├── .editorconfig ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.rst ├── docs ├── Makefile ├── conf.py └── index.rst ├── requirements.txt ├── setup.py ├── src ├── .ignore ├── _arrayops.h ├── arrayops.pxi ├── bitcount.h ├── bitops.pxi ├── block.pxi ├── immutablerb.pxi ├── macros.h ├── multirb.pxi ├── rbbinaryops.pxi └── roaringbitmap.pyx └── tests ├── benchmarks.py └── unittests.py /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org/ 2 | root = true 3 | 4 | [*] 5 | end_of_line = lf 6 | insert_final_newline = true 7 | 8 | [*.{py,pyx,pxd,pxi,c,h,cpp,css,js}] 9 | charset = utf-8 10 | indent_style = tab 11 | indent_size = 4 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | MANIFEST 2 | src/.*.swp 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | 8 | # Cython-generated files 9 | src/*.c 10 | src/*.html 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | env/ 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | 57 | # Sphinx documentation 58 | docs/_build/ 59 | 60 | # PyBuilder 61 | target/ 62 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | {description} 294 | Copyright (C) {year} {fullname} 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | {signature of Ty Coon}, 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | 341 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst LICENSE setup.py 2 | recursive-include src *.h *.c 3 | recursive-exclude src *.pyx *.pxi *.pxd 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | python3 setup.py install --user 3 | 4 | clean: 5 | rm -rf build/ src/roaringbitmap.h 6 | find src/ -name '*.c' -delete 7 | find src/ -name '*.so' -delete 8 | find src/ -name '*.pyc' -delete 9 | find src/ -name '*.html' -delete 10 | find tests/ -name '*.pyc' -delete 11 | rm -rf src/__pycache__ tests/__pycache__ 12 | 13 | test: all 14 | ulimit -Sv 500000; python3 -m pytest tests/unittests.py 15 | 16 | bench: all 17 | ulimit -Sv 500000; python3 tests/benchmarks.py 18 | 19 | lint: 20 | pycodestyle --ignore=E1,W1,W503 tests/*.py \ 21 | && pycodestyle --ignore=E1,W1,F,E901,E225,E227,E211,W503 \ 22 | src/*.pyx src/*.pxi 23 | 24 | py2: 25 | python2 setup.py install --user 26 | 27 | test2: py2 28 | python2 -m pytest tests/unittests.py 29 | 30 | bench2: all 31 | ulimit -Sv 500000; python2 tests/benchmarks.py 32 | 33 | debug: 34 | python3-dbg setup.py install --user --debug 35 | 36 | debug2: 37 | python2-dbg setup.py install --user --debug 38 | 39 | testdebug: debug 40 | gdb -ex run --args python3-dbg -m pytest tests/unittests.py -v 41 | 42 | testdebug2: debug2 43 | gdb -ex run --args python2-dbg -m pytest tests/unittests.py -v 44 | 45 | valgrind: 46 | python3-dbg setup.py install --user --debug 47 | valgrind --tool=memcheck --suppressions=valgrind-python.supp \ 48 | --leak-check=full --show-leak-kinds=definite \ 49 | python3.5-dbg -m pytest tests/unittests.py -v 50 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Roaring Bitmap in Cython 2 | ======================== 3 | 4 | A roaring bitmap is an efficient compressed datastructure to store a set 5 | of integers. A Roaring bitmap stores a set of 32-bit integers in a series of 6 | arrays and bitmaps, whichever takes the least space (which is always 7 | ``2 ** 16`` bits or less). 8 | 9 | This datastructure is useful for storing a large number of integers, e.g., for 10 | an inverted index used by search engines and databases. In particular, it is 11 | possible to quickly compute the intersection of a series of sets, which can be 12 | used to implement a query as the conjunction of subqueries. 13 | 14 | This implementation is based on the Java and C implementations at 15 | https://github.com/lemire/RoaringBitmap 16 | and https://github.com/lemire/CRoaring 17 | 18 | Additional features of this implementation: 19 | 20 | - Inverted list representation: blocks that are mostly full are stored 21 | compactly as an array of non-members (instead of as an array of members or a 22 | fixed-size bitmap). 23 | - Collections of immutable roaring bitmaps can be efficiently serialized with 24 | ``mmap`` in a single file. 25 | 26 | Missing features w.r.t. CRoaring: 27 | 28 | - Run-length encoded blocks 29 | - Various AVX2 / SSE optimizations 30 | 31 | See also PyRoaringBitmap, a Python wrapper of CRoaring: 32 | https://github.com/Ezibenroc/PyRoaringBitMap 33 | 34 | License, requirements 35 | --------------------- 36 | The code is licensed under GNU GPL v2, or any later version at your option. 37 | 38 | - Python 2.7+/3.3+ http://www.python.org (headers required, e.g. python-dev package) 39 | - Cython 0.20+ http://www.cython.org 40 | 41 | Installation, usage 42 | ------------------- 43 | 44 | :: 45 | 46 | $ git clone https://github.com/andreasvc/roaringbitmap.git 47 | $ cd roaringbitmap 48 | $ make 49 | 50 | (or ``make py2`` for Python 2) 51 | 52 | A ``RoaringBitmap()`` can be used as a replacement for a normal (mutable) 53 | Python set containing (unsigned) 32-bit integers: 54 | 55 | .. code-block:: python 56 | 57 | >>> from roaringbitmap import RoaringBitmap 58 | >>> RoaringBitmap(range(10)) & RoaringBitmap(range(5, 15)) 59 | RoaringBitmap({5, 6, 7, 8, 9}) 60 | 61 | ``ImmutableRoaringBitmap`` is an immutable variant (analogous to ``frozenset``) 62 | which is stored compactly as a contiguous block of memory. 63 | 64 | A sequence of immutable RoaringBitmaps can be stored in a single file and 65 | accessed efficiently with ``mmap``, without needing to copy or deserialize: 66 | 67 | .. code-block:: python 68 | 69 | >>> from roaringbitmap import MultiRoaringBitmap 70 | >>> mrb = MultiRoaringBitmap([range(n, n + 5) for n in range(10)], filename='index') 71 | 72 | >>> mrb = MultiRoaringBitmap.fromfile('index') 73 | >>> mrb[5] 74 | ImmutableRoaringBitmap({5, 6, 7, 8, 9}) 75 | 76 | For API documentation cf. http://roaringbitmap.readthedocs.io 77 | 78 | Benchmarks 79 | ---------- 80 | Output of ``$ make bench``:: 81 | 82 | small sparse set 83 | 100 runs with sets of 200 random elements n s.t. 0 <= n < 40000 84 | set() RoaringBitmap() ratio 85 | init 0.000834 0.00138 0.603 86 | initsort 0.00085 0.000394 2.16 87 | and 0.00102 8.49e-05 12.1 88 | or 0.00171 0.000169 10.1 89 | xor 0.00152 0.000213 7.11 90 | sub 0.000934 0.000197 4.74 91 | iand 1.29e-05 2.97e-06 4.35 92 | ior 9.7e-06 3.26e-06 2.98 93 | ixor 8.98e-06 3.43e-06 2.62 94 | isub 6.83e-06 3.3e-06 2.07 95 | eq 0.000438 1.17e-05 37.6 96 | neq 6.37e-06 7.81e-06 0.816 97 | jaccard 0.0029 0.000126 23.1 98 | 99 | medium load factor 100 | 100 runs with sets of 59392 random elements n s.t. 0 <= n < 118784 101 | set() RoaringBitmap() ratio 102 | init 0.564 0.324 1.74 103 | initsort 0.696 0.273 2.55 104 | and 0.613 0.000418 1466 105 | or 0.976 0.000292 3344 106 | xor 0.955 0.000294 3250 107 | sub 0.346 0.000316 1092 108 | iand 0.00658 1.14e-05 575 109 | ior 0.00594 1.08e-05 548 110 | ixor 0.00434 1.12e-05 385 111 | isub 0.00431 1.09e-05 397 112 | eq 0.0991 0.000116 851 113 | neq 9.62e-06 1.29e-05 0.743 114 | jaccard 1.62 0.00025 6476 115 | 116 | dense set / high load factor 117 | 100 runs with sets of 39800 random elements n s.t. 0 <= n < 40000 118 | set() RoaringBitmap() ratio 119 | init 0.33 0.0775 4.26 120 | initsort 0.352 0.148 2.38 121 | and 0.24 0.000223 1078 122 | or 0.45 0.000165 2734 123 | xor 0.404 0.000161 2514 124 | sub 0.169 0.000173 973 125 | iand 0.00287 6.02e-06 477 126 | ior 0.00179 6.34e-06 282 127 | ixor 0.00195 5.53e-06 353 128 | isub 0.0017 6.35e-06 267 129 | eq 0.0486 4.65e-05 1045 130 | neq 1.01e-05 1.13e-05 0.888 131 | jaccard 0.722 0.000118 6136 132 | 133 | See https://github.com/Ezibenroc/roaring_analysis/ for a performance comparison 134 | of PyRoaringBitmap and this library. 135 | 136 | References 137 | ---------- 138 | - http://roaringbitmap.org/ 139 | - Chambi, S., Lemire, D., Kaser, O., & Godin, R. (2016). Better bitmap 140 | performance with Roaring bitmaps. Software: practice and experience, 46(5), 141 | pp. 709-719. http://arxiv.org/abs/1402.6407 142 | - The idea of using the inverted list representation is based on 143 | https://issues.apache.org/jira/browse/LUCENE-5983 144 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python3 `which sphinx-build` 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " singlehtml to make a single large HTML file" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 31 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 32 | @echo " text to make text files" 33 | @echo " man to make manual pages" 34 | @echo " texinfo to make Texinfo files" 35 | @echo " info to make Texinfo files and run them through makeinfo" 36 | @echo " gettext to make PO message catalogs" 37 | @echo " changes to make an overview of all changed/added/deprecated items" 38 | @echo " linkcheck to check all external links for integrity" 39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 40 | 41 | clean: 42 | -rm -rf $(BUILDDIR)/* 43 | 44 | html: 45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 48 | 49 | dirhtml: 50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 51 | @echo 52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 53 | 54 | singlehtml: 55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 56 | @echo 57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 58 | 59 | pickle: 60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 61 | @echo 62 | @echo "Build finished; now you can process the pickle files." 63 | 64 | json: 65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 66 | @echo 67 | @echo "Build finished; now you can process the JSON files." 68 | 69 | htmlhelp: 70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 71 | @echo 72 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 73 | ".hhp project file in $(BUILDDIR)/htmlhelp." 74 | 75 | qthelp: 76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 77 | @echo 78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/roaringbitmap.qhcp" 81 | @echo "To view the help file:" 82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/roaringbitmap.qhc" 83 | 84 | devhelp: 85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 86 | @echo 87 | @echo "Build finished." 88 | @echo "To view the help file:" 89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/roaringbitmap" 90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/roaringbitmap" 91 | @echo "# devhelp" 92 | 93 | epub: 94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 95 | @echo 96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 97 | 98 | latex: 99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 100 | @echo 101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 103 | "(use \`make latexpdf' here to do that automatically)." 104 | 105 | latexpdf: 106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 107 | @echo "Running LaTeX files through pdflatex..." 108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 110 | 111 | text: 112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 113 | @echo 114 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 115 | 116 | man: 117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 118 | @echo 119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 120 | 121 | texinfo: 122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 123 | @echo 124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 125 | @echo "Run \`make' in that directory to run these through makeinfo" \ 126 | "(use \`make info' here to do that automatically)." 127 | 128 | info: 129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 130 | @echo "Running Texinfo files through makeinfo..." 131 | make -C $(BUILDDIR)/texinfo info 132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 133 | 134 | gettext: 135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 136 | @echo 137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 138 | 139 | changes: 140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 141 | @echo 142 | @echo "The overview file is in $(BUILDDIR)/changes." 143 | 144 | linkcheck: 145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 146 | @echo 147 | @echo "Link check complete; look for any errors in the above output " \ 148 | "or in $(BUILDDIR)/linkcheck/output.txt." 149 | 150 | doctest: 151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 152 | @echo "Testing of doctests in the sources finished, look at the " \ 153 | "results in $(BUILDDIR)/doctest/output.txt." 154 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is execfile()d with the current directory set to its containing dir. 4 | # 5 | # Note that not all possible configuration values are present in this 6 | # autogenerated file. 7 | # 8 | # All configuration values have a default; values that are commented out 9 | # serve to show the default. 10 | 11 | import sys, os 12 | 13 | # If extensions (or modules to document with autodoc) are in another directory, 14 | # add these directories to sys.path here. If the directory is relative to the 15 | # documentation root, use os.path.abspath to make it absolute, like shown here. 16 | #sys.path.insert(0, os.path.abspath('.')) 17 | 18 | # -- General configuration ---------------------------------------------------- 19 | 20 | # If your documentation needs a minimal Sphinx version, state it here. 21 | #needs_sphinx = '1.0' 22 | 23 | # Add any Sphinx extension module names here, as strings. They can be 24 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 25 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode'] 26 | 27 | # Add any paths that contain templates here, relative to this directory. 28 | templates_path = ['_templates'] 29 | 30 | # The suffix of source filenames. 31 | source_suffix = '.rst' 32 | 33 | # The encoding of source files. 34 | #source_encoding = 'utf-8-sig' 35 | 36 | # The master toctree document. 37 | master_doc = 'index' 38 | 39 | # General information about the project. 40 | project = u'roaringbitmap' 41 | copyright = u'2022, Andreas van Cranenburgh' 42 | 43 | # The version info for the project you're documenting, acts as replacement for 44 | # |version| and |release|, also used in various other places throughout the 45 | # built documents. 46 | # 47 | # The short X.Y version. 48 | version = '0.7' 49 | # The full version, including alpha/beta/rc tags. 50 | release = '0.7.2' 51 | 52 | # The language for content autogenerated by Sphinx. Refer to documentation 53 | # for a list of supported languages. 54 | #language = None 55 | 56 | # There are two options for replacing |today|: either, you set today to some 57 | # non-false value, then it is used: 58 | #today = '' 59 | # Else, today_fmt is used as the format for a strftime call. 60 | #today_fmt = '%B %d, %Y' 61 | 62 | # List of patterns, relative to source directory, that match files and 63 | # directories to ignore when looking for source files. 64 | exclude_patterns = ['_build'] 65 | 66 | # The reST default role (used for this markup: `text`) to use for all documents 67 | #default_role = None 68 | 69 | # If true, '()' will be appended to :func: etc. cross-reference text. 70 | #add_function_parentheses = True 71 | 72 | # If true, the current module name will be prepended to all description 73 | # unit titles (such as .. function::). 74 | #add_module_names = True 75 | 76 | # If true, sectionauthor and moduleauthor directives will be shown in the 77 | # output. They are ignored by default. 78 | #show_authors = False 79 | 80 | # The name of the Pygments (syntax highlighting) style to use. 81 | pygments_style = 'sphinx' 82 | 83 | # A list of ignored prefixes for module index sorting. 84 | #modindex_common_prefix = [] 85 | 86 | autodoc_member_order = 'bysource' 87 | autodoc_default_flags = ['members'] 88 | 89 | # -- Options for HTML output -------------------------------------------------- 90 | 91 | ## on_rtd is whether we are on readthedocs.org, this line of code grabbed from docs.readthedocs.org 92 | #on_rtd = os.environ.get('READTHEDOCS', None) == 'True' 93 | # 94 | #if not on_rtd: # only import and set the theme if we're building docs locally 95 | # import sphinx_rtd_theme 96 | # html_theme = 'sphinx_rtd_theme' 97 | # html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 98 | ## otherwise, readthedocs.org uses their theme by default, so no need to specify it 99 | 100 | html_theme = 'nature' 101 | 102 | # The name for this set of Sphinx documents. If None, it defaults to 103 | # " v documentation". 104 | #html_title = None 105 | 106 | # A shorter title for the navigation bar. Default is the same as html_title. 107 | #html_short_title = None 108 | 109 | # The name of an image file (relative to this directory) to place at the top 110 | # of the sidebar. 111 | #html_logo = None 112 | 113 | # The name of an image file (within the static path) to use as favicon of the 114 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 115 | # pixels large. 116 | #html_favicon = None 117 | 118 | # Add any paths that contain custom static files (such as style sheets) here, 119 | # relative to this directory. They are copied after the builtin static files, 120 | # so a file named "default.css" will overwrite the builtin "default.css". 121 | html_static_path = [] 122 | 123 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 124 | # using the given strftime format. 125 | #html_last_updated_fmt = '%b %d, %Y' 126 | 127 | # If true, SmartyPants will be used to convert quotes and dashes to 128 | # typographically correct entities. 129 | #html_use_smartypants = True 130 | 131 | # Custom sidebar templates, maps document names to template names. 132 | html_sidebars = {'**': [ 133 | 'globaltoc.html', 134 | 'searchbox.html', 135 | #'localtoc.html', 136 | #'relations.html', 137 | #'sourcelink.html', 138 | ], } 139 | 140 | # Additional templates that should be rendered to pages, maps page names to 141 | # template names. 142 | #html_additional_pages = {} 143 | 144 | # If false, no module index is generated. 145 | html_domain_indices = False 146 | 147 | # If false, no index is generated. 148 | html_use_index = False 149 | 150 | # If true, the index is split into individual pages for each letter. 151 | #html_split_index = False 152 | 153 | # If true, links to the reST sources are added to the pages. 154 | html_show_sourcelink = False 155 | 156 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 157 | #html_show_sphinx = True 158 | 159 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 160 | #html_show_copyright = True 161 | 162 | # If true, an OpenSearch description file will be output, and all pages will 163 | # contain a tag referring to it. The value of this option must be the 164 | # base URL from which the finished HTML is served. 165 | #html_use_opensearch = '' 166 | 167 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 168 | #html_file_suffix = None 169 | 170 | # Output file base name for HTML help builder. 171 | htmlhelp_basename = 'roaringbitmapdoc' 172 | 173 | # append __init__ docstring to docstring of class 174 | autoclass_content = 'both' 175 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | RoaringBitmap API documentation 2 | =============================== 3 | .. automodule:: roaringbitmap 4 | :members: 5 | :undoc-members: 6 | :show-inheritance: 7 | 8 | 9 | Indices and tables 10 | ================== 11 | 12 | * :ref:`genindex` 13 | * :ref:`modindex` 14 | * :ref:`search` 15 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cython>=0.21 2 | sphinx>=1.6.2 3 | pytest>=3.0.0 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """Generic setup.py for Cython code.""" 2 | import os 3 | import sys 4 | from distutils.core import setup 5 | from distutils.extension import Extension 6 | 7 | PY2 = sys.version_info[0] == 2 8 | 9 | # In releases, include C sources but not Cython sources; otherwise, use cython 10 | # to figure out which files may need to be re-cythonized. 11 | USE_CYTHON = os.path.exists('src/roaringbitmap.pyx') 12 | if USE_CYTHON: 13 | try: 14 | from Cython.Build import cythonize 15 | from Cython.Distutils import build_ext 16 | from Cython.Compiler import Options 17 | Options.fast_fail = True 18 | except ImportError: 19 | raise RuntimeError('could not import Cython.') 20 | cmdclass = dict(build_ext=build_ext) 21 | else: 22 | cmdclass = dict() 23 | 24 | DEBUG = '--debug' in sys.argv 25 | if DEBUG: 26 | sys.argv.remove('--debug') 27 | 28 | MTUNE = '--with-mtune' in sys.argv 29 | if MTUNE: 30 | sys.argv.remove('--with-mtune') 31 | 32 | with open('README.rst') as inp: 33 | README = inp.read() 34 | 35 | METADATA = dict(name='roaringbitmap', 36 | version='0.7.2', 37 | description='Roaring Bitmap', 38 | long_description=README, 39 | author='Andreas van Cranenburgh', 40 | author_email='A.W.van.Cranenburgh@rug.nl', 41 | url='http://roaringbitmap.readthedocs.io', 42 | license='GPL', 43 | platforms=['Many'], 44 | classifiers=[ 45 | 'Development Status :: 4 - Beta', 46 | 'Intended Audience :: Science/Research', 47 | 'License :: OSI Approved :: GNU General Public License (GPL)', 48 | 'Operating System :: POSIX', 49 | 'Programming Language :: Python :: 2.7', 50 | 'Programming Language :: Python :: 3.3', 51 | 'Programming Language :: Cython', 52 | ], 53 | ) 54 | 55 | # some of these directives increase performance, 56 | # but at the cost of failing in mysterious ways. 57 | directives = { 58 | 'profile': False, 59 | 'cdivision': True, 60 | 'nonecheck': False, 61 | 'wraparound': False, 62 | 'boundscheck': False, 63 | 'infer_types': None, 64 | 'embedsignature': True, 65 | 'warn.unused': True, 66 | 'warn.unreachable': True, 67 | 'warn.maybe_uninitialized': True, 68 | 'warn.undeclared': False, 69 | 'warn.unused_arg': False, 70 | 'warn.unused_result': False, 71 | } 72 | 73 | if __name__ == '__main__': 74 | if sys.version_info[:2] < (2, 7) or (3, 0) <= sys.version_info[:2] < (3, 3): 75 | raise RuntimeError('Python version 2.7 or >= 3.3 required.') 76 | os.environ['GCC_COLORS'] = 'auto' 77 | # NB: could also use Cython compile-time definition, 78 | # but this would lead to different C output for Python 2/3. 79 | extra_compile_args = ['-DPY2=%d' % PY2] # '-fopt-info-vec-missed', 80 | if sys.platform == 'win32': 81 | # https://docs.microsoft.com/en-us/cpp/intrinsics/bitscanforward-bitscanforward64?view=vs-2017 82 | extra_compile_args += ['-EHsc'] 83 | else: 84 | extra_compile_args += [ 85 | '-Wno-strict-prototypes', '-Wno-unreachable-code', '-Wextra'] 86 | extra_link_args = [] 87 | if not DEBUG and sys.platform != 'win32': 88 | extra_compile_args += ['-O3', '-DNDEBUG'] 89 | extra_compile_args += ['-mtune=native'] if MTUNE else ['-march=native'] 90 | extra_link_args += ['-DNDEBUG'] 91 | if USE_CYTHON: 92 | if DEBUG: 93 | directives.update(wraparound=True, boundscheck=True) 94 | if sys.platform == 'win32': 95 | extra_compile_args += ['-DDEBUG', '-Od', '-Zi'] 96 | extra_link_args += ['-DEBUG'] 97 | else: 98 | extra_compile_args += ['-g', '-O0', 99 | # '-fsanitize=address', '-fsanitize=undefined', 100 | '-fno-omit-frame-pointer'] 101 | extra_link_args += ['-g'] 102 | ext_modules = cythonize( 103 | [Extension( 104 | '*', 105 | sources=['src/*.pyx'], 106 | extra_compile_args=extra_compile_args, 107 | extra_link_args=extra_link_args)], 108 | annotate=True, 109 | compiler_directives=directives, 110 | language_level=3) 111 | else: 112 | ext_modules = [Extension( 113 | 'roaringbitmap', 114 | sources=['src/roaringbitmap.c'], 115 | extra_compile_args=extra_compile_args, 116 | extra_link_args=extra_link_args)] 117 | setup( 118 | cmdclass=cmdclass, 119 | ext_modules=ext_modules, 120 | **METADATA) 121 | -------------------------------------------------------------------------------- /src/.ignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.html 3 | -------------------------------------------------------------------------------- /src/_arrayops.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #if defined(__SSE4_2__) 5 | #if defined(_MSC_VER) 6 | #include 7 | #else 8 | #include 9 | #endif 10 | #endif 11 | 12 | /** 13 | * Generic intersection function. Passes unit tests. 14 | * 15 | * From CRoaring, array_util.c 16 | * cf. https://github.com/RoaringBitmap/CRoaring/blob/master/src/array_util.c 17 | */ 18 | int32_t intersect_general16(const uint16_t *A, const size_t lenA, 19 | const uint16_t *B, const size_t lenB, uint16_t *out) { 20 | const uint16_t *initout = out; 21 | const uint16_t *endA; 22 | const uint16_t *endB; 23 | if (lenA == 0 || lenB == 0) return 0; 24 | endA = A + lenA; 25 | endB = B + lenB; 26 | 27 | while (1) { 28 | while (*A < *B) { 29 | SKIP_FIRST_COMPARE: 30 | if (++A == endA) return (int32_t)(out - initout); 31 | } 32 | while (*A > *B) { 33 | if (++B == endB) return (int32_t)(out - initout); 34 | } 35 | if (*A == *B) { 36 | *out++ = *A; 37 | if (++A == endA || ++B == endB) return (int32_t)(out - initout); 38 | } else { 39 | goto SKIP_FIRST_COMPARE; 40 | } 41 | } 42 | return (int32_t)(out - initout); /* NOTREACHED */ 43 | } 44 | 45 | 46 | #if defined(__SSE4_2__) 47 | 48 | static inline int32_t intersect_uint16( 49 | const uint16_t* __restrict a, size_t a_size, 50 | const uint16_t* __restrict b, size_t b_size, 51 | uint16_t* __restrict result) { 52 | /* from https://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/ */ 53 | size_t count = 0; 54 | static __m128i shuffle_mask16[256]; 55 | static int built_shuffle_mask = 0; 56 | int i, j; 57 | if (!built_shuffle_mask) { 58 | built_shuffle_mask = 1; 59 | for (i = 0; i < 256; i++) { 60 | uint8_t mask[16]; 61 | memset(mask, 0xFF, sizeof(mask)); 62 | int counter = 0; 63 | for (j = 0; j < 16; j++) { 64 | if (i & (1 << j)) { 65 | mask[counter++] = 2 * j; 66 | mask[counter++] = 2 * j + 1; 67 | } 68 | } 69 | __m128i v_mask = _mm_loadu_si128((const __m128i *)mask); 70 | shuffle_mask16[i] = v_mask; 71 | } 72 | } 73 | size_t i_a = 0, i_b = 0; 74 | size_t st_a = (a_size / 8) * 8; 75 | size_t st_b = (b_size / 8) * 8; 76 | 77 | while(i_a < st_a && i_b < st_b) { 78 | __m128i v_a = _mm_loadu_si128((__m128i *)&a[i_a]); 79 | __m128i v_b = _mm_loadu_si128((__m128i *)&b[i_b]); 80 | __m128i v_cmp = _mm_cmpestrm(v_a, 8, v_b, 8, 81 | _SIDD_UWORD_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_BIT_MASK); 82 | int r = _mm_extract_epi32(v_cmp, 0); 83 | __m128i v_shuf = _mm_shuffle_epi8(v_b, shuffle_mask16[r]); 84 | _mm_storeu_si128((__m128i *)&result[count], v_shuf); 85 | count += _mm_popcnt_u32(r); 86 | uint16_t a_max = _mm_extract_epi16(v_a, 7); 87 | uint16_t b_max = _mm_extract_epi16(v_b, 7); 88 | i_a += (a_max <= b_max) * 8; 89 | i_b += (a_max >= b_max) * 8; 90 | } 91 | a += i_a; 92 | a_size -= i_a; 93 | b += i_b; 94 | b_size -= i_b; 95 | result += count; 96 | return count + intersect_general16(a, a_size, b, b_size, result); 97 | } 98 | 99 | #else /* __SSE4_2__ */ 100 | 101 | int32_t intersect_uint16(const uint16_t *A, size_t s_a, 102 | const uint16_t *B, size_t s_b, uint16_t *C) { 103 | return intersect_general16(A, s_a, B, s_b, C); 104 | } 105 | 106 | #endif /* __SSE4_2__ */ 107 | -------------------------------------------------------------------------------- /src/arrayops.pxi: -------------------------------------------------------------------------------- 1 | # Set / search operations on integer arrays 2 | 3 | cdef inline int binarysearch(uint16_t *data, int begin, int end, 4 | uint16_t elem) nogil: 5 | """Binary search for short `elem` in array `data`. 6 | 7 | :returns: positive index ``i`` if ``elem`` is found; otherwise return a 8 | negative value ``i`` such that ``-i - 1`` is the index where ``elem`` 9 | should be inserted.""" 10 | cdef int low = begin 11 | cdef int high = end - 1 12 | cdef int middleidx 13 | cdef uint16_t middleval 14 | # accelerate the possibly common case of a just appended value 15 | if end > 0 and data[end - 1] < elem: 16 | return -end - 1 17 | while low <= high: 18 | middleidx = (low + high) >> 1 19 | middleval = data[middleidx] 20 | if middleval < elem: 21 | low = middleidx + 1 22 | elif middleval > elem: 23 | high = middleidx - 1 24 | else: 25 | return middleidx 26 | return -(low + 1) 27 | 28 | 29 | cdef inline int advance(uint16_t *data, int pos, int length, 30 | uint16_t minitem) nogil: 31 | cdef int lower = pos + 1 32 | cdef int spansize = 1 33 | cdef int upper, mid 34 | if lower >= length or data[lower] >= minitem: 35 | return lower 36 | while lower + spansize < length and data[lower + spansize] < minitem: 37 | spansize *= 2 38 | upper = (lower + spansize) if lower + spansize < length else (length - 1) 39 | if data[upper] == minitem: 40 | return upper 41 | if data[upper] < minitem: 42 | return length 43 | lower += spansize >> 1 44 | while lower + 1 != upper: 45 | mid = (lower + upper) >> 1 46 | if data[mid] == minitem: 47 | return mid 48 | elif data[mid] < minitem: 49 | lower = mid 50 | else: 51 | upper = mid 52 | return upper 53 | 54 | 55 | cdef uint32_t intersect2by2(uint16_t *data1, uint16_t *data2, 56 | int length1, int length2, uint16_t *dest) nogil: 57 | if length1 * 64 < length2: 58 | return intersectgalloping(data1, length1, data2, length2, dest) 59 | elif length2 * 64 < length1: 60 | return intersectgalloping(data2, length2, data1, length1, dest) 61 | if dest is NULL: 62 | return intersectcard(data1, data2, length1, length2) 63 | elif data1 is not dest and data2 is not dest: 64 | # NB: dest must have 8 elements extra capacity 65 | return intersect_uint16(data1, length1, data2, length2, dest) 66 | return intersect_general16(data1, length1, data2, length2, dest) 67 | # return intersectlocal2by2(data1, length1, data2, length2, dest) 68 | 69 | 70 | cdef inline int intersectlocal2by2(uint16_t *data1, int length1, 71 | uint16_t *data2, int length2, uint16_t *dest) nogil: 72 | cdef int k1 = 0, k2 = 0, pos = 0 73 | if length1 == 0 or length2 == 0: 74 | return 0 75 | while True: 76 | if data2[k2] < data1[k1]: 77 | while True: 78 | k2 += 1 79 | if k2 == length2: 80 | return pos 81 | elif data2[k2] >= data1[k1]: 82 | break 83 | elif data1[k1] < data2[k2]: 84 | while True: 85 | k1 += 1 86 | if k1 == length1: 87 | return pos 88 | elif data1[k1] >= data2[k2]: 89 | break 90 | else: # data1[k1] == data2[k2] 91 | dest[pos] = data1[k1] 92 | pos += 1 93 | k1 += 1 94 | if k1 == length1: 95 | return pos 96 | k2 += 1 97 | if k2 == length2: 98 | return pos 99 | 100 | 101 | cdef inline int intersectcard(uint16_t *data1, uint16_t *data2, 102 | int length1, int length2) nogil: 103 | cdef int k1 = 0, k2 = 0, pos = 0 104 | if length1 == 0 or length2 == 0: 105 | return 0 106 | while True: 107 | if data2[k2] < data1[k1]: 108 | while True: 109 | k2 += 1 110 | if k2 == length2: 111 | return pos 112 | elif data2[k2] >= data1[k1]: 113 | break 114 | elif data1[k1] < data2[k2]: 115 | while True: 116 | k1 += 1 117 | if k1 == length1: 118 | return pos 119 | elif data1[k1] >= data2[k2]: 120 | break 121 | else: # data1[k1] == data2[k2] 122 | pos += 1 123 | k1 += 1 124 | if k1 == length1: 125 | return pos 126 | k2 += 1 127 | if k2 == length2: 128 | return pos 129 | 130 | 131 | cdef inline int intersectgalloping( 132 | uint16_t *small, int lensmall, 133 | uint16_t *large, int lenlarge, 134 | uint16_t *dest) nogil: 135 | cdef int k1 = 0, k2 = 0, pos = 0 136 | if lensmall == 0: 137 | return 0 138 | if dest is NULL: # cardinality only 139 | while True: 140 | if large[k1] < small[k2]: 141 | k1 = advance(large, k1, lenlarge, small[k2]) 142 | if k1 == lenlarge: 143 | return pos 144 | if small[k2] < large[k1]: 145 | k2 += 1 146 | if k2 == lensmall: 147 | return pos 148 | else: # large[k2] == small[k1] 149 | pos += 1 150 | k2 += 1 151 | if k2 == lensmall: 152 | return pos 153 | k1 = advance(large, k1, lenlarge, small[k2]) 154 | if k1 == lenlarge: 155 | return pos 156 | else: # store result 157 | while True: 158 | if large[k1] < small[k2]: 159 | k1 = advance(large, k1, lenlarge, small[k2]) 160 | if k1 == lenlarge: 161 | return pos 162 | if small[k2] < large[k1]: 163 | k2 += 1 164 | if k2 == lensmall: 165 | return pos 166 | else: # large[k2] == small[k1] 167 | dest[pos] = small[k2] 168 | pos += 1 169 | k2 += 1 170 | if k2 == lensmall: 171 | return pos 172 | k1 = advance(large, k1, lenlarge, small[k2]) 173 | if k1 == lenlarge: 174 | return pos 175 | 176 | 177 | cdef int union2by2(uint16_t *data1, uint16_t *data2, 178 | int length1, int length2, uint16_t *dest) nogil: 179 | cdef int k1 = 0, k2 = 0, pos = 0, n_elems 180 | if length2 == 0: 181 | if dest is not NULL: 182 | memcpy(dest, data1, length1 * sizeof(uint16_t)) 183 | return length1 184 | elif length1 == 0: 185 | if dest is not NULL: 186 | memcpy(dest, data2, length2 * sizeof(uint16_t)) 187 | return length2 188 | elif length1 > length2: 189 | return union2by2(data2, data1, length2, length1, dest) 190 | if dest is NULL: # cardinality only 191 | while True: 192 | if data1[k1] < data2[k2]: 193 | pos += 1 194 | k1 += 1 195 | if k1 >= length1: 196 | break 197 | elif data1[k1] > data2[k2]: 198 | pos += 1 199 | k2 += 1 200 | if k2 >= length2: 201 | break 202 | else: # data1[k1] == data2[k2] 203 | pos += 1 204 | k1 += 1 205 | k2 += 1 206 | if k1 >= length1 or k2 >= length2: 207 | break 208 | else: # store result 209 | while True: 210 | if data1[k1] < data2[k2]: 211 | dest[pos] = data1[k1] 212 | pos += 1 213 | k1 += 1 214 | if k1 >= length1: 215 | break 216 | elif data1[k1] > data2[k2]: 217 | dest[pos] = data2[k2] 218 | pos += 1 219 | k2 += 1 220 | if k2 >= length2: 221 | break 222 | else: # data1[k1] == data2[k2] 223 | dest[pos] = data1[k1] 224 | pos += 1 225 | k1 += 1 226 | k2 += 1 227 | if k1 >= length1 or k2 >= length2: 228 | break 229 | if k1 < length1: 230 | n_elems = length1 - k1 231 | if dest is not NULL: 232 | memcpy(&(dest[pos]), &(data1[k1]), n_elems * sizeof(uint16_t)) 233 | pos += n_elems 234 | elif k2 < length2: 235 | n_elems = length2 - k2 236 | if dest is not NULL: 237 | memcpy(&(dest[pos]), &(data2[k2]), n_elems * sizeof(uint16_t)) 238 | pos += n_elems 239 | return pos 240 | 241 | 242 | cdef int union2by2bitmap(uint16_t *data1, uint16_t *data2, 243 | int length1, int length2, uint64_t *dest) nogil: 244 | """Like union2by2, but write result to bitmap.""" 245 | cdef int length = 0, pos = 0 246 | memset(dest, 0, BITMAPSIZE) 247 | for pos in range(length1): 248 | SETBIT(dest, data1[pos]) 249 | length = length1 250 | for pos in range(length2): 251 | length += TESTBIT(dest, data2[pos]) == 0 252 | SETBIT(dest, data2[pos]) 253 | return length 254 | 255 | 256 | cdef int difference(uint16_t *data1, uint16_t *data2, 257 | int length1, int length2, uint16_t *dest) nogil: 258 | cdef int k1 = 0, k2 = 0, pos = 0 259 | if length2 == 0: 260 | if dest is not NULL: 261 | memcpy(dest, data1, length1 * sizeof(uint16_t)) 262 | return length1 263 | elif length1 == 0: 264 | return 0 265 | if dest is NULL: # cardinality only 266 | while True: 267 | if data1[k1] < data2[k2]: 268 | pos += 1 269 | k1 += 1 270 | if k1 >= length1: 271 | return pos 272 | elif data1[k1] == data2[k2]: 273 | k1 += 1 274 | k2 += 1 275 | if k1 >= length1: 276 | return pos 277 | elif k2 >= length2: 278 | break 279 | else: # data1[k1] > data2[k2] 280 | k2 += 1 281 | if k2 >= length2: 282 | break 283 | while k1 < length1: 284 | pos += 1 285 | k1 += 1 286 | else: # store result 287 | while True: 288 | if data1[k1] < data2[k2]: 289 | dest[pos] = data1[k1] 290 | pos += 1 291 | k1 += 1 292 | if k1 >= length1: 293 | return pos 294 | elif data1[k1] == data2[k2]: 295 | k1 += 1 296 | k2 += 1 297 | if k1 >= length1: 298 | return pos 299 | elif k2 >= length2: 300 | break 301 | else: # data1[k1] > data2[k2] 302 | k2 += 1 303 | if k2 >= length2: 304 | break 305 | while k1 < length1: 306 | dest[pos] = data1[k1] 307 | pos += 1 308 | k1 += 1 309 | return pos 310 | 311 | 312 | cdef int xor2by2(uint16_t *data1, uint16_t *data2, 313 | int length1, int length2, uint16_t *dest) nogil: 314 | cdef int k1 = 0, k2 = 0, pos = 0 315 | if length2 == 0: 316 | if dest is not NULL: 317 | memcpy(dest, data1, length1 * sizeof(uint16_t)) 318 | return length1 319 | elif length1 == 0: 320 | if dest is not NULL: 321 | memcpy(dest, data2, length2 * sizeof(uint16_t)) 322 | return length2 323 | if dest is NULL: # cardinality only 324 | while True: 325 | if data1[k1] < data2[k2]: 326 | pos += 1 327 | k1 += 1 328 | if k1 >= length1: 329 | break 330 | elif data1[k1] == data2[k2]: 331 | k1 += 1 332 | k2 += 1 333 | if k1 >= length1 or k2 >= length2: 334 | break 335 | else: # data1[k1] > data2[k2] 336 | pos += 1 337 | k2 += 1 338 | if k2 >= length2: 339 | break 340 | if k1 >= length1: 341 | while k2 < length2: 342 | pos += 1 343 | k2 += 1 344 | elif k2 >= length2: 345 | while k1 < length1: 346 | pos += 1 347 | k1 += 1 348 | else: # store result 349 | while True: 350 | if data1[k1] < data2[k2]: 351 | dest[pos] = data1[k1] 352 | pos += 1 353 | k1 += 1 354 | if k1 >= length1: 355 | break 356 | elif data1[k1] == data2[k2]: 357 | k1 += 1 358 | k2 += 1 359 | if k1 >= length1 or k2 >= length2: 360 | break 361 | else: # data1[k1] > data2[k2] 362 | dest[pos] = data2[k2] 363 | pos += 1 364 | k2 += 1 365 | if k2 >= length2: 366 | break 367 | if k1 >= length1: 368 | while k2 < length2: 369 | dest[pos] = data2[k2] 370 | pos += 1 371 | k2 += 1 372 | elif k2 >= length2: 373 | while k1 < length1: 374 | dest[pos] = data1[k1] 375 | pos += 1 376 | k1 += 1 377 | return pos 378 | 379 | 380 | cdef inline int selectinvertedbinarysearch( 381 | uint16_t *data, int begin, int end, uint16_t i) nogil: 382 | """Custom binary search to find i'th member given array of non-members.""" 383 | # 0 1 2 3 4 5 6 7 8 9 10 ... indices 384 | # 0 1 2 ... inverted: indices 385 | # 3 7 11 ... inverted: non-members 386 | # 0 1 2 4 5 6 8 9 10 12 13 ... members 387 | cdef int low = begin 388 | cdef int high = end - 1 389 | cdef int middleidx 390 | cdef uint16_t middleval 391 | if end == 0 or data[0] > i: 392 | return i 393 | elif data[high] - high <= i: 394 | return i + high + 1 395 | # find the pair of non-members between which the i'th member lies 396 | while low < high: 397 | middleidx = (low + high) >> 1 398 | middleval = data[middleidx] - middleidx 399 | if middleval > i: 400 | high = middleidx 401 | else: 402 | low = middleidx + 1 403 | # compute member given index 404 | return i + low 405 | -------------------------------------------------------------------------------- /src/bitcount.h: -------------------------------------------------------------------------------- 1 | /* Fast cross-platform bit counting using intrinsic functions 2 | * 3 | * This code is based on https://github.com/Noctune/bitcount 4 | * Adapted for 64-bit integers instead of 32 bits. 5 | */ 6 | 7 | #ifndef BITCOUNT_H_ 8 | #define BITCOUNT_H_ 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | #if !defined(BITCOUNT_NO_AUTODETECT) 15 | #if defined(__GNUC__) || defined(__clang__) 16 | #define BITCOUNT_GCC 17 | // FIXME: disabled for debugging 18 | // #elif defined(_MSC_VER) && defined(_M_X64) 19 | // #define BITCOUNT_VS_X64 20 | // #elif defined(_MSC_VER) && defined(_M_IX86) 21 | // #define BITCOUNT_VS_X86 22 | #endif 23 | #endif 24 | 25 | #ifdef _MSC_VER 26 | #define BITCOUNT_INLINE static __inline 27 | #else 28 | #define BITCOUNT_INLINE static inline 29 | #endif 30 | 31 | #ifdef BITCOUNT_VS_X64 32 | #include 33 | #pragma intrinsic(_BitScanForward64,_BitScanReverse64,__popcnt64) 34 | #endif 35 | 36 | #ifdef BITCOUNT_VS_X86 37 | #include 38 | #pragma intrinsic(_BitScanForward,_BitScanReverse,__popcnt) 39 | #endif 40 | 41 | #include 42 | #include 43 | #define BITCOUNT_BITS (sizeof(uint64_t) * CHAR_BIT) 44 | 45 | /* General implementations for systems without intrinsics */ 46 | unsigned int bit_clz_general(uint64_t); 47 | unsigned int bit_ctz_general(uint64_t); 48 | unsigned int bit_popcount_general(uint64_t); 49 | 50 | /* Returns the number of leading 0-bits in x, starting at the most significant 51 | bit position. If v is 0, the result is undefined. */ 52 | BITCOUNT_INLINE unsigned int bit_clz(uint64_t v) { 53 | #if defined(BITCOUNT_GCC) 54 | return __builtin_clzll(v); 55 | #elif defined(BITCOUNT_VS_X64) 56 | unsigned long result; 57 | _BitScanReverse64(&result, v); 58 | return BITCOUNT_BITS - 1 - result; 59 | #elif defined(BITCOUNT_VS_X86) 60 | unsigned long result; 61 | if ((uint32_t)(v >> 32) != 0) { 62 | _BitScanReverse(&result, (uint32_t)(v >> 32)); 63 | } else { 64 | _BitScanReverse(&result, (uint32_t)v); 65 | result += 32; 66 | } 67 | return BITCOUNT_BITS - 1 - result; 68 | #else 69 | return bit_clz_general(v); 70 | #endif 71 | } 72 | 73 | /* Returns the number of trailing 0-bits in x, starting at the least significant 74 | bit position. If v is 0, the result is undefined. */ 75 | BITCOUNT_INLINE unsigned int bit_ctz(uint64_t v) { 76 | #if defined(BITCOUNT_GCC) 77 | return __builtin_ctzll(v); 78 | #elif defined(BITCOUNT_VS_X64) 79 | unsigned long result; 80 | _BitScanForward64(&result, v); 81 | return result; 82 | #elif defined(BITCOUNT_VS_X86) 83 | unsigned long result; 84 | /* https://github.com/google/re2/commit/35febd432d9e6d8630845285c7f29eabd1df7beb */ 85 | if ((uint32_t)v != 0) { 86 | _BitScanForward(&result, (uint32_t)v); 87 | return (unsigned int)result; 88 | } else { 89 | _BitScanForward(&result, (uint32_t)(v >> 32)); 90 | return (unsigned int)(result) + 32; 91 | } 92 | #else 93 | return bit_ctz_general(v); 94 | #endif 95 | } 96 | 97 | /* Returns the number of 1-bits in v. */ 98 | BITCOUNT_INLINE unsigned int bit_popcount(uint64_t v) { 99 | #if defined(BITCOUNT_GCC) 100 | return __builtin_popcountll(v); 101 | #elif defined(BITCOUNT_VS_X64) 102 | return __popcnt64(v); 103 | #elif defined(BITCOUNT_VS_X86) 104 | return (__popcnt((uint32_t)v) + __popcnt((uint32_t)(v >> 32))); 105 | #else 106 | return bit_popcount_general(v); 107 | #endif 108 | } 109 | 110 | unsigned int bit_clz_general(uint64_t v) { 111 | /* From http://www.codeproject.com/Tips/784635/UInt-Bit-Operations */ 112 | uint64_t i, c; 113 | 114 | i = ~v; 115 | c = ((i ^ (i + 1)) & i) >> 63; 116 | 117 | i = (v >> 32) + 0xffffffff; 118 | i = ((i & 0x100000000) ^ 0x100000000) >> 27; 119 | c += i; v <<= i; 120 | 121 | i = (v >> 48) + 0xffff; 122 | i = ((i & 0x10000) ^ 0x10000) >> 12; 123 | c += i; v <<= i; 124 | 125 | i = (v >> 56) + 0xff; 126 | i = ((i & 0x100) ^ 0x100) >> 5; 127 | c += i; v <<= i; 128 | 129 | i = (v >> 60) + 0xf; 130 | i = ((i & 0x10) ^ 0x10) >> 2; 131 | c += i; v <<= i; 132 | 133 | i = (v >> 62) + 3; 134 | i = ((i & 4) ^ 4) >> 1; 135 | c += i; v <<= i; 136 | 137 | c += (v >> 63) ^ 1; 138 | 139 | return (unsigned int)c; 140 | } 141 | 142 | unsigned int bit_ctz_general(uint64_t v) { 143 | /* From http://www.codeproject.com/Tips/784635/UInt-Bit-Operations */ 144 | uint64_t i = ~v; 145 | uint64_t c = ((i ^ (i + 1)) & i) >> 63; 146 | 147 | i = (v & 0xffffffff) + 0xffffffff; 148 | i = ((i & 0x100000000) ^ 0x100000000) >> 27; 149 | c += i; v >>= i; 150 | 151 | i = (v & 0xffff) + 0xffff; 152 | i = ((i & 0x10000) ^ 0x10000) >> 12; 153 | c += i; v >>= i; 154 | 155 | i = (v & 0xff) + 0xff; 156 | i = ((i & 0x100) ^ 0x100) >> 5; 157 | c += i; v >>= i; 158 | 159 | i = (v & 0xf) + 0xf; 160 | i = ((i & 0x10) ^ 0x10) >> 2; 161 | c += i; v >>= i; 162 | 163 | i = (v & 3) + 3; 164 | i = ((i & 4) ^ 4) >> 1; 165 | c += i; v >>= i; 166 | 167 | c += ((v & 1) ^ 1); 168 | 169 | return (unsigned int)c; 170 | } 171 | 172 | unsigned int bit_popcount_general(uint64_t v) { 173 | /* see http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel */ 174 | v -= ((v >> 1) & 0x5555555555555555); 175 | v = (v & 0x3333333333333333) + ((v >> 2) & 0x3333333333333333); 176 | return (((v + (v >> 4)) & 0xF0F0F0F0F0F0F0F) * 0x101010101010101) >> 56; 177 | } 178 | 179 | #ifdef __cplusplus 180 | } 181 | #endif 182 | 183 | #endif /* BITCOUNT_H_ */ 184 | -------------------------------------------------------------------------------- /src/bitops.pxi: -------------------------------------------------------------------------------- 1 | """Oerations on fixed-size bitvectors. 2 | 3 | All bitvector operands are assumed to have ``BLOCKSIZE`` elements (bits). 4 | """ 5 | 6 | # Store result, return cardinality 7 | cdef inline uint32_t bitsetintersect(uint64_t *dest, 8 | uint64_t *src1, uint64_t *src2) nogil: 9 | """dest gets the intersection of src1 and src2. 10 | 11 | :returns: number of set bits in result.""" 12 | cdef size_t n 13 | cdef uint64_t res1, res2 14 | cdef uint32_t result = 0 15 | for n in range(0, (BLOCKSIZE // BITSIZE), 2): 16 | res1 = src1[n] & src2[n] 17 | res2 = src1[n + 1] & src2[n + 1] 18 | dest[n] = res1 19 | dest[n + 1] = res2 20 | result += bit_popcount(res1) 21 | result += bit_popcount(res2) 22 | return result 23 | 24 | 25 | cdef inline uint32_t bitsetunion(uint64_t *dest, 26 | uint64_t *src1, uint64_t *src2) nogil: 27 | """dest gets the union of src1 and src2. 28 | 29 | :returns: number of set bits in result.""" 30 | cdef size_t n 31 | cdef uint64_t res1, res2 32 | cdef uint32_t result = 0 33 | for n in range(0, (BLOCKSIZE // BITSIZE), 2): 34 | res1 = src1[n] | src2[n] 35 | res2 = src1[n + 1] | src2[n + 1] 36 | dest[n] = res1 37 | dest[n + 1] = res2 38 | result += bit_popcount(res1) 39 | result += bit_popcount(res2) 40 | return result 41 | 42 | 43 | cdef inline uint32_t bitsetxor(uint64_t *dest, 44 | uint64_t *src1, uint64_t *src2) nogil: 45 | """dest gets the xor of src1 and src2. 46 | 47 | :returns: number of set bits in result.""" 48 | cdef size_t n 49 | cdef uint64_t res1, res2 50 | cdef uint32_t result = 0 51 | for n in range(0, (BLOCKSIZE // BITSIZE), 2): 52 | res1 = src1[n] ^ src2[n] 53 | res2 = src1[n + 1] ^ src2[n + 1] 54 | dest[n] = res1 55 | dest[n + 1] = res2 56 | result += bit_popcount(res1) 57 | result += bit_popcount(res2) 58 | return result 59 | 60 | 61 | cdef inline uint32_t bitsetsubtract(uint64_t *dest, 62 | uint64_t *src1, uint64_t *src2) nogil: 63 | """dest gets the src2 - src1. 64 | 65 | :returns: number of set bits in result.""" 66 | cdef size_t n 67 | cdef uint64_t res1, res2 68 | cdef uint32_t result = 0 69 | for n in range(0, (BLOCKSIZE // BITSIZE), 2): 70 | res1 = src1[n] & ~src2[n] 71 | res2 = src1[n + 1] & ~src2[n + 1] 72 | dest[n] = res1 73 | dest[n + 1] = res2 74 | result += bit_popcount(res1) 75 | result += bit_popcount(res2) 76 | return result 77 | 78 | 79 | # Only store result, no cardinality 80 | cdef inline void bitsetintersectnocard(uint64_t *dest, 81 | uint64_t *src1, uint64_t *src2) noexcept nogil: 82 | """dest gets the intersection of src1 and src2.""" 83 | cdef size_t n 84 | cdef uint64_t res1, res2 85 | for n in range(0, (BLOCKSIZE // BITSIZE), 2): 86 | res1 = src1[n] & src2[n] 87 | res2 = src1[n + 1] & src2[n + 1] 88 | dest[n] = res1 89 | dest[n + 1] = res2 90 | 91 | 92 | cdef inline void bitsetunionnocard(uint64_t *dest, 93 | uint64_t *src1, uint64_t *src2) noexcept nogil: 94 | """dest gets the union of src1 and src2.""" 95 | cdef size_t n 96 | cdef uint64_t res1, res2 97 | for n in range(0, (BLOCKSIZE // BITSIZE), 2): 98 | res1 = src1[n] | src2[n] 99 | res2 = src1[n + 1] | src2[n + 1] 100 | dest[n] = res1 101 | dest[n + 1] = res2 102 | 103 | 104 | cdef inline void bitsetxornocard(uint64_t *dest, 105 | uint64_t *src1, uint64_t *src2) noexcept nogil: 106 | """dest gets the xor of src1 and src2.""" 107 | cdef size_t n 108 | cdef uint64_t res1, res2 109 | for n in range(0, (BLOCKSIZE // BITSIZE), 2): 110 | res1 = src1[n] ^ src2[n] 111 | res2 = src1[n + 1] ^ src2[n + 1] 112 | dest[n] = res1 113 | dest[n + 1] = res2 114 | 115 | 116 | cdef inline void bitsetsubtractnocard(uint64_t *dest, 117 | uint64_t *src1, uint64_t *src2) noexcept nogil: 118 | """dest gets the src2 - src1.""" 119 | cdef size_t n 120 | cdef uint64_t res1, res2 121 | for n in range(0, (BLOCKSIZE // BITSIZE), 2): 122 | res1 = src1[n] & ~src2[n] 123 | res2 = src1[n + 1] & ~src2[n + 1] 124 | dest[n] = res1 125 | dest[n + 1] = res2 126 | 127 | 128 | # Count cardinality only 129 | cdef inline uint32_t bitsetintersectcount( 130 | uint64_t *src1, uint64_t *src2) noexcept nogil: 131 | """return the cardinality of the intersection of dest and src. 132 | 133 | :returns: number of set bits in result. 134 | Both operands are assumed to have a fixed number of bits ``BLOCKSIZE``.""" 135 | cdef uint32_t result = 0 136 | cdef size_t n 137 | for n in range((BLOCKSIZE // BITSIZE)): 138 | result += bit_popcount(src1[n] & src2[n]) 139 | return result 140 | 141 | 142 | # Other operations 143 | cdef inline int iteratesetbits(uint64_t *vec, 144 | uint64_t *cur, int *idx) noexcept nogil: 145 | """Iterate over set bits in an array of unsigned long. 146 | 147 | :param cur: pointer to variable to maintain state, 148 | ``cur`` should be initialized to the first element of 149 | the bit array ``vec``, i.e., ``cur = vec[idx]``. 150 | :param idx: pointer to variable to maintain state, 151 | ``idx`` should be initialized to 0. 152 | :returns: the index of a set bit, or -1 if there are no more set 153 | bits. The result of calling a stopped iterator is undefined. 154 | 155 | e.g.:: 156 | 157 | int idx = 0 158 | uint64_t vec[4] = {0, 0, 0, 0b10001}, cur = vec[idx] 159 | iteratesetbits(vec, 4, &cur, &idx) # returns 0 160 | iteratesetbits(vec, 4, &cur, &idx) # returns 4 161 | iteratesetbits(vec, 4, &cur, &idx) # returns -1 162 | """ 163 | cdef int tmp 164 | while not cur[0]: 165 | idx[0] += 1 166 | if idx[0] >= (BLOCKSIZE // BITSIZE): 167 | return -1 168 | cur[0] = vec[idx[0]] 169 | tmp = bit_ctz(cur[0]) # index of right-most 1-bit in current slot 170 | cur[0] ^= 1ULL << tmp # TOGGLEBIT(cur, tmp) 171 | return idx[0] * BITSIZE + tmp 172 | 173 | 174 | cdef inline int iterateunsetbits(uint64_t *vec, 175 | uint64_t *cur, int *idx) noexcept nogil: 176 | """Like ``iteratesetbits``, but return indices of zero bits. 177 | 178 | :param cur: should be initialized as: ``cur = ~vec[idx]``. 179 | :param idx: pointer to variables to maintain state, 180 | ``idx`` should be initialized to 0. 181 | """ 182 | cdef int tmp 183 | while not cur[0]: 184 | idx[0] += 1 185 | if idx[0] >= (BLOCKSIZE // BITSIZE): 186 | return -1 187 | cur[0] = ~vec[idx[0]] 188 | tmp = bit_ctz(cur[0]) # index of right-most 0-bit in current slot 189 | cur[0] ^= 1ULL << tmp # TOGGLEBIT(cur, tmp) 190 | return idx[0] * BITSIZE + tmp 191 | 192 | 193 | cdef inline int reviteratesetbits(uint64_t *vec, uint64_t *cur, 194 | int *idx) noexcept nogil: 195 | """Iterate in reverse over set bits in an array of unsigned long. 196 | 197 | :param cur: pointer to variable to maintain state, 198 | ``cur`` should be initialized to the last element of 199 | the bit array ``vec``, i.e., ``cur = vec[idx]``. 200 | :param idx: pointer to variable to maintain state, 201 | ``idx`` should be initialized to ``slots - 1``, where slots is the 202 | number of elements in unsigned long array ``vec``. 203 | :returns: the index of a set bit, or -1 if there are no more set 204 | bits. The result of calling a stopped iterator is undefined. 205 | 206 | e.g.:: 207 | 208 | int idx = 3 209 | uint64_t vec[4] = {0, 0, 0, 0b10001}, cur = vec[idx] 210 | reviteratesetbits(vec, 4, &cur, &idx) # returns 4 211 | reviteratesetbits(vec, 4, &cur, &idx) # returns 0 212 | reviteratesetbits(vec, 4, &cur, &idx) # returns -1 213 | """ 214 | cdef int tmp 215 | while not cur[0]: 216 | idx[0] -= 1 217 | if idx[0] < 0: 218 | return -1 219 | cur[0] = vec[idx[0]] 220 | tmp = BITSIZE - bit_clz(cur[0]) - 1 # index of left-most 1-bit in cur 221 | cur[0] &= ~(1ULL << tmp) # CLEARBIT(cur, tmp) 222 | return idx[0] * BITSIZE + tmp 223 | 224 | 225 | cdef inline uint32_t extractsetbits(uint16_t *dest, 226 | uint64_t *src) noexcept nogil: 227 | """Store set bits of bitvector in preallocated array. 228 | 229 | :returns: number of elements in result.""" 230 | cdef size_t n, length = 0, base = 0 231 | cdef uint64_t cur 232 | for n in range((BLOCKSIZE // BITSIZE)): 233 | cur = src[n] 234 | while cur: 235 | dest[length] = base + bit_ctz(cur) 236 | length += 1 237 | cur ^= cur & -cur 238 | base += 64 239 | return length 240 | 241 | 242 | cdef inline uint32_t extractunsetbits(uint16_t *dest, 243 | uint64_t *src) noexcept nogil: 244 | """Store zero bits of bitvector in preallocated array. 245 | 246 | :returns: number of elements in result.""" 247 | cdef size_t n, length = 0, base = 0 248 | cdef uint64_t cur 249 | for n in range((BLOCKSIZE // BITSIZE)): 250 | cur = ~src[n] 251 | while cur: 252 | dest[length] = base + bit_ctz(cur) 253 | length += 1 254 | cur ^= cur & -cur 255 | base += 64 256 | return length 257 | 258 | 259 | cdef inline uint32_t extractintersection( 260 | uint16_t *dest, uint64_t *src1, uint64_t *src2) noexcept nogil: 261 | """Compute intersection of bitvectors and store in preallocated array. 262 | 263 | :returns: number of elements in result.""" 264 | cdef size_t n, length = 0, base = 0 265 | cdef uint64_t cur 266 | for n in range((BLOCKSIZE // BITSIZE)): 267 | cur = src1[n] & src2[n] 268 | while cur: 269 | dest[length] = base + bit_ctz(cur) 270 | length += 1 271 | cur ^= cur & -cur 272 | base += 64 273 | return length 274 | 275 | 276 | cdef inline bint bitsubset(uint64_t *vec1, uint64_t *vec2) noexcept nogil: 277 | """Test whether vec1 is a subset of vec2. 278 | 279 | i.e., all set bits of vec1 should be set in vec2.""" 280 | cdef size_t n 281 | for n in range(0, (BLOCKSIZE // BITSIZE), 2): 282 | if (vec1[n] & vec2[n]) != vec1[n] or ( 283 | vec1[n + 1] & vec2[n + 1]) != vec1[n + 1]: 284 | return False 285 | return True 286 | 287 | 288 | cdef inline bint bitdisjoint(uint64_t *vec1, uint64_t *vec2) noexcept nogil: 289 | """Test whether vec1 is disjoint from vec2. 290 | 291 | i.e., len(vec1 & vec2) = 0.""" 292 | cdef size_t n 293 | for n in range(0, (BLOCKSIZE // BITSIZE), 2): 294 | if (vec1[n] & vec2[n]) or (vec1[n + 1] & vec2[n + 1]): 295 | return False 296 | return True 297 | 298 | 299 | cdef inline int select64(uint64_t w, int i) except -1: 300 | """Given a 64-bit int w, return the position of the ith 1-bit.""" 301 | cdef uint64_t part1 = w & 0xFFFFFFFFUL 302 | cdef int wfirsthalf = bit_popcount(part1) 303 | if wfirsthalf > i: 304 | return select32(part1, i) 305 | else: 306 | return select32((w >> 32), i - wfirsthalf) + 32 307 | 308 | 309 | cdef inline int select32(uint32_t w, int i) except -1: 310 | """Given a 32-bit int w, return the position of the ith 1-bit.""" 311 | cdef uint64_t part1 = w & 0xFFFFUL 312 | cdef int wfirsthalf = bit_popcount(part1) 313 | if wfirsthalf > i: 314 | return select16(part1, i) 315 | else: 316 | return select16(w >> 16, i - wfirsthalf) + 16 317 | 318 | 319 | cdef inline int select16(uint16_t w, int i) except -1: 320 | """Given a 16-bit int w, return the position of the ith 1-bit.""" 321 | cdef int sumtotal = 0, counter 322 | for counter in range(16): 323 | sumtotal += (w >> counter) & 1 324 | if sumtotal > i: 325 | return counter 326 | raise IndexError('select16: index %d out of range 0..%d.' % ( 327 | i, bit_popcount(w))) 328 | 329 | 330 | cdef inline void setbitcard(uint64_t *bitmap, uint16_t elem, 331 | uint32_t *cardinality) noexcept nogil: 332 | """Set bit and update cardinality without branch.""" 333 | cdef uint32_t i 334 | cdef uint64_t ow, nw 335 | i = BITSLOT(elem) 336 | ow = bitmap[i] 337 | nw = ow | BITMASK(elem) 338 | cardinality[0] += (ow ^ nw) >> (elem % BITSIZE) 339 | bitmap[i] = nw 340 | 341 | 342 | cdef inline void clearbitcard(uint64_t *bitmap, uint16_t elem, 343 | uint32_t *cardinality) noexcept nogil: 344 | """Clear bit and update cardinality without branch.""" 345 | cdef uint32_t i 346 | cdef uint64_t ow, nw 347 | i = BITSLOT(elem) 348 | ow = bitmap[i] 349 | nw = ow & ~BITMASK(elem) 350 | cardinality[0] -= (ow ^ nw) >> (elem % BITSIZE) 351 | bitmap[i] = nw 352 | 353 | 354 | cdef inline void togglebitcard(uint64_t *bitmap, uint16_t elem, 355 | uint32_t *cardinality) noexcept nogil: 356 | """Flip bit and update cardinality without branch.""" 357 | cdef uint32_t i 358 | cdef uint64_t ow, nw 359 | i = BITSLOT(elem) 360 | ow = bitmap[i] 361 | nw = ow ^ BITMASK(elem) 362 | cardinality[0] += (nw >> (elem % BITSIZE)) - (ow >> (elem % BITSIZE)) 363 | bitmap[i] = nw 364 | -------------------------------------------------------------------------------- /src/immutablerb.pxi: -------------------------------------------------------------------------------- 1 | cdef class ImmutableRoaringBitmap(RoaringBitmap): 2 | """A roaring bitmap that does not allow mutation operations. 3 | 4 | Any operation resulting in a new roaring bitmap is returned as a mutable 5 | RoaringBitmap (except for ``freeze()`` and the ``ImmutableRoaringBitmap`` 6 | constructor). Stores data in one contiguous block of memory for efficient 7 | serialization. 8 | """ 9 | cdef readonly object _ob # object to be kept for ptr to remain valid 10 | cdef char *ptr # the data 11 | cdef size_t bufsize # length in bytes of data 12 | cdef long _hash # cached hash value, computed as needed 13 | 14 | def __init__(self, iterable=None): 15 | """Return a new RoaringBitmap with elements from ``iterable``. 16 | 17 | The elements ``x`` of a RoaringBitmap must be ``0 <= x < 2 ** 32``. 18 | If ``iterable`` is not specified, a new empty RoaringBitmap is 19 | returned. Note that a sorted iterable will significantly speed up the 20 | construction. 21 | ``iterable`` may be a generator, in which case the generator is 22 | consumed incrementally. 23 | ``iterable`` may be a ``range`` (Python 3) or ``xrange`` (Python 2) 24 | object, which will be constructed efficiently.""" 25 | cdef RoaringBitmap ob 26 | cdef ImmutableRoaringBitmap iob 27 | if isinstance(iterable, ImmutableRoaringBitmap): 28 | iob = iterable 29 | self.__setstate__(iob.__getstate__()) 30 | else: 31 | ob = ensurerb(iterable or ()) 32 | self.__setstate__(ob.__getstate__()) 33 | 34 | def __getstate__(self): 35 | """Return a serialized representation (Python array) for pickling.""" 36 | if self._ob is None: 37 | state = array.clone(chararray, self.bufsize, False) 38 | memcpy(state.data.as_chars, self.ptr, self.bufsize) 39 | return state 40 | return self._ob 41 | 42 | def __setstate__(self, array.array state): 43 | """Initialize this object with a serialized representation. 44 | 45 | :param state: a char array with the pickle format of RoaringBitmap. 46 | Instead of copying this data, it will be used directly. 47 | """ 48 | self._ob = state 49 | # FIXME: 32 byte alignment depends on state.data being aligned. 50 | self._setptr(state.data.as_chars, len(state)) 51 | 52 | cdef void _setptr(self, char *ptr, size_t size) noexcept nogil: 53 | self.ptr = ptr 54 | self.offset = ptr 55 | self.bufsize = size 56 | self._hash = -1 57 | self.size = (ptr)[0] 58 | self.capacity = self.size 59 | self.keys = &(ptr[sizeof(uint32_t)]) 60 | # pointers will be adjusted on the fly with self.offset 61 | self.data = &(ptr[ 62 | sizeof(uint32_t) + self.size * (sizeof(uint16_t))]) 63 | 64 | def __hash__(self): 65 | cdef size_t n 66 | if self._hash == -1: 67 | self._hash = 5381 68 | for n in range(self.bufsize): 69 | self._hash = ((self._hash << 5) + self._hash) + self.ptr[n] 70 | # i.e., self._hash *= 33 ^ self.ptr[n] 71 | return self._hash 72 | 73 | def __richcmp__(x, y, int op): 74 | cdef ImmutableRoaringBitmap iob1, iob2 75 | if (isinstance(x, ImmutableRoaringBitmap) 76 | and isinstance(y, ImmutableRoaringBitmap)): 77 | if op == 2: # == 78 | iob1, iob2 = x, y 79 | if (iob1.bufsize != iob2.bufsize 80 | or iob1.__hash__() != iob2.__hash__()): 81 | return False 82 | return memcmp(iob1.ptr, iob2.ptr, iob1.bufsize) == 0 83 | elif op == 3: # != 84 | return not (x == y) 85 | return richcmp(x, y, op) 86 | 87 | def __sizeof__(self): 88 | """Return memory usage in bytes.""" 89 | return len(self._ob) 90 | 91 | def freeze(self): 92 | """Already immutable, return self.""" 93 | return self 94 | 95 | def __repr__(self): 96 | return 'ImmutableRoaringBitmap(%s)' % str(self) 97 | 98 | def copy(self): 99 | """Return a copy of this RoaringBitmap.""" 100 | cdef ImmutableRoaringBitmap result = ImmutableRoaringBitmap.__new__( 101 | ImmutableRoaringBitmap) 102 | result.__setstate__(array.copy(self.__getstate__())) 103 | return result 104 | 105 | def __iand__(self, x): 106 | """Unsupported method.""" 107 | raise ValueError('ImmutableRoaringBitmap cannot be modified.') 108 | 109 | def __isub__(self, x): 110 | """Unsupported method.""" 111 | raise ValueError('ImmutableRoaringBitmap cannot be modified.') 112 | 113 | def __ior__(self, x): 114 | """Unsupported method.""" 115 | raise ValueError('ImmutableRoaringBitmap cannot be modified.') 116 | 117 | def __ixor__(self, x): 118 | """Unsupported method.""" 119 | raise ValueError('ImmutableRoaringBitmap cannot be modified.') 120 | 121 | def add(self, uint32_t elem): 122 | """Unsupported method.""" 123 | raise ValueError('ImmutableRoaringBitmap cannot be modified.') 124 | 125 | def discard(self, uint32_t elem): 126 | """Unsupported method.""" 127 | raise ValueError('ImmutableRoaringBitmap cannot be modified.') 128 | 129 | def remove(self, uint32_t elem): 130 | """Unsupported method.""" 131 | raise ValueError('ImmutableRoaringBitmap cannot be modified.') 132 | 133 | def pop(self): 134 | """Unsupported method.""" 135 | raise ValueError('ImmutableRoaringBitmap cannot be modified.') 136 | 137 | def update(self, *bitmaps): 138 | """Unsupported method.""" 139 | raise ValueError('ImmutableRoaringBitmap cannot be modified.') 140 | 141 | def intersection_update(self, *bitmaps): 142 | """Unsupported method.""" 143 | raise ValueError('ImmutableRoaringBitmap cannot be modified.') 144 | 145 | def difference_update(self, *other): 146 | """Unsupported method.""" 147 | raise ValueError('ImmutableRoaringBitmap cannot be modified.') 148 | 149 | def symmetric_difference_update(self, other): 150 | """Unsupported method.""" 151 | raise ValueError('ImmutableRoaringBitmap cannot be modified.') 152 | 153 | def flip_range(self, start, stop): 154 | """Unsupported method.""" 155 | raise ValueError('ImmutableRoaringBitmap cannot be modified.') 156 | 157 | def clear(self): 158 | """Unsupported method.""" 159 | raise ValueError('ImmutableRoaringBitmap cannot be modified.') 160 | -------------------------------------------------------------------------------- /src/macros.h: -------------------------------------------------------------------------------- 1 | /* http://c-faq.com/misc/bitsets.html */ 2 | /* Original, any word size: 3 | #define BITSIZE (8 * sizeof(uint64_t)) 4 | #define BITSLOT(b) ((b) / BITSIZE) 5 | #define BITMASK(b) (1ULL << ((b) % BITSIZE)) 6 | #define TESTBIT(a, b) ((a)[BITSLOT(b)] & BITMASK(b)) 7 | NB: TESTBIT returns 0 or a value with bit b set 8 | Fix word size at 64 bits: 9 | */ 10 | #define BITSIZE (64) 11 | #define BITSIZE1 (BITSIZE - 1) 12 | #define BITSLOT(b) ((b) >> 6) 13 | #define BITMASK(b) (1ULL << ((b) & BITSIZE1)) 14 | #define SETBIT(a, b) ((a)[BITSLOT(b)] |= BITMASK(b)) 15 | #define TOGGLEBIT(a, b) ((a)[BITSLOT(b)] ^= BITMASK(b)) 16 | #define CLEARBIT(a, b) ((a)[BITSLOT(b)] &= ~BITMASK(b)) 17 | #define BITNSLOTS(nb) (((nb) + BITSIZE1) / BITSIZE) 18 | #define TESTBIT(a, b) (((a)[BITSLOT(b)] >> (b & BITSIZE1)) & 1) 19 | /* NB: TESTBIT returns 0 or 1*/ 20 | 21 | #ifdef _MSC_VER 22 | #define ALIGNED_INLINE __inline 23 | #else 24 | #define ALIGNED_INLINE inline 25 | #endif 26 | 27 | /* https://stackoverflow.com/q/16376942 */ 28 | ALIGNED_INLINE void* aligned_malloc(size_t size, size_t align) { 29 | void *result; 30 | #ifdef _MSC_VER 31 | result = _aligned_malloc(size, align); 32 | #else 33 | if (posix_memalign(&result, align, size)) 34 | result = 0; 35 | #endif 36 | return result; 37 | } 38 | 39 | ALIGNED_INLINE void aligned_free(void *ptr) { 40 | #ifdef _MSC_VER 41 | _aligned_free(ptr); 42 | #else 43 | free(ptr); 44 | #endif 45 | } 46 | -------------------------------------------------------------------------------- /src/multirb.pxi: -------------------------------------------------------------------------------- 1 | @cython.no_gc_clear 2 | cdef class MultiRoaringBitmap(object): 3 | """A sequence of immutable roaring bitmaps. 4 | 5 | Bitmaps are addressed with 32-bit indices. 6 | Everything is stored in a single contiguous block of memory. 7 | 8 | >>> mrb = MultiRoaringBitmap([ 9 | ... RoaringBitmap({0, 1, 2}), 10 | ... RoaringBitmap({1, 6, 8}), 11 | ... RoaringBitmap({1, 7, 2})]) 12 | >>> mrb.intersection(list(range(len(mrb)))) 13 | RoaringBitmap({1}) 14 | >>> mrb[0] | mrb[1] 15 | RoaringBitmap({0, 1, 2, 6, 8}) 16 | """ 17 | cdef uint32_t size # the number of roaring bitmaps 18 | cdef uint32_t *offsets # byte offset in ptr for each roaring bitmap 19 | cdef uint32_t *sizes # the size in bytes of each roaring bitmap 20 | cdef uint32_t *ptr # the data 21 | cdef object _ob # array or mmap which should be kept alive for ptr 22 | cdef object _file # optionally, file with mmap to be kept open 23 | 24 | def __init__(self, list init, filename=None): 25 | """ 26 | :param init: a list of set-like objects (e.g., RoaringBitmaps). 27 | May contain ``None`` elements, which are treated as empty 28 | sets. 29 | :param filename: if given, result is stored in an mmap'd file. 30 | File is overwritten if it already exists.""" 31 | cdef ImmutableRoaringBitmap irb 32 | cdef uint32_t alloc, offset 33 | cdef int alignment = 32 34 | cdef Py_buffer buffer 35 | cdef Py_ssize_t size = 0 36 | cdef char *ptr = NULL 37 | cdef int result 38 | 39 | if filename is not None: 40 | flags = os.O_CREAT | os.O_RDWR 41 | if sys.platform == 'win32': 42 | flags |= os.O_BINARY 43 | self._file = os.open(filename, flags) 44 | 45 | tmp = [None if a is None else ImmutableRoaringBitmap(a) for a in init] 46 | self.size = len(tmp) 47 | alloc = sizeof(uint32_t) + 2 * self.size * sizeof(uint32_t) 48 | extra = alignment - alloc % alignment 49 | alloc += extra 50 | offset = alloc 51 | for irb in tmp: 52 | if irb is not None: 53 | alloc += irb.bufsize 54 | 55 | if filename is not None: 56 | os.ftruncate(self._file, alloc) 57 | self._ob = mmap.mmap( 58 | -1 if filename is None else self._file, 59 | alloc, access=mmap.ACCESS_WRITE) 60 | result = getbufptr(self._ob, &ptr, &size, &buffer) 61 | self.ptr = ptr 62 | if result != 0: 63 | raise ValueError('could not get buffer from mmap.') 64 | 65 | self.ptr[0] = self.size 66 | self.offsets = &(self.ptr[1]) 67 | self.sizes = &(self.ptr[1 + self.size]) 68 | for n in range(1 + 2 * self.size, 69 | 1 + 2 * self.size + extra // sizeof(uint32_t)): 70 | self.ptr[n] = 0 71 | for n, irb in enumerate(tmp): 72 | # offset 73 | self.ptr[1 + n] = offset 74 | # size 75 | if irb is None or irb.size == 0: 76 | self.ptr[1 + n + self.size] = 0 77 | continue 78 | self.ptr[1 + n + self.size] = irb.bufsize 79 | # copy data 80 | memcpy(&((self.ptr)[offset]), irb.ptr, irb.bufsize) 81 | offset += irb.bufsize 82 | if filename is not None: 83 | self._ob.flush() 84 | releasebuf(&buffer) 85 | 86 | def __richcmp__(x, y, int op): 87 | if x is None or y is None: 88 | if op == 2 or op == 3: 89 | return op == 3 90 | raise TypeError 91 | if (not isinstance(x, (MultiRoaringBitmap, list)) 92 | or not isinstance(y, (MultiRoaringBitmap, list))): 93 | raise TypeError 94 | if op == 2: # == 95 | if len(x) != len(y): 96 | return False 97 | return all(a == b for a, b in zip(x, y)) 98 | elif op == 3: # != 99 | if len(x) != len(y): 100 | return True 101 | return not all(a == b for a, b in zip(x, y)) 102 | return NotImplemented 103 | 104 | def close(self): 105 | """Close opened file, if any.""" 106 | if hasattr(self._ob, 'close'): 107 | self._ob.close() 108 | self._ob = None 109 | if self._file is not None: 110 | os.close(self._file) 111 | self._file = None 112 | 113 | def __enter__(self): 114 | return self 115 | 116 | def __exit__(self, _type, _value, _traceback): 117 | self.close() 118 | 119 | def __getstate__(self): 120 | """Return a serialized representation (Python array) for pickling.""" 121 | return bytes(self._ob) 122 | 123 | def __setstate__(self, state): 124 | """Initialize this object with a serialized representation.""" 125 | self._ob = state 126 | self.ptr = state 127 | self.size = self.ptr[0] 128 | self.offsets = &(self.ptr[1]) 129 | self.sizes = &(self.ptr[1 + self.size]) 130 | 131 | @classmethod 132 | def fromfile(cls, filename): 133 | """Load a MultiRoaringBitmap from a file using mmap.""" 134 | cdef MultiRoaringBitmap ob 135 | cdef Py_buffer buffer 136 | cdef char *ptr = NULL 137 | cdef Py_ssize_t size = 0 138 | ob = MultiRoaringBitmap.__new__(MultiRoaringBitmap) 139 | flags = os.O_RDONLY 140 | if sys.platform == 'win32': 141 | flags |= os.O_BINARY 142 | ob._file = os.open(filename, flags) 143 | ob._ob = mmap.mmap(ob._file, 0, access=mmap.ACCESS_READ) 144 | result = getbufptr(ob._ob, &ptr, &size, &buffer) 145 | ob.ptr = ptr 146 | if result != 0: 147 | raise ValueError('could not get buffer from mmap.') 148 | ob.size = ob.ptr[0] 149 | ob.offsets = &(ob.ptr[1]) 150 | ob.sizes = &(ob.ptr[1 + ob.size]) 151 | # rest is data 152 | releasebuf(&buffer) 153 | return ob 154 | 155 | @classmethod 156 | def frombuffer(cls, data, int offset): 157 | """Load a MultiRoaringBitmap from a Python object using the buffer 158 | interface (e.g. bytes or mmap object), starting at ``offset``.""" 159 | cdef MultiRoaringBitmap ob = MultiRoaringBitmap.__new__( 160 | MultiRoaringBitmap) 161 | cdef char *ptr = NULL 162 | cdef Py_buffer buffer 163 | cdef Py_ssize_t size = 0 164 | result = getbufptr(data, &ptr, &size, &buffer) 165 | ob.ptr = &ptr[offset] 166 | if result != 0: 167 | raise ValueError('could not get buffer from mmap.') 168 | ob.size = ob.ptr[0] 169 | ob.offsets = &(ob.ptr[1]) 170 | ob.sizes = &(ob.ptr[1 + ob.size]) 171 | # rest is data 172 | releasebuf(&buffer) 173 | return ob 174 | 175 | def bufsize(self): 176 | """Return size in number of bytes.""" 177 | return self.offsets[self.size - 1] + self.sizes[self.size - 1] 178 | 179 | def __len__(self): 180 | return self.size 181 | 182 | def __getitem__(self, i): 183 | """Like self.get(), but handle negative indices, slices and raise 184 | IndexError for invalid index.""" 185 | if isinstance(i, slice): 186 | return [self[n] for n in range(*i.indices(self.size))] 187 | elif not isinstance(i, (int, long)): 188 | raise TypeError('Expected integer index or slice object.') 189 | elif i < 0: 190 | i += self.size 191 | result = self.get(i) 192 | if result is None: 193 | raise IndexError 194 | return result 195 | 196 | cpdef get(self, long i): 197 | """Return bitmap `i` as an ``ImmutableRoaringBitmap``, or ``None`` if 198 | `i` is an invalid index.""" 199 | cdef ImmutableRoaringBitmap ob1 200 | if i < 0 or i >= self.size: 201 | return None 202 | if self.sizes[i] == 0: 203 | return EMPTYIRB 204 | ob1 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap) 205 | ob1._setptr(&(self.ptr)[self.offsets[i]], self.sizes[i]) 206 | return ob1 207 | 208 | def getsize(self, long i): 209 | return self.sizes[i] 210 | 211 | def intersection(self, list indices, 212 | uint32_t start=0, uint32_t stop=0xffffffffUL): 213 | """Compute intersection of given a list of indices of roaring bitmaps 214 | in this collection. 215 | 216 | :param start: optional start index. 217 | :param stop: optional end index; 218 | if given, only return elements ``n`` s.t. ``start <= n < stop``. 219 | :returns: the intersection as a mutable RoaringBitmap. 220 | Returns ``None`` when an invalid index is encountered or an empty 221 | result is obtained. 222 | """ 223 | cdef ImmutableRoaringBitmap ob1, ob2 224 | cdef RoaringBitmap result 225 | cdef char *ptr = self.ptr 226 | cdef long i, j, numindices = len(indices) 227 | if numindices == 0: 228 | return None 229 | for i in range(numindices): 230 | j = indices[i] 231 | if j < 0 or j >= self.size or self.sizes[j] == 0: 232 | return None 233 | ob1 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap) 234 | if numindices == 1: 235 | i = indices[0] 236 | ob1._setptr(&(ptr[self.offsets[i]]), self.sizes[i]) 237 | if start or stop < 0xffffffffUL: 238 | return rb_clamp(ob1, start, stop) 239 | return ob1 240 | indices.sort(key=self.getsize) 241 | ob2 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap) 242 | # TODO with nogil?: 243 | i, j = indices[0], indices[1] 244 | ob1._setptr(&(ptr[self.offsets[i]]), self.sizes[i]) 245 | ob2._setptr(&(ptr[self.offsets[j]]), self.sizes[j]) 246 | if start or stop < 0xffffffffUL: 247 | result = rb_clamp(ob1, start, stop) 248 | rb_iand(result, ob2) 249 | else: 250 | result = rb_and(ob1, ob2) 251 | for i in range(2, numindices): 252 | j = indices[i] 253 | # swap out contents of ImmutableRoaringBitmap object 254 | ob1._setptr(&(ptr[self.offsets[j]]), self.sizes[j]) 255 | rb_iand(result, ob1) 256 | if result.size == 0: 257 | return None 258 | return result 259 | 260 | def andor_len_pairwise(self, array.array indices1, array.array indices2, 261 | array.array resultand, array.array resultor): 262 | """Pairwise intersection/union cardinality for pairs of roaring bitmaps 263 | in this collection given by ``zip(indices1, indices2)``. 264 | 265 | :param indices1: input array 266 | :param indices2: input array 267 | :param resultand: result array 268 | :param resultor: result array 269 | 270 | All parameters should be Python arrays of type 'L', all preallocated 271 | with the same length; result arrays need not be initialized. 272 | 273 | >>> result1 = array.array('L', [0] * 3) 274 | >>> result2 = array.array('L', [0] * 3) 275 | >>> mrb.intersection_card_pairwise(array.array('L', [0, 6, 8]), 276 | ... array.array('L', [1, 7, 6]), result1, result2) 277 | >>> result1 278 | array.array('L', [3, 2, 56]) 279 | >>> result2 280 | array.array('L', [6, 4, 123]) 281 | """ 282 | cdef char *ptr = self.ptr 283 | cdef int i, j, n, lenindices1 = len(indices1) 284 | cdef ImmutableRoaringBitmap ob1, ob2 285 | ob1 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap) 286 | ob2 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap) 287 | with nogil: 288 | for n in range(lenindices1): 289 | i, j = indices1.data.as_ulongs[n], indices2.data.as_ulongs[n] 290 | ob1._setptr(&(ptr[self.offsets[i]]), self.sizes[i]) 291 | ob2._setptr(&(ptr[self.offsets[j]]), self.sizes[j]) 292 | if self.sizes[i] and self.sizes[j]: 293 | rb_andor_len(ob1, ob2, &(resultand.data.as_ulongs[n]), 294 | &(resultor.data.as_ulongs[n])) 295 | else: 296 | resultand.data.as_ulongs[n] = 0 297 | resultor.data.as_ulongs[n] = 0 298 | 299 | def jaccard_dist(self, array.array indices1, array.array indices2): 300 | """Compute the Jaccard distances for pairs of roaring bitmaps 301 | in this collection given by ``zip(indices1, indices2)``. 302 | 303 | >>> mrb.jaccard_dist(array.array('L', [0, 6, 8]), 304 | ... array.array('L', [1, 7, 6])) 305 | array.array('d', [0.3, 0.2, 0.56]) 306 | 307 | :param indices1: input array 308 | :param indices2: input array 309 | :returns: a Python array of floats with the jaccard distances. 310 | 311 | ``indices1`` and ``indices2`` should be arrays of unsigned long 312 | integers, created with ``array.array('L')``. Ensure that all indices 313 | `i` are in the range ``0 <= i < len(self)``. 314 | """ 315 | cdef ImmutableRoaringBitmap ob1, ob2 316 | cdef array.array result = array.clone(dblarray, len(indices1), False) 317 | cdef char *ptr = self.ptr 318 | cdef int i, j, n, lenindices1 = len(indices1) 319 | ob1 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap) 320 | ob2 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap) 321 | with nogil: 322 | for n in range(lenindices1): 323 | i, j = indices1.data.as_ulongs[n], indices2.data.as_ulongs[n] 324 | ob1._setptr(&(ptr[self.offsets[i]]), self.sizes[i]) 325 | ob2._setptr(&(ptr[self.offsets[j]]), self.sizes[j]) 326 | result.data.as_doubles[n] = (rb_jaccard_dist(ob1, ob2) 327 | if self.sizes[i] and self.sizes[j] else 1) 328 | return result 329 | 330 | def jaccard_dist_single(self, RoaringBitmap rb): 331 | """Compute the Jaccard distances for `rb` with all roaring bitmaps 332 | in this collection. 333 | 334 | >>> mrb.jaccard_dist_single(RoaringBitmap([1, 6, 19, 22])) 335 | array.array('d', [0.3, 0.2, 0.56]) 336 | 337 | :param rb: a roaring bitmap. 338 | :returns: a Python array of floats with the jaccard distances with 339 | length equal to `len(self)`. 340 | """ 341 | cdef ImmutableRoaringBitmap ob1, ob2 342 | cdef array.array result = array.clone(dblarray, len(self), False) 343 | cdef char *ptr = self.ptr 344 | cdef uint32_t n 345 | ob1 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap) 346 | ob2 = ImmutableRoaringBitmap(rb) 347 | with nogil: 348 | for n in range(self.size): 349 | ob1._setptr(&(ptr[self.offsets[n]]), self.sizes[n]) 350 | result.data.as_doubles[n] = rb_jaccard_dist(ob1, ob2) 351 | return result 352 | -------------------------------------------------------------------------------- /src/rbbinaryops.pxi: -------------------------------------------------------------------------------- 1 | cdef inline richcmp(x, y, int op): 2 | """Considers comparisons to RoaringBitmaps and sets; 3 | other types raise a TypeError.""" 4 | cdef RoaringBitmap ob1, ob2 5 | cdef size_t n 6 | if x is None or y is None: 7 | if op == 2 or op == 3: 8 | return op == 3 9 | raise TypeError 10 | if (not isinstance(x, (RoaringBitmap, set)) 11 | or not isinstance(y, (RoaringBitmap, set))): 12 | raise TypeError 13 | if op == 2: # == 14 | ob1, ob2 = ensurerb(x), ensurerb(y) 15 | if ob1.size != ob2.size: 16 | return False 17 | if memcmp(ob1.keys, ob2.keys, ob1.size * sizeof(uint16_t)) != 0: 18 | return False 19 | for n in range(ob1.size): 20 | if ob1.data[n].cardinality != ob2.data[n].cardinality: 21 | return False 22 | for n in range(ob1.size): 23 | if memcmp( 24 | (ob1.offset + ob1.data[n].buf.offset), 25 | (ob2.offset + ob2.data[n].buf.offset), 26 | getsize(&(ob1.data[n])) * sizeof(uint16_t)) != 0: 27 | return False 28 | return True 29 | elif op == 3: # != 30 | return not richcmp(x, y, 2) 31 | elif op == 1: # <= 32 | return ensurerb(x).issubset(y) 33 | elif op == 5: # >= 34 | return ensurerb(x).issuperset(y) 35 | elif op == 0: # < 36 | return len(x) < len(y) and ensurerb(x).issubset(y) 37 | elif op == 4: # > 38 | return len(x) > len(y) and ensurerb(x).issuperset(y) 39 | return NotImplemented 40 | 41 | 42 | cdef inline RoaringBitmap rb_iand(RoaringBitmap ob1, RoaringBitmap ob2): 43 | cdef uint32_t pos1 = 0, pos2 = 0, res = 0 44 | cdef uint16_t *keys = NULL 45 | cdef Block *data = NULL 46 | cdef Block b2 47 | if ob2.size == 0: 48 | for pos1 in range(ob1.size): 49 | aligned_free(ob1.data[pos1].buf.ptr) 50 | ob1._resize(0) 51 | elif ob1.size > 0: 52 | ob1.capacity = min(ob1.size, ob2.size) 53 | ob1._tmpalloc(ob1.capacity, &keys, &data) 54 | while True: 55 | if ob1.keys[pos1] < ob2.keys[pos2]: 56 | aligned_free(ob1.data[pos1].buf.ptr) 57 | pos1 += 1 58 | if pos1 == ob1.size: 59 | break 60 | elif ob1.keys[pos1] > ob2.keys[pos2]: 61 | pos2 += 1 62 | if pos2 == ob2.size: 63 | break 64 | else: # ob1.keys[pos1] == ob2.keys[pos2]: 65 | block_iand(&(ob1.data[pos1]), ob2._getblk(pos2, &b2)) 66 | if ob1.data[pos1].cardinality > 0: 67 | keys[res] = ob1.keys[pos1] 68 | data[res] = ob1.data[pos1] 69 | res += 1 70 | else: 71 | aligned_free(ob1.data[pos1].buf.ptr) 72 | pos1 += 1 73 | pos2 += 1 74 | if pos1 == ob1.size or pos2 == ob2.size: 75 | break 76 | ob1._replacearrays(keys, data, res) 77 | return ob1 78 | 79 | 80 | cdef inline RoaringBitmap rb_isub(RoaringBitmap ob1, RoaringBitmap ob2): 81 | cdef uint32_t pos1 = 0, pos2 = 0, res = 0 82 | cdef uint16_t *keys = NULL 83 | cdef Block *data = NULL 84 | cdef Block b2 85 | if pos1 < ob1.size and pos2 < ob2.size: 86 | ob1.capacity = ob1.size 87 | ob1._tmpalloc(ob1.capacity, &keys, &data) 88 | while True: 89 | if ob1.keys[pos1] < ob2.keys[pos2]: 90 | keys[res] = ob1.keys[pos1] 91 | data[res] = ob1.data[pos1] 92 | res += 1 93 | pos1 += 1 94 | if pos1 == ob1.size: 95 | break 96 | elif ob1.keys[pos1] > ob2.keys[pos2]: 97 | pos2 += 1 98 | if pos2 == ob2.size: 99 | break 100 | else: # ob1.keys[pos1] == ob2.keys[pos2]: 101 | block_isub(&(ob1.data[pos1]), ob2._getblk(pos2, &b2)) 102 | if ob1.data[pos1].cardinality > 0: 103 | keys[res] = ob1.keys[pos1] 104 | data[res] = ob1.data[pos1] 105 | res += 1 106 | else: 107 | aligned_free(ob1.data[pos1].buf.ptr) 108 | pos1 += 1 109 | pos2 += 1 110 | if pos1 == ob1.size or pos2 == ob2.size: 111 | break 112 | if pos2 == ob2.size: 113 | for pos1 in range(pos1, ob1.size): 114 | keys[res] = ob1.keys[pos1] 115 | data[res] = ob1.data[pos1] 116 | res += 1 117 | ob1._replacearrays(keys, data, res) 118 | return ob1 119 | 120 | 121 | cdef inline RoaringBitmap rb_ior(RoaringBitmap ob1, RoaringBitmap ob2): 122 | cdef uint32_t pos1 = 0, pos2 = 0, res = 0 123 | cdef uint16_t *keys = NULL 124 | cdef Block *data = NULL 125 | cdef Block b2 126 | if ob2.size == 0: 127 | return ob1 128 | ob1.capacity = ob1.size + ob2.size 129 | ob1._tmpalloc(ob1.capacity, &keys, &data) 130 | if pos1 < ob1.size and pos2 < ob2.size: 131 | while True: 132 | if ob1.keys[pos1] < ob2.keys[pos2]: 133 | keys[res] = ob1.keys[pos1] 134 | data[res] = ob1.data[pos1] 135 | res += 1 136 | pos1 += 1 137 | if pos1 == ob1.size: 138 | break 139 | elif ob1.keys[pos1] > ob2.keys[pos2]: 140 | keys[res] = ob2.keys[pos2] 141 | block_copy(&(data[res]), ob2._getblk(pos2, &b2)) 142 | res += 1 143 | pos2 += 1 144 | if pos2 == ob2.size: 145 | break 146 | else: # ob1.keys[pos1] == ob2.keys[pos2]: 147 | block_ior(&(ob1.data[pos1]), ob2._getblk(pos2, &b2)) 148 | keys[res] = ob1.keys[pos1] 149 | data[res] = ob1.data[pos1] 150 | res += 1 151 | pos1 += 1 152 | pos2 += 1 153 | if pos1 == ob1.size or pos2 == ob2.size: 154 | break 155 | if pos1 == ob1.size: 156 | for pos2 in range(pos2, ob2.size): 157 | keys[res] = ob2.keys[pos2] 158 | block_copy(&(data[res]), ob2._getblk(pos2, &b2)) 159 | res += 1 160 | elif pos2 == ob2.size: 161 | for pos1 in range(pos1, ob1.size): 162 | keys[res] = ob1.keys[pos1] 163 | data[res] = ob1.data[pos1] 164 | res += 1 165 | ob1._replacearrays(keys, data, res) 166 | return ob1 167 | 168 | 169 | cdef inline RoaringBitmap rb_ixor(RoaringBitmap ob1, RoaringBitmap ob2): 170 | cdef uint32_t pos1 = 0, pos2 = 0, res = 0 171 | cdef uint16_t *keys = NULL 172 | cdef Block *data = NULL 173 | cdef Block b2 174 | ob1.capacity = ob1.size + ob2.size 175 | ob1._tmpalloc(ob1.capacity, &keys, &data) 176 | if pos1 < ob1.size and pos2 < ob2.size: 177 | while True: 178 | if ob1.keys[pos1] < ob2.keys[pos2]: 179 | keys[res] = ob1.keys[pos1] 180 | data[res] = ob1.data[pos1] 181 | res += 1 182 | pos1 += 1 183 | if pos1 == ob1.size: 184 | break 185 | elif ob1.keys[pos1] > ob2.keys[pos2]: 186 | keys[res] = ob2.keys[pos2] 187 | block_copy(&(data[res]), ob2._getblk(pos2, &b2)) 188 | res += 1 189 | pos2 += 1 190 | if pos2 == ob2.size: 191 | break 192 | else: # ob1.keys[pos1] == ob2.keys[pos2]: 193 | block_ixor(&(ob1.data[pos1]), ob2._getblk(pos2, &b2)) 194 | if ob1.data[pos1].cardinality > 0: 195 | keys[res] = ob1.keys[pos1] 196 | data[res] = ob1.data[pos1] 197 | res += 1 198 | else: 199 | aligned_free(ob1.data[pos1].buf.ptr) 200 | pos1 += 1 201 | pos2 += 1 202 | if pos1 == ob1.size or pos2 == ob2.size: 203 | break 204 | if pos1 == ob1.size: 205 | for pos2 in range(pos2, ob2.size): 206 | keys[res] = ob2.keys[pos2] 207 | block_copy(&(data[res]), ob2._getblk(pos2, &b2)) 208 | res += 1 209 | elif pos2 == ob2.size: 210 | for pos1 in range(pos1, ob1.size): 211 | keys[res] = ob1.keys[pos1] 212 | data[res] = ob1.data[pos1] 213 | res += 1 214 | ob1._replacearrays(keys, data, res) 215 | return ob1 216 | 217 | 218 | cdef inline RoaringBitmap rb_and(RoaringBitmap ob1, RoaringBitmap ob2): 219 | cdef RoaringBitmap result = RoaringBitmap() 220 | cdef uint32_t pos1 = 0, pos2 = 0 221 | cdef Block b1, b2 222 | if pos1 < ob1.size and pos2 < ob2.size: 223 | # initialize to zero so that unallocated blocks can be detected 224 | result._initarray(min(ob1.size, ob2.size)) 225 | while True: 226 | if ob1.keys[pos1] < ob2.keys[pos2]: 227 | pos1 += 1 228 | if pos1 == ob1.size: 229 | break 230 | elif ob1.keys[pos1] > ob2.keys[pos2]: 231 | pos2 += 1 232 | if pos2 == ob2.size: 233 | break 234 | else: # ob1.keys[pos1] == ob2.keys[pos2]: 235 | block_and(&(result.data[result.size]), 236 | ob1._getblk(pos1, &b1), ob2._getblk(pos2, &b2)) 237 | if result.data[result.size].cardinality: 238 | result.keys[result.size] = ob1.keys[pos1] 239 | result.size += 1 240 | pos1 += 1 241 | pos2 += 1 242 | if pos1 == ob1.size or pos2 == ob2.size: 243 | break 244 | aligned_free(result.data[result.size].buf.ptr) 245 | result._resize(result.size) 246 | return result 247 | 248 | 249 | cdef inline RoaringBitmap rb_sub(RoaringBitmap ob1, RoaringBitmap ob2): 250 | cdef RoaringBitmap result = RoaringBitmap() 251 | cdef uint32_t pos1 = 0, pos2 = 0 252 | cdef Block b1, b2 253 | result._initarray(ob1.size) 254 | if pos1 < ob1.size and pos2 < ob2.size: 255 | while True: 256 | if ob1.keys[pos1] < ob2.keys[pos2]: 257 | result._insertcopy( 258 | result.size, ob1.keys[pos1], ob1._getblk(pos1, &b1)) 259 | pos1 += 1 260 | if pos1 == ob1.size: 261 | break 262 | elif ob1.keys[pos1] > ob2.keys[pos2]: 263 | pos2 += 1 264 | if pos2 == ob2.size: 265 | break 266 | else: # ob1.keys[pos1] == ob2.keys[pos2]: 267 | block_sub(&(result.data[result.size]), 268 | ob1._getblk(pos1, &b1), ob2._getblk(pos2, &b2)) 269 | if result.data[result.size].cardinality > 0: 270 | result.keys[result.size] = ob1.keys[pos1] 271 | result.size += 1 272 | pos1 += 1 273 | pos2 += 1 274 | if pos1 == ob1.size or pos2 == ob2.size: 275 | break 276 | if pos2 == ob2.size: 277 | for pos1 in range(pos1, ob1.size): 278 | result._insertcopy( 279 | result.size, ob1.keys[pos1], ob1._getblk(pos1, &b1)) 280 | aligned_free(result.data[result.size].buf.ptr) 281 | result._resize(result.size) 282 | if pos2 == ob2.size: 283 | while pos1 < ob1.size: 284 | result._insertcopy( 285 | result.size, ob1.keys[pos1], ob1._getblk(pos1, &b1)) 286 | pos1 += 1 287 | return result 288 | 289 | 290 | cdef inline RoaringBitmap rb_or(RoaringBitmap ob1, RoaringBitmap ob2): 291 | cdef RoaringBitmap result = RoaringBitmap() 292 | cdef uint32_t pos1 = 0, pos2 = 0 293 | cdef Block b1, b2 294 | if pos1 < ob1.size and pos2 < ob2.size: 295 | result._initarray(ob1.size + ob2.size) 296 | while True: 297 | if ob1.keys[pos1] < ob2.keys[pos2]: 298 | result._insertcopy( 299 | result.size, ob1.keys[pos1], ob1._getblk(pos1, &b1)) 300 | pos1 += 1 301 | if pos1 == ob1.size: 302 | break 303 | elif ob1.keys[pos1] > ob2.keys[pos2]: 304 | result._insertcopy( 305 | result.size, ob2.keys[pos2], ob2._getblk(pos2, &b2)) 306 | pos2 += 1 307 | if pos2 == ob2.size: 308 | break 309 | else: # ob1.keys[pos1] == ob2.keys[pos2]: 310 | block_or(&(result.data[result.size]), 311 | ob1._getblk(pos1, &b1), ob2._getblk(pos2, &b2)) 312 | result.keys[result.size] = ob1.keys[pos1] 313 | result.size += 1 314 | pos1 += 1 315 | pos2 += 1 316 | if pos1 == ob1.size or pos2 == ob2.size: 317 | break 318 | if pos1 == ob1.size: 319 | result._extendarray(ob2.size - pos2) 320 | for pos2 in range(pos2, ob2.size): 321 | result._insertcopy(result.size, 322 | ob2.keys[pos2], ob2._getblk(pos2, &b2)) 323 | elif pos2 == ob2.size: 324 | result._extendarray(ob1.size - pos1) 325 | for pos1 in range(pos1, ob1.size): 326 | result._insertcopy( 327 | result.size, ob1.keys[pos1], ob1._getblk(pos1, &b1)) 328 | result._resize(result.size) 329 | return result 330 | 331 | 332 | cdef inline RoaringBitmap rb_xor(RoaringBitmap ob1, RoaringBitmap ob2): 333 | cdef RoaringBitmap result = RoaringBitmap() 334 | cdef uint32_t pos1 = 0, pos2 = 0 335 | cdef Block b1, b2 336 | if pos1 < ob1.size and pos2 < ob2.size: 337 | result._initarray(ob1.size + ob2.size) 338 | while True: 339 | if ob1.keys[pos1] < ob2.keys[pos2]: 340 | result._insertcopy( 341 | result.size, ob1.keys[pos1], ob1._getblk(pos1, &b1)) 342 | pos1 += 1 343 | if pos1 == ob1.size: 344 | break 345 | elif ob1.keys[pos1] > ob2.keys[pos2]: 346 | result._insertcopy( 347 | result.size, ob2.keys[pos2], ob2._getblk(pos2, &b2)) 348 | pos2 += 1 349 | if pos2 == ob2.size: 350 | break 351 | else: # ob1.keys[pos1] == ob2.keys[pos2]: 352 | block_xor(&(result.data[result.size]), 353 | ob1._getblk(pos1, &b1), ob2._getblk(pos2, &b2)) 354 | if result.data[result.size].cardinality > 0: 355 | result.keys[result.size] = ob1.keys[pos1] 356 | result.size += 1 357 | pos1 += 1 358 | pos2 += 1 359 | if pos1 == ob1.size or pos2 == ob2.size: 360 | break 361 | aligned_free(result.data[result.size].buf.ptr) 362 | if pos1 == ob1.size: 363 | result._extendarray(ob2.size - pos2) 364 | for pos2 in range(pos2, ob2.size): 365 | result._insertcopy( 366 | result.size, ob2.keys[pos2], ob2._getblk(pos2, &b2)) 367 | elif pos2 == ob2.size: 368 | result._extendarray(ob1.size - pos1) 369 | for pos1 in range(pos1, ob1.size): 370 | result._insertcopy( 371 | result.size, ob1.keys[pos1], ob1._getblk(pos1, &b1)) 372 | result._resize(result.size) 373 | return result 374 | 375 | 376 | cdef bint rb_isdisjoint(RoaringBitmap self, RoaringBitmap ob): 377 | cdef Block b1, b2 378 | cdef size_t n 379 | cdef int i = 0 380 | if self.size == 0 or ob.size == 0: 381 | return True 382 | for n in range(self.size): 383 | i = ob._binarysearch(i, ob.size, self.keys[n]) 384 | if i < 0: 385 | if -i - 1 >= ob.size: 386 | return True 387 | i = -i - 1 388 | elif not block_isdisjoint(self._getblk(n, &b1), ob._getblk(i, &b2)): 389 | return False 390 | return True 391 | 392 | 393 | cdef inline bint rb_issubset(RoaringBitmap self, RoaringBitmap ob): 394 | cdef Block b1, b2 395 | cdef size_t n 396 | cdef int i = 0 397 | if self.size == 0: 398 | return True 399 | elif ob.size == 0: 400 | return False 401 | for n in range(self.size): 402 | i = ob._binarysearch(i, ob.size, self.keys[n]) 403 | if i < 0: 404 | return False 405 | i = 0 406 | for n in range(self.size): 407 | i = ob._binarysearch(i, ob.size, self.keys[n]) 408 | if not block_issubset(self._getblk(n, &b1), ob._getblk(i, &b2)): 409 | return False 410 | return True 411 | 412 | 413 | cdef inline RoaringBitmap rb_clamp(RoaringBitmap self, 414 | uint32_t start, uint32_t stop): 415 | cdef Block b1 416 | cdef RoaringBitmap result = RoaringBitmap() 417 | cdef int ii = self._getindex(highbits(start)) 418 | cdef int jj = ii 419 | cdef int i = -ii - 1 if ii < 0 else ii 420 | cdef int j = i 421 | if highbits(start) != highbits(stop): 422 | jj = self._getindex(highbits(stop)) 423 | # when block was not found, round down to preceding block 424 | j = -jj - 2 if jj < 0 else jj 425 | if i >= self.size or j < 0: 426 | return result 427 | result._initarray(j - i + 1) 428 | block_clamp( 429 | &(result.data[0]), self._getblk(i, &b1), 430 | lowbits(start) if i == ii else 0, 431 | lowbits(stop) if ii == jj and ii >= 0 else BLOCKSIZE) 432 | if result.data[result.size].cardinality: 433 | result.keys[result.size] = self.keys[i] 434 | result.size += 1 435 | else: 436 | aligned_free(result.data[0].buf.ptr) 437 | for n in range(i + 1, j): 438 | block_copy(&(result.data[result.size]), self._getblk(n, &b1)) 439 | result.keys[result.size] = self.keys[n] 440 | result.size += 1 441 | if i != j: 442 | block_clamp( 443 | &(result.data[result.size]), self._getblk(j, &b1), 444 | 0, lowbits(stop) if jj >= 0 else BLOCKSIZE) 445 | if result.data[result.size].cardinality: 446 | result.keys[result.size] = self.keys[j] 447 | result.size += 1 448 | else: 449 | aligned_free(result.data[result.size].buf.ptr) 450 | result._resize(result.size) 451 | return result 452 | 453 | 454 | cdef inline void rb_andor_len(RoaringBitmap ob1, RoaringBitmap ob2, 455 | unsigned long *intersection_result, 456 | unsigned long *union_result) noexcept nogil: 457 | cdef Block b1, b2 458 | cdef uint32_t pos1 = 0, pos2 = 0, tmp1, tmp2 459 | union_result[0] = intersection_result[0] = 0 460 | if pos1 < ob1.size and pos2 < ob2.size: 461 | while True: 462 | if ob1.keys[pos1] < ob2.keys[pos2]: 463 | union_result[0] += ob1.data[pos1].cardinality 464 | pos1 += 1 465 | if pos1 == ob1.size: 466 | break 467 | elif ob1.keys[pos1] > ob2.keys[pos2]: 468 | union_result[0] += ob2.data[pos2].cardinality 469 | pos2 += 1 470 | if pos2 == ob2.size: 471 | break 472 | else: 473 | tmp1 = tmp2 = 0 474 | block_andorlen( 475 | ob1._getblk(pos1, &b1), 476 | ob2._getblk(pos2, &b2), 477 | &tmp1, &tmp2) 478 | intersection_result[0] += tmp1 479 | union_result[0] += tmp2 480 | pos1 += 1 481 | pos2 += 1 482 | if pos1 == ob1.size or pos2 == ob2.size: 483 | break 484 | if pos1 == ob1.size and pos2 < ob2.size: 485 | for pos2 in range(pos2, ob2.size): 486 | union_result[0] += ob2.data[pos2].cardinality 487 | elif pos2 == ob2.size and pos1 < ob1.size: 488 | for pos1 in range(pos1, ob1.size): 489 | union_result[0] += ob1.data[pos1].cardinality 490 | 491 | 492 | cdef inline double rb_jaccard_dist(RoaringBitmap ob1, 493 | RoaringBitmap ob2) noexcept nogil: 494 | cdef unsigned long union_result = 0, intersection_result = 0 495 | rb_andor_len(ob1, ob2, &intersection_result, &union_result) 496 | if union_result == 0: 497 | return 1 498 | return 1 - (intersection_result / union_result) 499 | -------------------------------------------------------------------------------- /src/roaringbitmap.pyx: -------------------------------------------------------------------------------- 1 | """Roaring bitmap in Cython. 2 | 3 | A Roaring bitmap stores a set of 32 bit integers compactly while allowing for 4 | efficient set operations. The space of integers is partitioned into blocks 5 | of ``2 ** 16`` integers. The representation for a block depends on the number 6 | of elements it contains: 7 | 8 | <= 4096 elements: 9 | an array of up to ``1 << 12`` 16-bit integers that are part of the set. 10 | 11 | >= 61140 elements: 12 | an array of up to ``1 << 12`` 16-bit integers that are not part of the set. 13 | 14 | otherwise: 15 | a fixed bitmap of ``1 << 16`` (65536) bits with a 1-bit for each element. 16 | 17 | A ``RoaringBitmap`` can be used as a replacement for a mutable 18 | Python ``set`` containing unsigned 32-bit integers: 19 | 20 | >>> from roaringbitmap import RoaringBitmap 21 | >>> RoaringBitmap(range(10)) & RoaringBitmap(range(5, 15)) 22 | RoaringBitmap({5, 6, 7, 8, 9}) 23 | 24 | ``ImmutableRoaringBitmap`` is an immutable variant (analogous to ``frozenset``) 25 | which is stored compactly as a contiguous block of memory. 26 | 27 | ``MultiRoaringBitmap`` stores a sequence of immutable roaring bitmaps 28 | in an efficiently serializable, contiguous block of memory. 29 | """ 30 | # TODOs 31 | # [ ] SSE/AVX2 intrinsics: 32 | # array intersection [x] SSE; [ ] AVX 33 | # bitmap=>array [ ] SSE; [ ] AVX 34 | # [ ] separate cardinality & binary ops for bitmaps 35 | # [ ] and; [-] or; [ ] xor; [ ] sub 36 | # slower in benchmarks 37 | # [ ] check growth strategy of arrays 38 | # [ ] more operations: 39 | # [ ] efficient shifts 40 | # [ ] operate on slices without instantiating range as temp object 41 | # [ ] subclass Set ABC? 42 | # [ ] error checking, robustness 43 | 44 | import io 45 | import os 46 | import sys 47 | import mmap 48 | import heapq 49 | import array 50 | 51 | from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t, int32_t 52 | from libc.stdio cimport printf 53 | from libc.stdlib cimport free, malloc, calloc, realloc, abort 54 | from libc.string cimport memset, memcpy, memcmp, memmove 55 | from cpython.buffer cimport PyBUF_SIMPLE, Py_buffer, PyObject_CheckBuffer, \ 56 | PyObject_GetBuffer, PyBuffer_Release 57 | from cpython cimport array 58 | cimport cython 59 | 60 | cdef extern from *: 61 | cdef bint PY2 62 | 63 | 64 | cdef extern from "Python.h": 65 | int PyObject_CheckReadBuffer(object) 66 | int PyObject_AsReadBuffer(object, const void **, Py_ssize_t *) 67 | 68 | 69 | cdef extern from "macros.h": 70 | int BITSIZE 71 | int BITSLOT(int b) nogil 72 | int BITNSLOTS(int nb) nogil 73 | void SETBIT(uint64_t a[], int b) nogil 74 | void CLEARBIT(uint64_t a[], int b) nogil 75 | uint64_t TESTBIT(uint64_t a[], int b) nogil 76 | uint64_t BITMASK(int b) nogil 77 | void *aligned_malloc(size_t size, size_t align) nogil 78 | void aligned_free(void *ptr) nogil 79 | 80 | cdef extern from "bitcount.h": 81 | unsigned int bit_clz(uint64_t) nogil 82 | unsigned int bit_ctz(uint64_t) nogil 83 | unsigned int bit_popcount(uint64_t) nogil 84 | size_t BITCOUNT_BITS 85 | size_t UINT64_MAX 86 | 87 | 88 | cdef extern from "_arrayops.h": 89 | int32_t intersect_uint16(uint16_t *A, size_t lenA, 90 | uint16_t *B, size_t lenB, uint16_t *out) nogil 91 | int32_t intersect_general16(uint16_t *A, size_t lenA, 92 | uint16_t *B, size_t lenB, uint16_t *out) nogil 93 | 94 | 95 | cdef union Buffer: 96 | void *ptr 97 | uint16_t *sparse 98 | uint64_t *dense 99 | size_t offset 100 | uint64_t _padding # ensure that this union takes at least 64 bits. 101 | 102 | 103 | cdef struct Block: 104 | # A set of 2**16 integers, stored as bitmap or array. 105 | # 106 | # This block may contain a bitvector (DENSE) or a sparse array; 107 | # The array can contain elements corresponding to 0-bits (INVERTED) 108 | # or 1-bits (POSITIVE). 109 | Buffer buf # data: sparse array or fixed-size bitvector 110 | uint32_t cardinality # the number of elements 111 | uint16_t capacity # number of allocated uint16_t elements 112 | uint16_t state # either DENSE, INVERTED, or POSITIVE 113 | # NB: make state uint16_t so that the struct is 16 bytes without padding. 114 | 115 | # The maximum number of elements in a block 116 | DEF BLOCKSIZE = 1 << 16 117 | 118 | # The number of bytes to store a bitmap of 2**16 bits: 119 | DEF BITMAPSIZE = BLOCKSIZE // 8 120 | 121 | # Maximum length of positive/inverted sparse arrays: 122 | DEF MAXARRAYLENGTH = 1 << 12 123 | 124 | # Capacity (elements) to allocate for an empty array 125 | DEF INITCAPACITY = 4 126 | 127 | # Extra elements in result to accomodate SSE/AVX vector operations 128 | DEF OVERALLOC = 8 129 | 130 | # The different ways a block may store its elements: 131 | DEF DENSE = 0 132 | DEF POSITIVE = 1 133 | DEF INVERTED = 2 134 | 135 | include "bitops.pxi" 136 | include "arrayops.pxi" 137 | include "block.pxi" 138 | include "rbbinaryops.pxi" 139 | include "immutablerb.pxi" 140 | include "multirb.pxi" 141 | 142 | chararray = array.array(b'B' if PY2 else 'B') 143 | dblarray = array.array(b'd' if PY2 else 'd') 144 | longarray = array.array(b'L' if PY2 else 'L') 145 | RANGE = xrange if PY2 else range 146 | EMPTYIRB = ImmutableRoaringBitmap() 147 | 148 | 149 | cdef class RoaringBitmap(object): 150 | """A compact, mutable set of 32-bit integers.""" 151 | cdef Block *data # pointer and size of array/bitmap with elements 152 | cdef uint16_t *keys # the high bits of elements in each block 153 | cdef uint32_t size # the number of blocks 154 | cdef uint32_t capacity # the allocated capacity for blocks 155 | cdef size_t offset # used for immutable bitmaps with relative pointers 156 | 157 | def __cinit__(self, *args, **kwargs): 158 | self.keys = self.data = NULL 159 | self.capacity = self.size = self.offset = 0 160 | 161 | def __init__(self, iterable=None): 162 | """Return a new RoaringBitmap with elements from ``iterable``. 163 | 164 | The elements ``x`` of a RoaringBitmap must be ``0 <= x < 2 ** 32``. 165 | If ``iterable`` is not specified, a new empty RoaringBitmap is 166 | returned. Note that a sorted iterable will significantly speed up the 167 | construction. 168 | ``iterable`` may be a generator, in which case the generator is 169 | consumed incrementally. 170 | ``iterable`` may be a ``range`` (Python 3) or ``xrange`` (Python 2) 171 | object, which will be constructed efficiently.""" 172 | cdef size_t n 173 | cdef Block b1 174 | cdef RoaringBitmap ob 175 | if isinstance(iterable, RANGE): 176 | _, (start, stop, step) = iterable.__reduce__() 177 | if 0 <= start < stop and step >= 1: 178 | self._initrange(start, stop, step) 179 | return 180 | # fall through on non-trivial use of range() 181 | if isinstance(iterable, (list, tuple, set, dict, RANGE)): 182 | self._init2pass(iterable) 183 | elif isinstance(iterable, RoaringBitmap): 184 | ob = iterable 185 | self._extendarray(ob.size) 186 | for n in range(ob.size): 187 | self._insertcopy(self.size, ob.keys[n], ob._getblk(n, &b1)) 188 | elif iterable is not None: 189 | self._inititerator(iterable) 190 | 191 | def __dealloc__(self): 192 | if self.data is not NULL and self.offset == 0: 193 | for n in range(self.size): 194 | aligned_free(self.data[n].buf.ptr) 195 | free(self.keys) 196 | free(self.data) 197 | self.keys = self.data = NULL 198 | self.size = 0 199 | 200 | def copy(self): 201 | """Return a copy of this RoaringBitmap.""" 202 | cdef RoaringBitmap result = RoaringBitmap() 203 | cdef size_t n 204 | result._extendarray(self.size) 205 | for n in range(self.size): 206 | result._insertcopy(result.size, self.keys[n], &(self.data[n])) 207 | return result 208 | 209 | def freeze(self): 210 | """Return an immutable copy of this RoaringBitmap.""" 211 | cdef ImmutableRoaringBitmap result = ImmutableRoaringBitmap.__new__( 212 | ImmutableRoaringBitmap) 213 | result.__setstate__(self.__getstate__()) 214 | return result 215 | 216 | def __contains__(self, uint32_t elem): 217 | cdef int i = self._getindex(highbits(elem)) 218 | cdef Block b1 219 | if i >= 0: 220 | return block_contains( 221 | self._getblk(i, &b1), lowbits(elem)) 222 | return False 223 | 224 | def __richcmp__(x, y, int op): 225 | return richcmp(x, y, op) 226 | 227 | def isdisjoint(self, other): 228 | """Return True if two RoaringBitmaps have a null intersection.""" 229 | return rb_isdisjoint(self, ensurerb(other)) 230 | 231 | def issubset(self, other): 232 | """Report whether another set contains this RoaringBitmap.""" 233 | return rb_issubset(self, ensurerb(other)) 234 | 235 | def issuperset(self, other): 236 | """Report whether this RoaringBitmap contains another set.""" 237 | return other.issubset(self) 238 | 239 | def min(self): 240 | """Return smallest element in this RoaringBitmap.""" 241 | return self.select(0) 242 | 243 | def max(self): 244 | """Return largest element in this RoaringBitmap.""" 245 | return next(reversed(self)) 246 | 247 | def __and__(x, y): 248 | cdef RoaringBitmap ob1 = ensurerb(x), ob2 = ensurerb(y) 249 | return rb_and(ob1, ob2) 250 | 251 | def __sub__(x, y): 252 | cdef RoaringBitmap ob1 = ensurerb(x), ob2 = ensurerb(y) 253 | return rb_sub(ob1, ob2) 254 | 255 | def __or__(x, y): 256 | cdef RoaringBitmap ob1 = ensurerb(x), ob2 = ensurerb(y) 257 | return rb_or(ob1, ob2) 258 | 259 | def __xor__(x, y): 260 | cdef RoaringBitmap ob1 = ensurerb(x), ob2 = ensurerb(y) 261 | return rb_xor(ob1, ob2) 262 | 263 | def __iand__(self, x): 264 | cdef RoaringBitmap ob2 = ensurerb(x) 265 | return rb_iand(self, ob2) 266 | 267 | def __isub__(self, x): 268 | cdef RoaringBitmap ob2 = ensurerb(x) 269 | return rb_isub(self, ob2) 270 | 271 | def __ior__(self, x): 272 | cdef RoaringBitmap ob2 = ensurerb(x) 273 | return rb_ior(self, ob2) 274 | 275 | def __ixor__(self, x): 276 | cdef RoaringBitmap ob2 = ensurerb(x) 277 | return rb_ixor(self, ob2) 278 | 279 | def add(self, uint32_t elem): 280 | """Add an element to the set. 281 | 282 | This has no effect if the element is already present.""" 283 | cdef Block *block 284 | cdef uint16_t key = highbits(elem) 285 | cdef int i = self._getindex(key) 286 | if i >= 0: 287 | block = &(self.data[i]) 288 | else: 289 | block = self._insertempty(-i - 1, key) 290 | block.state = POSITIVE 291 | block.cardinality = 0 292 | block.buf.sparse = allocsparse(INITCAPACITY) 293 | block.capacity = INITCAPACITY 294 | block_add(block, lowbits(elem)) 295 | block_convert(block) 296 | 297 | def clamp(self, uint32_t start, uint32_t stop): 298 | """Return new set with range of values restricted to ``(start, stop)``. 299 | """ 300 | return rb_clamp(self, start, stop) 301 | 302 | def discard(self, uint32_t elem): 303 | """Remove an element from the set if it is a member. 304 | 305 | If the element is not a member, do nothing.""" 306 | cdef int i = self._getindex(highbits(elem)) 307 | if i >= 0: 308 | block_discard(&(self.data[i]), lowbits(elem)) 309 | if self.data[i].cardinality == 0: 310 | self._removeatidx(i) 311 | 312 | def remove(self, uint32_t elem): 313 | """Remove an element from the set; it must be a member. 314 | 315 | If the element is not a member, raise a KeyError.""" 316 | cdef int i = self._getindex(highbits(elem)) 317 | cdef uint32_t x 318 | if i >= 0: 319 | x = self.data[i].cardinality 320 | block_discard(&(self.data[i]), lowbits(elem)) 321 | if x == self.data[i].cardinality: 322 | raise KeyError(elem) 323 | if self.data[i].cardinality == 0: 324 | self._removeatidx(i) 325 | else: 326 | raise KeyError(elem) 327 | 328 | def pop(self): 329 | """Remove and return the largest element.""" 330 | cdef uint32_t high, low 331 | if self.size == 0: 332 | raise ValueError('pop from empty roaringbitmap') 333 | high = self.keys[self.size - 1] 334 | low = block_pop(&(self.data[self.size - 1])) 335 | if self.data[self.size - 1].cardinality == 0: 336 | self._removeatidx(self.size - 1) 337 | return high << 16 | low 338 | 339 | def clear(self): 340 | """Remove all elements from this RoaringBitmap.""" 341 | cdef size_t n 342 | for n in range(self.size): 343 | aligned_free(self.data[n].buf.ptr) 344 | free(self.keys) 345 | free(self.data) 346 | self.size = 0 347 | self.keys = malloc(INITCAPACITY * sizeof(uint16_t)) 348 | self.data = malloc(INITCAPACITY * sizeof(Block)) 349 | if self.keys is NULL or self.data is NULL: 350 | raise MemoryError(INITCAPACITY) 351 | self.capacity = INITCAPACITY 352 | 353 | def __lshift__(self, other): 354 | return self.__rshift__(-other) 355 | 356 | def __rshift__(self, int other): 357 | # FIXME: replace with optimized implementation 358 | return RoaringBitmap([elem + other for elem in self 359 | if 0 <= elem + other < 1 << 32]) 360 | 361 | # def __ilshift__(self, other): 362 | # raise NotImplementedError 363 | 364 | # def __irshift__(self, other): 365 | # raise NotImplementedError 366 | 367 | def __invert__(self): 368 | """Return copy with smallest to largest elements inverted.""" 369 | return self.symmetric_difference( 370 | RANGE(self.min(), self.max() + 1)) 371 | 372 | def __iter__(self): 373 | cdef Block *block 374 | cdef Block b1 375 | cdef uint32_t high, i 376 | cdef uint64_t cur 377 | cdef int n, idx, low 378 | for i in range(self.size): 379 | block = self._getblk(i, &b1) 380 | high = ((self.keys[i])) << 16 381 | if block.cardinality == BLOCKSIZE: 382 | for low in range(BLOCKSIZE): 383 | yield high | low 384 | elif block.state == DENSE: 385 | idx = 0 386 | cur = block.buf.dense[idx] 387 | n = iteratesetbits(block.buf.dense, &cur, &idx) 388 | while n != -1: 389 | yield high | n 390 | n = iteratesetbits(block.buf.dense, &cur, &idx) 391 | elif block.state == POSITIVE: 392 | for n in range(block.cardinality): 393 | low = block.buf.sparse[n] 394 | yield high | low 395 | elif block.state == INVERTED: 396 | for low in range(block.buf.sparse[0]): 397 | yield high | low 398 | if block.cardinality < BLOCKSIZE - 1: 399 | for n in range(BLOCKSIZE - block.cardinality - 1): 400 | for low in range( 401 | block.buf.sparse[n] + 1, 402 | block.buf.sparse[n + 1]): 403 | yield high | low 404 | for low in range(block.buf.sparse[ 405 | BLOCKSIZE - block.cardinality - 1] + 1, BLOCKSIZE): 406 | yield high | low 407 | 408 | def __reversed__(self): 409 | cdef Block *block 410 | cdef Block b1 411 | cdef uint32_t high, i 412 | cdef uint64_t cur 413 | cdef int n, idx, low 414 | for i in range(self.size - 1, -1, -1): 415 | block = self._getblk(i, &b1) 416 | high = ((self.keys[i])) << 16 417 | if block.cardinality == BLOCKSIZE: 418 | for low in reversed(range(BLOCKSIZE)): 419 | yield high | low 420 | elif block.state == POSITIVE: 421 | for n in reversed(range(block.cardinality)): 422 | low = block.buf.sparse[n] 423 | yield high | low 424 | elif block.state == DENSE: 425 | idx = BITNSLOTS(BLOCKSIZE) - 1 426 | cur = block.buf.dense[idx] 427 | n = reviteratesetbits(block.buf.dense, &cur, &idx) 428 | while n != -1: 429 | low = n 430 | yield high | low 431 | n = reviteratesetbits(block.buf.dense, &cur, &idx) 432 | elif block.state == INVERTED: 433 | for low in reversed(range(block.buf.sparse[ 434 | BLOCKSIZE - block.cardinality - 1] + 1, BLOCKSIZE)): 435 | yield high | low 436 | if block.cardinality < BLOCKSIZE - 1: 437 | for n in reversed(range(BLOCKSIZE - block.cardinality - 1)): 438 | for low in reversed(range( 439 | block.buf.sparse[n] + 1, 440 | block.buf.sparse[n + 1])): 441 | yield high | low 442 | for low in reversed(range(block.buf.sparse[0])): 443 | yield high | low 444 | 445 | def __len__(self): 446 | cdef size_t result = 0, n 447 | for n in range(self.size): 448 | result += self.data[n].cardinality 449 | return result 450 | 451 | def __sizeof__(self): 452 | """Return memory usage in bytes (incl. overallocation).""" 453 | cdef uint32_t result = 0 454 | for n in range(self.size): 455 | result += (sizeof(uint16_t) + sizeof(Block) 456 | + self.data[n].capacity * sizeof(uint16_t)) 457 | return result 458 | 459 | def numelem(self): 460 | """Return total number of uint16_t elements stored.""" 461 | cdef uint32_t result = 0 462 | for n in range(self.size): 463 | result += 1 + getsize(&(self.data[n])) 464 | return result 465 | 466 | def __bool__(self): 467 | return self.size 468 | 469 | def __str__(self): 470 | return '{%s}' % ', '.join([str(a) for a in self]) 471 | 472 | def __repr__(self): 473 | return 'RoaringBitmap(%s)' % str(self) 474 | 475 | def debuginfo(self, verbose=False): 476 | """Return a string describing the internal representation of this set. 477 | """ 478 | cdef Block b1 479 | return 'keys=%d, cap=%d, data={%s}' % ( 480 | self.size, self.capacity, ', '.join([ 481 | block_repr(self.keys[n], self._getblk(n, &b1), verbose) 482 | for n in range(self.size)])) 483 | 484 | def _keys(self): 485 | return [self.keys[n] for n in range(self.size)] 486 | 487 | def __getstate__(self): 488 | """Return a serialized representation (Python array) for pickling.""" 489 | cdef array.array state 490 | cdef Block *ob 491 | cdef uint32_t extra, alignment = 32 492 | cdef size_t n, size 493 | cdef size_t alloc # total allocated bytes for pickle 494 | cdef size_t offset1 = sizeof(uint32_t) # keys, data 495 | cdef size_t offset2 # buffers 496 | # compute total size to allocate 497 | # add padding to ensure bitmaps are 32-byte aligned 498 | alloc = offset1 + self.size * (sizeof(uint16_t) + sizeof(Block)) 499 | alloc += alignment - alloc % alignment 500 | for n in range(self.size): 501 | alloc += getsize(&(self.data[n])) * sizeof(uint16_t) 502 | alloc += alignment - alloc % alignment 503 | state = array.clone(chararray, alloc, False) 504 | (state.data.as_chars)[0] = self.size 505 | size = self.size * sizeof(uint16_t) 506 | memcpy(&(state.data.as_chars[offset1]), self.keys, size) 507 | offset1 += size 508 | offset2 = offset1 + self.size * sizeof(Block) 509 | # add zero padding bytes 510 | extra = alignment - offset2 % alignment 511 | memset(&(state.data.as_chars[offset2]), 0, extra) 512 | offset2 += extra 513 | for n in range(self.size): 514 | # copy block 515 | ob = (&(state.data.as_chars[offset1])) 516 | ob[0] = self.data[n] 517 | ob.capacity = getsize(&(self.data[n])) 518 | ob.buf.ptr = offset2 519 | offset1 += sizeof(Block) 520 | # copy buffer of block 521 | size = ob.capacity * sizeof(uint16_t) 522 | memcpy(&(state.data.as_chars[offset2]), self.data[n].buf.ptr, size) 523 | offset2 += size 524 | # add zero padding bytes 525 | extra = alignment - offset2 % alignment 526 | memset(&(state.data.as_chars[offset2]), 0, extra) 527 | offset2 += extra 528 | return state 529 | 530 | def __setstate__(self, array.array state): 531 | """Initialize this object with a serialized representation.""" 532 | cdef char *buf = state.data.as_chars 533 | cdef void *tmp1 534 | cdef void *tmp2 535 | cdef Block *data 536 | cdef size_t n, size, offset = sizeof(uint32_t) 537 | self.clear() 538 | self.size = (buf)[0] 539 | self.capacity = max(self.size, INITCAPACITY) 540 | tmp1 = realloc(self.keys, self.capacity * sizeof(uint16_t)) 541 | tmp2 = realloc(self.data, self.capacity * sizeof(Block)) 542 | if tmp1 is NULL or tmp2 is NULL: 543 | raise MemoryError(self.size) 544 | self.keys = tmp1 545 | self.data = tmp2 546 | memcpy(self.keys, &(buf[offset]), self.size * sizeof(uint16_t)) 547 | offset += self.size * sizeof(uint16_t) 548 | data = &(buf[offset]) 549 | for n in range(self.size): 550 | self.data[n] = data[n] 551 | offset = data[n].buf.offset 552 | if data[n].state == DENSE: 553 | self.data[n].buf.dense = allocdense() 554 | size = BITMAPSIZE 555 | else: 556 | self.data[n].buf.sparse = allocsparse(data[n].capacity) 557 | size = data[n].capacity * sizeof(uint16_t) 558 | memcpy(self.data[n].buf.ptr, &(buf[offset]), size) 559 | 560 | def intersection(self, *other): 561 | """Return the intersection of two or more sets as a new RoaringBitmap. 562 | 563 | (i.e. elements that are common to all of the sets.)""" 564 | cdef RoaringBitmap result 565 | if len(other) == 0: 566 | return self 567 | elif len(other) == 1: 568 | return self & other[0] 569 | other = sorted([self] + [ensurerb(a) for a in other], 570 | key=RoaringBitmap.numelem) 571 | result = other[0] & other[1] 572 | for ob in other[2:]: 573 | result &= ob 574 | if result.size == 0: 575 | break 576 | return result 577 | 578 | def union(self, *other): 579 | """Return the union of two or more sets as a new set. 580 | 581 | (i.e. all elements that are in at least one of the sets.)""" 582 | if len(other) == 0: 583 | return self 584 | elif len(other) == 1: 585 | return self | other[0] 586 | queue = [(ob1.numelem(), ob1) for ob1 in map(ensurerb, other)] 587 | queue.append((self.numelem(), self)) 588 | heapq.heapify(queue) 589 | while len(queue) > 1: 590 | _, ob1 = heapq.heappop(queue) 591 | _, ob2 = heapq.heappop(queue) 592 | result = ob1 | ob2 593 | heapq.heappush(queue, (result.numelem(), result)) 594 | _, result = heapq.heappop(queue) 595 | return result 596 | 597 | def difference(self, *other): 598 | """Return the difference of two or more sets as a new RoaringBitmap. 599 | 600 | (i.e, self - other[0] - other[1] - ...)""" 601 | cdef RoaringBitmap result 602 | if len(other) == 0: 603 | return self 604 | other = sorted(map(ensurerb, other), 605 | key=RoaringBitmap.numelem, reverse=True) 606 | result = self - other[0] 607 | for ob in other[1:]: 608 | result -= ob 609 | if result.size == 0: 610 | break 611 | return result 612 | 613 | def symmetric_difference(self, other): 614 | """Return the symmetric difference of two sets as a new RoaringBitmap. 615 | 616 | (i.e. all elements that are in exactly one of the sets.)""" 617 | return self ^ other 618 | 619 | def update(self, *other): 620 | """In-place union update of this RoaringBitmap. 621 | 622 | With one argument, add items from the iterable to this set; 623 | with more arguments: add the union of given ``RoaringBitmap`` objects. 624 | 625 | NB: since range objects are recognized by the constructor, this 626 | provides an efficient way to set a range of bits: 627 | 628 | >>> rb = RoaringBitmap(range(5)) 629 | >>> rb.update(range(3, 7)) 630 | >>> rb 631 | RoaringBitmap({0, 1, 2, 3, 4, 5, 6}) 632 | """ 633 | cdef RoaringBitmap ob1, ob2 634 | if len(other) == 0: 635 | return 636 | if len(other) == 1: 637 | self |= other[0] 638 | return 639 | queue = [(ob1.numelem(), ob1) for ob1 in map(ensurerb, other)] 640 | heapq.heapify(queue) 641 | while len(queue) > 1: 642 | _, ob1 = heapq.heappop(queue) 643 | _, ob2 = heapq.heappop(queue) 644 | result = ob1 | ob2 645 | heapq.heappush(queue, (result.numelem(), result)) 646 | _, result = heapq.heappop(queue) 647 | self |= result 648 | 649 | def intersection_update(self, *other): 650 | """Intersect this set in-place with one or more ``RoaringBitmap`` 651 | objects. 652 | 653 | NB: since range objects are recognized by the constructor, this 654 | provides an efficient way to restrict the set to a range of elements: 655 | 656 | >>> rb = RoaringBitmap(range(5)) 657 | >>> rb.intersection_update(range(3, 7)) 658 | >>> rb 659 | RoaringBitmap({3, 4}) 660 | """ 661 | if len(other) == 0: 662 | return 663 | elif len(other) == 1: 664 | self &= other[0] 665 | return 666 | other = sorted(map(ensurerb, other), key=RoaringBitmap.numelem) 667 | for ob in other: 668 | self &= ob 669 | if self.size == 0: 670 | break 671 | 672 | def difference_update(self, *other): 673 | """Remove all elements of other RoaringBitmaps from this one.""" 674 | for ob in other: 675 | self -= ob 676 | if self.size == 0: 677 | break 678 | 679 | def symmetric_difference_update(self, other): 680 | """Update set to symmetric difference of itself and another.""" 681 | self ^= other 682 | 683 | def flip_range(self, uint32_t start, uint32_t stop): 684 | """In-place negation for range(start, stop).""" 685 | self.symmetric_difference_update(RANGE(start, stop)) 686 | 687 | def intersection_len(self, other): 688 | """Return the cardinality of the intersection. 689 | 690 | Optimized version of ``len(self & other)``.""" 691 | cdef RoaringBitmap ob1 = ensurerb(self) 692 | cdef RoaringBitmap ob2 = ensurerb(other) 693 | cdef Block b1, b2 694 | cdef uint32_t pos1 = 0, pos2 = 0 695 | cdef size_t result = 0 696 | if pos1 < ob1.size and pos2 < ob2.size: 697 | while True: 698 | if ob1.keys[pos1] < ob2.keys[pos2]: 699 | pos1 += 1 700 | if pos1 == ob1.size: 701 | break 702 | elif ob1.keys[pos1] > ob2.keys[pos2]: 703 | pos2 += 1 704 | if pos2 == ob2.size: 705 | break 706 | else: 707 | result += block_andlen( 708 | ob1._getblk(pos1, &b1), 709 | ob2._getblk(pos2, &b2)) 710 | pos1 += 1 711 | pos2 += 1 712 | if pos1 == ob1.size or pos2 == ob2.size: 713 | break 714 | return result 715 | 716 | def union_len(self, other): 717 | """Return the cardinality of the union. 718 | 719 | Optimized version of ``len(self | other)``.""" 720 | cdef RoaringBitmap ob1 = ensurerb(self) 721 | cdef RoaringBitmap ob2 = ensurerb(other) 722 | cdef Block b1, b2 723 | cdef uint32_t pos1 = 0, pos2 = 0 724 | cdef size_t result = 0 725 | if pos1 < ob1.size and pos2 < ob2.size: 726 | while True: 727 | if ob1.keys[pos1] < ob2.keys[pos2]: 728 | result += ob1.data[pos1].cardinality 729 | pos1 += 1 730 | if pos1 == ob1.size: 731 | break 732 | elif ob1.keys[pos1] > ob2.keys[pos2]: 733 | result += ob2.data[pos2].cardinality 734 | pos2 += 1 735 | if pos2 == ob2.size: 736 | break 737 | else: 738 | result += block_orlen( 739 | ob1._getblk(pos1, &b1), 740 | ob2._getblk(pos2, &b2)) 741 | pos1 += 1 742 | pos2 += 1 743 | if pos1 == ob1.size or pos2 == ob2.size: 744 | break 745 | if pos1 == ob1.size and pos2 < ob2.size: 746 | for pos2 in range(pos2, ob2.size): 747 | result += ob2.data[pos2].cardinality 748 | elif pos2 == ob2.size and pos1 < ob1.size: 749 | for pos1 in range(pos1, ob1.size): 750 | result += ob1.data[pos1].cardinality 751 | return result 752 | 753 | def jaccard_dist(self, other): 754 | """Return the Jaccard distance. 755 | 756 | Optimized version of ``1 - len(self & other) / len(self | other)``. 757 | Counts of union and intersection are performed simultaneously.""" 758 | cdef RoaringBitmap ob1 = ensurerb(self) 759 | cdef RoaringBitmap ob2 = ensurerb(other) 760 | return rb_jaccard_dist(ob1, ob2) 761 | 762 | def rank(self, uint32_t x): 763 | """Return the number of elements ``<= x`` that are in this set.""" 764 | cdef Block b1 765 | cdef size_t size = 0, n 766 | cdef uint16_t xhigh = highbits(x) 767 | for n in range(self.size): 768 | if self.keys[n] < xhigh: 769 | size += self.data[n].cardinality 770 | elif self.keys[n] > xhigh: 771 | return size 772 | else: 773 | return size + block_rank( 774 | self._getblk(n, &b1), 775 | lowbits(x)) 776 | return size 777 | 778 | def select(self, int i): 779 | """Return the ith element that is in this set. 780 | 781 | :param i: a 0-based index.""" 782 | cdef Block b1 783 | cdef uint32_t leftover = i 784 | cdef uint32_t n, keycontrib, lowcontrib 785 | if i < 0: 786 | raise IndexError('select: index %d out of range 0..%d.' % ( 787 | i, len(self))) 788 | for n in range(self.size): 789 | if self.data[n].cardinality > leftover: 790 | keycontrib = self.keys[n] << 16 791 | lowcontrib = block_select( 792 | self._getblk(n, &b1), 793 | leftover) 794 | return keycontrib | lowcontrib 795 | leftover -= self.data[n].cardinality 796 | raise IndexError('select: index %d out of range 0..%d.' % ( 797 | i, len(self))) 798 | 799 | def index(self, uint32_t x): 800 | """Return the 0-based index of `x` in this set. 801 | 802 | Equivalent to ``sorted(self).index(x)``.""" 803 | if x in self: 804 | return self.rank(x) - 1 805 | raise IndexError 806 | 807 | def _ridx(self, i): 808 | if i < 0: 809 | return len(self) + i 810 | return i 811 | 812 | def _slice(self, i): 813 | """Return the range of values for a given a range of indices i.""" 814 | start = 0 if i.start is None else self._ridx(i.start) 815 | stop = len(self) if i.stop is None else self._ridx(i.stop) 816 | return RANGE( 817 | self.select(start), self.select(stop - 1) + 1) 818 | 819 | def __getitem__(self, i): 820 | """Get element with rank `i`, or a slice. 821 | 822 | In the case of a slice, a new roaringbitmap is returned.""" 823 | if isinstance(i, slice): 824 | if i.step is None or i.step == 1: 825 | return self.intersection(self._slice(i)) 826 | elif i.step <= 0: 827 | raise ValueError 828 | else: # i.step > 1 FIXME we could do better 829 | start, stop, step = i.indices(len(self)) 830 | return RoaringBitmap( 831 | [self[x] for x in RANGE(start, stop, step)]) 832 | elif isinstance(i, (int, long)): 833 | return self.select(self._ridx(i)) 834 | else: 835 | raise TypeError('Expected integer index or slice object.') 836 | 837 | def __delitem__(self, i): 838 | """Discard element with rank `i`, or a slice.""" 839 | if isinstance(i, slice): 840 | if i.step is None or i.step == 1: 841 | self.difference_update(self._slice(i)) 842 | elif i.step <= 0: 843 | raise ValueError 844 | else: # i.step > 1 FIXME we could do better 845 | start, stop, step = i.indices(len(self)) 846 | self.difference_update(RoaringBitmap([ 847 | self[x] for x in RANGE(start, stop, step)])) 848 | elif isinstance(i, (int, long)): 849 | self.discard(self.select(self._ridx(i))) 850 | else: 851 | raise TypeError('Expected integer index or slice object.') 852 | 853 | def _initrange(self, uint32_t start, uint32_t stop, uint32_t step): 854 | cdef Block *block = NULL 855 | cdef uint32_t key, blockstart, blockstop, gap 856 | cdef uint32_t tmp = start 857 | cdef uint64_t n 858 | if step >= (1 << 16): 859 | n = start 860 | while n < stop: 861 | self.add(n) 862 | n += step 863 | return 864 | while True: 865 | key = highbits(tmp) 866 | blockstart = lowbits(tmp) 867 | blockstop = min(stop - (key << 16), 1 << 16) 868 | block = self._insertempty(self.size, key) 869 | block_initrange(block, blockstart, blockstop, step) 870 | gap = blockstop - blockstart + step - 1 871 | tmp += gap - (gap % step) 872 | if tmp >= stop: 873 | break 874 | 875 | def _init2pass(self, iterable): 876 | cdef Block *block = NULL 877 | cdef uint32_t elem 878 | cdef uint16_t key 879 | cdef int i, prev = -1 880 | # gather keys and count elements for each block 881 | for elem in iterable: 882 | key = highbits(elem) 883 | if key != prev: 884 | i = self._getindex(key) 885 | if i < 0: 886 | block = self._insertempty(-i - 1, key) 887 | block.cardinality = block.capacity = 0 888 | else: 889 | block = &(self.data[i]) 890 | prev = key 891 | block.capacity += 1 # NB: wraps to 0 for block with all elements set 892 | # allocate blocks 893 | for i in range(self.size): 894 | block = &(self.data[i]) 895 | if 0 < block.capacity < MAXARRAYLENGTH: 896 | block.buf.sparse = allocsparse(block.capacity) 897 | block.state = POSITIVE 898 | else: # if necessary, will convert to inverted later 899 | block.capacity = BITMAPSIZE // sizeof(uint16_t) 900 | block.buf.dense = allocdense() 901 | memset(block.buf.dense, 0, BITMAPSIZE) 902 | block.state = DENSE 903 | # second pass, add elements for each block 904 | prev = -1 905 | for elem in iterable: 906 | key = highbits(elem) 907 | if key != prev: 908 | i = self._getindex(key) 909 | if prev != -1: 910 | block_convert(block) 911 | block = &(self.data[i]) 912 | prev = key 913 | block_add(block, lowbits(elem)) 914 | if prev != -1: 915 | block_convert(block) 916 | 917 | def _inititerator(self, iterable): 918 | cdef Block *block = NULL 919 | cdef uint32_t elem 920 | cdef uint16_t key 921 | cdef int n 922 | cdef dict tmp = {} 923 | cdef list values 924 | for elem in iterable: 925 | key = highbits(elem) 926 | if key not in tmp: 927 | tmp[key] = set() 928 | tmp[key].add(lowbits(elem)) 929 | for key in sorted(tmp): 930 | values = sorted(tmp[key]) 931 | block = self._insertempty(self.size, key) 932 | block.cardinality = len(values) 933 | if block.cardinality < MAXARRAYLENGTH: 934 | block.capacity = block.cardinality 935 | block.buf.sparse = allocsparse(block.capacity) 936 | block.state = POSITIVE 937 | for n, elem in enumerate(values): 938 | block.buf.sparse[n] = elem 939 | elif block.cardinality == BLOCKSIZE: 940 | block_initrange(block, 0, BLOCKSIZE, 1) 941 | else: 942 | block.capacity = BITMAPSIZE // sizeof(uint16_t) 943 | block.buf.dense = allocdense() 944 | memset(block.buf.dense, 0, BITMAPSIZE) 945 | block.state = DENSE 946 | for elem in values: 947 | SETBIT(block.buf.dense, elem) 948 | block_convert(block) 949 | 950 | # def _inititerator(self, iterable): 951 | # cdef Block *block = NULL 952 | # cdef uint32_t elem 953 | # cdef uint16_t key 954 | # cdef int i, prev = -1 955 | # for elem in iterable: 956 | # key = highbits(elem) 957 | # if key != prev: 958 | # i = self._getindex(key) 959 | # if i >= 0: 960 | # block = &(self.data[i]) 961 | # else: 962 | # block = self._insertempty(-i - 1, key) 963 | # block.state = POSITIVE 964 | # block.cardinality = 0 965 | # block.buf.sparse = allocsparse(INITCAPACITY) 966 | # block.capacity = INITCAPACITY 967 | # prev = key 968 | # block_add(block, lowbits(elem)) 969 | # block_convert(block) 970 | 971 | cdef _initarray(self, int k): 972 | """Allocate k elements and initialize pointers to zero.""" 973 | self._extendarray(k) 974 | memset(self.data, 0, self.capacity * sizeof(Block)) 975 | 976 | cdef _extendarray(self, int k): 977 | """Extend allocation with k extra elements + amortization.""" 978 | cdef size_t desired = self.size + k 979 | cdef size_t newcapacity 980 | cdef void *tmp1 981 | cdef void *tmp2 982 | if desired < self.capacity: 983 | return 984 | newcapacity = 2 * desired if self.size < 1024 else 5 * desired // 4 985 | tmp1 = realloc(self.keys, newcapacity * sizeof(uint16_t)) 986 | tmp2 = realloc(self.data, newcapacity * sizeof(Block)) 987 | if tmp1 is NULL or tmp2 is NULL: 988 | raise MemoryError(newcapacity) 989 | self.keys = tmp1 990 | self.data = tmp2 991 | self.capacity = newcapacity 992 | 993 | cdef _resize(self, int k): 994 | """Set size and if necessary reduce array allocation to k elements.""" 995 | cdef void *tmp1 996 | cdef void *tmp2 997 | if k > INITCAPACITY and k * 2 < self.capacity: 998 | tmp1 = realloc(self.keys, k * sizeof(uint16_t)) 999 | tmp2 = realloc(self.data, k * sizeof(Block)) 1000 | if tmp1 is NULL or tmp2 is NULL: 1001 | raise MemoryError((k, self.size, self.capacity)) 1002 | self.keys = tmp1 1003 | self.data = tmp2 1004 | self.capacity = k 1005 | self.size = k 1006 | 1007 | cdef _tmpalloc(self, int size, uint16_t **keys, Block **data): 1008 | keys[0] = malloc(size * sizeof(uint16_t)) 1009 | data[0] = calloc(size, sizeof(Block)) 1010 | if keys[0] is NULL or data[0] is NULL: 1011 | raise MemoryError(size) 1012 | 1013 | cdef _replacearrays(self, uint16_t *keys, Block *data, int size): 1014 | free(self.keys) 1015 | free(self.data) 1016 | self.keys = keys 1017 | self.data = data 1018 | self.size = size 1019 | self._resize(self.size) # truncate 1020 | 1021 | cdef _removeatidx(self, int i): 1022 | """Remove the i'th element.""" 1023 | aligned_free(self.data[i].buf.ptr) 1024 | memmove(&(self.keys[i]), &(self.keys[i + 1]), 1025 | (self.size - i - 1) * sizeof(uint16_t)) 1026 | memmove(&(self.data[i]), &(self.data[i + 1]), 1027 | (self.size - i - 1) * sizeof(Block)) 1028 | self.size -= 1 1029 | 1030 | cdef Block *_insertempty(self, int i, uint16_t key): 1031 | """Insert a new, uninitialized block.""" 1032 | self._extendarray(1) 1033 | if i < self.size: 1034 | memmove(&(self.keys[i + 1]), &(self.keys[i]), 1035 | (self.size - i) * sizeof(uint16_t)) 1036 | memmove(&(self.data[i + 1]), &(self.data[i]), 1037 | (self.size - i) * sizeof(Block)) 1038 | self.size += 1 1039 | self.keys[i] = key 1040 | self.data[i].buf.ptr = NULL 1041 | return &(self.data[i]) 1042 | 1043 | cdef _insertcopy(self, int i, uint16_t key, Block *block): 1044 | """Insert a copy of given block.""" 1045 | cdef size_t size 1046 | self._extendarray(1) 1047 | if i < self.size: 1048 | memmove(&(self.keys[i + 1]), &(self.keys[i]), 1049 | (self.size - i) * sizeof(uint16_t)) 1050 | memmove(&(self.data[i + 1]), &(self.data[i]), 1051 | (self.size - i) * sizeof(Block)) 1052 | size = getsize(block) 1053 | self.keys[i] = key 1054 | self.data[i] = block[0] 1055 | if self.data[i].state == DENSE: 1056 | self.data[i].buf.dense = allocdense() 1057 | elif self.data[i].state in (POSITIVE, INVERTED): 1058 | self.data[i].buf.sparse = allocsparse(size) 1059 | self.data[i].capacity = size 1060 | memcpy(self.data[i].buf.ptr, block.buf.ptr, size * sizeof(uint16_t)) 1061 | self.size += 1 1062 | 1063 | cdef int _getindex(self, uint16_t key): 1064 | if self.size == 0: 1065 | return -1 1066 | # Common case of appending in last block: 1067 | if self.keys[self.size - 1] == key: 1068 | return self.size - 1 1069 | return self._binarysearch(0, self.size, key) 1070 | 1071 | cdef int _binarysearch(self, int begin, int end, uint16_t key): 1072 | """Binary search for key. 1073 | 1074 | :returns: positive index ``i`` if ``key`` is found; 1075 | negative value ``i`` if ``elem`` is not found, 1076 | but would fit at ``-i - 1``.""" 1077 | cdef int low = begin, high = end - 1 1078 | cdef int middleidx, middleval 1079 | while low <= high: 1080 | middleidx = (low + high) >> 1 1081 | middleval = self.keys[middleidx] 1082 | if middleval < key: 1083 | low = middleidx + 1 1084 | elif middleval > key: 1085 | high = middleidx - 1 1086 | else: 1087 | return middleidx 1088 | return -(low + 1) 1089 | 1090 | def _checkconsistency(self): 1091 | """Verify that arrays are sorted and free of duplicates.""" 1092 | cdef Block b1 1093 | cdef Block *b2 1094 | cdef size_t n, m 1095 | for n in range(self.size): 1096 | assert self.data[n].state in (DENSE, POSITIVE, INVERTED) 1097 | assert 1 <= self.data[n].cardinality < 1 << 16 1098 | assert getsize(&(self.data[n])) <= self.data[n].capacity 1099 | if self.data[n].state == POSITIVE: 1100 | assert 1 <= self.data[n].cardinality < MAXARRAYLENGTH 1101 | elif self.data[n].state == DENSE: 1102 | assert (MAXARRAYLENGTH <= self.data[n].cardinality 1103 | <= BLOCKSIZE - MAXARRAYLENGTH) 1104 | elif self.data[n].state == INVERTED: 1105 | assert (BLOCKSIZE - MAXARRAYLENGTH < self.data[n].cardinality 1106 | < BLOCKSIZE) 1107 | if n + 1 < self.size: 1108 | assert self.keys[n] < self.keys[n + 1], ( 1109 | n, self.keys[n], self.keys[n + 1]) 1110 | if self.data[n].state != DENSE: 1111 | for m in range(getsize(&(self.data[n])) - 1): 1112 | b2 = self._getblk(n, &b1) 1113 | assert b2.buf.sparse[m] < b2.buf.sparse[m + 1], ( 1114 | m, b2.buf.sparse[m], b2.buf.sparse[m + 1]) 1115 | 1116 | cdef inline Block *_getblk(self, int i, Block *tmp) noexcept nogil: 1117 | """Get pointer to block `i`. If there is an offset, copy this block 1118 | to ``tmp`` and add offset to its pointer, otherwise return block itself. 1119 | """ 1120 | # a bit unelegant, but this makes it possible to use the same code 1121 | # for mutable & immutable variants. 1122 | if not 0 <= i < self.size: 1123 | printf('illegal index %d; size=%d\n', i, self.size) 1124 | abort() 1125 | if self.offset: 1126 | tmp[0] = self.data[i] 1127 | tmp.buf.ptr = (tmp.buf.offset + self.offset) 1128 | return tmp 1129 | return &(self.data[i]) 1130 | 1131 | 1132 | cdef inline RoaringBitmap ensurerb(obj): 1133 | """Convert set-like ``obj`` to RoaringBitmap if necessary.""" 1134 | if isinstance(obj, RoaringBitmap): 1135 | return obj 1136 | return RoaringBitmap(obj) 1137 | 1138 | 1139 | cdef inline uint16_t highbits(uint32_t x) noexcept nogil: 1140 | return x >> 16 1141 | 1142 | 1143 | cdef inline uint16_t lowbits(uint32_t x) noexcept nogil: 1144 | return x & 0xFFFF 1145 | 1146 | 1147 | cdef inline uint32_t min(uint32_t a, uint32_t b) noexcept nogil: 1148 | return a if a <= b else b 1149 | 1150 | 1151 | cdef inline uint32_t max(uint32_t a, uint32_t b) noexcept nogil: 1152 | return a if a >= b else b 1153 | 1154 | 1155 | cdef inline int getbufptr( 1156 | object obj, char ** ptr, Py_ssize_t * size, Py_buffer * buf): 1157 | """Get a pointer from bytes/buffer object ``obj``. 1158 | 1159 | On success, return 0, and set ``ptr``, ``size``, and possibly ``buf``.""" 1160 | cdef int result = -1 1161 | ptr[0] = NULL 1162 | size[0] = 0 1163 | if PY2: 1164 | # Although the new-style buffer interface was backported to Python 2.6, 1165 | # some modules, notably mmap, only support the old buffer interface. 1166 | # Cf. http://bugs.python.org/issue9229 1167 | if PyObject_CheckReadBuffer(obj) == 1: 1168 | result = PyObject_AsReadBuffer( 1169 | obj, ptr, size) 1170 | elif PyObject_CheckBuffer(obj) == 1: # new-style Buffer interface 1171 | result = PyObject_GetBuffer(obj, buf, PyBUF_SIMPLE) 1172 | if result == 0: 1173 | ptr[0] = buf.buf 1174 | size[0] = buf.len 1175 | return result 1176 | 1177 | 1178 | cdef inline void releasebuf(Py_buffer *buf): 1179 | """Release buffer if necessary.""" 1180 | if not PY2: 1181 | PyBuffer_Release(buf) 1182 | 1183 | 1184 | def bitcounttests(): 1185 | assert bit_ctz(2) == 1 1186 | assert bit_ctz(3) == 0 1187 | assert bit_ctz(0x80000000) == 31 1188 | assert bit_ctz(0x1000) == 12 1189 | assert bit_ctz(UINT64_MAX) == 0 1190 | assert bit_clz(1) == BITCOUNT_BITS - 1 1191 | assert bit_clz(4) == BITCOUNT_BITS - 3 1192 | assert bit_clz(0x80000000) == BITCOUNT_BITS - 32 1193 | assert bit_clz(0x1000) == BITCOUNT_BITS - 13 1194 | assert bit_clz(UINT64_MAX) == 0 1195 | assert bit_popcount(0x1) == 1 1196 | assert bit_popcount(0x10) == 1 1197 | assert bit_popcount(0x101001) == 3 1198 | assert bit_popcount(3) == 2 1199 | assert bit_popcount(UINT64_MAX) == BITCOUNT_BITS 1200 | assert bit_popcount(0) == 0 1201 | return True 1202 | 1203 | 1204 | def aligned_malloc_tests(): 1205 | cdef void *ptr = NULL 1206 | ptr = aligned_malloc(1024, sizeof(void *)) 1207 | assert ptr is not NULL 1208 | (ptr)[0] = 1234 1209 | aligned_free(ptr) 1210 | return True 1211 | 1212 | 1213 | def mmaptests(): 1214 | cdef Py_buffer buffer 1215 | cdef Py_ssize_t size = 0 1216 | cdef char *ptr = NULL 1217 | cdef uint32_t *uptr 1218 | cdef int result 1219 | 1220 | alignment = 32 1221 | alloc = sizeof(uint32_t) + 8 * sizeof(uint32_t) 1222 | extra = alignment - alloc % alignment 1223 | alloc += extra + 1024 1224 | 1225 | ob = mmap.mmap(-1, alloc, access=mmap.ACCESS_WRITE) 1226 | result = getbufptr(ob, &ptr, &size, &buffer) 1227 | if result != 0: 1228 | raise ValueError('could not get buffer from mmap.') 1229 | uptr = ptr 1230 | uptr[0] = 1234 1231 | return True 1232 | 1233 | 1234 | __all__ = ['RoaringBitmap', 'ImmutableRoaringBitmap', 'MultiRoaringBitmap'] 1235 | -------------------------------------------------------------------------------- /tests/benchmarks.py: -------------------------------------------------------------------------------- 1 | """Benchmarks for roaringbitmap""" 2 | from __future__ import division, print_function, absolute_import, \ 3 | unicode_literals 4 | import random 5 | import timeit 6 | 7 | N = 1 << 17 # number of random elements 8 | M = 100 # number of test runs 9 | MAX = 1 << 20 # range of elements 10 | DATA1, DATA2 = None, None 11 | 12 | 13 | def pair(): 14 | random.seed(42) 15 | data1 = [random.randint(0, MAX) for _ in range(N)] 16 | data2 = data1[:len(data1) // 2] 17 | data2.extend(random.randint(0, MAX) for _ in range(N // 2)) 18 | return data1, data2 19 | 20 | 21 | def bench_init(): 22 | a = timeit.Timer('set(DATA1)', 23 | setup='from __main__ import DATA1').timeit(number=M) 24 | b = timeit.Timer('rb = RoaringBitmap(DATA1)', 25 | setup='from __main__ import DATA1; ' 26 | 'from roaringbitmap import RoaringBitmap; ' 27 | ).timeit(number=M) 28 | return a, b 29 | 30 | 31 | def bench_initsort(): 32 | a = timeit.Timer('set(data)', 33 | setup='from __main__ import DATA1; ' 34 | 'data = sorted(DATA1)').timeit(number=M) 35 | b = timeit.Timer('rb = RoaringBitmap(data)', 36 | setup='from __main__ import DATA1; ' 37 | 'from roaringbitmap import RoaringBitmap; ' 38 | 'data = sorted(DATA1)' 39 | ).timeit(number=M) 40 | return a, b 41 | 42 | 43 | def bench_eq(): 44 | # benchmark equality with equal operands 45 | a = timeit.Timer('ref == ref2', 46 | setup='from __main__ import DATA1; ' 47 | 'ref = set(DATA1); ref2 = set(DATA1)').timeit(number=M) 48 | b = timeit.Timer('rb == rb2', 49 | setup='from __main__ import DATA1; ' 50 | 'from roaringbitmap import RoaringBitmap; ' 51 | 'rb = RoaringBitmap(DATA1); ' 52 | 'rb2 = RoaringBitmap(DATA1)').timeit(number=M) 53 | return a, b 54 | 55 | 56 | def bench_neq(): 57 | # benchmark non-equality with non-equal operands 58 | a = timeit.Timer('ref != ref2', 59 | setup='from __main__ import DATA1, DATA2; ' 60 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M) 61 | b = timeit.Timer('rb != rb2', 62 | setup='from __main__ import DATA1, DATA2; ' 63 | 'from roaringbitmap import RoaringBitmap; ' 64 | 'rb = RoaringBitmap(DATA1); ' 65 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=M) 66 | return a, b 67 | 68 | 69 | def bench_and(): 70 | a = timeit.Timer('ref & ref2', 71 | setup='from __main__ import DATA1, DATA2; ' 72 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M) 73 | b = timeit.Timer('rb & rb2', 74 | setup='from __main__ import DATA1, DATA2; ' 75 | 'from roaringbitmap import RoaringBitmap; ' 76 | 'rb = RoaringBitmap(DATA1); ' 77 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=M) 78 | return a, b 79 | 80 | 81 | def bench_or(): 82 | a = timeit.Timer('ref | ref2', 83 | setup='from __main__ import DATA1, DATA2; ' 84 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M) 85 | b = timeit.Timer('rb | rb2', 86 | setup='from __main__ import DATA1, DATA2; ' 87 | 'from roaringbitmap import RoaringBitmap; ' 88 | 'rb = RoaringBitmap(DATA1); ' 89 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=M) 90 | return a, b 91 | 92 | 93 | def bench_xor(): 94 | a = timeit.Timer('ref ^ ref2', 95 | setup='from __main__ import DATA1, DATA2; ' 96 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M) 97 | b = timeit.Timer('rb ^ rb2', 98 | setup='from __main__ import DATA1, DATA2; ' 99 | 'from roaringbitmap import RoaringBitmap; ' 100 | 'rb = RoaringBitmap(DATA1); ' 101 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=M) 102 | return a, b 103 | 104 | 105 | def bench_sub(): 106 | a = timeit.Timer('ref - ref2', 107 | setup='from __main__ import DATA1, DATA2; ' 108 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M) 109 | b = timeit.Timer('rb - rb2', 110 | setup='from __main__ import DATA1, DATA2; ' 111 | 'from roaringbitmap import RoaringBitmap; ' 112 | 'rb = RoaringBitmap(DATA1); ' 113 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=M) 114 | return a, b 115 | 116 | 117 | def bench_iand(): 118 | aa = [timeit.Timer('ref &= ref2', 119 | setup='from __main__ import DATA1, DATA2; ' 120 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=1) 121 | for _ in range(M)] 122 | bb = [timeit.Timer('rb &= rb2', 123 | setup='from __main__ import DATA1, DATA2; ' 124 | 'from roaringbitmap import RoaringBitmap; ' 125 | 'rb = RoaringBitmap(DATA1); ' 126 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=1) 127 | for _ in range(M)] 128 | return sum(aa) / M, sum(bb) / M 129 | 130 | 131 | def bench_ior(): 132 | aa = [timeit.Timer('ref |= ref2', 133 | setup='from __main__ import DATA1, DATA2; ' 134 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=1) 135 | for _ in range(M)] 136 | bb = [timeit.Timer('rb |= rb2', 137 | setup='from __main__ import DATA1, DATA2; ' 138 | 'from roaringbitmap import RoaringBitmap; ' 139 | 'rb = RoaringBitmap(DATA1); ' 140 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=1) 141 | for _ in range(M)] 142 | return sum(aa) / M, sum(bb) / M 143 | 144 | 145 | def bench_ixor(): 146 | aa = [timeit.Timer('ref ^= ref2', 147 | setup='from __main__ import DATA1, DATA2; ' 148 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=1) 149 | for _ in range(M)] 150 | bb = [timeit.Timer('rb ^= rb2', 151 | setup='from __main__ import DATA1, DATA2; ' 152 | 'from roaringbitmap import RoaringBitmap; ' 153 | 'rb = RoaringBitmap(DATA1); ' 154 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=1) 155 | for _ in range(M)] 156 | return sum(aa) / M, sum(bb) / M 157 | 158 | 159 | def bench_isub(): 160 | aa = [timeit.Timer('ref -= ref2', 161 | setup='from __main__ import DATA1, DATA2; ' 162 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=1) 163 | for _ in range(M)] 164 | bb = [timeit.Timer('rb -= rb2', 165 | setup='from __main__ import DATA1, DATA2; ' 166 | 'from roaringbitmap import RoaringBitmap; ' 167 | 'rb = RoaringBitmap(DATA1); ' 168 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=1) 169 | for _ in range(M)] 170 | return sum(aa) / M, sum(bb) / M 171 | 172 | 173 | def bench_andlen(): 174 | a = timeit.Timer('len(ref & ref2)', 175 | setup='from __main__ import DATA1, DATA2; ' 176 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M) 177 | b = timeit.Timer('rb.intersection_len(rb2)', 178 | setup='from __main__ import DATA1, DATA2; ' 179 | 'from roaringbitmap import RoaringBitmap; ' 180 | 'rb = RoaringBitmap(DATA1); ' 181 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=M) 182 | return a, b 183 | 184 | 185 | def bench_orlen(): 186 | a = timeit.Timer('len(ref | ref2)', 187 | setup='from __main__ import DATA1, DATA2; ' 188 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M) 189 | b = timeit.Timer('rb.union_len(rb2)', 190 | setup='from __main__ import DATA1, DATA2; ' 191 | 'from roaringbitmap import RoaringBitmap; ' 192 | 'rb = RoaringBitmap(DATA1); ' 193 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=M) 194 | return a, b 195 | 196 | 197 | def bench_jaccard(): 198 | a = timeit.Timer('1 - (len(ref & ref2) / len(ref | ref2))', 199 | setup='from __main__ import DATA1, DATA2; ' 200 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M) 201 | b = timeit.Timer('rb.jaccard_dist(rb2)', 202 | setup='from __main__ import DATA1, DATA2; ' 203 | 'from roaringbitmap import RoaringBitmap; ' 204 | 'rb = RoaringBitmap(DATA1); ' 205 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=M) 206 | return a, b 207 | 208 | 209 | def main(): 210 | global N, MAX, DATA1, DATA2 211 | for x in range(3): 212 | if x == 0: # benchmark positive blocks 213 | print('small sparse set') 214 | N = 200 # number of random elements 215 | MAX = 40000 # range of elements 216 | elif x == 1: # benchmark bitmap blocks 217 | print('medium load factor') 218 | N = 59392 219 | MAX = 118784 220 | elif x == 2: # benchmark inverted blocks 221 | print('dense set / high load factor') 222 | N = 40000 - 200 223 | MAX = 40000 224 | elif x == 3: # benchmark large number of small blocks 225 | print('large sparse set') # don't use RoaringBitmap for this case 226 | N = 1 << 12 227 | MAX = 1 << 31 228 | DATA1, DATA2 = pair() 229 | 230 | fmt = '%12s %8s %16s %8s' 231 | numfmt = '%8.3g' 232 | print('%d runs with sets of %d random elements n s.t. 0 <= n < %d' % ( 233 | M, N, MAX)) 234 | print(fmt % ('', 'set()', 'RoaringBitmap()', 'ratio')) 235 | for func in (bench_init, bench_initsort, 236 | bench_and, bench_or, bench_xor, bench_sub, 237 | bench_iand, bench_ior, bench_ixor, bench_isub, 238 | bench_eq, bench_neq, 239 | # bench_andlen, bench_orlen, 240 | bench_jaccard): 241 | a, b = func() 242 | ratio = a / b 243 | print(fmt % (func.__name__.split('_', 1)[1].ljust(12), 244 | numfmt % a, numfmt % b, 245 | (numfmt % ratio) if ratio < 100 else int(ratio))) 246 | print() 247 | 248 | 249 | if __name__ == '__main__': 250 | main() 251 | -------------------------------------------------------------------------------- /tests/unittests.py: -------------------------------------------------------------------------------- 1 | """Unit tests for roaringbitmap""" 2 | from __future__ import division, absolute_import, unicode_literals 3 | import sys 4 | import array 5 | import pytest 6 | import pickle 7 | import tempfile 8 | from random import seed, choice, sample, randint 9 | try: 10 | import faulthandler 11 | faulthandler.enable() 12 | except ImportError: 13 | pass 14 | from roaringbitmap import (RoaringBitmap, ImmutableRoaringBitmap, 15 | MultiRoaringBitmap, bitcounttests, aligned_malloc_tests, mmaptests) 16 | PY2 = sys.version_info[0] == 2 17 | if PY2: 18 | range = xrange 19 | from itertools import izip_longest as zip_longest 20 | else: 21 | from itertools import zip_longest 22 | 23 | # (numitems, maxnum) 24 | PARAMS = [ 25 | ('empty', 0, (1 << 16) - 1), 26 | ('positive', 200, (1 << 16) - 1), 27 | ('dense', 5000, (1 << 16) - 1), 28 | ('inverted', 4000, (1 << 16) - 1), 29 | ('many keys', 4000, (1 << 25) - 1) 30 | ] 31 | 32 | 33 | def _single(): 34 | seed(42) 35 | result = [] 36 | for name, elements, maxnum in PARAMS: 37 | if name == 'inverted': 38 | result.append((name, list(set(range(1 << 16)) 39 | - {randint(0, maxnum) for _ in range(elements)}))) 40 | else: 41 | result.append((name, sorted( 42 | randint(0, maxnum) for _ in range(elements)))) 43 | return result 44 | 45 | 46 | @pytest.fixture(scope='module') 47 | def single(): 48 | return _single() 49 | 50 | 51 | @pytest.fixture(scope='module') 52 | def pair(): 53 | result = [] 54 | for name1, a in _single(): 55 | for name2, b in _single(): 56 | if name2 != 'empty': 57 | b = sorted(b[:len(b) // 2] + a[len(a) // 2:]) 58 | result.append((name1 + ':' + name2, a, b)) 59 | return result 60 | 61 | 62 | @pytest.fixture(scope='module') 63 | def multi(): 64 | a = sorted(randint(0, 2000) 65 | for _ in range(randint(100, 2000))) 66 | result = [sorted([randint(0, 2000) 67 | for _ in range(randint(100, 2000))] + a) 68 | for _ in range(100)] 69 | return result 70 | 71 | 72 | def abbr(a): 73 | return a[:500] + '...' + a[-500:] 74 | 75 | 76 | def test_fixtures(single): 77 | for name, data in single: 78 | rb = RoaringBitmap(data) 79 | if name == 'many keys': 80 | assert len(rb._keys()) > 100 81 | elif name == 'empty': 82 | assert len(rb) == 0 83 | else: 84 | assert name[0].upper() in rb.debuginfo() 85 | 86 | 87 | def test_bitcount(): 88 | assert bitcounttests() 89 | 90 | 91 | def test_aligned_malloc(): 92 | assert aligned_malloc_tests() 93 | 94 | 95 | def test_mmap(): 96 | assert mmaptests() 97 | 98 | 99 | class Test_roaringbitmap(object): 100 | def test_inittrivial(self): 101 | data = list(range(5)) 102 | ref = set(data) 103 | rb = RoaringBitmap(data) 104 | rb._checkconsistency() 105 | assert ref == rb 106 | 107 | def test_initsorted(self, single): 108 | for name, data in single: 109 | ref = set(sorted(data)) 110 | rb = RoaringBitmap(sorted(data)) 111 | rb._checkconsistency() 112 | assert ref == rb, name 113 | 114 | def test_initunsorted(self, single): 115 | for name, data in single: 116 | ref = set(data) 117 | rb = RoaringBitmap(data) 118 | rb._checkconsistency() 119 | assert ref == rb, name 120 | 121 | def test_inititerator(self, single): 122 | for name, data in single: 123 | ref = set(a for a in data) 124 | rb = RoaringBitmap(a for a in data) 125 | rb._checkconsistency() 126 | assert ref == rb, name 127 | 128 | def test_initrange(self): 129 | # creates a positive, dense, and inverted block, respectively 130 | for n in [400, 6000, 61241]: 131 | ref = set(range(23, n)) 132 | rb = RoaringBitmap(range(23, n)) 133 | rb._checkconsistency() 134 | assert ref == rb, ('range(23, %d)' % n) 135 | 136 | def test_initrangestep(self): 137 | # creates a positive, dense, and inverted block, respectively 138 | for n in [400, 6000, 61241]: 139 | for step in (2, 7, 113): 140 | ref = set(range(23, n * step, step)) 141 | rb = RoaringBitmap(range(23, n * step, step)) 142 | rb._checkconsistency() 143 | assert ref == rb, ('range(23, %d, %d)' % (n, step)) 144 | n = 100 * (1 << 16) 145 | step = (1 << 16) + 7 146 | ref = set(range(23, n, step)) 147 | rb = RoaringBitmap(range(23, n, step)) 148 | rb._checkconsistency() 149 | assert ref == rb, ('range(23, %d, %d)' % (n, step)) 150 | 151 | def test_inititerableallset(self): 152 | rb = RoaringBitmap(list(range(0, 0xffff + 1))) 153 | assert len(rb) == 0xffff + 1 154 | 155 | def test_add(self, single): 156 | for name, data in single: 157 | ref = set() 158 | rb = RoaringBitmap() 159 | for n in sorted(data): 160 | ref.add(n) 161 | rb.add(n) 162 | assert rb == ref, name 163 | with pytest.raises(OverflowError): 164 | rb.add(-1) 165 | rb.add(1 << 32) 166 | rb.add(0) 167 | rb.add((1 << 32) - 1) 168 | rb._checkconsistency() 169 | 170 | def test_discard(self, single): 171 | for name, data in single: 172 | ref = set() 173 | rb = RoaringBitmap() 174 | for n in sorted(data): 175 | ref.add(n) 176 | rb.add(n) 177 | for n in sorted(data): 178 | ref.discard(n) 179 | rb.discard(n) 180 | rb._checkconsistency() 181 | assert len(ref) == 0, name 182 | assert len(rb) == 0, name 183 | assert rb == ref, name 184 | 185 | def test_pop(self): 186 | rb = RoaringBitmap([60748, 28806, 54664, 28597, 58922, 75684, 56364, 187 | 67421, 52608, 55686, 10427, 48506, 64363, 14506, 73077, 59035, 188 | 70246, 19875, 73145, 40225, 58664, 6597, 65554, 73102, 26636, 189 | 74227, 59566, 19023]) 190 | while rb: 191 | rb.pop() 192 | rb._checkconsistency() 193 | assert len(rb) == 0 194 | 195 | def test_contains(self, single): 196 | for name, data in single: 197 | ref = set(data) 198 | rb = RoaringBitmap(data) 199 | for a in data: 200 | assert a in ref, name 201 | assert a in rb, name 202 | for a in set(range(20000)) - set(data): 203 | assert a not in ref, name 204 | assert a not in rb, name 205 | rb._checkconsistency() 206 | 207 | def test_eq(self, single): 208 | for name, data in single: 209 | ref, ref2 = set(data), set(data) 210 | rb, rb2 = RoaringBitmap(data), RoaringBitmap(data) 211 | assert (ref == ref2) == (rb == rb2), name 212 | 213 | def test_neq(self, pair): 214 | for name, data1, data2 in pair: 215 | ref, ref2 = set(data1), set(data2) 216 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2) 217 | assert (ref != ref2) == (rb != rb2), name 218 | 219 | def test_iter(self, single): 220 | for name, data in single: 221 | rb = RoaringBitmap(data) 222 | assert list(iter(rb)) == sorted(set(data)), name 223 | 224 | def test_reversed(self, single): 225 | for name, data in single: 226 | rb = RoaringBitmap(data) 227 | for a, b in zip_longest(reversed(rb), reversed(sorted(set(data)))): 228 | assert a == b, name 229 | 230 | def test_iand(self, pair): 231 | for name, data1, data2 in pair: 232 | ref, ref2 = set(data1), set(data2) 233 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2) 234 | ref &= ref2 235 | rb &= rb2 236 | rb._checkconsistency() 237 | assert rb == ref, name 238 | 239 | def test_ior(self, pair): 240 | for name, data1, data2 in pair: 241 | ref, ref2 = set(data1), set(data2) 242 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2) 243 | ref |= ref2 244 | rb |= rb2 245 | rb._checkconsistency() 246 | assert rb == ref, name 247 | 248 | def test_ixor(self, pair): 249 | for name, data1, data2 in pair: 250 | ref, ref2 = set(data1), set(data2) 251 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2) 252 | ref ^= ref2 253 | rb ^= rb2 254 | rb._checkconsistency() 255 | assert len(ref) == len(rb), name 256 | assert ref == rb, name 257 | 258 | def test_isub(self, pair): 259 | for name, data1, data2 in pair: 260 | ref, ref2 = set(data1), set(data2) 261 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2) 262 | ref -= ref2 263 | rb -= rb2 264 | rb._checkconsistency() 265 | assert len(ref) <= len(set(data1)) 266 | assert len(rb) <= len(set(data1)), name 267 | assert len(ref) == len(rb), name 268 | assert ref == rb, name 269 | 270 | def test_and(self, pair): 271 | for name, data1, data2 in pair: 272 | ref, ref2 = set(data1), set(data2) 273 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2) 274 | assert ref & ref2 == set(rb & rb2), name 275 | 276 | def test_or(self, pair): 277 | for name, data1, data2 in pair: 278 | ref, ref2 = set(data1), set(data2) 279 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2) 280 | assert ref | ref2 == set(rb | rb2), name 281 | 282 | def test_xor(self, pair): 283 | for name, data1, data2 in pair: 284 | ref, ref2 = set(data1), set(data2) 285 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2) 286 | assert ref ^ ref2 == set(rb ^ rb2), name 287 | 288 | def test_sub(self, pair): 289 | for name, data1, data2 in pair: 290 | ref, ref2 = set(data1), set(data2) 291 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2) 292 | assert ref - ref2 == set(rb - rb2), name 293 | 294 | def test_subset(self, pair): 295 | for name, data1, data2 in pair: 296 | ref, ref2 = set(data1), set(data2) 297 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2) 298 | refans = ref <= ref2 299 | assert (set(rb) <= ref2) == refans, name 300 | assert (rb <= rb2) == refans, name 301 | k = len(data2) // 2 302 | ref, rb = set(data2[:k]), RoaringBitmap(data2[:k]) 303 | refans = ref <= ref2 304 | assert (set(rb) <= ref2) == refans, name 305 | assert (ref <= set(rb2)) == refans, name 306 | assert (rb <= rb2) == refans, (name, rb.debuginfo()) 307 | 308 | def test_disjoint(self, pair): 309 | for name, data1, data2 in pair: 310 | ref, ref2 = set(data1), set(data2) 311 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2) 312 | refans = ref.isdisjoint(ref2) 313 | assert rb.isdisjoint(rb2) == refans, name 314 | data3 = [a for a in data2 if a not in ref] 315 | ref3, rb3 = set(data3), RoaringBitmap(data3) 316 | refans2 = ref.isdisjoint(ref3) 317 | assert rb.isdisjoint(rb3) == refans2, name 318 | 319 | def test_clamp(self, single): 320 | for name, data in single: 321 | if len(data) == 0: 322 | continue 323 | a, b = sorted(sample(data, 2)) 324 | ref = set(data).intersection(range(a, b)) 325 | rb = RoaringBitmap(data).intersection(range(a, b)) 326 | rb2 = RoaringBitmap(data).clamp(a, b) 327 | assert a <= rb2.min() and rb2.max() < b, name 328 | assert ref == rb2, (name, a, b) 329 | assert rb == rb2, (name, a, b) 330 | 331 | def test_clamp_issue12(self): 332 | b = RoaringBitmap([1, 2, 3]) 333 | assert b.clamp(0, 65536) == b 334 | assert b.clamp(0, 65537) == b 335 | assert b.clamp(0, 65538) == b 336 | assert b.clamp(0, 65539) == b 337 | 338 | def test_clamp2(self): 339 | a = RoaringBitmap([0x00010001]) 340 | b = RoaringBitmap([0x00030003, 0x00050005]) 341 | c = RoaringBitmap([0x00070007]) 342 | x = a | b | c 343 | assert x.clamp(0, 0x000FFFFF) == x 344 | assert x.clamp(0x000200FF, 0x000FFFFF) == b | c 345 | assert x.clamp(0x00030003, 0x000FFFFF) == b | c 346 | assert x.clamp(0, 0x00060006) == a | b 347 | assert x.clamp(0, 0x00050006) == a | b 348 | assert x.clamp(0, 0x00050005) == a | RoaringBitmap([0x00030003]) 349 | 350 | def test_aggregateand(self, multi): 351 | ref = set(multi[0]) 352 | ref.intersection_update(*[set(a) for a in multi[1:]]) 353 | rb = RoaringBitmap(multi[0]) 354 | rb.intersection_update(*[RoaringBitmap(a) for a in multi[1:]]) 355 | rb._checkconsistency() 356 | assert rb == ref 357 | 358 | def test_aggregateor(self, multi): 359 | ref = set(multi[0]) 360 | ref.update(*[set(a) for a in multi[1:]]) 361 | rb = RoaringBitmap(multi[0]) 362 | rb.update(*[RoaringBitmap(a) for a in multi[1:]]) 363 | rb._checkconsistency() 364 | assert rb == ref 365 | 366 | def test_andlen(self, pair): 367 | for name, data1, data2 in pair: 368 | ref, ref2 = set(data1), set(data2) 369 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2) 370 | assert len(rb & rb2) == rb.intersection_len(rb2), name 371 | assert len(ref & ref2) == rb.intersection_len(rb2), name 372 | 373 | def test_orlen(self, pair): 374 | for name, data1, data2 in pair: 375 | ref, ref2 = set(data1), set(data2) 376 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2) 377 | assert len(ref | ref2) == rb.union_len(rb2), name 378 | assert len(rb | rb2) == rb.union_len(rb2), name 379 | 380 | def test_jaccard_dist(self, pair): 381 | for name, data1, data2 in pair: 382 | if len(data1) == 0 and len(data2) == 0: 383 | continue 384 | ref, ref2 = set(data1), set(data2) 385 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2) 386 | assert len(ref & ref2) / float(len(ref | ref2)) == pytest.approx( 387 | rb.intersection_len(rb2) / float(rb.union_len(rb2))), name 388 | assert (1 - (len(ref & ref2) / float(len(ref | ref2))) 389 | == pytest.approx(rb.jaccard_dist(rb2))), name 390 | 391 | def test_rank(self, single): 392 | for name, data in single: 393 | if len(data) == 0: 394 | continue 395 | ref = sorted(set(data)) 396 | rb = RoaringBitmap(data) 397 | for _ in range(10): 398 | x = choice(ref) 399 | assert x in rb, name 400 | assert rb.rank(x) == ref.index(x) + 1, name 401 | 402 | def test_select(self, single): 403 | for name, data in single: 404 | if len(data) == 0: 405 | continue 406 | ref = sorted(set(data)) 407 | rb = RoaringBitmap(data) 408 | lrb = list(rb) 409 | idx = [randint(0, len(ref) - 1) for _ in range(10)] 410 | for i in idx: 411 | assert lrb[i] == ref[i], (name, i, len(ref)) 412 | assert rb.select(i) in rb, name 413 | assert rb.select(i) == ref[i], name 414 | assert rb[i] == ref[i], name 415 | assert rb.rank(rb.select(i)) - 1 == i, name 416 | if rb.select(i) + 1 in rb: 417 | assert rb.rank(rb.select(i) + 1) - 1 == i + 1, name 418 | else: 419 | assert rb.rank(rb.select(i) + 1) - 1 == i, name 420 | 421 | def test_rank2(self): 422 | rb = RoaringBitmap(range(0, 100000, 7)) 423 | rb.update(range(100000, 200000, 1000)) 424 | for k in range(100000): 425 | assert rb.rank(k) == 1 + k // 7 426 | for k in range(100000, 200000): 427 | assert rb.rank(k) == 1 + 100000 // 7 + 1 + (k - 100000) // 1000 428 | 429 | def test_select2(self): 430 | gap = 1 431 | while gap <= 1024: 432 | rb = RoaringBitmap(range(0, 100000, gap)) 433 | for k in range(0, 100000 // gap): 434 | assert rb.select(k) == k * gap 435 | gap *= 2 436 | 437 | def test_select_issue15(self): 438 | rb = RoaringBitmap(range(0x10000, 0x1ffff + 1)) 439 | assert rb[0] == 0x10000 440 | rb.discard(0x10010) 441 | assert rb[0] == 0x10000 442 | rb = RoaringBitmap(range(0x10, 0xffff + 1)) 443 | assert rb[0] == 0x10, rb.debuginfo(True) 444 | rb = RoaringBitmap(range(0x10010, 0x1ffff + 1)) 445 | assert rb[0] == 0x10010, rb.debuginfo(True) 446 | lst = list(range(1, 0xccbb)) 447 | lst.extend(range(0xcccc, 0xfffc)) 448 | rb = RoaringBitmap(lst) 449 | for n in (0, 0xcccc, -1): 450 | assert lst[n] == rb[n], (n, lst[n], rb[n]) 451 | 452 | def test_pickle(self, single): 453 | for name, data in single: 454 | rb = RoaringBitmap(data) 455 | rb_pickled = pickle.dumps(rb, protocol=-1) 456 | rb_unpickled = pickle.loads(rb_pickled) 457 | rb._checkconsistency() 458 | assert rb_unpickled == rb, name 459 | 460 | def test_invalid(self): 461 | with pytest.raises(TypeError): 462 | rb = RoaringBitmap([1, 2, 'a']) 463 | with pytest.raises(TypeError): 464 | RoaringBitmap([1, 2]) < [1, 2, 3] 465 | 466 | def test_slices(self): # issue 20 467 | ref = list(range(10)) 468 | rb = RoaringBitmap(range(10)) 469 | assert list(rb[::2]) == ref[::2] 470 | with pytest.raises(ValueError): 471 | _ = rb[::-2] 472 | with pytest.raises(ValueError): 473 | _ = rb[::0] 474 | del rb[::2] 475 | del ref[::2] 476 | assert list(rb) == ref 477 | 478 | def test_minmax(self): 479 | rb = RoaringBitmap(range(0, 61440)) 480 | assert rb.min() == 0 481 | assert rb.max() == 61439 482 | rb1 = RoaringBitmap(range(0, 61441)) 483 | assert rb1.min() == 0 484 | assert rb1.max() == 61440 485 | assert rb1[61440] == 61440 486 | assert list(rb1)[61440] == 61440 487 | 488 | def test_issue19(self): 489 | a = RoaringBitmap() 490 | b = RoaringBitmap(range(4095)) 491 | c = RoaringBitmap(range(2)) 492 | a |= b 493 | a |= c 494 | assert len(a - b - c) == 0 495 | assert len((b | c) - b - c) == 0 496 | 497 | def test_issue22(self): 498 | rb = RoaringBitmap(range(0, 61440)) 499 | rb1 = RoaringBitmap(range(0, 61441)) 500 | assert len(rb ^ rb) == 0 501 | assert len(rb - rb) == 0 502 | assert len(rb1 ^ rb1) == 0 503 | assert len(rb1 - rb1) == 0 504 | assert len(~rb) == 0 505 | assert len(~rb1) == 0 506 | 507 | rb1 = RoaringBitmap(range(0, 61441)) 508 | assert len(rb ^ rb) == 0 509 | rb1 ^= rb1 510 | assert len(rb1) == 0 511 | 512 | rb1 = RoaringBitmap(range(0, 61441)) 513 | rb1 -= rb1 514 | assert len(rb1) == 0 515 | 516 | def test_issue24(self): 517 | r = RoaringBitmap(range(131071)) 518 | assert r.pop() == 131070 519 | assert r.pop() == 131069 520 | 521 | rr = r - RoaringBitmap([130752]) 522 | assert 130752 not in rr 523 | assert rr.pop() == 131068 524 | 525 | r.difference_update(RoaringBitmap([130752])) 526 | assert 130752 not in r 527 | assert r.pop() == 131068 528 | 529 | def test_issue25(self): 530 | r = RoaringBitmap({1}) 531 | r.intersection_update(RoaringBitmap([])) 532 | assert len(r) == 0 533 | 534 | def test_issue28(self): 535 | rbm = RoaringBitmap() 536 | rbm.add(3995084765) 537 | r = rbm.clamp(0, 8388607) 538 | assert len(r) == 0 539 | 540 | def test_issue34(self): 541 | seed(232992) 542 | set_a = sample(range(235342), k=169308) 543 | set_b = sample(range(255999), k=255713) 544 | rba = RoaringBitmap(set_a) 545 | rbb = RoaringBitmap(set_b) 546 | assert rba - rbb == set(set_a) - set(set_b) 547 | rba -= rbb 548 | assert rba == set(set_a) - set(set_b) 549 | 550 | 551 | class Test_immutablerb(object): 552 | def test_inittrivial(self): 553 | data = list(range(5)) 554 | ref = set(data) 555 | rb = ImmutableRoaringBitmap(data) 556 | rb._checkconsistency() 557 | assert ref == rb 558 | assert type(rb) == ImmutableRoaringBitmap 559 | 560 | def test_initsorted(self, single): 561 | for name, data in single: 562 | ref = set(sorted(data)) 563 | rb = ImmutableRoaringBitmap(sorted(data)) 564 | rb._checkconsistency() 565 | assert ref == rb, name 566 | 567 | def test_initunsorted(self, single): 568 | for name, data in single: 569 | ref = set(data) 570 | rb = ImmutableRoaringBitmap(data) 571 | rb._checkconsistency() 572 | assert ref == rb, name 573 | 574 | def test_inititerator(self, single): 575 | for name, data in single: 576 | ref = set(a for a in data) 577 | rb = ImmutableRoaringBitmap(a for a in data) 578 | rb._checkconsistency() 579 | assert ref == rb, name 580 | 581 | def test_initrange(self): 582 | # creates a positive, dense, and inverted block, respectively 583 | for n in [400, 6000, 61241]: 584 | ref = set(range(23, n)) 585 | rb = ImmutableRoaringBitmap(range(23, n)) 586 | rb._checkconsistency() 587 | assert ref == rb, n 588 | 589 | def test_initrb(self): 590 | r = RoaringBitmap(range(5)) 591 | i = ImmutableRoaringBitmap(r) 592 | r = RoaringBitmap(i) 593 | assert r == i 594 | 595 | i = ImmutableRoaringBitmap(range(5)) 596 | r = RoaringBitmap(i) 597 | assert r == i 598 | 599 | def test_pickle(self, single): 600 | for name, data in single: 601 | rb = ImmutableRoaringBitmap(data) 602 | rb_pickled = pickle.dumps(rb, protocol=-1) 603 | rb_unpickled = pickle.loads(rb_pickled) 604 | rb._checkconsistency() 605 | assert rb_unpickled == rb, name 606 | assert type(rb) == ImmutableRoaringBitmap, name 607 | 608 | def test_and(self, pair): 609 | for name, data1, data2 in pair: 610 | ref, ref2 = set(data1), set(data2) 611 | rb = ImmutableRoaringBitmap(data1) 612 | rb2 = ImmutableRoaringBitmap(data2) 613 | assert ref & ref2 == set(rb & rb2), name 614 | assert type(rb & rb2) == RoaringBitmap, name 615 | 616 | def test_or(self, pair): 617 | for name, data1, data2 in pair: 618 | ref, ref2 = set(data1), set(data2) 619 | rb, rb2 = ImmutableRoaringBitmap(data1), ImmutableRoaringBitmap(data2) 620 | assert ref | ref2 == set(rb | rb2), name 621 | 622 | def test_xor(self, pair): 623 | for name, data1, data2 in pair: 624 | ref, ref2 = set(data1), set(data2) 625 | rb, rb2 = ImmutableRoaringBitmap(data1), ImmutableRoaringBitmap(data2) 626 | assert ref ^ ref2 == set(rb ^ rb2), name 627 | 628 | def test_sub(self, pair): 629 | for name, data1, data2 in pair: 630 | ref, ref2 = set(data1), set(data2) 631 | rb, rb2 = ImmutableRoaringBitmap(data1), ImmutableRoaringBitmap(data2) 632 | assert ref - ref2 == set(rb - rb2), name 633 | 634 | def test_aggregateand(self, multi): 635 | ref = set(multi[0]) 636 | res1 = ref.intersection(*[set(a) for a in multi[1:]]) 637 | rb = ImmutableRoaringBitmap(multi[0]) 638 | res2 = rb.intersection(*[ImmutableRoaringBitmap(a) for a in multi[1:]]) 639 | res2._checkconsistency() 640 | assert res1 == res2 641 | 642 | def test_aggregateor(self, multi): 643 | ref = set(multi[0]) 644 | res1 = ref.union(*[set(a) for a in multi[1:]]) 645 | rb = ImmutableRoaringBitmap(multi[0]) 646 | res2 = rb.union(*[ImmutableRoaringBitmap(a) for a in multi[1:]]) 647 | res2._checkconsistency() 648 | assert res1 == res2 649 | 650 | def test_andlen(self, pair): 651 | for name, data1, data2 in pair: 652 | ref, ref2 = set(data1), set(data2) 653 | rb = ImmutableRoaringBitmap(data1) 654 | rb2 = ImmutableRoaringBitmap(data2) 655 | assert len(rb & rb2) == rb.intersection_len(rb2), name 656 | assert len(ref & ref2) == rb.intersection_len(rb2), name 657 | 658 | def test_orlen(self, pair): 659 | for name, data1, data2 in pair: 660 | ref, ref2 = set(data1), set(data2) 661 | rb = ImmutableRoaringBitmap(data1) 662 | rb2 = ImmutableRoaringBitmap(data2) 663 | assert len(ref | ref2) == rb.union_len(rb2), name 664 | assert len(rb | rb2) == rb.union_len(rb2), name 665 | 666 | def test_jaccard_dist(self, pair): 667 | for name, data1, data2 in pair: 668 | if len(data1) == 0 and len(data2) == 0: 669 | continue 670 | ref, ref2 = set(data1), set(data2) 671 | rb = ImmutableRoaringBitmap(data1) 672 | rb2 = ImmutableRoaringBitmap(data2) 673 | assert len(ref & ref2) / float(len(ref | ref2)) == pytest.approx( 674 | rb.intersection_len(rb2) / float(rb.union_len(rb2))), name 675 | assert (1 - (len(ref & ref2) / float(len(ref | ref2))) 676 | == pytest.approx(rb.jaccard_dist(rb2))), name 677 | 678 | def test_rank(self, single): 679 | for name, data in single: 680 | if len(data) == 0: 681 | continue 682 | ref = sorted(set(data)) 683 | rb = ImmutableRoaringBitmap(data) 684 | for _ in range(10): 685 | x = choice(ref) 686 | assert x in rb, name 687 | assert rb.rank(x) == ref.index(x) + 1, name 688 | 689 | def test_select(self, single): 690 | for name, data in single: 691 | if len(data) == 0: 692 | continue 693 | ref = sorted(set(data)) 694 | rb = ImmutableRoaringBitmap(data) 695 | lrb = list(rb) 696 | idx = [0, 1, 2] + [ 697 | randint(0, len(ref) - 1) for _ in range(10)] + [ 698 | len(ref) - 1, len(ref) - 2] 699 | for i in idx: 700 | assert lrb[i] == ref[i], name 701 | assert rb.select(i) in rb, name 702 | assert rb.select(i) == ref[i], name 703 | assert rb.rank(rb.select(i)) - 1 == i, name 704 | if rb.select(i) + 1 in rb: 705 | assert rb.rank(rb.select(i) + 1) - 1 == i + 1, name 706 | else: 707 | assert rb.rank(rb.select(i) + 1) - 1 == i, name 708 | 709 | def test_rank2(self): 710 | rb = ImmutableRoaringBitmap(range(0, 100000, 7)) 711 | rb = rb.union(range(100000, 200000, 1000)) 712 | for k in range(100000): 713 | assert rb.rank(k) == 1 + k // 7 714 | for k in range(100000, 200000): 715 | assert rb.rank(k) == 1 + 100000 // 7 + 1 + (k - 100000) // 1000 716 | 717 | def test_select2(self): 718 | gap = 1 719 | while gap <= 1024: 720 | rb = ImmutableRoaringBitmap(range(0, 100000, gap)) 721 | for k in range(0, 100000 // gap): 722 | assert rb.select(k) == k * gap 723 | gap *= 2 724 | 725 | 726 | class Test_multirb(object): 727 | def test_init(self, multi): 728 | orig = [RoaringBitmap(a) for a in multi] 729 | mrb = MultiRoaringBitmap(orig) 730 | assert len(orig) == len(mrb) 731 | for rb1, rb2 in zip(orig, mrb): 732 | assert rb1 == rb2 733 | 734 | def test_none(self, multi): 735 | orig = [RoaringBitmap(a) for a in multi] 736 | orig.insert(4, RoaringBitmap()) 737 | mrb = MultiRoaringBitmap(orig) 738 | assert len(orig) == len(mrb) 739 | for rb1, rb2 in zip(orig, mrb): 740 | assert rb1 == rb2 741 | assert mrb.intersection([4, 5]) is None 742 | 743 | def test_aggregateand(self, multi): 744 | ref = set(multi[0]) 745 | res1 = ref.intersection(*[set(a) for a in multi[1:]]) 746 | mrb = MultiRoaringBitmap([ImmutableRoaringBitmap(a) for a in multi]) 747 | res2 = mrb.intersection(list(range(len(mrb)))) 748 | assert res1 == res2 749 | 750 | def test_jaccard(self, multi): 751 | mrb = MultiRoaringBitmap([ImmutableRoaringBitmap(a) for a in multi]) 752 | indices1 = array.array(b'L' if PY2 else 'L', [0, 6, 8]) 753 | indices2 = array.array(b'L' if PY2 else 'L', [1, 7, 6]) 754 | res = mrb.jaccard_dist(indices1, indices2) 755 | ref = array.array(b'd' if PY2 else 'd', [mrb[i].jaccard_dist(mrb[j]) 756 | for i, j in zip(indices1, indices2)]) 757 | assert res == ref 758 | 759 | def test_andor_len_pairwise(self, multi): 760 | mrb = MultiRoaringBitmap([ImmutableRoaringBitmap(a) for a in multi]) 761 | indices1 = array.array(b'L' if PY2 else 'L', [0, 6, 8]) 762 | indices2 = array.array(b'L' if PY2 else 'L', [1, 7, 6]) 763 | res1 = array.array(b'L' if PY2 else 'L', [0] * len(indices1)) 764 | res2 = array.array(b'L' if PY2 else 'L', [0] * len(indices1)) 765 | mrb.andor_len_pairwise(indices1, indices2, res1, res2) 766 | ref1 = array.array(b'L' if PY2 else 'L') 767 | ref2 = array.array(b'L' if PY2 else 'L') 768 | for i, j in zip(indices1, indices2): 769 | ref1.append(len(mrb[i] & mrb[j])) 770 | ref2.append(len(mrb[i] | mrb[j])) 771 | assert res1 == ref1 772 | assert res2 == ref2 773 | 774 | def test_clamp(self, multi): 775 | a, b = sorted(sample(multi[0], 2)) 776 | ref = set.intersection( 777 | *[set(x) for x in multi]) & set(range(a, b)) 778 | mrb = MultiRoaringBitmap([RoaringBitmap(x) for x in multi]) 779 | rb = mrb.intersection(list(range(len(mrb))), start=a, stop=b) 780 | assert a <= rb.min() and rb.max() < b 781 | assert ref == rb 782 | 783 | def test_serialize(self, multi): 784 | orig = [RoaringBitmap(a) for a in multi] 785 | mrb = MultiRoaringBitmap(orig) 786 | with tempfile.NamedTemporaryFile(delete=False) as tmp: 787 | mrb2 = MultiRoaringBitmap(orig, filename=tmp.name) 788 | del mrb2 789 | mrb_deserialized = MultiRoaringBitmap.fromfile(tmp.name) 790 | assert len(orig) == len(mrb) 791 | assert len(orig) == len(mrb_deserialized) 792 | for rb1, rb2, rb3 in zip(orig, mrb, mrb_deserialized): 793 | assert rb1 == rb2 794 | assert rb1 == rb3 795 | rb3._checkconsistency() 796 | assert type(rb3) == ImmutableRoaringBitmap 797 | 798 | def test_multi1(self): 799 | for_multi = [] 800 | for i in range(5): 801 | for_multi += [RoaringBitmap(sample(range(99999), 200))] 802 | mrb = MultiRoaringBitmap(for_multi) 803 | assert len(mrb) == 5 804 | assert mrb[4] == for_multi[4] 805 | with pytest.raises(IndexError): 806 | mrb[5] 807 | assert mrb[-1] == for_multi[-1] 808 | list(mrb) 809 | for n, rb in enumerate(mrb): 810 | assert rb == for_multi[n], n 811 | 812 | def test_multi2(self): 813 | for_multi_pre = [] 814 | for x in range(3): 815 | for_multi = [] 816 | for i in range(5): 817 | for_multi += [RoaringBitmap(sample(range(99999), 200))] 818 | mrb = MultiRoaringBitmap(for_multi) 819 | for_multi_pre += [mrb[0], mrb[1]] 820 | 821 | assert type(for_multi_pre) is list 822 | for_multi_pre[-1] 823 | list(for_multi_pre) 824 | 825 | def test_eq(self, multi): 826 | orig = [RoaringBitmap(a) for a in multi] 827 | mrb = MultiRoaringBitmap(orig) 828 | mrb2 = MultiRoaringBitmap(orig) 829 | mrb3 = MultiRoaringBitmap(orig[1:]) 830 | assert mrb == orig 831 | assert mrb == mrb2 832 | assert mrb != orig[1:] 833 | assert mrb != mrb3 834 | --------------------------------------------------------------------------------