├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.md ├── example.cpp └── include ├── BlockEpsilonTree.hpp ├── LABlock.hpp ├── block_tree ├── BlockTree.h ├── HashString.h ├── RabinKarp.h └── blocks │ ├── BackBlock.h │ ├── Block.h │ ├── InternalBlock.h │ └── LeafBlock.h └── la_vector ├── la_vector.hpp └── piecewise_linear_model.hpp /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/clion,macos 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=clion,macos 3 | 4 | ### CLion ### 5 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 6 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 7 | 8 | # User-specific stuff 9 | .idea/**/workspace.xml 10 | .idea/**/tasks.xml 11 | .idea/**/usage.statistics.xml 12 | .idea/**/dictionaries 13 | .idea/**/shelf 14 | 15 | # Generated files 16 | .idea/**/contentModel.xml 17 | 18 | # Sensitive or high-churn files 19 | .idea/**/dataSources/ 20 | .idea/**/dataSources.ids 21 | .idea/**/dataSources.local.xml 22 | .idea/**/sqlDataSources.xml 23 | .idea/**/dynamic.xml 24 | .idea/**/uiDesigner.xml 25 | .idea/**/dbnavigator.xml 26 | 27 | # Gradle 28 | .idea/**/gradle.xml 29 | .idea/**/libraries 30 | 31 | # Gradle and Maven with auto-import 32 | # When using Gradle or Maven with auto-import, you should exclude module files, 33 | # since they will be recreated, and may cause churn. Uncomment if using 34 | # auto-import. 35 | # .idea/artifacts 36 | # .idea/compiler.xml 37 | # .idea/jarRepositories.xml 38 | # .idea/modules.xml 39 | # .idea/*.iml 40 | # .idea/modules 41 | # *.iml 42 | # *.ipr 43 | 44 | # CMake 45 | cmake-build-*/ 46 | 47 | # Mongo Explorer plugin 48 | .idea/**/mongoSettings.xml 49 | 50 | # File-based project format 51 | *.iws 52 | 53 | # IntelliJ 54 | out/ 55 | 56 | # mpeltonen/sbt-idea plugin 57 | .idea_modules/ 58 | 59 | # JIRA plugin 60 | atlassian-ide-plugin.xml 61 | 62 | # Cursive Clojure plugin 63 | .idea/replstate.xml 64 | 65 | # Crashlytics plugin (for Android Studio and IntelliJ) 66 | com_crashlytics_export_strings.xml 67 | crashlytics.properties 68 | crashlytics-build.properties 69 | fabric.properties 70 | 71 | # Editor-based Rest Client 72 | .idea/httpRequests 73 | 74 | # Android studio 3.1+ serialized cache file 75 | .idea/caches/build_file_checksums.ser 76 | 77 | ### CLion Patch ### 78 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 79 | 80 | # *.iml 81 | # modules.xml 82 | # .idea/misc.xml 83 | # *.ipr 84 | 85 | # Sonarlint plugin 86 | .idea/**/sonarlint/ 87 | 88 | # SonarQube Plugin 89 | .idea/**/sonarIssues.xml 90 | 91 | # Markdown Navigator plugin 92 | .idea/**/markdown-navigator.xml 93 | .idea/**/markdown-navigator-enh.xml 94 | .idea/**/markdown-navigator/ 95 | 96 | # Cache file creation bug 97 | # See https://youtrack.jetbrains.com/issue/JBR-2257 98 | .idea/$CACHE_FILE$ 99 | 100 | ### macOS ### 101 | # General 102 | .DS_Store 103 | .AppleDouble 104 | .LSOverride 105 | 106 | # Icon must end with two \r 107 | Icon 108 | 109 | # Thumbnails 110 | ._* 111 | 112 | # Files that might appear in the root of a volume 113 | .DocumentRevisions-V100 114 | .fseventsd 115 | .Spotlight-V100 116 | .TemporaryItems 117 | .Trashes 118 | .VolumeIcon.icns 119 | .com.apple.timemachine.donotpresent 120 | 121 | # Directories potentially created on remote AFP share 122 | .AppleDB 123 | .AppleDesktop 124 | Network Trash Folder 125 | Temporary Items 126 | .apdisk 127 | 128 | # End of https://www.toptal.com/developers/gitignore/api/clion,macos -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "lib/sdsl-lite"] 2 | path = lib/sdsl-lite 3 | url = https://github.com/xxsds/sdsl-lite.git 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.12) 2 | project(BlockEpsilonTree) 3 | 4 | set(CMAKE_CXX_STANDARD 17) 5 | if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug") 6 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") 7 | endif () 8 | 9 | find_package(OpenMP) 10 | if (OpenMP_CXX_FOUND) 11 | link_libraries(OpenMP::OpenMP_CXX) 12 | endif () 13 | 14 | include_directories(lib/sdsl-lite/include include) 15 | 16 | add_executable(example example.cpp) 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

Block-ε tree

2 | 3 | The [block-ε tree](http://pages.di.unipi.it/vinciguerra/publication/repetition-and-linearity-aware-rank-select-dictionaries/) is a compressed rank/select dictionary that achieves new space-time trade-offs by exploiting the approximate linearity and the repetitiveness of the data. 4 | It is based on a combination of the _LA-vector_ ([paper](https://doi.org/10.1137/1.9781611976472.4), [code](https://github.com/gvinciguerra/la_vector)) and the _block tree_ ([paper](https://doi.org/10.1016/j.jcss.2020.11.002), [code](https://github.com/elarielcl/BlockTrees)). 5 | 6 | ## Usage 7 | 8 | This is a header-only library. To compile the [example](example.cpp), use the following commands: 9 | 10 | ```sh 11 | git clone https://github.com/gvinciguerra/BlockEpsilonTree.git 12 | cd BlockEpsilonTree 13 | cmake . -DCMAKE_BUILD_TYPE=Release 14 | make -j8 15 | ``` 16 | 17 | ## License 18 | 19 | This project is released for academic purposes under the terms of the GNU General Public License v3.0. Some methods implemented in this project are **patent pending**. 20 | 21 | If you use this code for your research, please cite: 22 | 23 | > Paolo Ferragina, Giovanni Manzini, and Giorgio Vinciguerra. Repetition- and linearity-aware rank/select dictionaries. In: Proceedings of the 32nd International Symposium on Algorithms and Computation (ISAAC), 2021. 24 | 25 | ```bibtex 26 | @inproceedings{Ferragina:2021isaac, 27 | author = {Ferragina, Paolo and Manzini, Giovanni and Vinciguerra, Giorgio}, 28 | booktitle = {Proceedings of the 32nd International Symposium on Algorithms and Computation (ISAAC)}, 29 | title = {Repetition- and linearity-aware rank/select dictionaries}, 30 | year = {2021}} 31 | ``` -------------------------------------------------------------------------------- /example.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "BlockEpsilonTree.hpp" 5 | 6 | int main() { 7 | std::mt19937 engine; 8 | std::geometric_distribution distribution(0.85); 9 | std::vector data(1000000); 10 | for (auto i = 1; i < data.size(); ++i) 11 | data[i] = data[i - 1] + distribution(engine) + 1; 12 | 13 | BlockEpsilonTree bet(data, 2); 14 | 15 | std::cout << "Bits per integer: " << 8. * bet.size_in_bytes() / data.size() << std::endl 16 | << "Average depth: " << bet.get_metadata()["average_depth"] << std::endl 17 | << "# of elements <= 500: " << bet.rank(500) << std::endl 18 | << "10th smallest element: " << bet.select(10) << std::endl; 19 | 20 | return 0; 21 | } 22 | -------------------------------------------------------------------------------- /include/BlockEpsilonTree.hpp: -------------------------------------------------------------------------------- 1 | // This file is part of . 2 | // Copyright (c) 2021 Giorgio Vinciguerra. 3 | // 4 | // This program is free software: you can redistribute it and/or modify 5 | // it under the terms of the GNU General Public License as published by 6 | // the Free Software Foundation, version 3. 7 | // 8 | // This program is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with this program. If not, see . 15 | // 16 | // NOTE: some methods implemented here are patent pending. 17 | 18 | #pragma once 19 | 20 | #include "LABlock.hpp" 21 | #include "block_tree/BlockTree.h" 22 | #include "block_tree/blocks/Block.h" 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | 44 | //#define BLOCK_EPS_DEBUG 45 | 46 | #ifdef BLOCK_EPS_DEBUG 47 | #define IF_DEBUG(X) { X; } 48 | #define DEBUG_OUT(X) { std::cout << X; } 49 | #define DEBUG_OUTLN(X) { std::cout << X << std::endl; } 50 | #else 51 | #define IF_DEBUG(X) 52 | #define DEBUG_OUT(X) 53 | #define DEBUG_OUTLN(X) 54 | #endif 55 | 56 | class BlockEpsilonTree { 57 | using rank_type = sdsl::rank_support_v5<>; 58 | uint8_t branching_factor = 0; 59 | size_t starting_block_size = 0; 60 | uint8_t number_of_levels = 0; 61 | uint8_t ptr_shift = 0; 62 | uint64_t leaves_shift = 0; 63 | 64 | /** Elias-Fano representation without efficient rank support. */ 65 | using elias_fano = sdsl::sd_vector>; 66 | 67 | std::vector bitvectors; ///< 1 if internal block, 0 if either left pointer or an LA-block 68 | std::vector bitvectors_rank; ///< the rank structure associated to the corresponding level bv 69 | std::vector> pointers; ///< one pointer for each 0 in the corresponding level bv 70 | std::vector> difference_values; ///< one value for each 0 in the corresponding level bv 71 | std::vector la_blocks; ///< LA-blocks sorted according to the block tree in-order visit 72 | sdsl::sd_vector<> leaves; 73 | sdsl::sd_vector<>::rank_1_type leaves_rank; 74 | sdsl::sd_vector<>::select_1_type leaves_select; 75 | // sdsl::int_vector<> leaves; 76 | 77 | std::map metadata; 78 | 79 | // Structures needed for rank 80 | std::vector> samples; ///< per-level samples. one sample for each child of an internal block 81 | 82 | class Pointer; 83 | 84 | public: 85 | 86 | BlockEpsilonTree(std::vector &data, 87 | uint8_t branching_factor, 88 | size_t leaf_size = 8, 89 | size_t hint_starting_block_size = 1 << 22) 90 | : branching_factor(branching_factor) { 91 | assert(std::is_sorted(data.begin(), data.end())); 92 | 93 | // (1) Construct the pointer-based block tree 94 | std::basic_string gap_string; 95 | gap_string.reserve(data.size() + hint_starting_block_size); 96 | gap_string.push_back(uint64_t(data.front())); 97 | for (auto it = data.begin() + 1; it != data.end(); ++it) 98 | gap_string.push_back(uint64_t(*it - *std::prev(it))); 99 | 100 | BlockTree bt(gap_string, branching_factor, leaf_size); 101 | bt.process_back_pointers(hint_starting_block_size); 102 | bt.clean_unnecessary_expansions(); 103 | 104 | // (1.1) Find the first level, which is the one containing at least one back block 105 | std::vector first_level = {bt.root_block_}; 106 | while (true) { 107 | auto it = std::find_if(first_level.begin(), first_level.end(), [](auto &b) { return b->is_leaf(); }); 108 | if (it != first_level.end()) 109 | break; 110 | first_level = bt.next_level(first_level, false); 111 | } 112 | 113 | // (1.2) Assign a cost to the nodes of the block tree 114 | const auto n = data.size(); 115 | const auto u = data.back(); 116 | const auto log_n = BIT_WIDTH(n); 117 | const auto log_u = BIT_WIDTH(u); 118 | 119 | size_t leaf_elements = 0; 120 | for (auto &b : first_level) 121 | leaf_elements += b->count_leaf_elements(); 122 | const auto leaf_approx_cost = std::ceil(ef_cost(leaf_elements, u - data.front()) / leaf_elements); 123 | 124 | auto cost = 0; 125 | for (auto &b : first_level) 126 | cost += b->la_visit(data.data(), log_n, log_u, 0, leaf_approx_cost); 127 | 128 | // (1.3) Assign IDs to LA blocks 129 | size_t la_blocks_count = 0; 130 | for (auto &b : first_level) 131 | la_blocks_count = b->number_la_blocks(la_blocks_count); 132 | 133 | la_blocks.resize(la_blocks_count); 134 | ptr_shift = la_blocks_count > 0 ? BIT_WIDTH(la_blocks_count - 1) : 0; 135 | if (log_n + ptr_shift + Pointer::selector_width > 64) 136 | throw std::overflow_error("Pointers too large"); 137 | 138 | // (1.4) Compute stats on the pruned tree 139 | size_t depth = 0; 140 | long double depth_sum = 0; 141 | for (auto &b : first_level) { 142 | auto[subtree_max_depth, subtree_depth_sum] = b->compute_depth(0); 143 | depth = std::max(depth, subtree_max_depth); 144 | depth_sum += subtree_depth_sum; 145 | } 146 | const auto average_depth = depth_sum / data.size(); 147 | 148 | // (2) Level-wise compress the tree topology 149 | bitvectors_rank.reserve(depth); 150 | pointers.reserve(depth); 151 | bitvectors.reserve(depth); 152 | difference_values.reserve(depth); 153 | 154 | starting_block_size = first_level[0]->length(); 155 | number_of_levels = 0; 156 | 157 | auto current_level = std::move(first_level); 158 | auto next_level = bt.next_level(current_level, true); 159 | 160 | sdsl::int_vector<> top_samples; 161 | top_samples.reserve(current_level.size()); 162 | for (auto &b : current_level) 163 | top_samples.push_back(b->end_ < n ? data[b->end_] : u); 164 | sdsl::util::bit_compress(top_samples); 165 | samples.push_back(std::move(top_samples)); 166 | 167 | while (!next_level.empty()) { 168 | // (2.1) Compress the level 169 | sdsl::bit_vector level_bv(current_level.size()); 170 | sdsl::int_vector<> level_difference_values; 171 | sdsl::int_vector<> level_pointers; 172 | sdsl::int_vector<> level_samples; 173 | level_difference_values.reserve(current_level.size()); 174 | level_pointers.reserve(current_level.size()); 175 | level_samples.reserve(current_level.size()); 176 | DEBUG_OUT("Level " << int(number_of_levels) << " samples: "); 177 | 178 | auto block_size = current_level.front()->length(); 179 | for (size_t i = 0; i < current_level.size(); ++i) { 180 | auto *&block = current_level[i]; 181 | block->level_start_pos_ = i; 182 | 183 | if (block->is_la_leaf()) { 184 | assert(!block->is_leaf()); 185 | 186 | BlockLAType la; 187 | if (!block->get_la(data.data(), la)) 188 | throw std::runtime_error(""); 189 | la_blocks[block->la_block_id_] = la; 190 | DEBUG_OUTLN("Pruned " << block->start_ << " " << block->end_ << " pos=" << block->la_block_id_); 191 | 192 | level_bv[i] = false; 193 | level_pointers.push_back(Pointer::make(0b11, 0, block->la_block_id_, ptr_shift)); 194 | level_difference_values.push_back(0); 195 | } else if (block->is_leaf()) { 196 | auto *&left = block->first_block_; 197 | auto *&right = block->second_block_; 198 | assert(right || block->offset_ == 0); 199 | 200 | level_bv[i] = false; 201 | level_difference_values.push_back(data[block->start_] - data[left->start_ + block->offset_]); 202 | 203 | uint64_t pointer; 204 | uint64_t position; 205 | auto left_is_pruned = left->parent_->is_la_leaf(); 206 | auto right_is_pruned = (right && right->parent_->is_la_leaf()) || (left_is_pruned && !right); 207 | auto selector = uint8_t((left_is_pruned << 1) | right_is_pruned); 208 | switch (selector) { 209 | case 0b00: { 210 | position = left->level_start_pos_ * block_size + block->offset_; 211 | pointer = Pointer::make(selector, position, Pointer::nil, ptr_shift); 212 | break; 213 | } 214 | 215 | case 0b01: { 216 | position = left->level_start_pos_ * block_size + block->offset_; 217 | pointer = Pointer::make(selector, position, right->parent_->la_block_id_, ptr_shift); 218 | break; 219 | } 220 | 221 | case 0b10: { 222 | auto offset = int64_t(right->level_start_pos_ * block_size + block->offset_) - block_size; 223 | auto shift = left->start_ - left->parent_->start_; 224 | position = offset + shift; 225 | pointer = Pointer::make(selector, position, left->parent_->la_block_id_, ptr_shift); 226 | break; 227 | } 228 | 229 | case 0b11: { 230 | position = block->offset_ + left->start_ - left->parent_->start_; 231 | pointer = Pointer::make(selector, position, left->parent_->la_block_id_, ptr_shift); 232 | break; 233 | } 234 | } 235 | 236 | level_pointers.push_back(pointer); 237 | } else { 238 | level_bv[i] = true; 239 | for (auto &c : block->children_) 240 | level_samples.push_back(c->end_ >= n ? u : data[c->end_]); 241 | } 242 | } 243 | 244 | // (2.2) Store the level 245 | DEBUG_OUTLN("BITV:\t" << level_bv << std::endl 246 | << "DIFF:\t" << level_difference_values << std::endl 247 | << "PTRS:\t" << level_pointers << std::endl 248 | << std::string(80, '-')); 249 | 250 | samples.emplace_back(std::move(level_samples)); 251 | pointers.emplace_back(std::move(level_pointers)); 252 | bitvectors.emplace_back(std::move(level_bv)); 253 | difference_values.emplace_back(std::move(level_difference_values)); 254 | sdsl::util::bit_compress(pointers.back()); 255 | sdsl::util::bit_compress(difference_values.back()); 256 | sdsl::util::bit_compress(samples.back()); 257 | 258 | current_level = std::move(next_level); 259 | next_level = bt.next_level(current_level, true); 260 | ++number_of_levels; 261 | } 262 | 263 | // (3) Init auxiliary structures and prepare leaf string 264 | if (!samples.empty()) 265 | samples.pop_back(); 266 | 267 | for (auto &bv : bitvectors) 268 | bitvectors_rank.emplace_back(&bv); 269 | 270 | ++number_of_levels; 271 | 272 | // std::vector tmp_leaves; 273 | // tmp_leaves.reserve(current_level.size() * bt.leaf_length_); 274 | // for (const auto &b: current_level) { 275 | // std::copy(data.begin() + b->start_, 276 | // std::min(data.end(), data.begin() + b->end_ + 1), 277 | // std::back_inserter(tmp_leaves)); 278 | // } 279 | // auto leaves_count = tmp_leaves.size(); 280 | // leaves = decltype(leaves)(tmp_leaves.begin(), tmp_leaves.end()); 281 | 282 | size_t leaves_count = 0; 283 | for (const auto &b: current_level) 284 | leaves_count += b->actual_length(); 285 | 286 | if (leaves_count) { 287 | leaves_shift = data[current_level.front()->start_]; 288 | auto leaves_u = data[std::min(data.size() - 1, current_level.back()->end_)] - leaves_shift; 289 | sdsl::sd_vector_builder builder(leaves_u + 1, leaves_count); 290 | for (const auto &b: current_level) 291 | for (size_t j = b->start_; j < std::min(data.size(), b->end_ + 1); ++j) 292 | builder.set(data[j] - leaves_shift); 293 | leaves = decltype(leaves)(builder); 294 | } 295 | sdsl::util::init_support(leaves_rank, &leaves); 296 | sdsl::util::init_support(leaves_select, &leaves); 297 | 298 | // (4) Compute stats / metadata 299 | size_t la_bytes = 0; 300 | for (auto &l: la_blocks) 301 | la_bytes += l.size_in_bytes(); 302 | 303 | size_t internal_nodes_count = 0; 304 | size_t back_pointers_count = 0; 305 | for (size_t i = 0; i < bitvectors.size(); ++i) { 306 | auto num_ones = bitvectors_rank[i].rank(bitvectors[i].size()); 307 | internal_nodes_count += num_ones; 308 | back_pointers_count += bitvectors[i].size() - num_ones; 309 | } 310 | back_pointers_count -= la_blocks_count; 311 | 312 | auto to_bpi = [&](auto bytes) { return std::to_string(bytes * 8. / n); }; 313 | metadata["bpi"] = to_bpi(size_in_bytes()); 314 | metadata["bitvectors_bpi"] = to_bpi(sdsl::size_in_bytes(bitvectors)); 315 | metadata["bitvectors_rank_bpi"] = to_bpi(sdsl::size_in_bytes(bitvectors_rank)); 316 | metadata["pointers_bpi"] = to_bpi(sdsl::size_in_bytes(pointers)); 317 | metadata["difference_values_bpi"] = to_bpi(sdsl::size_in_bytes(difference_values)); 318 | metadata["samples_bpi"] = to_bpi(sdsl::size_in_bytes(samples)); 319 | metadata["leaves_bpi"] = to_bpi(sdsl::size_in_bytes(leaves)); 320 | metadata["la_blocks_bpi"] = to_bpi(la_bytes); 321 | metadata["la_blocks_count"] = std::to_string(la_blocks_count); 322 | metadata["internal_nodes_count"] = std::to_string(internal_nodes_count); 323 | metadata["back_pointers_count"] = std::to_string(back_pointers_count); 324 | metadata["leaves_count"] = std::to_string(leaves_count); 325 | metadata["depth"] = std::to_string(depth); 326 | metadata["average_depth"] = std::to_string(average_depth); 327 | metadata["starting_block_size"] = std::to_string(starting_block_size); 328 | } 329 | 330 | std::map get_metadata() { return metadata; } 331 | 332 | size_t size_in_bytes() const { 333 | size_t sum = 0; 334 | sum += sdsl::size_in_bytes(bitvectors); 335 | sum += sdsl::size_in_bytes(bitvectors_rank); 336 | sum += sdsl::size_in_bytes(pointers); 337 | sum += sdsl::size_in_bytes(difference_values); 338 | sum += sdsl::size_in_bytes(samples); 339 | for (auto &l: la_blocks) 340 | sum += l.size_in_bytes(); 341 | sum += sdsl::size_in_bytes(leaves); 342 | return sum; 343 | } 344 | 345 | size_t rank(uint64_t x) const { 346 | if (samples.empty()) 347 | return leaves_rank(std::max(x, leaves_shift) - leaves_shift); 348 | 349 | auto block_size = starting_block_size; 350 | auto &top_samples = samples.front(); 351 | auto block = size_t(std::lower_bound(top_samples.begin(), top_samples.end(), x) - top_samples.begin()); 352 | auto x_remapped = x; 353 | auto input_shift = block * block_size; 354 | 355 | for (auto level = 0; level < number_of_levels - 1; ++level) { 356 | if (!bitvectors[level][block]) { 357 | auto[diff_val, ptr] = get_leftward_data(level, block); 358 | x_remapped -= diff_val; 359 | 360 | switch (ptr.get_selector()) { 361 | case 0b00: { 362 | block = ptr.get_position() / block_size; 363 | input_shift -= ptr.get_position() % block_size; 364 | auto sample = samples[level][block]; 365 | if (x_remapped > sample) { 366 | ++block; 367 | input_shift += block_size; 368 | } 369 | break; 370 | } 371 | 372 | case 0b01: { 373 | block = ptr.get_position() / block_size; 374 | input_shift -= ptr.get_position() % block_size; 375 | auto sample = samples[level][block]; 376 | if (x_remapped > sample) { 377 | auto &la_block = la_blocks[ptr.get_right_la_block()]; 378 | return input_shift + block_size + la_block.rank(x_remapped, 0, block_size); 379 | } 380 | break; 381 | } 382 | 383 | case 0b10: { 384 | auto &la_block = la_blocks[ptr.get_left_la_block()]; 385 | input_shift -= ptr.get_position() % block_size; 386 | block = ptr.get_position() / block_size + 1 - (la_block.size() - block_size) / block_size; 387 | auto sample = samples[level][block]; 388 | if (x_remapped <= sample) { 389 | auto lo = la_block.size() - block_size; 390 | auto hi = la_block.size(); 391 | return input_shift + la_block.rank(x_remapped, lo, hi) - (la_block.size() - block_size); 392 | } 393 | input_shift += block_size; 394 | break; 395 | } 396 | 397 | case 0b11: { 398 | auto k = ptr.get_position(); 399 | input_shift -= k; 400 | auto &left_la_block = la_blocks[ptr.get_left_la_block()]; 401 | if (k < left_la_block.size()) { 402 | auto lo = ptr.get_position(); 403 | auto hi = left_la_block.size(); 404 | return input_shift + left_la_block.rank(x_remapped, lo, hi); 405 | } 406 | auto &right_la_block = la_blocks[ptr.get_right_la_block()]; 407 | auto lo = ptr.get_position() - left_la_block.size(); 408 | auto hi = ptr.get_position() + block_size; 409 | return input_shift + right_la_block.rank(x_remapped, lo, hi); 410 | } 411 | } 412 | } 413 | 414 | block = bitvectors_rank[level].rank(block) * branching_factor; 415 | uint8_t child = 0; 416 | if (level != number_of_levels - 2) { // There are no samples at the last level because we use EF with rank 417 | for (; child < branching_factor - 1; ++child) { 418 | if (block + child >= samples[level + 1].size()) 419 | break; 420 | auto child_ub = samples[level + 1][block + child]; 421 | if (x_remapped <= child_ub) 422 | break; 423 | } 424 | } 425 | block = block + child; 426 | block_size /= branching_factor; 427 | input_shift += child * block_size; 428 | } 429 | 430 | // Uncomment if there is no efficient rank support on the leaves 431 | // auto start = block * block_size; 432 | // auto j = start; 433 | // uint64_t val = 0; 434 | // for (; j < start + block_size; ++j) { 435 | // val = leaves_shift + leaves_select(j + 1); 436 | // if (val >= x_remapped) 437 | // break; 438 | // } 439 | // return input_shift + j - start; 440 | 441 | auto leaves_rank_val = x_remapped <= leaves_shift ? 0 : leaves_rank(x_remapped - leaves_shift); 442 | return input_shift + leaves_rank_val - block * block_size; 443 | } 444 | 445 | uint64_t select(size_t i) const { 446 | assert(i > 0); 447 | return operator[](i - 1); 448 | } 449 | 450 | uint64_t operator[](size_t i) const { 451 | auto block_size = starting_block_size; 452 | auto block = i / block_size; 453 | auto offset = i % block_size; 454 | auto result = uint64_t(0); 455 | 456 | for (auto level = 0; level < number_of_levels - 1; ++level) { 457 | if (!bitvectors[level][block]) { 458 | auto[diff_val, ptr] = get_leftward_data(level, block); 459 | result += diff_val; 460 | 461 | switch (ptr.get_selector()) { 462 | case 0b00: 463 | offset += ptr.get_position() % block_size; 464 | block = ptr.get_position() / block_size; 465 | if (offset >= block_size) { 466 | ++block; 467 | offset -= block_size; 468 | } 469 | break; 470 | 471 | case 0b01: 472 | offset += ptr.get_position() % block_size; 473 | block = ptr.get_position() / block_size; 474 | if (offset >= block_size) 475 | return result + la_blocks[ptr.get_right_la_block()][offset - block_size]; 476 | break; 477 | 478 | case 0b10: { 479 | offset += ptr.get_position() % block_size; 480 | auto &la_block = la_blocks[ptr.get_left_la_block()]; 481 | if (offset < block_size) 482 | return result + la_block[la_block.size() - block_size + offset]; 483 | block = ptr.get_position() / block_size + 1 - (la_block.size() - block_size) / block_size; 484 | offset -= block_size; 485 | break; 486 | } 487 | 488 | case 0b11: 489 | offset += ptr.get_position(); 490 | auto &left_la_block = la_blocks[ptr.get_left_la_block()]; 491 | if (offset < left_la_block.size()) 492 | return result + left_la_block[offset]; 493 | auto &right_la_block = la_blocks[ptr.get_right_la_block()]; 494 | return result + right_la_block[offset - left_la_block.size()]; 495 | } 496 | } 497 | block_size /= branching_factor; 498 | auto child = offset / block_size; 499 | block = bitvectors_rank[level].rank(block) * branching_factor + child; 500 | offset -= child * block_size; 501 | } 502 | 503 | return result + leaves_shift + leaves_select(block * block_size + offset + 1); 504 | // return result + leaves_shift + leaves[block * block_size + offset]; 505 | } 506 | 507 | private: 508 | 509 | class Pointer { 510 | uint8_t selector; 511 | size_t position; 512 | size_t la_block; 513 | 514 | public: 515 | 516 | static constexpr uint8_t selector_width = 2; 517 | static constexpr auto nil = std::numeric_limits::max(); 518 | 519 | Pointer(uint64_t value, uint8_t shift) { 520 | selector = value & sdsl::bits::lo_set[selector_width]; 521 | if (selector) { 522 | la_block = (value >> selector_width) & sdsl::bits::lo_set[shift]; 523 | la_block -= selector == 0b01; 524 | position = value >> (selector_width + shift); 525 | } else 526 | position = value >> selector_width; 527 | } 528 | 529 | static uint64_t make(uint8_t selector, size_t offset, size_t la_block, uint8_t shift) { 530 | if (selector == 0) { 531 | assert(la_block == nil); 532 | return offset << selector_width; 533 | } 534 | 535 | assert(selector <= 3); 536 | assert(la_block != nil); 537 | uint64_t ptr = 0; 538 | ptr |= selector; 539 | ptr |= la_block << selector_width; 540 | ptr |= offset << (selector_width + shift); 541 | return ptr; 542 | } 543 | 544 | uint8_t get_selector() const { return selector; } 545 | size_t get_position() const { return position; } 546 | size_t get_left_la_block() const { return la_block; } 547 | size_t get_right_la_block() const { return la_block + 1; } 548 | }; 549 | 550 | std::pair get_leftward_data(size_t level, size_t block) const { 551 | auto rank0 = block - bitvectors_rank[level].rank(block); 552 | return {difference_values[level][rank0], Pointer(pointers[level][rank0], ptr_shift)}; 553 | } 554 | 555 | }; -------------------------------------------------------------------------------- /include/LABlock.hpp: -------------------------------------------------------------------------------- 1 | // This file is part of . 2 | // Copyright (c) 2021 Giorgio Vinciguerra. 3 | // 4 | // This program is free software: you can redistribute it and/or modify 5 | // it under the terms of the GNU General Public License as published by 6 | // the Free Software Foundation, version 3. 7 | // 8 | // This program is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with this program. If not, see . 15 | // 16 | // NOTE: some methods implemented here are patent pending. 17 | 18 | #pragma once 19 | 20 | #include "la_vector/piecewise_linear_model.hpp" 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | /** Computes (bits_per_correction > 0 ? 2^(bits_per_correction-1) - 1 : 0) without the conditional operator. */ 27 | #define BPC_TO_EPSILON(bits_per_correction) (((1ul << (bits_per_correction)) + 1) / 2 - 1) 28 | 29 | /** Computes the number of bits needed to store x, that is, 0 if x is 0, 1 + floor(log2(x)) otherwise. */ 30 | #define BIT_WIDTH(x) ((x) == 0 ? 0 : 64 - __builtin_clzll(x)) 31 | 32 | /** Computes the smallest integral value not less than x / y, where x and y must be positive integers. */ 33 | #define CEIL_UINT_DIV(x, y) ((x) / (y) + ((x) % (y) != 0)) 34 | 35 | #pragma pack(push, 1) 36 | 37 | template 38 | class LABlock { 39 | static constexpr uint8_t field_bits_for_bpc = 5; 40 | static constexpr uint8_t field_bits_for_n = 32 - field_bits_for_bpc; 41 | 42 | K slope_numerator; 43 | K slope_denominator; 44 | std::make_signed_t intercept; 45 | uint32_t n: field_bits_for_n; 46 | uint8_t bpc: field_bits_for_bpc; 47 | std::unique_ptr corrections; 48 | using larger_key_type = typename std::conditional_t; 49 | 50 | public: 51 | 52 | LABlock() = default; 53 | 54 | LABlock(const LABlock &la) 55 | : slope_numerator(la.slope_numerator), 56 | slope_denominator(la.slope_denominator), 57 | intercept(la.intercept), 58 | n(la.n), 59 | bpc(la.bpc), 60 | corrections() { 61 | if (bpc) { 62 | auto s = CEIL_UINT_DIV(bpc * n, 64) + 1; 63 | corrections = std::make_unique(s); 64 | std::copy(la.corrections.get(), la.corrections.get() + s, corrections.get()); 65 | } 66 | } 67 | 68 | LABlock &operator=(const LABlock &la) { 69 | slope_numerator = la.slope_numerator; 70 | slope_denominator = la.slope_denominator; 71 | intercept = la.intercept; 72 | n = la.n; 73 | bpc = la.bpc; 74 | if (bpc) { 75 | auto s = CEIL_UINT_DIV(bpc * n, 64) + 1; 76 | corrections = std::make_unique(s); 77 | std::copy(la.corrections.get(), la.corrections.get() + s, corrections.get()); 78 | } 79 | return *this; 80 | } 81 | 82 | larger_key_type approximate(size_t i) const { 83 | return larger_key_type(slope_numerator * i) / slope_denominator + intercept; 84 | } 85 | 86 | K operator[](size_t i) const { 87 | if (!bpc) 88 | return approximate(i); 89 | auto epsilon = BPC_TO_EPSILON(bpc); 90 | auto j = i * bpc; 91 | auto correction = sdsl::bits::read_int(corrections.get() + (j >> 6u), j & 0x3F, bpc); 92 | return approximate(i) + correction - epsilon; 93 | } 94 | 95 | size_t rank(const K &value) const { 96 | auto[pos, bound] = approximate_position(value); 97 | auto lo = pos <= bound ? 0 : pos - bound; 98 | auto hi = std::min(pos + bound + 1, n); 99 | 100 | while (lo < hi) { 101 | auto mid = lo + (hi - lo) / 2; 102 | if (operator[](mid) < value) 103 | lo = mid + 1; 104 | else 105 | hi = mid; 106 | } 107 | 108 | return lo; 109 | } 110 | 111 | size_t rank(const K &value, size_t lo_bound, size_t hi_bound) const { 112 | auto[pos, bound] = approximate_position(value); 113 | auto lo = std::max(pos <= bound ? 0 : pos - bound, lo_bound); 114 | auto hi = std::min(pos + bound + 1, hi_bound); 115 | 116 | while (lo < hi) { 117 | auto mid = lo + (hi - lo) / 2; 118 | if (operator[](mid) < value) 119 | lo = mid + 1; 120 | else 121 | hi = mid; 122 | } 123 | 124 | return lo; 125 | } 126 | 127 | std::pair approximate_position(const K &value) const { 128 | auto numerator = std::max(1, slope_numerator); 129 | auto position = ((larger_key_type(value) - intercept) * slope_denominator) / numerator; 130 | auto epsilon = larger_key_type(BPC_TO_EPSILON(this->bpc)); 131 | auto bound = 1 + (epsilon * slope_denominator) / numerator; 132 | return {std::clamp(position, 0, n), bound}; 133 | } 134 | 135 | template 136 | static bool make(RandomIt begin, RandomIt end, uint8_t bpc, LABlock &out) { 137 | const auto n = (size_t) std::distance(begin, end); 138 | if (BIT_WIDTH(n) > field_bits_for_n) 139 | throw std::overflow_error("increase bits assigned to n"); 140 | if (BIT_WIDTH(bpc) > field_bits_for_bpc) 141 | throw std::overflow_error("increase bits assigned to bpc"); 142 | 143 | const auto epsilon = BPC_TO_EPSILON(bpc); 144 | 145 | OptimalPiecewiseLinearModel opt(epsilon); 146 | opt.add_point(0, begin[0]); 147 | 148 | for (size_t i = 1; i < n; ++i) 149 | if (!opt.add_point(i, begin[i])) 150 | return false; 151 | 152 | auto cs = opt.get_segment(); 153 | auto max_slope = cs.rectangle[3] - cs.rectangle[1]; 154 | auto intercept_numerator = cs.rectangle[3].x * cs.rectangle[1].y - cs.rectangle[1].x * cs.rectangle[3].y; 155 | out.slope_numerator = max_slope.dy; 156 | out.slope_denominator = max_slope.dx; 157 | out.intercept = max_slope.dx == 0 ? begin[0] : intercept_numerator / max_slope.dx; 158 | out.n = n; 159 | out.bpc = bpc; 160 | if (bpc == 0) { 161 | out.corrections = nullptr; 162 | return true; 163 | } 164 | 165 | out.corrections = std::make_unique(CEIL_UINT_DIV(bpc * n, 64) + 1); 166 | for (size_t i = 0; i < n; ++i) { 167 | auto error = begin[i] - out.approximate(i); 168 | auto correction = uint64_t(error + epsilon); 169 | if (BIT_WIDTH(correction) > bpc) 170 | throw std::overflow_error("Segment correction too large"); 171 | auto j = i * bpc; 172 | sdsl::bits::write_int(out.corrections.get() + (j >> 6), correction, j & 0x3F, bpc); 173 | } 174 | 175 | return true; 176 | } 177 | 178 | size_t size() const { return n; } 179 | 180 | uint8_t get_bpc() const { return bpc; } 181 | 182 | size_t size_in_bytes() const { return sizeof(*this) + n * bpc / 8; } 183 | }; 184 | 185 | #pragma pack(pop) -------------------------------------------------------------------------------- /include/block_tree/BlockTree.h: -------------------------------------------------------------------------------- 1 | // This source file is adapted from Manuel Cáceres's https://github.com/elarielcl/BlockTrees, distributed under GPL-3.0 2 | 3 | #pragma once 4 | 5 | #include "RabinKarp.h" 6 | #include "HashString.h" 7 | #include "blocks/Block.h" 8 | #include "blocks/BackBlock.h" 9 | #include "blocks/InternalBlock.h" 10 | #include "blocks/LeafBlock.h" 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | class BlockTree { 18 | using block_map = std::unordered_map>; 19 | using block_pairs_map = std::unordered_map>>; 20 | 21 | void block_scan(std::vector &, int, block_map &); 22 | public: 23 | uint8_t r_; 24 | size_t leaf_length_; 25 | const std::basic_string &input_; 26 | Block *root_block_; 27 | 28 | BlockTree(const std::basic_string &, uint8_t, size_t); 29 | ~BlockTree(); 30 | 31 | void process_back_pointers(size_t starting_block_size); 32 | void clean_unnecessary_expansions(); 33 | 34 | void process_level(std::vector &); 35 | 36 | void forward_window_block_scan(std::vector &level, 37 | uint32_t window_size, 38 | uint32_t N, 39 | block_map &hashtable); 40 | void forward_pair_window_block_scan( 41 | std::vector &level, 42 | uint32_t pair_window_size, 43 | uint32_t N, 44 | block_pairs_map &pair_hashtable); 45 | 46 | std::vector next_level(std::vector &level, bool skip_la_leaf) const; 47 | // Returns a vector of levels of nodes of the tree where 48 | // each level is represented by a vector of its nodes (left-to-right). 49 | // 50 | // A simple levelwise (left-to-right) traversal of the tree would be: 51 | // for (std::vector level : bt->levelwise_iterator()) { 52 | // for (Block* b : level) { 53 | // ... 54 | std::vector> levelwise_iterator(); 55 | }; 56 | 57 | BlockTree::BlockTree(const std::basic_string &input, uint8_t r, size_t leaf_length) 58 | : r_(r), 59 | input_(input), 60 | leaf_length_(leaf_length) { 61 | 62 | if (input_.size() <= leaf_length_ || input_.size() < r) 63 | root_block_ = new LeafBlock(nullptr, 0, input_.size() - 1, input_); 64 | else { 65 | int number_of_leaves = 66 | (input_.size() % leaf_length_ == 0) ? input_.size() / leaf_length_ : input_.size() / leaf_length_ + 1; 67 | int height = 0; 68 | 69 | auto nl = number_of_leaves - 1; 70 | auto block_length = leaf_length_; 71 | while (nl) { 72 | height++; 73 | block_length *= r_; 74 | nl /= r_; 75 | } 76 | 77 | root_block_ = new InternalBlock(nullptr, 0, block_length - 1, input_); 78 | } 79 | 80 | } 81 | 82 | BlockTree::~BlockTree() { 83 | delete root_block_; 84 | } 85 | 86 | std::vector> BlockTree::levelwise_iterator() { 87 | std::vector> result = {{root_block_}}; 88 | while (!dynamic_cast(result.back()[0])) { 89 | std::vector next_level; 90 | for (Block *b : result.back()) 91 | for (Block *child : b->children(leaf_length_, r_)) 92 | next_level.push_back(child); 93 | result.push_back(next_level); 94 | } 95 | 96 | return result; 97 | } 98 | 99 | void BlockTree::clean_unnecessary_expansions() { 100 | root_block_->clean_unnecessary_expansions(); 101 | for (std::vector level : levelwise_iterator()) { 102 | for (int i = 0; i < level.size(); ++i) { 103 | level[i]->level_index_ = i; 104 | level[i]->first_occurrence_level_index_ = level[i]->first_block_->level_index_; 105 | } 106 | } 107 | } 108 | 109 | std::vector BlockTree::next_level(std::vector &level, bool skip_la_leaf = false) const { 110 | std::vector next_level; 111 | for (auto &b : level) { 112 | if (skip_la_leaf && b->is_la_leaf()) 113 | continue; 114 | for (Block *child : b->children(leaf_length_, r_)) { // Do it in order 115 | child->level_index_ = next_level.size(); 116 | child->first_occurrence_level_index_ = next_level.size(); 117 | next_level.push_back(child); 118 | } 119 | } 120 | return next_level; 121 | } 122 | 123 | void BlockTree::forward_pair_window_block_scan( 124 | std::vector &level, 125 | uint32_t pair_window_size, 126 | uint32_t N, 127 | block_pairs_map &pair_hashtable) { 128 | for (auto it = level.begin(); it != level.end();) { 129 | Block *b = (*it); 130 | b->right_ = true; 131 | int offset = 0; 132 | RabinKarp rk(input_, (*it)->start_ + offset, pair_window_size, N); // position is always 0 here 133 | for (; it != level.end() && ((*it) == b || (*(it - 1))->end_ == (*it)->start_ - 1); it++) { 134 | Block *current = *(it); 135 | bool last_block = ((it + 1) == level.end() || current->end_ != (*(it + 1))->start_ - 1); 136 | for (offset = 0; offset < current->length(); ++offset) { 137 | if (last_block && current->length() - offset < pair_window_size) 138 | break; 139 | HashString hS(rk.hash(), 140 | input_, 141 | current->start_ + offset, 142 | current->start_ + offset + pair_window_size - 1); 143 | auto result = pair_hashtable.find(hS); 144 | if (result != pair_hashtable.end()) { 145 | // Here, It could be that the scanning should have finished with the penultimate, but it never should enter this ''if'' 146 | // when We're on the penultimate block and the window exceeds the last block because if that is a first occurrence should have been occured before in a pair of blocks 147 | // maybe use a condition more like rk's condition below could work fine too 148 | // Same logic: for when passing a window of size 2l + 2 over 2 block of length l 149 | for (std::pair p: result->second) { 150 | if (current->start_ + offset < p.first->start_) { 151 | p.first->left_ = true; 152 | p.second->right_ = true; 153 | } 154 | } 155 | pair_hashtable.erase(hS); 156 | } 157 | if (current->start_ + offset + pair_window_size < input_.size()) 158 | rk.next(); 159 | } 160 | } 161 | (*(it - 1))->left_ = true; 162 | } 163 | } 164 | 165 | void BlockTree::forward_window_block_scan(std::vector &level, 166 | uint32_t window_size, 167 | uint32_t N, 168 | block_map &hashtable) { 169 | int i = 0; 170 | for (auto it = level.begin(); it != level.end();) { 171 | Block *b = (*it); 172 | int offset = 0; 173 | RabinKarp rk(input_, (*it)->start_ + offset, window_size, N); 174 | for (; it != level.end() && ((*it) == b || (*(it - 1))->end_ == (*it)->start_ - 1); it++, i++) { 175 | Block *current = *(it); 176 | bool last_block = ((it + 1) == level.end() || current->end_ != (*(it + 1))->start_ - 1); 177 | for (offset = 0; offset < current->length(); ++offset) { 178 | if (last_block && current->length() - offset < window_size) 179 | break; 180 | HashString hS(rk.hash(), input_, current->start_ + offset, current->start_ + offset + window_size - 1); 181 | auto result = hashtable.find(hS); 182 | if (result != hashtable.end()) { 183 | std::vector blocks = result->second; 184 | for (Block *b : blocks) { 185 | b->first_occurrence_level_index_ = i; 186 | b->first_block_ = current; 187 | b->offset_ = offset; 188 | if (offset + window_size > b->first_block_->length()) b->second_block_ = (*(it + 1)); 189 | else b->second_block_ = nullptr; 190 | } 191 | hashtable.erase(hS); 192 | } 193 | if (current->start_ + offset + window_size < input_.size()) rk.next(); 194 | } 195 | } 196 | } 197 | } 198 | 199 | void BlockTree::block_scan(std::vector &level, int N, block_map &hashtable) { 200 | for (Block *b : level) { 201 | RabinKarp rk(input_, b->start_, b->length(), N); 202 | HashString hS(rk.hash(), input_, b->start_, b->end_); 203 | 204 | auto result = hashtable.find(hS); 205 | if (result == hashtable.end()) 206 | hashtable[hS] = {b}; 207 | else 208 | hashtable[hS].push_back(b); 209 | } 210 | } 211 | 212 | void BlockTree::process_level(std::vector &level) { 213 | auto N = 6700417; //Large prime 214 | auto level_length = level.front()->length(); 215 | 216 | // Block scan 217 | block_map hashtable; 218 | block_scan(level, N, hashtable); 219 | 220 | // Pairs of blocks scan 221 | block_pairs_map pair_hashtable; 222 | for (auto it = level.begin(); it != level.end();) { 223 | for (++it; (it != level.end() && (*(it - 1))->end_ == (*it)->start_ - 1); ++it) { 224 | Block *current = (*(it - 1)); 225 | Block *next = (*it); 226 | RabinKarp rk(input_, current->start_, current->length() + next->length(), N); 227 | HashString hS(rk.hash(), 228 | input_, 229 | current->start_, 230 | current->start_ + current->length() + next->length() - 1); 231 | 232 | auto result = pair_hashtable.find(hS); 233 | if (result == pair_hashtable.end()) 234 | pair_hashtable[hS] = {{current, next}}; 235 | else 236 | pair_hashtable[hS].push_back({current, next}); 237 | } 238 | } 239 | 240 | // Window block scan 241 | //Establishes first occurrences of blocks 242 | forward_window_block_scan(level, level_length, N, hashtable); 243 | 244 | // Window Pair of blocks scans 245 | if (level.size() > 1) 246 | forward_pair_window_block_scan(level, level_length * 2, N, pair_hashtable); 247 | 248 | // BackBlock creation 249 | for (int i = 0; i < level.size(); ++i) { 250 | Block *b = level[i]; 251 | if (b->left_ && b->right_ && b->first_occurrence_level_index_ < b->level_index_) { 252 | // This doesn't have the bug of the dangling reference fixed with first_occurrence_level_index, because it 253 | // shouldn't happen that block points back to a BackBlock 254 | auto *bb = new BackBlock(b->parent_, b->start_, b->end_, input_, 255 | level[b->first_occurrence_level_index_], 256 | b->second_block_ ? level[b->first_occurrence_level_index_ + 1] : nullptr, 257 | b->offset_); 258 | bb->level_index_ = b->level_index_; 259 | bb->first_occurrence_level_index_ = b->first_occurrence_level_index_; 260 | bb->left_ = true; 261 | bb->right_ = true; 262 | b->parent_->replace_child(b, bb); 263 | delete b; 264 | level[i] = bb; 265 | } 266 | } 267 | 268 | } 269 | 270 | void BlockTree::process_back_pointers(size_t starting_block_size = std::numeric_limits::max()) { 271 | std::vector current_level = {root_block_}; 272 | 273 | while (current_level.front()->length() > starting_block_size) 274 | current_level = next_level(current_level, false); 275 | 276 | std::stack none_blocks; 277 | while (!(current_level = next_level(current_level, false)).empty()) { 278 | if (current_level[0]->length() < r_ || current_level[0]->length() <= leaf_length_) break; 279 | while (!current_level.empty() && current_level.back()->end_ >= input_.size()) { 280 | none_blocks.push(current_level.back()); 281 | current_level.pop_back(); 282 | } 283 | process_level(current_level); 284 | while (!none_blocks.empty()) { 285 | current_level.push_back(none_blocks.top()); 286 | none_blocks.pop(); 287 | } 288 | } 289 | } -------------------------------------------------------------------------------- /include/block_tree/HashString.h: -------------------------------------------------------------------------------- 1 | // This source file is adapted from Manuel Cáceres's https://github.com/elarielcl/BlockTrees, distributed under GPL-3.0 2 | 3 | #pragma once 4 | 5 | #include 6 | 7 | class HashString { 8 | public: 9 | uint64_t hash_; 10 | const std::basic_string &ws_; 11 | size_t init_; 12 | size_t end_; 13 | 14 | HashString(uint64_t, const std::basic_string &, size_t, size_t); 15 | 16 | bool operator==(const HashString &) const; 17 | }; 18 | 19 | namespace std { 20 | template<> 21 | struct hash { 22 | std::size_t operator()(const HashString &hS) const { return hS.hash_; } 23 | }; 24 | } 25 | 26 | HashString::HashString(uint64_t hash, const std::basic_string &s, size_t init, size_t end) 27 | : hash_(hash), ws_(s), init_(init), end_(end) { 28 | 29 | } 30 | 31 | bool HashString::operator==(const HashString &other) const { 32 | auto length = end_ - init_ + 1; 33 | if (length != other.end_ - other.init_ + 1) 34 | return false; 35 | 36 | for (int i = 0; i < length; ++i) 37 | if (ws_[init_ + i] != other.ws_[other.init_ + i]) 38 | return false; 39 | return true; 40 | } 41 | -------------------------------------------------------------------------------- /include/block_tree/RabinKarp.h: -------------------------------------------------------------------------------- 1 | // This source file is adapted from Manuel Cáceres's https://github.com/elarielcl/BlockTrees, distributed under GPL-3.0 2 | 3 | #pragma once 4 | 5 | #include 6 | 7 | class RabinKarp { 8 | uint64_t kp_; 9 | uint64_t init_; 10 | uint64_t rm_; 11 | 12 | public: 13 | uint64_t sigma_; 14 | uint64_t hash_; 15 | uint64_t size_; 16 | const std::basic_string &ws_; 17 | 18 | RabinKarp(const std::basic_string &s, uint64_t init, uint64_t size, uint64_t range, uint64_t sigma = 257); 19 | 20 | uint64_t hash() const { return hash_; }; 21 | void next(); 22 | }; 23 | 24 | RabinKarp::RabinKarp(const std::basic_string &s, uint64_t init, uint64_t size, uint64_t range, uint64_t sigma) 25 | : sigma_(sigma), size_(size), ws_(s), hash_(0), init_(init), rm_(1), kp_(range) { 26 | for (auto i = init; i < init + size_; ++i) { 27 | uint64_t next = ws_[i]; 28 | next = next % kp_; 29 | hash_ = (sigma_ * hash_ + next) % kp_; // sigma or little prime 30 | } 31 | 32 | for (int i = 0; i < size_ - 1; ++i) 33 | rm_ = (rm_ * sigma_) % kp_; 34 | } 35 | 36 | void RabinKarp::next() { 37 | uint64_t next = ws_[init_]; 38 | next = next % kp_; 39 | hash_ = (hash_ + kp_ - rm_ * next % kp_) % kp_; 40 | init_++; 41 | next = ws_[init_ + size_ - 1]; 42 | next = next % kp_; 43 | hash_ = (hash_ * sigma_ + next) % kp_; 44 | } 45 | -------------------------------------------------------------------------------- /include/block_tree/blocks/BackBlock.h: -------------------------------------------------------------------------------- 1 | // This source file is adapted from Manuel Cáceres's https://github.com/elarielcl/BlockTrees, distributed under GPL-3.0 2 | 3 | #pragma once 4 | 5 | #include "Block.h" 6 | 7 | class BackBlock final : public Block { 8 | public: 9 | 10 | BackBlock(Block *parent, 11 | size_t start_index, 12 | size_t end_index, 13 | const std::basic_string &source, 14 | Block *first_block, 15 | Block *second_block, 16 | uint32_t offset) 17 | : Block(parent, start_index, end_index, source) { 18 | first_block_ = first_block; 19 | if (second_block) { 20 | if (second_block->start_ == start_index && second_block->end_ == end_index) second_block_ = this; 21 | else second_block_ = second_block; 22 | } 23 | offset_ = offset; 24 | if (first_block_) 25 | first_block_->pointing_to_me_++; 26 | if (second_block_) 27 | second_block_->pointing_to_me_++; 28 | } 29 | 30 | ~BackBlock() final { 31 | if (first_block_ && first_block_->pointing_to_me_ > 0) 32 | first_block_->pointing_to_me_--; 33 | if (second_block_ && second_block_->pointing_to_me_ > 0) 34 | second_block_->pointing_to_me_--; 35 | } 36 | 37 | size_t la_visit(DataPtr, uint8_t log_n, uint8_t log_u, uint8_t depth, uint8_t) final { 38 | return 2 * log_n + log_u; 39 | } 40 | 41 | virtual std::pair compute_depth(uint8_t depth) { 42 | auto left_pruned = first_block_->parent_->is_la_leaf(); 43 | auto right_pruned = (second_block_ && second_block_->parent_->is_la_leaf()) || (left_pruned && !second_block_); 44 | 45 | auto left_length = length() - offset_; 46 | auto right_length = offset_; 47 | if (left_pruned && right_pruned) 48 | return {depth, depth * actual_length()}; 49 | else if (left_pruned) { 50 | auto right_cost = second_block_ ? second_block_->compute_depth(depth).second : 0; 51 | return {depth, depth * left_length + right_cost * right_length / double(length())}; 52 | } else if (right_pruned) { 53 | auto left_cost = first_block_->compute_depth(depth).second; 54 | return {depth, depth * right_length + left_cost * left_length / double(length())}; 55 | } else { 56 | auto left_cost = first_block_->compute_depth(depth).second; 57 | auto right_cost = second_block_ ? second_block_->compute_depth(depth).second : 0; 58 | return {depth, left_cost * left_length / double(length()) + right_cost * right_length / double(length())}; 59 | } 60 | } 61 | }; 62 | 63 | 64 | -------------------------------------------------------------------------------- /include/block_tree/blocks/Block.h: -------------------------------------------------------------------------------- 1 | // This source file is adapted from Manuel Cáceres's https://github.com/elarielcl/BlockTrees, distributed under GPL-3.0 2 | 3 | #pragma once 4 | 5 | #include "LABlock.hpp" 6 | #include 7 | #include 8 | #include 9 | 10 | using BlockLAType = LABlock; 11 | using DataPtr = const uint32_t *; 12 | static uint8_t max_bpc = 14; 13 | 14 | double ef_cost(size_t n, size_t u) { 15 | auto log_n = sdsl::bits::hi(n) + 1; 16 | auto log_u = sdsl::bits::hi(u) + 1; 17 | if (log_n == log_u) 18 | --log_n; // to ensure log_u-log_n > 0 19 | return n * (log_u - log_n) + n + (1ULL << log_n) + 0.2 * n; 20 | // return size_t(n * (2 + std::ceil(std::log2(u / double(n))))); 21 | } 22 | 23 | class Block { 24 | public: 25 | Block *parent_; 26 | size_t start_; 27 | size_t end_; 28 | 29 | const std::basic_string &source_; 30 | 31 | Block *first_block_; 32 | Block *second_block_; 33 | uint32_t offset_; 34 | bool left_: 1; 35 | bool right_: 1; 36 | uint32_t pointing_to_me_; 37 | uint8_t level_index_; 38 | uint32_t first_occurrence_level_index_; 39 | 40 | uint32_t la_block_id_; 41 | uint32_t level_start_pos_; 42 | 43 | std::vector children_; 44 | 45 | size_t length() const { return end_ - start_ + 1; }; 46 | size_t actual_length() const { return std::min(end_ + 1, source_.size()) - start_; }; 47 | std::basic_string represented_string() const { return source_.substr(start_, length()); }; 48 | 49 | virtual size_t count_leaf_elements() { return 0; }; 50 | 51 | virtual std::vector &children(int, int) { return children_; }; 52 | virtual void clean_unnecessary_expansions() {}; 53 | 54 | void replace_child(Block *old_child, Block *new_child) { 55 | for (auto &c : children_) { 56 | if (c == old_child) { 57 | c = new_child; 58 | return; 59 | } 60 | } 61 | } 62 | 63 | virtual bool is_leaf() const { return true; }; 64 | virtual bool is_la_leaf() const { return false; }; 65 | 66 | virtual size_t la_visit(DataPtr data, uint8_t log_n, uint8_t log_u, uint8_t depth, uint8_t leaf_approx_cost) = 0; 67 | 68 | virtual bool get_la(DataPtr data, BlockLAType &la) { 69 | // TODO: replace with exponential search from bpc=0 (skipping 1) 70 | for (uint8_t bpc = 0; bpc < max_bpc; bpc += 1 + (bpc == 0)) 71 | if (BlockLAType::make(data + start_, data + std::min(end_ + 1, source_.size()), bpc, la)) 72 | return true; 73 | return false; 74 | } 75 | 76 | virtual std::pair compute_depth(uint8_t depth) = 0; 77 | 78 | virtual uint32_t number_la_blocks(uint32_t id) { 79 | if (is_la_leaf()) { 80 | la_block_id_ = id; 81 | return id + 1; 82 | } 83 | if (is_leaf()) 84 | return id; 85 | 86 | for (auto *&c : children_) 87 | id = c->number_la_blocks(id); 88 | return id; 89 | } 90 | 91 | Block(Block *parent, size_t start_index, size_t end_index, const std::basic_string &source) 92 | : parent_(parent), 93 | start_(start_index), 94 | end_(end_index), 95 | source_(source), 96 | offset_(0), 97 | left_(false), 98 | right_(false), 99 | first_block_(this), 100 | second_block_(nullptr), 101 | pointing_to_me_(0), 102 | level_index_(0), 103 | first_occurrence_level_index_(0), 104 | la_block_id_(0), 105 | level_start_pos_(0) {} 106 | 107 | virtual ~Block() = default; 108 | }; -------------------------------------------------------------------------------- /include/block_tree/blocks/InternalBlock.h: -------------------------------------------------------------------------------- 1 | // This source file is adapted from Manuel Cáceres's https://github.com/elarielcl/BlockTrees, distributed under GPL-3.0 2 | 3 | #pragma once 4 | 5 | #include "Block.h" 6 | #include "LeafBlock.h" 7 | #include "BackBlock.h" 8 | 9 | class InternalBlock final : public Block { 10 | bool use_la; 11 | BlockLAType cached_la; 12 | 13 | void collapse_subtree(Block *block, Block *parent) { 14 | for (const auto &c : block->children_) { 15 | c->parent_ = parent; 16 | collapse_subtree(c, parent); 17 | } 18 | } 19 | 20 | public: 21 | 22 | InternalBlock(Block *, size_t, size_t, const std::basic_string &); 23 | ~InternalBlock() final; 24 | 25 | std::vector &children(int, int) final; 26 | void clean_unnecessary_expansions() final; 27 | 28 | bool is_leaf() const final { return false; } 29 | bool is_la_leaf() const final { return use_la; }; 30 | 31 | size_t la_visit(DataPtr data, uint8_t log_n, uint8_t log_u, uint8_t depth, uint8_t leaf_approx_cost) final { 32 | auto tree_bit_cost = children_.size(); // 1 bit for each child 33 | tree_bit_cost += log_u * (children_.size() - 1); // for the samples needed to implement rank 34 | for (auto rit = children_.rbegin(); rit != children_.rend(); ++rit) 35 | tree_bit_cost += (*rit)->la_visit(data, log_n, log_u, depth + 1, leaf_approx_cost); 36 | 37 | auto pruned_bit_cost = std::numeric_limits::max(); 38 | if (get_la(data, cached_la)) { 39 | auto back_block_cost = 2 * log_n + log_u; // An LA block is encoded as a back block at the same level 40 | pruned_bit_cost = cached_la.size_in_bytes() * 8 + back_block_cost; 41 | } 42 | 43 | if (pruned_bit_cost < tree_bit_cost) { 44 | use_la = true; 45 | collapse_subtree(this, this); 46 | this->parent_ = this; 47 | return pruned_bit_cost; 48 | } 49 | 50 | use_la = false; 51 | return tree_bit_cost; 52 | }; 53 | 54 | std::pair compute_depth(uint8_t depth) final { 55 | if (is_la_leaf()) 56 | return {depth, depth * actual_length()}; 57 | 58 | size_t max_depth = depth; 59 | long double depth_sum = 0; 60 | for (auto *&c : children_) { 61 | auto[subtree_max_depth, subtree_depth_sum] = c->compute_depth(depth + 1); 62 | max_depth = (uint8_t) std::max(max_depth, subtree_max_depth); 63 | depth_sum += subtree_depth_sum; 64 | } 65 | return {max_depth, depth_sum}; 66 | } 67 | 68 | 69 | size_t count_leaf_elements() final { 70 | size_t sum = 0; 71 | for (auto *&c : children_) 72 | sum += c->count_leaf_elements(); 73 | return sum; 74 | } 75 | 76 | bool get_la(DataPtr data, BlockLAType &la) final { 77 | if (use_la) { 78 | la = cached_la; 79 | return true; 80 | } 81 | return Block::get_la(data, la); 82 | } 83 | }; 84 | 85 | InternalBlock::InternalBlock(Block *parent, 86 | size_t start_index, 87 | size_t end_index, 88 | const std::basic_string &source) 89 | : Block(parent, start_index, end_index, source), 90 | use_la(false), 91 | cached_la() { 92 | } 93 | 94 | InternalBlock::~InternalBlock() { 95 | for (auto rit = children_.rbegin(); rit != children_.rend(); ++rit) 96 | delete *rit; 97 | } 98 | 99 | std::vector &InternalBlock::children(int leaf_length, int r) { 100 | if (children_.empty()) { 101 | auto next_length = length() / r; 102 | if (next_length <= leaf_length) { 103 | for (auto i = 0; i < r; ++i) { 104 | auto init = start_ + i * next_length; 105 | auto end = start_ + (i + 1) * next_length - 1; 106 | if (init < source_.size()) { 107 | Block *child = new LeafBlock(this, init, end, source_); 108 | children_.push_back(child); 109 | } 110 | } 111 | } else { 112 | for (auto i = 0; i < r; ++i) { 113 | auto init = start_ + i * next_length; 114 | auto end = start_ + (i + 1) * next_length - 1; 115 | if (init < source_.size()) { 116 | Block *child = new InternalBlock(this, init, end, source_); 117 | children_.push_back(child); 118 | } 119 | } 120 | } 121 | } 122 | return children_; 123 | } 124 | 125 | void InternalBlock::clean_unnecessary_expansions() { 126 | for (auto rit = children_.rbegin(); rit != children_.rend(); ++rit) { 127 | Block *b = (*rit); 128 | b->clean_unnecessary_expansions(); 129 | } 130 | 131 | bool all_children_leaves = true; 132 | for (Block *child : children_) 133 | all_children_leaves = all_children_leaves && child->is_leaf(); 134 | 135 | if (all_children_leaves && pointing_to_me_ == 0 && first_block_->start_ < start_ && second_block_ != this) { 136 | auto *bb = new BackBlock(parent_, start_, end_, source_, first_block_, second_block_, offset_); 137 | bb->level_index_ = level_index_; 138 | bb->first_occurrence_level_index_ = first_occurrence_level_index_; 139 | bb->left_ = true; 140 | bb->right_ = true; 141 | parent_->replace_child(this, bb); 142 | delete this; 143 | } else { //To avoid dangling references 144 | first_block_ = this; 145 | second_block_ = nullptr; 146 | } 147 | } -------------------------------------------------------------------------------- /include/block_tree/blocks/LeafBlock.h: -------------------------------------------------------------------------------- 1 | // This source file is adapted from Manuel Cáceres's https://github.com/elarielcl/BlockTrees, distributed under GPL-3.0 2 | 3 | #pragma once 4 | 5 | #include "Block.h" 6 | 7 | class LeafBlock final : public Block { 8 | public: 9 | 10 | LeafBlock(Block *parent, size_t start_index, size_t end_index, const std::basic_string &source) 11 | : Block(parent, start_index, end_index, source) { 12 | } 13 | 14 | ~LeafBlock() final = default; 15 | 16 | size_t la_visit(DataPtr, uint8_t, uint8_t log_u, uint8_t depth, uint8_t leaf_approx_cost) final { 17 | return leaf_approx_cost * actual_length(); 18 | }; 19 | 20 | virtual std::pair compute_depth(uint8_t depth) { 21 | return {depth, depth * actual_length()}; 22 | } 23 | 24 | size_t count_leaf_elements() final { return actual_length(); } 25 | }; 26 | 27 | -------------------------------------------------------------------------------- /include/la_vector/la_vector.hpp: -------------------------------------------------------------------------------- 1 | // This file is part of la_vector . 2 | // Copyright (c) 2020 Giorgio Vinciguerra. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | #pragma once 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "piecewise_linear_model.hpp" 25 | 26 | /** Computes (bits_per_correction > 0 ? 2^(bits_per_correction-1) - 1 : 0) without the conditional operator. */ 27 | #define BPC_TO_EPSILON(bits_per_correction) (((1ul << (bits_per_correction)) + 1) / 2 - 1) 28 | 29 | /** Computes the smallest integral power of two that is not smaller than x. */ 30 | #define BIT_CEIL(x) ((x) < 2 ? 1u : 1u << (64u - __builtin_clzll((x) - 1))) 31 | 32 | /** Computes the smallest integral value not less than x / y, where x and y must be positive integers. */ 33 | #define CEIL_UINT_DIV(x, y) ((x) / (y) + ((x) % (y) != 0)) 34 | 35 | /** Computes the number of bits needed to store x, that is, 0 if x is 0, 1 + floor(log2(x)) otherwise. */ 36 | #define BIT_WIDTH(x) ((x) == 0 ? 0 : 64 - __builtin_clzll(x)) 37 | 38 | template 39 | class bucketing_top_level; 40 | 41 | template class t_top_level = bucketing_top_level> 42 | class la_vector { 43 | static_assert(std::is_integral_v); 44 | static_assert(std::is_unsigned_v); 45 | static_assert(t_bpc < sizeof(K) * CHAR_BIT); 46 | 47 | class segment; 48 | class constant_bpc; 49 | class variable_bpc; 50 | class la_iterator; 51 | friend class la_iterator; 52 | 53 | static constexpr bool auto_bpc = t_bpc < 2; 54 | static constexpr size_t cache_line_bits = 64 * CHAR_BIT; 55 | static constexpr size_t extraction_density = auto_bpc ? 1 : BIT_CEIL(4 * cache_line_bits / t_bpc); 56 | 57 | using position_type = typename std::conditional_t; 58 | using signed_position_type = typename std::make_signed_t; 59 | using larger_signed_key_type = typename std::conditional_t; 60 | using top_level_type = t_top_level::const_iterator>; 61 | using canonical_segment = typename OptimalPiecewiseLinearModel::CanonicalSegment; 62 | using base_segment_type = typename std::conditional_t; 63 | 64 | // If auto_bpc, each segment uses a different bit-size for the corrections, stored in segment::bpc. It also stores 65 | // segment::corrections_offset, which is the cumulative sum of the preceding segments' bpc values and is used as a 66 | // (bit) pointer into the corrections array. 67 | // If !auto_bpc, the corrections array is split into two parts. The first part contains the corrections at positions 68 | // multiples of extraction_density. The second part contains the remaining corrections. 69 | 70 | K front; ///< The first element in this container. 71 | K back; ///< The last element in this container. 72 | size_t n; ///< The number of elements in this container. 73 | std::vector segments; ///< The linear models that, together with the corrections, compress the data. 74 | sdsl::int_vector<64> corrections; ///< The corrections for each compressed element. 75 | top_level_type top_level; ///< The top level structure on the segments. 76 | 77 | public: 78 | 79 | using size_type = size_t; 80 | using iterator = class la_iterator; 81 | 82 | la_vector() = default; 83 | 84 | explicit la_vector(std::vector &data) : la_vector(data.begin(), data.end()) {}; 85 | 86 | template 87 | la_vector(RandomIt begin, RandomIt end) 88 | : front(*begin), 89 | back(*std::prev(end)), 90 | n(std::distance(begin, end)), 91 | segments() { 92 | if (n == 0) 93 | return; 94 | 95 | auto[canonical_segments, bit_size] = make_segmentation(begin, end); 96 | 97 | // Store segments and fill the corrections array 98 | segments.reserve(canonical_segments.size() + 1); 99 | corrections = decltype(corrections)(CEIL_UINT_DIV(bit_size, 64) + 1, 0); 100 | 101 | size_t corrections_offset = 0; 102 | for (auto it = canonical_segments.begin(); it < canonical_segments.end(); ++it) { 103 | auto i = it->get_first_x(); 104 | auto j = std::next(it) != canonical_segments.end() ? std::next(it)->get_first_x() : n; 105 | uint8_t bpc = t_bpc; 106 | if constexpr (auto_bpc) 107 | bpc = it->bpc; 108 | push_segment(*it, bpc, corrections_offset, begin, i, j); 109 | corrections_offset += bpc * (j - i); 110 | } 111 | 112 | segments.emplace_back(n); // extra segment to avoid bound checking in decode() and lower_bound() 113 | top_level = decltype(top_level)(begin, end, segments.begin(), std::prev(segments.end())); 114 | } 115 | 116 | /** 117 | * Returns the element at the specified position. No bounds checking is performed. 118 | * @param i position of the element to return 119 | * @return the element at the specified position 120 | */ 121 | K operator[](size_t i) const { 122 | assert(i < n); 123 | return top_level.segment_for_position(i)->decompress(corrections.data(), n, i); 124 | } 125 | 126 | /** 127 | * Returns an iterator pointing to the first element that is not less than the given value. 128 | * @param value value to compare the elements to 129 | * @return an iterator pointing to the first element that is not less than value 130 | */ 131 | iterator lower_bound(K value) const { 132 | if (value > back) 133 | return end(); 134 | if (value <= front) 135 | return begin(); 136 | 137 | auto it = top_level.segment_for_value(value); 138 | auto &s = *it; 139 | auto &t = *std::next(it); 140 | auto[pos, bound] = s.approximate_position(value); 141 | pos = std::clamp(pos, s.first, t.first - 1); 142 | 143 | auto lo = pos <= bound + s.first ? s.first : pos - bound; 144 | auto hi = std::min(pos + bound + 1, t.first); 145 | 146 | if (!auto_bpc) { 147 | // Binary search on the samples 148 | auto sample_lo = CEIL_UINT_DIV(lo, extraction_density); 149 | auto sample_hi = (hi - 1) / extraction_density + 1; 150 | 151 | while (sample_lo < sample_hi) { 152 | size_t mid = sample_lo + (sample_hi - sample_lo) / 2; 153 | if (s.decompress(corrections.data(), n, mid * extraction_density) < value) { 154 | sample_lo = mid + 1; 155 | lo = mid * extraction_density; 156 | } else { 157 | sample_hi = mid; 158 | hi = mid * extraction_density; 159 | } 160 | } 161 | 162 | // Binary search on the compressed data 163 | while (lo < hi) { 164 | auto mid = lo + (hi - lo) / 2; 165 | if (s.decompress(corrections.data(), n, mid) < value) 166 | lo = mid + 1; 167 | else 168 | hi = mid; 169 | } 170 | 171 | if (lo == t.first) 172 | return iterator(this, t.first, std::next(it)); 173 | return iterator(this, lo, it); 174 | } 175 | 176 | auto val = s.decompress(corrections.data(), n, pos); 177 | auto search_forward = val <= value; 178 | constexpr auto linear_threshold = 2 * cache_line_bits / (auto_bpc ? 4 : t_bpc); 179 | 180 | if (hi - lo <= linear_threshold) { 181 | if (search_forward) 182 | while (pos < t.first && val < value) 183 | val = s.decompress(corrections.data(), n, ++pos); 184 | else 185 | while (pos > s.first && s.decompress(corrections.data(), n, pos - 1) >= value) 186 | --pos; 187 | 188 | if (pos == t.first) 189 | return iterator(this, t.first, std::next(it)); 190 | return iterator(this, pos, it); 191 | } 192 | 193 | lo = search_forward ? pos : lo; 194 | hi = search_forward ? hi : pos; 195 | auto val_at_lo = search_forward ? val : s.decompress(corrections.data(), n, lo); 196 | auto val_at_hi = search_forward ? s.decompress(corrections.data(), n, hi - 1) : val; 197 | auto count = hi - lo; 198 | 199 | if (hi == t.first and value > val_at_hi) 200 | return iterator(this, t.first, std::next(it)); 201 | 202 | while (count > linear_threshold) { 203 | auto x = larger_signed_key_type(value) - val_at_lo; 204 | auto dx = val_at_hi - val_at_lo; 205 | auto dy = count - 1; 206 | auto step = x * dy / dx; 207 | auto p = lo + step; 208 | auto val_at_p = s.decompress(corrections.data(), n, p); 209 | 210 | if (value > val_at_p) { 211 | lo = p + 1; 212 | count -= step + 1; 213 | val_at_lo = s.decompress(corrections.data(), n, lo); 214 | if (val_at_lo >= value) { 215 | if (lo == t.first) 216 | return iterator(this, t.first, std::next(it)); 217 | return iterator(this, lo, it); 218 | } 219 | } else { 220 | hi = p; 221 | count = step; 222 | val_at_hi = val_at_p; 223 | } 224 | } 225 | 226 | for (; lo < hi && s.decompress(corrections.data(), n, lo) < value; ++lo); 227 | 228 | if (lo == t.first) 229 | return iterator(this, t.first, std::next(it)); 230 | return iterator(this, lo, it); 231 | } 232 | 233 | /** 234 | * Returns the number of elements in the container that are less than or equal to @p value. 235 | * @param value value to compare elements to 236 | * @return the number of elements that are less than or equal to @p value 237 | */ 238 | size_t rank(K value) const { return std::distance(begin(), lower_bound(value)); } 239 | 240 | /** 241 | * Returns the i-th smallest element in the container. 242 | * @param i rank of the element, must be between 1 and @ref size() 243 | * @return the i-th smallest element 244 | */ 245 | K select(size_t i) const { 246 | assert(i > 0 && i <= n); 247 | return operator[](i - 1); 248 | } 249 | 250 | /** 251 | * Decodes all the elements of this container into the memory beginning at @p out. The caller is responsible for 252 | * allocating enough memory for @p out, that is, at least @ref size() * sizeof(K) bytes. 253 | * @param out the beginning of the destination of the decoded elements 254 | */ 255 | void decode(K *out) const { 256 | for (auto it = segments.begin(); it != std::prev(segments.end()); ++it) { 257 | auto &s = *it; 258 | auto covered = std::next(it)->first - s.first; 259 | auto significand = s.slope_significand; 260 | auto exponent = s.slope_exponent; 261 | auto intercept = s.intercept - BPC_TO_EPSILON(s.bpc); 262 | 263 | #pragma omp simd 264 | for (auto j = 0; j < covered; ++j) 265 | out[j] = ((j * significand) >> exponent) + intercept; 266 | 267 | for (auto j = 0; j < covered; ++j) 268 | out[j] += s.get_correction(corrections.data(), n, j + s.first); 269 | 270 | out += covered; 271 | } 272 | } 273 | 274 | /** 275 | * Decodes all the elements of this container into a vector. 276 | * @return a vector with the decoded elements 277 | */ 278 | std::vector decode() const { 279 | std::vector out(n); 280 | decode(out.data()); 281 | return out; 282 | } 283 | 284 | /** 285 | * Returns an iterator to the first element. 286 | * @return an iterator to the first element 287 | */ 288 | iterator begin() const { return iterator(this, 0, segments.begin()); } 289 | 290 | /** 291 | * Returns an iterator to the element following the last element. 292 | * @return an iterator to the element following the last element 293 | */ 294 | iterator end() const { return iterator(this, n, segments.end()); } 295 | 296 | /** 297 | * Returns the number of bytes used by this container to encode its elements. 298 | * @return the size in bytes of this container 299 | */ 300 | size_t size_in_bytes() const { 301 | return corrections.bit_size() / CHAR_BIT + segments_count() * sizeof(segment) + top_level.size_in_bytes(); 302 | } 303 | 304 | /** 305 | * Returns the number of segments (linear models) used in this container to encode its elements. 306 | * @return the number of segments in this container 307 | */ 308 | size_t segments_count() const { return segments.size(); } 309 | 310 | /** 311 | * Returns the average number of bits per element, that is, @ref size_in_bytes() * 8. / @ref size(). 312 | * @return the average number of bits per element 313 | */ 314 | double bits_per_element() const { return size_in_bytes() * CHAR_BIT / (double) n; } 315 | 316 | /** 317 | * Returns the number of elements in this container. 318 | * @return the number of elements in this container 319 | */ 320 | size_t size() const { return n; } 321 | 322 | /** 323 | * Serializes the vector to a stream. 324 | * @param out output stream 325 | * @param v parent node in the structure tree 326 | * @param name name of the structure tree node 327 | * @return the number of bytes written to out 328 | */ 329 | size_t serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, const std::string &name = "") const { 330 | auto child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 331 | size_t written_bytes = 0; 332 | written_bytes += sdsl::write_member(n, out, child, "size"); 333 | written_bytes += sdsl::write_member(front, out, child, "front"); 334 | written_bytes += sdsl::write_member(back, out, child, "back"); 335 | written_bytes += sdsl::write_member(segments.size(), out, child, "segments.size()"); 336 | written_bytes += sdsl::serialize_vector(segments, out, child, "segments"); 337 | written_bytes += sdsl::serialize(top_level, out, child, "top_level"); 338 | written_bytes += sdsl::serialize(corrections, out, child, "corrections"); 339 | sdsl::structure_tree::add_size(child, written_bytes); 340 | return written_bytes; 341 | } 342 | 343 | /** 344 | * Loads the vector from a stream. 345 | * @param in input stream 346 | */ 347 | void load(std::istream &in) { 348 | sdsl::read_member(n, in); 349 | sdsl::read_member(front, in); 350 | sdsl::read_member(back, in); 351 | size_t segments_size; 352 | sdsl::read_member(segments_size, in); 353 | segments = decltype(segments)(segments_size); 354 | sdsl::load_vector(segments, in); 355 | top_level.load(in, segments.begin(), std::prev(segments.end())); 356 | sdsl::load(corrections, in); 357 | } 358 | 359 | private: 360 | 361 | struct canonical_segment_bpc : canonical_segment { 362 | uint8_t bpc; 363 | canonical_segment_bpc() = default; 364 | canonical_segment_bpc(const canonical_segment &cs, uint8_t bpc) : canonical_segment(cs), bpc(bpc) {}; 365 | }; 366 | 367 | template = 0> 368 | static std::pair, size_t> make_segmentation(RandomIt begin, RandomIt end) { 369 | auto n = std::distance(begin, end); 370 | auto eps = BPC_TO_EPSILON(t_bpc); 371 | std::vector out; 372 | out.reserve(eps > 0 ? n / (eps * eps) : n / 8); 373 | auto in_fun = [begin](auto i) { return std::pair(i, begin[i]); }; 374 | auto out_fun = [&out](auto cs) { out.push_back(cs); }; 375 | make_segmentation_par(n, eps, in_fun, out_fun); 376 | return {out, n * t_bpc}; 377 | } 378 | 379 | template = 0> 380 | static std::pair, size_t> make_segmentation(RandomIt begin, RandomIt end) { 381 | const auto n = size_t(std::distance(begin, end)); 382 | const auto max_bpc = (1 + uint8_t(std::log2(begin[n - 1]))) / 2; 383 | 384 | std::vector frontier(max_bpc + 1); 385 | std::vector> segmentations; 386 | for (auto bpc = 0; bpc <= max_bpc; ++bpc) 387 | segmentations.emplace_back(BPC_TO_EPSILON(bpc)); 388 | 389 | auto advance_frontier = [&](auto bpc, auto target) { 390 | if (frontier[bpc] > target) 391 | return frontier[bpc]; 392 | 393 | size_t i; 394 | for (i = frontier[bpc]; i < n && segmentations[bpc].add_point(i, begin[i]); ++i) 395 | continue; 396 | return frontier[bpc] = i; 397 | }; 398 | 399 | // Find the shortest path 400 | std::vector distance(n + 1, -1); 401 | std::vector> parent(n + 1); 402 | distance[0] = 0; 403 | 404 | for (size_t i = 0, next_i = -1; i < n; i = next_i, next_i = -1) { 405 | // For each j adjacent to i, do a relaxation 406 | for (uint8_t bpc = 0; bpc <= max_bpc; bpc += 1 + (bpc == 0)) { 407 | auto j = advance_frontier(bpc, i); 408 | auto weight_ij = bpc * (j - i) + CHAR_BIT * sizeof(segment); 409 | 410 | // Relax edge (i, j) 411 | if (distance[j] > distance[i] + weight_ij) { 412 | distance[j] = distance[i] + weight_ij; 413 | parent[j] = std::make_unique(segmentations[bpc].get_segment().copy(i), bpc); 414 | } 415 | 416 | next_i = std::min(next_i, j); 417 | } 418 | } 419 | 420 | // Traverse the parent links to build the result 421 | if (!parent[n]) 422 | throw std::runtime_error("Cannot reach target vertex"); 423 | 424 | size_t bit_size = 0; 425 | std::vector out; 426 | out.reserve(n / 10); 427 | 428 | for (size_t current = n; current != 0; current = parent[current]->get_first_x()) { 429 | out.push_back(*parent[current]); 430 | bit_size += (current - out.back().get_first_x()) * out.back().bpc; 431 | } 432 | 433 | std::reverse(out.begin(), out.end()); 434 | return {out, bit_size}; 435 | } 436 | 437 | template 438 | void push_segment(const canonical_segment &cs, uint8_t bpc, position_type corrections_offset, 439 | RandomIt data, size_t i, size_t j) { 440 | try { 441 | segment s(cs, bpc, corrections_offset, data, n, i, j, corrections.data()); 442 | segments.push_back(s); 443 | } 444 | catch (const std::overflow_error &) { 445 | auto half = (i + j) / 2; 446 | push_segment(cs, bpc, corrections_offset, data, i, half); 447 | push_segment(cs.copy(half), bpc, corrections_offset + half * bpc, data, half, j); 448 | } 449 | } 450 | }; 451 | 452 | #pragma pack(push, 1) 453 | 454 | template class t_top_level> 455 | struct la_vector::constant_bpc { 456 | static constexpr uint8_t bpc = t_bpc; 457 | uint32_t first_correction: t_bpc; 458 | constant_bpc() = default; 459 | constant_bpc(uint8_t, position_type) : first_correction(0) {}; 460 | }; 461 | 462 | template class t_top_level> 463 | struct la_vector::variable_bpc { 464 | uint8_t bpc; 465 | position_type corrections_offset; 466 | uint32_t first_correction: 16; 467 | variable_bpc() = default; 468 | variable_bpc(uint8_t bpc, position_type offset) : bpc(bpc), corrections_offset(offset), first_correction(0) {}; 469 | }; 470 | 471 | template class t_top_level> 472 | struct la_vector::segment : base_segment_type { 473 | using size_type = size_t; 474 | static constexpr auto exponent_bits = 5; 475 | static constexpr auto significand_bits = sizeof(K) <= 4 ? 32 - exponent_bits : 56 - exponent_bits; 476 | position_type first; 477 | signed_position_type intercept; 478 | uint8_t slope_exponent: exponent_bits; 479 | uint64_t slope_significand: significand_bits; 480 | 481 | segment() = default; 482 | 483 | template 484 | segment(const canonical_segment &cs, uint8_t bpc, position_type corrections_offset, 485 | RandomIt data, size_t n, size_t i, size_t j, uint64_t *corrections) 486 | : base_segment_type(bpc, corrections_offset) { 487 | auto epsilon = BPC_TO_EPSILON(bpc); 488 | auto[cs_significand, cs_exponent, cs_intercept] = cs.get_fixed_point_segment(cs.get_first_x(), j - i + 1); 489 | 490 | if (BIT_WIDTH(cs_exponent) > exponent_bits || BIT_WIDTH(cs_significand) > significand_bits) 491 | throw std::overflow_error("Bit fields' sizes are not large enough"); 492 | 493 | first = cs.get_first_x(); 494 | intercept = cs_intercept; 495 | slope_exponent = cs_exponent; 496 | slope_significand = cs_significand; 497 | 498 | for (auto k = i; k < j; k++) { 499 | auto error = data[k] - approximate(k); 500 | auto correction = uint64_t(error + epsilon); 501 | set_correction(corrections, n, k, correction); 502 | } 503 | } 504 | 505 | explicit segment(position_type first) 506 | : base_segment_type(0, 0), 507 | first(first), 508 | slope_significand(0), 509 | slope_exponent(0), 510 | intercept(std::numeric_limits::max()) {} 511 | 512 | size_t get_correction_bit_offset(size_t n, size_t i) const { 513 | if constexpr (auto_bpc) 514 | return this->corrections_offset + (i - first) * this->bpc; 515 | if (i % extraction_density == 0) 516 | return this->bpc * (i / extraction_density); 517 | return this->bpc * (i + n / extraction_density - i / extraction_density); 518 | } 519 | 520 | void set_correction(uint64_t *corrections, size_t n, size_t i, uint64_t value) { 521 | if (this->bpc == 0) 522 | return; 523 | if (BIT_WIDTH(value) > this->bpc) 524 | throw std::overflow_error("Segment correction too large"); 525 | if (i == first) 526 | this->first_correction = value; 527 | 528 | auto idx = get_correction_bit_offset(n, i); 529 | sdsl::bits::write_int(corrections + (idx >> 6), value, idx & 0x3F, this->bpc); 530 | } 531 | 532 | K get_correction(const uint64_t *corrections, size_t n, size_t i) const { 533 | auto idx = get_correction_bit_offset(n, i); 534 | return sdsl::bits::read_int(corrections + (idx >> 6u), idx & 0x3F, this->bpc); 535 | } 536 | 537 | larger_signed_key_type approximate(size_t i) const { 538 | return ((larger_signed_key_type(slope_significand) * (i - first)) >> slope_exponent) + intercept; 539 | } 540 | 541 | std::pair approximate_position(const K &value) const { 542 | auto significand = larger_signed_key_type(slope_significand == 0 ? 1 : slope_significand); 543 | auto p = ((larger_signed_key_type(value) - intercept) << slope_exponent) / significand + first; 544 | auto epsilon = larger_signed_key_type(BPC_TO_EPSILON(this->bpc)); 545 | auto bound = 1 + (epsilon << slope_exponent) / larger_signed_key_type(significand); 546 | return {std::max(0, p), bound}; 547 | } 548 | 549 | K first_key() const { 550 | auto epsilon = BPC_TO_EPSILON(this->bpc); 551 | auto correction = this->first_correction; 552 | return intercept + correction - epsilon; 553 | } 554 | 555 | K decompress(const uint64_t *corrections, size_t n, size_t i) const { 556 | auto epsilon = BPC_TO_EPSILON(this->bpc); 557 | auto correction = get_correction(corrections, n, i); 558 | return approximate(i) + correction - epsilon; 559 | } 560 | 561 | size_t serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, const std::string &name = "") const { 562 | return sdsl::write_member(*this, out, v, name); 563 | } 564 | 565 | void load(std::istream &in) { sdsl::read_member(*this, in); } 566 | }; 567 | 568 | #pragma pack(pop) 569 | 570 | template class t_top_level> 571 | class la_vector::la_iterator { 572 | using parent_type = const la_vector; 573 | using segments_iterator = typename decltype(parent_type::segments)::const_iterator; 574 | 575 | parent_type *p; 576 | size_t i; 577 | segments_iterator s_it; 578 | 579 | template 580 | void move_segment_cursor() { 581 | bool is_segment_cursor_invalid = s_it == p->segments.end(); 582 | if (is_segment_cursor_invalid) { 583 | s_it = p->top_level.segment_for_position(i); 584 | return; 585 | } 586 | 587 | if constexpr (Forward) { 588 | if (i >= std::next(s_it)->first) 589 | ++s_it; 590 | } else { 591 | if (s_it > p->segments.begin() && i < s_it->first) 592 | --s_it; 593 | } 594 | } 595 | 596 | public: 597 | using value_type = const K; 598 | using pointer = const K *; 599 | using reference = const K; 600 | using difference_type = std::ptrdiff_t; 601 | using iterator_category = std::random_access_iterator_tag; 602 | 603 | la_iterator() : p(nullptr), i(0), s_it(p->segments.begin()) {} 604 | la_iterator(parent_type *p, size_t i, segments_iterator s_it) : p(p), i(i), s_it(s_it) {} 605 | la_iterator(parent_type *p, size_t i) : p(p), i(i), s_it(p->segments.end()) { move_segment_cursor<>(); } 606 | 607 | reference operator*() const { return s_it->decompress(p->corrections.data(), p->n, i); } 608 | reference operator[](difference_type m) const { return (*p)[i + m]; } 609 | 610 | la_iterator &operator++() { 611 | ++i; 612 | move_segment_cursor(); 613 | return *this; 614 | } 615 | 616 | la_iterator &operator--() { 617 | --i; 618 | move_segment_cursor(); 619 | return *this; 620 | } 621 | 622 | la_iterator operator++(int) { 623 | la_iterator r(*this); 624 | ++i; 625 | move_segment_cursor(); 626 | return r; 627 | } 628 | 629 | la_iterator operator--(int) { 630 | la_iterator r(*this); 631 | --i; 632 | move_segment_cursor(); 633 | return r; 634 | } 635 | 636 | la_iterator &operator+=(difference_type d) { 637 | i += d; 638 | if (i < s_it->first || i >= std::next(s_it)->first) { 639 | // Reposition the segment cursor 640 | s_it = p->segments.end(); 641 | move_segment_cursor<>(); 642 | } 643 | return *this; 644 | } 645 | 646 | la_iterator &operator-=(difference_type d) { 647 | *this += -d; 648 | return *this; 649 | } 650 | 651 | la_iterator operator+(difference_type d) const { return la_iterator(p, i + d); } 652 | la_iterator operator-(difference_type d) const { return la_iterator(p, i - d); } 653 | 654 | difference_type operator-(const la_iterator &r) const { return i - r.i; } 655 | 656 | bool operator<(const la_iterator &r) const { return i < r.i; } 657 | bool operator<=(const la_iterator &r) const { return i <= r.i; } 658 | bool operator>(const la_iterator &r) const { return i > r.i; } 659 | bool operator>=(const la_iterator &r) const { return i >= r.i; } 660 | bool operator!=(const la_iterator &r) const { return i != r.i; } 661 | bool operator==(const la_iterator &r) const { return i == r.i; } 662 | }; 663 | 664 | template> 665 | ForwardIt upper_bound_branchless(ForwardIt first, ForwardIt last, const T &value, Compare comp = Compare()) { 666 | auto n = std::distance(first, last); 667 | while (n > 1) { 668 | auto half = n / 2; 669 | __builtin_prefetch(&*first + half / 2, 0, 0); 670 | __builtin_prefetch(&*first + half + half / 2, 0, 0); 671 | first = !comp(value, *std::next(first, half)) ? first + half : first; 672 | n -= half; 673 | } 674 | return std::next(first, !comp(value, *first)); 675 | } 676 | 677 | template 678 | class bucketing_top_level { 679 | size_t val_step; ///< The chunk size of val_top_level, in terms of universe values 0,...,back. 680 | size_t pos_step; ///< The chunk size of pos_top_level, in terms of positions 0,...,n. 681 | size_t top_level_size; ///< The number of elements in the two *_top_level structures. 682 | sdsl::int_vector<> val_top_level; ///< Used to speed up segment_for_value, contains positions of segments. 683 | sdsl::int_vector<> pos_top_level; ///< Used to speed up segment_for_position, contains positions of segments. 684 | t_segments_iterator segments_begin; ///< An iterator to the first segment. 685 | 686 | public: 687 | 688 | using size_type = size_t; 689 | 690 | bucketing_top_level() = default; 691 | 692 | template 693 | bucketing_top_level(It first, It last, t_segments_iterator first_segment, t_segments_iterator last_segment) { 694 | auto n_segments = std::distance(first_segment, last_segment); 695 | auto n = std::distance(first, last); 696 | auto u = *std::prev(last); 697 | 698 | segments_begin = first_segment; 699 | top_level_size = std::min(1u << 16, BIT_CEIL(n_segments)); 700 | val_step = CEIL_UINT_DIV(u, top_level_size); 701 | pos_step = CEIL_UINT_DIV(n, top_level_size); 702 | val_top_level = sdsl::int_vector<>(top_level_size, n_segments, BIT_WIDTH(n_segments)); 703 | pos_top_level = sdsl::int_vector<>(top_level_size, n_segments, BIT_WIDTH(n_segments)); 704 | 705 | for (auto i = 0, j = 0, k = 0; i < top_level_size - 1; ++i) { 706 | while (j < n_segments && first[first_segment[j].first] < (i + 1) * val_step) 707 | ++j; 708 | while (k < n_segments && first_segment[k].first < (i + 1) * pos_step) 709 | ++k; 710 | val_top_level[i] = j; 711 | pos_top_level[i] = k; 712 | } 713 | } 714 | 715 | /** 716 | * Returns an iterator to the segment responsible for decompressing the element at the given position. 717 | * @param i position of the element 718 | * @return an iterator to the segment responsible for the given position 719 | */ 720 | t_segments_iterator segment_for_position(size_t i) const { 721 | auto k = i / pos_step; 722 | auto first = segments_begin + (i < pos_step ? 0 : pos_top_level[k - 1]); 723 | auto last = segments_begin + pos_top_level[k]; 724 | auto cmp = [](size_t x, const auto &s) { return x < s.first; }; 725 | return std::prev(upper_bound_branchless(first, last, i, cmp)); 726 | } 727 | 728 | /** 729 | * Returns an iterator to the segment responsible for decompressing an element that is not less than the given 730 | * value. 731 | * @param value value of the element 732 | * @return an iterator to the segment responsible for the given value 733 | */ 734 | t_segments_iterator segment_for_value(K value) const { 735 | auto k = value / val_step; 736 | auto first = segments_begin + (value < val_step ? 0 : val_top_level[k - 1]); 737 | auto last = segments_begin + val_top_level[k]; 738 | auto cmp = [](K x, const auto &s) { return x < s.first_key(); }; 739 | auto it = upper_bound_branchless(first, last, value, cmp); 740 | return it == segments_begin ? it : std::prev(it); 741 | } 742 | 743 | /** 744 | * Serializes the top-level structure to a stream. 745 | * @param out output stream 746 | * @param v parent node in the structure tree 747 | * @param name name of the structure tree node 748 | * @return the number of bytes written to out 749 | */ 750 | size_t serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, const std::string &name = "") const { 751 | auto child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 752 | size_t written_bytes = 0; 753 | written_bytes += sdsl::write_member(val_step, out, child, "val_step"); 754 | written_bytes += sdsl::write_member(pos_step, out, child, "pos_step"); 755 | written_bytes += sdsl::write_member(top_level_size, out, child, "top_level.size()"); 756 | written_bytes += sdsl::serialize(val_top_level, out, child, "val_top_level"); 757 | written_bytes += sdsl::serialize(pos_top_level, out, child, "pos_top_level"); 758 | sdsl::structure_tree::add_size(child, written_bytes); 759 | return written_bytes; 760 | } 761 | 762 | /** 763 | * Loads the top-level structure from a stream. 764 | * @param in input stream 765 | */ 766 | void load(std::istream &in, t_segments_iterator first, t_segments_iterator) { 767 | segments_begin = first; 768 | sdsl::read_member(val_step, in); 769 | sdsl::read_member(pos_step, in); 770 | sdsl::read_member(top_level_size, in); 771 | sdsl::load(val_top_level, in); 772 | sdsl::load(pos_top_level, in); 773 | } 774 | 775 | /** 776 | * Returns the number of bytes used by this top-level structure. 777 | * @return the size in bytes of this top-level structure 778 | */ 779 | size_t size_in_bytes() const { return (val_top_level.bit_size() + pos_top_level.bit_size()) / CHAR_BIT; } 780 | }; -------------------------------------------------------------------------------- /include/la_vector/piecewise_linear_model.hpp: -------------------------------------------------------------------------------- 1 | // This file is part of PGM-index . 2 | // Copyright (c) 2018 Giorgio Vinciguerra. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | #pragma once 17 | 18 | #ifdef _OPENMP 19 | #include 20 | #else 21 | #warning Compilation with -fopenmp is recommended 22 | typedef int omp_int_t; 23 | inline omp_int_t omp_get_max_threads() { return 1; } 24 | #endif 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | template 33 | using LargeSigned = typename std::conditional_t, 34 | long double, 35 | std::conditional_t<(sizeof(T) < 8), int64_t, __int128>>; 36 | 37 | template 38 | class OptimalPiecewiseLinearModel { 39 | private: 40 | using SX = LargeSigned; 41 | using SY = LargeSigned; 42 | 43 | struct Slope { 44 | SX dx{}; 45 | SY dy{}; 46 | 47 | bool operator<(const Slope &p) const { 48 | return dy * p.dx < dx * p.dy; 49 | } 50 | 51 | bool operator>(const Slope &p) const { 52 | return dy * p.dx > dx * p.dy; 53 | } 54 | 55 | bool operator==(const Slope &p) const { 56 | return dy * p.dx == dx * p.dy; 57 | } 58 | 59 | bool operator!=(const Slope &p) const { 60 | return dy * p.dx != dx * p.dy; 61 | } 62 | 63 | explicit operator long double() const { 64 | return dy / (long double) dx; 65 | } 66 | }; 67 | 68 | struct StoredPoint { 69 | X x; 70 | Y y; 71 | }; 72 | 73 | struct Point { 74 | X x{}; 75 | SY y{}; 76 | 77 | Slope operator-(const Point &p) const { 78 | return {SX(x) - p.x, y - p.y}; 79 | } 80 | }; 81 | 82 | template 83 | struct Hull : private std::vector { 84 | const SY epsilon; 85 | 86 | explicit Hull(SY epsilon) : std::vector(), epsilon(Upper ? epsilon : -epsilon) {} 87 | 88 | Point operator[](size_t i) const { 89 | auto &p = std::vector::operator[](i); 90 | return {p.x, SY(p.y) + epsilon}; 91 | } 92 | 93 | void clear() { std::vector::clear(); } 94 | void resize(size_t n) { std::vector::resize(n); } 95 | void reserve(size_t n) { std::vector::reserve(n); } 96 | size_t size() const { return std::vector::size(); } 97 | void push(X x, Y y) { std::vector::emplace_back(StoredPoint{x, y}); }; 98 | }; 99 | 100 | const Y epsilon; 101 | Hull lower; 102 | Hull upper; 103 | X first_x = 0; 104 | X last_x = 0; 105 | size_t lower_start = 0; 106 | size_t upper_start = 0; 107 | size_t points_in_hull = 0; 108 | Point rectangle[4]; 109 | 110 | auto cross(const Point &O, const Point &A, const Point &B) const { 111 | auto OA = A - O; 112 | auto OB = B - O; 113 | return (OA.dx * OB.dy) - (OA.dy * OB.dx); 114 | } 115 | 116 | public: 117 | 118 | class CanonicalSegment; 119 | 120 | explicit OptimalPiecewiseLinearModel(Y epsilon) : epsilon(epsilon), lower(epsilon), upper(epsilon) { 121 | if (epsilon < 0) 122 | throw std::invalid_argument("epsilon cannot be negative"); 123 | 124 | upper.reserve(1u << 16); 125 | lower.reserve(1u << 16); 126 | } 127 | 128 | bool add_point(const X &x, const Y &y) { 129 | if (points_in_hull > 0 && x <= last_x) 130 | throw std::logic_error("Points must be increasing by x."); 131 | 132 | last_x = x; 133 | Point p1{x, SY(y) + epsilon}; 134 | Point p2{x, SY(y) - epsilon}; 135 | 136 | if (points_in_hull == 0) { 137 | first_x = x; 138 | rectangle[0] = p1; 139 | rectangle[1] = p2; 140 | upper.clear(); 141 | lower.clear(); 142 | upper.push(x, y); 143 | lower.push(x, y); 144 | upper_start = lower_start = 0; 145 | ++points_in_hull; 146 | return true; 147 | } 148 | 149 | if (points_in_hull == 1) { 150 | rectangle[2] = p2; 151 | rectangle[3] = p1; 152 | upper.push(x, y); 153 | lower.push(x, y); 154 | ++points_in_hull; 155 | return true; 156 | } 157 | 158 | if (epsilon == 0) { 159 | auto p1_on_line1 = p1 - rectangle[0] == rectangle[2] - rectangle[0]; 160 | points_in_hull = p1_on_line1 ? points_in_hull + 1 : 0; 161 | return p1_on_line1; 162 | } 163 | 164 | auto slope1 = rectangle[2] - rectangle[0]; 165 | auto slope2 = rectangle[3] - rectangle[1]; 166 | bool outside_line1 = p1 - rectangle[2] < slope1; 167 | bool outside_line2 = p2 - rectangle[3] > slope2; 168 | 169 | if (outside_line1 || outside_line2) { 170 | points_in_hull = 0; 171 | return false; 172 | } 173 | 174 | if (p1 - rectangle[1] < slope2) { 175 | // Find extreme slope 176 | auto min = lower[lower_start] - p1; 177 | auto min_i = lower_start; 178 | for (auto i = lower_start + 1; i < lower.size(); i++) { 179 | auto val = (lower[i] - p1); 180 | if (val > min) 181 | break; 182 | min = val; 183 | min_i = i; 184 | } 185 | 186 | rectangle[1] = lower[min_i]; 187 | rectangle[3] = p1; 188 | lower_start = min_i; 189 | 190 | // Hull update 191 | auto end = upper.size(); 192 | for (; end >= upper_start + 2 && cross(upper[end - 2], upper[end - 1], p1) <= 0; --end); 193 | upper.resize(end); 194 | upper.push(x, y); 195 | } 196 | 197 | if (p2 - rectangle[0] > slope1) { 198 | // Find extreme slope 199 | auto max = upper[upper_start] - p2; 200 | auto max_i = upper_start; 201 | for (auto i = upper_start + 1; i < upper.size(); i++) { 202 | auto val = (upper[i] - p2); 203 | if (val < max) 204 | break; 205 | max = val; 206 | max_i = i; 207 | } 208 | 209 | rectangle[0] = upper[max_i]; 210 | rectangle[2] = p2; 211 | upper_start = max_i; 212 | 213 | // Hull update 214 | auto end = lower.size(); 215 | for (; end >= lower_start + 2 && cross(lower[end - 2], lower[end - 1], p2) >= 0; --end); 216 | lower.resize(end); 217 | lower.push(x, y); 218 | } 219 | 220 | ++points_in_hull; 221 | return true; 222 | } 223 | 224 | CanonicalSegment get_segment() const { 225 | if (points_in_hull == 1) 226 | return CanonicalSegment(rectangle[0], rectangle[1], first_x); 227 | return CanonicalSegment(rectangle, first_x); 228 | } 229 | 230 | void reset() { 231 | points_in_hull = 0; 232 | lower.clear(); 233 | upper.clear(); 234 | } 235 | }; 236 | 237 | template 238 | class OptimalPiecewiseLinearModel::CanonicalSegment { 239 | friend class OptimalPiecewiseLinearModel; 240 | 241 | public: 242 | 243 | Point rectangle[4]; 244 | X first; 245 | 246 | CanonicalSegment(const Point &p0, const Point &p1, X first) : rectangle{p0, p1, p0, p1}, first(first) {}; 247 | 248 | CanonicalSegment(const Point (&rectangle)[4], X first) 249 | : rectangle{rectangle[0], rectangle[1], rectangle[2], rectangle[3]}, first(first) {}; 250 | 251 | bool one_point() const { 252 | return rectangle[0].x == rectangle[2].x && rectangle[0].y == rectangle[2].y 253 | && rectangle[1].x == rectangle[3].x && rectangle[1].y == rectangle[3].y; 254 | } 255 | 256 | 257 | CanonicalSegment() = default; 258 | 259 | explicit CanonicalSegment(X first) : CanonicalSegment({first, 0}, {first, 0}, first) {}; 260 | 261 | X get_first_x() const { return first; } 262 | 263 | CanonicalSegment copy(X x) const { 264 | auto c(*this); 265 | c.first = x; 266 | return c; 267 | } 268 | 269 | std::tuple get_fixed_point_segment(X origin, X max_input) const { 270 | if (one_point()) 271 | return {0, 0, (rectangle[0].y + rectangle[1].y) / 2}; 272 | 273 | auto &p1 = rectangle[1]; 274 | auto max_slope = rectangle[3] - rectangle[1]; 275 | 276 | auto is_slope_integral = max_slope.dy % max_slope.dx == 0; 277 | auto slope_exponent = is_slope_integral ? 0 : (uint8_t) std::ceil(std::log2(max_input)) + 1; 278 | auto slope_significand = (max_slope.dy << slope_exponent) / max_slope.dx; 279 | 280 | auto intercept_n = max_slope.dy * (SX(origin) - p1.x); 281 | auto intercept_d = max_slope.dx; 282 | auto rounding_term = ((intercept_n < 0) ^ (intercept_d < 0) ? -1 : +1) * intercept_d / 2; 283 | auto intercept = (intercept_n + rounding_term) / intercept_d + p1.y; 284 | 285 | return {slope_significand, slope_exponent, intercept}; 286 | } 287 | }; 288 | 289 | template 290 | size_t make_segmentation(size_t n, size_t epsilon, Fin in, Fout out) { 291 | if (n == 0) 292 | return 0; 293 | 294 | using X = typename std::invoke_result_t::first_type; 295 | using Y = typename std::invoke_result_t::second_type; 296 | size_t c = 0; 297 | size_t start = 0; 298 | auto p = in(0); 299 | 300 | OptimalPiecewiseLinearModel opt(epsilon); 301 | opt.add_point(p.first, p.second); 302 | 303 | for (size_t i = 1; i < n; ++i) { 304 | auto next_p = in(i); 305 | if (i != start && next_p.first == p.first) 306 | continue; 307 | p = next_p; 308 | if (!opt.add_point(p.first, p.second)) { 309 | out(opt.get_segment()); 310 | start = i; 311 | --i; 312 | ++c; 313 | } 314 | } 315 | 316 | out(opt.get_segment()); 317 | return ++c; 318 | } 319 | 320 | template 321 | size_t make_segmentation_par(size_t n, size_t epsilon, Fin in, Fout out) { 322 | auto parallelism = std::min(omp_get_max_threads(), 20); 323 | auto chunk_size = n / parallelism; 324 | auto c = 0ull; 325 | 326 | if (parallelism == 1 || n < 1ull << 15) 327 | return make_segmentation(n, epsilon, in, out); 328 | 329 | using X = typename std::invoke_result_t::first_type; 330 | using Y = typename std::invoke_result_t::second_type; 331 | using canonical_segment = typename OptimalPiecewiseLinearModel::CanonicalSegment; 332 | std::vector> results(parallelism); 333 | 334 | #pragma omp parallel for reduction(+:c) num_threads(parallelism) 335 | for (auto i = 0ull; i < parallelism; ++i) { 336 | auto first = i * chunk_size; 337 | auto last = i == parallelism - 1 ? n : first + chunk_size; 338 | if (first > 0) { 339 | for (; first < last; ++first) 340 | if (in(first).first != in(first - 1).first) 341 | break; 342 | if (first == last) 343 | continue; 344 | } 345 | 346 | auto in_fun = [in, first](auto j) { return in(first + j); }; 347 | auto out_fun = [&results, i](const auto &cs) { results[i].emplace_back(cs); }; 348 | results[i].reserve(chunk_size / (epsilon > 0 ? epsilon * epsilon : 16)); 349 | c += make_segmentation(last - first, epsilon, in_fun, out_fun); 350 | } 351 | 352 | for (auto &v : results) 353 | for (auto &cs : v) 354 | out(cs); 355 | 356 | return c; 357 | } --------------------------------------------------------------------------------