├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── data ├── SARS-CoV-2.access └── example_fasta │ └── example.fasta ├── include ├── CMakeLists.txt ├── common │ ├── CMakeLists.txt │ └── common.hpp └── r_index_f │ ├── CMakeLists.txt │ ├── LF_table.hpp │ ├── block_table.hpp │ ├── ds │ ├── ACGT_map.hpp │ ├── CMakeLists.txt │ ├── base_bv.hpp │ ├── base_interpolate.hpp │ ├── base_sample.hpp │ ├── heads_bv_w.hpp │ ├── heads_wt_w.hpp │ ├── idx_bit_vector.hpp │ ├── idx_list.hpp │ ├── interval_block.hpp │ ├── interval_pos.hpp │ ├── intervals_rank_w.hpp │ └── symbol_map.hpp │ └── r_index_f.hpp ├── pipeline └── rif ├── test └── src │ ├── CMakeLists.txt │ ├── build_rif.cpp │ ├── count_query.cpp │ └── rif_tests.cpp └── thirdparty └── CMakeLists.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/github/gitignore/ C++ .gitignore template 2 | 3 | # Prerequisites 4 | *.d 5 | 6 | # Compiled Object files 7 | *.slo 8 | *.lo 9 | *.o 10 | *.obj 11 | 12 | # Precompiled Headers 13 | *.gch 14 | *.pch 15 | 16 | # Compiled Dynamic libraries 17 | *.so 18 | *.dylib 19 | *.dll 20 | 21 | # Fortran module files 22 | *.mod 23 | *.smod 24 | 25 | # Compiled Static libraries 26 | *.lai 27 | *.la 28 | *.a 29 | *.lib 30 | 31 | # Executables 32 | *.exe 33 | *.out 34 | *.app 35 | 36 | # Build Folder 37 | build 38 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # @brief : Root cmake file. 3 | # @author : Enrico Fraccaroli 4 | # @create : 23/03/2021 5 | # @update : 23/03/2021 6 | # ----------------------------------------------------------------------------- 7 | 8 | # ----------------------------------------------------------------------------- 9 | # Set the minimum CMake version, the project name and default build type. 10 | # ----------------------------------------------------------------------------- 11 | cmake_minimum_required(VERSION 3.15) 12 | 13 | # Set the project name. 14 | project(r-index-f) 15 | 16 | # Set the default build type to Release. 17 | if(NOT CMAKE_BUILD_TYPE) 18 | message(STATUS "Setting build type to 'Release' as none was specified.") 19 | set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE) 20 | endif() 21 | 22 | find_package(Git) 23 | if(GIT_FOUND) 24 | message("git found: ${GIT_EXECUTABLE}") 25 | else() 26 | message(WARNING "git not found. Cloning of submodules will not work.") 27 | endif() 28 | 29 | # ----------------------------------------------------------------------------- 30 | # Set the compilation flags. 31 | # ----------------------------------------------------------------------------- 32 | # Set C++ Standard. 33 | set(CMAKE_CXX_STANDARD 17) 34 | 35 | # Set the actual flags. 36 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") 37 | #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror") 38 | #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pedantic") 39 | #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pedantic-errors") 40 | 41 | if (CMAKE_BUILD_TYPE STREQUAL "Debug") 42 | 43 | message(STATUS "Disabling optimizations.") 44 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g3") 45 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb") 46 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0") 47 | 48 | elseif (CMAKE_BUILD_TYPE STREQUAL "Release") 49 | 50 | message(STATUS "Enabling optimizations.") 51 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") 52 | 53 | endif (CMAKE_BUILD_TYPE STREQUAL "Debug") 54 | 55 | # ----------------------------------------------------------------------------- 56 | # Inlcude header directories and set the library. 57 | # ----------------------------------------------------------------------------- 58 | add_subdirectory(thirdparty) 59 | 60 | # ----------------------------------------------------------------------------- 61 | # Add tests. 62 | # ----------------------------------------------------------------------------- 63 | # CMake has support for adding tests to a project: 64 | enable_testing() 65 | # Add the subdirectory containing the tests (which imports also their target). 66 | add_subdirectory(include) 67 | add_subdirectory(test/src) 68 | 69 | configure_file(${PROJECT_SOURCE_DIR}/pipeline/rif ${PROJECT_BINARY_DIR}/rif) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # R-Index-F 2 | 11 | 12 | R-Index-F Library for String Indexing 13 | 14 | Implemented and adapted from original work by Takaaki Nishimoto and Yasuo Tabei [1]. 15 | 16 | This library uses a simplified approach which follows the theory of the original paper. We store intervals consisting of full BWT-runs rather than sub-runs, representing a maximal interval mapping, and a custom block compression [2]. Joint work with Travis Gagie and Massimiliano Rossi. To reproduce experiments shown in RLBWT Tricks [2], accession codes for SARS-CoV-2 genomes from the [Covid-19 Data Portal](https://www.covid19dataportal.org/) are listed in the example data. 17 | 18 | Efficiently performs decompression and count queries using interval mapping of BWT-runs. 19 | 20 | *Current Version:* 0.2.0 21 | 22 | # Example 23 | ### Download and Compile 24 | 25 | ```console 26 | git clone https://github.com/drnatebrown/r-index-f.git 27 | cd r-index-f 28 | 29 | mkdir build && cd build 30 | cmake .. 31 | make 32 | ``` 33 | 34 | ### Build 35 | Builds the data structure on the example fasta file given, creating [filename].rif as output. The -f flag specifies we read in a fasta format. Other build flags affect the BWT build and are described in [Big-BWT](https://github.com/alshai/Big-BWT.git). 36 | ```console 37 | python3 rif ../data/example_fasta/example.fasta -f 38 | ``` 39 | 40 | If using row splitting from [r-permute](https://github.com/drnatebrown/r-permute) to bound LF to $O(1)$ and $O(r)$-space, use the `-d` option (` = d`). First, copy the output of r-permute to rename using the split parameter. 41 | ```console 42 | cp example.fasta.d_col example.fasta._col 43 | python3 rif ../data/example_fasta/example.fasta -f -d 44 | ``` 45 | 46 | ### Queries 47 | The data structure should be imported and loaded as decribed in r-index-f.hpp once built, and supports LF computation needed to perform count queries. An example command prints the count query for a pattern to stdout, assuming the table was built using default settings. 48 | 49 | To give the pattern explicitly, i.e. `"GATTACAT"`: 50 | ```console 51 | ./test/src/count_query ../data/example_fasta/example.fasta -p GATTACAT 52 | ``` 53 | 54 | To give multiple pattern from a file (one per line), i.e. `"pattern.txt"`, use the -f option: 55 | ```console 56 | ./test/src/count_query ../data/example_fasta/example.fasta -f -p pattern.txt 57 | ``` 58 | 59 | # External Dependencies 60 | 61 | * [Big-BWT](https://github.com/alshai/Big-BWT.git) 62 | * [gSACA-K](https://github.com/felipelouza/gsa-is.git) 63 | * [malloc_count](https://github.com/bingmann/malloc_count) 64 | * [pfp_thresholds](https://github.com/maxrossi91/pfp-thresholds) 65 | * [sdsl-lite](https://github.com/simongog/sdsl-lite) 66 | * [divufsort](https://github.com/simongog/libdivsufsort) 67 | * [Google Benchmark](https://github.com/google/benchmark.git) 68 | * [Google Test](https://github.com/google/googletest) 69 | 70 | # Authors 71 | 72 | ### Implementation: 73 | 74 | * [Nathaniel Brown](https://github.com/drnatebrown) 75 | * [Massimiliano Rossi](https://github.com/maxrossi91) 76 | 77 | ### Theory 78 | * Nathaniel Brown 79 | * Travis Gagie 80 | * Massimiliano Rossi 81 | 82 | # Citation 83 | Please cite the original paper by Nishimoto and Tabei [1] if you refer only to their data structure 84 | 85 | If you use the implementation in an academic setting, or the style of our data structure, please cite both the former as well as RLBWT Tricks [2]. 86 | 87 | # References 88 | 89 | [1] Nishimoto, T., & Tabei, Y. (2020). Optimal-Time Queries on BWT-runs Compressed Indexes. arXiv preprint arXiv:2006.05104. 90 | [2] Brown, N.K., Gagie, T., & Rossi, M. (2022). RLBWT Tricks. arXiv preprint arXiv:2112.04271. 91 | -------------------------------------------------------------------------------- /include/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drnatebrown/r-index-f/90c8b390a26912db51cc6b11551c55bbb7fd6ef9/include/CMakeLists.txt -------------------------------------------------------------------------------- /include/common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(COMMON_SOURCES common.hpp) 2 | 3 | add_library(common OBJECT ${COMMON_SOURCES}) 4 | target_link_libraries(common sdsl) -------------------------------------------------------------------------------- /include/common/common.hpp: -------------------------------------------------------------------------------- 1 | /* common 2 | Copyright (C) 2020 Massimiliano Rossi 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file common.hpp 16 | \brief common.hpp contains common features. 17 | \author Massimiliano Rossi 18 | \date 12/03/2020 19 | */ 20 | 21 | #ifndef _COMMON_HH 22 | #define _COMMON_HH 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #include 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | #include // std::stringstream 36 | #include // std::vector 37 | #include // high_resolution_clock 38 | #include 39 | #include 40 | #include 41 | 42 | #include 43 | 44 | #define ALPHABET_SIZE 256 45 | 46 | static const uint8_t TERMINATOR = 1; 47 | typedef unsigned long int ulint; 48 | typedef unsigned char uchar; 49 | 50 | std::string NowTime(); 51 | void _internal_messageInfo(const std::string message); 52 | void _internal_messageWarning( const std::string file, const unsigned int line, const std::string message); 53 | void _internal_messageError( const std::string file, const unsigned int line,const std::string message); 54 | 55 | 56 | std::string NowTime() 57 | { 58 | struct timeval tv; 59 | gettimeofday(&tv, 0); 60 | char buffer[100]; 61 | tm r; 62 | strftime(buffer, sizeof(buffer), "%X", localtime_r(&tv.tv_sec, &r)); 63 | char result[100]; 64 | snprintf(result, 100, "%s"/*.%06ld"*/, buffer/*, (long)tv.tv_usec*/); 65 | return result; 66 | } 67 | 68 | 69 | template 70 | inline void _internal_message_helper(std::stringstream &ss, T const &first) { ss << first; } 71 | template 72 | inline void _internal_message_helper(std::stringstream &ss, T const &first, const Args&... args) { ss << first << " "; _internal_message_helper(ss,args...); } 73 | template 74 | inline std::string _internal_message(T const &first, const Args&... args) { std::stringstream ss; _internal_message_helper(ss,first,args...); return ss.str(); } 75 | 76 | 77 | void _internal_messageInfo(const std::string message) 78 | { 79 | std::cout << "[INFO] " << NowTime() << " - " << "Message: " << message << std::endl; 80 | } 81 | 82 | void _internal_messageWarning( const std::string file, const unsigned int line, 83 | const std::string message) 84 | { 85 | std::cout << "[WARNING] " << NowTime() << " - " 86 | << "File: " << file << '\n' 87 | << "Line: " << line << '\n' 88 | << "Message: " << message << std::endl; 89 | } 90 | 91 | void _internal_messageError( const std::string file, const unsigned int line, 92 | const std::string message) 93 | { 94 | std::cerr << "[ERROR] " << NowTime() << " - " 95 | << "File: " << file << '\n' 96 | << "Line: " << line << '\n' 97 | << "Message: " << message << std::endl; 98 | assert( false ); 99 | exit( 1 ); 100 | } 101 | 102 | 103 | 104 | #define info( args... ) \ 105 | _internal_messageInfo( _internal_message(args) ) 106 | 107 | #ifdef VERBOSE 108 | #define verbose( args... ) \ 109 | _internal_messageInfo( _internal_message(args) ) 110 | #else 111 | #define verbose( args... ) 112 | #endif 113 | 114 | #define warning( args... ) \ 115 | _internal_messageWarning( __FILE__, __LINE__, _internal_message(args) ) 116 | 117 | #define error( args... ) \ 118 | _internal_messageError( __FILE__, __LINE__, _internal_message(args) ) 119 | 120 | //*********************** Argument options *************************************** 121 | // struct containing command line parameters and other globals 122 | struct Args 123 | { 124 | std::string filename = ""; 125 | bool store = false; // store the data structure in the file 126 | bool memo = false; // print the memory usage 127 | bool rle = true; // outpt RLBWT 128 | size_t th = 1; // number of threads 129 | bool is_fasta = false; // read a fasta file 130 | size_t d = 0; // Use run-splitting [must have bitvector marking runs present] 131 | std::string pattern = ""; 132 | }; 133 | 134 | void parseArgs(int argc, char *const argv[], Args &arg) 135 | { 136 | int c; 137 | extern char *optarg; 138 | extern int optind; 139 | 140 | std::string sarg; 141 | while ((c = getopt(argc, argv, "w:smcfl:rhp:t:d:p:")) != -1) 142 | { 143 | switch (c) 144 | { 145 | case 's': 146 | arg.store = true; 147 | break; 148 | case 'm': 149 | arg.memo = true; 150 | break; 151 | case 'r': 152 | arg.rle = true; 153 | break; 154 | case 't': 155 | sarg.assign(optarg); 156 | arg.th = stoi(sarg); 157 | break; 158 | case 'p': 159 | arg.pattern.assign(optarg); 160 | break; 161 | case 'f': 162 | arg.is_fasta = true; 163 | break; 164 | case 'd': 165 | sarg.assign(optarg); 166 | arg.d = stoi(sarg); 167 | break; 168 | case '?': 169 | error("Unknown option.\n"); 170 | break; 171 | } 172 | } 173 | // the only input parameter is the file name 174 | if (argc == optind + 1) 175 | { 176 | arg.filename.assign(argv[optind]); 177 | } 178 | else 179 | { 180 | error("Invalid number of arguments\n"); 181 | } 182 | } 183 | 184 | //********** end argument options ******************** 185 | 186 | // Convert boolean vector to specified bit vector 187 | template 188 | B bool_to_bit_vec(std::vector &b) 189 | { 190 | if(b.size()==0) return B(); 191 | 192 | sdsl::bit_vector bv(b.size()); 193 | 194 | for(size_t i = 0; i < b.size(); ++i) 195 | bv[i] = b[i]; 196 | 197 | return B(bv); 198 | } 199 | 200 | #endif /* end of include guard: _COMMON_HH */ -------------------------------------------------------------------------------- /include/r_index_f/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(ds) 2 | 3 | set(RIF_SOURCES r_index_f.hpp block_table.hpp LF_table.hpp) 4 | 5 | add_library(r_index_f OBJECT ${RIF_SOURCES}) 6 | target_link_libraries(r_index_f ds common sdsl) -------------------------------------------------------------------------------- /include/r_index_f/LF_table.hpp: -------------------------------------------------------------------------------- 1 | /* Lf_table - Uncompressed version of OptBWTR (LF table) 2 | Copyright (C) 2021 Nathaniel Brown 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file LF_table.hpp 16 | \brief LF_table.hpp Uncompressed version of OptBWTR (LF table) 17 | \author Nathaniel Brown 18 | \author Massimiliano Rossi 19 | \date 19/11/2021 20 | */ 21 | 22 | #ifndef _LF_TABLE_HH 23 | #define _LF_TABLE_HH 24 | 25 | #include 26 | 27 | #include 28 | #include 29 | 30 | using namespace std; 31 | 32 | class LF_table 33 | { 34 | public: 35 | // Row of the LF table 36 | typedef struct LF_row 37 | { 38 | char character; 39 | ulint length; 40 | ulint interval; 41 | ulint offset; 42 | 43 | size_t serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name ="") 44 | { 45 | sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 46 | size_t written_bytes = 0; 47 | 48 | out.write((char *)&character, sizeof(character)); 49 | written_bytes += sizeof(character); 50 | 51 | out.write((char *)&interval, sizeof(interval)); 52 | written_bytes += sizeof(interval); 53 | 54 | out.write((char *)&length, sizeof(length)); 55 | written_bytes += sizeof(length); 56 | 57 | out.write((char *)&offset, sizeof(offset)); 58 | written_bytes += sizeof(offset); 59 | 60 | return written_bytes; 61 | } 62 | 63 | void load(std::istream &in) 64 | { 65 | in.read((char *)&character, sizeof(character)); 66 | in.read((char *)&interval, sizeof(interval)); 67 | in.read((char *)&length, sizeof(length)); 68 | in.read((char *)&offset, sizeof(offset)); 69 | } 70 | }; 71 | 72 | LF_table() {} 73 | 74 | LF_table(std::ifstream &heads, std::ifstream &lengths, ulint max_run = 0) 75 | { 76 | heads.clear(); 77 | heads.seekg(0); 78 | lengths.clear(); 79 | lengths.seekg(0); 80 | 81 | LF_runs = vector(); 82 | vector> L_block_indices = vector>(ALPHABET_SIZE); 83 | 84 | char c; 85 | ulint i = 0; 86 | r = 0; 87 | n = 0; 88 | while ((c = heads.get()) != EOF) 89 | { 90 | size_t length = 0; 91 | lengths.read((char *)&length, 5); 92 | if (c <= TERMINATOR) c = TERMINATOR; 93 | 94 | if (max_run > 0 && length > max_run) { 95 | ulint max_splits = length/max_run; 96 | for (size_t split = 0; split < max_splits; ++split) 97 | { 98 | LF_runs.push_back({c, max_run, 0, 0}); 99 | L_block_indices[c].push_back(i++); 100 | } 101 | 102 | if (length % max_run != 0) 103 | { 104 | LF_runs.push_back({c, length % max_run, 0, 0}); 105 | L_block_indices[c].push_back(i++); 106 | } 107 | } 108 | else { 109 | LF_runs.push_back({c, length, 0, 0}); 110 | L_block_indices[c].push_back(i++); 111 | } 112 | n+=length; 113 | } 114 | r = LF_runs.size(); 115 | 116 | ulint curr_L_num = 0; 117 | ulint L_seen = 0; 118 | ulint F_seen = 0; 119 | for(size_t i = 0; i < L_block_indices.size(); ++i) 120 | { 121 | for(size_t j = 0; j < L_block_indices[i].size(); ++j) 122 | { 123 | ulint pos = L_block_indices[i][j]; 124 | 125 | LF_runs[pos].interval = curr_L_num; 126 | LF_runs[pos].offset = F_seen - L_seen; 127 | 128 | F_seen += LF_runs[pos].length; 129 | 130 | while (curr_L_num < r && F_seen >= L_seen + LF_runs[curr_L_num].length) 131 | { 132 | L_seen += LF_runs[curr_L_num].length; 133 | ++curr_L_num; 134 | } 135 | } 136 | } 137 | 138 | mem_stats(); 139 | } 140 | 141 | LF_table(std::ifstream &bwt, ulint max_run = 0) 142 | { 143 | bwt.clear(); 144 | bwt.seekg(0); 145 | 146 | LF_runs = vector(); 147 | vector> L_block_indices = vector>(ALPHABET_SIZE); 148 | 149 | char last_c; 150 | char c; 151 | ulint i = 0; 152 | r = 0; 153 | n = 0; 154 | size_t length = 0; 155 | while ((c = bwt.get()) != EOF) 156 | { 157 | if (c <= TERMINATOR) c = TERMINATOR; 158 | 159 | if (i != 0 && c != last_c) 160 | { 161 | if (max_run > 0 && length > max_run) { 162 | ulint max_splits = length/max_run; 163 | for (size_t split = 0; split < max_splits; ++split) 164 | { 165 | LF_runs.push_back({last_c, max_run, 0, 0}); 166 | L_block_indices[last_c].push_back(i++); 167 | } 168 | 169 | if (length % max_run != 0) 170 | { 171 | LF_runs.push_back({last_c, length % max_run, 0, 0}); 172 | L_block_indices[last_c].push_back(i++); 173 | } 174 | } 175 | else { 176 | LF_runs.push_back({last_c, length, 0, 0}); 177 | L_block_indices[last_c].push_back(i++); 178 | } 179 | n+=length; 180 | length = 0; 181 | } 182 | ++length; 183 | last_c = c; 184 | } 185 | // Step for final character 186 | if (max_run > 0 && length > max_run) { 187 | ulint max_splits = length/max_run; 188 | for (size_t split = 0; split < max_splits; ++split) 189 | { 190 | LF_runs.push_back({last_c, max_run, 0, 0}); 191 | L_block_indices[last_c].push_back(i++); 192 | } 193 | 194 | if (length % max_run != 0) 195 | { 196 | LF_runs.push_back({last_c, length % max_run, 0, 0}); 197 | L_block_indices[last_c].push_back(i++); 198 | } 199 | } 200 | else { 201 | LF_runs.push_back({last_c, length, 0, 0}); 202 | L_block_indices[last_c].push_back(i++); 203 | } 204 | n+=length; 205 | 206 | r = LF_runs.size(); 207 | 208 | ulint curr_L_num = 0; 209 | ulint L_seen = 0; 210 | ulint F_seen = 0; 211 | for(size_t i = 0; i < L_block_indices.size(); ++i) 212 | { 213 | for(size_t j = 0; j < L_block_indices[i].size(); ++j) 214 | { 215 | ulint pos = L_block_indices[i][j]; 216 | 217 | LF_runs[pos].interval = curr_L_num; 218 | LF_runs[pos].offset = F_seen - L_seen; 219 | 220 | F_seen += LF_runs[pos].length; 221 | 222 | while (curr_L_num < r && F_seen >= L_seen + LF_runs[curr_L_num].length) 223 | { 224 | L_seen += LF_runs[curr_L_num].length; 225 | ++curr_L_num; 226 | } 227 | } 228 | } 229 | 230 | mem_stats(); 231 | } 232 | 233 | LF_table(std::ifstream &heads, std::ifstream &lengths, sdsl::bit_vector splits) 234 | { 235 | heads.clear(); 236 | heads.seekg(0); 237 | lengths.clear(); 238 | lengths.seekg(0); 239 | 240 | LF_runs = vector(); 241 | vector> L_block_indices = vector>(ALPHABET_SIZE); 242 | 243 | char c; 244 | ulint i = 0; 245 | r = 0; 246 | n = 0; 247 | while ((c = heads.get()) != EOF) 248 | { 249 | size_t length = 0; 250 | lengths.read((char *)&length, 5); 251 | if (c <= TERMINATOR) c = TERMINATOR; 252 | 253 | size_t curr_len = 1; // Assume we start at a run-head 254 | for (size_t bwt_i = n+1; bwt_i < n + length; bwt_i++) 255 | { 256 | if (splits[bwt_i]) 257 | { 258 | LF_runs.push_back({c, curr_len, 0, 0}); 259 | L_block_indices[c].push_back(i++); 260 | curr_len = 0; 261 | } 262 | curr_len++; 263 | } 264 | LF_runs.push_back({c, curr_len, 0, 0}); 265 | L_block_indices[c].push_back(i++); 266 | 267 | } 268 | r = LF_runs.size(); 269 | 270 | ulint curr_L_num = 0; 271 | ulint L_seen = 0; 272 | ulint F_seen = 0; 273 | for(size_t i = 0; i < L_block_indices.size(); ++i) 274 | { 275 | for(size_t j = 0; j < L_block_indices[i].size(); ++j) 276 | { 277 | ulint pos = L_block_indices[i][j]; 278 | 279 | LF_runs[pos].interval = curr_L_num; 280 | LF_runs[pos].offset = F_seen - L_seen; 281 | 282 | F_seen += LF_runs[pos].length; 283 | 284 | while (curr_L_num < r && F_seen >= L_seen + LF_runs[curr_L_num].length) 285 | { 286 | L_seen += LF_runs[curr_L_num].length; 287 | ++curr_L_num; 288 | } 289 | } 290 | } 291 | 292 | mem_stats(); 293 | } 294 | 295 | const LF_row get(size_t i) 296 | { 297 | assert(i < LF_runs.size()); 298 | return LF_runs[i]; 299 | } 300 | 301 | ulint size() 302 | { 303 | return n; 304 | } 305 | 306 | ulint runs() 307 | { 308 | return r; 309 | } 310 | 311 | void invert(std::string outfile) 312 | { 313 | std::ofstream out(outfile); 314 | 315 | ulint interval = 0; 316 | ulint offset = 0; 317 | 318 | char c; 319 | while((c = get_char(interval)) > TERMINATOR) 320 | { 321 | out << c; 322 | std::pair pos = LF(interval, offset); 323 | interval = pos.first; 324 | offset = pos.second; 325 | } 326 | } 327 | 328 | /* 329 | * \param Run position (RLE intervals) 330 | * \param Current character offset in block 331 | * \return block position and offset of preceding character 332 | */ 333 | std::pair LF(ulint run, ulint offset) 334 | { 335 | ulint next_interval = LF_runs[run].interval; 336 | ulint next_offset = LF_runs[run].offset + offset; 337 | 338 | while (next_offset >= LF_runs[next_interval].length) 339 | { 340 | next_offset -= LF_runs[next_interval++].length; 341 | } 342 | 343 | return std::make_pair(next_interval, next_offset); 344 | } 345 | 346 | uchar get_char(ulint i) 347 | { 348 | return get(i).character; 349 | } 350 | 351 | std::string get_file_extension() const 352 | { 353 | return ".LF_table"; 354 | } 355 | 356 | void mem_stats() 357 | { 358 | sdsl::nullstream ns; 359 | 360 | verbose("Memory consumption (bytes)."); 361 | verbose(" LF table: ", serialize(ns)); 362 | } 363 | 364 | void bwt_stats() 365 | { 366 | ulint n = size(); 367 | ulint r = runs(); 368 | verbose("Number of BWT equal-letter runs: r = ", r); 369 | verbose("Length of complete BWT: n = ", n); 370 | verbose("Rate n/r = ", double(n) / r); 371 | verbose("log2(r) = ", log2(double(r))); 372 | verbose("log2(n/r) = ", log2(double(n) / r)); 373 | } 374 | 375 | /* serialize to the ostream 376 | * \param out the ostream 377 | */ 378 | size_t serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name ="") 379 | { 380 | sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 381 | size_t written_bytes = 0; 382 | 383 | out.write((char *)&n, sizeof(n)); 384 | written_bytes += sizeof(n); 385 | 386 | out.write((char *)&r, sizeof(r)); 387 | written_bytes += sizeof(r); 388 | 389 | size_t size = LF_runs.size(); 390 | out.write((char *)&size, sizeof(size)); 391 | written_bytes += sizeof(size); 392 | 393 | for(size_t i = 0; i < size; ++i) 394 | { 395 | written_bytes += LF_runs[i].serialize(out, v, "LF_run_" + std::to_string(i)); 396 | } 397 | 398 | return written_bytes; 399 | } 400 | 401 | /* load from the istream 402 | * \param in the istream 403 | */ 404 | void load(std::istream &in) 405 | { 406 | size_t size; 407 | 408 | in.read((char *)&n, sizeof(n)); 409 | in.read((char *)&r, sizeof(r)); 410 | 411 | in.read((char *)&size, sizeof(size)); 412 | LF_runs = std::vector(size); 413 | for(size_t i = 0; i < size; ++i) 414 | { 415 | LF_runs[i].load(in); 416 | } 417 | } 418 | 419 | private: 420 | ulint n; // Length of BWT 421 | ulint r; // Runs of BWT 422 | 423 | vector LF_runs; 424 | }; 425 | 426 | #endif /* end of include guard: _LF_TABLE_HH */ -------------------------------------------------------------------------------- /include/r_index_f/block_table.hpp: -------------------------------------------------------------------------------- 1 | /* interval_block - Wrapper for vector holding interval blocks 2 | Copyright (C) 2021 Nathaniel Brown 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file block_table.hpp 16 | \brief block_table.hpp Wrapper for vector holding interval blocks 17 | \author Nathaniel Brown 18 | \author Massimiliano Rossi 19 | \date 19/11/2021 20 | */ 21 | 22 | #ifndef _BLOCK_TABLE_HH 23 | #define _BLOCK_TABLE_HH 24 | 25 | #include 26 | #include 27 | 28 | #include 29 | #include 30 | #include 31 | 32 | using namespace sdsl; 33 | 34 | template < ulint block_size = 1048576, // 2^20 35 | ulint idx_rate = 8, 36 | class idx_vec = idx_bit_vector<>, 37 | class block = interval_block<>> 38 | class block_table 39 | { 40 | private: 41 | 42 | vector blocks; 43 | idx_vec idx_samples; 44 | 45 | ulint r; 46 | ulint n; 47 | 48 | ulint get_length(ulint run) 49 | { 50 | return get_block(run).get_length(row(run)); 51 | } 52 | 53 | ulint get_length(interval_pos pos) 54 | { 55 | return get_length(pos.run); 56 | } 57 | 58 | ulint row(ulint run) 59 | { 60 | return run % block_size; 61 | } 62 | 63 | ulint row(interval_pos pos) 64 | { 65 | return row(pos.run); 66 | } 67 | 68 | public: 69 | block_table() {} 70 | 71 | block_table(LF_table LF_rows) 72 | { 73 | r = LF_rows.runs(); 74 | n = LF_rows.size(); 75 | 76 | // Round up if quotient not whole 77 | ulint B_len = (r / block_size) + ((r % block_size) != 0); 78 | blocks = vector(B_len); 79 | 80 | std::vector block_chars = std::vector(); 81 | std::vector block_intervals = std::vector(); 82 | std::vector block_lens = std::vector(); 83 | std::vector block_offsets = std::vector(); 84 | 85 | // Where characters prior to block mapped to, in case we can't find that character when we LF 86 | std::unordered_map prior_LF = std::unordered_map(); 87 | 88 | std::vector sampled_runs = std::vector(); 89 | 90 | // Where the last character's position was wrt. current block 91 | std::unordered_map last_c_pos = std::unordered_map(); 92 | 93 | ulint b = 0; 94 | ulint i = 0; 95 | while (i < r) 96 | { 97 | LF_table::LF_row curr = LF_rows.get(i); 98 | 99 | block_chars.push_back(curr.character); 100 | block_intervals.push_back(curr.interval); 101 | block_lens.push_back(curr.length); 102 | block_offsets.push_back(curr.offset); 103 | 104 | sampled_runs.push_back(i % idx_rate == 0); 105 | for (size_t j = 1; j < curr.length; j++) 106 | { 107 | sampled_runs.push_back(false); 108 | } 109 | 110 | if (!last_c_pos.count(curr.character)) { 111 | last_c_pos[curr.character] = block_chars.size() - 1; 112 | 113 | // For all blocks prior without set values to find the next c's mapping, loop back and set 114 | if (b > 0) 115 | { 116 | auto[k, d] = LF_rows.LF(i, 0); 117 | interval_pos next_c = interval_pos(k, d); 118 | 119 | ulint b_curr = b; 120 | while (b_curr > 0 && !blocks[b_curr-1].has_next_LF(curr.character)) 121 | { 122 | blocks[b_curr-1].set_next_LF(curr.character, next_c); 123 | --b_curr; 124 | } 125 | } 126 | } 127 | last_c_pos[curr.character] = block_chars.size() - 1; 128 | 129 | ++i; 130 | 131 | // End of block of intervals, update block table 132 | if (i % block_size == 0 || i >= r) 133 | { 134 | blocks[b] = block(block_chars, block_intervals, block_lens, block_offsets, prior_LF); 135 | 136 | for(auto const& [c, pos] : last_c_pos) 137 | { 138 | // Since pos is wrt. current block, add the runs seen prior before computing LF 139 | ulint run = b*block_size + pos; 140 | // Perform LF step from the last seen character in this run (which is at offset equal to last character, one minus length) 141 | auto[k, d] = LF_rows.LF(run, block_lens[pos] - 1); 142 | 143 | prior_LF[c] = interval_pos(k, d); 144 | } 145 | 146 | block_chars = std::vector(); 147 | block_intervals = std::vector(); 148 | block_lens = std::vector(); 149 | block_offsets = std::vector(); 150 | 151 | last_c_pos = std::unordered_map(); 152 | 153 | ++b; 154 | } 155 | } 156 | 157 | idx_samples = idx_vec(sampled_runs); 158 | } 159 | 160 | block& get_block(ulint run) 161 | { 162 | assert(run < r); 163 | return blocks[run / block_size]; 164 | } 165 | 166 | block& get_block(interval_pos pos) 167 | { 168 | return get_block(pos.run); 169 | } 170 | 171 | uchar get_char(ulint run) 172 | { 173 | return (uchar) get_block(run).get_char(row(run)); 174 | } 175 | 176 | uchar get_char(interval_pos pos) 177 | { 178 | return get_char(pos.run); 179 | } 180 | 181 | ulint runs() 182 | { 183 | return r; 184 | } 185 | 186 | ulint size() 187 | { 188 | return n; 189 | } 190 | 191 | interval_pos LF(interval_pos pos) 192 | { 193 | return reduced_pos(get_block(pos).LF(row(pos), pos.offset)); 194 | } 195 | 196 | interval_pos LF_prior(interval_pos pos, uchar c) 197 | { 198 | return reduced_pos(get_block(pos).LF_prior(row(pos), pos.offset, c)); 199 | } 200 | 201 | interval_pos LF_next(interval_pos pos, uchar c) 202 | { 203 | return reduced_pos(get_block(pos).LF_next(row(pos), pos.offset, c)); 204 | } 205 | 206 | interval_pos reduced_pos(interval_pos pos) 207 | { 208 | if (!pos.is_set()) 209 | { 210 | return pos; 211 | } 212 | 213 | interval_pos curr = pos; 214 | while (curr.offset >= get_length(curr)) 215 | { 216 | curr = get_block(curr).reduce(curr, row(curr)); 217 | } 218 | 219 | return curr; 220 | } 221 | 222 | interval_pos begin() 223 | { 224 | return interval_pos(0, 0); 225 | } 226 | 227 | interval_pos end() 228 | { 229 | return interval_pos(r-1, get_length(r-1)-1); 230 | } 231 | 232 | // For a general interval position, return the idx wrt. the BWT 233 | ulint interval_to_idx(interval_pos pos) 234 | { 235 | ulint sample_rank = pos.run / idx_rate; 236 | ulint sample_run = sample_rank*idx_rate; 237 | ulint idx = idx_samples.sample(sample_rank); 238 | while (sample_run < pos.run) 239 | { 240 | idx += get_length(sample_run++); 241 | } 242 | idx += pos.offset; 243 | 244 | return idx; 245 | } 246 | 247 | // For a general index on the BWT, return the corresponding interval position 248 | interval_pos idx_to_interval(ulint idx) 249 | { 250 | assert(idx < n); 251 | // Get first sampled run idx equal to or greater than idx 252 | ulint base = idx_samples.predecessor(idx); 253 | // Offset is difference between predecessor and true value, reduce to find true position 254 | return reduced_pos(interval_pos(base*idx_rate, idx-base)); 255 | } 256 | 257 | /* serialize the interval block to the ostream 258 | * \param out the ostream 259 | */ 260 | size_t serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name ="") 261 | { 262 | sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 263 | size_t written_bytes = 0; 264 | 265 | out.write((char *)&n, sizeof(n)); 266 | written_bytes += sizeof(n); 267 | 268 | out.write((char *)&r, sizeof(r)); 269 | written_bytes += sizeof(r); 270 | 271 | size_t size = blocks.size(); 272 | out.write((char *)&size, sizeof(size)); 273 | written_bytes += sizeof(size); 274 | 275 | for(size_t i = 0; i < size; ++i) 276 | { 277 | written_bytes += blocks[i].serialize(out,v,"block_table_" + std::to_string(i)); 278 | } 279 | 280 | written_bytes += idx_samples.serialize(out, v, "idx_samples"); 281 | 282 | return written_bytes; 283 | } 284 | 285 | /* load the interval block from the istream 286 | * \param in the istream 287 | */ 288 | void load(std::istream &in) 289 | { 290 | size_t size; 291 | 292 | in.read((char *)&n, sizeof(n)); 293 | in.read((char *)&r, sizeof(r)); 294 | 295 | in.read((char *)&size, sizeof(size)); 296 | blocks = std::vector(size); 297 | for(size_t i = 0; i < size; ++i) 298 | { 299 | blocks[i].load(in); 300 | } 301 | 302 | idx_samples.load(in); 303 | } 304 | }; 305 | 306 | #endif /* end of include guard: _BLOCK_TABLE_HH */ -------------------------------------------------------------------------------- /include/r_index_f/ds/ACGT_map.hpp: -------------------------------------------------------------------------------- 1 | /* ACGT_map Map which only accepts values for characters ACGT 2 | Copyright (C) 2021 Nathaniel Brown 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file ACGT_map.hpp 16 | \brief ACGT_map.hpp maps only for characters ACGT 17 | \author Nathaniel Brown 18 | \date 18/12/2021 19 | */ 20 | 21 | #ifndef _ACGT_MAP_HH 22 | #define _ACGT_MAP_HH 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #define ACGT_SIZE 4 30 | 31 | template 32 | class ACGT_map 33 | { 34 | private: 35 | bit_vector map_contains; 36 | T a_type; 37 | T c_type; 38 | T g_type; 39 | T t_type; 40 | 41 | T& find(uchar c) 42 | { 43 | switch(c) 44 | { 45 | case 'A': return a_type; 46 | case 'C': return c_type; 47 | case 'G': return g_type; 48 | case 'T': return t_type; 49 | default: throw std::out_of_range("Symbol" + std::to_string(c) + "not in map"); 50 | } 51 | } 52 | 53 | bool allowed(uchar c) 54 | { 55 | switch(c) 56 | { 57 | case 'A': return true; 58 | case 'C': return true; 59 | case 'G': return true; 60 | case 'T': return true; 61 | default: return false; 62 | } 63 | } 64 | 65 | void set_contains(uchar c, bool val) 66 | { 67 | switch(c) 68 | { 69 | case 'A': 70 | map_contains[0] = val; 71 | break; 72 | case 'C': 73 | map_contains[1] = val; 74 | break; 75 | case 'G': 76 | map_contains[2] = val; 77 | break; 78 | case 'T': 79 | map_contains[3] = val; 80 | break; 81 | default: 82 | return; 83 | } 84 | } 85 | 86 | public: 87 | 88 | ACGT_map() { 89 | map_contains = bit_vector(ACGT_SIZE, false); 90 | } 91 | 92 | ACGT_map(std::unordered_map map) 93 | { 94 | map_contains = bit_vector(ACGT_SIZE, false); 95 | 96 | for(auto const& [c, val] : map) 97 | { 98 | if(allowed(c)) 99 | { 100 | set_contains(c, true); 101 | find(c) = val; 102 | } 103 | } 104 | } 105 | 106 | bool contains(uchar c) 107 | { 108 | switch(c) 109 | { 110 | case 'A': return map_contains[0]; 111 | case 'C': return map_contains[1]; 112 | case 'G': return map_contains[2]; 113 | case 'T': return map_contains[3]; 114 | default: return false; 115 | } 116 | } 117 | 118 | bool insert(std::pair kv) 119 | { 120 | if (allowed(kv.first) && !contains(kv.first)) { 121 | set_contains(kv.first, true); 122 | find(kv.first) = kv.second; 123 | 124 | return true; 125 | } 126 | else { 127 | return false; 128 | } 129 | } 130 | 131 | T& at(const uchar c) 132 | { 133 | if (!contains(c)) 134 | { 135 | throw std::out_of_range("Symbol" + std::to_string(c) + "not in map"); 136 | } 137 | 138 | return find(c); 139 | } 140 | 141 | T& operator[](const uchar c) 142 | { 143 | return find(c); 144 | } 145 | 146 | /* serialize the structure to the ostream 147 | * \param out the ostream 148 | */ 149 | size_t serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name = "") const 150 | { 151 | sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 152 | size_t written_bytes = 0; 153 | 154 | written_bytes += map_contains.serialize(out, v, "map_contains"); 155 | written_bytes += a_type.serialize(out, v, "a_type"); 156 | written_bytes += c_type.serialize(out, v, "c_type"); 157 | written_bytes += g_type.serialize(out, v, "g_type"); 158 | written_bytes += t_type.serialize(out, v, "t_type"); 159 | 160 | return written_bytes; 161 | } 162 | 163 | /* load the structure from the istream 164 | * \param in the istream 165 | */ 166 | void load(std::istream &in) 167 | { 168 | map_contains.load(in); 169 | a_type.load(in); 170 | c_type.load(in); 171 | g_type.load(in); 172 | t_type.load(in); 173 | } 174 | }; 175 | 176 | #endif /* end of include guard: _ACGT_MAP_HH */ -------------------------------------------------------------------------------- /include/r_index_f/ds/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(DS_SOURCES interval_block.hpp interval_pos.hpp idx_bit_vector.hpp idx_list.hpp symbol_map.hpp intervals_rank_w.hpp base_bv.hpp base_interpolate.hpp base_sample.hpp heads_wt_w.hpp ACGT_map.hpp, heads_bv_w.hpp) 2 | add_library(ds OBJECT ${DS_SOURCES}) 3 | target_link_libraries(ds common sdsl) -------------------------------------------------------------------------------- /include/r_index_f/ds/base_bv.hpp: -------------------------------------------------------------------------------- 1 | /* base_bv - Holds interval as base and diff computed using bit vector 2 | Copyright (C) 2021 Nathaniel Brown 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file base__bv.hpp 16 | \brief base_bv.hpp Returns intervals by calculating a difference from a stored base (intervals non-decreasing sequence wrt. character) 17 | \author Nathaniel Brown 18 | \date 18/12/2021 19 | */ 20 | 21 | #ifndef _BASE_BV_HH 22 | #define _BASE_BV_HH 23 | 24 | #include 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | using namespace sdsl; 32 | 33 | template 34 | class base_bv 35 | { 36 | private: 37 | typedef typename bv_t::select_1_type bv_select_1; 38 | 39 | ulint base; 40 | bv_t diff_bv; 41 | bv_select_1 diff_select; 42 | 43 | public: 44 | 45 | base_bv() {} 46 | 47 | base_bv(ulint b, std::vector diffs) 48 | { 49 | std::vector bit_diff = std::vector(); 50 | for(size_t i = 0; i < diffs.size(); ++i) 51 | { 52 | ulint diff = diffs[i]; 53 | while (diff > 0) { 54 | bit_diff.push_back(false); 55 | --diff; 56 | } 57 | bit_diff.push_back(true); 58 | } 59 | 60 | base = b; 61 | diff_bv = bool_to_bit_vec(bit_diff); 62 | diff_select = bv_select_1(&diff_bv); 63 | } 64 | 65 | ulint get(ulint rank) const 66 | { 67 | return base + diff_select(rank+1) - rank; 68 | } 69 | 70 | /* serialize the structure to the ostream 71 | * \param out the ostream 72 | */ 73 | size_t serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name = "") const 74 | { 75 | sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 76 | size_t written_bytes = 0; 77 | 78 | out.write((char *)&base, sizeof(base)); 79 | written_bytes += sizeof(base); 80 | 81 | written_bytes += diff_bv.serialize(out, v, "diff_bv"); 82 | 83 | return written_bytes; 84 | } 85 | 86 | /* load the structure from the istream 87 | * \param in the istream 88 | */ 89 | void load(std::istream &in) 90 | { 91 | in.read((char *)&base, sizeof(base)); 92 | diff_bv.load(in); 93 | diff_select= bv_select_1(&diff_bv); 94 | } 95 | }; 96 | 97 | #endif /* end of include guard: _BASE_BV_HH */ -------------------------------------------------------------------------------- /include/r_index_f/ds/base_interpolate.hpp: -------------------------------------------------------------------------------- 1 | /* base_interpolate - Holds interval as base and diff computed using dac/interpolative coding 2 | Copyright (C) 2021 Nathaniel Brown 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file base_interpolate.hpp 16 | \brief base_interpolate.hpp Returns interval by computing from a base and diff retrieved from interpolation, sampling absolute diff positions 17 | \author Nathaniel Brown 18 | \date 18/12/2021 19 | */ 20 | 21 | #ifndef _BASE_INTERPOLATE_HH 22 | #define _BASE_INTERPOLATE_HH 23 | 24 | #include 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | using namespace sdsl; 32 | 33 | template < ulint sample_rate = 16, 34 | class bv_t = bit_vector, 35 | class vec_t = dac_vector_dp<> > 36 | class base_interpolate 37 | { 38 | private: 39 | ulint base; 40 | std::vector sampled_diffs; 41 | bv_t interp_neg; 42 | vec_t interp_diffs; 43 | 44 | ulint get_diff(ulint rank) const 45 | { 46 | // If value is sampled (at sample rate or last value) then return it 47 | if (rank % sample_rate == 0) return sampled_diffs[rank/sample_rate]; 48 | if (rank == interp_diffs.size() - 1) return sampled_diffs[sampled_diffs.size() - 1]; 49 | 50 | // Find the last and next samples for the given position 51 | ulint x = sampled_diffs[rank/sample_rate]; 52 | ulint z = sampled_diffs[rank/sample_rate+1]; 53 | 54 | int sign = (interp_neg[rank]) ? -1 : 1; 55 | // Add weighted sum of samples to stored encoding, which returns y 56 | return sign*interp_diffs[rank] + (x + ((z - x)*(rank - sample_rate*(rank/sample_rate))/sample_rate)); 57 | } 58 | 59 | public: 60 | 61 | base_interpolate() {} 62 | 63 | base_interpolate(ulint b, std::vector diffs) 64 | { 65 | base = b; 66 | 67 | ulint absolute_diff = 0; 68 | vector encoding = vector(diffs.size(), 0); 69 | interp_neg = bv_t(diffs.size(), 0); 70 | vector full_diffs = vector(diffs.size()); 71 | for(size_t i = 0; i < diffs.size(); ++i) 72 | { 73 | absolute_diff += diffs[i]; 74 | full_diffs[i] = absolute_diff; 75 | if (i % sample_rate == 0 || i == diffs.size() - 1) 76 | { 77 | sampled_diffs.push_back(absolute_diff); 78 | if (i != 0) 79 | { 80 | // Last sample 81 | ulint x = sampled_diffs[sampled_diffs.size() - 2]; 82 | // Next sample 83 | ulint z = absolute_diff; 84 | 85 | // If not at last entry, the last sampled is always a distance sample_rate away, otherwise the distance past the last sample 86 | ulint last_dist = i % sample_rate; 87 | if (last_dist == 0) last_dist = sample_rate; 88 | 89 | // Iterate between last and next sample 90 | for (ulint j = i - last_dist + 1; j < i; ++j) 91 | { 92 | // current diff 93 | ulint y = full_diffs[j]; 94 | // Get the difference from the weighted average of the next/last samples and the current diff 95 | long int encode = y - (x + ((z - x)*(j - sample_rate*(j/sample_rate))/sample_rate)); 96 | if (encode < 0) { 97 | interp_neg[j] = true; 98 | encode*=-1; 99 | } 100 | 101 | encoding[j] = encode; 102 | } 103 | } 104 | } 105 | } 106 | 107 | interp_diffs = vec_t(encoding); 108 | } 109 | 110 | ulint get(ulint rank) const 111 | { 112 | return base + get_diff(rank); 113 | } 114 | 115 | /* serialize the structure to the ostream 116 | * \param out the ostream 117 | */ 118 | size_t serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name = "") const 119 | { 120 | sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 121 | size_t written_bytes = 0; 122 | 123 | out.write((char *)&base, sizeof(base)); 124 | written_bytes += sizeof(base); 125 | 126 | size_t size = sampled_diffs.size(); 127 | out.write((char *)&size, sizeof(size)); 128 | written_bytes += sizeof(size); 129 | for(size_t i = 0; i < size; ++i) 130 | { 131 | out.write((char *)&sampled_diffs[i], sizeof(sampled_diffs[i])); 132 | written_bytes += sizeof(sampled_diffs[i]); 133 | } 134 | 135 | written_bytes += interp_neg.serialize(out, v, "interp_neg"); 136 | written_bytes += interp_diffs.serialize(out, v, "interp_diffs"); 137 | 138 | return written_bytes; 139 | } 140 | 141 | /* load the structure from the istream 142 | * \param in the istream 143 | */ 144 | void load(std::istream &in) 145 | { 146 | in.read((char *)&base, sizeof(base)); 147 | 148 | size_t size; 149 | in.read((char *)&size, sizeof(size)); 150 | sampled_diffs = std::vector(size); 151 | for(size_t i = 0; i < size; ++i) 152 | { 153 | in.read((char *)&sampled_diffs[i], sizeof(sampled_diffs[i])); 154 | } 155 | 156 | interp_neg.load(in); 157 | interp_diffs.load(in); 158 | } 159 | }; 160 | 161 | #endif /* end of include guard: _BASE_INTERPOLATIVE_HH */ -------------------------------------------------------------------------------- /include/r_index_f/ds/base_sample.hpp: -------------------------------------------------------------------------------- 1 | /* base_sample - Holds interval as base and diff computed using dac and sampled diffs 2 | Copyright (C) 2021 Nathaniel Brown 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file base_sample.hpp 16 | \brief base_sample.hpp Returns interval by computing from a base and diff, sampling absolute diff positions 17 | \author Nathaniel Brown 18 | \date 18/12/2021 19 | */ 20 | 21 | #ifndef _BASE_SAMPLE_HH 22 | #define _BASE_SAMPLE_HH 23 | 24 | #include 25 | 26 | #include 27 | #include 28 | #include 29 | 30 | using namespace sdsl; 31 | 32 | template < ulint sample_rate = 10, 33 | class vec_t = dac_vector<> > 34 | class base_sample 35 | { 36 | private: 37 | ulint base; 38 | std::vector sampled_diffs; 39 | vec_t partial_diffs; 40 | 41 | ulint get_diff(ulint rank) const 42 | { 43 | ulint sample_rank = rank / sample_rate; 44 | ulint sample_next = sample_rank*sample_rate + 1; 45 | ulint diff = sampled_diffs[sample_rank]; 46 | while (sample_next <= rank) 47 | { 48 | diff += partial_diffs[sample_next++]; 49 | } 50 | 51 | return diff; 52 | } 53 | 54 | public: 55 | 56 | base_sample() {} 57 | 58 | base_sample(ulint b, std::vector diffs) 59 | { 60 | base = b; 61 | 62 | ulint absolute_diff = 0; 63 | for(size_t i = 0; i < diffs.size(); ++i) 64 | { 65 | absolute_diff += diffs[i]; 66 | if (i % sample_rate == 0) 67 | { 68 | sampled_diffs.push_back(absolute_diff); 69 | } 70 | } 71 | 72 | partial_diffs = vec_t(diffs); 73 | } 74 | 75 | ulint get(ulint rank) const 76 | { 77 | return base + get_diff(rank); 78 | } 79 | 80 | /* serialize the structure to the ostream 81 | * \param out the ostream 82 | */ 83 | size_t serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name = "") const 84 | { 85 | sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 86 | size_t written_bytes = 0; 87 | 88 | out.write((char *)&base, sizeof(base)); 89 | written_bytes += sizeof(base); 90 | 91 | size_t size = sampled_diffs.size(); 92 | out.write((char *)&size, sizeof(size)); 93 | written_bytes += sizeof(size); 94 | for(size_t i = 0; i < size; ++i) 95 | { 96 | out.write((char *)&sampled_diffs[i], sizeof(sampled_diffs[i])); 97 | written_bytes += sizeof(sampled_diffs[i]); 98 | } 99 | 100 | written_bytes += partial_diffs.serialize(out, v, "partial_diffs"); 101 | 102 | return written_bytes; 103 | } 104 | 105 | /* load the structure from the istream 106 | * \param in the istream 107 | */ 108 | void load(std::istream &in) 109 | { 110 | in.read((char *)&base, sizeof(base)); 111 | 112 | size_t size; 113 | in.read((char *)&size, sizeof(size)); 114 | sampled_diffs = std::vector(size); 115 | for(size_t i = 0; i < size; ++i) 116 | { 117 | in.read((char *)&sampled_diffs[i], sizeof(sampled_diffs[i])); 118 | } 119 | 120 | partial_diffs.load(in); 121 | } 122 | }; 123 | 124 | #endif /* end of include guard: _BASE_SAMPLE_HH */ -------------------------------------------------------------------------------- /include/r_index_f/ds/heads_bv_w.hpp: -------------------------------------------------------------------------------- 1 | /* heads_bv_w - Wrapper to store the heads in full bit vectors (access is bad, trying all bit vectors) 2 | Copyright (C) 2021 Nathaniel Brown 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file heads_bv_w.hpp 16 | \brief heads_bv_w Wrapper to store the heads in a wavelet tree 17 | \author Nathaniel Brown 18 | \date 18/12/2021 19 | */ 20 | 21 | #ifndef _HEADS_BV_W_HH 22 | #define _HEADS_BV_W_HH 23 | 24 | #include 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | using namespace sdsl; 33 | 34 | template< class bv_t = bit_vector, 35 | template class char_map_t = ACGT_map > 36 | class heads_bv_w 37 | { 38 | private: 39 | typedef typename bv_t::select_1_type bv_select_1; 40 | typedef typename bv_t::rank_1_type bv_rank_1; 41 | 42 | struct rank_select_bv { 43 | bv_t bv; 44 | bv_select_1 select; 45 | bv_rank_1 rank; 46 | 47 | rank_select_bv() {} 48 | 49 | rank_select_bv(ulint size) 50 | { 51 | bv = bv_t(size, false); 52 | } 53 | 54 | size_t serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name = "") const 55 | { 56 | sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 57 | size_t written_bytes = 0; 58 | 59 | written_bytes += bv.serialize(out, v, "bv"); 60 | 61 | return written_bytes; 62 | } 63 | 64 | void load(std::istream &in) 65 | { 66 | bv.load(in); 67 | select = bv_select_1(&bv); 68 | rank = bv_rank_1(&bv); 69 | } 70 | }; 71 | 72 | typedef char_map_t bv_map; 73 | 74 | bv_map bit_vecs; 75 | ulint bv_size; 76 | 77 | uchar scan(ulint idx) 78 | { 79 | for (size_t i = 0; i < ALPHABET_SIZE; ++i) 80 | { 81 | if(bit_vecs.contains(i)) 82 | { 83 | if (bit_vecs[i].bv[idx]) 84 | { 85 | return i; 86 | } 87 | } 88 | } 89 | 90 | return 0; 91 | } 92 | 93 | public: 94 | heads_bv_w() {} 95 | 96 | heads_bv_w(std::vector chars) { 97 | bit_vecs = bv_map(); 98 | bv_size = chars.size(); 99 | 100 | for (size_t i = 0; i < chars.size(); ++i) 101 | { 102 | uchar c = chars[i]; 103 | if (!bit_vecs.contains(c)) 104 | { 105 | bit_vecs.insert(std::pair(c, rank_select_bv(bv_size))); 106 | } 107 | 108 | if (bit_vecs.contains(c)) 109 | { 110 | bit_vecs[c].bv[i] = true; 111 | } 112 | } 113 | 114 | for (size_t i = 0; i < ALPHABET_SIZE; ++i) 115 | { 116 | if(bit_vecs.contains(i)) 117 | { 118 | bit_vecs[i].select = bv_select_1(&bit_vecs[i].bv); 119 | bit_vecs[i].rank = bv_rank_1(&bit_vecs[i].bv); 120 | } 121 | } 122 | } 123 | 124 | ulint rank(ulint idx, uchar c) 125 | { 126 | return bit_vecs[c].rank(idx); 127 | } 128 | 129 | ulint select(ulint idx, uchar c) 130 | { 131 | return bit_vecs[c].select(idx); 132 | } 133 | 134 | std::pair inverse_select(ulint idx) 135 | { 136 | uchar c = scan(idx); 137 | ulint r = 0; 138 | if (bit_vecs.contains(c)) 139 | { 140 | r = rank(idx, c); 141 | } 142 | return std::pair(r, c); 143 | } 144 | 145 | uchar operator[](size_t idx) { 146 | scan(idx); 147 | } 148 | 149 | ulint size() 150 | { 151 | return bv_size;; 152 | } 153 | 154 | /* serialize the structure to the ostream 155 | * \param out the ostream 156 | */ 157 | size_t serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name = "") const 158 | { 159 | sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 160 | size_t written_bytes = 0; 161 | 162 | written_bytes += bit_vecs.serialize(out, v, "symbols"); 163 | out.write((char *)&bv_size, sizeof(bv_size)); 164 | written_bytes += sizeof(bv_size); 165 | 166 | return written_bytes; 167 | } 168 | 169 | /* load the structure from the istream 170 | * \param in the istream 171 | */ 172 | void load(std::istream &in) 173 | { 174 | bit_vecs.load(in); 175 | in.read((char *)&bv_size, sizeof(bv_size)); 176 | } 177 | }; 178 | 179 | #endif /* end of include guard: _HEADS_WT_W_HH */ -------------------------------------------------------------------------------- /include/r_index_f/ds/heads_wt_w.hpp: -------------------------------------------------------------------------------- 1 | /* heads_wt_w - Wrapper to store the heads in a wavelet tree 2 | Copyright (C) 2021 Nathaniel Brown 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file heads_wt_w.hpp 16 | \brief heads_wt_w Wrapper to store the heads in a wavelet tree 17 | \author Nathaniel Brown 18 | \date 18/12/2021 19 | */ 20 | 21 | #ifndef _HEADS_WT_W_HH 22 | #define _HEADS_WT_W_HH 23 | 24 | #include 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | using namespace sdsl; 32 | 33 | template< class wt_t = wt_huff<> > 34 | class heads_wt_w 35 | { 36 | private: 37 | wt_t symbols; 38 | 39 | public: 40 | heads_wt_w() {} 41 | 42 | heads_wt_w(std::vector chars) { 43 | construct_im(symbols, std::string(chars.begin(), chars.end()).c_str(), 1); 44 | } 45 | 46 | ulint rank(ulint idx, uchar c) 47 | { 48 | return symbols.rank(idx, c); 49 | } 50 | 51 | ulint select(ulint idx, uchar c) 52 | { 53 | return symbols.select(idx, c); 54 | } 55 | 56 | std::pair inverse_select(ulint idx) 57 | { 58 | return symbols.inverse_select(idx); 59 | } 60 | 61 | uchar operator[](size_t i) const { 62 | return symbols[i]; 63 | } 64 | 65 | ulint size() 66 | { 67 | return symbols.size(); 68 | } 69 | 70 | /* serialize the structure to the ostream 71 | * \param out the ostream 72 | */ 73 | size_t serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name = "") const 74 | { 75 | sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 76 | size_t written_bytes = 0; 77 | 78 | written_bytes += symbols.serialize(out, v, "symbols"); 79 | 80 | return written_bytes; 81 | } 82 | 83 | /* load the structure from the istream 84 | * \param in the istream 85 | */ 86 | void load(std::istream &in) 87 | { 88 | symbols.load(in); 89 | } 90 | }; 91 | 92 | #endif /* end of include guard: _HEADS_WT_W_HH */ -------------------------------------------------------------------------------- /include/r_index_f/ds/idx_bit_vector.hpp: -------------------------------------------------------------------------------- 1 | /* idx_bit_vector.hpp - Sampling idx using bit vector approach 2 | Copyright (C) 2021 Nathaniel Brown 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file idx_bit_vector.hpp 16 | \brief idx_bit_vector.hpp template class wrapper used to access idx sampling using bit vector approach 17 | \author Nathaniel Brown 18 | \date 16/12/2021 19 | */ 20 | 21 | #ifndef _IDX_BV_HH 22 | #define _IDX_BV_HH 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | template < class bit_vec = sd_vector<> > 31 | class idx_bit_vector 32 | { 33 | private: 34 | typedef typename bit_vec::rank_1_type idx_rank; 35 | typedef typename bit_vec::select_1_type idx_select; 36 | 37 | bit_vec samples; 38 | idx_rank pred; 39 | idx_select run_sample; 40 | 41 | public: 42 | 43 | idx_bit_vector() {} 44 | 45 | idx_bit_vector(vector vec) { 46 | samples = bool_to_bit_vec(vec); 47 | pred = idx_rank(&samples); 48 | run_sample = idx_select(&samples); 49 | } 50 | 51 | ulint sample(ulint rank) 52 | { 53 | return run_sample(rank + 1); 54 | } 55 | 56 | ulint predecessor(ulint idx) { 57 | return pred(idx + 1); 58 | } 59 | 60 | /* serialize the structure to the ostream 61 | * \param out the ostream 62 | */ 63 | size_t serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name = "") const 64 | { 65 | sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 66 | size_t written_bytes = 0; 67 | 68 | written_bytes += samples.serialize(out, v, "idx_bit_vec"); 69 | 70 | return written_bytes; 71 | } 72 | 73 | /* load the structure from the istream 74 | * \param in the istream 75 | */ 76 | void load(std::istream &in) 77 | { 78 | samples.load(in); 79 | pred = idx_rank(&samples); 80 | run_sample = idx_select(&samples); 81 | } 82 | }; 83 | 84 | #endif /* end of include guard: _IDX_BV_HH */ -------------------------------------------------------------------------------- /include/r_index_f/ds/idx_list.hpp: -------------------------------------------------------------------------------- 1 | /* idx_list.hpp - Sampling idx using bexplicit list (vector) 2 | Copyright (C) 2021 Nathaniel Brown 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file idx_list.hpp 16 | \brief idx_list.hpp template class wrapper used to access idx sampling using list approach 17 | \author Nathaniel Brown 18 | \date 16/12/2021 19 | */ 20 | 21 | #ifndef _IDX_LIST_HH 22 | #define _IDX_LIST_HH 23 | 24 | #include 25 | 26 | #include 27 | #include 28 | 29 | class idx_list 30 | { 31 | private: 32 | std::vector samples; 33 | 34 | public: 35 | 36 | idx_list() {} 37 | 38 | idx_list(std::vector vec) { 39 | samples = std::vector(); 40 | 41 | for(size_t i = 0; i < vec.size(); ++i) 42 | if (vec[i]) samples.push_back(i); 43 | } 44 | 45 | ulint sample(ulint rank) 46 | { 47 | assert(rank < samples.size()); 48 | return samples[rank]; 49 | } 50 | 51 | ulint predecessor(ulint idx) { 52 | // Get first element equal to or greater than idx (runs are sorted, so O(lg n) using binary search) 53 | auto pred = std::lower_bound(samples.begin(), samples.end(), idx); 54 | if(*pred != idx) 55 | { 56 | // Index in sampling array of predecessor (minus 1, since it is first element greater) 57 | pred -= 1; 58 | } 59 | 60 | std::distance(samples.begin(), pred); 61 | } 62 | 63 | /* serialize the structure to the ostream 64 | * \param out the ostream 65 | */ 66 | size_t serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name = "") const 67 | { 68 | sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 69 | size_t written_bytes = 0; 70 | 71 | size_t size = samples.size(); 72 | out.write((char *)&size, sizeof(size)); 73 | written_bytes += sizeof(size); 74 | 75 | for(size_t i = 0; i < size; ++i) 76 | { 77 | out.write((char *)&samples[i], sizeof(samples[i])); 78 | written_bytes += sizeof(samples[i]); 79 | } 80 | 81 | return written_bytes; 82 | } 83 | 84 | /* load the structure from the istream 85 | * \param in the istream 86 | */ 87 | void load(std::istream &in) 88 | { 89 | size_t size; 90 | in.read((char *)&size, sizeof(size)); 91 | samples = std::vector(size); 92 | for(size_t i = 0; i < size; ++i) 93 | { 94 | in.read((char *)&samples[i], sizeof(samples[i])); 95 | } 96 | } 97 | }; 98 | 99 | #endif /* end of include guard: _IDX_LIST_HH */ -------------------------------------------------------------------------------- /include/r_index_f/ds/interval_block.hpp: -------------------------------------------------------------------------------- 1 | /* interval_block - Compresses intervals into a block using SDSL/template 2 | Copyright (C) 2021 Nathaniel Brown 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file interval_block.hpp 16 | \brief interval_block.hpp Compact representation of intervals as a block 17 | \author Nathaniel Brown 18 | \author Massimiliano Rossi 19 | \date 09/07/2020 20 | */ 21 | 22 | #ifndef _I_BLOCK_HH 23 | #define _I_BLOCK_HH 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | #include 33 | #include 34 | 35 | using namespace sdsl; 36 | 37 | template < class heads_t = heads_wt_w<>, 38 | class intervals_t = intervals_rank_w<>, 39 | class lengths_t = dac_vector_dp<>, 40 | class offsets_t = dac_vector_dp<>, 41 | template class char_map_t = ACGT_map > 42 | class interval_block 43 | { 44 | private: 45 | typedef char_map_t pos_map; 46 | 47 | // Heads supporting rank/select/access for character c 48 | heads_t heads; 49 | // Intervals which return the mapping given a character c and its rank in block 50 | intervals_t intervals; 51 | // Lengths supporting access 52 | lengths_t lengths; 53 | // Offsets supporting access 54 | offsets_t offsets; 55 | 56 | // Stores prior and next block LF mapping for a character (for block overruns) 57 | pos_map prior_block_LF; 58 | pos_map next_block_LF; 59 | 60 | // For a row k with offset d, character c of rank c_rank, compute it's LF 61 | interval_pos LF(ulint k, ulint d, uchar c, ulint c_rank) 62 | { 63 | ulint q = get_interval(c, c_rank); 64 | ulint d_prime = d + get_offset(k); 65 | 66 | return interval_pos(q, d_prime); 67 | } 68 | 69 | public: 70 | interval_block() {} 71 | 72 | interval_block(std::vector chars, std::vector ints, std::vector lens, std::vector offs, std::unordered_map prior_LF) { 73 | heads = heads_t(chars); 74 | intervals = intervals_t(chars, ints); 75 | lengths = lengths_t(lens); 76 | offsets = offsets_t(offs); 77 | 78 | prior_block_LF = pos_map(prior_LF); 79 | next_block_LF = pos_map(); 80 | } 81 | 82 | // Return the character at row k 83 | const ulint get_char(const ulint k) 84 | { 85 | return heads[k]; 86 | } 87 | 88 | // For a given character and it's rank, return the interval mapping (LF) 89 | const ulint get_interval(const uchar c, const ulint c_rank) 90 | { 91 | return intervals.get(c, c_rank); 92 | } 93 | 94 | // Get the length at row k 95 | const ulint get_length(const ulint k) 96 | { 97 | return lengths[k]; 98 | } 99 | 100 | // Get the offset at row k 101 | const ulint get_offset(const ulint k) 102 | { 103 | return offsets[k]; 104 | } 105 | 106 | bool has_prior_LF(const uchar c) 107 | { 108 | return prior_block_LF.contains(c); 109 | } 110 | 111 | bool has_next_LF(const uchar c) 112 | { 113 | return next_block_LF.contains(c); 114 | } 115 | 116 | void set_next_LF(const uchar c, interval_pos next_LF) 117 | { 118 | next_block_LF.insert(std::pair(c, next_LF)); 119 | } 120 | 121 | // For row k wih offset d, compute the LF mapping 122 | interval_pos LF(const ulint k, const ulint d) 123 | { 124 | const auto [c_rank, c] = heads.inverse_select(k); 125 | return LF(k, d, c, c_rank); 126 | } 127 | 128 | // Perform the LF mapping for character c prior or at position k with offset d 129 | interval_pos LF_prior(const ulint k, const ulint d, const uchar c) 130 | { 131 | // Look in row ahead so that the rank includes current position 132 | ulint c_rank = heads.rank(k + 1, c); 133 | // If there are no c prior to position in block, return LF of prior c in another block 134 | if (c_rank == 0) 135 | { 136 | if (has_prior_LF(c)) 137 | { 138 | return prior_block_LF[c]; 139 | } 140 | else 141 | { 142 | return interval_pos(); 143 | } 144 | } 145 | // We subtract 1 to maintain 0-based rank after ensuring it is not 0, since we use unsigned values 146 | else 147 | { 148 | c_rank -= 1; 149 | } 150 | 151 | ulint k_prime = heads.select(c_rank + 1, c); 152 | // If our k changed, set the offset to the last character in that prior run 153 | ulint d_prime = (k != k_prime) ? lengths[k_prime] - 1 : d; 154 | 155 | return LF(k_prime, d_prime, c, c_rank); 156 | } 157 | 158 | // Perform the LF mapping for character c succeding or at position k with offset d 159 | interval_pos LF_next(const ulint k, const ulint d, const uchar c) 160 | { 161 | // Count occ of c before position 162 | ulint c_rank = heads.rank(k, c); 163 | // If the c of rank at or succeding our position overruns the block, return LF of next c in another block 164 | if (c_rank + 1 > heads.rank(heads.size(), c)) 165 | { 166 | if (has_next_LF(c)) 167 | { 168 | return next_block_LF[c]; 169 | } 170 | else 171 | { 172 | return interval_pos(); 173 | } 174 | } 175 | ulint k_prime = heads.select(c_rank + 1, c); 176 | // If k changed, set it to the first character of the next run 177 | ulint d_prime = (k != k_prime) ? 0 : d; 178 | 179 | return LF(k_prime, d_prime, c, c_rank); 180 | } 181 | 182 | // Reduces position until offset shorter than length of interval, or returns if at end of block 183 | interval_pos reduce(interval_pos pos, ulint k) 184 | { 185 | ulint q = pos.run; 186 | ulint d = pos.offset; 187 | ulint next_len; 188 | while (k < lengths.size() && d >= (next_len = get_length(k))) 189 | { 190 | d -= next_len; 191 | ++k; 192 | ++q; 193 | } 194 | 195 | return interval_pos(q, d); 196 | } 197 | 198 | /* serialize the interval block to the ostream 199 | * \param out the ostream 200 | */ 201 | size_t serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name = "") // const 202 | { 203 | sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 204 | size_t written_bytes = 0; 205 | 206 | 207 | written_bytes += heads.serialize(out,v,"Heads"); 208 | written_bytes += intervals.serialize(out,v,"Intervals"); 209 | written_bytes += lengths.serialize(out,v,"Lengths"); 210 | written_bytes += offsets.serialize(out,v,"Offsets"); 211 | 212 | written_bytes += prior_block_LF.serialize(out,v,"Prior_Block_LF"); 213 | written_bytes += next_block_LF.serialize(out,v,"Next_Block_LF"); 214 | 215 | sdsl::structure_tree::add_size(child, written_bytes); 216 | return written_bytes; 217 | } 218 | 219 | /* load the interval block from the istream 220 | * \param in the istream 221 | */ 222 | void load(std::istream &in) 223 | { 224 | heads.load(in); 225 | intervals.load(in); 226 | lengths.load(in); 227 | offsets.load(in); 228 | 229 | prior_block_LF.load(in); 230 | next_block_LF.load(in); 231 | } 232 | }; 233 | 234 | #endif /* end of include guard: _I_BLOCK_HH */ -------------------------------------------------------------------------------- /include/r_index_f/ds/interval_pos.hpp: -------------------------------------------------------------------------------- 1 | /* interval_pos - Pair describing run/offset access of r-index-f 2 | Copyright (C) 2021 Nathaniel Brown 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file interval_pos.hpp 16 | \brief interval_pos.hpp Pair describing run/offset access of r-index-f 17 | \author Nathaniel Brown 18 | \date 11/19/2021 19 | */ 20 | 21 | #ifndef _INTERVAL_POS_HH 22 | #define _INTERVAL_POS_HH 23 | 24 | #include 25 | 26 | #include 27 | #include 28 | 29 | class interval_pos 30 | { 31 | private: 32 | bool set; 33 | 34 | public: 35 | ulint run; 36 | ulint offset; 37 | 38 | interval_pos() { 39 | run = 0; 40 | offset = 0; 41 | set = false; 42 | } 43 | 44 | interval_pos(ulint r, ulint o) { 45 | run = r; 46 | offset = o; 47 | set = true; 48 | } 49 | 50 | bool is_set() 51 | { 52 | return set; 53 | } 54 | 55 | interval_pos& operator++() 56 | { 57 | ++offset; 58 | } 59 | 60 | interval_pos operator++(int) 61 | { 62 | interval_pos old = *this; 63 | operator++(); 64 | return old; 65 | } 66 | 67 | inline bool operator< (const interval_pos& pos){ return (run == pos.run) ? (offset < pos.offset) : (run < pos.run); } 68 | inline bool operator> (const interval_pos& pos){ return (run == pos.run) ? (offset > pos.offset) : (run > pos.run); } 69 | inline bool operator<=(const interval_pos& pos){ return !(*this > pos); } 70 | inline bool operator>=(const interval_pos& pos){ return !(*this < pos); } 71 | inline bool operator==(const interval_pos& pos){ return run == pos.run && offset == pos.offset; } 72 | inline bool operator!=(const interval_pos& pos){ return !(*this == pos); } 73 | 74 | /* serialize the structure to the ostream 75 | * \param out the ostream 76 | */ 77 | size_t serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name = "") const 78 | { 79 | sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 80 | size_t written_bytes = 0; 81 | 82 | out.write((char *)&run, sizeof(run)); 83 | written_bytes += sizeof(run); 84 | 85 | out.write((char *)&offset, sizeof(offset)); 86 | written_bytes += sizeof(offset); 87 | 88 | return written_bytes; 89 | } 90 | 91 | /* load the structure from the istream 92 | * \param in the istream 93 | */ 94 | void load(std::istream &in) 95 | { 96 | set = true; 97 | in.read((char *)&run, sizeof(run)); 98 | in.read((char *)&offset, sizeof(offset)); 99 | } 100 | }; 101 | 102 | #endif /* end of include guard: _INTERVAL_POS_HH */ -------------------------------------------------------------------------------- /include/r_index_f/ds/intervals_rank_w.hpp: -------------------------------------------------------------------------------- 1 | /* intervals_rank_w - Wrapper which accesses intervals by seperating their mapping wrt. character, i.e. compute given character rank 2 | Copyright (C) 2021 Nathaniel Brown 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file intervals_rank_w.hpp 16 | \brief intervals_rank_w Returns intervals for respective character and rank of that character 17 | \author Nathaniel Brown 18 | \date 18/12/2021 19 | */ 20 | 21 | #ifndef _INTERVALS_RANK_W_HH 22 | #define _INTERVALS_RANK_W_HH 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | #include 34 | 35 | using namespace sdsl; 36 | 37 | template< class interval_t = base_bv<>, 38 | template class char_map_t = ACGT_map > 39 | class intervals_rank_w 40 | { 41 | private: 42 | typedef char_map_t interval_map; 43 | 44 | interval_map char_map; 45 | 46 | public: 47 | intervals_rank_w() {} 48 | 49 | intervals_rank_w(std::vector characters, std::vector intervals) { 50 | assert(characters.size() == intervals.size()); 51 | 52 | // Concerned with Interval sectioned by character (break into base pointer and difference from prior interval) 53 | std::unordered_map last_c_map = std::unordered_map(); 54 | std::unordered_map block_c_map = std::unordered_map(); 55 | std::unordered_map> diff = std::unordered_map>(); 56 | 57 | for(size_t i = 0; i < characters.size(); ++i) 58 | { 59 | uchar character = characters[i]; 60 | ulint interval = intervals[i]; 61 | 62 | if (!block_c_map.count(character)) { 63 | block_c_map[character] = interval; 64 | last_c_map[character] = interval; 65 | diff[character] = std::vector(); 66 | } 67 | 68 | diff[character].push_back(interval - last_c_map[character]); 69 | last_c_map[character] = interval; 70 | } 71 | 72 | for(size_t i = 0; i < ALPHABET_SIZE; ++i) 73 | { 74 | if(diff.count(i)) 75 | { 76 | char_map.insert(std::pair(i, interval_t(block_c_map[i], diff[i]))); 77 | } 78 | } 79 | } 80 | 81 | ulint get(uchar c, ulint c_rank) 82 | { 83 | return char_map[c].get(c_rank); 84 | } 85 | 86 | /* serialize the structure to the ostream 87 | * \param out the ostream 88 | */ 89 | size_t serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name = "") const 90 | { 91 | sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 92 | size_t written_bytes = 0; 93 | 94 | written_bytes += char_map.serialize(out, v, "char_map"); 95 | 96 | return written_bytes; 97 | } 98 | 99 | /* load the structure from the istream 100 | * \param in the istream 101 | */ 102 | void load(std::istream &in) 103 | { 104 | char_map.load(in); 105 | } 106 | }; 107 | 108 | #endif /* end of include guard: _BASE_BV_HH */ -------------------------------------------------------------------------------- /include/r_index_f/ds/symbol_map.hpp: -------------------------------------------------------------------------------- 1 | /* symbol_map - Implements a simple map taking character (byte) positions 2 | Copyright (C) 2021 Nathaniel Brown 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file symbol_map.hpp 16 | \brief symbol_map.hpp Bitvector for contains and vector for access 17 | \author Nathaniel Brown 18 | \date 18/12/2021 19 | */ 20 | 21 | #ifndef _SYMBOL_MAP_HH 22 | #define _SYMBOL_MAP_HH 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | template 30 | class symbol_map 31 | { 32 | private: 33 | bit_vector map_contains; 34 | std::vector access; 35 | 36 | public: 37 | 38 | symbol_map() { 39 | map_contains = bit_vector(ALPHABET_SIZE, false); 40 | access = std::vector(ALPHABET_SIZE); 41 | } 42 | 43 | symbol_map(std::unordered_map map) { 44 | map_contains = bit_vector(ALPHABET_SIZE, false); 45 | access = std::vector(ALPHABET_SIZE); 46 | 47 | for(auto const& [c, val] : map) 48 | { 49 | map_contains[c] = true; 50 | access[c] = val; 51 | } 52 | } 53 | 54 | bool contains(uchar c) const 55 | { 56 | return map_contains[c]; 57 | } 58 | 59 | bool insert(std::pair kv) 60 | { 61 | if (!contains(kv.first)) { 62 | map_contains[kv.first] = true; 63 | access[kv.first] = kv.second; 64 | 65 | return true; 66 | } 67 | else { 68 | return false; 69 | } 70 | } 71 | 72 | T& at(const uchar c) const 73 | { 74 | if (!contains(c)) 75 | { 76 | throw std::out_of_range("Symbol" + std::to_string(c) + "not in map"); 77 | } 78 | 79 | return access[c]; 80 | } 81 | 82 | T& operator[](const uchar c) const 83 | { 84 | return access[c]; 85 | } 86 | 87 | /* serialize the structure to the ostream 88 | * \param out the ostream 89 | */ 90 | size_t serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name = "") const 91 | { 92 | sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 93 | size_t written_bytes = 0; 94 | 95 | map_contains.serialize(out, v, "char_next"); 96 | for (size_t i = 0; i < ALPHABET_SIZE; ++i) 97 | { 98 | if(contains(i)) 99 | { 100 | written_bytes += access[i].serialize(out, v, "access_" + std::to_string(i)); 101 | } 102 | } 103 | 104 | return written_bytes; 105 | } 106 | 107 | /* load the structure from the istream 108 | * \param in the istream 109 | */ 110 | void load(std::istream &in) 111 | { 112 | map_contains.load(in); 113 | access = std::vector(ALPHABET_SIZE); 114 | for(size_t i = 0; i < ALPHABET_SIZE; ++i) 115 | { 116 | if (contains(i)) 117 | { 118 | access[i].load(in); 119 | } 120 | } 121 | } 122 | }; 123 | 124 | #endif /* end of include guard: _SYMBOL_MAP_HH */ -------------------------------------------------------------------------------- /include/r_index_f/r_index_f.hpp: -------------------------------------------------------------------------------- 1 | /* r-index-f - Computes the simple r-index-f block compressed table 2 | Copyright (C) 2021 Nathaniel Brown 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file r_index_f.hpp 16 | \brief r_index_f.hpp Computes the r-Index-f block table from RLBWT 17 | \author Nathaniel Brown 18 | \author Massimiliano Rossi 19 | \date 11/19/2021 20 | */ 21 | 22 | #ifndef _R_INDEX_F_HH 23 | #define _R_INDEX_F_HH 24 | 25 | #include 26 | #include 27 | #include 28 | 29 | #include 30 | #include 31 | #include 32 | 33 | #include 34 | 35 | #include 36 | #include 37 | #include 38 | #include 39 | 40 | using namespace sdsl; 41 | using namespace std; 42 | 43 | template > 44 | class r_index_f 45 | { 46 | public: 47 | typedef std::pair range_t; 48 | 49 | r_index_f() {} 50 | 51 | r_index_f(std::string filename, uint16_t splitting = 0, bool rle = true) 52 | { 53 | verbose("Building the R-Index-F using Block Table Compression"); 54 | 55 | std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now(); 56 | 57 | std::string bwt_fname = filename + ".bwt"; 58 | 59 | if (rle) 60 | { 61 | std::string bwt_heads_fname = bwt_fname + ".heads"; 62 | std::ifstream ifs_heads(bwt_heads_fname); 63 | std::string bwt_len_fname = bwt_fname + ".len"; 64 | std::ifstream ifs_len(bwt_len_fname); 65 | ifs_heads.seekg(0); 66 | ifs_len.seekg(0); 67 | 68 | LF_table temp; 69 | 70 | if (splitting) 71 | { 72 | std::string splitting_filename = filename + "." + std::to_string(splitting) + "_col"; 73 | std::ifstream ifs_split(splitting_filename); 74 | bit_vector run_splits; 75 | run_splits.load(ifs_split); 76 | 77 | temp = LF_table(ifs_heads, ifs_len, run_splits); 78 | } 79 | else { 80 | temp = LF_table(ifs_heads, ifs_len); 81 | } 82 | B_table = table(temp); 83 | } 84 | else 85 | { 86 | std::ifstream ifs_bwt(bwt_fname); 87 | 88 | ifs_bwt.seekg(0); 89 | LF_table temp(ifs_bwt); 90 | B_table = table(temp); 91 | } 92 | 93 | std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now(); 94 | 95 | verbose("Block-Table construction complete"); 96 | verbose("Memory peak: ", malloc_count_peak()); 97 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 98 | mem_stats(); 99 | bwt_stats(); 100 | } 101 | 102 | r_index_f(LF_table t) { 103 | verbose("Building the R-Index-F using Block Table Compression from LF Table Construction"); 104 | 105 | std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now(); 106 | B_table = table(t); 107 | 108 | std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now(); 109 | 110 | verbose("Block-Table construction complete"); 111 | verbose("Memory peak: ", malloc_count_peak()); 112 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 113 | mem_stats(); 114 | bwt_stats(); 115 | } 116 | 117 | ulint runs() 118 | { 119 | return B_table.runs(); 120 | } 121 | 122 | ulint size() 123 | { 124 | return B_table.size(); 125 | } 126 | 127 | size_t count(const std::string &pattern){ 128 | range_t range = full_range(); 129 | ulint m = pattern.size(); 130 | for (ulint i=0; i < m && range.second >= range.first; ++i){ 131 | range = LF(range, pattern[m - i - 1]); 132 | } 133 | return interval_to_idx(range.second) - interval_to_idx(range.first) + 1; 134 | } 135 | 136 | // void invert() { 137 | // std::string outfile = args.filename + ".inverted"; 138 | // std::ofstream out(outfile); 139 | 140 | // interval_pos i = {0,0}; 141 | // char c; 142 | // while((c = bwt.get_char(i)) > TERMINATOR) 143 | // out << c; 144 | // i = bwt.LF(i), ++steps; 145 | // out.close(); 146 | // } 147 | 148 | interval_pos LF(interval_pos pos) 149 | { 150 | return B_table.LF(pos); 151 | } 152 | 153 | range_t LF(range_t range, uchar c) 154 | { 155 | return range_t(B_table.LF_next(range.first, c), B_table.LF_prior(range.second, c)); 156 | } 157 | 158 | range_t full_range() 159 | { 160 | return range_t(B_table.begin(), B_table.end()); 161 | } 162 | 163 | ulint interval_to_idx(interval_pos pos) 164 | { 165 | return B_table.interval_to_idx(pos); 166 | } 167 | 168 | uchar get_char(interval_pos pos) 169 | { 170 | return B_table.get_char(pos); 171 | } 172 | 173 | // Return underlying table (not recommended, add methods to access its capabilities) 174 | table get_table() 175 | { 176 | return B_table; 177 | } 178 | 179 | void mem_stats() 180 | { 181 | sdsl::nullstream ns; 182 | 183 | verbose("Memory consumption (bytes)."); 184 | verbose(" Block table: ", serialize(ns)); 185 | } 186 | 187 | void bwt_stats() 188 | { 189 | ulint n = size(); 190 | ulint r = runs(); 191 | verbose("Number of BWT equal-letter runs: r = ", r); 192 | verbose("Length of complete BWT: n = ", n); 193 | verbose("Rate n/r = ", double(n) / r); 194 | verbose("log2(r) = ", log2(double(r))); 195 | verbose("log2(n/r) = ", log2(double(n) / r)); 196 | } 197 | 198 | /* serialize the structure to the ostream 199 | * \param out the ostream 200 | */ 201 | size_t serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name = "") // const 202 | { 203 | sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 204 | size_t written_bytes = 0; 205 | 206 | written_bytes += B_table.serialize(out, v, "B_table"); 207 | 208 | sdsl::structure_tree::add_size(child, written_bytes); 209 | return written_bytes; 210 | } 211 | 212 | std::string get_file_extension() const 213 | { 214 | return ".rif"; 215 | } 216 | 217 | /* load the structure from the istream 218 | * \param in the istream 219 | */ 220 | void load(std::istream &in) 221 | { 222 | B_table.load(in); 223 | } 224 | 225 | private: 226 | table B_table; 227 | }; 228 | 229 | #endif /* end of include guard: _R_INDEX_F_HH */ -------------------------------------------------------------------------------- /pipeline/rif: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Edited from bigbwt/pfpthresholds script file 4 | 5 | import sys, time, argparse, subprocess, os.path, threading 6 | 7 | Description = """ 8 | ____ ___ _ _____ 9 | | _ \ |_ _|_ __ __| | _____ __ | ___| 10 | | |_) | _____ | || '_ \ / _` |/ _ \ \/ / _____ | |_ 11 | | _ < |_____| | || | | | (_| | __/> < |_____| | _| 12 | |_| \_\ |___|_| |_|\__,_|\___/_/\_\ |_ 13 | "R-Index-F Library for String Indexing" 14 | by Nathaniel Brown, Travis Gagie and Massimiliano Rossi 15 | The input file cannot contain the characters 0, 1 or 2 which are 16 | used internally by the algorithm. 17 | """ 18 | 19 | dirname = os.path.dirname(os.path.abspath(__file__)) 20 | 21 | bigbwt_dirname = os.path.join(dirname, "_deps/bigbwt-build") 22 | bigrepair_dirname = os.path.join(dirname, "_deps/bigrepair-src") 23 | thresholds_dirname = os.path.join( 24 | dirname, "_deps/pfp_thresholds-build/test/src") 25 | rif_dirname = os.path.join(dirname, "test/src") 26 | 27 | parse_exe = os.path.join(bigbwt_dirname, "pscan.x") 28 | parse_fasta_exe = os.path.join(bigbwt_dirname, "newscan.x") 29 | parseNT_exe = os.path.join(bigbwt_dirname, "newscanNT.x") 30 | 31 | parsebwt_exe = os.path.join(bigbwt_dirname, "bwtparse") 32 | parsebwt_exe64 = os.path.join(bigbwt_dirname, "bwtparse64") 33 | pfbwt_exe = os.path.join(bigbwt_dirname, "pfbwt.x") 34 | pfbwtNT_exe = os.path.join(bigbwt_dirname, "pfbwtNT.x") 35 | pfbwt_exe64 = os.path.join(bigbwt_dirname, "pfbwt64.x") 36 | pfbwtNT_exe64 = os.path.join(bigbwt_dirname, "pfbwtNT64.x") 37 | 38 | pfp_thresholds = os.path.join(thresholds_dirname, "pfp_thresholds") 39 | pfp_thresholds64 = os.path.join(thresholds_dirname, "pfp_thresholds64") 40 | 41 | rif_build_exe = os.path.join(rif_dirname, "build_rif") 42 | 43 | # Temporarily borrowed to build RLE compressed BWT files 44 | class PFPthresholds(threading.Thread): 45 | def __init__(self, name, counter, args): 46 | threading.Thread.__init__(self) 47 | self.threadID = counter 48 | self.name = name 49 | self.counter = counter 50 | self.args = args 51 | 52 | def run(self): 53 | args = self.args 54 | logfile = args.logfile 55 | logfile_name = args.logfile_name 56 | print("{} PFP started!".format(self.getName())) # "Thread-x started!" 57 | 58 | start = time.time() 59 | parse_size = os.path.getsize(args.input+".parse")/4 60 | dictionary_size = os.path.getsize(args.input+".dict") 61 | 62 | if(parse_size >= (2**31-1) or dictionary_size >= (2**31-4) ): 63 | command = "{exe} {file} -w {wsize}".format( 64 | exe = os.path.join(args.bigbwt_dir,pfp_thresholds64), 65 | wsize=args.wsize, file=args.input) 66 | else: 67 | command = "{exe} {file} -w {wsize}".format( 68 | exe = os.path.join(args.bigbwt_dir,pfp_thresholds), 69 | wsize=args.wsize, file=args.input) 70 | 71 | command += " -r" 72 | 73 | print("==== Computing RLE BWT Files. Command:", command) 74 | if(execute_command(command,logfile,logfile_name)!=True): 75 | return 76 | print("RLE BWT Files Elapsed time: {0:.4f}".format(time.time()-start)); 77 | 78 | 79 | 80 | class BWT(threading.Thread): 81 | def __init__(self, name, counter, args): 82 | threading.Thread.__init__(self) 83 | self.threadID = counter 84 | self.name = name 85 | self.counter = counter 86 | self.args = args 87 | 88 | def run(self): 89 | args = self.args 90 | logfile = args.logfile 91 | logfile_name = args.logfile_name 92 | print("{} BWT started!".format(self.getName())) # "Thread-x started!" 93 | 94 | # ----------- computation of the BWT of the parsing 95 | start = time.time() 96 | parse_size = os.path.getsize(args.input+".parse")/4 97 | if(parse_size >= (2**32-1) ): 98 | print("Sorry, the parse contains %d words" % parse_size ) 99 | print("which is more than my current limit 2^32-2") 100 | print("Please re-run the program with a larger modulus (currently %d)" % args.mod) 101 | sys.exit(1) 102 | elif(parse_size >= (2**31-1) ): 103 | command = "{exe} {file}".format( 104 | exe = os.path.join(args.bigbwt_dir,parsebwt_exe64), file=args.input) 105 | else: 106 | command = "{exe} {file}".format( 107 | exe = os.path.join(args.bigbwt_dir,parsebwt_exe), file=args.input) 108 | # if (args.s or args.e or args.S): command += " -s" 109 | command += " -s" 110 | if (args.t>0): command += " -t " + str(args.t) 111 | print("==== Computing BWT of parsing. Command:", command) 112 | if(execute_command(command,logfile,logfile_name)!=True): 113 | return 114 | print("Elapsed time: {0:.4f}".format(time.time()-start)); 115 | 116 | # ----------- compute final BWT using dictionary and BWT of parse 117 | start = time.time() 118 | if(os.path.getsize(args.input+".dict") >= (2**31-4) ): 119 | # 64 bit version with and without threads 120 | if args.t>0 and args.s==False and args.e==False: 121 | command = "{exe} -w {wsize} {file} -t {th}".format( 122 | exe = os.path.join(args.bigbwt_dir,pfbwt_exe64), 123 | wsize=args.wsize, file=args.input, th=args.t) 124 | else: 125 | command = "{exe} -w {wsize} {file}".format( 126 | exe = os.path.join(args.bigbwt_dir,pfbwtNT_exe64), 127 | wsize=args.wsize, file=args.input) 128 | else: # 32 bit version 129 | if args.t>0 and args.s==False and args.e==False: 130 | command = "{exe} -w {wsize} {file} -t {th}".format( 131 | exe = os.path.join(args.bigbwt_dir,pfbwt_exe), 132 | wsize=args.wsize, file=args.input, th=args.t) 133 | else: 134 | command = "{exe} -w {wsize} {file}".format( 135 | exe = os.path.join(args.bigbwt_dir,pfbwtNT_exe), 136 | wsize=args.wsize, file=args.input) 137 | command += " -s" 138 | command += " -e" 139 | # if args.S: command += " -S" 140 | 141 | print("==== Computing final BWT. Command:", command) 142 | if(execute_command(command,logfile,logfile_name)!=True): 143 | return 144 | print("BWT Elapsed time: {0:.4f}".format(time.time()-start)) 145 | 146 | 147 | 148 | 149 | 150 | 151 | class build_rif(threading.Thread): 152 | def __init__(self, name, counter, args): 153 | threading.Thread.__init__(self) 154 | self.threadID = counter 155 | self.name = name 156 | self.counter = counter 157 | self.args = args 158 | 159 | def run(self): 160 | args = self.args 161 | logfile = args.logfile 162 | logfile_name = args.logfile_name 163 | print("{} r-index-f started!".format(self.getName())) 164 | 165 | # ----------- computation of the BWT of the parsing 166 | print("==== Building the r-index-f. ", flush=True) 167 | start = time.time() 168 | 169 | command = "{exe} {file}".format(exe=os.path.join( 170 | args.bigbwt_dir, rif_build_exe), file=args.input) 171 | 172 | print("==== Building the r-index-f. Command:", command, flush=True) 173 | if(execute_command(command, logfile, logfile_name) != True): 174 | return 175 | print("Building the r-index-f Elapsed time: {0:.4f}".format( 176 | time.time()-start), flush=True) 177 | 178 | 179 | 180 | 181 | 182 | def main(): 183 | parser = argparse.ArgumentParser(description=Description, formatter_class=argparse.RawTextHelpFormatter) 184 | parser.add_argument('input', help='input file name', type=str) 185 | parser.add_argument('-w', '--wsize', help='sliding window size (def. 10)', default=10, type=int) 186 | parser.add_argument('-p', '--mod', help='hash modulus (def. 100)', default=100, type=int) 187 | parser.add_argument('-t', help='number of helper threads (def. None)', default=0, type=int) 188 | parser.add_argument('-k', help='keep temporary files',action='store_true') 189 | parser.add_argument('-v', help='verbose',action='store_true') 190 | parser.add_argument('-f', help='read fasta',action='store_true') 191 | parser.add_argument('-m', help='print memory usage',action='store_true') 192 | # parser.add_argument('--sum', help='compute output files sha256sum',action='store_true') 193 | parser.add_argument('--parsing', help='stop after the parsing phase (debug only)',action='store_true') 194 | parser.add_argument('--compress', help='compress output of the parsing phase (debug only)',action='store_true') 195 | args = parser.parse_args() 196 | 197 | if args.f and args.t > 0 and (".fq" in args.input or ".fastq" in args.input or ".fnq" in args.input): 198 | print("bigbwt does not current support FASTQ format! Exiting...") 199 | return 200 | 201 | 202 | 203 | logfile_name = args.input + ".log" 204 | # get main bigbwt directory 205 | args.bigbwt_dir = os.path.split(sys.argv[0])[0] 206 | print("Sending logging messages to file:", logfile_name) 207 | with open(logfile_name,"a") as logfile: 208 | args.logfile = logfile 209 | args.logfile_name = logfile_name 210 | # ---------- parsing of the input file 211 | start0 = start = time.time() 212 | if args.t>0: 213 | if args.f: 214 | command = "{exe} {file} -w {wsize} -p {modulus} -t {th} -f".format( 215 | exe = os.path.join(args.bigbwt_dir,parse_fasta_exe), 216 | wsize=args.wsize, modulus = args.mod, th=args.t, file=args.input) 217 | else: 218 | command = "{exe} {file} -w {wsize} -p {modulus} -t {th}".format( 219 | exe = os.path.join(args.bigbwt_dir,parse_exe), 220 | wsize=args.wsize, modulus = args.mod, th=args.t, file=args.input) 221 | else: 222 | if args.f: 223 | command = "{exe} {file} -w {wsize} -p {modulus} -t {th} -f".format( 224 | exe = os.path.join(args.bigbwt_dir,parseNT_exe), 225 | wsize=args.wsize, modulus = args.mod, th=args.t, file=args.input) 226 | else: 227 | command = "{exe} {file} -w {wsize} -p {modulus} -t".format( 228 | exe = os.path.join(args.bigbwt_dir,parseNT_exe), 229 | wsize=args.wsize, modulus = args.mod, file=args.input) 230 | if args.v: command += " -v" 231 | # if args.f: command += " -f" 232 | command += " -s" 233 | print("==== Parsing. Command:", command) 234 | if(execute_command(command,logfile,logfile_name)!=True): 235 | return 236 | print("Elapsed time: {0:.4f}".format(time.time()-start)) 237 | if args.parsing: 238 | # delete temporary parsing files 239 | command = "rm -f {file}.parse_old {file}.last".format(file=args.input) # check format when -t is used 240 | if(execute_command(command,logfile,logfile_name)!=True): 241 | return 242 | print("==== Stopping after the parsing phase as requested") 243 | return 244 | elif args.compress: 245 | # save parsing files 246 | start = time.time() 247 | command = "tar -cJf {file}.parse.txz {file}.parse {file}.dict".format(file=args.input) 248 | print("==== Compressing. Command:", command) 249 | if(execute_command(command,logfile,logfile_name,env={"XZ_OPT":"-9"})!=True): 250 | return 251 | print("Elapsed time: {0:.4f}".format(time.time()-start)) 252 | delete_temp_files(args,logfile,logfile_name) 253 | print("==== Done: Parsing output xz-compressed as requested") 254 | return 255 | 256 | # ----------- computation of the PFP data structures 257 | 258 | pfpds_thread = PFPthresholds(name = "{}".format(args.input), args=args, counter=1) # ...Instantiate a thread and pass a unique ID to it 259 | pfpds_thread.start() # ...Start the thread, invoke the run method 260 | # bwt_thread = BWT(name = "{}".format(args.input), args=args, counter=2) # ...Instantiate a thread and pass a unique ID to it 261 | # bwt_thread.start() # ...Start the thread, invoke the run method 262 | 263 | pfpds_thread.join() 264 | # bwt_thread.join() 265 | 266 | # ----------- build the r-index-f 267 | build_rif_thread = build_rif( 268 | name="{}".format(args.input), args=args, counter=2) 269 | build_rif_thread.start() 270 | build_rif_thread.join() 271 | 272 | print("Total construction time: {0:.4f}".format(time.time()-start0)) 273 | # ---- print elapsed time to file 274 | command = "echo Total construction time: {0:.4f}".format(time.time()-start0) 275 | if(execute_command(command,logfile,logfile_name)!=True): 276 | return 277 | 278 | # ---- delete intermediate files 279 | delete_files(args,logfile,logfile_name) 280 | 281 | # --- start checking --- 282 | 283 | # --- end checking --- 284 | 285 | print("==== Done") 286 | 287 | # delete intermediate and unused files 288 | def delete_files(args,logfile,logfile_name): 289 | if args.k==False: 290 | print("==== Deleting temporary files.") # no need to show the command 291 | command = "rm -f {file}.parse {file}.parse_old {file}.last {file}.bwlast {file}.dict {file}.ilist {file}.occ".format(file=args.input) 292 | if(execute_command(command,logfile,logfile_name)!=True): 293 | return 294 | for i in range(args.t): 295 | command = "rm -f {file}.{i}.parse_old {file}.{i}.last".format(file=args.input, i=i) 296 | if(execute_command(command,logfile,logfile_name)!=True): 297 | return 298 | 299 | command = "rm -f {file}.sai {file}.bwsai".format(file=args.input); 300 | if(execute_command(command,logfile,logfile_name)!=True): 301 | return 302 | for i in range(args.t): 303 | command = "rm -f {file}.{i}.sai".format(file=args.input, i=i) 304 | if(execute_command(command,logfile,logfile_name)!=True): 305 | return 306 | 307 | command = "rm -f {file}.thr {file}.thr_pos {file}.ssa {file}.esa".format(file=args.input); 308 | if(execute_command(command,logfile,logfile_name)!=True): 309 | return 310 | 311 | 312 | # compute hash digest for a file 313 | def file_digest(name,logfile): 314 | try: 315 | hash_command = "{exe} {infile}".format(exe=shasum_exe, infile=name) 316 | hashsum = subprocess.check_output(hash_command.split(),stderr=logfile) 317 | hashsum = hashsum.decode("utf-8").split()[0] 318 | except: 319 | hashsum = "Error!" 320 | return hashsum 321 | 322 | # execute command: return True is everything OK, False otherwise 323 | def execute_command(command,logfile,logfile_name,env=None): 324 | try: 325 | #subprocess.run(command.split(),stdout=logfile,stderr=logfile,check=True,env=env) 326 | subprocess.check_call(command.split(),stdout=logfile,stderr=logfile,env=env) 327 | except subprocess.CalledProcessError: 328 | print("Error executing command line:") 329 | print("\t"+ command) 330 | print("Check log file: " + logfile_name) 331 | return False 332 | return True 333 | 334 | 335 | 336 | if __name__ == '__main__': 337 | main() -------------------------------------------------------------------------------- /test/src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(build_rif build_rif.cpp) 2 | target_link_libraries(build_rif sdsl malloc_count divsufsort divsufsort64) 3 | target_include_directories(build_rif PUBLIC "../../include/r_index_f" 4 | "../../include/common" 5 | "../../include/block_compression" 6 | ) 7 | target_compile_options(build_rif PUBLIC "-std=c++17") 8 | 9 | add_executable(rif_tests rif_tests.cpp) 10 | target_link_libraries(rif_tests sdsl divsufsort divsufsort64 malloc_count) 11 | target_include_directories(rif_tests PUBLIC "../../include/r_index_f" 12 | "../../include/common" 13 | "../../include/block_compression" 14 | ) 15 | target_compile_options(rif_tests PUBLIC "-std=c++17") 16 | 17 | add_executable(count_query count_query.cpp) 18 | target_link_libraries(count_query sdsl divsufsort divsufsort64 malloc_count) 19 | target_include_directories(count_query PUBLIC "../../include/r_index_f" 20 | "../../include/common" 21 | "../../include/block_compression" 22 | ) 23 | target_compile_options(count_query PUBLIC "-std=c++17") -------------------------------------------------------------------------------- /test/src/build_rif.cpp: -------------------------------------------------------------------------------- 1 | /* build_rif - Build the simple R-Index-F tablke 2 | Copyright (C) 2021 Nathaniel Brown 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file build_LF_table.cpp 16 | \brief build_LF_table.cpp Build the simple R-Index-F table. 17 | \author Nathaniel Brown 18 | \author Massimiliano Rossi 19 | \date 02/11/2021 20 | */ 21 | 22 | #include 23 | 24 | #define VERBOSE 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | 32 | int main(int argc, char *const argv[]) 33 | { 34 | Args args; 35 | parseArgs(argc, argv, args); 36 | 37 | // Building the r-index-f table 38 | 39 | verbose("Building the R-Index-F"); 40 | std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now(); 41 | 42 | r_index_f<> rif(args.filename, args.d, args.rle); 43 | 44 | std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now(); 45 | 46 | verbose("Construction Complete"); 47 | verbose("Memory peak: ", malloc_count_peak()); 48 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 49 | 50 | verbose("Serializing Table"); 51 | 52 | std::string outfile = args.filename + rif.get_file_extension(); 53 | std::ofstream out(outfile); 54 | rif.serialize(out); 55 | out.close(); 56 | t_insert_end = std::chrono::high_resolution_clock::now(); 57 | 58 | verbose("Memory peak: ", malloc_count_peak()); 59 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 60 | 61 | return 0; 62 | } -------------------------------------------------------------------------------- /test/src/count_query.cpp: -------------------------------------------------------------------------------- 1 | /* build_rif - Build the simple R-Index-F tablke 2 | Copyright (C) 2021 Nathaniel Brown 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file build_LF_table.cpp 16 | \brief build_LF_table.cpp Build the simple R-Index-F table. 17 | \author Nathaniel Brown 18 | \author Massimiliano Rossi 19 | \date 02/11/2021 20 | */ 21 | 22 | #include 23 | 24 | #define VERBOSE 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | 32 | int main(int argc, char *const argv[]) 33 | { 34 | Args args; 35 | parseArgs(argc, argv, args); 36 | 37 | if (args.pattern == "") { 38 | error("-p flag is required for count query"); 39 | return 1; 40 | } 41 | 42 | verbose("Loading the R-Index-F from B-Table"); 43 | std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now(); 44 | 45 | r_index_f<> rif; 46 | std::string filename_rif = args.filename + rif.get_file_extension(); 47 | 48 | ifstream fs_rif(filename_rif); 49 | rif.load(fs_rif); 50 | fs_rif.close(); 51 | 52 | std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now(); 53 | 54 | verbose("R-Index-F load complete"); 55 | verbose("Memory peak: ", malloc_count_peak()); 56 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 57 | 58 | rif.mem_stats(); 59 | rif.bwt_stats(); 60 | 61 | verbose("Computing count query"); 62 | t_insert_start = std::chrono::high_resolution_clock::now(); 63 | 64 | if (args.is_fasta) { 65 | cout << "\tCOUNT: " << rif.count(args.pattern) << endl; 66 | } 67 | else { 68 | std::string pattern_file = args.pattern; 69 | ifstream fs_pattern(pattern_file); 70 | std::string pattern; 71 | size_t count = 0; 72 | while (std::getline(fs_pattern, pattern)) { 73 | cout << "P_LINE: " << count << "\tCOUNT: " << rif.count(pattern) << endl; 74 | ++count; 75 | } 76 | } 77 | t_insert_end = std::chrono::high_resolution_clock::now(); 78 | 79 | verbose("Count query complete"); 80 | verbose("Memory peak: ", malloc_count_peak()); 81 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 82 | 83 | return 0; 84 | } -------------------------------------------------------------------------------- /test/src/rif_tests.cpp: -------------------------------------------------------------------------------- 1 | /* rif_tests Performs benchmarks on the constructed R-Index-F --- must serialize both r-index-f and LF_table 2 | Copyright (C) 2021 Nathaniel Brown 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file rif_tests.cpp 16 | \brief rif_tests Benchmark tests on the R-Index-F 17 | \author Nathaniel Brown 18 | \author Massimiliano Rossi 19 | \date 02/11/2021 20 | */ 21 | 22 | #include 23 | #include 24 | 25 | #define VERBOSE 26 | 27 | #include 28 | #include 29 | #include 30 | 31 | // Check inversion is correct (compared to explicit table) - FAIL FOR ACGT_MAP 32 | void test_invert(r_index_f<> rif, LF_table table) 33 | { 34 | ulint steps = 0; 35 | interval_pos pos(0,0); 36 | char c_rif; 37 | while((c_rif = rif.get_char(pos)) > TERMINATOR) 38 | { 39 | auto[table_k, table_d] = table.LF(pos.run, pos.offset); 40 | pos = rif.LF(pos); 41 | 42 | assert(pos.run == table_k); 43 | assert(pos.offset == table_d); 44 | ++steps; 45 | } 46 | } 47 | 48 | void test_prior_LF(r_index_f<> rif, LF_table table) 49 | { 50 | std::string pattern = "CGATATCGCACAGATC"; // Occurs in example, should implement dynamic test 51 | interval_pos curr = rif.get_table().end(); 52 | for (size_t i = 0; i < pattern.size(); i++) 53 | { 54 | char c = pattern[i]; 55 | ulint k = curr.run; 56 | ulint d = curr.offset; 57 | while(table.get(k).character != c) 58 | { 59 | d = table.get(--k).length - 1; 60 | } 61 | 62 | auto[k_prime, d_prime] = table.LF(k, d); 63 | curr = rif.get_table().LF_prior(curr, c); 64 | 65 | assert(k_prime == curr.run); 66 | assert(d_prime == curr.offset); 67 | } 68 | } 69 | 70 | void test_next_LF(r_index_f<> rif, LF_table table) 71 | { 72 | std::string pattern = "CGATATCGCACAGATC"; // Occurs in example, should implement dynamic test 73 | interval_pos curr = rif.get_table().begin(); 74 | for (size_t i = 0; i < pattern.size(); i++) 75 | { 76 | char c = pattern[i]; 77 | ulint k = curr.run; 78 | ulint d = curr.offset; 79 | while(table.get(k).character != c) 80 | { 81 | k++; 82 | d = 0; 83 | } 84 | 85 | auto[k_prime, d_prime] = table.LF(k, d); 86 | curr = rif.get_table().LF_next(curr, c); 87 | 88 | assert(k_prime == curr.run); 89 | assert(d_prime == curr.offset); 90 | } 91 | } 92 | 93 | // Test pos to idx 94 | void test_idx_samples(r_index_f<> rif) 95 | { 96 | interval_pos curr = interval_pos(0,0); 97 | for (int i = 0; i < rif.size(); i++) 98 | { 99 | curr = rif.get_table().reduced_pos(curr); 100 | assert(i == rif.interval_to_idx(curr)); 101 | 102 | curr++; 103 | } 104 | } 105 | 106 | int main(int argc, char *const argv[]) 107 | { 108 | Args args; 109 | parseArgs(argc, argv, args); 110 | 111 | verbose("Loading the R-Index-F from B-Table"); 112 | std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now(); 113 | 114 | r_index_f<> rif; 115 | std::string filename_rif = args.filename + rif.get_file_extension(); 116 | 117 | ifstream fs_rif(filename_rif); 118 | rif.load(fs_rif); 119 | fs_rif.close(); 120 | 121 | LF_table table; 122 | std::string filename_LF = args.filename + table.get_file_extension(); 123 | 124 | ifstream fs_table(filename_LF); 125 | table.load(fs_table); 126 | fs_table.close(); 127 | 128 | std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now(); 129 | 130 | verbose("R-Index-F load complete"); 131 | verbose("Memory peak: ", malloc_count_peak()); 132 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 133 | 134 | rif.mem_stats(); 135 | rif.bwt_stats(); 136 | test_prior_LF(rif, table); 137 | verbose("R-Index-F Prior Steps Successful"); 138 | test_next_LF(rif, table); 139 | verbose("R-Index-F Next Steps Successful"); 140 | test_invert(rif, table); 141 | verbose("R-Index-F Inversion Successful"); 142 | test_idx_samples(rif); 143 | verbose("R-Index-F Indices Successful"); 144 | 145 | return 0; 146 | } -------------------------------------------------------------------------------- /thirdparty/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(FetchContent) 2 | 3 | ## Add malloc_count 4 | FetchContent_Declare( 5 | malloc_count 6 | GIT_REPOSITORY https://github.com/bingmann/malloc_count 7 | ) 8 | 9 | FetchContent_GetProperties(malloc_count) 10 | if(NOT malloc_count_POPULATED) 11 | FetchContent_Populate(malloc_count) 12 | 13 | add_library(malloc_count OBJECT ${malloc_count_SOURCE_DIR}/malloc_count.c ${malloc_count_SOURCE_DIR}/malloc_count.h) 14 | target_link_libraries(malloc_count dl) 15 | target_include_directories(malloc_count PUBLIC "${malloc_count_SOURCE_DIR}") 16 | 17 | add_library(memprofile OBJECT ${malloc_count_SOURCE_DIR}/memprofile.h) 18 | target_include_directories(memprofile PUBLIC "${malloc_count_SOURCE_DIR}") 19 | endif() 20 | 21 | ## Add Big-BWT 22 | FetchContent_Declare( 23 | bigbwt 24 | GIT_REPOSITORY https://github.com/alshai/Big-BWT.git 25 | ) 26 | 27 | FetchContent_GetProperties(bigbwt) 28 | if(NOT bigbwt_POPULATED) 29 | FetchContent_Populate(bigbwt) 30 | add_subdirectory(${bigbwt_SOURCE_DIR} ${bigbwt_BINARY_DIR}) 31 | endif() 32 | 33 | 34 | 35 | ## Add gsacak 36 | FetchContent_Declare( 37 | gsacak 38 | GIT_REPOSITORY https://github.com/felipelouza/gsa-is.git 39 | ) 40 | 41 | FetchContent_GetProperties(gsacak) 42 | if(NOT gsacak_POPULATED) 43 | FetchContent_Populate(gsacak) 44 | add_library(gsacak OBJECT ${gsacak_SOURCE_DIR}/gsacak.c ${gsacak_SOURCE_DIR}/gsacak.h) 45 | target_include_directories(gsacak PUBLIC "${gsacak_SOURCE_DIR}") 46 | 47 | add_library(gsacak64 OBJECT ${gsacak_SOURCE_DIR}/gsacak.c ${gsacak_SOURCE_DIR}/gsacak.h) 48 | target_include_directories(gsacak64 PUBLIC "${gsacak_SOURCE_DIR}") 49 | target_compile_options(gsacak64 PUBLIC -DM64) 50 | endif() 51 | 52 | ## Add pfp-thresholds 53 | FetchContent_Declare( 54 | pfp_thresholds 55 | GIT_REPOSITORY https://github.com/maxrossi91/pfp-thresholds.git 56 | GIT_TAG develop 57 | ) 58 | 59 | FetchContent_GetProperties(pfp_thresholds) 60 | if(NOT pfp_thresholds_POPULATED) 61 | FetchContent_Populate(pfp_thresholds) 62 | add_subdirectory(${pfp_thresholds_SOURCE_DIR} ${pfp_thresholds_BINARY_DIR}) 63 | 64 | endif() 65 | 66 | ## Google benchmark 67 | FetchContent_Declare( 68 | benchmark 69 | GIT_REPOSITORY https://github.com/google/benchmark.git 70 | GIT_TAG master 71 | ) 72 | 73 | FetchContent_GetProperties(benchmark) 74 | if(NOT benchmark_POPULATED) 75 | FetchContent_Populate(benchmark) 76 | set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "Enable testing of the benchmark library." FORCE) 77 | add_subdirectory(${benchmark_SOURCE_DIR} ${benchmark_BINARY_DIR} EXCLUDE_FROM_ALL) 78 | endif() 79 | 80 | 81 | ## Add sdsl 82 | FetchContent_Declare( 83 | sdsl 84 | # GIT_REPOSITORY https://github.com/elarielcl/sdsl-lite.git 85 | GIT_REPOSITORY https://github.com/simongog/sdsl-lite 86 | ) 87 | 88 | FetchContent_GetProperties(sdsl) 89 | if(NOT sdsl_POPULATED) 90 | FetchContent_Populate(sdsl) 91 | 92 | set(GENERATE_DOC OFF CACHE BOOL "Do not generate doxygen for sdsl-lite") 93 | 94 | add_subdirectory(${sdsl_SOURCE_DIR} ${sdsl_BINARY_DIR} EXCLUDE_FROM_ALL) 95 | endif() 96 | 97 | 98 | ## Add divsuffsort 99 | FetchContent_Declare( 100 | divsufsort 101 | GIT_REPOSITORY https://github.com/simongog/libdivsufsort.git 102 | GIT_TAG 2.0.1 103 | ) 104 | 105 | FetchContent_GetProperties(divsufsort) 106 | if(NOT divsufsort_POPULATED) 107 | FetchContent_Populate(divsufsort) 108 | 109 | set(BUILD_SHARED_LIBS OFF CACHE BOOL "Do not build a shared library for libdivsufsort") 110 | set(BUILD_EXAMPLES OFF CACHE BOOL "Do not build libdivsufsort example") 111 | set(BUILD_DIVSUFSORT64 ON CACHE BOOL "Build libdivsufsort in 64-bits mode") 112 | 113 | add_subdirectory(${divsufsort_SOURCE_DIR} ${divsufsort_BINARY_DIR} EXCLUDE_FROM_ALL) 114 | 115 | target_include_directories(divsufsort PUBLIC "${divsufsort_BINARY_DIR}/include") 116 | target_include_directories(divsufsort64 PUBLIC "${divsufsort_BINARY_DIR}/include") 117 | endif() --------------------------------------------------------------------------------