├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── blank_issue.md │ └── bugs.md └── dependabot.yml ├── .gitignore ├── Cargo.toml ├── LICENSE ├── README.md ├── src ├── bin │ └── tiny_lsm_bench.rs ├── fuzz.rs ├── lib.rs └── tearable.rs └── trophy_case ├── 00.json └── 01.json /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: spacejam # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/blank_issue.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Blank Issue (do not use this for bug reports or feature requests) 3 | about: Create an issue with a blank template. 4 | --- 5 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bugs.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | about: Report a correctness issue or violated expectation 4 | labels: bug 5 | --- 6 | 7 | Bug reports must include all following items: 8 | 9 | 1. expected result 10 | 1. actual result 11 | 1. tiny-lsm version 12 | 1. rustc version 13 | 1. operating system 14 | 1. minimal code sample that helps to reproduce the issue 15 | 1. logs, panic messages, stack traces 16 | 17 | Incomplete bug reports will be closed. 18 | 19 | Do not open bug reports for documentation issues. Please just open a PR with the proposed documentation change. 20 | 21 | Thank you for understanding :) 22 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: cargo 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | time: "10:00" 8 | open-pull-requests-limit: 10 9 | ignore: 10 | - dependency-name: crdts 11 | versions: 12 | - ">= 2.a, < 3" 13 | - dependency-name: zerocopy 14 | versions: 15 | - 0.4.0 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | tiny_lsm_bench 2 | target 3 | Cargo.lock 4 | fuzz* 5 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tiny-lsm" 3 | version = "0.4.6" 4 | edition = "2021" 5 | authors = ["Tyler Neely "] 6 | description = "a dead-simple in-memory blocking LSM tree for constant-sized keys and values" 7 | license = "GPL-3.0" 8 | repository = "https://github.com/spacejam/tiny-lsm" 9 | documentation = "https://docs.rs/tiny-lsm/" 10 | 11 | [features] 12 | no_fuzz = [] 13 | 14 | [profile.dev] 15 | panic = "abort" 16 | 17 | [dependencies] 18 | crc32fast = "1.3.0" 19 | zstd = "0.12.3" 20 | log = "0.4.14" 21 | 22 | [dev-dependencies] 23 | env_logger = "0.10.0" 24 | fuzzcheck = "0.12.1" 25 | serde = { version = "1.0.130", features = ["derive"] } 26 | serde_json = "1.0.72" 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tiny-lsm 2 | 3 | Super simple in-memory blocking LSM for constant-size keys and values. 4 | 5 | Despite being single-threaded and blocking, this is still capable 6 | of outperforming a wide range of other storage systems. 7 | 8 | This is a great choice when you: 9 | * want to fit the whole data set in-memory 10 | * can model your keys and values in a bounded number of bytes 11 | 12 | Tested with [fuzzcheck](https://docs.rs/fuzzcheck), and the API and 13 | internals are intentionally being kept minimal to reduce bugs and 14 | improve performance for the use cases that this works well for. 15 | 16 | Pairs extremely well with the [zerocopy](https://docs.rs/zerocopy) 17 | crate for viewing the fixed size byte arrays as typed data without 18 | paying expensive deserialization costs. 19 | -------------------------------------------------------------------------------- /src/bin/tiny_lsm_bench.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | let before_recovery = std::time::Instant::now(); 3 | let mut lsm = tiny_lsm::Lsm::<8, 8>::recover("tiny_lsm_bench").unwrap(); 4 | dbg!(before_recovery.elapsed()); 5 | 6 | if let Some((k, _v)) = lsm.iter().next_back() { 7 | println!("max key recovered: {:?}", u64::from_le_bytes(*k)); 8 | } else { 9 | println!("starting from scratch"); 10 | } 11 | 12 | let before_writes = std::time::Instant::now(); 13 | for i in 1_u64..1_000_000_000 { 14 | lsm.insert(i.to_le_bytes(), [0; 8]).unwrap(); 15 | if i % 1_000_000 == 0 { 16 | println!( 17 | "{:.2} million wps - stats: {:?}", 18 | i as f64 / (before_writes.elapsed().as_micros() + 1) as f64, 19 | lsm.stats(), 20 | ) 21 | } 22 | } 23 | lsm.flush().unwrap(); 24 | dbg!(before_writes.elapsed()); 25 | 26 | std::thread::sleep(std::time::Duration::from_secs(200)); 27 | } 28 | -------------------------------------------------------------------------------- /src/fuzz.rs: -------------------------------------------------------------------------------- 1 | use std::collections::BTreeMap; 2 | use std::ops::Deref; 3 | use std::sync::atomic::{AtomicUsize, Ordering}; 4 | 5 | use serde::{Deserialize, Serialize}; 6 | 7 | #[derive(Debug, Clone, Serialize, Deserialize, fuzzcheck::DefaultMutator)] 8 | enum Operation { 9 | Insert(u8, u8), 10 | Remove(u8), 11 | Contains(u8), 12 | Batch(Vec<(u8, Option)>), 13 | TornBatch(Vec<(u8, Option)>, usize), 14 | Restart, 15 | } 16 | 17 | #[derive(Debug, Clone, Serialize, Deserialize, fuzzcheck::DefaultMutator)] 18 | struct Args { 19 | ops: Vec, 20 | config: crate::Config, 21 | } 22 | 23 | fn compare_with_btree_map(args: &Args) { 24 | static NDB: AtomicUsize = AtomicUsize::new(0); 25 | 26 | let path = format!( 27 | "test_fuzz_db/fuzzcheck-test-{}", 28 | NDB.fetch_add(1, Ordering::SeqCst) 29 | ); 30 | 31 | let _ = std::fs::remove_dir_all(&path); 32 | 33 | let mut lsm = crate::Lsm::<1, 1>::recover_with_config(&path, args.config).unwrap(); 34 | let mut map = BTreeMap::<[u8; 1], [u8; 1]>::new(); 35 | for op in &args.ops { 36 | match op { 37 | Operation::Insert(key, value) => { 38 | let a = lsm.insert([*key], [*value]).unwrap(); 39 | let b = map.insert([*key], [*value]); 40 | assert_eq!(a, b); 41 | } 42 | Operation::Remove(key) => { 43 | let a = lsm.remove(&[*key]).unwrap(); 44 | let b = map.remove(&[*key]); 45 | assert_eq!(a, b); 46 | } 47 | Operation::Contains(key) => { 48 | let a = lsm.contains_key(&[*key]); 49 | let b = map.contains_key(&[*key]); 50 | assert_eq!(a, b); 51 | } 52 | Operation::Batch(batch) => { 53 | let mut wb = vec![]; 54 | for (k, v) in batch { 55 | if let Some(v) = v { 56 | map.insert([*k], [*v]); 57 | wb.push(([*k], Some([*v]))); 58 | } else { 59 | map.remove(&[*k]); 60 | wb.push(([*k], None)); 61 | } 62 | } 63 | 64 | lsm.write_batch(&wb).unwrap(); 65 | } 66 | Operation::TornBatch(batch, tear_offset) => { 67 | // this tests torn batches which 68 | // should not be present in the 69 | // db after recovering. 70 | 71 | lsm.flush().unwrap(); 72 | 73 | lsm.log.begin_tear(); 74 | 75 | let mut wb = vec![]; 76 | for (k, v) in batch { 77 | if let Some(v) = v { 78 | wb.push(([*k], Some([*v]))); 79 | } else { 80 | wb.push(([*k], None)); 81 | } 82 | } 83 | 84 | lsm.write_batch(&wb).unwrap(); 85 | 86 | lsm.log.apply_tear(*tear_offset, false); 87 | 88 | drop(lsm); 89 | 90 | lsm = crate::Lsm::recover_with_config(&path, args.config).unwrap(); 91 | 92 | // lsm should be the same as if the batch was never applied 93 | } 94 | Operation::Restart => { 95 | log::info!("restarting in test"); 96 | lsm.flush().unwrap(); 97 | drop(lsm); 98 | lsm = crate::Lsm::recover_with_config(&path, args.config).unwrap(); 99 | } 100 | } 101 | assert_eq!( 102 | lsm.deref(), 103 | &map, 104 | "lsm and map diverged after op {:?}:\nlsm: {:?}\nmap:{:?}", 105 | op, 106 | lsm.deref(), 107 | map 108 | ); 109 | } 110 | let _ = std::fs::remove_dir_all(&path); 111 | } 112 | 113 | #[cfg(not(feature = "no_fuzz"))] 114 | #[test] 115 | fn check() { 116 | env_logger::init(); 117 | let _ = std::fs::remove_dir_all("test_fuzz_db"); 118 | let result = fuzzcheck::fuzz_test(compare_with_btree_map) 119 | .default_options() 120 | .stop_after_first_test_failure(true) 121 | .launch(); 122 | let _ = std::fs::remove_dir_all("test_fuzz_db"); 123 | assert!(!result.found_test_failure); 124 | } 125 | 126 | #[test] 127 | fn test_corruption() { 128 | static NDB: AtomicUsize = AtomicUsize::new(0); 129 | 130 | let wb = [([0], None), ([1], Some([1])), ([2], None)]; 131 | 132 | let _ = std::fs::remove_dir_all("test_corruption_db"); 133 | for i in 0..100 { 134 | let path = format!( 135 | "test_corruption_db/corruption-test-{}", 136 | NDB.fetch_add(1, Ordering::SeqCst) 137 | ); 138 | 139 | let _ = std::fs::remove_dir_all(&path); 140 | 141 | let mut lsm = crate::Lsm::<1, 1>::recover(&path).unwrap(); 142 | lsm.flush().unwrap(); 143 | 144 | lsm.log.begin_tear(); 145 | lsm.write_batch(&wb).unwrap(); 146 | 147 | lsm.log.apply_tear(i, true); 148 | 149 | drop(lsm); 150 | 151 | lsm = crate::Lsm::recover(&path).unwrap(); 152 | 153 | assert!(lsm.is_empty(), "corruption test at slot {}", i); 154 | } 155 | let _ = std::fs::remove_dir_all("test_corruption_db"); 156 | } 157 | 158 | #[test] 159 | fn trophies() { 160 | for _ in 0..1 { 161 | for i in 0..2 { 162 | let json = std::fs::read_to_string(format!("trophy_case/0{}.json", i)).unwrap(); 163 | let args: Args = serde_json::from_str(&json).unwrap(); 164 | compare_with_btree_map(&args); 165 | } 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! `tiny-lsm` is a dead-simple in-memory LSM for managing 2 | //! fixed-size metadata in more complex systems. 3 | //! 4 | //! Uses crc32fast to checksum all key-value pairs in the log and 5 | //! sstables. Uses zstd to compress all sstables. Performs sstable 6 | //! compaction in the background. 7 | //! 8 | //! Because the data is in-memory, there is no need to put bloom 9 | //! filters on the sstables, and read operations cannot fail due 10 | //! to IO issues. 11 | //! 12 | //! `Lsm` implements `Deref>` 13 | //! to immutably access the data directly without any IO or 14 | //! blocking. 15 | //! 16 | //! `Lsm::insert` writes all data into a 32-kb `BufWriter` 17 | //! in front of a log file, so it will block for very 18 | //! short periods of time here and there. SST compaction 19 | //! is handled completely in the background. 20 | //! 21 | //! This is a bad choice for large data sets if you 22 | //! require quick recovery time because it needs to read all of 23 | //! the sstables and the write ahead log when starting up. 24 | //! 25 | //! The benefit to using tiered sstables at all, despite being 26 | //! in-memory, is that they act as an effective log-deduplication 27 | //! mechanism, keeping space amplification very low. 28 | //! 29 | //! Maximum throughput is not the goal of this project. Low space 30 | //! amplification and very simple code is the goal, because this 31 | //! is intended to maintain metadata in more complex systems. 32 | //! 33 | //! There is currently no compaction throttling. You can play 34 | //! with the `Config` options around compaction to change compaction 35 | //! characteristics. 36 | //! 37 | //! Never change the constant size of keys or values for an existing 38 | //! database. 39 | //! 40 | //! # Examples 41 | //! 42 | //! ``` 43 | //! // open up the LSM 44 | //! let mut lsm = tiny_lsm::Lsm::recover("path/to/base/dir").expect("recover lsm"); 45 | //! 46 | //! // store some things 47 | //! let key: [u8; 8] = 8_u64.to_le_bytes(); 48 | //! let value: [u8; 1] = 255_u8.to_le_bytes(); 49 | //! lsm.insert(key, value); 50 | //! 51 | //! assert_eq!(lsm.get(&key), Some(&value)); 52 | //! 53 | //! ``` 54 | #![cfg_attr(test, feature(no_coverage))] 55 | 56 | use std::collections::BTreeMap; 57 | use std::fs; 58 | use std::io::{self, prelude::*, BufReader, BufWriter, Result}; 59 | use std::path::{Path, PathBuf}; 60 | use std::sync::{ 61 | atomic::{AtomicU64, Ordering}, 62 | mpsc, Arc, 63 | }; 64 | 65 | const SSTABLE_DIR: &str = "sstables"; 66 | const U64_SZ: usize = std::mem::size_of::(); 67 | 68 | #[derive(Debug, Clone, Copy)] 69 | #[cfg_attr( 70 | test, 71 | derive(serde::Serialize, serde::Deserialize, fuzzcheck::DefaultMutator) 72 | )] 73 | pub struct Config { 74 | /// If on-disk uncompressed sstable data exceeds in-memory usage 75 | /// by this proportion, a full-compaction of all sstables will 76 | /// occur. This is only likely to happen in situations where 77 | /// multiple versions of most of the database's keys exist 78 | /// in multiple sstables, but should never happen for workloads 79 | /// where mostly new keys are being written. 80 | pub max_space_amp: u8, 81 | /// When the log file exceeds this size, a new compressed 82 | /// and compacted sstable will be flushed to disk and the 83 | /// log file will be truncated. 84 | pub max_log_length: usize, 85 | /// When the background compactor thread looks for contiguous 86 | /// ranges of sstables to merge, it will require all sstables 87 | /// to be at least 1/`merge_ratio` * the size of the first sstable 88 | /// in the contiguous window under consideration. 89 | pub merge_ratio: u8, 90 | /// When the background compactor thread looks for ranges of 91 | /// sstables to merge, it will require ranges to be at least 92 | /// this long. 93 | pub merge_window: u8, 94 | /// All inserts go directly to a `BufWriter` wrapping the log 95 | /// file. This option determines how large that in-memory buffer 96 | /// is. 97 | pub log_bufwriter_size: u32, 98 | /// The level of compression to use for the sstables with zstd. 99 | pub zstd_sstable_compression_level: u8, 100 | } 101 | 102 | impl Default for Config { 103 | fn default() -> Config { 104 | Config { 105 | max_space_amp: 2, 106 | max_log_length: 32 * 1024 * 1024, 107 | merge_ratio: 3, 108 | merge_window: 10, 109 | log_bufwriter_size: 32 * 1024, 110 | zstd_sstable_compression_level: 3, 111 | } 112 | } 113 | } 114 | 115 | struct WorkerStats { 116 | read_bytes: AtomicU64, 117 | written_bytes: AtomicU64, 118 | } 119 | 120 | #[derive(Debug, Clone, Copy)] 121 | pub struct Stats { 122 | pub resident_bytes: u64, 123 | pub on_disk_bytes: u64, 124 | pub logged_bytes: u64, 125 | pub written_bytes: u64, 126 | pub read_bytes: u64, 127 | pub space_amp: f64, 128 | pub write_amp: f64, 129 | } 130 | 131 | fn hash(k: &[u8; K], v: &Option<[u8; V]>) -> u32 { 132 | let mut hasher = crc32fast::Hasher::new(); 133 | hasher.update(&[v.is_some() as u8]); 134 | hasher.update(&*k); 135 | 136 | if let Some(v) = v { 137 | hasher.update(v); 138 | } else { 139 | hasher.update(&[0; V]); 140 | } 141 | 142 | // we XOR the hash to make sure it's something other than 0 when empty, 143 | // because 0 is an easy value to create accidentally or via corruption. 144 | hasher.finalize() ^ 0xFF 145 | } 146 | 147 | #[inline] 148 | fn hash_batch_len(len: usize) -> u32 { 149 | let mut hasher = crc32fast::Hasher::new(); 150 | hasher.update(&(len as u64).to_le_bytes()); 151 | 152 | hasher.finalize() ^ 0xFF 153 | } 154 | 155 | enum WorkerMessage { 156 | NewSST { id: u64, sst_sz: u64, db_sz: u64 }, 157 | Stop(mpsc::Sender<()>), 158 | Heartbeat(mpsc::Sender<()>), 159 | } 160 | 161 | struct Worker { 162 | sstable_directory: BTreeMap, 163 | inbox: mpsc::Receiver, 164 | db_sz: u64, 165 | path: PathBuf, 166 | config: Config, 167 | stats: Arc, 168 | } 169 | 170 | impl Worker { 171 | #[cfg(not(test))] 172 | fn run(mut self) { 173 | while self.tick() {} 174 | log::info!("tiny-lsm compaction worker quitting"); 175 | } 176 | 177 | fn tick(&mut self) -> bool { 178 | match self.inbox.recv() { 179 | Ok(message) => { 180 | if !self.handle_message(message) { 181 | return false; 182 | } 183 | } 184 | Err(mpsc::RecvError) => { 185 | return false; 186 | } 187 | } 188 | 189 | // only compact one run at a time before checking 190 | // for new messages. 191 | if let Err(e) = self.sstable_maintenance() { 192 | log::error!( 193 | "error while compacting sstables \ 194 | in the background: {:?}", 195 | e 196 | ); 197 | } 198 | 199 | true 200 | } 201 | 202 | fn handle_message(&mut self, message: WorkerMessage) -> bool { 203 | match message { 204 | WorkerMessage::NewSST { id, sst_sz, db_sz } => { 205 | self.db_sz = db_sz; 206 | self.sstable_directory.insert(id, sst_sz); 207 | true 208 | } 209 | WorkerMessage::Stop(dropper) => { 210 | drop(dropper); 211 | false 212 | } 213 | WorkerMessage::Heartbeat(dropper) => { 214 | drop(dropper); 215 | true 216 | } 217 | } 218 | } 219 | 220 | fn sstable_maintenance(&mut self) -> Result<()> { 221 | let on_disk_size: u64 = self.sstable_directory.values().sum(); 222 | 223 | log::debug!("disk size: {} mem size: {}", on_disk_size, self.db_sz); 224 | if self.sstable_directory.len() > 1 225 | && on_disk_size / (self.db_sz + 1) > self.config.max_space_amp as u64 226 | { 227 | log::debug!( 228 | "performing full compaction, decompressed on-disk \ 229 | database size has grown beyond {}x the in-memory size", 230 | self.config.max_space_amp 231 | ); 232 | let run_to_compact: Vec = self.sstable_directory.keys().copied().collect(); 233 | 234 | self.compact_sstable_run(&run_to_compact)?; 235 | return Ok(()); 236 | } 237 | 238 | if self.sstable_directory.len() < self.config.merge_window.max(2) as usize { 239 | return Ok(()); 240 | } 241 | 242 | for window in self 243 | .sstable_directory 244 | .iter() 245 | .collect::>() 246 | .windows(self.config.merge_window.max(2) as usize) 247 | { 248 | if window 249 | .iter() 250 | .skip(1) 251 | .all(|w| *w.1 * self.config.merge_ratio as u64 > *window[0].1) 252 | { 253 | let run_to_compact: Vec = window.into_iter().map(|(id, _sum)| **id).collect(); 254 | 255 | self.compact_sstable_run(&run_to_compact)?; 256 | return Ok(()); 257 | } 258 | } 259 | 260 | Ok(()) 261 | } 262 | 263 | // This function must be able to crash at any point without 264 | // leaving the system in an unrecoverable state, or without 265 | // losing data. This function must be nullipotent from the 266 | // external API surface's perspective. 267 | fn compact_sstable_run(&mut self, sstable_ids: &[u64]) -> Result<()> { 268 | log::debug!( 269 | "trying to compact sstable_ids {:?}", 270 | sstable_ids 271 | .iter() 272 | .map(|id| id_format(*id)) 273 | .collect::>() 274 | ); 275 | 276 | let mut map = BTreeMap::new(); 277 | 278 | let mut read_pairs = 0; 279 | 280 | for sstable_id in sstable_ids { 281 | for (k, v) in read_sstable::(&self.path, *sstable_id)? { 282 | map.insert(k, v); 283 | read_pairs += 1; 284 | } 285 | } 286 | 287 | self.stats 288 | .read_bytes 289 | .fetch_add(read_pairs * (4 + 1 + K + V) as u64, Ordering::Relaxed); 290 | 291 | let sst_id = sstable_ids 292 | .iter() 293 | .max() 294 | .expect("compact_sstable_run called with empty set of sst ids"); 295 | 296 | write_sstable(&self.path, *sst_id, &map, true, &self.config)?; 297 | 298 | self.stats 299 | .written_bytes 300 | .fetch_add(map.len() as u64 * (4 + 1 + K + V) as u64, Ordering::Relaxed); 301 | 302 | let sst_sz = map.len() as u64 * (4 + K + V) as u64; 303 | self.sstable_directory.insert(*sst_id, sst_sz); 304 | 305 | log::debug!("compacted range into sstable {}", id_format(*sst_id)); 306 | 307 | for sstable_id in sstable_ids { 308 | if sstable_id == sst_id { 309 | continue; 310 | } 311 | fs::remove_file(self.path.join(SSTABLE_DIR).join(id_format(*sstable_id)))?; 312 | self.sstable_directory 313 | .remove(sstable_id) 314 | .expect("compacted sst not present in sstable_directory"); 315 | } 316 | fs::File::open(self.path.join(SSTABLE_DIR))?.sync_all()?; 317 | 318 | Ok(()) 319 | } 320 | } 321 | 322 | fn id_format(id: u64) -> String { 323 | format!("{:016x}", id) 324 | } 325 | 326 | fn list_sstables(path: &Path, remove_tmp: bool) -> Result> { 327 | let mut sstable_map = BTreeMap::new(); 328 | 329 | for dir_entry_res in fs::read_dir(path.join(SSTABLE_DIR))? { 330 | let dir_entry = dir_entry_res?; 331 | let file_name = if let Ok(f) = dir_entry.file_name().into_string() { 332 | f 333 | } else { 334 | continue; 335 | }; 336 | 337 | if let Ok(id) = u64::from_str_radix(&file_name, 16) { 338 | let metadata = dir_entry.metadata()?; 339 | 340 | sstable_map.insert(id, metadata.len()); 341 | } else { 342 | if remove_tmp && file_name.ends_with("-tmp") { 343 | log::warn!("removing incomplete sstable rewrite {}", file_name); 344 | fs::remove_file(path.join(SSTABLE_DIR).join(file_name))?; 345 | } 346 | } 347 | } 348 | 349 | Ok(sstable_map) 350 | } 351 | 352 | fn write_sstable( 353 | path: &Path, 354 | id: u64, 355 | items: &BTreeMap<[u8; K], Option<[u8; V]>>, 356 | tmp_mv: bool, 357 | config: &Config, 358 | ) -> Result<()> { 359 | let sst_dir_path = path.join(SSTABLE_DIR); 360 | let sst_path = if tmp_mv { 361 | sst_dir_path.join(format!("{:x}-tmp", id)) 362 | } else { 363 | sst_dir_path.join(id_format(id)) 364 | }; 365 | 366 | let file = fs::OpenOptions::new() 367 | .create(true) 368 | .write(true) 369 | .open(&sst_path)?; 370 | 371 | let max_zstd_level = zstd::compression_level_range(); 372 | let zstd_level = config 373 | .zstd_sstable_compression_level 374 | .min(*max_zstd_level.end() as u8); 375 | 376 | let mut bw = 377 | BufWriter::new(zstd::Encoder::new(file, zstd_level as _).expect("zstd encoder failure")); 378 | 379 | bw.write_all(&(items.len() as u64).to_le_bytes())?; 380 | 381 | for (k, v) in items { 382 | let crc: u32 = hash(k, v); 383 | bw.write_all(&crc.to_le_bytes())?; 384 | bw.write_all(&[v.is_some() as u8])?; 385 | bw.write_all(k)?; 386 | 387 | if let Some(v) = v { 388 | bw.write_all(v)?; 389 | } else { 390 | bw.write_all(&[0; V])?; 391 | } 392 | } 393 | 394 | bw.flush()?; 395 | 396 | bw.get_mut().get_mut().sync_all()?; 397 | fs::File::open(path.join(SSTABLE_DIR))?.sync_all()?; 398 | 399 | if tmp_mv { 400 | let new_path = sst_dir_path.join(id_format(id)); 401 | fs::rename(sst_path, new_path)?; 402 | } 403 | 404 | Ok(()) 405 | } 406 | 407 | fn read_sstable( 408 | path: &Path, 409 | id: u64, 410 | ) -> Result)>> { 411 | let file = fs::OpenOptions::new() 412 | .read(true) 413 | .open(path.join(SSTABLE_DIR).join(id_format(id)))?; 414 | 415 | let mut reader = zstd::Decoder::new(BufReader::with_capacity(16 * 1024 * 1024, file)).unwrap(); 416 | 417 | // crc + tombstone discriminant + key + value 418 | let mut buf = vec![0; 4 + 1 + K + V]; 419 | 420 | let len_buf = &mut [0; 8]; 421 | 422 | reader.read_exact(len_buf)?; 423 | 424 | let expected_len: u64 = u64::from_le_bytes(*len_buf); 425 | let mut sstable = Vec::with_capacity(expected_len as usize); 426 | 427 | while let Ok(()) = reader.read_exact(&mut buf) { 428 | let crc_expected: u32 = u32::from_le_bytes(buf[0..4].try_into().unwrap()); 429 | let d: bool = match buf[4] { 430 | 0 => false, 431 | 1 => true, 432 | _ => { 433 | log::warn!("detected torn-write while reading sstable {:016x}", id); 434 | break; 435 | } 436 | }; 437 | let k: [u8; K] = buf[5..K + 5].try_into().unwrap(); 438 | let v: Option<[u8; V]> = if d { 439 | Some(buf[K + 5..5 + K + V].try_into().unwrap()) 440 | } else { 441 | None 442 | }; 443 | let crc_actual: u32 = hash(&k, &v); 444 | 445 | if crc_expected != crc_actual { 446 | log::warn!("detected torn-write while reading sstable {:016x}", id); 447 | break; 448 | } 449 | 450 | sstable.push((k, v)); 451 | } 452 | 453 | if sstable.len() as u64 != expected_len { 454 | log::warn!( 455 | "sstable {:016x} tear detected - process probably crashed \ 456 | before full sstable could be written out", 457 | id 458 | ); 459 | } 460 | 461 | Ok(sstable) 462 | } 463 | 464 | pub struct Lsm { 465 | // `BufWriter` flushes on drop 466 | memtable: BTreeMap<[u8; K], Option<[u8; V]>>, 467 | db: BTreeMap<[u8; K], [u8; V]>, 468 | worker_outbox: mpsc::Sender, 469 | next_sstable_id: u64, 470 | dirty_bytes: usize, 471 | #[cfg(test)] 472 | worker: Worker, 473 | #[cfg(test)] 474 | pub log: tearable::Tearable, 475 | #[cfg(not(test))] 476 | log: BufWriter, 477 | path: PathBuf, 478 | config: Config, 479 | stats: Stats, 480 | worker_stats: Arc, 481 | } 482 | 483 | impl Drop for Lsm { 484 | fn drop(&mut self) { 485 | let (tx, rx) = mpsc::channel(); 486 | 487 | if self.worker_outbox.send(WorkerMessage::Stop(tx)).is_err() { 488 | log::error!("failed to shut down compaction worker on Lsm drop"); 489 | return; 490 | } 491 | 492 | #[cfg(test)] 493 | assert!(!self.worker.tick()); 494 | 495 | for _ in rx {} 496 | } 497 | } 498 | 499 | impl std::ops::Deref for Lsm { 500 | type Target = BTreeMap<[u8; K], [u8; V]>; 501 | 502 | fn deref(&self) -> &Self::Target { 503 | &self.db 504 | } 505 | } 506 | 507 | impl Lsm { 508 | /// Recover the LSM off disk. Make sure to never 509 | /// recover a DB using different K, V parameters than 510 | /// it was created with, or there may be data loss. 511 | /// 512 | /// This is an O(N) operation and involves reading 513 | /// all previously written sstables and the log, 514 | /// to recover all data into an in-memory `BTreeMap`. 515 | pub fn recover>(p: P) -> Result> { 516 | Lsm::recover_with_config(p, Config::default()) 517 | } 518 | 519 | /// Recover the LSM, and provide custom options 520 | /// around IO and merging. All values in the `Config` 521 | /// object are safe to change across restarts, unlike 522 | /// the fixed K and V lengths for data in the database. 523 | pub fn recover_with_config>(p: P, config: Config) -> Result> { 524 | let path = p.as_ref(); 525 | if !path.exists() { 526 | fs::create_dir_all(path)?; 527 | fs::create_dir(path.join(SSTABLE_DIR))?; 528 | fs::File::open(path.join(SSTABLE_DIR))?.sync_all()?; 529 | fs::File::open(path)?.sync_all()?; 530 | let mut parent_opt = path.parent(); 531 | 532 | // need to recursively fsync parents since 533 | // we used create_dir_all 534 | while let Some(parent) = parent_opt { 535 | if parent.file_name().is_none() { 536 | break; 537 | } 538 | if fs::File::open(parent).and_then(|f| f.sync_all()).is_err() { 539 | // we made a reasonable attempt, but permissions 540 | // can sometimes get in the way, and at this point it's 541 | // becoming pedantic. 542 | break; 543 | } 544 | parent_opt = parent.parent(); 545 | } 546 | } 547 | 548 | let sstable_directory = list_sstables(path, true)?; 549 | 550 | let mut db = BTreeMap::new(); 551 | for sstable_id in sstable_directory.keys() { 552 | for (k, v) in read_sstable::(path, *sstable_id)? { 553 | if let Some(v) = v { 554 | db.insert(k, v); 555 | } else { 556 | db.remove(&k); 557 | } 558 | } 559 | } 560 | 561 | let max_sstable_id = sstable_directory.keys().next_back().copied(); 562 | 563 | let log = fs::OpenOptions::new() 564 | .create(true) 565 | .read(true) 566 | .write(true) 567 | .open(path.join("log"))?; 568 | 569 | let mut reader = BufReader::new(log); 570 | 571 | let tuple_sz = U64_SZ.max(K + V); 572 | let header_sz = 5; 573 | let header_tuple_sz = header_sz + tuple_sz; 574 | let mut buf = vec![0; header_tuple_sz]; 575 | 576 | let mut memtable = BTreeMap::new(); 577 | let mut recovered = 0; 578 | 579 | // write_batch is the pending memtable updates, the number 580 | // of remaining items in the write batch, and the number of 581 | // bytes that have been recovered in the write batch. 582 | let mut write_batch: Option<(_, usize, u64)> = None; 583 | while let Ok(()) = reader.read_exact(&mut buf) { 584 | let crc_expected: u32 = u32::from_le_bytes(buf[0..4].try_into().unwrap()); 585 | let d: bool = match buf[4] { 586 | 0 => false, 587 | 1 => true, 588 | 2 if write_batch.is_none() => { 589 | // begin batch 590 | let batch_sz_buf: [u8; 8] = buf[5..5 + 8].try_into().unwrap(); 591 | let batch_sz: u64 = u64::from_le_bytes(batch_sz_buf); 592 | log::debug!("processing batch of len {}", batch_sz); 593 | 594 | let crc_actual = hash_batch_len(usize::try_from(batch_sz).unwrap()); 595 | if crc_expected != crc_actual { 596 | log::warn!("crc mismatch for batch size marker"); 597 | break; 598 | } 599 | 600 | if !buf[5 + U64_SZ..].iter().all(|e| *e == 0) { 601 | log::warn!( 602 | "expected all pad bytes after logged \ 603 | batch manifests to be zero, but some \ 604 | corruption was detected" 605 | ); 606 | break; 607 | } 608 | 609 | if batch_sz > usize::MAX as u64 { 610 | return Err(io::Error::new( 611 | io::ErrorKind::InvalidInput, 612 | "recovering a batch size over usize::MAX is not supported", 613 | )); 614 | } 615 | 616 | let wb_remaining = batch_sz as usize; 617 | let wb_recovered = buf.len() as u64; 618 | 619 | if wb_remaining > 0 { 620 | write_batch = Some(( 621 | Vec::with_capacity(batch_sz as usize), 622 | wb_remaining, 623 | wb_recovered, 624 | )); 625 | } else { 626 | recovered += buf.len() as u64; 627 | } 628 | 629 | continue; 630 | } 631 | _ => { 632 | log::warn!("invalid log message discriminant detected: {}", buf[4]); 633 | break; 634 | } 635 | }; 636 | let k: [u8; K] = buf[5..5 + K].try_into().unwrap(); 637 | let v: Option<[u8; V]> = if d { 638 | Some(buf[5 + K..5 + K + V].try_into().unwrap()) 639 | } else { 640 | None 641 | }; 642 | 643 | let crc_actual: u32 = hash(&k, &v); 644 | 645 | if crc_expected != crc_actual { 646 | log::warn!( 647 | "crc mismatch for kv pair {:?}-{:?}: expected {} actual {}, torn log detected", 648 | k, 649 | v, 650 | crc_expected, 651 | crc_actual 652 | ); 653 | break; 654 | } 655 | 656 | let pad_start = if v.is_some() { 5 + K + V } else { 5 + K }; 657 | 658 | if !buf[pad_start..].iter().all(|e| *e == 0) { 659 | log::warn!( 660 | "expected all pad bytes for logged kv entries \ 661 | to be zero, but some corruption was detected" 662 | ); 663 | break; 664 | } 665 | 666 | if let Some((mut wb, mut wb_remaining, mut wb_recovered)) = write_batch.take() { 667 | wb.push((k, v)); 668 | wb_remaining = wb_remaining.checked_sub(1).unwrap(); 669 | wb_recovered = wb_recovered.checked_add(buf.len() as u64).unwrap(); 670 | 671 | // apply the write batch all at once 672 | // or never at all 673 | if wb_remaining == 0 { 674 | for (k, v) in wb { 675 | memtable.insert(k, v); 676 | 677 | if let Some(v) = v { 678 | db.insert(k, v); 679 | } else { 680 | db.remove(&k); 681 | } 682 | } 683 | recovered += wb_recovered; 684 | } else { 685 | write_batch = Some((wb, wb_remaining, wb_recovered)); 686 | } 687 | } else { 688 | memtable.insert(k, v); 689 | 690 | if let Some(v) = v { 691 | db.insert(k, v); 692 | } else { 693 | db.remove(&k); 694 | } 695 | 696 | recovered += buf.len() as u64; 697 | } 698 | } 699 | 700 | // need to back up a few bytes to chop off the torn log 701 | log::debug!("recovered {} kv pairs", db.len()); 702 | log::debug!("rewinding log down to length {}", recovered); 703 | let log_file = reader.get_mut(); 704 | log_file.seek(io::SeekFrom::Start(recovered))?; 705 | log_file.set_len(recovered)?; 706 | log_file.sync_all()?; 707 | fs::File::open(path.join(SSTABLE_DIR))?.sync_all()?; 708 | 709 | let (tx, rx) = mpsc::channel(); 710 | 711 | let worker_stats = Arc::new(WorkerStats { 712 | read_bytes: 0.into(), 713 | written_bytes: 0.into(), 714 | }); 715 | 716 | let worker: Worker = Worker { 717 | path: path.clone().into(), 718 | sstable_directory, 719 | inbox: rx, 720 | db_sz: db.len() as u64 * (K + V) as u64, 721 | config, 722 | stats: worker_stats.clone(), 723 | }; 724 | 725 | #[cfg(not(test))] 726 | std::thread::spawn(move || worker.run()); 727 | 728 | let (hb_tx, hb_rx) = mpsc::channel(); 729 | tx.send(WorkerMessage::Heartbeat(hb_tx)).unwrap(); 730 | 731 | #[cfg(test)] 732 | let mut worker = worker; 733 | 734 | #[cfg(test)] 735 | assert!(worker.tick()); 736 | 737 | for _ in hb_rx {} 738 | 739 | let lsm = Lsm { 740 | #[cfg(not(test))] 741 | log: BufWriter::with_capacity(config.log_bufwriter_size as usize, reader.into_inner()), 742 | #[cfg(test)] 743 | log: tearable::Tearable::new(reader.into_inner()), 744 | #[cfg(test)] 745 | worker, 746 | path: path.into(), 747 | next_sstable_id: max_sstable_id.unwrap_or(0) + 1, 748 | dirty_bytes: recovered as usize, 749 | worker_outbox: tx, 750 | config, 751 | stats: Stats { 752 | logged_bytes: recovered, 753 | on_disk_bytes: 0, 754 | read_bytes: 0, 755 | written_bytes: 0, 756 | resident_bytes: db.len() as u64 * (K + V) as u64, 757 | space_amp: 0., 758 | write_amp: 0., 759 | }, 760 | worker_stats, 761 | db, 762 | memtable, 763 | }; 764 | 765 | Ok(lsm) 766 | } 767 | 768 | /// Writes a KV pair into the `Lsm`, returning the 769 | /// previous value if it existed. This operation might 770 | /// involve blocking for a very brief moment as a 32kb 771 | /// `BufWriter` wrapping the log file is flushed. 772 | /// 773 | /// If you require blocking until all written data is 774 | /// durable, use the `Lsm::flush` method below. 775 | pub fn insert(&mut self, k: [u8; K], v: [u8; V]) -> Result> { 776 | self.log_mutation(k, Some(v))?; 777 | 778 | if self.dirty_bytes > self.config.max_log_length { 779 | self.flush()?; 780 | } 781 | 782 | Ok(self.db.insert(k, v)) 783 | } 784 | 785 | /// Removes a KV pair from the `Lsm`, returning the 786 | /// previous value if it existed. This operation might 787 | /// involve blocking for a very brief moment as a 32kb 788 | /// `BufWriter` wrapping the log file is flushed. 789 | /// 790 | /// If you require blocking until all written data is 791 | /// durable, use the `Lsm::flush` method below. 792 | pub fn remove(&mut self, k: &[u8; K]) -> Result> { 793 | self.log_mutation(*k, None)?; 794 | 795 | if self.dirty_bytes > self.config.max_log_length { 796 | self.flush()?; 797 | } 798 | 799 | Ok(self.db.remove(k)) 800 | } 801 | 802 | /// Apply a set of updates to the `Lsm` and 803 | /// log them to disk in a way that will 804 | /// be recovered only if every update is 805 | /// present. 806 | pub fn write_batch(&mut self, write_batch: &[([u8; K], Option<[u8; V]>)]) -> Result<()> { 807 | let batch_len: [u8; 8] = (write_batch.len() as u64).to_le_bytes(); 808 | let crc = hash_batch_len(write_batch.len()); 809 | 810 | self.log.write_all(&crc.to_le_bytes())?; 811 | self.log.write_all(&[2_u8])?; 812 | self.log.write_all(&batch_len)?; 813 | 814 | // the zero pad is necessary because every log 815 | // entry must have the same length, whether 816 | // it's a batch size or actual kv tuple. 817 | let tuple_sz = U64_SZ.max(K + V); 818 | let pad_sz = tuple_sz - U64_SZ; 819 | let pad = [0; U64_SZ]; 820 | self.log.write_all(&pad[..pad_sz])?; 821 | 822 | for (k, v_opt) in write_batch { 823 | if let Some(v) = v_opt { 824 | self.db.insert(*k, *v); 825 | } else { 826 | self.db.remove(k); 827 | } 828 | 829 | self.log_mutation(*k, *v_opt)?; 830 | self.memtable.insert(*k, *v_opt); 831 | } 832 | 833 | if self.dirty_bytes > self.config.max_log_length { 834 | self.flush()?; 835 | } 836 | 837 | Ok(()) 838 | } 839 | 840 | fn log_mutation(&mut self, k: [u8; K], v: Option<[u8; V]>) -> Result<()> { 841 | let crc: u32 = hash(&k, &v); 842 | self.log.write_all(&crc.to_le_bytes())?; 843 | self.log.write_all(&[v.is_some() as u8])?; 844 | self.log.write_all(&k)?; 845 | 846 | if let Some(v) = v { 847 | self.log.write_all(&v)?; 848 | } else { 849 | self.log.write_all(&[0; V])?; 850 | }; 851 | 852 | // the zero pad is necessary because every log 853 | // entry must have the same length, whether 854 | // it's a batch size or actual kv tuple. 855 | let min_tuple_sz = U64_SZ.max(K + V); 856 | let pad_sz = min_tuple_sz - (K + V); 857 | let pad = [0; U64_SZ]; 858 | self.log.write_all(&pad[..pad_sz])?; 859 | 860 | let logged_bytes = 4 + 1 + min_tuple_sz; 861 | 862 | self.memtable.insert(k, v); 863 | 864 | self.dirty_bytes += logged_bytes; 865 | self.stats.logged_bytes += logged_bytes as u64; 866 | self.stats.written_bytes += logged_bytes as u64; 867 | 868 | Ok(()) 869 | } 870 | 871 | /// Blocks until all log data has been 872 | /// written out to disk and fsynced. If 873 | /// the log file has grown above a certain 874 | /// threshold, it will be compacted into 875 | /// a new sstable and the log file will 876 | /// be truncated after the sstable has 877 | /// been written, fsynced, and the sstable 878 | /// directory has been fsyced. 879 | pub fn flush(&mut self) -> Result<()> { 880 | #[cfg(test)] 881 | { 882 | if self.log.tearing { 883 | return Ok(()); 884 | } 885 | } 886 | 887 | self.log.flush()?; 888 | self.log.get_mut().sync_all()?; 889 | 890 | if self.dirty_bytes > self.config.max_log_length { 891 | log::debug!("compacting log to sstable"); 892 | let memtable = std::mem::take(&mut self.memtable); 893 | let sst_id = self.next_sstable_id; 894 | if let Err(e) = write_sstable(&self.path, sst_id, &memtable, false, &self.config) { 895 | // put memtable back together before returning 896 | self.memtable = memtable; 897 | log::error!("failed to flush lsm log to sstable: {:?}", e); 898 | return Err(e.into()); 899 | } 900 | 901 | let sst_sz = 8 + (memtable.len() as u64 * (4 + K + V) as u64); 902 | let db_sz = self.db.len() as u64 * (K + V) as u64; 903 | 904 | if let Err(e) = self.worker_outbox.send(WorkerMessage::NewSST { 905 | id: sst_id, 906 | sst_sz, 907 | db_sz, 908 | }) { 909 | log::error!("failed to send message to worker: {:?}", e); 910 | log::logger().flush(); 911 | panic!("failed to send message to worker: {:?}", e); 912 | } 913 | 914 | #[cfg(test)] 915 | assert!(self.worker.tick()); 916 | 917 | self.next_sstable_id += 1; 918 | 919 | let log_file: &mut fs::File = self.log.get_mut(); 920 | log_file.seek(io::SeekFrom::Start(0))?; 921 | log_file.set_len(0)?; 922 | log_file.sync_all()?; 923 | fs::File::open(self.path.join(SSTABLE_DIR))?.sync_all()?; 924 | 925 | self.dirty_bytes = 0; 926 | } 927 | 928 | Ok(()) 929 | } 930 | 931 | pub fn stats(&mut self) -> Result { 932 | self.stats.written_bytes += self.worker_stats.written_bytes.swap(0, Ordering::Relaxed); 933 | self.stats.read_bytes += self.worker_stats.read_bytes.swap(0, Ordering::Relaxed); 934 | self.stats.resident_bytes = self.db.len() as u64 * (K + V) as u64; 935 | 936 | let mut on_disk_bytes: u64 = std::fs::metadata(self.path.join("log"))?.len(); 937 | 938 | on_disk_bytes += list_sstables(&self.path, false)? 939 | .into_iter() 940 | .map(|(_, len)| len) 941 | .sum::(); 942 | 943 | self.stats.on_disk_bytes = on_disk_bytes; 944 | 945 | self.stats.write_amp = 946 | self.stats.written_bytes as f64 / self.stats.on_disk_bytes.max(1) as f64; 947 | self.stats.space_amp = 948 | self.stats.on_disk_bytes as f64 / self.stats.resident_bytes.max(1) as f64; 949 | Ok(self.stats) 950 | } 951 | } 952 | 953 | #[cfg(test)] 954 | mod tearable; 955 | 956 | #[cfg(test)] 957 | mod fuzz; 958 | -------------------------------------------------------------------------------- /src/tearable.rs: -------------------------------------------------------------------------------- 1 | use std::io::{self, Write}; 2 | 3 | /// A writer that can facilitate corruption and torn writes 4 | /// for testing purposes. 5 | pub struct Tearable { 6 | inner: W, 7 | buffer: Vec, 8 | pub tearing: bool, 9 | } 10 | 11 | impl Tearable { 12 | pub fn new(inner: W) -> Self { 13 | Tearable { 14 | inner, 15 | buffer: vec![], 16 | tearing: false, 17 | } 18 | } 19 | 20 | pub fn begin_tear(&mut self) { 21 | assert!(!self.tearing); 22 | self.tearing = true; 23 | } 24 | 25 | pub fn apply_tear(&mut self, offset: usize, corrupt: bool) { 26 | assert!(self.tearing); 27 | 28 | if self.buffer.is_empty() { 29 | return; 30 | } 31 | 32 | let at = offset % self.buffer.len(); 33 | 34 | if corrupt { 35 | log::debug!( 36 | "corrupting {} to {} at idx {}", 37 | self.buffer[at], 38 | self.buffer[at] ^ 0xFF, 39 | at 40 | ); 41 | 42 | self.buffer[at] ^= 0xFF; 43 | } else { 44 | log::debug!("truncating pending write buffer to length {}", at); 45 | self.buffer.truncate(at); 46 | } 47 | 48 | self.tearing = false; 49 | self.flush().unwrap(); 50 | } 51 | 52 | pub fn get_mut(&mut self) -> &mut W { 53 | &mut self.inner 54 | } 55 | } 56 | 57 | impl Write for Tearable { 58 | fn write(&mut self, buf: &[u8]) -> io::Result { 59 | self.buffer.extend_from_slice(&buf); 60 | Ok(buf.len()) 61 | } 62 | 63 | fn flush(&mut self) -> io::Result<()> { 64 | if self.tearing { 65 | Ok(()) 66 | } else { 67 | self.inner.write_all(&self.buffer)?; 68 | self.inner.flush()?; 69 | log::debug!("flushed {} buffered log bytes to disk", self.buffer.len()); 70 | self.buffer.clear(); 71 | Ok(()) 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /trophy_case/00.json: -------------------------------------------------------------------------------- 1 | {"ops":[{"TornBatch":[[[145, null], [73, 67], [45, null], [51, 146], [231, null], [87, 216], [231, 82], [32, 85], [20, null], [87, 145], [232, 182], [73, null], [74, null], [55, null], [216, 203], [70, 18], [174, 232], [213, null], [89, 56], [141, 214], [10, 75], [42, null], [44, null], [177, 174], [91, null], [174, null], [151, null], [85, 19], [144, null], [181, null], [234, null], [51, 51], [53, 94], [56, null], [212, 32], [194, null], [233, 138], [186, 8], [251, null], [238, 23], [62, 133], [67, 126], [94, 153], [158, null], [100, 104], [198, 206], [124, 27], [142, 160], [209, null], [0, 107], [1, 81], [149, null], [120, 42], [99, null]], 16779932764131070072 ]}], "config":{"max_space_amp":48, "max_log_length":15648256160790557864, "merge_ratio":13, "merge_window":252, "log_bufwriter_size":2633668034, "zstd_sstable_compression_level":251}} 2 | -------------------------------------------------------------------------------- /trophy_case/01.json: -------------------------------------------------------------------------------- 1 | {"ops":[],"config":{"max_space_amp":23,"max_log_length":2967489108446635161,"merge_ratio":190,"merge_window":0,"log_bufwriter_size":1301844105,"zstd_sstable_compression_level":168}} --------------------------------------------------------------------------------