├── .github ├── dependabot.yml └── workflows │ └── ci.yml ├── .gitignore ├── .gitmodules ├── COMPARE.md ├── Cargo.toml ├── LICENSE-AGPL-3.0 ├── README.md ├── benches └── compare.rs ├── bors.toml ├── ci └── script.sh ├── examples └── generate-test-data.rs ├── rustfmt.toml ├── src ├── bup.rs ├── buzhash.rs ├── buzhash_table.rs ├── fastcdc.rs ├── gear.rs ├── gear_table │ ├── gear32.rs │ ├── gear64.rs │ └── mod.rs ├── gzip.rs ├── lib.rs ├── mii.rs ├── pigz.rs ├── ram.rs ├── range.rs ├── zpaq.rs └── zstd.rs └── tests ├── cuts.rs ├── cuts_qc.proptest-regressions ├── cuts_qc.rs ├── fastcdc.rs ├── oracle_zpaq.rs ├── qc_bh.rs ├── qc_bup.rs └── rsyncable.rs /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: cargo 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | time: "10:00" 8 | open-pull-requests-limit: 10 9 | rebase-strategy: disabled 10 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches-ignore: 4 | - '**.tmp' 5 | 6 | name: ci 7 | 8 | jobs: 9 | test: 10 | runs-on: ubuntu-20.04 11 | strategy: 12 | matrix: 13 | rust: 14 | - stable 15 | - beta 16 | - nightly 17 | - 1.47.0 18 | 19 | steps: 20 | - uses: actions/checkout@v2 21 | 22 | - uses: actions-rs/toolchain@v1 23 | with: 24 | profile: minimal 25 | toolchain: ${{ matrix.rust }} 26 | override: true 27 | 28 | - name: Cache Rust dependencies 29 | uses: Swatinem/rust-cache@v1 30 | 31 | - name: Run all tests 32 | uses: actions-rs/cargo@v1 33 | with: 34 | command: test 35 | args: --all 36 | check: 37 | runs-on: ubuntu-20.04 38 | 39 | steps: 40 | - uses: actions/checkout@v2 41 | 42 | - uses: actions-rs/toolchain@v1 43 | with: 44 | profile: minimal 45 | toolchain: beta 46 | override: true 47 | components: rustfmt, clippy 48 | 49 | - name: Cache Rust dependencies 50 | uses: Swatinem/rust-cache@v1 51 | 52 | - uses: actions-rs/cargo@v1 53 | with: 54 | command: fmt 55 | args: --all -- --check 56 | 57 | - uses: actions-rs/cargo@v1 58 | with: 59 | command: clippy 60 | args: -- -D warnings 61 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | Cargo.lock 3 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codyps/hash-roll/a964a874bed4398a02d6c8136cd7d0bdd8064f36/.gitmodules -------------------------------------------------------------------------------- /COMPARE.md: -------------------------------------------------------------------------------- 1 | 2 | # Algorithms 3 | 4 | - Gear 5 | - FastCDC 6 | - Zpaq 7 | - AE 8 | - RollSum 9 | - BuzHash 10 | - LMC (chunking) 11 | - RAM Chunking (Rapid Asymmetric Maximum) 12 | - doi:10.1016/j.future.2017.02.013 13 | - MII (minimal incrimental interval) 14 | - doi:10.1109/access.2019.2926195 15 | - [TTTD](https://scholarworks.sjsu.edu/cgi/viewcontent.cgi?referer=&httpsredir=1&article=1041&context=etd_projects) 16 | - [FBC](doi:10.1109/mascots.2010.37) 17 | 18 | # Algorithm Points of Comparison 19 | 20 | - Ability to constrain block size 21 | - distribution 22 | - tuneability of distribution 23 | - Speed 24 | - on different distributions 25 | - Common chunk discovery 26 | - on different distributions 27 | - Common chuck discovery after a byte shift 28 | - on different distributions 29 | - Common chuck discovery after edit 30 | - on different data distributions 31 | - under different edit kinds 32 | 33 | # Impl Features 34 | 35 | - Incrimental input: rather than require a single `&[u8]` up front, allow 36 | providing a number of `&[u8]`s over the life of the splitter/hasher. 37 | 38 | - Slice input vs byte-at-a-time: By allowing algorithms to take in larger 39 | slices of data at a time, it enables them to potentially impliment 40 | optimizations to speed up computation. 41 | 42 | # Implimentations 43 | 44 | - [cdc](https://lib.rs/crates/cdc) 45 | - latest release: 2017-09-09 46 | - inactive development (as of 2020-06-21) 47 | - algorithm(s): "Rabin64" (polynomial based, 64-bit) 48 | - incrimental input: no 49 | - no documentation indicates incrimental input is possible 50 | - while one could use a special impl of `Iterator` that can be 51 | extended, this would only work if the `SeperatorIter` or `ChunkIter` had 52 | not emitted a final incomplete chunk/seperator. 53 | - includes `RollingHash64` trait 54 | - structure includes mutable context, no non-mutable representation 55 | - input format(s): `Iterator`, `u8` 56 | - may limit performance capability 57 | - input is fully buffered by cdc structures 58 | - provides both rolling hash and content splitting features 59 | - has _explicit_ representation for "prefilling" of the rolling hash. 60 | - includes multiple iterator adapters 61 | - splits the concept of a "seperator" (index + hash) vs a "chunk" (index + 62 | hash + size). 63 | - iterator adaptors don't generalize over rolling hashes, they are 64 | hard-coded to the `Rabin64` impl 65 | - documentation is lacking (almost universally missing) 66 | - [fastcdc](https://lib.rs/crates/fastcdc) 67 | - latest release: 2020-03-19, v1.0.3 68 | - active development (as of 2020-06-21) 69 | - algorithm(s): FastCDC 70 | - incrimental input: no 71 | - api: 72 | - input: one `&[u8]` 73 | - output: `Iterator where Chunk: (offset: usize, size: 74 | usize)`. Returns the remaining chunk as the last item even if not 75 | considered a complete chunk. 76 | - only struct mixes mutable and immutable data, no configuration representation 77 | - "chunks" are an offset and a size 78 | - iow: no rolling hash support 79 | - single struct, no traits 80 | - provides a fixed table for fastcdc (generated via a reproducable mechanism initially) 81 | - [quickcdc](https://lib.rs/crates/quickcdc) 82 | - latest release: 2018-12-17 v1.0.0 (no other releases) 83 | - inactive development (as of 2020-06-21) 84 | - algorithm(s): AE (with modifications/extensions) 85 | - incrimental input: no 86 | - api: 87 | - input: one `&[u8]` 88 | - output: `Iterator` 89 | - no struct representation of configuration (only mixes mutable and immutable) 90 | - api: iterator over slices 91 | - single struct, no traits 92 | - includes improper use of unsafe in a non-public function (passes pointers 93 | into a function that dereferences them but the function is not marked 94 | unsafe). 95 | - [gearhash](https://lib.rs/crates/gearhash) 96 | - latest release: 2020-04-12 v0.1.3 97 | - active development (as of 2020-06-21) 98 | - algorithm(s): gear 99 | - incrimental input: yes 100 | - provides simd & scalar impls 101 | - includes a static table for gearhash 102 | - api: call `next_match()` repeatedly with new slices. Returns a 103 | `Option` indicating where a split point is (if any) in the slice 104 | passed to `next_match()`. 105 | - `Hasher` struct provides both content splitting and rolling hash features. 106 | - in-place splitting 107 | - lacks helpers present in `cdchunking` 108 | - single struct, no traits 109 | - no struct representation of configuration (only mixes mutable and immutable data) 110 | - [cdchunking](https://lib.rs/crates/cdchunking) 111 | - latest release: 2019-11-02 v1.0.0 112 | - inactive development (as of 2020-06-21) 113 | - algorithm(s): Zpaq 114 | - provides a chunker-impl trait 115 | - api: call `next_boundary()` repeatedly with new slices. Returns a 116 | `Option` indicating what a split point is (if any) in the slice. 117 | - must explicitly call a `reset()` after a match to reset internal state 118 | for subsequent matches. 119 | - provides a `Chunker` which takes a `ChunkerImpl` and provides a number of ease-of-use apis: 120 | - from a `Read` into a `Iterator>>` 121 | - from a `Read` into a `Result>>` 122 | - from a `Read` into a series of one of `Data(&[u8])` or `End`, where the 123 | `Data(&[u8])` are references to an internal buffer and `End` indicate 124 | the end of a chunk. 125 | - from a `Read` to an iterator of (start, len, end) (ie: no data returned) 126 | - from a `&[u8]` to an `Iterator` 127 | - [rollsum](https://lib.rs/crates/rollsum) aka [rsroll](https://github.com/aidanhs/rsroll) 128 | - latest release: (commit 2019-12-22, publish 2020-09-27) v0.3.0 129 | - uncertain inactive development (as of 2020-10-08) 130 | - algorithm(s): 131 | - rollsum (based on bupsplit, based on rsync chunking) 132 | - gear 133 | - incrimental input: yes 134 | - includes a static table for gearhash 135 | - low level trait has byte-by-byte and slice based interfaces 136 | - exposes conditionality of chunk edge (ie: like a rolling-sum) in trait, 137 | but provides a helper on the specific struct that uses it's defaults. 138 | - requires explicit state resets after finding a chunk edge to find the next 139 | chunk edge (doesn't reset internal state) 140 | - api: call `find_chunk_edge()` with different slices until Some((usize, Sum)) is 141 | returned. the `usize` here is the offset after the end of the chunk (ie: 142 | start of the next chunk). 143 | - provides access to the underlying Sum on each edge 144 | - [rededup-cdc](https://lib.rs/crates/rdedup-cdc) 145 | - `rollsum` fork 146 | - [bitar](https://lib.rs/crates/bitar) 147 | - latest release: 2020-06-09 v0.7.0 148 | - active development (as of 2020-06-21) 149 | - algorithms(s): BuzHash, RollSum 150 | - uses enum to abstract over algorithms (`Config` and `Chunker`) 151 | - includes seperate immutable "configuration object" concept (`Config`) 152 | - supports/requires use of `tokio::AsyncRead` as input 153 | - api: provide a `AsyncRead` when constructing the `Chunker`. Use the 154 | `futures::Stream>` it returns 155 | - low-level trait for each hash is byte-at-a-time 156 | - many other items included in the library (designed to support the cmdline tool `bita`) 157 | - [zvault](https://github.com/dswd/zvault) 158 | - algorithm(s): AE, fastcdc, rabin, fixed (non content defined) 159 | - low level trait requires a Read & a Write instance 160 | - provides run-time generic over creation & extraction of some details (`Chunker`) 161 | - Instantiation for each provides a seed and average size 162 | - inactive development (last change 2018-03-08 (as of 2020-05-10)) 163 | - includes many non-chunking items 164 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hash-roll" 3 | version = "0.3.0" 4 | authors = ["Cody P Schafer "] 5 | description = "Rolling hashes & Content Defined Chunking (cdc)" 6 | keywords = [ "hash", "rolling", "incremental", "split" , "cdc"] 7 | license = "AGPL-3.0-or-later" 8 | repository = "https://github.com/jmesmon/hash-roll.git" 9 | documentation = "https://docs.rs/hash-roll" 10 | include = ["Cargo.toml", "**/*.rs", "README.md", "COMPARE.md"] 11 | edition = "2018" 12 | 13 | [features] 14 | default = [ 15 | "bup", 16 | "buzhash", 17 | "fastcdc", 18 | "gear", 19 | "gzip", 20 | "mii", 21 | "pigz", 22 | "ram", 23 | "zpaq", 24 | "zstd" 25 | ] 26 | 27 | bup = [] 28 | buzhash = [] 29 | fastcdc = [] 30 | gear = [] 31 | gzip = [] 32 | mii = [] 33 | pigz = [] 34 | ram = [] 35 | zpaq = [] 36 | zstd = [] 37 | 38 | [dependencies] 39 | fmt-extra = "0.2" 40 | #circbuf = "0.1.4" 41 | 42 | [dev-dependencies] 43 | rand = "0.7.3" 44 | histogram = "0.6" 45 | quickcheck = "0.9" 46 | rollsum = "0.3" 47 | criterion = "0.3" 48 | rand_pcg = "0.2.1" 49 | proptest = "0.10.0" 50 | 51 | [[bench]] 52 | name = "compare" 53 | harness = false 54 | -------------------------------------------------------------------------------- /LICENSE-AGPL-3.0: -------------------------------------------------------------------------------- 1 | GNU AFFERO GENERAL PUBLIC LICENSE 2 | Version 3, 19 November 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU Affero General Public License is a free, copyleft license for 11 | software and other kinds of works, specifically designed to ensure 12 | cooperation with the community in the case of network server software. 13 | 14 | The licenses for most software and other practical works are designed 15 | to take away your freedom to share and change the works. By contrast, 16 | our General Public Licenses are intended to guarantee your freedom to 17 | share and change all versions of a program--to make sure it remains free 18 | software for all its users. 19 | 20 | When we speak of free software, we are referring to freedom, not 21 | price. Our General Public Licenses are designed to make sure that you 22 | have the freedom to distribute copies of free software (and charge for 23 | them if you wish), that you receive source code or can get it if you 24 | want it, that you can change the software or use pieces of it in new 25 | free programs, and that you know you can do these things. 26 | 27 | Developers that use our General Public Licenses protect your rights 28 | with two steps: (1) assert copyright on the software, and (2) offer 29 | you this License which gives you legal permission to copy, distribute 30 | and/or modify the software. 31 | 32 | A secondary benefit of defending all users' freedom is that 33 | improvements made in alternate versions of the program, if they 34 | receive widespread use, become available for other developers to 35 | incorporate. Many developers of free software are heartened and 36 | encouraged by the resulting cooperation. However, in the case of 37 | software used on network servers, this result may fail to come about. 38 | The GNU General Public License permits making a modified version and 39 | letting the public access it on a server without ever releasing its 40 | source code to the public. 41 | 42 | The GNU Affero General Public License is designed specifically to 43 | ensure that, in such cases, the modified source code becomes available 44 | to the community. It requires the operator of a network server to 45 | provide the source code of the modified version running there to the 46 | users of that server. Therefore, public use of a modified version, on 47 | a publicly accessible server, gives the public access to the source 48 | code of the modified version. 49 | 50 | An older license, called the Affero General Public License and 51 | published by Affero, was designed to accomplish similar goals. This is 52 | a different license, not a version of the Affero GPL, but Affero has 53 | released a new version of the Affero GPL which permits relicensing under 54 | this license. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | TERMS AND CONDITIONS 60 | 61 | 0. Definitions. 62 | 63 | "This License" refers to version 3 of the GNU Affero General Public License. 64 | 65 | "Copyright" also means copyright-like laws that apply to other kinds of 66 | works, such as semiconductor masks. 67 | 68 | "The Program" refers to any copyrightable work licensed under this 69 | License. Each licensee is addressed as "you". "Licensees" and 70 | "recipients" may be individuals or organizations. 71 | 72 | To "modify" a work means to copy from or adapt all or part of the work 73 | in a fashion requiring copyright permission, other than the making of an 74 | exact copy. The resulting work is called a "modified version" of the 75 | earlier work or a work "based on" the earlier work. 76 | 77 | A "covered work" means either the unmodified Program or a work based 78 | on the Program. 79 | 80 | To "propagate" a work means to do anything with it that, without 81 | permission, would make you directly or secondarily liable for 82 | infringement under applicable copyright law, except executing it on a 83 | computer or modifying a private copy. Propagation includes copying, 84 | distribution (with or without modification), making available to the 85 | public, and in some countries other activities as well. 86 | 87 | To "convey" a work means any kind of propagation that enables other 88 | parties to make or receive copies. Mere interaction with a user through 89 | a computer network, with no transfer of a copy, is not conveying. 90 | 91 | An interactive user interface displays "Appropriate Legal Notices" 92 | to the extent that it includes a convenient and prominently visible 93 | feature that (1) displays an appropriate copyright notice, and (2) 94 | tells the user that there is no warranty for the work (except to the 95 | extent that warranties are provided), that licensees may convey the 96 | work under this License, and how to view a copy of this License. If 97 | the interface presents a list of user commands or options, such as a 98 | menu, a prominent item in the list meets this criterion. 99 | 100 | 1. Source Code. 101 | 102 | The "source code" for a work means the preferred form of the work 103 | for making modifications to it. "Object code" means any non-source 104 | form of a work. 105 | 106 | A "Standard Interface" means an interface that either is an official 107 | standard defined by a recognized standards body, or, in the case of 108 | interfaces specified for a particular programming language, one that 109 | is widely used among developers working in that language. 110 | 111 | The "System Libraries" of an executable work include anything, other 112 | than the work as a whole, that (a) is included in the normal form of 113 | packaging a Major Component, but which is not part of that Major 114 | Component, and (b) serves only to enable use of the work with that 115 | Major Component, or to implement a Standard Interface for which an 116 | implementation is available to the public in source code form. A 117 | "Major Component", in this context, means a major essential component 118 | (kernel, window system, and so on) of the specific operating system 119 | (if any) on which the executable work runs, or a compiler used to 120 | produce the work, or an object code interpreter used to run it. 121 | 122 | The "Corresponding Source" for a work in object code form means all 123 | the source code needed to generate, install, and (for an executable 124 | work) run the object code and to modify the work, including scripts to 125 | control those activities. However, it does not include the work's 126 | System Libraries, or general-purpose tools or generally available free 127 | programs which are used unmodified in performing those activities but 128 | which are not part of the work. For example, Corresponding Source 129 | includes interface definition files associated with source files for 130 | the work, and the source code for shared libraries and dynamically 131 | linked subprograms that the work is specifically designed to require, 132 | such as by intimate data communication or control flow between those 133 | subprograms and other parts of the work. 134 | 135 | The Corresponding Source need not include anything that users 136 | can regenerate automatically from other parts of the Corresponding 137 | Source. 138 | 139 | The Corresponding Source for a work in source code form is that 140 | same work. 141 | 142 | 2. Basic Permissions. 143 | 144 | All rights granted under this License are granted for the term of 145 | copyright on the Program, and are irrevocable provided the stated 146 | conditions are met. This License explicitly affirms your unlimited 147 | permission to run the unmodified Program. The output from running a 148 | covered work is covered by this License only if the output, given its 149 | content, constitutes a covered work. This License acknowledges your 150 | rights of fair use or other equivalent, as provided by copyright law. 151 | 152 | You may make, run and propagate covered works that you do not 153 | convey, without conditions so long as your license otherwise remains 154 | in force. You may convey covered works to others for the sole purpose 155 | of having them make modifications exclusively for you, or provide you 156 | with facilities for running those works, provided that you comply with 157 | the terms of this License in conveying all material for which you do 158 | not control copyright. Those thus making or running the covered works 159 | for you must do so exclusively on your behalf, under your direction 160 | and control, on terms that prohibit them from making any copies of 161 | your copyrighted material outside their relationship with you. 162 | 163 | Conveying under any other circumstances is permitted solely under 164 | the conditions stated below. Sublicensing is not allowed; section 10 165 | makes it unnecessary. 166 | 167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 168 | 169 | No covered work shall be deemed part of an effective technological 170 | measure under any applicable law fulfilling obligations under article 171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 172 | similar laws prohibiting or restricting circumvention of such 173 | measures. 174 | 175 | When you convey a covered work, you waive any legal power to forbid 176 | circumvention of technological measures to the extent such circumvention 177 | is effected by exercising rights under this License with respect to 178 | the covered work, and you disclaim any intention to limit operation or 179 | modification of the work as a means of enforcing, against the work's 180 | users, your or third parties' legal rights to forbid circumvention of 181 | technological measures. 182 | 183 | 4. Conveying Verbatim Copies. 184 | 185 | You may convey verbatim copies of the Program's source code as you 186 | receive it, in any medium, provided that you conspicuously and 187 | appropriately publish on each copy an appropriate copyright notice; 188 | keep intact all notices stating that this License and any 189 | non-permissive terms added in accord with section 7 apply to the code; 190 | keep intact all notices of the absence of any warranty; and give all 191 | recipients a copy of this License along with the Program. 192 | 193 | You may charge any price or no price for each copy that you convey, 194 | and you may offer support or warranty protection for a fee. 195 | 196 | 5. Conveying Modified Source Versions. 197 | 198 | You may convey a work based on the Program, or the modifications to 199 | produce it from the Program, in the form of source code under the 200 | terms of section 4, provided that you also meet all of these conditions: 201 | 202 | a) The work must carry prominent notices stating that you modified 203 | it, and giving a relevant date. 204 | 205 | b) The work must carry prominent notices stating that it is 206 | released under this License and any conditions added under section 207 | 7. This requirement modifies the requirement in section 4 to 208 | "keep intact all notices". 209 | 210 | c) You must license the entire work, as a whole, under this 211 | License to anyone who comes into possession of a copy. This 212 | License will therefore apply, along with any applicable section 7 213 | additional terms, to the whole of the work, and all its parts, 214 | regardless of how they are packaged. This License gives no 215 | permission to license the work in any other way, but it does not 216 | invalidate such permission if you have separately received it. 217 | 218 | d) If the work has interactive user interfaces, each must display 219 | Appropriate Legal Notices; however, if the Program has interactive 220 | interfaces that do not display Appropriate Legal Notices, your 221 | work need not make them do so. 222 | 223 | A compilation of a covered work with other separate and independent 224 | works, which are not by their nature extensions of the covered work, 225 | and which are not combined with it such as to form a larger program, 226 | in or on a volume of a storage or distribution medium, is called an 227 | "aggregate" if the compilation and its resulting copyright are not 228 | used to limit the access or legal rights of the compilation's users 229 | beyond what the individual works permit. Inclusion of a covered work 230 | in an aggregate does not cause this License to apply to the other 231 | parts of the aggregate. 232 | 233 | 6. Conveying Non-Source Forms. 234 | 235 | You may convey a covered work in object code form under the terms 236 | of sections 4 and 5, provided that you also convey the 237 | machine-readable Corresponding Source under the terms of this License, 238 | in one of these ways: 239 | 240 | a) Convey the object code in, or embodied in, a physical product 241 | (including a physical distribution medium), accompanied by the 242 | Corresponding Source fixed on a durable physical medium 243 | customarily used for software interchange. 244 | 245 | b) Convey the object code in, or embodied in, a physical product 246 | (including a physical distribution medium), accompanied by a 247 | written offer, valid for at least three years and valid for as 248 | long as you offer spare parts or customer support for that product 249 | model, to give anyone who possesses the object code either (1) a 250 | copy of the Corresponding Source for all the software in the 251 | product that is covered by this License, on a durable physical 252 | medium customarily used for software interchange, for a price no 253 | more than your reasonable cost of physically performing this 254 | conveying of source, or (2) access to copy the 255 | Corresponding Source from a network server at no charge. 256 | 257 | c) Convey individual copies of the object code with a copy of the 258 | written offer to provide the Corresponding Source. This 259 | alternative is allowed only occasionally and noncommercially, and 260 | only if you received the object code with such an offer, in accord 261 | with subsection 6b. 262 | 263 | d) Convey the object code by offering access from a designated 264 | place (gratis or for a charge), and offer equivalent access to the 265 | Corresponding Source in the same way through the same place at no 266 | further charge. You need not require recipients to copy the 267 | Corresponding Source along with the object code. If the place to 268 | copy the object code is a network server, the Corresponding Source 269 | may be on a different server (operated by you or a third party) 270 | that supports equivalent copying facilities, provided you maintain 271 | clear directions next to the object code saying where to find the 272 | Corresponding Source. Regardless of what server hosts the 273 | Corresponding Source, you remain obligated to ensure that it is 274 | available for as long as needed to satisfy these requirements. 275 | 276 | e) Convey the object code using peer-to-peer transmission, provided 277 | you inform other peers where the object code and Corresponding 278 | Source of the work are being offered to the general public at no 279 | charge under subsection 6d. 280 | 281 | A separable portion of the object code, whose source code is excluded 282 | from the Corresponding Source as a System Library, need not be 283 | included in conveying the object code work. 284 | 285 | A "User Product" is either (1) a "consumer product", which means any 286 | tangible personal property which is normally used for personal, family, 287 | or household purposes, or (2) anything designed or sold for incorporation 288 | into a dwelling. In determining whether a product is a consumer product, 289 | doubtful cases shall be resolved in favor of coverage. For a particular 290 | product received by a particular user, "normally used" refers to a 291 | typical or common use of that class of product, regardless of the status 292 | of the particular user or of the way in which the particular user 293 | actually uses, or expects or is expected to use, the product. A product 294 | is a consumer product regardless of whether the product has substantial 295 | commercial, industrial or non-consumer uses, unless such uses represent 296 | the only significant mode of use of the product. 297 | 298 | "Installation Information" for a User Product means any methods, 299 | procedures, authorization keys, or other information required to install 300 | and execute modified versions of a covered work in that User Product from 301 | a modified version of its Corresponding Source. The information must 302 | suffice to ensure that the continued functioning of the modified object 303 | code is in no case prevented or interfered with solely because 304 | modification has been made. 305 | 306 | If you convey an object code work under this section in, or with, or 307 | specifically for use in, a User Product, and the conveying occurs as 308 | part of a transaction in which the right of possession and use of the 309 | User Product is transferred to the recipient in perpetuity or for a 310 | fixed term (regardless of how the transaction is characterized), the 311 | Corresponding Source conveyed under this section must be accompanied 312 | by the Installation Information. But this requirement does not apply 313 | if neither you nor any third party retains the ability to install 314 | modified object code on the User Product (for example, the work has 315 | been installed in ROM). 316 | 317 | The requirement to provide Installation Information does not include a 318 | requirement to continue to provide support service, warranty, or updates 319 | for a work that has been modified or installed by the recipient, or for 320 | the User Product in which it has been modified or installed. Access to a 321 | network may be denied when the modification itself materially and 322 | adversely affects the operation of the network or violates the rules and 323 | protocols for communication across the network. 324 | 325 | Corresponding Source conveyed, and Installation Information provided, 326 | in accord with this section must be in a format that is publicly 327 | documented (and with an implementation available to the public in 328 | source code form), and must require no special password or key for 329 | unpacking, reading or copying. 330 | 331 | 7. Additional Terms. 332 | 333 | "Additional permissions" are terms that supplement the terms of this 334 | License by making exceptions from one or more of its conditions. 335 | Additional permissions that are applicable to the entire Program shall 336 | be treated as though they were included in this License, to the extent 337 | that they are valid under applicable law. If additional permissions 338 | apply only to part of the Program, that part may be used separately 339 | under those permissions, but the entire Program remains governed by 340 | this License without regard to the additional permissions. 341 | 342 | When you convey a copy of a covered work, you may at your option 343 | remove any additional permissions from that copy, or from any part of 344 | it. (Additional permissions may be written to require their own 345 | removal in certain cases when you modify the work.) You may place 346 | additional permissions on material, added by you to a covered work, 347 | for which you have or can give appropriate copyright permission. 348 | 349 | Notwithstanding any other provision of this License, for material you 350 | add to a covered work, you may (if authorized by the copyright holders of 351 | that material) supplement the terms of this License with terms: 352 | 353 | a) Disclaiming warranty or limiting liability differently from the 354 | terms of sections 15 and 16 of this License; or 355 | 356 | b) Requiring preservation of specified reasonable legal notices or 357 | author attributions in that material or in the Appropriate Legal 358 | Notices displayed by works containing it; or 359 | 360 | c) Prohibiting misrepresentation of the origin of that material, or 361 | requiring that modified versions of such material be marked in 362 | reasonable ways as different from the original version; or 363 | 364 | d) Limiting the use for publicity purposes of names of licensors or 365 | authors of the material; or 366 | 367 | e) Declining to grant rights under trademark law for use of some 368 | trade names, trademarks, or service marks; or 369 | 370 | f) Requiring indemnification of licensors and authors of that 371 | material by anyone who conveys the material (or modified versions of 372 | it) with contractual assumptions of liability to the recipient, for 373 | any liability that these contractual assumptions directly impose on 374 | those licensors and authors. 375 | 376 | All other non-permissive additional terms are considered "further 377 | restrictions" within the meaning of section 10. If the Program as you 378 | received it, or any part of it, contains a notice stating that it is 379 | governed by this License along with a term that is a further 380 | restriction, you may remove that term. If a license document contains 381 | a further restriction but permits relicensing or conveying under this 382 | License, you may add to a covered work material governed by the terms 383 | of that license document, provided that the further restriction does 384 | not survive such relicensing or conveying. 385 | 386 | If you add terms to a covered work in accord with this section, you 387 | must place, in the relevant source files, a statement of the 388 | additional terms that apply to those files, or a notice indicating 389 | where to find the applicable terms. 390 | 391 | Additional terms, permissive or non-permissive, may be stated in the 392 | form of a separately written license, or stated as exceptions; 393 | the above requirements apply either way. 394 | 395 | 8. Termination. 396 | 397 | You may not propagate or modify a covered work except as expressly 398 | provided under this License. Any attempt otherwise to propagate or 399 | modify it is void, and will automatically terminate your rights under 400 | this License (including any patent licenses granted under the third 401 | paragraph of section 11). 402 | 403 | However, if you cease all violation of this License, then your 404 | license from a particular copyright holder is reinstated (a) 405 | provisionally, unless and until the copyright holder explicitly and 406 | finally terminates your license, and (b) permanently, if the copyright 407 | holder fails to notify you of the violation by some reasonable means 408 | prior to 60 days after the cessation. 409 | 410 | Moreover, your license from a particular copyright holder is 411 | reinstated permanently if the copyright holder notifies you of the 412 | violation by some reasonable means, this is the first time you have 413 | received notice of violation of this License (for any work) from that 414 | copyright holder, and you cure the violation prior to 30 days after 415 | your receipt of the notice. 416 | 417 | Termination of your rights under this section does not terminate the 418 | licenses of parties who have received copies or rights from you under 419 | this License. If your rights have been terminated and not permanently 420 | reinstated, you do not qualify to receive new licenses for the same 421 | material under section 10. 422 | 423 | 9. Acceptance Not Required for Having Copies. 424 | 425 | You are not required to accept this License in order to receive or 426 | run a copy of the Program. Ancillary propagation of a covered work 427 | occurring solely as a consequence of using peer-to-peer transmission 428 | to receive a copy likewise does not require acceptance. However, 429 | nothing other than this License grants you permission to propagate or 430 | modify any covered work. These actions infringe copyright if you do 431 | not accept this License. Therefore, by modifying or propagating a 432 | covered work, you indicate your acceptance of this License to do so. 433 | 434 | 10. Automatic Licensing of Downstream Recipients. 435 | 436 | Each time you convey a covered work, the recipient automatically 437 | receives a license from the original licensors, to run, modify and 438 | propagate that work, subject to this License. You are not responsible 439 | for enforcing compliance by third parties with this License. 440 | 441 | An "entity transaction" is a transaction transferring control of an 442 | organization, or substantially all assets of one, or subdividing an 443 | organization, or merging organizations. If propagation of a covered 444 | work results from an entity transaction, each party to that 445 | transaction who receives a copy of the work also receives whatever 446 | licenses to the work the party's predecessor in interest had or could 447 | give under the previous paragraph, plus a right to possession of the 448 | Corresponding Source of the work from the predecessor in interest, if 449 | the predecessor has it or can get it with reasonable efforts. 450 | 451 | You may not impose any further restrictions on the exercise of the 452 | rights granted or affirmed under this License. For example, you may 453 | not impose a license fee, royalty, or other charge for exercise of 454 | rights granted under this License, and you may not initiate litigation 455 | (including a cross-claim or counterclaim in a lawsuit) alleging that 456 | any patent claim is infringed by making, using, selling, offering for 457 | sale, or importing the Program or any portion of it. 458 | 459 | 11. Patents. 460 | 461 | A "contributor" is a copyright holder who authorizes use under this 462 | License of the Program or a work on which the Program is based. The 463 | work thus licensed is called the contributor's "contributor version". 464 | 465 | A contributor's "essential patent claims" are all patent claims 466 | owned or controlled by the contributor, whether already acquired or 467 | hereafter acquired, that would be infringed by some manner, permitted 468 | by this License, of making, using, or selling its contributor version, 469 | but do not include claims that would be infringed only as a 470 | consequence of further modification of the contributor version. For 471 | purposes of this definition, "control" includes the right to grant 472 | patent sublicenses in a manner consistent with the requirements of 473 | this License. 474 | 475 | Each contributor grants you a non-exclusive, worldwide, royalty-free 476 | patent license under the contributor's essential patent claims, to 477 | make, use, sell, offer for sale, import and otherwise run, modify and 478 | propagate the contents of its contributor version. 479 | 480 | In the following three paragraphs, a "patent license" is any express 481 | agreement or commitment, however denominated, not to enforce a patent 482 | (such as an express permission to practice a patent or covenant not to 483 | sue for patent infringement). To "grant" such a patent license to a 484 | party means to make such an agreement or commitment not to enforce a 485 | patent against the party. 486 | 487 | If you convey a covered work, knowingly relying on a patent license, 488 | and the Corresponding Source of the work is not available for anyone 489 | to copy, free of charge and under the terms of this License, through a 490 | publicly available network server or other readily accessible means, 491 | then you must either (1) cause the Corresponding Source to be so 492 | available, or (2) arrange to deprive yourself of the benefit of the 493 | patent license for this particular work, or (3) arrange, in a manner 494 | consistent with the requirements of this License, to extend the patent 495 | license to downstream recipients. "Knowingly relying" means you have 496 | actual knowledge that, but for the patent license, your conveying the 497 | covered work in a country, or your recipient's use of the covered work 498 | in a country, would infringe one or more identifiable patents in that 499 | country that you have reason to believe are valid. 500 | 501 | If, pursuant to or in connection with a single transaction or 502 | arrangement, you convey, or propagate by procuring conveyance of, a 503 | covered work, and grant a patent license to some of the parties 504 | receiving the covered work authorizing them to use, propagate, modify 505 | or convey a specific copy of the covered work, then the patent license 506 | you grant is automatically extended to all recipients of the covered 507 | work and works based on it. 508 | 509 | A patent license is "discriminatory" if it does not include within 510 | the scope of its coverage, prohibits the exercise of, or is 511 | conditioned on the non-exercise of one or more of the rights that are 512 | specifically granted under this License. You may not convey a covered 513 | work if you are a party to an arrangement with a third party that is 514 | in the business of distributing software, under which you make payment 515 | to the third party based on the extent of your activity of conveying 516 | the work, and under which the third party grants, to any of the 517 | parties who would receive the covered work from you, a discriminatory 518 | patent license (a) in connection with copies of the covered work 519 | conveyed by you (or copies made from those copies), or (b) primarily 520 | for and in connection with specific products or compilations that 521 | contain the covered work, unless you entered into that arrangement, 522 | or that patent license was granted, prior to 28 March 2007. 523 | 524 | Nothing in this License shall be construed as excluding or limiting 525 | any implied license or other defenses to infringement that may 526 | otherwise be available to you under applicable patent law. 527 | 528 | 12. No Surrender of Others' Freedom. 529 | 530 | If conditions are imposed on you (whether by court order, agreement or 531 | otherwise) that contradict the conditions of this License, they do not 532 | excuse you from the conditions of this License. If you cannot convey a 533 | covered work so as to satisfy simultaneously your obligations under this 534 | License and any other pertinent obligations, then as a consequence you may 535 | not convey it at all. For example, if you agree to terms that obligate you 536 | to collect a royalty for further conveying from those to whom you convey 537 | the Program, the only way you could satisfy both those terms and this 538 | License would be to refrain entirely from conveying the Program. 539 | 540 | 13. Remote Network Interaction; Use with the GNU General Public License. 541 | 542 | Notwithstanding any other provision of this License, if you modify the 543 | Program, your modified version must prominently offer all users 544 | interacting with it remotely through a computer network (if your version 545 | supports such interaction) an opportunity to receive the Corresponding 546 | Source of your version by providing access to the Corresponding Source 547 | from a network server at no charge, through some standard or customary 548 | means of facilitating copying of software. This Corresponding Source 549 | shall include the Corresponding Source for any work covered by version 3 550 | of the GNU General Public License that is incorporated pursuant to the 551 | following paragraph. 552 | 553 | Notwithstanding any other provision of this License, you have 554 | permission to link or combine any covered work with a work licensed 555 | under version 3 of the GNU General Public License into a single 556 | combined work, and to convey the resulting work. The terms of this 557 | License will continue to apply to the part which is the covered work, 558 | but the work with which it is combined will remain governed by version 559 | 3 of the GNU General Public License. 560 | 561 | 14. Revised Versions of this License. 562 | 563 | The Free Software Foundation may publish revised and/or new versions of 564 | the GNU Affero General Public License from time to time. Such new versions 565 | will be similar in spirit to the present version, but may differ in detail to 566 | address new problems or concerns. 567 | 568 | Each version is given a distinguishing version number. If the 569 | Program specifies that a certain numbered version of the GNU Affero General 570 | Public License "or any later version" applies to it, you have the 571 | option of following the terms and conditions either of that numbered 572 | version or of any later version published by the Free Software 573 | Foundation. If the Program does not specify a version number of the 574 | GNU Affero General Public License, you may choose any version ever published 575 | by the Free Software Foundation. 576 | 577 | If the Program specifies that a proxy can decide which future 578 | versions of the GNU Affero General Public License can be used, that proxy's 579 | public statement of acceptance of a version permanently authorizes you 580 | to choose that version for the Program. 581 | 582 | Later license versions may give you additional or different 583 | permissions. However, no additional obligations are imposed on any 584 | author or copyright holder as a result of your choosing to follow a 585 | later version. 586 | 587 | 15. Disclaimer of Warranty. 588 | 589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 597 | 598 | 16. Limitation of Liability. 599 | 600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 608 | SUCH DAMAGES. 609 | 610 | 17. Interpretation of Sections 15 and 16. 611 | 612 | If the disclaimer of warranty and limitation of liability provided 613 | above cannot be given local legal effect according to their terms, 614 | reviewing courts shall apply local law that most closely approximates 615 | an absolute waiver of all civil liability in connection with the 616 | Program, unless a warranty or assumption of liability accompanies a 617 | copy of the Program in return for a fee. 618 | 619 | END OF TERMS AND CONDITIONS 620 | 621 | How to Apply These Terms to Your New Programs 622 | 623 | If you develop a new program, and you want it to be of the greatest 624 | possible use to the public, the best way to achieve this is to make it 625 | free software which everyone can redistribute and change under these terms. 626 | 627 | To do so, attach the following notices to the program. It is safest 628 | to attach them to the start of each source file to most effectively 629 | state the exclusion of warranty; and each file should have at least 630 | the "copyright" line and a pointer to where the full notice is found. 631 | 632 | 633 | Copyright (C) 634 | 635 | This program is free software: you can redistribute it and/or modify 636 | it under the terms of the GNU Affero General Public License as published by 637 | the Free Software Foundation, either version 3 of the License, or 638 | (at your option) any later version. 639 | 640 | This program is distributed in the hope that it will be useful, 641 | but WITHOUT ANY WARRANTY; without even the implied warranty of 642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 643 | GNU Affero General Public License for more details. 644 | 645 | You should have received a copy of the GNU Affero General Public License 646 | along with this program. If not, see . 647 | 648 | Also add information on how to contact you by electronic and paper mail. 649 | 650 | If your software can interact with users remotely through a computer 651 | network, you should also make sure that it provides a way for users to 652 | get its source. For example, if your program is a web application, its 653 | interface could display a "Source" link that leads users to an archive 654 | of the code. There are many ways you could offer source, and different 655 | solutions will be better for different programs; see section 13 for the 656 | specific requirements. 657 | 658 | You should also get your employer (if you work as a programmer) or school, 659 | if any, to sign a "copyright disclaimer" for the program, if necessary. 660 | For more information on this, and how to apply and follow the GNU AGPL, see 661 | . 662 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # hash-roll 2 | 3 | Provides a generic API for abstracting over various implimentations of content 4 | defined chunking. Also provides implimentations of a number of content defined 5 | chunking algorithms. 6 | 7 | ## Metrics 8 | 9 | - DER: duplicate elimination ratio 10 | 11 | ## CDC References 12 | 13 | - https://www.usenix.org/legacy/events/fast10/tech/full_papers/kruus.pdf 14 | 15 | ## [Comparison of the Chunking options avaliable in Rust](COMPARE.md) 16 | 17 | 18 | ## License 19 | 20 | This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. 21 | 22 | This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 23 | 24 | You should have received a copy of the GNU Affero General Public License along with this program. If not, see . 25 | -------------------------------------------------------------------------------- /benches/compare.rs: -------------------------------------------------------------------------------- 1 | /* 2 | */ 3 | use criterion::{black_box, criterion_group, criterion_main, Criterion}; 4 | use hash_roll::{ChunkIncr, Splitter}; 5 | use rand; 6 | 7 | /* 8 | pub fn split_hashmap(b: &mut Criterion, bytes: usize, init: F) 9 | where F: Fn(&[u8]) -> I, 10 | I: FnMut() -> Option 11 | { 12 | use rand::RngCore; 13 | use std::collections::HashMap; 14 | let mut rng = rand::thread_rng(); 15 | let mut d = vec![0u8; bytes]; 16 | let mut lenghts = HashMap::new(); 17 | b.iter(|| { 18 | rng.fill_bytes(&mut d); 19 | 20 | let mut i = init(&d[..]); 21 | loop { 22 | match i() { 23 | Some(l) => { 24 | let mut v = lenghts.entry(l).or_insert(0u64); 25 | *v = *v + 1; 26 | }, 27 | None => { 28 | break; 29 | } 30 | } 31 | } 32 | }) 33 | 34 | /* TODO: analize length data */ 35 | } 36 | */ 37 | 38 | /* 39 | pub fn split(b: &mut Criterion, bytes: usize, _name: &'static str, init: F) 40 | where for<'a> F: Fn(&'a [u8]) -> Box Option + 'a> 41 | { 42 | use rand::RngCore; 43 | let mut rng = rand::thread_rng(); 44 | let mut d = vec![0u8; bytes]; 45 | b.iter(|| { 46 | rng.fill_bytes(&mut d); 47 | let mut i = test::black_box(init(&d[..])); 48 | loop { 49 | match test::black_box(i()) { 50 | None => { 51 | break; 52 | }, 53 | _ => {}, 54 | } 55 | } 56 | }); 57 | } 58 | */ 59 | 60 | pub fn split_histogram(c: &mut Criterion, bytes: usize, name: &'static str, init: F) 61 | where 62 | for<'a> F: Fn(&'a [u8]) -> Box Option + 'a>, 63 | { 64 | use histogram::*; 65 | use rand::RngCore; 66 | use rand::SeedableRng; 67 | let mut rng = rand_pcg::Pcg64::from_rng(rand::thread_rng()).unwrap(); 68 | let mut d = vec![0u8; bytes]; 69 | rng.fill_bytes(&mut d); 70 | let mut lenghts = Histogram::new(); 71 | c.bench_function(name, |b| { 72 | b.iter(|| { 73 | let mut i = black_box(init(&d[..])); 74 | loop { 75 | match black_box(i()) { 76 | Some(l) => { 77 | lenghts.increment(l).unwrap(); 78 | } 79 | None => { 80 | break; 81 | } 82 | } 83 | } 84 | }) 85 | }); 86 | 87 | /* FIXME: for some reason cargo runs this outer code many times over instead of just running 88 | * the inner code many times over, causing this info to be printied far to much. 89 | */ 90 | /* 91 | println!("{}({} bytes) p50: {} bytes, p90: {} bytes, p99: {} bytes, p999: {} bytes", 92 | name, bytes, 93 | lenghts.percentile(50.0).unwrap(), 94 | lenghts.percentile(90.0).unwrap(), 95 | lenghts.percentile(99.0).unwrap(), 96 | lenghts.percentile(99.9).unwrap(), 97 | ); 98 | */ 99 | } 100 | 101 | /* 4 MiB */ 102 | const BENCH_BYTES: usize = 1024 * 1024 / 2; 103 | 104 | //const BENCH_RANGE : Range = Range { first: Bound::Unbounded, last: Bound::Unbounded }; 105 | 106 | fn bench_rsyncable_vecs(c: &mut Criterion) { 107 | use rand::RngCore; 108 | let mut rng = rand::thread_rng(); 109 | let mut d = vec![0u8; BENCH_BYTES]; 110 | c.bench_function("rsyncable vecs", |b| { 111 | b.iter(|| { 112 | rng.fill_bytes(&mut d); 113 | let s = hash_roll::rsyncable::Rsyncable::default().into_vecs(d.iter().cloned()); 114 | for _ in s {} 115 | }) 116 | }); 117 | } 118 | 119 | fn bench_rsyncable_slices(c: &mut Criterion) { 120 | use rand::RngCore; 121 | let mut rng = rand::thread_rng(); 122 | let mut d = vec![0u8; BENCH_BYTES]; 123 | c.bench_function("rsyncable slices", |b| { 124 | b.iter(|| { 125 | rng.fill_bytes(&mut d); 126 | let s = hash_roll::rsyncable::Rsyncable::default().into_slices(&d[..]); 127 | for _ in s {} 128 | }) 129 | }); 130 | } 131 | 132 | fn bench_zpaq(b: &mut Criterion) { 133 | split_histogram(b, BENCH_BYTES, "bench_zpaq", |data| { 134 | let z = hash_roll::zpaq::Zpaq::default(); 135 | let mut c = &data[..]; 136 | Box::new(move || { 137 | let (a, b) = z.split(c); 138 | if b.is_empty() || a.is_empty() { 139 | None 140 | } else { 141 | c = b; 142 | Some(b.len() as u64) 143 | } 144 | }) 145 | }); 146 | } 147 | 148 | fn bench_zpaq_iter_slice(b: &mut Criterion) { 149 | split_histogram(b, BENCH_BYTES, "zpaq_iter_slice", |data| { 150 | let z = hash_roll::zpaq::Zpaq::default(); 151 | let mut zi = z.into_slices(data); 152 | Box::new(move || zi.next().map(|x| x.len() as u64)) 153 | }) 154 | } 155 | 156 | fn bench_zpaq_iter_vec(b: &mut Criterion) { 157 | split_histogram(b, BENCH_BYTES, "zpaq_iter_vec", |data| { 158 | let z = hash_roll::zpaq::Zpaq::default(); 159 | let mut zi = z.into_vecs(data.iter().cloned()); 160 | Box::new(move || zi.next().map(|x| x.len() as u64)) 161 | }) 162 | } 163 | 164 | fn bench_rollsum_bup(b: &mut Criterion) { 165 | split_histogram(b, BENCH_BYTES, "rollsum_bup", |data| { 166 | let mut z = rollsum::Bup::default(); 167 | let mut pos = 0; 168 | Box::new(move || { 169 | let l = z.find_chunk_edge(&data[pos..]).map(|x| (x as u64) + 1); 170 | match l { 171 | Some(x) => pos += x as usize, 172 | None => {} 173 | } 174 | l 175 | }) 176 | }) 177 | } 178 | 179 | fn bench_bup(b: &mut Criterion) { 180 | split_histogram(b, BENCH_BYTES, "bup", |data| { 181 | let mut z = hash_roll::bup::RollSumIncr::default(); 182 | let mut pos = 0; 183 | Box::new(move || { 184 | let l = z.push(&data[pos..]); 185 | match l { 186 | Some(x) => pos += x, 187 | None => {} 188 | } 189 | l.map(|x| x as u64) 190 | }) 191 | }) 192 | } 193 | 194 | criterion_group!( 195 | benches, 196 | bench_bup, 197 | bench_rollsum_bup, 198 | bench_rsyncable_vecs, 199 | bench_zpaq, 200 | bench_zpaq_iter_vec, 201 | bench_zpaq_iter_slice, 202 | bench_rsyncable_slices 203 | ); 204 | criterion_main!(benches); 205 | -------------------------------------------------------------------------------- /bors.toml: -------------------------------------------------------------------------------- 1 | status = [ 2 | "test (stable)", 3 | "test (beta)", 4 | "test (1.47.0)", 5 | "check", 6 | ] 7 | cut_body_after = "---" 8 | delete_merged_branches = true 9 | -------------------------------------------------------------------------------- /ci/script.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | run_cargo() { 4 | if [ -n "${FEATURES:-}" ]; then 5 | cargo "$@" --verbose --features="$FEATURES" 6 | else 7 | cargo "$@" --verbose 8 | fi 9 | } 10 | 11 | run_cargo test 12 | -------------------------------------------------------------------------------- /examples/generate-test-data.rs: -------------------------------------------------------------------------------- 1 | use rand::RngCore; 2 | use rand_pcg::Pcg64; 3 | use std::io::Write; 4 | 5 | fn main() { 6 | let mut args = std::env::args(); 7 | if args.len() != 3 { 8 | eprintln!("usage: generate-test-data "); 9 | std::process::exit(1); 10 | } 11 | 12 | let _ = args.next().unwrap(); 13 | let seed: u128 = args.next().unwrap().parse().unwrap(); 14 | let len: usize = args.next().unwrap().parse().unwrap(); 15 | 16 | let mut fill_rng = Pcg64::new(seed, 0xa02bdbf7bb3c0a7ac28fa16a64abf96); 17 | // note: original len = 8192 * 4 = 1024 * 32 18 | let mut buf = vec![0u8; 1024 * len]; 19 | fill_rng.fill_bytes(&mut buf[..]); 20 | std::io::stdout().lock().write_all(&buf[..]).unwrap(); 21 | } 22 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | edition="2018" 2 | use_try_shorthand=true 3 | use_field_init_shorthand=true 4 | -------------------------------------------------------------------------------- /src/bup.rs: -------------------------------------------------------------------------------- 1 | use crate::{Chunk, ChunkIncr, ToChunkIncr}; 2 | use std::fmt; 3 | use std::num::Wrapping; 4 | 5 | const BLOBBITS: u8 = 13; 6 | const BLOBSIZE: u32 = 1 << (BLOBBITS as u32); 7 | 8 | const WINDOW_BITS: u8 = 6; 9 | const WINDOW_SIZE: usize = 1 << (WINDOW_BITS as usize); 10 | 11 | const ROLLSUM_CHAR_OFFSET: usize = 31; 12 | 13 | /// Rolling sum used by [`Bup`] for splitting 14 | /// 15 | /// - https://github.com/bup/bup/blob/0ab7c3a958729b4723e6fe254da771aff608c2bf/lib/bup/bupsplit.c 16 | /// - https://github.com/bup/bup/blob/0ab7c3a958729b4723e6fe254da771aff608c2bf/lib/bup/bupsplit.h 17 | /// 18 | #[derive(Debug, Clone, PartialEq, Eq)] 19 | pub struct RollSum { 20 | window_len: usize, 21 | } 22 | 23 | impl ToChunkIncr for RollSum { 24 | type Incr = RollSumIncr; 25 | 26 | fn to_chunk_incr(&self) -> Self::Incr { 27 | self.into() 28 | } 29 | } 30 | 31 | impl RollSum { 32 | pub fn with_window(window_len: usize) -> Self { 33 | Self { window_len } 34 | } 35 | } 36 | 37 | impl Chunk for RollSum { 38 | type SearchState = RollSumSearchState; 39 | 40 | fn to_search_state(&self) -> Self::SearchState { 41 | self.into() 42 | } 43 | 44 | fn find_chunk_edge( 45 | &self, 46 | state: &mut Self::SearchState, 47 | data: &[u8], 48 | ) -> (Option, usize) { 49 | for i in state.offset..data.len() { 50 | let a = data[i]; 51 | let d = if i >= self.window_len { 52 | data[i - self.window_len] 53 | } else { 54 | 0 55 | }; 56 | 57 | state.state.add(self.window_len, d, a); 58 | 59 | if state.state.at_split() { 60 | state.reset(self); 61 | return (Some(i + 1), i + 1); 62 | } 63 | } 64 | 65 | // keep k elements = discard all but k 66 | let discard_ct = data.len().saturating_sub(self.window_len); 67 | state.offset = data.len() - discard_ct; 68 | (None, discard_ct) 69 | } 70 | } 71 | 72 | #[derive(Debug, Clone, PartialEq, Eq)] 73 | pub struct RollSumState { 74 | // NOTE: in bup, these are `unsigned`, but masking indicates they'll end up being used as 75 | // u16's. In `librsync`, these are `uint_fast16_t`, which end up being u32 on most platforms. 76 | // Both only require `u16` values to be represented. We use `u32` here as it's likely to be 77 | // somewhat more performant, but this should be examined 78 | s1: Wrapping, 79 | s2: Wrapping, 80 | } 81 | 82 | impl From<&RollSum> for RollSumState { 83 | fn from(s: &RollSum) -> Self { 84 | let ws = Wrapping(s.window_len as u32); 85 | // NOTE: bup uses this initialization, but librsync uses zeros. 86 | // 87 | // I believe the idea is to allow a slightly different implimentation of the "setup" 88 | // portion of the processing (ie: before the window is filled) 89 | Self { 90 | s1: ws * Wrapping(ROLLSUM_CHAR_OFFSET as u32), 91 | s2: ws * (ws - Wrapping(1)) * Wrapping(ROLLSUM_CHAR_OFFSET as u32), 92 | } 93 | } 94 | } 95 | 96 | impl RollSumState { 97 | fn reset(&mut self, params: &RollSum) { 98 | *self = params.into() 99 | } 100 | } 101 | 102 | #[derive(Debug, Clone, PartialEq, Eq)] 103 | pub struct RollSumSearchState { 104 | state: RollSumState, 105 | offset: usize, 106 | } 107 | 108 | impl From<&RollSum> for RollSumSearchState { 109 | fn from(s: &RollSum) -> Self { 110 | Self { 111 | state: s.into(), 112 | offset: 0, 113 | } 114 | } 115 | } 116 | 117 | impl RollSumSearchState { 118 | fn reset(&mut self, params: &RollSum) { 119 | self.offset = 0; 120 | self.state.reset(params); 121 | } 122 | } 123 | 124 | impl Default for RollSum { 125 | fn default() -> Self { 126 | Self::with_window(WINDOW_SIZE) 127 | } 128 | } 129 | 130 | /// Incrimental instance of [`RollSum`] 131 | /// 132 | /// Performance note: Bup's Roll sum algorithm requires tracking the entire window. As a result, 133 | /// this includes a circular buffer which all inputs are copied through. If your use case allows 134 | /// it, use the non-incrimental variant for improved performance. 135 | #[derive(Clone, PartialEq, Eq)] 136 | pub struct RollSumIncr { 137 | state: RollSumState, 138 | 139 | /// window offset 140 | wofs: Wrapping, 141 | window: Box<[u8]>, 142 | } 143 | 144 | impl fmt::Debug for RollSumIncr { 145 | fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> Result<(), ::std::fmt::Error> { 146 | f.debug_struct("RollSumIncr") 147 | .field("state", &self.state) 148 | .field("window", &::fmt_extra::Hs(&self.window[..])) 149 | .field("wofs", &self.wofs) 150 | .finish() 151 | } 152 | } 153 | 154 | impl From<&RollSum> for RollSumIncr { 155 | fn from(params: &RollSum) -> Self { 156 | Self { 157 | state: params.into(), 158 | window: vec![0; params.window_len].into_boxed_slice(), 159 | wofs: Wrapping(0), 160 | } 161 | } 162 | } 163 | 164 | impl Default for RollSumIncr { 165 | fn default() -> Self { 166 | (&RollSum::default()).into() 167 | } 168 | } 169 | 170 | impl RollSumState { 171 | fn add(&mut self, window_len: usize, drop: u8, add: u8) { 172 | let d = Wrapping(drop as u32); 173 | self.s1 += Wrapping(add as u32); 174 | self.s1 -= d; 175 | self.s2 += self.s1; 176 | self.s2 -= Wrapping(window_len as u32) * (d + Wrapping(ROLLSUM_CHAR_OFFSET as u32)); 177 | } 178 | 179 | fn digest(&self) -> u32 { 180 | (self.s1.0 << 16) | (self.s2.0 & 0xffff) 181 | } 182 | 183 | fn at_split(&self) -> bool { 184 | (self.digest() & (BLOBSIZE - 1)) == (BLOBSIZE - 1) 185 | } 186 | } 187 | 188 | impl RollSumIncr { 189 | pub fn digest(&self) -> u32 { 190 | self.state.digest() 191 | } 192 | 193 | fn add(&mut self, drop: u8, add: u8) { 194 | self.state.add(self.window.len(), drop, add); 195 | } 196 | 197 | pub fn roll_byte(&mut self, ch: u8) { 198 | let w = self.window[self.wofs.0]; 199 | self.add(w, ch); 200 | self.window[self.wofs.0] = ch; 201 | self.wofs = Wrapping((self.wofs + Wrapping(1)).0 & (self.window.len() - 1)); 202 | } 203 | 204 | #[cfg(test)] 205 | pub(crate) fn roll(&mut self, data: &[u8]) { 206 | for &i in data.iter() { 207 | self.roll_byte(i); 208 | } 209 | } 210 | 211 | /* 212 | fn sum(data: &[u8]) -> u32 { 213 | let mut x = Self::default(); 214 | x.roll(data); 215 | x.digest() 216 | } 217 | */ 218 | 219 | pub fn at_split(&self) -> bool { 220 | self.state.at_split() 221 | } 222 | } 223 | 224 | impl ChunkIncr for RollSumIncr { 225 | fn push(&mut self, data: &[u8]) -> Option { 226 | for (i, &v) in data.iter().enumerate() { 227 | self.roll_byte(v); 228 | if self.at_split() { 229 | return Some(i + 1); 230 | } 231 | } 232 | 233 | None 234 | } 235 | } 236 | 237 | #[cfg(test)] 238 | mod test { 239 | use super::*; 240 | use rand::RngCore; 241 | use rollsum::Engine; 242 | 243 | #[test] 244 | fn rs() { 245 | let mut m = RollSumIncr::default(); 246 | m.roll_byte(3); 247 | assert_eq!(m.digest(), 130279491); 248 | } 249 | 250 | #[test] 251 | fn compare_rollsum() { 252 | let mut m1 = RollSumIncr::default(); 253 | let mut m2 = rollsum::Bup::default(); 254 | 255 | assert_eq!(m1.digest(), m2.digest()); 256 | 257 | m1.roll_byte(4); 258 | m2.roll_byte(4); 259 | 260 | assert_eq!(m1.digest(), m2.digest()); 261 | 262 | m1.roll_byte(18); 263 | m2.roll_byte(18); 264 | 265 | assert_eq!(m1.digest(), m2.digest()); 266 | 267 | let mut r = rand::thread_rng(); 268 | let mut b = [0u8; 2048]; 269 | 270 | r.fill_bytes(&mut b); 271 | 272 | for (i, &v) in b.iter().enumerate() { 273 | m1.roll_byte(v); 274 | m2.roll_byte(v); 275 | println!("i={}, v={}", i, v); 276 | assert_eq!(m1.digest(), m2.digest()); 277 | } 278 | 279 | m1.roll(&b); 280 | m2.roll(&b); 281 | 282 | assert_eq!(m1.digest(), m2.digest()); 283 | } 284 | 285 | #[test] 286 | fn compare_bup() { 287 | use super::ChunkIncr; 288 | let mut m1 = RollSumIncr::default(); 289 | let mut m2 = rollsum::Bup::default(); 290 | 291 | let mut r = rand::thread_rng(); 292 | let mut b = [0u8; 2048]; 293 | 294 | r.fill_bytes(&mut b); 295 | 296 | let mut x = &b[..]; 297 | loop { 298 | let v1 = m1.push(&x); 299 | let v2 = m2.find_chunk_edge(&x); 300 | assert_eq!(v1, v2.map(|x| x.0)); 301 | 302 | match v1 { 303 | None => break, 304 | Some(v) => { 305 | x = &x[v..]; 306 | } 307 | } 308 | } 309 | } 310 | } 311 | -------------------------------------------------------------------------------- /src/buzhash.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "buzhash")] 2 | 3 | //! BuzHash (aka Cyclic Polynomial Hashing) is a window based rolling hash 4 | //! 5 | //! BuzHash with various chunk-splitting methods is used in: 6 | //! - [Borg](https://github.com/borgbackup/borg) 7 | //! - https://github.com/borgbackup/borg/blob/master/src/borg/_chunker.c 8 | //! - [Attic](https://github.com/jborg/attic) 9 | //! - https://github.com/jborg/attic/blob/master/attic/_chunker.c 10 | //! - [silvasur/buzhash](https://github.com/silvasur/buzhash), in turn used in: 11 | //! - [attic-labs/nom](https://github.com/attic-labs/noms/blob/26620a34bc8c95812037588869d4790b5581b34d/go/types/rolling_value_hasher.go#L15-L21) 12 | //! - [dolt](https://github.com/liquidata-inc/dolt) 13 | //! - [casync](https://github.com/systemd/casync/blob/master/src/cachunker.c) 14 | //! 15 | //! Documentation: 16 | //! 17 | //! - [Recursive Hashing Functions for n-Grams, JONATHAN D. COHEN](https://www.csee.umbc.edu/courses/graduate/676/recursivehashingp291-cohen) 18 | //! - ["Cyclic Polynomial", Rolling Hashes, Wikipedia](https://en.wikipedia.org/wiki/Rolling_hash#cite_ref-3) 19 | //! 20 | use crate::{Chunk, ChunkIncr, ToChunkIncr}; 21 | use std::fmt; 22 | use std::num::Wrapping; 23 | /* Cyclic polynomial (buzhash) 24 | * 25 | * H = s ** (k -1) (h(c_1)) ^ s**(k-2)(h(c_2)) ^ ... ^ s(h(c_(k-1))) ^ h(c_k) 26 | * where s(x) is a barrel shift of x (ABCDEFG becomes BCDEFGA, where each letter is a bit) 27 | * s**y(x) is application of s(n) y times. 28 | * 29 | * Application: 30 | * 31 | * - H <- s(H) 32 | * - c_1 <- s**k(h(c_1)) 33 | * - H <- s(H) ^ s**k(h(c_1)) ^ h(c_(k+1)) 34 | * 35 | * Where c_1 is the character to remove, 36 | * c_(k+1) is the character to add 37 | * 38 | * Parameters: 39 | * - k: number of inputs to contain (can be un-capped?) 40 | * - h: a hash function from inputs to integers on [2, 2**L) 41 | * 42 | * State: 43 | * - every input contained in the hash (if removal is required) 44 | * - previous hash result 45 | */ 46 | 47 | /// Describes an instance of BuzHash (aka cyclic polynomial hash). 48 | /// 49 | /// Provides parameterization over the window size (`k`), hash function (`h`), chunk edge mask, and 50 | /// max chunk size. 51 | /// 52 | /// Uses fixed 32-bit width for the hash. 53 | /// 54 | /// The trait [`BuzHashHash`] provides the internal hash function, see the implimentations of it 55 | /// for built-in hash options (which include both `Borg` and `silvasur/buzhash`'s internal hash 56 | /// tables). 57 | /// 58 | /// Note that it's helpful for `k` to be prime to prevent repeating strings form resulting 59 | /// in total cancelation of the internal hash, which can cause overly long chunks. 60 | /// 61 | /// Adjusting `mask` changes the average chunk size. 62 | /// 63 | /// # Performance 64 | /// 65 | /// [`BuzHash`] requires storing bytes equal to it's window size (`k`). Because of this, 66 | /// [`BuzHashIncr`] may have poor performance compared to [`BuzHash::find_chunk_edge()`]. 67 | #[derive(Debug, Clone, PartialEq, Eq)] 68 | pub struct BuzHash { 69 | /// number of characters to consider at once 70 | k: usize, 71 | 72 | /// A hash function over a single byte that emits a 32-bit value 73 | h: H, 74 | 75 | /// the 1 bits indicates the bit in the hash which must be 1 to form a chunk edge 76 | /// (called `pattern` in `attic-labs/nom`) 77 | mask: u32, 78 | 79 | /// if the index grows _above_ this size, a chunk edge is formed 80 | max_chunk_size: u64, 81 | } 82 | 83 | impl BuzHash { 84 | /// Create an instance with the given capacity (k) and chunk termination `mask`, and a internal 85 | /// `hash` function. 86 | /// 87 | /// `capacity` is the number of bytes that are taken into account for a given hash. 88 | /// `mask` affects how chunk edges are determined. 89 | /// `hash` is applied to each byte of input prior to mixing into the rolling hash. 90 | pub fn new(capacity: usize, mask: u32, hash: H, max_chunk_size: u64) -> Self { 91 | assert!(capacity > 0); 92 | BuzHash { 93 | k: capacity, 94 | h: hash, 95 | mask, 96 | max_chunk_size, 97 | } 98 | } 99 | 100 | // fn new_attic() 101 | // fn new_bup() 102 | } 103 | 104 | impl<'a> BuzHash> { 105 | /// Create a buzhash instance using defaults from attic-labs/nom version 7.17 106 | /// 107 | /// - `k: 67` 108 | /// - `hash` is the `silvasur/buzhash` table 109 | /// - `mask: 1<<12 -1` 110 | /// - `max_chunk_size: 1 << 24` 111 | pub fn new_nom(salt: u8) -> Self { 112 | BuzHash::new( 113 | 67, 114 | (1 << 12u32) - 1, 115 | BuzHashTableByteSaltHash::from((salt, &crate::buzhash_table::GO_BUZHASH)), 116 | 1 << 24, 117 | ) 118 | } 119 | } 120 | 121 | impl Chunk for BuzHash { 122 | type SearchState = BuzHashSearchState; 123 | 124 | fn to_search_state(&self) -> Self::SearchState { 125 | Self::SearchState::default() 126 | } 127 | 128 | fn find_chunk_edge( 129 | &self, 130 | state: &mut Self::SearchState, 131 | data: &[u8], 132 | ) -> (Option, usize) { 133 | for i in state.offset..data.len() { 134 | state.state.add_buf(data, self, i); 135 | 136 | if (state.state.h & self.mask) == self.mask { 137 | state.reset(); 138 | return (Some(i + 1), i + 1); 139 | } 140 | 141 | /* 142 | * broken: `i` is not the number of bytes since prev chunk. 143 | * need to track internal last chunk 144 | if i as u64 > self.max_chunk_size { 145 | state.reset(); 146 | println!(" <- CHUNK: {}", i + 1); 147 | return (Some(i + 1), i + 1); 148 | } 149 | */ 150 | } 151 | 152 | // keep k elements = discard all but k 153 | let discard_ct = data.len().saturating_sub(self.k); 154 | state.offset = data.len() - discard_ct; 155 | (None, discard_ct) 156 | } 157 | } 158 | 159 | impl From<&BuzHash> for BuzHashIncr { 160 | fn from(src: &BuzHash) -> Self { 161 | src.clone().into() 162 | } 163 | } 164 | 165 | impl ToChunkIncr for BuzHash { 166 | type Incr = BuzHashIncr; 167 | fn to_chunk_incr(&self) -> Self::Incr { 168 | self.into() 169 | } 170 | } 171 | 172 | #[derive(Debug, Clone, PartialEq, Eq, Default)] 173 | pub struct BuzHashSearchState { 174 | offset: usize, 175 | state: BuzHashState, 176 | } 177 | 178 | impl BuzHashSearchState { 179 | fn reset(&mut self) { 180 | self.offset = 0; 181 | self.state.reset(); 182 | } 183 | } 184 | 185 | #[derive(Debug, Clone, PartialEq, Eq, Default)] 186 | struct BuzHashState { 187 | /// current value of the hash. 188 | h: u32, 189 | } 190 | 191 | impl BuzHashState { 192 | fn reset(&mut self) { 193 | self.h = 0; 194 | } 195 | 196 | fn add_buf(&mut self, data: &[u8], params: &BuzHash, i: usize) { 197 | if i >= params.k { 198 | // need to find and "remove" a entry 199 | let drop_i = i - params.k; 200 | let drop = data[drop_i]; 201 | self.add_overflow(params, data[i], drop); 202 | } else { 203 | // no removal 204 | self.add(params, data[i]); 205 | } 206 | } 207 | 208 | // insert, assuming no overflow 209 | fn add(&mut self, params: &BuzHash, v: u8) { 210 | self.h = self.h.rotate_left(1) ^ params.h.hash(v); 211 | } 212 | 213 | // insert with overflow 214 | fn add_overflow(&mut self, params: &BuzHash, add_v: u8, remove_v: u8) { 215 | let h = self.h.rotate_left(1); 216 | // need to find and "remove" a entry 217 | let drop = params.h.hash(remove_v).rotate_left((params.k % 8) as u32); 218 | self.h = h ^ drop ^ params.h.hash(add_v); 219 | } 220 | } 221 | 222 | /// Self-contained buzhash which buffers it's window of values internally 223 | /// 224 | /// Note that this will be less efficient than using [`BuzHash`] on a slice directly, 225 | /// but may be more convenient. 226 | #[derive(Debug, Clone, PartialEq, Eq)] 227 | pub struct BuzHashIncr { 228 | params: BuzHash, 229 | state: BuzHashState, 230 | buf: Box<[u8]>, 231 | buf_idx: Wrapping, 232 | input_idx: u64, 233 | } 234 | 235 | impl ChunkIncr for BuzHashIncr { 236 | /// Return the index in `data` immeidately following the hash matching. 237 | /// 238 | /// Note that you can call this multiple times to examine "subsequent" `data` slices, but the 239 | /// index returned will always refer to the current `data` slice. 240 | fn push(&mut self, data: &[u8]) -> Option { 241 | for (i, &v) in data.iter().enumerate() { 242 | self.push_byte(v); 243 | if (self.state.h & self.params.mask) == self.params.mask { 244 | self.reset(); 245 | return Some(i + 1); 246 | } 247 | 248 | if self.input_idx > self.params.max_chunk_size { 249 | self.reset(); 250 | return Some(i + 1); 251 | } 252 | } 253 | 254 | None 255 | } 256 | } 257 | 258 | impl BuzHashIncr { 259 | fn reset(&mut self) { 260 | self.buf_idx = Wrapping(0); 261 | self.input_idx = 0; 262 | self.state.reset(); 263 | } 264 | 265 | fn push_byte(&mut self, val: u8) { 266 | if self.input_idx >= self.params.k as u64 { 267 | let o = self.buf[self.buf_idx.0]; 268 | self.state.add_overflow(&self.params, val, o); 269 | } else { 270 | self.state.add(&self.params, val); 271 | } 272 | 273 | self.buf[self.buf_idx.0] = val; 274 | 275 | self.buf_idx += Wrapping(1); 276 | self.buf_idx.0 %= self.params.k; 277 | self.input_idx += 1; 278 | } 279 | } 280 | 281 | impl From> for BuzHashIncr { 282 | fn from(params: BuzHash) -> Self { 283 | let buf = vec![0; params.k].into_boxed_slice(); 284 | Self { 285 | params, 286 | state: Default::default(), 287 | buf, 288 | buf_idx: Wrapping(0), 289 | input_idx: 0, 290 | } 291 | } 292 | } 293 | 294 | /// The internal byte to u32 mapping used in buzhash 295 | pub trait BuzHashHash { 296 | fn hash(&self, data: u8) -> u32; 297 | } 298 | 299 | /// Use a referenced table to preform the `BuzHashHash` internal hashing 300 | #[derive(Clone)] 301 | pub struct BuzHashTableHash<'a> { 302 | table: &'a [u32; 256], 303 | } 304 | 305 | impl<'a> fmt::Debug for BuzHashTableHash<'a> { 306 | fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { 307 | fmt.debug_struct("BuzHashTableHash").finish() 308 | } 309 | } 310 | 311 | impl<'a> From<&'a [u32; 256]> for BuzHashTableHash<'a> { 312 | fn from(table: &'a [u32; 256]) -> Self { 313 | Self { table } 314 | } 315 | } 316 | 317 | impl<'a> BuzHashHash for BuzHashTableHash<'a> { 318 | fn hash(&self, data: u8) -> u32 { 319 | self.table[data as usize] 320 | } 321 | } 322 | 323 | /// Use a owned table to perform the `BuzHashHash` internal hashing 324 | #[derive(Clone)] 325 | pub struct BuzHashTableBufHash { 326 | table: Box<[u32; 256]>, 327 | } 328 | 329 | impl fmt::Debug for BuzHashTableBufHash { 330 | fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { 331 | fmt.debug_struct("BuzHashTableBufHash").finish() 332 | } 333 | } 334 | 335 | impl<'a> From> for BuzHashTableBufHash { 336 | fn from(table: Box<[u32; 256]>) -> Self { 337 | Self { table } 338 | } 339 | } 340 | 341 | impl BuzHashHash for BuzHashTableBufHash { 342 | fn hash(&self, data: u8) -> u32 { 343 | self.table[data as usize] 344 | } 345 | } 346 | 347 | /// Lookup up in a table, after applying a salt via xor to the input byte 348 | /// 349 | /// Used by attic-labs/nom 350 | #[derive(Clone)] 351 | pub struct BuzHashTableByteSaltHash<'a> { 352 | table: &'a [u32; 256], 353 | salt: u8, 354 | } 355 | 356 | impl<'a> fmt::Debug for BuzHashTableByteSaltHash<'a> { 357 | fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { 358 | fmt.debug_struct("BuzHashTableByteSaltHash").finish() 359 | } 360 | } 361 | 362 | impl<'a> From<(u8, &'a [u32; 256])> for BuzHashTableByteSaltHash<'a> { 363 | fn from((salt, table): (u8, &'a [u32; 256])) -> Self { 364 | Self { table, salt } 365 | } 366 | } 367 | 368 | impl<'a> BuzHashHash for BuzHashTableByteSaltHash<'a> { 369 | fn hash(&self, data: u8) -> u32 { 370 | self.table[(data ^ self.salt) as usize] 371 | } 372 | } 373 | -------------------------------------------------------------------------------- /src/buzhash_table.rs: -------------------------------------------------------------------------------- 1 | /// https://github.com/silvasur/buzhash/blob/9bdec3dec7c611fa97beadc374d75bdf02cd880e/hash.go#L13-L57 2 | /// 3 | /// used by attic & attic-labs/nom 4 | pub const GO_BUZHASH: [u32; 256] = [ 5 | 0x12bd9527, 0xf4140cea, 0x987bd6e1, 0x79079850, 0xafbfd539, 0xd350ce0a, 0x82973931, 0x9fc32b9c, 6 | 0x28003b88, 0xc30c13aa, 0x6b678c34, 0x5844ef1d, 0xaa552c18, 0x4a77d3e8, 0xd1f62ea0, 0x6599417c, 7 | 0xfbe30e7a, 0xf9e2d5ee, 0xa1fca42e, 0x41548969, 0x116d5b59, 0xaeda1e1a, 0xc5191c17, 0x54b9a3cb, 8 | 0x727e492a, 0x5c432f91, 0x31a50bce, 0xc2696af6, 0x217c8020, 0x1262aefc, 0xace75924, 0x9876a04f, 9 | 0xaf300bc2, 0x3ffce3f6, 0xd6680fb5, 0xd0b1ced8, 0x6651f842, 0x736fadef, 0xbc2d3429, 0xb03d2904, 10 | 0x7e634ba4, 0xdfd87d8c, 0x7988d63a, 0x4be4d933, 0x6a8d0382, 0x9e132d62, 0x3ee9c95f, 0xfec05b97, 11 | 0x6907ad34, 0x8616cfcc, 0xa6aabf24, 0x8ad1c92e, 0x4f2affc0, 0xb87519db, 0x6576eaf6, 0x15dbe00a, 12 | 0x63e1dd82, 0xa36b6a81, 0xeead99b3, 0xbc6a4309, 0x3478d1a7, 0x2182bcc0, 0xdd50cfce, 0x7cb25580, 13 | 0x73075483, 0x503b7f42, 0x4cd50d63, 0x3f4d94c9, 0x385fcbb7, 0x90daf16c, 0xece10b8e, 0x11c1cb04, 14 | 0x816a899b, 0x69a29d06, 0xfb090b37, 0xf98ef13c, 0x07653435, 0x9f15dc42, 0x3b43abdf, 0x1334283f, 15 | 0x93f3d9af, 0x0cbdfe71, 0xa788a614, 0x4f54d2f0, 0xd4374fc7, 0x70557ce7, 0xf741fce8, 0xe4b6f661, 16 | 0xc630cb98, 0x387a6366, 0x72f428fd, 0x539009db, 0xc53e3810, 0x1e1a52e5, 0x7d6816b0, 0x040f9b81, 17 | 0x9c99c9fb, 0x9f3af3d2, 0x774d1061, 0xd5c840ea, 0x8e1480fe, 0x6ee4023c, 0x2fbda535, 0xd88eff7a, 18 | 0xd8632a2a, 0x43c4e024, 0x3ef27971, 0xc72866fd, 0xe35cc630, 0x46d96220, 0x437a8384, 0xe92caf0c, 19 | 0x6290a47e, 0xa7bb9238, 0x0e1000f9, 0x49e76bdc, 0x3acfb4b8, 0x03582b8e, 0x6ea2de4e, 0x2ec1008d, 20 | 0xfcc8df69, 0x91c2fe0a, 0xb471c7d9, 0x778be812, 0x70d29ad1, 0x76411cbf, 0xc302e81c, 0x4e445194, 21 | 0x22e3aa72, 0xb65762e9, 0xa280db05, 0x827aa70e, 0x4c531a9d, 0x7a60bf4a, 0x8fd95a44, 0x2289aef0, 22 | 0xcd50ddc4, 0x639aae69, 0x5fe85ed6, 0x4ed724ff, 0x00f04f7d, 0x95a5fcb0, 0x88255d15, 0xa603d2c9, 23 | 0xf6956a5b, 0x53ea7f3e, 0xb570f225, 0x2b3be203, 0xa181e40e, 0xc413cdce, 0xa7cb1ebb, 0xcf258b1f, 24 | 0x516eb016, 0xca204586, 0xd1e69894, 0xe85a73d3, 0x7db2d382, 0xae73b463, 0x3598d643, 0x5087c864, 25 | 0xd91f30b6, 0xe1d4d1e7, 0x73b3b337, 0xceac1233, 0x8edf7845, 0xa69c45c9, 0xdb5db3ab, 0x28cfade8, 26 | 0xebfa49e7, 0xcbc2a659, 0x59cce971, 0x959a01af, 0x8ee9aae7, 0xfb2f01c6, 0x5a752836, 0x9ed12981, 27 | 0x618d05b6, 0x93ec12b3, 0x4590c779, 0xed1317a2, 0x03fe5835, 0x7ad3c6f7, 0xd4aad5b5, 0x1a995ed7, 28 | 0x247bfaa4, 0x69c2c799, 0x745fa405, 0xc5b9f239, 0xc3d9aebc, 0xa6f60e0b, 0xdf1e91d7, 0xab8e041c, 29 | 0xee3188c6, 0x37377a9e, 0xc0e1a3bf, 0x19a5a9e4, 0x56cb9556, 0xc4d33d3f, 0xfb1eb03e, 0xf9557057, 30 | 0x1be31d37, 0xd1fa65f1, 0xf518d714, 0x570ac722, 0xf26cf66a, 0x24794d47, 0x8ba2e402, 0x3f5137e6, 31 | 0x35be1453, 0x43350478, 0x9f05ee88, 0x364cf9cf, 0x39a23ee7, 0xa4db8d49, 0xc2ebb3d2, 0xc6fb99d5, 32 | 0xe014dfb0, 0x7156d425, 0xe090a87a, 0x4cc12f78, 0x1b30f503, 0x06694a7a, 0x68198cd1, 0x2f8345bd, 33 | 0x9d79198e, 0xd871943f, 0x22ef6cf4, 0xe81b1c15, 0x067b61d8, 0xfc4ea4f5, 0xfe6dab57, 0x1bf744ba, 34 | 0xa70b6a25, 0xafe6e412, 0xc6c1a05c, 0x8ffbe3ce, 0xc4270af1, 0xf3f36373, 0xc4507dd8, 0x5e6fd1e2, 35 | 0x58cd9739, 0x47d3c5b5, 0xe1d5a343, 0x3d4dea4a, 0x893d91ae, 0xbb2a5e2a, 0x0d57b800, 0x652a7cc9, 36 | 0x6a68ccfd, 0x62529f0b, 0xec5f36d6, 0x766cceda, 0x96ca63ef, 0xa0499838, 0xd9030f59, 0x8185f4d2, 37 | ]; 38 | 39 | /// https://github.com/borgbackup/borg/blob/master/src/borg/_chunker.c#L30-L64 40 | /// 41 | /// Note that borg does not use this directly, it xors it with a 32bit seed prior to use 42 | pub const BORG_BUZHASH_BASE: [u32; 256] = [ 43 | 0xe7f831ec, 0xf4026465, 0xafb50cae, 0x6d553c7a, 0xd639efe3, 0x19a7b895, 0x9aba5b21, 0x5417d6d4, 44 | 0x35fd2b84, 0xd1f6a159, 0x3f8e323f, 0xb419551c, 0xf444cebf, 0x21dc3b80, 0xde8d1e36, 0x84a32436, 45 | 0xbeb35a9d, 0xa36f24aa, 0xa4e60186, 0x98d18ffe, 0x3f042f9e, 0xdb228bcd, 0x096474b7, 0x5c20c2f7, 46 | 0xf9eec872, 0xe8625275, 0xb9d38f80, 0xd48eb716, 0x22a950b4, 0x3cbaaeaa, 0xc37cddd3, 0x8fea6f6a, 47 | 0x1d55d526, 0x7fd6d3b3, 0xdaa072ee, 0x4345ac40, 0xa077c642, 0x8f2bd45b, 0x28509110, 0x55557613, 48 | 0xffc17311, 0xd961ffef, 0xe532c287, 0xaab95937, 0x46d38365, 0xb065c703, 0xf2d91d0f, 0x92cd4bb0, 49 | 0x4007c712, 0xf35509dd, 0x505b2f69, 0x557ead81, 0x310f4563, 0xbddc5be8, 0x9760f38c, 0x701e0205, 50 | 0x00157244, 0x14912826, 0xdc4ca32b, 0x67b196de, 0x5db292e8, 0x8c1b406b, 0x01f34075, 0xfa2520f7, 51 | 0x73bc37ab, 0x1e18bc30, 0xfe2c6cb3, 0x20c522d0, 0x5639e3db, 0x942bda35, 0x899af9d1, 0xced44035, 52 | 0x98cc025b, 0x255f5771, 0x70fefa24, 0xe928fa4d, 0x2c030405, 0xb9325590, 0x20cb63bd, 0xa166305d, 53 | 0x80e52c0a, 0xa8fafe2f, 0x1ad13f7d, 0xcfaf3685, 0x6c83a199, 0x7d26718a, 0xde5dfcd9, 0x79cf7355, 54 | 0x8979d7fb, 0xebf8c55e, 0xebe408e4, 0xcd2affba, 0xe483be6e, 0xe239d6de, 0x5dc1e9e0, 0x0473931f, 55 | 0x851b097c, 0xac5db249, 0x09c0f9f2, 0xd8d2f134, 0xe6f38e41, 0xb1c71bf1, 0x52b6e4db, 0x07224424, 56 | 0x6cf73e85, 0x4f25d89c, 0x782a7d74, 0x10a68dcd, 0x3a868189, 0xd570d2dc, 0x69630745, 0x9542ed86, 57 | 0x331cd6b2, 0xa84b5b28, 0x07879c9d, 0x38372f64, 0x7185db11, 0x25ba7c83, 0x01061523, 0xe6792f9f, 58 | 0xe5df07d1, 0x4321b47f, 0x7d2469d8, 0x1a3a4f90, 0x48be29a3, 0x669071af, 0x8ec8dd31, 0x0810bfbf, 59 | 0x813a06b4, 0x68538345, 0x65865ddc, 0x43a71b8e, 0x78619a56, 0x5a34451d, 0x5bdaa3ed, 0x71edc7e9, 60 | 0x17ac9a20, 0x78d10bfa, 0x6c1e7f35, 0xd51839d9, 0x240cbc51, 0x33513cc1, 0xd2b4f795, 0xccaa8186, 61 | 0x0babe682, 0xa33cf164, 0x18c643ea, 0xc1ca105f, 0x9959147a, 0x6d3d94de, 0x0b654fbe, 0xed902ca0, 62 | 0x7d835cb5, 0x99ba1509, 0x6445c922, 0x495e76c2, 0xf07194bc, 0xa1631d7e, 0x677076a5, 0x89fffe35, 63 | 0x1a49bcf3, 0x8e6c948a, 0x0144c917, 0x8d93aea1, 0x16f87ddf, 0xc8f25d49, 0x1fb11297, 0x27e750cd, 64 | 0x2f422da1, 0xdee89a77, 0x1534c643, 0x457b7b8b, 0xaf172f7a, 0x6b9b09d6, 0x33573f7f, 0xf14e15c4, 65 | 0x526467d5, 0xaf488241, 0x87c3ee0d, 0x33be490c, 0x95aa6e52, 0x43ec242e, 0xd77de99b, 0xd018334f, 66 | 0x5b78d407, 0x498eb66b, 0xb1279fa8, 0xb38b0ea6, 0x90718376, 0xe325dee2, 0x8e2f2cba, 0xcaa5bdec, 67 | 0x9d652c56, 0xad68f5cb, 0xa77591af, 0x88e37ee8, 0xf8faa221, 0xfcbbbe47, 0x4f407786, 0xaf393889, 68 | 0xf444a1d9, 0x15ae1a2f, 0x40aa7097, 0x6f9486ac, 0x29d232a3, 0xe47609e9, 0xe8b631ff, 0xba8565f4, 69 | 0x11288749, 0x46c9a838, 0xeb1b7cd8, 0xf516bbb1, 0xfb74fda0, 0x010996e6, 0x4c994653, 0x1d889512, 70 | 0x53dcd9a3, 0xdd074697, 0x1e78e17c, 0x637c98bf, 0x930bb219, 0xcf7f75b0, 0xcb9355fb, 0x9e623009, 71 | 0xe466d82c, 0x28f968d3, 0xfeb385d9, 0x238e026c, 0xb8ed0560, 0x0c6a027a, 0x3d6fec4b, 0xbb4b2ec2, 72 | 0xe715031c, 0xeded011d, 0xcdc4d3b9, 0xc456fc96, 0xdd0eea20, 0xb3df8ec9, 0x12351993, 0xd9cbb01c, 73 | 0x603147a2, 0xcf37d17d, 0xf7fcd9dc, 0xd8556fa3, 0x104c8131, 0x13152774, 0xb4715811, 0x6a72c2c9, 74 | 0xc5ae37bb, 0xa76ce12a, 0x8150d8f3, 0x2ec29218, 0xa35f0984, 0x48c0647e, 0x0b5ff98c, 0x71893f7b, 75 | ]; 76 | 77 | pub fn borg_buzhash_table(seed: u32) -> Box<[u32; 256]> { 78 | let mut t = Box::new([0u32; 256]); 79 | for i in 0..t.len() { 80 | t[i] = BORG_BUZHASH_BASE[i] ^ seed 81 | } 82 | 83 | t 84 | } 85 | -------------------------------------------------------------------------------- /src/fastcdc.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "fastcdc")] 2 | 3 | //! FastCDC is a chunking algorithm using some features from [Gear](super::gear) 4 | //! 5 | //! Reference: 6 | //! - https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf 7 | 8 | use crate::{Chunk, ChunkIncr, ToChunkIncr}; 9 | use std::fmt; 10 | use std::num::Wrapping; 11 | 12 | // these masks are taken from the paper and could be adjusted/adjustable. 13 | const MASK_S: u64 = 0x0003590703530000; 14 | //const MASK_A: u64 = 0x0000d90303530000; 15 | const MASK_L: u64 = 0x0000d90003530000; 16 | 17 | /// An instance of the "FastCDC" algorithm 18 | /// 19 | /// Default parameters: 20 | /// - Minimum chunk size: 2 KiB 21 | /// - Maximum chunk size: 64 KiB 22 | /// - Normal size: 8 KiB 23 | /// - internal 64-bit gear table: [`super::gear_table::GEAR_64`] 24 | /// 25 | #[derive(Clone, Copy)] 26 | pub struct FastCdc<'a> { 27 | gear: &'a [u64; 256], 28 | min_size: u64, 29 | max_size: u64, 30 | normal_size: u64, 31 | } 32 | 33 | impl<'a> PartialEq for FastCdc<'a> { 34 | fn eq(&self, other: &Self) -> bool { 35 | self.min_size == other.min_size 36 | && self.max_size == other.max_size 37 | && self.normal_size == other.normal_size 38 | && self.gear[..] == other.gear[..] 39 | } 40 | } 41 | 42 | impl<'a> Eq for FastCdc<'a> {} 43 | 44 | impl<'a> Default for FastCdc<'a> { 45 | fn default() -> Self { 46 | FastCdc { 47 | min_size: 2 * 1024, // 2 KiB 48 | max_size: 64 * 1024, // 64 KiB 49 | normal_size: 8 * 1024, // 8 KiB 50 | gear: &super::gear_table::GEAR_64, 51 | } 52 | } 53 | } 54 | 55 | impl<'a> fmt::Debug for FastCdc<'a> { 56 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 57 | f.debug_struct("FastCdc") 58 | .field("gear", &"[...]") 59 | .field("min_size", &self.min_size) 60 | .field("max_size", &self.max_size) 61 | .field("normal_size", &self.normal_size) 62 | .finish() 63 | } 64 | } 65 | 66 | impl<'a> Chunk for FastCdc<'a> { 67 | type SearchState = FastCdcState; 68 | 69 | fn to_search_state(&self) -> Self::SearchState { 70 | Default::default() 71 | } 72 | 73 | fn find_chunk_edge( 74 | &self, 75 | state: &mut Self::SearchState, 76 | data: &[u8], 77 | ) -> (Option, usize) { 78 | match state.push(self, data) { 79 | Some(i) => (Some(i + 1), i + 1), 80 | None => (None, data.len()), 81 | } 82 | } 83 | } 84 | 85 | impl<'a> FastCdc<'a> { 86 | /// Create a custom FastCDC instance 87 | pub fn new(gear: &'a [u64; 256], min_size: u64, normal_size: u64, max_size: u64) -> Self { 88 | Self { 89 | gear, 90 | min_size, 91 | max_size, 92 | normal_size, 93 | } 94 | } 95 | } 96 | 97 | impl<'a> ToChunkIncr for FastCdc<'a> { 98 | type Incr = FastCdcIncr<'a>; 99 | 100 | fn to_chunk_incr(&self) -> Self::Incr { 101 | self.into() 102 | } 103 | } 104 | 105 | impl<'a> From<&FastCdc<'a>> for FastCdcIncr<'a> { 106 | fn from(params: &FastCdc<'a>) -> Self { 107 | Self { 108 | params: *params, 109 | state: Default::default(), 110 | } 111 | } 112 | } 113 | 114 | /// FastCdcIncr provides an incrimental interface to `FastCdc` 115 | /// 116 | /// This impl does not buffer data passing through it (the FastCDC algorithm does not require 117 | /// look-back) making it very efficient. 118 | #[derive(Debug, Clone, PartialEq, Eq, Default)] 119 | pub struct FastCdcIncr<'a> { 120 | params: FastCdc<'a>, 121 | state: FastCdcState, 122 | } 123 | 124 | #[derive(Debug, Clone, PartialEq, Eq, Default)] 125 | pub struct FastCdcState { 126 | /// Number of bytes we've "examined" 127 | /// 128 | /// varying state. 129 | l: u64, 130 | 131 | /// Current fingerprint 132 | /// 133 | /// varying state. 134 | fp: Wrapping, 135 | } 136 | 137 | impl FastCdcState { 138 | fn reset(&mut self) { 139 | self.l = 0; 140 | self.fp = Wrapping(0); 141 | } 142 | 143 | fn push(&mut self, params: &FastCdc<'_>, data: &[u8]) -> Option { 144 | // global start/index 145 | let mut gi = self.l; 146 | // global end 147 | let ge = data.len() as u64 + gi; 148 | 149 | if ge <= params.min_size { 150 | // No split, no processing of data, but we've "consumed" the bytes. 151 | self.l = ge; 152 | return None; 153 | } 154 | 155 | // skip elements prior to MIN_SIZE and track offset of new `data` in argument `data` for 156 | // return value 157 | let mut i = if gi <= params.min_size { 158 | let skip = params.min_size - gi; 159 | gi += skip; 160 | skip 161 | } else { 162 | 0 163 | } as usize; 164 | 165 | let mut fp = self.fp; 166 | 167 | loop { 168 | if i >= data.len() { 169 | break; 170 | } 171 | if gi >= params.normal_size { 172 | // go to next set of matches 173 | break; 174 | } 175 | 176 | let v = data[i]; 177 | fp = (fp << 1) + Wrapping(params.gear[v as usize]); 178 | if (fp.0 & MASK_S) == 0 { 179 | self.reset(); 180 | return Some(i); 181 | } 182 | 183 | gi += 1; 184 | i += 1; 185 | } 186 | 187 | loop { 188 | if gi >= params.max_size { 189 | // no match found, emit fixed match at MAX_SIZE 190 | self.reset(); 191 | return Some(i); 192 | } 193 | if i >= data.len() { 194 | break; 195 | } 196 | 197 | let v = data[i]; 198 | fp = (fp << 1) + Wrapping(params.gear[v as usize]); 199 | if (fp.0 & MASK_L) == 0 { 200 | self.reset(); 201 | return Some(i); 202 | } 203 | 204 | gi += 1; 205 | i += 1; 206 | } 207 | 208 | // no match, but not at MAX_SIZE yet, so store context for next time. 209 | self.fp = fp; 210 | self.l = ge; 211 | 212 | None 213 | } 214 | } 215 | 216 | impl<'a> ChunkIncr for FastCdcIncr<'a> { 217 | fn push(&mut self, src: &[u8]) -> Option { 218 | self.state.push(&self.params, src) 219 | } 220 | } 221 | -------------------------------------------------------------------------------- /src/gear.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "gear")] 2 | 3 | use crate::{Chunk, ChunkIncr, ToChunkIncr}; 4 | use std::fmt; 5 | use std::num::Wrapping; 6 | 7 | /// Gear Content Defined Chunking using 32bit expansion. 8 | /// 9 | /// Reference: 10 | /// 11 | /// Xia, W., Jiang, H., Feng, D., Tian, L., Fu, M., and Zhou, Y. Ddelta: A dedulication-inspired 12 | /// fast delta compression approach. Performance Evaluation 79 (2014), 258-271. 13 | /// 14 | /// http://wxia.hustbackup.cn/pub/DElta-PEVA-2014.pdf 15 | #[derive(Clone)] 16 | pub struct Gear32<'a> { 17 | /// A mask with an appropriate number of bits set for the desired average chunk size. 18 | /// 19 | /// fixed configuration. 20 | mask: u32, 21 | 22 | /// value to match (fp & mask) against. 23 | /// 24 | /// fixed configuration. 25 | xxx: u32, 26 | 27 | /// A table to map bytes to 32bit values 28 | /// 29 | /// fixed configuration. 30 | gear: &'a [u32; 256], 31 | } 32 | 33 | #[derive(Debug, Default, PartialEq, Eq, Clone)] 34 | pub struct GearState32 { 35 | /// current fingerprint/hash 36 | /// 37 | /// varying state. 38 | fp: Wrapping, 39 | } 40 | 41 | #[derive(Debug, Clone)] 42 | pub struct GearIncr32<'a> { 43 | params: Gear32<'a>, 44 | 45 | state: GearState32, 46 | } 47 | 48 | impl<'a> Chunk for Gear32<'a> { 49 | type SearchState = GearState32; 50 | 51 | fn to_search_state(&self) -> Self::SearchState { 52 | Default::default() 53 | } 54 | 55 | fn find_chunk_edge( 56 | &self, 57 | state: &mut Self::SearchState, 58 | data: &[u8], 59 | ) -> (Option, usize) { 60 | for (i, v) in data.iter().enumerate() { 61 | if state.push(self, *v) { 62 | *state = self.to_search_state(); 63 | return (Some(i + 1), i + 1); 64 | } 65 | } 66 | 67 | (None, data.len()) 68 | } 69 | } 70 | 71 | impl<'a> ToChunkIncr for Gear32<'a> { 72 | type Incr = GearIncr32<'a>; 73 | 74 | fn to_chunk_incr(&self) -> Self::Incr { 75 | self.into() 76 | } 77 | } 78 | 79 | impl<'a> From<&Gear32<'a>> for GearIncr32<'a> { 80 | fn from(params: &Gear32<'a>) -> Self { 81 | Self { 82 | params: params.clone(), 83 | state: Default::default(), 84 | } 85 | } 86 | } 87 | 88 | impl<'a> fmt::Debug for Gear32<'a> { 89 | fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { 90 | fmt.debug_struct("Gear32") 91 | .field("mask", &self.mask) 92 | .field("xxx", &self.xxx) 93 | .field("gear", &&self.gear[..]) 94 | .finish() 95 | } 96 | } 97 | 98 | impl GearState32 { 99 | fn push(&mut self, params: &Gear32<'_>, add: u8) -> bool { 100 | self.fp = (self.fp << 1) + Wrapping(params.gear[add as usize]); 101 | self.fp.0 & params.mask == params.xxx 102 | } 103 | 104 | fn reset(&mut self) { 105 | self.fp.0 = 0; 106 | } 107 | } 108 | 109 | impl<'a> ChunkIncr for GearIncr32<'a> { 110 | fn push(&mut self, data: &[u8]) -> Option { 111 | for (i, v) in data.iter().enumerate() { 112 | if self.state.push(&self.params, *v) { 113 | self.state.reset(); 114 | return Some(i); 115 | } 116 | } 117 | 118 | None 119 | } 120 | } 121 | 122 | fn msb_mask(log2: usize) -> u32 { 123 | // at least 1 bit & not all the bits 124 | // FIXME: probably could relax those requirements with better math. 125 | //debug_assert!(log2 > 0); 126 | //debug_assert!(log2 < 32); 127 | 128 | ((1 << log2) - 1) << (32 - log2) 129 | } 130 | 131 | impl<'a> Gear32<'a> { 132 | /// Create a gear chunker which emits blocks with average size `(1< Self { 135 | Gear32 { 136 | mask: msb_mask(average_size_log2), 137 | xxx: 0, 138 | gear: &super::gear_table::GEAR_32, 139 | } 140 | } 141 | } 142 | 143 | impl<'a> Default for Gear32<'a> { 144 | fn default() -> Self { 145 | // 8KB average size 146 | Self::with_average_size_log2(13) 147 | } 148 | } 149 | 150 | #[cfg(test)] 151 | mod test { 152 | #[test] 153 | fn mm() { 154 | use super::msb_mask; 155 | assert_eq!(0b1 << 31, msb_mask(1)); 156 | assert_eq!(0b11 << 30, msb_mask(2)); 157 | assert_eq!(0b111 << 29, msb_mask(3)); 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /src/gear_table/gear32.rs: -------------------------------------------------------------------------------- 1 | /// An abitrary table originally produced for rsroll that is included here for convenience. 2 | pub static GEAR_32: [u32; 256] = [ 3 | 0xb088d3a9, 0x5652c7f7, 0x45b28969, 0x6b0a89d5, 0x368f573e, 0x1dc636dc, 0x207a4c4e, 0xa474b346, 4 | 0x3b06a83e, 0x90e78d6c, 0xe1c92df7, 0x8e95053a, 0x5a2ef4f1, 0xa50fac94, 0x0e7303eb, 0x99b07edc, 5 | 0x689d2fb5, 0x00005082, 0xc4b08306, 0x3eb0678a, 0xf19f87ab, 0xf2129fbf, 0x48114957, 0x00000106, 6 | 0x1fba3780, 0x3bf06fd6, 0x99687e97, 0x79a10673, 0xe4accf9e, 0x2520e71f, 0x2bd5d3fd, 0x00de4dcd, 7 | 0xeaa9311c, 0xdb748eb6, 0xaf579a8d, 0x86a6e5da, 0xcc2fc30a, 0x355e2afe, 0x2d99c8f4, 0xbade4b4a, 8 | 0xf7b51872, 0x3286b658, 0x0000b688, 0xa115d6e4, 0x484f7e9c, 0xccca7bb7, 0xbf2584a6, 0xade7e813, 9 | 0x00007094, 0x8ae69108, 0xbd776ad7, 0xfb6b001f, 0xc7a474b8, 0xbaf6f116, 0x09cb1f5b, 0xb0b219e6, 10 | 0x00ccbc38, 0xcc849d0a, 0x73a3ef7d, 0xc807d2d3, 0x7f2ac996, 0xd037a86b, 0xf3f17c66, 0xaca626b0, 11 | 0x755a9937, 0x90837ee6, 0x6ee8ad93, 0x0000d9e1, 0x9e063bb2, 0x07ab77f1, 0xec550255, 0x78fb94a8, 12 | 0xc7510e1b, 0x0000320b, 0x827c3326, 0x14675f0b, 0x267bd3a6, 0xf1916ff9, 0x86221b7f, 0x9dbecee7, 13 | 0xea58f8ca, 0x008d1986, 0x6d38704f, 0xe032cb07, 0x228d21f6, 0x635cb1bf, 0x4620a173, 0xa7e7dfe3, 14 | 0x0c10ca93, 0x2727fee8, 0xa2df1c6d, 0x4dcdd1ac, 0x000070ff, 0xa2ace87b, 0x9892275a, 0xc2861181, 15 | 0xbb9972a0, 0xef70cd37, 0x00000513, 0xc058b618, 0x09e85085, 0x9197fb3b, 0x7e1e626d, 0x520c5450, 16 | 0xbee17971, 0x6fd9ac32, 0x0023957c, 0xa01c7d7e, 0xaba2c758, 0x0d1fa0ce, 0x0bb6a58b, 0x4333dd5b, 17 | 0xc2fd3b7d, 0xfb418024, 0x65a56185, 0xf67a02bd, 0x696f11dd, 0x00002022, 0x8cd6be91, 0x695189b6, 18 | 0xee9453b5, 0xd8fc5ea9, 0xab86bf19, 0x0000c6b5, 0x26731017, 0xed2d101b, 0x3b41ed84, 0x13e62212, 19 | 0xa315f5eb, 0x8816c34e, 0xe9395b9c, 0x002ce920, 0x4283db1d, 0xd77d461a, 0xe2ec17e4, 0xb8e0be40, 20 | 0xdea160c4, 0x7eec86c8, 0x2119ad12, 0xa6ccf46b, 0x2c52cede, 0x2db48711, 0x0000f0d6, 0x3dd5d8c9, 21 | 0x8a1872a2, 0xf282a4c4, 0x8020ec2c, 0x6693b6e0, 0x0000ce19, 0x20cb5735, 0x762ebf37, 0x207bfe82, 22 | 0xd77dc112, 0x9ba78342, 0x217dc513, 0xb27b1a29, 0x00d5cd98, 0x71e39b80, 0x7e572af0, 0xa2734f2f, 23 | 0xbf82c6b5, 0x5c3beac6, 0xcdc893bb, 0x6d108561, 0x77f8ae30, 0x917c6b81, 0x5b75b699, 0x0000cf6a, 24 | 0xf3c40afa, 0x2063127a, 0x621de622, 0xd188ac1d, 0x107036e2, 0x0000b85f, 0xf2ef4e4c, 0xd9d6de66, 25 | 0xa1fc7955, 0xeb85fd03, 0xbe27502f, 0xe3034251, 0x441364d3, 0x0082b36c, 0xb1459103, 0x021c069c, 26 | 0x2910dfc7, 0x735b353e, 0xce44312c, 0xbc942e45, 0xf05086a7, 0xfec3b215, 0x00ae1055, 0xf54b4084, 27 | 0x00007fd9, 0xbfbd9ef3, 0xa804302f, 0x39ce4957, 0xffb9e2a4, 0x55b9ad1d, 0x00008acb, 0x48e2bfc8, 28 | 0x8be39841, 0x0e271216, 0xd51096e8, 0x1101ba17, 0xc22e770f, 0x1689eff2, 0x00a92a19, 0xbc765990, 29 | 0xc61441e3, 0x07e13a2c, 0x92cbe984, 0x8f4ff572, 0x0b9670c0, 0x62955a58, 0x645f83e5, 0x41fce516, 30 | 0xbbda9748, 0x0000aab2, 0x19761b06, 0x8b8f5e83, 0x3e5d1cfd, 0xec5c1e2c, 0xfaf7e0fe, 0x000000d3, 31 | 0xda3f9017, 0x70ff906d, 0x0527d5a7, 0x22d8e773, 0xc9ab70df, 0xeda4c6dc, 0xecef1f41, 0x0024c2b2, 32 | 0x06740d95, 0x1d7a299b, 0xb3c37cb2, 0xc986e3c7, 0x9fabea36, 0x6da214c5, 0x17a43ed8, 0x6eccec51, 33 | 0xf9cab309, 0x4a5e60c5, 0x00006967, 0x9da51d12, 0x84321e13, 0xfb3d6fb6, 0x60305eed, 0xcbbf4b14, 34 | 0x00004f63, 0x07d5b781, 0xe5a53672, 0x57afb234, 0x18f346f7, 0x636dc655, 0xcc8bab49, 0x63c7a906, 35 | ]; 36 | -------------------------------------------------------------------------------- /src/gear_table/gear64.rs: -------------------------------------------------------------------------------- 1 | /// An abitrary table originally produced for rsroll that is included here for convenience. 2 | pub static GEAR_64: [u64; 256] = [ 3 | 0xb088d3a9e840f559, 4 | 0x5652c7f739ed20d6, 5 | 0x45b28969898972ab, 6 | 0x6b0a89d5b68ec777, 7 | 0x368f573e8b7a31b7, 8 | 0x1dc636dce936d94b, 9 | 0x207a4c4e5554d5b6, 10 | 0xa474b34628239acb, 11 | 0x3b06a83e1ca3b912, 12 | 0x90e78d6c2f02baf7, 13 | 0xe1c92df7150d9a8a, 14 | 0x8e95053a1086d3ad, 15 | 0x5a2ef4f1b83a0722, 16 | 0xa50fac949f807fae, 17 | 0x0e7303eb80d8d681, 18 | 0x99b07edc1570ad0f, 19 | 0x689d2fb555fd3076, 20 | 0x00005082119ea468, 21 | 0xc4b08306a88fcc28, 22 | 0x3eb0678af6374afd, 23 | 0xf19f87ab86ad7436, 24 | 0xf2129fbfbe6bc736, 25 | 0x481149575c98a4ed, 26 | 0x0000010695477bc5, 27 | 0x1fba37801a9ceacc, 28 | 0x3bf06fd663a49b6d, 29 | 0x99687e9782e3874b, 30 | 0x79a10673aa50d8e3, 31 | 0xe4accf9e6211f420, 32 | 0x2520e71f87579071, 33 | 0x2bd5d3fd781a8a9b, 34 | 0x00de4dcddd11c873, 35 | 0xeaa9311c5a87392f, 36 | 0xdb748eb617bc40ff, 37 | 0xaf579a8df620bf6f, 38 | 0x86a6e5da1b09c2b1, 39 | 0xcc2fc30ac322a12e, 40 | 0x355e2afec1f74267, 41 | 0x2d99c8f4c021a47b, 42 | 0xbade4b4a9404cfc3, 43 | 0xf7b518721d707d69, 44 | 0x3286b6587bf32c20, 45 | 0x0000b68886af270c, 46 | 0xa115d6e4db8a9079, 47 | 0x484f7e9c97b2e199, 48 | 0xccca7bb75713e301, 49 | 0xbf2584a62bb0f160, 50 | 0xade7e813625dbcc8, 51 | 0x000070940d87955a, 52 | 0x8ae69108139e626f, 53 | 0xbd776ad72fde38a2, 54 | 0xfb6b001fc2fcc0cf, 55 | 0xc7a474b8e67bc427, 56 | 0xbaf6f11610eb5d58, 57 | 0x09cb1f5b6de770d1, 58 | 0xb0b219e6977d4c47, 59 | 0x00ccbc386ea7ad4a, 60 | 0xcc849d0adf973f01, 61 | 0x73a3ef7d016af770, 62 | 0xc807d2d386bdbdfe, 63 | 0x7f2ac9966c791730, 64 | 0xd037a86bc6c504da, 65 | 0xf3f17c661eaa609d, 66 | 0xaca626b04daae687, 67 | 0x755a99374f4a5b07, 68 | 0x90837ee65b2caede, 69 | 0x6ee8ad93fd560785, 70 | 0x0000d9e11053edd8, 71 | 0x9e063bb2d21cdbd7, 72 | 0x07ab77f12a01d2b2, 73 | 0xec550255e6641b44, 74 | 0x78fb94a8449c14c6, 75 | 0xc7510e1bc6c0f5f5, 76 | 0x0000320b36e4cae3, 77 | 0x827c33262c8b1a2d, 78 | 0x14675f0b48ea4144, 79 | 0x267bd3a6498deceb, 80 | 0xf1916ff982f5035e, 81 | 0x86221b7ff434fb88, 82 | 0x9dbecee7386f49d8, 83 | 0xea58f8cac80f8f4a, 84 | 0x008d198692fc64d8, 85 | 0x6d38704fbabf9a36, 86 | 0xe032cb07d1e7be4c, 87 | 0x228d21f6ad450890, 88 | 0x635cb1bfc02589a5, 89 | 0x4620a1739ca2ce71, 90 | 0xa7e7dfe3aae5fb58, 91 | 0x0c10ca932b3c0deb, 92 | 0x2727fee884afed7b, 93 | 0xa2df1c6df9e2ab1f, 94 | 0x4dcdd1ac0774f523, 95 | 0x000070ffad33e24e, 96 | 0xa2ace87bc5977816, 97 | 0x9892275ab4286049, 98 | 0xc2861181ddf18959, 99 | 0xbb9972a042483e19, 100 | 0xef70cd3766513078, 101 | 0x00000513abfc9864, 102 | 0xc058b61858c94083, 103 | 0x09e850859725e0de, 104 | 0x9197fb3bf83e7d94, 105 | 0x7e1e626d12b64bce, 106 | 0x520c54507f7b57d1, 107 | 0xbee1797174e22416, 108 | 0x6fd9ac3222e95587, 109 | 0x0023957c9adfbf3e, 110 | 0xa01c7d7e234bbe15, 111 | 0xaba2c758b8a38cbb, 112 | 0x0d1fa0ceec3e2b30, 113 | 0x0bb6a58b7e60b991, 114 | 0x4333dd5b9fa26635, 115 | 0xc2fd3b7d4001c1a3, 116 | 0xfb41802454731127, 117 | 0x65a56185a50d18cb, 118 | 0xf67a02bd8784b54f, 119 | 0x696f11dd67e65063, 120 | 0x00002022fca814ab, 121 | 0x8cd6be912db9d852, 122 | 0x695189b6e9ae8a57, 123 | 0xee9453b50ada0c28, 124 | 0xd8fc5ea91a78845e, 125 | 0xab86bf191a4aa767, 126 | 0x0000c6b5c86415e5, 127 | 0x267310178e08a22e, 128 | 0xed2d101b078bca25, 129 | 0x3b41ed84b226a8fb, 130 | 0x13e622120f28dc06, 131 | 0xa315f5ebfb706d26, 132 | 0x8816c34e3301bace, 133 | 0xe9395b9cbb71fdae, 134 | 0x002ce9202e721648, 135 | 0x4283db1d2bb3c91c, 136 | 0xd77d461ad2b1a6a5, 137 | 0xe2ec17e46eeb866b, 138 | 0xb8e0be4039fbc47c, 139 | 0xdea160c4d5299d04, 140 | 0x7eec86c8d28c3634, 141 | 0x2119ad129f98a399, 142 | 0xa6ccf46b61a283ef, 143 | 0x2c52cedef658c617, 144 | 0x2db4871169acdd83, 145 | 0x0000f0d6f39ecbe9, 146 | 0x3dd5d8c98d2f9489, 147 | 0x8a1872a22b01f584, 148 | 0xf282a4c40e7b3cf2, 149 | 0x8020ec2ccb1ba196, 150 | 0x6693b6e09e59e313, 151 | 0x0000ce19cc7c83eb, 152 | 0x20cb5735f6479c3b, 153 | 0x762ebf3759d75a5b, 154 | 0x207bfe823d693975, 155 | 0xd77dc112339cd9d5, 156 | 0x9ba7834284627d03, 157 | 0x217dc513e95f51e9, 158 | 0xb27b1a29fc5e7816, 159 | 0x00d5cd9831bb662d, 160 | 0x71e39b806d75734c, 161 | 0x7e572af006fb1a23, 162 | 0xa2734f2f6ae91f85, 163 | 0xbf82c6b5022cddf2, 164 | 0x5c3beac60761a0de, 165 | 0xcdc893bb47416998, 166 | 0x6d1085615c187e01, 167 | 0x77f8ae30ac277c5d, 168 | 0x917c6b81122a2c91, 169 | 0x5b75b699add16967, 170 | 0x0000cf6ae79a069b, 171 | 0xf3c40afa60de1104, 172 | 0x2063127aa59167c3, 173 | 0x621de62269d1894d, 174 | 0xd188ac1de62b4726, 175 | 0x107036e2154b673c, 176 | 0x0000b85f28553a1d, 177 | 0xf2ef4e4c18236f3d, 178 | 0xd9d6de6611b9f602, 179 | 0xa1fc7955fb47911c, 180 | 0xeb85fd032f298dbd, 181 | 0xbe27502fb3befae1, 182 | 0xe3034251c4cd661e, 183 | 0x441364d354071836, 184 | 0x0082b36c75f2983e, 185 | 0xb145910316fa66f0, 186 | 0x021c069c9847caf7, 187 | 0x2910dfc75a4b5221, 188 | 0x735b353e1c57a8b5, 189 | 0xce44312ce98ed96c, 190 | 0xbc942e4506bdfa65, 191 | 0xf05086a71257941b, 192 | 0xfec3b215d351cead, 193 | 0x00ae1055e0144202, 194 | 0xf54b40846f42e454, 195 | 0x00007fd9c8bcbcc8, 196 | 0xbfbd9ef317de9bfe, 197 | 0xa804302ff2854e12, 198 | 0x39ce4957a5e5d8d4, 199 | 0xffb9e2a45637ba84, 200 | 0x55b9ad1d9ea0818b, 201 | 0x00008acbf319178a, 202 | 0x48e2bfc8d0fbfb38, 203 | 0x8be39841e848b5e8, 204 | 0x0e2712160696a08b, 205 | 0xd51096e84b44242a, 206 | 0x1101ba176792e13a, 207 | 0xc22e770f4531689d, 208 | 0x1689eff272bbc56c, 209 | 0x00a92a197f5650ec, 210 | 0xbc765990bda1784e, 211 | 0xc61441e392fcb8ae, 212 | 0x07e13a2ced31e4a0, 213 | 0x92cbe984234e9d4d, 214 | 0x8f4ff572bb7d8ac5, 215 | 0x0b9670c00b963bd0, 216 | 0x62955a581a03eb01, 217 | 0x645f83e5ea000254, 218 | 0x41fce516cd88f299, 219 | 0xbbda9748da7a98cf, 220 | 0x0000aab2fe4845fa, 221 | 0x19761b069bf56555, 222 | 0x8b8f5e8343b6ad56, 223 | 0x3e5d1cfd144821d9, 224 | 0xec5c1e2ca2b0cd8f, 225 | 0xfaf7e0fea7fbb57f, 226 | 0x000000d3ba12961b, 227 | 0xda3f90178401b18e, 228 | 0x70ff906de33a5feb, 229 | 0x0527d5a7c06970e7, 230 | 0x22d8e773607c13e9, 231 | 0xc9ab70df643c3bac, 232 | 0xeda4c6dc8abe12e3, 233 | 0xecef1f410033e78a, 234 | 0x0024c2b274ac72cb, 235 | 0x06740d954fa900b4, 236 | 0x1d7a299b323d6304, 237 | 0xb3c37cb298cbead5, 238 | 0xc986e3c76178739b, 239 | 0x9fabea364b46f58a, 240 | 0x6da214c5af85cc56, 241 | 0x17a43ed8b7a38f84, 242 | 0x6eccec511d9adbeb, 243 | 0xf9cab30913335afb, 244 | 0x4a5e60c5f415eed2, 245 | 0x00006967503672b4, 246 | 0x9da51d121454bb87, 247 | 0x84321e13b9bbc816, 248 | 0xfb3d6fb6ab2fdd8d, 249 | 0x60305eed8e160a8d, 250 | 0xcbbf4b14e9946ce8, 251 | 0x00004f63381b10c3, 252 | 0x07d5b7816fcc4e10, 253 | 0xe5a536726a6a8155, 254 | 0x57afb23447a07fdd, 255 | 0x18f346f7abc9d394, 256 | 0x636dc655d61ad33d, 257 | 0xcc8bab4939f7f3f6, 258 | 0x63c7a906c1dd187b, 259 | ]; 260 | -------------------------------------------------------------------------------- /src/gear_table/mod.rs: -------------------------------------------------------------------------------- 1 | mod gear32; 2 | mod gear64; 3 | 4 | pub use self::gear32::GEAR_32; 5 | pub use self::gear64::GEAR_64; 6 | -------------------------------------------------------------------------------- /src/gzip.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "gzip")] 2 | 3 | //! gzip (forks) rsyncable mode that uses a simple window accumulator 4 | //! 5 | //! **WARNING: not validated against gzip or rsyncrypto algorithms. Output may change to fix bugs** 6 | //! 7 | //! A widely distributed patch for gzip adds a `--rsyncable` flag which causes `gzip` to split it's 8 | //! input on "stable" boundaries. This module impliments the algorithm used in that patch. 9 | //! 10 | //! The same algorithm is used by the `rsyncrypto` project. 11 | //! 12 | //! - No maximum block size is provided. 13 | //! - No minimum block size is provided. 14 | //! 15 | //! PDF of block sizes: ??? 16 | //! 17 | //! Note that the defacto-standard parameters allow a slightly more efficient check for a block 18 | //! split (by replacing a modulus with a bitwise-and). This impl currently doesn't allow that 19 | //! optimization even if you provide appropriate parameters (we need type-level integers for that). 20 | //! 21 | //! Parameters: 22 | //! 23 | //! - window-len: The maximum number of bytes to be examined when deciding to split a block. 24 | //! set to 8192 by default in gzip-rsyncable & rsyncrypto) 25 | //! - modulus: set to half of window-len (so, 4096) in gzip-rsyncable & rsyncrypto. 26 | //! 27 | //! In-block state: 28 | //! - window of window-len bytes (use of the iterator interface means we also track more bytes than 29 | //! this) 30 | //! - sum (u64) 31 | //! 32 | //! Between-block state: 33 | //! 34 | //! - none 35 | //! 36 | //! References: 37 | //! 38 | //! - http://rsyncrypto.lingnu.com/index.php/Algorithm 39 | //! - https://www.samba.org/~tridge/phd_thesis.pdf 40 | //! 41 | //! S(n) = sum(c_i, var=i, top=n, bottom=n-8196) 42 | //! 43 | //! A(n) = S(n) / 8192 44 | //! 45 | //! H(n) = S(n) mod 4096 46 | //! 47 | //! Trigger splits when H(n) == 0 48 | 49 | use crate::{Chunk, ChunkIncr, ToChunkIncr}; 50 | use std::collections::VecDeque; 51 | use std::num::Wrapping; 52 | 53 | /// Parameters for defining the gzip rsyncable algorithm 54 | #[derive(Clone, Debug, PartialEq, Eq)] 55 | pub struct GzipRsyncable { 56 | /* 57 | * TODO: if we can avoid loading entire files into memory, this could be u64 58 | */ 59 | window_len: usize, 60 | modulus: u64, 61 | } 62 | 63 | impl GzipRsyncable { 64 | pub fn with_window_and_modulus(window: usize, modulus: u64) -> GzipRsyncable { 65 | Self { 66 | window_len: window, 67 | modulus, 68 | } 69 | } 70 | } 71 | 72 | impl Default for GzipRsyncable { 73 | fn default() -> Self { 74 | Self::with_window_and_modulus(8192, 4096) 75 | } 76 | } 77 | 78 | impl Chunk for GzipRsyncable { 79 | type SearchState = GzipRsyncableSearchState; 80 | 81 | fn to_search_state(&self) -> Self::SearchState { 82 | Self::SearchState::default() 83 | } 84 | 85 | fn find_chunk_edge( 86 | &self, 87 | state: &mut Self::SearchState, 88 | data: &[u8], 89 | ) -> (Option, usize) { 90 | for i in state.offset..data.len() { 91 | let v = data[i]; 92 | 93 | if state.state.add(data, self, i, v) { 94 | state.reset(); 95 | return (Some(i + 1), i + 1); 96 | } 97 | } 98 | 99 | // keep k elements = discard all but k 100 | let discard_ct = data.len().saturating_sub(self.window_len); 101 | state.offset = data.len() - discard_ct; 102 | (None, discard_ct) 103 | } 104 | } 105 | 106 | impl From<&GzipRsyncable> for GzipRsyncableIncr { 107 | fn from(src: &GzipRsyncable) -> Self { 108 | src.clone().into() 109 | } 110 | } 111 | 112 | impl ToChunkIncr for GzipRsyncable { 113 | type Incr = GzipRsyncableIncr; 114 | fn to_chunk_incr(&self) -> Self::Incr { 115 | self.into() 116 | } 117 | } 118 | 119 | #[derive(Debug, Default, Clone)] 120 | struct GzipRsyncableState { 121 | accum: Wrapping, 122 | } 123 | 124 | impl GzipRsyncableState { 125 | fn reset(&mut self) { 126 | self.accum.0 = 0; 127 | } 128 | } 129 | 130 | /// Intermediate state for [`GzipRsyncable::find_chunk_edge`] 131 | /// 132 | /// Using this avoids re-computation of data when no edge is found 133 | #[derive(Debug, Default, Clone)] 134 | pub struct GzipRsyncableSearchState { 135 | offset: usize, 136 | state: GzipRsyncableState, 137 | } 138 | 139 | impl GzipRsyncableSearchState { 140 | fn reset(&mut self) { 141 | self.offset = 0; 142 | self.state.reset(); 143 | } 144 | } 145 | 146 | /// Provides an incremental interface to [`GzipRsyncable`] 147 | /// 148 | /// Performance Note: [`GzipRsyncable`] requires look-back. As a result, [`GzipRsyncableIncr`] internally 149 | /// buffers data up to the window size. This additional copying may affect performance. If 150 | /// possible for your use case, use the non-incremental interface. 151 | /// 152 | /// See [`GzipRsyncable`] for details on the underlying algorithm 153 | #[derive(Debug, Clone)] 154 | pub struct GzipRsyncableIncr { 155 | params: GzipRsyncable, 156 | 157 | accum: Wrapping, 158 | // really poor efficiency 159 | window: VecDeque, 160 | } 161 | 162 | impl GzipRsyncableIncr { 163 | fn reset(&mut self) { 164 | self.window.clear(); 165 | self.accum = Wrapping(0); 166 | } 167 | } 168 | 169 | impl From for GzipRsyncableIncr { 170 | fn from(params: GzipRsyncable) -> Self { 171 | let window = VecDeque::with_capacity(params.window_len); 172 | GzipRsyncableIncr { 173 | params, 174 | accum: Wrapping(0), 175 | window, 176 | } 177 | } 178 | } 179 | 180 | impl GzipRsyncableState { 181 | fn add(&mut self, data: &[u8], parent: &GzipRsyncable, i: usize, v: u8) -> bool { 182 | if i >= parent.window_len { 183 | self.accum -= Wrapping(data[i - parent.window_len] as u64); 184 | } 185 | self.accum += Wrapping(v as u64); 186 | (self.accum % Wrapping(parent.modulus)).0 == 0 187 | } 188 | } 189 | 190 | impl ChunkIncr for GzipRsyncableIncr { 191 | fn push(&mut self, data: &[u8]) -> Option { 192 | for (i, &v) in data.iter().enumerate() { 193 | if self.window.len() >= self.params.window_len { 194 | self.accum -= Wrapping(self.window.pop_front().unwrap() as u64); 195 | } 196 | 197 | self.accum += Wrapping(v as u64); 198 | self.window.push_back(v); 199 | 200 | if (self.accum % Wrapping(self.params.modulus)).0 == 0 { 201 | self.reset(); 202 | return Some(i + 1); 203 | } 204 | } 205 | 206 | None 207 | } 208 | } 209 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! hash-roll provides various content defined chunking algorithms 2 | //! 3 | //! Content defined chunking (CDC) algorithms are algorithms that examine a stream of input bytes (often 4 | //! represented as a slice like `[u8]`, and provide locations within that input stream to split or 5 | //! chunk the stream into parts. 6 | //! 7 | //! CDC algorithms generally try to optimize for the following: 8 | //! 9 | //! 1. Processing speed (ie: bytes/second) 10 | //! 2. Stability in split locations even when insertions/deletions of bytes occur 11 | //! 3. Reasonable distributions of chunk lengths 12 | //! 13 | //! ## API Concepts 14 | //! 15 | //! - Configured Algorithm Instance (impliments [`Chunk`]). Named plainly using the algorithm name 16 | //! (like [`Bup`]). These can be thought of as "parameters" for an algorithm. 17 | //! - Incrimental (impliments [`ChunkIncr`]). Normally named with `Incr` suffix. These are created 18 | //! using [`ToChunkIncr`] for a configured algorithm instance. 19 | //! 20 | //! Because of the various ways one might use a CDC, and the different CDC algorithm 21 | //! characteristics, hash-roll provides a few ways to use them. 22 | //! 23 | //! Configured Algorithm Instances are created from the set of configuration needed for a given 24 | //! algorithm. For example, this might mean configuring a window size or how to decide where to 25 | //! split. These don't include any mutable data, in other words: they don't keep track of what data 26 | //! is given to them. Configured Algorithm Instances provide the all-at-once APIs, as well as 27 | //! methods to obtain other kinds of APIs, like incrimental style apis. 28 | //! 29 | //! ```rust 30 | //! use hash_roll::ToChunkIncr; 31 | //! let algorithm_instance = hash_roll::mii::Mii::default(); 32 | //! let _incrimental_comp = algorithm_instance.to_chunk_incr(); 33 | //! ``` 34 | //! 35 | //! ## CDC Algorithms and Window Buffering 36 | //! 37 | //! Different CDC algorithms have different constraints about how they process data. Notably, some 38 | //! require a large amount of previous processed data to process additional data. This "large 39 | //! amount of previously processed data" is typically referred to as the "window". That said, note 40 | //! that some CDC algorithms that use a window concept don't need previously accessed data. 41 | //! 42 | //! For the window-buffering algorithms, their is an extra cost to certain types of API 43 | //! implimentations. The documentation will note when these occur and suggest alternatives. 44 | //! 45 | //! Generally, CDC interfaces that are incrimental will be slower for window-buffering algorithms. 46 | //! Using an explicitly allocating interface (which emits `Vec` or `Vec>`) will have no 47 | //! worse performance that the incrimental API, but might be more convenient. Using an all-at-once 48 | //! API will provide the best performance due to not requiring any buffering (the input data can be 49 | //! used directly). 50 | //! 51 | //! ## Use Cases that drive API choices 52 | //! 53 | //! - accumulate vecs, emits vecs 54 | //! - incrimental: yes 55 | //! - input: `Vec` 56 | //! - internal state: `Vec>` 57 | //! - output: `Vec>` 58 | //! 59 | //! - stream data through 60 | //! - incrimenal: yes 61 | //! - input: `&[u8]` 62 | //! 63 | //! - mmap (or read entire) file, emit 64 | //! - incrimenal: no 65 | //! - input: `&[u8]` 66 | //! - output: `&[u8]` 67 | 68 | // # API Design Notes 69 | // 70 | // ## What methods should be in a trait? What should be in wrapper structs? 71 | // 72 | // - place methods that might have more optimized variants, but can have common implimentations, 73 | // in a trait. This notably affects window-buffering differences: it's always possible to 74 | // impliment all-at-once processing using incrimental interfaces that internally buffer, but 75 | // it's much more efficient for window-buffering algorithms to provide implimentations that know 76 | // how to look into the input data directly. 77 | 78 | #![warn(rust_2018_idioms, missing_debug_implementations)] 79 | /* TODO: Rabin-Karp 80 | * H = c_1 * a ** (k-1) + c_2 * a ** (k-2) ... + c_k * a ** 0 81 | * where: 82 | * a is a constant 83 | * c_1, ..., c_k are the input characters 84 | * 85 | * All math is done modulo n. Choice of n & a critical 86 | * 87 | * Parameters: 88 | * - n: mululo limit 89 | * - a: a constant 90 | * 91 | * State: 92 | * H 93 | * 94 | * Application: 95 | */ 96 | 97 | /* TODO: 98 | * rollsum of librsync 99 | */ 100 | 101 | use std::mem; 102 | 103 | pub mod bup; 104 | pub mod buzhash; 105 | pub mod buzhash_table; 106 | pub mod fastcdc; 107 | pub mod gear; 108 | pub mod gear_table; 109 | pub mod gzip; 110 | pub mod mii; 111 | pub mod pigz; 112 | pub mod ram; 113 | pub mod range; 114 | pub mod zpaq; 115 | pub mod zstd; 116 | 117 | pub(crate) use range::RangeExt; 118 | 119 | /// Accept incrimental input and provide indexes of split points 120 | /// 121 | /// Compared to [`Chunk`], [`ChunkIncr`] allows avoiding having to buffer all input data in memory, 122 | /// and avoids the need to use a single buffer for storing the input data (even if all data is in 123 | /// memory). 124 | /// 125 | /// Data fed into a given [`ChunkIncr`] instance is considered to be part of the same 126 | /// data "source". This affects chunking algorithms that maintain some state between chunks 127 | /// (like `ZstdRsyncable` does). If you have multiple "sources", one should obtain new instances of 128 | /// [`ChunkIncr`] for each of them (typically via [`ToChunkIncr`]). 129 | /// 130 | /// Note that for some splitting/chunking algorithms, the incrimental api will be less efficient 131 | /// compared to the non-incrimental API. In particular, algorithms like [`Rsyncable`] that require 132 | /// the use of previously examined data to shift their "window" (resulting in needing a circular 133 | /// buffer which all inputed data passes through) will perform more poorly using [`ChunkIncr`] 134 | /// compared with non-incrimental interfaces 135 | pub trait ChunkIncr { 136 | /// The data "contained" within a implimentor of this trait is the history of all data slices 137 | /// passed to feed. 138 | /// 139 | /// In other words, all previous data (or no previous data) may be used in determining the 140 | /// point to split. 141 | /// 142 | /// Returns None if the data has no split point. 143 | /// Otherwise, returns an index in the most recently passed `data`. 144 | /// 145 | /// Note that returning the index in the current slice makes most "look-ahead" splitting 146 | /// impossible (as it is permissible to pass 1 byte at a time). 147 | fn push(&mut self, data: &[u8]) -> Option; 148 | 149 | /// Given a [`ChunkIncr`] and a single slice, return a list of slices chunked by the chunker. 150 | /// 151 | /// Will always return enough slices to form the entire content of `data`, even if the trailing 152 | /// part of data is not a chunk (ie: does not end on a chunk boundary) 153 | fn iter_slices(self, data: &[u8]) -> IterSlices<'_, Self> 154 | where 155 | Self: std::marker::Sized, 156 | { 157 | IterSlices { 158 | rem: data, 159 | chunker: self, 160 | } 161 | } 162 | 163 | /// Given a [`ChunkIncr`] and a single slice, return a list of slices chunked by the chunker. 164 | /// Does not return the remainder (if any) in the iteration. Use [`IterSlices::take_rem()`] or 165 | /// [`IterSlices::into_parts()`] to get the remainder. 166 | /// 167 | /// Note that this is a non-incrimental interface. Calling this on an already fed chunker or using 168 | /// this multiple times on the same chunker may provide unexpected results 169 | fn iter_slices_strict(self, data: &[u8]) -> IterSlicesStrict<'_, Self> 170 | where 171 | Self: std::marker::Sized, 172 | { 173 | IterSlicesStrict { 174 | rem: data, 175 | chunker: self, 176 | } 177 | } 178 | } 179 | 180 | /// Returned by [`ChunkIncr::iter_slices_strict()`] 181 | /// 182 | /// Always emits _complete_ slices durring iteration. 183 | #[derive(Debug)] 184 | pub struct IterSlicesStrict<'a, C: ChunkIncr> { 185 | rem: &'a [u8], 186 | chunker: C, 187 | } 188 | 189 | impl<'a, C: ChunkIncr> IterSlicesStrict<'a, C> { 190 | /// Take the remainder from this iterator. Leaves an empty slice in it's place. 191 | pub fn take_rem(&mut self) -> &'a [u8] { 192 | let mut l: &[u8] = &[]; 193 | mem::swap(&mut self.rem, &mut l); 194 | l 195 | } 196 | 197 | /// Obtain the internals 198 | /// 199 | /// Useful, for example, after iteration stops to obtain the remaining slice. 200 | pub fn into_parts(self) -> (C, &'a [u8]) { 201 | (self.chunker, self.rem) 202 | } 203 | } 204 | 205 | impl<'a, C: ChunkIncr> Iterator for IterSlicesStrict<'a, C> { 206 | type Item = &'a [u8]; 207 | 208 | fn next(&mut self) -> Option { 209 | match self.chunker.push(self.rem) { 210 | None => None, 211 | Some(l) => { 212 | let (v, rn) = self.rem.split_at(l); 213 | self.rem = rn; 214 | Some(v) 215 | } 216 | } 217 | } 218 | } 219 | 220 | /// Returned by [`ChunkIncr::iter_slices()`] 221 | /// 222 | /// When it runs out of data, it returns the remainder as the last element of the iteration 223 | #[derive(Debug)] 224 | pub struct IterSlices<'a, C: ChunkIncr> { 225 | rem: &'a [u8], 226 | chunker: C, 227 | } 228 | 229 | impl<'a, C: ChunkIncr> IterSlices<'a, C> { 230 | /// Obtain the internals 231 | /// 232 | /// Useful, for example, after iteration stops to obtain the remaining slice. 233 | pub fn into_parts(self) -> (C, &'a [u8]) { 234 | (self.chunker, self.rem) 235 | } 236 | } 237 | 238 | impl<'a, C: ChunkIncr> Iterator for IterSlices<'a, C> { 239 | type Item = &'a [u8]; 240 | 241 | fn next(&mut self) -> Option { 242 | if self.rem.is_empty() { 243 | return None; 244 | } 245 | 246 | match self.chunker.push(self.rem) { 247 | None => { 248 | let v = self.rem; 249 | self.rem = &[]; 250 | Some(v) 251 | } 252 | Some(l) => { 253 | let (v, rn) = self.rem.split_at(l); 254 | self.rem = rn; 255 | Some(v) 256 | } 257 | } 258 | } 259 | } 260 | 261 | /// Impl on algorthms that define methods of chunking data 262 | /// 263 | /// This is the lowest level (but somewhat restrictive) trait for chunking algorthms. It assumes 264 | /// that the input is provided to it in a contiguous slice. If you don't have your input as a 265 | /// contiguous slice, [`ChunkIncr`] may be a better choice (it allows non-contiguous input, but may 266 | /// be slowing for some chunking algorthms). 267 | pub trait Chunk { 268 | /// `SearchState` allows searching for the chunk edge to resume without duplicating work 269 | /// already done. 270 | type SearchState; 271 | 272 | /* 273 | /// Amount of data from already emitted chunks requried for determining future chunks 274 | /// 275 | /// Indicates the amount of data that _must_ be preserved for [`find_chunk_edge()`]'s 276 | /// `prev_data` argument. If more that this is passed, the last bytes in the slice are used. At 277 | /// the start of an input (where there is no previous data), an empty slice would be used. 278 | /// 279 | /// For most chunking algorithms, this is `0` (zero), indicating that `prev_data` may always be 280 | /// an empty slice. 281 | const CARRY_LEN: usize; 282 | */ 283 | 284 | /// Provide an initial [`SearchState`] for use with [`find_chunk_edge()`]. Generally, for each 285 | /// input one should generate a new [`SearchState`]. 286 | fn to_search_state(&self) -> Self::SearchState; 287 | 288 | /// Find the next "chunk" in `data` to emit 289 | /// 290 | /// The return value is a pair of a range representing the start and end of the chunk being 291 | /// emitted, and the offset from which subsequent `data` subsets should be passed to the next 292 | /// call to `find_chunk_edge`. 293 | /// 294 | /// `state` is mutated so that it does not rexamine previously examined data, even when a chunk 295 | /// is not emitted. 296 | /// 297 | /// `data` may be extended with additional data between calls to `find_chunk_edge()`. The bytes 298 | /// that were _previously_ in `data` and are not indicated by `discard_ct` must be preserved in 299 | /// the next `data` buffer called. 300 | /// 301 | /// ```rust 302 | /// use hash_roll::Chunk; 303 | /// 304 | /// fn some_chunk() -> impl Chunk { 305 | /// hash_roll::mii::Mii::default() 306 | /// } 307 | /// 308 | /// let chunk = some_chunk(); 309 | /// let orig_data = b"hello"; 310 | /// let mut data = &orig_data[..]; 311 | /// let mut ss = chunk.to_search_state(); 312 | /// let mut prev_cut = 0; 313 | /// 314 | /// loop { 315 | /// let (chunk, discard_ct) = chunk.find_chunk_edge(&mut ss, data); 316 | /// 317 | /// match chunk { 318 | /// Some(cut_point) => { 319 | /// // map `cut_point` from the current slice back into the original slice so we can 320 | /// // have consistent indexes 321 | /// let g_cut = cut_point + orig_data.len() - data.len(); 322 | /// println!("chunk: {:?}", &orig_data[prev_cut..g_cut]); 323 | /// prev_cut = g_cut; 324 | /// }, 325 | /// None => { 326 | /// println!("no chunk, done with data we have"); 327 | /// println!("remain: {:?}", &data[discard_ct..]); 328 | /// break; 329 | /// } 330 | /// } 331 | /// 332 | /// data = &data[discard_ct..]; 333 | /// } 334 | /// ``` 335 | /// 336 | /// Note: call additional times on the same `SearchState` and the required `data` to obtain 337 | /// subsequent chunks in the same input data. To handle a seperate input, use a new 338 | /// `SearchState`. 339 | /// 340 | /// Note: calling with a previous `state` with a new `data` that isn't an extention of the 341 | /// previous `data` will result in split points that may not follow the design of the 342 | /// underlying algorithm. Avoid relying on consistent cut points to reason about memory safety. 343 | /// 344 | // NOTE: the reason that we preserve `state` even when chunks are emitted is that some 345 | // algorthims require some state to pass between chunks for a given input. zstd includes an 346 | // example of an algorithm that needs this 347 | // 348 | // Potential pitfal: for better performance, keeping the return value small is a very good 349 | // idea. By returning ~2x64+32, we are might be less performant depending on the ABI selected. 350 | // 351 | // Consider if result should return `(&[u8], &[u8])` instead of an index (which would then be 352 | // given to `.split_at()` 353 | // 354 | // Consider if `state` should have a `reset()` method to avoid reallocating 355 | // 356 | // API: 357 | // - `fn find_chunk_edge(&self, state: &mut Self::SearchState, data: &[u8]) -> (Option<(usize, uszie)>, usize); 358 | // - Problem: unclear what indexes of slices represent: start can't be in the data being 359 | // passed because we don't require `data` include the start of the chunk 360 | // - `fn find_chunk_edge(&self, state: &mut Self::SearchState, data: &[u8]) -> (Option, usize); 361 | // - Problem: user code to track indexing match up is somewhat difficult 362 | // - mostly due to needing an extra index to track to handle the "last chunk" location not 363 | // being the "slice we need to pass start" 364 | fn find_chunk_edge(&self, state: &mut Self::SearchState, data: &[u8]) 365 | -> (Option, usize); 366 | } 367 | 368 | /// Implimented on types which can be converted to/can provide a [`ChunkIncr`] interface. 369 | /// 370 | /// Types that impliment this generally represent a instantiation of a chunking algorithm. 371 | // NOTE: we use this instead of just having `From<&C: Chunk> for CI: ChunkIncr` because there is 372 | // _one_ `ChunkIncr` for each `Chunk`, and rust can't infer that when using a `From` or `Into` 373 | // bound. 374 | // 375 | // We could consider adding `type Incr` into `trait Chunk`, or only having `type Incr` 376 | pub trait ToChunkIncr { 377 | /// `Incr` provides the incrimental interface to this chunking instance 378 | type Incr: ChunkIncr; 379 | 380 | /// `to_chunk_incr()` returns a [`ChunkIncr`] which can be incrimentally fed data and emits 381 | /// chunks. 382 | /// 383 | /// Generally, this is a typically low cost operation that copies from the implimentor or does 384 | /// minor computation on its fields and may allocate some memory for storing additional state 385 | /// needed for incrimental computation. 386 | fn to_chunk_incr(&self) -> Self::Incr; 387 | } 388 | -------------------------------------------------------------------------------- /src/mii.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "mii")] 2 | use crate::{ChunkIncr, ToChunkIncr}; 3 | 4 | /// C. Zhang et al., "MII: A Novel Content Defined Chunking Algorithm for Finding Incremental Data 5 | /// in Data Synchronization," in IEEE Access, vol. 7, pp. 86932-86945, 2019, doi: 6 | /// 10.1109/ACCESS.2019.2926195. 7 | /// 8 | /// https://ieeexplore.ieee.org/abstract/document/8752387 9 | #[derive(Debug, Clone)] 10 | pub struct Mii { 11 | w: u64, 12 | } 13 | 14 | impl Mii { 15 | /// Create a new splitter with parameter `w` 16 | /// 17 | /// `w` is the number of "increments" (positive changes in byte value) after which we split the 18 | /// input 19 | /// 20 | // TODO: determine distribution and expected size of chunks 21 | // 22 | // 1: P(curr > prev) = 0 (prev set to 0xff) 23 | // 2: P(curr > prev) = 0.5 (prev and curr assumed to be randomly distributed) 24 | // 3: P(curr > prev) | t2 = ??? 25 | // P(curr > prev) | !t2 = ??? 26 | pub fn with_w(w: u64) -> Self { 27 | Self { w } 28 | } 29 | } 30 | 31 | impl Default for Mii { 32 | /// The window of 5 is used in the paper for the generated graphs 33 | /// 34 | /// It is compared against Rabin with a window of 7 and AE/LMC/RAM with a window of 700 35 | fn default() -> Self { 36 | Mii::with_w(5) 37 | } 38 | } 39 | 40 | impl crate::Chunk for Mii { 41 | type SearchState = MiiSearchState; 42 | 43 | fn to_search_state(&self) -> Self::SearchState { 44 | Into::::into(self).into() 45 | } 46 | 47 | fn find_chunk_edge( 48 | &self, 49 | state: &mut Self::SearchState, 50 | data: &[u8], 51 | ) -> (Option, usize) { 52 | match state.push(data) { 53 | Some(v) => { 54 | state.reset(); 55 | (Some(v), v) 56 | } 57 | None => (None, data.len()), 58 | } 59 | } 60 | } 61 | 62 | impl From<&Mii> for MiiIncr { 63 | fn from(src: &Mii) -> Self { 64 | src.clone().into() 65 | } 66 | } 67 | 68 | impl ToChunkIncr for Mii { 69 | type Incr = MiiIncr; 70 | 71 | fn to_chunk_incr(&self) -> Self::Incr { 72 | self.into() 73 | } 74 | } 75 | 76 | #[derive(Debug)] 77 | pub struct MiiSearchState { 78 | incr: MiiIncr, 79 | } 80 | 81 | impl MiiSearchState { 82 | fn reset(&mut self) { 83 | self.incr.reset(); 84 | } 85 | } 86 | 87 | impl From for MiiSearchState { 88 | fn from(incr: MiiIncr) -> Self { 89 | Self { incr } 90 | } 91 | } 92 | 93 | impl MiiSearchState { 94 | fn push(&mut self, data: &[u8]) -> Option { 95 | self.incr.push(data) 96 | } 97 | } 98 | 99 | #[derive(Debug)] 100 | pub struct MiiIncr { 101 | /// After this many increments, split the file 102 | w: u64, 103 | 104 | /// previous examined byte, if any 105 | prev: u8, 106 | 107 | /// number of times a byte was greater than the previous value 108 | increment: u64, 109 | } 110 | 111 | impl From for MiiIncr { 112 | fn from(p: Mii) -> Self { 113 | MiiIncr { 114 | w: p.w, 115 | // we use 0xff to ensure that the first examined byte does not trigger an increment 116 | prev: 0xff, 117 | increment: 0, 118 | } 119 | } 120 | } 121 | 122 | impl ChunkIncr for MiiIncr { 123 | fn push(&mut self, input: &[u8]) -> Option { 124 | for (i, b) in input.iter().cloned().enumerate() { 125 | if b > self.prev { 126 | self.increment += 1; 127 | if self.increment == self.w { 128 | // this is a split 129 | self.increment = 0; 130 | self.prev = 0; 131 | return Some(i + 1); 132 | } 133 | } else { 134 | self.increment = 0; 135 | } 136 | self.prev = b; 137 | } 138 | 139 | None 140 | } 141 | } 142 | 143 | impl MiiIncr { 144 | fn reset(&mut self) { 145 | self.prev = 0xff; 146 | self.increment = 0; 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /src/pigz.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "pigz")] 2 | use crate::{Chunk, ChunkIncr, ToChunkIncr}; 3 | 4 | #[derive(Clone, Debug, PartialEq, Eq)] 5 | pub struct PigzRsyncable { 6 | bits: u8, 7 | 8 | /// directly derived from `bits` 9 | mask: u32, 10 | /// directly derived from `mask` 11 | hit: u32, 12 | } 13 | 14 | impl PigzRsyncable { 15 | pub fn with_bits(bits: u8) -> PigzRsyncable { 16 | let mask = (1 << bits) - 1; 17 | let hit = mask >> 1; 18 | PigzRsyncable { bits, mask, hit } 19 | } 20 | } 21 | 22 | impl Default for PigzRsyncable { 23 | fn default() -> Self { 24 | Self::with_bits(12) 25 | } 26 | } 27 | 28 | impl Chunk for PigzRsyncable { 29 | type SearchState = PigzRsyncableSearchState; 30 | 31 | fn to_search_state(&self) -> Self::SearchState { 32 | self.into() 33 | } 34 | 35 | fn find_chunk_edge( 36 | &self, 37 | state: &mut Self::SearchState, 38 | data: &[u8], 39 | ) -> (Option, usize) { 40 | for (i, v) in data.iter().enumerate() { 41 | if state.state.add(self, *v) { 42 | *state = self.to_search_state(); 43 | return (Some(i + 1), i + 1); 44 | } 45 | } 46 | 47 | (None, data.len()) 48 | } 49 | } 50 | 51 | impl From<&PigzRsyncable> for PigzRsyncableIncr { 52 | fn from(src: &PigzRsyncable) -> Self { 53 | src.clone().into() 54 | } 55 | } 56 | 57 | impl ToChunkIncr for PigzRsyncable { 58 | type Incr = PigzRsyncableIncr; 59 | fn to_chunk_incr(&self) -> Self::Incr { 60 | self.into() 61 | } 62 | } 63 | 64 | #[derive(Debug, Clone)] 65 | struct PigzRsyncableState { 66 | hash: u32, 67 | } 68 | 69 | impl From<&PigzRsyncable> for PigzRsyncableState { 70 | fn from(params: &PigzRsyncable) -> Self { 71 | PigzRsyncableState { hash: params.hit } 72 | } 73 | } 74 | 75 | /// Intermediate state for [`PigzRsyncable::find_chunk_edge`] 76 | /// 77 | /// Using this avoids re-computation of data when no edge is found 78 | #[derive(Debug, Clone)] 79 | pub struct PigzRsyncableSearchState { 80 | state: PigzRsyncableState, 81 | } 82 | 83 | impl From<&PigzRsyncable> for PigzRsyncableSearchState { 84 | fn from(params: &PigzRsyncable) -> Self { 85 | PigzRsyncableSearchState { 86 | state: params.into(), 87 | } 88 | } 89 | } 90 | 91 | /// Provides an incremental interface to [`PigzRsyncable`] 92 | /// 93 | /// Performance Note: [`PigzRsyncable`] requires look-back. As a result, [`PigzRsyncableIncr`] internally 94 | /// buffers data up to the window size. This additional copying may affect performance. If 95 | /// possible for your use case, use the non-incremental interface. 96 | /// 97 | /// See [`PigzRsyncable`] for details on the underlying algorithm 98 | #[derive(Debug, Clone)] 99 | pub struct PigzRsyncableIncr { 100 | params: PigzRsyncable, 101 | state: PigzRsyncableState, 102 | } 103 | 104 | impl PigzRsyncableIncr {} 105 | 106 | impl From for PigzRsyncableIncr { 107 | fn from(params: PigzRsyncable) -> Self { 108 | let state = (¶ms).into(); 109 | PigzRsyncableIncr { params, state } 110 | } 111 | } 112 | 113 | impl PigzRsyncableState { 114 | fn add(&mut self, parent: &PigzRsyncable, v: u8) -> bool { 115 | self.hash = ((self.hash << 1) ^ (v as u32)) & parent.mask; 116 | self.hash == parent.hit 117 | } 118 | } 119 | 120 | impl ChunkIncr for PigzRsyncableIncr { 121 | fn push(&mut self, data: &[u8]) -> Option { 122 | for (i, &v) in data.iter().enumerate() { 123 | if self.state.add(&self.params, v) { 124 | self.state = (&self.params).into(); 125 | return Some(i + 1); 126 | } 127 | } 128 | 129 | None 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /src/ram.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "ram")] 2 | 3 | //! Rapid Asymmetric Maximum (RAM) is a fast chunking algorithm 4 | //! 5 | //! - Has a minimum block size (it's "window" size) 6 | //! - Does not provide an upper bound on block size (though paper discusses a RAML variant that 7 | //! does). 8 | //! 9 | //! doi:10.1016/j.future.2017.02.013 10 | //! 11 | use crate::{Chunk, ChunkIncr, ToChunkIncr}; 12 | 13 | /// Parameters for the Rapid Asymmetric Maximum (RAM) chunking algorithm 14 | /// 15 | /// Is window free, with very small ( 16 | #[derive(Debug, Clone, PartialEq, Eq)] 17 | pub struct Ram { 18 | /// window size 19 | /// 20 | /// fixed data 21 | w: u64, 22 | } 23 | 24 | impl Ram { 25 | /// Construct a RAM instance with window size `w` 26 | /// 27 | /// `w` is also the minimum block size 28 | pub fn with_w(w: u64) -> Self { 29 | Self { w } 30 | } 31 | } 32 | 33 | impl Chunk for Ram { 34 | type SearchState = RamState; 35 | 36 | fn to_search_state(&self) -> Self::SearchState { 37 | Default::default() 38 | } 39 | 40 | fn find_chunk_edge( 41 | &self, 42 | state: &mut Self::SearchState, 43 | data: &[u8], 44 | ) -> (Option, usize) { 45 | match state.push(self, data) { 46 | Some(i) => (Some(i + 1), i + 1), 47 | None => (None, data.len()), 48 | } 49 | } 50 | } 51 | 52 | #[derive(Default, Debug, PartialEq, Eq, Clone)] 53 | pub struct RamState { 54 | /// global index (number of processed bytes since split) 55 | i: u64, 56 | 57 | /// 58 | max_val: u8, 59 | } 60 | 61 | impl RamState { 62 | fn push(&mut self, params: &Ram, data: &[u8]) -> Option { 63 | let i = self.i; 64 | 65 | for (l_i, b) in data.iter().cloned().enumerate() { 66 | if b >= self.max_val { 67 | // minimum block size 68 | let ri = l_i as u64 + i; 69 | if ri > params.w { 70 | self.i = 0; 71 | self.max_val = 0; 72 | return Some(l_i); 73 | } 74 | 75 | self.max_val = b; 76 | } 77 | } 78 | 79 | self.i += data.len() as u64; 80 | None 81 | } 82 | } 83 | 84 | impl ToChunkIncr for Ram { 85 | type Incr = RamIncr; 86 | 87 | fn to_chunk_incr(&self) -> Self::Incr { 88 | self.into() 89 | } 90 | } 91 | 92 | impl From<&Ram> for RamIncr { 93 | fn from(params: &Ram) -> Self { 94 | Self { 95 | params: params.clone(), 96 | state: Default::default(), 97 | } 98 | } 99 | } 100 | 101 | #[derive(Debug, PartialEq, Eq, Clone)] 102 | pub struct RamIncr { 103 | params: Ram, 104 | state: RamState, 105 | } 106 | 107 | impl ChunkIncr for RamIncr { 108 | fn push(&mut self, data: &[u8]) -> Option { 109 | self.state.push(&self.params, data) 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/range.rs: -------------------------------------------------------------------------------- 1 | use std::ops::Bound::{self, *}; 2 | 3 | pub trait RangeExt { 4 | fn exceeds_max(&self, other: &T) -> bool 5 | where 6 | T: PartialOrd; 7 | 8 | fn under_min(&self, item: &T) -> bool 9 | where 10 | T: PartialOrd; 11 | 12 | fn contains(&self, item: &T) -> bool 13 | where 14 | T: PartialOrd; 15 | 16 | fn into_tuple(self) -> (std::ops::Bound, std::ops::Bound) 17 | where 18 | T: Copy + std::marker::Sized; 19 | } 20 | 21 | impl> RangeExt for R { 22 | fn exceeds_max(&self, item: &T) -> bool 23 | where 24 | T: PartialOrd, 25 | { 26 | match self.end_bound() { 27 | Included(ref i) => { 28 | if item > i { 29 | return true; 30 | } 31 | } 32 | Excluded(ref i) => { 33 | if item >= i { 34 | return true; 35 | } 36 | } 37 | Unbounded => {} 38 | } 39 | 40 | false 41 | } 42 | 43 | fn under_min(&self, item: &T) -> bool 44 | where 45 | T: PartialOrd, 46 | { 47 | match self.start_bound() { 48 | Included(ref i) => { 49 | if item < i { 50 | return true; 51 | } 52 | } 53 | Excluded(ref i) => { 54 | if item <= i { 55 | return true; 56 | } 57 | } 58 | Unbounded => {} 59 | } 60 | 61 | false 62 | } 63 | 64 | fn contains(&self, item: &T) -> bool 65 | where 66 | T: PartialOrd, 67 | { 68 | /* not excluded by lower */ 69 | if self.under_min(item) { 70 | return false; 71 | } 72 | 73 | if self.exceeds_max(item) { 74 | return false; 75 | } 76 | 77 | true 78 | } 79 | 80 | fn into_tuple(self) -> (std::ops::Bound, std::ops::Bound) 81 | where 82 | T: Copy + std::marker::Sized, 83 | { 84 | ( 85 | bound_cloned(self.start_bound()), 86 | bound_cloned(self.end_bound()), 87 | ) 88 | } 89 | } 90 | 91 | /// Map a `Bound<&T>` to a `Bound` by cloning the contents of the bound. 92 | /// 93 | /// # Examples 94 | /// 95 | /// ``` 96 | /// use std::ops::Bound::*; 97 | /// use std::ops::RangeBounds; 98 | /// use hash_roll::range::bound_cloned; 99 | /// 100 | /// assert_eq!((1..12).start_bound(), Included(&1)); 101 | /// assert_eq!(bound_cloned((1..12).start_bound()), Included(1)); 102 | /// ``` 103 | pub fn bound_cloned(src: std::ops::Bound<&T>) -> std::ops::Bound { 104 | match src { 105 | Bound::Unbounded => Bound::Unbounded, 106 | Bound::Included(x) => Bound::Included(x.clone()), 107 | Bound::Excluded(x) => Bound::Excluded(x.clone()), 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /src/zpaq.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "zpaq")] 2 | 3 | //! `zpaq` impliments the chunking algorithm used in the zpaq archiving tool 4 | 5 | use std::fmt; 6 | use std::num::Wrapping; 7 | 8 | use crate::{Chunk, ChunkIncr, RangeExt, ToChunkIncr}; 9 | use std::ops::Bound; 10 | use std::ops::RangeBounds; 11 | 12 | /** 13 | * A splitter used in go 'dedup' and zpaq that does not require looking back in the source 14 | * data to update 15 | * 16 | * PDF: ?? 17 | * 18 | * Note: go-dedup & zpaq calculate the relationship between their parameters slightly differently. 19 | * We support both of these (via the seperate with_*() constructors, but it'd be nice to clarify 20 | * why they differ and what affect the differences have. 21 | * 22 | * References: 23 | * 24 | * - http://encode.ru/threads/456-zpaq-updates?p=45192&viewfull=1#post45192 25 | * - https://github.com/klauspost/dedup/blob/master/writer.go#L668, 'zpaqWriter' 26 | * - https://github.com/zpaq/zpaq/blob/master/zpaq.cpp 27 | * 28 | * Parameters: 29 | * 30 | * - fragment (aka average_size_pow_2): average size = 2**fragment KiB 31 | * in Zpaq (the compressor), this defaults to 6 32 | * - min_size, max_size: additional bounds on the blocks. Not technically needed for the algorithm 33 | * to function 34 | * 35 | * In Zpaq-compressor, min & max size are calculated using the fragment value 36 | * In go's dedup, fragment is calculated using a min & max size 37 | * 38 | * In-block state: 39 | * 40 | * - hash: u32, current hash 41 | * - last_byte: u8, previous byte read 42 | * - predicted_byte: array of 256 u8's. 43 | * 44 | * Between-block state: 45 | * 46 | * - None 47 | */ 48 | #[derive(Debug, Clone, PartialEq, Eq)] 49 | pub struct Zpaq { 50 | range: (Bound, Bound), 51 | max_hash: u32, 52 | } 53 | 54 | impl Zpaq { 55 | /* this is taken from go-dedup */ 56 | fn fragment_ave_from_max(max: u64) -> u8 { 57 | /* TODO: convert this to pure integer math */ 58 | (max as f64 / (64f64 * 64f64)).log2() as u8 59 | } 60 | 61 | /* these are based on the zpaq (not go-dedup) calculations */ 62 | fn fragment_ave_from_range>(range: T) -> u8 { 63 | let v = match range.end_bound() { 64 | Bound::Included(i) => *i, 65 | Bound::Excluded(i) => *i - 1, 66 | Bound::Unbounded => { 67 | /* try to guess based on first */ 68 | 64 * match range.start_bound() { 69 | Bound::Included(i) => *i, 70 | Bound::Excluded(i) => *i + 1, 71 | Bound::Unbounded => { 72 | /* welp, lets use the default */ 73 | return 16; 74 | } 75 | } 76 | } 77 | }; 78 | 79 | Self::fragment_ave_from_max(v) 80 | } 81 | 82 | /* these are based on the zpaq (not go-dedup) calculations */ 83 | fn range_from_fragment_ave(fragment_ave: u8) -> impl RangeBounds { 84 | assert!(fragment_ave <= 32); 85 | assert!(fragment_ave >= 10); 86 | 64 << (fragment_ave - 10)..8128 << fragment_ave 87 | } 88 | 89 | fn range_from_max(max: u64) -> impl RangeBounds { 90 | max / 64..max 91 | } 92 | 93 | fn max_hash_from_fragment_ave(fragment_ave: u8) -> u32 { 94 | assert!(fragment_ave <= 32); 95 | 1 << (32 - fragment_ave) 96 | /* 97 | * go-dedup does this: 98 | * (22f64 - fragment_ave).exp2() as u32 99 | * 100 | * Which should be equivalent to the integer math above (which is used by zpaq). 101 | */ 102 | } 103 | 104 | /** 105 | * Create a splitter using the range of output block sizes. 106 | * 107 | * The average block size will be the max block size (if any) divided by 4, using the same 108 | * algorithm to calculate it as go-dedup. 109 | */ 110 | pub fn with_range(range: impl RangeBounds + Clone) -> Self { 111 | let f = Self::fragment_ave_from_range(range.clone()); 112 | Self::with_average_and_range(f, range) 113 | } 114 | 115 | /** 116 | * Create a splitter using the defaults from Zpaq (the compressor) given a average size 117 | * formated as a power of 2. 118 | * 119 | * Corresponds to zpaq's argument "-fragment". 120 | */ 121 | pub fn with_average_size_pow_2(average_size_pow_2: u8) -> Self { 122 | let r = Self::range_from_fragment_ave(average_size_pow_2); 123 | Self::with_average_and_range(average_size_pow_2, r) 124 | } 125 | 126 | /** 127 | * Use the defaults from go-dedup to generate a splitter given the max size of a split. 128 | * 129 | * The average block size will be the max block size (if any) divided by 4, using the same 130 | * algorithm to calculate it as go-dedup. 131 | */ 132 | pub fn with_max_size(max: u64) -> Self { 133 | Self::with_average_and_range(Self::fragment_ave_from_max(max), Self::range_from_max(max)) 134 | } 135 | 136 | /** 137 | * Create a splitter with control of all parameters 138 | * 139 | * All the other constructors use this internally 140 | */ 141 | pub fn with_average_and_range(average_size_pow_2: u8, range: impl RangeBounds) -> Self { 142 | Zpaq { 143 | range: range.into_tuple(), 144 | max_hash: Self::max_hash_from_fragment_ave(average_size_pow_2), 145 | } 146 | } 147 | 148 | /* 149 | fn average_block_size(&self) -> u64 { 150 | /* I don't know If i really trust this, do some more confirmation */ 151 | 1024 << self.fragment 152 | } 153 | */ 154 | 155 | fn split_here(&self, hash: u32, index: u64) -> bool { 156 | (hash < self.max_hash && !self.range.under_min(&index)) || self.range.exceeds_max(&index) 157 | } 158 | } 159 | 160 | impl Default for Zpaq { 161 | /** 162 | * Create a splitter using the defaults from Zpaq (the compressor) 163 | * 164 | * Average size is 65536 bytes (64KiB), max is 520192 bytes (508KiB), min is 4096 bytes (4KiB) 165 | */ 166 | fn default() -> Self { 167 | Self::with_average_size_pow_2(16) 168 | } 169 | } 170 | 171 | /// Incrimental instance of [`Zpaq`]. 172 | /// 173 | /// `Zpaq` doesn't require input look back, so the incrimental and non-incrimental performance 174 | /// should be similar. 175 | #[derive(Debug)] 176 | pub struct ZpaqIncr { 177 | params: Zpaq, 178 | state: ZpaqHash, 179 | idx: u64, 180 | } 181 | 182 | /// Intermediate state from [`Chunk::find_chunk_edge`] for [`Zpaq`]. 183 | #[derive(Default, Debug)] 184 | pub struct ZpaqSearchState { 185 | state: ZpaqHash, 186 | idx: u64, 187 | } 188 | 189 | impl ZpaqSearchState { 190 | fn feed(&mut self, v: u8) -> u32 { 191 | self.idx += 1; 192 | self.state.feed(v) 193 | } 194 | } 195 | 196 | impl Chunk for Zpaq { 197 | type SearchState = ZpaqSearchState; 198 | 199 | fn to_search_state(&self) -> Self::SearchState { 200 | Default::default() 201 | } 202 | 203 | fn find_chunk_edge( 204 | &self, 205 | state: &mut Self::SearchState, 206 | data: &[u8], 207 | ) -> (Option, usize) { 208 | for (i, v) in data.iter().enumerate() { 209 | let h = state.feed(*v); 210 | if self.split_here(h, (state.idx + 1) as u64) { 211 | *state = self.to_search_state(); 212 | return (Some(i + 1), i + 1); 213 | } 214 | } 215 | 216 | (None, data.len()) 217 | } 218 | } 219 | 220 | impl From<&Zpaq> for ZpaqSearchState { 221 | fn from(_: &Zpaq) -> Self { 222 | Default::default() 223 | } 224 | } 225 | 226 | impl From<&Zpaq> for ZpaqIncr { 227 | fn from(s: &Zpaq) -> Self { 228 | s.clone().into() 229 | } 230 | } 231 | 232 | impl ToChunkIncr for Zpaq { 233 | type Incr = ZpaqIncr; 234 | fn to_chunk_incr(&self) -> Self::Incr { 235 | self.into() 236 | } 237 | } 238 | 239 | impl ZpaqIncr { 240 | fn feed(&mut self, v: u8) -> u32 { 241 | self.idx += 1; 242 | self.state.feed(v) 243 | } 244 | 245 | fn reset(&mut self) { 246 | self.idx = 0; 247 | self.state = Default::default(); 248 | } 249 | } 250 | 251 | impl ChunkIncr for ZpaqIncr { 252 | fn push(&mut self, data: &[u8]) -> Option { 253 | for (i, &v) in data.iter().enumerate() { 254 | let h = self.feed(v); 255 | if self.params.split_here(h, self.idx) { 256 | self.reset(); 257 | return Some(i + 1); 258 | } 259 | } 260 | 261 | None 262 | } 263 | } 264 | 265 | impl From for ZpaqIncr { 266 | fn from(params: Zpaq) -> Self { 267 | Self { 268 | params, 269 | state: Default::default(), 270 | idx: 0, 271 | } 272 | } 273 | } 274 | 275 | /** 276 | * The rolling hash component of the zpaq splitter 277 | */ 278 | #[derive(Clone)] 279 | pub struct ZpaqHash { 280 | hash: Wrapping, 281 | last_byte: u8, 282 | predicted_byte: [u8; 256], 283 | } 284 | 285 | impl PartialEq for ZpaqHash { 286 | fn eq(&self, other: &Self) -> bool { 287 | self.hash == other.hash 288 | && self.last_byte == other.last_byte 289 | && self.predicted_byte[..] == other.predicted_byte[..] 290 | } 291 | } 292 | 293 | impl Eq for ZpaqHash {} 294 | 295 | impl fmt::Debug for ZpaqHash { 296 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { 297 | f.debug_struct("ZpaqHash") 298 | .field("hash", &self.hash) 299 | .field("last_byte", &self.last_byte) 300 | .field("predicted_byte", &fmt_extra::Hs(&self.predicted_byte[..])) 301 | .finish() 302 | } 303 | } 304 | 305 | impl Default for ZpaqHash { 306 | fn default() -> Self { 307 | ZpaqHash { 308 | hash: Wrapping(0), 309 | last_byte: 0, 310 | predicted_byte: [0; 256], 311 | } 312 | } 313 | } 314 | 315 | impl ZpaqHash { 316 | /* 317 | * we can only get away with this because Zpaq doesn't need to look at old data to make it's 318 | * splitting decision, it only examines it's state + current value (and the state is 319 | * relatively large, but isn't a window into past data). 320 | */ 321 | fn feed(&mut self, c: u8) -> u32 { 322 | self.hash = if c == self.predicted_byte[self.last_byte as usize] { 323 | (self.hash + Wrapping(c as u32) + Wrapping(1)) * Wrapping(314159265) 324 | } else { 325 | (self.hash + Wrapping(c as u32) + Wrapping(1)) * Wrapping(271828182) 326 | }; 327 | 328 | self.predicted_byte[self.last_byte as usize] = c; 329 | self.last_byte = c; 330 | self.hash.0 331 | } 332 | } 333 | -------------------------------------------------------------------------------- /src/zstd.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "zstd")] 2 | 3 | //! zstd's `--rsyncable` option performs content defined chunking 4 | //! 5 | //! This has been minimally validated to match the implimentation from zstd, with the following 6 | //! caveats: 7 | //! 8 | //! - Maximum chunk size is not implimented 9 | //! - Only 1 test case with a single chunk edge (ie: 2 chunks) has been tested 10 | //! 11 | //! It uses a internal [rolling 12 | //! hash](https://github.com/facebook/zstd/blob/01261bc8b6fcfc77801788f8b1e2a2e5dd2e8e25/lib/compress/zstd_compress_internal.h#L658-L698) 13 | //! with 1 multiple and 2 additions. (see `ZSTD_rollingHash_append()` for core functionality). 14 | //! 15 | //! The rolling hash is then used by 16 | //! [`findSynchronizationPoint()`](https://github.com/facebook/zstd/blob/15c5e200235edc520c1bd678ed126a6dd05736e1/lib/compress/zstdmt_compress.c#L1931-L2001) 17 | //! in various ways to find "syncronization points" (ie: edges of chunks). 18 | //! 19 | //! [This issue thread comment ](https://github.com/facebook/zstd/issues/1155#issuecomment-520258862) also 20 | //! includes some explanation on the mechanism. 21 | //! 22 | //! The zstd code _does_ include in it's context information about _previous_ block that was 23 | //! emitted. In other words: the rolling hash isn't "reset" on block emittion. (Most chunking 24 | //! algorithms are reset on block emittion). 25 | use crate::{Chunk, ChunkIncr, ToChunkIncr}; 26 | use std::convert::TryInto; 27 | use std::num::Wrapping; 28 | 29 | const RSYNC_LENGTH: usize = 32; 30 | const PRIME_8_BYTES: Wrapping = Wrapping(0xCF1BBCDCB7A56463); 31 | const ROLL_HASH_CHAR_OFFSET: Wrapping = Wrapping(10); 32 | 33 | #[derive(Debug, PartialEq, Eq, Clone)] 34 | pub struct Zstd { 35 | hit_mask: u64, 36 | prime_power: u64, 37 | } 38 | 39 | impl Default for Zstd { 40 | fn default() -> Self { 41 | // ../lib/compress/zstdmt_compress.c: jobSizeMB: 8, rsyncBits: 23, hitMask: 7fffff, primePower: f5507fe35f91f8cb 42 | Self::with_target_section_size(8 << 20) 43 | } 44 | } 45 | 46 | impl Zstd { 47 | /* 48 | * ```notrust 49 | /* Aim for the targetsectionSize as the average job size. */ 50 | U32 const jobSizeMB = (U32)(mtctx->targetSectionSize >> 20); 51 | U32 const rsyncBits = ZSTD_highbit32(jobSizeMB) + 20; 52 | assert(jobSizeMB >= 1); 53 | DEBUGLOG(4, "rsyncLog = %u", rsyncBits); 54 | mtctx->rsync.hash = 0; 55 | mtctx->rsync.hitMask = (1ULL << rsyncBits) - 1; 56 | mtctx->rsync.primePower = ZSTD_rollingHash_primePower(RSYNC_LENGTH); 57 | ``` 58 | */ 59 | pub fn with_target_section_size(target_section_size: u64) -> Self { 60 | let job_size_mb: u32 = (target_section_size >> 20).try_into().unwrap(); 61 | assert_ne!(job_size_mb, 0); 62 | let rsync_bits = (job_size_mb.leading_zeros() ^ 31) + 20; 63 | let hit_mask = (1u64 << rsync_bits) - 1; 64 | let prime_power = PRIME_8_BYTES 65 | .0 66 | .wrapping_pow((RSYNC_LENGTH - 1).try_into().unwrap()); 67 | Self { 68 | hit_mask, 69 | prime_power, 70 | } 71 | } 72 | } 73 | 74 | #[cfg(test)] 75 | mod test { 76 | #[test] 77 | fn test_zstd_init_matches_upstream() { 78 | let zstd = super::Zstd::default(); 79 | assert_eq!(zstd.hit_mask, 0x7f_ffff); 80 | assert_eq!(zstd.prime_power, 0xf5507fe35f91f8cb); 81 | } 82 | } 83 | 84 | #[derive(Default, Debug, PartialEq, Eq)] 85 | struct ZstdState { 86 | hash: Wrapping, 87 | } 88 | 89 | impl ZstdState { 90 | // `ZSTD_rollingHash_append()` 91 | fn append(&mut self, data: &[u8]) { 92 | for i in data { 93 | self.hash *= PRIME_8_BYTES; 94 | self.hash += Wrapping(*i as u64) + ROLL_HASH_CHAR_OFFSET; 95 | } 96 | } 97 | 98 | // `ZSTD_rollingHash_rotate()` 99 | fn rotate(&mut self, to_remove: u8, to_add: u8, prime_power: u64) { 100 | self.hash -= (Wrapping(to_remove as u64) + ROLL_HASH_CHAR_OFFSET) * Wrapping(prime_power); 101 | self.hash *= PRIME_8_BYTES; 102 | self.hash += Wrapping(to_add as u64) + ROLL_HASH_CHAR_OFFSET; 103 | } 104 | 105 | fn at_split(&mut self, params: &Zstd) -> bool { 106 | (self.hash.0 & params.hit_mask) == params.hit_mask 107 | } 108 | } 109 | 110 | #[derive(Default, Debug, PartialEq, Eq)] 111 | pub struct ZstdSearchState { 112 | state: ZstdState, 113 | offset: usize, 114 | } 115 | 116 | impl ZstdSearchState { 117 | fn append(&mut self, data: &[u8]) { 118 | self.state.append(data); 119 | } 120 | 121 | fn rotate(&mut self, to_remove: u8, to_add: u8, prime_power: u64) { 122 | self.state.rotate(to_remove, to_add, prime_power); 123 | } 124 | 125 | fn at_split(&mut self, params: &Zstd) -> bool { 126 | self.state.at_split(params) 127 | } 128 | } 129 | 130 | /// Incrimental chunking using Zstd's rsyncable algorithm 131 | /// 132 | /// Performance note: Zstd's chunking requires buffer look back to remove previously inserted data, 133 | /// and as a result requires `ZstdIncr` to maintain an internal buffer. This internal buffer may 134 | /// reduce performance. 135 | #[derive(Debug, PartialEq, Eq)] 136 | pub struct ZstdIncr { 137 | params: Zstd, 138 | 139 | state: ZstdState, 140 | 141 | window: Box<[u8]>, 142 | // insert into the window at this offset 143 | window_offs: usize, 144 | // if true, we need to remove bytes from the window when inserting 145 | // 146 | // NOTE: by pre-filling `self.hash` with an appropriate value, we might be able to remove this 147 | // variable and always treat the window as full (of zeros initially). 148 | window_full: bool, 149 | 150 | // how many byte since last emitted block 151 | // used to cap the block size as zstd does 152 | input_offs: u64, 153 | } 154 | 155 | impl ToChunkIncr for Zstd { 156 | type Incr = ZstdIncr; 157 | 158 | fn to_chunk_incr(&self) -> Self::Incr { 159 | self.into() 160 | } 161 | } 162 | 163 | impl From for ZstdIncr { 164 | fn from(params: Zstd) -> Self { 165 | Self { 166 | params, 167 | state: Default::default(), 168 | window: vec![0; RSYNC_LENGTH].into_boxed_slice(), 169 | window_offs: 0, 170 | window_full: false, 171 | input_offs: 0, 172 | } 173 | } 174 | } 175 | 176 | impl From<&Zstd> for ZstdIncr { 177 | fn from(params: &Zstd) -> Self { 178 | params.clone().into() 179 | } 180 | } 181 | 182 | impl Chunk for Zstd { 183 | type SearchState = ZstdSearchState; 184 | 185 | fn to_search_state(&self) -> Self::SearchState { 186 | Self::SearchState::default() 187 | } 188 | 189 | fn find_chunk_edge( 190 | &self, 191 | state: &mut Self::SearchState, 192 | data: &[u8], 193 | ) -> (Option, usize) { 194 | if state.offset < RSYNC_LENGTH { 195 | // push some data in 196 | let seed_b = &data[state.offset..std::cmp::min(RSYNC_LENGTH, data.len())]; 197 | state.append(seed_b); 198 | state.offset += seed_b.len(); 199 | 200 | if state.offset < RSYNC_LENGTH { 201 | // not enough data 202 | return (None, 0); 203 | } 204 | } 205 | 206 | // TODO: track input_offs to split over-size blocks 207 | 208 | // we've got enough data, do rotations 209 | for i in state.offset..data.len() { 210 | let to_remove = data[i - RSYNC_LENGTH]; 211 | let to_add = data[i]; 212 | state.rotate(to_remove, to_add, self.prime_power); 213 | if state.at_split(self) { 214 | let discard_ct = data.len().saturating_sub(RSYNC_LENGTH); 215 | return (Some(i + 1), discard_ct); 216 | } 217 | } 218 | 219 | let discard_ct = data.len().saturating_sub(RSYNC_LENGTH); 220 | let keep_ct = data.len() - discard_ct; 221 | state.offset = keep_ct; 222 | (None, discard_ct) 223 | } 224 | } 225 | 226 | impl ChunkIncr for ZstdIncr { 227 | fn push(&mut self, data: &[u8]) -> Option { 228 | let use_len = if !self.window_full { 229 | let use_len = std::cmp::min(self.window.len() - self.window_offs, data.len()); 230 | self.window[self.window_offs..(self.window_offs + use_len)] 231 | .copy_from_slice(&data[..use_len]); 232 | self.window_offs += use_len; 233 | 234 | if self.window_offs != self.window.len() { 235 | return None; 236 | } 237 | 238 | self.window_full = true; 239 | self.window_offs = 0; 240 | self.state.append(&self.window[..]); 241 | use_len 242 | } else { 243 | 0 244 | }; 245 | 246 | // TODO: track input_offs to split over-size blocks 247 | 248 | // we have a full window, now rotate data through 249 | for (i, &v) in data[use_len..].iter().enumerate() { 250 | let to_remove = self.window[self.window_offs]; 251 | let to_add = v; 252 | self.state 253 | .rotate(to_remove, to_add, self.params.prime_power); 254 | self.window[self.window_offs] = to_add; 255 | self.window_offs = (self.window_offs + 1) % self.window.len(); 256 | 257 | if self.state.at_split(&self.params) { 258 | // NOTE: don't clear window 259 | return Some(i + use_len); 260 | } 261 | } 262 | 263 | None 264 | } 265 | } 266 | -------------------------------------------------------------------------------- /tests/cuts.rs: -------------------------------------------------------------------------------- 1 | use hash_roll::{Chunk, ChunkIncr, ToChunkIncr}; 2 | use rand::RngCore; 3 | use rand_pcg::Pcg64; 4 | 5 | fn test_data(seed: u128, size: usize) -> Vec { 6 | let mut fill_rng = Pcg64::new(seed, 0xa02bdbf7bb3c0a7ac28fa16a64abf96); 7 | let mut buf = vec![0u8; size]; 8 | fill_rng.fill_bytes(&mut buf); 9 | buf 10 | } 11 | 12 | fn cut_test_incr(seed: u128, size: usize, chunker: C, expected_splits: &[usize]) { 13 | let buf = test_data(seed, size); 14 | 15 | // Note: this is only basic equivalance checking via byte-at-a-time. More full equivalance 16 | // checking will be done via quickcheck tests. 17 | let mut incr_splits = Vec::with_capacity(expected_splits.len()); 18 | { 19 | let mut incr = chunker; 20 | let buf = &buf[..]; 21 | let mut last_split = 0; 22 | for (i, v) in buf.iter().enumerate() { 23 | match incr.push(&[*v]) { 24 | Some(_split_point) => { 25 | let sp = i + 1; 26 | incr_splits.push(sp - last_split); 27 | last_split = sp; 28 | } 29 | None => {} 30 | } 31 | } 32 | } 33 | 34 | assert_eq!(expected_splits, &incr_splits[..]); 35 | } 36 | 37 | fn cut_test_sz( 38 | seed: u128, 39 | size: usize, 40 | chunker: C, 41 | expected_splits: &[usize], 42 | ) { 43 | let buf = test_data(seed, size); 44 | 45 | // Note: this doesn't validate SearchState at all 46 | let mut splits = Vec::with_capacity(expected_splits.len()); 47 | { 48 | let mut state = chunker.to_search_state(); 49 | let mut discard_idx = 0; 50 | let mut last_chunk_idx = 0; 51 | loop { 52 | let b = &buf[discard_idx..]; 53 | let (split_point, discard_ct) = chunker.find_chunk_edge(&mut state, b); 54 | match split_point { 55 | Some(split_point) => { 56 | let split_point_global = discard_idx + split_point; 57 | if last_chunk_idx > split_point_global { 58 | panic!("last_chunk_idx: {}, split_point_global: {}, split_point: {}, discard_idx: {}", 59 | last_chunk_idx, split_point_global, split_point, discard_idx); 60 | } 61 | let split_len = split_point_global - last_chunk_idx; 62 | last_chunk_idx = split_point_global; 63 | splits.push(split_len); 64 | } 65 | None => { 66 | break; 67 | } 68 | } 69 | discard_idx += discard_ct; 70 | } 71 | } 72 | 73 | // Note: this is only basic equivalance checking via byte-at-a-time. More full equivalance 74 | // checking will be done via quickcheck tests. 75 | let mut incr_splits = Vec::with_capacity(expected_splits.len()); 76 | { 77 | let mut incr = chunker.to_chunk_incr(); 78 | let buf = &buf[..]; 79 | let mut last_split = 0; 80 | for (i, v) in buf.iter().enumerate() { 81 | match incr.push(&[*v]) { 82 | Some(_split_point) => { 83 | let sp = i + 1; 84 | incr_splits.push(sp - last_split); 85 | last_split = sp; 86 | } 87 | None => {} 88 | } 89 | } 90 | } 91 | 92 | assert_eq!(&splits[..], &incr_splits[..]); 93 | assert_eq!(expected_splits, &splits[..]); 94 | } 95 | 96 | fn cut_test(seed: u128, chunker: C, expected_splits: &[usize]) { 97 | cut_test_sz(seed, 8192 * 4, chunker, expected_splits) 98 | } 99 | 100 | #[cfg(feature = "mii")] 101 | #[test] 102 | fn mii_cuts_1() { 103 | cut_test( 104 | 0, 105 | hash_roll::mii::Mii::default(), 106 | &[ 107 | 1212, 40, 261, 1548, 1881, 312, 2043, 285, 1062, 677, 542, 1473, 303, 172, 318, 839, 108 | 2560, 3242, 396, 202, 123, 898, 2454, 544, 3541, 571, 483, 383, 103, 2629, 929, 47, 109 | 524, 110 | ], 111 | ); 112 | } 113 | 114 | #[cfg(feature = "bup")] 115 | #[test] 116 | fn bup_cuts_1() { 117 | cut_test(0, hash_roll::bup::RollSum::default(), &[2600, 6245]) 118 | } 119 | 120 | #[cfg(feature = "gzip")] 121 | #[test] 122 | fn gzip_cuts_1() { 123 | cut_test( 124 | 0, 125 | hash_roll::gzip::GzipRsyncable::default(), 126 | &[2941, 2077, 5263, 7263, 392, 4371, 5204], 127 | ) 128 | } 129 | 130 | #[cfg(feature = "gzip")] 131 | #[test] 132 | fn gzip_cuts_2() { 133 | // chosen so we check window removal 134 | cut_test( 135 | 2, 136 | hash_roll::gzip::GzipRsyncable::default(), 137 | &[9277, 2758, 3074, 7415, 3579, 4141], 138 | ) 139 | } 140 | 141 | #[cfg(feature = "buzhash")] 142 | #[test] 143 | fn buzhash_cuts_1() { 144 | cut_test( 145 | 0, 146 | hash_roll::buzhash::BuzHash::new_nom(0), 147 | &[6265, 1745, 11527, 6851, 1089], 148 | ) 149 | } 150 | 151 | #[cfg(feature = "zpaq")] 152 | #[test] 153 | fn zpaq_cuts_0() { 154 | // These match edges from Zpaq 7.15 (with modification to print the fragment sizes). 155 | // 156 | // cargo run --example generate-test-data 0 >test_data_0.bin 157 | // zpaq a foo.zpaq ~/p/hash-roll/test_data_0.bin -fragment 3 158 | cut_test( 159 | 0, 160 | hash_roll::zpaq::Zpaq::with_average_size_pow_2(13), 161 | &[10785, 6329, 1287, 860, 4716, 7419], 162 | ) 163 | } 164 | 165 | #[cfg(feature = "zpaq")] 166 | #[test] 167 | fn zpaq_cuts_3() { 168 | // These match edges from Zpaq 7.15 (with modification to print the fragment sizes). 169 | // 170 | // cargo run --example generate-test-data 3 >test_data_3.bin 171 | // zpaq a foo.zpaq ~/p/hash-roll/test_data_3.bin -fragment 3 172 | cut_test( 173 | 3, 174 | hash_roll::zpaq::Zpaq::with_average_size_pow_2(13), 175 | &[16353, 2334, 970, 5326, 1557], 176 | ) 177 | } 178 | 179 | #[cfg(feature = "pigz")] 180 | #[test] 181 | fn pigz_cuts_0() { 182 | cut_test( 183 | 0, 184 | hash_roll::pigz::PigzRsyncable::default(), 185 | &[9069, 1191, 3685, 8629, 2119, 2939], 186 | ) 187 | } 188 | 189 | /* 190 | * 0 191 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 0, tss: 8388608 -> (131072, 0) 192 | * 1 193 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 131072, tss: 8388608 -> (131072, 0) 194 | * 2 195 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 262144, tss: 8388608 -> (131072, 0) 196 | * 3 197 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 393216, tss: 8388608 -> (131072, 0) 198 | * 4 199 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 524288, tss: 8388608 -> (131072, 0) 200 | * 5 201 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 655360, tss: 8388608 -> (131072, 0) 202 | * 6 203 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 786432, tss: 8388608 -> (131072, 0) 204 | 7 205 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 917504, tss: 8388608 -> (131072, 0) 206 | 8 207 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 1048576, tss: 8388608 -> (131072, 0) 208 | 9 209 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 1179648, tss: 8388608 -> (131072, 0) 210 | 10 211 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 1310720, tss: 8388608 -> (131072, 0) 212 | 11 213 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 1441792, tss: 8388608 -> (131072, 0) 214 | 12 215 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 1572864, tss: 8388608 -> (87647, 1) 216 | 13 217 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (87647, 131072), inbf: 0, tss: 8388608 -> (43425, 0) 218 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 43425, tss: 8388608 -> (131072, 0) 219 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 174497, tss: 8388608 -> (131072, 0) 220 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 305569, tss: 8388608 -> (131072, 0) 221 | */ 222 | #[cfg(feature = "zstd")] 223 | #[test] 224 | fn zstd_cuts_0_2mb() { 225 | cut_test_sz( 226 | 0, 227 | 1024 * 1024 * 2, 228 | hash_roll::zstd::Zstd::default(), 229 | //&[1660511], 230 | &[12 * 131072 + 87647], 231 | ) 232 | } 233 | 234 | #[cfg(feature = "gear")] 235 | #[test] 236 | fn gear32_cuts_0() { 237 | cut_test(0, hash_roll::gear::Gear32::default(), &[11031, 7789, 10463]) 238 | } 239 | 240 | #[cfg(feature = "fastcdc")] 241 | #[test] 242 | fn fastcdc_cuts_incr_0() { 243 | cut_test_incr( 244 | 0, 245 | 8192 * 4, 246 | hash_roll::fastcdc::FastCdcIncr::default(), 247 | &[8463, 9933, 9029], 248 | ) 249 | } 250 | 251 | #[cfg(feature = "fastcdc")] 252 | #[test] 253 | fn fastcdc_cuts_0() { 254 | cut_test( 255 | 0, 256 | hash_roll::fastcdc::FastCdc::default(), 257 | &[8463, 9933, 9029], 258 | ) 259 | } 260 | 261 | #[cfg(feature = "ram")] 262 | #[test] 263 | fn ram_cuts_0() { 264 | cut_test(0, hash_roll::ram::Ram::with_w(8192), &[8264, 8368, 8341]) 265 | } 266 | -------------------------------------------------------------------------------- /tests/cuts_qc.rs: -------------------------------------------------------------------------------- 1 | // check the following are equivalent: 2 | // - find_chunk_edge() with 1 set of buffer sizes vs another set of buffer sizes 3 | // - incrimental with 1 set of buffer sizes vs another set of buffer sizes 4 | // - find_chunk_edge() vs incrimental 5 | // 6 | // - simd vs non-simd algorithms 7 | 8 | use hash_roll::Chunk; 9 | use proptest::prelude::*; 10 | 11 | fn splits_fce(chunker: &C, buf: &[u8], buf_sizes: &[usize]) -> Vec { 12 | let mut splits = Vec::new(); 13 | let mut i = 0; 14 | let mut ss = chunker.to_search_state(); 15 | let mut last_split_point = 0; 16 | let mut curr_discard = 0; 17 | let mut prev_buf_size = 0; 18 | loop { 19 | if buf.len() == curr_discard { 20 | break; 21 | } 22 | 23 | // use adative methods to ensure buf size grows 24 | let buf_size = buf_sizes[i % buf_sizes.len()] + prev_buf_size; 25 | i += 1; 26 | let buf_size = std::cmp::min(buf_size, buf.len() - curr_discard); 27 | 28 | let b = &buf[curr_discard..(buf_size + curr_discard)]; 29 | println!( 30 | "{{ PRE: curr_discard: {}, buf_size: {}", 31 | curr_discard, buf_size 32 | ); 33 | let (split, discard_ct) = chunker.find_chunk_edge(&mut ss, b); 34 | println!( 35 | "}} POST: discard_ct: {}, next_discard: {}", 36 | discard_ct, 37 | curr_discard + discard_ct 38 | ); 39 | 40 | match split { 41 | Some(split_point) => { 42 | // `split_point` is translated into the entire buffer (from the one passed to fce), 43 | // and the length is determined by tracking the previous split. 44 | let split_point_global = curr_discard + split_point; 45 | let split_len = split_point_global - last_split_point; 46 | splits.push(split_len); 47 | last_split_point = split_point_global; 48 | prev_buf_size = 0; 49 | } 50 | None => { 51 | // at end of buffer without a split point 52 | if buf_size == (buf.len() - curr_discard) { 53 | break; 54 | } 55 | 56 | prev_buf_size = buf_size; 57 | } 58 | } 59 | 60 | curr_discard += discard_ct; 61 | println!("-- curr_discard = {}", curr_discard); 62 | } 63 | 64 | splits 65 | } 66 | 67 | proptest! { 68 | #[test] 69 | #[cfg(feature = "gzip")] 70 | fn gzip_fce_self_consistent_with_varying_buf_size( 71 | data in prop::collection::vec(0u8..=255u8, 0..10000), 72 | buf_sizes_1 in prop::collection::vec(1usize..5000, 1..10000), 73 | buf_sizes_2 in prop::collection::vec(1usize..5000, 1..10000)) 74 | { 75 | let chunker = hash_roll::gzip::GzipRsyncable::default(); 76 | let s1 = splits_fce(&chunker, &data[..], &buf_sizes_1[..]); 77 | let s2 = splits_fce(&chunker, &data[..], &buf_sizes_2[..]); 78 | assert_eq!(s1, s2); 79 | } 80 | 81 | #[test] 82 | #[cfg(feature = "mii")] 83 | fn mii_fce_self_consistent_with_varying_buf_size( 84 | data in prop::collection::vec(0u8..=255u8, 0..10000), 85 | buf_sizes_1 in prop::collection::vec(1usize..5000, 1..10000), 86 | buf_sizes_2 in prop::collection::vec(1usize..5000, 1..10000)) 87 | { 88 | let chunker = hash_roll::mii::Mii::default(); 89 | let s1 = splits_fce(&chunker, &data[..], &buf_sizes_1[..]); 90 | let s2 = splits_fce(&chunker, &data[..], &buf_sizes_2[..]); 91 | assert_eq!(s1, s2); 92 | } 93 | 94 | #[test] 95 | #[cfg(feature = "buzhash_big")] 96 | fn buzhash_fce_self_consistent_with_varying_buf_size( 97 | seed: u8, 98 | data in prop::collection::vec(0u8..=255u8, 0..10000), 99 | buf_sizes_1 in prop::collection::vec(1usize..5000, 1..10000), 100 | buf_sizes_2 in prop::collection::vec(1usize..5000, 1..10000)) 101 | { 102 | let chunker = hash_roll::buzhash::BuzHash::new_nom(seed); 103 | let s1 = splits_fce(&chunker, &data[..], &buf_sizes_1[..]); 104 | let s2 = splits_fce(&chunker, &data[..], &buf_sizes_2[..]); 105 | assert_eq!(s1, s2); 106 | } 107 | 108 | #[test] 109 | #[cfg(feature = "buzhash")] 110 | fn buzhash_short_fce_self_consistent_with_varying_buf_size( 111 | data in prop::collection::vec(0u8..=255u8, 0..100), 112 | buf_sizes_1 in prop::collection::vec(1usize..100, 1..100), 113 | buf_sizes_2 in prop::collection::vec(1usize..100, 1..100)) 114 | { 115 | let chunker = hash_roll::buzhash::BuzHash::new( 116 | 7, 117 | (1 << 4u32) - 1, 118 | hash_roll::buzhash::BuzHashTableByteSaltHash::from((0, &hash_roll::buzhash_table::GO_BUZHASH)), 119 | 1 << 10, 120 | ); 121 | let s1 = splits_fce(&chunker, &data[..], &buf_sizes_1[..]); 122 | let s2 = splits_fce(&chunker, &data[..], &buf_sizes_2[..]); 123 | assert_eq!(s1, s2); 124 | } 125 | 126 | #[test] 127 | #[cfg(feature = "zpaq")] 128 | fn zpaq_fce_self_consistent_with_varying_buf_size( 129 | data in prop::collection::vec(0u8..=255u8, 0..10000), 130 | buf_sizes_1 in prop::collection::vec(1usize..5000, 1..10000), 131 | buf_sizes_2 in prop::collection::vec(1usize..5000, 1..10000)) 132 | { 133 | let chunker = hash_roll::zpaq::Zpaq::with_average_size_pow_2(13); 134 | let s1 = splits_fce(&chunker, &data[..], &buf_sizes_1[..]); 135 | let s2 = splits_fce(&chunker, &data[..], &buf_sizes_2[..]); 136 | assert_eq!(s1, s2); 137 | } 138 | 139 | #[test] 140 | #[cfg(feature = "pigz")] 141 | fn pigz_fce_self_consistent_with_varying_buf_size( 142 | data in prop::collection::vec(0u8..=255u8, 0..10000), 143 | buf_sizes_1 in prop::collection::vec(1usize..5000, 1..10000), 144 | buf_sizes_2 in prop::collection::vec(1usize..5000, 1..10000)) 145 | { 146 | let chunker = hash_roll::pigz::PigzRsyncable::default(); 147 | let s1 = splits_fce(&chunker, &data[..], &buf_sizes_1[..]); 148 | let s2 = splits_fce(&chunker, &data[..], &buf_sizes_2[..]); 149 | assert_eq!(s1, s2); 150 | } 151 | 152 | #[test] 153 | #[cfg(feature = "bup")] 154 | fn bup_fce_self_consistent_with_varying_buf_size( 155 | data in prop::collection::vec(0u8..=255u8, 0..10000), 156 | buf_sizes_1 in prop::collection::vec(1usize..5000, 1..10000), 157 | buf_sizes_2 in prop::collection::vec(1usize..5000, 1..10000)) 158 | { 159 | let chunker = hash_roll::bup::RollSum::default(); 160 | let s1 = splits_fce(&chunker, &data[..], &buf_sizes_1[..]); 161 | let s2 = splits_fce(&chunker, &data[..], &buf_sizes_2[..]); 162 | assert_eq!(s1, s2); 163 | } 164 | 165 | #[test] 166 | #[cfg(feature = "zstd")] 167 | fn zstd_fce_self_consistent_with_varying_buf_size( 168 | data in prop::collection::vec(0u8..=255u8, 0..100000), 169 | buf_sizes_1 in prop::collection::vec(1usize..5000, 1..10000), 170 | buf_sizes_2 in prop::collection::vec(1usize..5000, 1..10000)) 171 | { 172 | let chunker = hash_roll::zstd::Zstd::default(); 173 | let s1 = splits_fce(&chunker, &data[..], &buf_sizes_1[..]); 174 | let s2 = splits_fce(&chunker, &data[..], &buf_sizes_2[..]); 175 | assert_eq!(s1, s2); 176 | } 177 | 178 | #[test] 179 | #[cfg(feature = "gear")] 180 | fn gear_fce_self_consistent_with_varying_buf_size( 181 | data in prop::collection::vec(0u8..=255u8, 0..100000), 182 | buf_sizes_1 in prop::collection::vec(1usize..5000, 1..10000), 183 | buf_sizes_2 in prop::collection::vec(1usize..5000, 1..10000)) 184 | { 185 | let chunker = hash_roll::gear::Gear32::default(); 186 | let s1 = splits_fce(&chunker, &data[..], &buf_sizes_1[..]); 187 | let s2 = splits_fce(&chunker, &data[..], &buf_sizes_2[..]); 188 | assert_eq!(s1, s2); 189 | } 190 | 191 | #[test] 192 | #[cfg(feature = "fastcdc")] 193 | fn fastcdc_fce_self_consistent_with_varying_buf_size( 194 | data in prop::collection::vec(0u8..=255u8, 0..100000), 195 | buf_sizes_1 in prop::collection::vec(1usize..5000, 1..10000), 196 | buf_sizes_2 in prop::collection::vec(1usize..5000, 1..10000)) 197 | { 198 | let chunker = hash_roll::fastcdc::FastCdc::default(); 199 | let s1 = splits_fce(&chunker, &data[..], &buf_sizes_1[..]); 200 | let s2 = splits_fce(&chunker, &data[..], &buf_sizes_2[..]); 201 | assert_eq!(s1, s2); 202 | } 203 | 204 | #[test] 205 | #[cfg(feature = "ram")] 206 | fn ram_fce_self_consistent_with_varying_buf_size( 207 | data in prop::collection::vec(0u8..=255u8, 0..100000), 208 | buf_sizes_1 in prop::collection::vec(1usize..5000, 1..10000), 209 | buf_sizes_2 in prop::collection::vec(1usize..5000, 1..10000)) 210 | { 211 | let chunker = hash_roll::ram::Ram::with_w(8192); 212 | let s1 = splits_fce(&chunker, &data[..], &buf_sizes_1[..]); 213 | let s2 = splits_fce(&chunker, &data[..], &buf_sizes_2[..]); 214 | assert_eq!(s1, s2); 215 | } 216 | } 217 | -------------------------------------------------------------------------------- /tests/fastcdc.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "fastcdc")] 2 | 3 | use hash_roll::fastcdc::FastCdcIncr; 4 | use hash_roll::ChunkIncr; 5 | use rand_pcg::Pcg64; 6 | 7 | #[derive(Debug, Clone, PartialEq, Eq)] 8 | struct Vec8K { 9 | data: Vec, 10 | } 11 | 12 | impl quickcheck::Arbitrary for Vec8K { 13 | fn arbitrary(g: &mut G) -> Self { 14 | // FIXME: the intention is to raise this >8KB, but that makes the tests take far too 15 | // long to run. 16 | let l = 1 * 1024 + g.size(); 17 | 18 | let mut d = vec![0; l]; 19 | 20 | g.fill_bytes(&mut d[..]); 21 | 22 | Vec8K { data: d } 23 | } 24 | 25 | fn shrink(&self) -> Box> { 26 | // use the normal Vec shrinkers 27 | let chain = self.data.shrink().map(|x| Vec8K { data: x }); 28 | Box::new(chain) 29 | } 30 | } 31 | 32 | fn oracle_1(d: Vec8K) -> bool { 33 | let mut cdc = FastCdcIncr::default(); 34 | let v1 = fast_cdc_8kb(&d.data[..]); 35 | let v2 = cdc.push(&d.data[..]); 36 | 37 | v1 == v2.unwrap_or(0) 38 | } 39 | 40 | fn oracle_1_test(data: &[u8]) { 41 | let mut cdc = FastCdcIncr::default(); 42 | let v1 = fast_cdc_8kb(&data[..]); 43 | let v2 = cdc.push(&data[..]).unwrap_or(0); 44 | assert_eq!(v1, v2); 45 | } 46 | 47 | #[test] 48 | fn o1_empty() { 49 | oracle_1_test(&vec![0]); 50 | } 51 | 52 | #[test] 53 | fn o1_qc() { 54 | quickcheck::quickcheck(oracle_1 as fn(Vec8K) -> bool); 55 | } 56 | 57 | fn o1_8k_seed(state: u128) { 58 | use rand::RngCore; 59 | let l = 8 * 1024 * 1024 + 1; 60 | let mut d = Vec::with_capacity(l); 61 | let c = d.capacity(); 62 | unsafe { d.set_len(c) }; 63 | println!("seed: {:#x}", state); 64 | println!("len: {}", c); 65 | let mut rng = Pcg64::new(state, 0xa02bdbf7bb3c0a7ac28fa16a64abf96); 66 | for _ in 0..10 { 67 | rng.fill_bytes(&mut d); 68 | oracle_1_test(&d); 69 | } 70 | } 71 | 72 | #[test] 73 | fn o1_8k1() { 74 | let state: u128 = ::rand::random(); 75 | o1_8k_seed(state); 76 | } 77 | 78 | #[test] 79 | fn o1_8k_t1() { 80 | o1_8k_seed(0x6362eca4ca113c1bd10d40b8b10e9ad4); 81 | } 82 | 83 | #[test] 84 | fn o1_8k_t2() { 85 | o1_8k_seed(0x22e622e48004575fe4229bf0da6341c9); 86 | } 87 | 88 | #[test] 89 | fn feed_until_5_chunks() { 90 | use rand::RngCore; 91 | let mut cdc = FastCdcIncr::default(); 92 | let mut ct = 0; 93 | let mut rng = ::rand::thread_rng(); 94 | let mut d = [0u8; 256]; 95 | rng.fill_bytes(&mut d); 96 | loop { 97 | rng.fill_bytes(&mut d); 98 | let mut data = &d[..]; 99 | loop { 100 | let p = cdc.push(&data[..]); 101 | println!("p: {:?}, cdc: {:?}", p, cdc); 102 | 103 | if p == None || p.unwrap() == data.len() { 104 | break; 105 | } else { 106 | ct += 1; 107 | if ct > 5 { 108 | return; 109 | } 110 | data = &data[p.unwrap()..]; 111 | } 112 | } 113 | } 114 | } 115 | 116 | /// A 1-buffer implimentation of FastCDC8KB designed to match the reference pseudocode 117 | fn fast_cdc_8kb(src: &[u8]) -> usize { 118 | use hash_roll::gear_table::GEAR_64; 119 | use std::num::Wrapping; 120 | // these masks are taken from the paper and could be adjusted/adjustable. 121 | const MASK_S: u64 = 0x0003590703530000; 122 | //const MASK_A: u64 = 0x0000d90303530000; 123 | const MASK_L: u64 = 0x0000d90003530000; 124 | const MIN_SIZE: u64 = 2 * 1024; // 2KB 125 | const MAX_SIZE: u64 = 64 * 1024; // 64KB 126 | const NORMAL_SIZE: u64 = 8 * 1024; // 8KB 127 | 128 | let mut fp = Wrapping(0); 129 | let mut n = src.len(); 130 | let mut normal_size = NORMAL_SIZE as usize; 131 | if n <= (MIN_SIZE as usize) { 132 | // Diverge from the reference here: 133 | // return 0 to indicate no split found rather than src.len() 134 | return 0; 135 | } 136 | 137 | if n >= (MAX_SIZE as usize) { 138 | n = MAX_SIZE as usize; 139 | } else if n <= normal_size { 140 | normal_size = n; 141 | } 142 | 143 | for i in (MIN_SIZE as usize)..normal_size { 144 | fp = (fp << 1) + Wrapping(GEAR_64[src[i] as usize]); 145 | if (fp.0 & MASK_S) == 0 { 146 | return i; 147 | } 148 | } 149 | 150 | for i in normal_size..n { 151 | fp = (fp << 1) + Wrapping(GEAR_64[src[i] as usize]); 152 | if (fp.0 & MASK_L) == 0 { 153 | return i; 154 | } 155 | } 156 | 157 | // Diverge from the reference here: 158 | // return MAX_SIZE when we've gotten to MAX_SIZE 159 | // return 0 to indicate no split found rather than src.len() 160 | if n == MAX_SIZE as usize { 161 | n 162 | } else { 163 | 0 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /tests/oracle_zpaq.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "zpaq-broken")] 2 | // cdchunking doesn't impliment zpaq exactly correctly 3 | 4 | use hash_roll::{ChunkIncr, ToChunkIncr}; 5 | use quickcheck::quickcheck; 6 | use rand::RngCore; 7 | use rand_pcg::Pcg64; 8 | 9 | fn test_data(seed: u128, size: usize) -> Vec { 10 | let mut fill_rng = Pcg64::new(seed, 0xa02bdbf7bb3c0a7ac28fa16a64abf96); 11 | let mut buf = vec![0u8; size]; 12 | fill_rng.fill_bytes(&mut buf); 13 | buf 14 | } 15 | 16 | quickcheck! { 17 | fn zpaq_eq_cdchunking(xs: Vec) -> bool { 18 | let m1 = hash_roll::zpaq::Zpaq::with_average_and_range_and_m(13, .., 123_456_791, 123_456_791 * 2); 19 | let m2 = cdchunking::Chunker::new(cdchunking::ZPAQ::new(13)); 20 | 21 | let mut i1 = m1.to_chunk_incr().iter_slices(&xs); 22 | let mut i2 = m2.slices(&xs); 23 | 24 | loop { 25 | let v1 = i1.next(); 26 | let v2 = i2.next(); 27 | 28 | if v1 != v2 { 29 | return false; 30 | } 31 | 32 | if v1.is_none() { 33 | return true; 34 | } 35 | } 36 | } 37 | } 38 | 39 | fn c(xs: &[u8]) { 40 | let m1 = 41 | hash_roll::zpaq::Zpaq::with_average_and_range_and_m(13, .., 123_456_791, 123_456_791 * 2); 42 | let m2 = cdchunking::Chunker::new(cdchunking::ZPAQ::new(13)); 43 | 44 | let mut i1 = m1.to_chunk_incr().iter_slices(&xs); 45 | let mut i2 = m2.slices(&xs); 46 | 47 | let mut i = 0; 48 | loop { 49 | let v1 = i1.next(); 50 | let v2 = i2.next(); 51 | 52 | if v1 != v2 { 53 | panic!("i: {}, hr_v: {:?} != cdc_v : {:?}", i, v1, v2); 54 | } 55 | 56 | if v1.is_none() { 57 | break; 58 | } 59 | 60 | i += 1; 61 | } 62 | } 63 | 64 | #[test] 65 | fn zpaq_cdchunking_cuts() { 66 | let buf = test_data(0, 8192 * 4); 67 | let m: Vec = cdchunking::Chunker::new(cdchunking::ZPAQ::new(13)) 68 | .slices(&buf) 69 | .map(|v| v.len()) 70 | .collect(); 71 | assert_eq!(&m[..], &[10785, 6329, 1287, 860, 4716, 7419],); 72 | } 73 | 74 | mod oracle_zpaq { 75 | use super::c; 76 | #[test] 77 | fn t1() { 78 | c(&[ 79 | 25, 5, 82, 84, 53, 94, 27, 24, 98, 47, 7, 7, 6, 34, 60, 98, 20, 64, 17, 5, 62, 40, 94, 80 | 79, 33, 1, 0, 81 | ]) 82 | } 83 | 84 | #[test] 85 | fn t2() { 86 | c(&[ 87 | 25, 5, 82, 84, 53, 94, 27, 24, 98, 47, 7, 7, 6, 34, 60, 98, 20, 64, 17, 5, 62, 40, 94, 88 | 79, 33, 1, 89 | ]) 90 | } 91 | 92 | #[test] 93 | fn t3() { 94 | c(&[ 95 | 25, 5, 82, 84, 53, 94, 27, 24, 98, 47, 7, 7, 6, 34, 60, 98, 20, 64, 17, 5, 62, 40, 94, 96 | 79, 33, 0, 0, 97 | ]) 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /tests/qc_bh.rs: -------------------------------------------------------------------------------- 1 | /* 2 | extern crate hash_roll; 3 | #[macro_use] 4 | extern crate quickcheck; 5 | */ 6 | 7 | /* 8 | #[derive(Debug,Clone,PartialEq,Eq)] 9 | struct Fma { 10 | data: Vec, 11 | msize: usize, 12 | moffs: usize, 13 | } 14 | 15 | impl quickcheck::Arbitrary for Fma { 16 | fn arbitrary(g: &mut G) -> Self { 17 | // lenght at least 1 18 | let d = { 19 | let mut x = g.gen(); 20 | while x.len() == 0 { 21 | x = g.gen(); 22 | } 23 | }; 24 | 25 | // 1 to d.len() 26 | let s = if d.len() == 1 { 27 | 1 28 | } else { 29 | (g.gen() % (d.len() - 1)) + 1 30 | }; 31 | 32 | // 0 to (d.len() - s) 33 | let o = if d.len() - s == 0 { 34 | 0 35 | } else { 36 | g.gen() % (d.len() - s) 37 | }; 38 | 39 | 40 | Fma { 41 | data: d, 42 | msize: s, 43 | moffs: o, 44 | } 45 | } 46 | 47 | fn shrink(&self) -> Box> { 48 | 49 | } 50 | } 51 | */ 52 | 53 | /* 54 | quickcheck! { 55 | // choose a substring of `data` and use buzhash to find it 56 | fn find_match(data: Vec, size: usize, offs: usize) -> bool { 57 | // d.len() > 0 58 | if data.len() == 0 { 59 | return true 60 | } 61 | // 1..d.len() 62 | let size = if size == 0 { 63 | 1 64 | } else { 65 | if data.len() == 1 { 66 | 1 67 | } else { 68 | (size % (data.len() - 1)) + 1 69 | } 70 | }; 71 | 72 | 73 | let offs = if offs == 0 { 74 | 0 75 | } else { 76 | if data.len() - size == 0 { 77 | 0 78 | } else { 79 | offs % (data.len() - size) 80 | } 81 | }; 82 | let ms = &data[offs..(offs+size)]; 83 | println!("size: {}, offs: {}", size, offs); 84 | let mut b = ::hash_roll::buzhash::BuzHashBuf::with_capacity(size); 85 | let mut b2 = b.clone(); 86 | 87 | b.push(ms); 88 | let h = b.hash(); 89 | 90 | let mut d = &data[..]; 91 | loop { 92 | let f = b2.find_match(h, &d[..]); 93 | if f == 0 { 94 | return false 95 | } 96 | 97 | if f >= offs + size { 98 | return false; 99 | } 100 | 101 | if f > size && (&data[(f-size)..f] == ms) { 102 | return true; 103 | } 104 | 105 | d = &d[f..]; 106 | } 107 | } 108 | } 109 | */ 110 | -------------------------------------------------------------------------------- /tests/qc_bup.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "bup")] 2 | use quickcheck::quickcheck; 3 | 4 | use hash_roll::ChunkIncr; 5 | 6 | quickcheck! { 7 | fn simple_eq(xs: Vec) -> bool { 8 | let mut m1 = hash_roll::bup::RollSumIncr::default(); 9 | let mut m2 = rollsum::Bup::default(); 10 | 11 | let v1 = m1.push(&xs); 12 | let v2 = m2.find_chunk_edge(&xs); 13 | 14 | v1 == v2.map(|x| x.0) 15 | } 16 | 17 | fn iter_eq(xs: Vec) -> bool { 18 | let mut m1 = hash_roll::bup::RollSumIncr::default(); 19 | let mut m2 = rollsum::Bup::default(); 20 | 21 | let mut x = &xs[..]; 22 | loop { 23 | let v1 = m1.push(&x); 24 | let v2 = m2.find_chunk_edge(&x); 25 | 26 | if v1 != v2.map(|x| x.0) { 27 | return false 28 | } 29 | 30 | if v1 == None { 31 | return true 32 | } 33 | 34 | let v1 = v1.unwrap(); 35 | 36 | x = &x[v1..]; 37 | if x.len() == 0 { 38 | return true 39 | } 40 | } 41 | } 42 | } 43 | 44 | fn chk_a(x: &[u8]) { 45 | let mut m1 = hash_roll::bup::RollSumIncr::default(); 46 | let mut m2 = rollsum::Bup::default(); 47 | 48 | let v1 = m1.push(&x); 49 | let v2 = m2.find_chunk_edge(&x); 50 | 51 | assert_eq!(v1, v2.map(|x| x.0)); 52 | } 53 | 54 | fn chk_b(x: &[u8]) { 55 | use rollsum::Engine; 56 | let mut m1 = hash_roll::bup::RollSumIncr::default(); 57 | let mut m2 = rollsum::Bup::default(); 58 | let cm = (1 << rollsum::bup::CHUNK_BITS) - 1; 59 | 60 | for (i, &v) in x.iter().enumerate() { 61 | m1.roll_byte(v); 62 | m2.roll_byte(v); 63 | println!("i={}, v={}", i, v); 64 | assert_eq!(m1.digest(), m2.digest()); 65 | assert_eq!(m1.at_split(), (m2.digest() & cm) == cm); 66 | } 67 | } 68 | 69 | #[test] 70 | fn simple_eq_1() { 71 | chk_a(&[ 72 | 92, 6, 28, 35, 68, 82, 35, 71, 34, 19, 9, 45, 97, 17, 11, 6, 53, 39, 93, 49, 29, 17, 37, 6, 73 | 39, 74 | ]); 75 | } 76 | 77 | #[test] 78 | fn simple_eq_1b() { 79 | chk_b(&[ 80 | 92, 6, 28, 35, 68, 82, 35, 71, 34, 19, 9, 45, 97, 17, 11, 6, 53, 39, 93, 49, 29, 17, 37, 6, 81 | 39, 82 | ]); 83 | } 84 | 85 | #[test] 86 | fn simple_eq_2() { 87 | chk_a(&[67, 3, 23, 73, 86, 64, 26, 25, 81, 53, 26, 82, 98, 86, 28]); 88 | } 89 | 90 | #[test] 91 | fn simple_eq_3() { 92 | chk_a(&[ 93 | 40, 58, 57, 0, 16, 2, 32, 88, 0, 22, 23, 74, 90, 88, 95, 99, 86, 94 | ]); 95 | } 96 | -------------------------------------------------------------------------------- /tests/rsyncable.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "rsyncable")] 2 | use hash_roll::gzip::GzipRsyncable; 3 | use hash_roll::Splitter; 4 | 5 | #[test] 6 | fn test_rsyncable() { 7 | use std::collections::HashSet; 8 | 9 | let d1 = b"hello, this is some bytes"; 10 | let mut d2 = d1.clone(); 11 | d2[4] = ':' as u8; 12 | 13 | let b1 = GzipRsyncable::with_window_and_modulus(4, 8).into_vecs(d1.iter().cloned()); 14 | let b2 = GzipRsyncable::with_window_and_modulus(4, 8).into_vecs(d2.iter().cloned()); 15 | 16 | let c1 = b1.clone().count(); 17 | let c2 = b2.clone().count(); 18 | 19 | /* XXX: in this contrived case, we generate the same number of blocks. 20 | * We should generalize this test to guess at "reasonable" differences in block size 21 | */ 22 | assert_eq!(c1, 4); 23 | assert!((c1 as i64 - c2 as i64).abs() < 1); 24 | 25 | /* check that some blocks match up */ 26 | 27 | let mut blocks = HashSet::with_capacity(c1); 28 | let mut common_in_b1 = 0u64; 29 | for b in b1 { 30 | if !blocks.insert(b) { 31 | common_in_b1 += 1; 32 | } 33 | } 34 | 35 | println!("common in b1: {}", common_in_b1); 36 | 37 | let mut shared_blocks = 0u64; 38 | for b in b2 { 39 | if blocks.contains(&b) { 40 | shared_blocks += 1; 41 | } 42 | } 43 | 44 | /* XXX: this is not a generic test, we can't rely on it */ 45 | println!("shared blocks: {}", shared_blocks); 46 | assert!(shared_blocks > (c1 as u64) / 2); 47 | } 48 | --------------------------------------------------------------------------------