├── .github
    ├── dependabot.yml
    └── workflows
    │   └── ci.yml
├── .gitignore
├── .gitmodules
├── COMPARE.md
├── Cargo.toml
├── LICENSE-AGPL-3.0
├── README.md
├── benches
    └── compare.rs
├── bors.toml
├── ci
    └── script.sh
├── examples
    └── generate-test-data.rs
├── rustfmt.toml
├── src
    ├── bup.rs
    ├── buzhash.rs
    ├── buzhash_table.rs
    ├── fastcdc.rs
    ├── gear.rs
    ├── gear_table
    │   ├── gear32.rs
    │   ├── gear64.rs
    │   └── mod.rs
    ├── gzip.rs
    ├── lib.rs
    ├── mii.rs
    ├── pigz.rs
    ├── ram.rs
    ├── range.rs
    ├── zpaq.rs
    └── zstd.rs
└── tests
    ├── cuts.rs
    ├── cuts_qc.proptest-regressions
    ├── cuts_qc.rs
    ├── fastcdc.rs
    ├── oracle_zpaq.rs
    ├── qc_bh.rs
    ├── qc_bup.rs
    └── rsyncable.rs


/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 | - package-ecosystem: cargo
 4 |   directory: "/"
 5 |   schedule:
 6 |     interval: daily
 7 |     time: "10:00"
 8 |   open-pull-requests-limit: 10
 9 |   rebase-strategy: disabled
10 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches-ignore:
 4 |       - '**.tmp'
 5 | 
 6 | name: ci
 7 | 
 8 | jobs:
 9 |   test:
10 |     runs-on: ubuntu-20.04
11 |     strategy:
12 |       matrix:
13 |         rust:
14 |           - stable
15 |           - beta
16 |           - nightly
17 |           - 1.47.0
18 | 
19 |     steps:
20 |       - uses: actions/checkout@v2
21 | 
22 |       - uses: actions-rs/toolchain@v1
23 |         with:
24 |           profile: minimal
25 |           toolchain: ${{ matrix.rust }}
26 |           override: true
27 | 
28 |       - name: Cache Rust dependencies
29 |         uses: Swatinem/rust-cache@v1
30 | 
31 |       - name: Run all tests
32 |         uses: actions-rs/cargo@v1
33 |         with:
34 |           command: test
35 |           args: --all
36 |   check:
37 |     runs-on: ubuntu-20.04
38 | 
39 |     steps:
40 |       - uses: actions/checkout@v2
41 | 
42 |       - uses: actions-rs/toolchain@v1
43 |         with:
44 |           profile: minimal
45 |           toolchain: beta
46 |           override: true
47 |           components: rustfmt, clippy
48 | 
49 |       - name: Cache Rust dependencies
50 |         uses: Swatinem/rust-cache@v1
51 | 
52 |       - uses: actions-rs/cargo@v1
53 |         with:
54 |           command: fmt
55 |           args: --all -- --check
56 | 
57 |       - uses: actions-rs/cargo@v1
58 |         with:
59 |           command: clippy
60 |           args: -- -D warnings
61 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | Cargo.lock
3 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codyps/hash-roll/a964a874bed4398a02d6c8136cd7d0bdd8064f36/.gitmodules


--------------------------------------------------------------------------------
/COMPARE.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Algorithms
  3 | 
  4 |  - Gear
  5 |  - FastCDC
  6 |  - Zpaq
  7 |  - AE
  8 |  - RollSum
  9 |  - BuzHash
 10 |  - LMC (chunking)
 11 |  - RAM Chunking (Rapid  Asymmetric  Maximum)
 12 |    - doi:10.1016/j.future.2017.02.013 
 13 |  - MII (minimal incrimental interval)
 14 |    - doi:10.1109/access.2019.2926195 
 15 |  - [TTTD](https://scholarworks.sjsu.edu/cgi/viewcontent.cgi?referer=&httpsredir=1&article=1041&context=etd_projects)
 16 |  - [FBC](doi:10.1109/mascots.2010.37)
 17 | 
 18 | # Algorithm Points of Comparison
 19 | 
 20 |  - Ability to constrain block size
 21 |    - distribution
 22 |    - tuneability of distribution
 23 |  - Speed
 24 |    - on different distributions
 25 |  - Common chunk discovery
 26 |    - on different distributions
 27 |  - Common chuck discovery after a byte shift
 28 |    - on different distributions
 29 |  - Common chuck discovery after edit
 30 |    - on different data distributions
 31 |    - under different edit kinds
 32 | 
 33 | # Impl Features
 34 | 
 35 |  - Incrimental input: rather than require a single `&[u8]` up front, allow
 36 |    providing a number of `&[u8]`s over the life of the splitter/hasher.
 37 | 
 38 |  - Slice input vs byte-at-a-time: By allowing algorithms to take in larger
 39 |    slices of data at a time, it enables them to potentially impliment
 40 |    optimizations to speed up computation.
 41 | 
 42 | # Implimentations
 43 | 
 44 |  - [cdc](https://lib.rs/crates/cdc)
 45 |    - latest release: 2017-09-09
 46 |      - inactive development (as of 2020-06-21)
 47 |    - algorithm(s): "Rabin64" (polynomial based, 64-bit)
 48 |    - incrimental input: no
 49 |      - no documentation indicates incrimental input is possible
 50 |      - while one could use a special impl of `Iterator<Item=u8>` that can be
 51 |        extended, this would only work if the `SeperatorIter` or `ChunkIter` had
 52 |        not emitted a final incomplete chunk/seperator.
 53 |    - includes `RollingHash64` trait
 54 |    - structure includes mutable context, no non-mutable representation
 55 |    - input format(s): `Iterator<Item=u8>`, `u8`
 56 |      - may limit performance capability
 57 |    - input is fully buffered by cdc structures
 58 |    - provides both rolling hash and content splitting features
 59 |    - has _explicit_ representation for "prefilling" of the rolling hash.
 60 |    - includes multiple iterator adapters
 61 |      - splits the concept of a "seperator" (index + hash) vs a "chunk" (index +
 62 |        hash + size).
 63 |      - iterator adaptors don't generalize over rolling hashes, they are
 64 |        hard-coded to the `Rabin64` impl
 65 |    - documentation is lacking (almost universally missing)
 66 |  - [fastcdc](https://lib.rs/crates/fastcdc)
 67 |    - latest release: 2020-03-19, v1.0.3
 68 |      - active development (as of 2020-06-21)
 69 |    - algorithm(s): FastCDC
 70 |    - incrimental input: no
 71 |    - api:
 72 |      - input: one `&[u8]`
 73 |      - output: `Iterator<Item=Chunk> where Chunk: (offset: usize, size:
 74 |        usize)`. Returns the remaining chunk as the last item even if not
 75 |        considered a complete chunk.
 76 |    - only struct mixes mutable and immutable data, no configuration representation
 77 |    - "chunks" are an offset and a size
 78 |      - iow: no rolling hash support
 79 |    - single struct, no traits
 80 |    - provides a fixed table for fastcdc (generated via a reproducable mechanism initially)
 81 |  - [quickcdc](https://lib.rs/crates/quickcdc)
 82 |    - latest release: 2018-12-17 v1.0.0 (no other releases)
 83 |      - inactive development (as of 2020-06-21)
 84 |    - algorithm(s): AE (with modifications/extensions)
 85 |    - incrimental input: no
 86 |    - api:
 87 |      - input: one `&[u8]`
 88 |      - output: `Iterator<Item=&[u8]>`
 89 |    - no struct representation of configuration (only mixes mutable and immutable)
 90 |    - api: iterator over slices
 91 |    - single struct, no traits
 92 |    - includes improper use of unsafe in a non-public function (passes pointers
 93 |      into a function that dereferences them but the function is not marked
 94 |      unsafe).
 95 |  - [gearhash](https://lib.rs/crates/gearhash)
 96 |    - latest release: 2020-04-12 v0.1.3
 97 |      - active development (as of 2020-06-21)
 98 |    - algorithm(s): gear
 99 |    - incrimental input: yes
100 |    - provides simd & scalar impls
101 |    - includes a static table for gearhash
102 |    - api: call `next_match()` repeatedly with new slices. Returns a
103 |      `Option<usize>` indicating where a split point is (if any) in the slice
104 |      passed to `next_match()`.
105 |    - `Hasher` struct provides both content splitting and rolling hash features.
106 |      - in-place splitting
107 |      - lacks helpers present in `cdchunking`
108 |    - single struct, no traits
109 |    - no struct representation of configuration (only mixes mutable and immutable data)
110 |  - [cdchunking](https://lib.rs/crates/cdchunking)
111 |    - latest release: 2019-11-02 v1.0.0
112 |      - inactive development (as of 2020-06-21)
113 |    - algorithm(s): Zpaq
114 |    - provides a chunker-impl trait
115 |    - api: call `next_boundary()` repeatedly with new slices. Returns a
116 |      `Option<usize>` indicating what a split point is (if any) in the slice.
117 |      - must explicitly call a `reset()` after a match to reset internal state
118 |        for subsequent matches.
119 |    - provides a `Chunker` which takes a `ChunkerImpl` and provides a number of ease-of-use apis:
120 |      - from a `Read` into a `Iterator<Item=Result<Vec<u8>>>`
121 |      - from a `Read` into a `Result<Vec<Vec<u8>>>`
122 |      - from a `Read` into a series of one of `Data(&[u8])` or `End`, where the
123 |        `Data(&[u8])` are references to an internal buffer and `End` indicate
124 |        the end of a chunk.
125 |      - from a `Read` to an iterator of (start, len, end) (ie: no data returned)
126 |      - from a `&[u8]` to an `Iterator<Item=[u8]>`
127 |  - [rollsum](https://lib.rs/crates/rollsum) aka [rsroll](https://github.com/aidanhs/rsroll)
128 |    - latest release: (commit 2019-12-22, publish 2020-09-27) v0.3.0
129 |      - uncertain inactive development (as of 2020-10-08)
130 |    - algorithm(s):
131 |      - rollsum (based on bupsplit, based on rsync chunking)
132 |      - gear
133 |    - incrimental input: yes
134 |    - includes a static table for gearhash
135 |    - low level trait has byte-by-byte and slice based interfaces
136 |    - exposes conditionality of chunk edge (ie: like a rolling-sum) in trait,
137 |      but provides a helper on the specific struct that uses it's defaults.
138 |    - requires explicit state resets after finding a chunk edge to find the next
139 |      chunk edge (doesn't reset internal state)
140 |    - api: call `find_chunk_edge()` with different slices until Some((usize, Sum)) is
141 |      returned. the `usize` here is the offset after the end of the chunk (ie:
142 |      start of the next chunk).
143 |    - provides access to the underlying Sum on each edge
144 |  - [rededup-cdc](https://lib.rs/crates/rdedup-cdc)
145 |    - `rollsum` fork
146 |  - [bitar](https://lib.rs/crates/bitar)
147 |    - latest release: 2020-06-09 v0.7.0
148 |      - active development (as of 2020-06-21)
149 |    - algorithms(s): BuzHash, RollSum
150 |    - uses enum to abstract over algorithms (`Config` and `Chunker`)
151 |    - includes seperate immutable "configuration object" concept (`Config`)
152 |    - supports/requires use of `tokio::AsyncRead` as input
153 |    - api: provide a `AsyncRead` when constructing the `Chunker`. Use the
154 |      `futures::Stream<Item=Result<(u64, Bytes)>>` it returns
155 |    - low-level trait for each hash is byte-at-a-time
156 |    - many other items included in the library (designed to support the cmdline tool `bita`)
157 |  - [zvault](https://github.com/dswd/zvault)
158 |    - algorithm(s): AE, fastcdc, rabin, fixed (non content defined)
159 |    - low level trait requires a Read & a Write instance
160 |    - provides run-time generic over creation & extraction of some details (`Chunker`)
161 |    - Instantiation for each provides a seed and average size
162 |    - inactive development (last change 2018-03-08 (as of 2020-05-10))
163 |    - includes many non-chunking items
164 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "hash-roll"
 3 | version = "0.3.0"
 4 | authors = ["Cody P Schafer <dev@codyps.com>"]
 5 | description = "Rolling hashes & Content Defined Chunking (cdc)"
 6 | keywords = [ "hash", "rolling", "incremental", "split" , "cdc"]
 7 | license = "AGPL-3.0-or-later"
 8 | repository = "https://github.com/jmesmon/hash-roll.git"
 9 | documentation = "https://docs.rs/hash-roll"
10 | include = ["Cargo.toml", "**/*.rs", "README.md", "COMPARE.md"]
11 | edition = "2018"
12 | 
13 | [features]
14 | default = [
15 | 	"bup",
16 | 	"buzhash",
17 | 	"fastcdc",
18 | 	"gear",
19 | 	"gzip",
20 | 	"mii",
21 | 	"pigz",
22 | 	"ram",
23 | 	"zpaq",
24 | 	"zstd"
25 | ]
26 | 
27 | bup = []
28 | buzhash = []
29 | fastcdc = []
30 | gear = []
31 | gzip = []
32 | mii = []
33 | pigz = []
34 | ram = []
35 | zpaq = []
36 | zstd = []
37 | 
38 | [dependencies]
39 | fmt-extra = "0.2"
40 | #circbuf = "0.1.4"
41 | 
42 | [dev-dependencies]
43 | rand = "0.7.3"
44 | histogram = "0.6"
45 | quickcheck = "0.9"
46 | rollsum = "0.3"
47 | criterion = "0.3"
48 | rand_pcg = "0.2.1"
49 | proptest = "0.10.0"
50 | 
51 | [[bench]]
52 | name = "compare"
53 | harness = false
54 | 


--------------------------------------------------------------------------------
/LICENSE-AGPL-3.0:
--------------------------------------------------------------------------------
  1 |                     GNU AFFERO GENERAL PUBLIC LICENSE
  2 |                        Version 3, 19 November 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU Affero General Public License is a free, copyleft license for
 11 | software and other kinds of works, specifically designed to ensure
 12 | cooperation with the community in the case of network server software.
 13 | 
 14 |   The licenses for most software and other practical works are designed
 15 | to take away your freedom to share and change the works.  By contrast,
 16 | our General Public Licenses are intended to guarantee your freedom to
 17 | share and change all versions of a program--to make sure it remains free
 18 | software for all its users.
 19 | 
 20 |   When we speak of free software, we are referring to freedom, not
 21 | price.  Our General Public Licenses are designed to make sure that you
 22 | have the freedom to distribute copies of free software (and charge for
 23 | them if you wish), that you receive source code or can get it if you
 24 | want it, that you can change the software or use pieces of it in new
 25 | free programs, and that you know you can do these things.
 26 | 
 27 |   Developers that use our General Public Licenses protect your rights
 28 | with two steps: (1) assert copyright on the software, and (2) offer
 29 | you this License which gives you legal permission to copy, distribute
 30 | and/or modify the software.
 31 | 
 32 |   A secondary benefit of defending all users' freedom is that
 33 | improvements made in alternate versions of the program, if they
 34 | receive widespread use, become available for other developers to
 35 | incorporate.  Many developers of free software are heartened and
 36 | encouraged by the resulting cooperation.  However, in the case of
 37 | software used on network servers, this result may fail to come about.
 38 | The GNU General Public License permits making a modified version and
 39 | letting the public access it on a server without ever releasing its
 40 | source code to the public.
 41 | 
 42 |   The GNU Affero General Public License is designed specifically to
 43 | ensure that, in such cases, the modified source code becomes available
 44 | to the community.  It requires the operator of a network server to
 45 | provide the source code of the modified version running there to the
 46 | users of that server.  Therefore, public use of a modified version, on
 47 | a publicly accessible server, gives the public access to the source
 48 | code of the modified version.
 49 | 
 50 |   An older license, called the Affero General Public License and
 51 | published by Affero, was designed to accomplish similar goals.  This is
 52 | a different license, not a version of the Affero GPL, but Affero has
 53 | released a new version of the Affero GPL which permits relicensing under
 54 | this license.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                        TERMS AND CONDITIONS
 60 | 
 61 |   0. Definitions.
 62 | 
 63 |   "This License" refers to version 3 of the GNU Affero General Public License.
 64 | 
 65 |   "Copyright" also means copyright-like laws that apply to other kinds of
 66 | works, such as semiconductor masks.
 67 | 
 68 |   "The Program" refers to any copyrightable work licensed under this
 69 | License.  Each licensee is addressed as "you".  "Licensees" and
 70 | "recipients" may be individuals or organizations.
 71 | 
 72 |   To "modify" a work means to copy from or adapt all or part of the work
 73 | in a fashion requiring copyright permission, other than the making of an
 74 | exact copy.  The resulting work is called a "modified version" of the
 75 | earlier work or a work "based on" the earlier work.
 76 | 
 77 |   A "covered work" means either the unmodified Program or a work based
 78 | on the Program.
 79 | 
 80 |   To "propagate" a work means to do anything with it that, without
 81 | permission, would make you directly or secondarily liable for
 82 | infringement under applicable copyright law, except executing it on a
 83 | computer or modifying a private copy.  Propagation includes copying,
 84 | distribution (with or without modification), making available to the
 85 | public, and in some countries other activities as well.
 86 | 
 87 |   To "convey" a work means any kind of propagation that enables other
 88 | parties to make or receive copies.  Mere interaction with a user through
 89 | a computer network, with no transfer of a copy, is not conveying.
 90 | 
 91 |   An interactive user interface displays "Appropriate Legal Notices"
 92 | to the extent that it includes a convenient and prominently visible
 93 | feature that (1) displays an appropriate copyright notice, and (2)
 94 | tells the user that there is no warranty for the work (except to the
 95 | extent that warranties are provided), that licensees may convey the
 96 | work under this License, and how to view a copy of this License.  If
 97 | the interface presents a list of user commands or options, such as a
 98 | menu, a prominent item in the list meets this criterion.
 99 | 
100 |   1. Source Code.
101 | 
102 |   The "source code" for a work means the preferred form of the work
103 | for making modifications to it.  "Object code" means any non-source
104 | form of a work.
105 | 
106 |   A "Standard Interface" means an interface that either is an official
107 | standard defined by a recognized standards body, or, in the case of
108 | interfaces specified for a particular programming language, one that
109 | is widely used among developers working in that language.
110 | 
111 |   The "System Libraries" of an executable work include anything, other
112 | than the work as a whole, that (a) is included in the normal form of
113 | packaging a Major Component, but which is not part of that Major
114 | Component, and (b) serves only to enable use of the work with that
115 | Major Component, or to implement a Standard Interface for which an
116 | implementation is available to the public in source code form.  A
117 | "Major Component", in this context, means a major essential component
118 | (kernel, window system, and so on) of the specific operating system
119 | (if any) on which the executable work runs, or a compiler used to
120 | produce the work, or an object code interpreter used to run it.
121 | 
122 |   The "Corresponding Source" for a work in object code form means all
123 | the source code needed to generate, install, and (for an executable
124 | work) run the object code and to modify the work, including scripts to
125 | control those activities.  However, it does not include the work's
126 | System Libraries, or general-purpose tools or generally available free
127 | programs which are used unmodified in performing those activities but
128 | which are not part of the work.  For example, Corresponding Source
129 | includes interface definition files associated with source files for
130 | the work, and the source code for shared libraries and dynamically
131 | linked subprograms that the work is specifically designed to require,
132 | such as by intimate data communication or control flow between those
133 | subprograms and other parts of the work.
134 | 
135 |   The Corresponding Source need not include anything that users
136 | can regenerate automatically from other parts of the Corresponding
137 | Source.
138 | 
139 |   The Corresponding Source for a work in source code form is that
140 | same work.
141 | 
142 |   2. Basic Permissions.
143 | 
144 |   All rights granted under this License are granted for the term of
145 | copyright on the Program, and are irrevocable provided the stated
146 | conditions are met.  This License explicitly affirms your unlimited
147 | permission to run the unmodified Program.  The output from running a
148 | covered work is covered by this License only if the output, given its
149 | content, constitutes a covered work.  This License acknowledges your
150 | rights of fair use or other equivalent, as provided by copyright law.
151 | 
152 |   You may make, run and propagate covered works that you do not
153 | convey, without conditions so long as your license otherwise remains
154 | in force.  You may convey covered works to others for the sole purpose
155 | of having them make modifications exclusively for you, or provide you
156 | with facilities for running those works, provided that you comply with
157 | the terms of this License in conveying all material for which you do
158 | not control copyright.  Those thus making or running the covered works
159 | for you must do so exclusively on your behalf, under your direction
160 | and control, on terms that prohibit them from making any copies of
161 | your copyrighted material outside their relationship with you.
162 | 
163 |   Conveying under any other circumstances is permitted solely under
164 | the conditions stated below.  Sublicensing is not allowed; section 10
165 | makes it unnecessary.
166 | 
167 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168 | 
169 |   No covered work shall be deemed part of an effective technological
170 | measure under any applicable law fulfilling obligations under article
171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172 | similar laws prohibiting or restricting circumvention of such
173 | measures.
174 | 
175 |   When you convey a covered work, you waive any legal power to forbid
176 | circumvention of technological measures to the extent such circumvention
177 | is effected by exercising rights under this License with respect to
178 | the covered work, and you disclaim any intention to limit operation or
179 | modification of the work as a means of enforcing, against the work's
180 | users, your or third parties' legal rights to forbid circumvention of
181 | technological measures.
182 | 
183 |   4. Conveying Verbatim Copies.
184 | 
185 |   You may convey verbatim copies of the Program's source code as you
186 | receive it, in any medium, provided that you conspicuously and
187 | appropriately publish on each copy an appropriate copyright notice;
188 | keep intact all notices stating that this License and any
189 | non-permissive terms added in accord with section 7 apply to the code;
190 | keep intact all notices of the absence of any warranty; and give all
191 | recipients a copy of this License along with the Program.
192 | 
193 |   You may charge any price or no price for each copy that you convey,
194 | and you may offer support or warranty protection for a fee.
195 | 
196 |   5. Conveying Modified Source Versions.
197 | 
198 |   You may convey a work based on the Program, or the modifications to
199 | produce it from the Program, in the form of source code under the
200 | terms of section 4, provided that you also meet all of these conditions:
201 | 
202 |     a) The work must carry prominent notices stating that you modified
203 |     it, and giving a relevant date.
204 | 
205 |     b) The work must carry prominent notices stating that it is
206 |     released under this License and any conditions added under section
207 |     7.  This requirement modifies the requirement in section 4 to
208 |     "keep intact all notices".
209 | 
210 |     c) You must license the entire work, as a whole, under this
211 |     License to anyone who comes into possession of a copy.  This
212 |     License will therefore apply, along with any applicable section 7
213 |     additional terms, to the whole of the work, and all its parts,
214 |     regardless of how they are packaged.  This License gives no
215 |     permission to license the work in any other way, but it does not
216 |     invalidate such permission if you have separately received it.
217 | 
218 |     d) If the work has interactive user interfaces, each must display
219 |     Appropriate Legal Notices; however, if the Program has interactive
220 |     interfaces that do not display Appropriate Legal Notices, your
221 |     work need not make them do so.
222 | 
223 |   A compilation of a covered work with other separate and independent
224 | works, which are not by their nature extensions of the covered work,
225 | and which are not combined with it such as to form a larger program,
226 | in or on a volume of a storage or distribution medium, is called an
227 | "aggregate" if the compilation and its resulting copyright are not
228 | used to limit the access or legal rights of the compilation's users
229 | beyond what the individual works permit.  Inclusion of a covered work
230 | in an aggregate does not cause this License to apply to the other
231 | parts of the aggregate.
232 | 
233 |   6. Conveying Non-Source Forms.
234 | 
235 |   You may convey a covered work in object code form under the terms
236 | of sections 4 and 5, provided that you also convey the
237 | machine-readable Corresponding Source under the terms of this License,
238 | in one of these ways:
239 | 
240 |     a) Convey the object code in, or embodied in, a physical product
241 |     (including a physical distribution medium), accompanied by the
242 |     Corresponding Source fixed on a durable physical medium
243 |     customarily used for software interchange.
244 | 
245 |     b) Convey the object code in, or embodied in, a physical product
246 |     (including a physical distribution medium), accompanied by a
247 |     written offer, valid for at least three years and valid for as
248 |     long as you offer spare parts or customer support for that product
249 |     model, to give anyone who possesses the object code either (1) a
250 |     copy of the Corresponding Source for all the software in the
251 |     product that is covered by this License, on a durable physical
252 |     medium customarily used for software interchange, for a price no
253 |     more than your reasonable cost of physically performing this
254 |     conveying of source, or (2) access to copy the
255 |     Corresponding Source from a network server at no charge.
256 | 
257 |     c) Convey individual copies of the object code with a copy of the
258 |     written offer to provide the Corresponding Source.  This
259 |     alternative is allowed only occasionally and noncommercially, and
260 |     only if you received the object code with such an offer, in accord
261 |     with subsection 6b.
262 | 
263 |     d) Convey the object code by offering access from a designated
264 |     place (gratis or for a charge), and offer equivalent access to the
265 |     Corresponding Source in the same way through the same place at no
266 |     further charge.  You need not require recipients to copy the
267 |     Corresponding Source along with the object code.  If the place to
268 |     copy the object code is a network server, the Corresponding Source
269 |     may be on a different server (operated by you or a third party)
270 |     that supports equivalent copying facilities, provided you maintain
271 |     clear directions next to the object code saying where to find the
272 |     Corresponding Source.  Regardless of what server hosts the
273 |     Corresponding Source, you remain obligated to ensure that it is
274 |     available for as long as needed to satisfy these requirements.
275 | 
276 |     e) Convey the object code using peer-to-peer transmission, provided
277 |     you inform other peers where the object code and Corresponding
278 |     Source of the work are being offered to the general public at no
279 |     charge under subsection 6d.
280 | 
281 |   A separable portion of the object code, whose source code is excluded
282 | from the Corresponding Source as a System Library, need not be
283 | included in conveying the object code work.
284 | 
285 |   A "User Product" is either (1) a "consumer product", which means any
286 | tangible personal property which is normally used for personal, family,
287 | or household purposes, or (2) anything designed or sold for incorporation
288 | into a dwelling.  In determining whether a product is a consumer product,
289 | doubtful cases shall be resolved in favor of coverage.  For a particular
290 | product received by a particular user, "normally used" refers to a
291 | typical or common use of that class of product, regardless of the status
292 | of the particular user or of the way in which the particular user
293 | actually uses, or expects or is expected to use, the product.  A product
294 | is a consumer product regardless of whether the product has substantial
295 | commercial, industrial or non-consumer uses, unless such uses represent
296 | the only significant mode of use of the product.
297 | 
298 |   "Installation Information" for a User Product means any methods,
299 | procedures, authorization keys, or other information required to install
300 | and execute modified versions of a covered work in that User Product from
301 | a modified version of its Corresponding Source.  The information must
302 | suffice to ensure that the continued functioning of the modified object
303 | code is in no case prevented or interfered with solely because
304 | modification has been made.
305 | 
306 |   If you convey an object code work under this section in, or with, or
307 | specifically for use in, a User Product, and the conveying occurs as
308 | part of a transaction in which the right of possession and use of the
309 | User Product is transferred to the recipient in perpetuity or for a
310 | fixed term (regardless of how the transaction is characterized), the
311 | Corresponding Source conveyed under this section must be accompanied
312 | by the Installation Information.  But this requirement does not apply
313 | if neither you nor any third party retains the ability to install
314 | modified object code on the User Product (for example, the work has
315 | been installed in ROM).
316 | 
317 |   The requirement to provide Installation Information does not include a
318 | requirement to continue to provide support service, warranty, or updates
319 | for a work that has been modified or installed by the recipient, or for
320 | the User Product in which it has been modified or installed.  Access to a
321 | network may be denied when the modification itself materially and
322 | adversely affects the operation of the network or violates the rules and
323 | protocols for communication across the network.
324 | 
325 |   Corresponding Source conveyed, and Installation Information provided,
326 | in accord with this section must be in a format that is publicly
327 | documented (and with an implementation available to the public in
328 | source code form), and must require no special password or key for
329 | unpacking, reading or copying.
330 | 
331 |   7. Additional Terms.
332 | 
333 |   "Additional permissions" are terms that supplement the terms of this
334 | License by making exceptions from one or more of its conditions.
335 | Additional permissions that are applicable to the entire Program shall
336 | be treated as though they were included in this License, to the extent
337 | that they are valid under applicable law.  If additional permissions
338 | apply only to part of the Program, that part may be used separately
339 | under those permissions, but the entire Program remains governed by
340 | this License without regard to the additional permissions.
341 | 
342 |   When you convey a copy of a covered work, you may at your option
343 | remove any additional permissions from that copy, or from any part of
344 | it.  (Additional permissions may be written to require their own
345 | removal in certain cases when you modify the work.)  You may place
346 | additional permissions on material, added by you to a covered work,
347 | for which you have or can give appropriate copyright permission.
348 | 
349 |   Notwithstanding any other provision of this License, for material you
350 | add to a covered work, you may (if authorized by the copyright holders of
351 | that material) supplement the terms of this License with terms:
352 | 
353 |     a) Disclaiming warranty or limiting liability differently from the
354 |     terms of sections 15 and 16 of this License; or
355 | 
356 |     b) Requiring preservation of specified reasonable legal notices or
357 |     author attributions in that material or in the Appropriate Legal
358 |     Notices displayed by works containing it; or
359 | 
360 |     c) Prohibiting misrepresentation of the origin of that material, or
361 |     requiring that modified versions of such material be marked in
362 |     reasonable ways as different from the original version; or
363 | 
364 |     d) Limiting the use for publicity purposes of names of licensors or
365 |     authors of the material; or
366 | 
367 |     e) Declining to grant rights under trademark law for use of some
368 |     trade names, trademarks, or service marks; or
369 | 
370 |     f) Requiring indemnification of licensors and authors of that
371 |     material by anyone who conveys the material (or modified versions of
372 |     it) with contractual assumptions of liability to the recipient, for
373 |     any liability that these contractual assumptions directly impose on
374 |     those licensors and authors.
375 | 
376 |   All other non-permissive additional terms are considered "further
377 | restrictions" within the meaning of section 10.  If the Program as you
378 | received it, or any part of it, contains a notice stating that it is
379 | governed by this License along with a term that is a further
380 | restriction, you may remove that term.  If a license document contains
381 | a further restriction but permits relicensing or conveying under this
382 | License, you may add to a covered work material governed by the terms
383 | of that license document, provided that the further restriction does
384 | not survive such relicensing or conveying.
385 | 
386 |   If you add terms to a covered work in accord with this section, you
387 | must place, in the relevant source files, a statement of the
388 | additional terms that apply to those files, or a notice indicating
389 | where to find the applicable terms.
390 | 
391 |   Additional terms, permissive or non-permissive, may be stated in the
392 | form of a separately written license, or stated as exceptions;
393 | the above requirements apply either way.
394 | 
395 |   8. Termination.
396 | 
397 |   You may not propagate or modify a covered work except as expressly
398 | provided under this License.  Any attempt otherwise to propagate or
399 | modify it is void, and will automatically terminate your rights under
400 | this License (including any patent licenses granted under the third
401 | paragraph of section 11).
402 | 
403 |   However, if you cease all violation of this License, then your
404 | license from a particular copyright holder is reinstated (a)
405 | provisionally, unless and until the copyright holder explicitly and
406 | finally terminates your license, and (b) permanently, if the copyright
407 | holder fails to notify you of the violation by some reasonable means
408 | prior to 60 days after the cessation.
409 | 
410 |   Moreover, your license from a particular copyright holder is
411 | reinstated permanently if the copyright holder notifies you of the
412 | violation by some reasonable means, this is the first time you have
413 | received notice of violation of this License (for any work) from that
414 | copyright holder, and you cure the violation prior to 30 days after
415 | your receipt of the notice.
416 | 
417 |   Termination of your rights under this section does not terminate the
418 | licenses of parties who have received copies or rights from you under
419 | this License.  If your rights have been terminated and not permanently
420 | reinstated, you do not qualify to receive new licenses for the same
421 | material under section 10.
422 | 
423 |   9. Acceptance Not Required for Having Copies.
424 | 
425 |   You are not required to accept this License in order to receive or
426 | run a copy of the Program.  Ancillary propagation of a covered work
427 | occurring solely as a consequence of using peer-to-peer transmission
428 | to receive a copy likewise does not require acceptance.  However,
429 | nothing other than this License grants you permission to propagate or
430 | modify any covered work.  These actions infringe copyright if you do
431 | not accept this License.  Therefore, by modifying or propagating a
432 | covered work, you indicate your acceptance of this License to do so.
433 | 
434 |   10. Automatic Licensing of Downstream Recipients.
435 | 
436 |   Each time you convey a covered work, the recipient automatically
437 | receives a license from the original licensors, to run, modify and
438 | propagate that work, subject to this License.  You are not responsible
439 | for enforcing compliance by third parties with this License.
440 | 
441 |   An "entity transaction" is a transaction transferring control of an
442 | organization, or substantially all assets of one, or subdividing an
443 | organization, or merging organizations.  If propagation of a covered
444 | work results from an entity transaction, each party to that
445 | transaction who receives a copy of the work also receives whatever
446 | licenses to the work the party's predecessor in interest had or could
447 | give under the previous paragraph, plus a right to possession of the
448 | Corresponding Source of the work from the predecessor in interest, if
449 | the predecessor has it or can get it with reasonable efforts.
450 | 
451 |   You may not impose any further restrictions on the exercise of the
452 | rights granted or affirmed under this License.  For example, you may
453 | not impose a license fee, royalty, or other charge for exercise of
454 | rights granted under this License, and you may not initiate litigation
455 | (including a cross-claim or counterclaim in a lawsuit) alleging that
456 | any patent claim is infringed by making, using, selling, offering for
457 | sale, or importing the Program or any portion of it.
458 | 
459 |   11. Patents.
460 | 
461 |   A "contributor" is a copyright holder who authorizes use under this
462 | License of the Program or a work on which the Program is based.  The
463 | work thus licensed is called the contributor's "contributor version".
464 | 
465 |   A contributor's "essential patent claims" are all patent claims
466 | owned or controlled by the contributor, whether already acquired or
467 | hereafter acquired, that would be infringed by some manner, permitted
468 | by this License, of making, using, or selling its contributor version,
469 | but do not include claims that would be infringed only as a
470 | consequence of further modification of the contributor version.  For
471 | purposes of this definition, "control" includes the right to grant
472 | patent sublicenses in a manner consistent with the requirements of
473 | this License.
474 | 
475 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
476 | patent license under the contributor's essential patent claims, to
477 | make, use, sell, offer for sale, import and otherwise run, modify and
478 | propagate the contents of its contributor version.
479 | 
480 |   In the following three paragraphs, a "patent license" is any express
481 | agreement or commitment, however denominated, not to enforce a patent
482 | (such as an express permission to practice a patent or covenant not to
483 | sue for patent infringement).  To "grant" such a patent license to a
484 | party means to make such an agreement or commitment not to enforce a
485 | patent against the party.
486 | 
487 |   If you convey a covered work, knowingly relying on a patent license,
488 | and the Corresponding Source of the work is not available for anyone
489 | to copy, free of charge and under the terms of this License, through a
490 | publicly available network server or other readily accessible means,
491 | then you must either (1) cause the Corresponding Source to be so
492 | available, or (2) arrange to deprive yourself of the benefit of the
493 | patent license for this particular work, or (3) arrange, in a manner
494 | consistent with the requirements of this License, to extend the patent
495 | license to downstream recipients.  "Knowingly relying" means you have
496 | actual knowledge that, but for the patent license, your conveying the
497 | covered work in a country, or your recipient's use of the covered work
498 | in a country, would infringe one or more identifiable patents in that
499 | country that you have reason to believe are valid.
500 | 
501 |   If, pursuant to or in connection with a single transaction or
502 | arrangement, you convey, or propagate by procuring conveyance of, a
503 | covered work, and grant a patent license to some of the parties
504 | receiving the covered work authorizing them to use, propagate, modify
505 | or convey a specific copy of the covered work, then the patent license
506 | you grant is automatically extended to all recipients of the covered
507 | work and works based on it.
508 | 
509 |   A patent license is "discriminatory" if it does not include within
510 | the scope of its coverage, prohibits the exercise of, or is
511 | conditioned on the non-exercise of one or more of the rights that are
512 | specifically granted under this License.  You may not convey a covered
513 | work if you are a party to an arrangement with a third party that is
514 | in the business of distributing software, under which you make payment
515 | to the third party based on the extent of your activity of conveying
516 | the work, and under which the third party grants, to any of the
517 | parties who would receive the covered work from you, a discriminatory
518 | patent license (a) in connection with copies of the covered work
519 | conveyed by you (or copies made from those copies), or (b) primarily
520 | for and in connection with specific products or compilations that
521 | contain the covered work, unless you entered into that arrangement,
522 | or that patent license was granted, prior to 28 March 2007.
523 | 
524 |   Nothing in this License shall be construed as excluding or limiting
525 | any implied license or other defenses to infringement that may
526 | otherwise be available to you under applicable patent law.
527 | 
528 |   12. No Surrender of Others' Freedom.
529 | 
530 |   If conditions are imposed on you (whether by court order, agreement or
531 | otherwise) that contradict the conditions of this License, they do not
532 | excuse you from the conditions of this License.  If you cannot convey a
533 | covered work so as to satisfy simultaneously your obligations under this
534 | License and any other pertinent obligations, then as a consequence you may
535 | not convey it at all.  For example, if you agree to terms that obligate you
536 | to collect a royalty for further conveying from those to whom you convey
537 | the Program, the only way you could satisfy both those terms and this
538 | License would be to refrain entirely from conveying the Program.
539 | 
540 |   13. Remote Network Interaction; Use with the GNU General Public License.
541 | 
542 |   Notwithstanding any other provision of this License, if you modify the
543 | Program, your modified version must prominently offer all users
544 | interacting with it remotely through a computer network (if your version
545 | supports such interaction) an opportunity to receive the Corresponding
546 | Source of your version by providing access to the Corresponding Source
547 | from a network server at no charge, through some standard or customary
548 | means of facilitating copying of software.  This Corresponding Source
549 | shall include the Corresponding Source for any work covered by version 3
550 | of the GNU General Public License that is incorporated pursuant to the
551 | following paragraph.
552 | 
553 |   Notwithstanding any other provision of this License, you have
554 | permission to link or combine any covered work with a work licensed
555 | under version 3 of the GNU General Public License into a single
556 | combined work, and to convey the resulting work.  The terms of this
557 | License will continue to apply to the part which is the covered work,
558 | but the work with which it is combined will remain governed by version
559 | 3 of the GNU General Public License.
560 | 
561 |   14. Revised Versions of this License.
562 | 
563 |   The Free Software Foundation may publish revised and/or new versions of
564 | the GNU Affero General Public License from time to time.  Such new versions
565 | will be similar in spirit to the present version, but may differ in detail to
566 | address new problems or concerns.
567 | 
568 |   Each version is given a distinguishing version number.  If the
569 | Program specifies that a certain numbered version of the GNU Affero General
570 | Public License "or any later version" applies to it, you have the
571 | option of following the terms and conditions either of that numbered
572 | version or of any later version published by the Free Software
573 | Foundation.  If the Program does not specify a version number of the
574 | GNU Affero General Public License, you may choose any version ever published
575 | by the Free Software Foundation.
576 | 
577 |   If the Program specifies that a proxy can decide which future
578 | versions of the GNU Affero General Public License can be used, that proxy's
579 | public statement of acceptance of a version permanently authorizes you
580 | to choose that version for the Program.
581 | 
582 |   Later license versions may give you additional or different
583 | permissions.  However, no additional obligations are imposed on any
584 | author or copyright holder as a result of your choosing to follow a
585 | later version.
586 | 
587 |   15. Disclaimer of Warranty.
588 | 
589 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597 | 
598 |   16. Limitation of Liability.
599 | 
600 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608 | SUCH DAMAGES.
609 | 
610 |   17. Interpretation of Sections 15 and 16.
611 | 
612 |   If the disclaimer of warranty and limitation of liability provided
613 | above cannot be given local legal effect according to their terms,
614 | reviewing courts shall apply local law that most closely approximates
615 | an absolute waiver of all civil liability in connection with the
616 | Program, unless a warranty or assumption of liability accompanies a
617 | copy of the Program in return for a fee.
618 | 
619 |                      END OF TERMS AND CONDITIONS
620 | 
621 |             How to Apply These Terms to Your New Programs
622 | 
623 |   If you develop a new program, and you want it to be of the greatest
624 | possible use to the public, the best way to achieve this is to make it
625 | free software which everyone can redistribute and change under these terms.
626 | 
627 |   To do so, attach the following notices to the program.  It is safest
628 | to attach them to the start of each source file to most effectively
629 | state the exclusion of warranty; and each file should have at least
630 | the "copyright" line and a pointer to where the full notice is found.
631 | 
632 |     <one line to give the program's name and a brief idea of what it does.>
633 |     Copyright (C) <year>  <name of author>
634 | 
635 |     This program is free software: you can redistribute it and/or modify
636 |     it under the terms of the GNU Affero General Public License as published by
637 |     the Free Software Foundation, either version 3 of the License, or
638 |     (at your option) any later version.
639 | 
640 |     This program is distributed in the hope that it will be useful,
641 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
642 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
643 |     GNU Affero General Public License for more details.
644 | 
645 |     You should have received a copy of the GNU Affero General Public License
646 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
647 | 
648 | Also add information on how to contact you by electronic and paper mail.
649 | 
650 |   If your software can interact with users remotely through a computer
651 | network, you should also make sure that it provides a way for users to
652 | get its source.  For example, if your program is a web application, its
653 | interface could display a "Source" link that leads users to an archive
654 | of the code.  There are many ways you could offer source, and different
655 | solutions will be better for different programs; see section 13 for the
656 | specific requirements.
657 | 
658 |   You should also get your employer (if you work as a programmer) or school,
659 | if any, to sign a "copyright disclaimer" for the program, if necessary.
660 | For more information on this, and how to apply and follow the GNU AGPL, see
661 | <https://www.gnu.org/licenses/>.
662 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # hash-roll
 2 | 
 3 | Provides a generic API for abstracting over various implimentations of content
 4 | defined chunking. Also provides implimentations of a number of content defined
 5 | chunking algorithms.
 6 | 
 7 | ## Metrics
 8 | 
 9 |  - DER: duplicate elimination ratio
10 | 
11 | ## CDC References
12 | 
13 |  - https://www.usenix.org/legacy/events/fast10/tech/full_papers/kruus.pdf
14 | 
15 | ## [Comparison of the Chunking options avaliable in Rust](COMPARE.md)
16 | 
17 | 
18 | ## License
19 | 
20 | This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
21 | 
22 | This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
23 | 
24 | You should have received a copy of the GNU Affero General Public License along with this program. If not, see <https://www.gnu.org/licenses/>.
25 | 


--------------------------------------------------------------------------------
/benches/compare.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  */
  3 | use criterion::{black_box, criterion_group, criterion_main, Criterion};
  4 | use hash_roll::{ChunkIncr, Splitter};
  5 | use rand;
  6 | 
  7 | /*
  8 | pub fn split_hashmap<F, I>(b: &mut Criterion, bytes: usize, init: F)
  9 |     where F: Fn(&[u8]) -> I,
 10 |           I: FnMut() -> Option<usize>
 11 | {
 12 |     use rand::RngCore;
 13 |     use std::collections::HashMap;
 14 |     let mut rng = rand::thread_rng();
 15 |     let mut d = vec![0u8; bytes];
 16 |     let mut lenghts = HashMap::new();
 17 |     b.iter(|| {
 18 |         rng.fill_bytes(&mut d);
 19 | 
 20 |         let mut i = init(&d[..]);
 21 |         loop {
 22 |             match i() {
 23 |                 Some(l) => {
 24 |                     let mut v = lenghts.entry(l).or_insert(0u64);
 25 |                     *v = *v + 1;
 26 |                 },
 27 |                 None => {
 28 |                     break;
 29 |                 }
 30 |             }
 31 |         }
 32 |     })
 33 | 
 34 |     /* TODO: analize length data */
 35 | }
 36 | */
 37 | 
 38 | /*
 39 | pub fn split<F>(b: &mut Criterion, bytes: usize, _name: &'static str, init: F)
 40 |     where for<'a> F: Fn(&'a [u8]) -> Box<FnMut() -> Option<u64> + 'a>
 41 | {
 42 |     use rand::RngCore;
 43 |     let mut rng = rand::thread_rng();
 44 |     let mut d = vec![0u8; bytes];
 45 |     b.iter(|| {
 46 |         rng.fill_bytes(&mut d);
 47 |         let mut i = test::black_box(init(&d[..]));
 48 |         loop {
 49 |             match test::black_box(i()) {
 50 |                 None => {
 51 |                     break;
 52 |                 },
 53 |                 _ => {},
 54 |             }
 55 |         }
 56 |     });
 57 | }
 58 | */
 59 | 
 60 | pub fn split_histogram<F>(c: &mut Criterion, bytes: usize, name: &'static str, init: F)
 61 | where
 62 |     for<'a> F: Fn(&'a [u8]) -> Box<dyn FnMut() -> Option<u64> + 'a>,
 63 | {
 64 |     use histogram::*;
 65 |     use rand::RngCore;
 66 |     use rand::SeedableRng;
 67 |     let mut rng = rand_pcg::Pcg64::from_rng(rand::thread_rng()).unwrap();
 68 |     let mut d = vec![0u8; bytes];
 69 |     rng.fill_bytes(&mut d);
 70 |     let mut lenghts = Histogram::new();
 71 |     c.bench_function(name, |b| {
 72 |         b.iter(|| {
 73 |             let mut i = black_box(init(&d[..]));
 74 |             loop {
 75 |                 match black_box(i()) {
 76 |                     Some(l) => {
 77 |                         lenghts.increment(l).unwrap();
 78 |                     }
 79 |                     None => {
 80 |                         break;
 81 |                     }
 82 |                 }
 83 |             }
 84 |         })
 85 |     });
 86 | 
 87 |     /* FIXME: for some reason cargo runs this outer code many times over instead of just running
 88 |      * the inner code many times over, causing this info to be printied far to much.
 89 |      */
 90 |     /*
 91 |     println!("{}({} bytes) p50: {} bytes, p90: {} bytes, p99: {} bytes, p999: {} bytes",
 92 |         name, bytes,
 93 |         lenghts.percentile(50.0).unwrap(),
 94 |         lenghts.percentile(90.0).unwrap(),
 95 |         lenghts.percentile(99.0).unwrap(),
 96 |         lenghts.percentile(99.9).unwrap(),
 97 |     );
 98 |     */
 99 | }
100 | 
101 | /* 4 MiB */
102 | const BENCH_BYTES: usize = 1024 * 1024 / 2;
103 | 
104 | //const BENCH_RANGE : Range<usize> = Range { first: Bound::Unbounded, last: Bound::Unbounded };
105 | 
106 | fn bench_rsyncable_vecs(c: &mut Criterion) {
107 |     use rand::RngCore;
108 |     let mut rng = rand::thread_rng();
109 |     let mut d = vec![0u8; BENCH_BYTES];
110 |     c.bench_function("rsyncable vecs", |b| {
111 |         b.iter(|| {
112 |             rng.fill_bytes(&mut d);
113 |             let s = hash_roll::rsyncable::Rsyncable::default().into_vecs(d.iter().cloned());
114 |             for _ in s {}
115 |         })
116 |     });
117 | }
118 | 
119 | fn bench_rsyncable_slices(c: &mut Criterion) {
120 |     use rand::RngCore;
121 |     let mut rng = rand::thread_rng();
122 |     let mut d = vec![0u8; BENCH_BYTES];
123 |     c.bench_function("rsyncable slices", |b| {
124 |         b.iter(|| {
125 |             rng.fill_bytes(&mut d);
126 |             let s = hash_roll::rsyncable::Rsyncable::default().into_slices(&d[..]);
127 |             for _ in s {}
128 |         })
129 |     });
130 | }
131 | 
132 | fn bench_zpaq(b: &mut Criterion) {
133 |     split_histogram(b, BENCH_BYTES, "bench_zpaq", |data| {
134 |         let z = hash_roll::zpaq::Zpaq::default();
135 |         let mut c = &data[..];
136 |         Box::new(move || {
137 |             let (a, b) = z.split(c);
138 |             if b.is_empty() || a.is_empty() {
139 |                 None
140 |             } else {
141 |                 c = b;
142 |                 Some(b.len() as u64)
143 |             }
144 |         })
145 |     });
146 | }
147 | 
148 | fn bench_zpaq_iter_slice(b: &mut Criterion) {
149 |     split_histogram(b, BENCH_BYTES, "zpaq_iter_slice", |data| {
150 |         let z = hash_roll::zpaq::Zpaq::default();
151 |         let mut zi = z.into_slices(data);
152 |         Box::new(move || zi.next().map(|x| x.len() as u64))
153 |     })
154 | }
155 | 
156 | fn bench_zpaq_iter_vec(b: &mut Criterion) {
157 |     split_histogram(b, BENCH_BYTES, "zpaq_iter_vec", |data| {
158 |         let z = hash_roll::zpaq::Zpaq::default();
159 |         let mut zi = z.into_vecs(data.iter().cloned());
160 |         Box::new(move || zi.next().map(|x| x.len() as u64))
161 |     })
162 | }
163 | 
164 | fn bench_rollsum_bup(b: &mut Criterion) {
165 |     split_histogram(b, BENCH_BYTES, "rollsum_bup", |data| {
166 |         let mut z = rollsum::Bup::default();
167 |         let mut pos = 0;
168 |         Box::new(move || {
169 |             let l = z.find_chunk_edge(&data[pos..]).map(|x| (x as u64) + 1);
170 |             match l {
171 |                 Some(x) => pos += x as usize,
172 |                 None => {}
173 |             }
174 |             l
175 |         })
176 |     })
177 | }
178 | 
179 | fn bench_bup(b: &mut Criterion) {
180 |     split_histogram(b, BENCH_BYTES, "bup", |data| {
181 |         let mut z = hash_roll::bup::RollSumIncr::default();
182 |         let mut pos = 0;
183 |         Box::new(move || {
184 |             let l = z.push(&data[pos..]);
185 |             match l {
186 |                 Some(x) => pos += x,
187 |                 None => {}
188 |             }
189 |             l.map(|x| x as u64)
190 |         })
191 |     })
192 | }
193 | 
194 | criterion_group!(
195 |     benches,
196 |     bench_bup,
197 |     bench_rollsum_bup,
198 |     bench_rsyncable_vecs,
199 |     bench_zpaq,
200 |     bench_zpaq_iter_vec,
201 |     bench_zpaq_iter_slice,
202 |     bench_rsyncable_slices
203 | );
204 | criterion_main!(benches);
205 | 


--------------------------------------------------------------------------------
/bors.toml:
--------------------------------------------------------------------------------
1 | status = [
2 | 	"test (stable)",
3 | 	"test (beta)",
4 | 	"test (1.47.0)",
5 | 	"check",
6 | ]
7 | cut_body_after = "---"
8 | delete_merged_branches = true
9 | 


--------------------------------------------------------------------------------
/ci/script.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | 
 3 | run_cargo() {
 4 | 	if [ -n "${FEATURES:-}" ]; then
 5 | 		cargo "$@" --verbose --features="$FEATURES"
 6 | 	else
 7 | 		cargo "$@" --verbose
 8 | 	fi
 9 | }
10 | 
11 | run_cargo test
12 | 


--------------------------------------------------------------------------------
/examples/generate-test-data.rs:
--------------------------------------------------------------------------------
 1 | use rand::RngCore;
 2 | use rand_pcg::Pcg64;
 3 | use std::io::Write;
 4 | 
 5 | fn main() {
 6 |     let mut args = std::env::args();
 7 |     if args.len() != 3 {
 8 |         eprintln!("usage: generate-test-data <seed> <len-kib>");
 9 |         std::process::exit(1);
10 |     }
11 | 
12 |     let _ = args.next().unwrap();
13 |     let seed: u128 = args.next().unwrap().parse().unwrap();
14 |     let len: usize = args.next().unwrap().parse().unwrap();
15 | 
16 |     let mut fill_rng = Pcg64::new(seed, 0xa02bdbf7bb3c0a7ac28fa16a64abf96);
17 |     // note: original len = 8192 * 4 = 1024 * 32
18 |     let mut buf = vec![0u8; 1024 * len];
19 |     fill_rng.fill_bytes(&mut buf[..]);
20 |     std::io::stdout().lock().write_all(&buf[..]).unwrap();
21 | }
22 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | edition="2018"
2 | use_try_shorthand=true
3 | use_field_init_shorthand=true
4 | 


--------------------------------------------------------------------------------
/src/bup.rs:
--------------------------------------------------------------------------------
  1 | use crate::{Chunk, ChunkIncr, ToChunkIncr};
  2 | use std::fmt;
  3 | use std::num::Wrapping;
  4 | 
  5 | const BLOBBITS: u8 = 13;
  6 | const BLOBSIZE: u32 = 1 << (BLOBBITS as u32);
  7 | 
  8 | const WINDOW_BITS: u8 = 6;
  9 | const WINDOW_SIZE: usize = 1 << (WINDOW_BITS as usize);
 10 | 
 11 | const ROLLSUM_CHAR_OFFSET: usize = 31;
 12 | 
 13 | /// Rolling sum used by [`Bup`] for splitting
 14 | ///
 15 | /// - https://github.com/bup/bup/blob/0ab7c3a958729b4723e6fe254da771aff608c2bf/lib/bup/bupsplit.c
 16 | /// - https://github.com/bup/bup/blob/0ab7c3a958729b4723e6fe254da771aff608c2bf/lib/bup/bupsplit.h
 17 | ///
 18 | #[derive(Debug, Clone, PartialEq, Eq)]
 19 | pub struct RollSum {
 20 |     window_len: usize,
 21 | }
 22 | 
 23 | impl ToChunkIncr for RollSum {
 24 |     type Incr = RollSumIncr;
 25 | 
 26 |     fn to_chunk_incr(&self) -> Self::Incr {
 27 |         self.into()
 28 |     }
 29 | }
 30 | 
 31 | impl RollSum {
 32 |     pub fn with_window(window_len: usize) -> Self {
 33 |         Self { window_len }
 34 |     }
 35 | }
 36 | 
 37 | impl Chunk for RollSum {
 38 |     type SearchState = RollSumSearchState;
 39 | 
 40 |     fn to_search_state(&self) -> Self::SearchState {
 41 |         self.into()
 42 |     }
 43 | 
 44 |     fn find_chunk_edge(
 45 |         &self,
 46 |         state: &mut Self::SearchState,
 47 |         data: &[u8],
 48 |     ) -> (Option<usize>, usize) {
 49 |         for i in state.offset..data.len() {
 50 |             let a = data[i];
 51 |             let d = if i >= self.window_len {
 52 |                 data[i - self.window_len]
 53 |             } else {
 54 |                 0
 55 |             };
 56 | 
 57 |             state.state.add(self.window_len, d, a);
 58 | 
 59 |             if state.state.at_split() {
 60 |                 state.reset(self);
 61 |                 return (Some(i + 1), i + 1);
 62 |             }
 63 |         }
 64 | 
 65 |         // keep k elements = discard all but k
 66 |         let discard_ct = data.len().saturating_sub(self.window_len);
 67 |         state.offset = data.len() - discard_ct;
 68 |         (None, discard_ct)
 69 |     }
 70 | }
 71 | 
 72 | #[derive(Debug, Clone, PartialEq, Eq)]
 73 | pub struct RollSumState {
 74 |     // NOTE: in bup, these are `unsigned`, but masking indicates they'll end up being used as
 75 |     // u16's. In `librsync`, these are `uint_fast16_t`, which end up being u32 on most platforms.
 76 |     // Both only require `u16` values to be represented. We use `u32` here as it's likely to be
 77 |     // somewhat more performant, but this should be examined
 78 |     s1: Wrapping<u32>,
 79 |     s2: Wrapping<u32>,
 80 | }
 81 | 
 82 | impl From<&RollSum> for RollSumState {
 83 |     fn from(s: &RollSum) -> Self {
 84 |         let ws = Wrapping(s.window_len as u32);
 85 |         // NOTE: bup uses this initialization, but librsync uses zeros.
 86 |         //
 87 |         // I believe the idea is to allow a slightly different implimentation of the "setup"
 88 |         // portion of the processing (ie: before the window is filled)
 89 |         Self {
 90 |             s1: ws * Wrapping(ROLLSUM_CHAR_OFFSET as u32),
 91 |             s2: ws * (ws - Wrapping(1)) * Wrapping(ROLLSUM_CHAR_OFFSET as u32),
 92 |         }
 93 |     }
 94 | }
 95 | 
 96 | impl RollSumState {
 97 |     fn reset(&mut self, params: &RollSum) {
 98 |         *self = params.into()
 99 |     }
100 | }
101 | 
102 | #[derive(Debug, Clone, PartialEq, Eq)]
103 | pub struct RollSumSearchState {
104 |     state: RollSumState,
105 |     offset: usize,
106 | }
107 | 
108 | impl From<&RollSum> for RollSumSearchState {
109 |     fn from(s: &RollSum) -> Self {
110 |         Self {
111 |             state: s.into(),
112 |             offset: 0,
113 |         }
114 |     }
115 | }
116 | 
117 | impl RollSumSearchState {
118 |     fn reset(&mut self, params: &RollSum) {
119 |         self.offset = 0;
120 |         self.state.reset(params);
121 |     }
122 | }
123 | 
124 | impl Default for RollSum {
125 |     fn default() -> Self {
126 |         Self::with_window(WINDOW_SIZE)
127 |     }
128 | }
129 | 
130 | /// Incrimental instance of [`RollSum`]
131 | ///
132 | /// Performance note: Bup's Roll sum algorithm requires tracking the entire window. As a result,
133 | /// this includes a circular buffer which all inputs are copied through. If your use case allows
134 | /// it, use the non-incrimental variant for improved performance.
135 | #[derive(Clone, PartialEq, Eq)]
136 | pub struct RollSumIncr {
137 |     state: RollSumState,
138 | 
139 |     /// window offset
140 |     wofs: Wrapping<usize>,
141 |     window: Box<[u8]>,
142 | }
143 | 
144 | impl fmt::Debug for RollSumIncr {
145 |     fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> Result<(), ::std::fmt::Error> {
146 |         f.debug_struct("RollSumIncr")
147 |             .field("state", &self.state)
148 |             .field("window", &::fmt_extra::Hs(&self.window[..]))
149 |             .field("wofs", &self.wofs)
150 |             .finish()
151 |     }
152 | }
153 | 
154 | impl From<&RollSum> for RollSumIncr {
155 |     fn from(params: &RollSum) -> Self {
156 |         Self {
157 |             state: params.into(),
158 |             window: vec![0; params.window_len].into_boxed_slice(),
159 |             wofs: Wrapping(0),
160 |         }
161 |     }
162 | }
163 | 
164 | impl Default for RollSumIncr {
165 |     fn default() -> Self {
166 |         (&RollSum::default()).into()
167 |     }
168 | }
169 | 
170 | impl RollSumState {
171 |     fn add(&mut self, window_len: usize, drop: u8, add: u8) {
172 |         let d = Wrapping(drop as u32);
173 |         self.s1 += Wrapping(add as u32);
174 |         self.s1 -= d;
175 |         self.s2 += self.s1;
176 |         self.s2 -= Wrapping(window_len as u32) * (d + Wrapping(ROLLSUM_CHAR_OFFSET as u32));
177 |     }
178 | 
179 |     fn digest(&self) -> u32 {
180 |         (self.s1.0 << 16) | (self.s2.0 & 0xffff)
181 |     }
182 | 
183 |     fn at_split(&self) -> bool {
184 |         (self.digest() & (BLOBSIZE - 1)) == (BLOBSIZE - 1)
185 |     }
186 | }
187 | 
188 | impl RollSumIncr {
189 |     pub fn digest(&self) -> u32 {
190 |         self.state.digest()
191 |     }
192 | 
193 |     fn add(&mut self, drop: u8, add: u8) {
194 |         self.state.add(self.window.len(), drop, add);
195 |     }
196 | 
197 |     pub fn roll_byte(&mut self, ch: u8) {
198 |         let w = self.window[self.wofs.0];
199 |         self.add(w, ch);
200 |         self.window[self.wofs.0] = ch;
201 |         self.wofs = Wrapping((self.wofs + Wrapping(1)).0 & (self.window.len() - 1));
202 |     }
203 | 
204 |     #[cfg(test)]
205 |     pub(crate) fn roll(&mut self, data: &[u8]) {
206 |         for &i in data.iter() {
207 |             self.roll_byte(i);
208 |         }
209 |     }
210 | 
211 |     /*
212 |     fn sum(data: &[u8]) -> u32 {
213 |         let mut x = Self::default();
214 |         x.roll(data);
215 |         x.digest()
216 |     }
217 |     */
218 | 
219 |     pub fn at_split(&self) -> bool {
220 |         self.state.at_split()
221 |     }
222 | }
223 | 
224 | impl ChunkIncr for RollSumIncr {
225 |     fn push(&mut self, data: &[u8]) -> Option<usize> {
226 |         for (i, &v) in data.iter().enumerate() {
227 |             self.roll_byte(v);
228 |             if self.at_split() {
229 |                 return Some(i + 1);
230 |             }
231 |         }
232 | 
233 |         None
234 |     }
235 | }
236 | 
237 | #[cfg(test)]
238 | mod test {
239 |     use super::*;
240 |     use rand::RngCore;
241 |     use rollsum::Engine;
242 | 
243 |     #[test]
244 |     fn rs() {
245 |         let mut m = RollSumIncr::default();
246 |         m.roll_byte(3);
247 |         assert_eq!(m.digest(), 130279491);
248 |     }
249 | 
250 |     #[test]
251 |     fn compare_rollsum() {
252 |         let mut m1 = RollSumIncr::default();
253 |         let mut m2 = rollsum::Bup::default();
254 | 
255 |         assert_eq!(m1.digest(), m2.digest());
256 | 
257 |         m1.roll_byte(4);
258 |         m2.roll_byte(4);
259 | 
260 |         assert_eq!(m1.digest(), m2.digest());
261 | 
262 |         m1.roll_byte(18);
263 |         m2.roll_byte(18);
264 | 
265 |         assert_eq!(m1.digest(), m2.digest());
266 | 
267 |         let mut r = rand::thread_rng();
268 |         let mut b = [0u8; 2048];
269 | 
270 |         r.fill_bytes(&mut b);
271 | 
272 |         for (i, &v) in b.iter().enumerate() {
273 |             m1.roll_byte(v);
274 |             m2.roll_byte(v);
275 |             println!("i={}, v={}", i, v);
276 |             assert_eq!(m1.digest(), m2.digest());
277 |         }
278 | 
279 |         m1.roll(&b);
280 |         m2.roll(&b);
281 | 
282 |         assert_eq!(m1.digest(), m2.digest());
283 |     }
284 | 
285 |     #[test]
286 |     fn compare_bup() {
287 |         use super::ChunkIncr;
288 |         let mut m1 = RollSumIncr::default();
289 |         let mut m2 = rollsum::Bup::default();
290 | 
291 |         let mut r = rand::thread_rng();
292 |         let mut b = [0u8; 2048];
293 | 
294 |         r.fill_bytes(&mut b);
295 | 
296 |         let mut x = &b[..];
297 |         loop {
298 |             let v1 = m1.push(&x);
299 |             let v2 = m2.find_chunk_edge(&x);
300 |             assert_eq!(v1, v2.map(|x| x.0));
301 | 
302 |             match v1 {
303 |                 None => break,
304 |                 Some(v) => {
305 |                     x = &x[v..];
306 |                 }
307 |             }
308 |         }
309 |     }
310 | }
311 | 


--------------------------------------------------------------------------------
/src/buzhash.rs:
--------------------------------------------------------------------------------
  1 | #![cfg(feature = "buzhash")]
  2 | 
  3 | //! BuzHash (aka Cyclic Polynomial Hashing) is a window based rolling hash
  4 | //!
  5 | //! BuzHash with various chunk-splitting methods is used in:
  6 | //!   - [Borg](https://github.com/borgbackup/borg)
  7 | //!      - https://github.com/borgbackup/borg/blob/master/src/borg/_chunker.c
  8 | //!   - [Attic](https://github.com/jborg/attic)
  9 | //!     - https://github.com/jborg/attic/blob/master/attic/_chunker.c
 10 | //!   - [silvasur/buzhash](https://github.com/silvasur/buzhash), in turn used in:
 11 | //!     - [attic-labs/nom](https://github.com/attic-labs/noms/blob/26620a34bc8c95812037588869d4790b5581b34d/go/types/rolling_value_hasher.go#L15-L21)
 12 | //!     - [dolt](https://github.com/liquidata-inc/dolt)
 13 | //!   - [casync](https://github.com/systemd/casync/blob/master/src/cachunker.c)
 14 | //!
 15 | //! Documentation:
 16 | //!
 17 | //! - [Recursive Hashing Functions for n-Grams, JONATHAN D. COHEN](https://www.csee.umbc.edu/courses/graduate/676/recursivehashingp291-cohen)
 18 | //! - ["Cyclic Polynomial", Rolling Hashes, Wikipedia](https://en.wikipedia.org/wiki/Rolling_hash#cite_ref-3)
 19 | //!
 20 | use crate::{Chunk, ChunkIncr, ToChunkIncr};
 21 | use std::fmt;
 22 | use std::num::Wrapping;
 23 | /* Cyclic polynomial (buzhash)
 24 |  *
 25 |  * H = s ** (k -1) (h(c_1)) ^ s**(k-2)(h(c_2)) ^ ... ^ s(h(c_(k-1))) ^ h(c_k)
 26 |  * where s(x) is a barrel shift of x (ABCDEFG becomes BCDEFGA, where each letter is a bit)
 27 |  * s**y(x) is application of s(n) y times.
 28 |  *
 29 |  * Application:
 30 |  *
 31 |  *  - H <- s(H)
 32 |  *  - c_1 <- s**k(h(c_1))
 33 |  *  - H <- s(H) ^ s**k(h(c_1)) ^ h(c_(k+1))
 34 |  *
 35 |  *  Where c_1 is the character to remove,
 36 |  *      c_(k+1) is the character to add
 37 |  *
 38 |  * Parameters:
 39 |  *  - k: number of inputs to contain (can be un-capped?)
 40 |  *  - h: a hash function from inputs to integers on [2, 2**L)
 41 |  *
 42 |  * State:
 43 |  *  - every input contained in the hash (if removal is required)
 44 |  *  - previous hash result
 45 |  */
 46 | 
 47 | /// Describes an instance of BuzHash (aka cyclic polynomial hash).
 48 | ///
 49 | /// Provides parameterization over the window size (`k`), hash function (`h`), chunk edge mask, and
 50 | /// max chunk size.
 51 | ///
 52 | /// Uses fixed 32-bit width for the hash.
 53 | ///
 54 | /// The trait [`BuzHashHash`] provides the internal hash function, see the implimentations of it
 55 | /// for built-in hash options (which include both `Borg` and `silvasur/buzhash`'s internal hash
 56 | /// tables).
 57 | ///
 58 | /// Note that it's helpful for `k` to be prime to prevent repeating strings form resulting
 59 | /// in total cancelation of the internal hash, which can cause overly long chunks.
 60 | ///
 61 | /// Adjusting `mask` changes the average chunk size.
 62 | ///
 63 | /// # Performance
 64 | ///
 65 | /// [`BuzHash`] requires storing bytes equal to it's window size (`k`). Because of this,
 66 | /// [`BuzHashIncr`] may have poor performance compared to [`BuzHash::find_chunk_edge()`].
 67 | #[derive(Debug, Clone, PartialEq, Eq)]
 68 | pub struct BuzHash<H: BuzHashHash> {
 69 |     /// number of characters to consider at once
 70 |     k: usize,
 71 | 
 72 |     /// A hash function over a single byte that emits a 32-bit value
 73 |     h: H,
 74 | 
 75 |     /// the 1 bits indicates the bit in the hash which must be 1 to form a chunk edge
 76 |     /// (called `pattern` in `attic-labs/nom`)
 77 |     mask: u32,
 78 | 
 79 |     /// if the index grows _above_ this size, a chunk edge is formed
 80 |     max_chunk_size: u64,
 81 | }
 82 | 
 83 | impl<H: BuzHashHash> BuzHash<H> {
 84 |     /// Create an instance with the given capacity (k) and chunk termination `mask`, and a internal
 85 |     /// `hash` function.
 86 |     ///
 87 |     /// `capacity` is the number of bytes that are taken into account for a given hash.
 88 |     /// `mask` affects how chunk edges are determined.
 89 |     /// `hash` is applied to each byte of input prior to mixing into the rolling hash.
 90 |     pub fn new(capacity: usize, mask: u32, hash: H, max_chunk_size: u64) -> Self {
 91 |         assert!(capacity > 0);
 92 |         BuzHash {
 93 |             k: capacity,
 94 |             h: hash,
 95 |             mask,
 96 |             max_chunk_size,
 97 |         }
 98 |     }
 99 | 
100 |     // fn new_attic()
101 |     // fn new_bup()
102 | }
103 | 
104 | impl<'a> BuzHash<BuzHashTableByteSaltHash<'a>> {
105 |     /// Create a buzhash instance using defaults from attic-labs/nom version 7.17
106 |     ///
107 |     /// - `k: 67`
108 |     /// - `hash` is the `silvasur/buzhash` table
109 |     /// - `mask: 1<<12 -1`
110 |     /// - `max_chunk_size: 1 << 24`
111 |     pub fn new_nom(salt: u8) -> Self {
112 |         BuzHash::new(
113 |             67,
114 |             (1 << 12u32) - 1,
115 |             BuzHashTableByteSaltHash::from((salt, &crate::buzhash_table::GO_BUZHASH)),
116 |             1 << 24,
117 |         )
118 |     }
119 | }
120 | 
121 | impl<H: BuzHashHash + Clone> Chunk for BuzHash<H> {
122 |     type SearchState = BuzHashSearchState;
123 | 
124 |     fn to_search_state(&self) -> Self::SearchState {
125 |         Self::SearchState::default()
126 |     }
127 | 
128 |     fn find_chunk_edge(
129 |         &self,
130 |         state: &mut Self::SearchState,
131 |         data: &[u8],
132 |     ) -> (Option<usize>, usize) {
133 |         for i in state.offset..data.len() {
134 |             state.state.add_buf(data, self, i);
135 | 
136 |             if (state.state.h & self.mask) == self.mask {
137 |                 state.reset();
138 |                 return (Some(i + 1), i + 1);
139 |             }
140 | 
141 |             /*
142 |              * broken: `i` is not the number of bytes since prev chunk.
143 |              * need to track internal last chunk
144 |             if i as u64 > self.max_chunk_size {
145 |                 state.reset();
146 |                 println!(" <- CHUNK: {}", i + 1);
147 |                 return (Some(i + 1), i + 1);
148 |             }
149 |             */
150 |         }
151 | 
152 |         // keep k elements = discard all but k
153 |         let discard_ct = data.len().saturating_sub(self.k);
154 |         state.offset = data.len() - discard_ct;
155 |         (None, discard_ct)
156 |     }
157 | }
158 | 
159 | impl<H: BuzHashHash + Clone> From<&BuzHash<H>> for BuzHashIncr<H> {
160 |     fn from(src: &BuzHash<H>) -> Self {
161 |         src.clone().into()
162 |     }
163 | }
164 | 
165 | impl<H: BuzHashHash + Clone> ToChunkIncr for BuzHash<H> {
166 |     type Incr = BuzHashIncr<H>;
167 |     fn to_chunk_incr(&self) -> Self::Incr {
168 |         self.into()
169 |     }
170 | }
171 | 
172 | #[derive(Debug, Clone, PartialEq, Eq, Default)]
173 | pub struct BuzHashSearchState {
174 |     offset: usize,
175 |     state: BuzHashState,
176 | }
177 | 
178 | impl BuzHashSearchState {
179 |     fn reset(&mut self) {
180 |         self.offset = 0;
181 |         self.state.reset();
182 |     }
183 | }
184 | 
185 | #[derive(Debug, Clone, PartialEq, Eq, Default)]
186 | struct BuzHashState {
187 |     /// current value of the hash.
188 |     h: u32,
189 | }
190 | 
191 | impl BuzHashState {
192 |     fn reset(&mut self) {
193 |         self.h = 0;
194 |     }
195 | 
196 |     fn add_buf<H: BuzHashHash>(&mut self, data: &[u8], params: &BuzHash<H>, i: usize) {
197 |         if i >= params.k {
198 |             // need to find and "remove" a entry
199 |             let drop_i = i - params.k;
200 |             let drop = data[drop_i];
201 |             self.add_overflow(params, data[i], drop);
202 |         } else {
203 |             // no removal
204 |             self.add(params, data[i]);
205 |         }
206 |     }
207 | 
208 |     // insert, assuming no overflow
209 |     fn add<H: BuzHashHash>(&mut self, params: &BuzHash<H>, v: u8) {
210 |         self.h = self.h.rotate_left(1) ^ params.h.hash(v);
211 |     }
212 | 
213 |     // insert with overflow
214 |     fn add_overflow<H: BuzHashHash>(&mut self, params: &BuzHash<H>, add_v: u8, remove_v: u8) {
215 |         let h = self.h.rotate_left(1);
216 |         // need to find and "remove" a entry
217 |         let drop = params.h.hash(remove_v).rotate_left((params.k % 8) as u32);
218 |         self.h = h ^ drop ^ params.h.hash(add_v);
219 |     }
220 | }
221 | 
222 | /// Self-contained buzhash which buffers it's window of values internally
223 | ///
224 | /// Note that this will be less efficient than using [`BuzHash`] on a slice directly,
225 | /// but may be more convenient.
226 | #[derive(Debug, Clone, PartialEq, Eq)]
227 | pub struct BuzHashIncr<H: BuzHashHash> {
228 |     params: BuzHash<H>,
229 |     state: BuzHashState,
230 |     buf: Box<[u8]>,
231 |     buf_idx: Wrapping<usize>,
232 |     input_idx: u64,
233 | }
234 | 
235 | impl<H: BuzHashHash> ChunkIncr for BuzHashIncr<H> {
236 |     /// Return the index in `data` immeidately following the hash matching.
237 |     ///
238 |     /// Note that you can call this multiple times to examine "subsequent" `data` slices, but the
239 |     /// index returned will always refer to the current `data` slice.
240 |     fn push(&mut self, data: &[u8]) -> Option<usize> {
241 |         for (i, &v) in data.iter().enumerate() {
242 |             self.push_byte(v);
243 |             if (self.state.h & self.params.mask) == self.params.mask {
244 |                 self.reset();
245 |                 return Some(i + 1);
246 |             }
247 | 
248 |             if self.input_idx > self.params.max_chunk_size {
249 |                 self.reset();
250 |                 return Some(i + 1);
251 |             }
252 |         }
253 | 
254 |         None
255 |     }
256 | }
257 | 
258 | impl<H: BuzHashHash> BuzHashIncr<H> {
259 |     fn reset(&mut self) {
260 |         self.buf_idx = Wrapping(0);
261 |         self.input_idx = 0;
262 |         self.state.reset();
263 |     }
264 | 
265 |     fn push_byte(&mut self, val: u8) {
266 |         if self.input_idx >= self.params.k as u64 {
267 |             let o = self.buf[self.buf_idx.0];
268 |             self.state.add_overflow(&self.params, val, o);
269 |         } else {
270 |             self.state.add(&self.params, val);
271 |         }
272 | 
273 |         self.buf[self.buf_idx.0] = val;
274 | 
275 |         self.buf_idx += Wrapping(1);
276 |         self.buf_idx.0 %= self.params.k;
277 |         self.input_idx += 1;
278 |     }
279 | }
280 | 
281 | impl<H: BuzHashHash> From<BuzHash<H>> for BuzHashIncr<H> {
282 |     fn from(params: BuzHash<H>) -> Self {
283 |         let buf = vec![0; params.k].into_boxed_slice();
284 |         Self {
285 |             params,
286 |             state: Default::default(),
287 |             buf,
288 |             buf_idx: Wrapping(0),
289 |             input_idx: 0,
290 |         }
291 |     }
292 | }
293 | 
294 | /// The internal byte to u32 mapping used in buzhash
295 | pub trait BuzHashHash {
296 |     fn hash(&self, data: u8) -> u32;
297 | }
298 | 
299 | /// Use a referenced table to preform the `BuzHashHash` internal hashing
300 | #[derive(Clone)]
301 | pub struct BuzHashTableHash<'a> {
302 |     table: &'a [u32; 256],
303 | }
304 | 
305 | impl<'a> fmt::Debug for BuzHashTableHash<'a> {
306 |     fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
307 |         fmt.debug_struct("BuzHashTableHash").finish()
308 |     }
309 | }
310 | 
311 | impl<'a> From<&'a [u32; 256]> for BuzHashTableHash<'a> {
312 |     fn from(table: &'a [u32; 256]) -> Self {
313 |         Self { table }
314 |     }
315 | }
316 | 
317 | impl<'a> BuzHashHash for BuzHashTableHash<'a> {
318 |     fn hash(&self, data: u8) -> u32 {
319 |         self.table[data as usize]
320 |     }
321 | }
322 | 
323 | /// Use a owned table to perform the `BuzHashHash` internal hashing
324 | #[derive(Clone)]
325 | pub struct BuzHashTableBufHash {
326 |     table: Box<[u32; 256]>,
327 | }
328 | 
329 | impl fmt::Debug for BuzHashTableBufHash {
330 |     fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
331 |         fmt.debug_struct("BuzHashTableBufHash").finish()
332 |     }
333 | }
334 | 
335 | impl<'a> From<Box<[u32; 256]>> for BuzHashTableBufHash {
336 |     fn from(table: Box<[u32; 256]>) -> Self {
337 |         Self { table }
338 |     }
339 | }
340 | 
341 | impl BuzHashHash for BuzHashTableBufHash {
342 |     fn hash(&self, data: u8) -> u32 {
343 |         self.table[data as usize]
344 |     }
345 | }
346 | 
347 | /// Lookup up in a table, after applying a salt via xor to the input byte
348 | ///
349 | /// Used by attic-labs/nom
350 | #[derive(Clone)]
351 | pub struct BuzHashTableByteSaltHash<'a> {
352 |     table: &'a [u32; 256],
353 |     salt: u8,
354 | }
355 | 
356 | impl<'a> fmt::Debug for BuzHashTableByteSaltHash<'a> {
357 |     fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
358 |         fmt.debug_struct("BuzHashTableByteSaltHash").finish()
359 |     }
360 | }
361 | 
362 | impl<'a> From<(u8, &'a [u32; 256])> for BuzHashTableByteSaltHash<'a> {
363 |     fn from((salt, table): (u8, &'a [u32; 256])) -> Self {
364 |         Self { table, salt }
365 |     }
366 | }
367 | 
368 | impl<'a> BuzHashHash for BuzHashTableByteSaltHash<'a> {
369 |     fn hash(&self, data: u8) -> u32 {
370 |         self.table[(data ^ self.salt) as usize]
371 |     }
372 | }
373 | 


--------------------------------------------------------------------------------
/src/buzhash_table.rs:
--------------------------------------------------------------------------------
 1 | /// https://github.com/silvasur/buzhash/blob/9bdec3dec7c611fa97beadc374d75bdf02cd880e/hash.go#L13-L57
 2 | ///
 3 | /// used by attic & attic-labs/nom
 4 | pub const GO_BUZHASH: [u32; 256] = [
 5 |     0x12bd9527, 0xf4140cea, 0x987bd6e1, 0x79079850, 0xafbfd539, 0xd350ce0a, 0x82973931, 0x9fc32b9c,
 6 |     0x28003b88, 0xc30c13aa, 0x6b678c34, 0x5844ef1d, 0xaa552c18, 0x4a77d3e8, 0xd1f62ea0, 0x6599417c,
 7 |     0xfbe30e7a, 0xf9e2d5ee, 0xa1fca42e, 0x41548969, 0x116d5b59, 0xaeda1e1a, 0xc5191c17, 0x54b9a3cb,
 8 |     0x727e492a, 0x5c432f91, 0x31a50bce, 0xc2696af6, 0x217c8020, 0x1262aefc, 0xace75924, 0x9876a04f,
 9 |     0xaf300bc2, 0x3ffce3f6, 0xd6680fb5, 0xd0b1ced8, 0x6651f842, 0x736fadef, 0xbc2d3429, 0xb03d2904,
10 |     0x7e634ba4, 0xdfd87d8c, 0x7988d63a, 0x4be4d933, 0x6a8d0382, 0x9e132d62, 0x3ee9c95f, 0xfec05b97,
11 |     0x6907ad34, 0x8616cfcc, 0xa6aabf24, 0x8ad1c92e, 0x4f2affc0, 0xb87519db, 0x6576eaf6, 0x15dbe00a,
12 |     0x63e1dd82, 0xa36b6a81, 0xeead99b3, 0xbc6a4309, 0x3478d1a7, 0x2182bcc0, 0xdd50cfce, 0x7cb25580,
13 |     0x73075483, 0x503b7f42, 0x4cd50d63, 0x3f4d94c9, 0x385fcbb7, 0x90daf16c, 0xece10b8e, 0x11c1cb04,
14 |     0x816a899b, 0x69a29d06, 0xfb090b37, 0xf98ef13c, 0x07653435, 0x9f15dc42, 0x3b43abdf, 0x1334283f,
15 |     0x93f3d9af, 0x0cbdfe71, 0xa788a614, 0x4f54d2f0, 0xd4374fc7, 0x70557ce7, 0xf741fce8, 0xe4b6f661,
16 |     0xc630cb98, 0x387a6366, 0x72f428fd, 0x539009db, 0xc53e3810, 0x1e1a52e5, 0x7d6816b0, 0x040f9b81,
17 |     0x9c99c9fb, 0x9f3af3d2, 0x774d1061, 0xd5c840ea, 0x8e1480fe, 0x6ee4023c, 0x2fbda535, 0xd88eff7a,
18 |     0xd8632a2a, 0x43c4e024, 0x3ef27971, 0xc72866fd, 0xe35cc630, 0x46d96220, 0x437a8384, 0xe92caf0c,
19 |     0x6290a47e, 0xa7bb9238, 0x0e1000f9, 0x49e76bdc, 0x3acfb4b8, 0x03582b8e, 0x6ea2de4e, 0x2ec1008d,
20 |     0xfcc8df69, 0x91c2fe0a, 0xb471c7d9, 0x778be812, 0x70d29ad1, 0x76411cbf, 0xc302e81c, 0x4e445194,
21 |     0x22e3aa72, 0xb65762e9, 0xa280db05, 0x827aa70e, 0x4c531a9d, 0x7a60bf4a, 0x8fd95a44, 0x2289aef0,
22 |     0xcd50ddc4, 0x639aae69, 0x5fe85ed6, 0x4ed724ff, 0x00f04f7d, 0x95a5fcb0, 0x88255d15, 0xa603d2c9,
23 |     0xf6956a5b, 0x53ea7f3e, 0xb570f225, 0x2b3be203, 0xa181e40e, 0xc413cdce, 0xa7cb1ebb, 0xcf258b1f,
24 |     0x516eb016, 0xca204586, 0xd1e69894, 0xe85a73d3, 0x7db2d382, 0xae73b463, 0x3598d643, 0x5087c864,
25 |     0xd91f30b6, 0xe1d4d1e7, 0x73b3b337, 0xceac1233, 0x8edf7845, 0xa69c45c9, 0xdb5db3ab, 0x28cfade8,
26 |     0xebfa49e7, 0xcbc2a659, 0x59cce971, 0x959a01af, 0x8ee9aae7, 0xfb2f01c6, 0x5a752836, 0x9ed12981,
27 |     0x618d05b6, 0x93ec12b3, 0x4590c779, 0xed1317a2, 0x03fe5835, 0x7ad3c6f7, 0xd4aad5b5, 0x1a995ed7,
28 |     0x247bfaa4, 0x69c2c799, 0x745fa405, 0xc5b9f239, 0xc3d9aebc, 0xa6f60e0b, 0xdf1e91d7, 0xab8e041c,
29 |     0xee3188c6, 0x37377a9e, 0xc0e1a3bf, 0x19a5a9e4, 0x56cb9556, 0xc4d33d3f, 0xfb1eb03e, 0xf9557057,
30 |     0x1be31d37, 0xd1fa65f1, 0xf518d714, 0x570ac722, 0xf26cf66a, 0x24794d47, 0x8ba2e402, 0x3f5137e6,
31 |     0x35be1453, 0x43350478, 0x9f05ee88, 0x364cf9cf, 0x39a23ee7, 0xa4db8d49, 0xc2ebb3d2, 0xc6fb99d5,
32 |     0xe014dfb0, 0x7156d425, 0xe090a87a, 0x4cc12f78, 0x1b30f503, 0x06694a7a, 0x68198cd1, 0x2f8345bd,
33 |     0x9d79198e, 0xd871943f, 0x22ef6cf4, 0xe81b1c15, 0x067b61d8, 0xfc4ea4f5, 0xfe6dab57, 0x1bf744ba,
34 |     0xa70b6a25, 0xafe6e412, 0xc6c1a05c, 0x8ffbe3ce, 0xc4270af1, 0xf3f36373, 0xc4507dd8, 0x5e6fd1e2,
35 |     0x58cd9739, 0x47d3c5b5, 0xe1d5a343, 0x3d4dea4a, 0x893d91ae, 0xbb2a5e2a, 0x0d57b800, 0x652a7cc9,
36 |     0x6a68ccfd, 0x62529f0b, 0xec5f36d6, 0x766cceda, 0x96ca63ef, 0xa0499838, 0xd9030f59, 0x8185f4d2,
37 | ];
38 | 
39 | /// https://github.com/borgbackup/borg/blob/master/src/borg/_chunker.c#L30-L64
40 | ///
41 | /// Note that borg does not use this directly, it xors it with a 32bit seed prior to use
42 | pub const BORG_BUZHASH_BASE: [u32; 256] = [
43 |     0xe7f831ec, 0xf4026465, 0xafb50cae, 0x6d553c7a, 0xd639efe3, 0x19a7b895, 0x9aba5b21, 0x5417d6d4,
44 |     0x35fd2b84, 0xd1f6a159, 0x3f8e323f, 0xb419551c, 0xf444cebf, 0x21dc3b80, 0xde8d1e36, 0x84a32436,
45 |     0xbeb35a9d, 0xa36f24aa, 0xa4e60186, 0x98d18ffe, 0x3f042f9e, 0xdb228bcd, 0x096474b7, 0x5c20c2f7,
46 |     0xf9eec872, 0xe8625275, 0xb9d38f80, 0xd48eb716, 0x22a950b4, 0x3cbaaeaa, 0xc37cddd3, 0x8fea6f6a,
47 |     0x1d55d526, 0x7fd6d3b3, 0xdaa072ee, 0x4345ac40, 0xa077c642, 0x8f2bd45b, 0x28509110, 0x55557613,
48 |     0xffc17311, 0xd961ffef, 0xe532c287, 0xaab95937, 0x46d38365, 0xb065c703, 0xf2d91d0f, 0x92cd4bb0,
49 |     0x4007c712, 0xf35509dd, 0x505b2f69, 0x557ead81, 0x310f4563, 0xbddc5be8, 0x9760f38c, 0x701e0205,
50 |     0x00157244, 0x14912826, 0xdc4ca32b, 0x67b196de, 0x5db292e8, 0x8c1b406b, 0x01f34075, 0xfa2520f7,
51 |     0x73bc37ab, 0x1e18bc30, 0xfe2c6cb3, 0x20c522d0, 0x5639e3db, 0x942bda35, 0x899af9d1, 0xced44035,
52 |     0x98cc025b, 0x255f5771, 0x70fefa24, 0xe928fa4d, 0x2c030405, 0xb9325590, 0x20cb63bd, 0xa166305d,
53 |     0x80e52c0a, 0xa8fafe2f, 0x1ad13f7d, 0xcfaf3685, 0x6c83a199, 0x7d26718a, 0xde5dfcd9, 0x79cf7355,
54 |     0x8979d7fb, 0xebf8c55e, 0xebe408e4, 0xcd2affba, 0xe483be6e, 0xe239d6de, 0x5dc1e9e0, 0x0473931f,
55 |     0x851b097c, 0xac5db249, 0x09c0f9f2, 0xd8d2f134, 0xe6f38e41, 0xb1c71bf1, 0x52b6e4db, 0x07224424,
56 |     0x6cf73e85, 0x4f25d89c, 0x782a7d74, 0x10a68dcd, 0x3a868189, 0xd570d2dc, 0x69630745, 0x9542ed86,
57 |     0x331cd6b2, 0xa84b5b28, 0x07879c9d, 0x38372f64, 0x7185db11, 0x25ba7c83, 0x01061523, 0xe6792f9f,
58 |     0xe5df07d1, 0x4321b47f, 0x7d2469d8, 0x1a3a4f90, 0x48be29a3, 0x669071af, 0x8ec8dd31, 0x0810bfbf,
59 |     0x813a06b4, 0x68538345, 0x65865ddc, 0x43a71b8e, 0x78619a56, 0x5a34451d, 0x5bdaa3ed, 0x71edc7e9,
60 |     0x17ac9a20, 0x78d10bfa, 0x6c1e7f35, 0xd51839d9, 0x240cbc51, 0x33513cc1, 0xd2b4f795, 0xccaa8186,
61 |     0x0babe682, 0xa33cf164, 0x18c643ea, 0xc1ca105f, 0x9959147a, 0x6d3d94de, 0x0b654fbe, 0xed902ca0,
62 |     0x7d835cb5, 0x99ba1509, 0x6445c922, 0x495e76c2, 0xf07194bc, 0xa1631d7e, 0x677076a5, 0x89fffe35,
63 |     0x1a49bcf3, 0x8e6c948a, 0x0144c917, 0x8d93aea1, 0x16f87ddf, 0xc8f25d49, 0x1fb11297, 0x27e750cd,
64 |     0x2f422da1, 0xdee89a77, 0x1534c643, 0x457b7b8b, 0xaf172f7a, 0x6b9b09d6, 0x33573f7f, 0xf14e15c4,
65 |     0x526467d5, 0xaf488241, 0x87c3ee0d, 0x33be490c, 0x95aa6e52, 0x43ec242e, 0xd77de99b, 0xd018334f,
66 |     0x5b78d407, 0x498eb66b, 0xb1279fa8, 0xb38b0ea6, 0x90718376, 0xe325dee2, 0x8e2f2cba, 0xcaa5bdec,
67 |     0x9d652c56, 0xad68f5cb, 0xa77591af, 0x88e37ee8, 0xf8faa221, 0xfcbbbe47, 0x4f407786, 0xaf393889,
68 |     0xf444a1d9, 0x15ae1a2f, 0x40aa7097, 0x6f9486ac, 0x29d232a3, 0xe47609e9, 0xe8b631ff, 0xba8565f4,
69 |     0x11288749, 0x46c9a838, 0xeb1b7cd8, 0xf516bbb1, 0xfb74fda0, 0x010996e6, 0x4c994653, 0x1d889512,
70 |     0x53dcd9a3, 0xdd074697, 0x1e78e17c, 0x637c98bf, 0x930bb219, 0xcf7f75b0, 0xcb9355fb, 0x9e623009,
71 |     0xe466d82c, 0x28f968d3, 0xfeb385d9, 0x238e026c, 0xb8ed0560, 0x0c6a027a, 0x3d6fec4b, 0xbb4b2ec2,
72 |     0xe715031c, 0xeded011d, 0xcdc4d3b9, 0xc456fc96, 0xdd0eea20, 0xb3df8ec9, 0x12351993, 0xd9cbb01c,
73 |     0x603147a2, 0xcf37d17d, 0xf7fcd9dc, 0xd8556fa3, 0x104c8131, 0x13152774, 0xb4715811, 0x6a72c2c9,
74 |     0xc5ae37bb, 0xa76ce12a, 0x8150d8f3, 0x2ec29218, 0xa35f0984, 0x48c0647e, 0x0b5ff98c, 0x71893f7b,
75 | ];
76 | 
77 | pub fn borg_buzhash_table(seed: u32) -> Box<[u32; 256]> {
78 |     let mut t = Box::new([0u32; 256]);
79 |     for i in 0..t.len() {
80 |         t[i] = BORG_BUZHASH_BASE[i] ^ seed
81 |     }
82 | 
83 |     t
84 | }
85 | 


--------------------------------------------------------------------------------
/src/fastcdc.rs:
--------------------------------------------------------------------------------
  1 | #![cfg(feature = "fastcdc")]
  2 | 
  3 | //! FastCDC is a chunking algorithm using some features from [Gear](super::gear)
  4 | //!
  5 | //! Reference:
  6 | //!  - https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf
  7 | 
  8 | use crate::{Chunk, ChunkIncr, ToChunkIncr};
  9 | use std::fmt;
 10 | use std::num::Wrapping;
 11 | 
 12 | // these masks are taken from the paper and could be adjusted/adjustable.
 13 | const MASK_S: u64 = 0x0003590703530000;
 14 | //const MASK_A: u64 = 0x0000d90303530000;
 15 | const MASK_L: u64 = 0x0000d90003530000;
 16 | 
 17 | /// An instance of the "FastCDC" algorithm
 18 | ///
 19 | /// Default parameters:
 20 | ///  - Minimum chunk size: 2 KiB
 21 | ///  - Maximum chunk size: 64 KiB
 22 | ///  - Normal size: 8 KiB
 23 | ///  - internal 64-bit gear table: [`super::gear_table::GEAR_64`]
 24 | ///
 25 | #[derive(Clone, Copy)]
 26 | pub struct FastCdc<'a> {
 27 |     gear: &'a [u64; 256],
 28 |     min_size: u64,
 29 |     max_size: u64,
 30 |     normal_size: u64,
 31 | }
 32 | 
 33 | impl<'a> PartialEq for FastCdc<'a> {
 34 |     fn eq(&self, other: &Self) -> bool {
 35 |         self.min_size == other.min_size
 36 |             && self.max_size == other.max_size
 37 |             && self.normal_size == other.normal_size
 38 |             && self.gear[..] == other.gear[..]
 39 |     }
 40 | }
 41 | 
 42 | impl<'a> Eq for FastCdc<'a> {}
 43 | 
 44 | impl<'a> Default for FastCdc<'a> {
 45 |     fn default() -> Self {
 46 |         FastCdc {
 47 |             min_size: 2 * 1024,    // 2 KiB
 48 |             max_size: 64 * 1024,   // 64 KiB
 49 |             normal_size: 8 * 1024, // 8 KiB
 50 |             gear: &super::gear_table::GEAR_64,
 51 |         }
 52 |     }
 53 | }
 54 | 
 55 | impl<'a> fmt::Debug for FastCdc<'a> {
 56 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 57 |         f.debug_struct("FastCdc")
 58 |             .field("gear", &"[...]")
 59 |             .field("min_size", &self.min_size)
 60 |             .field("max_size", &self.max_size)
 61 |             .field("normal_size", &self.normal_size)
 62 |             .finish()
 63 |     }
 64 | }
 65 | 
 66 | impl<'a> Chunk for FastCdc<'a> {
 67 |     type SearchState = FastCdcState;
 68 | 
 69 |     fn to_search_state(&self) -> Self::SearchState {
 70 |         Default::default()
 71 |     }
 72 | 
 73 |     fn find_chunk_edge(
 74 |         &self,
 75 |         state: &mut Self::SearchState,
 76 |         data: &[u8],
 77 |     ) -> (Option<usize>, usize) {
 78 |         match state.push(self, data) {
 79 |             Some(i) => (Some(i + 1), i + 1),
 80 |             None => (None, data.len()),
 81 |         }
 82 |     }
 83 | }
 84 | 
 85 | impl<'a> FastCdc<'a> {
 86 |     /// Create a custom FastCDC instance
 87 |     pub fn new(gear: &'a [u64; 256], min_size: u64, normal_size: u64, max_size: u64) -> Self {
 88 |         Self {
 89 |             gear,
 90 |             min_size,
 91 |             max_size,
 92 |             normal_size,
 93 |         }
 94 |     }
 95 | }
 96 | 
 97 | impl<'a> ToChunkIncr for FastCdc<'a> {
 98 |     type Incr = FastCdcIncr<'a>;
 99 | 
100 |     fn to_chunk_incr(&self) -> Self::Incr {
101 |         self.into()
102 |     }
103 | }
104 | 
105 | impl<'a> From<&FastCdc<'a>> for FastCdcIncr<'a> {
106 |     fn from(params: &FastCdc<'a>) -> Self {
107 |         Self {
108 |             params: *params,
109 |             state: Default::default(),
110 |         }
111 |     }
112 | }
113 | 
114 | /// FastCdcIncr provides an incrimental interface to `FastCdc`
115 | ///
116 | /// This impl does not buffer data passing through it (the FastCDC algorithm does not require
117 | /// look-back) making it very efficient.
118 | #[derive(Debug, Clone, PartialEq, Eq, Default)]
119 | pub struct FastCdcIncr<'a> {
120 |     params: FastCdc<'a>,
121 |     state: FastCdcState,
122 | }
123 | 
124 | #[derive(Debug, Clone, PartialEq, Eq, Default)]
125 | pub struct FastCdcState {
126 |     /// Number of bytes we've "examined"
127 |     ///
128 |     /// varying state.
129 |     l: u64,
130 | 
131 |     /// Current fingerprint
132 |     ///
133 |     /// varying state.
134 |     fp: Wrapping<u64>,
135 | }
136 | 
137 | impl FastCdcState {
138 |     fn reset(&mut self) {
139 |         self.l = 0;
140 |         self.fp = Wrapping(0);
141 |     }
142 | 
143 |     fn push(&mut self, params: &FastCdc<'_>, data: &[u8]) -> Option<usize> {
144 |         // global start/index
145 |         let mut gi = self.l;
146 |         // global end
147 |         let ge = data.len() as u64 + gi;
148 | 
149 |         if ge <= params.min_size {
150 |             // No split, no processing of data, but we've "consumed" the bytes.
151 |             self.l = ge;
152 |             return None;
153 |         }
154 | 
155 |         // skip elements prior to MIN_SIZE and track offset of new `data` in argument `data` for
156 |         // return value
157 |         let mut i = if gi <= params.min_size {
158 |             let skip = params.min_size - gi;
159 |             gi += skip;
160 |             skip
161 |         } else {
162 |             0
163 |         } as usize;
164 | 
165 |         let mut fp = self.fp;
166 | 
167 |         loop {
168 |             if i >= data.len() {
169 |                 break;
170 |             }
171 |             if gi >= params.normal_size {
172 |                 // go to next set of matches
173 |                 break;
174 |             }
175 | 
176 |             let v = data[i];
177 |             fp = (fp << 1) + Wrapping(params.gear[v as usize]);
178 |             if (fp.0 & MASK_S) == 0 {
179 |                 self.reset();
180 |                 return Some(i);
181 |             }
182 | 
183 |             gi += 1;
184 |             i += 1;
185 |         }
186 | 
187 |         loop {
188 |             if gi >= params.max_size {
189 |                 // no match found, emit fixed match at MAX_SIZE
190 |                 self.reset();
191 |                 return Some(i);
192 |             }
193 |             if i >= data.len() {
194 |                 break;
195 |             }
196 | 
197 |             let v = data[i];
198 |             fp = (fp << 1) + Wrapping(params.gear[v as usize]);
199 |             if (fp.0 & MASK_L) == 0 {
200 |                 self.reset();
201 |                 return Some(i);
202 |             }
203 | 
204 |             gi += 1;
205 |             i += 1;
206 |         }
207 | 
208 |         // no match, but not at MAX_SIZE yet, so store context for next time.
209 |         self.fp = fp;
210 |         self.l = ge;
211 | 
212 |         None
213 |     }
214 | }
215 | 
216 | impl<'a> ChunkIncr for FastCdcIncr<'a> {
217 |     fn push(&mut self, src: &[u8]) -> Option<usize> {
218 |         self.state.push(&self.params, src)
219 |     }
220 | }
221 | 


--------------------------------------------------------------------------------
/src/gear.rs:
--------------------------------------------------------------------------------
  1 | #![cfg(feature = "gear")]
  2 | 
  3 | use crate::{Chunk, ChunkIncr, ToChunkIncr};
  4 | use std::fmt;
  5 | use std::num::Wrapping;
  6 | 
  7 | /// Gear Content Defined Chunking using 32bit expansion.
  8 | ///
  9 | /// Reference:
 10 | ///
 11 | ///  Xia, W., Jiang, H., Feng, D., Tian, L., Fu, M., and Zhou, Y. Ddelta: A dedulication-inspired
 12 | ///  fast delta compression approach. Performance Evaluation 79 (2014), 258-271.
 13 | ///
 14 | ///  http://wxia.hustbackup.cn/pub/DElta-PEVA-2014.pdf
 15 | #[derive(Clone)]
 16 | pub struct Gear32<'a> {
 17 |     /// A mask with an appropriate number of bits set for the desired average chunk size.
 18 |     ///
 19 |     /// fixed configuration.
 20 |     mask: u32,
 21 | 
 22 |     /// value to match (fp & mask) against.
 23 |     ///
 24 |     /// fixed configuration.
 25 |     xxx: u32,
 26 | 
 27 |     /// A table to map bytes to 32bit values
 28 |     ///
 29 |     /// fixed configuration.
 30 |     gear: &'a [u32; 256],
 31 | }
 32 | 
 33 | #[derive(Debug, Default, PartialEq, Eq, Clone)]
 34 | pub struct GearState32 {
 35 |     /// current fingerprint/hash
 36 |     ///
 37 |     /// varying state.
 38 |     fp: Wrapping<u32>,
 39 | }
 40 | 
 41 | #[derive(Debug, Clone)]
 42 | pub struct GearIncr32<'a> {
 43 |     params: Gear32<'a>,
 44 | 
 45 |     state: GearState32,
 46 | }
 47 | 
 48 | impl<'a> Chunk for Gear32<'a> {
 49 |     type SearchState = GearState32;
 50 | 
 51 |     fn to_search_state(&self) -> Self::SearchState {
 52 |         Default::default()
 53 |     }
 54 | 
 55 |     fn find_chunk_edge(
 56 |         &self,
 57 |         state: &mut Self::SearchState,
 58 |         data: &[u8],
 59 |     ) -> (Option<usize>, usize) {
 60 |         for (i, v) in data.iter().enumerate() {
 61 |             if state.push(self, *v) {
 62 |                 *state = self.to_search_state();
 63 |                 return (Some(i + 1), i + 1);
 64 |             }
 65 |         }
 66 | 
 67 |         (None, data.len())
 68 |     }
 69 | }
 70 | 
 71 | impl<'a> ToChunkIncr for Gear32<'a> {
 72 |     type Incr = GearIncr32<'a>;
 73 | 
 74 |     fn to_chunk_incr(&self) -> Self::Incr {
 75 |         self.into()
 76 |     }
 77 | }
 78 | 
 79 | impl<'a> From<&Gear32<'a>> for GearIncr32<'a> {
 80 |     fn from(params: &Gear32<'a>) -> Self {
 81 |         Self {
 82 |             params: params.clone(),
 83 |             state: Default::default(),
 84 |         }
 85 |     }
 86 | }
 87 | 
 88 | impl<'a> fmt::Debug for Gear32<'a> {
 89 |     fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
 90 |         fmt.debug_struct("Gear32")
 91 |             .field("mask", &self.mask)
 92 |             .field("xxx", &self.xxx)
 93 |             .field("gear", &&self.gear[..])
 94 |             .finish()
 95 |     }
 96 | }
 97 | 
 98 | impl GearState32 {
 99 |     fn push(&mut self, params: &Gear32<'_>, add: u8) -> bool {
100 |         self.fp = (self.fp << 1) + Wrapping(params.gear[add as usize]);
101 |         self.fp.0 & params.mask == params.xxx
102 |     }
103 | 
104 |     fn reset(&mut self) {
105 |         self.fp.0 = 0;
106 |     }
107 | }
108 | 
109 | impl<'a> ChunkIncr for GearIncr32<'a> {
110 |     fn push(&mut self, data: &[u8]) -> Option<usize> {
111 |         for (i, v) in data.iter().enumerate() {
112 |             if self.state.push(&self.params, *v) {
113 |                 self.state.reset();
114 |                 return Some(i);
115 |             }
116 |         }
117 | 
118 |         None
119 |     }
120 | }
121 | 
122 | fn msb_mask(log2: usize) -> u32 {
123 |     // at least 1 bit & not all the bits
124 |     // FIXME: probably could relax those requirements with better math.
125 |     //debug_assert!(log2 > 0);
126 |     //debug_assert!(log2 < 32);
127 | 
128 |     ((1 << log2) - 1) << (32 - log2)
129 | }
130 | 
131 | impl<'a> Gear32<'a> {
132 |     /// Create a gear chunker which emits blocks with average size `(1<<average_size_log2)`, (or:
133 |     /// `2**average_size_log2`
134 |     pub fn with_average_size_log2(average_size_log2: usize) -> Self {
135 |         Gear32 {
136 |             mask: msb_mask(average_size_log2),
137 |             xxx: 0,
138 |             gear: &super::gear_table::GEAR_32,
139 |         }
140 |     }
141 | }
142 | 
143 | impl<'a> Default for Gear32<'a> {
144 |     fn default() -> Self {
145 |         // 8KB average size
146 |         Self::with_average_size_log2(13)
147 |     }
148 | }
149 | 
150 | #[cfg(test)]
151 | mod test {
152 |     #[test]
153 |     fn mm() {
154 |         use super::msb_mask;
155 |         assert_eq!(0b1 << 31, msb_mask(1));
156 |         assert_eq!(0b11 << 30, msb_mask(2));
157 |         assert_eq!(0b111 << 29, msb_mask(3));
158 |     }
159 | }
160 | 


--------------------------------------------------------------------------------
/src/gear_table/gear32.rs:
--------------------------------------------------------------------------------
 1 | /// An abitrary table originally produced for rsroll that is included here for convenience.
 2 | pub static GEAR_32: [u32; 256] = [
 3 |     0xb088d3a9, 0x5652c7f7, 0x45b28969, 0x6b0a89d5, 0x368f573e, 0x1dc636dc, 0x207a4c4e, 0xa474b346,
 4 |     0x3b06a83e, 0x90e78d6c, 0xe1c92df7, 0x8e95053a, 0x5a2ef4f1, 0xa50fac94, 0x0e7303eb, 0x99b07edc,
 5 |     0x689d2fb5, 0x00005082, 0xc4b08306, 0x3eb0678a, 0xf19f87ab, 0xf2129fbf, 0x48114957, 0x00000106,
 6 |     0x1fba3780, 0x3bf06fd6, 0x99687e97, 0x79a10673, 0xe4accf9e, 0x2520e71f, 0x2bd5d3fd, 0x00de4dcd,
 7 |     0xeaa9311c, 0xdb748eb6, 0xaf579a8d, 0x86a6e5da, 0xcc2fc30a, 0x355e2afe, 0x2d99c8f4, 0xbade4b4a,
 8 |     0xf7b51872, 0x3286b658, 0x0000b688, 0xa115d6e4, 0x484f7e9c, 0xccca7bb7, 0xbf2584a6, 0xade7e813,
 9 |     0x00007094, 0x8ae69108, 0xbd776ad7, 0xfb6b001f, 0xc7a474b8, 0xbaf6f116, 0x09cb1f5b, 0xb0b219e6,
10 |     0x00ccbc38, 0xcc849d0a, 0x73a3ef7d, 0xc807d2d3, 0x7f2ac996, 0xd037a86b, 0xf3f17c66, 0xaca626b0,
11 |     0x755a9937, 0x90837ee6, 0x6ee8ad93, 0x0000d9e1, 0x9e063bb2, 0x07ab77f1, 0xec550255, 0x78fb94a8,
12 |     0xc7510e1b, 0x0000320b, 0x827c3326, 0x14675f0b, 0x267bd3a6, 0xf1916ff9, 0x86221b7f, 0x9dbecee7,
13 |     0xea58f8ca, 0x008d1986, 0x6d38704f, 0xe032cb07, 0x228d21f6, 0x635cb1bf, 0x4620a173, 0xa7e7dfe3,
14 |     0x0c10ca93, 0x2727fee8, 0xa2df1c6d, 0x4dcdd1ac, 0x000070ff, 0xa2ace87b, 0x9892275a, 0xc2861181,
15 |     0xbb9972a0, 0xef70cd37, 0x00000513, 0xc058b618, 0x09e85085, 0x9197fb3b, 0x7e1e626d, 0x520c5450,
16 |     0xbee17971, 0x6fd9ac32, 0x0023957c, 0xa01c7d7e, 0xaba2c758, 0x0d1fa0ce, 0x0bb6a58b, 0x4333dd5b,
17 |     0xc2fd3b7d, 0xfb418024, 0x65a56185, 0xf67a02bd, 0x696f11dd, 0x00002022, 0x8cd6be91, 0x695189b6,
18 |     0xee9453b5, 0xd8fc5ea9, 0xab86bf19, 0x0000c6b5, 0x26731017, 0xed2d101b, 0x3b41ed84, 0x13e62212,
19 |     0xa315f5eb, 0x8816c34e, 0xe9395b9c, 0x002ce920, 0x4283db1d, 0xd77d461a, 0xe2ec17e4, 0xb8e0be40,
20 |     0xdea160c4, 0x7eec86c8, 0x2119ad12, 0xa6ccf46b, 0x2c52cede, 0x2db48711, 0x0000f0d6, 0x3dd5d8c9,
21 |     0x8a1872a2, 0xf282a4c4, 0x8020ec2c, 0x6693b6e0, 0x0000ce19, 0x20cb5735, 0x762ebf37, 0x207bfe82,
22 |     0xd77dc112, 0x9ba78342, 0x217dc513, 0xb27b1a29, 0x00d5cd98, 0x71e39b80, 0x7e572af0, 0xa2734f2f,
23 |     0xbf82c6b5, 0x5c3beac6, 0xcdc893bb, 0x6d108561, 0x77f8ae30, 0x917c6b81, 0x5b75b699, 0x0000cf6a,
24 |     0xf3c40afa, 0x2063127a, 0x621de622, 0xd188ac1d, 0x107036e2, 0x0000b85f, 0xf2ef4e4c, 0xd9d6de66,
25 |     0xa1fc7955, 0xeb85fd03, 0xbe27502f, 0xe3034251, 0x441364d3, 0x0082b36c, 0xb1459103, 0x021c069c,
26 |     0x2910dfc7, 0x735b353e, 0xce44312c, 0xbc942e45, 0xf05086a7, 0xfec3b215, 0x00ae1055, 0xf54b4084,
27 |     0x00007fd9, 0xbfbd9ef3, 0xa804302f, 0x39ce4957, 0xffb9e2a4, 0x55b9ad1d, 0x00008acb, 0x48e2bfc8,
28 |     0x8be39841, 0x0e271216, 0xd51096e8, 0x1101ba17, 0xc22e770f, 0x1689eff2, 0x00a92a19, 0xbc765990,
29 |     0xc61441e3, 0x07e13a2c, 0x92cbe984, 0x8f4ff572, 0x0b9670c0, 0x62955a58, 0x645f83e5, 0x41fce516,
30 |     0xbbda9748, 0x0000aab2, 0x19761b06, 0x8b8f5e83, 0x3e5d1cfd, 0xec5c1e2c, 0xfaf7e0fe, 0x000000d3,
31 |     0xda3f9017, 0x70ff906d, 0x0527d5a7, 0x22d8e773, 0xc9ab70df, 0xeda4c6dc, 0xecef1f41, 0x0024c2b2,
32 |     0x06740d95, 0x1d7a299b, 0xb3c37cb2, 0xc986e3c7, 0x9fabea36, 0x6da214c5, 0x17a43ed8, 0x6eccec51,
33 |     0xf9cab309, 0x4a5e60c5, 0x00006967, 0x9da51d12, 0x84321e13, 0xfb3d6fb6, 0x60305eed, 0xcbbf4b14,
34 |     0x00004f63, 0x07d5b781, 0xe5a53672, 0x57afb234, 0x18f346f7, 0x636dc655, 0xcc8bab49, 0x63c7a906,
35 | ];
36 | 


--------------------------------------------------------------------------------
/src/gear_table/gear64.rs:
--------------------------------------------------------------------------------
  1 | /// An abitrary table originally produced for rsroll that is included here for convenience.
  2 | pub static GEAR_64: [u64; 256] = [
  3 |     0xb088d3a9e840f559,
  4 |     0x5652c7f739ed20d6,
  5 |     0x45b28969898972ab,
  6 |     0x6b0a89d5b68ec777,
  7 |     0x368f573e8b7a31b7,
  8 |     0x1dc636dce936d94b,
  9 |     0x207a4c4e5554d5b6,
 10 |     0xa474b34628239acb,
 11 |     0x3b06a83e1ca3b912,
 12 |     0x90e78d6c2f02baf7,
 13 |     0xe1c92df7150d9a8a,
 14 |     0x8e95053a1086d3ad,
 15 |     0x5a2ef4f1b83a0722,
 16 |     0xa50fac949f807fae,
 17 |     0x0e7303eb80d8d681,
 18 |     0x99b07edc1570ad0f,
 19 |     0x689d2fb555fd3076,
 20 |     0x00005082119ea468,
 21 |     0xc4b08306a88fcc28,
 22 |     0x3eb0678af6374afd,
 23 |     0xf19f87ab86ad7436,
 24 |     0xf2129fbfbe6bc736,
 25 |     0x481149575c98a4ed,
 26 |     0x0000010695477bc5,
 27 |     0x1fba37801a9ceacc,
 28 |     0x3bf06fd663a49b6d,
 29 |     0x99687e9782e3874b,
 30 |     0x79a10673aa50d8e3,
 31 |     0xe4accf9e6211f420,
 32 |     0x2520e71f87579071,
 33 |     0x2bd5d3fd781a8a9b,
 34 |     0x00de4dcddd11c873,
 35 |     0xeaa9311c5a87392f,
 36 |     0xdb748eb617bc40ff,
 37 |     0xaf579a8df620bf6f,
 38 |     0x86a6e5da1b09c2b1,
 39 |     0xcc2fc30ac322a12e,
 40 |     0x355e2afec1f74267,
 41 |     0x2d99c8f4c021a47b,
 42 |     0xbade4b4a9404cfc3,
 43 |     0xf7b518721d707d69,
 44 |     0x3286b6587bf32c20,
 45 |     0x0000b68886af270c,
 46 |     0xa115d6e4db8a9079,
 47 |     0x484f7e9c97b2e199,
 48 |     0xccca7bb75713e301,
 49 |     0xbf2584a62bb0f160,
 50 |     0xade7e813625dbcc8,
 51 |     0x000070940d87955a,
 52 |     0x8ae69108139e626f,
 53 |     0xbd776ad72fde38a2,
 54 |     0xfb6b001fc2fcc0cf,
 55 |     0xc7a474b8e67bc427,
 56 |     0xbaf6f11610eb5d58,
 57 |     0x09cb1f5b6de770d1,
 58 |     0xb0b219e6977d4c47,
 59 |     0x00ccbc386ea7ad4a,
 60 |     0xcc849d0adf973f01,
 61 |     0x73a3ef7d016af770,
 62 |     0xc807d2d386bdbdfe,
 63 |     0x7f2ac9966c791730,
 64 |     0xd037a86bc6c504da,
 65 |     0xf3f17c661eaa609d,
 66 |     0xaca626b04daae687,
 67 |     0x755a99374f4a5b07,
 68 |     0x90837ee65b2caede,
 69 |     0x6ee8ad93fd560785,
 70 |     0x0000d9e11053edd8,
 71 |     0x9e063bb2d21cdbd7,
 72 |     0x07ab77f12a01d2b2,
 73 |     0xec550255e6641b44,
 74 |     0x78fb94a8449c14c6,
 75 |     0xc7510e1bc6c0f5f5,
 76 |     0x0000320b36e4cae3,
 77 |     0x827c33262c8b1a2d,
 78 |     0x14675f0b48ea4144,
 79 |     0x267bd3a6498deceb,
 80 |     0xf1916ff982f5035e,
 81 |     0x86221b7ff434fb88,
 82 |     0x9dbecee7386f49d8,
 83 |     0xea58f8cac80f8f4a,
 84 |     0x008d198692fc64d8,
 85 |     0x6d38704fbabf9a36,
 86 |     0xe032cb07d1e7be4c,
 87 |     0x228d21f6ad450890,
 88 |     0x635cb1bfc02589a5,
 89 |     0x4620a1739ca2ce71,
 90 |     0xa7e7dfe3aae5fb58,
 91 |     0x0c10ca932b3c0deb,
 92 |     0x2727fee884afed7b,
 93 |     0xa2df1c6df9e2ab1f,
 94 |     0x4dcdd1ac0774f523,
 95 |     0x000070ffad33e24e,
 96 |     0xa2ace87bc5977816,
 97 |     0x9892275ab4286049,
 98 |     0xc2861181ddf18959,
 99 |     0xbb9972a042483e19,
100 |     0xef70cd3766513078,
101 |     0x00000513abfc9864,
102 |     0xc058b61858c94083,
103 |     0x09e850859725e0de,
104 |     0x9197fb3bf83e7d94,
105 |     0x7e1e626d12b64bce,
106 |     0x520c54507f7b57d1,
107 |     0xbee1797174e22416,
108 |     0x6fd9ac3222e95587,
109 |     0x0023957c9adfbf3e,
110 |     0xa01c7d7e234bbe15,
111 |     0xaba2c758b8a38cbb,
112 |     0x0d1fa0ceec3e2b30,
113 |     0x0bb6a58b7e60b991,
114 |     0x4333dd5b9fa26635,
115 |     0xc2fd3b7d4001c1a3,
116 |     0xfb41802454731127,
117 |     0x65a56185a50d18cb,
118 |     0xf67a02bd8784b54f,
119 |     0x696f11dd67e65063,
120 |     0x00002022fca814ab,
121 |     0x8cd6be912db9d852,
122 |     0x695189b6e9ae8a57,
123 |     0xee9453b50ada0c28,
124 |     0xd8fc5ea91a78845e,
125 |     0xab86bf191a4aa767,
126 |     0x0000c6b5c86415e5,
127 |     0x267310178e08a22e,
128 |     0xed2d101b078bca25,
129 |     0x3b41ed84b226a8fb,
130 |     0x13e622120f28dc06,
131 |     0xa315f5ebfb706d26,
132 |     0x8816c34e3301bace,
133 |     0xe9395b9cbb71fdae,
134 |     0x002ce9202e721648,
135 |     0x4283db1d2bb3c91c,
136 |     0xd77d461ad2b1a6a5,
137 |     0xe2ec17e46eeb866b,
138 |     0xb8e0be4039fbc47c,
139 |     0xdea160c4d5299d04,
140 |     0x7eec86c8d28c3634,
141 |     0x2119ad129f98a399,
142 |     0xa6ccf46b61a283ef,
143 |     0x2c52cedef658c617,
144 |     0x2db4871169acdd83,
145 |     0x0000f0d6f39ecbe9,
146 |     0x3dd5d8c98d2f9489,
147 |     0x8a1872a22b01f584,
148 |     0xf282a4c40e7b3cf2,
149 |     0x8020ec2ccb1ba196,
150 |     0x6693b6e09e59e313,
151 |     0x0000ce19cc7c83eb,
152 |     0x20cb5735f6479c3b,
153 |     0x762ebf3759d75a5b,
154 |     0x207bfe823d693975,
155 |     0xd77dc112339cd9d5,
156 |     0x9ba7834284627d03,
157 |     0x217dc513e95f51e9,
158 |     0xb27b1a29fc5e7816,
159 |     0x00d5cd9831bb662d,
160 |     0x71e39b806d75734c,
161 |     0x7e572af006fb1a23,
162 |     0xa2734f2f6ae91f85,
163 |     0xbf82c6b5022cddf2,
164 |     0x5c3beac60761a0de,
165 |     0xcdc893bb47416998,
166 |     0x6d1085615c187e01,
167 |     0x77f8ae30ac277c5d,
168 |     0x917c6b81122a2c91,
169 |     0x5b75b699add16967,
170 |     0x0000cf6ae79a069b,
171 |     0xf3c40afa60de1104,
172 |     0x2063127aa59167c3,
173 |     0x621de62269d1894d,
174 |     0xd188ac1de62b4726,
175 |     0x107036e2154b673c,
176 |     0x0000b85f28553a1d,
177 |     0xf2ef4e4c18236f3d,
178 |     0xd9d6de6611b9f602,
179 |     0xa1fc7955fb47911c,
180 |     0xeb85fd032f298dbd,
181 |     0xbe27502fb3befae1,
182 |     0xe3034251c4cd661e,
183 |     0x441364d354071836,
184 |     0x0082b36c75f2983e,
185 |     0xb145910316fa66f0,
186 |     0x021c069c9847caf7,
187 |     0x2910dfc75a4b5221,
188 |     0x735b353e1c57a8b5,
189 |     0xce44312ce98ed96c,
190 |     0xbc942e4506bdfa65,
191 |     0xf05086a71257941b,
192 |     0xfec3b215d351cead,
193 |     0x00ae1055e0144202,
194 |     0xf54b40846f42e454,
195 |     0x00007fd9c8bcbcc8,
196 |     0xbfbd9ef317de9bfe,
197 |     0xa804302ff2854e12,
198 |     0x39ce4957a5e5d8d4,
199 |     0xffb9e2a45637ba84,
200 |     0x55b9ad1d9ea0818b,
201 |     0x00008acbf319178a,
202 |     0x48e2bfc8d0fbfb38,
203 |     0x8be39841e848b5e8,
204 |     0x0e2712160696a08b,
205 |     0xd51096e84b44242a,
206 |     0x1101ba176792e13a,
207 |     0xc22e770f4531689d,
208 |     0x1689eff272bbc56c,
209 |     0x00a92a197f5650ec,
210 |     0xbc765990bda1784e,
211 |     0xc61441e392fcb8ae,
212 |     0x07e13a2ced31e4a0,
213 |     0x92cbe984234e9d4d,
214 |     0x8f4ff572bb7d8ac5,
215 |     0x0b9670c00b963bd0,
216 |     0x62955a581a03eb01,
217 |     0x645f83e5ea000254,
218 |     0x41fce516cd88f299,
219 |     0xbbda9748da7a98cf,
220 |     0x0000aab2fe4845fa,
221 |     0x19761b069bf56555,
222 |     0x8b8f5e8343b6ad56,
223 |     0x3e5d1cfd144821d9,
224 |     0xec5c1e2ca2b0cd8f,
225 |     0xfaf7e0fea7fbb57f,
226 |     0x000000d3ba12961b,
227 |     0xda3f90178401b18e,
228 |     0x70ff906de33a5feb,
229 |     0x0527d5a7c06970e7,
230 |     0x22d8e773607c13e9,
231 |     0xc9ab70df643c3bac,
232 |     0xeda4c6dc8abe12e3,
233 |     0xecef1f410033e78a,
234 |     0x0024c2b274ac72cb,
235 |     0x06740d954fa900b4,
236 |     0x1d7a299b323d6304,
237 |     0xb3c37cb298cbead5,
238 |     0xc986e3c76178739b,
239 |     0x9fabea364b46f58a,
240 |     0x6da214c5af85cc56,
241 |     0x17a43ed8b7a38f84,
242 |     0x6eccec511d9adbeb,
243 |     0xf9cab30913335afb,
244 |     0x4a5e60c5f415eed2,
245 |     0x00006967503672b4,
246 |     0x9da51d121454bb87,
247 |     0x84321e13b9bbc816,
248 |     0xfb3d6fb6ab2fdd8d,
249 |     0x60305eed8e160a8d,
250 |     0xcbbf4b14e9946ce8,
251 |     0x00004f63381b10c3,
252 |     0x07d5b7816fcc4e10,
253 |     0xe5a536726a6a8155,
254 |     0x57afb23447a07fdd,
255 |     0x18f346f7abc9d394,
256 |     0x636dc655d61ad33d,
257 |     0xcc8bab4939f7f3f6,
258 |     0x63c7a906c1dd187b,
259 | ];
260 | 


--------------------------------------------------------------------------------
/src/gear_table/mod.rs:
--------------------------------------------------------------------------------
1 | mod gear32;
2 | mod gear64;
3 | 
4 | pub use self::gear32::GEAR_32;
5 | pub use self::gear64::GEAR_64;
6 | 


--------------------------------------------------------------------------------
/src/gzip.rs:
--------------------------------------------------------------------------------
  1 | #![cfg(feature = "gzip")]
  2 | 
  3 | //! gzip (forks) rsyncable mode that uses a simple window accumulator
  4 | //!
  5 | //! **WARNING: not validated against gzip or rsyncrypto algorithms. Output may change to fix bugs**
  6 | //!
  7 | //! A widely distributed patch for gzip adds a `--rsyncable` flag which causes `gzip` to split it's
  8 | //! input on "stable" boundaries. This module impliments the algorithm used in that patch.
  9 | //!
 10 | //! The same algorithm is used by the `rsyncrypto` project.
 11 | //!
 12 | //!  - No maximum block size is provided.
 13 | //!  - No minimum block size is provided.
 14 | //!
 15 | //! PDF of block sizes: ???
 16 | //!
 17 | //! Note that the defacto-standard parameters allow a slightly more efficient check for a block
 18 | //! split (by replacing a modulus with a bitwise-and). This impl currently doesn't allow that
 19 | //! optimization even if you provide appropriate parameters (we need type-level integers for that).
 20 | //!
 21 | //! Parameters:
 22 | //!
 23 | //!  - window-len: The maximum number of bytes to be examined when deciding to split a block.
 24 | //!              set to 8192 by default in gzip-rsyncable & rsyncrypto)
 25 | //!  - modulus:    set to half of window-len (so, 4096) in gzip-rsyncable & rsyncrypto.
 26 | //!
 27 | //! In-block state:
 28 | //!  - window of window-len bytes (use of the iterator interface means we also track more bytes than
 29 | //!      this)
 30 | //!  - sum (u64)
 31 | //!
 32 | //! Between-block state:
 33 | //!
 34 | //! - none
 35 | //!
 36 | //! References:
 37 | //!
 38 | //! - http://rsyncrypto.lingnu.com/index.php/Algorithm
 39 | //! - https://www.samba.org/~tridge/phd_thesis.pdf
 40 | //!
 41 | //! S(n) = sum(c_i, var=i, top=n, bottom=n-8196)
 42 | //!
 43 | //! A(n) = S(n) / 8192
 44 | //!
 45 | //! H(n) = S(n) mod 4096
 46 | //!
 47 | //! Trigger splits when H(n) == 0
 48 | 
 49 | use crate::{Chunk, ChunkIncr, ToChunkIncr};
 50 | use std::collections::VecDeque;
 51 | use std::num::Wrapping;
 52 | 
 53 | /// Parameters for defining the gzip rsyncable algorithm
 54 | #[derive(Clone, Debug, PartialEq, Eq)]
 55 | pub struct GzipRsyncable {
 56 |     /*
 57 |      * TODO: if we can avoid loading entire files into memory, this could be u64
 58 |      */
 59 |     window_len: usize,
 60 |     modulus: u64,
 61 | }
 62 | 
 63 | impl GzipRsyncable {
 64 |     pub fn with_window_and_modulus(window: usize, modulus: u64) -> GzipRsyncable {
 65 |         Self {
 66 |             window_len: window,
 67 |             modulus,
 68 |         }
 69 |     }
 70 | }
 71 | 
 72 | impl Default for GzipRsyncable {
 73 |     fn default() -> Self {
 74 |         Self::with_window_and_modulus(8192, 4096)
 75 |     }
 76 | }
 77 | 
 78 | impl Chunk for GzipRsyncable {
 79 |     type SearchState = GzipRsyncableSearchState;
 80 | 
 81 |     fn to_search_state(&self) -> Self::SearchState {
 82 |         Self::SearchState::default()
 83 |     }
 84 | 
 85 |     fn find_chunk_edge(
 86 |         &self,
 87 |         state: &mut Self::SearchState,
 88 |         data: &[u8],
 89 |     ) -> (Option<usize>, usize) {
 90 |         for i in state.offset..data.len() {
 91 |             let v = data[i];
 92 | 
 93 |             if state.state.add(data, self, i, v) {
 94 |                 state.reset();
 95 |                 return (Some(i + 1), i + 1);
 96 |             }
 97 |         }
 98 | 
 99 |         // keep k elements = discard all but k
100 |         let discard_ct = data.len().saturating_sub(self.window_len);
101 |         state.offset = data.len() - discard_ct;
102 |         (None, discard_ct)
103 |     }
104 | }
105 | 
106 | impl From<&GzipRsyncable> for GzipRsyncableIncr {
107 |     fn from(src: &GzipRsyncable) -> Self {
108 |         src.clone().into()
109 |     }
110 | }
111 | 
112 | impl ToChunkIncr for GzipRsyncable {
113 |     type Incr = GzipRsyncableIncr;
114 |     fn to_chunk_incr(&self) -> Self::Incr {
115 |         self.into()
116 |     }
117 | }
118 | 
119 | #[derive(Debug, Default, Clone)]
120 | struct GzipRsyncableState {
121 |     accum: Wrapping<u64>,
122 | }
123 | 
124 | impl GzipRsyncableState {
125 |     fn reset(&mut self) {
126 |         self.accum.0 = 0;
127 |     }
128 | }
129 | 
130 | /// Intermediate state for [`GzipRsyncable::find_chunk_edge`]
131 | ///
132 | /// Using this avoids re-computation of data when no edge is found
133 | #[derive(Debug, Default, Clone)]
134 | pub struct GzipRsyncableSearchState {
135 |     offset: usize,
136 |     state: GzipRsyncableState,
137 | }
138 | 
139 | impl GzipRsyncableSearchState {
140 |     fn reset(&mut self) {
141 |         self.offset = 0;
142 |         self.state.reset();
143 |     }
144 | }
145 | 
146 | /// Provides an incremental interface to [`GzipRsyncable`]
147 | ///
148 | /// Performance Note: [`GzipRsyncable`] requires look-back. As a result, [`GzipRsyncableIncr`] internally
149 | /// buffers data up to the window size. This additional copying may affect performance. If
150 | /// possible for your use case, use the non-incremental interface.
151 | ///
152 | /// See [`GzipRsyncable`] for details on the underlying algorithm
153 | #[derive(Debug, Clone)]
154 | pub struct GzipRsyncableIncr {
155 |     params: GzipRsyncable,
156 | 
157 |     accum: Wrapping<u64>,
158 |     // really poor efficiency
159 |     window: VecDeque<u8>,
160 | }
161 | 
162 | impl GzipRsyncableIncr {
163 |     fn reset(&mut self) {
164 |         self.window.clear();
165 |         self.accum = Wrapping(0);
166 |     }
167 | }
168 | 
169 | impl From<GzipRsyncable> for GzipRsyncableIncr {
170 |     fn from(params: GzipRsyncable) -> Self {
171 |         let window = VecDeque::with_capacity(params.window_len);
172 |         GzipRsyncableIncr {
173 |             params,
174 |             accum: Wrapping(0),
175 |             window,
176 |         }
177 |     }
178 | }
179 | 
180 | impl GzipRsyncableState {
181 |     fn add(&mut self, data: &[u8], parent: &GzipRsyncable, i: usize, v: u8) -> bool {
182 |         if i >= parent.window_len {
183 |             self.accum -= Wrapping(data[i - parent.window_len] as u64);
184 |         }
185 |         self.accum += Wrapping(v as u64);
186 |         (self.accum % Wrapping(parent.modulus)).0 == 0
187 |     }
188 | }
189 | 
190 | impl ChunkIncr for GzipRsyncableIncr {
191 |     fn push(&mut self, data: &[u8]) -> Option<usize> {
192 |         for (i, &v) in data.iter().enumerate() {
193 |             if self.window.len() >= self.params.window_len {
194 |                 self.accum -= Wrapping(self.window.pop_front().unwrap() as u64);
195 |             }
196 | 
197 |             self.accum += Wrapping(v as u64);
198 |             self.window.push_back(v);
199 | 
200 |             if (self.accum % Wrapping(self.params.modulus)).0 == 0 {
201 |                 self.reset();
202 |                 return Some(i + 1);
203 |             }
204 |         }
205 | 
206 |         None
207 |     }
208 | }
209 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | //! hash-roll provides various content defined chunking algorithms
  2 | //!
  3 | //! Content defined chunking (CDC) algorithms are algorithms that examine a stream of input bytes (often
  4 | //! represented as a slice like `[u8]`, and provide locations within that input stream to split or
  5 | //! chunk the stream into parts.
  6 | //!
  7 | //! CDC algorithms generally try to optimize for the following:
  8 | //!
  9 | //!  1. Processing speed (ie: bytes/second)
 10 | //!  2. Stability in split locations even when insertions/deletions of bytes occur
 11 | //!  3. Reasonable distributions of chunk lengths
 12 | //!
 13 | //! ## API Concepts
 14 | //!
 15 | //! - Configured Algorithm Instance (impliments [`Chunk`]). Named plainly using the algorithm name
 16 | //!  (like [`Bup`]). These can be thought of as "parameters" for an algorithm.
 17 | //! - Incrimental (impliments [`ChunkIncr`]). Normally named with `Incr` suffix. These are created
 18 | //!   using [`ToChunkIncr`] for a configured algorithm instance.
 19 | //!
 20 | //! Because of the various ways one might use a CDC, and the different CDC algorithm
 21 | //! characteristics, hash-roll provides a few ways to use them.
 22 | //!
 23 | //! Configured Algorithm Instances are created from the set of configuration needed for a given
 24 | //! algorithm. For example, this might mean configuring a window size or how to decide where to
 25 | //! split. These don't include any mutable data, in other words: they don't keep track of what data
 26 | //! is given to them. Configured Algorithm Instances provide the all-at-once APIs, as well as
 27 | //! methods to obtain other kinds of APIs, like incrimental style apis.
 28 | //!
 29 | //! ```rust
 30 | //! use hash_roll::ToChunkIncr;
 31 | //! let algorithm_instance = hash_roll::mii::Mii::default();
 32 | //! let _incrimental_comp = algorithm_instance.to_chunk_incr();
 33 | //! ```
 34 | //!
 35 | //! ## CDC Algorithms and Window Buffering
 36 | //!
 37 | //! Different CDC algorithms have different constraints about how they process data. Notably, some
 38 | //! require a large amount of previous processed data to process additional data. This "large
 39 | //! amount of previously processed data" is typically referred to as the "window". That said, note
 40 | //! that some CDC algorithms that use a window concept don't need previously accessed data.
 41 | //!
 42 | //! For the window-buffering algorithms, their is an extra cost to certain types of API
 43 | //! implimentations. The documentation will note when these occur and suggest alternatives.
 44 | //!
 45 | //! Generally, CDC interfaces that are incrimental will be slower for window-buffering algorithms.
 46 | //! Using an explicitly allocating interface (which emits `Vec<u8>` or `Vec<Vec<u8>>`) will have no
 47 | //! worse performance that the incrimental API, but might be more convenient. Using an all-at-once
 48 | //! API will provide the best performance due to not requiring any buffering (the input data can be
 49 | //! used directly).
 50 | //!
 51 | //! ## Use Cases that drive API choices
 52 | //!
 53 | //!  - accumulate vecs, emits vecs
 54 | //!    - incrimental: yes
 55 | //!    - input: `Vec<u8>`
 56 | //!    - internal state: `Vec<Vec<u8>>`
 57 | //!    - output: `Vec<Vec<u8>>`
 58 | //!
 59 | //!  - stream data through
 60 | //!    - incrimenal: yes
 61 | //!    - input: `&[u8]`
 62 | //!
 63 | //!  - mmap (or read entire) file, emit
 64 | //!    - incrimenal: no
 65 | //!    - input: `&[u8]`
 66 | //!    - output: `&[u8]`
 67 | 
 68 | // # API Design Notes
 69 | //
 70 | // ## What methods should be in a trait? What should be in wrapper structs?
 71 | //
 72 | //  - place methods that might have more optimized variants, but can have common implimentations,
 73 | //    in a trait. This notably affects window-buffering differences: it's always possible to
 74 | //    impliment all-at-once processing using incrimental interfaces that internally buffer, but
 75 | //    it's much more efficient for window-buffering algorithms to provide implimentations that know
 76 | //    how to look into the input data directly.
 77 | 
 78 | #![warn(rust_2018_idioms, missing_debug_implementations)]
 79 | /* TODO: Rabin-Karp
 80 |  * H = c_1 * a ** (k-1) + c_2 * a ** (k-2) ... + c_k * a ** 0
 81 |  * where:
 82 |  *  a is a constant
 83 |  *  c_1, ..., c_k are the input characters
 84 |  *
 85 |  * All math is done modulo n. Choice of n & a critical
 86 |  *
 87 |  * Parameters:
 88 |  *  - n: mululo limit
 89 |  *  - a: a constant
 90 |  *
 91 |  * State:
 92 |  *  H
 93 |  *
 94 |  * Application:
 95 |  */
 96 | 
 97 | /* TODO:
 98 |  * rollsum of librsync
 99 |  */
100 | 
101 | use std::mem;
102 | 
103 | pub mod bup;
104 | pub mod buzhash;
105 | pub mod buzhash_table;
106 | pub mod fastcdc;
107 | pub mod gear;
108 | pub mod gear_table;
109 | pub mod gzip;
110 | pub mod mii;
111 | pub mod pigz;
112 | pub mod ram;
113 | pub mod range;
114 | pub mod zpaq;
115 | pub mod zstd;
116 | 
117 | pub(crate) use range::RangeExt;
118 | 
119 | /// Accept incrimental input and provide indexes of split points
120 | ///
121 | /// Compared to [`Chunk`], [`ChunkIncr`] allows avoiding having to buffer all input data in memory,
122 | /// and avoids the need to use a single buffer for storing the input data (even if all data is in
123 | /// memory).
124 | ///
125 | /// Data fed into a given [`ChunkIncr`] instance is considered to be part of the same
126 | /// data "source". This affects chunking algorithms that maintain some state between chunks
127 | /// (like `ZstdRsyncable` does). If you have multiple "sources", one should obtain new instances of
128 | /// [`ChunkIncr`] for each of them (typically via [`ToChunkIncr`]).
129 | ///
130 | /// Note that for some splitting/chunking algorithms, the incrimental api will be less efficient
131 | /// compared to the non-incrimental API. In particular, algorithms like [`Rsyncable`] that require
132 | /// the use of previously examined data to shift their "window" (resulting in needing a circular
133 | /// buffer which all inputed data passes through) will perform more poorly using [`ChunkIncr`]
134 | /// compared with non-incrimental interfaces
135 | pub trait ChunkIncr {
136 |     /// The data "contained" within a implimentor of this trait is the history of all data slices
137 |     /// passed to feed.
138 |     ///
139 |     /// In other words, all previous data (or no previous data) may be used in determining the
140 |     /// point to split.
141 |     ///
142 |     /// Returns None if the data has no split point.
143 |     /// Otherwise, returns an index in the most recently passed `data`.
144 |     ///
145 |     /// Note that returning the index in the current slice makes most "look-ahead" splitting
146 |     /// impossible (as it is permissible to pass 1 byte at a time).
147 |     fn push(&mut self, data: &[u8]) -> Option<usize>;
148 | 
149 |     /// Given a [`ChunkIncr`] and a single slice, return a list of slices chunked by the chunker.
150 |     ///
151 |     /// Will always return enough slices to form the entire content of `data`, even if the trailing
152 |     /// part of data is not a chunk (ie: does not end on a chunk boundary)
153 |     fn iter_slices(self, data: &[u8]) -> IterSlices<'_, Self>
154 |     where
155 |         Self: std::marker::Sized,
156 |     {
157 |         IterSlices {
158 |             rem: data,
159 |             chunker: self,
160 |         }
161 |     }
162 | 
163 |     /// Given a [`ChunkIncr`] and a single slice, return a list of slices chunked by the chunker.
164 |     /// Does not return the remainder (if any) in the iteration. Use [`IterSlices::take_rem()`] or
165 |     /// [`IterSlices::into_parts()`] to get the remainder.
166 |     ///
167 |     /// Note that this is a non-incrimental interface. Calling this on an already fed chunker or using
168 |     /// this multiple times on the same chunker may provide unexpected results
169 |     fn iter_slices_strict(self, data: &[u8]) -> IterSlicesStrict<'_, Self>
170 |     where
171 |         Self: std::marker::Sized,
172 |     {
173 |         IterSlicesStrict {
174 |             rem: data,
175 |             chunker: self,
176 |         }
177 |     }
178 | }
179 | 
180 | /// Returned by [`ChunkIncr::iter_slices_strict()`]
181 | ///
182 | /// Always emits _complete_ slices durring iteration.
183 | #[derive(Debug)]
184 | pub struct IterSlicesStrict<'a, C: ChunkIncr> {
185 |     rem: &'a [u8],
186 |     chunker: C,
187 | }
188 | 
189 | impl<'a, C: ChunkIncr> IterSlicesStrict<'a, C> {
190 |     /// Take the remainder from this iterator. Leaves an empty slice in it's place.
191 |     pub fn take_rem(&mut self) -> &'a [u8] {
192 |         let mut l: &[u8] = &[];
193 |         mem::swap(&mut self.rem, &mut l);
194 |         l
195 |     }
196 | 
197 |     /// Obtain the internals
198 |     ///
199 |     /// Useful, for example, after iteration stops to obtain the remaining slice.
200 |     pub fn into_parts(self) -> (C, &'a [u8]) {
201 |         (self.chunker, self.rem)
202 |     }
203 | }
204 | 
205 | impl<'a, C: ChunkIncr> Iterator for IterSlicesStrict<'a, C> {
206 |     type Item = &'a [u8];
207 | 
208 |     fn next(&mut self) -> Option<Self::Item> {
209 |         match self.chunker.push(self.rem) {
210 |             None => None,
211 |             Some(l) => {
212 |                 let (v, rn) = self.rem.split_at(l);
213 |                 self.rem = rn;
214 |                 Some(v)
215 |             }
216 |         }
217 |     }
218 | }
219 | 
220 | /// Returned by [`ChunkIncr::iter_slices()`]
221 | ///
222 | /// When it runs out of data, it returns the remainder as the last element of the iteration
223 | #[derive(Debug)]
224 | pub struct IterSlices<'a, C: ChunkIncr> {
225 |     rem: &'a [u8],
226 |     chunker: C,
227 | }
228 | 
229 | impl<'a, C: ChunkIncr> IterSlices<'a, C> {
230 |     /// Obtain the internals
231 |     ///
232 |     /// Useful, for example, after iteration stops to obtain the remaining slice.
233 |     pub fn into_parts(self) -> (C, &'a [u8]) {
234 |         (self.chunker, self.rem)
235 |     }
236 | }
237 | 
238 | impl<'a, C: ChunkIncr> Iterator for IterSlices<'a, C> {
239 |     type Item = &'a [u8];
240 | 
241 |     fn next(&mut self) -> Option<Self::Item> {
242 |         if self.rem.is_empty() {
243 |             return None;
244 |         }
245 | 
246 |         match self.chunker.push(self.rem) {
247 |             None => {
248 |                 let v = self.rem;
249 |                 self.rem = &[];
250 |                 Some(v)
251 |             }
252 |             Some(l) => {
253 |                 let (v, rn) = self.rem.split_at(l);
254 |                 self.rem = rn;
255 |                 Some(v)
256 |             }
257 |         }
258 |     }
259 | }
260 | 
261 | /// Impl on algorthms that define methods of chunking data
262 | ///
263 | /// This is the lowest level (but somewhat restrictive) trait for chunking algorthms.  It assumes
264 | /// that the input is provided to it in a contiguous slice. If you don't have your input as a
265 | /// contiguous slice, [`ChunkIncr`] may be a better choice (it allows non-contiguous input, but may
266 | /// be slowing for some chunking algorthms).
267 | pub trait Chunk {
268 |     /// `SearchState` allows searching for the chunk edge to resume without duplicating work
269 |     /// already done.
270 |     type SearchState;
271 | 
272 |     /*
273 |     /// Amount of data from already emitted chunks requried for determining future chunks
274 |     ///
275 |     /// Indicates the amount of data that _must_ be preserved for [`find_chunk_edge()`]'s
276 |     /// `prev_data` argument. If more that this is passed, the last bytes in the slice are used. At
277 |     /// the start of an input (where there is no previous data), an empty slice would be used.
278 |     ///
279 |     /// For most chunking algorithms, this is `0` (zero), indicating that `prev_data` may always be
280 |     /// an empty slice.
281 |     const CARRY_LEN: usize;
282 |     */
283 | 
284 |     /// Provide an initial [`SearchState`] for use with [`find_chunk_edge()`]. Generally, for each
285 |     /// input one should generate a new [`SearchState`].
286 |     fn to_search_state(&self) -> Self::SearchState;
287 | 
288 |     /// Find the next "chunk" in `data` to emit
289 |     ///
290 |     /// The return value is a pair of a range representing the start and end of the chunk being
291 |     /// emitted, and the offset from which subsequent `data` subsets should be passed to the next
292 |     /// call to `find_chunk_edge`.
293 |     ///
294 |     /// `state` is mutated so that it does not rexamine previously examined data, even when a chunk
295 |     /// is not emitted.
296 |     ///
297 |     /// `data` may be extended with additional data between calls to `find_chunk_edge()`. The bytes
298 |     /// that were _previously_ in `data` and are not indicated by `discard_ct` must be preserved in
299 |     /// the next `data` buffer called.
300 |     ///
301 |     /// ```rust
302 |     /// use hash_roll::Chunk;
303 |     ///
304 |     /// fn some_chunk() -> impl Chunk {
305 |     ///     hash_roll::mii::Mii::default()
306 |     /// }
307 |     ///
308 |     /// let chunk = some_chunk();
309 |     /// let orig_data = b"hello";
310 |     /// let mut data = &orig_data[..];
311 |     /// let mut ss = chunk.to_search_state();
312 |     /// let mut prev_cut = 0;
313 |     ///
314 |     /// loop {
315 |     ///    let (chunk, discard_ct) = chunk.find_chunk_edge(&mut ss, data);
316 |     ///
317 |     ///    match chunk {
318 |     ///        Some(cut_point) => {
319 |     ///            // map `cut_point` from the current slice back into the original slice so we can
320 |     ///            // have consistent indexes
321 |     ///            let g_cut = cut_point + orig_data.len() - data.len();
322 |     ///            println!("chunk: {:?}", &orig_data[prev_cut..g_cut]);
323 |     ///            prev_cut = g_cut;
324 |     ///        },
325 |     ///        None => {
326 |     ///            println!("no chunk, done with data we have");
327 |     ///            println!("remain: {:?}", &data[discard_ct..]);
328 |     ///            break;
329 |     ///        }
330 |     ///    }
331 |     ///
332 |     ///    data = &data[discard_ct..];
333 |     /// }
334 |     /// ```
335 |     ///
336 |     /// Note: call additional times on the same `SearchState` and the required `data` to obtain
337 |     /// subsequent chunks in the same input data. To handle a seperate input, use a new
338 |     /// `SearchState`.
339 |     ///
340 |     /// Note: calling with a previous `state` with a new `data` that isn't an extention of the
341 |     /// previous `data` will result in split points that may not follow the design of the
342 |     /// underlying algorithm. Avoid relying on consistent cut points to reason about memory safety.
343 |     ///
344 |     // NOTE: the reason that we preserve `state` even when chunks are emitted is that some
345 |     // algorthims require some state to pass between chunks for a given input. zstd includes an
346 |     // example of an algorithm that needs this
347 |     //
348 |     // Potential pitfal: for better performance, keeping the return value small is a very good
349 |     // idea. By returning ~2x64+32, we are might be less performant depending on the ABI selected.
350 |     //
351 |     // Consider if result should return `(&[u8], &[u8])` instead of an index (which would then be
352 |     // given to `.split_at()`
353 |     //
354 |     // Consider if `state` should have a `reset()` method to avoid reallocating
355 |     //
356 |     // API:
357 |     //  - `fn find_chunk_edge(&self, state: &mut Self::SearchState, data: &[u8]) -> (Option<(usize, uszie)>, usize);
358 |     //     - Problem: unclear what indexes of slices represent: start can't be in the data being
359 |     //       passed because we don't require `data` include the start of the chunk
360 |     //  - `fn find_chunk_edge(&self, state: &mut Self::SearchState, data: &[u8]) -> (Option<usize>, usize);
361 |     //     - Problem: user code to track indexing match up is somewhat difficult
362 |     //     - mostly due to needing an extra index to track to handle the "last chunk" location not
363 |     //     being the "slice we need to pass start"
364 |     fn find_chunk_edge(&self, state: &mut Self::SearchState, data: &[u8])
365 |         -> (Option<usize>, usize);
366 | }
367 | 
368 | /// Implimented on types which can be converted to/can provide a [`ChunkIncr`] interface.
369 | ///
370 | /// Types that impliment this generally represent a instantiation of a chunking algorithm.
371 | // NOTE: we use this instead of just having `From<&C: Chunk> for CI: ChunkIncr` because there is
372 | // _one_ `ChunkIncr` for each `Chunk`, and rust can't infer that when using a `From` or `Into`
373 | // bound.
374 | //
375 | // We could consider adding `type Incr` into `trait Chunk`, or only having `type Incr`
376 | pub trait ToChunkIncr {
377 |     /// `Incr` provides the incrimental interface to this chunking instance
378 |     type Incr: ChunkIncr;
379 | 
380 |     /// `to_chunk_incr()` returns a [`ChunkIncr`] which can be incrimentally fed data and emits
381 |     /// chunks.
382 |     ///
383 |     /// Generally, this is a typically low cost operation that copies from the implimentor or does
384 |     /// minor computation on its fields and may allocate some memory for storing additional state
385 |     /// needed for incrimental computation.
386 |     fn to_chunk_incr(&self) -> Self::Incr;
387 | }
388 | 


--------------------------------------------------------------------------------
/src/mii.rs:
--------------------------------------------------------------------------------
  1 | #![cfg(feature = "mii")]
  2 | use crate::{ChunkIncr, ToChunkIncr};
  3 | 
  4 | /// C. Zhang et al., "MII: A Novel Content Defined Chunking Algorithm for Finding Incremental Data
  5 | /// in Data Synchronization," in IEEE Access, vol. 7, pp. 86932-86945, 2019, doi:
  6 | /// 10.1109/ACCESS.2019.2926195.
  7 | ///
  8 | /// https://ieeexplore.ieee.org/abstract/document/8752387
  9 | #[derive(Debug, Clone)]
 10 | pub struct Mii {
 11 |     w: u64,
 12 | }
 13 | 
 14 | impl Mii {
 15 |     /// Create a new splitter with parameter `w`
 16 |     ///
 17 |     /// `w` is the number of "increments" (positive changes in byte value) after which we split the
 18 |     /// input
 19 |     ///
 20 |     // TODO: determine distribution and expected size of chunks
 21 |     //
 22 |     // 1: P(curr > prev) = 0    (prev set to 0xff)
 23 |     // 2: P(curr > prev) = 0.5  (prev and curr assumed to be randomly distributed)
 24 |     // 3: P(curr > prev) |  t2 = ???
 25 |     //    P(curr > prev) | !t2 = ???
 26 |     pub fn with_w(w: u64) -> Self {
 27 |         Self { w }
 28 |     }
 29 | }
 30 | 
 31 | impl Default for Mii {
 32 |     /// The window of 5 is used in the paper for the generated graphs
 33 |     ///
 34 |     /// It is compared against Rabin with a window of 7 and AE/LMC/RAM with a window of 700
 35 |     fn default() -> Self {
 36 |         Mii::with_w(5)
 37 |     }
 38 | }
 39 | 
 40 | impl crate::Chunk for Mii {
 41 |     type SearchState = MiiSearchState;
 42 | 
 43 |     fn to_search_state(&self) -> Self::SearchState {
 44 |         Into::<MiiIncr>::into(self).into()
 45 |     }
 46 | 
 47 |     fn find_chunk_edge(
 48 |         &self,
 49 |         state: &mut Self::SearchState,
 50 |         data: &[u8],
 51 |     ) -> (Option<usize>, usize) {
 52 |         match state.push(data) {
 53 |             Some(v) => {
 54 |                 state.reset();
 55 |                 (Some(v), v)
 56 |             }
 57 |             None => (None, data.len()),
 58 |         }
 59 |     }
 60 | }
 61 | 
 62 | impl From<&Mii> for MiiIncr {
 63 |     fn from(src: &Mii) -> Self {
 64 |         src.clone().into()
 65 |     }
 66 | }
 67 | 
 68 | impl ToChunkIncr for Mii {
 69 |     type Incr = MiiIncr;
 70 | 
 71 |     fn to_chunk_incr(&self) -> Self::Incr {
 72 |         self.into()
 73 |     }
 74 | }
 75 | 
 76 | #[derive(Debug)]
 77 | pub struct MiiSearchState {
 78 |     incr: MiiIncr,
 79 | }
 80 | 
 81 | impl MiiSearchState {
 82 |     fn reset(&mut self) {
 83 |         self.incr.reset();
 84 |     }
 85 | }
 86 | 
 87 | impl From<MiiIncr> for MiiSearchState {
 88 |     fn from(incr: MiiIncr) -> Self {
 89 |         Self { incr }
 90 |     }
 91 | }
 92 | 
 93 | impl MiiSearchState {
 94 |     fn push(&mut self, data: &[u8]) -> Option<usize> {
 95 |         self.incr.push(data)
 96 |     }
 97 | }
 98 | 
 99 | #[derive(Debug)]
100 | pub struct MiiIncr {
101 |     /// After this many increments, split the file
102 |     w: u64,
103 | 
104 |     /// previous examined byte, if any
105 |     prev: u8,
106 | 
107 |     /// number of times a byte was greater than the previous value
108 |     increment: u64,
109 | }
110 | 
111 | impl From<Mii> for MiiIncr {
112 |     fn from(p: Mii) -> Self {
113 |         MiiIncr {
114 |             w: p.w,
115 |             // we use 0xff to ensure that the first examined byte does not trigger an increment
116 |             prev: 0xff,
117 |             increment: 0,
118 |         }
119 |     }
120 | }
121 | 
122 | impl ChunkIncr for MiiIncr {
123 |     fn push(&mut self, input: &[u8]) -> Option<usize> {
124 |         for (i, b) in input.iter().cloned().enumerate() {
125 |             if b > self.prev {
126 |                 self.increment += 1;
127 |                 if self.increment == self.w {
128 |                     // this is a split
129 |                     self.increment = 0;
130 |                     self.prev = 0;
131 |                     return Some(i + 1);
132 |                 }
133 |             } else {
134 |                 self.increment = 0;
135 |             }
136 |             self.prev = b;
137 |         }
138 | 
139 |         None
140 |     }
141 | }
142 | 
143 | impl MiiIncr {
144 |     fn reset(&mut self) {
145 |         self.prev = 0xff;
146 |         self.increment = 0;
147 |     }
148 | }
149 | 


--------------------------------------------------------------------------------
/src/pigz.rs:
--------------------------------------------------------------------------------
  1 | #![cfg(feature = "pigz")]
  2 | use crate::{Chunk, ChunkIncr, ToChunkIncr};
  3 | 
  4 | #[derive(Clone, Debug, PartialEq, Eq)]
  5 | pub struct PigzRsyncable {
  6 |     bits: u8,
  7 | 
  8 |     /// directly derived from `bits`
  9 |     mask: u32,
 10 |     /// directly derived from `mask`
 11 |     hit: u32,
 12 | }
 13 | 
 14 | impl PigzRsyncable {
 15 |     pub fn with_bits(bits: u8) -> PigzRsyncable {
 16 |         let mask = (1 << bits) - 1;
 17 |         let hit = mask >> 1;
 18 |         PigzRsyncable { bits, mask, hit }
 19 |     }
 20 | }
 21 | 
 22 | impl Default for PigzRsyncable {
 23 |     fn default() -> Self {
 24 |         Self::with_bits(12)
 25 |     }
 26 | }
 27 | 
 28 | impl Chunk for PigzRsyncable {
 29 |     type SearchState = PigzRsyncableSearchState;
 30 | 
 31 |     fn to_search_state(&self) -> Self::SearchState {
 32 |         self.into()
 33 |     }
 34 | 
 35 |     fn find_chunk_edge(
 36 |         &self,
 37 |         state: &mut Self::SearchState,
 38 |         data: &[u8],
 39 |     ) -> (Option<usize>, usize) {
 40 |         for (i, v) in data.iter().enumerate() {
 41 |             if state.state.add(self, *v) {
 42 |                 *state = self.to_search_state();
 43 |                 return (Some(i + 1), i + 1);
 44 |             }
 45 |         }
 46 | 
 47 |         (None, data.len())
 48 |     }
 49 | }
 50 | 
 51 | impl From<&PigzRsyncable> for PigzRsyncableIncr {
 52 |     fn from(src: &PigzRsyncable) -> Self {
 53 |         src.clone().into()
 54 |     }
 55 | }
 56 | 
 57 | impl ToChunkIncr for PigzRsyncable {
 58 |     type Incr = PigzRsyncableIncr;
 59 |     fn to_chunk_incr(&self) -> Self::Incr {
 60 |         self.into()
 61 |     }
 62 | }
 63 | 
 64 | #[derive(Debug, Clone)]
 65 | struct PigzRsyncableState {
 66 |     hash: u32,
 67 | }
 68 | 
 69 | impl From<&PigzRsyncable> for PigzRsyncableState {
 70 |     fn from(params: &PigzRsyncable) -> Self {
 71 |         PigzRsyncableState { hash: params.hit }
 72 |     }
 73 | }
 74 | 
 75 | /// Intermediate state for [`PigzRsyncable::find_chunk_edge`]
 76 | ///
 77 | /// Using this avoids re-computation of data when no edge is found
 78 | #[derive(Debug, Clone)]
 79 | pub struct PigzRsyncableSearchState {
 80 |     state: PigzRsyncableState,
 81 | }
 82 | 
 83 | impl From<&PigzRsyncable> for PigzRsyncableSearchState {
 84 |     fn from(params: &PigzRsyncable) -> Self {
 85 |         PigzRsyncableSearchState {
 86 |             state: params.into(),
 87 |         }
 88 |     }
 89 | }
 90 | 
 91 | /// Provides an incremental interface to [`PigzRsyncable`]
 92 | ///
 93 | /// Performance Note: [`PigzRsyncable`] requires look-back. As a result, [`PigzRsyncableIncr`] internally
 94 | /// buffers data up to the window size. This additional copying may affect performance. If
 95 | /// possible for your use case, use the non-incremental interface.
 96 | ///
 97 | /// See [`PigzRsyncable`] for details on the underlying algorithm
 98 | #[derive(Debug, Clone)]
 99 | pub struct PigzRsyncableIncr {
100 |     params: PigzRsyncable,
101 |     state: PigzRsyncableState,
102 | }
103 | 
104 | impl PigzRsyncableIncr {}
105 | 
106 | impl From<PigzRsyncable> for PigzRsyncableIncr {
107 |     fn from(params: PigzRsyncable) -> Self {
108 |         let state = (&params).into();
109 |         PigzRsyncableIncr { params, state }
110 |     }
111 | }
112 | 
113 | impl PigzRsyncableState {
114 |     fn add(&mut self, parent: &PigzRsyncable, v: u8) -> bool {
115 |         self.hash = ((self.hash << 1) ^ (v as u32)) & parent.mask;
116 |         self.hash == parent.hit
117 |     }
118 | }
119 | 
120 | impl ChunkIncr for PigzRsyncableIncr {
121 |     fn push(&mut self, data: &[u8]) -> Option<usize> {
122 |         for (i, &v) in data.iter().enumerate() {
123 |             if self.state.add(&self.params, v) {
124 |                 self.state = (&self.params).into();
125 |                 return Some(i + 1);
126 |             }
127 |         }
128 | 
129 |         None
130 |     }
131 | }
132 | 


--------------------------------------------------------------------------------
/src/ram.rs:
--------------------------------------------------------------------------------
  1 | #![cfg(feature = "ram")]
  2 | 
  3 | //! Rapid Asymmetric Maximum (RAM) is a fast chunking algorithm
  4 | //!
  5 | //! - Has a minimum block size (it's "window" size)
  6 | //! - Does not provide an upper bound on block size (though paper discusses a RAML variant that
  7 | //!   does).
  8 | //!
  9 | //! doi:10.1016/j.future.2017.02.013
 10 | //!
 11 | use crate::{Chunk, ChunkIncr, ToChunkIncr};
 12 | 
 13 | /// Parameters for the Rapid Asymmetric Maximum (RAM) chunking algorithm
 14 | ///
 15 | /// Is window free, with very small (
 16 | #[derive(Debug, Clone, PartialEq, Eq)]
 17 | pub struct Ram {
 18 |     /// window size
 19 |     ///
 20 |     /// fixed data
 21 |     w: u64,
 22 | }
 23 | 
 24 | impl Ram {
 25 |     /// Construct a RAM instance with window size `w`
 26 |     ///
 27 |     /// `w` is also the minimum block size
 28 |     pub fn with_w(w: u64) -> Self {
 29 |         Self { w }
 30 |     }
 31 | }
 32 | 
 33 | impl Chunk for Ram {
 34 |     type SearchState = RamState;
 35 | 
 36 |     fn to_search_state(&self) -> Self::SearchState {
 37 |         Default::default()
 38 |     }
 39 | 
 40 |     fn find_chunk_edge(
 41 |         &self,
 42 |         state: &mut Self::SearchState,
 43 |         data: &[u8],
 44 |     ) -> (Option<usize>, usize) {
 45 |         match state.push(self, data) {
 46 |             Some(i) => (Some(i + 1), i + 1),
 47 |             None => (None, data.len()),
 48 |         }
 49 |     }
 50 | }
 51 | 
 52 | #[derive(Default, Debug, PartialEq, Eq, Clone)]
 53 | pub struct RamState {
 54 |     /// global index (number of processed bytes since split)
 55 |     i: u64,
 56 | 
 57 |     ///
 58 |     max_val: u8,
 59 | }
 60 | 
 61 | impl RamState {
 62 |     fn push(&mut self, params: &Ram, data: &[u8]) -> Option<usize> {
 63 |         let i = self.i;
 64 | 
 65 |         for (l_i, b) in data.iter().cloned().enumerate() {
 66 |             if b >= self.max_val {
 67 |                 // minimum block size
 68 |                 let ri = l_i as u64 + i;
 69 |                 if ri > params.w {
 70 |                     self.i = 0;
 71 |                     self.max_val = 0;
 72 |                     return Some(l_i);
 73 |                 }
 74 | 
 75 |                 self.max_val = b;
 76 |             }
 77 |         }
 78 | 
 79 |         self.i += data.len() as u64;
 80 |         None
 81 |     }
 82 | }
 83 | 
 84 | impl ToChunkIncr for Ram {
 85 |     type Incr = RamIncr;
 86 | 
 87 |     fn to_chunk_incr(&self) -> Self::Incr {
 88 |         self.into()
 89 |     }
 90 | }
 91 | 
 92 | impl From<&Ram> for RamIncr {
 93 |     fn from(params: &Ram) -> Self {
 94 |         Self {
 95 |             params: params.clone(),
 96 |             state: Default::default(),
 97 |         }
 98 |     }
 99 | }
100 | 
101 | #[derive(Debug, PartialEq, Eq, Clone)]
102 | pub struct RamIncr {
103 |     params: Ram,
104 |     state: RamState,
105 | }
106 | 
107 | impl ChunkIncr for RamIncr {
108 |     fn push(&mut self, data: &[u8]) -> Option<usize> {
109 |         self.state.push(&self.params, data)
110 |     }
111 | }
112 | 


--------------------------------------------------------------------------------
/src/range.rs:
--------------------------------------------------------------------------------
  1 | use std::ops::Bound::{self, *};
  2 | 
  3 | pub trait RangeExt<T> {
  4 |     fn exceeds_max(&self, other: &T) -> bool
  5 |     where
  6 |         T: PartialOrd<T>;
  7 | 
  8 |     fn under_min(&self, item: &T) -> bool
  9 |     where
 10 |         T: PartialOrd<T>;
 11 | 
 12 |     fn contains(&self, item: &T) -> bool
 13 |     where
 14 |         T: PartialOrd<T>;
 15 | 
 16 |     fn into_tuple(self) -> (std::ops::Bound<T>, std::ops::Bound<T>)
 17 |     where
 18 |         T: Copy + std::marker::Sized;
 19 | }
 20 | 
 21 | impl<T, R: std::ops::RangeBounds<T>> RangeExt<T> for R {
 22 |     fn exceeds_max(&self, item: &T) -> bool
 23 |     where
 24 |         T: PartialOrd<T>,
 25 |     {
 26 |         match self.end_bound() {
 27 |             Included(ref i) => {
 28 |                 if item > i {
 29 |                     return true;
 30 |                 }
 31 |             }
 32 |             Excluded(ref i) => {
 33 |                 if item >= i {
 34 |                     return true;
 35 |                 }
 36 |             }
 37 |             Unbounded => {}
 38 |         }
 39 | 
 40 |         false
 41 |     }
 42 | 
 43 |     fn under_min(&self, item: &T) -> bool
 44 |     where
 45 |         T: PartialOrd<T>,
 46 |     {
 47 |         match self.start_bound() {
 48 |             Included(ref i) => {
 49 |                 if item < i {
 50 |                     return true;
 51 |                 }
 52 |             }
 53 |             Excluded(ref i) => {
 54 |                 if item <= i {
 55 |                     return true;
 56 |                 }
 57 |             }
 58 |             Unbounded => {}
 59 |         }
 60 | 
 61 |         false
 62 |     }
 63 | 
 64 |     fn contains(&self, item: &T) -> bool
 65 |     where
 66 |         T: PartialOrd<T>,
 67 |     {
 68 |         /* not excluded by lower */
 69 |         if self.under_min(item) {
 70 |             return false;
 71 |         }
 72 | 
 73 |         if self.exceeds_max(item) {
 74 |             return false;
 75 |         }
 76 | 
 77 |         true
 78 |     }
 79 | 
 80 |     fn into_tuple(self) -> (std::ops::Bound<T>, std::ops::Bound<T>)
 81 |     where
 82 |         T: Copy + std::marker::Sized,
 83 |     {
 84 |         (
 85 |             bound_cloned(self.start_bound()),
 86 |             bound_cloned(self.end_bound()),
 87 |         )
 88 |     }
 89 | }
 90 | 
 91 | /// Map a `Bound<&T>` to a `Bound<T>` by cloning the contents of the bound.
 92 | ///
 93 | /// # Examples
 94 | ///
 95 | /// ```
 96 | /// use std::ops::Bound::*;
 97 | /// use std::ops::RangeBounds;
 98 | /// use hash_roll::range::bound_cloned;
 99 | ///
100 | /// assert_eq!((1..12).start_bound(), Included(&1));
101 | /// assert_eq!(bound_cloned((1..12).start_bound()), Included(1));
102 | /// ```
103 | pub fn bound_cloned<T: Clone>(src: std::ops::Bound<&T>) -> std::ops::Bound<T> {
104 |     match src {
105 |         Bound::Unbounded => Bound::Unbounded,
106 |         Bound::Included(x) => Bound::Included(x.clone()),
107 |         Bound::Excluded(x) => Bound::Excluded(x.clone()),
108 |     }
109 | }
110 | 


--------------------------------------------------------------------------------
/src/zpaq.rs:
--------------------------------------------------------------------------------
  1 | #![cfg(feature = "zpaq")]
  2 | 
  3 | //! `zpaq` impliments the chunking algorithm used in the zpaq archiving tool
  4 | 
  5 | use std::fmt;
  6 | use std::num::Wrapping;
  7 | 
  8 | use crate::{Chunk, ChunkIncr, RangeExt, ToChunkIncr};
  9 | use std::ops::Bound;
 10 | use std::ops::RangeBounds;
 11 | 
 12 | /**
 13 |  * A splitter used in go 'dedup' and zpaq that does not require looking back in the source
 14 |  * data to update
 15 |  *
 16 |  * PDF: ??
 17 |  *
 18 |  * Note: go-dedup & zpaq calculate the relationship between their parameters slightly differently.
 19 |  * We support both of these (via the seperate with_*() constructors, but it'd be nice to clarify
 20 |  * why they differ and what affect the differences have.
 21 |  *
 22 |  * References:
 23 |  *
 24 |  *  - http://encode.ru/threads/456-zpaq-updates?p=45192&viewfull=1#post45192
 25 |  *  - https://github.com/klauspost/dedup/blob/master/writer.go#L668, 'zpaqWriter'
 26 |  *  - https://github.com/zpaq/zpaq/blob/master/zpaq.cpp
 27 |  *
 28 |  * Parameters:
 29 |  *
 30 |  *  - fragment (aka average_size_pow_2): average size = 2**fragment KiB
 31 |  *      in Zpaq (the compressor), this defaults to 6
 32 |  *  - min_size, max_size: additional bounds on the blocks. Not technically needed for the algorithm
 33 |  *      to function
 34 |  *
 35 |  *  In Zpaq-compressor, min & max size are calculated using the fragment value
 36 |  *  In go's dedup, fragment is calculated using a min & max size
 37 |  *
 38 |  * In-block state:
 39 |  *
 40 |  *  - hash: u32, current hash
 41 |  *  - last_byte: u8, previous byte read
 42 |  *  - predicted_byte: array of 256 u8's.
 43 |  *
 44 |  * Between-block state:
 45 |  *
 46 |  *  - None
 47 |  */
 48 | #[derive(Debug, Clone, PartialEq, Eq)]
 49 | pub struct Zpaq {
 50 |     range: (Bound<u64>, Bound<u64>),
 51 |     max_hash: u32,
 52 | }
 53 | 
 54 | impl Zpaq {
 55 |     /* this is taken from go-dedup */
 56 |     fn fragment_ave_from_max(max: u64) -> u8 {
 57 |         /* TODO: convert this to pure integer math */
 58 |         (max as f64 / (64f64 * 64f64)).log2() as u8
 59 |     }
 60 | 
 61 |     /* these are based on the zpaq (not go-dedup) calculations */
 62 |     fn fragment_ave_from_range<T: RangeBounds<u64>>(range: T) -> u8 {
 63 |         let v = match range.end_bound() {
 64 |             Bound::Included(i) => *i,
 65 |             Bound::Excluded(i) => *i - 1,
 66 |             Bound::Unbounded => {
 67 |                 /* try to guess based on first */
 68 |                 64 * match range.start_bound() {
 69 |                     Bound::Included(i) => *i,
 70 |                     Bound::Excluded(i) => *i + 1,
 71 |                     Bound::Unbounded => {
 72 |                         /* welp, lets use the default */
 73 |                         return 16;
 74 |                     }
 75 |                 }
 76 |             }
 77 |         };
 78 | 
 79 |         Self::fragment_ave_from_max(v)
 80 |     }
 81 | 
 82 |     /* these are based on the zpaq (not go-dedup) calculations */
 83 |     fn range_from_fragment_ave(fragment_ave: u8) -> impl RangeBounds<u64> {
 84 |         assert!(fragment_ave <= 32);
 85 |         assert!(fragment_ave >= 10);
 86 |         64 << (fragment_ave - 10)..8128 << fragment_ave
 87 |     }
 88 | 
 89 |     fn range_from_max(max: u64) -> impl RangeBounds<u64> {
 90 |         max / 64..max
 91 |     }
 92 | 
 93 |     fn max_hash_from_fragment_ave(fragment_ave: u8) -> u32 {
 94 |         assert!(fragment_ave <= 32);
 95 |         1 << (32 - fragment_ave)
 96 |         /*
 97 |          * go-dedup does this:
 98 |          * (22f64 - fragment_ave).exp2() as u32
 99 |          *
100 |          * Which should be equivalent to the integer math above (which is used by zpaq).
101 |          */
102 |     }
103 | 
104 |     /**
105 |      * Create a splitter using the range of output block sizes.
106 |      *
107 |      * The average block size will be the max block size (if any) divided by 4, using the same
108 |      * algorithm to calculate it as go-dedup.
109 |      */
110 |     pub fn with_range(range: impl RangeBounds<u64> + Clone) -> Self {
111 |         let f = Self::fragment_ave_from_range(range.clone());
112 |         Self::with_average_and_range(f, range)
113 |     }
114 | 
115 |     /**
116 |      * Create a splitter using the defaults from Zpaq (the compressor) given a average size
117 |      * formated as a power of 2.
118 |      *
119 |      * Corresponds to zpaq's argument "-fragment".
120 |      */
121 |     pub fn with_average_size_pow_2(average_size_pow_2: u8) -> Self {
122 |         let r = Self::range_from_fragment_ave(average_size_pow_2);
123 |         Self::with_average_and_range(average_size_pow_2, r)
124 |     }
125 | 
126 |     /**
127 |      * Use the defaults from go-dedup to generate a splitter given the max size of a split.
128 |      *
129 |      * The average block size will be the max block size (if any) divided by 4, using the same
130 |      * algorithm to calculate it as go-dedup.
131 |      */
132 |     pub fn with_max_size(max: u64) -> Self {
133 |         Self::with_average_and_range(Self::fragment_ave_from_max(max), Self::range_from_max(max))
134 |     }
135 | 
136 |     /**
137 |      * Create a splitter with control of all parameters
138 |      *
139 |      * All the other constructors use this internally
140 |      */
141 |     pub fn with_average_and_range(average_size_pow_2: u8, range: impl RangeBounds<u64>) -> Self {
142 |         Zpaq {
143 |             range: range.into_tuple(),
144 |             max_hash: Self::max_hash_from_fragment_ave(average_size_pow_2),
145 |         }
146 |     }
147 | 
148 |     /*
149 |     fn average_block_size(&self) -> u64 {
150 |         /* I don't know If i really trust this, do some more confirmation */
151 |         1024 << self.fragment
152 |     }
153 |     */
154 | 
155 |     fn split_here(&self, hash: u32, index: u64) -> bool {
156 |         (hash < self.max_hash && !self.range.under_min(&index)) || self.range.exceeds_max(&index)
157 |     }
158 | }
159 | 
160 | impl Default for Zpaq {
161 |     /**
162 |      * Create a splitter using the defaults from Zpaq (the compressor)
163 |      *
164 |      * Average size is 65536 bytes (64KiB), max is 520192 bytes (508KiB), min is 4096 bytes (4KiB)
165 |      */
166 |     fn default() -> Self {
167 |         Self::with_average_size_pow_2(16)
168 |     }
169 | }
170 | 
171 | /// Incrimental instance of [`Zpaq`].
172 | ///
173 | /// `Zpaq` doesn't require input look back, so the incrimental and non-incrimental performance
174 | /// should be similar.
175 | #[derive(Debug)]
176 | pub struct ZpaqIncr {
177 |     params: Zpaq,
178 |     state: ZpaqHash,
179 |     idx: u64,
180 | }
181 | 
182 | /// Intermediate state from [`Chunk::find_chunk_edge`] for [`Zpaq`].
183 | #[derive(Default, Debug)]
184 | pub struct ZpaqSearchState {
185 |     state: ZpaqHash,
186 |     idx: u64,
187 | }
188 | 
189 | impl ZpaqSearchState {
190 |     fn feed(&mut self, v: u8) -> u32 {
191 |         self.idx += 1;
192 |         self.state.feed(v)
193 |     }
194 | }
195 | 
196 | impl Chunk for Zpaq {
197 |     type SearchState = ZpaqSearchState;
198 | 
199 |     fn to_search_state(&self) -> Self::SearchState {
200 |         Default::default()
201 |     }
202 | 
203 |     fn find_chunk_edge(
204 |         &self,
205 |         state: &mut Self::SearchState,
206 |         data: &[u8],
207 |     ) -> (Option<usize>, usize) {
208 |         for (i, v) in data.iter().enumerate() {
209 |             let h = state.feed(*v);
210 |             if self.split_here(h, (state.idx + 1) as u64) {
211 |                 *state = self.to_search_state();
212 |                 return (Some(i + 1), i + 1);
213 |             }
214 |         }
215 | 
216 |         (None, data.len())
217 |     }
218 | }
219 | 
220 | impl From<&Zpaq> for ZpaqSearchState {
221 |     fn from(_: &Zpaq) -> Self {
222 |         Default::default()
223 |     }
224 | }
225 | 
226 | impl From<&Zpaq> for ZpaqIncr {
227 |     fn from(s: &Zpaq) -> Self {
228 |         s.clone().into()
229 |     }
230 | }
231 | 
232 | impl ToChunkIncr for Zpaq {
233 |     type Incr = ZpaqIncr;
234 |     fn to_chunk_incr(&self) -> Self::Incr {
235 |         self.into()
236 |     }
237 | }
238 | 
239 | impl ZpaqIncr {
240 |     fn feed(&mut self, v: u8) -> u32 {
241 |         self.idx += 1;
242 |         self.state.feed(v)
243 |     }
244 | 
245 |     fn reset(&mut self) {
246 |         self.idx = 0;
247 |         self.state = Default::default();
248 |     }
249 | }
250 | 
251 | impl ChunkIncr for ZpaqIncr {
252 |     fn push(&mut self, data: &[u8]) -> Option<usize> {
253 |         for (i, &v) in data.iter().enumerate() {
254 |             let h = self.feed(v);
255 |             if self.params.split_here(h, self.idx) {
256 |                 self.reset();
257 |                 return Some(i + 1);
258 |             }
259 |         }
260 | 
261 |         None
262 |     }
263 | }
264 | 
265 | impl From<Zpaq> for ZpaqIncr {
266 |     fn from(params: Zpaq) -> Self {
267 |         Self {
268 |             params,
269 |             state: Default::default(),
270 |             idx: 0,
271 |         }
272 |     }
273 | }
274 | 
275 | /**
276 |  * The rolling hash component of the zpaq splitter
277 |  */
278 | #[derive(Clone)]
279 | pub struct ZpaqHash {
280 |     hash: Wrapping<u32>,
281 |     last_byte: u8,
282 |     predicted_byte: [u8; 256],
283 | }
284 | 
285 | impl PartialEq for ZpaqHash {
286 |     fn eq(&self, other: &Self) -> bool {
287 |         self.hash == other.hash
288 |             && self.last_byte == other.last_byte
289 |             && self.predicted_byte[..] == other.predicted_byte[..]
290 |     }
291 | }
292 | 
293 | impl Eq for ZpaqHash {}
294 | 
295 | impl fmt::Debug for ZpaqHash {
296 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
297 |         f.debug_struct("ZpaqHash")
298 |             .field("hash", &self.hash)
299 |             .field("last_byte", &self.last_byte)
300 |             .field("predicted_byte", &fmt_extra::Hs(&self.predicted_byte[..]))
301 |             .finish()
302 |     }
303 | }
304 | 
305 | impl Default for ZpaqHash {
306 |     fn default() -> Self {
307 |         ZpaqHash {
308 |             hash: Wrapping(0),
309 |             last_byte: 0,
310 |             predicted_byte: [0; 256],
311 |         }
312 |     }
313 | }
314 | 
315 | impl ZpaqHash {
316 |     /*
317 |      * we can only get away with this because Zpaq doesn't need to look at old data to make it's
318 |      * splitting decision, it only examines it's state + current value (and the state is
319 |      * relatively large, but isn't a window into past data).
320 |      */
321 |     fn feed(&mut self, c: u8) -> u32 {
322 |         self.hash = if c == self.predicted_byte[self.last_byte as usize] {
323 |             (self.hash + Wrapping(c as u32) + Wrapping(1)) * Wrapping(314159265)
324 |         } else {
325 |             (self.hash + Wrapping(c as u32) + Wrapping(1)) * Wrapping(271828182)
326 |         };
327 | 
328 |         self.predicted_byte[self.last_byte as usize] = c;
329 |         self.last_byte = c;
330 |         self.hash.0
331 |     }
332 | }
333 | 


--------------------------------------------------------------------------------
/src/zstd.rs:
--------------------------------------------------------------------------------
  1 | #![cfg(feature = "zstd")]
  2 | 
  3 | //! zstd's `--rsyncable` option performs content defined chunking
  4 | //!
  5 | //! This has been minimally validated to match the implimentation from zstd, with the following
  6 | //! caveats:
  7 | //!
  8 | //!   - Maximum chunk size is not implimented
  9 | //!   - Only 1 test case with a single chunk edge (ie: 2 chunks) has been tested
 10 | //!
 11 | //! It uses a internal [rolling
 12 | //! hash](https://github.com/facebook/zstd/blob/01261bc8b6fcfc77801788f8b1e2a2e5dd2e8e25/lib/compress/zstd_compress_internal.h#L658-L698)
 13 | //! with 1 multiple and 2 additions. (see `ZSTD_rollingHash_append()` for core functionality).
 14 | //!
 15 | //! The rolling hash is then used by
 16 | //! [`findSynchronizationPoint()`](https://github.com/facebook/zstd/blob/15c5e200235edc520c1bd678ed126a6dd05736e1/lib/compress/zstdmt_compress.c#L1931-L2001)
 17 | //! in various ways to find "syncronization points" (ie: edges of chunks).
 18 | //!
 19 | //! [This issue thread comment ](https://github.com/facebook/zstd/issues/1155#issuecomment-520258862) also
 20 | //! includes some explanation on the mechanism.
 21 | //!
 22 | //! The zstd code _does_ include in it's context information about _previous_ block that was
 23 | //! emitted. In other words: the rolling hash isn't "reset" on block emittion. (Most chunking
 24 | //! algorithms are reset on block emittion).
 25 | use crate::{Chunk, ChunkIncr, ToChunkIncr};
 26 | use std::convert::TryInto;
 27 | use std::num::Wrapping;
 28 | 
 29 | const RSYNC_LENGTH: usize = 32;
 30 | const PRIME_8_BYTES: Wrapping<u64> = Wrapping(0xCF1BBCDCB7A56463);
 31 | const ROLL_HASH_CHAR_OFFSET: Wrapping<u64> = Wrapping(10);
 32 | 
 33 | #[derive(Debug, PartialEq, Eq, Clone)]
 34 | pub struct Zstd {
 35 |     hit_mask: u64,
 36 |     prime_power: u64,
 37 | }
 38 | 
 39 | impl Default for Zstd {
 40 |     fn default() -> Self {
 41 |         // ../lib/compress/zstdmt_compress.c: jobSizeMB: 8, rsyncBits: 23, hitMask: 7fffff, primePower: f5507fe35f91f8cb
 42 |         Self::with_target_section_size(8 << 20)
 43 |     }
 44 | }
 45 | 
 46 | impl Zstd {
 47 |     /*
 48 |      * ```notrust
 49 |         /* Aim for the targetsectionSize as the average job size. */
 50 |         U32 const jobSizeMB = (U32)(mtctx->targetSectionSize >> 20);
 51 |         U32 const rsyncBits = ZSTD_highbit32(jobSizeMB) + 20;
 52 |         assert(jobSizeMB >= 1);
 53 |         DEBUGLOG(4, "rsyncLog = %u", rsyncBits);
 54 |         mtctx->rsync.hash = 0;
 55 |         mtctx->rsync.hitMask = (1ULL << rsyncBits) - 1;
 56 |         mtctx->rsync.primePower = ZSTD_rollingHash_primePower(RSYNC_LENGTH);
 57 |         ```
 58 |     */
 59 |     pub fn with_target_section_size(target_section_size: u64) -> Self {
 60 |         let job_size_mb: u32 = (target_section_size >> 20).try_into().unwrap();
 61 |         assert_ne!(job_size_mb, 0);
 62 |         let rsync_bits = (job_size_mb.leading_zeros() ^ 31) + 20;
 63 |         let hit_mask = (1u64 << rsync_bits) - 1;
 64 |         let prime_power = PRIME_8_BYTES
 65 |             .0
 66 |             .wrapping_pow((RSYNC_LENGTH - 1).try_into().unwrap());
 67 |         Self {
 68 |             hit_mask,
 69 |             prime_power,
 70 |         }
 71 |     }
 72 | }
 73 | 
 74 | #[cfg(test)]
 75 | mod test {
 76 |     #[test]
 77 |     fn test_zstd_init_matches_upstream() {
 78 |         let zstd = super::Zstd::default();
 79 |         assert_eq!(zstd.hit_mask, 0x7f_ffff);
 80 |         assert_eq!(zstd.prime_power, 0xf5507fe35f91f8cb);
 81 |     }
 82 | }
 83 | 
 84 | #[derive(Default, Debug, PartialEq, Eq)]
 85 | struct ZstdState {
 86 |     hash: Wrapping<u64>,
 87 | }
 88 | 
 89 | impl ZstdState {
 90 |     // `ZSTD_rollingHash_append()`
 91 |     fn append(&mut self, data: &[u8]) {
 92 |         for i in data {
 93 |             self.hash *= PRIME_8_BYTES;
 94 |             self.hash += Wrapping(*i as u64) + ROLL_HASH_CHAR_OFFSET;
 95 |         }
 96 |     }
 97 | 
 98 |     // `ZSTD_rollingHash_rotate()`
 99 |     fn rotate(&mut self, to_remove: u8, to_add: u8, prime_power: u64) {
100 |         self.hash -= (Wrapping(to_remove as u64) + ROLL_HASH_CHAR_OFFSET) * Wrapping(prime_power);
101 |         self.hash *= PRIME_8_BYTES;
102 |         self.hash += Wrapping(to_add as u64) + ROLL_HASH_CHAR_OFFSET;
103 |     }
104 | 
105 |     fn at_split(&mut self, params: &Zstd) -> bool {
106 |         (self.hash.0 & params.hit_mask) == params.hit_mask
107 |     }
108 | }
109 | 
110 | #[derive(Default, Debug, PartialEq, Eq)]
111 | pub struct ZstdSearchState {
112 |     state: ZstdState,
113 |     offset: usize,
114 | }
115 | 
116 | impl ZstdSearchState {
117 |     fn append(&mut self, data: &[u8]) {
118 |         self.state.append(data);
119 |     }
120 | 
121 |     fn rotate(&mut self, to_remove: u8, to_add: u8, prime_power: u64) {
122 |         self.state.rotate(to_remove, to_add, prime_power);
123 |     }
124 | 
125 |     fn at_split(&mut self, params: &Zstd) -> bool {
126 |         self.state.at_split(params)
127 |     }
128 | }
129 | 
130 | /// Incrimental chunking using Zstd's rsyncable algorithm
131 | ///
132 | /// Performance note: Zstd's chunking requires buffer look back to remove previously inserted data,
133 | /// and as a result requires `ZstdIncr` to maintain an internal buffer. This internal buffer may
134 | /// reduce performance.
135 | #[derive(Debug, PartialEq, Eq)]
136 | pub struct ZstdIncr {
137 |     params: Zstd,
138 | 
139 |     state: ZstdState,
140 | 
141 |     window: Box<[u8]>,
142 |     // insert into the window at this offset
143 |     window_offs: usize,
144 |     // if true, we need to remove bytes from the window when inserting
145 |     //
146 |     // NOTE: by pre-filling `self.hash` with an appropriate value, we might be able to remove this
147 |     // variable and always treat the window as full (of zeros initially).
148 |     window_full: bool,
149 | 
150 |     // how many byte since last emitted block
151 |     // used to cap the block size as zstd does
152 |     input_offs: u64,
153 | }
154 | 
155 | impl ToChunkIncr for Zstd {
156 |     type Incr = ZstdIncr;
157 | 
158 |     fn to_chunk_incr(&self) -> Self::Incr {
159 |         self.into()
160 |     }
161 | }
162 | 
163 | impl From<Zstd> for ZstdIncr {
164 |     fn from(params: Zstd) -> Self {
165 |         Self {
166 |             params,
167 |             state: Default::default(),
168 |             window: vec![0; RSYNC_LENGTH].into_boxed_slice(),
169 |             window_offs: 0,
170 |             window_full: false,
171 |             input_offs: 0,
172 |         }
173 |     }
174 | }
175 | 
176 | impl From<&Zstd> for ZstdIncr {
177 |     fn from(params: &Zstd) -> Self {
178 |         params.clone().into()
179 |     }
180 | }
181 | 
182 | impl Chunk for Zstd {
183 |     type SearchState = ZstdSearchState;
184 | 
185 |     fn to_search_state(&self) -> Self::SearchState {
186 |         Self::SearchState::default()
187 |     }
188 | 
189 |     fn find_chunk_edge(
190 |         &self,
191 |         state: &mut Self::SearchState,
192 |         data: &[u8],
193 |     ) -> (Option<usize>, usize) {
194 |         if state.offset < RSYNC_LENGTH {
195 |             // push some data in
196 |             let seed_b = &data[state.offset..std::cmp::min(RSYNC_LENGTH, data.len())];
197 |             state.append(seed_b);
198 |             state.offset += seed_b.len();
199 | 
200 |             if state.offset < RSYNC_LENGTH {
201 |                 // not enough data
202 |                 return (None, 0);
203 |             }
204 |         }
205 | 
206 |         // TODO: track input_offs to split over-size blocks
207 | 
208 |         // we've got enough data, do rotations
209 |         for i in state.offset..data.len() {
210 |             let to_remove = data[i - RSYNC_LENGTH];
211 |             let to_add = data[i];
212 |             state.rotate(to_remove, to_add, self.prime_power);
213 |             if state.at_split(self) {
214 |                 let discard_ct = data.len().saturating_sub(RSYNC_LENGTH);
215 |                 return (Some(i + 1), discard_ct);
216 |             }
217 |         }
218 | 
219 |         let discard_ct = data.len().saturating_sub(RSYNC_LENGTH);
220 |         let keep_ct = data.len() - discard_ct;
221 |         state.offset = keep_ct;
222 |         (None, discard_ct)
223 |     }
224 | }
225 | 
226 | impl ChunkIncr for ZstdIncr {
227 |     fn push(&mut self, data: &[u8]) -> Option<usize> {
228 |         let use_len = if !self.window_full {
229 |             let use_len = std::cmp::min(self.window.len() - self.window_offs, data.len());
230 |             self.window[self.window_offs..(self.window_offs + use_len)]
231 |                 .copy_from_slice(&data[..use_len]);
232 |             self.window_offs += use_len;
233 | 
234 |             if self.window_offs != self.window.len() {
235 |                 return None;
236 |             }
237 | 
238 |             self.window_full = true;
239 |             self.window_offs = 0;
240 |             self.state.append(&self.window[..]);
241 |             use_len
242 |         } else {
243 |             0
244 |         };
245 | 
246 |         // TODO: track input_offs to split over-size blocks
247 | 
248 |         // we have a full window, now rotate data through
249 |         for (i, &v) in data[use_len..].iter().enumerate() {
250 |             let to_remove = self.window[self.window_offs];
251 |             let to_add = v;
252 |             self.state
253 |                 .rotate(to_remove, to_add, self.params.prime_power);
254 |             self.window[self.window_offs] = to_add;
255 |             self.window_offs = (self.window_offs + 1) % self.window.len();
256 | 
257 |             if self.state.at_split(&self.params) {
258 |                 // NOTE: don't clear window
259 |                 return Some(i + use_len);
260 |             }
261 |         }
262 | 
263 |         None
264 |     }
265 | }
266 | 


--------------------------------------------------------------------------------
/tests/cuts.rs:
--------------------------------------------------------------------------------
  1 | use hash_roll::{Chunk, ChunkIncr, ToChunkIncr};
  2 | use rand::RngCore;
  3 | use rand_pcg::Pcg64;
  4 | 
  5 | fn test_data(seed: u128, size: usize) -> Vec<u8> {
  6 |     let mut fill_rng = Pcg64::new(seed, 0xa02bdbf7bb3c0a7ac28fa16a64abf96);
  7 |     let mut buf = vec![0u8; size];
  8 |     fill_rng.fill_bytes(&mut buf);
  9 |     buf
 10 | }
 11 | 
 12 | fn cut_test_incr<C: ChunkIncr>(seed: u128, size: usize, chunker: C, expected_splits: &[usize]) {
 13 |     let buf = test_data(seed, size);
 14 | 
 15 |     // Note: this is only basic equivalance checking via byte-at-a-time. More full equivalance
 16 |     // checking will be done via quickcheck tests.
 17 |     let mut incr_splits = Vec::with_capacity(expected_splits.len());
 18 |     {
 19 |         let mut incr = chunker;
 20 |         let buf = &buf[..];
 21 |         let mut last_split = 0;
 22 |         for (i, v) in buf.iter().enumerate() {
 23 |             match incr.push(&[*v]) {
 24 |                 Some(_split_point) => {
 25 |                     let sp = i + 1;
 26 |                     incr_splits.push(sp - last_split);
 27 |                     last_split = sp;
 28 |                 }
 29 |                 None => {}
 30 |             }
 31 |         }
 32 |     }
 33 | 
 34 |     assert_eq!(expected_splits, &incr_splits[..]);
 35 | }
 36 | 
 37 | fn cut_test_sz<C: Chunk + ToChunkIncr>(
 38 |     seed: u128,
 39 |     size: usize,
 40 |     chunker: C,
 41 |     expected_splits: &[usize],
 42 | ) {
 43 |     let buf = test_data(seed, size);
 44 | 
 45 |     // Note: this doesn't validate SearchState at all
 46 |     let mut splits = Vec::with_capacity(expected_splits.len());
 47 |     {
 48 |         let mut state = chunker.to_search_state();
 49 |         let mut discard_idx = 0;
 50 |         let mut last_chunk_idx = 0;
 51 |         loop {
 52 |             let b = &buf[discard_idx..];
 53 |             let (split_point, discard_ct) = chunker.find_chunk_edge(&mut state, b);
 54 |             match split_point {
 55 |                 Some(split_point) => {
 56 |                     let split_point_global = discard_idx + split_point;
 57 |                     if last_chunk_idx > split_point_global {
 58 |                         panic!("last_chunk_idx: {}, split_point_global: {}, split_point: {}, discard_idx: {}",
 59 |                             last_chunk_idx, split_point_global, split_point, discard_idx);
 60 |                     }
 61 |                     let split_len = split_point_global - last_chunk_idx;
 62 |                     last_chunk_idx = split_point_global;
 63 |                     splits.push(split_len);
 64 |                 }
 65 |                 None => {
 66 |                     break;
 67 |                 }
 68 |             }
 69 |             discard_idx += discard_ct;
 70 |         }
 71 |     }
 72 | 
 73 |     // Note: this is only basic equivalance checking via byte-at-a-time. More full equivalance
 74 |     // checking will be done via quickcheck tests.
 75 |     let mut incr_splits = Vec::with_capacity(expected_splits.len());
 76 |     {
 77 |         let mut incr = chunker.to_chunk_incr();
 78 |         let buf = &buf[..];
 79 |         let mut last_split = 0;
 80 |         for (i, v) in buf.iter().enumerate() {
 81 |             match incr.push(&[*v]) {
 82 |                 Some(_split_point) => {
 83 |                     let sp = i + 1;
 84 |                     incr_splits.push(sp - last_split);
 85 |                     last_split = sp;
 86 |                 }
 87 |                 None => {}
 88 |             }
 89 |         }
 90 |     }
 91 | 
 92 |     assert_eq!(&splits[..], &incr_splits[..]);
 93 |     assert_eq!(expected_splits, &splits[..]);
 94 | }
 95 | 
 96 | fn cut_test<C: Chunk + ToChunkIncr>(seed: u128, chunker: C, expected_splits: &[usize]) {
 97 |     cut_test_sz(seed, 8192 * 4, chunker, expected_splits)
 98 | }
 99 | 
100 | #[cfg(feature = "mii")]
101 | #[test]
102 | fn mii_cuts_1() {
103 |     cut_test(
104 |         0,
105 |         hash_roll::mii::Mii::default(),
106 |         &[
107 |             1212, 40, 261, 1548, 1881, 312, 2043, 285, 1062, 677, 542, 1473, 303, 172, 318, 839,
108 |             2560, 3242, 396, 202, 123, 898, 2454, 544, 3541, 571, 483, 383, 103, 2629, 929, 47,
109 |             524,
110 |         ],
111 |     );
112 | }
113 | 
114 | #[cfg(feature = "bup")]
115 | #[test]
116 | fn bup_cuts_1() {
117 |     cut_test(0, hash_roll::bup::RollSum::default(), &[2600, 6245])
118 | }
119 | 
120 | #[cfg(feature = "gzip")]
121 | #[test]
122 | fn gzip_cuts_1() {
123 |     cut_test(
124 |         0,
125 |         hash_roll::gzip::GzipRsyncable::default(),
126 |         &[2941, 2077, 5263, 7263, 392, 4371, 5204],
127 |     )
128 | }
129 | 
130 | #[cfg(feature = "gzip")]
131 | #[test]
132 | fn gzip_cuts_2() {
133 |     // chosen so we check window removal
134 |     cut_test(
135 |         2,
136 |         hash_roll::gzip::GzipRsyncable::default(),
137 |         &[9277, 2758, 3074, 7415, 3579, 4141],
138 |     )
139 | }
140 | 
141 | #[cfg(feature = "buzhash")]
142 | #[test]
143 | fn buzhash_cuts_1() {
144 |     cut_test(
145 |         0,
146 |         hash_roll::buzhash::BuzHash::new_nom(0),
147 |         &[6265, 1745, 11527, 6851, 1089],
148 |     )
149 | }
150 | 
151 | #[cfg(feature = "zpaq")]
152 | #[test]
153 | fn zpaq_cuts_0() {
154 |     // These match edges from Zpaq 7.15 (with modification to print the fragment sizes).
155 |     //
156 |     //     cargo run --example generate-test-data 0 >test_data_0.bin
157 |     //     zpaq a foo.zpaq ~/p/hash-roll/test_data_0.bin -fragment 3
158 |     cut_test(
159 |         0,
160 |         hash_roll::zpaq::Zpaq::with_average_size_pow_2(13),
161 |         &[10785, 6329, 1287, 860, 4716, 7419],
162 |     )
163 | }
164 | 
165 | #[cfg(feature = "zpaq")]
166 | #[test]
167 | fn zpaq_cuts_3() {
168 |     // These match edges from Zpaq 7.15 (with modification to print the fragment sizes).
169 |     //
170 |     //     cargo run --example generate-test-data 3 >test_data_3.bin
171 |     //     zpaq a foo.zpaq ~/p/hash-roll/test_data_3.bin -fragment 3
172 |     cut_test(
173 |         3,
174 |         hash_roll::zpaq::Zpaq::with_average_size_pow_2(13),
175 |         &[16353, 2334, 970, 5326, 1557],
176 |     )
177 | }
178 | 
179 | #[cfg(feature = "pigz")]
180 | #[test]
181 | fn pigz_cuts_0() {
182 |     cut_test(
183 |         0,
184 |         hash_roll::pigz::PigzRsyncable::default(),
185 |         &[9069, 1191, 3685, 8629, 2119, 2939],
186 |     )
187 | }
188 | 
189 | /*
190 |  * 0
191 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 0, tss: 8388608 -> (131072, 0)
192 |  * 1
193 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 131072, tss: 8388608 -> (131072, 0)
194 |  * 2
195 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 262144, tss: 8388608 -> (131072, 0)
196 |  * 3
197 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 393216, tss: 8388608 -> (131072, 0)
198 |  * 4
199 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 524288, tss: 8388608 -> (131072, 0)
200 |  * 5
201 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 655360, tss: 8388608 -> (131072, 0)
202 |  * 6
203 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 786432, tss: 8388608 -> (131072, 0)
204 |  7
205 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 917504, tss: 8388608 -> (131072, 0)
206 |  8
207 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 1048576, tss: 8388608 -> (131072, 0)
208 |  9
209 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 1179648, tss: 8388608 -> (131072, 0)
210 |  10
211 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 1310720, tss: 8388608 -> (131072, 0)
212 |  11
213 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 1441792, tss: 8388608 -> (131072, 0)
214 |  12
215 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 1572864, tss: 8388608 -> (87647, 1)
216 |  13
217 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (87647, 131072), inbf: 0, tss: 8388608 -> (43425, 0)
218 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 43425, tss: 8388608 -> (131072, 0)
219 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 174497, tss: 8388608 -> (131072, 0)
220 | ../lib/compress/zstdmt_compress.c: findSynchronizationPoint: input: (0, 131072), inbf: 305569, tss: 8388608 -> (131072, 0)
221 |  */
222 | #[cfg(feature = "zstd")]
223 | #[test]
224 | fn zstd_cuts_0_2mb() {
225 |     cut_test_sz(
226 |         0,
227 |         1024 * 1024 * 2,
228 |         hash_roll::zstd::Zstd::default(),
229 |         //&[1660511],
230 |         &[12 * 131072 + 87647],
231 |     )
232 | }
233 | 
234 | #[cfg(feature = "gear")]
235 | #[test]
236 | fn gear32_cuts_0() {
237 |     cut_test(0, hash_roll::gear::Gear32::default(), &[11031, 7789, 10463])
238 | }
239 | 
240 | #[cfg(feature = "fastcdc")]
241 | #[test]
242 | fn fastcdc_cuts_incr_0() {
243 |     cut_test_incr(
244 |         0,
245 |         8192 * 4,
246 |         hash_roll::fastcdc::FastCdcIncr::default(),
247 |         &[8463, 9933, 9029],
248 |     )
249 | }
250 | 
251 | #[cfg(feature = "fastcdc")]
252 | #[test]
253 | fn fastcdc_cuts_0() {
254 |     cut_test(
255 |         0,
256 |         hash_roll::fastcdc::FastCdc::default(),
257 |         &[8463, 9933, 9029],
258 |     )
259 | }
260 | 
261 | #[cfg(feature = "ram")]
262 | #[test]
263 | fn ram_cuts_0() {
264 |     cut_test(0, hash_roll::ram::Ram::with_w(8192), &[8264, 8368, 8341])
265 | }
266 | 


--------------------------------------------------------------------------------
/tests/cuts_qc.rs:
--------------------------------------------------------------------------------
  1 | // check the following are equivalent:
  2 | //  - find_chunk_edge() with 1 set of buffer sizes vs another set of buffer sizes
  3 | //  - incrimental with 1 set of buffer sizes vs another set of buffer sizes
  4 | //  - find_chunk_edge() vs incrimental
  5 | //
  6 | //  - simd vs non-simd algorithms
  7 | 
  8 | use hash_roll::Chunk;
  9 | use proptest::prelude::*;
 10 | 
 11 | fn splits_fce<C: Chunk>(chunker: &C, buf: &[u8], buf_sizes: &[usize]) -> Vec<usize> {
 12 |     let mut splits = Vec::new();
 13 |     let mut i = 0;
 14 |     let mut ss = chunker.to_search_state();
 15 |     let mut last_split_point = 0;
 16 |     let mut curr_discard = 0;
 17 |     let mut prev_buf_size = 0;
 18 |     loop {
 19 |         if buf.len() == curr_discard {
 20 |             break;
 21 |         }
 22 | 
 23 |         // use adative methods to ensure buf size grows
 24 |         let buf_size = buf_sizes[i % buf_sizes.len()] + prev_buf_size;
 25 |         i += 1;
 26 |         let buf_size = std::cmp::min(buf_size, buf.len() - curr_discard);
 27 | 
 28 |         let b = &buf[curr_discard..(buf_size + curr_discard)];
 29 |         println!(
 30 |             "{{ PRE: curr_discard: {}, buf_size: {}",
 31 |             curr_discard, buf_size
 32 |         );
 33 |         let (split, discard_ct) = chunker.find_chunk_edge(&mut ss, b);
 34 |         println!(
 35 |             "}} POST: discard_ct: {}, next_discard: {}",
 36 |             discard_ct,
 37 |             curr_discard + discard_ct
 38 |         );
 39 | 
 40 |         match split {
 41 |             Some(split_point) => {
 42 |                 // `split_point` is translated into the entire buffer (from the one passed to fce),
 43 |                 // and the length is determined by tracking the previous split.
 44 |                 let split_point_global = curr_discard + split_point;
 45 |                 let split_len = split_point_global - last_split_point;
 46 |                 splits.push(split_len);
 47 |                 last_split_point = split_point_global;
 48 |                 prev_buf_size = 0;
 49 |             }
 50 |             None => {
 51 |                 // at end of buffer without a split point
 52 |                 if buf_size == (buf.len() - curr_discard) {
 53 |                     break;
 54 |                 }
 55 | 
 56 |                 prev_buf_size = buf_size;
 57 |             }
 58 |         }
 59 | 
 60 |         curr_discard += discard_ct;
 61 |         println!("-- curr_discard = {}", curr_discard);
 62 |     }
 63 | 
 64 |     splits
 65 | }
 66 | 
 67 | proptest! {
 68 |     #[test]
 69 |     #[cfg(feature = "gzip")]
 70 |     fn gzip_fce_self_consistent_with_varying_buf_size(
 71 |         data in prop::collection::vec(0u8..=255u8, 0..10000),
 72 |         buf_sizes_1 in prop::collection::vec(1usize..5000, 1..10000),
 73 |         buf_sizes_2 in prop::collection::vec(1usize..5000, 1..10000))
 74 |     {
 75 |         let chunker = hash_roll::gzip::GzipRsyncable::default();
 76 |         let s1 = splits_fce(&chunker, &data[..], &buf_sizes_1[..]);
 77 |         let s2 = splits_fce(&chunker, &data[..], &buf_sizes_2[..]);
 78 |         assert_eq!(s1, s2);
 79 |     }
 80 | 
 81 |     #[test]
 82 |     #[cfg(feature = "mii")]
 83 |     fn mii_fce_self_consistent_with_varying_buf_size(
 84 |         data in prop::collection::vec(0u8..=255u8, 0..10000),
 85 |         buf_sizes_1 in prop::collection::vec(1usize..5000, 1..10000),
 86 |         buf_sizes_2 in prop::collection::vec(1usize..5000, 1..10000))
 87 |     {
 88 |         let chunker = hash_roll::mii::Mii::default();
 89 |         let s1 = splits_fce(&chunker, &data[..], &buf_sizes_1[..]);
 90 |         let s2 = splits_fce(&chunker, &data[..], &buf_sizes_2[..]);
 91 |         assert_eq!(s1, s2);
 92 |     }
 93 | 
 94 |     #[test]
 95 |     #[cfg(feature = "buzhash_big")]
 96 |     fn buzhash_fce_self_consistent_with_varying_buf_size(
 97 |         seed: u8,
 98 |         data in prop::collection::vec(0u8..=255u8, 0..10000),
 99 |         buf_sizes_1 in prop::collection::vec(1usize..5000, 1..10000),
100 |         buf_sizes_2 in prop::collection::vec(1usize..5000, 1..10000))
101 |     {
102 |         let chunker = hash_roll::buzhash::BuzHash::new_nom(seed);
103 |         let s1 = splits_fce(&chunker, &data[..], &buf_sizes_1[..]);
104 |         let s2 = splits_fce(&chunker, &data[..], &buf_sizes_2[..]);
105 |         assert_eq!(s1, s2);
106 |     }
107 | 
108 |     #[test]
109 |     #[cfg(feature = "buzhash")]
110 |     fn buzhash_short_fce_self_consistent_with_varying_buf_size(
111 |         data in prop::collection::vec(0u8..=255u8, 0..100),
112 |         buf_sizes_1 in prop::collection::vec(1usize..100, 1..100),
113 |         buf_sizes_2 in prop::collection::vec(1usize..100, 1..100))
114 |     {
115 |         let chunker = hash_roll::buzhash::BuzHash::new(
116 |             7,
117 |             (1 << 4u32) - 1,
118 |             hash_roll::buzhash::BuzHashTableByteSaltHash::from((0, &hash_roll::buzhash_table::GO_BUZHASH)),
119 |             1 << 10,
120 |         );
121 |         let s1 = splits_fce(&chunker, &data[..], &buf_sizes_1[..]);
122 |         let s2 = splits_fce(&chunker, &data[..], &buf_sizes_2[..]);
123 |         assert_eq!(s1, s2);
124 |     }
125 | 
126 |     #[test]
127 |     #[cfg(feature = "zpaq")]
128 |     fn zpaq_fce_self_consistent_with_varying_buf_size(
129 |         data in prop::collection::vec(0u8..=255u8, 0..10000),
130 |         buf_sizes_1 in prop::collection::vec(1usize..5000, 1..10000),
131 |         buf_sizes_2 in prop::collection::vec(1usize..5000, 1..10000))
132 |     {
133 |         let chunker = hash_roll::zpaq::Zpaq::with_average_size_pow_2(13);
134 |         let s1 = splits_fce(&chunker, &data[..], &buf_sizes_1[..]);
135 |         let s2 = splits_fce(&chunker, &data[..], &buf_sizes_2[..]);
136 |         assert_eq!(s1, s2);
137 |     }
138 | 
139 |     #[test]
140 |     #[cfg(feature = "pigz")]
141 |     fn pigz_fce_self_consistent_with_varying_buf_size(
142 |         data in prop::collection::vec(0u8..=255u8, 0..10000),
143 |         buf_sizes_1 in prop::collection::vec(1usize..5000, 1..10000),
144 |         buf_sizes_2 in prop::collection::vec(1usize..5000, 1..10000))
145 |     {
146 |         let chunker = hash_roll::pigz::PigzRsyncable::default();
147 |         let s1 = splits_fce(&chunker, &data[..], &buf_sizes_1[..]);
148 |         let s2 = splits_fce(&chunker, &data[..], &buf_sizes_2[..]);
149 |         assert_eq!(s1, s2);
150 |     }
151 | 
152 |     #[test]
153 |     #[cfg(feature = "bup")]
154 |     fn bup_fce_self_consistent_with_varying_buf_size(
155 |         data in prop::collection::vec(0u8..=255u8, 0..10000),
156 |         buf_sizes_1 in prop::collection::vec(1usize..5000, 1..10000),
157 |         buf_sizes_2 in prop::collection::vec(1usize..5000, 1..10000))
158 |     {
159 |         let chunker = hash_roll::bup::RollSum::default();
160 |         let s1 = splits_fce(&chunker, &data[..], &buf_sizes_1[..]);
161 |         let s2 = splits_fce(&chunker, &data[..], &buf_sizes_2[..]);
162 |         assert_eq!(s1, s2);
163 |     }
164 | 
165 |     #[test]
166 |     #[cfg(feature = "zstd")]
167 |     fn zstd_fce_self_consistent_with_varying_buf_size(
168 |         data in prop::collection::vec(0u8..=255u8, 0..100000),
169 |         buf_sizes_1 in prop::collection::vec(1usize..5000, 1..10000),
170 |         buf_sizes_2 in prop::collection::vec(1usize..5000, 1..10000))
171 |     {
172 |         let chunker = hash_roll::zstd::Zstd::default();
173 |         let s1 = splits_fce(&chunker, &data[..], &buf_sizes_1[..]);
174 |         let s2 = splits_fce(&chunker, &data[..], &buf_sizes_2[..]);
175 |         assert_eq!(s1, s2);
176 |     }
177 | 
178 |     #[test]
179 |     #[cfg(feature = "gear")]
180 |     fn gear_fce_self_consistent_with_varying_buf_size(
181 |         data in prop::collection::vec(0u8..=255u8, 0..100000),
182 |         buf_sizes_1 in prop::collection::vec(1usize..5000, 1..10000),
183 |         buf_sizes_2 in prop::collection::vec(1usize..5000, 1..10000))
184 |     {
185 |         let chunker = hash_roll::gear::Gear32::default();
186 |         let s1 = splits_fce(&chunker, &data[..], &buf_sizes_1[..]);
187 |         let s2 = splits_fce(&chunker, &data[..], &buf_sizes_2[..]);
188 |         assert_eq!(s1, s2);
189 |     }
190 | 
191 |     #[test]
192 |     #[cfg(feature = "fastcdc")]
193 |     fn fastcdc_fce_self_consistent_with_varying_buf_size(
194 |         data in prop::collection::vec(0u8..=255u8, 0..100000),
195 |         buf_sizes_1 in prop::collection::vec(1usize..5000, 1..10000),
196 |         buf_sizes_2 in prop::collection::vec(1usize..5000, 1..10000))
197 |     {
198 |         let chunker = hash_roll::fastcdc::FastCdc::default();
199 |         let s1 = splits_fce(&chunker, &data[..], &buf_sizes_1[..]);
200 |         let s2 = splits_fce(&chunker, &data[..], &buf_sizes_2[..]);
201 |         assert_eq!(s1, s2);
202 |     }
203 | 
204 |     #[test]
205 |     #[cfg(feature = "ram")]
206 |     fn ram_fce_self_consistent_with_varying_buf_size(
207 |         data in prop::collection::vec(0u8..=255u8, 0..100000),
208 |         buf_sizes_1 in prop::collection::vec(1usize..5000, 1..10000),
209 |         buf_sizes_2 in prop::collection::vec(1usize..5000, 1..10000))
210 |     {
211 |         let chunker = hash_roll::ram::Ram::with_w(8192);
212 |         let s1 = splits_fce(&chunker, &data[..], &buf_sizes_1[..]);
213 |         let s2 = splits_fce(&chunker, &data[..], &buf_sizes_2[..]);
214 |         assert_eq!(s1, s2);
215 |     }
216 | }
217 | 


--------------------------------------------------------------------------------
/tests/fastcdc.rs:
--------------------------------------------------------------------------------
  1 | #![cfg(feature = "fastcdc")]
  2 | 
  3 | use hash_roll::fastcdc::FastCdcIncr;
  4 | use hash_roll::ChunkIncr;
  5 | use rand_pcg::Pcg64;
  6 | 
  7 | #[derive(Debug, Clone, PartialEq, Eq)]
  8 | struct Vec8K {
  9 |     data: Vec<u8>,
 10 | }
 11 | 
 12 | impl quickcheck::Arbitrary for Vec8K {
 13 |     fn arbitrary<G: quickcheck::Gen>(g: &mut G) -> Self {
 14 |         // FIXME: the intention is to raise this >8KB, but that makes the tests take far too
 15 |         // long to run.
 16 |         let l = 1 * 1024 + g.size();
 17 | 
 18 |         let mut d = vec![0; l];
 19 | 
 20 |         g.fill_bytes(&mut d[..]);
 21 | 
 22 |         Vec8K { data: d }
 23 |     }
 24 | 
 25 |     fn shrink(&self) -> Box<dyn Iterator<Item = Self>> {
 26 |         // use the normal Vec shrinkers
 27 |         let chain = self.data.shrink().map(|x| Vec8K { data: x });
 28 |         Box::new(chain)
 29 |     }
 30 | }
 31 | 
 32 | fn oracle_1(d: Vec8K) -> bool {
 33 |     let mut cdc = FastCdcIncr::default();
 34 |     let v1 = fast_cdc_8kb(&d.data[..]);
 35 |     let v2 = cdc.push(&d.data[..]);
 36 | 
 37 |     v1 == v2.unwrap_or(0)
 38 | }
 39 | 
 40 | fn oracle_1_test(data: &[u8]) {
 41 |     let mut cdc = FastCdcIncr::default();
 42 |     let v1 = fast_cdc_8kb(&data[..]);
 43 |     let v2 = cdc.push(&data[..]).unwrap_or(0);
 44 |     assert_eq!(v1, v2);
 45 | }
 46 | 
 47 | #[test]
 48 | fn o1_empty() {
 49 |     oracle_1_test(&vec![0]);
 50 | }
 51 | 
 52 | #[test]
 53 | fn o1_qc() {
 54 |     quickcheck::quickcheck(oracle_1 as fn(Vec8K) -> bool);
 55 | }
 56 | 
 57 | fn o1_8k_seed(state: u128) {
 58 |     use rand::RngCore;
 59 |     let l = 8 * 1024 * 1024 + 1;
 60 |     let mut d = Vec::with_capacity(l);
 61 |     let c = d.capacity();
 62 |     unsafe { d.set_len(c) };
 63 |     println!("seed: {:#x}", state);
 64 |     println!("len: {}", c);
 65 |     let mut rng = Pcg64::new(state, 0xa02bdbf7bb3c0a7ac28fa16a64abf96);
 66 |     for _ in 0..10 {
 67 |         rng.fill_bytes(&mut d);
 68 |         oracle_1_test(&d);
 69 |     }
 70 | }
 71 | 
 72 | #[test]
 73 | fn o1_8k1() {
 74 |     let state: u128 = ::rand::random();
 75 |     o1_8k_seed(state);
 76 | }
 77 | 
 78 | #[test]
 79 | fn o1_8k_t1() {
 80 |     o1_8k_seed(0x6362eca4ca113c1bd10d40b8b10e9ad4);
 81 | }
 82 | 
 83 | #[test]
 84 | fn o1_8k_t2() {
 85 |     o1_8k_seed(0x22e622e48004575fe4229bf0da6341c9);
 86 | }
 87 | 
 88 | #[test]
 89 | fn feed_until_5_chunks() {
 90 |     use rand::RngCore;
 91 |     let mut cdc = FastCdcIncr::default();
 92 |     let mut ct = 0;
 93 |     let mut rng = ::rand::thread_rng();
 94 |     let mut d = [0u8; 256];
 95 |     rng.fill_bytes(&mut d);
 96 |     loop {
 97 |         rng.fill_bytes(&mut d);
 98 |         let mut data = &d[..];
 99 |         loop {
100 |             let p = cdc.push(&data[..]);
101 |             println!("p: {:?}, cdc: {:?}", p, cdc);
102 | 
103 |             if p == None || p.unwrap() == data.len() {
104 |                 break;
105 |             } else {
106 |                 ct += 1;
107 |                 if ct > 5 {
108 |                     return;
109 |                 }
110 |                 data = &data[p.unwrap()..];
111 |             }
112 |         }
113 |     }
114 | }
115 | 
116 | /// A 1-buffer implimentation of FastCDC8KB designed to match the reference pseudocode
117 | fn fast_cdc_8kb(src: &[u8]) -> usize {
118 |     use hash_roll::gear_table::GEAR_64;
119 |     use std::num::Wrapping;
120 |     // these masks are taken from the paper and could be adjusted/adjustable.
121 |     const MASK_S: u64 = 0x0003590703530000;
122 |     //const MASK_A: u64 = 0x0000d90303530000;
123 |     const MASK_L: u64 = 0x0000d90003530000;
124 |     const MIN_SIZE: u64 = 2 * 1024; // 2KB
125 |     const MAX_SIZE: u64 = 64 * 1024; // 64KB
126 |     const NORMAL_SIZE: u64 = 8 * 1024; // 8KB
127 | 
128 |     let mut fp = Wrapping(0);
129 |     let mut n = src.len();
130 |     let mut normal_size = NORMAL_SIZE as usize;
131 |     if n <= (MIN_SIZE as usize) {
132 |         // Diverge from the reference here:
133 |         //  return 0 to indicate no split found rather than src.len()
134 |         return 0;
135 |     }
136 | 
137 |     if n >= (MAX_SIZE as usize) {
138 |         n = MAX_SIZE as usize;
139 |     } else if n <= normal_size {
140 |         normal_size = n;
141 |     }
142 | 
143 |     for i in (MIN_SIZE as usize)..normal_size {
144 |         fp = (fp << 1) + Wrapping(GEAR_64[src[i] as usize]);
145 |         if (fp.0 & MASK_S) == 0 {
146 |             return i;
147 |         }
148 |     }
149 | 
150 |     for i in normal_size..n {
151 |         fp = (fp << 1) + Wrapping(GEAR_64[src[i] as usize]);
152 |         if (fp.0 & MASK_L) == 0 {
153 |             return i;
154 |         }
155 |     }
156 | 
157 |     // Diverge from the reference here:
158 |     //  return MAX_SIZE when we've gotten to MAX_SIZE
159 |     //  return 0 to indicate no split found rather than src.len()
160 |     if n == MAX_SIZE as usize {
161 |         n
162 |     } else {
163 |         0
164 |     }
165 | }
166 | 


--------------------------------------------------------------------------------
/tests/oracle_zpaq.rs:
--------------------------------------------------------------------------------
  1 | #![cfg(feature = "zpaq-broken")]
  2 | // cdchunking doesn't impliment zpaq exactly correctly
  3 | 
  4 | use hash_roll::{ChunkIncr, ToChunkIncr};
  5 | use quickcheck::quickcheck;
  6 | use rand::RngCore;
  7 | use rand_pcg::Pcg64;
  8 | 
  9 | fn test_data(seed: u128, size: usize) -> Vec<u8> {
 10 |     let mut fill_rng = Pcg64::new(seed, 0xa02bdbf7bb3c0a7ac28fa16a64abf96);
 11 |     let mut buf = vec![0u8; size];
 12 |     fill_rng.fill_bytes(&mut buf);
 13 |     buf
 14 | }
 15 | 
 16 | quickcheck! {
 17 |     fn zpaq_eq_cdchunking(xs: Vec<u8>) -> bool {
 18 |         let m1 = hash_roll::zpaq::Zpaq::with_average_and_range_and_m(13, .., 123_456_791, 123_456_791 * 2);
 19 |         let m2 = cdchunking::Chunker::new(cdchunking::ZPAQ::new(13));
 20 | 
 21 |         let mut i1 = m1.to_chunk_incr().iter_slices(&xs);
 22 |         let mut i2 = m2.slices(&xs);
 23 | 
 24 |         loop {
 25 |             let v1 = i1.next();
 26 |             let v2 = i2.next();
 27 | 
 28 |             if v1 != v2 {
 29 |                 return false;
 30 |             }
 31 | 
 32 |             if v1.is_none() {
 33 |                 return true;
 34 |             }
 35 |         }
 36 |     }
 37 | }
 38 | 
 39 | fn c(xs: &[u8]) {
 40 |     let m1 =
 41 |         hash_roll::zpaq::Zpaq::with_average_and_range_and_m(13, .., 123_456_791, 123_456_791 * 2);
 42 |     let m2 = cdchunking::Chunker::new(cdchunking::ZPAQ::new(13));
 43 | 
 44 |     let mut i1 = m1.to_chunk_incr().iter_slices(&xs);
 45 |     let mut i2 = m2.slices(&xs);
 46 | 
 47 |     let mut i = 0;
 48 |     loop {
 49 |         let v1 = i1.next();
 50 |         let v2 = i2.next();
 51 | 
 52 |         if v1 != v2 {
 53 |             panic!("i: {}, hr_v: {:?} != cdc_v : {:?}", i, v1, v2);
 54 |         }
 55 | 
 56 |         if v1.is_none() {
 57 |             break;
 58 |         }
 59 | 
 60 |         i += 1;
 61 |     }
 62 | }
 63 | 
 64 | #[test]
 65 | fn zpaq_cdchunking_cuts() {
 66 |     let buf = test_data(0, 8192 * 4);
 67 |     let m: Vec<usize> = cdchunking::Chunker::new(cdchunking::ZPAQ::new(13))
 68 |         .slices(&buf)
 69 |         .map(|v| v.len())
 70 |         .collect();
 71 |     assert_eq!(&m[..], &[10785, 6329, 1287, 860, 4716, 7419],);
 72 | }
 73 | 
 74 | mod oracle_zpaq {
 75 |     use super::c;
 76 |     #[test]
 77 |     fn t1() {
 78 |         c(&[
 79 |             25, 5, 82, 84, 53, 94, 27, 24, 98, 47, 7, 7, 6, 34, 60, 98, 20, 64, 17, 5, 62, 40, 94,
 80 |             79, 33, 1, 0,
 81 |         ])
 82 |     }
 83 | 
 84 |     #[test]
 85 |     fn t2() {
 86 |         c(&[
 87 |             25, 5, 82, 84, 53, 94, 27, 24, 98, 47, 7, 7, 6, 34, 60, 98, 20, 64, 17, 5, 62, 40, 94,
 88 |             79, 33, 1,
 89 |         ])
 90 |     }
 91 | 
 92 |     #[test]
 93 |     fn t3() {
 94 |         c(&[
 95 |             25, 5, 82, 84, 53, 94, 27, 24, 98, 47, 7, 7, 6, 34, 60, 98, 20, 64, 17, 5, 62, 40, 94,
 96 |             79, 33, 0, 0,
 97 |         ])
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------
/tests/qc_bh.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 | extern crate hash_roll;
  3 | #[macro_use]
  4 | extern crate quickcheck;
  5 | */
  6 | 
  7 | /*
  8 | #[derive(Debug,Clone,PartialEq,Eq)]
  9 | struct Fma {
 10 |     data: Vec<u8>,
 11 |     msize: usize,
 12 |     moffs: usize,
 13 | }
 14 | 
 15 | impl quickcheck::Arbitrary for Fma {
 16 |     fn arbitrary<G: Gen>(g: &mut G) -> Self {
 17 |         // lenght at least 1
 18 |         let d = {
 19 |             let mut x = g.gen();
 20 |             while x.len() == 0 {
 21 |                 x = g.gen();
 22 |             }
 23 |         };
 24 | 
 25 |         // 1 to d.len()
 26 |         let s = if d.len() == 1 {
 27 |             1
 28 |         } else {
 29 |             (g.gen() % (d.len() - 1)) + 1
 30 |         };
 31 | 
 32 |         // 0 to (d.len() - s)
 33 |         let o = if d.len() - s == 0 {
 34 |             0
 35 |         } else {
 36 |             g.gen() % (d.len() - s)
 37 |         };
 38 | 
 39 | 
 40 |         Fma {
 41 |             data: d,
 42 |             msize: s,
 43 |             moffs: o,
 44 |         }
 45 |     }
 46 | 
 47 |     fn shrink(&self) -> Box<Iterator<Item=Self>> {
 48 | 
 49 |     }
 50 | }
 51 | */
 52 | 
 53 | /*
 54 | quickcheck! {
 55 |     // choose a substring of `data` and use buzhash to find it
 56 |     fn find_match(data: Vec<u8>, size: usize, offs: usize)  -> bool {
 57 |         // d.len() > 0
 58 |         if data.len() == 0 {
 59 |             return true
 60 |         }
 61 |         // 1..d.len()
 62 |         let size = if size == 0 {
 63 |             1
 64 |         } else {
 65 |             if data.len() == 1 {
 66 |                 1
 67 |             } else {
 68 |                 (size % (data.len() - 1)) + 1
 69 |             }
 70 |         };
 71 | 
 72 | 
 73 |         let offs = if offs == 0 {
 74 |             0
 75 |         } else {
 76 |             if data.len() - size == 0 {
 77 |                 0
 78 |             } else {
 79 |                 offs % (data.len() - size)
 80 |             }
 81 |         };
 82 |         let ms = &data[offs..(offs+size)];
 83 |         println!("size: {}, offs: {}", size, offs);
 84 |         let mut b = ::hash_roll::buzhash::BuzHashBuf::with_capacity(size);
 85 |         let mut b2 = b.clone();
 86 | 
 87 |         b.push(ms);
 88 |         let h = b.hash();
 89 | 
 90 |         let mut d = &data[..];
 91 |         loop {
 92 |             let f = b2.find_match(h, &d[..]);
 93 |             if f == 0 {
 94 |                 return false
 95 |             }
 96 | 
 97 |             if f >= offs + size {
 98 |                 return false;
 99 |             }
100 | 
101 |             if f > size && (&data[(f-size)..f] == ms) {
102 |                 return true;
103 |             }
104 | 
105 |             d = &d[f..];
106 |         }
107 |     }
108 | }
109 | */
110 | 


--------------------------------------------------------------------------------
/tests/qc_bup.rs:
--------------------------------------------------------------------------------
 1 | #![cfg(feature = "bup")]
 2 | use quickcheck::quickcheck;
 3 | 
 4 | use hash_roll::ChunkIncr;
 5 | 
 6 | quickcheck! {
 7 |     fn simple_eq(xs: Vec<u8>) -> bool {
 8 |         let mut m1 = hash_roll::bup::RollSumIncr::default();
 9 |         let mut m2 = rollsum::Bup::default();
10 | 
11 |         let v1 = m1.push(&xs);
12 |         let v2 = m2.find_chunk_edge(&xs);
13 | 
14 |         v1 == v2.map(|x| x.0)
15 |     }
16 | 
17 |     fn iter_eq(xs: Vec<u8>) -> bool {
18 |         let mut m1 = hash_roll::bup::RollSumIncr::default();
19 |         let mut m2 = rollsum::Bup::default();
20 | 
21 |         let mut x = &xs[..];
22 |         loop {
23 |             let v1 = m1.push(&x);
24 |             let v2 = m2.find_chunk_edge(&x);
25 | 
26 |             if v1 != v2.map(|x| x.0) {
27 |                 return false
28 |             }
29 | 
30 |             if v1 == None {
31 |                 return true
32 |             }
33 | 
34 |             let v1 = v1.unwrap();
35 | 
36 |             x = &x[v1..];
37 |             if x.len() == 0 {
38 |                 return true
39 |             }
40 |         }
41 |     }
42 | }
43 | 
44 | fn chk_a(x: &[u8]) {
45 |     let mut m1 = hash_roll::bup::RollSumIncr::default();
46 |     let mut m2 = rollsum::Bup::default();
47 | 
48 |     let v1 = m1.push(&x);
49 |     let v2 = m2.find_chunk_edge(&x);
50 | 
51 |     assert_eq!(v1, v2.map(|x| x.0));
52 | }
53 | 
54 | fn chk_b(x: &[u8]) {
55 |     use rollsum::Engine;
56 |     let mut m1 = hash_roll::bup::RollSumIncr::default();
57 |     let mut m2 = rollsum::Bup::default();
58 |     let cm = (1 << rollsum::bup::CHUNK_BITS) - 1;
59 | 
60 |     for (i, &v) in x.iter().enumerate() {
61 |         m1.roll_byte(v);
62 |         m2.roll_byte(v);
63 |         println!("i={}, v={}", i, v);
64 |         assert_eq!(m1.digest(), m2.digest());
65 |         assert_eq!(m1.at_split(), (m2.digest() & cm) == cm);
66 |     }
67 | }
68 | 
69 | #[test]
70 | fn simple_eq_1() {
71 |     chk_a(&[
72 |         92, 6, 28, 35, 68, 82, 35, 71, 34, 19, 9, 45, 97, 17, 11, 6, 53, 39, 93, 49, 29, 17, 37, 6,
73 |         39,
74 |     ]);
75 | }
76 | 
77 | #[test]
78 | fn simple_eq_1b() {
79 |     chk_b(&[
80 |         92, 6, 28, 35, 68, 82, 35, 71, 34, 19, 9, 45, 97, 17, 11, 6, 53, 39, 93, 49, 29, 17, 37, 6,
81 |         39,
82 |     ]);
83 | }
84 | 
85 | #[test]
86 | fn simple_eq_2() {
87 |     chk_a(&[67, 3, 23, 73, 86, 64, 26, 25, 81, 53, 26, 82, 98, 86, 28]);
88 | }
89 | 
90 | #[test]
91 | fn simple_eq_3() {
92 |     chk_a(&[
93 |         40, 58, 57, 0, 16, 2, 32, 88, 0, 22, 23, 74, 90, 88, 95, 99, 86,
94 |     ]);
95 | }
96 | 


--------------------------------------------------------------------------------
/tests/rsyncable.rs:
--------------------------------------------------------------------------------
 1 | #![cfg(feature = "rsyncable")]
 2 | use hash_roll::gzip::GzipRsyncable;
 3 | use hash_roll::Splitter;
 4 | 
 5 | #[test]
 6 | fn test_rsyncable() {
 7 |     use std::collections::HashSet;
 8 | 
 9 |     let d1 = b"hello, this is some bytes";
10 |     let mut d2 = d1.clone();
11 |     d2[4] = ':' as u8;
12 | 
13 |     let b1 = GzipRsyncable::with_window_and_modulus(4, 8).into_vecs(d1.iter().cloned());
14 |     let b2 = GzipRsyncable::with_window_and_modulus(4, 8).into_vecs(d2.iter().cloned());
15 | 
16 |     let c1 = b1.clone().count();
17 |     let c2 = b2.clone().count();
18 | 
19 |     /* XXX: in this contrived case, we generate the same number of blocks.
20 |      * We should generalize this test to guess at "reasonable" differences in block size
21 |      */
22 |     assert_eq!(c1, 4);
23 |     assert!((c1 as i64 - c2 as i64).abs() < 1);
24 | 
25 |     /* check that some blocks match up */
26 | 
27 |     let mut blocks = HashSet::with_capacity(c1);
28 |     let mut common_in_b1 = 0u64;
29 |     for b in b1 {
30 |         if !blocks.insert(b) {
31 |             common_in_b1 += 1;
32 |         }
33 |     }
34 | 
35 |     println!("common in b1: {}", common_in_b1);
36 | 
37 |     let mut shared_blocks = 0u64;
38 |     for b in b2 {
39 |         if blocks.contains(&b) {
40 |             shared_blocks += 1;
41 |         }
42 |     }
43 | 
44 |     /* XXX: this is not a generic test, we can't rely on it */
45 |     println!("shared blocks: {}", shared_blocks);
46 |     assert!(shared_blocks > (c1 as u64) / 2);
47 | }
48 | 


--------------------------------------------------------------------------------