├── .gitignore
├── .travis.yml
├── Cargo.toml
├── LICENSE-APACHE
├── LICENSE-MIT
├── README.md
└── src
├── bwt
├── dc.rs
├── mod.rs
└── mtf.rs
├── checksum
└── adler.rs
├── data
├── test.large
├── test.large.z.5
├── test.lz4.1
├── test.lz4.2
├── test.lz4.3
├── test.lz4.4
├── test.lz4.5
├── test.lz4.6
├── test.lz4.7
├── test.lz4.8
├── test.lz4.9
├── test.txt
├── test.z.0
├── test.z.1
├── test.z.2
├── test.z.3
├── test.z.4
├── test.z.5
├── test.z.6
├── test.z.7
├── test.z.8
├── test.z.9
└── test.z.go
├── entropy
└── ari
│ ├── apm.rs
│ ├── bin.rs
│ ├── mod.rs
│ ├── table.rs
│ └── test.rs
├── flate.rs
├── lib.rs
├── lz4.rs
├── main.rs
├── rle.rs
└── zlib.rs
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | /Cargo.lock
3 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: rust
2 | rust:
3 | - stable
4 | - nightly
5 | script:
6 | - cargo build --verbose
7 | - cargo test --verbose
8 | # cargo test already runs doctests now, so no need for rustdoc --test here
9 | # - rustdoc --test src/lib.rs -L target
10 | - cargo doc
11 | after_success: ! '[ $TRAVIS_BRANCH = master ] && [ $TRAVIS_PULL_REQUEST = false ]
12 | && echo '''' > target/doc/index.html
13 | && sudo pip install ghp-import && ghp-import -n target/doc && git push -fq https://${TOKEN}@github.com/${TRAVIS_REPO_SLUG}.git
14 | gh-pages '
15 | env:
16 | global:
17 | - secure: NcLf8VutE7aJ3Sq9IzksEM0qA4yfM+RJxAnD7zpA/y6ipsqtLfo1qUIiiNg7uhJjSGrGLd7fGH/awUDnJfhSYdKLRML87qFt02Dqz4E8gPIRUOe3a6Q2QHzvM/SiLsc6W/tRvHwKlHld0MzqHyrOWO6AMeIfV2+kREU3WbhPHtI=
18 | notifications:
19 | email:
20 | on_success: never
21 |
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 |
3 | name = "compress"
4 | description = "Various compression algorithms written in rust"
5 | repository = "https://github.com/alexcrichton/rust-compress"
6 | version = "0.2.1"
7 | authors = ["Alex Crichton ",
8 | "Dzmitry Malyshau "]
9 | license = "MIT/Apache-2.0"
10 |
11 | [features]
12 | default = ["bwt", "checksum", "entropy", "flate", "lz4", "zlib", "rle"]
13 | bwt = []
14 | checksum = []
15 | entropy = []
16 | flate = []
17 | lz4 = []
18 | zlib = ["flate", "checksum"]
19 | rle = []
20 | unstable = []
21 |
22 | [[bin]]
23 | name = "compress"
24 | doc = false
25 |
26 | [dependencies]
27 | log = "0.4"
28 | num = "0.3"
29 | rand = "0.7"
30 | byteorder = "1.3"
--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
1 | Copyright (c) 2014 Alex Crichton
2 |
3 | Permission is hereby granted, free of charge, to any
4 | person obtaining a copy of this software and associated
5 | documentation files (the "Software"), to deal in the
6 | Software without restriction, including without
7 | limitation the rights to use, copy, modify, merge,
8 | publish, distribute, sublicense, and/or sell copies of
9 | the Software, and to permit persons to whom the Software
10 | is furnished to do so, subject to the following
11 | conditions:
12 |
13 | The above copyright notice and this permission notice
14 | shall be included in all copies or substantial portions
15 | of the Software.
16 |
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | DEALINGS IN THE SOFTWARE.
26 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Rust Compresion
2 |
3 | [](https://travis-ci.org/alexcrichton/rust-compress)
4 |
5 | [Documentation](http://alexcrichton.com/rust-compress/compress/index.html)
6 |
7 | **NOTE: This is not a production-quality library, it is a proof of concept. This
8 | library mainly contains *decoders*, not *encoders*.**
9 |
10 | This repository aims to house various implementations of compression algorithms,
11 | all written in rust. This is still very much a work in progress.
12 |
13 | ```
14 | git clone https://github.com/alexcrichton/rust-compress
15 | cd rust-compress
16 | cargo build
17 | ```
18 |
19 | ### Implemented Algorithms
20 |
21 | The following algorithms are alredy implemented in the main branch:
22 |
23 | * DEFLATE: standard decoder based on RFC 1951
24 | * LZ4 (Ziv-Lempel modification): dummy encoder, semi-complete decoder
25 | * BWT (Burrows-Wheeler Transform): straightforward encoder, standard decoder
26 | * DC (Distance Coding): basic encoder, standard decoder
27 | * Ari (Arithmetic coding): standard range encoder/decoder
28 | * RLE (Run-Length Encoding): basic encoder/decoder
29 |
30 | ### Desired Algorithms
31 |
32 | The following algorithms are either planned or in development at this point:
33 |
34 | * WFC (Weight-Frequency Coding)
35 | * SA/BWT in linear time
36 |
--------------------------------------------------------------------------------
/src/bwt/dc.rs:
--------------------------------------------------------------------------------
1 | /*!
2 |
3 | DC (Distance Coding) forward and backward transformation.
4 | Designed to be used on BWT block output for compression.
5 |
6 | # Links
7 |
8 | http://www.data-compression.info/Algorithms/DC/
9 |
10 | # Example
11 |
12 | ```rust
13 | use compress::bwt::dc;
14 |
15 | let bytes = b"abracadabra";
16 | let distances = dc::encode_simple::(bytes);
17 | let decoded = dc::decode_simple(bytes.len(), &distances[..]);
18 | ```
19 |
20 | # Credit
21 |
22 | This is an original implementation.
23 | Thanks to Edgar Binder for inventing DC!
24 |
25 | */
26 |
27 | use std::io;
28 | use std::iter::{self, repeat};
29 | use std::slice as vec;
30 | use super::num::traits::{NumCast, ToPrimitive};
31 | use super::mtf::MTF;
32 |
33 | pub type Symbol = u8;
34 | pub type Rank = u8;
35 | pub const TOTAL_SYMBOLS: usize = 0x100;
36 |
37 | /// Distance coding context
38 | /// Has all the information potentially needed by the underlying coding model
39 | #[derive(PartialEq, Eq, Debug)]
40 | pub struct Context {
41 | /// current symbol
42 | pub symbol: Symbol,
43 | /// last known MTF rank
44 | pub last_rank: Rank,
45 | /// maximum possible distance
46 | pub distance_limit: usize,
47 | }
48 |
49 | impl Context {
50 | /// create a new distance context
51 | pub fn new(s: Symbol, r: Rank, dmax: usize) -> Context {
52 | Context {
53 | symbol: s,
54 | last_rank: r,
55 | distance_limit: dmax,
56 | }
57 | }
58 | }
59 |
60 |
61 | /// DC body iterator, can be used to encode distances
62 | pub struct EncodeIterator<'a,'b, D: 'b> {
63 | data: iter::Enumerate,vec::Iter<'b, D>>>,
64 | pos: [usize; TOTAL_SYMBOLS],
65 | last_active: usize,
66 | size: usize,
67 | }
68 |
69 | impl<'a, 'b, D: NumCast> EncodeIterator<'a,'b, D> {
70 | /// create a new encode iterator
71 | pub fn new(input: &'a [Symbol], dist: &'b [D], init: [usize; TOTAL_SYMBOLS]) -> EncodeIterator<'a,'b,D> {
72 | assert_eq!(input.len(), dist.len());
73 | EncodeIterator {
74 | data: input.iter().zip(dist.iter()).enumerate(),
75 | pos: init,
76 | last_active: 0,
77 | size: input.len()
78 | }
79 | }
80 |
81 | /// get the initial symbol positions, to be called before iteration
82 | pub fn get_init<'c>(&'c self) -> &'c [usize; TOTAL_SYMBOLS] {
83 | assert_eq!(self.last_active, 0);
84 | &self.pos
85 | }
86 | }
87 |
88 | impl<'a, 'b, D> Iterator for EncodeIterator<'a,'b,D>
89 | where D: Clone + Eq + NumCast + 'b
90 | {
91 | type Item = (D, Context);
92 | fn next(&mut self) -> Option<(D,Context)> {
93 | let filler: D = NumCast::from(self.size).unwrap();
94 | self.data.find(|&(_,(_,d))| *d != filler).map(|(i,(sym,d))| {
95 | let rank = self.last_active - self.pos[*sym as usize];
96 | assert!(rank < TOTAL_SYMBOLS);
97 | self.last_active = i+1;
98 | self.pos[*sym as usize] = i + 1 + d.to_usize().unwrap();
99 | debug!("Encoding distance {} at pos {} for symbol {}, computed rank {}, predicting next at {}",
100 | d.to_usize().unwrap(), i, *sym, rank, self.pos[*sym as usize]);
101 | (d.clone(), Context::new(*sym, rank as Rank, self.size-i))
102 | })
103 | }
104 | }
105 |
106 | /// Encode a block of bytes 'input'
107 | /// write output distance stream into 'distances'
108 | /// return: unique bytes encountered in the order they appear
109 | /// with the corresponding initial distances
110 | pub fn encode<'a, 'b, D: Clone + Copy + Eq + NumCast>(input: &'a [Symbol], distances: &'b mut [D], mtf: &mut MTF) -> EncodeIterator<'a,'b,D> {
111 | let n = input.len();
112 | assert_eq!(distances.len(), n);
113 | let mut num_unique = 0;
114 | let mut last = [n; TOTAL_SYMBOLS];
115 | let mut init = [n; TOTAL_SYMBOLS];
116 | let filler: D = NumCast::from(n).unwrap();
117 | for (i,&sym) in input.iter().enumerate() {
118 | distances[i] = filler.clone();
119 | let base = last[sym as usize];
120 | last[sym as usize] = i;
121 | debug!("\tProcessing symbol {} at position {}, last known at {}", sym, i, base);
122 | if base == n {
123 | let rank = num_unique;
124 | mtf.symbols[rank] = sym;
125 | mtf.encode(sym); //==rank
126 | // initial distances are not ordered to support re-shuffle
127 | debug!("\t\tUnique => assigning rank {}, encoding {}", rank, i);
128 | init[sym as usize] = i;
129 | num_unique += 1;
130 | }else {
131 | let rank = mtf.encode(sym) as usize;
132 | if rank > 0 {
133 | debug!("\t\tRegular at rank {}, encoding {}", rank, i-base-rank-1);
134 | assert!(i >= base+rank+1);
135 | distances[base] = NumCast::from(i-base-rank-1).unwrap();
136 | }
137 | }
138 | }
139 | for (rank,&sym) in mtf.symbols[..num_unique].iter().enumerate() {
140 | let base = last[sym as usize];
141 | debug!("\tSweep symbol {} of rank {}, last known at {}, encoding {}", sym, rank, base, n-base-rank-1);
142 | assert!(n >= base+rank+1);
143 | distances[base] = NumCast::from(n-base-rank-1).unwrap();
144 | }
145 | // a basic but expensive check, to be improved
146 | //assert_eq!(input.iter().zip(input.iter().skip(1)).zip(distances.iter()).
147 | // position(|((&a,&b),d)| *d==filler && a!=b), None);
148 | EncodeIterator::new(input, distances, init)
149 | }
150 |
151 |
152 | /// Encode version with "batteries included" for quick testing
153 | pub fn encode_simple(input: &[Symbol]) -> Vec {
154 | let n = input.len();
155 | let mut raw_dist: Vec = repeat(NumCast::from(0).unwrap()).take(n).collect();
156 | let mut eniter = encode(input, &mut raw_dist, &mut MTF::new());
157 | let init: Vec = (0..TOTAL_SYMBOLS).map(|i| NumCast::from(eniter.get_init()[i]).unwrap()).collect();
158 | init.iter().map(|d| d.clone()).chain(eniter.by_ref().map(|(d,_)| d)).collect()
159 | }
160 |
161 | /// Decode a block of distances given the initial symbol positions
162 | pub fn decode(mut next: [usize; TOTAL_SYMBOLS], output: &mut [Symbol], mtf: &mut MTF,
163 | mut fn_dist: F) -> io::Result<()>
164 | where F: FnMut(Context) -> io::Result
165 | {
166 |
167 | let n = output.len();
168 | let mut i = 0;
169 | for (sym,d) in next.iter().enumerate() {
170 | if *d < n {
171 | let mut j = i;
172 | while j>0 && next[mtf.symbols[j-1] as usize] > *d {
173 | mtf.symbols[j] = mtf.symbols[j-1];
174 | j -= 1;
175 | }
176 | mtf.symbols[j] = sym as Symbol;
177 | i += 1;
178 | }
179 | }
180 | if i<=1 {
181 | // redundant alphabet case
182 | let sym = mtf.symbols[0];
183 | for out in output.iter_mut() {
184 | *out = sym;
185 | }
186 | return Ok(())
187 | }
188 |
189 | let alphabet_size = i;
190 | let mut ranks = [0 as Rank; TOTAL_SYMBOLS];
191 | for rank in 0..i {
192 | let sym = mtf.symbols[rank];
193 | debug!("\tRegistering symbol {} of rank {} at position {}",
194 | sym, rank, next[sym as usize]);
195 | ranks[sym as usize] = 0; //could use 'rank' but don't know how to derive it during encoding
196 | }
197 |
198 | i = 0;
199 | while i stop + d,
210 | Err(e) => return Err(e)
211 | };
212 | debug!("\t\tLooking for future position {}", future);
213 | assert!(future <= n);
214 | let mut rank = 1;
215 | while rank < alphabet_size && future+rank > next[mtf.symbols[rank] as usize] {
216 | mtf.symbols[rank-1] = mtf.symbols[rank];
217 | rank += 1;
218 | }
219 | if rank < alphabet_size {
220 | debug!("\t\tFound sym {} of rank {} at position {}", mtf.symbols[rank],
221 | rank, next[mtf.symbols[rank] as usize]);
222 | }else {
223 | debug!("\t\tNot found");
224 | }
225 | mtf.symbols[rank-1] = sym;
226 | debug!("\t\tAssigning future pos {} for symbol {}", future+rank-1, sym);
227 | next[sym as usize] = future+rank-1;
228 | ranks[sym as usize] = (rank-1) as Rank;
229 | }
230 | assert_eq!(next.iter().position(|&d| d=n+alphabet_size), None);
231 | assert_eq!(i, n);
232 | Ok(())
233 | }
234 |
235 | /// Decode version with "batteries included" for quick testing
236 | pub fn decode_simple(n: usize, distances: &[D]) -> Vec {
237 | let mut output: Vec = repeat(0 as Symbol).take(n).collect();
238 | let mut init = [0; TOTAL_SYMBOLS];
239 | for i in 0..TOTAL_SYMBOLS {
240 | init[i] = distances[i].to_usize().unwrap();
241 | }
242 | let mut di = TOTAL_SYMBOLS;
243 | decode(init, &mut output[..], &mut MTF::new(), |_ctx| {
244 | di += 1;
245 | if di > distances.len() {
246 | Err(io::Error::new(io::ErrorKind::Other, "Unexpected end of file"))
247 | } else {
248 | Ok(distances[di-1].to_usize().unwrap())
249 | }
250 | }).unwrap();
251 | output.into_iter().collect()
252 | }
253 |
254 |
255 | #[cfg(test)]
256 | mod test {
257 | use std::iter::repeat;
258 |
259 | fn roundtrip(bytes: &[u8]) {
260 | info!("Roundtrip DC of size {}", bytes.len());
261 | let distances = super::encode_simple::(bytes);
262 | debug!("Roundtrip DC input: {:?}, distances: {:?}", bytes, distances);
263 | let decoded = super::decode_simple(bytes.len(), &distances[..]);
264 | assert_eq!(&decoded[..], bytes);
265 | }
266 |
267 | /// rountrip version that compares the coding contexts on the way
268 | fn roundtrip_ctx(bytes: &[u8]) {
269 | let n = bytes.len();
270 | info!("Roundtrip DC context of size {}", n);
271 | let mut mtf = super::super::mtf::MTF::new();
272 | let mut raw_dist: Vec = repeat(0).take(n).collect();
273 | let eniter = super::encode(bytes, &mut raw_dist[..], &mut mtf);
274 | let mut init = [0; super::TOTAL_SYMBOLS];
275 | for i in 0..super::TOTAL_SYMBOLS {
276 | init[i] = eniter.get_init()[i];
277 | }
278 | // implicit iterator copies, or we can gather in one pass and then split
279 | let (distances, contexts): (Vec<_>, Vec<_>) = eniter.unzip();
280 | let mut output: Vec = repeat(0).take(n).collect();
281 | let mut di = 0;
282 | super::decode(init, &mut output[..], &mut mtf, |ctx| {
283 | assert_eq!(contexts[di], ctx);
284 | di += 1;
285 | Ok(distances[di-1] as usize)
286 | }).unwrap();
287 | assert_eq!(di, distances.len());
288 | assert_eq!(&output[..], bytes);
289 | }
290 |
291 | #[test]
292 | fn roundtrips() {
293 | roundtrip(b"teeesst_dc");
294 | roundtrip(b"");
295 | roundtrip(include_bytes!("../data/test.txt"));
296 | }
297 |
298 | #[test]
299 | fn roundtrips_context() {
300 | roundtrip_ctx(b"teeesst_dc");
301 | roundtrip_ctx(b"../data/test.txt");
302 | }
303 | }
304 |
--------------------------------------------------------------------------------
/src/bwt/mod.rs:
--------------------------------------------------------------------------------
1 | /*!
2 |
3 | BWT (Burrows-Wheeler Transform) forward and backward transformation. Requires `bwt` feature, enabled by default
4 |
5 | This module contains a bruteforce implementation of BWT encoding in Rust as well as standard decoding.
6 | These are exposed as a standard `Reader` and `Writer` interfaces wrapping an underlying stream.
7 |
8 | BWT output stream places together symbols with similar leading contexts. This reshaping of the entropy
9 | allows further stages to deal with repeated sequences of symbols for better compression.
10 |
11 | Typical compression schemes are:
12 | BWT + RLE (+ EC)
13 | RLE + BWT + MTF + RLE + EC : bzip2
14 | BWT + DC + EC : ybs
15 |
16 | Where the stage families are:
17 | BWT: BWT (Burrows-Wheeler Transform), ST (Shindler transform)
18 | RLE: RLE (Run-Length Encoding)
19 | MTF: MTF (Move-To-Front), WFC (Weighted Frequency Coding)
20 | DC: DC (Distance Coding), IF (Inverse Frequencies)
21 | EC (Entropy Coder): Huffman, Arithmetic, RC (Range Coder)
22 |
23 |
24 | # Example
25 |
26 | ```rust
27 | use std::io::{BufWriter, BufReader, Read, Write};
28 | use compress::bwt;
29 |
30 | // Encode some text
31 | let text = "some text";
32 | let mut e = bwt::Encoder::new(BufWriter::new(Vec::new()), 4 << 20);
33 | e.write(text.as_bytes()).unwrap();
34 | let (encoded, _) = e.finish();
35 | let inner = encoded.into_inner().unwrap();
36 |
37 | // Decode the encoded text
38 | let mut d = bwt::Decoder::new(BufReader::new(&inner[..]), true);
39 | let mut decoded = Vec::new();
40 | d.read_to_end(&mut decoded).unwrap();
41 |
42 | assert_eq!(&decoded[..], text.as_bytes());
43 | ```
44 |
45 | # Credit
46 |
47 | This is an original (mostly trivial) implementation.
48 |
49 | */
50 |
51 | #![allow(missing_docs)]
52 |
53 | extern crate num;
54 |
55 | use std::{cmp, fmt, slice};
56 | use std::ptr;
57 | use std::iter::{self, Extend, repeat};
58 | use std::io::{self, Read, Write};
59 | use self::num::traits::{NumCast, ToPrimitive};
60 |
61 | use super::byteorder::{LittleEndian, WriteBytesExt, ReadBytesExt};
62 | use super::{byteorder_err_to_io, ReadExact};
63 |
64 | pub mod dc;
65 | pub mod mtf;
66 |
67 | /// A base element for the transformation
68 | pub type Symbol = u8;
69 |
70 | pub const ALPHABET_SIZE: usize = 0x100;
71 |
72 | /// Radix sorting primitive
73 | pub struct Radix {
74 | /// number of occurancies (frequency) per symbox
75 | pub freq : [usize; ALPHABET_SIZE+1],
76 | }
77 |
78 | impl Radix {
79 | /// create Radix sort instance
80 | pub fn new() -> Radix {
81 | Radix {
82 | freq : [0; ALPHABET_SIZE+1],
83 | }
84 | }
85 |
86 | /// reset counters
87 | /// allows the struct to be re-used
88 | pub fn reset(&mut self) {
89 | for fr in self.freq.iter_mut() {
90 | *fr = 0;
91 | }
92 | }
93 |
94 | /// count elements in the input
95 | pub fn gather(&mut self, input: &[Symbol]) {
96 | for &b in input.iter() {
97 | self.freq[b as usize] += 1;
98 | }
99 | }
100 |
101 | /// build offset table
102 | pub fn accumulate(&mut self) {
103 | let mut n = 0;
104 | for freq in self.freq.iter_mut() {
105 | let f = *freq;
106 | *freq = n;
107 | n += f;
108 | }
109 | }
110 |
111 | /// return next byte position, advance it internally
112 | pub fn place(&mut self, b: Symbol)-> usize {
113 | let pos = self.freq[b as usize];
114 | assert!(self.freq[b as usize] < self.freq[(b as usize)+1],
115 | "Unable to place symbol {} at offset {}",
116 | b, pos);
117 | self.freq[b as usize] += 1;
118 | pos
119 | }
120 |
121 | /// shift frequences to the left
122 | /// allows the offsets to be re-used after all positions are obtained
123 | pub fn shift(&mut self) {
124 | assert_eq!( self.freq[ALPHABET_SIZE-1], self.freq[ALPHABET_SIZE] );
125 | for i in (0 .. ALPHABET_SIZE).rev() {
126 | self.freq[i+1] = self.freq[i];
127 | }
128 | self.freq[0] = 0;
129 | }
130 | }
131 |
132 |
133 | /// Compute a suffix array from a given input string
134 | /// Resulting suffixes are guaranteed to be alphabetically sorted
135 | /// Run time: O(N^3), memory: N words (suf_array) + ALPHABET_SIZE words (Radix)
136 | pub fn compute_suffixes(input: &[Symbol], suf_array: &mut [SUF]) {
137 | let mut radix = Radix::new();
138 | radix.gather(input);
139 | radix.accumulate();
140 |
141 | debug!("SA compute input: {:?}", input);
142 | debug!("radix offsets: {:?}", &radix.freq[..]);
143 |
144 | for (i,&ch) in input.iter().enumerate() {
145 | let p = radix.place(ch);
146 | suf_array[p] = NumCast::from(i).unwrap();
147 | }
148 |
149 | // bring the original offsets back
150 | radix.shift();
151 |
152 | for i in 0..ALPHABET_SIZE {
153 | let lo = radix.freq[i];
154 | let hi = radix.freq[i+1];
155 | if lo == hi {
156 | continue;
157 | }
158 | let slice = &mut suf_array[lo..hi];
159 | debug!("\tsorting group [{}-{}) for symbol {}", lo, hi, i);
160 | slice.sort_by(|a,b| {
161 | input[(a.to_usize().unwrap())..].cmp(&input[(b.to_usize().unwrap())..])
162 | });
163 | }
164 |
165 | debug!("sorted SA: {:?}", suf_array);
166 | }
167 |
168 | /// An iterator over BWT output
169 | pub struct TransformIterator<'a, SUF: 'a> {
170 | input : &'a [Symbol],
171 | suf_iter : iter::Enumerate>,
172 | origin : Option,
173 | }
174 |
175 | impl<'a, SUF> TransformIterator<'a, SUF> {
176 | /// create a new BWT iterator from the suffix array
177 | pub fn new(input: &'a [Symbol], suffixes: &'a [SUF]) -> TransformIterator<'a, SUF> {
178 | TransformIterator {
179 | input: input,
180 | suf_iter: suffixes.iter().enumerate(),
181 | origin: None,
182 | }
183 | }
184 |
185 | /// return the index of the original string
186 | pub fn get_origin(&self) -> usize {
187 | self.origin.unwrap()
188 | }
189 | }
190 |
191 | impl<'a, SUF: ToPrimitive + 'a> Iterator for TransformIterator<'a, SUF> {
192 | type Item = Symbol;
193 | fn next(&mut self) -> Option {
194 | self.suf_iter.next().map(|(i,p)| {
195 | if p.to_usize().unwrap() == 0 {
196 | assert!( self.origin.is_none() );
197 | self.origin = Some(i);
198 | *self.input.last().unwrap()
199 | }else {
200 | self.input[p.to_usize().unwrap() - 1]
201 | }
202 | })
203 | }
204 | }
205 |
206 | /// Encode BWT of a given input, using the 'suf_array'
207 | pub fn encode<'a, SUF: NumCast + ToPrimitive + fmt::Debug>(input: &'a [Symbol], suf_array: &'a mut [SUF]) -> TransformIterator<'a, SUF> {
208 | compute_suffixes(input, suf_array);
209 | TransformIterator::new(input, suf_array)
210 | }
211 |
212 | /// Transform an input block into the output slice, all-inclusive version.
213 | /// Returns the index of the original string in the output matrix.
214 | pub fn encode_simple(input: &[Symbol]) -> (Vec, usize) {
215 | let mut suf_array: Vec = repeat(0).take(input.len()).collect();
216 | let mut iter = encode(input, &mut suf_array[..]);
217 | let output: Vec = iter.by_ref().collect();
218 | (output, iter.get_origin())
219 | }
220 |
221 |
222 | /// Compute an inversion jump table, needed for BWT decoding
223 | pub fn compute_inversion_table(input: &[Symbol], origin: usize, table: &mut [SUF]) {
224 | assert_eq!(input.len(), table.len());
225 |
226 | let mut radix = Radix::new();
227 | radix.gather(input);
228 | radix.accumulate();
229 |
230 | table[radix.place(input[origin])] = NumCast::from(0).unwrap();
231 | for (i,&ch) in input[..origin].iter().enumerate() {
232 | table[radix.place(ch)] = NumCast::from(i+1).unwrap();
233 | }
234 | for (i,&ch) in input[(origin+1)..].iter().enumerate() {
235 | table[radix.place(ch)] = NumCast::from(origin+2+i).unwrap();
236 | }
237 | //table[-1] = origin;
238 | debug!("inverse table: {:?}", table)
239 | }
240 |
241 | /// An iterator over inverse BWT
242 | /// Run time: O(N), memory: N words (table)
243 | pub struct InverseIterator<'a, SUF: 'a> {
244 | input : &'a [Symbol],
245 | table : &'a [SUF],
246 | origin : usize,
247 | current : usize,
248 | }
249 |
250 | impl<'a, SUF> InverseIterator<'a, SUF> {
251 | /// create a new inverse BWT iterator with a given input, origin, and a jump table
252 | pub fn new(input: &'a [Symbol], origin: usize, table: &'a [SUF]) -> InverseIterator<'a, SUF> {
253 | debug!("inverse origin={:?}, input: {:?}", origin, input);
254 | InverseIterator {
255 | input: input,
256 | table: table,
257 | origin: origin,
258 | current: origin,
259 | }
260 | }
261 | }
262 |
263 | impl<'a, SUF: ToPrimitive> Iterator for InverseIterator<'a, SUF> {
264 | type Item = Symbol;
265 |
266 | fn next(&mut self) -> Option {
267 | if self.current == usize::max_value() {
268 | None
269 | } else {
270 | self.current = self.table[self.current].to_usize().unwrap().wrapping_sub(1);
271 | debug!("\tjumped to {}", self.current);
272 |
273 | let p = if self.current != usize::max_value() {
274 | self.current
275 | } else {
276 | self.origin
277 | };
278 |
279 | Some(self.input[p])
280 | }
281 | }
282 | }
283 |
284 | /// Decode a BWT block, given it's origin, and using 'table' temporarily
285 | pub fn decode<'a, SUF: NumCast + fmt::Debug>(input: &'a [Symbol], origin: usize, table: &'a mut [SUF]) -> InverseIterator<'a, SUF> {
286 | compute_inversion_table(input, origin, table);
287 | InverseIterator::new(input, origin, table)
288 | }
289 |
290 | /// A simplified BWT decode function, which allocates a temporary suffix array
291 | pub fn decode_simple(input: &[Symbol], origin: usize) -> Vec {
292 | let mut suf: Vec = repeat(0).take(input.len()).collect();
293 | decode(input, origin, &mut suf[..]).take(input.len()).collect()
294 | }
295 |
296 | /// Decode without additional memory, can be greatly optimized
297 | /// Run time: O(n^2), Memory: 0n
298 | fn decode_minimal(input: &[Symbol], origin: usize, output: &mut [Symbol]) {
299 | assert_eq!(input.len(), output.len());
300 | if input.len() == 0 {
301 | assert_eq!(origin, 0);
302 | }
303 |
304 | let mut radix = Radix::new();
305 | radix.gather(input);
306 | radix.accumulate();
307 |
308 | let n = input.len();
309 | (0..n).fold(origin, |i,j| {
310 | let ch = input[i];
311 | output[n-j-1] = ch;
312 | let offset = &input[..i].iter().filter(|&k| *k==ch).count();
313 | radix.freq[ch as usize] + offset
314 | });
315 | }
316 |
317 |
318 | /// This structure is used to decode a stream of BWT blocks. This wraps an
319 | /// internal reader which is read from when this decoder's read method is
320 | /// called.
321 | pub struct Decoder {
322 | /// The internally wrapped reader. This is exposed so it may be moved out
323 | /// of. Note that if data is read from the reader while decoding is in
324 | /// progress the output stream will get corrupted.
325 | pub r: R,
326 | start : usize,
327 |
328 | temp : Vec,
329 | output : Vec,
330 | table : Vec,
331 |
332 | header : bool,
333 | max_block_size : usize,
334 | extra_memory : bool,
335 | }
336 |
337 | impl Decoder {
338 | /// Creates a new decoder which will read data from the given stream. The
339 | /// inner stream can be re-acquired by moving out of the `r` field of this
340 | /// structure.
341 | /// 'extra_mem' switch allows allocating extra N words of memory for better performance
342 | pub fn new(r: R, extra_mem: bool) -> Decoder {
343 | Decoder {
344 | r: r,
345 | start: 0,
346 | temp: Vec::new(),
347 | output: Vec::new(),
348 | table: Vec::new(),
349 | header: false,
350 | max_block_size: 0,
351 | extra_memory: extra_mem,
352 | }
353 | }
354 |
355 | /// Resets this decoder back to its initial state. Note that the underlying
356 | /// stream is not seeked on or has any alterations performed on it.
357 | pub fn reset(&mut self) {
358 | self.header = false;
359 | self.start = 0;
360 | }
361 |
362 | fn read_header(&mut self) -> io::Result<()> {
363 | match self.r.read_u32::() {
364 | Ok(size) => {
365 | self.max_block_size = size as usize;
366 | debug!("max size: {}", self.max_block_size);
367 | Ok(())
368 | },
369 | Err(e) => Err(byteorder_err_to_io(e)),
370 | }
371 | }
372 |
373 | fn decode_block(&mut self) -> io::Result {
374 | let n = match self.r.read_u32::() {
375 | Ok(n) => n as usize,
376 | Err(ref e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(false), // EOF
377 | Err(e) => return Err(e),
378 | };
379 |
380 | self.temp.truncate(0);
381 | self.temp.reserve(n);
382 | try!(self.r.push_exactly(n as u64, &mut self.temp));
383 |
384 | let origin = try!(self.r.read_u32::()) as usize;
385 | self.output.truncate(0);
386 | self.output.reserve(n);
387 |
388 | if self.extra_memory {
389 | self.table.truncate(0);
390 | self.table.extend((0..n).map(|_| 0));
391 | for ch in decode(&self.temp[..], origin, &mut self.table[..]) {
392 | self.output.push(ch);
393 | }
394 | }else {
395 | self.output.extend((0..n).map(|_| 0));
396 | decode_minimal(&self.temp[..], origin, &mut self.output[..]);
397 | }
398 |
399 | self.start = 0;
400 | return Ok(true);
401 | }
402 | }
403 |
404 | impl Read for Decoder {
405 | fn read(&mut self, dst: &mut [u8]) -> io::Result {
406 | if !self.header {
407 | try!(self.read_header());
408 | self.header = true;
409 | }
410 | let mut amt = dst.len();
411 | let dst_len = amt;
412 |
413 | while amt > 0 {
414 | if self.output.len() == self.start {
415 | let keep_going = try!(self.decode_block());
416 | if !keep_going {
417 | break
418 | }
419 | }
420 | let n = cmp::min(amt, self.output.len() - self.start);
421 | unsafe { ptr::copy_nonoverlapping(
422 | &self.output[self.start],
423 | &mut dst[dst_len - amt],
424 | n,
425 | )};
426 | self.start += n;
427 | amt -= n;
428 | }
429 |
430 | Ok(dst_len - amt)
431 | }
432 | }
433 |
434 |
435 | /// This structure is used to compress a stream of bytes using the BWT.
436 | /// This is a wrapper around an internal writer which bytes will be written to.
437 | pub struct Encoder {
438 | w: W,
439 | buf: Vec,
440 | suf: Vec,
441 | wrote_header: bool,
442 | block_size: usize,
443 | }
444 |
445 | impl Encoder {
446 | /// Creates a new encoder which will have its output written to the given
447 | /// output stream. The output stream can be re-acquired by calling
448 | /// `finish()`
449 | /// 'block_size' is idealy as big as your input, unless you know for sure that
450 | /// the input consists of multiple parts of different nature. Often set as 4Mb.
451 | pub fn new(w: W, block_size: usize) -> Encoder {
452 | Encoder {
453 | w: w,
454 | buf: Vec::new(),
455 | suf: Vec::new(),
456 | wrote_header: false,
457 | block_size: block_size,
458 | }
459 | }
460 |
461 | fn encode_block(&mut self) -> io::Result<()> {
462 | let n = self.buf.len();
463 | try!(self.w.write_u32::(n as u32));
464 |
465 | self.suf.truncate(0);
466 | self.suf.extend((0..n).map(|_| n));
467 | let w = &mut self.w;
468 |
469 | {
470 | let mut iter = encode(&self.buf[..], &mut self.suf[..]);
471 | for ch in iter.by_ref() {
472 | try!(w.write_u8(ch));
473 | }
474 |
475 | try!(w.write_u32::(iter.get_origin() as u32));
476 | }
477 | self.buf.truncate(0);
478 |
479 | Ok(())
480 | }
481 |
482 | /// This function is used to flag that this session of compression is done
483 | /// with. The stream is finished up (final bytes are written), and then the
484 | /// wrapped writer is returned.
485 | pub fn finish(mut self) -> (W, io::Result<()>) {
486 | let result = self.flush();
487 | (self.w, result)
488 | }
489 | }
490 |
491 | impl Write for Encoder {
492 | fn write(&mut self, mut buf: &[u8]) -> io::Result {
493 | if !self.wrote_header {
494 | try!(self.w.write_u32::(self.block_size as u32));
495 | self.wrote_header = true;
496 | }
497 |
498 | while buf.len() > 0 {
499 | let amt = cmp::min( self.block_size - self.buf.len(), buf.len() );
500 | self.buf.extend(buf[..amt].iter().map(|b| *b));
501 |
502 | if self.buf.len() == self.block_size {
503 | try!(self.encode_block());
504 | }
505 | buf = &buf[amt..];
506 | }
507 | Ok(buf.len())
508 | }
509 |
510 | fn flush(&mut self) -> io::Result<()> {
511 | let ret = if self.buf.len() > 0 {
512 | self.encode_block()
513 | } else {
514 | Ok(())
515 | };
516 | ret.and(self.w.flush())
517 | }
518 | }
519 |
520 |
521 | #[cfg(test)]
522 | mod test {
523 | use std::io::{BufReader, BufWriter, Read, Write};
524 | #[cfg(feature="unstable")]
525 | use test::Bencher;
526 | use super::{Decoder, Encoder};
527 |
528 | fn roundtrip(bytes: &[u8], extra_mem: bool) {
529 | let mut e = Encoder::new(BufWriter::new(Vec::new()), 1<<10);
530 | e.write(bytes).unwrap();
531 | let (e, err) = e.finish();
532 | err.unwrap();
533 | let encoded = e.into_inner().unwrap();
534 |
535 | let mut d = Decoder::new(BufReader::new(&encoded[..]), extra_mem);
536 | let mut decoded = Vec::new();
537 | d.read_to_end(&mut decoded).unwrap();
538 | assert_eq!(&decoded[..], bytes);
539 | }
540 |
541 | #[test]
542 | fn some_roundtrips() {
543 | roundtrip(b"test", true);
544 | roundtrip(b"", true);
545 | roundtrip(include_bytes!("../data/test.txt"), true);
546 | }
547 |
548 | #[test]
549 | fn decode_minimal() {
550 | roundtrip(b"abracadabra", false);
551 | }
552 |
553 | #[cfg(feature="unstable")]
554 | #[bench]
555 | fn decode_speed(bh: &mut Bencher) {
556 | use std::iter::repeat;
557 | use super::{encode, decode};
558 |
559 | let input = include_bytes!("../data/test.txt");
560 | let n = input.len();
561 | let mut suf: Vec = repeat(0).take(n).collect();
562 | let (output, origin) = {
563 | let mut to_iter = encode(input, &mut suf[..]);
564 | let out: Vec = to_iter.by_ref().collect();
565 | (out, to_iter.get_origin())
566 | };
567 |
568 | bh.iter(|| {
569 | let from_iter = decode(&output[..], origin, &mut suf[..]);
570 | from_iter.last().unwrap();
571 | });
572 | bh.bytes = n as u64;
573 | }
574 | }
575 |
--------------------------------------------------------------------------------
/src/bwt/mtf.rs:
--------------------------------------------------------------------------------
1 | /*!
2 |
3 | MTF (Move To Front) encoder/decoder
4 | Produces a rank for each input character based on when it was seen last time.
5 | Useful for BWT output encoding, which produces a lot of zeroes and low ranks.
6 |
7 | # Links
8 |
9 | http://en.wikipedia.org/wiki/Move-to-front_transform
10 |
11 | # Example
12 |
13 | ```rust
14 | use std::io::{self, Read, Write};
15 | use compress::bwt::mtf;
16 |
17 | // Encode a stream of bytes
18 | let bytes = b"abracadabra";
19 | let mut e = mtf::Encoder::new(io::BufWriter::new(Vec::new()));
20 | e.write_all(bytes).unwrap();
21 | let encoded = e.finish().into_inner().unwrap();
22 |
23 | // Decode a stream of ranks
24 | let mut d = mtf::Decoder::new(io::BufReader::new(&encoded[..]));
25 | let mut decoded = Vec::new();
26 | let result = d.read_to_end(&mut decoded).unwrap();
27 | ```
28 |
29 | # Credit
30 |
31 | */
32 |
33 | use std::mem;
34 | use std::io::{self, Read, Write};
35 |
36 | use super::super::byteorder::{WriteBytesExt, ReadBytesExt};
37 |
38 | pub type Symbol = u8;
39 | pub type Rank = u8;
40 | pub const TOTAL_SYMBOLS: usize = 0x100;
41 |
42 |
43 | /// MoveToFront encoder/decoder
44 | pub struct MTF {
45 | /// rank-ordered list of unique Symbols
46 | pub symbols: [Symbol; TOTAL_SYMBOLS],
47 | }
48 |
49 | impl MTF {
50 | /// create a new zeroed MTF
51 | pub fn new() -> MTF {
52 | MTF { symbols: [0; TOTAL_SYMBOLS] }
53 | }
54 |
55 | /// set the order of symbols to be alphabetical
56 | pub fn reset_alphabetical(&mut self) {
57 | for (i,sym) in self.symbols.iter_mut().enumerate() {
58 | *sym = i as Symbol;
59 | }
60 | }
61 |
62 | /// encode a symbol into its rank
63 | pub fn encode(&mut self, sym: Symbol) -> Rank {
64 | let mut next = self.symbols[0];
65 | if next == sym {
66 | return 0
67 | }
68 | let mut rank: Rank = 1;
69 | loop {
70 | mem::swap(&mut self.symbols[rank as usize], &mut next);
71 | if next == sym {
72 | break;
73 | }
74 | rank += 1;
75 | assert!((rank as usize) < self.symbols.len());
76 | }
77 | self.symbols[0] = sym;
78 | rank
79 | }
80 |
81 | /// decode a rank into its symbol
82 | pub fn decode(&mut self, rank: Rank) -> Symbol {
83 | let sym = self.symbols[rank as usize];
84 | debug!("\tDecoding rank {} with symbol {}", rank, sym);
85 | for i in (0 .. rank as usize).rev() {
86 | self.symbols[i+1] = self.symbols[i];
87 | }
88 | self.symbols[0] = sym;
89 | sym
90 | }
91 | }
92 |
93 |
94 | /// A simple MTF stream encoder
95 | pub struct Encoder {
96 | w: W,
97 | mtf: MTF,
98 | }
99 |
100 | impl Encoder {
101 | /// start encoding into the given writer
102 | pub fn new(w: W) -> Encoder {
103 | let mut mtf = MTF::new();
104 | mtf.reset_alphabetical();
105 | Encoder {
106 | w: w,
107 | mtf: mtf,
108 | }
109 | }
110 |
111 | /// finish encoding and return the wrapped writer
112 | pub fn finish(self) -> W {
113 | self.w
114 | }
115 | }
116 |
117 | impl Write for Encoder {
118 | fn write(&mut self, buf: &[u8]) -> io::Result {
119 | for sym in buf.iter() {
120 | let rank = self.mtf.encode(*sym);
121 | try!(self.w.write_u8(rank));
122 | }
123 | Ok(buf.len())
124 | }
125 |
126 | fn flush(&mut self) -> io::Result<()> {
127 | self.w.flush()
128 | }
129 | }
130 |
131 |
132 | /// A simple MTF stream decoder
133 | pub struct Decoder {
134 | r: R,
135 | mtf: MTF,
136 | }
137 |
138 | impl Decoder {
139 | /// start decoding the given reader
140 | pub fn new(r: R) -> Decoder {
141 | let mut mtf = MTF::new();
142 | mtf.reset_alphabetical();
143 | Decoder {
144 | r: r,
145 | mtf: mtf,
146 | }
147 | }
148 |
149 | /// finish decoder and return the wrapped reader
150 | pub fn finish(self) -> R {
151 | self.r
152 | }
153 | }
154 |
155 | impl Read for Decoder {
156 | fn read(&mut self, dst: &mut [u8]) -> io::Result {
157 | let mut bytes_read = 0;
158 | for sym in dst.iter_mut() {
159 | let rank = match self.r.read_u8() {
160 | Ok(r) => r,
161 | Err(ref e) if e.kind() == io::ErrorKind::UnexpectedEof => break,
162 | Err(e) => return Err(e)
163 | };
164 | bytes_read += 1;
165 | *sym = self.mtf.decode(rank);
166 | }
167 | Ok(bytes_read)
168 | }
169 | }
170 |
171 |
172 | #[cfg(test)]
173 | mod test {
174 | use std::io::{self, Read, Write};
175 | #[cfg(feature="unstable")]
176 | use test::Bencher;
177 | use super::{Encoder, Decoder};
178 |
179 | fn roundtrip(bytes: &[u8]) {
180 | info!("Roundtrip MTF of size {}", bytes.len());
181 | let buf = Vec::new();
182 | let mut e = Encoder::new(io::BufWriter::new(buf));
183 | e.write_all(bytes).unwrap();
184 | let encoded = e.finish().into_inner().unwrap();
185 | debug!("Roundtrip MTF input: {:?}, ranks: {:?}", bytes, encoded);
186 | let mut d = Decoder::new(io::BufReader::new(&encoded[..]));
187 | let mut decoded = Vec::new();
188 | d.read_to_end(&mut decoded).unwrap();
189 | assert_eq!(&decoded[..], bytes);
190 | }
191 |
192 | #[test]
193 | fn some_roundtrips() {
194 | roundtrip(b"teeesst_mtf");
195 | roundtrip(b"");
196 | roundtrip(include_bytes!("../data/test.txt"));
197 | }
198 |
199 | #[cfg(feature="unstable")]
200 | #[bench]
201 | fn encode_speed(bh: &mut Bencher) {
202 | let vec = Vec::new();
203 | let input = include_bytes!("../data/test.txt");
204 | let mem = io::BufWriter::with_capacity(input.len(), vec);
205 | let mut e = Encoder::new(mem);
206 | bh.iter(|| {
207 | e.write_all(input).unwrap();
208 | });
209 | bh.bytes = input.len() as u64;
210 | }
211 |
212 | #[cfg(feature="unstable")]
213 | #[bench]
214 | fn decode_speed(bh: &mut Bencher) {
215 | let vec = Vec::new();
216 | let input = include_bytes!("../data/test.txt");
217 | let mut e = Encoder::new(io::BufWriter::new(vec));
218 | e.write_all(input).unwrap();
219 | let encoded = e.finish().into_inner().unwrap();
220 | bh.iter(|| {
221 | let mut d = Decoder::new(io::BufReader::new(&encoded[..]));
222 | let mut buf = Vec::new();
223 | d.read_to_end(&mut buf).unwrap();
224 | });
225 | bh.bytes = input.len() as u64;
226 | }
227 | }
228 |
--------------------------------------------------------------------------------
/src/checksum/adler.rs:
--------------------------------------------------------------------------------
1 | /*!
2 |
3 | Adler-32 checksum
4 |
5 | This implementation is based off the example found at
6 | http://en.wikipedia.org/wiki/Adler-32.
7 |
8 | # Example
9 |
10 | ```rust
11 | use compress::checksum::adler;
12 | let mut state = adler::State32::new();
13 | state.feed(b"abracadabra");
14 | let checksum = state.result();
15 | ```
16 |
17 | */
18 |
19 | const MOD_ADLER: u32 = 65521;
20 |
21 | /// Adler state for 32 bits
22 | pub struct State32 {
23 | a: u32,
24 | b: u32,
25 | }
26 |
27 | impl State32 {
28 | /// Create a new state
29 | pub fn new() -> State32 {
30 | State32 { a: 1, b: 0 }
31 | }
32 |
33 | /// Mutate the state for given data
34 | pub fn feed(&mut self, buf: &[u8]) {
35 | for byte in buf.iter() {
36 | self.a = (self.a + *byte as u32) % MOD_ADLER;
37 | self.b = (self.a + self.b) % MOD_ADLER;
38 | }
39 | }
40 |
41 | /// Get checksum
42 | pub fn result(&self) -> u32 {
43 | (self.b << 16) | self.a
44 | }
45 |
46 | /// Reset the state
47 | pub fn reset(&mut self) {
48 | self.a = 1;
49 | self.b = 0;
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/data/test.large.z.5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rusty-shell/rust-compress/e16695cac3c8861a2132e0d3f4605a3c11f26078/src/data/test.large.z.5
--------------------------------------------------------------------------------
/src/data/test.lz4.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rusty-shell/rust-compress/e16695cac3c8861a2132e0d3f4605a3c11f26078/src/data/test.lz4.1
--------------------------------------------------------------------------------
/src/data/test.lz4.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rusty-shell/rust-compress/e16695cac3c8861a2132e0d3f4605a3c11f26078/src/data/test.lz4.2
--------------------------------------------------------------------------------
/src/data/test.lz4.3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rusty-shell/rust-compress/e16695cac3c8861a2132e0d3f4605a3c11f26078/src/data/test.lz4.3
--------------------------------------------------------------------------------
/src/data/test.lz4.4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rusty-shell/rust-compress/e16695cac3c8861a2132e0d3f4605a3c11f26078/src/data/test.lz4.4
--------------------------------------------------------------------------------
/src/data/test.lz4.5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rusty-shell/rust-compress/e16695cac3c8861a2132e0d3f4605a3c11f26078/src/data/test.lz4.5
--------------------------------------------------------------------------------
/src/data/test.lz4.6:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rusty-shell/rust-compress/e16695cac3c8861a2132e0d3f4605a3c11f26078/src/data/test.lz4.6
--------------------------------------------------------------------------------
/src/data/test.lz4.7:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rusty-shell/rust-compress/e16695cac3c8861a2132e0d3f4605a3c11f26078/src/data/test.lz4.7
--------------------------------------------------------------------------------
/src/data/test.lz4.8:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rusty-shell/rust-compress/e16695cac3c8861a2132e0d3f4605a3c11f26078/src/data/test.lz4.8
--------------------------------------------------------------------------------
/src/data/test.lz4.9:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rusty-shell/rust-compress/e16695cac3c8861a2132e0d3f4605a3c11f26078/src/data/test.lz4.9
--------------------------------------------------------------------------------
/src/data/test.txt:
--------------------------------------------------------------------------------
1 | 0000000: 91c2f55c3215a8d8 c9ffc4e7ef9b6798 ...\2.........g.
2 | 0000010: d61b0d1670a59372 1573c8a561857f9c ....p..r.s..a...
3 | 0000020: 362f1a9d6d953cbf 8e189a2eaa6be88e 6/..m.<......k..
4 | 0000030: 28fe132560ada100 2ef6f85a9e6d1b91 (..%`......Z.m..
5 | 0000040: 8a5d37cc5cbfff89 3537ff5d73aae040 .]7.\...57.]s..@
6 | 0000050: 127c7809f5b183c3 61400ea6b90451ef .|x.....a@....Q.
7 | 0000060: 2272ffbb2cd81b2a 005bbea7bf8f945f "r..,..*.[....._
8 | 0000070: 39e03ab01676bbd0 939be3d936585554 9.:..v......6XUT
9 | 0000080: 41bb97f4410bd105 1cf75f2dee4f35c9 A...A....._-.O5.
10 | 0000090: 8123a6ca320b8951 c8313d50aea11065 .#..2..Q.1=P...e
11 | 00000a0: b269bda147bba414 94575126ed2d2770 .i..G....WQ&.-'p
12 | 00000b0: 422f9292b31978a6 92e24695dbdca27e B/....x...F....~
13 | 00000c0: 419738fc9d2e61bd e1d22e3604c521f9 A.8...a....6..!.
14 | 00000d0: 89b9c7aa81eaf936 589cd50421ea42f9 .......6X...!.B.
15 | 00000e0: e5d716af7e3ba436 e70fcb33b85adff5 ....~;.6...3.Z..
16 | 00000f0: 0b8e32a1e824edc9 30f6abd2c733a354 ..2..$..0....3.T
17 | 0000100: d7a2554901bbbc17 3b8308d8269f96a6 ..UI....;...&...
18 | 0000110: 190985c6f2e5e147 f85c1e61c7a1ce22 .......G.\.a..."
19 | 0000120: 21522181dc5329c3 ff83b2ca5537dca1 !R!..S).....U7..
20 | 0000130: 2a95ab06ccca632f 3f819fc396916676 *.....c/?.....fv
21 | 0000140: 384731c6146aa2a3 e76189389b59ff49 8G1..j...a.8.Y.I
22 | 0000150: a9b1d8dff5a86626 72a11ae65e0b33fc ......f&r...^.3.
23 | 0000160: 89362784bb1a0443 3de0327da454ee20 .6'....C=.2}.T.
24 | 0000170: e1173db62e55dc20 8229a37aaadd4ba6 ..=..U. .).z..K.
25 | 0000180: d4c4a71393b45926 8f34d87fa5c6b28e ......Y&.4......
26 | 0000190: 2f1188a6d1d0711c 9379286d7511a3bd /.....q..y(mu...
27 | 00001a0: 1a2d83fad3ce1bb0 f964a48d8b6f5c8c .-.......d...o\.
28 | 00001b0: 6e992294580e9982 8bbba454fa167dcf n.".X......T..}.
29 | 00001c0: 6c75bd8d6df6cd2b 33b31e56eed09514 lu..m..+3..V....
30 | 00001d0: 0e26ab922c4689bd 43644322d4541da3 .&..,F..CdC".T..
31 | 00001e0: 83ddbe1dcbbf77bd 1da89fad1b162341 ......w.......#A
32 | 00001f0: 43a336bb09b47551 b08eef90e745e832 C.6...uQ.....E.2
33 | 0000200: 4666a92e97425a15 bf6ff63b883eec86 Ff...BZ..o.;.>..
34 | 0000210: 09d23683b90c4218 1d20615003253f40 ..6...B.. aP.%?@
35 | 0000220: 3b91ce6f1ee06042 449a593d61b1e68c ;..o..`BD.Y=a...
36 | 0000230: 9d304c8d8e49c96f b7640996f180401c .0L..I.o.d....@.
37 | 0000240: fd1edd7fc72e9259 06310df7759e5bc9 .......Y.1..u.[.
38 | 0000250: b8674a3c006b8bfa 393858ad1ffd1efd .gJ<.k..98X.....
39 | 0000260: 3f75301efe580293 68a8aa18f2efa649 ?u0..X..h......I
40 | 0000270: e55f21ab282a0f7a 317f8def421150a3 ._!.(*.z1...B.P.
41 | 0000280: a211cfce3587ea16 46be81d9b63646cd ....5...F....6F.
42 | 0000290: 2a30a1be6d917fb9 31015f1b91bbb1dc *0..m...1._.....
43 | 00002a0: 9c52e29b165c4ec3 f8aea285a688d1cb .R...\N.........
44 | 00002b0: 5901ac6448e3686e 061d0d2ff04bbf04 Y..dH.hn.../.K..
45 | 00002c0: 4bf319830a3cfb5a ab051b3c5ad70e6b K....<.Z... Bit {
42 | Bit(FLAT_TOTAL as FlatProbability >> 1)
43 | }
44 |
45 | /// Return flat probability
46 | #[inline]
47 | pub fn to_flat(&self) -> FlatProbability {
48 | let Bit(fp) = *self;
49 | fp
50 | }
51 |
52 | /// Return wide probability
53 | #[inline]
54 | pub fn to_wide(&self) -> WideProbability {
55 | //table_stretch[self.to_flat() as usize]
56 | let p = (self.to_flat() as f32) / (FLAT_TOTAL as f32);
57 | let d = (p / (1.0-p)).ln();
58 | let wp = (d * WIDE_OFFSET as f32).to_i16().unwrap();
59 | wp
60 | }
61 |
62 | /// Construct from flat probability
63 | #[inline]
64 | pub fn from_flat(fp: FlatProbability) -> Bit {
65 | Bit(fp)
66 | }
67 |
68 | /// Construct from wide probability
69 | #[inline]
70 | pub fn from_wide(wp: WideProbability) -> Bit {
71 | //Bit(table_squash[(wp+WIDE_OFFSET) as usize])
72 | let d = (wp as f32) / (WIDE_OFFSET as f32);
73 | let p = 1.0 / (1.0 + (-d).exp());
74 | let fp = (p * FLAT_TOTAL as f32).to_u16().unwrap();
75 | Bit(fp)
76 | }
77 |
78 | /// Mutate for better zeroes
79 | pub fn update_zero(&mut self, rate: isize, bias: isize) {
80 | let &mut Bit(ref mut fp) = self;
81 | let one = FLAT_TOTAL - bias - (*fp as isize);
82 | *fp += (one >> (rate as usize)) as FlatProbability;
83 | }
84 |
85 | /// Mutate for better ones
86 | pub fn update_one(&mut self, rate: isize, bias: isize) {
87 | let &mut Bit(ref mut fp) = self;
88 | let zero = (*fp as isize) - bias;
89 | *fp -= (zero >> (rate as usize)) as FlatProbability;
90 | }
91 |
92 | /// Mutate for a given value
93 | #[inline]
94 | pub fn update(&mut self, value: bool, rate: isize, bias: isize) {
95 | if !value {
96 | self.update_zero(rate, bias)
97 | }else {
98 | self.update_one(rate, bias)
99 | }
100 | }
101 | }
102 |
103 | impl super::Model for Bit {
104 | fn get_range(&self, value: bool) -> (Border,Border) {
105 | let fp = self.to_flat() as Border;
106 | if !value {
107 | (0, fp)
108 | }else {
109 | (fp, FLAT_TOTAL as Border)
110 | }
111 | }
112 |
113 | fn find_value(&self, offset: Border) -> (bool,Border,Border) {
114 | assert!(offset < FLAT_TOTAL as Border,
115 | "Invalid bit offset {} requested", offset);
116 | let fp = self.to_flat() as Border;
117 | if offset < fp {
118 | (false, 0, fp)
119 | }else {
120 | (true, fp, FLAT_TOTAL as Border)
121 | }
122 | }
123 |
124 | fn get_denominator(&self) -> Border {
125 | FLAT_TOTAL as Border
126 | }
127 | }
128 |
129 |
130 | /// Binary context gate
131 | /// maps an input binary probability into a new one
132 | /// by interpolating between internal maps in non-linear space
133 | pub struct Gate {
134 | map: [Bit; PORTAL_BINS],
135 | }
136 |
137 | pub type BinCoords = (usize, usize); // (index, weight)
138 |
139 | impl Gate {
140 | /// Create a new gate instance
141 | pub fn new() -> Gate {
142 | let mut g = Gate {
143 | map: [Bit::new_equal(); PORTAL_BINS],
144 | };
145 | for (i,bit) in g.map.iter_mut().enumerate() {
146 | let rp = (i as f32)/(PORTAL_OFFSET as f32) - 1.0;
147 | let wp = (rp * (WIDE_OFFSET as f32)).to_i16().unwrap();
148 | *bit = Bit::from_wide(wp);
149 | }
150 | g
151 | }
152 |
153 | /// Pass a bit through the gate
154 | #[inline]
155 | pub fn pass(&self, bit: &Bit) -> (Bit, BinCoords) {
156 | let (fp, index) = self.pass_wide(bit.to_wide());
157 | (Bit::from_flat(fp), index)
158 | }
159 |
160 | /// Pass a wide probability on input, usable when
161 | /// you mix it linearly beforehand (libbsc does that)
162 | pub fn pass_wide(&self, wp: WideProbability) -> (FlatProbability, BinCoords) {
163 | let index = ((wp + WIDE_OFFSET) >> BIN_WEIGHT_BITS) as usize;
164 | let weight = wp as usize & (BIN_WEIGHT_TOTAL-1);
165 | let z = [
166 | self.map[index+0].to_flat() as usize,
167 | self.map[index+1].to_flat() as usize];
168 | let sum = z[0]*(BIN_WEIGHT_TOTAL-weight) + z[1]*weight;
169 | let fp = (sum >> BIN_WEIGHT_BITS) as FlatProbability;
170 | (fp, (index, weight))
171 | }
172 |
173 | //TODO: weight update ratio & bias as well
174 |
175 | /// Mutate for better zeroes
176 | pub fn update_zero(&mut self, bc: BinCoords, rate: isize, bias: isize) {
177 | let (index, _) = bc;
178 | self.map[index+0].update_zero(rate, bias);
179 | self.map[index+1].update_zero(rate, bias);
180 | }
181 |
182 | /// Mutate for better ones
183 | pub fn update_one(&mut self, bc: BinCoords, rate: isize, bias: isize) {
184 | let (index, _) = bc;
185 | self.map[index+0].update_one(rate, bias);
186 | self.map[index+1].update_one(rate, bias);
187 | }
188 |
189 | /// Mutate for a given value
190 | #[inline]
191 | pub fn update(&mut self, value: bool, bc: BinCoords, rate: isize, bias: isize) {
192 | if !value {
193 | self.update_zero(bc, rate, bias)
194 | }else {
195 | self.update_one(bc, rate, bias)
196 | }
197 | }
198 | }
199 |
--------------------------------------------------------------------------------
/src/entropy/ari/bin.rs:
--------------------------------------------------------------------------------
1 | /*!
2 |
3 | Binary models for the arithmetic coder.
4 | The simplicity of the domain allows for normalized updates in place using bit shifts.
5 |
6 | # Links
7 |
8 | # Example
9 |
10 | # Credit
11 |
12 | */
13 |
14 | use super::Border;
15 |
16 | /// A binary value frequency model
17 | pub struct Model {
18 | /// frequency of bit 0
19 | zero: Border,
20 | /// total frequency (constant)
21 | total: Border,
22 | /// learning rate
23 | pub rate: Border,
24 | }
25 |
26 | impl Model {
27 | /// Create a new flat (50/50 probability) instance
28 | pub fn new_flat(threshold: Border, rate: Border) -> Model {
29 | Model {
30 | zero: threshold>>1,
31 | total: threshold,
32 | rate: rate,
33 | }
34 | }
35 |
36 | /// Create a new instance with a given percentage for zeroes
37 | pub fn new_custom(zero_percent: u8, threshold: Border, rate: Border) -> Model {
38 | assert!(threshold >= 100);
39 | Model {
40 | zero: (zero_percent as Border)*threshold/100,
41 | total: threshold,
42 | rate: rate,
43 | }
44 | }
45 |
46 | /// Reset the model to 50/50 distribution
47 | pub fn reset_flat(&mut self) {
48 | self.zero = self.total>>1;
49 | }
50 |
51 | /// Return the probability of 0
52 | pub fn get_probability_zero(&self) -> Border {
53 | self.zero
54 | }
55 |
56 | /// Return the probability of 1
57 | pub fn get_probability_one(&self) -> Border {
58 | self.total - self.zero
59 | }
60 |
61 | /// Update the frequency of zero
62 | pub fn update_zero(&mut self) {
63 | debug!("\tUpdating zero");
64 | self.zero += (self.total-self.zero) >> (self.rate as usize);
65 | }
66 |
67 | /// Update the frequency of one
68 | pub fn update_one(&mut self) {
69 | debug!("\tUpdating one");
70 | self.zero -= self.zero >> (self.rate as usize);
71 | }
72 |
73 | /// Update frequencies in favor of given 'value'
74 | /// Lower factors produce more aggressive updates
75 | pub fn update(&mut self, value: bool) {
76 | if value {
77 | self.update_one()
78 | }else {
79 | self.update_zero()
80 | }
81 | }
82 | }
83 |
84 | impl super::Model for Model {
85 | fn get_range(&self, value: bool) -> (Border,Border) {
86 | if value {
87 | (self.zero, self.total)
88 | }else {
89 | (0, self.zero)
90 | }
91 | }
92 |
93 | fn find_value(&self, offset: Border) -> (bool,Border,Border) {
94 | assert!(offset < self.total,
95 | "Invalid frequency offset {} requested under total {}",
96 | offset, self.total);
97 | if offset < self.zero {
98 | (false, 0, self.zero)
99 | }else {
100 | (true, self.zero, self.total)
101 | }
102 | }
103 |
104 | fn get_denominator(&self) -> Border {
105 | self.total
106 | }
107 | }
108 |
109 |
110 | /// A proxy model for the combination of two binary models
111 | /// using equation: (wa * A + wb * B) >> ws
112 | pub struct SumProxy<'a> {
113 | first: &'a Model,
114 | second: &'a Model,
115 | w_first: Border,
116 | w_second: Border,
117 | w_shift: Border,
118 | }
119 |
120 | impl<'a> SumProxy<'a> {
121 | /// Create a new instance of the binary sum proxy
122 | pub fn new(wa: Border, first: &'a Model, wb: Border, second: &'a Model, shift: Border) -> SumProxy<'a> {
123 | SumProxy {
124 | first: first,
125 | second: second,
126 | w_first: wa,
127 | w_second: wb,
128 | w_shift: shift,
129 | }
130 | }
131 |
132 | fn get_probability_zero(&self) -> Border {
133 | (self.w_first * self.first.get_probability_zero() +
134 | self.w_second * self.second.get_probability_zero()) >>
135 | (self.w_shift as usize)
136 | }
137 | }
138 |
139 | impl<'a> super::Model for SumProxy<'a> {
140 | fn get_range(&self, value: bool) -> (Border,Border) {
141 | let zero = self.get_probability_zero();
142 | if value {
143 | (zero, self.get_denominator())
144 | }else {
145 | (0, zero)
146 | }
147 | }
148 |
149 | fn find_value(&self, offset: Border) -> (bool,Border,Border) {
150 | let zero = self.get_probability_zero();
151 | let total = self.get_denominator();
152 | assert!(offset < total,
153 | "Invalid frequency offset {} requested under total {}",
154 | offset, total);
155 | if offset < zero {
156 | (false, 0, zero)
157 | }else {
158 | (true, zero, total)
159 | }
160 | }
161 |
162 | fn get_denominator(&self) -> Border {
163 | (self.w_first * self.first.get_denominator() +
164 | self.w_second * self.second.get_denominator()) >>
165 | (self.w_shift as usize)
166 | }
167 | }
168 |
--------------------------------------------------------------------------------
/src/entropy/ari/mod.rs:
--------------------------------------------------------------------------------
1 | /*!
2 |
3 | Arithmetic encoder/decoder using the Range encoder underneath. Requires `entropy` feature, enabled by default
4 | Can be used in a general case of entropy coding stage. Supposed to be fast.
5 |
6 | # Links
7 |
8 | http://en.wikipedia.org/wiki/Arithmetic_coding
9 | http://en.wikipedia.org/wiki/Range_encoding
10 |
11 | # Example
12 | ```rust
13 | # #![allow(unused_must_use)]
14 | use std::io::{BufWriter, BufReader, Read, Write};
15 | use compress::entropy::ari;
16 |
17 | // Encode some text
18 | let text = "some text";
19 | let mut e = ari::ByteEncoder::new(BufWriter::new(Vec::new()));
20 | e.write_all(text.as_bytes()).unwrap();
21 | let (encoded, _) = e.finish();
22 | let inner = encoded.into_inner().unwrap();
23 |
24 | // Decode the encoded text
25 | let mut d = ari::ByteDecoder::new(BufReader::new(&inner[..]));
26 | let mut decoded = Vec::new();
27 | d.read_to_end(&mut decoded).unwrap();
28 | ```
29 | # Credit
30 |
31 | This is an original implementation.
32 |
33 | */
34 |
35 | #![allow(missing_docs)]
36 |
37 | use std::fmt::Display;
38 | use std::io::{self, Read, Write};
39 |
40 | use super::super::byteorder::{BigEndian, WriteBytesExt, ReadBytesExt};
41 | use super::super::byteorder_err_to_io;
42 |
43 | pub use self::table::{ByteDecoder, ByteEncoder};
44 |
45 | pub mod apm;
46 | pub mod bin;
47 | pub mod table;
48 | #[cfg(test)]
49 | mod test;
50 |
51 | pub type Symbol = u8;
52 | const SYMBOL_BITS: usize = 8;
53 | const SYMBOL_TOTAL: usize = 1< RangeEncoder {
83 | debug_assert!(max_range > (SYMBOL_TOTAL as Border));
84 | RangeEncoder {
85 | low: 0,
86 | hai: !0,
87 | threshold: max_range,
88 | bits_lost_on_threshold_cut: 0.0,
89 | bits_lost_on_division: 0.0,
90 | }
91 | }
92 |
93 | /// Reset the current range
94 | pub fn reset(&mut self) {
95 | self.low = 0;
96 | self.hai = !0;
97 | }
98 |
99 | #[cfg(tune)]
100 | fn count_bits(range: Border, total: Border) -> f32 {
101 | -((range as f32) / (total as f32)).log2()
102 | }
103 |
104 | #[cfg(not(tune))]
105 | fn count_bits(_range: Border, _total: Border) -> f32 {
106 | 0.0
107 | }
108 |
109 | /// Return the number of bits lost due to threshold cuts and integer operations
110 | #[cfg(tune)]
111 | pub fn get_bits_lost(&self) -> (f32, f32) {
112 | (self.bits_lost_on_threshold_cut, self.bits_lost_on_division)
113 | }
114 |
115 | /// Process a given interval [from/total,to/total) into the current range
116 | /// write into the output slice, and return the number of symbols produced
117 | pub fn process(&mut self, total: Border, from: Border, to: Border, output: &mut [Symbol]) -> usize {
118 | debug_assert!(from0, "RangeCoder range is too narrow [{}-{}) for the total {}",
122 | self.low, self.hai, total);
123 | debug!("\t\tProcessing [{}-{})/{} with range {}", from, to, total, range);
124 | let mut lo = self.low + range*from;
125 | let mut hi = self.low + range*to;
126 | self.bits_lost_on_division += RangeEncoder::count_bits(range*total, old_range);
127 | let mut num_shift = 0;
128 | loop {
129 | if (lo^hi) & BORDER_SYMBOL_MASK != 0 {
130 | if hi-lo > self.threshold {
131 | break
132 | }
133 | let old_range = hi-lo;
134 | let lim = hi & BORDER_SYMBOL_MASK;
135 | if hi-lim >= lim-lo {lo=lim}
136 | else {hi=lim-1};
137 | debug_assert!(lo < hi);
138 | self.bits_lost_on_threshold_cut += RangeEncoder::count_bits(hi-lo, old_range);
139 | }
140 |
141 | debug!("\t\tShifting on [{}-{}) to symbol {}", lo, hi, lo>>BORDER_EXCESS);
142 | output[num_shift] = (lo>>BORDER_EXCESS) as Symbol;
143 | num_shift += 1;
144 | lo<<=SYMBOL_BITS; hi<<=SYMBOL_BITS;
145 | debug_assert!(lo < hi);
146 | }
147 | self.low = lo;
148 | self.hai = hi;
149 | num_shift
150 | }
151 |
152 | /// Query the value encoded by 'code' in range [0,total)
153 | pub fn query(&self, total: Border, code: Border) -> Border {
154 | debug!("\t\tQuerying code {} of total {} under range [{}-{})",
155 | code, total, self.low, self.hai);
156 | debug_assert!(self.low <= code && code < self.hai);
157 | let range = (self.hai - self.low) / total;
158 | (code - self.low) / range
159 | }
160 |
161 | /// Get the code tail and close the range
162 | /// used at the end of encoding
163 | pub fn get_code_tail(&mut self) -> Border {
164 | let tail = self.low;
165 | self.low = 0;
166 | self.hai = 0;
167 | tail
168 | }
169 | }
170 |
171 |
172 | /// An abstract model to produce probability ranges
173 | /// Can be a table, a mix of tables, or just a smart function.
174 | pub trait Model {
175 | /// Get the probability range of a value
176 | fn get_range(&self, value: V) -> (Border,Border);
177 | /// Find the value by a given probability offset, return with the range
178 | fn find_value(&self, offset: Border) -> (V,Border,Border);
179 | /// Get the sum of all probabilities
180 | fn get_denominator(&self) -> Border;
181 |
182 | /// Encode a value using a range encoder
183 | /// return the number of symbols written
184 | fn encode(&self, value: V, re: &mut RangeEncoder, out: &mut [Symbol]) -> usize {
185 | let (lo, hi) = self.get_range(value);
186 | let total = self.get_denominator();
187 | debug!("\tEncoding value {} of range [{}-{}) with total {}", value, lo, hi, total);
188 | re.process(total, lo, hi, out)
189 | }
190 |
191 | /// Decode a value using given 'code' on the range encoder
192 | /// return a (value, num_symbols_to_shift) pair
193 | fn decode(&self, code: Border, re: &mut RangeEncoder) -> (V, usize) {
194 | let total = self.get_denominator();
195 | let offset = re.query(total, code);
196 | let (value, lo, hi) = self.find_value(offset);
197 | debug!("\tDecoding value {} of offset {} with total {}", value, offset, total);
198 | let mut out = [0 as Symbol; BORDER_BYTES];
199 | let shift = re.process(total, lo, hi, &mut out[..]);
200 | debug_assert_eq!(if shift==0 {0} else {code>>(BORDER_BITS - shift*8)},
201 | out[..shift].iter().fold(0 as Border, |u,&b| (u<<8)+(b as Border)));
202 | (value, shift)
203 | }
204 | }
205 |
206 |
207 | /// An arithmetic encoder helper
208 | pub struct Encoder {
209 | stream: W,
210 | range: RangeEncoder,
211 | }
212 |
213 | impl Encoder {
214 | /// Create a new encoder on top of a given Writer
215 | pub fn new(w: W) -> Encoder {
216 | Encoder {
217 | stream: w,
218 | range: RangeEncoder::new(RANGE_DEFAULT_THRESHOLD),
219 | }
220 | }
221 |
222 | /// Encode an abstract value under the given Model
223 | pub fn encode>(&mut self, value: V, model: &M) -> io::Result<()> {
224 | let mut buf = [0 as Symbol; BORDER_BYTES];
225 | let num = model.encode(value, &mut self.range, &mut buf[..]);
226 | self.stream.write(&buf[..num]).map(|_| ())
227 | }
228 |
229 | /// Finish encoding by writing the code tail word
230 | pub fn finish(mut self) -> (W, io::Result<()>) {
231 | debug_assert!(BORDER_BITS == 32);
232 | let code = self.range.get_code_tail();
233 | let result = self.stream.write_u32::(code)
234 | .map_err(byteorder_err_to_io);
235 | let result = result.and(self.stream.flush());
236 | (self.stream, result)
237 | }
238 |
239 | /// Flush the output stream
240 | pub fn flush(&mut self) -> io::Result<()> {
241 | self.stream.flush()
242 | }
243 |
244 | /// Return the number of bytes lost due to threshold cuts and integer operations
245 | #[cfg(tune)]
246 | pub fn get_bytes_lost(&self) -> (f32, f32) {
247 | let (a,b) = self.range.get_bits_lost();
248 | (a/8.0, b/8.0)
249 | }
250 | }
251 |
252 | /// An arithmetic decoder helper
253 | pub struct Decoder {
254 | stream: R,
255 | range: RangeEncoder,
256 | code: Border,
257 | bytes_pending: usize,
258 | }
259 |
260 | impl Decoder {
261 | /// Create a decoder on top of a given Reader
262 | pub fn new(r: R) -> Decoder {
263 | Decoder {
264 | stream: r,
265 | range: RangeEncoder::new(RANGE_DEFAULT_THRESHOLD),
266 | code: 0,
267 | bytes_pending: BORDER_BYTES,
268 | }
269 | }
270 |
271 | fn feed(&mut self) -> io::Result<()> {
272 | while self.bytes_pending != 0 {
273 | let b = try!(self.stream.read_u8());
274 | self.code = (self.code<<8) + (b as Border);
275 | self.bytes_pending -= 1;
276 | }
277 | Ok(())
278 | }
279 |
280 | /// Decode an abstract value based on the given Model
281 | pub fn decode>(&mut self, model: &M) -> io::Result {
282 | self.feed().unwrap();
283 | let (value, shift) = model.decode(self.code, &mut self.range);
284 | self.bytes_pending = shift;
285 | Ok(value)
286 | }
287 |
288 | /// Finish decoding
289 | pub fn finish(mut self) -> (R, io::Result<()>) {
290 | let err = self.feed();
291 | (self.stream, err)
292 | }
293 | }
294 |
--------------------------------------------------------------------------------
/src/entropy/ari/table.rs:
--------------------------------------------------------------------------------
1 | /*!
2 |
3 | Frequency table models for the arithmetic coder.
4 | The module also implements Reader/Writer using simple byte coding.
5 |
6 | # Links
7 |
8 | # Example
9 |
10 | # Credit
11 |
12 | */
13 |
14 | use std::io::{self, Read, Write};
15 | use super::Border;
16 |
17 | pub type Frequency = u16;
18 |
19 | /// A simple table of frequencies.
20 | pub struct Model {
21 | /// sum of frequencies
22 | total: Border,
23 | /// main table: value -> Frequency
24 | table: Vec,
25 | /// maximum allowed sum of frequency,
26 | /// should be smaller than RangeEncoder::threshold
27 | cut_threshold: Border,
28 | /// number of bits to shift on cut
29 | cut_shift: usize,
30 | }
31 |
32 | impl Model {
33 | /// Create a new table with frequencies initialized by a function
34 | pub fn new_custom(num_values: usize, threshold: Border,
35 | mut fn_init: F) -> Model
36 | where F: FnMut(usize) -> Frequency
37 | {
38 | let freq: Vec = (0..num_values).map(|i| fn_init(i)).collect();
39 | let total = freq.iter().fold(0 as Border, |u,&f| u+(f as Border));
40 | let mut ft = Model {
41 | total: total,
42 | table: freq,
43 | cut_threshold: threshold,
44 | cut_shift: 1,
45 | };
46 | // downscale if needed
47 | while ft.total >= threshold {
48 | ft.downscale();
49 | }
50 | ft
51 | }
52 |
53 | /// Create a new tanle with all frequencies being equal
54 | pub fn new_flat(num_values: usize, threshold: Border) -> Model {
55 | Model::new_custom(num_values, threshold, |_| 1)
56 | }
57 |
58 | /// Reset the table to the flat state
59 | pub fn reset_flat(&mut self) {
60 | for freq in self.table.iter_mut() {
61 | *freq = 1;
62 | }
63 | self.total = self.table.len() as Border;
64 | }
65 |
66 | /// Adapt the table in favor of given 'value'
67 | /// using 'add_log' and 'add_const' to produce the additive factor
68 | /// the higher 'add_log' is, the more concervative is the adaptation
69 | pub fn update(&mut self, value: usize, add_log: usize, add_const: Border) {
70 | let add = (self.total>>add_log) + add_const;
71 | assert!(add < 2*self.cut_threshold);
72 | debug!("\tUpdating by adding {} to value {}", add, value);
73 | self.table[value] += add as Frequency;
74 | self.total += add;
75 | if self.total >= self.cut_threshold {
76 | self.downscale();
77 | assert!(self.total < self.cut_threshold);
78 | }
79 | }
80 |
81 | /// Reduce frequencies by 'cut_iter' bits
82 | pub fn downscale(&mut self) {
83 | debug!("\tDownscaling frequencies");
84 | let roundup = (1<> self.cut_shift;
89 | self.total += *freq as Border;
90 | }
91 | }
92 |
93 | /// Return read-only frequencies slice
94 | pub fn get_frequencies<'a>(&'a self) -> &'a [Frequency] {
95 | &self.table[..]
96 | }
97 | }
98 |
99 | impl super::Model for Model {
100 | fn get_range(&self, value: usize) -> (Border,Border) {
101 | let lo = self.table[..value].iter().fold(0, |u,&f| u+(f as Border));
102 | (lo, lo + (self.table[value] as Border))
103 | }
104 |
105 | fn find_value(&self, offset: Border) -> (usize,Border,Border) {
106 | assert!(offset < self.total,
107 | "Invalid frequency offset {} requested under total {}",
108 | offset, self.total);
109 | let mut value = 0;
110 | let mut lo = 0 as Border;
111 | let mut hi;
112 | while {hi=lo+(self.table[value] as Border); hi} <= offset {
113 | lo = hi;
114 | value += 1;
115 | }
116 | (value, lo, hi)
117 | }
118 |
119 | fn get_denominator(&self) -> Border {
120 | self.total
121 | }
122 | }
123 |
124 |
125 | /// A proxy model for the sum of two frequency tables
126 | /// using equation: (wa * A + wb * B) >> ws
127 | pub struct SumProxy<'a> {
128 | first: &'a Model,
129 | second: &'a Model,
130 | w_first: Border,
131 | w_second: Border,
132 | w_shift: Border,
133 | }
134 |
135 | impl<'a> SumProxy<'a> {
136 | /// Create a new instance of the table sum proxy
137 | pub fn new(wa: Border, fa: &'a Model, wb: Border, fb: &'a Model, shift: Border) -> SumProxy<'a> {
138 | assert_eq!(fa.get_frequencies().len(), fb.get_frequencies().len());
139 | SumProxy {
140 | first: fa,
141 | second: fb,
142 | w_first: wa,
143 | w_second: wb,
144 | w_shift: shift,
145 | }
146 | }
147 | }
148 |
149 | impl<'a> super::Model for SumProxy<'a> {
150 | fn get_range(&self, value: usize) -> (Border,Border) {
151 | let (lo0, hi0) = self.first.get_range(value);
152 | let (lo1, hi1) = self.second.get_range(value);
153 | let (wa, wb, ws) = (self.w_first, self.w_second, self.w_shift as usize);
154 | ((wa*lo0 + wb*lo1)>>ws, (wa*hi0 + wb*hi1)>>ws)
155 | }
156 |
157 | fn find_value(&self, offset: Border) -> (usize,Border,Border) {
158 | assert!(offset < self.get_denominator(),
159 | "Invalid frequency offset {} requested under total {}",
160 | offset, self.get_denominator());
161 | let mut value = 0;
162 | let mut lo = 0 as Border;
163 | let mut hi;
164 | while { hi = lo +
165 | (self.w_first * (self.first.get_frequencies()[value] as Border) +
166 | self.w_second * (self.second.get_frequencies()[value] as Border)) >>
167 | (self.w_shift as usize);
168 | hi <= offset } {
169 | lo = hi;
170 | value += 1;
171 | }
172 | (value, lo, hi)
173 | }
174 |
175 | fn get_denominator(&self) -> Border {
176 | (self.w_first * self.first.get_denominator() +
177 | self.w_second * self.second.get_denominator()) >>
178 | (self.w_shift as usize)
179 | }
180 | }
181 |
182 |
183 | /// A basic byte-encoding arithmetic
184 | /// uses a special terminator code to end the stream
185 | pub struct ByteEncoder {
186 | /// A lower level encoder
187 | pub encoder: super::Encoder,
188 | /// A basic frequency table
189 | pub freq: Model,
190 | }
191 |
192 | impl ByteEncoder {
193 | /// Create a new encoder on top of a given Writer
194 | pub fn new(w: W) -> ByteEncoder {
195 | let freq_max = super::RANGE_DEFAULT_THRESHOLD >> 2;
196 | ByteEncoder {
197 | encoder: super::Encoder::new(w),
198 | freq: Model::new_flat(super::SYMBOL_TOTAL+1, freq_max),
199 | }
200 | }
201 |
202 | /// Finish encoding & write the terminator symbol
203 | pub fn finish(mut self) -> (W, io::Result<()>) {
204 | let ret = self.encoder.encode(super::SYMBOL_TOTAL, &self.freq);
205 | let (w,r2) = self.encoder.finish();
206 | (w, ret.and(r2))
207 | }
208 | }
209 |
210 | impl Write for ByteEncoder {
211 | fn write(&mut self, buf: &[u8]) -> io::Result {
212 | for byte in buf.iter() {
213 | let value = *byte as usize;
214 | try!(self.encoder.encode(value, &self.freq));
215 | self.freq.update(value, 10, 1);
216 | }
217 |
218 | Ok(buf.len())
219 | }
220 |
221 | fn flush(&mut self) -> io::Result<()> {
222 | self.encoder.flush()
223 | }
224 | }
225 |
226 |
227 | /// A basic byte-decoding arithmetic
228 | /// expects a special terminator code for the end of the stream
229 | pub struct ByteDecoder {
230 | /// A lower level decoder
231 | pub decoder: super::Decoder,
232 | /// A basic frequency table
233 | pub freq: Model,
234 | /// Remember if we found the terminator code
235 | is_eof: bool,
236 | }
237 |
238 | impl ByteDecoder {
239 | /// Create a decoder on top of a given Reader
240 | pub fn new(r: R) -> ByteDecoder {
241 | let freq_max = super::RANGE_DEFAULT_THRESHOLD >> 2;
242 | ByteDecoder {
243 | decoder: super::Decoder::new(r),
244 | freq: Model::new_flat(super::SYMBOL_TOTAL+1, freq_max),
245 | is_eof: false,
246 | }
247 | }
248 |
249 | /// Finish decoding
250 | pub fn finish(self) -> (R, io::Result<()>) {
251 | self.decoder.finish()
252 | }
253 | }
254 |
255 | impl Read for ByteDecoder {
256 | fn read(&mut self, dst: &mut [u8]) -> io::Result {
257 | if self.is_eof {
258 | return Ok(0)
259 | }
260 | let mut amount = 0;
261 | for out_byte in dst.iter_mut() {
262 | let value = try!(self.decoder.decode(&self.freq));
263 | if value == super::SYMBOL_TOTAL {
264 | self.is_eof = true;
265 | break
266 | }
267 | self.freq.update(value, 10, 1);
268 | *out_byte = value as u8;
269 | amount += 1;
270 | }
271 | Ok(amount)
272 | }
273 | }
274 |
--------------------------------------------------------------------------------
/src/entropy/ari/test.rs:
--------------------------------------------------------------------------------
1 | use std::io::{BufReader, BufWriter, Write, Read};
2 | use std::vec::Vec;
3 | #[cfg(feature="unstable")]
4 | use test::Bencher;
5 |
6 | static TEXT_INPUT: &'static [u8] = include_bytes!("../../data/test.txt");
7 |
8 | fn roundtrip(bytes: &[u8]) {
9 | info!("Roundtrip Ari of size {}", bytes.len());
10 | let mut e = super::table::ByteEncoder::new(BufWriter::new(Vec::new()));
11 | e.write(bytes).unwrap();
12 | let (e, r) = e.finish();
13 | r.unwrap();
14 | let encoded = e.into_inner().unwrap();
15 | debug!("Roundtrip input {:?} encoded {:?}", bytes, encoded);
16 | let mut d = super::ByteDecoder::new(BufReader::new(&encoded[..]));
17 | let mut decoded = Vec::new();
18 | d.read_to_end(&mut decoded).unwrap();
19 | assert_eq!(&bytes[..], &decoded[..]);
20 | }
21 |
22 | fn encode_binary(bytes: &[u8], model: &mut super::bin::Model) -> Vec {
23 | let mut encoder = super::Encoder::new(BufWriter::new(Vec::new()));
24 | for &byte in bytes.iter() {
25 | for i in 0..8 {
26 | let bit = (byte & (1<> 3, factor);
38 | let output = encode_binary(bytes, &mut bm);
39 | bm.reset_flat();
40 | let mut decoder = super::Decoder::new(BufReader::new(&output[..]));
41 | for &byte in bytes.iter() {
42 | let mut value = 0u8;
43 | for i in 0..8 {
44 | let bit = decoder.decode(&bm).unwrap();
45 | bm.update(bit);
46 | value += (bit as u8)<> 3;
96 | let mut t0 = super::table::Model::new_flat(16, threshold);
97 | let mut t1 = super::table::Model::new_flat(16, threshold);
98 | let mut b0 = super::bin::Model::new_flat(threshold, 3);
99 | let mut b1 = super::bin::Model::new_flat(threshold, 5);
100 | // encode (high 4 bits with the proxy table, low 4 bits with the proxy binary)
101 | let mut encoder = super::Encoder::new(BufWriter::new(Vec::new()));
102 | for &byte in bytes.iter() {
103 | let high = (byte>>4) as usize;
104 | {
105 | let proxy = super::table::SumProxy::new(2, &t0, 1, &t1, 0);
106 | encoder.encode(high, &proxy).unwrap();
107 | }
108 | t0.update(high, update0, 1);
109 | t1.update(high, update1, 1);
110 | for i in 0..4 {
111 | let bit = (byte & (1<>i) & 1 != 0;
157 | let (bit_new, coords) = gate.pass(&bit);
158 | encoder.encode(b1, &bit_new).unwrap();
159 | bit.update(b1, 10, 0);
160 | gate.update(b1, coords, 10, 0);
161 | }
162 | }
163 | let (writer, err) = encoder.finish();
164 | err.unwrap();
165 | let output = writer.into_inner().unwrap();
166 | bit = super::apm::Bit::new_equal();
167 | gate = super::apm::Gate::new();
168 | let mut decoder = super::Decoder::new(BufReader::new(&output[..]));
169 | for b8 in bytes.iter() {
170 | let mut decoded = 0u8;
171 | for i in 0..8 {
172 | let (bit_new, coords) = gate.pass(&bit);
173 | let b1 = decoder.decode(&bit_new).unwrap();
174 | if b1 {
175 | decoded += 1< = vec![0u8; TEXT_INPUT.len()];
219 | bh.iter(|| {
220 | let cursor = Cursor::new(&mut storage[..]);
221 | let mut w = BufWriter::new(cursor);
222 | w.seek(SeekFrom::Start(0)).unwrap();
223 | let mut e = super::ByteEncoder::new(w);
224 | e.write(TEXT_INPUT).unwrap();
225 | });
226 | bh.bytes = TEXT_INPUT.len() as u64;
227 | }
228 |
--------------------------------------------------------------------------------
/src/flate.rs:
--------------------------------------------------------------------------------
1 | //! DEFLATE Compression and Decompression. Requires `flate` feature, enabled by default
2 | //!
3 | //! This module contains an implementation of the DEFLATE compression scheme.
4 | //! This format is often used as the underpinning of other compression formats.
5 | //!
6 | //! # Example
7 | //!
8 | //! ```rust,ignore
9 | //! use compress::flate;
10 | //! use std::fs::File;
11 | //! use std::path::Path;
12 | //! use std::io::Read;
13 | //!
14 | //! let stream = File::open(&Path::new("path/to/file.flate")).unwrap();
15 | //! let mut decompressed = Vec::new();
16 | //! flate::Decoder::new(stream).read_to_end(&mut decompressed);
17 | //! ```
18 | //!
19 | //! # Related links
20 | //!
21 | //! * http://tools.ietf.org/html/rfc1951 - RFC that this implementation is based
22 | //! on
23 | //! * http://www.gzip.org/zlib/rfc-deflate.html - simplified version of RFC 1951
24 | //! used as a reference
25 | //! * http://svn.ghostscript.com/ghostscript/trunk/gs/zlib/contrib/puff/puff.c -
26 | //! Much of this code is based on the puff.c implementation found here
27 |
28 | use std::cmp;
29 | use std::ptr::copy_nonoverlapping;
30 | use std::io::{self, Read};
31 | use std::vec::Vec;
32 |
33 | use super::byteorder::{LittleEndian, ReadBytesExt};
34 | use super::ReadExact;
35 |
36 | const MAXBITS: usize = 15;
37 | const MAXLCODES: u16 = 286;
38 | const MAXDCODES: u16 = 30;
39 | const MAXCODES: u16 = MAXLCODES + MAXDCODES;
40 | const HISTORY: usize = 32 * 1024;
41 |
42 | enum Error {
43 | HuffmanTreeTooLarge,
44 | InvalidBlockCode,
45 | InvalidHuffmanHeaderSymbol,
46 | InvalidHuffmanTree,
47 | InvalidHuffmanTreeHeader,
48 | InvalidHuffmanCode,
49 | InvalidStaticSize,
50 | NotEnoughBits,
51 | }
52 |
53 | fn error(e: Error) -> io::Result {
54 | Err(io::Error::new(
55 | io::ErrorKind::InvalidInput,
56 | match e {
57 | Error::HuffmanTreeTooLarge => "huffman tree too large",
58 | Error::InvalidBlockCode => "invalid block code",
59 | Error::InvalidHuffmanHeaderSymbol => "invalid huffman header symbol",
60 | Error::InvalidHuffmanTree => "invalid huffman tree",
61 | Error::InvalidHuffmanTreeHeader => "invalid huffman tree header",
62 | Error::InvalidHuffmanCode => "invalid huffman code",
63 | Error::InvalidStaticSize => "invalid static size",
64 | Error::NotEnoughBits => "not enough bits",
65 | }
66 | ))
67 | }
68 |
69 | struct HuffmanTree {
70 | /// An array which counts the number of codes which can be found at the
71 | /// index's bit length, or count[n] is the number of n-bit codes
72 | pub count: [u16; MAXBITS + 1],
73 |
74 | /// Symbols in this huffman tree in sorted order. This preserves the
75 | /// original huffman codes
76 | pub symbol: [u16; MAXCODES as usize],
77 | }
78 |
79 | impl HuffmanTree {
80 | /// Constructs a new huffman tree for decoding. If the given array has
81 | /// length N, then the huffman tree can be used to decode N symbols. Each
82 | /// entry in the array corresponds to the length of the nth symbol.
83 | fn construct(lens: &[u16]) -> io::Result {
84 | let mut tree = HuffmanTree {
85 | count: [0; MAXBITS + 1],
86 | symbol: [0; MAXCODES as usize],
87 | };
88 | // Collect the lengths of all symbols
89 | for len in lens.iter() {
90 | tree.count[*len as usize] += 1;
91 | }
92 | // If there weren't actually any codes, then we're done
93 | if tree.count[0] as usize == lens.len() { return Ok(tree) }
94 |
95 | // Make sure that this tree is sane. Each bit gives us 2x more codes to
96 | // work with, but if the counts add up to greater than the available
97 | // amount, then this is an invalid table.
98 | let mut left = 1;
99 | for i in 1..(MAXBITS + 1) {
100 | left *= 2;
101 | left -= tree.count[i] as isize;
102 | if left < 0 { return error(Error::InvalidHuffmanTree) }
103 | }
104 |
105 | // Generate the offset of each length into the 'symbol' array
106 | let mut offs = [0; MAXBITS + 1];
107 | for i in 1..MAXBITS {
108 | offs[i + 1] = offs[i] + tree.count[i];
109 | }
110 |
111 | // Insert all symbols into the table, in sorted order using the `offs`
112 | // array generated above.
113 | for (sym, &len) in lens.iter().enumerate() {
114 | if len != 0 {
115 | tree.symbol[offs[len as usize] as usize] = sym as u16;
116 | offs[len as usize] += 1;
117 | }
118 | }
119 | return Ok(tree);
120 | }
121 |
122 | /// Decodes a codepoint from the buffer.
123 | ///
124 | /// This operates by reading bits as long as the code isn't found within the
125 | /// valid range of the codes itself. Remember the codepoints are all encoded
126 | /// by a sequence of lengths. The codepoint being decoded needs to figure
127 | /// out what lengths it's between, and then within that range we can index
128 | /// into the whole symbol array to pluck out the right symbol.
129 | fn decode(&self, s: &mut Decoder) -> io::Result {
130 | // this could be a lot faster.
131 | let mut code = 0;
132 | let mut first = 0;
133 | let mut index = 0;
134 | for len in 1..(MAXBITS + 1) {
135 | code |= try!(s.bits(1));
136 | let count = self.count[len];
137 | if code < first + count {
138 | return Ok(self.symbol[(index + (code - first)) as usize])
139 | }
140 | index += count;
141 | first += count;
142 | first <<= 1;
143 | code <<= 1;
144 | }
145 | return error(Error::NotEnoughBits);
146 | }
147 | }
148 |
149 | #[cfg(genflate)]
150 | fn main() {
151 | static FIXLCODES: usize = 388;
152 | let mut arr = [0; FIXLCODES];
153 | for i in 0..144 { arr[i] = 8; }
154 | for i in 144..256 { arr[i] = 9; }
155 | for i in 256..280 { arr[i] = 7; }
156 | for i in 280..288 { arr[i] = 8; }
157 | println!("{:?}", HuffmanTree::construct(arr[..FIXLCODES]));
158 | for i in 0..MAXDCODES { arr[i] = 5; }
159 | println!("{:?}", HuffmanTree::construct(arr[..MAXDCODES]));
160 | }
161 |
162 | /// The structure that is used to decode an LZ4 data stream. This wraps an
163 | /// internal reader which is used as the source of all data.
164 | pub struct Decoder {
165 | /// Wrapped reader which is exposed to allow getting it back.
166 | pub r: R,
167 |
168 | output: Vec,
169 | outpos: usize,
170 |
171 | block: Vec,
172 | pos: usize,
173 |
174 | bitbuf: usize,
175 | bitcnt: usize,
176 | eof: bool,
177 | }
178 |
179 | impl Decoder {
180 | /// Creates a new flate decoder which will read data from the specified
181 | /// source
182 | pub fn new(r: R) -> Decoder {
183 | Decoder {
184 | r: r,
185 | output: Vec::with_capacity(HISTORY),
186 | outpos: 0,
187 | block: Vec::new(),
188 | pos: 0,
189 | bitbuf: 0,
190 | bitcnt: 0,
191 | eof: false,
192 | }
193 | }
194 |
195 | fn block(&mut self) -> io::Result<()> {
196 | self.pos = 0;
197 | self.block = Vec::with_capacity(4096);
198 | if try!(self.bits(1)) == 1 { self.eof = true; }
199 | match try!(self.bits(2)) {
200 | 0 => self.statik(),
201 | 1 => self.fixed(),
202 | 2 => self.dynamic(),
203 | 3 => error(Error::InvalidBlockCode),
204 | _ => unreachable!(),
205 | }
206 | }
207 |
208 | fn update_output(&mut self, mut from: usize) {
209 | let to = self.block.len();
210 | if to - from > HISTORY {
211 | from = to - HISTORY;
212 | }
213 | let amt = to - from;
214 | let remaining = HISTORY - self.outpos;
215 | let n = cmp::min(amt, remaining);
216 | if self.output.len() < HISTORY {
217 | self.output.extend(self.block[from..(from + n)].iter().map(|b| *b));
218 | } else if n > 0 {
219 | assert_eq!(self.output.len(), HISTORY);
220 | unsafe { copy_nonoverlapping(
221 | &self.block[from],
222 | &mut self.output[self.outpos],
223 | n
224 | )};
225 | }
226 | self.outpos += n;
227 | if n < amt {
228 | unsafe { copy_nonoverlapping(
229 | &self.block[from+n],
230 | &mut self.output[0],
231 | amt - n
232 | )};
233 | self.outpos = amt - n;
234 | }
235 | }
236 |
237 | fn statik(&mut self) -> io::Result<()> {
238 | let len = try!(self.r.read_u16::());
239 | let nlen = try!(self.r.read_u16::());
240 | if !nlen != len { return error(Error::InvalidStaticSize) }
241 | try!(self.r.push_exactly(len as u64, &mut self.block));
242 | self.update_output(0);
243 | self.bitcnt = 0;
244 | self.bitbuf = 0;
245 | Ok(())
246 | }
247 |
248 | // Bytes in the stream are LSB first, so the bitbuf is appended to from the
249 | // left and consumed from the right.
250 | fn bits(&mut self, cnt: usize) -> io::Result {
251 | while self.bitcnt < cnt {
252 | let byte = try!(self.r.read_u8());
253 | self.bitbuf |= (byte as usize) << self.bitcnt;
254 | self.bitcnt += 8;
255 | }
256 | let ret = self.bitbuf & ((1 << cnt) - 1);
257 | self.bitbuf >>= cnt;
258 | self.bitcnt -= cnt;
259 | return Ok(ret as u16);
260 | }
261 |
262 | fn codes(&mut self, lens: &HuffmanTree,
263 | dist: &HuffmanTree) -> io::Result<()> {
264 | // extra base length for codes 257-285
265 | static EXTRALENS: [u16; 29] = [
266 | 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51,
267 | 59, 67, 83, 99, 115, 131, 163, 195, 227, 258
268 | ];
269 | // extra bits to read for codes 257-285
270 | static EXTRABITS: [u16; 29] = [
271 | 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
272 | 4, 5, 5, 5, 5, 0,
273 | ];
274 | // base offset for distance codes.
275 | static EXTRADIST: [u16; 30] = [
276 | 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385,
277 | 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385,
278 | 24577,
279 | ];
280 | // number of bits to read for distance codes (to add to the offset)
281 | static EXTRADBITS: [u16; 30] = [
282 | 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9,
283 | 10, 10, 11, 11, 12, 12, 13, 13,
284 | ];
285 | let mut last_updated = 0;
286 | loop {
287 | let sym = try!(lens.decode(self));
288 | match sym {
289 | n if n < 256 => { self.block.push(sym as u8); }
290 | 256 => break,
291 | n if n < 290 => {
292 | // figure out len/dist that we're working with
293 | let n = n - 257;
294 | if n as usize > EXTRALENS.len() {
295 | return error(Error::InvalidHuffmanCode)
296 | }
297 | let len = EXTRALENS[n as usize] +
298 | try!(self.bits(EXTRABITS[n as usize] as usize));
299 |
300 | let len = len as usize;
301 |
302 | let dist = try!(dist.decode(self)) as usize;
303 | let dist = EXTRADIST[dist] +
304 | try!(self.bits(EXTRADBITS[dist] as usize));
305 | let dist = dist as usize;
306 |
307 | // update the output buffer with any data we haven't pushed
308 | // into it yet
309 | if last_updated != self.block.len() {
310 | self.update_output(last_updated);
311 | last_updated = self.block.len();
312 | }
313 |
314 | if dist > self.output.len() {
315 | return error(Error::InvalidHuffmanCode)
316 | }
317 |
318 | // Perform the copy
319 | self.block.reserve(dist);
320 | let mut finger = if self.outpos >= dist {
321 | self.outpos - dist
322 | } else {
323 | HISTORY - (dist - self.outpos)
324 | };
325 | let min = cmp::min(dist, len);
326 | let start = self.block.len();
327 | for _ in 0..min {
328 | self.block.push(self.output[finger]);
329 | finger = (finger + 1) % HISTORY;
330 | }
331 | for i in min..len {
332 | let b = self.block[start + i - min];
333 | self.block.push(b);
334 | }
335 | }
336 | _ => return error(Error::InvalidHuffmanCode)
337 | }
338 | }
339 | self.update_output(last_updated);
340 | Ok(())
341 | }
342 |
343 | fn fixed(&mut self) -> io::Result<()> {
344 | // Generated by the main function above
345 | static LEN: HuffmanTree = HuffmanTree {
346 | count: [100, 0, 0, 0, 0, 0, 0, 24, 152, 112, 0, 0, 0, 0, 0, 0],
347 | symbol: [
348 | 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268,
349 | 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 0, 1, 2,
350 | 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
351 | 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
352 | 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
353 | 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
354 | 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
355 | 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
356 | 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
357 | 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
358 | 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
359 | 140, 141, 142, 143, 280, 281, 282, 283, 284, 285, 286, 287, 144,
360 | 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
361 | 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170,
362 | 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183,
363 | 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
364 | 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
365 | 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222,
366 | 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235,
367 | 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248,
368 | 249, 250, 251, 252, 253, 254, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
369 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
370 | ]
371 | };
372 | static DIST: HuffmanTree = HuffmanTree {
373 | count: [0, 0, 0, 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
374 | symbol: [
375 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
376 | 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 0, 0, 0, 0,
377 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
378 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
379 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
380 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
381 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
382 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
383 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
384 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
385 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
386 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
387 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
388 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
389 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
390 | 0, 0, 0, 0, 0, 0, 0, 0
391 | ]
392 | };
393 |
394 | self.codes(&LEN, &DIST)
395 | }
396 |
397 | fn dynamic(&mut self) -> io::Result<()> {
398 | let hlit = try!(self.bits(5)) + 257; // number of length codes
399 | let hdist = try!(self.bits(5)) + 1; // number of distance codes
400 | let hclen = try!(self.bits(4)) + 4; // number of code length codes
401 | if hlit > MAXLCODES || hdist > MAXDCODES {
402 | return error(Error::HuffmanTreeTooLarge);
403 | }
404 |
405 | // Read off the code length codes, and then build the huffman tree which
406 | // is then used to decode the actual huffman tree for the rest of the
407 | // data.
408 | static ORDER: [usize; 19] = [
409 | 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15,
410 | ];
411 | let mut lengths = [0; 19];
412 | for i in 0..(hclen as usize) {
413 | lengths[ORDER[i]] = try!(self.bits(3));
414 | }
415 | let tree = try!(HuffmanTree::construct(&lengths));
416 |
417 | // Decode all of the length and distance codes in one go, we'll
418 | // partition them into two huffman trees later
419 | let mut lengths = [0; MAXCODES as usize];
420 | let mut i = 0;
421 | while i < hlit + hdist {
422 | let symbol = try!(tree.decode(self));
423 | match symbol {
424 | n if n < 16 => {
425 | lengths[i as usize] = symbol;
426 | i += 1;
427 | }
428 | 16 if i == 0 => return error(Error::InvalidHuffmanHeaderSymbol),
429 | 16 => {
430 | let prev = lengths[i as usize - 1];
431 | for _ in 0..(try!(self.bits(2)) + 3) {
432 | lengths[i as usize] = prev;
433 | i += 1;
434 | }
435 | }
436 | // all codes start out as 0, so these just skip
437 | 17 => { i += try!(self.bits(3)) + 3; }
438 | 18 => { i += try!(self.bits(7)) + 11; }
439 | _ => return error(Error::InvalidHuffmanHeaderSymbol),
440 | }
441 | }
442 | if i > hlit + hdist { return error(Error::InvalidHuffmanTreeHeader) }
443 |
444 | // Use the decoded codes to construct yet another huffman tree
445 | let arr = &lengths[..(hlit as usize)];
446 | let lencode = try!(HuffmanTree::construct(arr));
447 | let arr = &lengths[(hlit as usize)..((hlit + hdist) as usize)];
448 | let distcode = try!(HuffmanTree::construct(arr));
449 | self.codes(&lencode, &distcode)
450 | }
451 |
452 | /// Returns whether this deflate stream has reached the EOF marker
453 | pub fn eof(&self) -> bool {
454 | self.eof && self.pos == self.block.len()
455 | }
456 |
457 | /// Resets this flate decoder. Note that this could corrupt an in-progress
458 | /// decoding of a stream.
459 | pub fn reset(&mut self) {
460 | self.bitbuf = 0;
461 | self.bitcnt = 0;
462 | self.eof = false;
463 | self.block = Vec::new();
464 | self.pos = 0;
465 | }
466 | }
467 |
468 | impl Read for Decoder {
469 | fn read(&mut self, buf: &mut [u8]) -> io::Result {
470 | if self.pos == self.block.len() {
471 | if self.eof { return Ok(0) }
472 | try!(self.block());
473 | }
474 | let n = cmp::min(buf.len(), self.block.len() - self.pos);
475 | match n {
476 | 0 => Ok(0),
477 | _ => {
478 | unsafe { copy_nonoverlapping(
479 | &self.block[self.pos],
480 | &mut buf[0],
481 | n
482 | )};
483 | self.pos += n;
484 | Ok(n)
485 | }
486 | }
487 | }
488 | }
489 |
490 | #[cfg(test)]
491 | #[allow(warnings)]
492 | mod test {
493 | use std::io::{BufReader, BufWriter, Read, Write};
494 | use super::super::rand::{random};
495 | use super::super::byteorder::{LittleEndian, BigEndian, WriteBytesExt, ReadBytesExt};
496 | use std::str;
497 | use super::{Decoder};
498 | #[cfg(feature="unstable")]
499 | use test;
500 |
501 | // The input data for these tests were all generated from the zpipe.c
502 | // program found at http://www.zlib.net/zpipe.c and the zlib format has an
503 | // extra 2 bytes of header with an 4-byte checksum at the end.
504 | fn fixup<'a>(s: &'a [u8]) -> &'a [u8] {
505 | &s[2..(s.len() - 4)]
506 | }
507 |
508 | fn test_decode(input: &[u8], output: &[u8]) {
509 | let mut d = Decoder::new(BufReader::new(fixup(input)));
510 | let mut buf = Vec::new();
511 | d.read_to_end(&mut buf).unwrap();
512 |
513 | assert_eq!(output.len(), buf.len());
514 | let i = buf.iter().zip(output.iter()).position(|(a, b)| a != b);
515 | assert!(buf == output);
516 | }
517 |
518 | fn test_decode_pure(input: &[u8], output: &[u8]) {
519 | let mut d = Decoder::new(BufReader::new(input));
520 | let mut buf = Vec::new();
521 | d.read_to_end(&mut buf).unwrap();
522 |
523 | assert_eq!(output.len(), buf.len());
524 | let i = buf.iter().zip(output.iter()).position(|(a, b)| a != b);
525 | assert!(buf == output);
526 | }
527 |
528 | #[test]
529 | fn decode() {
530 | let reference = include_bytes!("data/test.txt");
531 | test_decode(include_bytes!("data/test.z.0"), reference);
532 | test_decode(include_bytes!("data/test.z.1"), reference);
533 | test_decode(include_bytes!("data/test.z.2"), reference);
534 | test_decode(include_bytes!("data/test.z.3"), reference);
535 | test_decode(include_bytes!("data/test.z.4"), reference);
536 | test_decode(include_bytes!("data/test.z.5"), reference);
537 | test_decode(include_bytes!("data/test.z.6"), reference);
538 | test_decode(include_bytes!("data/test.z.7"), reference);
539 | test_decode(include_bytes!("data/test.z.8"), reference);
540 | test_decode(include_bytes!("data/test.z.9"), reference);
541 | test_decode_pure(include_bytes!("data/test.z.go"), reference);
542 | }
543 |
544 | #[test]
545 | fn large() {
546 | let reference = include_bytes!("data/test.large");
547 | test_decode(include_bytes!("data/test.large.z.5"), reference);
548 | }
549 |
550 | #[test]
551 | fn one_byte_at_a_time() {
552 | let input = include_bytes!("data/test.z.1");
553 | let mut d = Decoder::new(BufReader::new(fixup(input)));
554 | assert!(!d.eof());
555 | let mut out = Vec::new();
556 | loop {
557 | match d.read_u8() {
558 | Ok(b) => out.push(b),
559 | Err(..) => break
560 | }
561 | }
562 |
563 | assert!(d.eof());
564 | assert!(&out[..] == &include_bytes!("data/test.txt")[..]);
565 | }
566 |
567 | #[test]
568 | fn random_byte_lengths() {
569 | let input = include_bytes!("data/test.z.1");
570 | let mut d = Decoder::new(BufReader::new(fixup(input)));
571 | let mut out = Vec::new();
572 | let mut buf = [0u8; 40];
573 | loop {
574 | match d.read(&mut buf[..(1 + random::() % 40)]) {
575 | Err(..) | Ok(0) => break,
576 | Ok(n) => {
577 | out.extend(buf[..n].iter().map(|b| *b));
578 | }
579 | }
580 | }
581 | assert!(&out[..] == &include_bytes!("data/test.txt")[..]);
582 | }
583 |
584 | //fn roundtrip(bytes: &[u8]) {
585 | // let mut e = Encoder::new(MemWriter::new());
586 | // e.write(bytes);
587 | // let encoded = e.finish().unwrap();
588 | //
589 | // let mut d = Decoder::new(BufReader::new(encoded));
590 | // let decoded = d.read_to_end();
591 | // assert_eq!(&decoded[..], bytes);
592 | //}
593 | //
594 | //#[test]
595 | //fn some_roundtrips() {
596 | // roundtrip(bytes!("test"));
597 | // roundtrip(bytes!(""));
598 | // roundtrip(include_bytes!("data/test.txt"));
599 | //}
600 |
601 | #[cfg(feature="unstable")]
602 | #[bench]
603 | fn decompress_speed(bh: &mut test::Bencher) {
604 | let input = include_bytes!("data/test.z.9");
605 | let mut d = Decoder::new(BufReader::new(fixup(input)));
606 | let mut output = [0u8; 65536];
607 | let mut output_size = 0;
608 | bh.iter(|| {
609 | d.r = BufReader::new(fixup(input));
610 | d.reset();
611 | output_size = d.read(&mut output).unwrap();
612 | });
613 | bh.bytes = output_size as u64;
614 | }
615 | }
616 |
--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
1 | #![deny(missing_docs)]
2 | #![allow(missing_copy_implementations)]
3 | #![allow(deprecated)]
4 |
5 | //! dox (placeholder)
6 |
7 | extern crate byteorder;
8 | extern crate rand;
9 |
10 | #[macro_use]
11 | extern crate log;
12 |
13 | #[cfg(test)]
14 | #[cfg(feature="unstable")]
15 | extern crate test;
16 |
17 | use std::io::{self, Read};
18 |
19 | /// Public exports
20 | #[cfg(feature="checksum")]
21 | pub use self::checksum::adler::State32 as Adler32;
22 |
23 | #[cfg(feature="checksum")]
24 | /// Checksum algorithms. Requires `checksum` feature, enabled by default
25 | // http://en.wikipedia.org/wiki/Checksum
26 | pub mod checksum {
27 | pub mod adler;
28 | }
29 |
30 | #[cfg(feature="bwt")]
31 | pub mod bwt;
32 |
33 | #[cfg(feature="flate")]
34 | pub mod flate;
35 |
36 | #[cfg(feature="lz4")]
37 | pub mod lz4;
38 |
39 | #[cfg(feature="zlib")]
40 | pub mod zlib;
41 |
42 | /// Entropy coder family. Requires `entropy` feature, enabled by default
43 | // http://en.wikipedia.org/wiki/Entropy_encoding
44 | #[cfg(feature="entropy")]
45 | pub mod entropy {
46 | pub mod ari;
47 | }
48 |
49 | #[cfg(feature="rle")]
50 | pub mod rle;
51 |
52 | #[cfg(any(feature = "lz4", feature = "entropy", feature = "bwt"))]
53 | fn byteorder_err_to_io(err: io::Error) -> io::Error {
54 | match err {
55 | e if e.kind() == io::ErrorKind::UnexpectedEof =>
56 | io::Error::new(
57 | io::ErrorKind::Other,
58 | "unexpected end of file"
59 | ),
60 | e => e,
61 | }
62 | }
63 |
64 | #[cfg(test)]
65 | mod test {
66 | use super::{io,byteorder_err_to_io};
67 | #[cfg(feature="unstable")]
68 | use test;
69 |
70 | fn force_byteorder_eof_error()->io::Result{
71 | use byteorder::{BigEndian,ReadBytesExt};
72 | let mut rdr = io::Cursor::new(vec![1,2]);
73 | rdr.read_u64::()
74 | }
75 |
76 | #[test]
77 | fn byteorder_err_to_io_with_eof() {
78 |
79 | let err_from_byteorder = force_byteorder_eof_error().unwrap_err();
80 | let err = byteorder_err_to_io(err_from_byteorder);
81 |
82 | let err_expected = io::Error::new(
83 | io::ErrorKind::Other,
84 | "unexpected end of file"
85 | );
86 | assert_eq!(err.kind(),err_expected.kind());
87 | }
88 |
89 | #[test]
90 | fn byteorder_err_to_io_with_not_eof() {
91 |
92 | // using closure here to produce 2x the same error,
93 | // as io::Error does not impl Copy trait
94 | let build_other_io_error = || io::Error::new(
95 | io::ErrorKind::NotFound,
96 | "some other io error"
97 | );
98 |
99 | let err = byteorder_err_to_io(build_other_io_error());
100 | let err_expected = build_other_io_error();
101 |
102 | assert_eq!(err.kind(),err_expected.kind());
103 | }
104 | }
105 |
106 |
107 | /// Adds a convenience method for types with the read trait, very similar
108 | /// to push_at_least in the late Reader trait
109 | pub trait ReadExact: Read + Sized {
110 | /// Appends exact number of bytes to a buffer
111 | fn push_exactly(&mut self, bytes: u64, buf: &mut Vec) -> io::Result<()> {
112 | let n = try!(self.by_ref().take(bytes).read_to_end(buf)) as u64;
113 |
114 | if n < bytes {
115 | return Err(io::Error::new(
116 | io::ErrorKind::Other,
117 | "unexpected end of file"
118 | ));
119 | }
120 |
121 | Ok(())
122 | }
123 | }
124 |
125 | impl ReadExact for T where T: Read + Sized {}
126 |
--------------------------------------------------------------------------------
/src/lz4.rs:
--------------------------------------------------------------------------------
1 | /*!
2 |
3 | LZ4 Decompression and Compression. Requires `lz4` feature, enabled by default
4 |
5 | This module contains an implementation in Rust of decompression and compression
6 | of LZ4-encoded streams. These are exposed as a standard `Reader` and `Writer`
7 | interfaces wrapping an underlying stream.
8 |
9 | # Example
10 |
11 | ```rust,ignore
12 | use compress::lz4;
13 | use std::fs::File;
14 | use std::path::Path;
15 | use std::io::Read;
16 |
17 | let stream = File::open(&Path::new("path/to/file.lz4")).unwrap();
18 | let mut decompressed = Vec::new();
19 | lz4::Decoder::new(stream).read_to_end(&mut decompressed);
20 | ```
21 |
22 | # Credit
23 |
24 | This implementation is largely based on Branimir Karadžić's implementation which
25 | can be found at https://github.com/bkaradzic/go-lz4.
26 |
27 | */
28 |
29 | use std::cmp;
30 | use std::ptr::copy_nonoverlapping;
31 | use std::io::{self, Read, Write};
32 | use std::iter::repeat;
33 | use std::vec::Vec;
34 | use std::num::Wrapping;
35 | use std::ops::Shr;
36 |
37 | use super::byteorder::{LittleEndian, WriteBytesExt, ReadBytesExt};
38 | use super::{ReadExact, byteorder_err_to_io};
39 |
40 | const MAGIC: u32 = 0x184d2204;
41 |
42 | const ML_BITS: u32 = 4;
43 | const ML_MASK: u32 = (1 << ML_BITS as usize) - 1;
44 | const RUN_BITS: u32 = 8 - ML_BITS;
45 | const RUN_MASK: u32 = (1 << RUN_BITS as usize) - 1;
46 |
47 | const MIN_MATCH: u32 = 4;
48 | const HASH_LOG: u32 = 17;
49 | const HASH_TABLE_SIZE: u32 = 1 << (HASH_LOG as usize);
50 | const HASH_SHIFT: u32 = (MIN_MATCH * 8) - HASH_LOG;
51 | const INCOMPRESSIBLE: u32 = 128;
52 | const UNINITHASH: u32 = 0x88888888;
53 | const MAX_INPUT_SIZE: u32 = 0x7e000000;
54 |
55 | struct BlockDecoder<'a> {
56 | input: &'a [u8],
57 | output: &'a mut Vec,
58 | cur: usize,
59 |
60 | start: usize,
61 | end: usize,
62 | }
63 |
64 | impl<'a> BlockDecoder<'a> {
65 | /// Decodes this block of data from 'input' to 'output', returning the
66 | /// number of valid bytes in the output.
67 | fn decode(&mut self) -> usize {
68 | while self.cur < self.input.len() {
69 | let code = self.bump();
70 | debug!("block with code: {:x}", code);
71 | // Extract a chunk of data from the input to the output.
72 | {
73 | let len = self.length(code >> 4);
74 | debug!("consume len {}", len);
75 | if len > 0 {
76 | let end = self.end;
77 | self.grow_output(end + len);
78 | unsafe { copy_nonoverlapping(
79 | &self.input[self.cur],
80 | &mut self.output[end],
81 | len
82 | )};
83 | self.end += len;
84 | self.cur += len;
85 | }
86 | }
87 | if self.cur == self.input.len() { break }
88 |
89 | // Read off the next i16 offset
90 | {
91 | let back = (self.bump() as usize) | ((self.bump() as usize) << 8);
92 | debug!("found back {}", back);
93 | self.start = self.end - back;
94 | }
95 |
96 | // Slosh around some bytes now
97 | {
98 | let mut len = self.length(code & 0xf);
99 | let literal = self.end - self.start;
100 | if literal < 4 {
101 | static DECR: [usize; 4] = [0, 3, 2, 3];
102 | self.cp(4, DECR[literal]);
103 | } else {
104 | len += 4;
105 | }
106 | self.cp(len, 0);
107 | }
108 | }
109 | self.end
110 | }
111 |
112 | fn length(&mut self, code: u8) -> usize {
113 | let mut ret = code as usize;
114 | if code == 0xf {
115 | loop {
116 | let tmp = self.bump();
117 | ret += tmp as usize;
118 | if tmp != 0xff { break }
119 | }
120 | }
121 | ret
122 | }
123 |
124 | fn bump(&mut self) -> u8 {
125 | let ret = self.input[self.cur];
126 | self.cur += 1;
127 | ret
128 | }
129 |
130 | #[inline]
131 | fn cp(&mut self, len: usize, decr: usize) {
132 | let end = self.end;
133 | self.grow_output(end + len);
134 | for i in 0..len {
135 | self.output[end + i] = (*self.output)[self.start + i];
136 | }
137 |
138 | self.end += len;
139 | self.start += len - decr;
140 | }
141 |
142 | // Extends the output vector to a target number of bytes (in total), but
143 | // does not actually initialize the new data. The length of the vector is
144 | // updated, but the bytes will all have undefined values. It is assumed that
145 | // the next operation is to pave over these bytes (so the initialization is
146 | // unnecessary).
147 | #[inline]
148 | fn grow_output(&mut self, target: usize) {
149 | if self.output.capacity() < target {
150 | debug!("growing {} to {}", self.output.capacity(), target);
151 | //let additional = target - self.output.capacity();
152 | //self.output.reserve(additional);
153 | while self.output.len() < target {
154 | self.output.push(0);
155 | }
156 | }else {
157 | unsafe {
158 | self.output.set_len(target);
159 | }
160 | }
161 | }
162 | }
163 |
164 | struct BlockEncoder<'a> {
165 | input: &'a [u8],
166 | output: &'a mut Vec,
167 | hash_table: Vec,
168 | pos: u32,
169 | anchor: u32,
170 | dest_pos: u32
171 | }
172 |
173 | /// Returns maximum possible size of compressed output
174 | /// given source size
175 | pub fn compression_bound(size: u32) -> Option {
176 | if size > MAX_INPUT_SIZE {
177 | None
178 | } else {
179 | Some(size + (size / 255) + 16 + 4)
180 | }
181 | }
182 |
183 | impl<'a> BlockEncoder<'a> {
184 | #[inline(always)]
185 | fn seq_at(&self, pos: u32) -> u32 {
186 | (self.input[pos as usize + 3] as u32) << 24
187 | | (self.input[pos as usize + 2] as u32) << 16
188 | | (self.input[pos as usize + 1] as u32) << 8
189 | | (self.input[pos as usize] as u32)
190 | }
191 |
192 | fn write_literals(&mut self, len: u32, ml_len: u32, pos: u32) {
193 | let mut ln = len;
194 |
195 | let code = if ln > RUN_MASK - 1 { RUN_MASK as u8 } else { ln as u8 };
196 |
197 | if ml_len > ML_MASK - 1 {
198 | self.output[self.dest_pos as usize] = (code << ML_BITS as usize) + ML_MASK as u8;
199 | } else {
200 | self.output[self.dest_pos as usize] = (code << ML_BITS as usize) + ml_len as u8;
201 | }
202 |
203 | self.dest_pos += 1;
204 |
205 | if code == RUN_MASK as u8 {
206 | ln -= RUN_MASK;
207 | while ln > 254 {
208 | self.output[self.dest_pos as usize] = 255;
209 | self.dest_pos += 1;
210 | ln -= 255;
211 | }
212 |
213 | self.output[self.dest_pos as usize] = ln as u8;
214 | self.dest_pos += 1;
215 | }
216 |
217 | // FIXME: find out why slicing syntax fails tests
218 | //self.output[self.dest_pos as usize .. (self.dest_pos + len) as usize] = self.input[pos as uint.. (pos + len) as uint];
219 | for i in 0..(len as usize) {
220 | self.output[self.dest_pos as usize + i] = self.input[pos as usize + i];
221 | }
222 |
223 | self.dest_pos += len;
224 | }
225 |
226 | fn encode(&mut self) -> u32 {
227 | let input_len = self.input.len() as u32;
228 |
229 | match compression_bound(input_len) {
230 | None => 0,
231 | Some(out_size) => {
232 | let out_size_usize = out_size as usize;
233 | if self.output.capacity() < out_size_usize {
234 | let additional = out_size_usize - self.output.capacity();
235 | self.output.reserve(additional);
236 | }
237 | unsafe {self.output.set_len(out_size_usize); }
238 |
239 | let mut step = 1u32;
240 | let mut limit = INCOMPRESSIBLE;
241 |
242 | loop {
243 | if self.pos + 12 > input_len {
244 | let tmp = self.anchor;
245 | self.write_literals(self.input.len() as u32 - tmp, 0, tmp);
246 | unsafe { self.output.set_len(self.dest_pos as usize) };
247 | return self.dest_pos;
248 | }
249 |
250 | let seq = self.seq_at(self.pos);
251 | let hash = (Wrapping(seq) * Wrapping(2654435761)).shr(HASH_SHIFT as usize).0;
252 | let mut r = (Wrapping(self.hash_table[hash as usize]) + Wrapping(UNINITHASH)).0;
253 | self.hash_table[hash as usize] = (Wrapping(self.pos) - Wrapping(UNINITHASH)).0;
254 |
255 | if (Wrapping(self.pos) - Wrapping(r)).shr(16).0 != 0 || seq != self.seq_at(r) {
256 | if self.pos - self.anchor > limit {
257 | limit = limit << 1;
258 | step += 1 + (step >> 2);
259 | }
260 | self.pos += step;
261 | continue;
262 | }
263 |
264 | if step > 1 {
265 | self.hash_table[hash as usize] = r - UNINITHASH;
266 | self.pos -= step - 1;
267 | step = 1;
268 | continue;
269 | }
270 |
271 | limit = INCOMPRESSIBLE;
272 |
273 | let ln = self.pos - self.anchor;
274 | let back = self.pos - r;
275 | let anchor = self.anchor;
276 |
277 | self.pos += MIN_MATCH;
278 | r += MIN_MATCH;
279 | self.anchor = self.pos;
280 |
281 | while (self.pos < input_len - 5) && self.input[self.pos as usize] == self.input[r as usize] {
282 | self.pos += 1;
283 | r += 1
284 | }
285 |
286 | let mut ml_len = self.pos - self.anchor;
287 |
288 | self.write_literals(ln, ml_len, anchor);
289 | self.output[self.dest_pos as usize] = back as u8;
290 | self.output[self.dest_pos as usize + 1] = (back >> 8) as u8;
291 | self.dest_pos += 2;
292 |
293 | if ml_len > ML_MASK - 1 {
294 | ml_len -= ML_MASK;
295 | while ml_len > 254 {
296 | ml_len -= 255;
297 |
298 | self.output[self.dest_pos as usize] = 255;
299 | self.dest_pos += 1;
300 | }
301 |
302 | self.output[self.dest_pos as usize] = ml_len as u8;
303 | self.dest_pos += 1;
304 | }
305 |
306 | self.anchor = self.pos;
307 | }
308 | }
309 | }
310 | }
311 | }
312 |
313 | /// This structure is used to decode a stream of LZ4 blocks. This wraps an
314 | /// internal reader which is read from when this decoder's read method is
315 | /// called.
316 | pub struct Decoder {
317 | /// The internally wrapped reader. This is exposed so it may be moved out
318 | /// of. Note that if data is read from the reader while decoding is in
319 | /// progress the output stream will get corrupted.
320 | pub r: R,
321 |
322 | temp: Vec,
323 | output: Vec,
324 |
325 | start: usize,
326 | end: usize,
327 | eof: bool,
328 |
329 | header: bool,
330 | blk_checksum: bool,
331 | stream_checksum: bool,
332 | max_block_size: usize,
333 | }
334 |
335 | impl Decoder {
336 | /// Creates a new decoder which will read data from the given stream. The
337 | /// inner stream can be re-acquired by moving out of the `r` field of this
338 | /// structure.
339 | pub fn new(r: R) -> Decoder {
340 | Decoder {
341 | r: r,
342 | temp: Vec::new(),
343 | output: Vec::new(),
344 | header: false,
345 | blk_checksum: false,
346 | stream_checksum: false,
347 | start: 0,
348 | end: 0,
349 | eof: false,
350 | max_block_size: 0,
351 | }
352 | }
353 |
354 | /// Resets this decoder back to its initial state. Note that the underlying
355 | /// stream is not seeked on or has any alterations performed on it.
356 | pub fn reset(&mut self) {
357 | self.header = false;
358 | self.eof = false;
359 | self.start = 0;
360 | self.end = 0;
361 | }
362 |
363 | fn read_header(&mut self) -> io::Result<()> {
364 | // Make sure the magic number is what's expected.
365 | if try!(self.r.read_u32::()) != MAGIC {
366 | return Err(io::Error::new(io::ErrorKind::InvalidInput, ""))
367 | }
368 |
369 | let mut bits = [0; 3];
370 | try!(self.r.read(&mut bits[..2]));
371 | let flg = bits[0];
372 | let bd = bits[1];
373 |
374 | // bits 7/6, the version number. Right now this must be 1
375 | if (flg >> 6) != 0b01 {
376 | return Err(io::Error::new(io::ErrorKind::InvalidInput, ""))
377 | }
378 | // bit 5 is the "block independence", don't care about this yet
379 | // bit 4 is whether blocks have checksums or not
380 | self.blk_checksum = (flg & 0x10) != 0;
381 | // bit 3 is whether there is a following stream size
382 | let stream_size = (flg & 0x08) != 0;
383 | // bit 2 is whether there is a stream checksum
384 | self.stream_checksum = (flg & 0x04) != 0;
385 | // bit 1 is reserved
386 | // bit 0 is whether there is a preset dictionary
387 | let preset_dictionary = (flg & 0x01) != 0;
388 |
389 | static MAX_SIZES: [usize; 8] =
390 | [0, 0, 0, 0, // all N/A
391 | 64 << 10, // 64KB
392 | 256 << 10, // 256 KB
393 | 1 << 20, // 1MB
394 | 4 << 20]; // 4MB
395 |
396 | // bit 7 is reserved
397 | // bits 6-4 are the maximum block size
398 | let max_block_size = MAX_SIZES[(bd >> 4) as usize & 0x7];
399 | // bits 3-0 are reserved
400 |
401 | // read off other portions of the stream
402 | let size = if stream_size {
403 | Some(try!(self.r.read_u64::()))
404 | } else {
405 | None
406 | };
407 | assert!(!preset_dictionary, "preset dictionaries not supported yet");
408 |
409 | debug!("blk: {}", self.blk_checksum);
410 | debug!("stream: {}", self.stream_checksum);
411 | debug!("max size: {}", max_block_size);
412 | debug!("stream size: {:?}", size);
413 |
414 | self.max_block_size = max_block_size;
415 |
416 | // XXX: implement checksums
417 | let cksum = try!(self.r.read_u8());
418 | debug!("ignoring header checksum: {}", cksum);
419 | return Ok(());
420 | }
421 |
422 | fn decode_block(&mut self) -> io::Result {
423 | match try!(self.r.read_u32::()) {
424 | // final block, we're done here
425 | 0 => return Ok(false),
426 |
427 | // raw block to read
428 | n if n & 0x80000000 != 0 => {
429 | let amt = (n & 0x7fffffff) as usize;
430 | self.output.truncate(0);
431 | self.output.reserve(amt);
432 | try!(self.r.push_exactly(amt as u64, &mut self.output));
433 | self.start = 0;
434 | self.end = amt;
435 | }
436 |
437 | // actual block to decompress
438 | n => {
439 | let n = n as usize;
440 | self.temp.truncate(0);
441 | self.temp.reserve(n);
442 | try!(self.r.push_exactly(n as u64, &mut self.temp));
443 |
444 | let target = cmp::min(self.max_block_size, 4 * n / 3);
445 | self.output.truncate(0);
446 | self.output.reserve(target);
447 | let mut decoder = BlockDecoder {
448 | input: &self.temp[..n],
449 | output: &mut self.output,
450 | cur: 0,
451 | start: 0,
452 | end: 0,
453 | };
454 | self.start = 0;
455 | self.end = decoder.decode();
456 | }
457 | }
458 |
459 | if self.blk_checksum {
460 | let cksum = try!(self.r.read_u32::());
461 | debug!("ignoring block checksum {}", cksum);
462 | }
463 | return Ok(true);
464 | }
465 |
466 | /// Tests whether the end of this LZ4 stream has been reached
467 | pub fn eof(&mut self) -> bool { self.eof }
468 | }
469 |
470 | impl Read for Decoder {
471 | fn read(&mut self, dst: &mut [u8]) -> io::Result {
472 | if self.eof { return Ok(0) }
473 | if !self.header {
474 | try!(self.read_header());
475 | self.header = true;
476 | }
477 | let mut amt = dst.len();
478 | let len = amt;
479 |
480 | while amt > 0 {
481 | if self.start == self.end {
482 | let keep_going = try!(self.decode_block());
483 | if !keep_going {
484 | self.eof = true;
485 | break;
486 | }
487 | }
488 | let n = cmp::min(amt, self.end - self.start);
489 | unsafe { copy_nonoverlapping(
490 | &self.output[self.start],
491 | &mut dst[len - amt],
492 | n
493 | )};
494 | self.start += n;
495 | amt -= n;
496 | }
497 |
498 | Ok(len - amt)
499 | }
500 | }
501 |
502 | /// This structure is used to compress a stream of bytes using the LZ4
503 | /// compression algorithm. This is a wrapper around an internal writer which
504 | /// bytes will be written to.
505 | pub struct Encoder {
506 | w: W,
507 | buf: Vec,
508 | tmp: Vec,
509 | wrote_header: bool,
510 | limit: usize,
511 | }
512 |
513 | impl Encoder {
514 | /// Creates a new encoder which will have its output written to the given
515 | /// output stream. The output stream can be re-acquired by calling
516 | /// `finish()`
517 | ///
518 | /// NOTE: compression isn't actually implemented just yet, this is just a
519 | /// skeleton of a future implementation.
520 | pub fn new(w: W) -> Encoder {
521 | Encoder {
522 | w: w,
523 | wrote_header: false,
524 | buf: Vec::with_capacity(1024),
525 | tmp: Vec::new(),
526 | limit: 256 * 1024,
527 | }
528 | }
529 |
530 | fn encode_block(&mut self) -> io::Result<()> {
531 | self.tmp.truncate(0);
532 | if self.compress() {
533 | try!(self.w.write_u32::(self.tmp.len() as u32));
534 | try!(self.w.write(&self.tmp));
535 | } else {
536 | try!(self.w.write_u32::((self.buf.len() as u32) | 0x80000000));
537 | try!(self.w.write(&self.buf));
538 | }
539 | self.buf.truncate(0);
540 | Ok(())
541 | }
542 |
543 | fn compress(&mut self) -> bool {
544 | false
545 | }
546 |
547 | /// This function is used to flag that this session of compression is done
548 | /// with. The stream is finished up (final bytes are written), and then the
549 | /// wrapped writer is returned.
550 | pub fn finish(mut self) -> (W, io::Result<()>) {
551 | let mut result = self.flush();
552 |
553 | for _ in 0..2 {
554 | let tmp = self.w.write_u32::(0)
555 | .map_err(byteorder_err_to_io);
556 |
557 | result = result.and_then(|_| tmp);
558 | }
559 |
560 | (self.w, result)
561 | }
562 | }
563 |
564 | impl Write for Encoder {
565 | fn write(&mut self, mut buf: &[u8]) -> io::Result {
566 | if !self.wrote_header {
567 | try!(self.w.write_u32::(MAGIC));
568 | // version 01, turn on block independence, but turn off
569 | // everything else (we have no checksums right now).
570 | try!(self.w.write_u8(0b01_100000));
571 | // Maximum block size is 256KB
572 | try!(self.w.write_u8(0b0_101_0000));
573 | // XXX: this checksum is just plain wrong.
574 | try!(self.w.write_u8(0));
575 | self.wrote_header = true;
576 | }
577 |
578 | while buf.len() > 0 {
579 | let amt = cmp::min(self.limit - self.buf.len(), buf.len());
580 | self.buf.extend(buf[..amt].iter().map(|b| *b));
581 |
582 | if self.buf.len() == self.limit {
583 | try!(self.encode_block());
584 | }
585 | buf = &buf[amt..];
586 | }
587 |
588 | Ok(buf.len())
589 | }
590 |
591 | fn flush(&mut self) -> io::Result<()> {
592 | if self.buf.len() > 0 {
593 | try!(self.encode_block());
594 | }
595 | self.w.flush()
596 | }
597 | }
598 |
599 |
600 | /// Decodes pure LZ4 block into output. Returns count of bytes
601 | /// processed.
602 | pub fn decode_block(input: &[u8], output: &mut Vec) -> usize {
603 | let mut b = BlockDecoder {
604 | input: input,
605 | output: output,
606 | cur: 0,
607 | start: 0,
608 | end: 0
609 | };
610 | b.decode()
611 | }
612 |
613 |
614 | /// Encodes input into pure LZ4 block. Return count of bytes
615 | /// processed.
616 | pub fn encode_block(input: &[u8], output: &mut Vec) -> usize {
617 | let mut encoder = BlockEncoder {
618 | input: input,
619 | output: output,
620 | hash_table: repeat(0).take(HASH_TABLE_SIZE as usize).collect(),
621 | pos: 0,
622 | anchor: 0,
623 | dest_pos: 0
624 | };
625 |
626 | encoder.encode() as usize
627 | }
628 |
629 | #[cfg(test)]
630 | mod test {
631 | use std::io::{BufReader, BufWriter, Read, Write};
632 | use super::super::rand;
633 | use super::{Decoder, Encoder};
634 | #[cfg(feature="unstable")]
635 | use test;
636 |
637 | use super::super::byteorder::ReadBytesExt;
638 |
639 | fn test_decode(input: &[u8], output: &[u8]) {
640 | let mut d = Decoder::new(BufReader::new(input));
641 | let mut buf = Vec::new();
642 |
643 | d.read_to_end(&mut buf).unwrap();
644 | assert!(&buf[..] == output);
645 | }
646 |
647 | #[test]
648 | fn decode() {
649 | let reference = include_bytes!("data/test.txt");
650 | test_decode(include_bytes!("data/test.lz4.1"), reference);
651 | test_decode(include_bytes!("data/test.lz4.2"), reference);
652 | test_decode(include_bytes!("data/test.lz4.3"), reference);
653 | test_decode(include_bytes!("data/test.lz4.4"), reference);
654 | test_decode(include_bytes!("data/test.lz4.5"), reference);
655 | test_decode(include_bytes!("data/test.lz4.6"), reference);
656 | test_decode(include_bytes!("data/test.lz4.7"), reference);
657 | test_decode(include_bytes!("data/test.lz4.8"), reference);
658 | test_decode(include_bytes!("data/test.lz4.9"), reference);
659 | }
660 |
661 | #[test]
662 | fn raw_encode_block() {
663 | let data = include_bytes!("data/test.txt");
664 | let mut encoded = Vec::new();
665 |
666 | super::encode_block(data, &mut encoded);
667 | let mut decoded = Vec::new();
668 |
669 | super::decode_block(&encoded[..], &mut decoded);
670 |
671 | assert_eq!(&data[..], &decoded[..]);
672 | }
673 |
674 | #[test]
675 | fn one_byte_at_a_time() {
676 | let input = include_bytes!("data/test.lz4.1");
677 | let mut d = Decoder::new(BufReader::new(&input[..]));
678 | assert!(!d.eof());
679 | let mut out = Vec::new();
680 | loop {
681 | match d.read_u8() {
682 | Ok(b) => out.push(b),
683 | Err(..) => break
684 | }
685 | }
686 | assert!(d.eof());
687 | assert!(&out[..] == &include_bytes!("data/test.txt")[..]);
688 | }
689 |
690 | #[test]
691 | fn random_byte_lengths() {
692 | let input = include_bytes!("data/test.lz4.1");
693 | let mut d = Decoder::new(BufReader::new(&input[..]));
694 | let mut out = Vec::new();
695 | let mut buf = [0u8; 40];
696 | loop {
697 | match d.read(&mut buf[..(1 + rand::random::() % 40)]) {
698 | Ok(0) => break,
699 | Ok(n) => {
700 | out.extend(buf[..n].iter().map(|b| *b));
701 | }
702 | Err(..) => break
703 | }
704 | }
705 | assert!(&out[..] == &include_bytes!("data/test.txt")[..]);
706 | }
707 |
708 | fn roundtrip(bytes: &[u8]) {
709 | let mut e = Encoder::new(BufWriter::new(Vec::new()));
710 | e.write(bytes).unwrap();
711 | let (e, err) = e.finish();
712 | err.unwrap();
713 | let encoded = e.into_inner().unwrap();
714 |
715 | let mut d = Decoder::new(BufReader::new(&encoded[..]));
716 | let mut decoded = Vec::new();
717 | d.read_to_end(&mut decoded).unwrap();
718 | assert_eq!(&decoded[..], bytes);
719 | }
720 |
721 | #[test]
722 | fn some_roundtrips() {
723 | roundtrip(b"test");
724 | roundtrip(b"");
725 | roundtrip(include_bytes!("data/test.txt"));
726 | }
727 |
728 | #[cfg(feature="unstable")]
729 | #[bench]
730 | fn decompress_speed(bh: &mut test::Bencher) {
731 | let input = include_bytes!("data/test.lz4.9");
732 | let mut d = Decoder::new(BufReader::new(&input[..]));
733 | let mut output = [0u8; 65536];
734 | let mut output_size = 0;
735 | bh.iter(|| {
736 | d.r = BufReader::new(&input[..]);
737 | d.reset();
738 | output_size = d.read(&mut output).unwrap();
739 | });
740 | bh.bytes = output_size as u64;
741 | }
742 | }
743 |
--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
1 | #![crate_type = "bin"]
2 |
3 | //! A rust-compress application that allows testing of implemented
4 | //! algorithms and their combinations using a simple command line.
5 | //! Example invocations:
6 | //! echo -n "abracadabra" | ./app bwt | xxd
7 | //! echo "banana" | ./app bwt | ./app -d
8 |
9 | #[macro_use] extern crate log;
10 | extern crate compress;
11 | extern crate byteorder;
12 |
13 | use std::collections::HashMap;
14 | use std::io::{self, Read, Write};
15 | use std::{env, str};
16 | use compress::{bwt, lz4, ReadExact};
17 | use compress::entropy::ari;
18 | use byteorder::{LittleEndian, WriteBytesExt, ReadBytesExt};
19 |
20 | static MAGIC : u32 = 0x73632172; //=r!cs
21 |
22 | struct Config {
23 | exe_name: String,
24 | methods: Vec,
25 | block_size: usize,
26 | decompress: bool,
27 | }
28 |
29 | impl Config {
30 | fn query(mut args: I) -> Config where I: Iterator- + Sized {
31 | let mut cfg = Config {
32 | exe_name: args.next().unwrap().clone(),
33 | methods: Vec::new(),
34 | block_size: 1<<16,
35 | decompress: false,
36 | };
37 | let mut handlers: HashMap<&str, Box> =
38 | HashMap::new();
39 | handlers.insert("d", Box::new(|_, cfg| { cfg.decompress = true; }));
40 | handlers.insert("block", Box::new(|b, cfg| {
41 | cfg.block_size = b.parse().unwrap();
42 | }));
43 |
44 | for arg in args {
45 | let slice = &arg[..];
46 | if slice.starts_with("-") {
47 | match handlers.iter_mut().find(|&(&k,_)| slice[1..].starts_with(k)) {
48 | Some((k,h)) => (*h)(&slice[1+k.len()..], &mut cfg),
49 | None => println!("Warning: unrecognized option: {}", &arg[..]),
50 | }
51 | }else {
52 | cfg.methods.push(arg.to_string());
53 | }
54 | }
55 | cfg
56 | }
57 | }
58 |
59 | struct Pass {
60 | encode: Box, &Config)
61 | -> Box + 'static>,
62 | decode: Box, &Config)
63 | -> Box + 'static>,
64 | info: String,
65 | }
66 |
67 |
68 | /// main entry point
69 | pub fn main() {
70 | let mut passes: HashMap = HashMap::new();
71 | passes.insert("dummy".to_string(), Pass {
72 | encode: Box::new(|w,_| w),
73 | decode: Box::new(|r,_| r),
74 | info: "pass-through".to_string(),
75 | });
76 | passes.insert("ari".to_string(), Pass {
77 | encode: Box::new(|w,_c| {
78 | Box::new(ari::ByteEncoder::new(w)) as Box
79 | }),
80 | decode: Box::new(|r,_c| {
81 | Box::new(ari::ByteDecoder::new(r)) as Box
82 | }),
83 | info: "Adaptive arithmetic byte coder".to_string(),
84 | });
85 | passes.insert("bwt".to_string(), Pass {
86 | encode: Box::new(|w,c| {
87 | Box::new(bwt::Encoder::new(w, c.block_size)) as Box
88 | }),
89 | decode: Box::new(|r,_c| {
90 | Box::new(bwt::Decoder::new(r, true)) as Box
91 | }),
92 | info: "Burrows-Wheeler Transformation".to_string(),
93 | });
94 | passes.insert("mtf".to_string(), Pass {
95 | encode: Box::new(|w,_c| {
96 | Box::new(bwt::mtf::Encoder::new(w)) as Box
97 | }),
98 | decode: Box::new(|r,_c| {
99 | Box::new(bwt::mtf::Decoder::new(r)) as Box
100 | }),
101 | info: "Move-To-Front Transformation".to_string(),
102 | });
103 | /* // looks like we are missing the encoder implementation
104 | passes.insert(~"flate", Pass {
105 | encode: |w,_c| {
106 | ~flate::Encoder::new(w, true) as ~Write
107 | },
108 | decode: |r,_c| {
109 | ~flate::Decoder::new(r, true) as ~Read
110 | },
111 | info: ~"Standardized Ziv-Lempel + Huffman variant",
112 | });*/
113 | passes.insert("lz4".to_string(), Pass {
114 | encode: Box::new(|w,_c| {
115 | Box::new(lz4::Encoder::new(w)) as Box
116 | }),
117 | decode: Box::new(|r,_c| { // LZ4 decoder seem to work
118 | Box::new(lz4::Decoder::new(r)) as Box
119 | }),
120 | info: "Ziv-Lempel derivative, focused at speed".to_string(),
121 | });
122 |
123 | let config = Config::query(env::args());
124 | let mut input = io::stdin();
125 | let mut output = io::stdout();
126 | if config.decompress {
127 | assert!(config.methods.is_empty(), "Decompression methods are set in stone");
128 | match input.read_u32::() {
129 | Ok(magic) if magic != MAGIC => {
130 | error!("Input is not a rust-compress archive");
131 | return
132 | },
133 | Err(e) => {
134 | error!("Unable to read input: {:?}", e);
135 | return
136 | },
137 | _ => () //OK
138 | }
139 | let methods: Vec<_> = (0..(input.read_u8().unwrap() as usize)).map(|_| {
140 | let len = input.read_u8().unwrap() as u64;
141 | let mut bytes = Vec::new();
142 | input.push_exactly(len, &mut bytes).unwrap();
143 | str::from_utf8(&bytes[..]).unwrap().to_string()
144 | }).collect();
145 | let mut rsum: Box = Box::new(input);
146 | for met in methods.iter() {
147 | info!("Found pass {}", *met);
148 | match passes.get_mut(met) {
149 | Some(pa) => rsum = (pa.decode)(rsum, &config),
150 | None => panic!("Pass is not implemented"),
151 | }
152 | }
153 | io::copy(&mut rsum, &mut output).unwrap();
154 | }else if config.methods.is_empty() {
155 | println!("rust-compress test application");
156 | println!("Usage:");
157 | println!("\t{} .. output", config.exe_name);
158 | println!("Options:");
159 | println!("\t-d (to decompress)");
160 | println!("\t-block (BWT block size)");
161 | println!("Passes:");
162 | for (name,pa) in passes.iter() {
163 | println!("\t{} = {}", *name, pa.info);
164 | }
165 | }else {
166 | output.write_u32::(MAGIC).unwrap();
167 | output.write_u8(config.methods.len() as u8).unwrap();
168 | for met in config.methods.iter() {
169 | output.write_u8(met.len() as u8).unwrap();
170 | output.write_all(met.as_bytes()).unwrap();
171 | }
172 | let mut wsum: Box = Box::new(output);
173 | for met in config.methods.iter() {
174 | match passes.get_mut(met) {
175 | Some(pa) => wsum = (pa.encode)(wsum, &config),
176 | None => panic!("Pass {} is not implemented", *met)
177 | }
178 | }
179 | io::copy(&mut input, &mut wsum).unwrap();
180 | wsum.flush().unwrap();
181 | }
182 | }
183 |
--------------------------------------------------------------------------------
/src/rle.rs:
--------------------------------------------------------------------------------
1 | /*!
2 |
3 | Run time length encoding and decoding based on byte streams, see
4 | https://en.wikipedia.org/wiki/Run-length_encoding.
5 |
6 | A run is defined as a sequence of identical bytes of length two or greater.
7 | A run of byte a and length n is encoded by a two repitions of a, followed
8 | by a length specification which describes how much often these bytes are
9 | repeated. Such a specification is a string of bytes with dynamic length.
10 | The most significat bit of each byte in this string indicates if the byte is
11 | the last byte in the string. The rest of the bits are concatenated using
12 | the Little Endian convention.
13 |
14 | # Example
15 |
16 | ```rust
17 | use compress::rle;
18 | use std::io::{Write, Read};
19 |
20 | let input = b"Helloooo world!!";
21 |
22 | let mut encoder = rle::Encoder::new(Vec::new());
23 | encoder.write_all(&input[..]).unwrap();
24 | let (buf, _): (Vec, _) = encoder.finish();
25 |
26 | let mut decoder = rle::Decoder::new(&buf[..]);
27 | let mut decoder_buf = Vec::new();
28 | decoder.read_to_end(&mut decoder_buf).unwrap();
29 |
30 | assert_eq!(&input[..], &decoder_buf[..]);
31 | ```
32 |
33 | !*/
34 |
35 | use std::io::{self, Write, Read, Bytes};
36 |
37 | /// This structure is used to compress a stream of bytes using a RLE
38 | /// compression algorithm. This is a wrapper around an internal writer which
39 | /// bytes will be written to.
40 | pub struct Encoder {
41 | w: W,
42 | reps: u64,
43 | byte: u8,
44 | in_run: bool
45 | }
46 |
47 | impl Encoder {
48 | /// Creates a new encoder which will have its output written to the given
49 | /// output stream.
50 | pub fn new(w: W) -> Encoder {
51 | Encoder {
52 | w: w,
53 | reps: 0,
54 | byte: 0,
55 | in_run: false
56 | }
57 | }
58 |
59 | /// This function is used to flag that this session of compression is done
60 | /// with. The stream is finished up (final bytes are written), and then the
61 | /// wrapped writer is returned.
62 | pub fn finish(mut self) -> (W, io::Result<()>) {
63 | let result = self.flush();
64 |
65 | (self.w, result)
66 | }
67 |
68 | fn process_byte(&mut self, byte: u8) -> io::Result<()> {
69 | if self.byte == byte {
70 | self.reps += 1;
71 | } else if self.byte != byte {
72 | try!(self.flush());
73 | self.reps = 1;
74 | self.byte = byte;
75 | }
76 |
77 | Ok(())
78 | }
79 | }
80 |
81 | impl Write for Encoder {
82 | fn write(&mut self, buf: &[u8]) -> io::Result {
83 | if ! self.in_run && buf.len() > 0 {
84 | self.byte = buf[0];
85 | self.reps = 1;
86 | self.in_run = true;
87 | }
88 |
89 | for byte in &buf[1..] {
90 | try!(self.process_byte(*byte));
91 | }
92 |
93 | Ok(buf.len())
94 | }
95 |
96 | fn flush(&mut self) -> io::Result<()> {
97 | if self.reps == 1 {
98 | try!(self.w.write(&[self.byte]));
99 | } else if self.reps > 1 {
100 | let mut buf = [0; 11];
101 | let mut reps_encode = self.reps - 2;
102 | let mut index = 2;
103 | buf[0] = self.byte;
104 | buf[1] = self.byte;
105 |
106 | loop {
107 | buf[index] = (reps_encode & 0b0111_1111) as u8;
108 | reps_encode = reps_encode >> 7;
109 |
110 | if reps_encode == 0 {
111 | buf[index] = buf[index] | 0b1000_0000;
112 | break;
113 | }
114 |
115 | index += 1;
116 | }
117 |
118 | try!(self.w.write(&buf[..(index + 1)]));
119 | }
120 |
121 | Ok(())
122 | }
123 | }
124 |
125 | struct RunBuilder {
126 | byte: u8,
127 | slice: [u8; 9],
128 | byte_count: u8
129 | }
130 |
131 | impl RunBuilder {
132 | fn new(byte: u8) -> RunBuilder {
133 | RunBuilder {
134 | byte: byte,
135 | slice: [0; 9],
136 | byte_count: 0
137 | }
138 | }
139 |
140 | fn to_run(&mut self) -> Run {
141 | let reps = 2 + self.slice.iter().enumerate().fold(0u64, |reps, (i, &byte)| {
142 | reps | (((byte & 0b0111_1111) as u64) << (i as u32 * 7))
143 | });
144 |
145 | Run {
146 | byte: self.byte,
147 | reps: reps
148 | }
149 | }
150 |
151 | fn add_byte(&mut self, byte: u8) -> io::Result<()> {
152 | if self.byte_count >= 9 {
153 | Err(io::Error::new(io::ErrorKind::Other, "Overly long run"))
154 | } else {
155 | self.slice[self.byte_count as usize] = byte;
156 | self.byte_count += 1;
157 | Ok(())
158 | }
159 | }
160 | }
161 |
162 | struct Run {
163 | byte: u8,
164 | reps: u64
165 | }
166 |
167 | enum DecoderState {
168 | Clean,
169 | Single(u8),
170 | Run(RunBuilder)
171 | }
172 |
173 | /// This structure is used to decode a run length encoded stream. This wraps
174 | /// an internal reader which is read from when this decoder's read method is
175 | /// called.
176 | pub struct Decoder {
177 | buf: Bytes,
178 | state: DecoderState,
179 | run: Option
180 | }
181 |
182 | impl Decoder {
183 | /// Creates a new decoder which will read data from the given stream. The
184 | /// inner stream can be re-acquired by moving out of the `r` field of this
185 | /// structure.
186 | pub fn new(r: R) -> Decoder {
187 | Decoder {
188 | buf: r.bytes(),
189 | state: DecoderState::Clean,
190 | run: None
191 | }
192 | }
193 |
194 | fn read_byte(&mut self) -> io::Result