├── .gitignore ├── .travis.yml ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md └── src ├── bwt ├── dc.rs ├── mod.rs └── mtf.rs ├── checksum └── adler.rs ├── data ├── test.large ├── test.large.z.5 ├── test.lz4.1 ├── test.lz4.2 ├── test.lz4.3 ├── test.lz4.4 ├── test.lz4.5 ├── test.lz4.6 ├── test.lz4.7 ├── test.lz4.8 ├── test.lz4.9 ├── test.txt ├── test.z.0 ├── test.z.1 ├── test.z.2 ├── test.z.3 ├── test.z.4 ├── test.z.5 ├── test.z.6 ├── test.z.7 ├── test.z.8 ├── test.z.9 └── test.z.go ├── entropy └── ari │ ├── apm.rs │ ├── bin.rs │ ├── mod.rs │ ├── table.rs │ └── test.rs ├── flate.rs ├── lib.rs ├── lz4.rs ├── main.rs ├── rle.rs └── zlib.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /Cargo.lock 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | rust: 3 | - stable 4 | - nightly 5 | script: 6 | - cargo build --verbose 7 | - cargo test --verbose 8 | # cargo test already runs doctests now, so no need for rustdoc --test here 9 | # - rustdoc --test src/lib.rs -L target 10 | - cargo doc 11 | after_success: ! '[ $TRAVIS_BRANCH = master ] && [ $TRAVIS_PULL_REQUEST = false ] 12 | && echo '''' > target/doc/index.html 13 | && sudo pip install ghp-import && ghp-import -n target/doc && git push -fq https://${TOKEN}@github.com/${TRAVIS_REPO_SLUG}.git 14 | gh-pages ' 15 | env: 16 | global: 17 | - secure: NcLf8VutE7aJ3Sq9IzksEM0qA4yfM+RJxAnD7zpA/y6ipsqtLfo1qUIiiNg7uhJjSGrGLd7fGH/awUDnJfhSYdKLRML87qFt02Dqz4E8gPIRUOe3a6Q2QHzvM/SiLsc6W/tRvHwKlHld0MzqHyrOWO6AMeIfV2+kREU3WbhPHtI= 18 | notifications: 19 | email: 20 | on_success: never 21 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | 3 | name = "compress" 4 | description = "Various compression algorithms written in rust" 5 | repository = "https://github.com/alexcrichton/rust-compress" 6 | version = "0.2.1" 7 | authors = ["Alex Crichton ", 8 | "Dzmitry Malyshau "] 9 | license = "MIT/Apache-2.0" 10 | 11 | [features] 12 | default = ["bwt", "checksum", "entropy", "flate", "lz4", "zlib", "rle"] 13 | bwt = [] 14 | checksum = [] 15 | entropy = [] 16 | flate = [] 17 | lz4 = [] 18 | zlib = ["flate", "checksum"] 19 | rle = [] 20 | unstable = [] 21 | 22 | [[bin]] 23 | name = "compress" 24 | doc = false 25 | 26 | [dependencies] 27 | log = "0.4" 28 | num = "0.3" 29 | rand = "0.7" 30 | byteorder = "1.3" -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 Alex Crichton 2 | 3 | Permission is hereby granted, free of charge, to any 4 | person obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the 6 | Software without restriction, including without 7 | limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software 10 | is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice 14 | shall be included in all copies or substantial portions 15 | of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Rust Compresion 2 | 3 | [![Build Status](https://travis-ci.org/alexcrichton/rust-compress.png?branch=master)](https://travis-ci.org/alexcrichton/rust-compress) 4 | 5 | [Documentation](http://alexcrichton.com/rust-compress/compress/index.html) 6 | 7 | **NOTE: This is not a production-quality library, it is a proof of concept. This 8 | library mainly contains *decoders*, not *encoders*.** 9 | 10 | This repository aims to house various implementations of compression algorithms, 11 | all written in rust. This is still very much a work in progress. 12 | 13 | ``` 14 | git clone https://github.com/alexcrichton/rust-compress 15 | cd rust-compress 16 | cargo build 17 | ``` 18 | 19 | ### Implemented Algorithms 20 | 21 | The following algorithms are alredy implemented in the main branch: 22 | 23 | * DEFLATE: standard decoder based on RFC 1951 24 | * LZ4 (Ziv-Lempel modification): dummy encoder, semi-complete decoder 25 | * BWT (Burrows-Wheeler Transform): straightforward encoder, standard decoder 26 | * DC (Distance Coding): basic encoder, standard decoder 27 | * Ari (Arithmetic coding): standard range encoder/decoder 28 | * RLE (Run-Length Encoding): basic encoder/decoder 29 | 30 | ### Desired Algorithms 31 | 32 | The following algorithms are either planned or in development at this point: 33 | 34 | * WFC (Weight-Frequency Coding) 35 | * SA/BWT in linear time 36 | -------------------------------------------------------------------------------- /src/bwt/dc.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | 3 | DC (Distance Coding) forward and backward transformation. 4 | Designed to be used on BWT block output for compression. 5 | 6 | # Links 7 | 8 | http://www.data-compression.info/Algorithms/DC/ 9 | 10 | # Example 11 | 12 | ```rust 13 | use compress::bwt::dc; 14 | 15 | let bytes = b"abracadabra"; 16 | let distances = dc::encode_simple::(bytes); 17 | let decoded = dc::decode_simple(bytes.len(), &distances[..]); 18 | ``` 19 | 20 | # Credit 21 | 22 | This is an original implementation. 23 | Thanks to Edgar Binder for inventing DC! 24 | 25 | */ 26 | 27 | use std::io; 28 | use std::iter::{self, repeat}; 29 | use std::slice as vec; 30 | use super::num::traits::{NumCast, ToPrimitive}; 31 | use super::mtf::MTF; 32 | 33 | pub type Symbol = u8; 34 | pub type Rank = u8; 35 | pub const TOTAL_SYMBOLS: usize = 0x100; 36 | 37 | /// Distance coding context 38 | /// Has all the information potentially needed by the underlying coding model 39 | #[derive(PartialEq, Eq, Debug)] 40 | pub struct Context { 41 | /// current symbol 42 | pub symbol: Symbol, 43 | /// last known MTF rank 44 | pub last_rank: Rank, 45 | /// maximum possible distance 46 | pub distance_limit: usize, 47 | } 48 | 49 | impl Context { 50 | /// create a new distance context 51 | pub fn new(s: Symbol, r: Rank, dmax: usize) -> Context { 52 | Context { 53 | symbol: s, 54 | last_rank: r, 55 | distance_limit: dmax, 56 | } 57 | } 58 | } 59 | 60 | 61 | /// DC body iterator, can be used to encode distances 62 | pub struct EncodeIterator<'a,'b, D: 'b> { 63 | data: iter::Enumerate,vec::Iter<'b, D>>>, 64 | pos: [usize; TOTAL_SYMBOLS], 65 | last_active: usize, 66 | size: usize, 67 | } 68 | 69 | impl<'a, 'b, D: NumCast> EncodeIterator<'a,'b, D> { 70 | /// create a new encode iterator 71 | pub fn new(input: &'a [Symbol], dist: &'b [D], init: [usize; TOTAL_SYMBOLS]) -> EncodeIterator<'a,'b,D> { 72 | assert_eq!(input.len(), dist.len()); 73 | EncodeIterator { 74 | data: input.iter().zip(dist.iter()).enumerate(), 75 | pos: init, 76 | last_active: 0, 77 | size: input.len() 78 | } 79 | } 80 | 81 | /// get the initial symbol positions, to be called before iteration 82 | pub fn get_init<'c>(&'c self) -> &'c [usize; TOTAL_SYMBOLS] { 83 | assert_eq!(self.last_active, 0); 84 | &self.pos 85 | } 86 | } 87 | 88 | impl<'a, 'b, D> Iterator for EncodeIterator<'a,'b,D> 89 | where D: Clone + Eq + NumCast + 'b 90 | { 91 | type Item = (D, Context); 92 | fn next(&mut self) -> Option<(D,Context)> { 93 | let filler: D = NumCast::from(self.size).unwrap(); 94 | self.data.find(|&(_,(_,d))| *d != filler).map(|(i,(sym,d))| { 95 | let rank = self.last_active - self.pos[*sym as usize]; 96 | assert!(rank < TOTAL_SYMBOLS); 97 | self.last_active = i+1; 98 | self.pos[*sym as usize] = i + 1 + d.to_usize().unwrap(); 99 | debug!("Encoding distance {} at pos {} for symbol {}, computed rank {}, predicting next at {}", 100 | d.to_usize().unwrap(), i, *sym, rank, self.pos[*sym as usize]); 101 | (d.clone(), Context::new(*sym, rank as Rank, self.size-i)) 102 | }) 103 | } 104 | } 105 | 106 | /// Encode a block of bytes 'input' 107 | /// write output distance stream into 'distances' 108 | /// return: unique bytes encountered in the order they appear 109 | /// with the corresponding initial distances 110 | pub fn encode<'a, 'b, D: Clone + Copy + Eq + NumCast>(input: &'a [Symbol], distances: &'b mut [D], mtf: &mut MTF) -> EncodeIterator<'a,'b,D> { 111 | let n = input.len(); 112 | assert_eq!(distances.len(), n); 113 | let mut num_unique = 0; 114 | let mut last = [n; TOTAL_SYMBOLS]; 115 | let mut init = [n; TOTAL_SYMBOLS]; 116 | let filler: D = NumCast::from(n).unwrap(); 117 | for (i,&sym) in input.iter().enumerate() { 118 | distances[i] = filler.clone(); 119 | let base = last[sym as usize]; 120 | last[sym as usize] = i; 121 | debug!("\tProcessing symbol {} at position {}, last known at {}", sym, i, base); 122 | if base == n { 123 | let rank = num_unique; 124 | mtf.symbols[rank] = sym; 125 | mtf.encode(sym); //==rank 126 | // initial distances are not ordered to support re-shuffle 127 | debug!("\t\tUnique => assigning rank {}, encoding {}", rank, i); 128 | init[sym as usize] = i; 129 | num_unique += 1; 130 | }else { 131 | let rank = mtf.encode(sym) as usize; 132 | if rank > 0 { 133 | debug!("\t\tRegular at rank {}, encoding {}", rank, i-base-rank-1); 134 | assert!(i >= base+rank+1); 135 | distances[base] = NumCast::from(i-base-rank-1).unwrap(); 136 | } 137 | } 138 | } 139 | for (rank,&sym) in mtf.symbols[..num_unique].iter().enumerate() { 140 | let base = last[sym as usize]; 141 | debug!("\tSweep symbol {} of rank {}, last known at {}, encoding {}", sym, rank, base, n-base-rank-1); 142 | assert!(n >= base+rank+1); 143 | distances[base] = NumCast::from(n-base-rank-1).unwrap(); 144 | } 145 | // a basic but expensive check, to be improved 146 | //assert_eq!(input.iter().zip(input.iter().skip(1)).zip(distances.iter()). 147 | // position(|((&a,&b),d)| *d==filler && a!=b), None); 148 | EncodeIterator::new(input, distances, init) 149 | } 150 | 151 | 152 | /// Encode version with "batteries included" for quick testing 153 | pub fn encode_simple(input: &[Symbol]) -> Vec { 154 | let n = input.len(); 155 | let mut raw_dist: Vec = repeat(NumCast::from(0).unwrap()).take(n).collect(); 156 | let mut eniter = encode(input, &mut raw_dist, &mut MTF::new()); 157 | let init: Vec = (0..TOTAL_SYMBOLS).map(|i| NumCast::from(eniter.get_init()[i]).unwrap()).collect(); 158 | init.iter().map(|d| d.clone()).chain(eniter.by_ref().map(|(d,_)| d)).collect() 159 | } 160 | 161 | /// Decode a block of distances given the initial symbol positions 162 | pub fn decode(mut next: [usize; TOTAL_SYMBOLS], output: &mut [Symbol], mtf: &mut MTF, 163 | mut fn_dist: F) -> io::Result<()> 164 | where F: FnMut(Context) -> io::Result 165 | { 166 | 167 | let n = output.len(); 168 | let mut i = 0; 169 | for (sym,d) in next.iter().enumerate() { 170 | if *d < n { 171 | let mut j = i; 172 | while j>0 && next[mtf.symbols[j-1] as usize] > *d { 173 | mtf.symbols[j] = mtf.symbols[j-1]; 174 | j -= 1; 175 | } 176 | mtf.symbols[j] = sym as Symbol; 177 | i += 1; 178 | } 179 | } 180 | if i<=1 { 181 | // redundant alphabet case 182 | let sym = mtf.symbols[0]; 183 | for out in output.iter_mut() { 184 | *out = sym; 185 | } 186 | return Ok(()) 187 | } 188 | 189 | let alphabet_size = i; 190 | let mut ranks = [0 as Rank; TOTAL_SYMBOLS]; 191 | for rank in 0..i { 192 | let sym = mtf.symbols[rank]; 193 | debug!("\tRegistering symbol {} of rank {} at position {}", 194 | sym, rank, next[sym as usize]); 195 | ranks[sym as usize] = 0; //could use 'rank' but don't know how to derive it during encoding 196 | } 197 | 198 | i = 0; 199 | while i stop + d, 210 | Err(e) => return Err(e) 211 | }; 212 | debug!("\t\tLooking for future position {}", future); 213 | assert!(future <= n); 214 | let mut rank = 1; 215 | while rank < alphabet_size && future+rank > next[mtf.symbols[rank] as usize] { 216 | mtf.symbols[rank-1] = mtf.symbols[rank]; 217 | rank += 1; 218 | } 219 | if rank < alphabet_size { 220 | debug!("\t\tFound sym {} of rank {} at position {}", mtf.symbols[rank], 221 | rank, next[mtf.symbols[rank] as usize]); 222 | }else { 223 | debug!("\t\tNot found"); 224 | } 225 | mtf.symbols[rank-1] = sym; 226 | debug!("\t\tAssigning future pos {} for symbol {}", future+rank-1, sym); 227 | next[sym as usize] = future+rank-1; 228 | ranks[sym as usize] = (rank-1) as Rank; 229 | } 230 | assert_eq!(next.iter().position(|&d| d=n+alphabet_size), None); 231 | assert_eq!(i, n); 232 | Ok(()) 233 | } 234 | 235 | /// Decode version with "batteries included" for quick testing 236 | pub fn decode_simple(n: usize, distances: &[D]) -> Vec { 237 | let mut output: Vec = repeat(0 as Symbol).take(n).collect(); 238 | let mut init = [0; TOTAL_SYMBOLS]; 239 | for i in 0..TOTAL_SYMBOLS { 240 | init[i] = distances[i].to_usize().unwrap(); 241 | } 242 | let mut di = TOTAL_SYMBOLS; 243 | decode(init, &mut output[..], &mut MTF::new(), |_ctx| { 244 | di += 1; 245 | if di > distances.len() { 246 | Err(io::Error::new(io::ErrorKind::Other, "Unexpected end of file")) 247 | } else { 248 | Ok(distances[di-1].to_usize().unwrap()) 249 | } 250 | }).unwrap(); 251 | output.into_iter().collect() 252 | } 253 | 254 | 255 | #[cfg(test)] 256 | mod test { 257 | use std::iter::repeat; 258 | 259 | fn roundtrip(bytes: &[u8]) { 260 | info!("Roundtrip DC of size {}", bytes.len()); 261 | let distances = super::encode_simple::(bytes); 262 | debug!("Roundtrip DC input: {:?}, distances: {:?}", bytes, distances); 263 | let decoded = super::decode_simple(bytes.len(), &distances[..]); 264 | assert_eq!(&decoded[..], bytes); 265 | } 266 | 267 | /// rountrip version that compares the coding contexts on the way 268 | fn roundtrip_ctx(bytes: &[u8]) { 269 | let n = bytes.len(); 270 | info!("Roundtrip DC context of size {}", n); 271 | let mut mtf = super::super::mtf::MTF::new(); 272 | let mut raw_dist: Vec = repeat(0).take(n).collect(); 273 | let eniter = super::encode(bytes, &mut raw_dist[..], &mut mtf); 274 | let mut init = [0; super::TOTAL_SYMBOLS]; 275 | for i in 0..super::TOTAL_SYMBOLS { 276 | init[i] = eniter.get_init()[i]; 277 | } 278 | // implicit iterator copies, or we can gather in one pass and then split 279 | let (distances, contexts): (Vec<_>, Vec<_>) = eniter.unzip(); 280 | let mut output: Vec = repeat(0).take(n).collect(); 281 | let mut di = 0; 282 | super::decode(init, &mut output[..], &mut mtf, |ctx| { 283 | assert_eq!(contexts[di], ctx); 284 | di += 1; 285 | Ok(distances[di-1] as usize) 286 | }).unwrap(); 287 | assert_eq!(di, distances.len()); 288 | assert_eq!(&output[..], bytes); 289 | } 290 | 291 | #[test] 292 | fn roundtrips() { 293 | roundtrip(b"teeesst_dc"); 294 | roundtrip(b""); 295 | roundtrip(include_bytes!("../data/test.txt")); 296 | } 297 | 298 | #[test] 299 | fn roundtrips_context() { 300 | roundtrip_ctx(b"teeesst_dc"); 301 | roundtrip_ctx(b"../data/test.txt"); 302 | } 303 | } 304 | -------------------------------------------------------------------------------- /src/bwt/mod.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | 3 | BWT (Burrows-Wheeler Transform) forward and backward transformation. Requires `bwt` feature, enabled by default 4 | 5 | This module contains a bruteforce implementation of BWT encoding in Rust as well as standard decoding. 6 | These are exposed as a standard `Reader` and `Writer` interfaces wrapping an underlying stream. 7 | 8 | BWT output stream places together symbols with similar leading contexts. This reshaping of the entropy 9 | allows further stages to deal with repeated sequences of symbols for better compression. 10 | 11 | Typical compression schemes are: 12 | BWT + RLE (+ EC) 13 | RLE + BWT + MTF + RLE + EC : bzip2 14 | BWT + DC + EC : ybs 15 | 16 | Where the stage families are: 17 | BWT: BWT (Burrows-Wheeler Transform), ST (Shindler transform) 18 | RLE: RLE (Run-Length Encoding) 19 | MTF: MTF (Move-To-Front), WFC (Weighted Frequency Coding) 20 | DC: DC (Distance Coding), IF (Inverse Frequencies) 21 | EC (Entropy Coder): Huffman, Arithmetic, RC (Range Coder) 22 | 23 | 24 | # Example 25 | 26 | ```rust 27 | use std::io::{BufWriter, BufReader, Read, Write}; 28 | use compress::bwt; 29 | 30 | // Encode some text 31 | let text = "some text"; 32 | let mut e = bwt::Encoder::new(BufWriter::new(Vec::new()), 4 << 20); 33 | e.write(text.as_bytes()).unwrap(); 34 | let (encoded, _) = e.finish(); 35 | let inner = encoded.into_inner().unwrap(); 36 | 37 | // Decode the encoded text 38 | let mut d = bwt::Decoder::new(BufReader::new(&inner[..]), true); 39 | let mut decoded = Vec::new(); 40 | d.read_to_end(&mut decoded).unwrap(); 41 | 42 | assert_eq!(&decoded[..], text.as_bytes()); 43 | ``` 44 | 45 | # Credit 46 | 47 | This is an original (mostly trivial) implementation. 48 | 49 | */ 50 | 51 | #![allow(missing_docs)] 52 | 53 | extern crate num; 54 | 55 | use std::{cmp, fmt, slice}; 56 | use std::ptr; 57 | use std::iter::{self, Extend, repeat}; 58 | use std::io::{self, Read, Write}; 59 | use self::num::traits::{NumCast, ToPrimitive}; 60 | 61 | use super::byteorder::{LittleEndian, WriteBytesExt, ReadBytesExt}; 62 | use super::{byteorder_err_to_io, ReadExact}; 63 | 64 | pub mod dc; 65 | pub mod mtf; 66 | 67 | /// A base element for the transformation 68 | pub type Symbol = u8; 69 | 70 | pub const ALPHABET_SIZE: usize = 0x100; 71 | 72 | /// Radix sorting primitive 73 | pub struct Radix { 74 | /// number of occurancies (frequency) per symbox 75 | pub freq : [usize; ALPHABET_SIZE+1], 76 | } 77 | 78 | impl Radix { 79 | /// create Radix sort instance 80 | pub fn new() -> Radix { 81 | Radix { 82 | freq : [0; ALPHABET_SIZE+1], 83 | } 84 | } 85 | 86 | /// reset counters 87 | /// allows the struct to be re-used 88 | pub fn reset(&mut self) { 89 | for fr in self.freq.iter_mut() { 90 | *fr = 0; 91 | } 92 | } 93 | 94 | /// count elements in the input 95 | pub fn gather(&mut self, input: &[Symbol]) { 96 | for &b in input.iter() { 97 | self.freq[b as usize] += 1; 98 | } 99 | } 100 | 101 | /// build offset table 102 | pub fn accumulate(&mut self) { 103 | let mut n = 0; 104 | for freq in self.freq.iter_mut() { 105 | let f = *freq; 106 | *freq = n; 107 | n += f; 108 | } 109 | } 110 | 111 | /// return next byte position, advance it internally 112 | pub fn place(&mut self, b: Symbol)-> usize { 113 | let pos = self.freq[b as usize]; 114 | assert!(self.freq[b as usize] < self.freq[(b as usize)+1], 115 | "Unable to place symbol {} at offset {}", 116 | b, pos); 117 | self.freq[b as usize] += 1; 118 | pos 119 | } 120 | 121 | /// shift frequences to the left 122 | /// allows the offsets to be re-used after all positions are obtained 123 | pub fn shift(&mut self) { 124 | assert_eq!( self.freq[ALPHABET_SIZE-1], self.freq[ALPHABET_SIZE] ); 125 | for i in (0 .. ALPHABET_SIZE).rev() { 126 | self.freq[i+1] = self.freq[i]; 127 | } 128 | self.freq[0] = 0; 129 | } 130 | } 131 | 132 | 133 | /// Compute a suffix array from a given input string 134 | /// Resulting suffixes are guaranteed to be alphabetically sorted 135 | /// Run time: O(N^3), memory: N words (suf_array) + ALPHABET_SIZE words (Radix) 136 | pub fn compute_suffixes(input: &[Symbol], suf_array: &mut [SUF]) { 137 | let mut radix = Radix::new(); 138 | radix.gather(input); 139 | radix.accumulate(); 140 | 141 | debug!("SA compute input: {:?}", input); 142 | debug!("radix offsets: {:?}", &radix.freq[..]); 143 | 144 | for (i,&ch) in input.iter().enumerate() { 145 | let p = radix.place(ch); 146 | suf_array[p] = NumCast::from(i).unwrap(); 147 | } 148 | 149 | // bring the original offsets back 150 | radix.shift(); 151 | 152 | for i in 0..ALPHABET_SIZE { 153 | let lo = radix.freq[i]; 154 | let hi = radix.freq[i+1]; 155 | if lo == hi { 156 | continue; 157 | } 158 | let slice = &mut suf_array[lo..hi]; 159 | debug!("\tsorting group [{}-{}) for symbol {}", lo, hi, i); 160 | slice.sort_by(|a,b| { 161 | input[(a.to_usize().unwrap())..].cmp(&input[(b.to_usize().unwrap())..]) 162 | }); 163 | } 164 | 165 | debug!("sorted SA: {:?}", suf_array); 166 | } 167 | 168 | /// An iterator over BWT output 169 | pub struct TransformIterator<'a, SUF: 'a> { 170 | input : &'a [Symbol], 171 | suf_iter : iter::Enumerate>, 172 | origin : Option, 173 | } 174 | 175 | impl<'a, SUF> TransformIterator<'a, SUF> { 176 | /// create a new BWT iterator from the suffix array 177 | pub fn new(input: &'a [Symbol], suffixes: &'a [SUF]) -> TransformIterator<'a, SUF> { 178 | TransformIterator { 179 | input: input, 180 | suf_iter: suffixes.iter().enumerate(), 181 | origin: None, 182 | } 183 | } 184 | 185 | /// return the index of the original string 186 | pub fn get_origin(&self) -> usize { 187 | self.origin.unwrap() 188 | } 189 | } 190 | 191 | impl<'a, SUF: ToPrimitive + 'a> Iterator for TransformIterator<'a, SUF> { 192 | type Item = Symbol; 193 | fn next(&mut self) -> Option { 194 | self.suf_iter.next().map(|(i,p)| { 195 | if p.to_usize().unwrap() == 0 { 196 | assert!( self.origin.is_none() ); 197 | self.origin = Some(i); 198 | *self.input.last().unwrap() 199 | }else { 200 | self.input[p.to_usize().unwrap() - 1] 201 | } 202 | }) 203 | } 204 | } 205 | 206 | /// Encode BWT of a given input, using the 'suf_array' 207 | pub fn encode<'a, SUF: NumCast + ToPrimitive + fmt::Debug>(input: &'a [Symbol], suf_array: &'a mut [SUF]) -> TransformIterator<'a, SUF> { 208 | compute_suffixes(input, suf_array); 209 | TransformIterator::new(input, suf_array) 210 | } 211 | 212 | /// Transform an input block into the output slice, all-inclusive version. 213 | /// Returns the index of the original string in the output matrix. 214 | pub fn encode_simple(input: &[Symbol]) -> (Vec, usize) { 215 | let mut suf_array: Vec = repeat(0).take(input.len()).collect(); 216 | let mut iter = encode(input, &mut suf_array[..]); 217 | let output: Vec = iter.by_ref().collect(); 218 | (output, iter.get_origin()) 219 | } 220 | 221 | 222 | /// Compute an inversion jump table, needed for BWT decoding 223 | pub fn compute_inversion_table(input: &[Symbol], origin: usize, table: &mut [SUF]) { 224 | assert_eq!(input.len(), table.len()); 225 | 226 | let mut radix = Radix::new(); 227 | radix.gather(input); 228 | radix.accumulate(); 229 | 230 | table[radix.place(input[origin])] = NumCast::from(0).unwrap(); 231 | for (i,&ch) in input[..origin].iter().enumerate() { 232 | table[radix.place(ch)] = NumCast::from(i+1).unwrap(); 233 | } 234 | for (i,&ch) in input[(origin+1)..].iter().enumerate() { 235 | table[radix.place(ch)] = NumCast::from(origin+2+i).unwrap(); 236 | } 237 | //table[-1] = origin; 238 | debug!("inverse table: {:?}", table) 239 | } 240 | 241 | /// An iterator over inverse BWT 242 | /// Run time: O(N), memory: N words (table) 243 | pub struct InverseIterator<'a, SUF: 'a> { 244 | input : &'a [Symbol], 245 | table : &'a [SUF], 246 | origin : usize, 247 | current : usize, 248 | } 249 | 250 | impl<'a, SUF> InverseIterator<'a, SUF> { 251 | /// create a new inverse BWT iterator with a given input, origin, and a jump table 252 | pub fn new(input: &'a [Symbol], origin: usize, table: &'a [SUF]) -> InverseIterator<'a, SUF> { 253 | debug!("inverse origin={:?}, input: {:?}", origin, input); 254 | InverseIterator { 255 | input: input, 256 | table: table, 257 | origin: origin, 258 | current: origin, 259 | } 260 | } 261 | } 262 | 263 | impl<'a, SUF: ToPrimitive> Iterator for InverseIterator<'a, SUF> { 264 | type Item = Symbol; 265 | 266 | fn next(&mut self) -> Option { 267 | if self.current == usize::max_value() { 268 | None 269 | } else { 270 | self.current = self.table[self.current].to_usize().unwrap().wrapping_sub(1); 271 | debug!("\tjumped to {}", self.current); 272 | 273 | let p = if self.current != usize::max_value() { 274 | self.current 275 | } else { 276 | self.origin 277 | }; 278 | 279 | Some(self.input[p]) 280 | } 281 | } 282 | } 283 | 284 | /// Decode a BWT block, given it's origin, and using 'table' temporarily 285 | pub fn decode<'a, SUF: NumCast + fmt::Debug>(input: &'a [Symbol], origin: usize, table: &'a mut [SUF]) -> InverseIterator<'a, SUF> { 286 | compute_inversion_table(input, origin, table); 287 | InverseIterator::new(input, origin, table) 288 | } 289 | 290 | /// A simplified BWT decode function, which allocates a temporary suffix array 291 | pub fn decode_simple(input: &[Symbol], origin: usize) -> Vec { 292 | let mut suf: Vec = repeat(0).take(input.len()).collect(); 293 | decode(input, origin, &mut suf[..]).take(input.len()).collect() 294 | } 295 | 296 | /// Decode without additional memory, can be greatly optimized 297 | /// Run time: O(n^2), Memory: 0n 298 | fn decode_minimal(input: &[Symbol], origin: usize, output: &mut [Symbol]) { 299 | assert_eq!(input.len(), output.len()); 300 | if input.len() == 0 { 301 | assert_eq!(origin, 0); 302 | } 303 | 304 | let mut radix = Radix::new(); 305 | radix.gather(input); 306 | radix.accumulate(); 307 | 308 | let n = input.len(); 309 | (0..n).fold(origin, |i,j| { 310 | let ch = input[i]; 311 | output[n-j-1] = ch; 312 | let offset = &input[..i].iter().filter(|&k| *k==ch).count(); 313 | radix.freq[ch as usize] + offset 314 | }); 315 | } 316 | 317 | 318 | /// This structure is used to decode a stream of BWT blocks. This wraps an 319 | /// internal reader which is read from when this decoder's read method is 320 | /// called. 321 | pub struct Decoder { 322 | /// The internally wrapped reader. This is exposed so it may be moved out 323 | /// of. Note that if data is read from the reader while decoding is in 324 | /// progress the output stream will get corrupted. 325 | pub r: R, 326 | start : usize, 327 | 328 | temp : Vec, 329 | output : Vec, 330 | table : Vec, 331 | 332 | header : bool, 333 | max_block_size : usize, 334 | extra_memory : bool, 335 | } 336 | 337 | impl Decoder { 338 | /// Creates a new decoder which will read data from the given stream. The 339 | /// inner stream can be re-acquired by moving out of the `r` field of this 340 | /// structure. 341 | /// 'extra_mem' switch allows allocating extra N words of memory for better performance 342 | pub fn new(r: R, extra_mem: bool) -> Decoder { 343 | Decoder { 344 | r: r, 345 | start: 0, 346 | temp: Vec::new(), 347 | output: Vec::new(), 348 | table: Vec::new(), 349 | header: false, 350 | max_block_size: 0, 351 | extra_memory: extra_mem, 352 | } 353 | } 354 | 355 | /// Resets this decoder back to its initial state. Note that the underlying 356 | /// stream is not seeked on or has any alterations performed on it. 357 | pub fn reset(&mut self) { 358 | self.header = false; 359 | self.start = 0; 360 | } 361 | 362 | fn read_header(&mut self) -> io::Result<()> { 363 | match self.r.read_u32::() { 364 | Ok(size) => { 365 | self.max_block_size = size as usize; 366 | debug!("max size: {}", self.max_block_size); 367 | Ok(()) 368 | }, 369 | Err(e) => Err(byteorder_err_to_io(e)), 370 | } 371 | } 372 | 373 | fn decode_block(&mut self) -> io::Result { 374 | let n = match self.r.read_u32::() { 375 | Ok(n) => n as usize, 376 | Err(ref e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(false), // EOF 377 | Err(e) => return Err(e), 378 | }; 379 | 380 | self.temp.truncate(0); 381 | self.temp.reserve(n); 382 | try!(self.r.push_exactly(n as u64, &mut self.temp)); 383 | 384 | let origin = try!(self.r.read_u32::()) as usize; 385 | self.output.truncate(0); 386 | self.output.reserve(n); 387 | 388 | if self.extra_memory { 389 | self.table.truncate(0); 390 | self.table.extend((0..n).map(|_| 0)); 391 | for ch in decode(&self.temp[..], origin, &mut self.table[..]) { 392 | self.output.push(ch); 393 | } 394 | }else { 395 | self.output.extend((0..n).map(|_| 0)); 396 | decode_minimal(&self.temp[..], origin, &mut self.output[..]); 397 | } 398 | 399 | self.start = 0; 400 | return Ok(true); 401 | } 402 | } 403 | 404 | impl Read for Decoder { 405 | fn read(&mut self, dst: &mut [u8]) -> io::Result { 406 | if !self.header { 407 | try!(self.read_header()); 408 | self.header = true; 409 | } 410 | let mut amt = dst.len(); 411 | let dst_len = amt; 412 | 413 | while amt > 0 { 414 | if self.output.len() == self.start { 415 | let keep_going = try!(self.decode_block()); 416 | if !keep_going { 417 | break 418 | } 419 | } 420 | let n = cmp::min(amt, self.output.len() - self.start); 421 | unsafe { ptr::copy_nonoverlapping( 422 | &self.output[self.start], 423 | &mut dst[dst_len - amt], 424 | n, 425 | )}; 426 | self.start += n; 427 | amt -= n; 428 | } 429 | 430 | Ok(dst_len - amt) 431 | } 432 | } 433 | 434 | 435 | /// This structure is used to compress a stream of bytes using the BWT. 436 | /// This is a wrapper around an internal writer which bytes will be written to. 437 | pub struct Encoder { 438 | w: W, 439 | buf: Vec, 440 | suf: Vec, 441 | wrote_header: bool, 442 | block_size: usize, 443 | } 444 | 445 | impl Encoder { 446 | /// Creates a new encoder which will have its output written to the given 447 | /// output stream. The output stream can be re-acquired by calling 448 | /// `finish()` 449 | /// 'block_size' is idealy as big as your input, unless you know for sure that 450 | /// the input consists of multiple parts of different nature. Often set as 4Mb. 451 | pub fn new(w: W, block_size: usize) -> Encoder { 452 | Encoder { 453 | w: w, 454 | buf: Vec::new(), 455 | suf: Vec::new(), 456 | wrote_header: false, 457 | block_size: block_size, 458 | } 459 | } 460 | 461 | fn encode_block(&mut self) -> io::Result<()> { 462 | let n = self.buf.len(); 463 | try!(self.w.write_u32::(n as u32)); 464 | 465 | self.suf.truncate(0); 466 | self.suf.extend((0..n).map(|_| n)); 467 | let w = &mut self.w; 468 | 469 | { 470 | let mut iter = encode(&self.buf[..], &mut self.suf[..]); 471 | for ch in iter.by_ref() { 472 | try!(w.write_u8(ch)); 473 | } 474 | 475 | try!(w.write_u32::(iter.get_origin() as u32)); 476 | } 477 | self.buf.truncate(0); 478 | 479 | Ok(()) 480 | } 481 | 482 | /// This function is used to flag that this session of compression is done 483 | /// with. The stream is finished up (final bytes are written), and then the 484 | /// wrapped writer is returned. 485 | pub fn finish(mut self) -> (W, io::Result<()>) { 486 | let result = self.flush(); 487 | (self.w, result) 488 | } 489 | } 490 | 491 | impl Write for Encoder { 492 | fn write(&mut self, mut buf: &[u8]) -> io::Result { 493 | if !self.wrote_header { 494 | try!(self.w.write_u32::(self.block_size as u32)); 495 | self.wrote_header = true; 496 | } 497 | 498 | while buf.len() > 0 { 499 | let amt = cmp::min( self.block_size - self.buf.len(), buf.len() ); 500 | self.buf.extend(buf[..amt].iter().map(|b| *b)); 501 | 502 | if self.buf.len() == self.block_size { 503 | try!(self.encode_block()); 504 | } 505 | buf = &buf[amt..]; 506 | } 507 | Ok(buf.len()) 508 | } 509 | 510 | fn flush(&mut self) -> io::Result<()> { 511 | let ret = if self.buf.len() > 0 { 512 | self.encode_block() 513 | } else { 514 | Ok(()) 515 | }; 516 | ret.and(self.w.flush()) 517 | } 518 | } 519 | 520 | 521 | #[cfg(test)] 522 | mod test { 523 | use std::io::{BufReader, BufWriter, Read, Write}; 524 | #[cfg(feature="unstable")] 525 | use test::Bencher; 526 | use super::{Decoder, Encoder}; 527 | 528 | fn roundtrip(bytes: &[u8], extra_mem: bool) { 529 | let mut e = Encoder::new(BufWriter::new(Vec::new()), 1<<10); 530 | e.write(bytes).unwrap(); 531 | let (e, err) = e.finish(); 532 | err.unwrap(); 533 | let encoded = e.into_inner().unwrap(); 534 | 535 | let mut d = Decoder::new(BufReader::new(&encoded[..]), extra_mem); 536 | let mut decoded = Vec::new(); 537 | d.read_to_end(&mut decoded).unwrap(); 538 | assert_eq!(&decoded[..], bytes); 539 | } 540 | 541 | #[test] 542 | fn some_roundtrips() { 543 | roundtrip(b"test", true); 544 | roundtrip(b"", true); 545 | roundtrip(include_bytes!("../data/test.txt"), true); 546 | } 547 | 548 | #[test] 549 | fn decode_minimal() { 550 | roundtrip(b"abracadabra", false); 551 | } 552 | 553 | #[cfg(feature="unstable")] 554 | #[bench] 555 | fn decode_speed(bh: &mut Bencher) { 556 | use std::iter::repeat; 557 | use super::{encode, decode}; 558 | 559 | let input = include_bytes!("../data/test.txt"); 560 | let n = input.len(); 561 | let mut suf: Vec = repeat(0).take(n).collect(); 562 | let (output, origin) = { 563 | let mut to_iter = encode(input, &mut suf[..]); 564 | let out: Vec = to_iter.by_ref().collect(); 565 | (out, to_iter.get_origin()) 566 | }; 567 | 568 | bh.iter(|| { 569 | let from_iter = decode(&output[..], origin, &mut suf[..]); 570 | from_iter.last().unwrap(); 571 | }); 572 | bh.bytes = n as u64; 573 | } 574 | } 575 | -------------------------------------------------------------------------------- /src/bwt/mtf.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | 3 | MTF (Move To Front) encoder/decoder 4 | Produces a rank for each input character based on when it was seen last time. 5 | Useful for BWT output encoding, which produces a lot of zeroes and low ranks. 6 | 7 | # Links 8 | 9 | http://en.wikipedia.org/wiki/Move-to-front_transform 10 | 11 | # Example 12 | 13 | ```rust 14 | use std::io::{self, Read, Write}; 15 | use compress::bwt::mtf; 16 | 17 | // Encode a stream of bytes 18 | let bytes = b"abracadabra"; 19 | let mut e = mtf::Encoder::new(io::BufWriter::new(Vec::new())); 20 | e.write_all(bytes).unwrap(); 21 | let encoded = e.finish().into_inner().unwrap(); 22 | 23 | // Decode a stream of ranks 24 | let mut d = mtf::Decoder::new(io::BufReader::new(&encoded[..])); 25 | let mut decoded = Vec::new(); 26 | let result = d.read_to_end(&mut decoded).unwrap(); 27 | ``` 28 | 29 | # Credit 30 | 31 | */ 32 | 33 | use std::mem; 34 | use std::io::{self, Read, Write}; 35 | 36 | use super::super::byteorder::{WriteBytesExt, ReadBytesExt}; 37 | 38 | pub type Symbol = u8; 39 | pub type Rank = u8; 40 | pub const TOTAL_SYMBOLS: usize = 0x100; 41 | 42 | 43 | /// MoveToFront encoder/decoder 44 | pub struct MTF { 45 | /// rank-ordered list of unique Symbols 46 | pub symbols: [Symbol; TOTAL_SYMBOLS], 47 | } 48 | 49 | impl MTF { 50 | /// create a new zeroed MTF 51 | pub fn new() -> MTF { 52 | MTF { symbols: [0; TOTAL_SYMBOLS] } 53 | } 54 | 55 | /// set the order of symbols to be alphabetical 56 | pub fn reset_alphabetical(&mut self) { 57 | for (i,sym) in self.symbols.iter_mut().enumerate() { 58 | *sym = i as Symbol; 59 | } 60 | } 61 | 62 | /// encode a symbol into its rank 63 | pub fn encode(&mut self, sym: Symbol) -> Rank { 64 | let mut next = self.symbols[0]; 65 | if next == sym { 66 | return 0 67 | } 68 | let mut rank: Rank = 1; 69 | loop { 70 | mem::swap(&mut self.symbols[rank as usize], &mut next); 71 | if next == sym { 72 | break; 73 | } 74 | rank += 1; 75 | assert!((rank as usize) < self.symbols.len()); 76 | } 77 | self.symbols[0] = sym; 78 | rank 79 | } 80 | 81 | /// decode a rank into its symbol 82 | pub fn decode(&mut self, rank: Rank) -> Symbol { 83 | let sym = self.symbols[rank as usize]; 84 | debug!("\tDecoding rank {} with symbol {}", rank, sym); 85 | for i in (0 .. rank as usize).rev() { 86 | self.symbols[i+1] = self.symbols[i]; 87 | } 88 | self.symbols[0] = sym; 89 | sym 90 | } 91 | } 92 | 93 | 94 | /// A simple MTF stream encoder 95 | pub struct Encoder { 96 | w: W, 97 | mtf: MTF, 98 | } 99 | 100 | impl Encoder { 101 | /// start encoding into the given writer 102 | pub fn new(w: W) -> Encoder { 103 | let mut mtf = MTF::new(); 104 | mtf.reset_alphabetical(); 105 | Encoder { 106 | w: w, 107 | mtf: mtf, 108 | } 109 | } 110 | 111 | /// finish encoding and return the wrapped writer 112 | pub fn finish(self) -> W { 113 | self.w 114 | } 115 | } 116 | 117 | impl Write for Encoder { 118 | fn write(&mut self, buf: &[u8]) -> io::Result { 119 | for sym in buf.iter() { 120 | let rank = self.mtf.encode(*sym); 121 | try!(self.w.write_u8(rank)); 122 | } 123 | Ok(buf.len()) 124 | } 125 | 126 | fn flush(&mut self) -> io::Result<()> { 127 | self.w.flush() 128 | } 129 | } 130 | 131 | 132 | /// A simple MTF stream decoder 133 | pub struct Decoder { 134 | r: R, 135 | mtf: MTF, 136 | } 137 | 138 | impl Decoder { 139 | /// start decoding the given reader 140 | pub fn new(r: R) -> Decoder { 141 | let mut mtf = MTF::new(); 142 | mtf.reset_alphabetical(); 143 | Decoder { 144 | r: r, 145 | mtf: mtf, 146 | } 147 | } 148 | 149 | /// finish decoder and return the wrapped reader 150 | pub fn finish(self) -> R { 151 | self.r 152 | } 153 | } 154 | 155 | impl Read for Decoder { 156 | fn read(&mut self, dst: &mut [u8]) -> io::Result { 157 | let mut bytes_read = 0; 158 | for sym in dst.iter_mut() { 159 | let rank = match self.r.read_u8() { 160 | Ok(r) => r, 161 | Err(ref e) if e.kind() == io::ErrorKind::UnexpectedEof => break, 162 | Err(e) => return Err(e) 163 | }; 164 | bytes_read += 1; 165 | *sym = self.mtf.decode(rank); 166 | } 167 | Ok(bytes_read) 168 | } 169 | } 170 | 171 | 172 | #[cfg(test)] 173 | mod test { 174 | use std::io::{self, Read, Write}; 175 | #[cfg(feature="unstable")] 176 | use test::Bencher; 177 | use super::{Encoder, Decoder}; 178 | 179 | fn roundtrip(bytes: &[u8]) { 180 | info!("Roundtrip MTF of size {}", bytes.len()); 181 | let buf = Vec::new(); 182 | let mut e = Encoder::new(io::BufWriter::new(buf)); 183 | e.write_all(bytes).unwrap(); 184 | let encoded = e.finish().into_inner().unwrap(); 185 | debug!("Roundtrip MTF input: {:?}, ranks: {:?}", bytes, encoded); 186 | let mut d = Decoder::new(io::BufReader::new(&encoded[..])); 187 | let mut decoded = Vec::new(); 188 | d.read_to_end(&mut decoded).unwrap(); 189 | assert_eq!(&decoded[..], bytes); 190 | } 191 | 192 | #[test] 193 | fn some_roundtrips() { 194 | roundtrip(b"teeesst_mtf"); 195 | roundtrip(b""); 196 | roundtrip(include_bytes!("../data/test.txt")); 197 | } 198 | 199 | #[cfg(feature="unstable")] 200 | #[bench] 201 | fn encode_speed(bh: &mut Bencher) { 202 | let vec = Vec::new(); 203 | let input = include_bytes!("../data/test.txt"); 204 | let mem = io::BufWriter::with_capacity(input.len(), vec); 205 | let mut e = Encoder::new(mem); 206 | bh.iter(|| { 207 | e.write_all(input).unwrap(); 208 | }); 209 | bh.bytes = input.len() as u64; 210 | } 211 | 212 | #[cfg(feature="unstable")] 213 | #[bench] 214 | fn decode_speed(bh: &mut Bencher) { 215 | let vec = Vec::new(); 216 | let input = include_bytes!("../data/test.txt"); 217 | let mut e = Encoder::new(io::BufWriter::new(vec)); 218 | e.write_all(input).unwrap(); 219 | let encoded = e.finish().into_inner().unwrap(); 220 | bh.iter(|| { 221 | let mut d = Decoder::new(io::BufReader::new(&encoded[..])); 222 | let mut buf = Vec::new(); 223 | d.read_to_end(&mut buf).unwrap(); 224 | }); 225 | bh.bytes = input.len() as u64; 226 | } 227 | } 228 | -------------------------------------------------------------------------------- /src/checksum/adler.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | 3 | Adler-32 checksum 4 | 5 | This implementation is based off the example found at 6 | http://en.wikipedia.org/wiki/Adler-32. 7 | 8 | # Example 9 | 10 | ```rust 11 | use compress::checksum::adler; 12 | let mut state = adler::State32::new(); 13 | state.feed(b"abracadabra"); 14 | let checksum = state.result(); 15 | ``` 16 | 17 | */ 18 | 19 | const MOD_ADLER: u32 = 65521; 20 | 21 | /// Adler state for 32 bits 22 | pub struct State32 { 23 | a: u32, 24 | b: u32, 25 | } 26 | 27 | impl State32 { 28 | /// Create a new state 29 | pub fn new() -> State32 { 30 | State32 { a: 1, b: 0 } 31 | } 32 | 33 | /// Mutate the state for given data 34 | pub fn feed(&mut self, buf: &[u8]) { 35 | for byte in buf.iter() { 36 | self.a = (self.a + *byte as u32) % MOD_ADLER; 37 | self.b = (self.a + self.b) % MOD_ADLER; 38 | } 39 | } 40 | 41 | /// Get checksum 42 | pub fn result(&self) -> u32 { 43 | (self.b << 16) | self.a 44 | } 45 | 46 | /// Reset the state 47 | pub fn reset(&mut self) { 48 | self.a = 1; 49 | self.b = 0; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/data/test.large.z.5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rusty-shell/rust-compress/e16695cac3c8861a2132e0d3f4605a3c11f26078/src/data/test.large.z.5 -------------------------------------------------------------------------------- /src/data/test.lz4.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rusty-shell/rust-compress/e16695cac3c8861a2132e0d3f4605a3c11f26078/src/data/test.lz4.1 -------------------------------------------------------------------------------- /src/data/test.lz4.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rusty-shell/rust-compress/e16695cac3c8861a2132e0d3f4605a3c11f26078/src/data/test.lz4.2 -------------------------------------------------------------------------------- /src/data/test.lz4.3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rusty-shell/rust-compress/e16695cac3c8861a2132e0d3f4605a3c11f26078/src/data/test.lz4.3 -------------------------------------------------------------------------------- /src/data/test.lz4.4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rusty-shell/rust-compress/e16695cac3c8861a2132e0d3f4605a3c11f26078/src/data/test.lz4.4 -------------------------------------------------------------------------------- /src/data/test.lz4.5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rusty-shell/rust-compress/e16695cac3c8861a2132e0d3f4605a3c11f26078/src/data/test.lz4.5 -------------------------------------------------------------------------------- /src/data/test.lz4.6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rusty-shell/rust-compress/e16695cac3c8861a2132e0d3f4605a3c11f26078/src/data/test.lz4.6 -------------------------------------------------------------------------------- /src/data/test.lz4.7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rusty-shell/rust-compress/e16695cac3c8861a2132e0d3f4605a3c11f26078/src/data/test.lz4.7 -------------------------------------------------------------------------------- /src/data/test.lz4.8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rusty-shell/rust-compress/e16695cac3c8861a2132e0d3f4605a3c11f26078/src/data/test.lz4.8 -------------------------------------------------------------------------------- /src/data/test.lz4.9: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rusty-shell/rust-compress/e16695cac3c8861a2132e0d3f4605a3c11f26078/src/data/test.lz4.9 -------------------------------------------------------------------------------- /src/data/test.txt: -------------------------------------------------------------------------------- 1 | 0000000: 91c2f55c3215a8d8 c9ffc4e7ef9b6798 ...\2.........g. 2 | 0000010: d61b0d1670a59372 1573c8a561857f9c ....p..r.s..a... 3 | 0000020: 362f1a9d6d953cbf 8e189a2eaa6be88e 6/..m.<......k.. 4 | 0000030: 28fe132560ada100 2ef6f85a9e6d1b91 (..%`......Z.m.. 5 | 0000040: 8a5d37cc5cbfff89 3537ff5d73aae040 .]7.\...57.]s..@ 6 | 0000050: 127c7809f5b183c3 61400ea6b90451ef .|x.....a@....Q. 7 | 0000060: 2272ffbb2cd81b2a 005bbea7bf8f945f "r..,..*.[....._ 8 | 0000070: 39e03ab01676bbd0 939be3d936585554 9.:..v......6XUT 9 | 0000080: 41bb97f4410bd105 1cf75f2dee4f35c9 A...A....._-.O5. 10 | 0000090: 8123a6ca320b8951 c8313d50aea11065 .#..2..Q.1=P...e 11 | 00000a0: b269bda147bba414 94575126ed2d2770 .i..G....WQ&.-'p 12 | 00000b0: 422f9292b31978a6 92e24695dbdca27e B/....x...F....~ 13 | 00000c0: 419738fc9d2e61bd e1d22e3604c521f9 A.8...a....6..!. 14 | 00000d0: 89b9c7aa81eaf936 589cd50421ea42f9 .......6X...!.B. 15 | 00000e0: e5d716af7e3ba436 e70fcb33b85adff5 ....~;.6...3.Z.. 16 | 00000f0: 0b8e32a1e824edc9 30f6abd2c733a354 ..2..$..0....3.T 17 | 0000100: d7a2554901bbbc17 3b8308d8269f96a6 ..UI....;...&... 18 | 0000110: 190985c6f2e5e147 f85c1e61c7a1ce22 .......G.\.a..." 19 | 0000120: 21522181dc5329c3 ff83b2ca5537dca1 !R!..S).....U7.. 20 | 0000130: 2a95ab06ccca632f 3f819fc396916676 *.....c/?.....fv 21 | 0000140: 384731c6146aa2a3 e76189389b59ff49 8G1..j...a.8.Y.I 22 | 0000150: a9b1d8dff5a86626 72a11ae65e0b33fc ......f&r...^.3. 23 | 0000160: 89362784bb1a0443 3de0327da454ee20 .6'....C=.2}.T. 24 | 0000170: e1173db62e55dc20 8229a37aaadd4ba6 ..=..U. .).z..K. 25 | 0000180: d4c4a71393b45926 8f34d87fa5c6b28e ......Y&.4...... 26 | 0000190: 2f1188a6d1d0711c 9379286d7511a3bd /.....q..y(mu... 27 | 00001a0: 1a2d83fad3ce1bb0 f964a48d8b6f5c8c .-.......d...o\. 28 | 00001b0: 6e992294580e9982 8bbba454fa167dcf n.".X......T..}. 29 | 00001c0: 6c75bd8d6df6cd2b 33b31e56eed09514 lu..m..+3..V.... 30 | 00001d0: 0e26ab922c4689bd 43644322d4541da3 .&..,F..CdC".T.. 31 | 00001e0: 83ddbe1dcbbf77bd 1da89fad1b162341 ......w.......#A 32 | 00001f0: 43a336bb09b47551 b08eef90e745e832 C.6...uQ.....E.2 33 | 0000200: 4666a92e97425a15 bf6ff63b883eec86 Ff...BZ..o.;.>.. 34 | 0000210: 09d23683b90c4218 1d20615003253f40 ..6...B.. aP.%?@ 35 | 0000220: 3b91ce6f1ee06042 449a593d61b1e68c ;..o..`BD.Y=a... 36 | 0000230: 9d304c8d8e49c96f b7640996f180401c .0L..I.o.d....@. 37 | 0000240: fd1edd7fc72e9259 06310df7759e5bc9 .......Y.1..u.[. 38 | 0000250: b8674a3c006b8bfa 393858ad1ffd1efd .gJ<.k..98X..... 39 | 0000260: 3f75301efe580293 68a8aa18f2efa649 ?u0..X..h......I 40 | 0000270: e55f21ab282a0f7a 317f8def421150a3 ._!.(*.z1...B.P. 41 | 0000280: a211cfce3587ea16 46be81d9b63646cd ....5...F....6F. 42 | 0000290: 2a30a1be6d917fb9 31015f1b91bbb1dc *0..m...1._..... 43 | 00002a0: 9c52e29b165c4ec3 f8aea285a688d1cb .R...\N......... 44 | 00002b0: 5901ac6448e3686e 061d0d2ff04bbf04 Y..dH.hn.../.K.. 45 | 00002c0: 4bf319830a3cfb5a ab051b3c5ad70e6b K....<.Z... Bit { 42 | Bit(FLAT_TOTAL as FlatProbability >> 1) 43 | } 44 | 45 | /// Return flat probability 46 | #[inline] 47 | pub fn to_flat(&self) -> FlatProbability { 48 | let Bit(fp) = *self; 49 | fp 50 | } 51 | 52 | /// Return wide probability 53 | #[inline] 54 | pub fn to_wide(&self) -> WideProbability { 55 | //table_stretch[self.to_flat() as usize] 56 | let p = (self.to_flat() as f32) / (FLAT_TOTAL as f32); 57 | let d = (p / (1.0-p)).ln(); 58 | let wp = (d * WIDE_OFFSET as f32).to_i16().unwrap(); 59 | wp 60 | } 61 | 62 | /// Construct from flat probability 63 | #[inline] 64 | pub fn from_flat(fp: FlatProbability) -> Bit { 65 | Bit(fp) 66 | } 67 | 68 | /// Construct from wide probability 69 | #[inline] 70 | pub fn from_wide(wp: WideProbability) -> Bit { 71 | //Bit(table_squash[(wp+WIDE_OFFSET) as usize]) 72 | let d = (wp as f32) / (WIDE_OFFSET as f32); 73 | let p = 1.0 / (1.0 + (-d).exp()); 74 | let fp = (p * FLAT_TOTAL as f32).to_u16().unwrap(); 75 | Bit(fp) 76 | } 77 | 78 | /// Mutate for better zeroes 79 | pub fn update_zero(&mut self, rate: isize, bias: isize) { 80 | let &mut Bit(ref mut fp) = self; 81 | let one = FLAT_TOTAL - bias - (*fp as isize); 82 | *fp += (one >> (rate as usize)) as FlatProbability; 83 | } 84 | 85 | /// Mutate for better ones 86 | pub fn update_one(&mut self, rate: isize, bias: isize) { 87 | let &mut Bit(ref mut fp) = self; 88 | let zero = (*fp as isize) - bias; 89 | *fp -= (zero >> (rate as usize)) as FlatProbability; 90 | } 91 | 92 | /// Mutate for a given value 93 | #[inline] 94 | pub fn update(&mut self, value: bool, rate: isize, bias: isize) { 95 | if !value { 96 | self.update_zero(rate, bias) 97 | }else { 98 | self.update_one(rate, bias) 99 | } 100 | } 101 | } 102 | 103 | impl super::Model for Bit { 104 | fn get_range(&self, value: bool) -> (Border,Border) { 105 | let fp = self.to_flat() as Border; 106 | if !value { 107 | (0, fp) 108 | }else { 109 | (fp, FLAT_TOTAL as Border) 110 | } 111 | } 112 | 113 | fn find_value(&self, offset: Border) -> (bool,Border,Border) { 114 | assert!(offset < FLAT_TOTAL as Border, 115 | "Invalid bit offset {} requested", offset); 116 | let fp = self.to_flat() as Border; 117 | if offset < fp { 118 | (false, 0, fp) 119 | }else { 120 | (true, fp, FLAT_TOTAL as Border) 121 | } 122 | } 123 | 124 | fn get_denominator(&self) -> Border { 125 | FLAT_TOTAL as Border 126 | } 127 | } 128 | 129 | 130 | /// Binary context gate 131 | /// maps an input binary probability into a new one 132 | /// by interpolating between internal maps in non-linear space 133 | pub struct Gate { 134 | map: [Bit; PORTAL_BINS], 135 | } 136 | 137 | pub type BinCoords = (usize, usize); // (index, weight) 138 | 139 | impl Gate { 140 | /// Create a new gate instance 141 | pub fn new() -> Gate { 142 | let mut g = Gate { 143 | map: [Bit::new_equal(); PORTAL_BINS], 144 | }; 145 | for (i,bit) in g.map.iter_mut().enumerate() { 146 | let rp = (i as f32)/(PORTAL_OFFSET as f32) - 1.0; 147 | let wp = (rp * (WIDE_OFFSET as f32)).to_i16().unwrap(); 148 | *bit = Bit::from_wide(wp); 149 | } 150 | g 151 | } 152 | 153 | /// Pass a bit through the gate 154 | #[inline] 155 | pub fn pass(&self, bit: &Bit) -> (Bit, BinCoords) { 156 | let (fp, index) = self.pass_wide(bit.to_wide()); 157 | (Bit::from_flat(fp), index) 158 | } 159 | 160 | /// Pass a wide probability on input, usable when 161 | /// you mix it linearly beforehand (libbsc does that) 162 | pub fn pass_wide(&self, wp: WideProbability) -> (FlatProbability, BinCoords) { 163 | let index = ((wp + WIDE_OFFSET) >> BIN_WEIGHT_BITS) as usize; 164 | let weight = wp as usize & (BIN_WEIGHT_TOTAL-1); 165 | let z = [ 166 | self.map[index+0].to_flat() as usize, 167 | self.map[index+1].to_flat() as usize]; 168 | let sum = z[0]*(BIN_WEIGHT_TOTAL-weight) + z[1]*weight; 169 | let fp = (sum >> BIN_WEIGHT_BITS) as FlatProbability; 170 | (fp, (index, weight)) 171 | } 172 | 173 | //TODO: weight update ratio & bias as well 174 | 175 | /// Mutate for better zeroes 176 | pub fn update_zero(&mut self, bc: BinCoords, rate: isize, bias: isize) { 177 | let (index, _) = bc; 178 | self.map[index+0].update_zero(rate, bias); 179 | self.map[index+1].update_zero(rate, bias); 180 | } 181 | 182 | /// Mutate for better ones 183 | pub fn update_one(&mut self, bc: BinCoords, rate: isize, bias: isize) { 184 | let (index, _) = bc; 185 | self.map[index+0].update_one(rate, bias); 186 | self.map[index+1].update_one(rate, bias); 187 | } 188 | 189 | /// Mutate for a given value 190 | #[inline] 191 | pub fn update(&mut self, value: bool, bc: BinCoords, rate: isize, bias: isize) { 192 | if !value { 193 | self.update_zero(bc, rate, bias) 194 | }else { 195 | self.update_one(bc, rate, bias) 196 | } 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /src/entropy/ari/bin.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | 3 | Binary models for the arithmetic coder. 4 | The simplicity of the domain allows for normalized updates in place using bit shifts. 5 | 6 | # Links 7 | 8 | # Example 9 | 10 | # Credit 11 | 12 | */ 13 | 14 | use super::Border; 15 | 16 | /// A binary value frequency model 17 | pub struct Model { 18 | /// frequency of bit 0 19 | zero: Border, 20 | /// total frequency (constant) 21 | total: Border, 22 | /// learning rate 23 | pub rate: Border, 24 | } 25 | 26 | impl Model { 27 | /// Create a new flat (50/50 probability) instance 28 | pub fn new_flat(threshold: Border, rate: Border) -> Model { 29 | Model { 30 | zero: threshold>>1, 31 | total: threshold, 32 | rate: rate, 33 | } 34 | } 35 | 36 | /// Create a new instance with a given percentage for zeroes 37 | pub fn new_custom(zero_percent: u8, threshold: Border, rate: Border) -> Model { 38 | assert!(threshold >= 100); 39 | Model { 40 | zero: (zero_percent as Border)*threshold/100, 41 | total: threshold, 42 | rate: rate, 43 | } 44 | } 45 | 46 | /// Reset the model to 50/50 distribution 47 | pub fn reset_flat(&mut self) { 48 | self.zero = self.total>>1; 49 | } 50 | 51 | /// Return the probability of 0 52 | pub fn get_probability_zero(&self) -> Border { 53 | self.zero 54 | } 55 | 56 | /// Return the probability of 1 57 | pub fn get_probability_one(&self) -> Border { 58 | self.total - self.zero 59 | } 60 | 61 | /// Update the frequency of zero 62 | pub fn update_zero(&mut self) { 63 | debug!("\tUpdating zero"); 64 | self.zero += (self.total-self.zero) >> (self.rate as usize); 65 | } 66 | 67 | /// Update the frequency of one 68 | pub fn update_one(&mut self) { 69 | debug!("\tUpdating one"); 70 | self.zero -= self.zero >> (self.rate as usize); 71 | } 72 | 73 | /// Update frequencies in favor of given 'value' 74 | /// Lower factors produce more aggressive updates 75 | pub fn update(&mut self, value: bool) { 76 | if value { 77 | self.update_one() 78 | }else { 79 | self.update_zero() 80 | } 81 | } 82 | } 83 | 84 | impl super::Model for Model { 85 | fn get_range(&self, value: bool) -> (Border,Border) { 86 | if value { 87 | (self.zero, self.total) 88 | }else { 89 | (0, self.zero) 90 | } 91 | } 92 | 93 | fn find_value(&self, offset: Border) -> (bool,Border,Border) { 94 | assert!(offset < self.total, 95 | "Invalid frequency offset {} requested under total {}", 96 | offset, self.total); 97 | if offset < self.zero { 98 | (false, 0, self.zero) 99 | }else { 100 | (true, self.zero, self.total) 101 | } 102 | } 103 | 104 | fn get_denominator(&self) -> Border { 105 | self.total 106 | } 107 | } 108 | 109 | 110 | /// A proxy model for the combination of two binary models 111 | /// using equation: (wa * A + wb * B) >> ws 112 | pub struct SumProxy<'a> { 113 | first: &'a Model, 114 | second: &'a Model, 115 | w_first: Border, 116 | w_second: Border, 117 | w_shift: Border, 118 | } 119 | 120 | impl<'a> SumProxy<'a> { 121 | /// Create a new instance of the binary sum proxy 122 | pub fn new(wa: Border, first: &'a Model, wb: Border, second: &'a Model, shift: Border) -> SumProxy<'a> { 123 | SumProxy { 124 | first: first, 125 | second: second, 126 | w_first: wa, 127 | w_second: wb, 128 | w_shift: shift, 129 | } 130 | } 131 | 132 | fn get_probability_zero(&self) -> Border { 133 | (self.w_first * self.first.get_probability_zero() + 134 | self.w_second * self.second.get_probability_zero()) >> 135 | (self.w_shift as usize) 136 | } 137 | } 138 | 139 | impl<'a> super::Model for SumProxy<'a> { 140 | fn get_range(&self, value: bool) -> (Border,Border) { 141 | let zero = self.get_probability_zero(); 142 | if value { 143 | (zero, self.get_denominator()) 144 | }else { 145 | (0, zero) 146 | } 147 | } 148 | 149 | fn find_value(&self, offset: Border) -> (bool,Border,Border) { 150 | let zero = self.get_probability_zero(); 151 | let total = self.get_denominator(); 152 | assert!(offset < total, 153 | "Invalid frequency offset {} requested under total {}", 154 | offset, total); 155 | if offset < zero { 156 | (false, 0, zero) 157 | }else { 158 | (true, zero, total) 159 | } 160 | } 161 | 162 | fn get_denominator(&self) -> Border { 163 | (self.w_first * self.first.get_denominator() + 164 | self.w_second * self.second.get_denominator()) >> 165 | (self.w_shift as usize) 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /src/entropy/ari/mod.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | 3 | Arithmetic encoder/decoder using the Range encoder underneath. Requires `entropy` feature, enabled by default 4 | Can be used in a general case of entropy coding stage. Supposed to be fast. 5 | 6 | # Links 7 | 8 | http://en.wikipedia.org/wiki/Arithmetic_coding 9 | http://en.wikipedia.org/wiki/Range_encoding 10 | 11 | # Example 12 | ```rust 13 | # #![allow(unused_must_use)] 14 | use std::io::{BufWriter, BufReader, Read, Write}; 15 | use compress::entropy::ari; 16 | 17 | // Encode some text 18 | let text = "some text"; 19 | let mut e = ari::ByteEncoder::new(BufWriter::new(Vec::new())); 20 | e.write_all(text.as_bytes()).unwrap(); 21 | let (encoded, _) = e.finish(); 22 | let inner = encoded.into_inner().unwrap(); 23 | 24 | // Decode the encoded text 25 | let mut d = ari::ByteDecoder::new(BufReader::new(&inner[..])); 26 | let mut decoded = Vec::new(); 27 | d.read_to_end(&mut decoded).unwrap(); 28 | ``` 29 | # Credit 30 | 31 | This is an original implementation. 32 | 33 | */ 34 | 35 | #![allow(missing_docs)] 36 | 37 | use std::fmt::Display; 38 | use std::io::{self, Read, Write}; 39 | 40 | use super::super::byteorder::{BigEndian, WriteBytesExt, ReadBytesExt}; 41 | use super::super::byteorder_err_to_io; 42 | 43 | pub use self::table::{ByteDecoder, ByteEncoder}; 44 | 45 | pub mod apm; 46 | pub mod bin; 47 | pub mod table; 48 | #[cfg(test)] 49 | mod test; 50 | 51 | pub type Symbol = u8; 52 | const SYMBOL_BITS: usize = 8; 53 | const SYMBOL_TOTAL: usize = 1< RangeEncoder { 83 | debug_assert!(max_range > (SYMBOL_TOTAL as Border)); 84 | RangeEncoder { 85 | low: 0, 86 | hai: !0, 87 | threshold: max_range, 88 | bits_lost_on_threshold_cut: 0.0, 89 | bits_lost_on_division: 0.0, 90 | } 91 | } 92 | 93 | /// Reset the current range 94 | pub fn reset(&mut self) { 95 | self.low = 0; 96 | self.hai = !0; 97 | } 98 | 99 | #[cfg(tune)] 100 | fn count_bits(range: Border, total: Border) -> f32 { 101 | -((range as f32) / (total as f32)).log2() 102 | } 103 | 104 | #[cfg(not(tune))] 105 | fn count_bits(_range: Border, _total: Border) -> f32 { 106 | 0.0 107 | } 108 | 109 | /// Return the number of bits lost due to threshold cuts and integer operations 110 | #[cfg(tune)] 111 | pub fn get_bits_lost(&self) -> (f32, f32) { 112 | (self.bits_lost_on_threshold_cut, self.bits_lost_on_division) 113 | } 114 | 115 | /// Process a given interval [from/total,to/total) into the current range 116 | /// write into the output slice, and return the number of symbols produced 117 | pub fn process(&mut self, total: Border, from: Border, to: Border, output: &mut [Symbol]) -> usize { 118 | debug_assert!(from0, "RangeCoder range is too narrow [{}-{}) for the total {}", 122 | self.low, self.hai, total); 123 | debug!("\t\tProcessing [{}-{})/{} with range {}", from, to, total, range); 124 | let mut lo = self.low + range*from; 125 | let mut hi = self.low + range*to; 126 | self.bits_lost_on_division += RangeEncoder::count_bits(range*total, old_range); 127 | let mut num_shift = 0; 128 | loop { 129 | if (lo^hi) & BORDER_SYMBOL_MASK != 0 { 130 | if hi-lo > self.threshold { 131 | break 132 | } 133 | let old_range = hi-lo; 134 | let lim = hi & BORDER_SYMBOL_MASK; 135 | if hi-lim >= lim-lo {lo=lim} 136 | else {hi=lim-1}; 137 | debug_assert!(lo < hi); 138 | self.bits_lost_on_threshold_cut += RangeEncoder::count_bits(hi-lo, old_range); 139 | } 140 | 141 | debug!("\t\tShifting on [{}-{}) to symbol {}", lo, hi, lo>>BORDER_EXCESS); 142 | output[num_shift] = (lo>>BORDER_EXCESS) as Symbol; 143 | num_shift += 1; 144 | lo<<=SYMBOL_BITS; hi<<=SYMBOL_BITS; 145 | debug_assert!(lo < hi); 146 | } 147 | self.low = lo; 148 | self.hai = hi; 149 | num_shift 150 | } 151 | 152 | /// Query the value encoded by 'code' in range [0,total) 153 | pub fn query(&self, total: Border, code: Border) -> Border { 154 | debug!("\t\tQuerying code {} of total {} under range [{}-{})", 155 | code, total, self.low, self.hai); 156 | debug_assert!(self.low <= code && code < self.hai); 157 | let range = (self.hai - self.low) / total; 158 | (code - self.low) / range 159 | } 160 | 161 | /// Get the code tail and close the range 162 | /// used at the end of encoding 163 | pub fn get_code_tail(&mut self) -> Border { 164 | let tail = self.low; 165 | self.low = 0; 166 | self.hai = 0; 167 | tail 168 | } 169 | } 170 | 171 | 172 | /// An abstract model to produce probability ranges 173 | /// Can be a table, a mix of tables, or just a smart function. 174 | pub trait Model { 175 | /// Get the probability range of a value 176 | fn get_range(&self, value: V) -> (Border,Border); 177 | /// Find the value by a given probability offset, return with the range 178 | fn find_value(&self, offset: Border) -> (V,Border,Border); 179 | /// Get the sum of all probabilities 180 | fn get_denominator(&self) -> Border; 181 | 182 | /// Encode a value using a range encoder 183 | /// return the number of symbols written 184 | fn encode(&self, value: V, re: &mut RangeEncoder, out: &mut [Symbol]) -> usize { 185 | let (lo, hi) = self.get_range(value); 186 | let total = self.get_denominator(); 187 | debug!("\tEncoding value {} of range [{}-{}) with total {}", value, lo, hi, total); 188 | re.process(total, lo, hi, out) 189 | } 190 | 191 | /// Decode a value using given 'code' on the range encoder 192 | /// return a (value, num_symbols_to_shift) pair 193 | fn decode(&self, code: Border, re: &mut RangeEncoder) -> (V, usize) { 194 | let total = self.get_denominator(); 195 | let offset = re.query(total, code); 196 | let (value, lo, hi) = self.find_value(offset); 197 | debug!("\tDecoding value {} of offset {} with total {}", value, offset, total); 198 | let mut out = [0 as Symbol; BORDER_BYTES]; 199 | let shift = re.process(total, lo, hi, &mut out[..]); 200 | debug_assert_eq!(if shift==0 {0} else {code>>(BORDER_BITS - shift*8)}, 201 | out[..shift].iter().fold(0 as Border, |u,&b| (u<<8)+(b as Border))); 202 | (value, shift) 203 | } 204 | } 205 | 206 | 207 | /// An arithmetic encoder helper 208 | pub struct Encoder { 209 | stream: W, 210 | range: RangeEncoder, 211 | } 212 | 213 | impl Encoder { 214 | /// Create a new encoder on top of a given Writer 215 | pub fn new(w: W) -> Encoder { 216 | Encoder { 217 | stream: w, 218 | range: RangeEncoder::new(RANGE_DEFAULT_THRESHOLD), 219 | } 220 | } 221 | 222 | /// Encode an abstract value under the given Model 223 | pub fn encode>(&mut self, value: V, model: &M) -> io::Result<()> { 224 | let mut buf = [0 as Symbol; BORDER_BYTES]; 225 | let num = model.encode(value, &mut self.range, &mut buf[..]); 226 | self.stream.write(&buf[..num]).map(|_| ()) 227 | } 228 | 229 | /// Finish encoding by writing the code tail word 230 | pub fn finish(mut self) -> (W, io::Result<()>) { 231 | debug_assert!(BORDER_BITS == 32); 232 | let code = self.range.get_code_tail(); 233 | let result = self.stream.write_u32::(code) 234 | .map_err(byteorder_err_to_io); 235 | let result = result.and(self.stream.flush()); 236 | (self.stream, result) 237 | } 238 | 239 | /// Flush the output stream 240 | pub fn flush(&mut self) -> io::Result<()> { 241 | self.stream.flush() 242 | } 243 | 244 | /// Return the number of bytes lost due to threshold cuts and integer operations 245 | #[cfg(tune)] 246 | pub fn get_bytes_lost(&self) -> (f32, f32) { 247 | let (a,b) = self.range.get_bits_lost(); 248 | (a/8.0, b/8.0) 249 | } 250 | } 251 | 252 | /// An arithmetic decoder helper 253 | pub struct Decoder { 254 | stream: R, 255 | range: RangeEncoder, 256 | code: Border, 257 | bytes_pending: usize, 258 | } 259 | 260 | impl Decoder { 261 | /// Create a decoder on top of a given Reader 262 | pub fn new(r: R) -> Decoder { 263 | Decoder { 264 | stream: r, 265 | range: RangeEncoder::new(RANGE_DEFAULT_THRESHOLD), 266 | code: 0, 267 | bytes_pending: BORDER_BYTES, 268 | } 269 | } 270 | 271 | fn feed(&mut self) -> io::Result<()> { 272 | while self.bytes_pending != 0 { 273 | let b = try!(self.stream.read_u8()); 274 | self.code = (self.code<<8) + (b as Border); 275 | self.bytes_pending -= 1; 276 | } 277 | Ok(()) 278 | } 279 | 280 | /// Decode an abstract value based on the given Model 281 | pub fn decode>(&mut self, model: &M) -> io::Result { 282 | self.feed().unwrap(); 283 | let (value, shift) = model.decode(self.code, &mut self.range); 284 | self.bytes_pending = shift; 285 | Ok(value) 286 | } 287 | 288 | /// Finish decoding 289 | pub fn finish(mut self) -> (R, io::Result<()>) { 290 | let err = self.feed(); 291 | (self.stream, err) 292 | } 293 | } 294 | -------------------------------------------------------------------------------- /src/entropy/ari/table.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | 3 | Frequency table models for the arithmetic coder. 4 | The module also implements Reader/Writer using simple byte coding. 5 | 6 | # Links 7 | 8 | # Example 9 | 10 | # Credit 11 | 12 | */ 13 | 14 | use std::io::{self, Read, Write}; 15 | use super::Border; 16 | 17 | pub type Frequency = u16; 18 | 19 | /// A simple table of frequencies. 20 | pub struct Model { 21 | /// sum of frequencies 22 | total: Border, 23 | /// main table: value -> Frequency 24 | table: Vec, 25 | /// maximum allowed sum of frequency, 26 | /// should be smaller than RangeEncoder::threshold 27 | cut_threshold: Border, 28 | /// number of bits to shift on cut 29 | cut_shift: usize, 30 | } 31 | 32 | impl Model { 33 | /// Create a new table with frequencies initialized by a function 34 | pub fn new_custom(num_values: usize, threshold: Border, 35 | mut fn_init: F) -> Model 36 | where F: FnMut(usize) -> Frequency 37 | { 38 | let freq: Vec = (0..num_values).map(|i| fn_init(i)).collect(); 39 | let total = freq.iter().fold(0 as Border, |u,&f| u+(f as Border)); 40 | let mut ft = Model { 41 | total: total, 42 | table: freq, 43 | cut_threshold: threshold, 44 | cut_shift: 1, 45 | }; 46 | // downscale if needed 47 | while ft.total >= threshold { 48 | ft.downscale(); 49 | } 50 | ft 51 | } 52 | 53 | /// Create a new tanle with all frequencies being equal 54 | pub fn new_flat(num_values: usize, threshold: Border) -> Model { 55 | Model::new_custom(num_values, threshold, |_| 1) 56 | } 57 | 58 | /// Reset the table to the flat state 59 | pub fn reset_flat(&mut self) { 60 | for freq in self.table.iter_mut() { 61 | *freq = 1; 62 | } 63 | self.total = self.table.len() as Border; 64 | } 65 | 66 | /// Adapt the table in favor of given 'value' 67 | /// using 'add_log' and 'add_const' to produce the additive factor 68 | /// the higher 'add_log' is, the more concervative is the adaptation 69 | pub fn update(&mut self, value: usize, add_log: usize, add_const: Border) { 70 | let add = (self.total>>add_log) + add_const; 71 | assert!(add < 2*self.cut_threshold); 72 | debug!("\tUpdating by adding {} to value {}", add, value); 73 | self.table[value] += add as Frequency; 74 | self.total += add; 75 | if self.total >= self.cut_threshold { 76 | self.downscale(); 77 | assert!(self.total < self.cut_threshold); 78 | } 79 | } 80 | 81 | /// Reduce frequencies by 'cut_iter' bits 82 | pub fn downscale(&mut self) { 83 | debug!("\tDownscaling frequencies"); 84 | let roundup = (1<> self.cut_shift; 89 | self.total += *freq as Border; 90 | } 91 | } 92 | 93 | /// Return read-only frequencies slice 94 | pub fn get_frequencies<'a>(&'a self) -> &'a [Frequency] { 95 | &self.table[..] 96 | } 97 | } 98 | 99 | impl super::Model for Model { 100 | fn get_range(&self, value: usize) -> (Border,Border) { 101 | let lo = self.table[..value].iter().fold(0, |u,&f| u+(f as Border)); 102 | (lo, lo + (self.table[value] as Border)) 103 | } 104 | 105 | fn find_value(&self, offset: Border) -> (usize,Border,Border) { 106 | assert!(offset < self.total, 107 | "Invalid frequency offset {} requested under total {}", 108 | offset, self.total); 109 | let mut value = 0; 110 | let mut lo = 0 as Border; 111 | let mut hi; 112 | while {hi=lo+(self.table[value] as Border); hi} <= offset { 113 | lo = hi; 114 | value += 1; 115 | } 116 | (value, lo, hi) 117 | } 118 | 119 | fn get_denominator(&self) -> Border { 120 | self.total 121 | } 122 | } 123 | 124 | 125 | /// A proxy model for the sum of two frequency tables 126 | /// using equation: (wa * A + wb * B) >> ws 127 | pub struct SumProxy<'a> { 128 | first: &'a Model, 129 | second: &'a Model, 130 | w_first: Border, 131 | w_second: Border, 132 | w_shift: Border, 133 | } 134 | 135 | impl<'a> SumProxy<'a> { 136 | /// Create a new instance of the table sum proxy 137 | pub fn new(wa: Border, fa: &'a Model, wb: Border, fb: &'a Model, shift: Border) -> SumProxy<'a> { 138 | assert_eq!(fa.get_frequencies().len(), fb.get_frequencies().len()); 139 | SumProxy { 140 | first: fa, 141 | second: fb, 142 | w_first: wa, 143 | w_second: wb, 144 | w_shift: shift, 145 | } 146 | } 147 | } 148 | 149 | impl<'a> super::Model for SumProxy<'a> { 150 | fn get_range(&self, value: usize) -> (Border,Border) { 151 | let (lo0, hi0) = self.first.get_range(value); 152 | let (lo1, hi1) = self.second.get_range(value); 153 | let (wa, wb, ws) = (self.w_first, self.w_second, self.w_shift as usize); 154 | ((wa*lo0 + wb*lo1)>>ws, (wa*hi0 + wb*hi1)>>ws) 155 | } 156 | 157 | fn find_value(&self, offset: Border) -> (usize,Border,Border) { 158 | assert!(offset < self.get_denominator(), 159 | "Invalid frequency offset {} requested under total {}", 160 | offset, self.get_denominator()); 161 | let mut value = 0; 162 | let mut lo = 0 as Border; 163 | let mut hi; 164 | while { hi = lo + 165 | (self.w_first * (self.first.get_frequencies()[value] as Border) + 166 | self.w_second * (self.second.get_frequencies()[value] as Border)) >> 167 | (self.w_shift as usize); 168 | hi <= offset } { 169 | lo = hi; 170 | value += 1; 171 | } 172 | (value, lo, hi) 173 | } 174 | 175 | fn get_denominator(&self) -> Border { 176 | (self.w_first * self.first.get_denominator() + 177 | self.w_second * self.second.get_denominator()) >> 178 | (self.w_shift as usize) 179 | } 180 | } 181 | 182 | 183 | /// A basic byte-encoding arithmetic 184 | /// uses a special terminator code to end the stream 185 | pub struct ByteEncoder { 186 | /// A lower level encoder 187 | pub encoder: super::Encoder, 188 | /// A basic frequency table 189 | pub freq: Model, 190 | } 191 | 192 | impl ByteEncoder { 193 | /// Create a new encoder on top of a given Writer 194 | pub fn new(w: W) -> ByteEncoder { 195 | let freq_max = super::RANGE_DEFAULT_THRESHOLD >> 2; 196 | ByteEncoder { 197 | encoder: super::Encoder::new(w), 198 | freq: Model::new_flat(super::SYMBOL_TOTAL+1, freq_max), 199 | } 200 | } 201 | 202 | /// Finish encoding & write the terminator symbol 203 | pub fn finish(mut self) -> (W, io::Result<()>) { 204 | let ret = self.encoder.encode(super::SYMBOL_TOTAL, &self.freq); 205 | let (w,r2) = self.encoder.finish(); 206 | (w, ret.and(r2)) 207 | } 208 | } 209 | 210 | impl Write for ByteEncoder { 211 | fn write(&mut self, buf: &[u8]) -> io::Result { 212 | for byte in buf.iter() { 213 | let value = *byte as usize; 214 | try!(self.encoder.encode(value, &self.freq)); 215 | self.freq.update(value, 10, 1); 216 | } 217 | 218 | Ok(buf.len()) 219 | } 220 | 221 | fn flush(&mut self) -> io::Result<()> { 222 | self.encoder.flush() 223 | } 224 | } 225 | 226 | 227 | /// A basic byte-decoding arithmetic 228 | /// expects a special terminator code for the end of the stream 229 | pub struct ByteDecoder { 230 | /// A lower level decoder 231 | pub decoder: super::Decoder, 232 | /// A basic frequency table 233 | pub freq: Model, 234 | /// Remember if we found the terminator code 235 | is_eof: bool, 236 | } 237 | 238 | impl ByteDecoder { 239 | /// Create a decoder on top of a given Reader 240 | pub fn new(r: R) -> ByteDecoder { 241 | let freq_max = super::RANGE_DEFAULT_THRESHOLD >> 2; 242 | ByteDecoder { 243 | decoder: super::Decoder::new(r), 244 | freq: Model::new_flat(super::SYMBOL_TOTAL+1, freq_max), 245 | is_eof: false, 246 | } 247 | } 248 | 249 | /// Finish decoding 250 | pub fn finish(self) -> (R, io::Result<()>) { 251 | self.decoder.finish() 252 | } 253 | } 254 | 255 | impl Read for ByteDecoder { 256 | fn read(&mut self, dst: &mut [u8]) -> io::Result { 257 | if self.is_eof { 258 | return Ok(0) 259 | } 260 | let mut amount = 0; 261 | for out_byte in dst.iter_mut() { 262 | let value = try!(self.decoder.decode(&self.freq)); 263 | if value == super::SYMBOL_TOTAL { 264 | self.is_eof = true; 265 | break 266 | } 267 | self.freq.update(value, 10, 1); 268 | *out_byte = value as u8; 269 | amount += 1; 270 | } 271 | Ok(amount) 272 | } 273 | } 274 | -------------------------------------------------------------------------------- /src/entropy/ari/test.rs: -------------------------------------------------------------------------------- 1 | use std::io::{BufReader, BufWriter, Write, Read}; 2 | use std::vec::Vec; 3 | #[cfg(feature="unstable")] 4 | use test::Bencher; 5 | 6 | static TEXT_INPUT: &'static [u8] = include_bytes!("../../data/test.txt"); 7 | 8 | fn roundtrip(bytes: &[u8]) { 9 | info!("Roundtrip Ari of size {}", bytes.len()); 10 | let mut e = super::table::ByteEncoder::new(BufWriter::new(Vec::new())); 11 | e.write(bytes).unwrap(); 12 | let (e, r) = e.finish(); 13 | r.unwrap(); 14 | let encoded = e.into_inner().unwrap(); 15 | debug!("Roundtrip input {:?} encoded {:?}", bytes, encoded); 16 | let mut d = super::ByteDecoder::new(BufReader::new(&encoded[..])); 17 | let mut decoded = Vec::new(); 18 | d.read_to_end(&mut decoded).unwrap(); 19 | assert_eq!(&bytes[..], &decoded[..]); 20 | } 21 | 22 | fn encode_binary(bytes: &[u8], model: &mut super::bin::Model) -> Vec { 23 | let mut encoder = super::Encoder::new(BufWriter::new(Vec::new())); 24 | for &byte in bytes.iter() { 25 | for i in 0..8 { 26 | let bit = (byte & (1<> 3, factor); 38 | let output = encode_binary(bytes, &mut bm); 39 | bm.reset_flat(); 40 | let mut decoder = super::Decoder::new(BufReader::new(&output[..])); 41 | for &byte in bytes.iter() { 42 | let mut value = 0u8; 43 | for i in 0..8 { 44 | let bit = decoder.decode(&bm).unwrap(); 45 | bm.update(bit); 46 | value += (bit as u8)<> 3; 96 | let mut t0 = super::table::Model::new_flat(16, threshold); 97 | let mut t1 = super::table::Model::new_flat(16, threshold); 98 | let mut b0 = super::bin::Model::new_flat(threshold, 3); 99 | let mut b1 = super::bin::Model::new_flat(threshold, 5); 100 | // encode (high 4 bits with the proxy table, low 4 bits with the proxy binary) 101 | let mut encoder = super::Encoder::new(BufWriter::new(Vec::new())); 102 | for &byte in bytes.iter() { 103 | let high = (byte>>4) as usize; 104 | { 105 | let proxy = super::table::SumProxy::new(2, &t0, 1, &t1, 0); 106 | encoder.encode(high, &proxy).unwrap(); 107 | } 108 | t0.update(high, update0, 1); 109 | t1.update(high, update1, 1); 110 | for i in 0..4 { 111 | let bit = (byte & (1<>i) & 1 != 0; 157 | let (bit_new, coords) = gate.pass(&bit); 158 | encoder.encode(b1, &bit_new).unwrap(); 159 | bit.update(b1, 10, 0); 160 | gate.update(b1, coords, 10, 0); 161 | } 162 | } 163 | let (writer, err) = encoder.finish(); 164 | err.unwrap(); 165 | let output = writer.into_inner().unwrap(); 166 | bit = super::apm::Bit::new_equal(); 167 | gate = super::apm::Gate::new(); 168 | let mut decoder = super::Decoder::new(BufReader::new(&output[..])); 169 | for b8 in bytes.iter() { 170 | let mut decoded = 0u8; 171 | for i in 0..8 { 172 | let (bit_new, coords) = gate.pass(&bit); 173 | let b1 = decoder.decode(&bit_new).unwrap(); 174 | if b1 { 175 | decoded += 1< = vec![0u8; TEXT_INPUT.len()]; 219 | bh.iter(|| { 220 | let cursor = Cursor::new(&mut storage[..]); 221 | let mut w = BufWriter::new(cursor); 222 | w.seek(SeekFrom::Start(0)).unwrap(); 223 | let mut e = super::ByteEncoder::new(w); 224 | e.write(TEXT_INPUT).unwrap(); 225 | }); 226 | bh.bytes = TEXT_INPUT.len() as u64; 227 | } 228 | -------------------------------------------------------------------------------- /src/flate.rs: -------------------------------------------------------------------------------- 1 | //! DEFLATE Compression and Decompression. Requires `flate` feature, enabled by default 2 | //! 3 | //! This module contains an implementation of the DEFLATE compression scheme. 4 | //! This format is often used as the underpinning of other compression formats. 5 | //! 6 | //! # Example 7 | //! 8 | //! ```rust,ignore 9 | //! use compress::flate; 10 | //! use std::fs::File; 11 | //! use std::path::Path; 12 | //! use std::io::Read; 13 | //! 14 | //! let stream = File::open(&Path::new("path/to/file.flate")).unwrap(); 15 | //! let mut decompressed = Vec::new(); 16 | //! flate::Decoder::new(stream).read_to_end(&mut decompressed); 17 | //! ``` 18 | //! 19 | //! # Related links 20 | //! 21 | //! * http://tools.ietf.org/html/rfc1951 - RFC that this implementation is based 22 | //! on 23 | //! * http://www.gzip.org/zlib/rfc-deflate.html - simplified version of RFC 1951 24 | //! used as a reference 25 | //! * http://svn.ghostscript.com/ghostscript/trunk/gs/zlib/contrib/puff/puff.c - 26 | //! Much of this code is based on the puff.c implementation found here 27 | 28 | use std::cmp; 29 | use std::ptr::copy_nonoverlapping; 30 | use std::io::{self, Read}; 31 | use std::vec::Vec; 32 | 33 | use super::byteorder::{LittleEndian, ReadBytesExt}; 34 | use super::ReadExact; 35 | 36 | const MAXBITS: usize = 15; 37 | const MAXLCODES: u16 = 286; 38 | const MAXDCODES: u16 = 30; 39 | const MAXCODES: u16 = MAXLCODES + MAXDCODES; 40 | const HISTORY: usize = 32 * 1024; 41 | 42 | enum Error { 43 | HuffmanTreeTooLarge, 44 | InvalidBlockCode, 45 | InvalidHuffmanHeaderSymbol, 46 | InvalidHuffmanTree, 47 | InvalidHuffmanTreeHeader, 48 | InvalidHuffmanCode, 49 | InvalidStaticSize, 50 | NotEnoughBits, 51 | } 52 | 53 | fn error(e: Error) -> io::Result { 54 | Err(io::Error::new( 55 | io::ErrorKind::InvalidInput, 56 | match e { 57 | Error::HuffmanTreeTooLarge => "huffman tree too large", 58 | Error::InvalidBlockCode => "invalid block code", 59 | Error::InvalidHuffmanHeaderSymbol => "invalid huffman header symbol", 60 | Error::InvalidHuffmanTree => "invalid huffman tree", 61 | Error::InvalidHuffmanTreeHeader => "invalid huffman tree header", 62 | Error::InvalidHuffmanCode => "invalid huffman code", 63 | Error::InvalidStaticSize => "invalid static size", 64 | Error::NotEnoughBits => "not enough bits", 65 | } 66 | )) 67 | } 68 | 69 | struct HuffmanTree { 70 | /// An array which counts the number of codes which can be found at the 71 | /// index's bit length, or count[n] is the number of n-bit codes 72 | pub count: [u16; MAXBITS + 1], 73 | 74 | /// Symbols in this huffman tree in sorted order. This preserves the 75 | /// original huffman codes 76 | pub symbol: [u16; MAXCODES as usize], 77 | } 78 | 79 | impl HuffmanTree { 80 | /// Constructs a new huffman tree for decoding. If the given array has 81 | /// length N, then the huffman tree can be used to decode N symbols. Each 82 | /// entry in the array corresponds to the length of the nth symbol. 83 | fn construct(lens: &[u16]) -> io::Result { 84 | let mut tree = HuffmanTree { 85 | count: [0; MAXBITS + 1], 86 | symbol: [0; MAXCODES as usize], 87 | }; 88 | // Collect the lengths of all symbols 89 | for len in lens.iter() { 90 | tree.count[*len as usize] += 1; 91 | } 92 | // If there weren't actually any codes, then we're done 93 | if tree.count[0] as usize == lens.len() { return Ok(tree) } 94 | 95 | // Make sure that this tree is sane. Each bit gives us 2x more codes to 96 | // work with, but if the counts add up to greater than the available 97 | // amount, then this is an invalid table. 98 | let mut left = 1; 99 | for i in 1..(MAXBITS + 1) { 100 | left *= 2; 101 | left -= tree.count[i] as isize; 102 | if left < 0 { return error(Error::InvalidHuffmanTree) } 103 | } 104 | 105 | // Generate the offset of each length into the 'symbol' array 106 | let mut offs = [0; MAXBITS + 1]; 107 | for i in 1..MAXBITS { 108 | offs[i + 1] = offs[i] + tree.count[i]; 109 | } 110 | 111 | // Insert all symbols into the table, in sorted order using the `offs` 112 | // array generated above. 113 | for (sym, &len) in lens.iter().enumerate() { 114 | if len != 0 { 115 | tree.symbol[offs[len as usize] as usize] = sym as u16; 116 | offs[len as usize] += 1; 117 | } 118 | } 119 | return Ok(tree); 120 | } 121 | 122 | /// Decodes a codepoint from the buffer. 123 | /// 124 | /// This operates by reading bits as long as the code isn't found within the 125 | /// valid range of the codes itself. Remember the codepoints are all encoded 126 | /// by a sequence of lengths. The codepoint being decoded needs to figure 127 | /// out what lengths it's between, and then within that range we can index 128 | /// into the whole symbol array to pluck out the right symbol. 129 | fn decode(&self, s: &mut Decoder) -> io::Result { 130 | // this could be a lot faster. 131 | let mut code = 0; 132 | let mut first = 0; 133 | let mut index = 0; 134 | for len in 1..(MAXBITS + 1) { 135 | code |= try!(s.bits(1)); 136 | let count = self.count[len]; 137 | if code < first + count { 138 | return Ok(self.symbol[(index + (code - first)) as usize]) 139 | } 140 | index += count; 141 | first += count; 142 | first <<= 1; 143 | code <<= 1; 144 | } 145 | return error(Error::NotEnoughBits); 146 | } 147 | } 148 | 149 | #[cfg(genflate)] 150 | fn main() { 151 | static FIXLCODES: usize = 388; 152 | let mut arr = [0; FIXLCODES]; 153 | for i in 0..144 { arr[i] = 8; } 154 | for i in 144..256 { arr[i] = 9; } 155 | for i in 256..280 { arr[i] = 7; } 156 | for i in 280..288 { arr[i] = 8; } 157 | println!("{:?}", HuffmanTree::construct(arr[..FIXLCODES])); 158 | for i in 0..MAXDCODES { arr[i] = 5; } 159 | println!("{:?}", HuffmanTree::construct(arr[..MAXDCODES])); 160 | } 161 | 162 | /// The structure that is used to decode an LZ4 data stream. This wraps an 163 | /// internal reader which is used as the source of all data. 164 | pub struct Decoder { 165 | /// Wrapped reader which is exposed to allow getting it back. 166 | pub r: R, 167 | 168 | output: Vec, 169 | outpos: usize, 170 | 171 | block: Vec, 172 | pos: usize, 173 | 174 | bitbuf: usize, 175 | bitcnt: usize, 176 | eof: bool, 177 | } 178 | 179 | impl Decoder { 180 | /// Creates a new flate decoder which will read data from the specified 181 | /// source 182 | pub fn new(r: R) -> Decoder { 183 | Decoder { 184 | r: r, 185 | output: Vec::with_capacity(HISTORY), 186 | outpos: 0, 187 | block: Vec::new(), 188 | pos: 0, 189 | bitbuf: 0, 190 | bitcnt: 0, 191 | eof: false, 192 | } 193 | } 194 | 195 | fn block(&mut self) -> io::Result<()> { 196 | self.pos = 0; 197 | self.block = Vec::with_capacity(4096); 198 | if try!(self.bits(1)) == 1 { self.eof = true; } 199 | match try!(self.bits(2)) { 200 | 0 => self.statik(), 201 | 1 => self.fixed(), 202 | 2 => self.dynamic(), 203 | 3 => error(Error::InvalidBlockCode), 204 | _ => unreachable!(), 205 | } 206 | } 207 | 208 | fn update_output(&mut self, mut from: usize) { 209 | let to = self.block.len(); 210 | if to - from > HISTORY { 211 | from = to - HISTORY; 212 | } 213 | let amt = to - from; 214 | let remaining = HISTORY - self.outpos; 215 | let n = cmp::min(amt, remaining); 216 | if self.output.len() < HISTORY { 217 | self.output.extend(self.block[from..(from + n)].iter().map(|b| *b)); 218 | } else if n > 0 { 219 | assert_eq!(self.output.len(), HISTORY); 220 | unsafe { copy_nonoverlapping( 221 | &self.block[from], 222 | &mut self.output[self.outpos], 223 | n 224 | )}; 225 | } 226 | self.outpos += n; 227 | if n < amt { 228 | unsafe { copy_nonoverlapping( 229 | &self.block[from+n], 230 | &mut self.output[0], 231 | amt - n 232 | )}; 233 | self.outpos = amt - n; 234 | } 235 | } 236 | 237 | fn statik(&mut self) -> io::Result<()> { 238 | let len = try!(self.r.read_u16::()); 239 | let nlen = try!(self.r.read_u16::()); 240 | if !nlen != len { return error(Error::InvalidStaticSize) } 241 | try!(self.r.push_exactly(len as u64, &mut self.block)); 242 | self.update_output(0); 243 | self.bitcnt = 0; 244 | self.bitbuf = 0; 245 | Ok(()) 246 | } 247 | 248 | // Bytes in the stream are LSB first, so the bitbuf is appended to from the 249 | // left and consumed from the right. 250 | fn bits(&mut self, cnt: usize) -> io::Result { 251 | while self.bitcnt < cnt { 252 | let byte = try!(self.r.read_u8()); 253 | self.bitbuf |= (byte as usize) << self.bitcnt; 254 | self.bitcnt += 8; 255 | } 256 | let ret = self.bitbuf & ((1 << cnt) - 1); 257 | self.bitbuf >>= cnt; 258 | self.bitcnt -= cnt; 259 | return Ok(ret as u16); 260 | } 261 | 262 | fn codes(&mut self, lens: &HuffmanTree, 263 | dist: &HuffmanTree) -> io::Result<()> { 264 | // extra base length for codes 257-285 265 | static EXTRALENS: [u16; 29] = [ 266 | 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 267 | 59, 67, 83, 99, 115, 131, 163, 195, 227, 258 268 | ]; 269 | // extra bits to read for codes 257-285 270 | static EXTRABITS: [u16; 29] = [ 271 | 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 272 | 4, 5, 5, 5, 5, 0, 273 | ]; 274 | // base offset for distance codes. 275 | static EXTRADIST: [u16; 30] = [ 276 | 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 277 | 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 278 | 24577, 279 | ]; 280 | // number of bits to read for distance codes (to add to the offset) 281 | static EXTRADBITS: [u16; 30] = [ 282 | 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 283 | 10, 10, 11, 11, 12, 12, 13, 13, 284 | ]; 285 | let mut last_updated = 0; 286 | loop { 287 | let sym = try!(lens.decode(self)); 288 | match sym { 289 | n if n < 256 => { self.block.push(sym as u8); } 290 | 256 => break, 291 | n if n < 290 => { 292 | // figure out len/dist that we're working with 293 | let n = n - 257; 294 | if n as usize > EXTRALENS.len() { 295 | return error(Error::InvalidHuffmanCode) 296 | } 297 | let len = EXTRALENS[n as usize] + 298 | try!(self.bits(EXTRABITS[n as usize] as usize)); 299 | 300 | let len = len as usize; 301 | 302 | let dist = try!(dist.decode(self)) as usize; 303 | let dist = EXTRADIST[dist] + 304 | try!(self.bits(EXTRADBITS[dist] as usize)); 305 | let dist = dist as usize; 306 | 307 | // update the output buffer with any data we haven't pushed 308 | // into it yet 309 | if last_updated != self.block.len() { 310 | self.update_output(last_updated); 311 | last_updated = self.block.len(); 312 | } 313 | 314 | if dist > self.output.len() { 315 | return error(Error::InvalidHuffmanCode) 316 | } 317 | 318 | // Perform the copy 319 | self.block.reserve(dist); 320 | let mut finger = if self.outpos >= dist { 321 | self.outpos - dist 322 | } else { 323 | HISTORY - (dist - self.outpos) 324 | }; 325 | let min = cmp::min(dist, len); 326 | let start = self.block.len(); 327 | for _ in 0..min { 328 | self.block.push(self.output[finger]); 329 | finger = (finger + 1) % HISTORY; 330 | } 331 | for i in min..len { 332 | let b = self.block[start + i - min]; 333 | self.block.push(b); 334 | } 335 | } 336 | _ => return error(Error::InvalidHuffmanCode) 337 | } 338 | } 339 | self.update_output(last_updated); 340 | Ok(()) 341 | } 342 | 343 | fn fixed(&mut self) -> io::Result<()> { 344 | // Generated by the main function above 345 | static LEN: HuffmanTree = HuffmanTree { 346 | count: [100, 0, 0, 0, 0, 0, 0, 24, 152, 112, 0, 0, 0, 0, 0, 0], 347 | symbol: [ 348 | 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 349 | 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 0, 1, 2, 350 | 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 351 | 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 352 | 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 353 | 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 354 | 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 355 | 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 356 | 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 357 | 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 358 | 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 359 | 140, 141, 142, 143, 280, 281, 282, 283, 284, 285, 286, 287, 144, 360 | 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 361 | 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 362 | 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 363 | 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 364 | 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 365 | 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 366 | 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 367 | 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 368 | 249, 250, 251, 252, 253, 254, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 369 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 370 | ] 371 | }; 372 | static DIST: HuffmanTree = HuffmanTree { 373 | count: [0, 0, 0, 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 374 | symbol: [ 375 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 376 | 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 0, 0, 0, 0, 377 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 378 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 379 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 380 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 381 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 382 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 383 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 384 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 385 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 386 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 387 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 388 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 389 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 390 | 0, 0, 0, 0, 0, 0, 0, 0 391 | ] 392 | }; 393 | 394 | self.codes(&LEN, &DIST) 395 | } 396 | 397 | fn dynamic(&mut self) -> io::Result<()> { 398 | let hlit = try!(self.bits(5)) + 257; // number of length codes 399 | let hdist = try!(self.bits(5)) + 1; // number of distance codes 400 | let hclen = try!(self.bits(4)) + 4; // number of code length codes 401 | if hlit > MAXLCODES || hdist > MAXDCODES { 402 | return error(Error::HuffmanTreeTooLarge); 403 | } 404 | 405 | // Read off the code length codes, and then build the huffman tree which 406 | // is then used to decode the actual huffman tree for the rest of the 407 | // data. 408 | static ORDER: [usize; 19] = [ 409 | 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15, 410 | ]; 411 | let mut lengths = [0; 19]; 412 | for i in 0..(hclen as usize) { 413 | lengths[ORDER[i]] = try!(self.bits(3)); 414 | } 415 | let tree = try!(HuffmanTree::construct(&lengths)); 416 | 417 | // Decode all of the length and distance codes in one go, we'll 418 | // partition them into two huffman trees later 419 | let mut lengths = [0; MAXCODES as usize]; 420 | let mut i = 0; 421 | while i < hlit + hdist { 422 | let symbol = try!(tree.decode(self)); 423 | match symbol { 424 | n if n < 16 => { 425 | lengths[i as usize] = symbol; 426 | i += 1; 427 | } 428 | 16 if i == 0 => return error(Error::InvalidHuffmanHeaderSymbol), 429 | 16 => { 430 | let prev = lengths[i as usize - 1]; 431 | for _ in 0..(try!(self.bits(2)) + 3) { 432 | lengths[i as usize] = prev; 433 | i += 1; 434 | } 435 | } 436 | // all codes start out as 0, so these just skip 437 | 17 => { i += try!(self.bits(3)) + 3; } 438 | 18 => { i += try!(self.bits(7)) + 11; } 439 | _ => return error(Error::InvalidHuffmanHeaderSymbol), 440 | } 441 | } 442 | if i > hlit + hdist { return error(Error::InvalidHuffmanTreeHeader) } 443 | 444 | // Use the decoded codes to construct yet another huffman tree 445 | let arr = &lengths[..(hlit as usize)]; 446 | let lencode = try!(HuffmanTree::construct(arr)); 447 | let arr = &lengths[(hlit as usize)..((hlit + hdist) as usize)]; 448 | let distcode = try!(HuffmanTree::construct(arr)); 449 | self.codes(&lencode, &distcode) 450 | } 451 | 452 | /// Returns whether this deflate stream has reached the EOF marker 453 | pub fn eof(&self) -> bool { 454 | self.eof && self.pos == self.block.len() 455 | } 456 | 457 | /// Resets this flate decoder. Note that this could corrupt an in-progress 458 | /// decoding of a stream. 459 | pub fn reset(&mut self) { 460 | self.bitbuf = 0; 461 | self.bitcnt = 0; 462 | self.eof = false; 463 | self.block = Vec::new(); 464 | self.pos = 0; 465 | } 466 | } 467 | 468 | impl Read for Decoder { 469 | fn read(&mut self, buf: &mut [u8]) -> io::Result { 470 | if self.pos == self.block.len() { 471 | if self.eof { return Ok(0) } 472 | try!(self.block()); 473 | } 474 | let n = cmp::min(buf.len(), self.block.len() - self.pos); 475 | match n { 476 | 0 => Ok(0), 477 | _ => { 478 | unsafe { copy_nonoverlapping( 479 | &self.block[self.pos], 480 | &mut buf[0], 481 | n 482 | )}; 483 | self.pos += n; 484 | Ok(n) 485 | } 486 | } 487 | } 488 | } 489 | 490 | #[cfg(test)] 491 | #[allow(warnings)] 492 | mod test { 493 | use std::io::{BufReader, BufWriter, Read, Write}; 494 | use super::super::rand::{random}; 495 | use super::super::byteorder::{LittleEndian, BigEndian, WriteBytesExt, ReadBytesExt}; 496 | use std::str; 497 | use super::{Decoder}; 498 | #[cfg(feature="unstable")] 499 | use test; 500 | 501 | // The input data for these tests were all generated from the zpipe.c 502 | // program found at http://www.zlib.net/zpipe.c and the zlib format has an 503 | // extra 2 bytes of header with an 4-byte checksum at the end. 504 | fn fixup<'a>(s: &'a [u8]) -> &'a [u8] { 505 | &s[2..(s.len() - 4)] 506 | } 507 | 508 | fn test_decode(input: &[u8], output: &[u8]) { 509 | let mut d = Decoder::new(BufReader::new(fixup(input))); 510 | let mut buf = Vec::new(); 511 | d.read_to_end(&mut buf).unwrap(); 512 | 513 | assert_eq!(output.len(), buf.len()); 514 | let i = buf.iter().zip(output.iter()).position(|(a, b)| a != b); 515 | assert!(buf == output); 516 | } 517 | 518 | fn test_decode_pure(input: &[u8], output: &[u8]) { 519 | let mut d = Decoder::new(BufReader::new(input)); 520 | let mut buf = Vec::new(); 521 | d.read_to_end(&mut buf).unwrap(); 522 | 523 | assert_eq!(output.len(), buf.len()); 524 | let i = buf.iter().zip(output.iter()).position(|(a, b)| a != b); 525 | assert!(buf == output); 526 | } 527 | 528 | #[test] 529 | fn decode() { 530 | let reference = include_bytes!("data/test.txt"); 531 | test_decode(include_bytes!("data/test.z.0"), reference); 532 | test_decode(include_bytes!("data/test.z.1"), reference); 533 | test_decode(include_bytes!("data/test.z.2"), reference); 534 | test_decode(include_bytes!("data/test.z.3"), reference); 535 | test_decode(include_bytes!("data/test.z.4"), reference); 536 | test_decode(include_bytes!("data/test.z.5"), reference); 537 | test_decode(include_bytes!("data/test.z.6"), reference); 538 | test_decode(include_bytes!("data/test.z.7"), reference); 539 | test_decode(include_bytes!("data/test.z.8"), reference); 540 | test_decode(include_bytes!("data/test.z.9"), reference); 541 | test_decode_pure(include_bytes!("data/test.z.go"), reference); 542 | } 543 | 544 | #[test] 545 | fn large() { 546 | let reference = include_bytes!("data/test.large"); 547 | test_decode(include_bytes!("data/test.large.z.5"), reference); 548 | } 549 | 550 | #[test] 551 | fn one_byte_at_a_time() { 552 | let input = include_bytes!("data/test.z.1"); 553 | let mut d = Decoder::new(BufReader::new(fixup(input))); 554 | assert!(!d.eof()); 555 | let mut out = Vec::new(); 556 | loop { 557 | match d.read_u8() { 558 | Ok(b) => out.push(b), 559 | Err(..) => break 560 | } 561 | } 562 | 563 | assert!(d.eof()); 564 | assert!(&out[..] == &include_bytes!("data/test.txt")[..]); 565 | } 566 | 567 | #[test] 568 | fn random_byte_lengths() { 569 | let input = include_bytes!("data/test.z.1"); 570 | let mut d = Decoder::new(BufReader::new(fixup(input))); 571 | let mut out = Vec::new(); 572 | let mut buf = [0u8; 40]; 573 | loop { 574 | match d.read(&mut buf[..(1 + random::() % 40)]) { 575 | Err(..) | Ok(0) => break, 576 | Ok(n) => { 577 | out.extend(buf[..n].iter().map(|b| *b)); 578 | } 579 | } 580 | } 581 | assert!(&out[..] == &include_bytes!("data/test.txt")[..]); 582 | } 583 | 584 | //fn roundtrip(bytes: &[u8]) { 585 | // let mut e = Encoder::new(MemWriter::new()); 586 | // e.write(bytes); 587 | // let encoded = e.finish().unwrap(); 588 | // 589 | // let mut d = Decoder::new(BufReader::new(encoded)); 590 | // let decoded = d.read_to_end(); 591 | // assert_eq!(&decoded[..], bytes); 592 | //} 593 | // 594 | //#[test] 595 | //fn some_roundtrips() { 596 | // roundtrip(bytes!("test")); 597 | // roundtrip(bytes!("")); 598 | // roundtrip(include_bytes!("data/test.txt")); 599 | //} 600 | 601 | #[cfg(feature="unstable")] 602 | #[bench] 603 | fn decompress_speed(bh: &mut test::Bencher) { 604 | let input = include_bytes!("data/test.z.9"); 605 | let mut d = Decoder::new(BufReader::new(fixup(input))); 606 | let mut output = [0u8; 65536]; 607 | let mut output_size = 0; 608 | bh.iter(|| { 609 | d.r = BufReader::new(fixup(input)); 610 | d.reset(); 611 | output_size = d.read(&mut output).unwrap(); 612 | }); 613 | bh.bytes = output_size as u64; 614 | } 615 | } 616 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![deny(missing_docs)] 2 | #![allow(missing_copy_implementations)] 3 | #![allow(deprecated)] 4 | 5 | //! dox (placeholder) 6 | 7 | extern crate byteorder; 8 | extern crate rand; 9 | 10 | #[macro_use] 11 | extern crate log; 12 | 13 | #[cfg(test)] 14 | #[cfg(feature="unstable")] 15 | extern crate test; 16 | 17 | use std::io::{self, Read}; 18 | 19 | /// Public exports 20 | #[cfg(feature="checksum")] 21 | pub use self::checksum::adler::State32 as Adler32; 22 | 23 | #[cfg(feature="checksum")] 24 | /// Checksum algorithms. Requires `checksum` feature, enabled by default 25 | // http://en.wikipedia.org/wiki/Checksum 26 | pub mod checksum { 27 | pub mod adler; 28 | } 29 | 30 | #[cfg(feature="bwt")] 31 | pub mod bwt; 32 | 33 | #[cfg(feature="flate")] 34 | pub mod flate; 35 | 36 | #[cfg(feature="lz4")] 37 | pub mod lz4; 38 | 39 | #[cfg(feature="zlib")] 40 | pub mod zlib; 41 | 42 | /// Entropy coder family. Requires `entropy` feature, enabled by default 43 | // http://en.wikipedia.org/wiki/Entropy_encoding 44 | #[cfg(feature="entropy")] 45 | pub mod entropy { 46 | pub mod ari; 47 | } 48 | 49 | #[cfg(feature="rle")] 50 | pub mod rle; 51 | 52 | #[cfg(any(feature = "lz4", feature = "entropy", feature = "bwt"))] 53 | fn byteorder_err_to_io(err: io::Error) -> io::Error { 54 | match err { 55 | e if e.kind() == io::ErrorKind::UnexpectedEof => 56 | io::Error::new( 57 | io::ErrorKind::Other, 58 | "unexpected end of file" 59 | ), 60 | e => e, 61 | } 62 | } 63 | 64 | #[cfg(test)] 65 | mod test { 66 | use super::{io,byteorder_err_to_io}; 67 | #[cfg(feature="unstable")] 68 | use test; 69 | 70 | fn force_byteorder_eof_error()->io::Result{ 71 | use byteorder::{BigEndian,ReadBytesExt}; 72 | let mut rdr = io::Cursor::new(vec![1,2]); 73 | rdr.read_u64::() 74 | } 75 | 76 | #[test] 77 | fn byteorder_err_to_io_with_eof() { 78 | 79 | let err_from_byteorder = force_byteorder_eof_error().unwrap_err(); 80 | let err = byteorder_err_to_io(err_from_byteorder); 81 | 82 | let err_expected = io::Error::new( 83 | io::ErrorKind::Other, 84 | "unexpected end of file" 85 | ); 86 | assert_eq!(err.kind(),err_expected.kind()); 87 | } 88 | 89 | #[test] 90 | fn byteorder_err_to_io_with_not_eof() { 91 | 92 | // using closure here to produce 2x the same error, 93 | // as io::Error does not impl Copy trait 94 | let build_other_io_error = || io::Error::new( 95 | io::ErrorKind::NotFound, 96 | "some other io error" 97 | ); 98 | 99 | let err = byteorder_err_to_io(build_other_io_error()); 100 | let err_expected = build_other_io_error(); 101 | 102 | assert_eq!(err.kind(),err_expected.kind()); 103 | } 104 | } 105 | 106 | 107 | /// Adds a convenience method for types with the read trait, very similar 108 | /// to push_at_least in the late Reader trait 109 | pub trait ReadExact: Read + Sized { 110 | /// Appends exact number of bytes to a buffer 111 | fn push_exactly(&mut self, bytes: u64, buf: &mut Vec) -> io::Result<()> { 112 | let n = try!(self.by_ref().take(bytes).read_to_end(buf)) as u64; 113 | 114 | if n < bytes { 115 | return Err(io::Error::new( 116 | io::ErrorKind::Other, 117 | "unexpected end of file" 118 | )); 119 | } 120 | 121 | Ok(()) 122 | } 123 | } 124 | 125 | impl ReadExact for T where T: Read + Sized {} 126 | -------------------------------------------------------------------------------- /src/lz4.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | 3 | LZ4 Decompression and Compression. Requires `lz4` feature, enabled by default 4 | 5 | This module contains an implementation in Rust of decompression and compression 6 | of LZ4-encoded streams. These are exposed as a standard `Reader` and `Writer` 7 | interfaces wrapping an underlying stream. 8 | 9 | # Example 10 | 11 | ```rust,ignore 12 | use compress::lz4; 13 | use std::fs::File; 14 | use std::path::Path; 15 | use std::io::Read; 16 | 17 | let stream = File::open(&Path::new("path/to/file.lz4")).unwrap(); 18 | let mut decompressed = Vec::new(); 19 | lz4::Decoder::new(stream).read_to_end(&mut decompressed); 20 | ``` 21 | 22 | # Credit 23 | 24 | This implementation is largely based on Branimir Karadžić's implementation which 25 | can be found at https://github.com/bkaradzic/go-lz4. 26 | 27 | */ 28 | 29 | use std::cmp; 30 | use std::ptr::copy_nonoverlapping; 31 | use std::io::{self, Read, Write}; 32 | use std::iter::repeat; 33 | use std::vec::Vec; 34 | use std::num::Wrapping; 35 | use std::ops::Shr; 36 | 37 | use super::byteorder::{LittleEndian, WriteBytesExt, ReadBytesExt}; 38 | use super::{ReadExact, byteorder_err_to_io}; 39 | 40 | const MAGIC: u32 = 0x184d2204; 41 | 42 | const ML_BITS: u32 = 4; 43 | const ML_MASK: u32 = (1 << ML_BITS as usize) - 1; 44 | const RUN_BITS: u32 = 8 - ML_BITS; 45 | const RUN_MASK: u32 = (1 << RUN_BITS as usize) - 1; 46 | 47 | const MIN_MATCH: u32 = 4; 48 | const HASH_LOG: u32 = 17; 49 | const HASH_TABLE_SIZE: u32 = 1 << (HASH_LOG as usize); 50 | const HASH_SHIFT: u32 = (MIN_MATCH * 8) - HASH_LOG; 51 | const INCOMPRESSIBLE: u32 = 128; 52 | const UNINITHASH: u32 = 0x88888888; 53 | const MAX_INPUT_SIZE: u32 = 0x7e000000; 54 | 55 | struct BlockDecoder<'a> { 56 | input: &'a [u8], 57 | output: &'a mut Vec, 58 | cur: usize, 59 | 60 | start: usize, 61 | end: usize, 62 | } 63 | 64 | impl<'a> BlockDecoder<'a> { 65 | /// Decodes this block of data from 'input' to 'output', returning the 66 | /// number of valid bytes in the output. 67 | fn decode(&mut self) -> usize { 68 | while self.cur < self.input.len() { 69 | let code = self.bump(); 70 | debug!("block with code: {:x}", code); 71 | // Extract a chunk of data from the input to the output. 72 | { 73 | let len = self.length(code >> 4); 74 | debug!("consume len {}", len); 75 | if len > 0 { 76 | let end = self.end; 77 | self.grow_output(end + len); 78 | unsafe { copy_nonoverlapping( 79 | &self.input[self.cur], 80 | &mut self.output[end], 81 | len 82 | )}; 83 | self.end += len; 84 | self.cur += len; 85 | } 86 | } 87 | if self.cur == self.input.len() { break } 88 | 89 | // Read off the next i16 offset 90 | { 91 | let back = (self.bump() as usize) | ((self.bump() as usize) << 8); 92 | debug!("found back {}", back); 93 | self.start = self.end - back; 94 | } 95 | 96 | // Slosh around some bytes now 97 | { 98 | let mut len = self.length(code & 0xf); 99 | let literal = self.end - self.start; 100 | if literal < 4 { 101 | static DECR: [usize; 4] = [0, 3, 2, 3]; 102 | self.cp(4, DECR[literal]); 103 | } else { 104 | len += 4; 105 | } 106 | self.cp(len, 0); 107 | } 108 | } 109 | self.end 110 | } 111 | 112 | fn length(&mut self, code: u8) -> usize { 113 | let mut ret = code as usize; 114 | if code == 0xf { 115 | loop { 116 | let tmp = self.bump(); 117 | ret += tmp as usize; 118 | if tmp != 0xff { break } 119 | } 120 | } 121 | ret 122 | } 123 | 124 | fn bump(&mut self) -> u8 { 125 | let ret = self.input[self.cur]; 126 | self.cur += 1; 127 | ret 128 | } 129 | 130 | #[inline] 131 | fn cp(&mut self, len: usize, decr: usize) { 132 | let end = self.end; 133 | self.grow_output(end + len); 134 | for i in 0..len { 135 | self.output[end + i] = (*self.output)[self.start + i]; 136 | } 137 | 138 | self.end += len; 139 | self.start += len - decr; 140 | } 141 | 142 | // Extends the output vector to a target number of bytes (in total), but 143 | // does not actually initialize the new data. The length of the vector is 144 | // updated, but the bytes will all have undefined values. It is assumed that 145 | // the next operation is to pave over these bytes (so the initialization is 146 | // unnecessary). 147 | #[inline] 148 | fn grow_output(&mut self, target: usize) { 149 | if self.output.capacity() < target { 150 | debug!("growing {} to {}", self.output.capacity(), target); 151 | //let additional = target - self.output.capacity(); 152 | //self.output.reserve(additional); 153 | while self.output.len() < target { 154 | self.output.push(0); 155 | } 156 | }else { 157 | unsafe { 158 | self.output.set_len(target); 159 | } 160 | } 161 | } 162 | } 163 | 164 | struct BlockEncoder<'a> { 165 | input: &'a [u8], 166 | output: &'a mut Vec, 167 | hash_table: Vec, 168 | pos: u32, 169 | anchor: u32, 170 | dest_pos: u32 171 | } 172 | 173 | /// Returns maximum possible size of compressed output 174 | /// given source size 175 | pub fn compression_bound(size: u32) -> Option { 176 | if size > MAX_INPUT_SIZE { 177 | None 178 | } else { 179 | Some(size + (size / 255) + 16 + 4) 180 | } 181 | } 182 | 183 | impl<'a> BlockEncoder<'a> { 184 | #[inline(always)] 185 | fn seq_at(&self, pos: u32) -> u32 { 186 | (self.input[pos as usize + 3] as u32) << 24 187 | | (self.input[pos as usize + 2] as u32) << 16 188 | | (self.input[pos as usize + 1] as u32) << 8 189 | | (self.input[pos as usize] as u32) 190 | } 191 | 192 | fn write_literals(&mut self, len: u32, ml_len: u32, pos: u32) { 193 | let mut ln = len; 194 | 195 | let code = if ln > RUN_MASK - 1 { RUN_MASK as u8 } else { ln as u8 }; 196 | 197 | if ml_len > ML_MASK - 1 { 198 | self.output[self.dest_pos as usize] = (code << ML_BITS as usize) + ML_MASK as u8; 199 | } else { 200 | self.output[self.dest_pos as usize] = (code << ML_BITS as usize) + ml_len as u8; 201 | } 202 | 203 | self.dest_pos += 1; 204 | 205 | if code == RUN_MASK as u8 { 206 | ln -= RUN_MASK; 207 | while ln > 254 { 208 | self.output[self.dest_pos as usize] = 255; 209 | self.dest_pos += 1; 210 | ln -= 255; 211 | } 212 | 213 | self.output[self.dest_pos as usize] = ln as u8; 214 | self.dest_pos += 1; 215 | } 216 | 217 | // FIXME: find out why slicing syntax fails tests 218 | //self.output[self.dest_pos as usize .. (self.dest_pos + len) as usize] = self.input[pos as uint.. (pos + len) as uint]; 219 | for i in 0..(len as usize) { 220 | self.output[self.dest_pos as usize + i] = self.input[pos as usize + i]; 221 | } 222 | 223 | self.dest_pos += len; 224 | } 225 | 226 | fn encode(&mut self) -> u32 { 227 | let input_len = self.input.len() as u32; 228 | 229 | match compression_bound(input_len) { 230 | None => 0, 231 | Some(out_size) => { 232 | let out_size_usize = out_size as usize; 233 | if self.output.capacity() < out_size_usize { 234 | let additional = out_size_usize - self.output.capacity(); 235 | self.output.reserve(additional); 236 | } 237 | unsafe {self.output.set_len(out_size_usize); } 238 | 239 | let mut step = 1u32; 240 | let mut limit = INCOMPRESSIBLE; 241 | 242 | loop { 243 | if self.pos + 12 > input_len { 244 | let tmp = self.anchor; 245 | self.write_literals(self.input.len() as u32 - tmp, 0, tmp); 246 | unsafe { self.output.set_len(self.dest_pos as usize) }; 247 | return self.dest_pos; 248 | } 249 | 250 | let seq = self.seq_at(self.pos); 251 | let hash = (Wrapping(seq) * Wrapping(2654435761)).shr(HASH_SHIFT as usize).0; 252 | let mut r = (Wrapping(self.hash_table[hash as usize]) + Wrapping(UNINITHASH)).0; 253 | self.hash_table[hash as usize] = (Wrapping(self.pos) - Wrapping(UNINITHASH)).0; 254 | 255 | if (Wrapping(self.pos) - Wrapping(r)).shr(16).0 != 0 || seq != self.seq_at(r) { 256 | if self.pos - self.anchor > limit { 257 | limit = limit << 1; 258 | step += 1 + (step >> 2); 259 | } 260 | self.pos += step; 261 | continue; 262 | } 263 | 264 | if step > 1 { 265 | self.hash_table[hash as usize] = r - UNINITHASH; 266 | self.pos -= step - 1; 267 | step = 1; 268 | continue; 269 | } 270 | 271 | limit = INCOMPRESSIBLE; 272 | 273 | let ln = self.pos - self.anchor; 274 | let back = self.pos - r; 275 | let anchor = self.anchor; 276 | 277 | self.pos += MIN_MATCH; 278 | r += MIN_MATCH; 279 | self.anchor = self.pos; 280 | 281 | while (self.pos < input_len - 5) && self.input[self.pos as usize] == self.input[r as usize] { 282 | self.pos += 1; 283 | r += 1 284 | } 285 | 286 | let mut ml_len = self.pos - self.anchor; 287 | 288 | self.write_literals(ln, ml_len, anchor); 289 | self.output[self.dest_pos as usize] = back as u8; 290 | self.output[self.dest_pos as usize + 1] = (back >> 8) as u8; 291 | self.dest_pos += 2; 292 | 293 | if ml_len > ML_MASK - 1 { 294 | ml_len -= ML_MASK; 295 | while ml_len > 254 { 296 | ml_len -= 255; 297 | 298 | self.output[self.dest_pos as usize] = 255; 299 | self.dest_pos += 1; 300 | } 301 | 302 | self.output[self.dest_pos as usize] = ml_len as u8; 303 | self.dest_pos += 1; 304 | } 305 | 306 | self.anchor = self.pos; 307 | } 308 | } 309 | } 310 | } 311 | } 312 | 313 | /// This structure is used to decode a stream of LZ4 blocks. This wraps an 314 | /// internal reader which is read from when this decoder's read method is 315 | /// called. 316 | pub struct Decoder { 317 | /// The internally wrapped reader. This is exposed so it may be moved out 318 | /// of. Note that if data is read from the reader while decoding is in 319 | /// progress the output stream will get corrupted. 320 | pub r: R, 321 | 322 | temp: Vec, 323 | output: Vec, 324 | 325 | start: usize, 326 | end: usize, 327 | eof: bool, 328 | 329 | header: bool, 330 | blk_checksum: bool, 331 | stream_checksum: bool, 332 | max_block_size: usize, 333 | } 334 | 335 | impl Decoder { 336 | /// Creates a new decoder which will read data from the given stream. The 337 | /// inner stream can be re-acquired by moving out of the `r` field of this 338 | /// structure. 339 | pub fn new(r: R) -> Decoder { 340 | Decoder { 341 | r: r, 342 | temp: Vec::new(), 343 | output: Vec::new(), 344 | header: false, 345 | blk_checksum: false, 346 | stream_checksum: false, 347 | start: 0, 348 | end: 0, 349 | eof: false, 350 | max_block_size: 0, 351 | } 352 | } 353 | 354 | /// Resets this decoder back to its initial state. Note that the underlying 355 | /// stream is not seeked on or has any alterations performed on it. 356 | pub fn reset(&mut self) { 357 | self.header = false; 358 | self.eof = false; 359 | self.start = 0; 360 | self.end = 0; 361 | } 362 | 363 | fn read_header(&mut self) -> io::Result<()> { 364 | // Make sure the magic number is what's expected. 365 | if try!(self.r.read_u32::()) != MAGIC { 366 | return Err(io::Error::new(io::ErrorKind::InvalidInput, "")) 367 | } 368 | 369 | let mut bits = [0; 3]; 370 | try!(self.r.read(&mut bits[..2])); 371 | let flg = bits[0]; 372 | let bd = bits[1]; 373 | 374 | // bits 7/6, the version number. Right now this must be 1 375 | if (flg >> 6) != 0b01 { 376 | return Err(io::Error::new(io::ErrorKind::InvalidInput, "")) 377 | } 378 | // bit 5 is the "block independence", don't care about this yet 379 | // bit 4 is whether blocks have checksums or not 380 | self.blk_checksum = (flg & 0x10) != 0; 381 | // bit 3 is whether there is a following stream size 382 | let stream_size = (flg & 0x08) != 0; 383 | // bit 2 is whether there is a stream checksum 384 | self.stream_checksum = (flg & 0x04) != 0; 385 | // bit 1 is reserved 386 | // bit 0 is whether there is a preset dictionary 387 | let preset_dictionary = (flg & 0x01) != 0; 388 | 389 | static MAX_SIZES: [usize; 8] = 390 | [0, 0, 0, 0, // all N/A 391 | 64 << 10, // 64KB 392 | 256 << 10, // 256 KB 393 | 1 << 20, // 1MB 394 | 4 << 20]; // 4MB 395 | 396 | // bit 7 is reserved 397 | // bits 6-4 are the maximum block size 398 | let max_block_size = MAX_SIZES[(bd >> 4) as usize & 0x7]; 399 | // bits 3-0 are reserved 400 | 401 | // read off other portions of the stream 402 | let size = if stream_size { 403 | Some(try!(self.r.read_u64::())) 404 | } else { 405 | None 406 | }; 407 | assert!(!preset_dictionary, "preset dictionaries not supported yet"); 408 | 409 | debug!("blk: {}", self.blk_checksum); 410 | debug!("stream: {}", self.stream_checksum); 411 | debug!("max size: {}", max_block_size); 412 | debug!("stream size: {:?}", size); 413 | 414 | self.max_block_size = max_block_size; 415 | 416 | // XXX: implement checksums 417 | let cksum = try!(self.r.read_u8()); 418 | debug!("ignoring header checksum: {}", cksum); 419 | return Ok(()); 420 | } 421 | 422 | fn decode_block(&mut self) -> io::Result { 423 | match try!(self.r.read_u32::()) { 424 | // final block, we're done here 425 | 0 => return Ok(false), 426 | 427 | // raw block to read 428 | n if n & 0x80000000 != 0 => { 429 | let amt = (n & 0x7fffffff) as usize; 430 | self.output.truncate(0); 431 | self.output.reserve(amt); 432 | try!(self.r.push_exactly(amt as u64, &mut self.output)); 433 | self.start = 0; 434 | self.end = amt; 435 | } 436 | 437 | // actual block to decompress 438 | n => { 439 | let n = n as usize; 440 | self.temp.truncate(0); 441 | self.temp.reserve(n); 442 | try!(self.r.push_exactly(n as u64, &mut self.temp)); 443 | 444 | let target = cmp::min(self.max_block_size, 4 * n / 3); 445 | self.output.truncate(0); 446 | self.output.reserve(target); 447 | let mut decoder = BlockDecoder { 448 | input: &self.temp[..n], 449 | output: &mut self.output, 450 | cur: 0, 451 | start: 0, 452 | end: 0, 453 | }; 454 | self.start = 0; 455 | self.end = decoder.decode(); 456 | } 457 | } 458 | 459 | if self.blk_checksum { 460 | let cksum = try!(self.r.read_u32::()); 461 | debug!("ignoring block checksum {}", cksum); 462 | } 463 | return Ok(true); 464 | } 465 | 466 | /// Tests whether the end of this LZ4 stream has been reached 467 | pub fn eof(&mut self) -> bool { self.eof } 468 | } 469 | 470 | impl Read for Decoder { 471 | fn read(&mut self, dst: &mut [u8]) -> io::Result { 472 | if self.eof { return Ok(0) } 473 | if !self.header { 474 | try!(self.read_header()); 475 | self.header = true; 476 | } 477 | let mut amt = dst.len(); 478 | let len = amt; 479 | 480 | while amt > 0 { 481 | if self.start == self.end { 482 | let keep_going = try!(self.decode_block()); 483 | if !keep_going { 484 | self.eof = true; 485 | break; 486 | } 487 | } 488 | let n = cmp::min(amt, self.end - self.start); 489 | unsafe { copy_nonoverlapping( 490 | &self.output[self.start], 491 | &mut dst[len - amt], 492 | n 493 | )}; 494 | self.start += n; 495 | amt -= n; 496 | } 497 | 498 | Ok(len - amt) 499 | } 500 | } 501 | 502 | /// This structure is used to compress a stream of bytes using the LZ4 503 | /// compression algorithm. This is a wrapper around an internal writer which 504 | /// bytes will be written to. 505 | pub struct Encoder { 506 | w: W, 507 | buf: Vec, 508 | tmp: Vec, 509 | wrote_header: bool, 510 | limit: usize, 511 | } 512 | 513 | impl Encoder { 514 | /// Creates a new encoder which will have its output written to the given 515 | /// output stream. The output stream can be re-acquired by calling 516 | /// `finish()` 517 | /// 518 | /// NOTE: compression isn't actually implemented just yet, this is just a 519 | /// skeleton of a future implementation. 520 | pub fn new(w: W) -> Encoder { 521 | Encoder { 522 | w: w, 523 | wrote_header: false, 524 | buf: Vec::with_capacity(1024), 525 | tmp: Vec::new(), 526 | limit: 256 * 1024, 527 | } 528 | } 529 | 530 | fn encode_block(&mut self) -> io::Result<()> { 531 | self.tmp.truncate(0); 532 | if self.compress() { 533 | try!(self.w.write_u32::(self.tmp.len() as u32)); 534 | try!(self.w.write(&self.tmp)); 535 | } else { 536 | try!(self.w.write_u32::((self.buf.len() as u32) | 0x80000000)); 537 | try!(self.w.write(&self.buf)); 538 | } 539 | self.buf.truncate(0); 540 | Ok(()) 541 | } 542 | 543 | fn compress(&mut self) -> bool { 544 | false 545 | } 546 | 547 | /// This function is used to flag that this session of compression is done 548 | /// with. The stream is finished up (final bytes are written), and then the 549 | /// wrapped writer is returned. 550 | pub fn finish(mut self) -> (W, io::Result<()>) { 551 | let mut result = self.flush(); 552 | 553 | for _ in 0..2 { 554 | let tmp = self.w.write_u32::(0) 555 | .map_err(byteorder_err_to_io); 556 | 557 | result = result.and_then(|_| tmp); 558 | } 559 | 560 | (self.w, result) 561 | } 562 | } 563 | 564 | impl Write for Encoder { 565 | fn write(&mut self, mut buf: &[u8]) -> io::Result { 566 | if !self.wrote_header { 567 | try!(self.w.write_u32::(MAGIC)); 568 | // version 01, turn on block independence, but turn off 569 | // everything else (we have no checksums right now). 570 | try!(self.w.write_u8(0b01_100000)); 571 | // Maximum block size is 256KB 572 | try!(self.w.write_u8(0b0_101_0000)); 573 | // XXX: this checksum is just plain wrong. 574 | try!(self.w.write_u8(0)); 575 | self.wrote_header = true; 576 | } 577 | 578 | while buf.len() > 0 { 579 | let amt = cmp::min(self.limit - self.buf.len(), buf.len()); 580 | self.buf.extend(buf[..amt].iter().map(|b| *b)); 581 | 582 | if self.buf.len() == self.limit { 583 | try!(self.encode_block()); 584 | } 585 | buf = &buf[amt..]; 586 | } 587 | 588 | Ok(buf.len()) 589 | } 590 | 591 | fn flush(&mut self) -> io::Result<()> { 592 | if self.buf.len() > 0 { 593 | try!(self.encode_block()); 594 | } 595 | self.w.flush() 596 | } 597 | } 598 | 599 | 600 | /// Decodes pure LZ4 block into output. Returns count of bytes 601 | /// processed. 602 | pub fn decode_block(input: &[u8], output: &mut Vec) -> usize { 603 | let mut b = BlockDecoder { 604 | input: input, 605 | output: output, 606 | cur: 0, 607 | start: 0, 608 | end: 0 609 | }; 610 | b.decode() 611 | } 612 | 613 | 614 | /// Encodes input into pure LZ4 block. Return count of bytes 615 | /// processed. 616 | pub fn encode_block(input: &[u8], output: &mut Vec) -> usize { 617 | let mut encoder = BlockEncoder { 618 | input: input, 619 | output: output, 620 | hash_table: repeat(0).take(HASH_TABLE_SIZE as usize).collect(), 621 | pos: 0, 622 | anchor: 0, 623 | dest_pos: 0 624 | }; 625 | 626 | encoder.encode() as usize 627 | } 628 | 629 | #[cfg(test)] 630 | mod test { 631 | use std::io::{BufReader, BufWriter, Read, Write}; 632 | use super::super::rand; 633 | use super::{Decoder, Encoder}; 634 | #[cfg(feature="unstable")] 635 | use test; 636 | 637 | use super::super::byteorder::ReadBytesExt; 638 | 639 | fn test_decode(input: &[u8], output: &[u8]) { 640 | let mut d = Decoder::new(BufReader::new(input)); 641 | let mut buf = Vec::new(); 642 | 643 | d.read_to_end(&mut buf).unwrap(); 644 | assert!(&buf[..] == output); 645 | } 646 | 647 | #[test] 648 | fn decode() { 649 | let reference = include_bytes!("data/test.txt"); 650 | test_decode(include_bytes!("data/test.lz4.1"), reference); 651 | test_decode(include_bytes!("data/test.lz4.2"), reference); 652 | test_decode(include_bytes!("data/test.lz4.3"), reference); 653 | test_decode(include_bytes!("data/test.lz4.4"), reference); 654 | test_decode(include_bytes!("data/test.lz4.5"), reference); 655 | test_decode(include_bytes!("data/test.lz4.6"), reference); 656 | test_decode(include_bytes!("data/test.lz4.7"), reference); 657 | test_decode(include_bytes!("data/test.lz4.8"), reference); 658 | test_decode(include_bytes!("data/test.lz4.9"), reference); 659 | } 660 | 661 | #[test] 662 | fn raw_encode_block() { 663 | let data = include_bytes!("data/test.txt"); 664 | let mut encoded = Vec::new(); 665 | 666 | super::encode_block(data, &mut encoded); 667 | let mut decoded = Vec::new(); 668 | 669 | super::decode_block(&encoded[..], &mut decoded); 670 | 671 | assert_eq!(&data[..], &decoded[..]); 672 | } 673 | 674 | #[test] 675 | fn one_byte_at_a_time() { 676 | let input = include_bytes!("data/test.lz4.1"); 677 | let mut d = Decoder::new(BufReader::new(&input[..])); 678 | assert!(!d.eof()); 679 | let mut out = Vec::new(); 680 | loop { 681 | match d.read_u8() { 682 | Ok(b) => out.push(b), 683 | Err(..) => break 684 | } 685 | } 686 | assert!(d.eof()); 687 | assert!(&out[..] == &include_bytes!("data/test.txt")[..]); 688 | } 689 | 690 | #[test] 691 | fn random_byte_lengths() { 692 | let input = include_bytes!("data/test.lz4.1"); 693 | let mut d = Decoder::new(BufReader::new(&input[..])); 694 | let mut out = Vec::new(); 695 | let mut buf = [0u8; 40]; 696 | loop { 697 | match d.read(&mut buf[..(1 + rand::random::() % 40)]) { 698 | Ok(0) => break, 699 | Ok(n) => { 700 | out.extend(buf[..n].iter().map(|b| *b)); 701 | } 702 | Err(..) => break 703 | } 704 | } 705 | assert!(&out[..] == &include_bytes!("data/test.txt")[..]); 706 | } 707 | 708 | fn roundtrip(bytes: &[u8]) { 709 | let mut e = Encoder::new(BufWriter::new(Vec::new())); 710 | e.write(bytes).unwrap(); 711 | let (e, err) = e.finish(); 712 | err.unwrap(); 713 | let encoded = e.into_inner().unwrap(); 714 | 715 | let mut d = Decoder::new(BufReader::new(&encoded[..])); 716 | let mut decoded = Vec::new(); 717 | d.read_to_end(&mut decoded).unwrap(); 718 | assert_eq!(&decoded[..], bytes); 719 | } 720 | 721 | #[test] 722 | fn some_roundtrips() { 723 | roundtrip(b"test"); 724 | roundtrip(b""); 725 | roundtrip(include_bytes!("data/test.txt")); 726 | } 727 | 728 | #[cfg(feature="unstable")] 729 | #[bench] 730 | fn decompress_speed(bh: &mut test::Bencher) { 731 | let input = include_bytes!("data/test.lz4.9"); 732 | let mut d = Decoder::new(BufReader::new(&input[..])); 733 | let mut output = [0u8; 65536]; 734 | let mut output_size = 0; 735 | bh.iter(|| { 736 | d.r = BufReader::new(&input[..]); 737 | d.reset(); 738 | output_size = d.read(&mut output).unwrap(); 739 | }); 740 | bh.bytes = output_size as u64; 741 | } 742 | } 743 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | #![crate_type = "bin"] 2 | 3 | //! A rust-compress application that allows testing of implemented 4 | //! algorithms and their combinations using a simple command line. 5 | //! Example invocations: 6 | //! echo -n "abracadabra" | ./app bwt | xxd 7 | //! echo "banana" | ./app bwt | ./app -d 8 | 9 | #[macro_use] extern crate log; 10 | extern crate compress; 11 | extern crate byteorder; 12 | 13 | use std::collections::HashMap; 14 | use std::io::{self, Read, Write}; 15 | use std::{env, str}; 16 | use compress::{bwt, lz4, ReadExact}; 17 | use compress::entropy::ari; 18 | use byteorder::{LittleEndian, WriteBytesExt, ReadBytesExt}; 19 | 20 | static MAGIC : u32 = 0x73632172; //=r!cs 21 | 22 | struct Config { 23 | exe_name: String, 24 | methods: Vec, 25 | block_size: usize, 26 | decompress: bool, 27 | } 28 | 29 | impl Config { 30 | fn query(mut args: I) -> Config where I: Iterator + Sized { 31 | let mut cfg = Config { 32 | exe_name: args.next().unwrap().clone(), 33 | methods: Vec::new(), 34 | block_size: 1<<16, 35 | decompress: false, 36 | }; 37 | let mut handlers: HashMap<&str, Box> = 38 | HashMap::new(); 39 | handlers.insert("d", Box::new(|_, cfg| { cfg.decompress = true; })); 40 | handlers.insert("block", Box::new(|b, cfg| { 41 | cfg.block_size = b.parse().unwrap(); 42 | })); 43 | 44 | for arg in args { 45 | let slice = &arg[..]; 46 | if slice.starts_with("-") { 47 | match handlers.iter_mut().find(|&(&k,_)| slice[1..].starts_with(k)) { 48 | Some((k,h)) => (*h)(&slice[1+k.len()..], &mut cfg), 49 | None => println!("Warning: unrecognized option: {}", &arg[..]), 50 | } 51 | }else { 52 | cfg.methods.push(arg.to_string()); 53 | } 54 | } 55 | cfg 56 | } 57 | } 58 | 59 | struct Pass { 60 | encode: Box, &Config) 61 | -> Box + 'static>, 62 | decode: Box, &Config) 63 | -> Box + 'static>, 64 | info: String, 65 | } 66 | 67 | 68 | /// main entry point 69 | pub fn main() { 70 | let mut passes: HashMap = HashMap::new(); 71 | passes.insert("dummy".to_string(), Pass { 72 | encode: Box::new(|w,_| w), 73 | decode: Box::new(|r,_| r), 74 | info: "pass-through".to_string(), 75 | }); 76 | passes.insert("ari".to_string(), Pass { 77 | encode: Box::new(|w,_c| { 78 | Box::new(ari::ByteEncoder::new(w)) as Box 79 | }), 80 | decode: Box::new(|r,_c| { 81 | Box::new(ari::ByteDecoder::new(r)) as Box 82 | }), 83 | info: "Adaptive arithmetic byte coder".to_string(), 84 | }); 85 | passes.insert("bwt".to_string(), Pass { 86 | encode: Box::new(|w,c| { 87 | Box::new(bwt::Encoder::new(w, c.block_size)) as Box 88 | }), 89 | decode: Box::new(|r,_c| { 90 | Box::new(bwt::Decoder::new(r, true)) as Box 91 | }), 92 | info: "Burrows-Wheeler Transformation".to_string(), 93 | }); 94 | passes.insert("mtf".to_string(), Pass { 95 | encode: Box::new(|w,_c| { 96 | Box::new(bwt::mtf::Encoder::new(w)) as Box 97 | }), 98 | decode: Box::new(|r,_c| { 99 | Box::new(bwt::mtf::Decoder::new(r)) as Box 100 | }), 101 | info: "Move-To-Front Transformation".to_string(), 102 | }); 103 | /* // looks like we are missing the encoder implementation 104 | passes.insert(~"flate", Pass { 105 | encode: |w,_c| { 106 | ~flate::Encoder::new(w, true) as ~Write 107 | }, 108 | decode: |r,_c| { 109 | ~flate::Decoder::new(r, true) as ~Read 110 | }, 111 | info: ~"Standardized Ziv-Lempel + Huffman variant", 112 | });*/ 113 | passes.insert("lz4".to_string(), Pass { 114 | encode: Box::new(|w,_c| { 115 | Box::new(lz4::Encoder::new(w)) as Box 116 | }), 117 | decode: Box::new(|r,_c| { // LZ4 decoder seem to work 118 | Box::new(lz4::Decoder::new(r)) as Box 119 | }), 120 | info: "Ziv-Lempel derivative, focused at speed".to_string(), 121 | }); 122 | 123 | let config = Config::query(env::args()); 124 | let mut input = io::stdin(); 125 | let mut output = io::stdout(); 126 | if config.decompress { 127 | assert!(config.methods.is_empty(), "Decompression methods are set in stone"); 128 | match input.read_u32::() { 129 | Ok(magic) if magic != MAGIC => { 130 | error!("Input is not a rust-compress archive"); 131 | return 132 | }, 133 | Err(e) => { 134 | error!("Unable to read input: {:?}", e); 135 | return 136 | }, 137 | _ => () //OK 138 | } 139 | let methods: Vec<_> = (0..(input.read_u8().unwrap() as usize)).map(|_| { 140 | let len = input.read_u8().unwrap() as u64; 141 | let mut bytes = Vec::new(); 142 | input.push_exactly(len, &mut bytes).unwrap(); 143 | str::from_utf8(&bytes[..]).unwrap().to_string() 144 | }).collect(); 145 | let mut rsum: Box = Box::new(input); 146 | for met in methods.iter() { 147 | info!("Found pass {}", *met); 148 | match passes.get_mut(met) { 149 | Some(pa) => rsum = (pa.decode)(rsum, &config), 150 | None => panic!("Pass is not implemented"), 151 | } 152 | } 153 | io::copy(&mut rsum, &mut output).unwrap(); 154 | }else if config.methods.is_empty() { 155 | println!("rust-compress test application"); 156 | println!("Usage:"); 157 | println!("\t{} .. output", config.exe_name); 158 | println!("Options:"); 159 | println!("\t-d (to decompress)"); 160 | println!("\t-block (BWT block size)"); 161 | println!("Passes:"); 162 | for (name,pa) in passes.iter() { 163 | println!("\t{} = {}", *name, pa.info); 164 | } 165 | }else { 166 | output.write_u32::(MAGIC).unwrap(); 167 | output.write_u8(config.methods.len() as u8).unwrap(); 168 | for met in config.methods.iter() { 169 | output.write_u8(met.len() as u8).unwrap(); 170 | output.write_all(met.as_bytes()).unwrap(); 171 | } 172 | let mut wsum: Box = Box::new(output); 173 | for met in config.methods.iter() { 174 | match passes.get_mut(met) { 175 | Some(pa) => wsum = (pa.encode)(wsum, &config), 176 | None => panic!("Pass {} is not implemented", *met) 177 | } 178 | } 179 | io::copy(&mut input, &mut wsum).unwrap(); 180 | wsum.flush().unwrap(); 181 | } 182 | } 183 | -------------------------------------------------------------------------------- /src/rle.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | 3 | Run time length encoding and decoding based on byte streams, see 4 | https://en.wikipedia.org/wiki/Run-length_encoding. 5 | 6 | A run is defined as a sequence of identical bytes of length two or greater. 7 | A run of byte a and length n is encoded by a two repitions of a, followed 8 | by a length specification which describes how much often these bytes are 9 | repeated. Such a specification is a string of bytes with dynamic length. 10 | The most significat bit of each byte in this string indicates if the byte is 11 | the last byte in the string. The rest of the bits are concatenated using 12 | the Little Endian convention. 13 | 14 | # Example 15 | 16 | ```rust 17 | use compress::rle; 18 | use std::io::{Write, Read}; 19 | 20 | let input = b"Helloooo world!!"; 21 | 22 | let mut encoder = rle::Encoder::new(Vec::new()); 23 | encoder.write_all(&input[..]).unwrap(); 24 | let (buf, _): (Vec, _) = encoder.finish(); 25 | 26 | let mut decoder = rle::Decoder::new(&buf[..]); 27 | let mut decoder_buf = Vec::new(); 28 | decoder.read_to_end(&mut decoder_buf).unwrap(); 29 | 30 | assert_eq!(&input[..], &decoder_buf[..]); 31 | ``` 32 | 33 | !*/ 34 | 35 | use std::io::{self, Write, Read, Bytes}; 36 | 37 | /// This structure is used to compress a stream of bytes using a RLE 38 | /// compression algorithm. This is a wrapper around an internal writer which 39 | /// bytes will be written to. 40 | pub struct Encoder { 41 | w: W, 42 | reps: u64, 43 | byte: u8, 44 | in_run: bool 45 | } 46 | 47 | impl Encoder { 48 | /// Creates a new encoder which will have its output written to the given 49 | /// output stream. 50 | pub fn new(w: W) -> Encoder { 51 | Encoder { 52 | w: w, 53 | reps: 0, 54 | byte: 0, 55 | in_run: false 56 | } 57 | } 58 | 59 | /// This function is used to flag that this session of compression is done 60 | /// with. The stream is finished up (final bytes are written), and then the 61 | /// wrapped writer is returned. 62 | pub fn finish(mut self) -> (W, io::Result<()>) { 63 | let result = self.flush(); 64 | 65 | (self.w, result) 66 | } 67 | 68 | fn process_byte(&mut self, byte: u8) -> io::Result<()> { 69 | if self.byte == byte { 70 | self.reps += 1; 71 | } else if self.byte != byte { 72 | try!(self.flush()); 73 | self.reps = 1; 74 | self.byte = byte; 75 | } 76 | 77 | Ok(()) 78 | } 79 | } 80 | 81 | impl Write for Encoder { 82 | fn write(&mut self, buf: &[u8]) -> io::Result { 83 | if ! self.in_run && buf.len() > 0 { 84 | self.byte = buf[0]; 85 | self.reps = 1; 86 | self.in_run = true; 87 | } 88 | 89 | for byte in &buf[1..] { 90 | try!(self.process_byte(*byte)); 91 | } 92 | 93 | Ok(buf.len()) 94 | } 95 | 96 | fn flush(&mut self) -> io::Result<()> { 97 | if self.reps == 1 { 98 | try!(self.w.write(&[self.byte])); 99 | } else if self.reps > 1 { 100 | let mut buf = [0; 11]; 101 | let mut reps_encode = self.reps - 2; 102 | let mut index = 2; 103 | buf[0] = self.byte; 104 | buf[1] = self.byte; 105 | 106 | loop { 107 | buf[index] = (reps_encode & 0b0111_1111) as u8; 108 | reps_encode = reps_encode >> 7; 109 | 110 | if reps_encode == 0 { 111 | buf[index] = buf[index] | 0b1000_0000; 112 | break; 113 | } 114 | 115 | index += 1; 116 | } 117 | 118 | try!(self.w.write(&buf[..(index + 1)])); 119 | } 120 | 121 | Ok(()) 122 | } 123 | } 124 | 125 | struct RunBuilder { 126 | byte: u8, 127 | slice: [u8; 9], 128 | byte_count: u8 129 | } 130 | 131 | impl RunBuilder { 132 | fn new(byte: u8) -> RunBuilder { 133 | RunBuilder { 134 | byte: byte, 135 | slice: [0; 9], 136 | byte_count: 0 137 | } 138 | } 139 | 140 | fn to_run(&mut self) -> Run { 141 | let reps = 2 + self.slice.iter().enumerate().fold(0u64, |reps, (i, &byte)| { 142 | reps | (((byte & 0b0111_1111) as u64) << (i as u32 * 7)) 143 | }); 144 | 145 | Run { 146 | byte: self.byte, 147 | reps: reps 148 | } 149 | } 150 | 151 | fn add_byte(&mut self, byte: u8) -> io::Result<()> { 152 | if self.byte_count >= 9 { 153 | Err(io::Error::new(io::ErrorKind::Other, "Overly long run")) 154 | } else { 155 | self.slice[self.byte_count as usize] = byte; 156 | self.byte_count += 1; 157 | Ok(()) 158 | } 159 | } 160 | } 161 | 162 | struct Run { 163 | byte: u8, 164 | reps: u64 165 | } 166 | 167 | enum DecoderState { 168 | Clean, 169 | Single(u8), 170 | Run(RunBuilder) 171 | } 172 | 173 | /// This structure is used to decode a run length encoded stream. This wraps 174 | /// an internal reader which is read from when this decoder's read method is 175 | /// called. 176 | pub struct Decoder { 177 | buf: Bytes, 178 | state: DecoderState, 179 | run: Option 180 | } 181 | 182 | impl Decoder { 183 | /// Creates a new decoder which will read data from the given stream. The 184 | /// inner stream can be re-acquired by moving out of the `r` field of this 185 | /// structure. 186 | pub fn new(r: R) -> Decoder { 187 | Decoder { 188 | buf: r.bytes(), 189 | state: DecoderState::Clean, 190 | run: None 191 | } 192 | } 193 | 194 | fn read_byte(&mut self) -> io::Result> { 195 | if let None = self.run { 196 | try!(self.read_run()); 197 | } 198 | 199 | if let Some(Run { byte: b, reps: r }) = self.run { 200 | if r <= 1 { 201 | self.run = None; 202 | } else { 203 | self.run = Some(Run { byte: b, reps: r - 1 }); 204 | } 205 | 206 | return Ok(Some(b)) 207 | } 208 | 209 | Ok(None) 210 | } 211 | 212 | fn read_run(&mut self) -> io::Result<()> { 213 | let mut reset = false; 214 | 215 | while let Some(result) = self.buf.next() { 216 | let byte = try!(result); 217 | 218 | match self.state { 219 | DecoderState::Clean => { 220 | self.state = DecoderState::Single(byte); 221 | }, 222 | DecoderState::Single(current) => { 223 | if byte == current { 224 | self.state = DecoderState::Run(RunBuilder::new(byte)); 225 | } else { 226 | self.run = Some(Run { byte: current, reps: 1 }); 227 | self.state = DecoderState::Single(byte); 228 | break; 229 | } 230 | }, 231 | DecoderState::Run(ref mut run_builder) => { 232 | try!(run_builder.add_byte(byte)); 233 | 234 | if Self::is_final_run_byte(byte) { 235 | self.run = Some(run_builder.to_run()); 236 | reset = true; 237 | break; 238 | } 239 | } 240 | } 241 | } 242 | 243 | if reset { 244 | self.state = DecoderState::Clean; 245 | } 246 | 247 | // internal read object exhausted -- flush remaining state into run 248 | if let None = self.run { 249 | self.run = match self.state { 250 | DecoderState::Clean => None, 251 | DecoderState::Single(byte) => Some(Run { byte: byte, reps: 1 }), 252 | DecoderState::Run(ref mut run_builder) => Some(run_builder.to_run()) 253 | }; 254 | 255 | self.state = DecoderState::Clean; 256 | } 257 | 258 | Ok(()) 259 | } 260 | 261 | fn is_final_run_byte(byte: u8) -> bool { 262 | 0b1000_0000 & byte != 0 263 | } 264 | } 265 | 266 | impl Read for Decoder { 267 | fn read(&mut self, buf: &mut [u8]) -> io::Result { 268 | let mut bytes_read = 0; 269 | 270 | for slot in buf { 271 | match try!(self.read_byte()) { 272 | Some(b) => *slot = b, 273 | None => break 274 | } 275 | 276 | bytes_read += 1; 277 | } 278 | 279 | Ok(bytes_read) 280 | } 281 | } 282 | 283 | #[cfg(test)] 284 | mod test { 285 | use super::{Decoder, Encoder}; 286 | use super::super::rand::{RngCore,rngs::OsRng}; 287 | use std::io::{Write, Read}; 288 | use std::iter::{Iterator, repeat}; 289 | #[cfg(feature="unstable")] 290 | use test; 291 | 292 | fn test_encode(input: &[u8], output: &[u8]) { 293 | let mut encoder = Encoder::new(Vec::new()); 294 | encoder.write_all(input).unwrap(); 295 | let (buf, _) = encoder.finish(); 296 | 297 | assert_eq!(output, &buf[..]); 298 | } 299 | 300 | fn test_decode(input: &[u8], output: &[u8]) { 301 | let mut decoder = Decoder::new(input); 302 | let mut buf = Vec::new(); 303 | decoder.read_to_end(&mut buf).unwrap(); 304 | 305 | assert_eq!(output, &buf[..]); 306 | } 307 | 308 | fn test_roundtrip(input: &[u8]) { 309 | let mut encoder = Encoder::new(Vec::new()); 310 | encoder.write_all(input).unwrap(); 311 | let (buf, _) = encoder.finish(); 312 | 313 | let mut decoder = Decoder::new(&buf[..]); 314 | let mut decoder_buf = Vec::new(); 315 | decoder.read_to_end(&mut decoder_buf).unwrap(); 316 | 317 | assert_eq!(input, &decoder_buf[..]); 318 | } 319 | 320 | #[test] 321 | fn simple_encoding() { 322 | test_encode(b"", b""); 323 | test_encode(b"a", b"a"); 324 | test_encode(b"abca123", b"abca123"); 325 | test_encode(&[20, 20, 20, 20, 20, 15], &[20, 20, 5 - 2 + 128, 15]); 326 | test_encode(&[0, 0], &[0, 0, 2 - 2 + 128]); 327 | } 328 | 329 | #[test] 330 | fn long_run_encoding() { 331 | let mut data = repeat(5).take(129).collect::>(); 332 | test_encode(&data[..], &[5, 5, 255]); 333 | 334 | data = [1, 3, 4, 4].iter().map(|&x| x).chain(repeat(100).take(2 + 52 + 128)).collect::>(); 335 | test_encode(&data[..], &[1, 3, 4, 4, 0 + 128, 100, 100, 52, 1 + 128]); 336 | } 337 | 338 | #[test] 339 | fn simple_decoding() { 340 | test_decode(b"", b""); 341 | test_decode(b"a", b"a"); 342 | test_decode(b"abca123", b"abca123"); 343 | test_decode(&[20, 20, 5 - 2 + 128, 15], &[20, 20, 20, 20, 20, 15]); 344 | test_decode(&[0, 0, 2 - 2 + 128], &[0, 0]); 345 | } 346 | 347 | #[test] 348 | fn long_run_decoding() { 349 | let data = [1, 3, 4, 4].iter().map(|&x| x).chain(repeat(100).take(2 + 52 + 128)).collect::>(); 350 | 351 | test_decode(&[1, 3, 4, 4, 0 + 128, 100, 100, 52, 1 + 128], &data[..]); 352 | } 353 | 354 | #[test] 355 | fn random_roundtrips() { 356 | for _ in 0..100 { 357 | let mut buf = [0; 13579]; 358 | OsRng.fill_bytes(&mut buf[..]); 359 | test_roundtrip(&buf); 360 | } 361 | } 362 | 363 | // initial speed: 145 MB/s 364 | // after moving check to write: 145 MB/s 365 | 366 | #[cfg(feature="unstable")] 367 | #[bench] 368 | fn compress_speed(bh: &mut test::Bencher) { 369 | let input = include_bytes!("data/test.txt"); 370 | bh.bytes = input.len() as u64; 371 | let output_size = Encoder::new(Vec::new()).write(&input[..]).unwrap(); 372 | let mut buf = Vec::with_capacity(output_size); 373 | 374 | bh.iter(|| { 375 | let mut encoder = Encoder::new(&mut buf[..]); 376 | encoder.write(&input[..]).unwrap(); 377 | }); 378 | } 379 | 380 | // initial speed: 91 MB/s 381 | // after using a BufReader instead of VecDeque: 20 MB/s 382 | // after using a byte iterator on a BufReader: 20 MB/s 383 | // after using a byte iterator on the raw read object: 80 MB/s 384 | 385 | #[cfg(feature="unstable")] 386 | #[bench] 387 | fn decompress_speed(bh: &mut test::Bencher) { 388 | let input = include_bytes!("data/test.txt"); 389 | let mut encoder = Encoder::new(Vec::new()); 390 | encoder.write_all(&input[..]).unwrap(); 391 | let (buf, _): (Vec, _) = encoder.finish(); 392 | 393 | let mut output = [0u8; 65536]; 394 | let mut output_size = 0; 395 | 396 | bh.iter(|| { 397 | let mut decoder = Decoder::new(& buf[..]); 398 | output_size = decoder.read(&mut output[..]).unwrap(); 399 | }); 400 | 401 | bh.bytes = output_size as u64; 402 | } 403 | } 404 | -------------------------------------------------------------------------------- /src/zlib.rs: -------------------------------------------------------------------------------- 1 | //! ZLIB Compression and Decompression. Requires `zlib` feature, enabled by default 2 | //! 3 | //! This module contains an implementation of the ZLIB compression scheme. This 4 | //! compression format is based on an underlying DEFLATE-encoded stream. 5 | //! 6 | //! # Example 7 | //! 8 | //! ```rust,ignore 9 | //! use compress::zlib; 10 | //! use std::fs::File; 11 | //! use std::path::Path; 12 | //! use std::io::Read; 13 | //! 14 | //! let stream = File::open(&Path::new("path/to/file.zlib")).unwrap(); 15 | //! let mut decompressed = Vec::new(); 16 | //! zlib::Decoder::new(stream).read_to_end(&mut decompressed); 17 | //! ``` 18 | //! 19 | //! # Related links 20 | //! 21 | //! * http://tools.ietf.org/html/rfc1950 - RFC that this implementation is based 22 | //! on 23 | 24 | use std::io::{self, Read}; 25 | use super::byteorder::{BigEndian, ReadBytesExt}; 26 | 27 | use Adler32; 28 | use flate; 29 | 30 | /// Structure used to decode a ZLIB-encoded stream. The wrapped stream can be 31 | /// re-acquired through the unwrap() method. 32 | pub struct Decoder { 33 | hash: Adler32, 34 | inner: flate::Decoder, 35 | read_header: bool, 36 | } 37 | 38 | impl Decoder { 39 | /// Creates a new ZLIB-stream decoder which will wrap the specified reader. 40 | /// This decoder also implements the `Reader` trait, and the underlying 41 | /// reader can be re-acquired through the `unwrap` method. 42 | pub fn new(r: R) -> Decoder { 43 | Decoder { 44 | hash: Adler32::new(), 45 | inner: flate::Decoder::new(r), 46 | read_header: false, 47 | } 48 | } 49 | 50 | /// Destroys this decoder, returning the underlying reader. 51 | pub fn unwrap(self) -> R { 52 | self.inner.r 53 | } 54 | 55 | fn validate_header(&mut self) -> io::Result<()> { 56 | let cmf = try!(self.inner.r.read_u8()); 57 | let flg = try!(self.inner.r.read_u8()); 58 | if cmf & 0xf != 0x8 { 59 | return Err(io::Error::new( 60 | io::ErrorKind::InvalidInput, 61 | "unsupported zlib stream format" 62 | )) 63 | } 64 | 65 | if cmf & 0xf0 != 0x70 { 66 | return Err(io::Error::new( 67 | io::ErrorKind::InvalidInput, 68 | "unsupported zlib window size" 69 | )) 70 | } 71 | 72 | if flg & 0x20 != 0 { 73 | return Err(io::Error::new( 74 | io::ErrorKind::InvalidInput, 75 | "unsupported initial dictionary in the output stream" 76 | )) 77 | } 78 | 79 | if ((cmf as u16) * 256 + (flg as u16)) % 31 != 0 { 80 | return Err(io::Error::new( 81 | io::ErrorKind::InvalidInput, 82 | "invalid zlib header checksum" 83 | )) 84 | } 85 | Ok(()) 86 | } 87 | 88 | /// Tests if this stream has reached the EOF point yet. 89 | pub fn eof(&self) -> bool { self.inner.eof() } 90 | 91 | #[allow(dead_code)] 92 | fn reset(&mut self) { 93 | self.inner.reset(); 94 | self.hash.reset(); 95 | self.read_header = false; 96 | } 97 | } 98 | 99 | impl Read for Decoder { 100 | fn read(&mut self, buf: &mut [u8]) -> io::Result { 101 | if !self.read_header { 102 | try!(self.validate_header()); 103 | self.read_header = true; 104 | } else if self.inner.eof() { 105 | return Ok(0); 106 | } 107 | match self.inner.read(buf) { 108 | Ok(0) => { 109 | let cksum = try!(self.inner.r.read_u32::()); 110 | if cksum != self.hash.result() { 111 | Err(io::Error::new( 112 | io::ErrorKind::InvalidInput, 113 | "invalid checksum on zlib stream" 114 | )) 115 | } 116 | else { 117 | Ok(0) 118 | } 119 | } 120 | Ok(n) => { 121 | self.hash.feed(&buf[..n]); 122 | Ok(n) 123 | } 124 | Err(e) => Err(e) 125 | } 126 | } 127 | } 128 | 129 | #[cfg(test)] 130 | #[allow(warnings)] 131 | mod test { 132 | use std::io::{BufReader, BufWriter, Read, Write}; 133 | use super::super::rand::{random}; 134 | use super::super::byteorder::{LittleEndian, BigEndian, WriteBytesExt, ReadBytesExt}; 135 | use std::str; 136 | use super::{Decoder}; 137 | #[cfg(feature="unstable")] 138 | use test; 139 | 140 | fn test_decode(input: &[u8], output: &[u8]) { 141 | let mut d = Decoder::new(BufReader::new(input)); 142 | let mut buf = Vec::new(); 143 | 144 | if let Err(e) = d.read_to_end(&mut buf) { 145 | panic!("error reading: {}", e); 146 | } 147 | 148 | assert!(&buf[..] == output); 149 | } 150 | 151 | #[test] 152 | fn decode() { 153 | let reference = include_bytes!("data/test.txt"); 154 | test_decode(include_bytes!("data/test.z.0"), reference); 155 | test_decode(include_bytes!("data/test.z.1"), reference); 156 | test_decode(include_bytes!("data/test.z.2"), reference); 157 | test_decode(include_bytes!("data/test.z.3"), reference); 158 | test_decode(include_bytes!("data/test.z.4"), reference); 159 | test_decode(include_bytes!("data/test.z.5"), reference); 160 | test_decode(include_bytes!("data/test.z.6"), reference); 161 | test_decode(include_bytes!("data/test.z.7"), reference); 162 | test_decode(include_bytes!("data/test.z.8"), reference); 163 | test_decode(include_bytes!("data/test.z.9"), reference); 164 | } 165 | 166 | #[test] 167 | fn large() { 168 | let reference = include_bytes!("data/test.large"); 169 | test_decode(include_bytes!("data/test.large.z.5"), reference); 170 | } 171 | 172 | #[test] 173 | fn one_byte_at_a_time() { 174 | let input = include_bytes!("data/test.z.1"); 175 | let mut d = Decoder::new(BufReader::new(&input[..])); 176 | assert!(!d.eof()); 177 | let mut out = Vec::new(); 178 | loop { 179 | match d.read_u8() { 180 | Ok(b) => out.push(b), 181 | Err(..) => break 182 | } 183 | } 184 | assert!(d.eof()); 185 | assert!(&out[..] == &include_bytes!("data/test.txt")[..]); 186 | } 187 | 188 | #[test] 189 | fn random_byte_lengths() { 190 | let input = include_bytes!("data/test.z.1"); 191 | let mut d = Decoder::new(BufReader::new(&input[..])); 192 | let mut out = Vec::new(); 193 | let mut buf = [0u8; 40]; 194 | loop { 195 | match d.read(&mut buf[..(1 + random::() % 40)]) { 196 | Ok(0) | Err(..) => break, 197 | Ok(n) => { 198 | out.extend(buf[..n].iter().map(|b| *b)); 199 | } 200 | } 201 | } 202 | assert!(&out[..] == &include_bytes!("data/test.txt")[..]); 203 | } 204 | 205 | //fn roundtrip(bytes: &[u8]) { 206 | // let mut e = Encoder::new(MemWriter::new()); 207 | // e.write(bytes); 208 | // let encoded = e.finish().unwrap(); 209 | // 210 | // let mut d = Decoder::new(BufReader::new(encoded)); 211 | // let decoded = d.read_to_end(); 212 | // assert_eq!(&decoded[..], bytes); 213 | //} 214 | // 215 | //#[test] 216 | //fn some_roundtrips() { 217 | // roundtrip(bytes!("test")); 218 | // roundtrip(bytes!("")); 219 | // roundtrip(include_bytes!("data/test.txt")); 220 | //} 221 | 222 | #[cfg(feature="unstable")] 223 | #[bench] 224 | fn decompress_speed(bh: &mut test::Bencher) { 225 | let input = include_bytes!("data/test.z.9"); 226 | let mut d = Decoder::new(BufReader::new(&input[..])); 227 | let mut output = [0u8; 65536]; 228 | let mut output_size = 0; 229 | bh.iter(|| { 230 | d.inner.r = BufReader::new(input); 231 | d.reset(); 232 | output_size = d.read(&mut output[..]).unwrap(); 233 | }); 234 | bh.bytes = output_size as u64; 235 | } 236 | } 237 | --------------------------------------------------------------------------------