├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── cli ├── Cargo.toml └── src │ └── main.rs ├── cmp.sh ├── src ├── huffman.rs ├── lib.rs ├── lzw.rs └── shared.rs └── stat ├── Cargo.toml └── src └── lib.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | .vscode 3 | ignore 4 | src/bin 5 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "anstream" 7 | version = "0.3.2" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163" 10 | dependencies = [ 11 | "anstyle", 12 | "anstyle-parse", 13 | "anstyle-query", 14 | "anstyle-wincon", 15 | "colorchoice", 16 | "is-terminal", 17 | "utf8parse", 18 | ] 19 | 20 | [[package]] 21 | name = "anstyle" 22 | version = "1.0.0" 23 | source = "registry+https://github.com/rust-lang/crates.io-index" 24 | checksum = "41ed9a86bf92ae6580e0a31281f65a1b1d867c0cc68d5346e2ae128dddfa6a7d" 25 | 26 | [[package]] 27 | name = "anstyle-parse" 28 | version = "0.2.0" 29 | source = "registry+https://github.com/rust-lang/crates.io-index" 30 | checksum = "e765fd216e48e067936442276d1d57399e37bce53c264d6fefbe298080cb57ee" 31 | dependencies = [ 32 | "utf8parse", 33 | ] 34 | 35 | [[package]] 36 | name = "anstyle-query" 37 | version = "1.0.0" 38 | source = "registry+https://github.com/rust-lang/crates.io-index" 39 | checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" 40 | dependencies = [ 41 | "windows-sys", 42 | ] 43 | 44 | [[package]] 45 | name = "anstyle-wincon" 46 | version = "1.0.1" 47 | source = "registry+https://github.com/rust-lang/crates.io-index" 48 | checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188" 49 | dependencies = [ 50 | "anstyle", 51 | "windows-sys", 52 | ] 53 | 54 | [[package]] 55 | name = "bitflags" 56 | version = "1.3.2" 57 | source = "registry+https://github.com/rust-lang/crates.io-index" 58 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" 59 | 60 | [[package]] 61 | name = "bitvec" 62 | version = "1.0.1" 63 | source = "registry+https://github.com/rust-lang/crates.io-index" 64 | checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" 65 | dependencies = [ 66 | "funty", 67 | "radium", 68 | "tap", 69 | "wyz", 70 | ] 71 | 72 | [[package]] 73 | name = "cc" 74 | version = "1.0.79" 75 | source = "registry+https://github.com/rust-lang/crates.io-index" 76 | checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" 77 | 78 | [[package]] 79 | name = "clap" 80 | version = "4.3.0" 81 | source = "registry+https://github.com/rust-lang/crates.io-index" 82 | checksum = "93aae7a4192245f70fe75dd9157fc7b4a5bf53e88d30bd4396f7d8f9284d5acc" 83 | dependencies = [ 84 | "clap_builder", 85 | "clap_derive", 86 | "once_cell", 87 | ] 88 | 89 | [[package]] 90 | name = "clap_builder" 91 | version = "4.3.0" 92 | source = "registry+https://github.com/rust-lang/crates.io-index" 93 | checksum = "4f423e341edefb78c9caba2d9c7f7687d0e72e89df3ce3394554754393ac3990" 94 | dependencies = [ 95 | "anstream", 96 | "anstyle", 97 | "bitflags", 98 | "clap_lex", 99 | "strsim", 100 | ] 101 | 102 | [[package]] 103 | name = "clap_derive" 104 | version = "4.3.0" 105 | source = "registry+https://github.com/rust-lang/crates.io-index" 106 | checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b" 107 | dependencies = [ 108 | "heck", 109 | "proc-macro2", 110 | "quote", 111 | "syn", 112 | ] 113 | 114 | [[package]] 115 | name = "clap_lex" 116 | version = "0.5.0" 117 | source = "registry+https://github.com/rust-lang/crates.io-index" 118 | checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" 119 | 120 | [[package]] 121 | name = "cmpr" 122 | version = "0.1.0" 123 | dependencies = [ 124 | "clap", 125 | "compressing", 126 | "stat", 127 | ] 128 | 129 | [[package]] 130 | name = "colorchoice" 131 | version = "1.0.0" 132 | source = "registry+https://github.com/rust-lang/crates.io-index" 133 | checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" 134 | 135 | [[package]] 136 | name = "compressing" 137 | version = "0.1.0" 138 | dependencies = [ 139 | "bitvec", 140 | "paste", 141 | ] 142 | 143 | [[package]] 144 | name = "errno" 145 | version = "0.3.1" 146 | source = "registry+https://github.com/rust-lang/crates.io-index" 147 | checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" 148 | dependencies = [ 149 | "errno-dragonfly", 150 | "libc", 151 | "windows-sys", 152 | ] 153 | 154 | [[package]] 155 | name = "errno-dragonfly" 156 | version = "0.1.2" 157 | source = "registry+https://github.com/rust-lang/crates.io-index" 158 | checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" 159 | dependencies = [ 160 | "cc", 161 | "libc", 162 | ] 163 | 164 | [[package]] 165 | name = "funty" 166 | version = "2.0.0" 167 | source = "registry+https://github.com/rust-lang/crates.io-index" 168 | checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" 169 | 170 | [[package]] 171 | name = "heck" 172 | version = "0.4.1" 173 | source = "registry+https://github.com/rust-lang/crates.io-index" 174 | checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" 175 | 176 | [[package]] 177 | name = "hermit-abi" 178 | version = "0.3.1" 179 | source = "registry+https://github.com/rust-lang/crates.io-index" 180 | checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" 181 | 182 | [[package]] 183 | name = "io-lifetimes" 184 | version = "1.0.10" 185 | source = "registry+https://github.com/rust-lang/crates.io-index" 186 | checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220" 187 | dependencies = [ 188 | "hermit-abi", 189 | "libc", 190 | "windows-sys", 191 | ] 192 | 193 | [[package]] 194 | name = "is-terminal" 195 | version = "0.4.7" 196 | source = "registry+https://github.com/rust-lang/crates.io-index" 197 | checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f" 198 | dependencies = [ 199 | "hermit-abi", 200 | "io-lifetimes", 201 | "rustix", 202 | "windows-sys", 203 | ] 204 | 205 | [[package]] 206 | name = "libc" 207 | version = "0.2.144" 208 | source = "registry+https://github.com/rust-lang/crates.io-index" 209 | checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1" 210 | 211 | [[package]] 212 | name = "linux-raw-sys" 213 | version = "0.3.8" 214 | source = "registry+https://github.com/rust-lang/crates.io-index" 215 | checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" 216 | 217 | [[package]] 218 | name = "once_cell" 219 | version = "1.17.1" 220 | source = "registry+https://github.com/rust-lang/crates.io-index" 221 | checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" 222 | 223 | [[package]] 224 | name = "paste" 225 | version = "1.0.12" 226 | source = "registry+https://github.com/rust-lang/crates.io-index" 227 | checksum = "9f746c4065a8fa3fe23974dd82f15431cc8d40779821001404d10d2e79ca7d79" 228 | 229 | [[package]] 230 | name = "proc-macro2" 231 | version = "1.0.58" 232 | source = "registry+https://github.com/rust-lang/crates.io-index" 233 | checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8" 234 | dependencies = [ 235 | "unicode-ident", 236 | ] 237 | 238 | [[package]] 239 | name = "quote" 240 | version = "1.0.27" 241 | source = "registry+https://github.com/rust-lang/crates.io-index" 242 | checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500" 243 | dependencies = [ 244 | "proc-macro2", 245 | ] 246 | 247 | [[package]] 248 | name = "radium" 249 | version = "0.7.0" 250 | source = "registry+https://github.com/rust-lang/crates.io-index" 251 | checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" 252 | 253 | [[package]] 254 | name = "rustix" 255 | version = "0.37.19" 256 | source = "registry+https://github.com/rust-lang/crates.io-index" 257 | checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d" 258 | dependencies = [ 259 | "bitflags", 260 | "errno", 261 | "io-lifetimes", 262 | "libc", 263 | "linux-raw-sys", 264 | "windows-sys", 265 | ] 266 | 267 | [[package]] 268 | name = "stat" 269 | version = "0.1.0" 270 | 271 | [[package]] 272 | name = "strsim" 273 | version = "0.10.0" 274 | source = "registry+https://github.com/rust-lang/crates.io-index" 275 | checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" 276 | 277 | [[package]] 278 | name = "syn" 279 | version = "2.0.16" 280 | source = "registry+https://github.com/rust-lang/crates.io-index" 281 | checksum = "a6f671d4b5ffdb8eadec19c0ae67fe2639df8684bd7bc4b83d986b8db549cf01" 282 | dependencies = [ 283 | "proc-macro2", 284 | "quote", 285 | "unicode-ident", 286 | ] 287 | 288 | [[package]] 289 | name = "tap" 290 | version = "1.0.1" 291 | source = "registry+https://github.com/rust-lang/crates.io-index" 292 | checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" 293 | 294 | [[package]] 295 | name = "unicode-ident" 296 | version = "1.0.8" 297 | source = "registry+https://github.com/rust-lang/crates.io-index" 298 | checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" 299 | 300 | [[package]] 301 | name = "utf8parse" 302 | version = "0.2.1" 303 | source = "registry+https://github.com/rust-lang/crates.io-index" 304 | checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" 305 | 306 | [[package]] 307 | name = "windows-sys" 308 | version = "0.48.0" 309 | source = "registry+https://github.com/rust-lang/crates.io-index" 310 | checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" 311 | dependencies = [ 312 | "windows-targets", 313 | ] 314 | 315 | [[package]] 316 | name = "windows-targets" 317 | version = "0.48.0" 318 | source = "registry+https://github.com/rust-lang/crates.io-index" 319 | checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" 320 | dependencies = [ 321 | "windows_aarch64_gnullvm", 322 | "windows_aarch64_msvc", 323 | "windows_i686_gnu", 324 | "windows_i686_msvc", 325 | "windows_x86_64_gnu", 326 | "windows_x86_64_gnullvm", 327 | "windows_x86_64_msvc", 328 | ] 329 | 330 | [[package]] 331 | name = "windows_aarch64_gnullvm" 332 | version = "0.48.0" 333 | source = "registry+https://github.com/rust-lang/crates.io-index" 334 | checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" 335 | 336 | [[package]] 337 | name = "windows_aarch64_msvc" 338 | version = "0.48.0" 339 | source = "registry+https://github.com/rust-lang/crates.io-index" 340 | checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" 341 | 342 | [[package]] 343 | name = "windows_i686_gnu" 344 | version = "0.48.0" 345 | source = "registry+https://github.com/rust-lang/crates.io-index" 346 | checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" 347 | 348 | [[package]] 349 | name = "windows_i686_msvc" 350 | version = "0.48.0" 351 | source = "registry+https://github.com/rust-lang/crates.io-index" 352 | checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" 353 | 354 | [[package]] 355 | name = "windows_x86_64_gnu" 356 | version = "0.48.0" 357 | source = "registry+https://github.com/rust-lang/crates.io-index" 358 | checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" 359 | 360 | [[package]] 361 | name = "windows_x86_64_gnullvm" 362 | version = "0.48.0" 363 | source = "registry+https://github.com/rust-lang/crates.io-index" 364 | checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" 365 | 366 | [[package]] 367 | name = "windows_x86_64_msvc" 368 | version = "0.48.0" 369 | source = "registry+https://github.com/rust-lang/crates.io-index" 370 | checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" 371 | 372 | [[package]] 373 | name = "wyz" 374 | version = "0.5.1" 375 | source = "registry+https://github.com/rust-lang/crates.io-index" 376 | checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" 377 | dependencies = [ 378 | "tap", 379 | ] 380 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = ["cli", "stat"] 3 | default-members = ["cli"] 4 | 5 | [package] 6 | name = "compressing" 7 | version = "0.1.0" 8 | edition = "2021" 9 | 10 | [dependencies] 11 | bitvec = "1" 12 | 13 | [dev-dependencies] 14 | paste = "1.0.12" 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Luiz Felipe Gonçalves 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Naive implementation of the [LZW] and [Huffman] compression algorithms. 2 | 3 | To run, install the [Rust toolchain][rust-toolchain]. Cargo may be used to 4 | compile the source. 5 | 6 | Example, 7 | 8 | ``` 9 | $ cargo run 10 | 11 | Usage: cmpr [OPTIONS] -a 12 | 13 | Commands: 14 | compress 15 | decompress 16 | help Print this message or the help of the given subcommand(s) 17 | 18 | Options: 19 | -a The algorithm to use for compress or decompress [possible values: lzw, huffman] 20 | --stats Whether the program should show statistics 21 | -h, --help Print help 22 | -V, --version Print version 23 | ``` 24 | 25 | Compress a file using the LZW algorithm (assuming `cargo build --release`): 26 | 27 | ``` 28 | $ ./target/release/cmpr -a lzw --stats compress -o Cargo.lock.lzw Cargo.lock 29 | done. 30 | in 1 ms 31 | saved 35.10% 32 | ``` 33 | 34 | Decompress the same file: 35 | 36 | ``` 37 | $ ./target/release/cmpr -a lzw --stats decompress -o recovered-Cargo.lock Cargo.lock.lzw 38 | done. 39 | in 0 ms 40 | ``` 41 | 42 | The script `cmp.sh` may be used to test the compression algorithm by 43 | compressing, decompressing and comparing with the original file. E.g., 44 | 45 | ``` 46 | $ ./cmp.sh Cargo.lock 47 | Finished release [optimized] target(s) in 0.06s 48 | cmpr 0.1.0 49 | compressing [Cargo.lock] into [Cargo.lock.cmp]... 50 | done. 51 | in 1 ms 52 | saved 35.10% 53 | decompressing [Cargo.lock.cmp] into [recovered-Cargo.lock]... 54 | done. 55 | in 2 ms 56 | ok 57 | ``` 58 | 59 | [LZW]: https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch 60 | [Huffman]: https://en.wikipedia.org/wiki/Huffman_coding 61 | [rust-toolchain]: https://rustup.rs/ 62 | -------------------------------------------------------------------------------- /cli/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "cmpr" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | [dependencies] 7 | compressing.path = ".." 8 | stat.path = "../stat" 9 | clap = { version = "4", features = ["derive"] } 10 | -------------------------------------------------------------------------------- /cli/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | fs::{File, OpenOptions}, 3 | io::{self, BufReader, BufWriter, Read, Write}, 4 | path::{Path, PathBuf}, 5 | time::{Duration, Instant}, 6 | }; 7 | 8 | use clap::{Args, Parser, Subcommand, ValueEnum}; 9 | use compressing::lzw; 10 | use stat::Stat; 11 | 12 | #[derive(Debug, Parser)] 13 | #[command(version)] 14 | struct Cli { 15 | /// The algorithm to use for compress or decompress. 16 | #[arg(short, value_enum)] 17 | algorithm: Algorithm, 18 | 19 | /// Whether the program should show statistics. 20 | #[arg(long)] 21 | stats: bool, 22 | 23 | #[command(subcommand)] 24 | action: Action, 25 | } 26 | 27 | #[derive(Copy, Clone, Debug, ValueEnum)] 28 | enum Algorithm { 29 | Lzw, 30 | Huffman, 31 | } 32 | 33 | #[derive(Debug, Subcommand)] 34 | enum Action { 35 | Compress(ActionData), 36 | Decompress(ActionData), 37 | } 38 | 39 | #[derive(Debug, Args)] 40 | struct ActionData { 41 | /// The file to compress or decompress. 42 | input: PathBuf, 43 | 44 | /// The output path. 45 | #[arg(short)] 46 | output: PathBuf, 47 | } 48 | 49 | fn main() -> io::Result<()> { 50 | let cmd = Cli::parse(); 51 | 52 | let data = cmd.action.data(); 53 | let manager = IoManager::new(&data.input, &data.output)?; 54 | 55 | let stats = match cmd.action { 56 | Action::Compress(_) => match cmd.algorithm { 57 | Algorithm::Lzw => manager.run(lzw::enc)?, 58 | Algorithm::Huffman => todo!(), 59 | }, 60 | Action::Decompress(_) => match cmd.algorithm { 61 | Algorithm::Lzw => manager.run(lzw::dec)?, 62 | Algorithm::Huffman => todo!(), 63 | }, 64 | }; 65 | 66 | if cmd.stats { 67 | println!("done."); 68 | println!(" in {} ms", stats.elapsed.as_millis()); 69 | 70 | if cmd.action.is_compress() { 71 | // https://en.wikipedia.org/wiki/Data_compression_ratio 72 | let space_saved = (1.0 - stats.written as f64 / stats.read as f64) * 100.0; 73 | println!(" saved {space_saved:.2}%"); 74 | } 75 | } 76 | 77 | Ok(()) 78 | } 79 | 80 | impl Action { 81 | fn data(&self) -> &ActionData { 82 | match self { 83 | Action::Compress(data) => data, 84 | Action::Decompress(data) => data, 85 | } 86 | } 87 | 88 | fn is_compress(&self) -> bool { 89 | matches!(self, Action::Compress(_)) 90 | } 91 | } 92 | 93 | struct IoManager { 94 | reader: BufReader>, 95 | writer: BufWriter>, 96 | } 97 | 98 | impl IoManager { 99 | /// Opens the given files and constructs a new [`IoManager`]. 100 | fn new(input: &Path, output: &Path) -> io::Result { 101 | let reader = { 102 | let file = File::open(input)?; 103 | let stat = Stat::new(file); 104 | BufReader::new(stat) 105 | }; 106 | let writer = { 107 | let file = OpenOptions::new().create(true).write(true).open(output)?; 108 | let stat = Stat::new(file); 109 | BufWriter::new(stat) 110 | }; 111 | Ok(Self { reader, writer }) 112 | } 113 | 114 | /// Runs the provided function and collects statistics on the involved I/O 115 | /// operations. 116 | fn run(mut self, f: F) -> io::Result 117 | where 118 | F: Fn(&mut dyn Read, &mut dyn Write) -> io::Result<()>, 119 | { 120 | let start = Instant::now(); 121 | f(&mut self.reader, &mut self.writer)?; 122 | let elapsed = start.elapsed(); 123 | 124 | let stat_r = self.reader.into_inner(); 125 | let stat_w = self.writer.into_inner()?; 126 | 127 | Ok(Stats { 128 | read: stat_r.read_count(), 129 | written: stat_w.write_count(), 130 | elapsed, 131 | }) 132 | } 133 | } 134 | 135 | struct Stats { 136 | read: u64, 137 | written: u64, 138 | elapsed: Duration, 139 | } 140 | -------------------------------------------------------------------------------- /cmp.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | orig=$1 4 | compressed="$orig.cmp" 5 | recovered="recovered-$orig" 6 | 7 | cli() { 8 | ./target/release/cmpr "$@" 9 | } 10 | 11 | cargo build --release 12 | cli --version 13 | 14 | echo "compressing [$orig] into [$compressed]..." 15 | cli -a lzw --stats compress -o "$compressed" "$orig" 16 | 17 | echo "decompressing [$compressed] into [$recovered]..." 18 | cli -a lzw --stats decompress -o "$recovered" "$compressed" 19 | 20 | if diff "$orig" "$recovered" &> /dev/null; then 21 | echo "ok" 22 | else 23 | echo "files differ" 24 | fi 25 | 26 | rm "$compressed" 27 | rm "$recovered" 28 | -------------------------------------------------------------------------------- /src/huffman.rs: -------------------------------------------------------------------------------- 1 | #![allow(dead_code)] // XX: Remove this. 2 | 3 | use std::{ 4 | cmp::{self, Reverse}, 5 | collections::{BinaryHeap, HashMap}, 6 | io, 7 | }; 8 | 9 | use bitvec::vec::BitVec; 10 | 11 | use crate::shared::read_u8; 12 | 13 | type Char = u8; 14 | type Freq = u32; 15 | 16 | type FreqMap = HashMap; 17 | type CodeMap = HashMap; 18 | 19 | #[derive(Debug, PartialEq, Eq)] 20 | struct Stat { 21 | freq: Freq, 22 | char: Char, 23 | } 24 | 25 | #[derive(Debug, PartialEq, Eq)] 26 | enum Tree { 27 | Node { 28 | freq: Freq, 29 | left: usize, 30 | right: usize, 31 | }, 32 | Leaf(Stat), 33 | } 34 | 35 | type TreeArena = Vec; 36 | 37 | /// Encodes the given data. 38 | /// 39 | /// # Errors 40 | /// 41 | /// Fails if any of the underlying I/O operations fail (i.e., reading from `src` 42 | pub fn enc(_src: &mut dyn io::Read, _out: &mut dyn io::Write) -> io::Result<()> { 43 | Ok(()) 44 | } 45 | 46 | /// Decodes the given data. 47 | /// 48 | /// # Errors 49 | /// 50 | /// Fails if any of the underlying I/O operations fail (i.e., reading from `src` 51 | /// or writing to `out`). 52 | pub fn dec(_src: &mut dyn io::Read, _out: &mut dyn io::Write) -> io::Result<()> { 53 | Ok(()) 54 | } 55 | 56 | fn code_map_from_reader(reader: &mut dyn io::Read) -> io::Result { 57 | let freq_map = freq_map_from_reader(reader)?; 58 | let freq_map_len = freq_map.len(); 59 | let tree_arena = tree_from_freq_map(freq_map); 60 | Ok(code_map_from_tree(freq_map_len, &tree_arena)) 61 | } 62 | 63 | fn freq_map_from_reader(reader: &mut dyn io::Read) -> io::Result { 64 | let mut map = HashMap::new(); 65 | while let Some(char) = read_u8(reader)? { 66 | *map.entry(char).or_insert(0) += 1; 67 | } 68 | Ok(map) 69 | } 70 | 71 | fn tree_from_freq_map(map: FreqMap) -> TreeArena { 72 | let mut queue = BinaryHeap::with_capacity(map.len()); 73 | for (char, freq) in map { 74 | let leaf = Tree::Leaf(Stat { char, freq }); 75 | // One needs a minimum heap. 76 | queue.push(Reverse(leaf)); 77 | } 78 | 79 | // A binary tree with `L` leaf nodes may have at most `2L - 1` nodes. 80 | let node_count = queue.len() * 2 - 1; 81 | let mut arena = Vec::with_capacity(node_count); 82 | 83 | // The root will be placed at the first index (i.e., `0`). However, since 84 | // the root node is the last to be inserted, one needs to manually skip 85 | // its position here. 86 | // 87 | // The following is safe since the code below doesn't index `arena[0]`. 88 | unsafe { arena.set_len(1) }; 89 | 90 | while queue.len() >= 2 { 91 | // SAFETY: See `while` predicate. 92 | let fst = unsafe { queue.pop().unwrap_unchecked() }.0; 93 | let snd = unsafe { queue.pop().unwrap_unchecked() }.0; 94 | 95 | let freq = fst.freq() + snd.freq(); 96 | let left = ins(&mut arena, fst); 97 | let right = ins(&mut arena, snd); 98 | 99 | let node = Tree::Node { freq, left, right }; 100 | queue.push(Reverse(node)); 101 | } 102 | 103 | // At the end of each `while` iteration, one always inserts a new node, 104 | // hence the following is safe. 105 | let root = unsafe { queue.pop().unwrap_unchecked() }.0; 106 | 107 | // `0` is is bounds. 108 | *unsafe { arena.get_unchecked_mut(0) } = root; 109 | 110 | arena 111 | } 112 | 113 | fn code_map_from_tree(size_hint: usize, arena: &TreeArena) -> CodeMap { 114 | fn go(i: usize, arena: &TreeArena, map: &mut CodeMap, vec: BitVec) { 115 | match &arena[i] { 116 | Tree::Node { left, right, .. } => { 117 | let mut left_vec = vec.clone(); 118 | left_vec.push(false); 119 | go(*left, arena, map, left_vec); 120 | 121 | let mut right_vec = vec; 122 | right_vec.push(true); 123 | go(*right, arena, map, right_vec); 124 | } 125 | Tree::Leaf(Stat { char, .. }) => { 126 | map.insert(*char, vec); 127 | } 128 | } 129 | } 130 | 131 | let mut map = HashMap::with_capacity(size_hint); 132 | go(/* root */ 0, arena, &mut map, BitVec::new()); 133 | map 134 | } 135 | 136 | /// Pushes the given element into the vector and returns the inserted-to index. 137 | fn ins(vec: &mut Vec, el: T) -> usize { 138 | let index = vec.len(); 139 | debug_assert_ne!(index, vec.capacity()); // Do not re-alloc, plz. :) 140 | vec.push(el); 141 | index 142 | } 143 | 144 | impl Tree { 145 | fn freq(&self) -> Freq { 146 | match self { 147 | Tree::Node { freq, .. } => *freq, 148 | Tree::Leaf(stat) => stat.freq, 149 | } 150 | } 151 | } 152 | 153 | impl PartialOrd for Stat { 154 | fn partial_cmp(&self, other: &Self) -> Option { 155 | self.freq.partial_cmp(&other.freq) 156 | } 157 | } 158 | 159 | impl Ord for Stat { 160 | fn cmp(&self, other: &Self) -> cmp::Ordering { 161 | self.freq.cmp(&other.freq) 162 | } 163 | } 164 | 165 | impl PartialOrd for Tree { 166 | fn partial_cmp(&self, other: &Self) -> Option { 167 | self.freq().partial_cmp(&other.freq()) 168 | } 169 | } 170 | 171 | impl Ord for Tree { 172 | fn cmp(&self, other: &Self) -> cmp::Ordering { 173 | self.freq().cmp(&other.freq()) 174 | } 175 | } 176 | 177 | #[cfg(test)] 178 | mod tests { 179 | use bitvec::{bitvec, order::Lsb0}; 180 | 181 | use super::*; 182 | 183 | #[test] 184 | #[rustfmt::skip] 185 | fn test_ord_impl() { 186 | let a = Stat { freq: 5, char: b'A' }; 187 | let b = Stat { freq: 5, char: b'A' }; 188 | let c = Stat { freq: 6, char: b'A' }; 189 | assert_eq!(a, b); 190 | assert!(a < c); 191 | } 192 | 193 | #[test] 194 | fn test_freq_map() { 195 | let mut src = b"AAABBBAABACD".as_ref(); 196 | let map = freq_map_from_reader(&mut src).unwrap(); 197 | assert_eq!( 198 | map, 199 | HashMap::from([(b'A', 6), (b'B', 4), (b'C', 1), (b'D', 1),]) 200 | ); 201 | } 202 | 203 | #[test] 204 | fn test_code_map() { 205 | let mut src = b"AAABBBAABACD".as_ref(); 206 | let map = code_map_from_reader(&mut src).unwrap(); 207 | 208 | assert_eq!(map[&b'A'], bitvec![usize, Lsb0; 0]); 209 | assert_eq!(map[&b'B'], bitvec![usize, Lsb0; 1, 1]); 210 | 211 | // The order is not specified, just the bit length. 212 | assert_ne!(map[&b'C'], map[&b'D']); 213 | assert!( 214 | map[&b'C'] == bitvec![usize, Lsb0; 1, 0, 0] 215 | || map[&b'C'] == bitvec![usize, Lsb0; 1, 0, 1] 216 | ); 217 | assert!( 218 | map[&b'D'] == bitvec![usize, Lsb0; 1, 0, 0] 219 | || map[&b'D'] == bitvec![usize, Lsb0; 1, 0, 1] 220 | ); 221 | } 222 | } 223 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod huffman; 2 | pub mod lzw; 3 | 4 | pub mod shared; 5 | -------------------------------------------------------------------------------- /src/lzw.rs: -------------------------------------------------------------------------------- 1 | use crate::shared::{read_u16, read_u8}; 2 | use std::{collections::HashMap, io, mem}; 3 | 4 | pub type Code = u16; 5 | 6 | pub type EncDict = HashMap, Code>; 7 | pub type DecDict = HashMap>; 8 | 9 | /// Encodes the given data. 10 | /// 11 | /// # Errors 12 | /// 13 | /// Fails if any of the underlying I/O operations fail (i.e., reading from `src` 14 | /// or writing to `out`). 15 | /// 16 | /// # Panics 17 | /// 18 | /// Panics if the dictionary grows past `u16`'s bounds. 19 | pub fn enc(src: &mut dyn io::Read, out: &mut dyn io::Write) -> io::Result<()> { 20 | enc_returning_dict(src, out)?; 21 | Ok(()) 22 | } 23 | 24 | #[doc(hidden)] 25 | pub fn enc_returning_dict(src: &mut dyn io::Read, out: &mut dyn io::Write) -> io::Result { 26 | let mut dict = build_default_enc_dict(); 27 | let mut seq = Vec::::new(); 28 | 29 | // Advance while the next char forms a key which is in the map. 30 | // When the next char forms a string which is not in the map, emits it 31 | // and inserts (it + the char) in the map. 32 | while let Some(c) = read_u8(src)? { 33 | seq.push(c); 34 | if !dict.contains_key(&seq) { 35 | let prev_seq = &seq[..(seq.len() - 1)]; 36 | emit(prev_seq, &dict, out)?; 37 | 38 | let code = dict.len().try_into().unwrap(); 39 | dict.insert(mem::replace(&mut seq, vec![c]), code); 40 | } 41 | } 42 | if !seq.is_empty() { 43 | emit(&seq, &dict, out)?; 44 | } 45 | 46 | Ok(dict) 47 | } 48 | 49 | /// Decodes the given data. 50 | /// 51 | /// # Errors 52 | /// 53 | /// Fails if any of the underlying I/O operations fail (i.e., reading from `src` 54 | /// or writing to `out`). 55 | /// 56 | /// # Panics 57 | /// 58 | /// Panics if the dictionary grows past `u16`'s bounds. 59 | pub fn dec(src: &mut dyn io::Read, out: &mut dyn io::Write) -> io::Result<()> { 60 | let mut dict = build_default_dec_dict(); 61 | let mut seq = Vec::::new(); 62 | 63 | while let Some(code) = read_u16(src)? { 64 | let decoded = dict.get(&code).map(Clone::clone).unwrap_or_else(|| { 65 | let mut s = seq.clone(); 66 | s.push(s[0]); 67 | s 68 | }); 69 | out.write_all(&decoded)?; 70 | 71 | if !seq.is_empty() { 72 | let next_code = dict.len().try_into().unwrap(); 73 | dict.insert(next_code, { 74 | let mut s = mem::take(&mut seq); 75 | s.push(decoded[0]); 76 | s 77 | }); 78 | } 79 | 80 | seq = decoded; 81 | } 82 | 83 | Ok(()) 84 | } 85 | 86 | fn emit(seq: &[u8], dict: &EncDict, out: &mut dyn io::Write) -> io::Result<()> { 87 | let code = Code::to_be_bytes(dict[seq]); 88 | out.write_all(&code) 89 | } 90 | 91 | fn build_default_enc_dict() -> EncDict { 92 | let mut dict = HashMap::new(); 93 | for i in u8::MIN..=u8::MAX { 94 | dict.insert(vec![i], i.into()); 95 | } 96 | dict 97 | } 98 | 99 | fn build_default_dec_dict() -> DecDict { 100 | let mut dict = HashMap::new(); 101 | for i in u8::MIN..=u8::MAX { 102 | dict.insert(i.into(), vec![i]); 103 | } 104 | dict 105 | } 106 | 107 | #[cfg(test)] 108 | mod tests { 109 | use super::*; 110 | 111 | macro_rules! test { 112 | ($( ($name:ident, $decoded:expr, $encoded:expr), )+) => { 113 | paste::paste! { 114 | $( 115 | #[test] 116 | fn [< $name _encode >]() { 117 | let mut src = ($decoded).as_ref(); 118 | let mut out = Vec::new(); 119 | enc(&mut src, &mut out).unwrap(); 120 | assert_eq!(out, Vec::from($encoded)); 121 | } 122 | 123 | #[test] 124 | fn [< $name _decode >]() { 125 | let src = Vec::from($encoded); 126 | let mut out = Vec::new(); 127 | dec(&mut &*src, &mut out).unwrap(); 128 | assert_eq!(out, $decoded); 129 | } 130 | )+ 131 | } 132 | }; 133 | } 134 | 135 | test![ 136 | ( 137 | test_basic_seq_1, 138 | b"ABBABBBABBA", 139 | coded(&[65, 66, 66, 256, 257, 259, 65]) 140 | ), 141 | (test_basic_seq_2, b"ABABA", coded(&[65, 66, 256, 65])), 142 | (test_basic_seq_3, b"ABABABA", coded(&[65, 66, 256, 258])), 143 | ( 144 | test_basic_seq_4, 145 | b"ol\xE1, mundo! como vai?", 146 | [ 147 | 0, 111, 0, 108, 0, 225, 0, 44, 0, 32, 0, 109, 0, 117, 0, 110, 0, 100, 0, 111, 0, 148 | 33, 0, 32, 0, 99, 0, 111, 0, 109, 0, 111, 0, 32, 0, 118, 0, 97, 0, 105, 0, 63 149 | ] 150 | ), 151 | ]; 152 | 153 | fn coded(codes: &[Code]) -> Vec { 154 | let mut out = Vec::new(); 155 | for code in codes { 156 | let data = Code::to_be_bytes(*code); 157 | out.extend(data); 158 | } 159 | out 160 | } 161 | } 162 | -------------------------------------------------------------------------------- /src/shared.rs: -------------------------------------------------------------------------------- 1 | macro_rules! read_fn { 2 | ($($vis:vis fn $name:ident() -> $ty:ty ;)+) => { 3 | $( 4 | #[inline(always)] 5 | $vis fn $name(src: &mut dyn ::std::io::Read) -> std::io::Result> { 6 | let mut buf = [0; ::std::mem::size_of::<$ty>()]; 7 | match src.read_exact(&mut buf) { 8 | Ok(_) => Ok(Some(<$ty>::from_be_bytes(buf))), 9 | Err(error) if error.kind() == ::std::io::ErrorKind::UnexpectedEof => Ok(None), 10 | Err(error) => Err(error), 11 | } 12 | } 13 | )+ 14 | }; 15 | } 16 | 17 | read_fn!( 18 | pub(crate) fn read_u8() -> u8; 19 | pub(crate) fn read_u16() -> u16; 20 | ); 21 | -------------------------------------------------------------------------------- /stat/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "stat" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | -------------------------------------------------------------------------------- /stat/src/lib.rs: -------------------------------------------------------------------------------- 1 | use std::io::{Read, Write}; 2 | 3 | /// A reader/writer that collects statistics on reads and writes. 4 | #[derive(Debug)] 5 | pub struct Stat { 6 | inner: I, 7 | read_count: u64, 8 | write_count: u64, 9 | } 10 | 11 | impl Stat { 12 | /// Constructs a new [`Stat`]. 13 | pub fn new(inner: I) -> Self { 14 | Self { 15 | inner, 16 | read_count: 0, 17 | write_count: 0, 18 | } 19 | } 20 | 21 | /// Returns the number of bytes read. 22 | pub fn read_count(&self) -> u64 { 23 | self.read_count 24 | } 25 | 26 | /// Returns the number of bytes written. 27 | pub fn write_count(&self) -> u64 { 28 | self.write_count 29 | } 30 | 31 | /// Returns the inner reader/writer. 32 | pub fn into_inner(self) -> I { 33 | self.inner 34 | } 35 | } 36 | 37 | impl Read for Stat { 38 | fn read(&mut self, buf: &mut [u8]) -> std::io::Result { 39 | let n = self.inner.read(buf)?; 40 | self.read_count += u64::try_from(n).unwrap(); 41 | Ok(n) 42 | } 43 | } 44 | 45 | impl Write for Stat { 46 | fn write(&mut self, buf: &[u8]) -> std::io::Result { 47 | let n = self.inner.write(buf)?; 48 | self.write_count += u64::try_from(n).unwrap(); 49 | Ok(n) 50 | } 51 | 52 | fn flush(&mut self) -> std::io::Result<()> { 53 | self.inner.flush() 54 | } 55 | } 56 | 57 | #[cfg(test)] 58 | mod tests { 59 | use std::io::{self, BufWriter}; 60 | 61 | use super::*; 62 | 63 | #[test] 64 | fn test_read() { 65 | let mut src = "olá, mundo!".as_bytes(); 66 | let mut stat_r = Stat::new(&mut src); 67 | 68 | io::read_to_string(&mut stat_r).unwrap(); 69 | 70 | assert_eq!(stat_r.read_count(), 12); 71 | assert_eq!(stat_r.write_count(), 0); 72 | } 73 | 74 | #[test] 75 | fn test_write() { 76 | let out = Vec::::new(); 77 | let mut stat_w = Stat::new(out); 78 | 79 | write!(&mut stat_w, "olá, mundo!").unwrap(); 80 | 81 | assert_eq!(stat_w.read_count(), 0); 82 | assert_eq!(stat_w.write_count(), 12); 83 | assert_eq!(stat_w.into_inner(), "olá, mundo!".as_bytes()); 84 | } 85 | 86 | #[test] 87 | fn test_three_level_composition_with_buffering() { 88 | let out = Vec::::new(); 89 | let stat = Stat::new(out); 90 | let mut buf_w = BufWriter::with_capacity(5, stat); 91 | 92 | write!(&mut buf_w, "olá, mundo!").unwrap(); 93 | 94 | let stat_w = buf_w.into_inner().unwrap(); 95 | 96 | assert_eq!(stat_w.read_count(), 0); 97 | assert_eq!(stat_w.write_count(), 12); 98 | assert_eq!(stat_w.into_inner(), "olá, mundo!".as_bytes()); 99 | } 100 | } 101 | --------------------------------------------------------------------------------