├── .gitignore ├── Makefile ├── .github └── workflows │ └── ci.yml ├── README.md ├── LICENSE ├── main.zig └── snappy.zig /.gitignore: -------------------------------------------------------------------------------- 1 | main* 2 | !main.zig 3 | zig-cache/ 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | test: snappy.zig 2 | zig test snappy.zig 3 | 4 | bin: main.zig snappy.zig 5 | zig build-exe -O ReleaseFast main.zig 6 | 7 | .PHONY: clean 8 | clean: 9 | \rm -rf zig-cache/ main main.o 10 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | on: [push, pull_request] 2 | 3 | name: CI 4 | 5 | jobs: 6 | test: 7 | name: test 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout sources 11 | uses: actions/checkout@v2 12 | 13 | - name: Zig test 14 | uses: goto-bus-stop/setup-zig@v2 15 | with: 16 | version: 0.11.0 17 | - run: make 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # zig-snappy 2 | [![CI](https://github.com/gsquire/zig-snappy/workflows/CI/badge.svg)](https://github.com/gsquire/zig-snappy/actions) 3 | 4 | This is a rough translation of Go's [snappy](https://github.com/golang/snappy) library for Zig. It 5 | only supports the block format. The streaming format may be added in the future. 6 | 7 | ### Caveat 8 | Expect some sharp edges. This is my first time writing Zig! I would greatly appreciate any issues 9 | or pull requests to improve the code, write tests, or just critique in general. 10 | 11 | ### Roadmap 12 | - More robust tests 13 | - Fuzzing 14 | 15 | ### Usage 16 | See the [binary](main.zig) in the repository. 17 | 18 | ### License 19 | MIT 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Garrett Squire 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /main.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const Allocator = std.mem.Allocator; 3 | const fs = std.fs; 4 | const file = fs.File; 5 | 6 | const snappy = @import("snappy.zig"); 7 | 8 | fn readFile(allocator: Allocator, path: []const u8) ![]u8 { 9 | var f = try fs.cwd().openFile(path, file.OpenFlags{ .mode = file.OpenMode.read_only }); 10 | const fMetadata = try f.stat(); 11 | 12 | var output = try allocator.alloc(u8, fMetadata.size); 13 | errdefer allocator.free(output); 14 | 15 | _ = try f.readAll(output); 16 | 17 | return output; 18 | } 19 | 20 | // A small sample application demonstrating how to decode a snappy block-formatted input. 21 | pub fn main() !void { 22 | const allocator = std.heap.page_allocator; 23 | const stdout = std.io.getStdOut().writer(); 24 | 25 | // const input = try readFile(allocator, "input"); 26 | // defer allocator.free(input); 27 | 28 | // const decoded = try snappy.decode(allocator, input); 29 | // defer allocator.free(decoded); 30 | 31 | // try stdout.print("{}", .{decoded}); 32 | 33 | var encInput = try readFile(allocator, "encode"); 34 | defer allocator.free(encInput); 35 | 36 | const encoded = try snappy.encode(allocator, encInput); 37 | defer allocator.free(encoded); 38 | try stdout.print("{s}", .{encoded}); 39 | } 40 | -------------------------------------------------------------------------------- /snappy.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const Allocator = std.mem.Allocator; 3 | const crc32 = std.hash.crc; 4 | const mem = std.mem; 5 | const testing = std.testing; 6 | 7 | const tagLiteral = 0x00; 8 | const tagCopy1 = 0x01; 9 | const tagCopy2 = 0x02; 10 | const tagCopy4 = 0x03; 11 | 12 | const checksumSize = 4; 13 | const chunkHeaderSize = 4; 14 | const magicBody = "sNaPpY"; 15 | const magicChunk = "\xff\x06\x00\x00" ++ magicBody; 16 | 17 | const maxBlockSize = 65536; 18 | const maxEncodedLenOfMaxBlockSize = 76490; 19 | 20 | const inputMargin = 16 - 1; 21 | const minNonLiteralBlockSize = 1 + 1 + inputMargin; 22 | 23 | const obufHeaderLen = magicChunk.len + checksumSize + chunkHeaderSize; 24 | const obufLen = obufHeaderLen + maxEncodedLenOfMaxBlockSize; 25 | 26 | const chunkTypeCompressedData = 0x00; 27 | const chunkTypeUncompressedData = 0x01; 28 | const chunkTypePadding = 0xfe; 29 | const chunkTypeStreamIdentifier = 0xff; 30 | 31 | // Various errors that may occur while decoding. 32 | const SnappyError = error{ 33 | Corrupt, 34 | TooLarge, 35 | Unsupported, 36 | }; 37 | 38 | // Perform the CRC hash per the snappy documentation. We must use wrapping addition since this is 39 | // the default behavior in other languages. 40 | fn crc(b: []const u8) u32 { 41 | const c = crc32.Crc32SmallWithPoly(.Castagnoli); 42 | const hash = c.hash(b); 43 | return @as(u32, hash >> 15 | hash << 17) +% 0xa282ead8; 44 | } 45 | 46 | // Represents a variable length integer that we read from a byte stream along with how many bytes 47 | // were read to decode it. 48 | const Varint = struct { 49 | value: u64, 50 | bytesRead: usize, 51 | }; 52 | 53 | // https://golang.org/pkg/encoding/binary/#Uvarint 54 | fn uvarint(buf: []const u8) Varint { 55 | var x: u64 = 0; 56 | var s: u6 = 0; // We can shift a maximum of 2^6 (64) times. 57 | 58 | for (buf, 0..) |b, i| { 59 | if (b < 0x80) { 60 | if (i > 9 or i == 9 and b > 1) { 61 | return Varint{ 62 | .value = 0, 63 | .bytesRead = -%i + 1, 64 | }; 65 | } 66 | return Varint{ 67 | .value = x | (@as(u64, b) << s), 68 | .bytesRead = i + 1, 69 | }; 70 | } 71 | x |= (@as(u64, b & 0x7f) << s); 72 | s += 7; 73 | } 74 | 75 | return Varint{ 76 | .value = 0, 77 | .bytesRead = 0, 78 | }; 79 | } 80 | 81 | // https://golang.org/pkg/encoding/binary/#PutUvarint 82 | fn putUvarint(buf: []u8, x: u64) usize { 83 | var i: usize = 0; 84 | var mutX = x; 85 | 86 | while (mutX >= 0x80) { 87 | buf[i] = @as(u8, @truncate(mutX)) | 0x80; 88 | mutX >>= 7; 89 | i += 1; 90 | } 91 | buf[i] = @as(u8, @truncate(mutX)); 92 | 93 | return i + 1; 94 | } 95 | 96 | // This type represents the size of the snappy block and the header length. 97 | const SnappyBlock = struct { 98 | blockLen: u64, 99 | headerLen: usize, 100 | }; 101 | 102 | // Return the length of the decoded block and the number of bytes that the header occupied. 103 | fn decodedLen(src: []const u8) !SnappyBlock { 104 | const varint = uvarint(src); 105 | if (varint.bytesRead <= 0 or varint.value > 0xffffffff) { 106 | return SnappyError.Corrupt; 107 | } 108 | 109 | const wordSize = 32 << (-1 >> 32 & 1); 110 | if (wordSize == 32 and varint.value > 0x7fffffff) { 111 | return SnappyError.TooLarge; 112 | } 113 | 114 | return SnappyBlock{ 115 | .blockLen = varint.value, 116 | .headerLen = varint.bytesRead, 117 | }; 118 | } 119 | 120 | // The block format decoding implementation. 121 | fn runDecode(dst: []u8, src: []const u8) u8 { 122 | var d: usize = 0; 123 | var s: usize = 0; 124 | var offset: isize = 0; 125 | var length: isize = 0; 126 | 127 | while (s < src.len) { 128 | switch (src[s] & 0x03) { 129 | tagLiteral => { 130 | var x = @as(u32, src[s] >> 2); 131 | switch (x) { 132 | 0...59 => s += 1, 133 | 60 => { 134 | s += 2; 135 | if (s > src.len) { 136 | return 1; 137 | } 138 | x = @as(u32, src[s - 1]); 139 | }, 140 | 61 => { 141 | s += 3; 142 | if (s > src.len) { 143 | return 1; 144 | } 145 | x = @as(u32, src[s - 2]) | @as(u32, src[s - 1]) << 8; 146 | }, 147 | 62 => { 148 | s += 4; 149 | if (s > src.len) { 150 | return 1; 151 | } 152 | x = @as(u32, src[s - 3]) | @as(u32, src[s - 2]) << 8 | @as(u32, src[s - 1]) << 16; 153 | }, 154 | 63 => { 155 | s += 5; 156 | if (s > src.len) { 157 | return 1; 158 | } 159 | x = @as(u32, src[s - 4]) | @as(u32, src[s - 3]) << 8 | @as(u32, src[s - 2]) << 16 | @as(u32, src[s - 1]) << 24; 160 | }, 161 | // Should be unreachable. 162 | else => { 163 | return 1; 164 | }, 165 | } 166 | length = @as(isize, x) + 1; 167 | if (length <= 0) { 168 | return 1; 169 | } 170 | 171 | if (length > dst.len - d or length > src.len - s) { 172 | return 1; 173 | } 174 | 175 | mem.copy(u8, dst[d..], src[s .. s + @as(usize, @intCast(length))]); 176 | const l = @as(usize, @intCast(length)); 177 | d += l; 178 | s += l; 179 | continue; 180 | }, 181 | tagCopy1 => { 182 | s += 2; 183 | if (s > src.len) { 184 | return 1; 185 | } 186 | 187 | length = 4 + (@as(isize, src[s - 2]) >> 2 & 0x7); 188 | offset = @as(isize, (@as(u32, src[s - 2]) & 0xe0) << 3 | @as(u32, src[s - 1])); 189 | }, 190 | tagCopy2 => { 191 | s += 3; 192 | if (s > src.len) { 193 | return 1; 194 | } 195 | 196 | length = 1 + (@as(isize, src[s - 3]) >> 2); 197 | offset = @as(isize, @as(u32, src[s - 2]) | @as(u32, src[s - 1]) << 8); 198 | }, 199 | tagCopy4 => { 200 | s += 5; 201 | if (s > src.len) { 202 | return 1; 203 | } 204 | 205 | length = 1 + (@as(isize, src[s - 5]) >> 2); 206 | offset = @as(isize, @as(u32, src[s - 4]) | @as(u32, src[s - 3]) << 8 | @as(u32, src[s - 2]) << 16 | @as(u32, src[s - 1]) << 24); 207 | }, 208 | // Should be unreachable. 209 | else => { 210 | return 1; 211 | }, 212 | } 213 | 214 | if (offset <= 0 or d < offset or length > dst.len - d) { 215 | return 1; 216 | } 217 | 218 | if (offset >= length) { 219 | const upper_bound = d - @as(usize, @intCast(offset)) + @as(usize, @intCast(length)); 220 | mem.copy(u8, dst[d .. d + @as(usize, @intCast(length))], dst[d - @as(usize, @intCast(offset)) .. upper_bound]); 221 | d += @as(usize, @intCast(length)); 222 | continue; 223 | } 224 | 225 | var a = dst[d .. d + @as(usize, @intCast(length))]; 226 | var b = dst[d - @as(usize, @intCast(offset)) ..]; 227 | var aLen = a.len; 228 | b = b[0..aLen]; 229 | for (a, 0..) |_, i| { 230 | a[i] = b[i]; 231 | } 232 | d += @as(usize, @intCast(length)); 233 | } 234 | 235 | if (d != dst.len) { 236 | return 1; 237 | } 238 | 239 | return 0; 240 | } 241 | 242 | /// Given a chosen allocator and the source input, decode it using the snappy block format. The 243 | /// returned slice must be freed. 244 | pub fn decode(allocator: Allocator, src: []const u8) ![]u8 { 245 | const block = try decodedLen(src); 246 | 247 | var dst = try allocator.alloc(u8, block.blockLen); 248 | errdefer allocator.free(dst); 249 | 250 | // Skip past how many bytes we read to get the length. 251 | var s = src[block.headerLen..]; 252 | 253 | if (runDecode(dst, s) != 0) { 254 | return SnappyError.Corrupt; 255 | } 256 | 257 | return dst; 258 | } 259 | 260 | // TODO: Split up encode and decode into separate files once I better understand modules. 261 | fn emitLiteral(dst: []u8, lit: []const u8) usize { 262 | var i: usize = 0; 263 | const n = @as(usize, @intCast(lit.len - 1)); 264 | switch (n) { 265 | 0...59 => { 266 | dst[0] = @as(u8, @intCast(n)) << 2 | tagLiteral; 267 | i = 1; 268 | }, 269 | 60...255 => { 270 | dst[0] = 60 << 2 | tagLiteral; 271 | dst[1] = @as(u8, @intCast(n)); 272 | i = 2; 273 | }, 274 | else => { 275 | dst[0] = 61 << 2 | tagLiteral; 276 | dst[1] = @as(u8, @intCast(n)); 277 | dst[2] = @as(u8, @intCast(n >> 8)); 278 | i = 3; 279 | }, 280 | } 281 | mem.copy(u8, dst[i..], lit); 282 | 283 | return i + @min(dst.len, lit.len); 284 | } 285 | 286 | fn load32(b: []u8, i: isize) u32 { 287 | const j = @as(usize, @intCast(i)); 288 | const v = b[j .. j + 4]; 289 | return @as(u32, @intCast(v[0])) | @as(u32, @intCast(v[1])) << 8 | @as(u32, @intCast(v[2])) << 16 | @as(u32, @intCast(v[3])) << 24; 290 | } 291 | 292 | fn load64(b: []u8, i: isize) u64 { 293 | const j = @as(usize, @intCast(i)); 294 | const v = b[j .. j + 8]; 295 | return @as(u64, @intCast(v[0])) | @as(u64, @intCast(v[1])) << 8 | @as(u64, @intCast(v[2])) << 16 | @as(u64, @intCast(v[3])) << 24 | @as(u64, @intCast(v[4])) << 32 | @as(u64, @intCast(v[5])) << 40 | @as(u64, @intCast(v[6])) << 48 | @as(u64, @intCast(v[7])) << 56; 296 | } 297 | 298 | fn snappyHash(u: u32, shift: u32) u32 { 299 | const s = @as(u5, @intCast(shift)); 300 | return (u *% 0x1e35a7bd) >> s; 301 | } 302 | 303 | fn emitCopy(dst: []u8, offset: isize, length: isize) usize { 304 | var i: usize = 0; 305 | var l: isize = length; 306 | 307 | while (l >= 68) { 308 | dst[i + 0] = 63 << 2 | tagCopy2; 309 | dst[i + 1] = @as(u8, @truncate(@as(usize, @intCast(offset)))); 310 | dst[i + 2] = @as(u8, @truncate(@as(usize, @intCast(offset >> 8)))); 311 | i += 3; 312 | l -= 64; 313 | } 314 | 315 | if (l > 64) { 316 | dst[i + 0] = 59 << 2 | tagCopy2; 317 | dst[i + 1] = @as(u8, @truncate(@as(usize, @intCast(offset)))); 318 | dst[i + 2] = @as(u8, @truncate(@as(usize, @intCast(offset >> 8)))); 319 | //mem.copy(u8, dst, &mem.toBytes(offset)); 320 | i += 3; 321 | l -= 60; 322 | } 323 | 324 | if (l >= 12 or offset >= 2048) { 325 | dst[i + 0] = (@as(u8, @intCast(l)) -% 1) << 2 | tagCopy2; 326 | dst[i + 1] = @as(u8, @truncate(@as(usize, @intCast(offset)))); 327 | dst[i + 2] = @as(u8, @truncate(@as(usize, @intCast(offset >> 8)))); 328 | return i + 3; 329 | } 330 | 331 | dst[i + 0] = @as(u8, @truncate(@as(usize, @intCast(offset >> 8)))) << 5 | (@as(u8, @intCast(l)) -% 4) << 2 | tagCopy1; 332 | dst[i + 1] = @as(u8, @truncate(@as(usize, @intCast(offset)))); 333 | return i + 2; 334 | } 335 | 336 | fn encodeBlock(dst: []u8, src: []u8) usize { 337 | const maxTableSize = 1 << 14; 338 | const tableMask = maxTableSize - 1; 339 | 340 | var d: usize = 0; 341 | var shift: u32 = 24; 342 | var tableSize: isize = 1 << 8; 343 | while (tableSize < maxTableSize and tableSize < src.len) { 344 | tableSize *= 2; 345 | shift -= 1; 346 | } 347 | 348 | var table = mem.zeroes([maxTableSize]u16); 349 | var sLimit = src.len - inputMargin; 350 | var nextEmit: usize = 0; 351 | var s: usize = 1; 352 | var nextHash = snappyHash(load32(src, @as(isize, @intCast(s))), shift); 353 | 354 | outer: while (true) { 355 | var skip: isize = 32; 356 | var nextS = s; 357 | var candidate: isize = 0; 358 | 359 | inner: while (true) { 360 | s = nextS; 361 | var bytesBetweenHashLookups = skip >> 5; 362 | nextS = s + @as(usize, @intCast(bytesBetweenHashLookups)); 363 | skip += bytesBetweenHashLookups; 364 | if (nextS > sLimit) { 365 | break :outer; 366 | } 367 | candidate = @as(isize, @intCast(table[nextHash & tableMask])); 368 | table[nextHash & tableMask] = @as(u16, @intCast(s)); 369 | nextHash = snappyHash(load32(src, @as(isize, @intCast(nextS))), shift); 370 | if (load32(src, @as(isize, @intCast(s))) == load32(src, candidate)) { 371 | break :inner; 372 | } 373 | } 374 | 375 | d += emitLiteral(dst[d..], src[nextEmit..s]); 376 | 377 | while (true) { 378 | var base = s; 379 | s += 4; 380 | var i = @as(usize, @intCast(candidate + 4)); 381 | while (s < src.len and src[i] == src[s]) { 382 | i += 1; 383 | s += 1; 384 | } 385 | 386 | d += emitCopy(dst[d..], @as(isize, @intCast(base - @as(usize, @intCast(candidate)))), @as(isize, @intCast(s - base))); 387 | nextEmit = s; 388 | if (s >= sLimit) { 389 | break :outer; 390 | } 391 | 392 | var x = load64(src, @as(isize, @intCast(s - 1))); 393 | var prevHash = snappyHash(@as(u32, @truncate(x >> 0)), shift); 394 | table[prevHash & tableMask] = @as(u16, @intCast(s - 1)); 395 | var currHash = snappyHash(@as(u32, @truncate(x >> 8)), shift); 396 | candidate = @as(isize, @intCast(table[currHash & tableMask])); 397 | table[currHash & tableMask] = @as(u16, @intCast(s)); 398 | if (@as(u32, @truncate(x >> 8)) != load32(src, candidate)) { 399 | nextHash = snappyHash(@as(u32, @truncate(x >> 16)), shift); 400 | s += 1; 401 | break; 402 | } 403 | } 404 | } 405 | 406 | if (nextEmit < src.len) { 407 | d += emitLiteral(dst[d..], src[nextEmit..]); 408 | } 409 | 410 | return d; 411 | } 412 | 413 | /// Encode returns the encoded form of the source input. The returned slice must be freed. 414 | pub fn encode(allocator: Allocator, src: []u8) ![]u8 { 415 | var mutSrc = src; 416 | const encodedLen = maxEncodedLen(mutSrc.len); 417 | if (encodedLen < 0) { 418 | return SnappyError.TooLarge; 419 | } 420 | 421 | var dst = try allocator.alloc(u8, @as(usize, @intCast(encodedLen))); 422 | errdefer allocator.free(dst); 423 | 424 | var d = putUvarint(dst, @as(u64, @intCast(mutSrc.len))); 425 | 426 | while (mutSrc.len > 0) { 427 | var p = try allocator.alloc(u8, mutSrc.len); 428 | mem.copy(u8, p, mutSrc); 429 | var empty = [_]u8{}; 430 | mutSrc = empty[0..]; 431 | if (p.len > maxBlockSize) { 432 | mutSrc = p[maxBlockSize..]; 433 | p = p[0..maxBlockSize]; 434 | } 435 | if (p.len < minNonLiteralBlockSize) { 436 | d += emitLiteral(dst[d..], p); 437 | } else { 438 | d += encodeBlock(dst[d..], p); 439 | } 440 | allocator.free(p); 441 | } 442 | 443 | var output = try allocator.alloc(u8, d); 444 | mem.copy(u8, output, dst[0..d]); 445 | allocator.free(dst); 446 | 447 | return output; 448 | } 449 | 450 | /// Return the maximum length of a snappy block, given the uncompressed length. 451 | pub fn maxEncodedLen(srcLen: usize) isize { 452 | var n = @as(u64, @intCast(srcLen)); 453 | if (n > 0xffffffff) { 454 | return -1; 455 | } 456 | 457 | n = 32 + n + n / 6; 458 | if (n > 0xffffffff) { 459 | return -1; 460 | } 461 | 462 | return @as(isize, @intCast(n)); 463 | } 464 | 465 | test "snappy crc" { 466 | try testing.expect(crc("snappy") == 0x293d0c23); 467 | } 468 | 469 | test "decoding variable integers" { 470 | // Taken from the block format description. 471 | const case1 = uvarint(&[_]u8{0x40}); 472 | try testing.expect(case1.value == 64); 473 | try testing.expect(case1.bytesRead == 1); 474 | 475 | const case2 = uvarint(&[_]u8{ 0xfe, 0xff, 0x7f }); 476 | try testing.expect(case2.value == 2097150); 477 | try testing.expect(case2.bytesRead == 3); 478 | } 479 | 480 | test "simple encode" { 481 | const allocator = testing.allocator; 482 | 483 | var input: [4]u8 = [_]u8{ 't', 'h', 'i', 's' }; 484 | var i: []u8 = &input; 485 | var output = try encode(allocator, i); 486 | defer allocator.free(output); 487 | 488 | try testing.expectEqualSlices(u8, output, "\x04\x0cthis"); 489 | } 490 | 491 | test "simple decode" { 492 | const allocator = testing.allocator; 493 | 494 | const decoded = try decode(allocator, "\x19\x1coh snap,\x05\x06,py is cool!\x0a"); 495 | defer allocator.free(decoded); 496 | 497 | try testing.expectEqualSlices(u8, decoded, "oh snap, snappy is cool!\n"); 498 | } 499 | --------------------------------------------------------------------------------