├── LICENSE ├── README.md ├── ftlogo.bmp ├── io7.ts ├── logo.io7 └── logo98.io7 /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Pierce Smith 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # io7 2 | An in-depth look into the compression format used to store Windows 95/98's boot graphics (which I am calling "IO7"). 3 | 4 | ![image](https://github.com/user-attachments/assets/4bde3727-bbaa-4349-a35f-de56d2bc27d5) ![image](https://github.com/user-attachments/assets/73f11aea-abe2-4689-a253-8e26ca4dc19c) 5 | 6 | 7 | ## Background 8 | 9 | In Windows 95 and 98, you can override the default boot splash by placing 10 | your own bitmap in the root of the C: drive with the name `LOGO.SYS`. When 11 | booting, Windows will display your bitmap instead of the default one. 12 | 13 | But where is the original bitmap actually stored? As Wikipedia will tell us, 14 | it's kept in `IO.SYS`, a crucial DOS system file. But it's obscured with some 15 | kind of bespoke properietary compression, so we cannot extract or replace it. 16 | 17 | Until now. Not being able to find a detailed description of this compression format 18 | anywhere on the internet, I spent days hex-editing, educated guessing, and bug 19 | hunting to fully understand how it worked, so that the world can finally unlock 20 | what Microsoft has been trying to hide from us for almost 30 years. 21 | 22 | ...or, you know, you could just keep using the `LOGO.SYS` trick... 23 | 24 | ## Contents 25 | 26 | * `io7.ts`: A full description of the IO7 format, and a TypeScript implementation of a decompressor and compressor 27 | * `logo.io7`: Sample file for use with the script: the raw compressed bitmap data extracted directly from Windows 95's `IO.SYS` (offset `0x1B210` to offset `0x2B17A`) 28 | * `logo98.io7`: Sample file for use with the script: the raw compressed bitmap data extracted directly from Windows 98's `IO.SYS` (offset `0x1E030` to offset `0x2E285`) 29 | * `ftlogo.bmp`: Sample file for use with the script: An alternative boot splash in a `LOGO.SYS` compatible format that can be compressed and patched into `IO.SYS` on 95/98 30 | 31 | ## Running 32 | 33 | For the benefit of those not familiar with TypeScript: 34 | 35 | The easiest way to run a TypeScript file directly is to use `npm` to install `ts-node` globally, then run with `npx`: 36 | 37 | ```bash 38 | apt install npm # (or equivalent for your platform) 39 | npm install -g typescript ts-node 40 | npx ts-node io7.ts 41 | ``` 42 | 43 | If that doesn't work, Google "how to run a typescript file" and be very confused for the next 30-60 minutes. 44 | 45 | ## Acknowledgements 46 | 47 | I would have had no chance at understanding the IO7 format if it weren't for [this open source implementation](https://github.com/chenall/grub4dos/blob/3c1d05f39e49ec1d7543caa825df00068b96620b/stage2/builtins.c#L441-L621) 48 | of a decompressor for a very similar format used in Windows ME. Thank you, bean, whoever and wherever you are. 49 | -------------------------------------------------------------------------------- /ftlogo.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierce-smith1/io7/64ea32e772db062f2300473703e55848c9835ebb/ftlogo.bmp -------------------------------------------------------------------------------- /io7.ts: -------------------------------------------------------------------------------- 1 | // 2 | // io7.ts 3 | // Windows 95/98 Boot Splash Decompressor and Compressor 4 | // 5 | // # BACKGROUND 6 | // 7 | // In Windows 95 and 98, you can override the default boot splash by placing 8 | // your own bitmap in the root of the C: drive with the name LOGO.SYS. When 9 | // booting, Windows will display your bitmap instead of the default one. 10 | // 11 | // But where is the original bitmap actually stored? As Wikipedia will tell us, 12 | // it's kept in IO.SYS, a crucial DOS system file. But it's obscured with some 13 | // kind of bespoke properietary compression, so we cannot extract or replace it. 14 | // 15 | // Until now. 16 | // 17 | // This script implements a (very inefficient) compressor and decompressor for 18 | // the custom compression format used for these graphics. While it can succesfully 19 | // extract and re-package logos for Windows 95 & 98, the main point of the 20 | // script is to serve as education and documentation for how this format works. 21 | // 22 | // The rest of this interlude will provide an exhaustive overview of what the 23 | // data format looks like, including every single gory detail. 24 | // Following this is the code itself, which provides a working 25 | // reference implementation for both compression and decompression. 26 | // 27 | // By default, this script does nothing when run - scroll to the very bottom and 28 | // uncomment one of the function calls to run the compressor or decompressor 29 | // on the example files that have been included with this script. 30 | // (from https://github.com/pierce-smith1/io7) 31 | // 32 | // # THE FORMAT 33 | // 34 | // In Windows 95's IO.SYS, the compressed logo data begins at offset 0x1b120 35 | // and ends at offset 0x2b17a. There are no other compressed regions in the 36 | // file. It's the only example I know of of this exact compression format 37 | // being used. I'm going to call the format "IO7", after the IO from IO.SYS 38 | // and the fact that 9x's DOS identifies itself as DOS 7. 39 | // 40 | // Fundamentally, IO7 is a variant on LZSS. Data is split up into 41 | // tokens, coming one after another, that represent either literal bytes 42 | // or runs of bytes that have been previously seen. These tokens are grouped 43 | // into blocks that encode 8192 or fewer bytes of uncompressed data. Each 44 | // block has a header that tells how big the compressed size is and whether 45 | // or not it's the last block. Each block in an IO7 "file" comes immediately 46 | // after the last. 47 | // 48 | // We'll now go into each of these things in more detail. 49 | // 50 | // ## BLOCKS 51 | // 52 | // An IO7 file is made entirely of "blocks" - it starts with a block, and blocks 53 | // continue appearing one after another until the final block, after which the 54 | // file simply ends. 55 | // 56 | // Blocks contain an 8-byte header, which crucially contains the size of the 57 | // block, then an arbitrary number of bytes afterwards that contain the 58 | // token stream encoding the compressed data. Unlike the token stream itself, 59 | // blocks must be aligned to bytes, and in fact the end of the token stream 60 | // will be padded with 0 bits to ensure the next block starts on a byte. 61 | // 62 | // Block headers are structured as follows: 63 | // 64 | // byte 65 | // 0----1----2----3----4----5----6----7--- 66 | // size size ltag ltag 0x44 0x53 0x00 0x00 67 | // 68 | // The two size bytes are a 16-bit little-endian integer encoding the size 69 | // of the compressed data in this block. For whatever reason, the MSB is 70 | // always set to 1, meaning you must subtract 0x8000 to get the actual size. 71 | // For example the two size bytes may be 0xBE 0x88, encoding an actual size of 72 | // 0x88BE - 0x8000 = 0x08BE. 73 | // 74 | // The two ltag bytes are either [0x00 0x20] or [0x36 0x18]. They are [0x36 0x18] 75 | // if and only if this is the last block in the file. 76 | // 77 | // After the ltag comes the fixed bytes [0x44 0x53 0x00 0x00] (DS..). Looking for 78 | // this DS.. pattern can quickly identify the starts of compressed blocks in a 79 | // hex editor. For the purposes of the block size, this byte pattern is also 80 | // where the compressed data starts, i.e. the block size counts these 81 | // four bytes. 82 | // 83 | // ## TOKEN STREAMS 84 | // 85 | // Immediately after a block header, the compressed data begins. The compressed 86 | // data is a stream of tokens, each token coming one after another. Tokens are 87 | // not aligned to bytes and often have sizes in bits that are not multiples 88 | // of 8, so they must be read and written bit-by-bit. 89 | // 90 | // In the file, streams of bits are encoded by chopping the stream into bytes, 91 | // where each byte contains 8 bits ordered from LSB to MSB. So for example, say 92 | // we want to encode the following stream of bits (where letters a-p are bits): 93 | // 94 | // ---------------> 95 | // abcdefghijklmnop 96 | // 97 | // They will appear in the bytes of the file as such: 98 | // 99 | // <------- <------- 100 | // hgfedcba ponmlkji 101 | // 0 1 102 | // 103 | // The bytes are read left to right, but the bits in those bytes are read 104 | // right to left. Put another way, the "first" bit in a byte is in the 105 | // least significant position. 106 | // 107 | // ### TOKENS 108 | // 109 | // There are two main kinds of token: a literal token and an offset-length 110 | // token. Each one represents one or more bytes of uncompressed data. 111 | // 112 | // #### LITERAL TOKENS 113 | // 114 | // A literal token encodes a single byte of uncompressed data. For notational 115 | // purposes, we can write these tokens as simply #hh, where hh are the two 116 | // hexadecimal digits of the encoded byte. For example, if we had a stream of 117 | // the following tokens: 118 | // 119 | // #00 #11 #22 120 | // 121 | // This encodes the uncompressed data [0x00 0x11 0x22]. 122 | // 123 | // In the bitstream, literal tokens are 9 bits long. There is a single flag bit 124 | // followed immediately by the 8 bits of the byte. A literal token looks as 125 | // follows in a stream: 126 | // 127 | // ---------> 128 | // ...f Dddddddd... 129 | // ^ ^ 130 | // | | 131 | // ' flag bit 132 | // | 133 | // ' data bits - D is the MSB 134 | // 135 | // The flag bit is the inverse of the MSB of the data. If D is 0, then f is 1 - 136 | // if D is 1, then f is 0. For example, we would encode the byte 0xB5 in a bit 137 | // stream as follows: 138 | // 139 | // ---------> 140 | // ...0 10110101... 141 | // ^ ^ 142 | // | | 143 | // ' flag bit, is 0 because the MSB of the data is 1 144 | // | 145 | // ' the byte (0xB5) as binary 146 | // 147 | // #### OFFSET-LENGTH TOKENS 148 | // 149 | // Offset-length tokens encode a sequence of at least 2 bytes through two 150 | // numbers: an _offset_ into the uncompressed data, and a _length_ of how many 151 | // bytes to read from this offset. 152 | // 153 | // For notational purposes, we can write these tokens as , where O is the 154 | // offset and L is the length. O must be >= 1, and L must be >= 2. 155 | // 156 | // The offset is backwards relative to our current position in the uncompressed 157 | // stream of bytes, and the length is the count of bytes to read. For example, 158 | // say we have the following token stream: 159 | // 160 | // #0a #1b #2c #3d <3,2> 161 | // 162 | // The first four tokens represent the uncompressed data [0x0a 0x1b 0x2c 0x3d]. 163 | // Now, when the token <3,2> is encountered, this tells us to look back 3 bytes 164 | // from the end of our uncompressed data, then read 2 bytes forward. 165 | // 166 | // [0x0a 0x1b 0x2c 0x3d] 167 | // ^ 168 | // | 169 | // ' start past the end 170 | // 171 | // [0x0a (0x1b 0x2c 0x3d] 172 | // ^ 173 | // | 174 | // ' move 3 bytes backwards... 175 | // 176 | // [0x0a (0x1b 0x2c) 0x3d] 177 | // ^ 178 | // | 179 | // ' ...then 2 bytes forwards 180 | // 181 | // (0x1b 0x2c) is what <3,2> encodes 182 | // 183 | // So, in this stream, <3,2> encodes [0x1b 0x2c], and our final uncompressed data 184 | // is the sequence [0x0a 0x1b 0x2c 0x3d 0x1b 0x2c]. 185 | // 186 | // It is valid for the length to be greater than the offset. In this case, 187 | // bytes read off the end of the uncompressed data repeat the data read from 188 | // from the offset to the end of the data. For example, take this stream of tokens: 189 | // 190 | // #00 #11 #22 #33 <3,7> 191 | // 192 | // To read the <3,7> token: 193 | // 194 | // [0x00 0x11 0x22 0x33] 195 | // ^ 196 | // | 197 | // ' start past the end 198 | // 199 | // [0x00 (0x11 0x22 0x33)] 200 | // ^ 201 | // | 202 | // ' 3 bytes backwards... 203 | // 204 | // [0x00 (0x11 0x22 0x33 ?? ... 205 | // ^ 206 | // | 207 | // ' ...then 7 bytes forwards - 208 | // but uh oh, we're out of bytes to read! 209 | // 210 | // [0x00 (0x11 0x22 0x33 0x11 ... 211 | // ^ ^ 212 | // '---------------| 213 | // ' ...so we pretend the bytes repeat from 214 | // where we started reading! 215 | // 216 | // [0x00 (0x11 0x22 0x33 0x11 0x22 ... 217 | // ^ ^ 218 | // '--------------| 219 | // ' continue... 220 | // 221 | // [0x00 (0x11 0x22 0x33 0x11 0x22 0x33 0x11)] 222 | // ^ ^ 223 | // '--------------| 224 | // ' ...until we've made enough bytes 225 | // 226 | // (0x11 0x22 0x33 0x11 0x22 0x33 0x11) is what <3,7> encodes 227 | // 228 | // It is not valid for the offset to be greater than the length of the 229 | // uncompressed data. 230 | // 231 | // In the bitstream, offset-length tokens are a variable number of bits. The 232 | // first group is bits encodes the offset and the second group encodes the 233 | // length, both of which are variable size. 234 | // 235 | // Consider the offset first. Depending on the offset's size, we use three 236 | // different encoding strategies. 237 | // 238 | // * If the offset is representable in 6 bits, i.e. if it is less than 64, 239 | // it is a "short" offset, and will be encoded in 8 bits. Start with two 0 bits, 240 | // then write the 6 bits of the offset. Example: An offset of 21 (0x15) will 241 | // appear as such: 242 | // 243 | // --------> 244 | // ...00 010101... 245 | // ^ ^ 246 | // | | 247 | // ' short offset flag 248 | // | 249 | // ' 21 (0x15) in binary 250 | // 251 | // * If the offset - 64 is repesentable in 8 bits, i.e. if it is >= 64 and < 320, 252 | // it is a "medium" offset, and will be encoded in 13 bits. Start with two 1 bits, 253 | // then a 0 bit, then write the 8 bits of the offset - 64. Example: An offset of 254 | // 152 (0x98) will appear as such: 255 | // 256 | // ...110 01011000... 257 | // ^ ^ 258 | // | | 259 | // ' medium offset flag 260 | // | 261 | // ' 152 - 64 = 88 (0x58) in binary 262 | // 263 | // * If the offset - 319 is representable in 12 bits i.e. if it is >= 320 and < 4415, 264 | // it is a "far" offset, and will be encoded in 15 bits. Start with three 1 bits, 265 | // then write the 12 bits of the offset - 320. Example: an offset of 2270 will 266 | // appear as such: 267 | // 268 | // ...111 011110011110... 269 | // ^ ^ 270 | // | | 271 | // ' far offset flag 272 | // | 273 | // ' 2270 - 320 = 1950 (0x79e) in binary 274 | // 275 | // Note that the offset cannot be 4415, even though 4415 - 320 = 4095 can be 276 | // represented in 12 bits as a sequence of all 1s. This would result in a 277 | // sequence of 15 consecutive 1s, which is interpreted as a special token 278 | // we will talk about later. 279 | // 280 | // Note also that this defines 4414 as the largest representable offset. 281 | // 282 | // Immediately following the offset is the length. Again, the length is encoded 283 | // differently based on how large it is - however, this encoding scheme is more 284 | // mathematical than case-based. 285 | // 286 | // We split the length into two numbers, a "base" and a "tail". The base is the 287 | // largest number that is a power of 2 plus 1 which is still smaller than the 288 | // length, and the tail is simply the length minus the base. For example, if 289 | // the length is 13, the largest number n^2 + 1 that fits within it is 9 (2^3 + 1), 290 | // so the base is 9 and the tail is 13 - 9 = 4. Another example, if the length 291 | // is 40, the base is 33 (2^5 + 1) and the tail is 40 - 33 = 7. 292 | // 293 | // To represent the length in the bitstream, first encode the base by writing 294 | // n 0s, where n is the power of the base. Then write a 1, then finish by 295 | // writing the tail in n bits. 296 | // 297 | // Let's use 13 as an example again. The base is 2^3 + 1, so we write 3 298 | // 0 bits, then a 1 bit. Finally we write the tail, 4, in 3 bits. 299 | // 300 | // ...000 1 100... 301 | // ^ ^ ^ 302 | // | | | 303 | // n 0s, where n = the power of the base (in this case, 3) 304 | // | | 305 | // ' fixed 1 following the 0s 306 | // | 307 | // ' the tail (4) in 3 bits 308 | // 309 | // Another example: to encode a length of 40, the base is 2^5 + 1, so we 310 | // write 5 0 bits, then a 1 bit, and finally the tail of 7 in 5 bits. 311 | // 312 | // ...00000 1 00111... 313 | // ^ ^ ^ 314 | // | | | 315 | // n 0s, where n = the power of the base (in this case, 5) 316 | // | | 317 | // ' fixed 1 following the 0s 318 | // | 319 | // ' the tail (7) in 5 bits 320 | // 321 | // Let's put everything together into a final example. 322 | // 323 | // Say we want to encode the token <343,10>. The offset is 343 and the length 324 | // is 10. We start by encoding the offset. 325 | // 326 | // The offset is larger than 320 and smaller than 4415, so it is a "far" 327 | // offset, and it is encoded as follows: 328 | // 329 | // ..111 000000010111... 330 | // ^ ^ 331 | // | | 332 | // ' far offset flag 333 | // | 334 | // ' 343 - 320 = 27 (0x17) in binary in 12 bits 335 | // 336 | // Now immediately afterwards we encode the length. The length is 10, so we 337 | // have a base of 9 (2^3 + 1). The tail is 10 - 9 = 1. This adds to our encoding 338 | // as follows: 339 | // 340 | // ...111000000010111 000 1 001... 341 | // ^ ^ ^ ^ 342 | // | | | | 343 | // ' the offset ' three 0s (since the base is 2^3 + 1) 344 | // | | 345 | // ' fixed 1 following the 0s 346 | // | 347 | // ' the tail (1) in three bits 348 | // 349 | // And the final token is: 350 | // 351 | // ...1110000000101110001001... 352 | // 353 | // #### SENTINEL TOKENS 354 | // 355 | // There is a final type of token that encodes no data. It MUST appear as the 356 | // final token of a block, and it also MUST appear after 512 bytes of 357 | // uncompressed data have been encoded. In other words, every sequence of 358 | // tokens between two sentinel tokens MUST encode 512 or fewer bytes of data. 359 | // This means is also invalid for a single token to encode more than 512 bytes. 360 | // 361 | // Its bitstream representation is simply 15 consecutive 1s. 362 | // 363 | // That's all! That's the compression format in its entirety. 364 | // We now know enough to build a working compressor and decompressor. 365 | // 366 | // The rest of this file will contain a reference implementation for a 367 | // decompressor and compressor, using what we've detailed above. But first, I 368 | // want to quickly make an acknowledgement: 369 | // 370 | // # ACKNOWLEDGEMENTS 371 | // 372 | // This work is indebted to a programmer simply credited "bean", who wrote a 373 | // decompressor for the Windows ME IO.SYS format for an open-source project 374 | // called grub4dos. 375 | // 376 | // (https://github.com/chenall/grub4dos/blob/3c1d05f39e49ec1d7543caa825df00068b96620b/stage2/builtins.c#L441-L621) 377 | // 378 | // Their code was instrumental in guiding me through the reverse-engineering 379 | // process. It it wasn't for them, I would never have made it even close to 380 | // understanding the IO7 format and being able to write these algorithms. 381 | // 382 | // Thank you, bean. 383 | // 384 | 385 | // 386 | // # ALGORITHMS 387 | // 388 | // What follows now is some actual code for implementing compression and 389 | // decompression in the IO7 format. 390 | // 391 | 392 | // We define a convenience class that wraps an array of bytes and lets us 393 | // read and write from it as if it were a stream of bits. 394 | class BitStream { 395 | constructor ( 396 | // Throughout this program, we will use the `number` type to implicitly 397 | // mean a byte, and just make the assumption it always stays in the range 398 | // 0 - 255. 399 | private bytes: Array, 400 | 401 | // bit_pos represents our position in the current byte - as such it 402 | // stays between 0 and 7. 403 | // 0 means we're at the LSB of the byte, and 7 the MSB. 404 | private bit_pos = 0, 405 | 406 | // byte_pos is effectively an index into the byte array. 407 | // As we travel bit-by-bit through the stream, byte_pos will increase 408 | // slowly without bound, and bit_pos will cycle between 0 and 7. 409 | private byte_pos = 0, 410 | ) {} 411 | 412 | get_bytes() { 413 | return [...this.bytes]; 414 | } 415 | 416 | get_bit_offset() { 417 | return this.bit_pos + (this.byte_pos * 8); 418 | } 419 | 420 | advance_bit_pos() { 421 | this.bit_pos++; 422 | 423 | if (this.bit_pos >= 8) { 424 | this.bit_pos = 0; 425 | this.byte_pos++; 426 | } 427 | } 428 | 429 | advance_bit_pos_by(num_bits: number) { 430 | for (let i = 0; i < num_bits; i++) { 431 | this.advance_bit_pos(); 432 | } 433 | } 434 | 435 | reverse_bit_pos() { 436 | this.bit_pos--; 437 | 438 | if (this.bit_pos < 0) { 439 | this.bit_pos = 7; 440 | this.byte_pos--; 441 | } 442 | } 443 | 444 | reverse_bit_pos_by(num_bits: number) { 445 | for (let i = 0; i < num_bits; i++) { 446 | this.reverse_bit_pos(); 447 | } 448 | } 449 | 450 | // This returns the special sentinel "eof" if we run out of bits. 451 | read_bits(num_bits: number): number | "eof" { 452 | const end_bit = (this.bit_pos + num_bits) % 8; 453 | const end_byte = this.byte_pos + Math.floor((this.bit_pos + num_bits) / 8); 454 | 455 | let result = 0; 456 | let bits_read = 0; 457 | while (this.bit_pos !== end_bit || this.byte_pos !== end_byte) { 458 | const byte = this.bytes[this.byte_pos]; 459 | 460 | if (byte === undefined) { 461 | return "eof"; 462 | } 463 | 464 | const next_bit = (byte & (1 << this.bit_pos)) ? 1 : 0; 465 | result += next_bit << bits_read++; 466 | this.advance_bit_pos(); 467 | } 468 | 469 | return result; 470 | } 471 | 472 | write_bits(n: number, num_bits: number) { 473 | let bits_written = 0; 474 | while (bits_written < num_bits) { 475 | this.write_bit((n >>> bits_written & 1) as 0 | 1); 476 | bits_written++; 477 | } 478 | } 479 | 480 | write_bit(bit: 0 | 1) { 481 | if (this.byte_pos >= this.bytes.length) { 482 | this.bytes.push(0); 483 | } 484 | 485 | this.bytes[this.byte_pos] += bit << this.bit_pos; 486 | 487 | this.advance_bit_pos(); 488 | } 489 | 490 | is_at_end(): boolean { 491 | return this.bytes[this.byte_pos] === undefined; 492 | } 493 | } 494 | 495 | // A model of our tokens will prove very useful. 496 | // They are perfectly described by a tagged union. 497 | type Token = 498 | | {type: "literal", value: number} 499 | | {type: "offset", offset: number, length: number} 500 | | {type: "sentinel"} 501 | 502 | // 503 | // ## DECOMPRESSION ROUTINES 504 | // 505 | 506 | // This function will read a token from a stream of bits, advancing the stream 507 | // to the start of the next token. 508 | // It will return undefined when there are no more valid tokens to read. 509 | // The idea is that you can call this repeatedly on the same bitstream until it 510 | // returns undefined to enumerate all the tokens in the stream. 511 | function read_token(bits: BitStream): Token | undefined { 512 | // To avoid having to constantly check for "eof" after reading from the 513 | // stream, we define this helper that throws an exception on eof. Then in 514 | // our logic, we trap the exception to immediately return undefined. 515 | function read_bits_or_die(num_bits: number): number { 516 | const result = bits.read_bits(num_bits); 517 | if (result === "eof") { 518 | throw new Error("Out of bits"); 519 | } 520 | 521 | return result; 522 | } 523 | 524 | try { 525 | // We need to read at least two bits to start figuring out what kind of 526 | // token we're looking at. 527 | const tag = read_bits_or_die(2); 528 | 529 | // We can tell if this is a literal byte by if the two bits of the tag 530 | // are different. 531 | if (tag === 1 || tag === 2) { 532 | const literal_byte_end = read_bits_or_die(7); 533 | 534 | // This reconstructs the original byte from part of the tag, since 535 | // they overlap. 536 | const literal_byte = literal_byte_end + ((tag & 1) << 7); 537 | 538 | return {type: "literal", value: literal_byte}; 539 | } 540 | 541 | // If we get here, we must be reading either a sentinel or an 542 | // offset-length token. 543 | // A sentinel resembles a far offset-length token enough that we can 544 | // check for it in the process of parsing an offset-length token, so at 545 | // this point we just assume we're looking at an offset-length token. 546 | const offset = (() => { 547 | if (tag === 0) { 548 | const short_offset = read_bits_or_die(6); 549 | return short_offset; 550 | } 551 | 552 | const is_far_offset = bits.read_bits(1); 553 | if (is_far_offset === 1) { 554 | const far_offset = read_bits_or_die(12); 555 | return far_offset + 0x140; 556 | } 557 | 558 | const med_offset = read_bits_or_die(8); 559 | return med_offset + 0x40; 560 | })(); 561 | 562 | // 0x113F - 0x140 === 0xFFF. So with the three 1s of the tag included, 563 | // if we see this exact offset, we've seen fifteen 1s in a row. 564 | if (offset === 0x113F) { 565 | return {type: "sentinel"}; 566 | } 567 | 568 | const length = (() => { 569 | let bits_in_length = 0; 570 | while (read_bits_or_die(1) === 0) { 571 | bits_in_length++; 572 | } 573 | 574 | const base = (1 << bits_in_length) + 1; 575 | const tail = read_bits_or_die(bits_in_length); 576 | 577 | const length = base + tail; 578 | return length; 579 | })(); 580 | 581 | return {type: "offset", offset, length}; 582 | } catch { 583 | // Using exceptions for control flow - very naughty indeed! 584 | return undefined; 585 | } 586 | } 587 | 588 | // This function unpacks a stream of tokens into the uncompressed bytes 589 | // they represent. 590 | function decode_tokens(tokens: Array): Array { 591 | // We need to keep track of the bytes we've uncompressed so far so that we 592 | // can decompress offset-length tokens. 593 | let output: Array = []; 594 | 595 | function segment_to_bytes(segment: Token): Array { 596 | switch (segment.type) { 597 | case "literal": return [segment.value]; 598 | case "offset": { 599 | const end_index = -segment.offset + segment.length; 600 | let new_bytes = [...output.slice(-segment.offset, end_index >= 0 ? undefined : end_index)]; 601 | 602 | // If the length was greater than the offset, we tried to read 603 | // past the end of the array. 604 | // We need to fill the missing pieces with the last byte of 605 | // the output. 606 | while (new_bytes.length !== segment.length) { 607 | new_bytes.push(new_bytes.at(-segment.offset)); 608 | } 609 | 610 | return new_bytes; 611 | } 612 | case "sentinel": return []; 613 | } 614 | } 615 | 616 | for (const segment of tokens) { 617 | const bytes = segment_to_bytes(segment); 618 | output = [...output, ...bytes]; 619 | } 620 | 621 | return output; 622 | } 623 | 624 | // This separates a bitstream into blocks of raw compressed data by searching 625 | // for, splitting on, and stripping out DS headers. 626 | // The result is an array of bitstreams that contain raw data from which we 627 | // can read tokens. 628 | function break_into_blocks(bits: BitStream): Array { 629 | let blocks = []; 630 | 631 | let block_size = bits.read_bits(16); 632 | while (!bits.is_at_end() && block_size !== "eof" && block_size >= 0x8000) { 633 | // Skip the rest of the DS header by moving forwards 6 bytes - 634 | // A DS header is 8 bytes but we already read 2 of them. 635 | bits.advance_bit_pos_by(6 * 8); 636 | 637 | const this_block = []; 638 | for (let i = 0; i < block_size - 0x8000 - 4; i++) { 639 | // We can just assume these reads will never be eofs, since if they 640 | // are, the header lied to us about big the block is..! 641 | const byte = bits.read_bits(8) as number; 642 | this_block.push(byte); 643 | } 644 | 645 | blocks.push(this_block); 646 | 647 | block_size = bits.read_bits(16); 648 | } 649 | 650 | return blocks.map(block => new BitStream(block)); 651 | } 652 | 653 | // Putting everyting together, this function turns a bitstream holding a 654 | // complete IO7 file into a stream of uncompressed bytes! 655 | function decompress(compressed_bits: BitStream): Array { 656 | const blocks = break_into_blocks(compressed_bits); 657 | 658 | const output_bytes = blocks.flatMap(block => { 659 | let tokens = []; 660 | 661 | let next_token: Token | undefined; 662 | while ((next_token = read_token(block))) { 663 | tokens.push(next_token); 664 | } 665 | 666 | const output = decode_tokens(tokens); 667 | return output; 668 | }); 669 | 670 | return output_bytes; 671 | } 672 | 673 | // 674 | // ## COMPRESSION ROUTINES 675 | // 676 | 677 | // This function turns a token into its bit representation and writes it to 678 | // a bitstream. 679 | function write_token(segment: Token, bits: BitStream) { 680 | if (segment.type === "literal") { 681 | const byte = segment.value; 682 | 683 | const high_bit = byte & (1 << 7) ? 1 : 0; 684 | 685 | bits.write_bit(high_bit); 686 | bits.write_bit(high_bit === 0 ? 1 : 0); 687 | bits.write_bits(byte, 7); 688 | } else if (segment.type === "offset") { 689 | const {offset, length} = segment; 690 | 691 | const offset_size = (() => { 692 | if (offset < 0x40) { 693 | return "short"; 694 | } else if (offset < 0x140) { 695 | return "medium"; 696 | } else { 697 | return "far"; 698 | } 699 | })(); 700 | 701 | if (offset_size === "short") { 702 | bits.write_bits(0, 2); 703 | bits.write_bits(offset, 6); 704 | } else { 705 | bits.write_bits(3, 2); 706 | } 707 | 708 | if (offset_size === "medium") { 709 | bits.write_bit(0); 710 | bits.write_bits(offset - 0x40, 8); 711 | } 712 | 713 | if (offset_size === "far") { 714 | bits.write_bit(1); 715 | bits.write_bits(offset - 0x140, 12); 716 | } 717 | 718 | let bits_in_length = (length => { 719 | let num_bits = 0; 720 | while (length >>> num_bits) { 721 | num_bits++; 722 | } 723 | return num_bits; 724 | })(length - 1); 725 | 726 | for (let i = 0; i < bits_in_length - 1; i++) { 727 | bits.write_bit(0); 728 | } 729 | 730 | bits.write_bit(1); 731 | 732 | bits.write_bits(length - 1 - (1 << bits_in_length), bits_in_length - 1); 733 | } else { 734 | bits.write_bits(0xFFFF, 15); 735 | } 736 | } 737 | 738 | // This function turns a stream of uncompressed bytes into a stream of tokens 739 | // that encode the compressed data. 740 | // This is where the compression magic really happens. 741 | function encode_bytes(bytes: Array): Array { 742 | const bits = new BitStream(bytes); 743 | 744 | // We need to keep track of where we are in the uncompressed data so we can 745 | // correctly count offsets for offset-length tokens. 746 | let input_cursor = 0; 747 | const output_segments: Array = []; 748 | 749 | function commit_segment(segment: Token, from_bytes: Array) { 750 | input_cursor += from_bytes.length; 751 | output_segments.push(segment); 752 | } 753 | 754 | function array_equal(a1: Array, a2: Array): boolean { 755 | return a1.length === a2.length && a1.every((_, i) => a1[i] === a2[i]); 756 | } 757 | 758 | // A "phrase" is some arbitray run of bytes. This helper searches through 759 | // the input we have currently processed to see if that phrase has 760 | // already appeared. 761 | // It's used to tell if we can use an offset-length token to encode the 762 | // given phrase. 763 | // It returns the index the phrase begins in our input bytes, or undefined 764 | // if it can't be found. 765 | function find_phrase_in_input(phrase: Array): number | undefined { 766 | const seen_input = bytes.slice(0, input_cursor); 767 | 768 | // Using findLastIndex here helps keep the offset small, since offsets 769 | // are negative relative to the end. 770 | const phrase_index = seen_input.findLastIndex((byte, i, bytes) => { 771 | let searched_bytes = bytes.slice(i, i + phrase.length); 772 | 773 | if (searched_bytes.length < phrase.length) { 774 | searched_bytes = [...searched_bytes, ...new Array(phrase.length - searched_bytes.length).fill(seen_input.at(-1))]; 775 | } 776 | 777 | return array_equal(phrase, searched_bytes); 778 | }); 779 | 780 | return phrase_index !== -1 ? phrase_index : undefined; 781 | } 782 | 783 | // The main compression loop, running until we're out of input. 784 | while (!bits.is_at_end()) { 785 | // We want to opportunistically read two bits at a time so we can 786 | // start trying to build a phrase that could be encoded in an 787 | // offset-length token. 788 | // If we only ever read one byte at a time, we can never make a phrase, 789 | // since they have to be at least two bytes. 790 | let maybe_phrase = [bits.read_bits(8), bits.read_bits(8)]; 791 | 792 | if (maybe_phrase[0] === "eof") { 793 | break; 794 | } 795 | 796 | if (maybe_phrase[1] === "eof") { 797 | commit_segment({type: "literal", value: maybe_phrase[0]}, [maybe_phrase[0]]); 798 | break; 799 | } 800 | 801 | let phrase = maybe_phrase as Array; 802 | 803 | let last_input_phrase_index: number | undefined; 804 | let input_phrase_index: number | undefined; 805 | 806 | // See if our phrase is in the input, and if it is, greedily keep 807 | // trying to grow it as much as possible. 808 | while ((input_phrase_index = find_phrase_in_input(phrase)) !== undefined) { 809 | const next_byte = bits.read_bits(8); 810 | 811 | // The -1 here is an ugly hack to handle the special case of our 812 | // phrase getting cut off by the end of the input. 813 | phrase = [...phrase, next_byte === "eof" ? -1 : next_byte]; 814 | last_input_phrase_index = input_phrase_index; 815 | } 816 | 817 | if (phrase.at(-1) !== -1) { 818 | // Once the above while loop exits, we will have read one byte that 819 | // didn't fit into our phrase - so we need to back up so that we can 820 | // consider that byte again in the next loop around. 821 | // The if condition guards against the case that the phrase was cut 822 | // off by the end of the stream, in which case we DON'T want to 823 | // reverse the stream since nothing was read. 824 | bits.reverse_bit_pos_by(8); 825 | } 826 | 827 | // The last byte we read was NOT part of this phrase. 828 | phrase = phrase.slice(0, -1); 829 | 830 | if (phrase.length === 1) { 831 | commit_segment({type: "literal", value: phrase[0]}, [phrase[0]]); 832 | continue; 833 | } 834 | 835 | commit_segment({type: "offset", offset: input_cursor - last_input_phrase_index, length: phrase.length}, phrase); 836 | } 837 | 838 | commit_segment({type: "sentinel"}, []); 839 | return output_segments; 840 | } 841 | 842 | // This function reads up to 8192 bits of uncompressed data (enough to make a 843 | // block) from the input stream, and outputs a stream of tokens to store in 844 | // the block. 845 | // It returns undefined when the stream is over and no more blocks can be made. 846 | function encode_next_block(bits: BitStream): Array | undefined { 847 | if (bits.is_at_end()) { 848 | return undefined; 849 | } 850 | 851 | // There must be a sentinel every 512 bytes - 852 | // We call a stretch of 512 bytes a "field". 853 | const max_bytes_in_field = 512; 854 | const max_fields_in_block = 16; 855 | const max_bytes_in_block = max_bytes_in_field * max_fields_in_block; // 8192! 856 | 857 | const block_bytes = (() => { 858 | let bytes: Array = []; 859 | 860 | for (let i = 0; i < max_bytes_in_block; i++) { 861 | const next_byte = bits.read_bits(8); 862 | if (next_byte === "eof") { 863 | break; 864 | } 865 | bytes.push(next_byte); 866 | } 867 | 868 | return bytes; 869 | })(); 870 | 871 | const fields = block_bytes.flatMap((byte, i, bytes) => { 872 | return i % max_bytes_in_field 873 | ? [] 874 | : [bytes.slice(i, i + max_bytes_in_field)]; 875 | }); 876 | 877 | const segments = fields.flatMap(encode_bytes); 878 | return segments; 879 | } 880 | 881 | // Putting everyting together, this function turns a bitstream holding 882 | // arbitrary data into a stream of bytes holding a complete compressed 883 | // IO7 file! 884 | function compress(input_bits: BitStream): Array { 885 | let blocks: Array> = []; 886 | 887 | let next_block: Array | undefined; 888 | while ((next_block = encode_next_block(input_bits))) { 889 | blocks.push(next_block); 890 | } 891 | 892 | const encoded_blocks = blocks.map(tokens => { 893 | const bits = new BitStream([]); 894 | tokens.forEach(token => write_token(token, bits)); 895 | return bits; 896 | }).map(bits => bits.get_bytes()); 897 | 898 | const output_bytes = []; 899 | encoded_blocks.forEach((encoded_block, i, blocks) => { 900 | // If this is the last block, use the special last block tag 901 | const ds_header_tag = i !== blocks.length - 1 902 | ? [0x00, 0x20] 903 | : [0x36, 0x18]; 904 | 905 | const ds_header = [...ds_header_tag, 0x44, 0x53, 0x00, 0x00]; 906 | 907 | const block_size = encoded_block.length + 0x8004; 908 | const size_header = [block_size & 0xff, ((block_size & 0xff00) >> 8)]; 909 | 910 | output_bytes.push(...size_header, ...ds_header, ...encoded_block); 911 | }); 912 | 913 | return output_bytes; 914 | } 915 | 916 | // Finally, some helpers to facilitate compressing and decompressing to and 917 | // from files. 918 | import * as Fs from "fs"; 919 | 920 | function decompress_file(input_filename: string, output_filename: string) { 921 | const input_bits = new BitStream([...new Uint8Array(Fs.readFileSync(input_filename))]); 922 | 923 | const decompressed_bytes = decompress(input_bits); 924 | 925 | Fs.writeFileSync(output_filename, new Uint8Array(decompressed_bytes)); 926 | } 927 | 928 | function compress_file(input_filename: string, output_filename: string) { 929 | const input_bits = new BitStream([...new Uint8Array(Fs.readFileSync(input_filename))]); 930 | 931 | const compressed_bytes = compress(input_bits); 932 | 933 | Fs.writeFileSync(output_filename, new Uint8Array(compressed_bytes)); 934 | } 935 | 936 | // Try it! 937 | 938 | //decompress_file("logo.io7", "logo.bmp"); 939 | //decompress_file("logo98.io7", "logo98.bmp"); 940 | //compress_file("ftlogo.bmp", "ftlogo.io7"); 941 | 942 | // And that's all, folks. 943 | // PS <3 944 | -------------------------------------------------------------------------------- /logo.io7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierce-smith1/io7/64ea32e772db062f2300473703e55848c9835ebb/logo.io7 -------------------------------------------------------------------------------- /logo98.io7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierce-smith1/io7/64ea32e772db062f2300473703e55848c9835ebb/logo98.io7 --------------------------------------------------------------------------------