├── LICENSE
├── README.md
├── ftlogo.bmp
├── io7.ts
├── logo.io7
└── logo98.io7


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Pierce Smith
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # io7
 2 | An in-depth look into the compression format used to store Windows 95/98's boot graphics (which I am calling "IO7").
 3 | 
 4 | ![image](https://github.com/user-attachments/assets/4bde3727-bbaa-4349-a35f-de56d2bc27d5) ![image](https://github.com/user-attachments/assets/73f11aea-abe2-4689-a253-8e26ca4dc19c)
 5 | 
 6 | 
 7 | ## Background 
 8 | 
 9 | In Windows 95 and 98, you can override the default boot splash by placing
10 | your own bitmap in the root of the C: drive with the name `LOGO.SYS`. When
11 | booting, Windows will display your bitmap instead of the default one.
12 | 
13 | But where is the original bitmap actually stored? As Wikipedia will tell us, 
14 | it's kept in `IO.SYS`, a crucial DOS system file. But it's obscured with some 
15 | kind of bespoke properietary compression, so we cannot extract or replace it.
16 | 
17 | Until now. Not being able to find a detailed description of this compression format
18 | anywhere on the internet, I spent days hex-editing, educated guessing, and bug
19 | hunting to fully understand how it worked, so that the world can finally unlock
20 | what Microsoft has been trying to hide from us for almost 30 years.
21 | 
22 | ...or, you know, you could just keep using the `LOGO.SYS` trick...
23 | 
24 | ## Contents
25 | 
26 | * `io7.ts`: A full description of the IO7 format, and a TypeScript implementation of a decompressor and compressor
27 | * `logo.io7`: Sample file for use with the script: the raw compressed bitmap data extracted directly from Windows 95's `IO.SYS` (offset `0x1B210` to offset `0x2B17A`)
28 | * `logo98.io7`: Sample file for use with the script: the raw compressed bitmap data extracted directly from Windows 98's `IO.SYS` (offset `0x1E030` to offset `0x2E285`)
29 | * `ftlogo.bmp`: Sample file for use with the script: An alternative boot splash in a `LOGO.SYS` compatible format that can be compressed and patched into `IO.SYS` on 95/98
30 | 
31 | ## Running
32 | 
33 | For the benefit of those not familiar with TypeScript:
34 | 
35 | The easiest way to run a TypeScript file directly is to use `npm` to install `ts-node` globally, then run with `npx`:
36 | 
37 | ```bash
38 | apt install npm # (or equivalent for your platform)
39 | npm install -g typescript ts-node
40 | npx ts-node io7.ts
41 | ```
42 | 
43 | If that doesn't work, Google "how to run a typescript file" and be very confused for the next 30-60 minutes.
44 | 
45 | ## Acknowledgements
46 | 
47 | I would have had no chance at understanding the IO7 format if it weren't for [this open source implementation](https://github.com/chenall/grub4dos/blob/3c1d05f39e49ec1d7543caa825df00068b96620b/stage2/builtins.c#L441-L621)
48 | of a decompressor for a very similar format used in Windows ME. Thank you, bean, whoever and wherever you are.
49 | 


--------------------------------------------------------------------------------
/ftlogo.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pierce-smith1/io7/64ea32e772db062f2300473703e55848c9835ebb/ftlogo.bmp


--------------------------------------------------------------------------------
/io7.ts:
--------------------------------------------------------------------------------
  1 | // 
  2 | // io7.ts
  3 | // Windows 95/98 Boot Splash Decompressor and Compressor
  4 | // 
  5 | // # BACKGROUND
  6 | //
  7 | // In Windows 95 and 98, you can override the default boot splash by placing
  8 | // your own bitmap in the root of the C: drive with the name LOGO.SYS. When
  9 | // booting, Windows will display your bitmap instead of the default one.
 10 | //
 11 | // But where is the original bitmap actually stored? As Wikipedia will tell us, 
 12 | // it's kept in IO.SYS, a crucial DOS system file. But it's obscured with some 
 13 | // kind of bespoke properietary compression, so we cannot extract or replace it.
 14 | //
 15 | // Until now.
 16 | //
 17 | // This script implements a (very inefficient) compressor and decompressor for
 18 | // the custom compression format used for these graphics. While it can succesfully 
 19 | // extract and re-package logos for Windows 95 & 98, the main point of the 
 20 | // script is to serve as education and documentation for how this format works.
 21 | //
 22 | // The rest of this interlude will provide an exhaustive overview of what the
 23 | // data format looks like, including every single gory detail.
 24 | // Following this is the code itself, which provides a working 
 25 | // reference implementation for both compression and decompression.
 26 | //
 27 | // By default, this script does nothing when run - scroll to the very bottom and 
 28 | // uncomment one of the function calls to run the compressor or decompressor
 29 | // on the example files that have been included with this script.
 30 | // (from https://github.com/pierce-smith1/io7)
 31 | //
 32 | // # THE FORMAT
 33 | //
 34 | // In Windows 95's IO.SYS, the compressed logo data begins at offset 0x1b120 
 35 | // and ends at offset 0x2b17a. There are no other compressed regions in the
 36 | // file. It's the only example I know of of this exact compression format 
 37 | // being used. I'm going to call the format "IO7", after the IO from IO.SYS
 38 | // and the fact that 9x's DOS identifies itself as DOS 7.
 39 | //
 40 | // Fundamentally, IO7 is a variant on LZSS. Data is split up into
 41 | // tokens, coming one after another, that represent either literal bytes 
 42 | // or runs of bytes that have been previously seen. These tokens are grouped 
 43 | // into blocks that encode 8192 or fewer bytes of uncompressed data. Each
 44 | // block has a header that tells how big the compressed size is and whether 
 45 | // or not it's the last block. Each block in an IO7 "file" comes immediately
 46 | // after the last.
 47 | //
 48 | // We'll now go into each of these things in more detail.
 49 | //
 50 | // ## BLOCKS 
 51 | //
 52 | // An IO7 file is made entirely of "blocks" - it starts with a block, and blocks
 53 | // continue appearing one after another until the final block, after which the
 54 | // file simply ends.
 55 | //
 56 | // Blocks contain an 8-byte header, which crucially contains the size of the
 57 | // block, then an arbitrary number of bytes afterwards that contain the
 58 | // token stream encoding the compressed data. Unlike the token stream itself,
 59 | // blocks must be aligned to bytes, and in fact the end of the token stream
 60 | // will be padded with 0 bits to ensure the next block starts on a byte.
 61 | //
 62 | // Block headers are structured as follows:
 63 | //
 64 | // byte
 65 | // 0----1----2----3----4----5----6----7---
 66 | // size size ltag ltag 0x44 0x53 0x00 0x00
 67 | //
 68 | // The two size bytes are a 16-bit little-endian integer encoding the size 
 69 | // of the compressed data in this block. For whatever reason, the MSB is 
 70 | // always set to 1, meaning you must subtract 0x8000 to get the actual size.
 71 | // For example the two size bytes may be 0xBE 0x88, encoding an actual size of
 72 | // 0x88BE - 0x8000 = 0x08BE.
 73 | //
 74 | // The two ltag bytes are either [0x00 0x20] or [0x36 0x18]. They are [0x36 0x18]
 75 | // if and only if this is the last block in the file.
 76 | //
 77 | // After the ltag comes the fixed bytes [0x44 0x53 0x00 0x00] (DS..). Looking for
 78 | // this DS.. pattern can quickly identify the starts of compressed blocks in a 
 79 | // hex editor. For the purposes of the block size, this byte pattern is also 
 80 | // where the compressed data starts, i.e. the block size counts these 
 81 | // four bytes.
 82 | //
 83 | // ## TOKEN STREAMS
 84 | // 
 85 | // Immediately after a block header, the compressed data begins. The compressed
 86 | // data is a stream of tokens, each token coming one after another. Tokens are
 87 | // not aligned to bytes and often have sizes in bits that are not multiples
 88 | // of 8, so they must be read and written bit-by-bit.
 89 | //
 90 | // In the file, streams of bits are encoded by chopping the stream into bytes,
 91 | // where each byte contains 8 bits ordered from LSB to MSB. So for example, say
 92 | // we want to encode the following stream of bits (where letters a-p are bits):
 93 | //
 94 | // --------------->
 95 | // abcdefghijklmnop
 96 | // 
 97 | // They will appear in the bytes of the file as such:
 98 | //
 99 | // <------- <-------
100 | // hgfedcba ponmlkji
101 | // 0        1
102 | //
103 | // The bytes are read left to right, but the bits in those bytes are read
104 | // right to left. Put another way, the "first" bit in a byte is in the
105 | // least significant position.
106 | //
107 | // ### TOKENS 
108 | //
109 | // There are two main kinds of token: a literal token and an offset-length
110 | // token. Each one represents one or more bytes of uncompressed data.
111 | //
112 | // #### LITERAL TOKENS
113 | //
114 | // A literal token encodes a single byte of uncompressed data. For notational 
115 | // purposes, we can write these tokens as simply #hh, where hh are the two 
116 | // hexadecimal digits of the encoded byte. For example, if we had a stream of 
117 | // the following tokens:
118 | //
119 | // #00 #11 #22
120 | //
121 | // This encodes the uncompressed data [0x00 0x11 0x22].
122 | //
123 | // In the bitstream, literal tokens are 9 bits long. There is a single flag bit 
124 | // followed immediately by the 8 bits of the byte. A literal token looks as 
125 | // follows in a stream:
126 | //
127 | //    --------->
128 | // ...f Dddddddd...
129 | //    ^ ^
130 | //    | |
131 | //    ' flag bit
132 | //      |
133 | //      ' data bits - D is the MSB
134 | //
135 | // The flag bit is the inverse of the MSB of the data. If D is 0, then f is 1 -
136 | // if D is 1, then f is 0. For example, we would encode the byte 0xB5 in a bit
137 | // stream as follows:
138 | //
139 | //    --------->
140 | // ...0 10110101...
141 | //    ^ ^
142 | //    | |
143 | //    ' flag bit, is 0 because the MSB of the data is 1
144 | //      |
145 | //      ' the byte (0xB5) as binary
146 | // 
147 | // #### OFFSET-LENGTH TOKENS
148 | //
149 | // Offset-length tokens encode a sequence of at least 2 bytes through two
150 | // numbers: an _offset_ into the uncompressed data, and a _length_ of how many 
151 | // bytes to read from this offset.
152 | //
153 | // For notational purposes, we can write these tokens as <O,L>, where O is the
154 | // offset and L is the length. O must be >= 1, and L must be >= 2.
155 | //
156 | // The offset is backwards relative to our current position in the uncompressed
157 | // stream of bytes, and the length is the count of bytes to read. For example,
158 | // say we have the following token stream:
159 | //
160 | // #0a #1b #2c #3d <3,2>
161 | //
162 | // The first four tokens represent the uncompressed data [0x0a 0x1b 0x2c 0x3d]. 
163 | // Now, when the token <3,2> is encountered, this tells us to look back 3 bytes 
164 | // from the end of our uncompressed data, then read 2 bytes forward.
165 | //
166 | // [0x0a 0x1b 0x2c 0x3d]
167 | //                      ^
168 | //                      |
169 | //                      ' start past the end
170 | //
171 | // [0x0a (0x1b 0x2c 0x3d]
172 | //       ^
173 | //       |
174 | //       ' move 3 bytes backwards...
175 | //
176 | // [0x0a (0x1b 0x2c) 0x3d]
177 | //                 ^
178 | //                 |
179 | //                 ' ...then 2 bytes forwards
180 | // 
181 | //       (0x1b 0x2c) is what <3,2> encodes
182 | //
183 | // So, in this stream, <3,2> encodes [0x1b 0x2c], and our final uncompressed data
184 | // is the sequence [0x0a 0x1b 0x2c 0x3d 0x1b 0x2c].
185 | //
186 | // It is valid for the length to be greater than the offset. In this case,
187 | // bytes read off the end of the uncompressed data repeat the data read from
188 | // from the offset to the end of the data. For example, take this stream of tokens:
189 | //
190 | // #00 #11 #22 #33 <3,7>
191 | //
192 | // To read the <3,7> token:
193 | //
194 | // [0x00 0x11 0x22 0x33]
195 | //                      ^
196 | //                      |
197 | //                      ' start past the end
198 | //
199 | // [0x00 (0x11 0x22 0x33)]
200 | //       ^
201 | //       |
202 | //       ' 3 bytes backwards...
203 | //
204 | // [0x00 (0x11 0x22 0x33 ?? ...
205 | //                       ^
206 | //                       |
207 | //                       ' ...then 7 bytes forwards -
208 | //                            but uh oh, we're out of bytes to read!
209 | //
210 | // [0x00 (0x11 0x22 0x33 0x11 ...
211 | //       ^               ^   
212 | //       '---------------|
213 | //                       ' ...so we pretend the bytes repeat from
214 | //                            where we started reading!
215 | //
216 | // [0x00 (0x11 0x22 0x33 0x11 0x22 ...
217 | //             ^              ^   
218 | //             '--------------|
219 | //                            ' continue...
220 | //
221 | // [0x00 (0x11 0x22 0x33 0x11 0x22 0x33 0x11)]
222 | //                       ^              ^   
223 | //                       '--------------|
224 | //                                      ' ...until we've made enough bytes
225 | //
226 | //       (0x11 0x22 0x33 0x11 0x22 0x33 0x11) is what <3,7> encodes
227 | // 
228 | // It is not valid for the offset to be greater than the length of the
229 | // uncompressed data.
230 | //
231 | // In the bitstream, offset-length tokens are a variable number of bits. The
232 | // first group is bits encodes the offset and the second group encodes the
233 | // length, both of which are variable size.
234 | //
235 | // Consider the offset first. Depending on the offset's size, we use three
236 | // different encoding strategies.
237 | //
238 | // * If the offset is representable in 6 bits, i.e. if it is less than 64, 
239 | //   it is a "short" offset, and will be encoded in 8 bits. Start with two 0 bits, 
240 | //   then write the 6 bits of the offset. Example: An offset of 21 (0x15) will
241 | //   appear as such:
242 | //
243 | //      -------->
244 | //   ...00 010101...
245 | //      ^  ^ 
246 | //      |  |
247 | //      ' short offset flag
248 | //         |
249 | //         ' 21 (0x15) in binary
250 | // 
251 | // * If the offset - 64 is repesentable in 8 bits, i.e. if it is >= 64 and < 320, 
252 | //   it is a "medium" offset, and will be encoded in 13 bits. Start with two 1 bits, 
253 | //   then a 0 bit, then write the 8 bits of the offset - 64. Example: An offset of 
254 | //   152 (0x98) will appear as such:
255 | //
256 | //   ...110 01011000...
257 | //      ^   ^
258 | //      |   |
259 | //      ' medium offset flag
260 | //          |
261 | //          ' 152 - 64 = 88 (0x58) in binary
262 | //
263 | // * If the offset - 319 is representable in 12 bits i.e. if it is >= 320 and < 4415,
264 | //   it is a "far" offset, and will be encoded in 15 bits. Start with three 1 bits, 
265 | //   then write the 12 bits of the offset - 320. Example: an offset of 2270 will 
266 | //   appear as such:
267 | //
268 | //   ...111 011110011110...
269 | //      ^   ^
270 | //      |   |
271 | //      ' far offset flag
272 | //          |
273 | //          ' 2270 - 320 = 1950 (0x79e) in binary
274 | //
275 | //   Note that the offset cannot be 4415, even though 4415 - 320 = 4095 can be
276 | //   represented in 12 bits as a sequence of all 1s. This would result in a 
277 | //   sequence of 15 consecutive 1s, which is interpreted as a special token
278 | //   we will talk about later.
279 | //
280 | //   Note also that this defines 4414 as the largest representable offset.
281 | //
282 | // Immediately following the offset is the length. Again, the length is encoded
283 | // differently based on how large it is - however, this encoding scheme is more
284 | // mathematical than case-based.
285 | //
286 | // We split the length into two numbers, a "base" and a "tail". The base is the
287 | // largest number that is a power of 2 plus 1 which is still smaller than the
288 | // length, and the tail is simply the length minus the base. For example, if 
289 | // the length is 13, the largest number n^2 + 1 that fits within it is 9 (2^3 + 1), 
290 | // so the base is 9 and the tail is 13 - 9 = 4. Another example, if the length 
291 | // is 40, the base is 33 (2^5 + 1) and the tail is 40 - 33 = 7.
292 | //
293 | // To represent the length in the bitstream, first encode the base by writing 
294 | // n 0s, where n is the power of the base. Then write a 1, then finish by
295 | // writing the tail in n bits.
296 | //
297 | // Let's use 13 as an example again. The base is 2^3 + 1, so we write 3
298 | // 0 bits, then a 1 bit. Finally we write the tail, 4, in 3 bits.
299 | //
300 | // ...000 1 100...
301 | //    ^   ^ ^
302 | //    |   | |
303 | //    n 0s, where n = the power of the base (in this case, 3)
304 | //        | |
305 | //        ' fixed 1 following the 0s
306 | //          |
307 | //          ' the tail (4) in 3 bits
308 | //
309 | // Another example: to encode a length of 40, the base is 2^5 + 1, so we
310 | // write 5 0 bits, then a 1 bit, and finally the tail of 7 in 5 bits.
311 | //
312 | // ...00000 1 00111...
313 | //    ^     ^ ^
314 | //    |     | |
315 | //    n 0s, where n = the power of the base (in this case, 5)
316 | //          | |
317 | //          ' fixed 1 following the 0s
318 | //            |
319 | //            ' the tail (7) in 5 bits
320 | //
321 | // Let's put everything together into a final example.
322 | //
323 | // Say we want to encode the token <343,10>. The offset is 343 and the length
324 | // is 10. We start by encoding the offset. 
325 | //
326 | // The offset is larger than 320 and smaller than 4415, so it is a "far" 
327 | // offset, and it is encoded as follows:
328 | //
329 | //  ..111 000000010111...
330 | //    ^   ^
331 | //    |   |
332 | //    ' far offset flag
333 | //        |
334 | //        ' 343 - 320 = 27 (0x17) in binary in 12 bits
335 | //
336 | // Now immediately afterwards we encode the length. The length is 10, so we
337 | // have a base of 9 (2^3 + 1). The tail is 10 - 9 = 1. This adds to our encoding
338 | // as follows:
339 | //
340 | //  ...111000000010111 000 1 001...
341 | //     ^               ^   ^ ^
342 | //     |               |   | |
343 | //     ' the offset    ' three 0s (since the base is 2^3 + 1)
344 | //                         | |
345 | //                         ' fixed 1 following the 0s
346 | //                           |
347 | //                           ' the tail (1) in three bits
348 | //
349 | // And the final token is:
350 | //
351 | //  ...1110000000101110001001...
352 | //
353 | // #### SENTINEL TOKENS
354 | //
355 | // There is a final type of token that encodes no data. It MUST appear as the
356 | // final token of a block, and it also MUST appear after 512 bytes of
357 | // uncompressed data have been encoded. In other words, every sequence of
358 | // tokens between two sentinel tokens MUST encode 512 or fewer bytes of data.
359 | // This means is also invalid for a single token to encode more than 512 bytes.
360 | //
361 | // Its bitstream representation is simply 15 consecutive 1s. 
362 | //
363 | // That's all! That's the compression format in its entirety.
364 | // We now know enough to build a working compressor and decompressor.
365 | //
366 | // The rest of this file will contain a reference implementation for a
367 | // decompressor and compressor, using what we've detailed above. But first, I 
368 | // want to quickly make an acknowledgement:
369 | //
370 | // # ACKNOWLEDGEMENTS
371 | // 
372 | // This work is indebted to a programmer simply credited "bean", who wrote a 
373 | // decompressor for the Windows ME IO.SYS format for an open-source project 
374 | // called grub4dos.
375 | //
376 | // (https://github.com/chenall/grub4dos/blob/3c1d05f39e49ec1d7543caa825df00068b96620b/stage2/builtins.c#L441-L621)
377 | //
378 | // Their code was instrumental in guiding me through the reverse-engineering
379 | // process. It it wasn't for them, I would never have made it even close to 
380 | // understanding the IO7 format and being able to write these algorithms.
381 | //
382 | // Thank you, bean.
383 | //
384 | 
385 | //
386 | // # ALGORITHMS
387 | //
388 | // What follows now is some actual code for implementing compression and
389 | // decompression in the IO7 format.
390 | //
391 | 
392 | // We define a convenience class that wraps an array of bytes and lets us
393 | // read and write from it as if it were a stream of bits. 
394 | class BitStream {
395 |     constructor (
396 |         // Throughout this program, we will use the `number` type to implicitly
397 |         // mean a byte, and just make the assumption it always stays in the range
398 |         // 0 - 255.
399 |         private bytes: Array<number>,
400 | 
401 |         // bit_pos represents our position in the current byte - as such it
402 |         // stays between 0 and 7. 
403 |         // 0 means we're at the LSB of the byte, and 7 the MSB.
404 |         private bit_pos = 0,
405 | 
406 |         // byte_pos is effectively an index into the byte array.
407 |         // As we travel bit-by-bit through the stream, byte_pos will increase
408 |         // slowly without bound, and bit_pos will cycle between 0 and 7.
409 |         private byte_pos = 0,
410 |     ) {}
411 | 
412 |     get_bytes() {
413 |         return [...this.bytes];
414 |     }
415 | 
416 |     get_bit_offset() {
417 |         return this.bit_pos + (this.byte_pos * 8);
418 |     }
419 | 
420 |     advance_bit_pos() {
421 |         this.bit_pos++;
422 | 
423 |         if (this.bit_pos >= 8) {
424 |             this.bit_pos = 0;
425 |             this.byte_pos++;
426 |         }
427 |     }
428 | 
429 |     advance_bit_pos_by(num_bits: number) {
430 |         for (let i = 0; i < num_bits; i++) {
431 |             this.advance_bit_pos();
432 |         }
433 |     }
434 | 
435 |     reverse_bit_pos() {
436 |         this.bit_pos--;
437 | 
438 |         if (this.bit_pos < 0) {
439 |             this.bit_pos = 7;
440 |             this.byte_pos--;
441 |         }
442 |     }
443 | 
444 |     reverse_bit_pos_by(num_bits: number) {
445 |         for (let i = 0; i < num_bits; i++) {
446 |             this.reverse_bit_pos();
447 |         }
448 |     }
449 | 
450 |     // This returns the special sentinel "eof" if we run out of bits.
451 |     read_bits(num_bits: number): number | "eof"  {
452 |         const end_bit = (this.bit_pos + num_bits) % 8;
453 |         const end_byte = this.byte_pos + Math.floor((this.bit_pos + num_bits) / 8);
454 | 
455 |         let result = 0;
456 |         let bits_read = 0;
457 |         while (this.bit_pos !== end_bit || this.byte_pos !== end_byte) {
458 |             const byte = this.bytes[this.byte_pos];
459 | 
460 |             if (byte === undefined) {
461 |                 return "eof";
462 |             }
463 |             
464 |             const next_bit = (byte & (1 << this.bit_pos)) ? 1 : 0;
465 |             result += next_bit << bits_read++;
466 |             this.advance_bit_pos();
467 |         }
468 | 
469 |         return result;
470 |     }
471 | 
472 |     write_bits(n: number, num_bits: number) {
473 |         let bits_written = 0;
474 |         while (bits_written < num_bits) {
475 |             this.write_bit((n >>> bits_written & 1) as 0 | 1);
476 |             bits_written++;
477 |         }
478 |     }
479 | 
480 |     write_bit(bit: 0 | 1) {
481 |         if (this.byte_pos >= this.bytes.length) {
482 |             this.bytes.push(0);
483 |         }
484 | 
485 |         this.bytes[this.byte_pos] += bit << this.bit_pos;
486 | 
487 |         this.advance_bit_pos();
488 |     }
489 | 
490 |     is_at_end(): boolean {
491 |         return this.bytes[this.byte_pos] === undefined;
492 |     }
493 | }
494 | 
495 | // A model of our tokens will prove very useful.
496 | // They are perfectly described by a tagged union.
497 | type Token = 
498 |     | {type: "literal", value: number}
499 |     | {type: "offset", offset: number, length: number}
500 |     | {type: "sentinel"}
501 | 
502 | //
503 | // ## DECOMPRESSION ROUTINES
504 | //
505 | 
506 | // This function will read a token from a stream of bits, advancing the stream
507 | // to the start of the next token.
508 | // It will return undefined when there are no more valid tokens to read.
509 | // The idea is that you can call this repeatedly on the same bitstream until it 
510 | // returns undefined to enumerate all the tokens in the stream.
511 | function read_token(bits: BitStream): Token | undefined {
512 |     // To avoid having to constantly check for "eof" after reading from the
513 |     // stream, we define this helper that throws an exception on eof. Then in 
514 |     // our logic, we trap the exception to immediately return undefined.
515 |     function read_bits_or_die(num_bits: number): number {
516 |         const result = bits.read_bits(num_bits);
517 |         if (result === "eof") {
518 |             throw new Error("Out of bits");
519 |         }
520 | 
521 |         return result;
522 |     }
523 | 
524 |     try {
525 |         // We need to read at least two bits to start figuring out what kind of
526 |         // token we're looking at.
527 |         const tag = read_bits_or_die(2);
528 | 
529 |         // We can tell if this is a literal byte by if the two bits of the tag
530 |         // are different.
531 |         if (tag === 1 || tag === 2) {
532 |             const literal_byte_end = read_bits_or_die(7);
533 | 
534 |             // This reconstructs the original byte from part of the tag, since
535 |             // they overlap.
536 |             const literal_byte = literal_byte_end + ((tag & 1) << 7);
537 | 
538 |             return {type: "literal", value: literal_byte};
539 |         }
540 | 
541 |         // If we get here, we must be reading either a sentinel or an 
542 |         // offset-length token.
543 |         // A sentinel resembles a far offset-length token enough that we can
544 |         // check for it in the process of parsing an offset-length token, so at
545 |         // this point we just assume we're looking at an offset-length token.
546 |         const offset = (() => {
547 |             if (tag === 0) {
548 |                 const short_offset = read_bits_or_die(6);
549 |                 return short_offset;
550 |             }
551 | 
552 |             const is_far_offset = bits.read_bits(1);
553 |             if (is_far_offset === 1) {
554 |                 const far_offset = read_bits_or_die(12);
555 |                 return far_offset + 0x140;
556 |             }
557 | 
558 |             const med_offset = read_bits_or_die(8);
559 |             return med_offset + 0x40;
560 |         })();
561 | 
562 |         // 0x113F - 0x140 === 0xFFF. So with the three 1s of the tag included,
563 |         // if we see this exact offset, we've seen fifteen 1s in a row.
564 |         if (offset === 0x113F) {
565 |             return {type: "sentinel"};
566 |         }
567 | 
568 |         const length = (() => {
569 |             let bits_in_length = 0;
570 |             while (read_bits_or_die(1) === 0) {
571 |                 bits_in_length++;
572 |             }
573 | 
574 |             const base = (1 << bits_in_length) + 1;
575 |             const tail = read_bits_or_die(bits_in_length);
576 |             
577 |             const length = base + tail;
578 |             return length;
579 |         })();
580 | 
581 |         return {type: "offset", offset, length};
582 |     } catch {
583 |         // Using exceptions for control flow - very naughty indeed!
584 |         return undefined;
585 |     }
586 | }
587 | 
588 | // This function unpacks a stream of tokens into the uncompressed bytes 
589 | // they represent.
590 | function decode_tokens(tokens: Array<Token>): Array<number> {
591 |     // We need to keep track of the bytes we've uncompressed so far so that we
592 |     // can decompress offset-length tokens.
593 |     let output: Array<number> = [];
594 | 
595 |     function segment_to_bytes(segment: Token): Array<number> {
596 |         switch (segment.type) {
597 |             case "literal": return [segment.value];
598 |             case "offset": {
599 |                 const end_index = -segment.offset + segment.length;
600 |                 let new_bytes = [...output.slice(-segment.offset, end_index >= 0 ? undefined : end_index)];
601 | 
602 |                 // If the length was greater than the offset, we tried to read
603 |                 // past the end of the array.
604 |                 // We need to fill the missing pieces with the last byte of
605 |                 // the output.
606 |                 while (new_bytes.length !== segment.length) {
607 |                     new_bytes.push(new_bytes.at(-segment.offset));
608 |                 }
609 | 
610 |                 return new_bytes;
611 |             }
612 |             case "sentinel": return [];
613 |         }
614 |     }
615 | 
616 |     for (const segment of tokens) {
617 |         const bytes = segment_to_bytes(segment);
618 |         output = [...output, ...bytes];
619 |     }
620 | 
621 |     return output;
622 | }
623 | 
624 | // This separates a bitstream into blocks of raw compressed data by searching
625 | // for, splitting on, and stripping out DS headers.
626 | // The result is an array of bitstreams that contain raw data from which we
627 | // can read tokens.
628 | function break_into_blocks(bits: BitStream): Array<BitStream> {
629 |     let blocks = [];
630 | 
631 |     let block_size = bits.read_bits(16);
632 |     while (!bits.is_at_end() && block_size !== "eof" && block_size >= 0x8000) {
633 |         // Skip the rest of the DS header by moving forwards 6 bytes - 
634 |         // A DS header is 8 bytes but we already read 2 of them.
635 |         bits.advance_bit_pos_by(6 * 8);
636 |         
637 |         const this_block = [];
638 |         for (let i = 0; i < block_size - 0x8000 - 4; i++) {
639 |             // We can just assume these reads will never be eofs, since if they
640 |             // are, the header lied to us about big the block is..!
641 |             const byte = bits.read_bits(8) as number;        
642 |             this_block.push(byte);
643 |         }
644 | 
645 |         blocks.push(this_block);
646 | 
647 |         block_size = bits.read_bits(16);
648 |     }
649 | 
650 |     return blocks.map(block => new BitStream(block));
651 | }
652 | 
653 | // Putting everyting together, this function turns a bitstream holding a
654 | // complete IO7 file into a stream of uncompressed bytes!
655 | function decompress(compressed_bits: BitStream): Array<number> {
656 |     const blocks = break_into_blocks(compressed_bits);
657 | 
658 |     const output_bytes = blocks.flatMap(block => {
659 |         let tokens = [];
660 | 
661 |         let next_token: Token | undefined;
662 |         while ((next_token = read_token(block))) {
663 |             tokens.push(next_token);
664 |         }
665 | 
666 |         const output = decode_tokens(tokens);
667 |         return output;
668 |     });
669 | 
670 |     return output_bytes;
671 | }
672 | 
673 | //
674 | // ## COMPRESSION ROUTINES
675 | //
676 | 
677 | // This function turns a token into its bit representation and writes it to
678 | // a bitstream.
679 | function write_token(segment: Token, bits: BitStream) {
680 |     if (segment.type === "literal") {
681 |         const byte = segment.value;
682 | 
683 |         const high_bit = byte & (1 << 7) ? 1 : 0;
684 | 
685 |         bits.write_bit(high_bit);
686 |         bits.write_bit(high_bit === 0 ? 1 : 0);
687 |         bits.write_bits(byte, 7);
688 |     } else if (segment.type === "offset") {
689 |         const {offset, length} = segment;
690 | 
691 |         const offset_size = (() => {
692 |             if (offset < 0x40) {
693 |                 return "short";
694 |             } else if (offset < 0x140) {
695 |                 return "medium";
696 |             } else {
697 |                 return "far";
698 |             }
699 |         })();
700 | 
701 |         if (offset_size === "short") {
702 |             bits.write_bits(0, 2);
703 |             bits.write_bits(offset, 6);
704 |         } else {
705 |             bits.write_bits(3, 2);
706 |         }
707 | 
708 |         if (offset_size === "medium") {
709 |             bits.write_bit(0);
710 |             bits.write_bits(offset - 0x40, 8);
711 |         }
712 | 
713 |         if (offset_size === "far") {
714 |             bits.write_bit(1);
715 |             bits.write_bits(offset - 0x140, 12);
716 |         }
717 | 
718 |         let bits_in_length = (length => {
719 |             let num_bits = 0;
720 |             while (length >>> num_bits) {
721 |                 num_bits++;
722 |             }
723 |             return num_bits;
724 |         })(length - 1);
725 | 
726 |         for (let i = 0; i < bits_in_length - 1; i++) {
727 |             bits.write_bit(0);
728 |         }
729 | 
730 |         bits.write_bit(1);
731 | 
732 |         bits.write_bits(length - 1 - (1 << bits_in_length), bits_in_length - 1);
733 |     } else {
734 |         bits.write_bits(0xFFFF, 15);
735 |     }
736 | }
737 | 
738 | // This function turns a stream of uncompressed bytes into a stream of tokens 
739 | // that encode the compressed data.
740 | // This is where the compression magic really happens.
741 | function encode_bytes(bytes: Array<number>): Array<Token> {
742 |     const bits = new BitStream(bytes);
743 | 
744 |     // We need to keep track of where we are in the uncompressed data so we can
745 |     // correctly count offsets for offset-length tokens.
746 |     let input_cursor = 0;
747 |     const output_segments: Array<Token> = [];
748 | 
749 |     function commit_segment(segment: Token, from_bytes: Array<number>) {
750 |         input_cursor += from_bytes.length;
751 |         output_segments.push(segment);
752 |     }
753 | 
754 |     function array_equal<T>(a1: Array<T>, a2: Array<T>): boolean {
755 |         return a1.length === a2.length && a1.every((_, i) => a1[i] === a2[i]);
756 |     }
757 | 
758 |     // A "phrase" is some arbitray run of bytes. This helper searches through
759 |     // the input we have currently processed to see if that phrase has
760 |     // already appeared.
761 |     // It's used to tell if we can use an offset-length token to encode the
762 |     // given phrase.
763 |     // It returns the index the phrase begins in our input bytes, or undefined
764 |     // if it can't be found.
765 |     function find_phrase_in_input(phrase: Array<number>): number | undefined {
766 |         const seen_input = bytes.slice(0, input_cursor);
767 | 
768 |         // Using findLastIndex here helps keep the offset small, since offsets
769 |         // are negative relative to the end.
770 |         const phrase_index = seen_input.findLastIndex((byte, i, bytes) => {
771 |             let searched_bytes = bytes.slice(i, i + phrase.length);
772 | 
773 |             if (searched_bytes.length < phrase.length) {
774 |                 searched_bytes = [...searched_bytes, ...new Array(phrase.length - searched_bytes.length).fill(seen_input.at(-1))];
775 |             }
776 | 
777 |             return array_equal(phrase, searched_bytes);
778 |         });
779 | 
780 |         return phrase_index !== -1 ? phrase_index : undefined;
781 |     }
782 | 
783 |     // The main compression loop, running until we're out of input.
784 |     while (!bits.is_at_end()) {
785 |         // We want to opportunistically read two bits at a time so we can
786 |         // start trying to build a phrase that could be encoded in an 
787 |         // offset-length token. 
788 |         // If we only ever read one byte at a time, we can never make a phrase,
789 |         // since they have to be at least two bytes.
790 |         let maybe_phrase = [bits.read_bits(8), bits.read_bits(8)];
791 | 
792 |         if (maybe_phrase[0] === "eof") {
793 |             break;
794 |         }
795 | 
796 |         if (maybe_phrase[1] === "eof") {
797 |             commit_segment({type: "literal", value: maybe_phrase[0]}, [maybe_phrase[0]]);
798 |             break;
799 |         }
800 | 
801 |         let phrase = maybe_phrase as Array<number>;
802 | 
803 |         let last_input_phrase_index: number | undefined;
804 |         let input_phrase_index: number | undefined;
805 | 
806 |         // See if our phrase is in the input, and if it is, greedily keep 
807 |         // trying to grow it as much as possible.
808 |         while ((input_phrase_index = find_phrase_in_input(phrase)) !== undefined) {
809 |             const next_byte = bits.read_bits(8);
810 | 
811 |             // The -1 here is an ugly hack to handle the special case of our 
812 |             // phrase getting cut off by the end of the input.
813 |             phrase = [...phrase, next_byte === "eof" ? -1 : next_byte];
814 |             last_input_phrase_index = input_phrase_index;
815 |         }
816 | 
817 |         if (phrase.at(-1) !== -1) {
818 |             // Once the above while loop exits, we will have read one byte that
819 |             // didn't fit into our phrase - so we need to back up so that we can
820 |             // consider that byte again in the next loop around.
821 |             // The if condition guards against the case that the phrase was cut
822 |             // off by the end of the stream, in which case we DON'T want to
823 |             // reverse the stream since nothing was read.
824 |             bits.reverse_bit_pos_by(8);
825 |         }
826 | 
827 |         // The last byte we read was NOT part of this phrase.
828 |         phrase = phrase.slice(0, -1);
829 | 
830 |         if (phrase.length === 1) {
831 |             commit_segment({type: "literal", value: phrase[0]}, [phrase[0]]);
832 |             continue;
833 |         }
834 | 
835 |         commit_segment({type: "offset", offset: input_cursor - last_input_phrase_index, length: phrase.length}, phrase);
836 |     }
837 | 
838 |     commit_segment({type: "sentinel"}, []);
839 |     return output_segments;
840 | }
841 | 
842 | // This function reads up to 8192 bits of uncompressed data (enough to make a 
843 | // block) from the input stream, and outputs a stream of tokens to store in
844 | // the block.
845 | // It returns undefined when the stream is over and no more blocks can be made.
846 | function encode_next_block(bits: BitStream): Array<Token> | undefined {
847 |     if (bits.is_at_end()) {
848 |         return undefined;
849 |     }
850 | 
851 |     // There must be a sentinel every 512 bytes -
852 |     // We call a stretch of 512 bytes a "field".
853 |     const max_bytes_in_field = 512;
854 |     const max_fields_in_block = 16;
855 |     const max_bytes_in_block = max_bytes_in_field * max_fields_in_block; // 8192!
856 | 
857 |     const block_bytes = (() => {
858 |         let bytes: Array<number> = [];
859 | 
860 |         for (let i = 0; i < max_bytes_in_block; i++) {
861 |             const next_byte = bits.read_bits(8);
862 |             if (next_byte === "eof") {
863 |                 break;
864 |             }
865 |             bytes.push(next_byte);
866 |         }
867 | 
868 |         return bytes;
869 |     })();
870 | 
871 |     const fields = block_bytes.flatMap((byte, i, bytes) => {
872 |         return i % max_bytes_in_field
873 |             ? []
874 |             : [bytes.slice(i, i + max_bytes_in_field)];
875 |     });
876 | 
877 |     const segments = fields.flatMap(encode_bytes);
878 |     return segments;
879 | }
880 | 
881 | // Putting everyting together, this function turns a bitstream holding
882 | // arbitrary data into a stream of bytes holding a complete compressed 
883 | // IO7 file!
884 | function compress(input_bits: BitStream): Array<number> {
885 |     let blocks: Array<Array<Token>> = [];
886 | 
887 |     let next_block: Array<Token> | undefined;
888 |     while ((next_block = encode_next_block(input_bits))) {
889 |         blocks.push(next_block);
890 |     }
891 | 
892 |     const encoded_blocks = blocks.map(tokens => {
893 |         const bits = new BitStream([]);
894 |         tokens.forEach(token => write_token(token, bits));
895 |         return bits;
896 |     }).map(bits => bits.get_bytes());
897 | 
898 |     const output_bytes = [];
899 |     encoded_blocks.forEach((encoded_block, i, blocks) => {
900 |         // If this is the last block, use the special last block tag
901 |         const ds_header_tag = i !== blocks.length - 1 
902 |             ? [0x00, 0x20]
903 |             : [0x36, 0x18];
904 |         
905 |         const ds_header = [...ds_header_tag, 0x44, 0x53, 0x00, 0x00];
906 |         
907 |         const block_size = encoded_block.length + 0x8004;
908 |         const size_header = [block_size & 0xff, ((block_size & 0xff00) >> 8)];
909 |     
910 |         output_bytes.push(...size_header, ...ds_header, ...encoded_block);
911 |     });
912 | 
913 |     return output_bytes;
914 | }
915 | 
916 | // Finally, some helpers to facilitate compressing and decompressing to and 
917 | // from files.
918 | import * as Fs from "fs";
919 | 
920 | function decompress_file(input_filename: string, output_filename: string) {
921 |     const input_bits = new BitStream([...new Uint8Array(Fs.readFileSync(input_filename))]);
922 | 
923 |     const decompressed_bytes = decompress(input_bits);
924 | 
925 |     Fs.writeFileSync(output_filename, new Uint8Array(decompressed_bytes));
926 | }
927 | 
928 | function compress_file(input_filename: string, output_filename: string) {
929 |     const input_bits = new BitStream([...new Uint8Array(Fs.readFileSync(input_filename))]);
930 | 
931 |     const compressed_bytes = compress(input_bits);
932 | 
933 |     Fs.writeFileSync(output_filename, new Uint8Array(compressed_bytes));
934 | }
935 | 
936 | // Try it!
937 | 
938 | //decompress_file("logo.io7", "logo.bmp");
939 | //decompress_file("logo98.io7", "logo98.bmp");
940 | //compress_file("ftlogo.bmp", "ftlogo.io7");
941 | 
942 | // And that's all, folks.
943 | // PS <3
944 | 


--------------------------------------------------------------------------------
/logo.io7:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pierce-smith1/io7/64ea32e772db062f2300473703e55848c9835ebb/logo.io7


--------------------------------------------------------------------------------
/logo98.io7:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pierce-smith1/io7/64ea32e772db062f2300473703e55848c9835ebb/logo98.io7


--------------------------------------------------------------------------------