├── .gitattributes ├── .gitignore ├── ANALYSIS.md ├── README.md ├── example └── index.html ├── eyaler ├── README.md └── index.html ├── jeremyrixon ├── README.md └── index.html ├── mmastrac ├── README.md ├── index.html └── x.png ├── winstonewert ├── README.md ├── index.html └── x.png └── xem ├── README.md └── index.html /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | miniBook 2 | === 3 | 4 | Welcome! 5 | 6 | This is a compression challenge for Web developers. 7 | 8 | The goal is to compress [this HTML page](http://xem.github.io/miniBook/example/index.html) containing the complete work of William Shakespeare,
9 | based on the version hosted on [Project Gutenberg](http://www.gutenberg.org/ebooks/100). 10 | 11 | All means are allowed to compress the file, but the decompression must happen entirely in the browser. 12 | 13 | The page is encoded in UTF-8 with BOM and has a size of 5,324,821 bytes.
14 | All characters are ASCII. (````\n !"&'(),-.0123456789:;<>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz|}````) 15 | 16 | More info on on [ANALYSIS.md](https://github.com/xem/miniBook/blob/gh-pages/ANALYSIS.md) 17 | 18 | **Rules** 19 | 20 | - Fork this repo. 21 | - Copy the "example" folder and rename the copy with your Github username. 22 | - Your folder contains a file called index.html, containing a <xmp> HTML tag followed by a long text. 23 | - You can create other text and/or binary files at your convenience. 24 | - Compress the file as much as you can without altering the output. 25 | - The decompression must happen in the browser, without any user input, and use a reasonable amount of time (<30 min) and memory (<4GB) on a modern computer. 26 | - You can use HTML & JavaScript only, no PHP (or any server-side language), no VBScript. 27 | - You CAN use HTML entities to replace any characters. 28 | - You CAN use gzip (or concurrent) compression algorithms, other charsets than UTF-8, binary packing, neural networks... 29 | - You CAN use temporary elements (text, canvas...), but need to hide them or remove them from the DOM at the end. 30 | - You can NOT perform network queries or read any file that's not present in your folder. 31 | - You can NOT use browser extensions, JS error messages, or the built-in spellchecker. 32 | - You can NOT store data in file names, HTTP headers or anything that wouldn't appear in the byte count. 33 | - Your score is the total size, in bytes, of the file(s) present in your folder. 34 | - Add a readme file explaining your process. 35 | - Open a merge request to appear in the leaderboard. 36 | 37 | **Reward** 38 | 39 | - A first reward of 50€ (via Paypal) will be granted to the first person that beats 1050kb (i.e. 15kb bigger than paq8hp12's score for this file) 40 | - Then, a reward of 10 + X€ will be granted to each person that beats the previous record by Xkb. 41 | 42 | **Leaderboard** 43 | 44 | - Eyal Gruss: 1 356 386 bytes (using text preprocessing + Burrows-Wheeler + Move-to-front variant + Huffman + Burrows-Wheeler on bits + ECT PNG bootstrap + crEnc) [demo](http://xem.github.io/miniBook/eyaler/index.html), [readme](https://github.com/xem/miniBook/blob/gh-pages/eyaler/README.md) 45 | - Matt Mastrac: 1 488 397 bytes (using better PNG bootstrap) [demo](http://xem.github.io/miniBook/mmastrac/index.html), [readme](https://github.com/xem/miniBook/blob/gh-pages/mmastrac/README.md) 46 | - winstonewert: 1 971 890 bytes (using PNG bootstrap) [demo](http://xem.github.io/miniBook/winstonewert/index.html), [readme](https://github.com/xem/miniBook/blob/gh-pages/winstonewert/README.md) 47 | - jeremyrixon: 2 961 629 bytes (using Unicode and LZW) [demo](http://xem.github.io/miniBook/jeremyrixon/index.html), [readme](https://github.com/xem/miniBook/blob/gh-pages/jeremyrixon/README.md) -------------------------------------------------------------------------------- /eyaler/README.md: -------------------------------------------------------------------------------- 1 | [ZTML](https://github.com/eyaler/ztml) is a compression pipeline I wrote for inline text compression in HTML / JS. 2 | 3 | It uses some text preprocessing + Burrows-Wheeler + Move-to-front vartiant + Huffman + Burrows-Wheeler on bits + ECT PNG bootstrap + crEnc (a yEnc-like efficient Base64 alternative) + minification. 4 | 5 | Code to generate index.html: 6 | https://github.com/eyaler/ztml/misc/minibook.py -------------------------------------------------------------------------------- /eyaler/index.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xem/miniBook/bcc97c42ee7f79673d61112140d5794d6a60fcff/eyaler/index.html -------------------------------------------------------------------------------- /jeremyrixon/README.md: -------------------------------------------------------------------------------- 1 | 2 | Uses a unicode encoded string to hold LZW compressed text. 3 | 4 | It doesn't beat the previous winner but it was interesting to work on. 5 | 6 | Things to note: 7 | * The index.html file is UTF-8 encoded but the javascript will see the string as UTF-16 encoded because the browser's javascript interpreter has already parsed the content of the script tag. 8 | * Unicode codepoints 0x0d800 to 0x0dfff are invalid (on their own) and so must be skipped when using codepoints as LZW dictionary codes. 9 | * The javascript LZW code is based on this gist: https://gist.github.com/revolunet/843889 10 | 11 | Here's the code I used to create the UTF-8 encoded string: 12 | 13 | ~~~ 14 | // Based on https://gist.github.com/revolunet/843889 15 | var fs = require('fs'); 16 | 17 | var MINCDE = 0x00080; 18 | var SKPFRM = 0x0d800; 19 | var SKIPTO = 0x0e000; 20 | var MAXCDE = 0xfffff; 21 | 22 | // LZW-compress a string 23 | function lzw_encode(s) { 24 | var dict = {}; 25 | var out = ""; 26 | var phrase = s.charAt(0); 27 | var code = MINCDE; 28 | for (var i=1; i 1 ? String.fromCodePoint(dict[phrase]) : phrase.charAt(0); 34 | dict[phrase + currChar] = code; 35 | code++; 36 | if (code === SKPFRM) { 37 | code = SKIPTO; 38 | } 39 | if (code === MAXCDE) { 40 | dict = {}; 41 | code = MINCDE; 42 | } 43 | phrase=currChar; 44 | } 45 | } 46 | out += phrase.length > 1 ? String.fromCodePoint(dict[phrase]) : phrase.charAt(0); 47 | return out; 48 | } 49 | 50 | // Decompress an LZW-encoded string 51 | function lzw_decode(s) { 52 | var dict = {}; 53 | var currChar = s.charAt(0); 54 | var oldPhrase = currChar; 55 | var out = currChar; 56 | var code = MINCDE; 57 | var phrase; 58 | for (var i=1; i 0xffff) { 61 | i++; 62 | } 63 | if (currCode < MINCDE) { 64 | phrase = String.fromCodePoint(currCode); 65 | } 66 | else { 67 | phrase = dict[currCode] ? dict[currCode] : (oldPhrase + currChar); 68 | } 69 | out += phrase; 70 | currChar = phrase.charAt(0); 71 | dict[code] = oldPhrase + currChar; 72 | code++; 73 | if (code === SKPFRM) { 74 | code = SKIPTO; 75 | } 76 | if (code === MAXCDE) { 77 | dict = {}; 78 | code = MINCDE; 79 | } 80 | oldPhrase = phrase; 81 | } 82 | return out; 83 | } 84 | 85 | fs.readFile('input.html', 'utf8', function (err, data1) { 86 | if (err) throw err; 87 | var data2 = lzw_encode(data1); 88 | fs.writeFile ('002.html', data2, function(err) { 89 | if (err) throw err; 90 | fs.readFile('002.html', 'utf8', function (err, data3) { 91 | if (err) throw err; 92 | var data4 = lzw_decode(data3); 93 | fs.writeFile ('recovered.html', data4, function(err) { 94 | if (err) throw err; 95 | console.log((data4.substring(0, 30) + '...' + data4.substring(data4.length-30)).replace(/\s+/g, ' ')); 96 | console.log(data1 === data4 ? 'match' : 'no match'); 97 | }); 98 | }); 99 | }); 100 | 101 | }); 102 | ~~~ 103 | -------------------------------------------------------------------------------- /mmastrac/README.md: -------------------------------------------------------------------------------- 1 | # mmastrac's solution 2 | 3 | This is similar to winstonewert's entry, but we use the BWT transform on the data (from a hacked-up version of compressjs) before encoding to PNG. I used Zopfli's 4 | PNG re-encoder to shrink the PNG even further as well. 5 | 6 | -------------------------------------------------------------------------------- /mmastrac/index.html: -------------------------------------------------------------------------------- 1 |