├── .gitattributes
├── .gitignore
├── ANALYSIS.md
├── README.md
├── example
└── index.html
├── eyaler
├── README.md
└── index.html
├── jeremyrixon
├── README.md
└── index.html
├── mmastrac
├── README.md
├── index.html
└── x.png
├── winstonewert
├── README.md
├── index.html
└── x.png
└── xem
├── README.md
└── index.html
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
4 | # Custom for Visual Studio
5 | *.cs diff=csharp
6 |
7 | # Standard to msysgit
8 | *.doc diff=astextplain
9 | *.DOC diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot diff=astextplain
13 | *.DOT diff=astextplain
14 | *.pdf diff=astextplain
15 | *.PDF diff=astextplain
16 | *.rtf diff=astextplain
17 | *.RTF diff=astextplain
18 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Windows image file caches
2 | Thumbs.db
3 | ehthumbs.db
4 |
5 | # Folder config file
6 | Desktop.ini
7 |
8 | # Recycle Bin used on file shares
9 | $RECYCLE.BIN/
10 |
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 |
17 | # Windows shortcuts
18 | *.lnk
19 |
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 |
24 | # OSX
25 | # =========================
26 |
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 |
31 | # Thumbnails
32 | ._*
33 |
34 | # Files that might appear in the root of a volume
35 | .DocumentRevisions-V100
36 | .fseventsd
37 | .Spotlight-V100
38 | .TemporaryItems
39 | .Trashes
40 | .VolumeIcon.icns
41 |
42 | # Directories potentially created on remote AFP share
43 | .AppleDB
44 | .AppleDesktop
45 | Network Trash Folder
46 | Temporary Items
47 | .apdisk
48 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | miniBook
2 | ===
3 |
4 | Welcome!
5 |
6 | This is a compression challenge for Web developers.
7 |
8 | The goal is to compress [this HTML page](http://xem.github.io/miniBook/example/index.html) containing the complete work of William Shakespeare,
9 | based on the version hosted on [Project Gutenberg](http://www.gutenberg.org/ebooks/100).
10 |
11 | All means are allowed to compress the file, but the decompression must happen entirely in the browser.
12 |
13 | The page is encoded in UTF-8 with BOM and has a size of 5,324,821 bytes.
14 | All characters are ASCII. (````\n !"&'(),-.0123456789:;<>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz|}````)
15 |
16 | More info on on [ANALYSIS.md](https://github.com/xem/miniBook/blob/gh-pages/ANALYSIS.md)
17 |
18 | **Rules**
19 |
20 | - Fork this repo.
21 | - Copy the "example" folder and rename the copy with your Github username.
22 | - Your folder contains a file called index.html, containing a <xmp> HTML tag followed by a long text.
23 | - You can create other text and/or binary files at your convenience.
24 | - Compress the file as much as you can without altering the output.
25 | - The decompression must happen in the browser, without any user input, and use a reasonable amount of time (<30 min) and memory (<4GB) on a modern computer.
26 | - You can use HTML & JavaScript only, no PHP (or any server-side language), no VBScript.
27 | - You CAN use HTML entities to replace any characters.
28 | - You CAN use gzip (or concurrent) compression algorithms, other charsets than UTF-8, binary packing, neural networks...
29 | - You CAN use temporary elements (text, canvas...), but need to hide them or remove them from the DOM at the end.
30 | - You can NOT perform network queries or read any file that's not present in your folder.
31 | - You can NOT use browser extensions, JS error messages, or the built-in spellchecker.
32 | - You can NOT store data in file names, HTTP headers or anything that wouldn't appear in the byte count.
33 | - Your score is the total size, in bytes, of the file(s) present in your folder.
34 | - Add a readme file explaining your process.
35 | - Open a merge request to appear in the leaderboard.
36 |
37 | **Reward**
38 |
39 | - A first reward of 50€ (via Paypal) will be granted to the first person that beats 1050kb (i.e. 15kb bigger than paq8hp12's score for this file)
40 | - Then, a reward of 10 + X€ will be granted to each person that beats the previous record by Xkb.
41 |
42 | **Leaderboard**
43 |
44 | - Eyal Gruss: 1 356 386 bytes (using text preprocessing + Burrows-Wheeler + Move-to-front variant + Huffman + Burrows-Wheeler on bits + ECT PNG bootstrap + crEnc) [demo](http://xem.github.io/miniBook/eyaler/index.html), [readme](https://github.com/xem/miniBook/blob/gh-pages/eyaler/README.md)
45 | - Matt Mastrac: 1 488 397 bytes (using better PNG bootstrap) [demo](http://xem.github.io/miniBook/mmastrac/index.html), [readme](https://github.com/xem/miniBook/blob/gh-pages/mmastrac/README.md)
46 | - winstonewert: 1 971 890 bytes (using PNG bootstrap) [demo](http://xem.github.io/miniBook/winstonewert/index.html), [readme](https://github.com/xem/miniBook/blob/gh-pages/winstonewert/README.md)
47 | - jeremyrixon: 2 961 629 bytes (using Unicode and LZW) [demo](http://xem.github.io/miniBook/jeremyrixon/index.html), [readme](https://github.com/xem/miniBook/blob/gh-pages/jeremyrixon/README.md)
--------------------------------------------------------------------------------
/eyaler/README.md:
--------------------------------------------------------------------------------
1 | [ZTML](https://github.com/eyaler/ztml) is a compression pipeline I wrote for inline text compression in HTML / JS.
2 |
3 | It uses some text preprocessing + Burrows-Wheeler + Move-to-front vartiant + Huffman + Burrows-Wheeler on bits + ECT PNG bootstrap + crEnc (a yEnc-like efficient Base64 alternative) + minification.
4 |
5 | Code to generate index.html:
6 | https://github.com/eyaler/ztml/misc/minibook.py
--------------------------------------------------------------------------------
/eyaler/index.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xem/miniBook/bcc97c42ee7f79673d61112140d5794d6a60fcff/eyaler/index.html
--------------------------------------------------------------------------------
/jeremyrixon/README.md:
--------------------------------------------------------------------------------
1 |
2 | Uses a unicode encoded string to hold LZW compressed text.
3 |
4 | It doesn't beat the previous winner but it was interesting to work on.
5 |
6 | Things to note:
7 | * The index.html file is UTF-8 encoded but the javascript will see the string as UTF-16 encoded because the browser's javascript interpreter has already parsed the content of the script tag.
8 | * Unicode codepoints 0x0d800 to 0x0dfff are invalid (on their own) and so must be skipped when using codepoints as LZW dictionary codes.
9 | * The javascript LZW code is based on this gist: https://gist.github.com/revolunet/843889
10 |
11 | Here's the code I used to create the UTF-8 encoded string:
12 |
13 | ~~~
14 | // Based on https://gist.github.com/revolunet/843889
15 | var fs = require('fs');
16 |
17 | var MINCDE = 0x00080;
18 | var SKPFRM = 0x0d800;
19 | var SKIPTO = 0x0e000;
20 | var MAXCDE = 0xfffff;
21 |
22 | // LZW-compress a string
23 | function lzw_encode(s) {
24 | var dict = {};
25 | var out = "";
26 | var phrase = s.charAt(0);
27 | var code = MINCDE;
28 | for (var i=1; i 1 ? String.fromCodePoint(dict[phrase]) : phrase.charAt(0);
34 | dict[phrase + currChar] = code;
35 | code++;
36 | if (code === SKPFRM) {
37 | code = SKIPTO;
38 | }
39 | if (code === MAXCDE) {
40 | dict = {};
41 | code = MINCDE;
42 | }
43 | phrase=currChar;
44 | }
45 | }
46 | out += phrase.length > 1 ? String.fromCodePoint(dict[phrase]) : phrase.charAt(0);
47 | return out;
48 | }
49 |
50 | // Decompress an LZW-encoded string
51 | function lzw_decode(s) {
52 | var dict = {};
53 | var currChar = s.charAt(0);
54 | var oldPhrase = currChar;
55 | var out = currChar;
56 | var code = MINCDE;
57 | var phrase;
58 | for (var i=1; i 0xffff) {
61 | i++;
62 | }
63 | if (currCode < MINCDE) {
64 | phrase = String.fromCodePoint(currCode);
65 | }
66 | else {
67 | phrase = dict[currCode] ? dict[currCode] : (oldPhrase + currChar);
68 | }
69 | out += phrase;
70 | currChar = phrase.charAt(0);
71 | dict[code] = oldPhrase + currChar;
72 | code++;
73 | if (code === SKPFRM) {
74 | code = SKIPTO;
75 | }
76 | if (code === MAXCDE) {
77 | dict = {};
78 | code = MINCDE;
79 | }
80 | oldPhrase = phrase;
81 | }
82 | return out;
83 | }
84 |
85 | fs.readFile('input.html', 'utf8', function (err, data1) {
86 | if (err) throw err;
87 | var data2 = lzw_encode(data1);
88 | fs.writeFile ('002.html', data2, function(err) {
89 | if (err) throw err;
90 | fs.readFile('002.html', 'utf8', function (err, data3) {
91 | if (err) throw err;
92 | var data4 = lzw_decode(data3);
93 | fs.writeFile ('recovered.html', data4, function(err) {
94 | if (err) throw err;
95 | console.log((data4.substring(0, 30) + '...' + data4.substring(data4.length-30)).replace(/\s+/g, ' '));
96 | console.log(data1 === data4 ? 'match' : 'no match');
97 | });
98 | });
99 | });
100 |
101 | });
102 | ~~~
103 |
--------------------------------------------------------------------------------
/mmastrac/README.md:
--------------------------------------------------------------------------------
1 | # mmastrac's solution
2 |
3 | This is similar to winstonewert's entry, but we use the BWT transform on the data (from a hacked-up version of compressjs) before encoding to PNG. I used Zopfli's
4 | PNG re-encoder to shrink the PNG even further as well.
5 |
6 |
--------------------------------------------------------------------------------
/mmastrac/index.html:
--------------------------------------------------------------------------------
1 |