├── ztml
    ├── __init__.py
    ├── default_vars.py
    ├── crenc.py
    ├── huffman.py
    ├── base125.py
    ├── tests.py
    ├── webify.py
    ├── text_prep.py
    ├── deflate.py
    ├── ztml.py
    ├── bwt_mtf.py
    └── validation.py
├── .github
    └── FUNDING.yml
├── ect
    ├── ect
    ├── ect.exe
    ├── ect-ubuntu
    ├── Build_ECT_Ububtu.ipynb
    └── License.txt
├── misc
    ├── reversim2022_slides.pdf
    ├── minibook.py
    ├── run_all.bat
    ├── .htaccess
    ├── size_checker.py
    └── example_html.py
├── requirements.txt
├── LICENSE
├── example.py
├── example_image.py
├── TODO.md
├── ZTML.ipynb
└── README.md


/ztml/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: eyaler
2 | 


--------------------------------------------------------------------------------
/ect/ect:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eyaler/ztml/HEAD/ect/ect


--------------------------------------------------------------------------------
/ect/ect.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eyaler/ztml/HEAD/ect/ect.exe


--------------------------------------------------------------------------------
/ect/ect-ubuntu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eyaler/ztml/HEAD/ect/ect-ubuntu


--------------------------------------------------------------------------------
/misc/reversim2022_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eyaler/ztml/HEAD/misc/reversim2022_slides.pdf


--------------------------------------------------------------------------------
/ztml/default_vars.py:
--------------------------------------------------------------------------------
1 | bitarray = 'b'
2 | bwt_func = 'f'
3 | bytearray = 'o'
4 | image = 'i'
5 | text = 't'
6 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | bitarray
 2 | chardet
 3 | numpy
 4 | pydivsufsort
 5 | pypng
 6 | regex
 7 | selenium
 8 | typing_extensions
 9 | webdriver_manager
10 | zopflipy
11 | gutenbergpy
12 | 


--------------------------------------------------------------------------------
/misc/minibook.py:
--------------------------------------------------------------------------------
 1 | # https://xem.github.io/miniBook
 2 | 
 3 | 
 4 | import sys
 5 | from urllib.request import urlopen
 6 | 
 7 | sys.path.append('..')
 8 | from ztml import ztml
 9 | 
10 | 
11 | with urlopen('https://xem.github.io/miniBook/example') as f:
12 |     out, result = ztml.ztml(f.read(), 'index.html', mtf=80, ect=True, raw=True, validate=True)
13 |     print(f'{len(out):,} B')
14 |     assert not result
15 | 


--------------------------------------------------------------------------------
/misc/run_all.bat:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 2> nul
 2 | 
 3 | :; trap "exit" INT TERM
 4 | :; set -o errexit
 5 | :; function goto() { return $?; }
 6 | 
 7 | cd ..
 8 | 
 9 | python example.py || goto :error
10 | 
11 | python example_image.py || goto :error
12 | 
13 | cd misc
14 | 
15 | python example_html.py || goto :error
16 | 
17 | python minibook.py || goto :error
18 | 
19 | cd ../ztml
20 | 
21 | python tests.py || goto :error
22 | 
23 | :; exit 0
24 | exit /b 0
25 | 
26 | :error
27 | exit /b %errorlevel%
28 | 


--------------------------------------------------------------------------------
/misc/.htaccess:
--------------------------------------------------------------------------------
 1 | # THIS IS FOR ONLINE TESTING OF OUTPUT FILES
 2 | 
 3 | # SHOW FILES IN FOLDER
 4 | Options +Indexes
 5 | IndexOptions +FancyIndexing
 6 | 
 7 | # DISABLE CACHING
 8 | <IfModule mod_expires.c>
 9 |     ExpiresActive Off
10 | </IfModule>
11 | <IfModule mod_headers.c>
12 |     FileETag None
13 |     Header unset ETag
14 |     Header unset Pragma
15 |     Header unset Cache-Control
16 |     Header unset Last-Modified
17 |     Header set Pragma "no-cache"
18 |     Header set Cache-Control "max-age=0, no-cache, no-store, must-revalidate"
19 |     Header set Expires "Thu, 1 Jan 1970 00:00:00 GMT"
20 | </IfModule>


--------------------------------------------------------------------------------
/misc/size_checker.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | 
 5 | old_folder = sys.argv[1]
 6 | new_folder = sys.argv[2]
 7 | assert new_folder != old_folder
 8 | old_files = sorted(os.listdir(old_folder))
 9 | new_files = sorted(os.listdir(new_folder))
10 | print(f'Old: {old_folder} ({len(old_files)} files)')
11 | print(f'New: {new_folder} ({len(new_files)} files)')
12 | assert new_files == old_files
13 | 
14 | for file in old_files:
15 |     old_size = os.path.getsize(os.path.join(old_folder, file))
16 |     new_size = os.path.getsize(os.path.join(new_folder, file))
17 |     assert new_size <= old_size, f'{file} grew from {old_size:,} to {new_size:,}'
18 | 
19 | print(f'All {len(old_files)} files are equal or smaller.')
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The following license applies to all parts of this software except where a more restrictive license is stated.
 2 | 
 3 | MIT License
 4 | 
 5 | Copyright (c) 2022 Eyal Gruss (https://github.com/eyaler/ztml)
 6 | 
 7 | Copyright (c) 2021-2022 Ethan Halsall (https://github.com/eshaz/simple-yenc)
 8 | 
 9 | Copyright (c) 2016 Kevin Albertson (https://github.com/kevinAlbs/Base122)
10 | 
11 | Permission is hereby granted, free of charge, to any person obtaining a copy
12 | of this software and associated documentation files (the "Software"), to deal
13 | in the Software without restriction, including without limitation the rights
14 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15 | copies of the Software, and to permit persons to whom the Software is
16 | furnished to do so, subject to the following conditions:
17 | 
18 | The above copyright notice and this permission notice shall be included in all
19 | copies or substantial portions of the Software.
20 | 
21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 | SOFTWARE.
28 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from time import time
 4 | 
 5 | start_time = time()
 6 | 
 7 | from ztml import validation, ztml
 8 | 
 9 | 
10 | books = [30123, 2600]
11 | book_mtf = [0, 80]
12 | book_ect = [False, True]
13 | output_folder = 'output'
14 | skip_download_exists = True
15 | element_id = ''
16 | 
17 | 
18 | assert len(books) == len(book_mtf) == len(book_ect)
19 | error = False
20 | for item, mtf, ect in zip(books, book_mtf, book_ect):
21 |     item_start_time = time()
22 |     filenames = dict(raw=f'{item}.txt',
23 |                      # base64_js=f'{item}_64.js',
24 |                      base64_html=f'{item}_64.html',
25 |                      # base125_js=f'{item}_125.js',
26 |                      base125_html=f'{item}_125.html',
27 |                      # crenc_js=f'{item}_cr.js',
28 |                      crenc_html=f'{item}_cr.html')
29 |     os.makedirs(output_folder, exist_ok=True)
30 |     filenames = {k: os.path.join(output_folder, v) for k, v in filenames.items()}
31 | 
32 |     # If missing, download an example file from the web
33 |     if not skip_download_exists or not os.path.exists(filenames['raw']):
34 |         from gutenbergpy.textget import get_text_by_id
35 |         with open(filenames['raw'], 'wb') as f:
36 |             f.write(get_text_by_id(item))
37 | 
38 |     with open(filenames['raw'], 'rb') as f:
39 |         data = f.read()
40 | 
41 |     cnt = 0
42 |     for label, filename in filenames.items():
43 |         if label == 'raw':
44 |             continue
45 |         file = ztml.ztml(data, filename, mtf=mtf, ect=ect, bin2txt=label.rsplit('_', 1)[0], element_id=element_id)
46 |         cnt += 1
47 | 
48 |     print(f'{cnt} encodings of {item} took {(time()-item_start_time) / 60 :.1f} min.')
49 | 
50 |     # Compare file sizes and validate data is recovered
51 |     error |= validation.validate_files(filenames, by='id' * bool(element_id), element=element_id)
52 |     print()
53 | 
54 | if error:
55 |     print('Error: some renderings timed out')
56 | else:
57 |     print(f'Total of {len(books)} books took {(time()-start_time) / 60 :.1f} min.')
58 | sys.exit(int(error))
59 | 


--------------------------------------------------------------------------------
/misc/example_html.py:
--------------------------------------------------------------------------------
 1 | # This is just for testing that ZTML can work on its own HTML outputs in raw mode
 2 | 
 3 | 
 4 | import os
 5 | import sys
 6 | from time import time
 7 | 
 8 | start_time = time()
 9 | 
10 | sys.path.append('..')
11 | from ztml import validation, ztml
12 | 
13 | 
14 | raw_files = ['30123_64.html',
15 |              '30123_125.html',
16 |              '30123_cr.html',
17 |              'test_pattern.jpg_64.html',
18 |              'test_pattern.jpg_125.html',
19 |              'test_pattern.jpg_cr.html'
20 |              ]
21 | output_folder = '../output'
22 | 
23 | 
24 | error = False
25 | for url in raw_files:
26 |     item_start_time = time()
27 |     item = url.replace(os.sep, '/').rsplit('/', 1)[-1]
28 |     filenames = dict(raw=item,
29 |                      # base64_js=f'{item}_64.js',
30 |                      base64_html=f'{item}_64.html',
31 |                      # base125_js=f'{item}_125.js',
32 |                      base125_html=f'{item}_125.html',
33 |                      # crenc_js=f'{item}_cr.js',
34 |                      crenc_html=f'{item}_cr.html')
35 |     os.makedirs(output_folder, exist_ok=True)
36 |     filenames = {k: os.path.join(output_folder, v) for k, v in filenames.items()}
37 | 
38 |     with open(filenames['raw'], 'rb') as f:
39 |         data = f.read()
40 |     if os.path.splitext(item)[0].endswith('_cr'):
41 |         data = data.decode('cp1252', 'backslashreplace')
42 | 
43 |     cnt = 0
44 |     for label, filename in filenames.items():
45 |         if label == 'raw':
46 |             continue
47 |         file = ztml.ztml(data, filename, bin2txt=label.rsplit('_', 1)[0], raw=True, text_var='z')
48 |         cnt += 1
49 | 
50 |     print(f'{cnt} encodings of {item} took {(time()-item_start_time) / 60 :.1f} min.')
51 | 
52 |     # Compare file sizes and validate data is recovered
53 |     error |= validation.validate_files(filenames, data, content_var='z')
54 |     print()
55 | 
56 | if error:
57 |     print('Error: some renderings timed out')
58 | else:
59 |     print(f'Total of {len(raw_files)} raw files took {(time()-start_time) / 60 :.1f} min.')
60 | sys.exit(int(error))
61 | 


--------------------------------------------------------------------------------
/ect/Build_ECT_Ububtu.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |   "nbformat": 4,
 3 |   "nbformat_minor": 0,
 4 |   "metadata": {
 5 |     "colab": {
 6 |       "private_outputs": true,
 7 |       "provenance": [],
 8 |       "authorship_tag": "ABX9TyMGts6SFpEriurAynOgIU2i",
 9 |       "include_colab_link": true
10 |     },
11 |     "kernelspec": {
12 |       "name": "python3",
13 |       "display_name": "Python 3"
14 |     },
15 |     "language_info": {
16 |       "name": "python"
17 |     }
18 |   },
19 |   "cells": [
20 |     {
21 |       "cell_type": "markdown",
22 |       "metadata": {
23 |         "id": "view-in-github",
24 |         "colab_type": "text"
25 |       },
26 |       "source": [
27 |         "<a href=\"https://colab.research.google.com/github/eyaler/ztml/blob/main/ect/Build_ECT_Ububtu.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
28 |       ]
29 |     },
30 |     {
31 |       "cell_type": "code",
32 |       "source": [
33 |         "%cd /content\n",
34 |         "!git clone --recursive https://github.com/fhanau/Efficient-Compression-Tool\n",
35 |         "!latestTag=$(git describe --tags `git rev-list --tags --max-count=1`)\n",
36 |         "!echo $latestTag\n",
37 |         "!git checkout $latestTag\n",
38 |         "!apt -y install nasm\n",
39 |         "%cd Efficient-Compression-Tool\n",
40 |         "!mkdir build\n",
41 |         "%cd build\n",
42 |         "!cmake ../src\n",
43 |         "!make"
44 |       ],
45 |       "metadata": {
46 |         "id": "jh1og550gUD-"
47 |       },
48 |       "execution_count": null,
49 |       "outputs": []
50 |     },
51 |     {
52 |       "cell_type": "code",
53 |       "source": [
54 |         "!./ect"
55 |       ],
56 |       "metadata": {
57 |         "id": "hR9UQWk88qsx"
58 |       },
59 |       "execution_count": null,
60 |       "outputs": []
61 |     },
62 |     {
63 |       "cell_type": "code",
64 |       "source": [
65 |         "from google.colab import files\n",
66 |         "!mv ect ect-ubuntu\n",
67 |         "files.download('ect-ubuntu')"
68 |       ],
69 |       "metadata": {
70 |         "id": "b7IETFkT7Y7Z"
71 |       },
72 |       "execution_count": null,
73 |       "outputs": []
74 |     }
75 |   ]
76 | }


--------------------------------------------------------------------------------
/example_image.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from time import time
 4 | from urllib.request import urlopen
 5 | 
 6 | start_time = time()
 7 | 
 8 | from ztml import validation, ztml
 9 | 
10 | 
11 | image_urls = ['http://wiesmann.codiferes.net/share/bitmaps/test_pattern.bmp',
12 |               'http://wiesmann.codiferes.net/share/bitmaps/test_pattern.gif',
13 |               'http://wiesmann.codiferes.net/share/bitmaps/test_pattern.jpg',
14 |               'http://wiesmann.codiferes.net/share/bitmaps/test_pattern.png',
15 |               'http://wiesmann.codiferes.net/share/bitmaps/test_pattern.webp'
16 |               ]
17 | output_folder = 'output'
18 | skip_download_exists = True
19 | element_id = ''
20 | 
21 | 
22 | error = False
23 | for url in image_urls:
24 |     item_start_time = time()
25 |     item = url.rsplit('/', 1)[-1]
26 |     filenames = dict(raw=item,
27 |                      # base64_js=f'{item}_64.js',
28 |                      base64_html=f'{item}_64.html',
29 |                      # base125_js=f'{item}_125.js',
30 |                      base125_html=f'{item}_125.html',
31 |                      # crenc_js=f'{item}_cr.js',
32 |                      crenc_html=f'{item}_cr.html')
33 |     os.makedirs(output_folder, exist_ok=True)
34 |     filenames = {k: os.path.join(output_folder, v) for k, v in filenames.items()}
35 | 
36 |     # If missing, download an example file from the web
37 |     if not skip_download_exists or not os.path.exists(filenames['raw']):
38 |         with urlopen(url) as fin, open(filenames['raw'], 'wb') as fout:
39 |             fout.write(fin.read())
40 | 
41 |     with open(filenames['raw'], 'rb') as f:
42 |         data = f.read()
43 | 
44 |     cnt = 0
45 |     for label, filename in filenames.items():
46 |         if label == 'raw':
47 |             continue
48 |         file = ztml.ztml(data, filename, bin2txt=label.rsplit('_', 1)[0], element_id=element_id, image=True)
49 |         cnt += 1
50 | 
51 |     print(f'{cnt} encodings of {item} took {(time()-item_start_time) / 60 :.1f} min.')
52 | 
53 |     # Compare file sizes and validate data is recovered
54 |     error |= validation.validate_files(filenames, by='id' * bool(element_id), element=element_id, image=True)
55 |     print()
56 | 
57 | if error:
58 |     print('Error: some renderings timed out')
59 | else:
60 |     print(f'Total of {len(image_urls)} images took {(time()-start_time) / 60 :.1f} min.')
61 | sys.exit(int(error))
62 | 


--------------------------------------------------------------------------------
/ztml/crenc.py:
--------------------------------------------------------------------------------
 1 | """crEnc encoding based on yEnc and optimized for inline HTML / JS text compression and image encoding
 2 | 
 3 | In the spirit of yEnc (why encode?), we only encode symbols where absolutely required.
 4 | If the HTML or JS charset can be set to a single-byte encoding as cp1252 (or latin1),
 5 | the only symbol requiring special treatment is the carriage-return (CR), hence crEnc,
 6 | which can be dealt with by simple backslash escaping.
 7 | We embed in JS template literals quotes ``, so we also escape backslash, ` and ${
 8 | giving us an effective 253 byte values out of 256,
 9 | with an overhead of ~ 3/256 ~ 1.2% (compared to 33.3% for Base64).
10 | JS does the unescaping, so the decoder only needs to take care of HTML character overrides for NUL and codes in 128 - 159.
11 | An optimal global character modular offset can be applied to minimize escaping, similar to dynEncode (enabled by default).
12 | A minimalistic JS decoder code is generated.
13 | 
14 | References:
15 | https://en.wikipedia.org/wiki/Binary-to-text_encoding
16 | http://www.yenc.org
17 | https://github.com/eshaz/simple-yenc
18 | https://github.com/eshaz/simple-yenc#what-is-dynencode
19 | https://html.spec.whatwg.org/multipage/parsing.html#table-charref-overrides
20 | https://stackoverflow.com/questions/10080605/special-character-u0098-read-as-u02dc-using-charcodeat/#10081375
21 | """
22 | 
23 | 
24 | from typing import Optional, Tuple
25 | 
26 | if not __package__:
27 |     import default_vars, webify
28 | else:
29 |     # noinspection PyPackages
30 |     from . import default_vars, webify
31 | 
32 | 
33 | def encode(data: bytes, offset: int = 0) -> bytes:
34 |     if offset:
35 |         data = bytes(byte+offset & 255 for byte in data)
36 |     return webify.escape(data)
37 | 
38 | 
39 | def optimize_encode(data: bytes) -> Tuple[bytes, int, int]:
40 |     best_offset = 0
41 |     for offset in range(256):
42 |         out = encode(data, offset)
43 |         length = len(out)
44 |         if offset == 0:
45 |             best_length = length0 = length
46 |         if length < best_length:
47 |             best_length = length
48 |             best_offset = offset
49 |     out = encode(data, best_offset)
50 |     return out, best_offset, length0 - best_length
51 | 
52 | 
53 | def get_js_decoder(data: bytes,
54 |                    offset: Optional[int] = None,
55 |                    output_var: str = default_vars.bytearray
56 |                    ) -> bytes:
57 |     if offset is None:
58 |         encoded, offset, saved = optimize_encode(data)  # Time-consuming op.
59 |     else:
60 |         encoded = encode(data, offset)
61 |     first_part = f'{output_var}=Uint8Array.from(`'
62 |     function = f"(i=c.charCodeAt()%65533)>>8?129+' \x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c \x8e  \x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c \x9e\x9f'.indexOf(c):i"
63 |     if offset:
64 |         function = f'({function})-{offset}'
65 |     last_part = f"`,c=>{function})\n"
66 |     return first_part.encode() + encoded + last_part.encode('l1')  # Encode with l1 as I used explicit bytes above
67 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | # Todo
 2 | 
 3 | ### Usability
 4 | - Simplify the "from ztml.ztml import ztml" hierarchy
 5 | - Support encoding video/audio/fonts/PDF/...
 6 | - Support encoding multiple media elements
 7 | - Provide an easy way to view and edit output HTML in Colab
 8 | - Make into a PIP library and start doing versioning
 9 | - JS library?
10 | - Expose more parameters and allow skipping steps in ztml() / CLI / Colab, possibly via config file
11 | - Stand-alone online web GUI
12 | - Stand-alone executable (+ script to build it)
13 | 
14 | ### Compression
15 | - Ablation benchmarks
16 | - Launch a challenge for smaller decoders
17 | 
18 | - #### Entropy coding:
19 | - Auto-caps should use modifiers for next letter/word/sentence/paragraph or block-level, over simple mode instead of falling back to raw. See e.g. [Grabowski](https://www.researchgate.net/profile/Szymon-Grabowski-2/publication/258239689_Text_Preprocessing_for_Burrows-Wheeler_Block_Sorting_Compression/links/0046352789a298f289000000), [Batista&Alexandre](https://www.di.ubi.pt/~lfbaa/pubs/dcc2008.pdf)
20 | - Dictionary compression for large texts + add references
21 | - [Fast Huffman one-shift decoder](https://researchgate.net/publication/3159499_On_the_implementation_of_minimum_redundancy_prefix_codes), and follow-up works: [Gagie et al.](https://arxiv.org/pdf/1410.3438.pdf), [Grabowski&Koppl](https://arxiv.org/pdf/2108.05495.pdf)
22 | - Consider [Roadroller](https://lifthrasiir.github.io/roadroller) entropy coder
23 | 
24 |   #### MTF:
25 | - Improve JS MTF decoding times for large files
26 | - Automatic optimizing over MTF variants
27 | - Benchmark alternatives to MTF + add references
28 | 
29 |   #### Deflate:
30 | - Investigate effect of PNG aspect ratio on compression / optimize over it
31 | - Investigate Safari canvas size limits
32 | - Use 8/24-bit to overcome canvas size limits when necessary (will not work on Safari, unless we go WebGL)
33 | - Compress metadata into PNG 
34 | - [Use WOFF2 as a Brotli container](https://github.com/lifthrasiir/roadroller/issues/9#issuecomment-905580540)
35 | 
36 |   #### Webification and minification:
37 | - [Base139](https://github.com/kevinAlbs/Base122/issues/3#issuecomment-263787763)
38 | - Compress the JS itself and use [eval](http://perfectionkills.com/global-eval-what-are-the-options), considering also JS packing e.g. [JSCrush](http://iteral.com/jscrush), [JS Crusher](https://jmperezperez.com/js-crusher), [RegPack](https://siorki.github.io/regPack), [Roadroller](https://lifthrasiir.github.io/roadroller)
39 | - Strip whitespace from code lines not part of multi-line content strings (see e.g. above JS packers and [closure-compiler](https://github.com/google/closure-compiler), [jsmin](https://crockford.com/jsmin), [miniMinifier](https://github.com/xem/miniMinifier), [Terser](https://terser.org), [UglifyJS](https://github.com/mishoo/UglifyJS))
40 | 
41 | ### Validation and testing
42 | - Running full tests take too long
43 | - Linux installation instructions / Enable validation in Colab
44 | - Validation testing for Safari (consider Playwright to test WebKit)
45 | - Fix slow rendering with Selenium in validation
46 | - Tests for text_prep.py: normalize, caps, the
47 | - Automatic testing on GitHub
48 | 


--------------------------------------------------------------------------------
/ztml/huffman.py:
--------------------------------------------------------------------------------
 1 | """Canonical Huffman encoding
 2 | 
 3 | Even though we later compress with DEFLATE which does its own Huffman encoding internally,
 4 | I found that for text compression, it is significantly beneficial to pre-encode with Huffman.
 5 | Canonical encoding obviates saving or reconstructing an explicit codebook.
 6 | Instead, we save a strings of symbols and a condensed canonical table of bases and offsets, in a variation of Moffat&Turpin.
 7 | A minimalistic JS decoder code is generated.
 8 | 
 9 | References:
10 | https://wikipedia.org/wiki/Canonical_Huffman_code
11 | https://github.com/ilanschnell/bitarray/blob/master/doc/canonical.rst
12 | https://researchgate.net/publication/3159499_On_the_implementation_of_minimum_redundancy_prefix_codes (Moffat&Turpin)
13 | https://arxiv.org/pdf/1410.3438.pdf
14 | https://arxiv.org/pdf/2108.05495.pdf
15 | """
16 | 
17 | 
18 | from collections import Counter
19 | import sys
20 | from typing import Dict, List, Tuple
21 | 
22 | from bitarray import bitarray
23 | from bitarray.util import ba2int, canonical_decode, canonical_huffman
24 | 
25 | if not __package__:
26 |     import default_vars, webify
27 | else:
28 |     # noinspection PyPackages
29 |     from . import default_vars, webify
30 | 
31 | 
32 | DEBUG_SKIP_HUFFMAN = False  # This is just for benchmarking and is not implemented in JS decoder
33 | 
34 | 
35 | def encode(text: str,
36 |            validate: bool = True,
37 |            verbose: bool = False
38 |            ) -> Tuple[List[int], str, str, Dict[str, str]]:
39 |     charset = ''
40 |     canonical_table = {}
41 |     counter = Counter(text)
42 |     if DEBUG_SKIP_HUFFMAN:
43 |         code_len = len(bin(ord(max(counter, default='\0')))) - 2
44 |         codebook = {c: bitarray(bin(ord(c))[2:].zfill(code_len)) for c in counter}
45 |     else:
46 |         if len(counter):
47 |             codebook, counts, symbols = canonical_huffman(counter)
48 |         else:
49 |             codebook = {}
50 |             counts = []
51 |             symbols = []
52 |         charset = ''.join(symbols[::-1])
53 |         canonical_table = {len(code): [2**len(code) - ba2int(code), len(codebook) - i - 1] for i, code in enumerate(codebook.values())}
54 | 
55 |     bits = bitarray()
56 |     if codebook:
57 |         bits.encode(codebook, text)
58 |     if verbose:
59 |         print(sorted([(k, v.to01()) for k, v in codebook.items()],
60 |                      key=lambda x: -counter[x[0]]), file=sys.stderr)
61 |         if charset:
62 |             print(len(charset), charset, file=sys.stderr)
63 |             print(canonical_table, file=sys.stderr)
64 |     if validate:
65 |         assert not codebook or ''.join(bits.decode(codebook)) == text
66 |         assert DEBUG_SKIP_HUFFMAN or ''.join(canonical_decode(bits, counts, symbols)) == text
67 |     canonical_table = ''.join(chr(j) for i in range(max(canonical_table, default=-1) + 1) for j in (canonical_table[i] if i in canonical_table else [2**i + 1, 1]))
68 |     rev_codebook = {v.to01(): k for k, v in codebook.items()}
69 |     return bits.tolist(), charset, canonical_table, rev_codebook
70 | 
71 | 
72 | def get_js_decoder(charset: str,
73 |                    canonical_table: str,
74 |                    bitarray_var: str = default_vars.bitarray,
75 |                    text_var: str = default_vars.text,
76 |                    ) -> str:
77 |     # Note that the escaped strings may include more characters requiring safe encoding as regard to encoding domains as well as HTML character overrides
78 |     charset = webify.escape(charset, escape_nul=True)
79 |     canonical_table = webify.escape(canonical_table, escape_nul=True)
80 |     return f'''s=[...`{charset}`]
81 | d=[...`{canonical_table}`]
82 | for(j={text_var}='';j<{bitarray_var}.length;{text_var}+=s[d[k*2-1].codePointAt()+m])for(k=c=0;(m=2**k-d[k++*2].codePointAt()-c)<0;)c+=c+{bitarray_var}[j++]
83 | '''
84 | 
85 | 
86 | def encode_and_get_js_decoder(text: str,
87 |                               bitarray_var: str = default_vars.bitarray,
88 |                               text_var: str = default_vars.text,
89 |                               validate: bool = True,
90 |                               verbose: bool = False
91 |                               ) -> Tuple[List[int], str]:
92 |     bits, charset, canonical_table, _ = encode(text, validate, verbose)
93 |     return bits, get_js_decoder(charset, canonical_table, bitarray_var, text_var)
94 | 


--------------------------------------------------------------------------------
/ztml/base125.py:
--------------------------------------------------------------------------------
  1 | """Base125 encoding based on Base122 and optimized for inline HTML / JS text compression and image encoding
  2 | 
  3 | If we must use utf8 encoding for HTML or JS, crEnc will not work.
  4 | Instead, we can use this original and unnecessarily-optimized version of the variable length Base122.
  5 | The original byte stream is split into 7 bit chunks,
  6 | which are encoded as a single byte: 0xxxxxxx, to comply with utf8 code point scheme.
  7 | We only use 125 byte values out of 128 (excluding CR, backslash and `)
  8 | and encode the remaining three with a double byte scheme: 110ssxxx 10xxxxxx,
  9 | where ss is 01, 10 or 11, and 9 bits are left for next data.
 10 | Alternatively, if these are the final 7 bits, we instead encode as: 1100010x 10xxxxxx.
 11 | As, we embed in JS template literals quotes ``, we further escape ${ with backslash.
 12 | The overhead is ~ 8/7 * 253/256 + 16/11 * 3/256 - 1 ~ 14.7% (compared to 33.3% for Base64).
 13 | The decoder further takes care of HTML character override for NUL.
 14 | An optimal global character modular offset can be added to minimize escaping, similar to dynEncode (disabled by default).
 15 | A minimalistic JS decoder code is generated.
 16 | 
 17 | References:
 18 | https://en.wikipedia.org/wiki/Binary-to-text_encoding
 19 | https://blog.kevinalbs.com/base122
 20 | https://github.com/kevinAlbs/Base122
 21 | https://github.com/eshaz/simple-yenc#what-is-dynencode
 22 | """
 23 | 
 24 | 
 25 | from typing import Optional, Tuple
 26 | 
 27 | if not __package__:
 28 |     import default_vars
 29 | else:
 30 |     # noinspection PyPackages
 31 |     from . import default_vars
 32 | 
 33 | 
 34 | illegal = ['', 13, 92, 96]
 35 | 
 36 | 
 37 | def encode(data: bytes, offset: int = 0, validate: bool = True) -> bytes:
 38 |     cur_index = 0
 39 |     cur_bit = 0  # Points to current bit needed
 40 |     out = bytearray()
 41 | 
 42 |     # Get 7 or 9 bits of input data. Returns None if there is no input left
 43 |     def get_bits(length: int) -> Optional[int]:
 44 |         nonlocal cur_index, cur_bit
 45 |         if cur_index >= len(data):
 46 |             return None
 47 | 
 48 |         # Shift, mask, unshift to get first part. Align it to a 7 or 9 bit chunk
 49 |         first_part = (255>>cur_bit & data[cur_index]+offset & 255) << cur_bit
 50 |         diff = 8 - length
 51 |         if diff > 0:
 52 |             first_part >>= diff
 53 |         else:
 54 |             first_part <<= -diff
 55 |         # Check if we need to go to the next byte for more bits
 56 |         cur_bit += length
 57 |         if cur_bit < 8:
 58 |             return first_part  # Do not need next byte
 59 |         cur_bit -= 8
 60 |         cur_index += 1
 61 |         # Now we want bits [0..cur_bit] of the next byte if it exists
 62 |         if cur_index >= len(data):
 63 |             return first_part
 64 |         # Align it
 65 |         second_part = (0xff00>>cur_bit & data[cur_index]+offset & 255) >> 8-cur_bit
 66 |         return first_part | second_part
 67 | 
 68 |     while True:
 69 |         # Grab 7 bits
 70 |         bits = get_bits(7)
 71 |         if bits is None:
 72 |             break
 73 |         try:
 74 |             illegal_index = illegal.index(bits)
 75 |             # Since this will be a two-byte character, get the next chunk of 9 bits
 76 |             next_bits = get_bits(9)
 77 |             if next_bits is None:
 78 |                 b1 = 4
 79 |                 next_bits = bits
 80 |             else:
 81 |                 b1 = illegal_index << 3
 82 |             # Push first 3 bits onto first byte, remaining 6 onto second
 83 |             out.extend([192 | b1 | next_bits>>6, 128 | next_bits&63])
 84 |         except ValueError:
 85 |             out.append(bits)
 86 | 
 87 |     if validate:
 88 |         decoded = decode(out, offset)
 89 |         assert decoded == data, (len(decoded), len(data), decoded[:30], data[:30])
 90 |     return out.replace(b'${', b'\\${')
 91 | 
 92 | 
 93 | def optimize_encode(data: bytes,
 94 |                     validate: bool = True
 95 |                     ) -> Tuple[bytes, int, int]:
 96 |     best_offset = 0
 97 |     for offset in range(256):
 98 |         length = len(encode(data, offset, validate=False))
 99 |         if offset == 0:
100 |             best_length = length0 = length
101 |         if length < best_length:
102 |             best_length = length
103 |             best_offset = offset
104 |     out = encode(data, best_offset, validate)
105 |     return out, best_offset, length0 - best_length
106 | 
107 | 
108 | def decode(data: bytes, offset: int = 0) -> bytes:
109 |     out = bytearray()
110 |     next_byte = 0
111 |     k = 0
112 | 
113 |     def push_bits(bits: int, length: int = 7) -> None:
114 |         nonlocal next_byte, k
115 |         next_byte |= bits << (length < 8) >> k >> (length > 8)
116 |         k += length
117 |         if k > 7:
118 |             out.append((next_byte&255)-offset & 255)
119 |             k -= 8
120 |             next_byte = bits << 8-k
121 | 
122 |     for byte in data.decode():
123 |         b = ord(byte)
124 |         if b > 127:
125 |             ss = b >> 9
126 |             if ss:
127 |                 push_bits(illegal[ss])
128 |             push_bits(b<<2*(not ss) & 511, 9)
129 |         else:
130 |             push_bits(b)
131 |     return out
132 | 
133 | 
134 | def get_js_decoder(data: bytes,
135 |                    offset: Optional[int] = 0,
136 |                    output_var: str = default_vars.bytearray,
137 |                    validate: bool = True
138 |                    ) -> bytes:
139 |     if offset is None:
140 |         encoded, offset, saved = optimize_encode(data, validate)  # Time-consuming op.
141 |     else:
142 |         encoded = encode(data, offset, validate)
143 |     illegal_str = ','.join(str(i) for i in illegal)
144 |     first_part = f'''k=n=0
145 | p=(b,l=7)=>(n|=b<<(l<8)>>k>>(l>8),k+=l,k>7?(v=n{-offset or ''},k-=8,n=b<<8-k,v):[])
146 | {output_var}=new Uint8Array([...`'''
147 |     last_part = f'`].flatMap(c=>(i=c.charCodeAt()%65533,i>127?(e=i>>9,[e?p([{illegal_str}][e]):[],p(i<<2*!e&511,9)].flat()):p(i))))\n'
148 |     return first_part.encode() + encoded + last_part.encode()
149 | 
150 | 
151 | def test() -> None:
152 |     for i in range(100):
153 |         for j in range(100):
154 |             for offset in [0, 1]:
155 |                 for symbol in [b'\r', b'\\', b'`']:
156 |                     encode(b'\0'*i + symbol*j, offset, validate=True)
157 | 
158 | 
159 | if __name__ == '__main__':
160 |     test()
161 | 


--------------------------------------------------------------------------------
/ztml/tests.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from time import time
  3 | 
  4 | start_time = time()
  5 | 
  6 | if not __package__:
  7 |     import text_prep, bwt_mtf, deflate, validation, webify, ztml
  8 | else:
  9 |     # noinspection PyPackages
 10 |     from . import text_prep, bwt_mtf, deflate, validation, webify, ztml
 11 | 
 12 | 
 13 | min_char_code1 = 0
 14 | max_char_code1 = 14000
 15 | min_char_code2 = 55000
 16 | max_char_code2 = 66000
 17 | browsers = list(validation.drivers)[:1]
 18 | input_encodings = ['utf8', 'cp1252', 'cp1255']
 19 | bin2txt_encodings = ztml.bin2txt_encodings
 20 | caps_modes = ['auto', 'simple']  # text_prep.caps_modes
 21 | mtf_variants = [None, 0, 52, 80]  # bwt_mtf.mtf_variants
 22 | bitdepths = deflate.allowed_bitdepths
 23 | ect_modes = [False, True]
 24 | temp_folder = 'tmp'
 25 | cleanup = True
 26 | 
 27 | 
 28 | all_chars = ''.join(chr(i) for i in range(min_char_code1, min(max_char_code1 or bwt_mtf.max_unicode, bwt_mtf.max_unicode) + 1))
 29 | if min_char_code2 and max_char_code2:
 30 |     all_chars += ''.join(chr(i) for i in range(min_char_code2, min(max_char_code2 or bwt_mtf.max_unicode, bwt_mtf.max_unicode) + 1) if chr(i) not in all_chars)
 31 | os.makedirs(temp_folder, exist_ok=True)
 32 | i = 0
 33 | for browser in browsers:
 34 |     with validation.get_browser(browser) as b:
 35 |         for encoding in input_encodings:
 36 |             encoding = encoding.lower()
 37 |             for bin2txt in bin2txt_encodings:
 38 |                 for caps in caps_modes:
 39 |                     for bwtsort in [True, False]:
 40 |                         for mtf in mtf_variants:
 41 |                             for bitdepth in bitdepths:
 42 |                                 for ect in ect_modes:
 43 |                                     for render_mode in range(3):
 44 |                                         element_id = ''
 45 |                                         raw = False
 46 |                                         if render_mode == 1:
 47 |                                             element_id = 'myid'
 48 |                                         elif render_mode == 2:
 49 |                                             raw = True
 50 |                                         test_start_time = time()
 51 |                                         i += 1
 52 |                                         print(f'{i}/{len(browsers) * len(input_encodings) * len(bin2txt_encodings) * len(caps_modes) * 2 * len(mtf_variants) * len(bitdepths) * len(ect_modes) * 3} browser={browser} input_enc={encoding} bin2txt={bin2txt} caps={caps} bwtsort={bwtsort} mtf={mtf} bitdepth={bitdepth} ect={ect} id={bool(element_id)} raw={raw}')
 53 |                                         suffix = f"{browser}_{encoding}_{bin2txt}_{caps}{'_bwtsort' * bwtsort}_{mtf}_{bitdepth}{'_ect' * ect}"
 54 |                                         if element_id:
 55 |                                             suffix += '_id'
 56 |                                         if raw:
 57 |                                             suffix += '_raw'
 58 |                                         input_filename = os.path.join(temp_folder, f'ztml_test_file_{suffix}.txt')
 59 |                                         output_filename = os.path.join(temp_folder, f'ztml_test_file_{suffix}.html')
 60 |                                         output_stream = os.path.join(temp_folder, f'ztml_test_stream_{suffix}.html')
 61 |                                         text = all_chars
 62 |                                         if mtf is not None:
 63 |                                             text = ''.join(c for c in text if ord(c) <= bwt_mtf.max_ord_for_mtf)
 64 |                                         if encoding.replace('-', '') == 'utf8':
 65 |                                             text = ''.join(c for c in text if ord(c) < bwt_mtf.surrogate_lo or ord(c) > bwt_mtf.surrogate_hi)
 66 |                                             out1, result1 = ztml.ztml(text, unix_newline=False, remove_bom=False, caps=caps, bwtsort=bwtsort, mtf=mtf, bitdepth=bitdepth, ect=ect, bin2txt=bin2txt, element_id=element_id, raw=raw, validate=True, browser=b, verbose=True)
 67 |                                             out2, result2 = ztml.ztml(text, output_filename, unix_newline=False, remove_bom=False, caps=caps, bwtsort=bwtsort, mtf=mtf, bitdepth=bitdepth, ect=ect, bin2txt=bin2txt, element_id=element_id, raw=raw, validate=True, browser=b, verbose=True)
 68 |                                             with open(output_filename, 'rb') as f:
 69 |                                                 out = f.read()
 70 |                                             assert not result1 and not result2 and out1 == out2 == out, (result1, result2, out1 == out2, out1 == out, out2 == out, len(out1), len(out2), validation.full_path(output_filename), len(out))
 71 |                                         with open(input_filename, 'wb') as f:
 72 |                                             f.write(webify.safe_encode(text, encoding))
 73 |                                         bwtsort_arg = '--skip_bwtsort' * (not bwtsort)
 74 |                                         ect_arg = '--ect' * ect
 75 |                                         element_id_or_raw_arg = ''
 76 |                                         if element_id:
 77 |                                             element_id_or_raw_arg = f'--element_id "{element_id}"'
 78 |                                         if raw:
 79 |                                             element_id_or_raw_arg = '--raw'
 80 |                                         result1 = os.system(f'python ztml.py "{input_filename}" "{output_filename}" --skip_unix_newline --skip_remove_bom --caps {caps} {bwtsort_arg} --mtf {mtf} --bitdepth {bitdepth} {ect_arg} --bin2txt {bin2txt} {element_id_or_raw_arg} --validate --browser {browser} --verbose')
 81 |                                         result2 = os.system(f'python ztml.py "{input_filename}" --skip_unix_newline --skip_remove_bom --caps {caps} {bwtsort_arg} --mtf {mtf} --bitdepth {bitdepth} {ect_arg} --bin2txt {bin2txt} {element_id_or_raw_arg} --validate --browser {browser} --verbose > {output_stream}')
 82 |                                         with open(output_filename, 'rb') as f1:
 83 |                                             out1 = f1.read()
 84 |                                         with open(output_stream, 'rb') as f2:
 85 |                                             out2 = f2.read()
 86 |                                         if out2.endswith(b'\x1b[0m'):  # E.g. due to PyCharm terminal
 87 |                                             out2 = out2[:-4]
 88 |                                         assert not result1 and not result2 and out1 == out2, (result1, result2, out1 == out2, validation.full_path(output_filename), len(out1), validation.full_path(output_stream), len(out2))
 89 |                                         if cleanup:
 90 |                                             for filename in [input_filename, output_filename, output_stream]:
 91 |                                                 try:
 92 |                                                     os.remove(filename)
 93 |                                                 except PermissionError:
 94 |                                                     pass
 95 |                                         print(f'Test took {time() - test_start_time :.0f} sec.\n')
 96 | if cleanup:
 97 |     try:
 98 |         os.rmdir(temp_folder)
 99 |     except OSError:
100 |         pass
101 | print(f'Total took {(time()-start_time) / 60 :.1f} min.')
102 | 


--------------------------------------------------------------------------------
/ztml/webify.py:
--------------------------------------------------------------------------------
  1 | """ Minification by way of aliasing AKA uglification
  2 | 
  3 | Substitutes recurring element, attribute and function names with short aliases.
  4 | This is far from being a full-fledged JS minifier, and only addresses specific forms of aliasing
  5 | (with defaults tuned for the author's own hand-minified use cases)
  6 | You may be able to reduce your script further with JS minifiers and packers (see references),
  7 | however these might not be compatible with ZTML (especially when using the non-utf8 crEnc).
  8 | 
  9 | Warnings:
 10 | 1. The two-parameter aliases would miss substitutions involving tag function syntax, i.e.
 11 |    func`str`, even if you specify such forms explicitly. However, see following examples.
 12 | 2. While alias substitution does support some level of composition, e.g.:
 13 |       a.appendChild(b=document.createElement`p`).innerHTML='hi'           # => C(a,b=E`p`).C='hi'
 14 |    More complex compositions would miss later substitutions, e.g.:
 15 |       a.appendChild(b=document.createElement`p`).appendChild(c)           # => C(a,b=E`p`).appendChild(c)
 16 |       a.appendChild(b=document.createElement`p`).setAttribute('style',c)  # => C(a,b=E`p`).setAttribute('style',c)
 17 | 3. Non-static method aliases support only specific parameter signatures as appear in
 18 |    default_aliases. Attempting to specify different signatures will break your code.
 19 | 4. You may need to set replace_quoted=False if you do not want e.g. all 'length', "Length"
 20 |    to be replaced by: L
 21 | 5. Aliases to be used in other aliases e.g. document, should be specified before the latter.
 22 | 
 23 | References:
 24 | https://github.com/google/closure-compiler
 25 | http://iteral.com/jscrush
 26 | https://nikhilism.com/post/2012/demystifying-jscrush
 27 | https://github.com/possan/jsintros/blob/master/a/src/crush.js
 28 | https://jmperezperez.com/js-crusher
 29 | https://crockford.com/jsmin
 30 | https://github.com/xem/miniMinifier
 31 | https://siorki.github.io/regPack
 32 | https://lifthrasiir.github.io/roadroller
 33 | https://terser.org
 34 | https://github.com/mishoo/UglifyJS
 35 | """
 36 | 
 37 | 
 38 | import re
 39 | import sys
 40 | from typing import AnyStr
 41 | 
 42 | 
 43 | raw_extensions = ['htm', 'html', 'svg']
 44 | image_extensions = ['bmp', 'gif', 'jfif', 'jpe', 'jpeg', 'jpg', 'png', 'webp']
 45 | 
 46 | 
 47 | default_aliases = '''
 48 | D = document
 49 | A = (e, d) => e.setAttribute('style', d)
 50 | B = document.body
 51 | C = (e, c) => e.appendChild(c)
 52 | E = (e='div') => document.createElement(e)
 53 | F = String
 54 | G = 'target'
 55 | H = 'innerHTML'
 56 | I = setInterval
 57 | J = clearInterval
 58 | K = e => e.codePointAt()
 59 | L = 'length'
 60 | M = Math
 61 | N = speechSynthesis
 62 | O = setTimeout
 63 | '''
 64 | 
 65 | literals_regex = rf'(`(?:\\.|[^`\\])*`)'
 66 | 
 67 | 
 68 | def escape(s: AnyStr, escape_nul: bool = False) -> AnyStr:
 69 |     pattern = r'\\|`|\${'
 70 |     repl = r'\\\g<0>'
 71 |     cr = '\r'
 72 |     esc_cr = '\\r'
 73 |     nul = '\0'
 74 |     esc_nul = '\\0'
 75 |     if isinstance(s, bytes):
 76 |         pattern = pattern.encode()
 77 |         repl = repl.encode()
 78 |         cr = cr.encode()
 79 |         esc_cr = esc_cr.encode()
 80 |     s = re.sub(pattern, repl, s).replace(cr, esc_cr)
 81 |     if escape_nul:
 82 |         s = s.replace(nul, esc_nul)
 83 |     return s
 84 | 
 85 | 
 86 | def safe_encode(s: str, encoding: str, get_back_unused: bool = False) -> bytes:
 87 |     encoding = encoding.lower()
 88 |     out = s.encode(encoding, 'strict' if encoding.replace('-', '') == 'utf8' else 'backslashreplace')
 89 |     out = re.sub(rb'\\U000?([\da-f]{5,6})', rb'\\u{\1}', out)
 90 |     if get_back_unused and encoding == 'cp1252':
 91 |         out = out.replace(b'\\x81', b'\x81').replace(b'\\x8d', b'\x8d').replace(b'\\x8f', b'\x8f').replace(b'\\x90', b'\x90').replace(b'\\x9d', b'\x9d')  # These actually do not require escaping in HTML
 92 |     return out
 93 | 
 94 | 
 95 | def get_len(s: AnyStr, encoding: str) -> int:
 96 |     return len(safe_encode(s, encoding) if isinstance(s, str) else s)
 97 | 
 98 | 
 99 | def uglify(script: AnyStr,
100 |            aliases: str = default_aliases,
101 |            replace_quoted: bool = True,
102 |            min_cnt: int = 2,
103 |            prevent_grow: bool = True,
104 |            add_used_aliases: bool = True,
105 |            encoding: str = 'utf8',
106 |            ) -> AnyStr:
107 |     orig_len = get_len(script, encoding)
108 |     shorts = set()
109 |     for alias in reversed(aliases.strip().splitlines()):
110 |         alias = alias.replace(' ', '')
111 |         if not alias:
112 |             continue
113 |         short, long = alias.split('=', 1)
114 |         assert short not in shorts, short
115 |         shorts.add(short)
116 |         prefix = ''
117 |         comma = ''
118 |         if re.search(r'(\b\w+\b)[^>]*=>[^.]*\b\1\.', long):
119 |             prefix = r'(\w([\w.]|\[[^[\]]+\])*)\.'
120 |             if re.search('[^,]+,[^>]+=>', long):
121 |                 comma = ','
122 |         long = re.sub(r'[^>]*(?P<prefix>\b\w+\b)[^>]*=>[^.]*\b(?P=prefix)\.|[^>]+=>|\([^,)]*\)|,.*', '', long)
123 |         if prefix:
124 |             short += '(\\1'
125 |             if '(' not in long:
126 |                 long += '('
127 |                 short += comma
128 |             long = prefix + re.sub('[\'"]', '[\'"]', re.escape(long))
129 |         elif long[0] == long[-1] in '\'"':
130 |             short = lambda x, short=short, long=long: f"{'[' * (len(x[0]) < len(long))}{short}{']' * (len(x[0]) < len(long))}"
131 |             long = f'\\.{long[1:-1]}' + re.sub('[\'"]', '[\'"]', f'|{long}') * replace_quoted
132 |         if re.match('\\w', long[0]):
133 |             long = f'\\b{long}'
134 |         if re.match('\\w', long[-1]):
135 |             long += '\\b'
136 |         if isinstance(script, bytes):
137 |             long = safe_encode(long, encoding)
138 |             if isinstance(short, str):
139 |                 short = safe_encode(short, encoding)
140 |             else:
141 |                 short = lambda x, short=short: safe_encode(short(x), encoding)
142 |         sub = script[:0]
143 |         cnt = 0
144 |         parts = re.split(safe_encode(literals_regex, encoding) if isinstance(script, bytes) else literals_regex, script)
145 |         for i, part in enumerate(parts):
146 |             if i % 2 == 0:
147 |                 part, c = re.subn(long, short, part)
148 |                 cnt += c
149 |             sub += part
150 |         if cnt >= min_cnt:
151 |             if add_used_aliases:
152 |                 alias += '\n'
153 |                 if isinstance(sub, bytes):
154 |                     alias = safe_encode(alias, encoding)
155 |                 if alias not in sub:
156 |                     sub = alias + sub.lstrip()
157 |             if not prevent_grow or get_len(sub, encoding) < get_len(script, encoding):
158 |                 script = sub
159 |     new_len = get_len(script, encoding)
160 |     if new_len > orig_len:
161 |         print(f'Warning: uglified size increased: {new_len} B > {orig_len} B', file=sys.stderr)
162 |     return script
163 | 
164 | 
165 | def html_wrap(script: AnyStr,
166 |               aliases: str = default_aliases,
167 |               replace_quoted: bool = True,
168 |               min_cnt: int = 2,
169 |               prevent_grow: bool = True,
170 |               lang: str = '',
171 |               encoding: str = 'utf8',
172 |               mobile: bool = False,
173 |               title: str = '',
174 |               ) -> AnyStr:
175 |     html_lang = f'<html lang={lang}>' * bool(lang)
176 |     encoding = encoding.lower()
177 |     if encoding == 'utf-8':
178 |         encoding = 'utf8'
179 |     elif encoding in ['cp1252', 'latin1']:
180 |         encoding = 'l1'  # HTML5 treats these the same
181 |     mobile_meta = '<meta name=viewport content="width=device-width,initial-scale=1">' * mobile
182 |     title_element = f'<title>{title}</title>' * bool(title)
183 |     html_header = f'<!DOCTYPEhtml>{html_lang}<meta charset={encoding}>{mobile_meta}{title_element}<b><script>'
184 |     html_footer = '</script>'
185 |     sep = ''
186 |     if isinstance(script, bytes):
187 |         html_header = safe_encode(html_header, encoding)
188 |         html_footer = safe_encode(html_footer, encoding)
189 |         sep = safe_encode(sep, encoding)
190 |     if aliases:
191 |         script = uglify(script, aliases, replace_quoted, min_cnt, prevent_grow, encoding=encoding)
192 |     return sep.join([html_header, script.strip(), html_footer])
193 | 


--------------------------------------------------------------------------------
/ZTML.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "private_outputs": true,
  7 |       "provenance": [],
  8 |       "collapsed_sections": [],
  9 |       "authorship_tag": "ABX9TyOZ/X56cwNPCb8Cs4lZTsRx",
 10 |       "include_colab_link": true
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     },
 16 |     "language_info": {
 17 |       "name": "python"
 18 |     }
 19 |   },
 20 |   "cells": [
 21 |     {
 22 |       "cell_type": "markdown",
 23 |       "metadata": {
 24 |         "id": "view-in-github",
 25 |         "colab_type": "text"
 26 |       },
 27 |       "source": [
 28 |         "<a href=\"https://colab.research.google.com/github/eyaler/ztml/blob/main/ZTML.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 29 |       ]
 30 |     },
 31 |     {
 32 |       "cell_type": "markdown",
 33 |       "source": [
 34 |         "# ZTML\n",
 35 |         "\n",
 36 |         "### Extreme inline text compression for HTML / JS\n",
 37 |         "### By [Eyal Gruss](https://eyalgruss.com) ([@eyaler](https://twitter.com/eyaler)\\)\n",
 38 |         "\n",
 39 |         "Repo: [github.com/eyaler/ztml](https://github.com/eyaler/ztml)\n",
 40 |         "\n",
 41 |         "Shortcut to Colab: [bit.ly/ztml1](https://bit.ly/ztml1)"
 42 |       ],
 43 |       "metadata": {
 44 |         "id": "V__-3LfHyt5l"
 45 |       }
 46 |     },
 47 |     {
 48 |       "cell_type": "code",
 49 |       "execution_count": null,
 50 |       "metadata": {
 51 |         "cellView": "form",
 52 |         "id": "kKLXYZNYynrz"
 53 |       },
 54 |       "outputs": [],
 55 |       "source": [
 56 |         "#@title Setup\n",
 57 |         "%cd /content\n",
 58 |         "!git clone -q https://github.com/eyaler/ztml\n",
 59 |         "!pip -q install -r ztml/requirements.txt"
 60 |       ]
 61 |     },
 62 |     {
 63 |       "cell_type": "code",
 64 |       "source": [
 65 |         "#@title Enter text or HTML code\n",
 66 |         "#@markdown Important: for HTML tick `raw` below\n",
 67 |         "from IPython.display import display\n",
 68 |         "from ipywidgets import Layout, Textarea\n",
 69 |         "try:\n",
 70 |         "  text = textarea.value\n",
 71 |         "except NameError:\n",
 72 |         "  text = ''\n",
 73 |         "textarea = Textarea(value=text, placeholder='Type something', description='Text:', layout=Layout(width='90%', height='200px'))\n",
 74 |         "display(textarea)"
 75 |       ],
 76 |       "metadata": {
 77 |         "cellView": "form",
 78 |         "id": "Z9RJOcFL_HEw"
 79 |       },
 80 |       "execution_count": null,
 81 |       "outputs": []
 82 |     },
 83 |     {
 84 |       "cell_type": "code",
 85 |       "source": [
 86 |         "#@title or Upload text or HTML or image file\n",
 87 |         "#@markdown Warning: will clear any input to above textarea\n",
 88 |         "from google.colab import files\n",
 89 |         "%cd /content\n",
 90 |         "try:\n",
 91 |         "  files.upload_file('input_file')\n",
 92 |         "except ValueError:\n",
 93 |         "  pass\n",
 94 |         "else:\n",
 95 |         "  try:\n",
 96 |         "    textarea.value = ''\n",
 97 |         "  except NameError:\n",
 98 |         "    pass"
 99 |       ],
100 |       "metadata": {
101 |         "cellView": "form",
102 |         "id": "pzlcSOpCGFXy"
103 |       },
104 |       "execution_count": null,
105 |       "outputs": []
106 |     },
107 |     {
108 |       "cell_type": "code",
109 |       "source": [
110 |         "#@title Compress!\n",
111 |         "#@markdown Warning: `bitdepth` of `8-bit`, `24-bit` do not work on Safari\n",
112 |         "import os\n",
113 |         "output_filename = 'index.html' #@param {type: 'string'}\n",
114 |         "input_encoding = '' #@param {type: 'string'}\n",
115 |         "reduce_whitespace = False #@param {type: 'boolean'}\n",
116 |         "unix_newline = True #@param {type: 'boolean'}\n",
117 |         "fix_punct = False #@param {type: 'boolean'}\n",
118 |         "remove_bom = True #@param {type: 'boolean'} \n",
119 |         "caps = 'auto' #@param ['auto', 'lower', 'raw', 'simple', 'upper']\n",
120 |         "bwtsort = True #@param {type: 'boolean'}\n",
121 |         "mtf = '0' #@param ['none', 0, 1, 2, 50, 52, 60, 70, 80, 90]\n",
122 |         "bitdepth = 1 #@param [1, 8, 24]\n",
123 |         "ect = False #@param {type: 'boolean'}\n",
124 |         "bin2txt = 'crenc' #@param ['base64', 'base125', 'crenc']\n",
125 |         "element_id = '' #@param {type: 'string'}\n",
126 |         "raw = True #@param {type: 'boolean'}\n",
127 |         "image = False #@param {type: 'boolean'}\n",
128 |         "js = False #@param {type: 'boolean'}\n",
129 |         "uglify = True #@param {type: 'boolean'}\n",
130 |         "replace_quoted = True #@param {type: 'boolean'}\n",
131 |         "lang = '' #@param {type: 'string'}\n",
132 |         "mobile = False #@param {type: 'boolean'}\n",
133 |         "title = '' #@param {type: 'string'}\n",
134 |         "text_var = 't' #@param {type: 'string'}\n",
135 |         "\n",
136 |         "if ect:\n",
137 |         "  try:\n",
138 |         "    have_ect_lib\n",
139 |         "  except NameError:\n",
140 |         "    !add-apt-repository -y ppa:ubuntu-toolchain-r/test\n",
141 |         "    !apt upgrade libstdc++6\n",
142 |         "    have_ect_lib = True\n",
143 |         "\n",
144 |         "%cd /content\n",
145 |         "input_filename = 'input_file'\n",
146 |         "try:\n",
147 |         "  if textarea.value:\n",
148 |         "    with open(input_filename, 'wb') as f:\n",
149 |         "      f.write(textarea.value.encode())\n",
150 |         "      print('Using input to textarea')\n",
151 |         "  else:\n",
152 |         "    print('Using uploaded file')\n",
153 |         "except NameError:\n",
154 |         "  print('Using uploaded file')\n",
155 |         "reduce_whitespace_arg = '--reduce_whitespace' * reduce_whitespace\n",
156 |         "unix_newline_arg = '--skip_unix_newline' * (not unix_newline)\n",
157 |         "fix_punct_arg = '--fix_punct' * fix_punct\n",
158 |         "remove_bom_arg = '--skip_remove_bom ' * (not remove_bom)\n",
159 |         "bwtsort_arg = '--skip_bwtsort ' * (not bwtsort)\n",
160 |         "ect_arg = '--ect' * ect\n",
161 |         "raw_arg = '--raw' * raw\n",
162 |         "image_arg = '--image' * image\n",
163 |         "js_arg = '--js' * js\n",
164 |         "uglify_arg = '--skip_uglify' * (not uglify)\n",
165 |         "replace_quoted_arg = '--skip_replace_quoted' * (not replace_quoted)\n",
166 |         "mobile_arg = '--mobile' * mobile\n",
167 |         "!python ztml/ztml/ztml.py \"$input_filename\" \"$output_filename\" --input_encoding $input_encoding $reduce_whitespace_arg $unix_newline_arg $fix_punct_arg $remove_bom_arg --caps $caps $bwtsort_arg --mtf $mtf --bitdepth $bitdepth $ect_arg --bin2txt $bin2txt --element_id $element_id $raw_arg $image_arg $js_arg $uglify_arg $replace_quoted_arg --lang $lang $mobile_arg --title $title --text_var $text_var\n",
168 |         "input_size = os.path.getsize(input_filename)\n",
169 |         "output_size = os.path.getsize(output_filename)\n",
170 |         "print(f'{input_size:,} B -> {output_size:,} B ({output_size / input_size * 100 :.1f}%)')"
171 |       ],
172 |       "metadata": {
173 |         "id": "qg-KcsfG0CpP",
174 |         "cellView": "form"
175 |       },
176 |       "execution_count": null,
177 |       "outputs": []
178 |     },
179 |     {
180 |       "cell_type": "code",
181 |       "source": [
182 |         "#@title Download output file\n",
183 |         "from google.colab import files\n",
184 |         "if bin2txt == 'crenc':\n",
185 |         "  print(f'Note: {output_filename} is encoded in cp1252, which some editors might break')\n",
186 |         "files.download(output_filename)"
187 |       ],
188 |       "metadata": {
189 |         "cellView": "form",
190 |         "id": "3C9EVO8sFyA0"
191 |       },
192 |       "execution_count": null,
193 |       "outputs": []
194 |     },
195 |     {
196 |       "cell_type": "code",
197 |       "source": [
198 |         "#@title Display output as hex dump\n",
199 |         "from IPython.display import HTML\n",
200 |         "with open(output_filename, 'rb') as f:\n",
201 |         "  hex = '0x' + f.read().hex()\n",
202 |         "print(hex)\n",
203 |         "HTML(f\"<button onclick=navigator.clipboard.writeText('{hex}')>Copy</button>\")"
204 |       ],
205 |       "metadata": {
206 |         "id": "v0GwtZtnTprz",
207 |         "cellView": "form"
208 |       },
209 |       "execution_count": null,
210 |       "outputs": []
211 |     }
212 |   ]
213 | }


--------------------------------------------------------------------------------
/ztml/text_prep.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from typing import Optional, Tuple
  3 | 
  4 | import regex
  5 | 
  6 | if not __package__:
  7 |     import default_vars
  8 | else:
  9 |     # noinspection PyPackages
 10 |     from . import default_vars
 11 | 
 12 | 
 13 | newline = r'\n\v\f\r\x85\u2028'
 14 | single_quote = '[\u2018-\u201b\u05f3\uff07]'
 15 | double_quote = '[\u201c-\u201f\u05f4\uff02]'
 16 | apos = "['’]"  # \\uff07
 17 | eos = '[!.?]'  # r'\uff01\uff0e\uff1f\ufe52\ufe56\ufe57'
 18 | nonword = r'\p{L}\p{M}\p{N}'
 19 | caps_modes = ['auto', 'lower', 'raw', 'simple', 'upper']
 20 | default_caps = 'auto'
 21 | 
 22 | 
 23 | def normalize(text: str,
 24 |               reduce_whitespace: bool = False,
 25 |               unix_newline: bool = True,
 26 |               fix_punct: bool = False,
 27 |               strip_bom: bool = True
 28 |               ) -> str:
 29 |     if reduce_whitespace:
 30 |         text = regex.sub(rf'\s*[{newline}]\s*[{newline}]\s*', '\n\n', text.replace('\u2029', '\n\n'))
 31 |         text = regex.sub(rf'[^\S{newline}]*[{newline}][^\S{newline}]*', '\n', text)
 32 |         text = regex.sub(rf'[^\S{newline}]+', ' ', text)
 33 |         text = text.strip()
 34 |     elif unix_newline:
 35 |         text = regex.sub('\r\n?', '\n', text)
 36 |     if fix_punct:
 37 |         text = regex.sub('\\p{Pd}', '-', text)
 38 |         text = regex.sub(single_quote, "'", text)
 39 |         text = regex.sub(double_quote, '"', text)
 40 |         text = regex.sub('\u2026', '...', text)
 41 |     if strip_bom:
 42 |         text = regex.sub('^\ufeff', '', text)
 43 |     return text
 44 | 
 45 | 
 46 | caps_regex = rf'(((?=(\r\n|[{newline}]))\3){{2,}}|\u2029|^|{eos})\P{{L}}*.|(^|[^{nonword}])i(?![{nonword}])'  # Avoid lookbehind to support Safari
 47 | 
 48 | 
 49 | def decode_caps_simple(text: str) -> str:
 50 |     return regex.sub(caps_regex, lambda m: m[0].upper(), text)
 51 | 
 52 | 
 53 | def encode_caps(text: str, caps: str = default_caps) -> str:
 54 |     assert caps in caps_modes, f"Error: caps='{caps}' not in {caps_modes}"
 55 |     return text if caps == 'raw' else text.upper() if caps == 'upper' else text.lower()
 56 | 
 57 | 
 58 | def remove_the(text: str) -> str:
 59 |     the_str = 'THE' if text == text.upper() else 'the'
 60 |     return regex.sub(f'(^(?!{the_str}$)| ){the_str}( |$)', r'\1\2', text, flags=regex.MULTILINE)
 61 | 
 62 | 
 63 | def get_qu_regex(next_letter_case: str, u_caps: Optional[bool] = None) -> str:
 64 |     u = 'U' if u_caps or u_caps is None and next_letter_case == 'u' else 'u'
 65 |     return f'(?={apos}?[^{u}\\P{{L{next_letter_case}}}])'
 66 | 
 67 | 
 68 | def encode_quq(text: str) -> str:
 69 |     text = regex.sub(f"([Qq])u{get_qu_regex('l')}", '\\1', text)
 70 |     return regex.sub(f"QU{get_qu_regex('')}", 'Q', text)
 71 | 
 72 | 
 73 | def decode_quq(text: str, caps: str) -> str:
 74 |     if caps == 'raw':
 75 |         text = regex.sub(f"[Qq]{get_qu_regex('l')}", '\\g<0>u', text)
 76 |         text = regex.sub(f"Q{get_qu_regex('u')}", 'QU', text)
 77 |     elif caps == 'upper':
 78 |         text = regex.sub(f"Q{get_qu_regex('', u_caps=True)}", 'QU', text)
 79 |     else:
 80 |         text = regex.sub(f"q{get_qu_regex('')}", 'qu', text)
 81 |     return text
 82 | 
 83 | 
 84 | def get_quq_js_decoder(caps: str) -> str:
 85 |     if caps == 'raw':
 86 |         js_decoder = f".replace(/[Qq]{get_qu_regex('l')}/gu,'$&u').replace(/Q{get_qu_regex('u')}/gu,'QU')"
 87 |     elif caps == 'upper':
 88 |         js_decoder = f".replace(/Q{get_qu_regex('', u_caps=True)}/gu,'QU')"
 89 |     else:
 90 |         js_decoder = f".replace(/q{get_qu_regex('')}/gu,'qu')"
 91 |     return js_decoder
 92 | 
 93 | 
 94 | def count_bad_quq(text: str, caps: str, verbose: bool = False) -> int:
 95 |     text = encode_caps(text, caps)
 96 |     recon = decode_quq(encode_quq(text), caps)
 97 |     text = regex.split('[Qq]', text)
 98 |     recon = regex.split('[Qq]', recon)
 99 |     cnt = sum(a != b for a, b in zip(recon, text)) + abs(len(recon) - len(text))
100 |     if verbose and cnt:
101 |         print(f'Warning: found {cnt} cases of q followed by a non u, or terminal qu', file=sys.stderr)
102 |     return cnt
103 | 
104 | 
105 | def encode_with_fallbacks(text: str,
106 |                           caps: str = default_caps,
107 |                           the: bool = True,
108 |                           quq: bool = True,
109 |                           caps_fallback: bool = True,
110 |                           the_fallback: bool = True,
111 |                           quq_fallback: bool = True,
112 |                           verbose: bool = False
113 |                           ) -> Tuple[str, str, bool, bool]:
114 |     if caps_fallback:
115 |         if caps == 'auto' and text != decode_caps_simple(encode_caps(text, caps)):
116 |             caps = 'raw'
117 |             if verbose:
118 |                 print(f"Falling back to caps='{caps}'", file=sys.stderr)
119 |         if caps == 'raw':
120 |             if text == text.lower():
121 |                 caps = 'lower'
122 |             elif text == text.upper():
123 |                 caps = 'upper'
124 |     text = encode_caps(text, caps)
125 | 
126 |     if the:
127 |         theless = remove_the(text)
128 |         if the_fallback:
129 |             if theless == text:
130 |                 the = False
131 |             if the and regex.search('^ |  | $', text, regex.MULTILINE):
132 |                 the = False
133 |                 if verbose:
134 |                     print(f'Falling back to the={the}', file=sys.stderr)
135 |         if the:
136 |             text = theless
137 | 
138 |     if quq:
139 |         quless = encode_quq(text)
140 |         if quq_fallback:
141 |             if len(text) - len(quless) < len(get_quq_js_decoder(caps)):
142 |                 quq = False
143 |             if quq and count_bad_quq(text, caps, verbose):
144 |                 quq = False
145 |                 if verbose:
146 |                     print(f'Falling back to quq={quq}', file=sys.stderr)
147 |         if quq:
148 |             text = quless
149 | 
150 |     return text, caps, the, quq
151 | 
152 | 
153 | def get_js_decoder(text: Optional[str] = None,
154 |                    caps: str = default_caps,
155 |                    the: bool = True,
156 |                    quq: bool = True,
157 |                    text_var: str = default_vars.text
158 |                    ) -> str:
159 |     assert caps in caps_modes, f"Error: caps='{caps}' not in {caps_modes}"
160 |     if text is not None:
161 |         text, caps, the, quq = encode_with_fallbacks(text, caps, the, quq)
162 |     js_decoder = ''
163 |     if quq:
164 |         js_decoder += get_quq_js_decoder(caps)
165 |     if the:
166 |         the_str = 'THE' if caps == 'upper' else 'the'
167 |         js_decoder += f".replace(/(^(?!$)| )( |$)/gm,'$1{the_str}$2')"
168 |     if caps in ['auto', 'simple']:
169 |         js_decoder += f'.replace(/{caps_regex}/gu,m=>m.toUpperCase())'
170 |     if js_decoder:
171 |         js_decoder = f'{text_var}={text_var}{js_decoder}\n'
172 |     return js_decoder
173 | 
174 | 
175 | def encode_and_get_js_decoder(text: str,
176 |                               caps: str = default_caps,
177 |                               the: bool = True,
178 |                               quq: bool = True,
179 |                               caps_fallback: bool = True,
180 |                               the_fallback: bool = True,
181 |                               quq_fallback: bool = True,
182 |                               verbose: bool = False,
183 |                               text_var: str = default_vars.text
184 |                               ) -> Tuple[str, str]:
185 |     text, caps, the, quq = encode_with_fallbacks(text, caps, the, quq, caps_fallback, the_fallback, quq_fallback, verbose)
186 |     return text, get_js_decoder(caps=caps, the=the, quq=quq, text_var=text_var)
187 | 
188 | 
189 | def test_quq() -> None:
190 |     bad = 0
191 |     for caps in caps_modes:
192 |         for q in 'Qq':
193 |             for u in ['U', 'u', ' ', "' "]:
194 |                 for a in "AaUu'’ ":
195 |                     for b in 'Bb ':
196 |                         orig = f'{q}{u}{a}{b}'
197 |                         text, new_caps, _, _ = encode_with_fallbacks(orig, caps, the=False, quq=False)
198 |                         enc = encode_quq(text)
199 |                         dec = decode_quq(text, new_caps)
200 |                         if text != dec:
201 |                             print(f'caps={caps:>6}->{new_caps:>5}: orig={orig} -> text={text} -> enc={enc} -> dec={dec}', file=sys.stderr)
202 |                             bad += 1
203 |     print(f'Found {bad} bad qu cases', file=sys.stderr)
204 | 
205 | 
206 | if __name__ == '__main__':
207 |     test_quq()
208 | 


--------------------------------------------------------------------------------
/ztml/deflate.py:
--------------------------------------------------------------------------------
  1 | """PNG / DEFLATE encoding optimized for arbitrary data compression
  2 | 
  3 | Encoding data as a PNG image allows efficient DEFLATE compression (similar to ZIP),
  4 | while allowing use of the browser's native decompression capability for free,
  5 | thus saving the need of an additional decoder, AKA PNG bootstarpping.
  6 | The data is then read from the HTML canvas element.
  7 | The image aspect ratio is optimized to be squarish (for higher browser compatibility) with minimal padding.
  8 | We do not use the alpha channel due to the browser's alpha pre-multiplication in Canvas 2D causing inaccuracies.
  9 | In Safari, even without an alpha channel, similar inaccuracies prevent using 8-bit and 24-bit depths for PNGs.
 10 | By default, we use Google's optimized Zopfli compression which is compatible with DEFLATE decompression.
 11 | Alternatively, you can use ECT which can be beneficial for large texts (but may slightly hurt smaller ones)
 12 | (e.g. ECT 0.9.4 gave 1.4% overall improvement over Zopfli on 2600.txt and minibook)
 13 | A minimalistic JS decoder code is generated.
 14 | 
 15 | Other experiments:
 16 | 8-bit and 24-bit (RGB) give similar overall results to 1-bit (but does not work on Safari)
 17 | WEBP gave worse overall results (libwebp/cwebp from 8-bit and 24-bit PNG, but does seem to work on Safari).
 18 | 
 19 | References:
 20 | https://web.archive.org/web/20090826082743/http://blog.nihilogic.dk:80/2008/05/compression-using-canvas-and-png.html
 21 | https://web.archive.org/web/20130310075429/http://daeken.com/superpacking-js-demos
 22 | https://web.archive.org/web/20130219050720/http://alexle.net/archives/306
 23 | https://www.iamcal.com/png-store
 24 | https://github.com/iamcal/PNGStore
 25 | http://bwirl.blogspot.com/2011/11/optimize-web-apps-with-png.html
 26 | https://gist.github.com/gasman/2560551 (pnginator)
 27 | https://www.pouet.net/prod.php?which=59298 (JsExe)
 28 | https://www.pouet.net/topic.php?which=8770
 29 | https://github.com/codegolf/zpng
 30 | https://github.com/xem/miniBook
 31 | https://github.com/google/zopfli
 32 | https://github.com/hattya/zopflipy
 33 | https://github.com/fhanau/Efficient-Compression-Tool (ECT)
 34 | https://encode.su/threads/2274-ECT-an-file-optimizer-with-fast-zopfli-like-deflate-compression
 35 | https://stackoverflow.com/questions/60074569/html-canvas-returns-off-by-some-bytes-from-getimagedata
 36 | https://stackoverflow.com/questions/23497925/how-can-i-stop-the-alpha-premultiplication-with-canvas-imagedata/#60564905
 37 | https://github.com/jhildenbiddle/canvas-size#test-results
 38 | https://pqina.nl/blog/canvas-area-exceeds-the-maximum-limit
 39 | https://bugs.webkit.org/show_bug.cgi?id=230855
 40 | """
 41 | 
 42 | 
 43 | from io import BytesIO
 44 | import math
 45 | import os
 46 | import platform
 47 | import sys
 48 | from tempfile import NamedTemporaryFile
 49 | from typing import List, Iterable, Optional
 50 | 
 51 | import png
 52 | # noinspection PyPackageRequirements
 53 | import zopfli
 54 | 
 55 | if not __package__:
 56 |     import default_vars
 57 | else:
 58 |     # noinspection PyPackages
 59 |     from . import default_vars
 60 | 
 61 | 
 62 | max_dim = 32767
 63 | max_len = 11180 ** 2
 64 | allowed_bitdepths = [1, 8, 24]  # Warning: 8-bit and 24-bit do not work on Safari
 65 | default_bitdepth = 1
 66 | 
 67 | 
 68 | def to_png(bits: Iterable[int],
 69 |            bitdepth: int = default_bitdepth,  # 1, 8, 24
 70 |            compression: Optional[int] = 9,
 71 |            ect: bool = False,  # This will override zop settings
 72 |            ect_compression: int = 20009,
 73 |            ect_filters: str = 'allfilters',  # 'allfilters', 'allfilters-b' (brute), 'allfilters-c' (cheap) or ''
 74 |            zop_filters: str = '',  # Any subset of 01234mepb or '' for auto
 75 |            zop_iterations: int = 15,
 76 |            zop_iterations_large: int = 5,
 77 |            omit_iend: bool = True,
 78 |            filename: str = '',
 79 |            verbose: bool = False) -> bytes:
 80 |     data = list(bits)
 81 |     bit_len = len(data)
 82 |     assert bit_len
 83 |     assert bitdepth in allowed_bitdepths, f'Error: bitdepth={bitdepth} not in {allowed_bitdepths}'
 84 |     assert compression is None or -1 <= compression <= 9
 85 |     pad_bits = (bitdepth - bit_len) % bitdepth
 86 |     if bitdepth > 1:
 87 |         data += [data[-1]] * pad_bits
 88 |         data = [int(''.join(str(b) for b in data[i : i + bitdepth]), 2) for i in range(0, len(data), bitdepth)]
 89 |     width = height = pad_pixels = 0
 90 |     length = None
 91 |     while width * height != length:
 92 |         if length is not None:
 93 |             data.append(data[-1])
 94 |             pad_pixels += 1
 95 |         length = len(data)
 96 |         assert length <= max_len, f'Error: length={length:,} > max_len={max_len:,}'
 97 |         height = int(math.sqrt(length))
 98 |         while length % height and height > 1 and length // (height-1) <= max_dim:
 99 |             height -= 1
100 |         width = length // height
101 |         assert width <= max_dim, f'Error: width={width:,} > max_dim={max_dim:,}'
102 |     width_with_channels = width
103 |     length_with_channels = length
104 |     if bitdepth > 8:
105 |         data = [b for i in data for b in i.to_bytes(bitdepth // 8, 'big')]
106 |         width_with_channels *= bitdepth // 8
107 |         length_with_channels *= bitdepth // 8
108 |     data = [data[i : i + width_with_channels] for i in range(0, length_with_channels, width_with_channels)]
109 |     png_data = BytesIO()
110 |     png.Writer(width, height, greyscale=bitdepth <= 8,
111 |                bitdepth=1 if bitdepth == 1 else 8,
112 |                compression=compression).write(png_data, data)
113 |     png_data.seek(0)
114 |     png_data = png_data.read()
115 |     out = png_data
116 | 
117 |     if ect:
118 |         with NamedTemporaryFile(suffix='.png', delete=False) as f:  # See https://github.com/python/cpython/issues/88221
119 |             f.write(out)
120 |             filename = f.name
121 |         ect_filters_arg = f'--{ect_filters}' * bool(ect_filters)
122 |         ect_path = os.path.normpath(os.path.join(os.path.dirname(__file__), '..', 'ect', 'ect')) + '-ubuntu' * (platform.system() == 'Linux')
123 |         error = os.system(f'{ect_path} -{ect_compression} -strip -quiet --strict {ect_filters_arg} --mt-deflate {filename}')  # Time-consuming op.
124 |         assert not error, f'Error: could not run {ect_path} - Please install from https://github.com/fhanau/Efficient-Compression-Tool or use ect=False'
125 |         with open(filename, 'rb') as f:
126 |             out = f.read()
127 |         try:
128 |             os.remove(filename)
129 |         except PermissionError:
130 |             pass
131 |     elif zop_iterations > 0 and zop_iterations_large > 0:
132 |         out = zopfli.ZopfliPNG(filter_strategies=zop_filters,
133 |                                iterations=zop_iterations,
134 |                                iterations_large=zop_iterations_large
135 |                                ).optimize(png_data)  # Time-consuming op.
136 |     if omit_iend:  # Warning: do this only for PNG files
137 |         out = out[:-12]  # IEND length (4 bytes) + IEND tag (4 bytes) + IEND CRC-32 (4 bytes). Note: do not omit the IDAT zlib Adler-32 or the IDAT CRC-32 as this will break Safari
138 |     if verbose:
139 |         print(f'input_bits={bit_len} pad_bits={pad_bits} width={width} height={height} pad_pixels={pad_pixels} total_pad_bits={length*bitdepth - bit_len} bits={length * bitdepth} bytes={length*bitdepth+7 >> 3} png={len(png_data)} final={len(out)}', file=sys.stderr)
140 |     if filename:
141 |         with open(filename, 'wb') as f:
142 |             f.write(out)
143 |     return out
144 | 
145 | 
146 | encode = to_png
147 | 
148 | 
149 | def load_png(filename: str) -> List[int]:
150 |     return png.Reader(filename=filename).read_flat()[2].tolist()
151 | 
152 | 
153 | def get_js_create_image(image_var: str = default_vars.image,
154 |                         bytearray_var: str = default_vars.bytearray
155 |                         ) -> str:
156 |     return f'''{image_var}=new Image
157 | {image_var}.src=URL.createObjectURL(new Blob([{bytearray_var}]))
158 | '''
159 | 
160 | 
161 | def get_js_image_data(bit_len: int,
162 |                       decoder_script: str = '',
163 |                       bitdepth: int = default_bitdepth,
164 |                       image_var: str = default_vars.image,
165 |                       bitarray_var: str = default_vars.bitarray
166 |                       ) -> str:
167 |     assert bitdepth in allowed_bitdepths, f'Error: bitdepth={bitdepth} not in {allowed_bitdepths}'
168 |     js_image_data = f'''{image_var}.decode().then(c=>{{
169 | c=document.createElement`canvas`
170 | x=c.getContext`2d`
171 | c=[c.width={image_var}.width,c.height={image_var}.height]
172 | x.drawImage({image_var},0,0)
173 | s=x.getImageData({bitarray_var}=[],0,...c).data{'.filter((v,i)=>(i+1)%4)' * (bitdepth == 24)}
174 | '''
175 |     if bitdepth == 1:
176 |         js_image_data += f'for(j={bit_len};j--;){bitarray_var}[j]=s[j*4]>>7&1\n'  # Applying >>7 to deal with Safari PNG rendering inaccuracy
177 |     else:  # Will break Safari
178 |         js_image_data += f'''for(j={(bit_len+(bitdepth-bit_len)%bitdepth) // 8};j--;)for(k=8;k--;){bitarray_var}[j*8+k]=s[j{'*4' * (bitdepth <= 8)}]>>7-k&1
179 | {bitarray_var}.length={bit_len}
180 | '''
181 |     js_image_data += f'{decoder_script.strip()}}})'
182 |     return js_image_data
183 | 
184 | 
185 | def get_js_image_decoder(bit_len: int,
186 |                          decoder_script: str = '',
187 |                          bitdepth: int = default_bitdepth,
188 |                          image_var: str = default_vars.image,
189 |                          bytearray_var: str = default_vars.bytearray,
190 |                          bitarray_var: str = default_vars.bitarray
191 |                          ) -> str:
192 |     return get_js_create_image(image_var, bytearray_var) + get_js_image_data(
193 |         bit_len, decoder_script, bitdepth, image_var, bitarray_var)
194 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <a href="https://colab.research.google.com/github/eyaler/ztml/blob/main/ZTML.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
 2 | 
 3 | # ZTML
 4 | 
 5 | ### Extreme inline text compression for HTML / JS
 6 | ### By [Eyal Gruss](https://eyalgruss.com) ([@eyaler](https://twitter.com/eyaler))
 7 | 
 8 | #### Partially made at [Stochastic Labs](http://stochasticlabs.org)
 9 | 
10 | On-chain media storage can require efficient compression for text embedded inline in HTML / JS.
11 | ZTML is a custom pipeline that generates stand-alone HTML or JS files which embed competitively compressed self-extracting text, with file sizes of 25% - 40% the original.
12 | These file sizes include the decoder code which is a highly golfed 1 - 1.5 kB (including auxiliary indices and tables).
13 | The approach makes sense and is optimized for small texts (tens of kB), but performs quite well also on large texts.
14 | The pipeline includes original low-overhead [binary-to-text alternatives](https://en.wikipedia.org/wiki/Binary-to-text_encoding) to Base64 which are also useful for inline images.
15 | 
16 | You can find a very high-level overview in these [slides](misc/reversim2022_slides.pdf) from this [5-minute talk](https://www.youtube.com/watch?v=7rz_MfAIJnY) (in Hebrew) at [Reversim Summit 2022](https://summit2022.reversim.com), and some more technical highlights and discussion in the [encode.su forum thread](https://encode.su/threads/3973-ZTML-Extreme-inline-text-compression-for-HTML-JS).
17 | 
18 | ### Benchmark
19 | |                                                                                       | File format   | [Micromegas (En)](https://gutenberg.org/files/30123/30123-8.txt) | [War and Peace (En)](https://gutenberg.org/files/2600/2600-0.txt) |
20 | |---------------------------------------------------------------------------------------|---------------|------------------------------------------------------------------|-------------------------------------------------------------------|
21 | | Project Gutenberg plain text utf8                                                     | txt           | 63.7 kB                                                          | 3.2 MB                                                            |
22 | | [paq8px_v206fix1](http://www.mattmahoney.net/dc/text.html#1250) -12RT (excl. decoder) | paq           | 13.3 kB (21%)                                                    | 575 kB (18%)                                                      |
23 | | 7-Zip 22.01 9 Ultra PPMd (excl. decoder)                                              | 7z            | 20.8 kB (32%)                                                    | 746 kB (23%)                                                      |
24 | | 7-Zip 22.01 9 Ultra PPMd (self-extracting)                                            | exe           | 232 kB (364%)                                                    | 958 kB (29%)                                                      |
25 | | Zstandard 1.5.2 -22 --ultra (excl. decoder)                                           | zst           | 23.4 kB (37%)                                                    | 921 kB (28%)                                                      |
26 | | [Roadroller](https://github.com/lifthrasiir/roadroller) 2.1.0 -O2                     | js            | 26.5 kB (42%)                                                    | 1.0 MB (30%)                                                      |
27 | | **ZTML Base125**                                                                      | html (utf8)   | 26.4 kB (41%) `mtf=0`                                            | 902 kB (28%) `mtf=80` `ect=True`                                  |
28 | | **ZTML crEnc**                                                                        | html (cp1252) | 23.5 kB (37%) `mtf=0`                                            | 803 kB (24%) `mtf=80` `ect=True`                                  |
29 | 
30 | ### Installation
31 | ```
32 | git clone https://github.com/eyaler/ztml
33 | pip install -r ztml/requirements.txt
34 | ```
35 | For running validations, you also need to have Chrome, Edge and Firefox installed.
36 | 
37 | ### Usage
38 | A standard simplified pipeline can be run by calling `ztml()`:
39 | ```
40 | from ztml import ztml
41 | ztml.ztml('Input text that is much longer than this one!', 'output.html')
42 | ```
43 | or running `ztml.py` from the command line (CLI):
44 | ```
45 | python ztml/ztml.py input.txt output.html
46 | ```
47 | See [ztml.py](ztml/ztml.py).
48 | Of course, there is also an accessible [Google Colab](https://colab.research.google.com/github/eyaler/ztml/blob/main/ZTML.ipynb) with a simple GUI. Shortcut: [bit.ly/ztml1](https://bit.ly/ztml).
49 | 
50 | [crEnc](ztml/crenc.py) gives better compression but requires setting the HTML or JS charset to cp1252.
51 | [Base125](ztml/base125.py) is the second-best option if one must stick with utf8. 
52 | 
53 | See [example.py](example.py) for a complete example reproducing the ZTML results in the above benchmark,
54 | and [example_image.py](example_image.py) for an example of encoding inline images, by using `image=True` or passing a file with a supported image extension to the CLI.
55 | Outputs of these runs can be accessed at [eyalgruss.com/ztml](https://eyalgruss.com/ztml).
56 | On top of the built-in validations for Chrome, Edge and Firefox, these were also manually tested on macOS Monterey 12.5 Safari 15.6, macOS Ventura 13.2 Safari 16.3 and iOS 16.0, 16.2 Safari.
57 | 
58 | A quick-and-dirty way to compress an existing single-page HTML websites with embedded inline media is to use `raw=True` or pass a '.html' file to the CLI.
59 | 
60 | ### What this is not
61 | 1. Not an HTML inliner
62 | 2. Not an image optimizer
63 | 3. Not a full-fledged JS minifier 
64 | 
65 | ### Caveats
66 | 1. Files larger than a few MB might not work on [iOS Safari](https://pqina.nl/blog/canvas-area-exceeds-the-maximum-limit) or [macOS Safari 15](https://bugs.webkit.org/show_bug.cgi?id=230855).
67 | 2. This solution favors compression rate over compression and decompression times. Use `mtf=None` for faster decompression of large files.
68 | 3. For [compressing word lists](http://golf.horse) (sorted lexicographically), solutions as [Roadroller](https://lifthrasiir.github.io/roadroller) do a much better job.
69 | 
70 | ### Pipeline and source code breakdown
71 | |     | Stage                                      | Source                              | Remarks                                                                                                                                                                                                                                               |
72 | |-----|--------------------------------------------|-------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
73 | | 0   | Pipeline and CLI                           | [ztml.py](ztml/ztml.py)             |                                                                                                                                                                                                                                                       |
74 | | 1   | Text normalization (lossy)                 | [text_prep.py](ztml/text_prep.py)   | Reduce whitespace; substitute unicode punctuation                                                                                                                                                                                                     |
75 | | 2   | Text condensation (lossless)               | [text_prep.py](ztml/text_prep.py)   | Lowercase with automatic capitalization; substitute common strings as: the, qu                                                                                                                                                                        |
76 | | 3   | Burrows–Wheeler + Move-to-front transforms | [bwt_mtf.py](ztml/bwt_mtf.py)       | Alphabet pre-sorting; Various MTF variants, including some original ones; Higher MTF settings beneficial for larger texts                                                                                                                             |
77 | | 4   | Huffman encoding                           | [huffman.py](ztml/huffman.py)       | Canonical encoding with a [codebook-free decoder](https://researchgate.net/publication/3159499_On_the_implementation_of_minimum_redundancy_prefix_codes); Benefical as a pre-DEFLATE stage                                                            |
78 | | 5   | Burrows–Wheeler transform on bits          | [bwt_mtf.py](ztml/bwt_mtf.py)       | Beneficial for large texts                                                                                                                                                                                                                            |
79 | | 6   | PNG / DEFLATE compression                  | [deflate.py](ztml/deflate.py)       | ZIP-like compression with native browser decompression; aspect ratio optimized for maximal compatibility and minimal padding; [Zopfli](https://github.com/google/zopfli) or [ECT](https://github.com/fhanau/Efficient-Compression-Tool) optimizations |
80 | | 7   | Binary-to-text encoding                    |                                     | Embed in template strings; Fix [HTML character overrides](https://html.spec.whatwg.org/multipage/parsing.html#table-charref-overrides); Allow [dynEncode](https://github.com/eshaz/simple-yenc#what-is-dynencode)-like optimal offset                 |
81 | | 7a  | Base125 (utf8)                             | [base125.py](ztml/base125.py)       | An original variant of [Base122](https://blog.kevinalbs.com/base122), with 14.7% overhead                                                                                                                                                             |
82 | | 7b  | crEnc (cp1252)                             | [crenc.py](ztml/crenc.py)           | An original variant of [yEnc](http://www.yenc.org) with 1.2% overhead; requires single-byte charset                                                                                                                                                   |
83 | | 8   | Uglification                               | [webify.py](ztml/webify.py)         | Substitute recurring JS names with short aliases                                                                                                                                                                                                      |
84 | | 9   | Validation                                 | [validation.py](ztml/validation.py) | Reproduce input content on Chrome, Edge and Firefox                                                                                                                                                                                                   |
85 | 
86 | Note: image encoding only uses steps 0 and 7 and later.
87 | 
88 | See source files for explanations, experiments and more references.
89 | 
90 | ### Projects using this
91 | - [fragium](https://fragium.com)
92 | - [miniBook](https://xem.github.io/miniBook) submission by Eyal Gruss ([source code](misc/minibook.py))
93 | - [WEBZOS](https://wbtz.github.io)
94 | 


--------------------------------------------------------------------------------
/ect/License.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/ztml/ztml.py:
--------------------------------------------------------------------------------
  1 | """ZTML - Extreme inline text compression for HTML / JS"""
  2 | 
  3 | 
  4 | import argparse
  5 | from base64 import b64encode
  6 | import chardet
  7 | import os
  8 | import sys
  9 | from time import time
 10 | from typing import AnyStr, Optional, overload, Tuple, Union
 11 | 
 12 | try:
 13 |     from typing import Literal
 14 | except ImportError:
 15 |     from typing_extensions import Literal
 16 | 
 17 | if not __package__:
 18 |     import base125, bwt_mtf, crenc, default_vars, deflate, huffman, text_prep, validation, webify
 19 | else:
 20 |     # noinspection PyPackages
 21 |     from . import base125, bwt_mtf, crenc, default_vars, deflate, huffman, text_prep, validation, webify
 22 | 
 23 | 
 24 | bin2txt_encodings = ['base64', 'base125', 'crenc']
 25 | default_bin2txt = 'crenc'
 26 | 
 27 | 
 28 | @overload
 29 | def ztml(data: AnyStr, filename: str = ..., reduce_whitespace: bool = ...,
 30 |          unix_newline: bool = ..., fix_punct: bool = ...,
 31 |          remove_bom: bool = ..., caps: str = ..., bwtsort: bool = ...,
 32 |          mtf: Optional[int] = ..., bitdepth: int = ..., ect: bool = ...,
 33 |          bin2txt: str = ..., element_id: str = ..., raw: bool = ...,
 34 |          image: bool = ..., js: bool = ..., uglify: bool = ...,
 35 |          replace_quoted: bool = ..., lang: str = ..., mobile: bool = ...,
 36 |          title: str = ..., text_var: str = ..., validate: Literal[False] = ...,
 37 |          ignore_regex: str = ..., browser: validation.BrowserType = ...,
 38 |          timeout: int = ..., verbose: bool = ...) -> bytes: ...
 39 | 
 40 | 
 41 | @overload
 42 | def ztml(data: AnyStr, filename: str = ..., reduce_whitespace: bool = ...,
 43 |          unix_newline: bool = ..., fix_punct: bool = ..., ect: bool = ...,
 44 |          remove_bom: bool = ..., caps: str = ..., bwtsort: bool = ...,
 45 |          mtf: Optional[int] = ..., bitdepth: int = ..., bin2txt: str = ...,
 46 |          element_id: str = ..., raw: bool = ..., image: bool = ...,
 47 |          js: bool = ..., uglify: bool = ..., replace_quoted: bool = ...,
 48 |          lang: str = ..., mobile: bool = ..., title: str = ...,
 49 |          text_var: str = ..., validate: Literal[True] = ...,
 50 |          ignore_regex: str = ..., browser: validation.BrowserType = ...,
 51 |          timeout: int = ..., verbose: bool = ...) -> Tuple[bytes, int]: ...
 52 | 
 53 | 
 54 | @overload
 55 | def ztml(data: AnyStr, filename: str = ..., reduce_whitespace: bool = ...,
 56 |          unix_newline: bool = ..., fix_punct: bool = ..., ect: bool = ...,
 57 |          remove_bom: bool = ..., caps: str = ..., bwtsort: bool = ...,
 58 |          mtf: Optional[int] = ..., bitdepth: int = ..., bin2txt: str = ...,
 59 |          element_id: str = ..., raw: bool = ..., image: bool = ...,
 60 |          js: bool = ..., uglify: bool = ..., replace_quoted: bool = ...,
 61 |          lang: str = ..., mobile: bool = ..., title: str = ...,
 62 |          text_var: str = ..., validate: bool = ..., ignore_regex: str = ...,
 63 |          browser: validation.BrowserType = ..., timeout: int = ...,
 64 |          verbose: bool = ...) -> Union[bytes, Tuple[bytes, int]]: ...
 65 | 
 66 | 
 67 | def ztml(data,
 68 |          filename='',
 69 |          reduce_whitespace=False,
 70 |          unix_newline=True,
 71 |          fix_punct=False,
 72 |          remove_bom=True,
 73 |          caps=text_prep.default_caps,
 74 |          bwtsort=True,
 75 |          mtf=bwt_mtf.default_mtf,
 76 |          bitdepth=deflate.default_bitdepth,
 77 |          ect=False,
 78 |          bin2txt=default_bin2txt,
 79 |          element_id='',
 80 |          raw=False,
 81 |          image=False,
 82 |          js=False,
 83 |          uglify=True,
 84 |          replace_quoted=True,
 85 |          lang='',
 86 |          mobile=False,
 87 |          title='',
 88 |          text_var=default_vars.text,
 89 |          validate=False,
 90 |          ignore_regex='',
 91 |          browser=validation.default_browser,
 92 |          timeout=validation.default_timeout,
 93 |          verbose=False
 94 |          ):
 95 |     start_time = time()
 96 |     assert bin2txt in bin2txt_encodings, f'Error: bin2txt={bin2txt} not in {bin2txt_encodings}'
 97 |     assert not element_id and not image or not raw
 98 |     if image:
 99 |         assert isinstance(data, bytes)
100 |         image_data = data
101 |     else:
102 |         if isinstance(data, bytes):
103 |             data = data.decode()
104 |         data = text_prep.normalize(data, reduce_whitespace, unix_newline, fix_punct, remove_bom)  # Reduce whitespace
105 |         condensed, string_decoder = text_prep.encode_and_get_js_decoder(data, caps, text_var=text_var)  # Lower case and shorten common strings
106 |         bwt_mtf_text, bwt_mtf_text_decoder = bwt_mtf.encode_and_get_js_decoder(condensed, bwtsort, mtf, add_bwt_func=False, data_var=text_var)  # Burrows-Wheeler + Move-to-front transforms on text. MTF is a time-consuming op.
107 |         huffman_bits, huffman_decoder = huffman.encode_and_get_js_decoder(bwt_mtf_text, text_var=text_var)  # Huffman encode
108 |         bits, bwt_bits_decoder = bwt_mtf.encode_and_get_js_decoder(huffman_bits)  # Burrows-Wheeler transform on bits
109 |         if raw:
110 |             writer = f'document.close(document.write({text_var}))'  # document.close() needed to ensure that any style changes added after a script are applied
111 |         elif element_id:
112 |             writer = f'''document.body.appendChild(document.createElement`pre`).id='{element_id}'
113 | {element_id}.textContent={text_var}'''
114 |         else:
115 |             writer = f"document.body.style.whiteSpace='pre';document.body.textContent={text_var}"
116 |         bits_decoder = f'{bwt_bits_decoder}{huffman_decoder}{bwt_mtf_text_decoder}{string_decoder}{writer}'
117 |         image_data = deflate.to_png(bits, bitdepth, ect=ect)  # PNG encode. Time-consuming op.
118 | 
119 |     encoding = 'cp1252' if bin2txt == 'crenc' else 'utf8'
120 |     if bin2txt == 'base64':  # This is just for benchmarking and is not recommended
121 |         image_url = b'data:;base64,' + b64encode(image_data)
122 |         if not image:
123 |             image_decoder = f"{default_vars.image}=new Image;{default_vars.image}.src='".encode() + image_url + b"'\n"
124 |             out = image_decoder + deflate.get_js_image_data(len(bits), bits_decoder, bitdepth).encode()
125 |     else:
126 |         if bin2txt == 'base125':
127 |             bytes_decoder = base125.get_js_decoder(image_data)  # Time-consuming op. when offset==None
128 |         else:
129 |             bytes_decoder = crenc.get_js_decoder(image_data)  # Time-consuming op. when offset==None
130 |         if image:
131 |             image_url = f"'+URL.createObjectURL(new Blob([{default_vars.bytearray}]))+'".encode()
132 |         else:
133 |             image_decoder = deflate.get_js_image_decoder(len(bits), bits_decoder, bitdepth)
134 |             out = webify.safe_encode(image_decoder, encoding, get_back_unused=True)
135 | 
136 |     if image:
137 |         if element_id:
138 |             out = f"""document.body.appendChild(new Image).id='{element_id}'
139 | {element_id}.src='""".encode() + image_url + b"'"
140 |         else:
141 |             out = f"document.body.style.background='url(".encode() + image_url + b")no-repeat'"
142 | 
143 |     if bin2txt != 'base64':
144 |         out = bytes_decoder + out
145 |     if os.path.splitext(filename)[-1] == '.js':
146 |         js = True
147 |     if js and uglify:
148 |         out = webify.uglify(out, replace_quoted=replace_quoted, encoding=encoding)
149 |     elif not js:
150 |         out = webify.html_wrap(out, aliases=webify.default_aliases * uglify,
151 |                                replace_quoted=replace_quoted, lang=lang,
152 |                                encoding=encoding, mobile=mobile, title=title)
153 |     if filename:
154 |         with open(filename, 'wb') as f:
155 |             f.write(out)
156 |     if verbose:
157 |         print(f'Encoding took {time() - start_time :,.1f} sec.', file=sys.stderr)
158 |     if validate:
159 |         file = webify.html_wrap(out, aliases='', encoding=encoding) if js else filename or out
160 |         by = element = ''
161 |         if element_id:
162 |             by = 'id'
163 |             element = element_id
164 |         valid = validation.validate_html(file, data, caps, by, element, raw,
165 |                                          browser, timeout,
166 |                                          content_var=text_var,
167 |                                          ignore_regex=ignore_regex,
168 |                                          verbose=True)
169 |         out = out, not valid
170 |     return out
171 | 
172 | 
173 | if __name__ == '__main__':
174 |     parser = argparse.ArgumentParser()
175 |     parser.add_argument('input_filename')
176 |     parser.add_argument('output_filename', nargs='?', default='')
177 |     parser.add_argument('--input_encoding', nargs='?', const='', default='', help='Auto detect by default')
178 |     parser.add_argument('--reduce_whitespace', action='store_true')
179 |     parser.add_argument('--skip_unix_newline', action='store_true')
180 |     parser.add_argument('--fix_punct', action='store_true')
181 |     parser.add_argument('--skip_remove_bom', action='store_true')
182 |     parser.add_argument('--caps', type=str.lower, choices=text_prep.caps_modes, default=text_prep.default_caps)
183 |     parser.add_argument('--skip_bwtsort', action='store_true')
184 |     parser.add_argument('--mtf', type=lambda x: None if x.lower() == 'none' else int(x), choices=bwt_mtf.mtf_variants,
185 |                         default=bwt_mtf.default_mtf)
186 |     parser.add_argument('--bitdepth', type=int, choices=deflate.allowed_bitdepths, default=deflate.default_bitdepth, help='Warning: 8-bit and 24-bit do not work on Safari')
187 |     parser.add_argument('--ect', action='store_true')
188 |     parser.add_argument('--bin2txt', type=str.lower, choices=bin2txt_encodings, default=default_bin2txt)
189 |     parser.add_argument('--element_id', nargs='?', const='', default='', help='Warning: must be a valid JS variable name, and watch out for collisions with HTML namespace')
190 |     parser.add_argument('--raw', action='store_true', help='Use document.write() to overwrite the document with the raw text. May also be implied from input_filename extension')
191 |     parser.add_argument('--image', action='store_true', help='May also be implied from input_filename extension')
192 |     parser.add_argument('--js', action='store_true', help='May also be implied from output_filename extension')
193 |     parser.add_argument('--skip_uglify', action='store_true')
194 |     parser.add_argument('--skip_replace_quoted', action='store_true')
195 |     parser.add_argument('--lang', nargs='?', const='', default='')
196 |     parser.add_argument('--mobile', action='store_true')
197 |     parser.add_argument('--title', nargs='?', const='', default='')
198 |     parser.add_argument('--text_var', default=default_vars.text)
199 |     parser.add_argument('--validate', action='store_true')
200 |     parser.add_argument('--ignore_regex', nargs='?', const='', default='')
201 |     parser.add_argument('--browser', type=str.lower, choices=list(validation.drivers), default=validation.default_browser)
202 |     parser.add_argument('--timeout', type=int, default=validation.default_timeout, help='seconds')
203 |     parser.add_argument('--verbose', action='store_true')
204 |     args = parser.parse_args(args=None if sys.argv[1:] else ['--help'])
205 |     ext = os.path.splitext(args.input_filename)[-1][1:].lower()
206 |     if ext in webify.raw_extensions:
207 |         args.raw = True
208 |     elif ext in webify.image_extensions:
209 |         args.image = True
210 |     with open(args.input_filename, 'rb') as f:
211 |         data = f.read()
212 |         if not args.image:
213 |             if args.input_encoding:
214 |                 data = data.decode(args.input_encoding)
215 |             else:
216 |                 encoding = chardet.detect(data)['encoding'] or 'utf8'
217 |                 try:
218 |                     data = data.decode(encoding)
219 |                 except UnicodeDecodeError:
220 |                     if encoding.replace('-', '') == 'utf8':
221 |                         raise
222 |     out = ztml(data, args.output_filename, args.reduce_whitespace,
223 |                not args.skip_unix_newline, args.fix_punct,
224 |                not args.skip_remove_bom, args.caps, not args.skip_bwtsort,
225 |                args.mtf, args.bitdepth, args.ect, args.bin2txt,
226 |                args.element_id, args.raw, args.image, args.js,
227 |                not args.skip_uglify, not args.skip_replace_quoted, args.lang,
228 |                args.mobile, args.title, args.text_var, args.validate,
229 |                args.ignore_regex, args.browser, args.timeout, args.verbose)
230 |     result = False
231 |     if args.validate:
232 |         out, result = out
233 |     if not args.output_filename:
234 |         sys.stdout.buffer.write(out)
235 |     sys.exit(int(result))
236 | 


--------------------------------------------------------------------------------
/ztml/bwt_mtf.py:
--------------------------------------------------------------------------------
  1 | """Burrows-Wheeler and Move-to-front transforms
  2 | 
  3 | Applies pre-BWT alphabet vowel sorting by default to concentrate the vowels together.
  4 | BWT Implementation follows pydivsufsort tests, to obviate adding an EOF token.
  5 | MTF includes original variants (50-90) inspired by Fenwick's Sticky MTF,
  6 | and larger texts show benefit from higher MTF settings.
  7 | Additional BWT on bits (after entropy coding and before DEFLATE) was found beneficial for large texts.
  8 | 
  9 | Other experiments:
 10 | Run-length encoding for spaces before BWT gave worse overall results.
 11 | Run-length encoding after text BWT, and MTF over run characters (just this part of Neimi&Teuhola) gave worse overall results.
 12 | Run-length encoding for zeros (ZLE) after MTF gave worse overall results.
 13 | 
 14 | References:
 15 | https://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf
 16 | https://github.com/louisabraham/pydivsufsort/blob/master/tests/reference.py
 17 | https://www.cs.auckland.ac.nz/~peter-f/FTPfiles/2002%20VL%20coding%20BWT.pdf (Fenwick)
 18 | https://www.juergen-abel.info/files/preprints/preprint_post_bwt_stages.pdf
 19 | https://www.juergen-abel.info/files/preprints/preprint_universal_text_preprocessing.pdf
 20 | https://home.uncg.edu/cmp/faculty/srtate/papers/bwtsort.pdf
 21 | https://www.math.uni-bielefeld.de/sfb343/preprints/pr99133.ps.gz
 22 | https://onlinelibrary.wiley.com/doi/full/10.1002/spe.2873 (Neimi&Teuhola)
 23 | http://groups.di.unipi.it/~gulli/tutorial/burrows_wheeler.pdf (note: has errors afaict)
 24 | """
 25 | 
 26 | 
 27 | from typing import Iterable, List, Optional, overload, Tuple, Union
 28 | 
 29 | import numpy as np
 30 | from pydivsufsort import divsufsort
 31 | 
 32 | if not __package__:
 33 |     import default_vars, webify
 34 | else:
 35 |     # noinspection PyPackages
 36 |     from . import default_vars, webify
 37 | 
 38 | 
 39 | order1 = 'AOUIEVWXYZaouievwxyz'
 40 | order2 = 'VWXYZAOUIEvwxyzaouie'
 41 | mtf_variants = [None, 0, 1, 2, 50, 52, 60, 70, 80, 90]
 42 | default_mtf = 0
 43 | 
 44 | 
 45 | bwtsort_table = str.maketrans(order1, order2)
 46 | reverse_bwtsort_table = str.maketrans(order2, order1)
 47 | surrogate_lo = 55296
 48 | surrogate_hi = 57343
 49 | max_unicode = 1114111
 50 | max_ord_for_mtf = max_unicode - (surrogate_hi-surrogate_lo) - 1
 51 | 
 52 | 
 53 | def mtf_rank(mtf: int, rank: int, prev: int) -> int:
 54 |     assert mtf is not None
 55 |     assert mtf in mtf_variants, f'Error: mtf={mtf} not in {mtf_variants}'
 56 |     if mtf == 0:
 57 |         new_rank = 0
 58 |     elif mtf == 1:
 59 |         new_rank = rank > 1
 60 |     elif mtf == 2:
 61 |         new_rank = rank > 1 or rank == 1 and not prev
 62 |     elif mtf == 50:
 63 |         new_rank = rank // 2
 64 |     elif mtf == 52:
 65 |         new_rank = rank // 2 if rank > 1 else rank == 1 and not prev
 66 |     else:
 67 |         new_rank = int(rank*(mtf/100) + 0.5)  # Round in the same way as JS (do not round half to even)
 68 |     return new_rank
 69 | 
 70 | 
 71 | def mtf_encode(data: Iterable[int],
 72 |                mtf: int == default_mtf,
 73 |                validate=True
 74 |                ) -> List[int]:
 75 |     data = list(data)
 76 |     max_data = max(data, default=-1)
 77 |     assert max_data <= max_ord_for_mtf, (max_data, max_ord_for_mtf)
 78 |     ranks = list(range(max_data + 1))
 79 |     out = []
 80 |     prev = 1
 81 |     for i in data:
 82 |         rank = ranks.index(i)  # Time-consuming op.
 83 |         ranks.pop(rank)
 84 |         ranks.insert(mtf_rank(mtf, rank, prev), i)
 85 |         prev = rank
 86 |         if rank >= surrogate_lo:
 87 |             rank += surrogate_hi - surrogate_lo + 1
 88 |         out.append(rank)
 89 |     if validate:
 90 |         decoded = mtf_decode(out, mtf)
 91 |         if not hasattr(data, '__getitem__'):
 92 |             data = type(decoded)(data)
 93 |         assert decoded == data, (len(decoded), len(data), decoded[:30], data[:30])
 94 |     return out
 95 | 
 96 | 
 97 | def mtf_decode(data: Iterable[int], mtf: int == default_mtf) -> List[int]:
 98 |     out = list(data)
 99 |     ranks = list(range(max(out, default=-1) + 1))
100 |     prev = 1
101 |     for i, rank in enumerate(out):
102 |         if rank > surrogate_lo:
103 |             rank -= surrogate_hi - surrogate_lo + 1
104 |         out[i] = ranks.pop(rank)
105 |         ranks.insert(mtf_rank(mtf, rank, prev), out[i])
106 |         prev = rank
107 |     return out
108 | 
109 | 
110 | @overload
111 | def encode(data: str, bwtsort: bool = ..., mtf: Optional[int] = ...,
112 |            validate: bool = ...) -> Tuple[str, int]: ...
113 | 
114 | 
115 | @overload
116 | def encode(data: Iterable[int], bwtsort: bool = ..., mtf: Optional[int] = ...,
117 |            validate: bool = ...) -> Tuple[List[int], int]: ...
118 | 
119 | 
120 | def encode(data, bwtsort=True, mtf=default_mtf, validate=True):
121 |     is_str = isinstance(data, str)
122 |     if not is_str:
123 |         data = list(data)
124 |     out = list(data)
125 |     if bwtsort:
126 |         if not is_str:
127 |             out = [chr(i) for i in out]
128 |         out = ''.join(out).translate(bwtsort_table)
129 |     if is_str or bwtsort:
130 |         out = [ord(c) for c in out]
131 |     sa = divsufsort(np.array(out)) if out else []
132 |     out = out[-1:] + [out[i - 1] for i in sa if i]
133 |     index = list(sa).index(0) if out else 0
134 |     if mtf is not None:
135 |         out = mtf_encode(out, mtf, validate)  # Time-consuming op.
136 |     if is_str:
137 |         out = ''.join(chr(i) for i in out)
138 |     if validate:
139 |         decoded = decode(out, index, bwtsort, mtf)
140 |         if not hasattr(data, '__getitem__'):
141 |             data = type(decoded)(data)
142 |         assert decoded == data, (len(decoded), len(data), decoded[:30], data[:30])
143 |     return out, index
144 | 
145 | 
146 | @overload
147 | def decode(data: str, index: int, bwtsort: bool = ...,
148 |            mtf: Optional[int] = ...) -> str: ...
149 | 
150 | 
151 | @overload
152 | def decode(data: Iterable[int], index: int, bwtsort: bool = ...,
153 |            mtf: Optional[int] = ...) -> List[int]: ...
154 | 
155 | 
156 | def decode(data, index, bwtsort=True, mtf=default_mtf):
157 |     is_str = isinstance(data, str)
158 |     out = list(data)
159 |     if mtf is not None:
160 |         if is_str:
161 |             out = [ord(c) for c in out]
162 |         out = mtf_decode(out, mtf)
163 |         if is_str:
164 |             out = [chr(i) for i in out]
165 |     ordered = [(c, i - (i <= index)) for i, c in enumerate(out)]
166 |     ordered.sort()
167 |     for i in range(len(out)):
168 |         out[i], index = ordered[index]
169 |     if bwtsort:
170 |         if not is_str:
171 |             out = [chr(i) for i in out]
172 |         out = ''.join(out).translate(reverse_bwtsort_table)
173 |         if not is_str:
174 |             out = [ord(c) for c in out]
175 |     elif is_str:
176 |         out = ''.join(out)
177 |     return out
178 | 
179 | 
180 | def get_js_decoder(data: Union[str, Iterable[int]],
181 |                    index: int,
182 |                    bwtsort: bool = True,
183 |                    mtf: Optional[int] = default_mtf,
184 |                    add_bwt_func: bool = True,
185 |                    bwt_func_var: str = default_vars.bwt_func,
186 |                    data_var: str = ''
187 |                    ) -> str:
188 |     assert mtf in mtf_variants, f'Error: mtf={mtf} not in {mtf_variants}'
189 |     is_str = isinstance(data, str)
190 |     if not is_str:
191 |         data = list(data)
192 |     if not data_var:
193 |         data_var = default_vars.text if is_str else default_vars.bitarray
194 |     js_decoder = f'{data_var}=[...{data_var}].map(c=>c.codePointAt())\n' * is_str
195 |     if mtf is not None:
196 |         if mtf == 0:
197 |             mtf_op = f'd.unshift({data_var}[j++]=d.splice(k,1)[0])'
198 |         elif mtf == 1:
199 |             mtf_op = f'd.splice(k>1,0,{data_var}[j++]=d.splice(k,1)[0])'
200 |         elif mtf == 2:
201 |             js_decoder += 'n=1\n'
202 |             mtf_op = f'd.splice(k>!!n,0,{data_var}[j++]=d.splice(k,1)[0]),n=k'
203 |         elif mtf == 50:
204 |             mtf_op = f'd.splice(k/2,0,{data_var}[j++]=d.splice(k,1)[0])'
205 |         elif mtf == 52:
206 |             js_decoder += 'n=1\n'
207 |             mtf_op = f'd.splice(k>1?k/2:k>n,0,{data_var}[j++]=d.splice(k,1)[0]),n=k'
208 |         else:
209 |             mtf_op = f"d.splice(k*{str(mtf / 100).lstrip('0')}+.5,0,{data_var}[j++]=d.splice(k,1)[0])"
210 |         if is_str and any(ord(c) > surrogate_lo for c in data):
211 |             mtf_op = f'k-={surrogate_hi - surrogate_lo + 1}*(k>{surrogate_lo}),{mtf_op}'
212 |         # Use reduce instead of Math.max(...array) due to argument limit: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Function/apply#using_apply_and_built-in_functions
213 |         js_decoder += f'''d=[...Array({data_var}.reduce((a,b)=>a>b?a:b+1,0)).keys()]
214 | j=0
215 | for(k of {data_var}){mtf_op}
216 | '''
217 |     if add_bwt_func:
218 |         js_decoder += f"{bwt_func_var}=(d,k)=>{{s=d.map((c,i)=>[c,i-(i<=k)]).sort((a,b)=>a[0]-b[0]);for(j in s)[d[j],k]=s[k]}}\n"  # Sort on code points to respect order of char above \uffff
219 |     js_decoder += f'{bwt_func_var}({data_var},{index})\n'
220 |     dyn_orders = None
221 |     if bwtsort:
222 |         symbols = set(data)
223 |         if not is_str:
224 |             symbols = {chr(i) for i in symbols}
225 |         dyn_orders = list(zip(*[(c1, c2) for c1, c2 in zip(order1, order2) if c1 in symbols]))
226 |         if dyn_orders:
227 |             dyn_order1, dyn_order2 = dyn_orders
228 |             dyn_order1 = webify.escape(''.join(dyn_order1))
229 |             dyn_order2 = webify.escape(''.join(dyn_order2))
230 |             js_decoder += f'''d={{}};[...`{dyn_order2}`].map((c,i)=>d[c]=[...`{dyn_order1}`][i])
231 | {data_var}={data_var}.map(i=>{'d[c=String.fromCodePoint(i)]||c).join``' if is_str else '(d[c=String.fromCodePoint(i)]||c).codePointAt())'}
232 | '''
233 |     if is_str and not dyn_orders:
234 |         # Don't use String.fromCodePoint(...array) due to argument limit: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Function/apply#using_apply_and_built-in_functions
235 |         js_decoder += f'{data_var}={data_var}.map(i=>String.fromCodePoint(i)).join``\n'
236 |     return js_decoder
237 | 
238 | 
239 | @overload
240 | def encode_and_get_js_decoder(data: str,
241 |                               bwtsort: bool = ...,
242 |                               mtf: Optional[int] = ...,
243 |                               add_bwt_func: bool = ...,
244 |                               bwt_func_var: str = ...,
245 |                               data_var: str = ...,
246 |                               validate: bool = ...
247 |                               ) -> Tuple[str, str]: ...
248 | 
249 | 
250 | @overload
251 | def encode_and_get_js_decoder(data: Iterable[int],
252 |                               bwtsort: bool = ...,
253 |                               mtf: Optional[int] = ...,
254 |                               add_bwt_func: bool = ...,
255 |                               bwt_func_var: str = ...,
256 |                               data_var: str = ...,
257 |                               validate: bool = ...
258 |                               ) -> Tuple[List[int], str]: ...
259 | 
260 | 
261 | def encode_and_get_js_decoder(data,
262 |                               bwtsort=True,
263 |                               mtf=default_mtf,
264 |                               add_bwt_func=True,
265 |                               bwt_func_var=default_vars.bwt_func,
266 |                               data_var='',
267 |                               validate=True
268 |                               ):
269 |     is_str = isinstance(data, str)
270 |     if not is_str:
271 |         data = list(data)
272 |     if not data_var:
273 |         data_var = default_vars.text if is_str else default_vars.bitarray
274 |     if data_var == default_vars.bitarray:
275 |         bwtsort = False
276 |         mtf = None
277 |     encoded, index = encode(data, bwtsort, mtf, validate)
278 |     return encoded, get_js_decoder(data, index, bwtsort, mtf, add_bwt_func, bwt_func_var, data_var)
279 | 
280 | 
281 | def test() -> None:
282 |     mtf_test = [3, 2, 2, 2, 3, 2, 2, 3, 2, 2]
283 |     mtf0 = mtf_encode(mtf_test[:], mtf=0, validate=True)
284 |     assert mtf0 == [3, 3, 0, 0, 1, 1, 0, 1, 1, 0], mtf0
285 |     mtf1 = mtf_encode(mtf_test[:], mtf=1, validate=True)
286 |     assert mtf1 == [3, 3, 1, 0, 2, 0, 0, 1, 1, 0], mtf1
287 |     mtf2 = mtf_encode(mtf_test[:], mtf=2, validate=True)
288 |     assert mtf2 == [3, 3, 1, 0, 2, 0, 0, 1, 0, 0], mtf2
289 | 
290 |     symbols = ['', '\0', '\1', 'a', 'b', 'א', 'ב', '\ue000', '\uffff', '\U00010000']
291 |     for x in symbols:
292 |         for y in symbols:
293 |             for z in symbols:
294 |                 for mtf in mtf_variants:
295 |                     for bwtsort in [False, True]:
296 |                         encode(f'{x}{y}{z}', bwtsort=bwtsort, mtf=mtf, validate=True)
297 | 
298 |     symbols = ['', '0', '1', '97', '255']
299 |     for x in symbols:
300 |         for y in symbols:
301 |             for z in symbols:
302 |                 for mtf in mtf_variants:
303 |                     for bwtsort in [False, True]:
304 |                         encode([int(c) for c in f'{x}{y}{z}'], bwtsort=bwtsort, mtf=mtf, validate=True)
305 | 
306 | 
307 | if __name__ == '__main__':
308 |     test()
309 | 


--------------------------------------------------------------------------------
/ztml/validation.py:
--------------------------------------------------------------------------------
  1 | from base64 import b64decode
  2 | from contextlib import ExitStack, redirect_stdout
  3 | import os
  4 | import sys
  5 | from tempfile import NamedTemporaryFile
  6 | from time import sleep, time
  7 | from typing import AnyStr, Iterable, Mapping, Optional, overload, TypeVar, Union
  8 | 
  9 | try:
 10 |     from typing import Literal
 11 | except ImportError:
 12 |     from typing_extensions import Literal
 13 | 
 14 | import regex
 15 | from selenium.common.exceptions import JavascriptException, TimeoutException, WebDriverException
 16 | from selenium.webdriver import Chrome, Edge, Firefox, chrome, edge, firefox
 17 | from selenium.webdriver.common.by import By
 18 | from selenium.webdriver.remote.webdriver import WebDriver
 19 | from selenium.webdriver.support.ui import WebDriverWait
 20 | from webdriver_manager.chrome import ChromeDriverManager
 21 | from webdriver_manager.microsoft import EdgeChromiumDriverManager
 22 | from webdriver_manager.firefox import GeckoDriverManager
 23 | 
 24 | if not __package__:
 25 |     import default_vars, text_prep, webify
 26 | else:
 27 |     # noinspection PyPackages
 28 |     from . import default_vars, text_prep, webify
 29 | 
 30 | 
 31 | default_browser = 'chrome'
 32 | default_timeout = 60
 33 | default_by = By.TAG_NAME
 34 | default_element = 'body'
 35 | webdriver_paths_filename = 'webdriver_paths.txt'
 36 | 
 37 | 
 38 | os.environ['WDM_LOG'] = '0'
 39 | drivers = dict(chrome=[Chrome, chrome, ChromeDriverManager],
 40 |                edge=[Edge, edge, EdgeChromiumDriverManager],
 41 |                firefox=[Firefox, firefox, GeckoDriverManager]
 42 |                )
 43 | BrowserType = Union[str, WebDriver]
 44 | critical_error_strings = ['executable needs to be', 'unable to find binary', 'unexpectedly']
 45 | 
 46 | 
 47 | FilenameOrBytes = TypeVar('FilenameOrBytes', str, bytes)
 48 | 
 49 | 
 50 | def full_path(filename: str) -> str:
 51 |     return f"file:///{os.path.realpath(filename).replace(os.sep, '/')}"
 52 | 
 53 | 
 54 | def get_browser(browser: BrowserType,
 55 |                 stack: Optional[ExitStack] = None
 56 |                 ) -> WebDriver:
 57 |     if isinstance(browser, WebDriver):
 58 |         return browser
 59 |     options = drivers[browser][1].options.Options()
 60 |     options.headless = True
 61 |     options.add_argument('--no-sandbox')
 62 |     if hasattr(options, 'add_experimental_option'):
 63 |         options.add_experimental_option('excludeSwitches', ['enable-logging'])
 64 |     try:
 65 |         with redirect_stdout(None):
 66 |             service = drivers[browser][2]().install()
 67 |         folder = os.path.dirname(webdriver_paths_filename)
 68 |         if folder:
 69 |             os.makedirs(folder, exist_ok=True)
 70 |         with open(webdriver_paths_filename, 'a', encoding='utf8') as f:
 71 |             f.write(f'{browser},{service}\n')
 72 |     except Exception:
 73 |         with open(webdriver_paths_filename, encoding='utf8') as f:
 74 |             for line in reversed(f.read().splitlines()):
 75 |                 b, service = line.split(',', 1)
 76 |                 if b == browser:
 77 |                     break
 78 |     while isinstance(browser, str):
 79 |         try:
 80 |             browser = drivers[browser][0](service=drivers[browser][1].service.Service(service, log_path=os.devnull), options=options)
 81 |         except WebDriverException as e:
 82 |             if any(s in e.msg for s in critical_error_strings):
 83 |                 raise
 84 |             print(e, file=sys.stderr)
 85 |             sleep(30)
 86 |     if stack:
 87 |         browser = stack.enter_context(browser)
 88 |     return browser
 89 | 
 90 | 
 91 | @overload
 92 | def render_html(file: FilenameOrBytes, by: str = ..., element: str = ...,
 93 |                 raw: bool = ..., image: Literal[True] = ...,
 94 |                 browser: str = ..., timeout: int = ..., content_var: str = ...
 95 |                 ) -> Optional[bytes]: ...
 96 | 
 97 | 
 98 | @overload
 99 | def render_html(file: FilenameOrBytes, by: str = ..., element: str = ...,
100 |                 raw: bool = ..., image: Literal[False] = ...,
101 |                 browser: str = ..., timeout: int = ..., content_var: str = ...
102 |                 ) -> Optional[str]: ...
103 | 
104 | 
105 | @overload
106 | def render_html(file: FilenameOrBytes, by: str = ..., element: str = ...,
107 |                 raw: bool = ..., image: bool = ...,
108 |                 browser: str = ..., timeout: int = ..., content_var: str = ...
109 |                 ) -> Optional[AnyStr]: ...
110 | 
111 | 
112 | def render_html(file,
113 |                 by=default_by,
114 |                 element=default_element,
115 |                 raw=False,
116 |                 image=False,
117 |                 browser=default_browser,
118 |                 timeout=default_timeout,
119 |                 content_var=''
120 |                 ):
121 |     assert not raw or not image
122 |     if not by:
123 |         by = default_by
124 |     if not element:
125 |         element = default_element
126 |     with ExitStack() as stack:
127 |         browser = get_browser(browser, stack)
128 |         if isinstance(file, str):
129 |             filename = file
130 |         else:
131 |             with NamedTemporaryFile(suffix='.html', delete=False) as f:  # See https://github.com/python/cpython/issues/88221
132 |                 f.write(file)
133 |                 filename = f.name
134 |         browser.get(full_path(filename))
135 |         if isinstance(file, bytes):
136 |             try:
137 |                 os.remove(filename)
138 |             except PermissionError:
139 |                 pass
140 |         try:
141 |             wait = WebDriverWait(browser, timeout)
142 |             if image:
143 |                 if by == By.TAG_NAME and element == 'body':
144 |                     data_url = wait.until(lambda x:
145 |                                           regex.sub('^none$', '',
146 |                                                     x.find_element(by, element)
147 |                                                     .value_of_css_property('background-image')))
148 |                 else:
149 |                     data_url = wait.until(lambda x:
150 |                                           x.find_element(by, element)
151 |                                           .get_property('src'))
152 |                 assert isinstance(data_url, str), type(data_url)
153 |                 if ';base64,' in data_url:
154 |                     return b64decode(data_url.split(';base64,', 1)[1].split('"', 1)[0], validate=True)
155 |                 image_data = browser.execute_script(f'return {content_var or default_vars.bytearray}')
156 |                 if isinstance(image_data, dict):  # Needed for or Firefox, see: https://github.com/SeleniumHQ/selenium/issues/11070
157 |                     image_data = [v for k, v in sorted(image_data.items(), key=lambda x: int(x[0]))]
158 |                 return bytes(image_data)
159 |             if raw:
160 |                 sleep(0.1)
161 |                 get_text = lambda x: x.execute_script(f'return {content_var or default_vars.text}')
162 |             else:
163 |                 get_text = lambda x: x.find_element(by, element).get_property('innerText')
164 |             try:
165 |                 text = wait.until(get_text)
166 |             except JavascriptException:
167 |                 sleep(1)
168 |                 text = wait.until(get_text)
169 |             assert isinstance(text, str), type(text)
170 |             return text
171 |         except TimeoutException:
172 |             return None
173 |         except Exception:
174 |             print(f'\nError: {browser.name} failed on {full_path(filename)}', file=sys.stderr)
175 |             raise
176 | 
177 | 
178 | def find_first_diff(rendered: AnyStr, data: AnyStr, verbose: bool = True) -> int:
179 |     i = -1
180 |     for i, (r, t) in enumerate(zip(rendered, data)):
181 |         if r != t:
182 |             break
183 |     else:
184 |         i += 1
185 |     if verbose:
186 |         print(f'\nFirst difference found at {i} / {len(rendered)}', file=sys.stderr)
187 |         print(f'Original: {data[max(i - 30, 0) : i]!r} -> {data[i : i + 50]!r}', file=sys.stderr)
188 |         print(f'Rendered: {rendered[max(i - 30, 0) : i]!r} -> {rendered[i : i + 50]!r}\n', file=sys.stderr)
189 |     return i
190 | 
191 | 
192 | def validate_html(file: FilenameOrBytes,  # Don't use AnyStr as it does not have to be the same type as data
193 |                   data: AnyStr,
194 |                   caps: str = text_prep.default_caps,
195 |                   by: str = default_by,
196 |                   element: str = default_element,
197 |                   raw: bool = False,
198 |                   browser: BrowserType = default_browser,
199 |                   timeout: int = default_timeout,
200 |                   unicode_A: int = 0,
201 |                   ignore_regex: str = '',
202 |                   content_var: str = '',
203 |                   verbose: bool = True
204 |                   ) -> Optional[bool]:
205 |     image = isinstance(data, bytes)
206 |     assert data, 'Error: Cannot validate against empty data'
207 |     rendered = render_html(file, by, element, raw, image, browser, timeout, content_var)
208 |     if rendered is None:
209 |         return None
210 |     if not image:
211 |         if caps == 'lower':
212 |             data = data.lower()
213 |         elif caps == 'upper':
214 |             data = data.upper()
215 |         elif caps == 'simple':
216 |             data = text_prep.decode_caps_simple(data.lower())
217 |         if not raw:
218 |             if unicode_A:
219 |                 rendered = regex.sub(r'[^\p{Z}\p{C}]', lambda m: chr(ord(m[0]) - unicode_A + 65 + (6 if ord(m[0]) - unicode_A + 65 > 90 else 0)), rendered)
220 |             rendered = regex.sub(ignore_regex, '', rendered)
221 |     if rendered == data:
222 |         return True
223 |     if verbose:
224 |         find_first_diff(rendered, data)
225 |     return False
226 | 
227 | 
228 | def validate_files(filenames: Mapping[str, str],
229 |                    data: Optional[AnyStr] = None,
230 |                    reduce_whitespace: bool = False,
231 |                    unix_newline: bool = True,
232 |                    fix_punct: bool = False,
233 |                    remove_bom: bool = True,
234 |                    caps: str = text_prep.default_caps,
235 |                    by: str = default_by,
236 |                    element: str = default_element,
237 |                    raw: bool = False,
238 |                    image: bool = False,
239 |                    browsers: Optional[Union[BrowserType, Iterable[BrowserType]]] = None,
240 |                    timeout: int = default_timeout,
241 |                    unicode_A: int = 0,
242 |                    ignore_regex: str = '',
243 |                    content_var: str = '',
244 |                    validate: bool = True,
245 |                    verbose: bool = True
246 |                    ) -> bool:
247 |     error = False
248 |     if browsers is None:
249 |         browsers = list(drivers)
250 |     elif isinstance(browsers, (str, WebDriver)):
251 |         browsers = [browsers]
252 |     with ExitStack() as stack:
253 |         if validate:
254 |             browsers = [get_browser(browser, stack) for browser in browsers]
255 |         raw_size = None
256 |         no_overhead_size = None
257 |         for label, filename in sorted(filenames.items(), key=lambda x: (x[0] != 'raw', x[0] != 'base64_html')):
258 |             ext = os.path.splitext(filename)[-1][1:].lower()
259 |             if raw_size is not None and ext != 'html':
260 |                 continue
261 |             if data is None or label == 'raw':
262 |                 if ext in webify.raw_extensions:
263 |                     raw = True
264 |                 elif ext in webify.image_extensions:
265 |                     image = True
266 |             assert not image or (not raw and not isinstance(data, str))
267 |             if data is None:
268 |                 with open(filename, 'rb') as f:
269 |                     data = f.read()
270 |             if raw_size is None:
271 |                 raw_size = len(data.encode() if isinstance(data, str) else data)
272 |             if not image and isinstance(data, bytes):
273 |                 data = text_prep.normalize(data.decode(), reduce_whitespace, unix_newline, fix_punct, remove_bom)  # Assumes raw text file is utf8. Otherwise, pass it as a data argument
274 | 
275 |             if verbose:
276 |                 size = os.path.getsize(filename)
277 |                 if label == 'base64_html':
278 |                     no_overhead_size = size * 3 / 4
279 |                 stats = []
280 |                 if raw_size:
281 |                     stats.append(f'ratio={round(size / raw_size * 100, 1)}%')
282 |                 if no_overhead_size:
283 |                     stats.append(f'overhead={round((size/no_overhead_size-1) * 100, 1)}%')
284 |                 if ext == 'html' and label not in ['raw', 'base64_html']:
285 |                     with open(filename, 'rb') as f:
286 |                         html = f.read()
287 |                         matches = regex.findall(webify.literals_regex.encode(), html)
288 |                         payload = max(matches, key=len, default=b'').split(b'`', 1)[1].rsplit(b'`', 1)[0]
289 |                         html = html.replace(payload, b'')
290 |                     stats.append(f'code: {len(html):,} B = {round(len(html) / 1024, 1):,} kB')
291 |                 stats = ' '.join(stats)
292 |                 if stats:
293 |                     stats = f' ({stats})'
294 |                 mb = size / 1024 ** 2
295 |                 if mb >= 0.1:
296 |                     stats = f' = {round(mb, 1):,} MB{stats}'
297 |                 kb = size / 1024
298 |                 if kb >= 0.1:
299 |                     stats = f' = {round(kb, 1):,} kB{stats}'
300 |                 print(f"{full_path(filename)} {size:,} B{stats}", end='' if validate and ext == 'html' and label != 'raw' else None, file=sys.stderr)
301 | 
302 |             if validate and ext == 'html' and label != 'raw':
303 |                 for i, browser in enumerate(browsers):
304 |                     start_time = time()
305 |                     valid = validate_html(filename, data, caps, by, element,
306 |                                           raw, browser, timeout, unicode_A,
307 |                                           ignore_regex, content_var, verbose)
308 |                     assert valid is not False, filename
309 |                     if not valid:
310 |                         error = True
311 |                     if verbose:
312 |                         if i == 0:
313 |                             print(f' rendering secs:', end='', file=sys.stderr)
314 |                         print(f' {browser.name}=' + (f'{time() - start_time :.1f}' if valid else f'{timeout}(TIMEOUT)'), end='', file=sys.stderr)
315 |                 if verbose:
316 |                     print(file=sys.stderr)
317 |         if verbose and validate:
318 |             print('Note: above rendering times from Selenium are much longer than actual browser rendering.', file=sys.stderr)
319 |     return error
320 | 


--------------------------------------------------------------------------------