├── ztml
├── __init__.py
├── default_vars.py
├── crenc.py
├── huffman.py
├── base125.py
├── tests.py
├── webify.py
├── text_prep.py
├── deflate.py
├── ztml.py
├── bwt_mtf.py
└── validation.py
├── .github
└── FUNDING.yml
├── ect
├── ect
├── ect.exe
├── ect-ubuntu
├── Build_ECT_Ububtu.ipynb
└── License.txt
├── misc
├── reversim2022_slides.pdf
├── minibook.py
├── run_all.bat
├── .htaccess
├── size_checker.py
└── example_html.py
├── requirements.txt
├── LICENSE
├── example.py
├── example_image.py
├── TODO.md
├── ZTML.ipynb
└── README.md
/ztml/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: eyaler
2 |
--------------------------------------------------------------------------------
/ect/ect:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eyaler/ztml/HEAD/ect/ect
--------------------------------------------------------------------------------
/ect/ect.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eyaler/ztml/HEAD/ect/ect.exe
--------------------------------------------------------------------------------
/ect/ect-ubuntu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eyaler/ztml/HEAD/ect/ect-ubuntu
--------------------------------------------------------------------------------
/misc/reversim2022_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eyaler/ztml/HEAD/misc/reversim2022_slides.pdf
--------------------------------------------------------------------------------
/ztml/default_vars.py:
--------------------------------------------------------------------------------
1 | bitarray = 'b'
2 | bwt_func = 'f'
3 | bytearray = 'o'
4 | image = 'i'
5 | text = 't'
6 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | bitarray
2 | chardet
3 | numpy
4 | pydivsufsort
5 | pypng
6 | regex
7 | selenium
8 | typing_extensions
9 | webdriver_manager
10 | zopflipy
11 | gutenbergpy
12 |
--------------------------------------------------------------------------------
/misc/minibook.py:
--------------------------------------------------------------------------------
1 | # https://xem.github.io/miniBook
2 |
3 |
4 | import sys
5 | from urllib.request import urlopen
6 |
7 | sys.path.append('..')
8 | from ztml import ztml
9 |
10 |
11 | with urlopen('https://xem.github.io/miniBook/example') as f:
12 | out, result = ztml.ztml(f.read(), 'index.html', mtf=80, ect=True, raw=True, validate=True)
13 | print(f'{len(out):,} B')
14 | assert not result
15 |
--------------------------------------------------------------------------------
/misc/run_all.bat:
--------------------------------------------------------------------------------
1 | #!/bin/bash 2> nul
2 |
3 | :; trap "exit" INT TERM
4 | :; set -o errexit
5 | :; function goto() { return $?; }
6 |
7 | cd ..
8 |
9 | python example.py || goto :error
10 |
11 | python example_image.py || goto :error
12 |
13 | cd misc
14 |
15 | python example_html.py || goto :error
16 |
17 | python minibook.py || goto :error
18 |
19 | cd ../ztml
20 |
21 | python tests.py || goto :error
22 |
23 | :; exit 0
24 | exit /b 0
25 |
26 | :error
27 | exit /b %errorlevel%
28 |
--------------------------------------------------------------------------------
/misc/.htaccess:
--------------------------------------------------------------------------------
1 | # THIS IS FOR ONLINE TESTING OF OUTPUT FILES
2 |
3 | # SHOW FILES IN FOLDER
4 | Options +Indexes
5 | IndexOptions +FancyIndexing
6 |
7 | # DISABLE CACHING
8 |
9 | ExpiresActive Off
10 |
11 |
12 | FileETag None
13 | Header unset ETag
14 | Header unset Pragma
15 | Header unset Cache-Control
16 | Header unset Last-Modified
17 | Header set Pragma "no-cache"
18 | Header set Cache-Control "max-age=0, no-cache, no-store, must-revalidate"
19 | Header set Expires "Thu, 1 Jan 1970 00:00:00 GMT"
20 |
--------------------------------------------------------------------------------
/misc/size_checker.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 |
5 | old_folder = sys.argv[1]
6 | new_folder = sys.argv[2]
7 | assert new_folder != old_folder
8 | old_files = sorted(os.listdir(old_folder))
9 | new_files = sorted(os.listdir(new_folder))
10 | print(f'Old: {old_folder} ({len(old_files)} files)')
11 | print(f'New: {new_folder} ({len(new_files)} files)')
12 | assert new_files == old_files
13 |
14 | for file in old_files:
15 | old_size = os.path.getsize(os.path.join(old_folder, file))
16 | new_size = os.path.getsize(os.path.join(new_folder, file))
17 | assert new_size <= old_size, f'{file} grew from {old_size:,} to {new_size:,}'
18 |
19 | print(f'All {len(old_files)} files are equal or smaller.')
20 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The following license applies to all parts of this software except where a more restrictive license is stated.
2 |
3 | MIT License
4 |
5 | Copyright (c) 2022 Eyal Gruss (https://github.com/eyaler/ztml)
6 |
7 | Copyright (c) 2021-2022 Ethan Halsall (https://github.com/eshaz/simple-yenc)
8 |
9 | Copyright (c) 2016 Kevin Albertson (https://github.com/kevinAlbs/Base122)
10 |
11 | Permission is hereby granted, free of charge, to any person obtaining a copy
12 | of this software and associated documentation files (the "Software"), to deal
13 | in the Software without restriction, including without limitation the rights
14 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15 | copies of the Software, and to permit persons to whom the Software is
16 | furnished to do so, subject to the following conditions:
17 |
18 | The above copyright notice and this permission notice shall be included in all
19 | copies or substantial portions of the Software.
20 |
21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 | SOFTWARE.
28 |
--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from time import time
4 |
5 | start_time = time()
6 |
7 | from ztml import validation, ztml
8 |
9 |
10 | books = [30123, 2600]
11 | book_mtf = [0, 80]
12 | book_ect = [False, True]
13 | output_folder = 'output'
14 | skip_download_exists = True
15 | element_id = ''
16 |
17 |
18 | assert len(books) == len(book_mtf) == len(book_ect)
19 | error = False
20 | for item, mtf, ect in zip(books, book_mtf, book_ect):
21 | item_start_time = time()
22 | filenames = dict(raw=f'{item}.txt',
23 | # base64_js=f'{item}_64.js',
24 | base64_html=f'{item}_64.html',
25 | # base125_js=f'{item}_125.js',
26 | base125_html=f'{item}_125.html',
27 | # crenc_js=f'{item}_cr.js',
28 | crenc_html=f'{item}_cr.html')
29 | os.makedirs(output_folder, exist_ok=True)
30 | filenames = {k: os.path.join(output_folder, v) for k, v in filenames.items()}
31 |
32 | # If missing, download an example file from the web
33 | if not skip_download_exists or not os.path.exists(filenames['raw']):
34 | from gutenbergpy.textget import get_text_by_id
35 | with open(filenames['raw'], 'wb') as f:
36 | f.write(get_text_by_id(item))
37 |
38 | with open(filenames['raw'], 'rb') as f:
39 | data = f.read()
40 |
41 | cnt = 0
42 | for label, filename in filenames.items():
43 | if label == 'raw':
44 | continue
45 | file = ztml.ztml(data, filename, mtf=mtf, ect=ect, bin2txt=label.rsplit('_', 1)[0], element_id=element_id)
46 | cnt += 1
47 |
48 | print(f'{cnt} encodings of {item} took {(time()-item_start_time) / 60 :.1f} min.')
49 |
50 | # Compare file sizes and validate data is recovered
51 | error |= validation.validate_files(filenames, by='id' * bool(element_id), element=element_id)
52 | print()
53 |
54 | if error:
55 | print('Error: some renderings timed out')
56 | else:
57 | print(f'Total of {len(books)} books took {(time()-start_time) / 60 :.1f} min.')
58 | sys.exit(int(error))
59 |
--------------------------------------------------------------------------------
/misc/example_html.py:
--------------------------------------------------------------------------------
1 | # This is just for testing that ZTML can work on its own HTML outputs in raw mode
2 |
3 |
4 | import os
5 | import sys
6 | from time import time
7 |
8 | start_time = time()
9 |
10 | sys.path.append('..')
11 | from ztml import validation, ztml
12 |
13 |
14 | raw_files = ['30123_64.html',
15 | '30123_125.html',
16 | '30123_cr.html',
17 | 'test_pattern.jpg_64.html',
18 | 'test_pattern.jpg_125.html',
19 | 'test_pattern.jpg_cr.html'
20 | ]
21 | output_folder = '../output'
22 |
23 |
24 | error = False
25 | for url in raw_files:
26 | item_start_time = time()
27 | item = url.replace(os.sep, '/').rsplit('/', 1)[-1]
28 | filenames = dict(raw=item,
29 | # base64_js=f'{item}_64.js',
30 | base64_html=f'{item}_64.html',
31 | # base125_js=f'{item}_125.js',
32 | base125_html=f'{item}_125.html',
33 | # crenc_js=f'{item}_cr.js',
34 | crenc_html=f'{item}_cr.html')
35 | os.makedirs(output_folder, exist_ok=True)
36 | filenames = {k: os.path.join(output_folder, v) for k, v in filenames.items()}
37 |
38 | with open(filenames['raw'], 'rb') as f:
39 | data = f.read()
40 | if os.path.splitext(item)[0].endswith('_cr'):
41 | data = data.decode('cp1252', 'backslashreplace')
42 |
43 | cnt = 0
44 | for label, filename in filenames.items():
45 | if label == 'raw':
46 | continue
47 | file = ztml.ztml(data, filename, bin2txt=label.rsplit('_', 1)[0], raw=True, text_var='z')
48 | cnt += 1
49 |
50 | print(f'{cnt} encodings of {item} took {(time()-item_start_time) / 60 :.1f} min.')
51 |
52 | # Compare file sizes and validate data is recovered
53 | error |= validation.validate_files(filenames, data, content_var='z')
54 | print()
55 |
56 | if error:
57 | print('Error: some renderings timed out')
58 | else:
59 | print(f'Total of {len(raw_files)} raw files took {(time()-start_time) / 60 :.1f} min.')
60 | sys.exit(int(error))
61 |
--------------------------------------------------------------------------------
/ect/Build_ECT_Ububtu.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "private_outputs": true,
7 | "provenance": [],
8 | "authorship_tag": "ABX9TyMGts6SFpEriurAynOgIU2i",
9 | "include_colab_link": true
10 | },
11 | "kernelspec": {
12 | "name": "python3",
13 | "display_name": "Python 3"
14 | },
15 | "language_info": {
16 | "name": "python"
17 | }
18 | },
19 | "cells": [
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {
23 | "id": "view-in-github",
24 | "colab_type": "text"
25 | },
26 | "source": [
27 | "
"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "source": [
33 | "%cd /content\n",
34 | "!git clone --recursive https://github.com/fhanau/Efficient-Compression-Tool\n",
35 | "!latestTag=$(git describe --tags `git rev-list --tags --max-count=1`)\n",
36 | "!echo $latestTag\n",
37 | "!git checkout $latestTag\n",
38 | "!apt -y install nasm\n",
39 | "%cd Efficient-Compression-Tool\n",
40 | "!mkdir build\n",
41 | "%cd build\n",
42 | "!cmake ../src\n",
43 | "!make"
44 | ],
45 | "metadata": {
46 | "id": "jh1og550gUD-"
47 | },
48 | "execution_count": null,
49 | "outputs": []
50 | },
51 | {
52 | "cell_type": "code",
53 | "source": [
54 | "!./ect"
55 | ],
56 | "metadata": {
57 | "id": "hR9UQWk88qsx"
58 | },
59 | "execution_count": null,
60 | "outputs": []
61 | },
62 | {
63 | "cell_type": "code",
64 | "source": [
65 | "from google.colab import files\n",
66 | "!mv ect ect-ubuntu\n",
67 | "files.download('ect-ubuntu')"
68 | ],
69 | "metadata": {
70 | "id": "b7IETFkT7Y7Z"
71 | },
72 | "execution_count": null,
73 | "outputs": []
74 | }
75 | ]
76 | }
--------------------------------------------------------------------------------
/example_image.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from time import time
4 | from urllib.request import urlopen
5 |
6 | start_time = time()
7 |
8 | from ztml import validation, ztml
9 |
10 |
11 | image_urls = ['http://wiesmann.codiferes.net/share/bitmaps/test_pattern.bmp',
12 | 'http://wiesmann.codiferes.net/share/bitmaps/test_pattern.gif',
13 | 'http://wiesmann.codiferes.net/share/bitmaps/test_pattern.jpg',
14 | 'http://wiesmann.codiferes.net/share/bitmaps/test_pattern.png',
15 | 'http://wiesmann.codiferes.net/share/bitmaps/test_pattern.webp'
16 | ]
17 | output_folder = 'output'
18 | skip_download_exists = True
19 | element_id = ''
20 |
21 |
22 | error = False
23 | for url in image_urls:
24 | item_start_time = time()
25 | item = url.rsplit('/', 1)[-1]
26 | filenames = dict(raw=item,
27 | # base64_js=f'{item}_64.js',
28 | base64_html=f'{item}_64.html',
29 | # base125_js=f'{item}_125.js',
30 | base125_html=f'{item}_125.html',
31 | # crenc_js=f'{item}_cr.js',
32 | crenc_html=f'{item}_cr.html')
33 | os.makedirs(output_folder, exist_ok=True)
34 | filenames = {k: os.path.join(output_folder, v) for k, v in filenames.items()}
35 |
36 | # If missing, download an example file from the web
37 | if not skip_download_exists or not os.path.exists(filenames['raw']):
38 | with urlopen(url) as fin, open(filenames['raw'], 'wb') as fout:
39 | fout.write(fin.read())
40 |
41 | with open(filenames['raw'], 'rb') as f:
42 | data = f.read()
43 |
44 | cnt = 0
45 | for label, filename in filenames.items():
46 | if label == 'raw':
47 | continue
48 | file = ztml.ztml(data, filename, bin2txt=label.rsplit('_', 1)[0], element_id=element_id, image=True)
49 | cnt += 1
50 |
51 | print(f'{cnt} encodings of {item} took {(time()-item_start_time) / 60 :.1f} min.')
52 |
53 | # Compare file sizes and validate data is recovered
54 | error |= validation.validate_files(filenames, by='id' * bool(element_id), element=element_id, image=True)
55 | print()
56 |
57 | if error:
58 | print('Error: some renderings timed out')
59 | else:
60 | print(f'Total of {len(image_urls)} images took {(time()-start_time) / 60 :.1f} min.')
61 | sys.exit(int(error))
62 |
--------------------------------------------------------------------------------
/ztml/crenc.py:
--------------------------------------------------------------------------------
1 | """crEnc encoding based on yEnc and optimized for inline HTML / JS text compression and image encoding
2 |
3 | In the spirit of yEnc (why encode?), we only encode symbols where absolutely required.
4 | If the HTML or JS charset can be set to a single-byte encoding as cp1252 (or latin1),
5 | the only symbol requiring special treatment is the carriage-return (CR), hence crEnc,
6 | which can be dealt with by simple backslash escaping.
7 | We embed in JS template literals quotes ``, so we also escape backslash, ` and ${
8 | giving us an effective 253 byte values out of 256,
9 | with an overhead of ~ 3/256 ~ 1.2% (compared to 33.3% for Base64).
10 | JS does the unescaping, so the decoder only needs to take care of HTML character overrides for NUL and codes in 128 - 159.
11 | An optimal global character modular offset can be applied to minimize escaping, similar to dynEncode (enabled by default).
12 | A minimalistic JS decoder code is generated.
13 |
14 | References:
15 | https://en.wikipedia.org/wiki/Binary-to-text_encoding
16 | http://www.yenc.org
17 | https://github.com/eshaz/simple-yenc
18 | https://github.com/eshaz/simple-yenc#what-is-dynencode
19 | https://html.spec.whatwg.org/multipage/parsing.html#table-charref-overrides
20 | https://stackoverflow.com/questions/10080605/special-character-u0098-read-as-u02dc-using-charcodeat/#10081375
21 | """
22 |
23 |
24 | from typing import Optional, Tuple
25 |
26 | if not __package__:
27 | import default_vars, webify
28 | else:
29 | # noinspection PyPackages
30 | from . import default_vars, webify
31 |
32 |
33 | def encode(data: bytes, offset: int = 0) -> bytes:
34 | if offset:
35 | data = bytes(byte+offset & 255 for byte in data)
36 | return webify.escape(data)
37 |
38 |
39 | def optimize_encode(data: bytes) -> Tuple[bytes, int, int]:
40 | best_offset = 0
41 | for offset in range(256):
42 | out = encode(data, offset)
43 | length = len(out)
44 | if offset == 0:
45 | best_length = length0 = length
46 | if length < best_length:
47 | best_length = length
48 | best_offset = offset
49 | out = encode(data, best_offset)
50 | return out, best_offset, length0 - best_length
51 |
52 |
53 | def get_js_decoder(data: bytes,
54 | offset: Optional[int] = None,
55 | output_var: str = default_vars.bytearray
56 | ) -> bytes:
57 | if offset is None:
58 | encoded, offset, saved = optimize_encode(data) # Time-consuming op.
59 | else:
60 | encoded = encode(data, offset)
61 | first_part = f'{output_var}=Uint8Array.from(`'
62 | function = f"(i=c.charCodeAt()%65533)>>8?129+' \x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c \x8e \x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c \x9e\x9f'.indexOf(c):i"
63 | if offset:
64 | function = f'({function})-{offset}'
65 | last_part = f"`,c=>{function})\n"
66 | return first_part.encode() + encoded + last_part.encode('l1') # Encode with l1 as I used explicit bytes above
67 |
--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
1 | # Todo
2 |
3 | ### Usability
4 | - Simplify the "from ztml.ztml import ztml" hierarchy
5 | - Support encoding video/audio/fonts/PDF/...
6 | - Support encoding multiple media elements
7 | - Provide an easy way to view and edit output HTML in Colab
8 | - Make into a PIP library and start doing versioning
9 | - JS library?
10 | - Expose more parameters and allow skipping steps in ztml() / CLI / Colab, possibly via config file
11 | - Stand-alone online web GUI
12 | - Stand-alone executable (+ script to build it)
13 |
14 | ### Compression
15 | - Ablation benchmarks
16 | - Launch a challenge for smaller decoders
17 |
18 | - #### Entropy coding:
19 | - Auto-caps should use modifiers for next letter/word/sentence/paragraph or block-level, over simple mode instead of falling back to raw. See e.g. [Grabowski](https://www.researchgate.net/profile/Szymon-Grabowski-2/publication/258239689_Text_Preprocessing_for_Burrows-Wheeler_Block_Sorting_Compression/links/0046352789a298f289000000), [Batista&Alexandre](https://www.di.ubi.pt/~lfbaa/pubs/dcc2008.pdf)
20 | - Dictionary compression for large texts + add references
21 | - [Fast Huffman one-shift decoder](https://researchgate.net/publication/3159499_On_the_implementation_of_minimum_redundancy_prefix_codes), and follow-up works: [Gagie et al.](https://arxiv.org/pdf/1410.3438.pdf), [Grabowski&Koppl](https://arxiv.org/pdf/2108.05495.pdf)
22 | - Consider [Roadroller](https://lifthrasiir.github.io/roadroller) entropy coder
23 |
24 | #### MTF:
25 | - Improve JS MTF decoding times for large files
26 | - Automatic optimizing over MTF variants
27 | - Benchmark alternatives to MTF + add references
28 |
29 | #### Deflate:
30 | - Investigate effect of PNG aspect ratio on compression / optimize over it
31 | - Investigate Safari canvas size limits
32 | - Use 8/24-bit to overcome canvas size limits when necessary (will not work on Safari, unless we go WebGL)
33 | - Compress metadata into PNG
34 | - [Use WOFF2 as a Brotli container](https://github.com/lifthrasiir/roadroller/issues/9#issuecomment-905580540)
35 |
36 | #### Webification and minification:
37 | - [Base139](https://github.com/kevinAlbs/Base122/issues/3#issuecomment-263787763)
38 | - Compress the JS itself and use [eval](http://perfectionkills.com/global-eval-what-are-the-options), considering also JS packing e.g. [JSCrush](http://iteral.com/jscrush), [JS Crusher](https://jmperezperez.com/js-crusher), [RegPack](https://siorki.github.io/regPack), [Roadroller](https://lifthrasiir.github.io/roadroller)
39 | - Strip whitespace from code lines not part of multi-line content strings (see e.g. above JS packers and [closure-compiler](https://github.com/google/closure-compiler), [jsmin](https://crockford.com/jsmin), [miniMinifier](https://github.com/xem/miniMinifier), [Terser](https://terser.org), [UglifyJS](https://github.com/mishoo/UglifyJS))
40 |
41 | ### Validation and testing
42 | - Running full tests take too long
43 | - Linux installation instructions / Enable validation in Colab
44 | - Validation testing for Safari (consider Playwright to test WebKit)
45 | - Fix slow rendering with Selenium in validation
46 | - Tests for text_prep.py: normalize, caps, the
47 | - Automatic testing on GitHub
48 |
--------------------------------------------------------------------------------
/ztml/huffman.py:
--------------------------------------------------------------------------------
1 | """Canonical Huffman encoding
2 |
3 | Even though we later compress with DEFLATE which does its own Huffman encoding internally,
4 | I found that for text compression, it is significantly beneficial to pre-encode with Huffman.
5 | Canonical encoding obviates saving or reconstructing an explicit codebook.
6 | Instead, we save a strings of symbols and a condensed canonical table of bases and offsets, in a variation of Moffat&Turpin.
7 | A minimalistic JS decoder code is generated.
8 |
9 | References:
10 | https://wikipedia.org/wiki/Canonical_Huffman_code
11 | https://github.com/ilanschnell/bitarray/blob/master/doc/canonical.rst
12 | https://researchgate.net/publication/3159499_On_the_implementation_of_minimum_redundancy_prefix_codes (Moffat&Turpin)
13 | https://arxiv.org/pdf/1410.3438.pdf
14 | https://arxiv.org/pdf/2108.05495.pdf
15 | """
16 |
17 |
18 | from collections import Counter
19 | import sys
20 | from typing import Dict, List, Tuple
21 |
22 | from bitarray import bitarray
23 | from bitarray.util import ba2int, canonical_decode, canonical_huffman
24 |
25 | if not __package__:
26 | import default_vars, webify
27 | else:
28 | # noinspection PyPackages
29 | from . import default_vars, webify
30 |
31 |
32 | DEBUG_SKIP_HUFFMAN = False # This is just for benchmarking and is not implemented in JS decoder
33 |
34 |
35 | def encode(text: str,
36 | validate: bool = True,
37 | verbose: bool = False
38 | ) -> Tuple[List[int], str, str, Dict[str, str]]:
39 | charset = ''
40 | canonical_table = {}
41 | counter = Counter(text)
42 | if DEBUG_SKIP_HUFFMAN:
43 | code_len = len(bin(ord(max(counter, default='\0')))) - 2
44 | codebook = {c: bitarray(bin(ord(c))[2:].zfill(code_len)) for c in counter}
45 | else:
46 | if len(counter):
47 | codebook, counts, symbols = canonical_huffman(counter)
48 | else:
49 | codebook = {}
50 | counts = []
51 | symbols = []
52 | charset = ''.join(symbols[::-1])
53 | canonical_table = {len(code): [2**len(code) - ba2int(code), len(codebook) - i - 1] for i, code in enumerate(codebook.values())}
54 |
55 | bits = bitarray()
56 | if codebook:
57 | bits.encode(codebook, text)
58 | if verbose:
59 | print(sorted([(k, v.to01()) for k, v in codebook.items()],
60 | key=lambda x: -counter[x[0]]), file=sys.stderr)
61 | if charset:
62 | print(len(charset), charset, file=sys.stderr)
63 | print(canonical_table, file=sys.stderr)
64 | if validate:
65 | assert not codebook or ''.join(bits.decode(codebook)) == text
66 | assert DEBUG_SKIP_HUFFMAN or ''.join(canonical_decode(bits, counts, symbols)) == text
67 | canonical_table = ''.join(chr(j) for i in range(max(canonical_table, default=-1) + 1) for j in (canonical_table[i] if i in canonical_table else [2**i + 1, 1]))
68 | rev_codebook = {v.to01(): k for k, v in codebook.items()}
69 | return bits.tolist(), charset, canonical_table, rev_codebook
70 |
71 |
72 | def get_js_decoder(charset: str,
73 | canonical_table: str,
74 | bitarray_var: str = default_vars.bitarray,
75 | text_var: str = default_vars.text,
76 | ) -> str:
77 | # Note that the escaped strings may include more characters requiring safe encoding as regard to encoding domains as well as HTML character overrides
78 | charset = webify.escape(charset, escape_nul=True)
79 | canonical_table = webify.escape(canonical_table, escape_nul=True)
80 | return f'''s=[...`{charset}`]
81 | d=[...`{canonical_table}`]
82 | for(j={text_var}='';j<{bitarray_var}.length;{text_var}+=s[d[k*2-1].codePointAt()+m])for(k=c=0;(m=2**k-d[k++*2].codePointAt()-c)<0;)c+=c+{bitarray_var}[j++]
83 | '''
84 |
85 |
86 | def encode_and_get_js_decoder(text: str,
87 | bitarray_var: str = default_vars.bitarray,
88 | text_var: str = default_vars.text,
89 | validate: bool = True,
90 | verbose: bool = False
91 | ) -> Tuple[List[int], str]:
92 | bits, charset, canonical_table, _ = encode(text, validate, verbose)
93 | return bits, get_js_decoder(charset, canonical_table, bitarray_var, text_var)
94 |
--------------------------------------------------------------------------------
/ztml/base125.py:
--------------------------------------------------------------------------------
1 | """Base125 encoding based on Base122 and optimized for inline HTML / JS text compression and image encoding
2 |
3 | If we must use utf8 encoding for HTML or JS, crEnc will not work.
4 | Instead, we can use this original and unnecessarily-optimized version of the variable length Base122.
5 | The original byte stream is split into 7 bit chunks,
6 | which are encoded as a single byte: 0xxxxxxx, to comply with utf8 code point scheme.
7 | We only use 125 byte values out of 128 (excluding CR, backslash and `)
8 | and encode the remaining three with a double byte scheme: 110ssxxx 10xxxxxx,
9 | where ss is 01, 10 or 11, and 9 bits are left for next data.
10 | Alternatively, if these are the final 7 bits, we instead encode as: 1100010x 10xxxxxx.
11 | As, we embed in JS template literals quotes ``, we further escape ${ with backslash.
12 | The overhead is ~ 8/7 * 253/256 + 16/11 * 3/256 - 1 ~ 14.7% (compared to 33.3% for Base64).
13 | The decoder further takes care of HTML character override for NUL.
14 | An optimal global character modular offset can be added to minimize escaping, similar to dynEncode (disabled by default).
15 | A minimalistic JS decoder code is generated.
16 |
17 | References:
18 | https://en.wikipedia.org/wiki/Binary-to-text_encoding
19 | https://blog.kevinalbs.com/base122
20 | https://github.com/kevinAlbs/Base122
21 | https://github.com/eshaz/simple-yenc#what-is-dynencode
22 | """
23 |
24 |
25 | from typing import Optional, Tuple
26 |
27 | if not __package__:
28 | import default_vars
29 | else:
30 | # noinspection PyPackages
31 | from . import default_vars
32 |
33 |
34 | illegal = ['', 13, 92, 96]
35 |
36 |
37 | def encode(data: bytes, offset: int = 0, validate: bool = True) -> bytes:
38 | cur_index = 0
39 | cur_bit = 0 # Points to current bit needed
40 | out = bytearray()
41 |
42 | # Get 7 or 9 bits of input data. Returns None if there is no input left
43 | def get_bits(length: int) -> Optional[int]:
44 | nonlocal cur_index, cur_bit
45 | if cur_index >= len(data):
46 | return None
47 |
48 | # Shift, mask, unshift to get first part. Align it to a 7 or 9 bit chunk
49 | first_part = (255>>cur_bit & data[cur_index]+offset & 255) << cur_bit
50 | diff = 8 - length
51 | if diff > 0:
52 | first_part >>= diff
53 | else:
54 | first_part <<= -diff
55 | # Check if we need to go to the next byte for more bits
56 | cur_bit += length
57 | if cur_bit < 8:
58 | return first_part # Do not need next byte
59 | cur_bit -= 8
60 | cur_index += 1
61 | # Now we want bits [0..cur_bit] of the next byte if it exists
62 | if cur_index >= len(data):
63 | return first_part
64 | # Align it
65 | second_part = (0xff00>>cur_bit & data[cur_index]+offset & 255) >> 8-cur_bit
66 | return first_part | second_part
67 |
68 | while True:
69 | # Grab 7 bits
70 | bits = get_bits(7)
71 | if bits is None:
72 | break
73 | try:
74 | illegal_index = illegal.index(bits)
75 | # Since this will be a two-byte character, get the next chunk of 9 bits
76 | next_bits = get_bits(9)
77 | if next_bits is None:
78 | b1 = 4
79 | next_bits = bits
80 | else:
81 | b1 = illegal_index << 3
82 | # Push first 3 bits onto first byte, remaining 6 onto second
83 | out.extend([192 | b1 | next_bits>>6, 128 | next_bits&63])
84 | except ValueError:
85 | out.append(bits)
86 |
87 | if validate:
88 | decoded = decode(out, offset)
89 | assert decoded == data, (len(decoded), len(data), decoded[:30], data[:30])
90 | return out.replace(b'${', b'\\${')
91 |
92 |
93 | def optimize_encode(data: bytes,
94 | validate: bool = True
95 | ) -> Tuple[bytes, int, int]:
96 | best_offset = 0
97 | for offset in range(256):
98 | length = len(encode(data, offset, validate=False))
99 | if offset == 0:
100 | best_length = length0 = length
101 | if length < best_length:
102 | best_length = length
103 | best_offset = offset
104 | out = encode(data, best_offset, validate)
105 | return out, best_offset, length0 - best_length
106 |
107 |
108 | def decode(data: bytes, offset: int = 0) -> bytes:
109 | out = bytearray()
110 | next_byte = 0
111 | k = 0
112 |
113 | def push_bits(bits: int, length: int = 7) -> None:
114 | nonlocal next_byte, k
115 | next_byte |= bits << (length < 8) >> k >> (length > 8)
116 | k += length
117 | if k > 7:
118 | out.append((next_byte&255)-offset & 255)
119 | k -= 8
120 | next_byte = bits << 8-k
121 |
122 | for byte in data.decode():
123 | b = ord(byte)
124 | if b > 127:
125 | ss = b >> 9
126 | if ss:
127 | push_bits(illegal[ss])
128 | push_bits(b<<2*(not ss) & 511, 9)
129 | else:
130 | push_bits(b)
131 | return out
132 |
133 |
134 | def get_js_decoder(data: bytes,
135 | offset: Optional[int] = 0,
136 | output_var: str = default_vars.bytearray,
137 | validate: bool = True
138 | ) -> bytes:
139 | if offset is None:
140 | encoded, offset, saved = optimize_encode(data, validate) # Time-consuming op.
141 | else:
142 | encoded = encode(data, offset, validate)
143 | illegal_str = ','.join(str(i) for i in illegal)
144 | first_part = f'''k=n=0
145 | p=(b,l=7)=>(n|=b<<(l<8)>>k>>(l>8),k+=l,k>7?(v=n{-offset or ''},k-=8,n=b<<8-k,v):[])
146 | {output_var}=new Uint8Array([...`'''
147 | last_part = f'`].flatMap(c=>(i=c.charCodeAt()%65533,i>127?(e=i>>9,[e?p([{illegal_str}][e]):[],p(i<<2*!e&511,9)].flat()):p(i))))\n'
148 | return first_part.encode() + encoded + last_part.encode()
149 |
150 |
151 | def test() -> None:
152 | for i in range(100):
153 | for j in range(100):
154 | for offset in [0, 1]:
155 | for symbol in [b'\r', b'\\', b'`']:
156 | encode(b'\0'*i + symbol*j, offset, validate=True)
157 |
158 |
159 | if __name__ == '__main__':
160 | test()
161 |
--------------------------------------------------------------------------------
/ztml/tests.py:
--------------------------------------------------------------------------------
1 | import os
2 | from time import time
3 |
4 | start_time = time()
5 |
6 | if not __package__:
7 | import text_prep, bwt_mtf, deflate, validation, webify, ztml
8 | else:
9 | # noinspection PyPackages
10 | from . import text_prep, bwt_mtf, deflate, validation, webify, ztml
11 |
12 |
13 | min_char_code1 = 0
14 | max_char_code1 = 14000
15 | min_char_code2 = 55000
16 | max_char_code2 = 66000
17 | browsers = list(validation.drivers)[:1]
18 | input_encodings = ['utf8', 'cp1252', 'cp1255']
19 | bin2txt_encodings = ztml.bin2txt_encodings
20 | caps_modes = ['auto', 'simple'] # text_prep.caps_modes
21 | mtf_variants = [None, 0, 52, 80] # bwt_mtf.mtf_variants
22 | bitdepths = deflate.allowed_bitdepths
23 | ect_modes = [False, True]
24 | temp_folder = 'tmp'
25 | cleanup = True
26 |
27 |
28 | all_chars = ''.join(chr(i) for i in range(min_char_code1, min(max_char_code1 or bwt_mtf.max_unicode, bwt_mtf.max_unicode) + 1))
29 | if min_char_code2 and max_char_code2:
30 | all_chars += ''.join(chr(i) for i in range(min_char_code2, min(max_char_code2 or bwt_mtf.max_unicode, bwt_mtf.max_unicode) + 1) if chr(i) not in all_chars)
31 | os.makedirs(temp_folder, exist_ok=True)
32 | i = 0
33 | for browser in browsers:
34 | with validation.get_browser(browser) as b:
35 | for encoding in input_encodings:
36 | encoding = encoding.lower()
37 | for bin2txt in bin2txt_encodings:
38 | for caps in caps_modes:
39 | for bwtsort in [True, False]:
40 | for mtf in mtf_variants:
41 | for bitdepth in bitdepths:
42 | for ect in ect_modes:
43 | for render_mode in range(3):
44 | element_id = ''
45 | raw = False
46 | if render_mode == 1:
47 | element_id = 'myid'
48 | elif render_mode == 2:
49 | raw = True
50 | test_start_time = time()
51 | i += 1
52 | print(f'{i}/{len(browsers) * len(input_encodings) * len(bin2txt_encodings) * len(caps_modes) * 2 * len(mtf_variants) * len(bitdepths) * len(ect_modes) * 3} browser={browser} input_enc={encoding} bin2txt={bin2txt} caps={caps} bwtsort={bwtsort} mtf={mtf} bitdepth={bitdepth} ect={ect} id={bool(element_id)} raw={raw}')
53 | suffix = f"{browser}_{encoding}_{bin2txt}_{caps}{'_bwtsort' * bwtsort}_{mtf}_{bitdepth}{'_ect' * ect}"
54 | if element_id:
55 | suffix += '_id'
56 | if raw:
57 | suffix += '_raw'
58 | input_filename = os.path.join(temp_folder, f'ztml_test_file_{suffix}.txt')
59 | output_filename = os.path.join(temp_folder, f'ztml_test_file_{suffix}.html')
60 | output_stream = os.path.join(temp_folder, f'ztml_test_stream_{suffix}.html')
61 | text = all_chars
62 | if mtf is not None:
63 | text = ''.join(c for c in text if ord(c) <= bwt_mtf.max_ord_for_mtf)
64 | if encoding.replace('-', '') == 'utf8':
65 | text = ''.join(c for c in text if ord(c) < bwt_mtf.surrogate_lo or ord(c) > bwt_mtf.surrogate_hi)
66 | out1, result1 = ztml.ztml(text, unix_newline=False, remove_bom=False, caps=caps, bwtsort=bwtsort, mtf=mtf, bitdepth=bitdepth, ect=ect, bin2txt=bin2txt, element_id=element_id, raw=raw, validate=True, browser=b, verbose=True)
67 | out2, result2 = ztml.ztml(text, output_filename, unix_newline=False, remove_bom=False, caps=caps, bwtsort=bwtsort, mtf=mtf, bitdepth=bitdepth, ect=ect, bin2txt=bin2txt, element_id=element_id, raw=raw, validate=True, browser=b, verbose=True)
68 | with open(output_filename, 'rb') as f:
69 | out = f.read()
70 | assert not result1 and not result2 and out1 == out2 == out, (result1, result2, out1 == out2, out1 == out, out2 == out, len(out1), len(out2), validation.full_path(output_filename), len(out))
71 | with open(input_filename, 'wb') as f:
72 | f.write(webify.safe_encode(text, encoding))
73 | bwtsort_arg = '--skip_bwtsort' * (not bwtsort)
74 | ect_arg = '--ect' * ect
75 | element_id_or_raw_arg = ''
76 | if element_id:
77 | element_id_or_raw_arg = f'--element_id "{element_id}"'
78 | if raw:
79 | element_id_or_raw_arg = '--raw'
80 | result1 = os.system(f'python ztml.py "{input_filename}" "{output_filename}" --skip_unix_newline --skip_remove_bom --caps {caps} {bwtsort_arg} --mtf {mtf} --bitdepth {bitdepth} {ect_arg} --bin2txt {bin2txt} {element_id_or_raw_arg} --validate --browser {browser} --verbose')
81 | result2 = os.system(f'python ztml.py "{input_filename}" --skip_unix_newline --skip_remove_bom --caps {caps} {bwtsort_arg} --mtf {mtf} --bitdepth {bitdepth} {ect_arg} --bin2txt {bin2txt} {element_id_or_raw_arg} --validate --browser {browser} --verbose > {output_stream}')
82 | with open(output_filename, 'rb') as f1:
83 | out1 = f1.read()
84 | with open(output_stream, 'rb') as f2:
85 | out2 = f2.read()
86 | if out2.endswith(b'\x1b[0m'): # E.g. due to PyCharm terminal
87 | out2 = out2[:-4]
88 | assert not result1 and not result2 and out1 == out2, (result1, result2, out1 == out2, validation.full_path(output_filename), len(out1), validation.full_path(output_stream), len(out2))
89 | if cleanup:
90 | for filename in [input_filename, output_filename, output_stream]:
91 | try:
92 | os.remove(filename)
93 | except PermissionError:
94 | pass
95 | print(f'Test took {time() - test_start_time :.0f} sec.\n')
96 | if cleanup:
97 | try:
98 | os.rmdir(temp_folder)
99 | except OSError:
100 | pass
101 | print(f'Total took {(time()-start_time) / 60 :.1f} min.')
102 |
--------------------------------------------------------------------------------
/ztml/webify.py:
--------------------------------------------------------------------------------
1 | """ Minification by way of aliasing AKA uglification
2 |
3 | Substitutes recurring element, attribute and function names with short aliases.
4 | This is far from being a full-fledged JS minifier, and only addresses specific forms of aliasing
5 | (with defaults tuned for the author's own hand-minified use cases)
6 | You may be able to reduce your script further with JS minifiers and packers (see references),
7 | however these might not be compatible with ZTML (especially when using the non-utf8 crEnc).
8 |
9 | Warnings:
10 | 1. The two-parameter aliases would miss substitutions involving tag function syntax, i.e.
11 | func`str`, even if you specify such forms explicitly. However, see following examples.
12 | 2. While alias substitution does support some level of composition, e.g.:
13 | a.appendChild(b=document.createElement`p`).innerHTML='hi' # => C(a,b=E`p`).C='hi'
14 | More complex compositions would miss later substitutions, e.g.:
15 | a.appendChild(b=document.createElement`p`).appendChild(c) # => C(a,b=E`p`).appendChild(c)
16 | a.appendChild(b=document.createElement`p`).setAttribute('style',c) # => C(a,b=E`p`).setAttribute('style',c)
17 | 3. Non-static method aliases support only specific parameter signatures as appear in
18 | default_aliases. Attempting to specify different signatures will break your code.
19 | 4. You may need to set replace_quoted=False if you do not want e.g. all 'length', "Length"
20 | to be replaced by: L
21 | 5. Aliases to be used in other aliases e.g. document, should be specified before the latter.
22 |
23 | References:
24 | https://github.com/google/closure-compiler
25 | http://iteral.com/jscrush
26 | https://nikhilism.com/post/2012/demystifying-jscrush
27 | https://github.com/possan/jsintros/blob/master/a/src/crush.js
28 | https://jmperezperez.com/js-crusher
29 | https://crockford.com/jsmin
30 | https://github.com/xem/miniMinifier
31 | https://siorki.github.io/regPack
32 | https://lifthrasiir.github.io/roadroller
33 | https://terser.org
34 | https://github.com/mishoo/UglifyJS
35 | """
36 |
37 |
38 | import re
39 | import sys
40 | from typing import AnyStr
41 |
42 |
43 | raw_extensions = ['htm', 'html', 'svg']
44 | image_extensions = ['bmp', 'gif', 'jfif', 'jpe', 'jpeg', 'jpg', 'png', 'webp']
45 |
46 |
47 | default_aliases = '''
48 | D = document
49 | A = (e, d) => e.setAttribute('style', d)
50 | B = document.body
51 | C = (e, c) => e.appendChild(c)
52 | E = (e='div') => document.createElement(e)
53 | F = String
54 | G = 'target'
55 | H = 'innerHTML'
56 | I = setInterval
57 | J = clearInterval
58 | K = e => e.codePointAt()
59 | L = 'length'
60 | M = Math
61 | N = speechSynthesis
62 | O = setTimeout
63 | '''
64 |
65 | literals_regex = rf'(`(?:\\.|[^`\\])*`)'
66 |
67 |
68 | def escape(s: AnyStr, escape_nul: bool = False) -> AnyStr:
69 | pattern = r'\\|`|\${'
70 | repl = r'\\\g<0>'
71 | cr = '\r'
72 | esc_cr = '\\r'
73 | nul = '\0'
74 | esc_nul = '\\0'
75 | if isinstance(s, bytes):
76 | pattern = pattern.encode()
77 | repl = repl.encode()
78 | cr = cr.encode()
79 | esc_cr = esc_cr.encode()
80 | s = re.sub(pattern, repl, s).replace(cr, esc_cr)
81 | if escape_nul:
82 | s = s.replace(nul, esc_nul)
83 | return s
84 |
85 |
86 | def safe_encode(s: str, encoding: str, get_back_unused: bool = False) -> bytes:
87 | encoding = encoding.lower()
88 | out = s.encode(encoding, 'strict' if encoding.replace('-', '') == 'utf8' else 'backslashreplace')
89 | out = re.sub(rb'\\U000?([\da-f]{5,6})', rb'\\u{\1}', out)
90 | if get_back_unused and encoding == 'cp1252':
91 | out = out.replace(b'\\x81', b'\x81').replace(b'\\x8d', b'\x8d').replace(b'\\x8f', b'\x8f').replace(b'\\x90', b'\x90').replace(b'\\x9d', b'\x9d') # These actually do not require escaping in HTML
92 | return out
93 |
94 |
95 | def get_len(s: AnyStr, encoding: str) -> int:
96 | return len(safe_encode(s, encoding) if isinstance(s, str) else s)
97 |
98 |
99 | def uglify(script: AnyStr,
100 | aliases: str = default_aliases,
101 | replace_quoted: bool = True,
102 | min_cnt: int = 2,
103 | prevent_grow: bool = True,
104 | add_used_aliases: bool = True,
105 | encoding: str = 'utf8',
106 | ) -> AnyStr:
107 | orig_len = get_len(script, encoding)
108 | shorts = set()
109 | for alias in reversed(aliases.strip().splitlines()):
110 | alias = alias.replace(' ', '')
111 | if not alias:
112 | continue
113 | short, long = alias.split('=', 1)
114 | assert short not in shorts, short
115 | shorts.add(short)
116 | prefix = ''
117 | comma = ''
118 | if re.search(r'(\b\w+\b)[^>]*=>[^.]*\b\1\.', long):
119 | prefix = r'(\w([\w.]|\[[^[\]]+\])*)\.'
120 | if re.search('[^,]+,[^>]+=>', long):
121 | comma = ','
122 | long = re.sub(r'[^>]*(?P\b\w+\b)[^>]*=>[^.]*\b(?P=prefix)\.|[^>]+=>|\([^,)]*\)|,.*', '', long)
123 | if prefix:
124 | short += '(\\1'
125 | if '(' not in long:
126 | long += '('
127 | short += comma
128 | long = prefix + re.sub('[\'"]', '[\'"]', re.escape(long))
129 | elif long[0] == long[-1] in '\'"':
130 | short = lambda x, short=short, long=long: f"{'[' * (len(x[0]) < len(long))}{short}{']' * (len(x[0]) < len(long))}"
131 | long = f'\\.{long[1:-1]}' + re.sub('[\'"]', '[\'"]', f'|{long}') * replace_quoted
132 | if re.match('\\w', long[0]):
133 | long = f'\\b{long}'
134 | if re.match('\\w', long[-1]):
135 | long += '\\b'
136 | if isinstance(script, bytes):
137 | long = safe_encode(long, encoding)
138 | if isinstance(short, str):
139 | short = safe_encode(short, encoding)
140 | else:
141 | short = lambda x, short=short: safe_encode(short(x), encoding)
142 | sub = script[:0]
143 | cnt = 0
144 | parts = re.split(safe_encode(literals_regex, encoding) if isinstance(script, bytes) else literals_regex, script)
145 | for i, part in enumerate(parts):
146 | if i % 2 == 0:
147 | part, c = re.subn(long, short, part)
148 | cnt += c
149 | sub += part
150 | if cnt >= min_cnt:
151 | if add_used_aliases:
152 | alias += '\n'
153 | if isinstance(sub, bytes):
154 | alias = safe_encode(alias, encoding)
155 | if alias not in sub:
156 | sub = alias + sub.lstrip()
157 | if not prevent_grow or get_len(sub, encoding) < get_len(script, encoding):
158 | script = sub
159 | new_len = get_len(script, encoding)
160 | if new_len > orig_len:
161 | print(f'Warning: uglified size increased: {new_len} B > {orig_len} B', file=sys.stderr)
162 | return script
163 |
164 |
165 | def html_wrap(script: AnyStr,
166 | aliases: str = default_aliases,
167 | replace_quoted: bool = True,
168 | min_cnt: int = 2,
169 | prevent_grow: bool = True,
170 | lang: str = '',
171 | encoding: str = 'utf8',
172 | mobile: bool = False,
173 | title: str = '',
174 | ) -> AnyStr:
175 | html_lang = f'' * bool(lang)
176 | encoding = encoding.lower()
177 | if encoding == 'utf-8':
178 | encoding = 'utf8'
179 | elif encoding in ['cp1252', 'latin1']:
180 | encoding = 'l1' # HTML5 treats these the same
181 | mobile_meta = '' * mobile
182 | title_element = f'{title}' * bool(title)
183 | html_header = f'{html_lang}{mobile_meta}{title_element}'
185 | sep = ''
186 | if isinstance(script, bytes):
187 | html_header = safe_encode(html_header, encoding)
188 | html_footer = safe_encode(html_footer, encoding)
189 | sep = safe_encode(sep, encoding)
190 | if aliases:
191 | script = uglify(script, aliases, replace_quoted, min_cnt, prevent_grow, encoding=encoding)
192 | return sep.join([html_header, script.strip(), html_footer])
193 |
--------------------------------------------------------------------------------
/ZTML.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "private_outputs": true,
7 | "provenance": [],
8 | "collapsed_sections": [],
9 | "authorship_tag": "ABX9TyOZ/X56cwNPCb8Cs4lZTsRx",
10 | "include_colab_link": true
11 | },
12 | "kernelspec": {
13 | "name": "python3",
14 | "display_name": "Python 3"
15 | },
16 | "language_info": {
17 | "name": "python"
18 | }
19 | },
20 | "cells": [
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {
24 | "id": "view-in-github",
25 | "colab_type": "text"
26 | },
27 | "source": [
28 | "
"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "source": [
34 | "# ZTML\n",
35 | "\n",
36 | "### Extreme inline text compression for HTML / JS\n",
37 | "### By [Eyal Gruss](https://eyalgruss.com) ([@eyaler](https://twitter.com/eyaler)\\)\n",
38 | "\n",
39 | "Repo: [github.com/eyaler/ztml](https://github.com/eyaler/ztml)\n",
40 | "\n",
41 | "Shortcut to Colab: [bit.ly/ztml1](https://bit.ly/ztml1)"
42 | ],
43 | "metadata": {
44 | "id": "V__-3LfHyt5l"
45 | }
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {
51 | "cellView": "form",
52 | "id": "kKLXYZNYynrz"
53 | },
54 | "outputs": [],
55 | "source": [
56 | "#@title Setup\n",
57 | "%cd /content\n",
58 | "!git clone -q https://github.com/eyaler/ztml\n",
59 | "!pip -q install -r ztml/requirements.txt"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "source": [
65 | "#@title Enter text or HTML code\n",
66 | "#@markdown Important: for HTML tick `raw` below\n",
67 | "from IPython.display import display\n",
68 | "from ipywidgets import Layout, Textarea\n",
69 | "try:\n",
70 | " text = textarea.value\n",
71 | "except NameError:\n",
72 | " text = ''\n",
73 | "textarea = Textarea(value=text, placeholder='Type something', description='Text:', layout=Layout(width='90%', height='200px'))\n",
74 | "display(textarea)"
75 | ],
76 | "metadata": {
77 | "cellView": "form",
78 | "id": "Z9RJOcFL_HEw"
79 | },
80 | "execution_count": null,
81 | "outputs": []
82 | },
83 | {
84 | "cell_type": "code",
85 | "source": [
86 | "#@title or Upload text or HTML or image file\n",
87 | "#@markdown Warning: will clear any input to above textarea\n",
88 | "from google.colab import files\n",
89 | "%cd /content\n",
90 | "try:\n",
91 | " files.upload_file('input_file')\n",
92 | "except ValueError:\n",
93 | " pass\n",
94 | "else:\n",
95 | " try:\n",
96 | " textarea.value = ''\n",
97 | " except NameError:\n",
98 | " pass"
99 | ],
100 | "metadata": {
101 | "cellView": "form",
102 | "id": "pzlcSOpCGFXy"
103 | },
104 | "execution_count": null,
105 | "outputs": []
106 | },
107 | {
108 | "cell_type": "code",
109 | "source": [
110 | "#@title Compress!\n",
111 | "#@markdown Warning: `bitdepth` of `8-bit`, `24-bit` do not work on Safari\n",
112 | "import os\n",
113 | "output_filename = 'index.html' #@param {type: 'string'}\n",
114 | "input_encoding = '' #@param {type: 'string'}\n",
115 | "reduce_whitespace = False #@param {type: 'boolean'}\n",
116 | "unix_newline = True #@param {type: 'boolean'}\n",
117 | "fix_punct = False #@param {type: 'boolean'}\n",
118 | "remove_bom = True #@param {type: 'boolean'} \n",
119 | "caps = 'auto' #@param ['auto', 'lower', 'raw', 'simple', 'upper']\n",
120 | "bwtsort = True #@param {type: 'boolean'}\n",
121 | "mtf = '0' #@param ['none', 0, 1, 2, 50, 52, 60, 70, 80, 90]\n",
122 | "bitdepth = 1 #@param [1, 8, 24]\n",
123 | "ect = False #@param {type: 'boolean'}\n",
124 | "bin2txt = 'crenc' #@param ['base64', 'base125', 'crenc']\n",
125 | "element_id = '' #@param {type: 'string'}\n",
126 | "raw = True #@param {type: 'boolean'}\n",
127 | "image = False #@param {type: 'boolean'}\n",
128 | "js = False #@param {type: 'boolean'}\n",
129 | "uglify = True #@param {type: 'boolean'}\n",
130 | "replace_quoted = True #@param {type: 'boolean'}\n",
131 | "lang = '' #@param {type: 'string'}\n",
132 | "mobile = False #@param {type: 'boolean'}\n",
133 | "title = '' #@param {type: 'string'}\n",
134 | "text_var = 't' #@param {type: 'string'}\n",
135 | "\n",
136 | "if ect:\n",
137 | " try:\n",
138 | " have_ect_lib\n",
139 | " except NameError:\n",
140 | " !add-apt-repository -y ppa:ubuntu-toolchain-r/test\n",
141 | " !apt upgrade libstdc++6\n",
142 | " have_ect_lib = True\n",
143 | "\n",
144 | "%cd /content\n",
145 | "input_filename = 'input_file'\n",
146 | "try:\n",
147 | " if textarea.value:\n",
148 | " with open(input_filename, 'wb') as f:\n",
149 | " f.write(textarea.value.encode())\n",
150 | " print('Using input to textarea')\n",
151 | " else:\n",
152 | " print('Using uploaded file')\n",
153 | "except NameError:\n",
154 | " print('Using uploaded file')\n",
155 | "reduce_whitespace_arg = '--reduce_whitespace' * reduce_whitespace\n",
156 | "unix_newline_arg = '--skip_unix_newline' * (not unix_newline)\n",
157 | "fix_punct_arg = '--fix_punct' * fix_punct\n",
158 | "remove_bom_arg = '--skip_remove_bom ' * (not remove_bom)\n",
159 | "bwtsort_arg = '--skip_bwtsort ' * (not bwtsort)\n",
160 | "ect_arg = '--ect' * ect\n",
161 | "raw_arg = '--raw' * raw\n",
162 | "image_arg = '--image' * image\n",
163 | "js_arg = '--js' * js\n",
164 | "uglify_arg = '--skip_uglify' * (not uglify)\n",
165 | "replace_quoted_arg = '--skip_replace_quoted' * (not replace_quoted)\n",
166 | "mobile_arg = '--mobile' * mobile\n",
167 | "!python ztml/ztml/ztml.py \"$input_filename\" \"$output_filename\" --input_encoding $input_encoding $reduce_whitespace_arg $unix_newline_arg $fix_punct_arg $remove_bom_arg --caps $caps $bwtsort_arg --mtf $mtf --bitdepth $bitdepth $ect_arg --bin2txt $bin2txt --element_id $element_id $raw_arg $image_arg $js_arg $uglify_arg $replace_quoted_arg --lang $lang $mobile_arg --title $title --text_var $text_var\n",
168 | "input_size = os.path.getsize(input_filename)\n",
169 | "output_size = os.path.getsize(output_filename)\n",
170 | "print(f'{input_size:,} B -> {output_size:,} B ({output_size / input_size * 100 :.1f}%)')"
171 | ],
172 | "metadata": {
173 | "id": "qg-KcsfG0CpP",
174 | "cellView": "form"
175 | },
176 | "execution_count": null,
177 | "outputs": []
178 | },
179 | {
180 | "cell_type": "code",
181 | "source": [
182 | "#@title Download output file\n",
183 | "from google.colab import files\n",
184 | "if bin2txt == 'crenc':\n",
185 | " print(f'Note: {output_filename} is encoded in cp1252, which some editors might break')\n",
186 | "files.download(output_filename)"
187 | ],
188 | "metadata": {
189 | "cellView": "form",
190 | "id": "3C9EVO8sFyA0"
191 | },
192 | "execution_count": null,
193 | "outputs": []
194 | },
195 | {
196 | "cell_type": "code",
197 | "source": [
198 | "#@title Display output as hex dump\n",
199 | "from IPython.display import HTML\n",
200 | "with open(output_filename, 'rb') as f:\n",
201 | " hex = '0x' + f.read().hex()\n",
202 | "print(hex)\n",
203 | "HTML(f\"\")"
204 | ],
205 | "metadata": {
206 | "id": "v0GwtZtnTprz",
207 | "cellView": "form"
208 | },
209 | "execution_count": null,
210 | "outputs": []
211 | }
212 | ]
213 | }
--------------------------------------------------------------------------------
/ztml/text_prep.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from typing import Optional, Tuple
3 |
4 | import regex
5 |
6 | if not __package__:
7 | import default_vars
8 | else:
9 | # noinspection PyPackages
10 | from . import default_vars
11 |
12 |
13 | newline = r'\n\v\f\r\x85\u2028'
14 | single_quote = '[\u2018-\u201b\u05f3\uff07]'
15 | double_quote = '[\u201c-\u201f\u05f4\uff02]'
16 | apos = "['’]" # \\uff07
17 | eos = '[!.?]' # r'\uff01\uff0e\uff1f\ufe52\ufe56\ufe57'
18 | nonword = r'\p{L}\p{M}\p{N}'
19 | caps_modes = ['auto', 'lower', 'raw', 'simple', 'upper']
20 | default_caps = 'auto'
21 |
22 |
23 | def normalize(text: str,
24 | reduce_whitespace: bool = False,
25 | unix_newline: bool = True,
26 | fix_punct: bool = False,
27 | strip_bom: bool = True
28 | ) -> str:
29 | if reduce_whitespace:
30 | text = regex.sub(rf'\s*[{newline}]\s*[{newline}]\s*', '\n\n', text.replace('\u2029', '\n\n'))
31 | text = regex.sub(rf'[^\S{newline}]*[{newline}][^\S{newline}]*', '\n', text)
32 | text = regex.sub(rf'[^\S{newline}]+', ' ', text)
33 | text = text.strip()
34 | elif unix_newline:
35 | text = regex.sub('\r\n?', '\n', text)
36 | if fix_punct:
37 | text = regex.sub('\\p{Pd}', '-', text)
38 | text = regex.sub(single_quote, "'", text)
39 | text = regex.sub(double_quote, '"', text)
40 | text = regex.sub('\u2026', '...', text)
41 | if strip_bom:
42 | text = regex.sub('^\ufeff', '', text)
43 | return text
44 |
45 |
46 | caps_regex = rf'(((?=(\r\n|[{newline}]))\3){{2,}}|\u2029|^|{eos})\P{{L}}*.|(^|[^{nonword}])i(?![{nonword}])' # Avoid lookbehind to support Safari
47 |
48 |
49 | def decode_caps_simple(text: str) -> str:
50 | return regex.sub(caps_regex, lambda m: m[0].upper(), text)
51 |
52 |
53 | def encode_caps(text: str, caps: str = default_caps) -> str:
54 | assert caps in caps_modes, f"Error: caps='{caps}' not in {caps_modes}"
55 | return text if caps == 'raw' else text.upper() if caps == 'upper' else text.lower()
56 |
57 |
58 | def remove_the(text: str) -> str:
59 | the_str = 'THE' if text == text.upper() else 'the'
60 | return regex.sub(f'(^(?!{the_str}$)| ){the_str}( |$)', r'\1\2', text, flags=regex.MULTILINE)
61 |
62 |
63 | def get_qu_regex(next_letter_case: str, u_caps: Optional[bool] = None) -> str:
64 | u = 'U' if u_caps or u_caps is None and next_letter_case == 'u' else 'u'
65 | return f'(?={apos}?[^{u}\\P{{L{next_letter_case}}}])'
66 |
67 |
68 | def encode_quq(text: str) -> str:
69 | text = regex.sub(f"([Qq])u{get_qu_regex('l')}", '\\1', text)
70 | return regex.sub(f"QU{get_qu_regex('')}", 'Q', text)
71 |
72 |
73 | def decode_quq(text: str, caps: str) -> str:
74 | if caps == 'raw':
75 | text = regex.sub(f"[Qq]{get_qu_regex('l')}", '\\g<0>u', text)
76 | text = regex.sub(f"Q{get_qu_regex('u')}", 'QU', text)
77 | elif caps == 'upper':
78 | text = regex.sub(f"Q{get_qu_regex('', u_caps=True)}", 'QU', text)
79 | else:
80 | text = regex.sub(f"q{get_qu_regex('')}", 'qu', text)
81 | return text
82 |
83 |
84 | def get_quq_js_decoder(caps: str) -> str:
85 | if caps == 'raw':
86 | js_decoder = f".replace(/[Qq]{get_qu_regex('l')}/gu,'$&u').replace(/Q{get_qu_regex('u')}/gu,'QU')"
87 | elif caps == 'upper':
88 | js_decoder = f".replace(/Q{get_qu_regex('', u_caps=True)}/gu,'QU')"
89 | else:
90 | js_decoder = f".replace(/q{get_qu_regex('')}/gu,'qu')"
91 | return js_decoder
92 |
93 |
94 | def count_bad_quq(text: str, caps: str, verbose: bool = False) -> int:
95 | text = encode_caps(text, caps)
96 | recon = decode_quq(encode_quq(text), caps)
97 | text = regex.split('[Qq]', text)
98 | recon = regex.split('[Qq]', recon)
99 | cnt = sum(a != b for a, b in zip(recon, text)) + abs(len(recon) - len(text))
100 | if verbose and cnt:
101 | print(f'Warning: found {cnt} cases of q followed by a non u, or terminal qu', file=sys.stderr)
102 | return cnt
103 |
104 |
105 | def encode_with_fallbacks(text: str,
106 | caps: str = default_caps,
107 | the: bool = True,
108 | quq: bool = True,
109 | caps_fallback: bool = True,
110 | the_fallback: bool = True,
111 | quq_fallback: bool = True,
112 | verbose: bool = False
113 | ) -> Tuple[str, str, bool, bool]:
114 | if caps_fallback:
115 | if caps == 'auto' and text != decode_caps_simple(encode_caps(text, caps)):
116 | caps = 'raw'
117 | if verbose:
118 | print(f"Falling back to caps='{caps}'", file=sys.stderr)
119 | if caps == 'raw':
120 | if text == text.lower():
121 | caps = 'lower'
122 | elif text == text.upper():
123 | caps = 'upper'
124 | text = encode_caps(text, caps)
125 |
126 | if the:
127 | theless = remove_the(text)
128 | if the_fallback:
129 | if theless == text:
130 | the = False
131 | if the and regex.search('^ | | $', text, regex.MULTILINE):
132 | the = False
133 | if verbose:
134 | print(f'Falling back to the={the}', file=sys.stderr)
135 | if the:
136 | text = theless
137 |
138 | if quq:
139 | quless = encode_quq(text)
140 | if quq_fallback:
141 | if len(text) - len(quless) < len(get_quq_js_decoder(caps)):
142 | quq = False
143 | if quq and count_bad_quq(text, caps, verbose):
144 | quq = False
145 | if verbose:
146 | print(f'Falling back to quq={quq}', file=sys.stderr)
147 | if quq:
148 | text = quless
149 |
150 | return text, caps, the, quq
151 |
152 |
153 | def get_js_decoder(text: Optional[str] = None,
154 | caps: str = default_caps,
155 | the: bool = True,
156 | quq: bool = True,
157 | text_var: str = default_vars.text
158 | ) -> str:
159 | assert caps in caps_modes, f"Error: caps='{caps}' not in {caps_modes}"
160 | if text is not None:
161 | text, caps, the, quq = encode_with_fallbacks(text, caps, the, quq)
162 | js_decoder = ''
163 | if quq:
164 | js_decoder += get_quq_js_decoder(caps)
165 | if the:
166 | the_str = 'THE' if caps == 'upper' else 'the'
167 | js_decoder += f".replace(/(^(?!$)| )( |$)/gm,'$1{the_str}$2')"
168 | if caps in ['auto', 'simple']:
169 | js_decoder += f'.replace(/{caps_regex}/gu,m=>m.toUpperCase())'
170 | if js_decoder:
171 | js_decoder = f'{text_var}={text_var}{js_decoder}\n'
172 | return js_decoder
173 |
174 |
175 | def encode_and_get_js_decoder(text: str,
176 | caps: str = default_caps,
177 | the: bool = True,
178 | quq: bool = True,
179 | caps_fallback: bool = True,
180 | the_fallback: bool = True,
181 | quq_fallback: bool = True,
182 | verbose: bool = False,
183 | text_var: str = default_vars.text
184 | ) -> Tuple[str, str]:
185 | text, caps, the, quq = encode_with_fallbacks(text, caps, the, quq, caps_fallback, the_fallback, quq_fallback, verbose)
186 | return text, get_js_decoder(caps=caps, the=the, quq=quq, text_var=text_var)
187 |
188 |
189 | def test_quq() -> None:
190 | bad = 0
191 | for caps in caps_modes:
192 | for q in 'Qq':
193 | for u in ['U', 'u', ' ', "' "]:
194 | for a in "AaUu'’ ":
195 | for b in 'Bb ':
196 | orig = f'{q}{u}{a}{b}'
197 | text, new_caps, _, _ = encode_with_fallbacks(orig, caps, the=False, quq=False)
198 | enc = encode_quq(text)
199 | dec = decode_quq(text, new_caps)
200 | if text != dec:
201 | print(f'caps={caps:>6}->{new_caps:>5}: orig={orig} -> text={text} -> enc={enc} -> dec={dec}', file=sys.stderr)
202 | bad += 1
203 | print(f'Found {bad} bad qu cases', file=sys.stderr)
204 |
205 |
206 | if __name__ == '__main__':
207 | test_quq()
208 |
--------------------------------------------------------------------------------
/ztml/deflate.py:
--------------------------------------------------------------------------------
1 | """PNG / DEFLATE encoding optimized for arbitrary data compression
2 |
3 | Encoding data as a PNG image allows efficient DEFLATE compression (similar to ZIP),
4 | while allowing use of the browser's native decompression capability for free,
5 | thus saving the need of an additional decoder, AKA PNG bootstarpping.
6 | The data is then read from the HTML canvas element.
7 | The image aspect ratio is optimized to be squarish (for higher browser compatibility) with minimal padding.
8 | We do not use the alpha channel due to the browser's alpha pre-multiplication in Canvas 2D causing inaccuracies.
9 | In Safari, even without an alpha channel, similar inaccuracies prevent using 8-bit and 24-bit depths for PNGs.
10 | By default, we use Google's optimized Zopfli compression which is compatible with DEFLATE decompression.
11 | Alternatively, you can use ECT which can be beneficial for large texts (but may slightly hurt smaller ones)
12 | (e.g. ECT 0.9.4 gave 1.4% overall improvement over Zopfli on 2600.txt and minibook)
13 | A minimalistic JS decoder code is generated.
14 |
15 | Other experiments:
16 | 8-bit and 24-bit (RGB) give similar overall results to 1-bit (but does not work on Safari)
17 | WEBP gave worse overall results (libwebp/cwebp from 8-bit and 24-bit PNG, but does seem to work on Safari).
18 |
19 | References:
20 | https://web.archive.org/web/20090826082743/http://blog.nihilogic.dk:80/2008/05/compression-using-canvas-and-png.html
21 | https://web.archive.org/web/20130310075429/http://daeken.com/superpacking-js-demos
22 | https://web.archive.org/web/20130219050720/http://alexle.net/archives/306
23 | https://www.iamcal.com/png-store
24 | https://github.com/iamcal/PNGStore
25 | http://bwirl.blogspot.com/2011/11/optimize-web-apps-with-png.html
26 | https://gist.github.com/gasman/2560551 (pnginator)
27 | https://www.pouet.net/prod.php?which=59298 (JsExe)
28 | https://www.pouet.net/topic.php?which=8770
29 | https://github.com/codegolf/zpng
30 | https://github.com/xem/miniBook
31 | https://github.com/google/zopfli
32 | https://github.com/hattya/zopflipy
33 | https://github.com/fhanau/Efficient-Compression-Tool (ECT)
34 | https://encode.su/threads/2274-ECT-an-file-optimizer-with-fast-zopfli-like-deflate-compression
35 | https://stackoverflow.com/questions/60074569/html-canvas-returns-off-by-some-bytes-from-getimagedata
36 | https://stackoverflow.com/questions/23497925/how-can-i-stop-the-alpha-premultiplication-with-canvas-imagedata/#60564905
37 | https://github.com/jhildenbiddle/canvas-size#test-results
38 | https://pqina.nl/blog/canvas-area-exceeds-the-maximum-limit
39 | https://bugs.webkit.org/show_bug.cgi?id=230855
40 | """
41 |
42 |
43 | from io import BytesIO
44 | import math
45 | import os
46 | import platform
47 | import sys
48 | from tempfile import NamedTemporaryFile
49 | from typing import List, Iterable, Optional
50 |
51 | import png
52 | # noinspection PyPackageRequirements
53 | import zopfli
54 |
55 | if not __package__:
56 | import default_vars
57 | else:
58 | # noinspection PyPackages
59 | from . import default_vars
60 |
61 |
62 | max_dim = 32767
63 | max_len = 11180 ** 2
64 | allowed_bitdepths = [1, 8, 24] # Warning: 8-bit and 24-bit do not work on Safari
65 | default_bitdepth = 1
66 |
67 |
68 | def to_png(bits: Iterable[int],
69 | bitdepth: int = default_bitdepth, # 1, 8, 24
70 | compression: Optional[int] = 9,
71 | ect: bool = False, # This will override zop settings
72 | ect_compression: int = 20009,
73 | ect_filters: str = 'allfilters', # 'allfilters', 'allfilters-b' (brute), 'allfilters-c' (cheap) or ''
74 | zop_filters: str = '', # Any subset of 01234mepb or '' for auto
75 | zop_iterations: int = 15,
76 | zop_iterations_large: int = 5,
77 | omit_iend: bool = True,
78 | filename: str = '',
79 | verbose: bool = False) -> bytes:
80 | data = list(bits)
81 | bit_len = len(data)
82 | assert bit_len
83 | assert bitdepth in allowed_bitdepths, f'Error: bitdepth={bitdepth} not in {allowed_bitdepths}'
84 | assert compression is None or -1 <= compression <= 9
85 | pad_bits = (bitdepth - bit_len) % bitdepth
86 | if bitdepth > 1:
87 | data += [data[-1]] * pad_bits
88 | data = [int(''.join(str(b) for b in data[i : i + bitdepth]), 2) for i in range(0, len(data), bitdepth)]
89 | width = height = pad_pixels = 0
90 | length = None
91 | while width * height != length:
92 | if length is not None:
93 | data.append(data[-1])
94 | pad_pixels += 1
95 | length = len(data)
96 | assert length <= max_len, f'Error: length={length:,} > max_len={max_len:,}'
97 | height = int(math.sqrt(length))
98 | while length % height and height > 1 and length // (height-1) <= max_dim:
99 | height -= 1
100 | width = length // height
101 | assert width <= max_dim, f'Error: width={width:,} > max_dim={max_dim:,}'
102 | width_with_channels = width
103 | length_with_channels = length
104 | if bitdepth > 8:
105 | data = [b for i in data for b in i.to_bytes(bitdepth // 8, 'big')]
106 | width_with_channels *= bitdepth // 8
107 | length_with_channels *= bitdepth // 8
108 | data = [data[i : i + width_with_channels] for i in range(0, length_with_channels, width_with_channels)]
109 | png_data = BytesIO()
110 | png.Writer(width, height, greyscale=bitdepth <= 8,
111 | bitdepth=1 if bitdepth == 1 else 8,
112 | compression=compression).write(png_data, data)
113 | png_data.seek(0)
114 | png_data = png_data.read()
115 | out = png_data
116 |
117 | if ect:
118 | with NamedTemporaryFile(suffix='.png', delete=False) as f: # See https://github.com/python/cpython/issues/88221
119 | f.write(out)
120 | filename = f.name
121 | ect_filters_arg = f'--{ect_filters}' * bool(ect_filters)
122 | ect_path = os.path.normpath(os.path.join(os.path.dirname(__file__), '..', 'ect', 'ect')) + '-ubuntu' * (platform.system() == 'Linux')
123 | error = os.system(f'{ect_path} -{ect_compression} -strip -quiet --strict {ect_filters_arg} --mt-deflate {filename}') # Time-consuming op.
124 | assert not error, f'Error: could not run {ect_path} - Please install from https://github.com/fhanau/Efficient-Compression-Tool or use ect=False'
125 | with open(filename, 'rb') as f:
126 | out = f.read()
127 | try:
128 | os.remove(filename)
129 | except PermissionError:
130 | pass
131 | elif zop_iterations > 0 and zop_iterations_large > 0:
132 | out = zopfli.ZopfliPNG(filter_strategies=zop_filters,
133 | iterations=zop_iterations,
134 | iterations_large=zop_iterations_large
135 | ).optimize(png_data) # Time-consuming op.
136 | if omit_iend: # Warning: do this only for PNG files
137 | out = out[:-12] # IEND length (4 bytes) + IEND tag (4 bytes) + IEND CRC-32 (4 bytes). Note: do not omit the IDAT zlib Adler-32 or the IDAT CRC-32 as this will break Safari
138 | if verbose:
139 | print(f'input_bits={bit_len} pad_bits={pad_bits} width={width} height={height} pad_pixels={pad_pixels} total_pad_bits={length*bitdepth - bit_len} bits={length * bitdepth} bytes={length*bitdepth+7 >> 3} png={len(png_data)} final={len(out)}', file=sys.stderr)
140 | if filename:
141 | with open(filename, 'wb') as f:
142 | f.write(out)
143 | return out
144 |
145 |
146 | encode = to_png
147 |
148 |
149 | def load_png(filename: str) -> List[int]:
150 | return png.Reader(filename=filename).read_flat()[2].tolist()
151 |
152 |
153 | def get_js_create_image(image_var: str = default_vars.image,
154 | bytearray_var: str = default_vars.bytearray
155 | ) -> str:
156 | return f'''{image_var}=new Image
157 | {image_var}.src=URL.createObjectURL(new Blob([{bytearray_var}]))
158 | '''
159 |
160 |
161 | def get_js_image_data(bit_len: int,
162 | decoder_script: str = '',
163 | bitdepth: int = default_bitdepth,
164 | image_var: str = default_vars.image,
165 | bitarray_var: str = default_vars.bitarray
166 | ) -> str:
167 | assert bitdepth in allowed_bitdepths, f'Error: bitdepth={bitdepth} not in {allowed_bitdepths}'
168 | js_image_data = f'''{image_var}.decode().then(c=>{{
169 | c=document.createElement`canvas`
170 | x=c.getContext`2d`
171 | c=[c.width={image_var}.width,c.height={image_var}.height]
172 | x.drawImage({image_var},0,0)
173 | s=x.getImageData({bitarray_var}=[],0,...c).data{'.filter((v,i)=>(i+1)%4)' * (bitdepth == 24)}
174 | '''
175 | if bitdepth == 1:
176 | js_image_data += f'for(j={bit_len};j--;){bitarray_var}[j]=s[j*4]>>7&1\n' # Applying >>7 to deal with Safari PNG rendering inaccuracy
177 | else: # Will break Safari
178 | js_image_data += f'''for(j={(bit_len+(bitdepth-bit_len)%bitdepth) // 8};j--;)for(k=8;k--;){bitarray_var}[j*8+k]=s[j{'*4' * (bitdepth <= 8)}]>>7-k&1
179 | {bitarray_var}.length={bit_len}
180 | '''
181 | js_image_data += f'{decoder_script.strip()}}})'
182 | return js_image_data
183 |
184 |
185 | def get_js_image_decoder(bit_len: int,
186 | decoder_script: str = '',
187 | bitdepth: int = default_bitdepth,
188 | image_var: str = default_vars.image,
189 | bytearray_var: str = default_vars.bytearray,
190 | bitarray_var: str = default_vars.bitarray
191 | ) -> str:
192 | return get_js_create_image(image_var, bytearray_var) + get_js_image_data(
193 | bit_len, decoder_script, bitdepth, image_var, bitarray_var)
194 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # ZTML
4 |
5 | ### Extreme inline text compression for HTML / JS
6 | ### By [Eyal Gruss](https://eyalgruss.com) ([@eyaler](https://twitter.com/eyaler))
7 |
8 | #### Partially made at [Stochastic Labs](http://stochasticlabs.org)
9 |
10 | On-chain media storage can require efficient compression for text embedded inline in HTML / JS.
11 | ZTML is a custom pipeline that generates stand-alone HTML or JS files which embed competitively compressed self-extracting text, with file sizes of 25% - 40% the original.
12 | These file sizes include the decoder code which is a highly golfed 1 - 1.5 kB (including auxiliary indices and tables).
13 | The approach makes sense and is optimized for small texts (tens of kB), but performs quite well also on large texts.
14 | The pipeline includes original low-overhead [binary-to-text alternatives](https://en.wikipedia.org/wiki/Binary-to-text_encoding) to Base64 which are also useful for inline images.
15 |
16 | You can find a very high-level overview in these [slides](misc/reversim2022_slides.pdf) from this [5-minute talk](https://www.youtube.com/watch?v=7rz_MfAIJnY) (in Hebrew) at [Reversim Summit 2022](https://summit2022.reversim.com), and some more technical highlights and discussion in the [encode.su forum thread](https://encode.su/threads/3973-ZTML-Extreme-inline-text-compression-for-HTML-JS).
17 |
18 | ### Benchmark
19 | | | File format | [Micromegas (En)](https://gutenberg.org/files/30123/30123-8.txt) | [War and Peace (En)](https://gutenberg.org/files/2600/2600-0.txt) |
20 | |---------------------------------------------------------------------------------------|---------------|------------------------------------------------------------------|-------------------------------------------------------------------|
21 | | Project Gutenberg plain text utf8 | txt | 63.7 kB | 3.2 MB |
22 | | [paq8px_v206fix1](http://www.mattmahoney.net/dc/text.html#1250) -12RT (excl. decoder) | paq | 13.3 kB (21%) | 575 kB (18%) |
23 | | 7-Zip 22.01 9 Ultra PPMd (excl. decoder) | 7z | 20.8 kB (32%) | 746 kB (23%) |
24 | | 7-Zip 22.01 9 Ultra PPMd (self-extracting) | exe | 232 kB (364%) | 958 kB (29%) |
25 | | Zstandard 1.5.2 -22 --ultra (excl. decoder) | zst | 23.4 kB (37%) | 921 kB (28%) |
26 | | [Roadroller](https://github.com/lifthrasiir/roadroller) 2.1.0 -O2 | js | 26.5 kB (42%) | 1.0 MB (30%) |
27 | | **ZTML Base125** | html (utf8) | 26.4 kB (41%) `mtf=0` | 902 kB (28%) `mtf=80` `ect=True` |
28 | | **ZTML crEnc** | html (cp1252) | 23.5 kB (37%) `mtf=0` | 803 kB (24%) `mtf=80` `ect=True` |
29 |
30 | ### Installation
31 | ```
32 | git clone https://github.com/eyaler/ztml
33 | pip install -r ztml/requirements.txt
34 | ```
35 | For running validations, you also need to have Chrome, Edge and Firefox installed.
36 |
37 | ### Usage
38 | A standard simplified pipeline can be run by calling `ztml()`:
39 | ```
40 | from ztml import ztml
41 | ztml.ztml('Input text that is much longer than this one!', 'output.html')
42 | ```
43 | or running `ztml.py` from the command line (CLI):
44 | ```
45 | python ztml/ztml.py input.txt output.html
46 | ```
47 | See [ztml.py](ztml/ztml.py).
48 | Of course, there is also an accessible [Google Colab](https://colab.research.google.com/github/eyaler/ztml/blob/main/ZTML.ipynb) with a simple GUI. Shortcut: [bit.ly/ztml1](https://bit.ly/ztml).
49 |
50 | [crEnc](ztml/crenc.py) gives better compression but requires setting the HTML or JS charset to cp1252.
51 | [Base125](ztml/base125.py) is the second-best option if one must stick with utf8.
52 |
53 | See [example.py](example.py) for a complete example reproducing the ZTML results in the above benchmark,
54 | and [example_image.py](example_image.py) for an example of encoding inline images, by using `image=True` or passing a file with a supported image extension to the CLI.
55 | Outputs of these runs can be accessed at [eyalgruss.com/ztml](https://eyalgruss.com/ztml).
56 | On top of the built-in validations for Chrome, Edge and Firefox, these were also manually tested on macOS Monterey 12.5 Safari 15.6, macOS Ventura 13.2 Safari 16.3 and iOS 16.0, 16.2 Safari.
57 |
58 | A quick-and-dirty way to compress an existing single-page HTML websites with embedded inline media is to use `raw=True` or pass a '.html' file to the CLI.
59 |
60 | ### What this is not
61 | 1. Not an HTML inliner
62 | 2. Not an image optimizer
63 | 3. Not a full-fledged JS minifier
64 |
65 | ### Caveats
66 | 1. Files larger than a few MB might not work on [iOS Safari](https://pqina.nl/blog/canvas-area-exceeds-the-maximum-limit) or [macOS Safari 15](https://bugs.webkit.org/show_bug.cgi?id=230855).
67 | 2. This solution favors compression rate over compression and decompression times. Use `mtf=None` for faster decompression of large files.
68 | 3. For [compressing word lists](http://golf.horse) (sorted lexicographically), solutions as [Roadroller](https://lifthrasiir.github.io/roadroller) do a much better job.
69 |
70 | ### Pipeline and source code breakdown
71 | | | Stage | Source | Remarks |
72 | |-----|--------------------------------------------|-------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
73 | | 0 | Pipeline and CLI | [ztml.py](ztml/ztml.py) | |
74 | | 1 | Text normalization (lossy) | [text_prep.py](ztml/text_prep.py) | Reduce whitespace; substitute unicode punctuation |
75 | | 2 | Text condensation (lossless) | [text_prep.py](ztml/text_prep.py) | Lowercase with automatic capitalization; substitute common strings as: the, qu |
76 | | 3 | Burrows–Wheeler + Move-to-front transforms | [bwt_mtf.py](ztml/bwt_mtf.py) | Alphabet pre-sorting; Various MTF variants, including some original ones; Higher MTF settings beneficial for larger texts |
77 | | 4 | Huffman encoding | [huffman.py](ztml/huffman.py) | Canonical encoding with a [codebook-free decoder](https://researchgate.net/publication/3159499_On_the_implementation_of_minimum_redundancy_prefix_codes); Benefical as a pre-DEFLATE stage |
78 | | 5 | Burrows–Wheeler transform on bits | [bwt_mtf.py](ztml/bwt_mtf.py) | Beneficial for large texts |
79 | | 6 | PNG / DEFLATE compression | [deflate.py](ztml/deflate.py) | ZIP-like compression with native browser decompression; aspect ratio optimized for maximal compatibility and minimal padding; [Zopfli](https://github.com/google/zopfli) or [ECT](https://github.com/fhanau/Efficient-Compression-Tool) optimizations |
80 | | 7 | Binary-to-text encoding | | Embed in template strings; Fix [HTML character overrides](https://html.spec.whatwg.org/multipage/parsing.html#table-charref-overrides); Allow [dynEncode](https://github.com/eshaz/simple-yenc#what-is-dynencode)-like optimal offset |
81 | | 7a | Base125 (utf8) | [base125.py](ztml/base125.py) | An original variant of [Base122](https://blog.kevinalbs.com/base122), with 14.7% overhead |
82 | | 7b | crEnc (cp1252) | [crenc.py](ztml/crenc.py) | An original variant of [yEnc](http://www.yenc.org) with 1.2% overhead; requires single-byte charset |
83 | | 8 | Uglification | [webify.py](ztml/webify.py) | Substitute recurring JS names with short aliases |
84 | | 9 | Validation | [validation.py](ztml/validation.py) | Reproduce input content on Chrome, Edge and Firefox |
85 |
86 | Note: image encoding only uses steps 0 and 7 and later.
87 |
88 | See source files for explanations, experiments and more references.
89 |
90 | ### Projects using this
91 | - [fragium](https://fragium.com)
92 | - [miniBook](https://xem.github.io/miniBook) submission by Eyal Gruss ([source code](misc/minibook.py))
93 | - [WEBZOS](https://wbtz.github.io)
94 |
--------------------------------------------------------------------------------
/ect/License.txt:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/ztml/ztml.py:
--------------------------------------------------------------------------------
1 | """ZTML - Extreme inline text compression for HTML / JS"""
2 |
3 |
4 | import argparse
5 | from base64 import b64encode
6 | import chardet
7 | import os
8 | import sys
9 | from time import time
10 | from typing import AnyStr, Optional, overload, Tuple, Union
11 |
12 | try:
13 | from typing import Literal
14 | except ImportError:
15 | from typing_extensions import Literal
16 |
17 | if not __package__:
18 | import base125, bwt_mtf, crenc, default_vars, deflate, huffman, text_prep, validation, webify
19 | else:
20 | # noinspection PyPackages
21 | from . import base125, bwt_mtf, crenc, default_vars, deflate, huffman, text_prep, validation, webify
22 |
23 |
24 | bin2txt_encodings = ['base64', 'base125', 'crenc']
25 | default_bin2txt = 'crenc'
26 |
27 |
28 | @overload
29 | def ztml(data: AnyStr, filename: str = ..., reduce_whitespace: bool = ...,
30 | unix_newline: bool = ..., fix_punct: bool = ...,
31 | remove_bom: bool = ..., caps: str = ..., bwtsort: bool = ...,
32 | mtf: Optional[int] = ..., bitdepth: int = ..., ect: bool = ...,
33 | bin2txt: str = ..., element_id: str = ..., raw: bool = ...,
34 | image: bool = ..., js: bool = ..., uglify: bool = ...,
35 | replace_quoted: bool = ..., lang: str = ..., mobile: bool = ...,
36 | title: str = ..., text_var: str = ..., validate: Literal[False] = ...,
37 | ignore_regex: str = ..., browser: validation.BrowserType = ...,
38 | timeout: int = ..., verbose: bool = ...) -> bytes: ...
39 |
40 |
41 | @overload
42 | def ztml(data: AnyStr, filename: str = ..., reduce_whitespace: bool = ...,
43 | unix_newline: bool = ..., fix_punct: bool = ..., ect: bool = ...,
44 | remove_bom: bool = ..., caps: str = ..., bwtsort: bool = ...,
45 | mtf: Optional[int] = ..., bitdepth: int = ..., bin2txt: str = ...,
46 | element_id: str = ..., raw: bool = ..., image: bool = ...,
47 | js: bool = ..., uglify: bool = ..., replace_quoted: bool = ...,
48 | lang: str = ..., mobile: bool = ..., title: str = ...,
49 | text_var: str = ..., validate: Literal[True] = ...,
50 | ignore_regex: str = ..., browser: validation.BrowserType = ...,
51 | timeout: int = ..., verbose: bool = ...) -> Tuple[bytes, int]: ...
52 |
53 |
54 | @overload
55 | def ztml(data: AnyStr, filename: str = ..., reduce_whitespace: bool = ...,
56 | unix_newline: bool = ..., fix_punct: bool = ..., ect: bool = ...,
57 | remove_bom: bool = ..., caps: str = ..., bwtsort: bool = ...,
58 | mtf: Optional[int] = ..., bitdepth: int = ..., bin2txt: str = ...,
59 | element_id: str = ..., raw: bool = ..., image: bool = ...,
60 | js: bool = ..., uglify: bool = ..., replace_quoted: bool = ...,
61 | lang: str = ..., mobile: bool = ..., title: str = ...,
62 | text_var: str = ..., validate: bool = ..., ignore_regex: str = ...,
63 | browser: validation.BrowserType = ..., timeout: int = ...,
64 | verbose: bool = ...) -> Union[bytes, Tuple[bytes, int]]: ...
65 |
66 |
67 | def ztml(data,
68 | filename='',
69 | reduce_whitespace=False,
70 | unix_newline=True,
71 | fix_punct=False,
72 | remove_bom=True,
73 | caps=text_prep.default_caps,
74 | bwtsort=True,
75 | mtf=bwt_mtf.default_mtf,
76 | bitdepth=deflate.default_bitdepth,
77 | ect=False,
78 | bin2txt=default_bin2txt,
79 | element_id='',
80 | raw=False,
81 | image=False,
82 | js=False,
83 | uglify=True,
84 | replace_quoted=True,
85 | lang='',
86 | mobile=False,
87 | title='',
88 | text_var=default_vars.text,
89 | validate=False,
90 | ignore_regex='',
91 | browser=validation.default_browser,
92 | timeout=validation.default_timeout,
93 | verbose=False
94 | ):
95 | start_time = time()
96 | assert bin2txt in bin2txt_encodings, f'Error: bin2txt={bin2txt} not in {bin2txt_encodings}'
97 | assert not element_id and not image or not raw
98 | if image:
99 | assert isinstance(data, bytes)
100 | image_data = data
101 | else:
102 | if isinstance(data, bytes):
103 | data = data.decode()
104 | data = text_prep.normalize(data, reduce_whitespace, unix_newline, fix_punct, remove_bom) # Reduce whitespace
105 | condensed, string_decoder = text_prep.encode_and_get_js_decoder(data, caps, text_var=text_var) # Lower case and shorten common strings
106 | bwt_mtf_text, bwt_mtf_text_decoder = bwt_mtf.encode_and_get_js_decoder(condensed, bwtsort, mtf, add_bwt_func=False, data_var=text_var) # Burrows-Wheeler + Move-to-front transforms on text. MTF is a time-consuming op.
107 | huffman_bits, huffman_decoder = huffman.encode_and_get_js_decoder(bwt_mtf_text, text_var=text_var) # Huffman encode
108 | bits, bwt_bits_decoder = bwt_mtf.encode_and_get_js_decoder(huffman_bits) # Burrows-Wheeler transform on bits
109 | if raw:
110 | writer = f'document.close(document.write({text_var}))' # document.close() needed to ensure that any style changes added after a script are applied
111 | elif element_id:
112 | writer = f'''document.body.appendChild(document.createElement`pre`).id='{element_id}'
113 | {element_id}.textContent={text_var}'''
114 | else:
115 | writer = f"document.body.style.whiteSpace='pre';document.body.textContent={text_var}"
116 | bits_decoder = f'{bwt_bits_decoder}{huffman_decoder}{bwt_mtf_text_decoder}{string_decoder}{writer}'
117 | image_data = deflate.to_png(bits, bitdepth, ect=ect) # PNG encode. Time-consuming op.
118 |
119 | encoding = 'cp1252' if bin2txt == 'crenc' else 'utf8'
120 | if bin2txt == 'base64': # This is just for benchmarking and is not recommended
121 | image_url = b'data:;base64,' + b64encode(image_data)
122 | if not image:
123 | image_decoder = f"{default_vars.image}=new Image;{default_vars.image}.src='".encode() + image_url + b"'\n"
124 | out = image_decoder + deflate.get_js_image_data(len(bits), bits_decoder, bitdepth).encode()
125 | else:
126 | if bin2txt == 'base125':
127 | bytes_decoder = base125.get_js_decoder(image_data) # Time-consuming op. when offset==None
128 | else:
129 | bytes_decoder = crenc.get_js_decoder(image_data) # Time-consuming op. when offset==None
130 | if image:
131 | image_url = f"'+URL.createObjectURL(new Blob([{default_vars.bytearray}]))+'".encode()
132 | else:
133 | image_decoder = deflate.get_js_image_decoder(len(bits), bits_decoder, bitdepth)
134 | out = webify.safe_encode(image_decoder, encoding, get_back_unused=True)
135 |
136 | if image:
137 | if element_id:
138 | out = f"""document.body.appendChild(new Image).id='{element_id}'
139 | {element_id}.src='""".encode() + image_url + b"'"
140 | else:
141 | out = f"document.body.style.background='url(".encode() + image_url + b")no-repeat'"
142 |
143 | if bin2txt != 'base64':
144 | out = bytes_decoder + out
145 | if os.path.splitext(filename)[-1] == '.js':
146 | js = True
147 | if js and uglify:
148 | out = webify.uglify(out, replace_quoted=replace_quoted, encoding=encoding)
149 | elif not js:
150 | out = webify.html_wrap(out, aliases=webify.default_aliases * uglify,
151 | replace_quoted=replace_quoted, lang=lang,
152 | encoding=encoding, mobile=mobile, title=title)
153 | if filename:
154 | with open(filename, 'wb') as f:
155 | f.write(out)
156 | if verbose:
157 | print(f'Encoding took {time() - start_time :,.1f} sec.', file=sys.stderr)
158 | if validate:
159 | file = webify.html_wrap(out, aliases='', encoding=encoding) if js else filename or out
160 | by = element = ''
161 | if element_id:
162 | by = 'id'
163 | element = element_id
164 | valid = validation.validate_html(file, data, caps, by, element, raw,
165 | browser, timeout,
166 | content_var=text_var,
167 | ignore_regex=ignore_regex,
168 | verbose=True)
169 | out = out, not valid
170 | return out
171 |
172 |
173 | if __name__ == '__main__':
174 | parser = argparse.ArgumentParser()
175 | parser.add_argument('input_filename')
176 | parser.add_argument('output_filename', nargs='?', default='')
177 | parser.add_argument('--input_encoding', nargs='?', const='', default='', help='Auto detect by default')
178 | parser.add_argument('--reduce_whitespace', action='store_true')
179 | parser.add_argument('--skip_unix_newline', action='store_true')
180 | parser.add_argument('--fix_punct', action='store_true')
181 | parser.add_argument('--skip_remove_bom', action='store_true')
182 | parser.add_argument('--caps', type=str.lower, choices=text_prep.caps_modes, default=text_prep.default_caps)
183 | parser.add_argument('--skip_bwtsort', action='store_true')
184 | parser.add_argument('--mtf', type=lambda x: None if x.lower() == 'none' else int(x), choices=bwt_mtf.mtf_variants,
185 | default=bwt_mtf.default_mtf)
186 | parser.add_argument('--bitdepth', type=int, choices=deflate.allowed_bitdepths, default=deflate.default_bitdepth, help='Warning: 8-bit and 24-bit do not work on Safari')
187 | parser.add_argument('--ect', action='store_true')
188 | parser.add_argument('--bin2txt', type=str.lower, choices=bin2txt_encodings, default=default_bin2txt)
189 | parser.add_argument('--element_id', nargs='?', const='', default='', help='Warning: must be a valid JS variable name, and watch out for collisions with HTML namespace')
190 | parser.add_argument('--raw', action='store_true', help='Use document.write() to overwrite the document with the raw text. May also be implied from input_filename extension')
191 | parser.add_argument('--image', action='store_true', help='May also be implied from input_filename extension')
192 | parser.add_argument('--js', action='store_true', help='May also be implied from output_filename extension')
193 | parser.add_argument('--skip_uglify', action='store_true')
194 | parser.add_argument('--skip_replace_quoted', action='store_true')
195 | parser.add_argument('--lang', nargs='?', const='', default='')
196 | parser.add_argument('--mobile', action='store_true')
197 | parser.add_argument('--title', nargs='?', const='', default='')
198 | parser.add_argument('--text_var', default=default_vars.text)
199 | parser.add_argument('--validate', action='store_true')
200 | parser.add_argument('--ignore_regex', nargs='?', const='', default='')
201 | parser.add_argument('--browser', type=str.lower, choices=list(validation.drivers), default=validation.default_browser)
202 | parser.add_argument('--timeout', type=int, default=validation.default_timeout, help='seconds')
203 | parser.add_argument('--verbose', action='store_true')
204 | args = parser.parse_args(args=None if sys.argv[1:] else ['--help'])
205 | ext = os.path.splitext(args.input_filename)[-1][1:].lower()
206 | if ext in webify.raw_extensions:
207 | args.raw = True
208 | elif ext in webify.image_extensions:
209 | args.image = True
210 | with open(args.input_filename, 'rb') as f:
211 | data = f.read()
212 | if not args.image:
213 | if args.input_encoding:
214 | data = data.decode(args.input_encoding)
215 | else:
216 | encoding = chardet.detect(data)['encoding'] or 'utf8'
217 | try:
218 | data = data.decode(encoding)
219 | except UnicodeDecodeError:
220 | if encoding.replace('-', '') == 'utf8':
221 | raise
222 | out = ztml(data, args.output_filename, args.reduce_whitespace,
223 | not args.skip_unix_newline, args.fix_punct,
224 | not args.skip_remove_bom, args.caps, not args.skip_bwtsort,
225 | args.mtf, args.bitdepth, args.ect, args.bin2txt,
226 | args.element_id, args.raw, args.image, args.js,
227 | not args.skip_uglify, not args.skip_replace_quoted, args.lang,
228 | args.mobile, args.title, args.text_var, args.validate,
229 | args.ignore_regex, args.browser, args.timeout, args.verbose)
230 | result = False
231 | if args.validate:
232 | out, result = out
233 | if not args.output_filename:
234 | sys.stdout.buffer.write(out)
235 | sys.exit(int(result))
236 |
--------------------------------------------------------------------------------
/ztml/bwt_mtf.py:
--------------------------------------------------------------------------------
1 | """Burrows-Wheeler and Move-to-front transforms
2 |
3 | Applies pre-BWT alphabet vowel sorting by default to concentrate the vowels together.
4 | BWT Implementation follows pydivsufsort tests, to obviate adding an EOF token.
5 | MTF includes original variants (50-90) inspired by Fenwick's Sticky MTF,
6 | and larger texts show benefit from higher MTF settings.
7 | Additional BWT on bits (after entropy coding and before DEFLATE) was found beneficial for large texts.
8 |
9 | Other experiments:
10 | Run-length encoding for spaces before BWT gave worse overall results.
11 | Run-length encoding after text BWT, and MTF over run characters (just this part of Neimi&Teuhola) gave worse overall results.
12 | Run-length encoding for zeros (ZLE) after MTF gave worse overall results.
13 |
14 | References:
15 | https://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf
16 | https://github.com/louisabraham/pydivsufsort/blob/master/tests/reference.py
17 | https://www.cs.auckland.ac.nz/~peter-f/FTPfiles/2002%20VL%20coding%20BWT.pdf (Fenwick)
18 | https://www.juergen-abel.info/files/preprints/preprint_post_bwt_stages.pdf
19 | https://www.juergen-abel.info/files/preprints/preprint_universal_text_preprocessing.pdf
20 | https://home.uncg.edu/cmp/faculty/srtate/papers/bwtsort.pdf
21 | https://www.math.uni-bielefeld.de/sfb343/preprints/pr99133.ps.gz
22 | https://onlinelibrary.wiley.com/doi/full/10.1002/spe.2873 (Neimi&Teuhola)
23 | http://groups.di.unipi.it/~gulli/tutorial/burrows_wheeler.pdf (note: has errors afaict)
24 | """
25 |
26 |
27 | from typing import Iterable, List, Optional, overload, Tuple, Union
28 |
29 | import numpy as np
30 | from pydivsufsort import divsufsort
31 |
32 | if not __package__:
33 | import default_vars, webify
34 | else:
35 | # noinspection PyPackages
36 | from . import default_vars, webify
37 |
38 |
39 | order1 = 'AOUIEVWXYZaouievwxyz'
40 | order2 = 'VWXYZAOUIEvwxyzaouie'
41 | mtf_variants = [None, 0, 1, 2, 50, 52, 60, 70, 80, 90]
42 | default_mtf = 0
43 |
44 |
45 | bwtsort_table = str.maketrans(order1, order2)
46 | reverse_bwtsort_table = str.maketrans(order2, order1)
47 | surrogate_lo = 55296
48 | surrogate_hi = 57343
49 | max_unicode = 1114111
50 | max_ord_for_mtf = max_unicode - (surrogate_hi-surrogate_lo) - 1
51 |
52 |
53 | def mtf_rank(mtf: int, rank: int, prev: int) -> int:
54 | assert mtf is not None
55 | assert mtf in mtf_variants, f'Error: mtf={mtf} not in {mtf_variants}'
56 | if mtf == 0:
57 | new_rank = 0
58 | elif mtf == 1:
59 | new_rank = rank > 1
60 | elif mtf == 2:
61 | new_rank = rank > 1 or rank == 1 and not prev
62 | elif mtf == 50:
63 | new_rank = rank // 2
64 | elif mtf == 52:
65 | new_rank = rank // 2 if rank > 1 else rank == 1 and not prev
66 | else:
67 | new_rank = int(rank*(mtf/100) + 0.5) # Round in the same way as JS (do not round half to even)
68 | return new_rank
69 |
70 |
71 | def mtf_encode(data: Iterable[int],
72 | mtf: int == default_mtf,
73 | validate=True
74 | ) -> List[int]:
75 | data = list(data)
76 | max_data = max(data, default=-1)
77 | assert max_data <= max_ord_for_mtf, (max_data, max_ord_for_mtf)
78 | ranks = list(range(max_data + 1))
79 | out = []
80 | prev = 1
81 | for i in data:
82 | rank = ranks.index(i) # Time-consuming op.
83 | ranks.pop(rank)
84 | ranks.insert(mtf_rank(mtf, rank, prev), i)
85 | prev = rank
86 | if rank >= surrogate_lo:
87 | rank += surrogate_hi - surrogate_lo + 1
88 | out.append(rank)
89 | if validate:
90 | decoded = mtf_decode(out, mtf)
91 | if not hasattr(data, '__getitem__'):
92 | data = type(decoded)(data)
93 | assert decoded == data, (len(decoded), len(data), decoded[:30], data[:30])
94 | return out
95 |
96 |
97 | def mtf_decode(data: Iterable[int], mtf: int == default_mtf) -> List[int]:
98 | out = list(data)
99 | ranks = list(range(max(out, default=-1) + 1))
100 | prev = 1
101 | for i, rank in enumerate(out):
102 | if rank > surrogate_lo:
103 | rank -= surrogate_hi - surrogate_lo + 1
104 | out[i] = ranks.pop(rank)
105 | ranks.insert(mtf_rank(mtf, rank, prev), out[i])
106 | prev = rank
107 | return out
108 |
109 |
110 | @overload
111 | def encode(data: str, bwtsort: bool = ..., mtf: Optional[int] = ...,
112 | validate: bool = ...) -> Tuple[str, int]: ...
113 |
114 |
115 | @overload
116 | def encode(data: Iterable[int], bwtsort: bool = ..., mtf: Optional[int] = ...,
117 | validate: bool = ...) -> Tuple[List[int], int]: ...
118 |
119 |
120 | def encode(data, bwtsort=True, mtf=default_mtf, validate=True):
121 | is_str = isinstance(data, str)
122 | if not is_str:
123 | data = list(data)
124 | out = list(data)
125 | if bwtsort:
126 | if not is_str:
127 | out = [chr(i) for i in out]
128 | out = ''.join(out).translate(bwtsort_table)
129 | if is_str or bwtsort:
130 | out = [ord(c) for c in out]
131 | sa = divsufsort(np.array(out)) if out else []
132 | out = out[-1:] + [out[i - 1] for i in sa if i]
133 | index = list(sa).index(0) if out else 0
134 | if mtf is not None:
135 | out = mtf_encode(out, mtf, validate) # Time-consuming op.
136 | if is_str:
137 | out = ''.join(chr(i) for i in out)
138 | if validate:
139 | decoded = decode(out, index, bwtsort, mtf)
140 | if not hasattr(data, '__getitem__'):
141 | data = type(decoded)(data)
142 | assert decoded == data, (len(decoded), len(data), decoded[:30], data[:30])
143 | return out, index
144 |
145 |
146 | @overload
147 | def decode(data: str, index: int, bwtsort: bool = ...,
148 | mtf: Optional[int] = ...) -> str: ...
149 |
150 |
151 | @overload
152 | def decode(data: Iterable[int], index: int, bwtsort: bool = ...,
153 | mtf: Optional[int] = ...) -> List[int]: ...
154 |
155 |
156 | def decode(data, index, bwtsort=True, mtf=default_mtf):
157 | is_str = isinstance(data, str)
158 | out = list(data)
159 | if mtf is not None:
160 | if is_str:
161 | out = [ord(c) for c in out]
162 | out = mtf_decode(out, mtf)
163 | if is_str:
164 | out = [chr(i) for i in out]
165 | ordered = [(c, i - (i <= index)) for i, c in enumerate(out)]
166 | ordered.sort()
167 | for i in range(len(out)):
168 | out[i], index = ordered[index]
169 | if bwtsort:
170 | if not is_str:
171 | out = [chr(i) for i in out]
172 | out = ''.join(out).translate(reverse_bwtsort_table)
173 | if not is_str:
174 | out = [ord(c) for c in out]
175 | elif is_str:
176 | out = ''.join(out)
177 | return out
178 |
179 |
180 | def get_js_decoder(data: Union[str, Iterable[int]],
181 | index: int,
182 | bwtsort: bool = True,
183 | mtf: Optional[int] = default_mtf,
184 | add_bwt_func: bool = True,
185 | bwt_func_var: str = default_vars.bwt_func,
186 | data_var: str = ''
187 | ) -> str:
188 | assert mtf in mtf_variants, f'Error: mtf={mtf} not in {mtf_variants}'
189 | is_str = isinstance(data, str)
190 | if not is_str:
191 | data = list(data)
192 | if not data_var:
193 | data_var = default_vars.text if is_str else default_vars.bitarray
194 | js_decoder = f'{data_var}=[...{data_var}].map(c=>c.codePointAt())\n' * is_str
195 | if mtf is not None:
196 | if mtf == 0:
197 | mtf_op = f'd.unshift({data_var}[j++]=d.splice(k,1)[0])'
198 | elif mtf == 1:
199 | mtf_op = f'd.splice(k>1,0,{data_var}[j++]=d.splice(k,1)[0])'
200 | elif mtf == 2:
201 | js_decoder += 'n=1\n'
202 | mtf_op = f'd.splice(k>!!n,0,{data_var}[j++]=d.splice(k,1)[0]),n=k'
203 | elif mtf == 50:
204 | mtf_op = f'd.splice(k/2,0,{data_var}[j++]=d.splice(k,1)[0])'
205 | elif mtf == 52:
206 | js_decoder += 'n=1\n'
207 | mtf_op = f'd.splice(k>1?k/2:k>n,0,{data_var}[j++]=d.splice(k,1)[0]),n=k'
208 | else:
209 | mtf_op = f"d.splice(k*{str(mtf / 100).lstrip('0')}+.5,0,{data_var}[j++]=d.splice(k,1)[0])"
210 | if is_str and any(ord(c) > surrogate_lo for c in data):
211 | mtf_op = f'k-={surrogate_hi - surrogate_lo + 1}*(k>{surrogate_lo}),{mtf_op}'
212 | # Use reduce instead of Math.max(...array) due to argument limit: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Function/apply#using_apply_and_built-in_functions
213 | js_decoder += f'''d=[...Array({data_var}.reduce((a,b)=>a>b?a:b+1,0)).keys()]
214 | j=0
215 | for(k of {data_var}){mtf_op}
216 | '''
217 | if add_bwt_func:
218 | js_decoder += f"{bwt_func_var}=(d,k)=>{{s=d.map((c,i)=>[c,i-(i<=k)]).sort((a,b)=>a[0]-b[0]);for(j in s)[d[j],k]=s[k]}}\n" # Sort on code points to respect order of char above \uffff
219 | js_decoder += f'{bwt_func_var}({data_var},{index})\n'
220 | dyn_orders = None
221 | if bwtsort:
222 | symbols = set(data)
223 | if not is_str:
224 | symbols = {chr(i) for i in symbols}
225 | dyn_orders = list(zip(*[(c1, c2) for c1, c2 in zip(order1, order2) if c1 in symbols]))
226 | if dyn_orders:
227 | dyn_order1, dyn_order2 = dyn_orders
228 | dyn_order1 = webify.escape(''.join(dyn_order1))
229 | dyn_order2 = webify.escape(''.join(dyn_order2))
230 | js_decoder += f'''d={{}};[...`{dyn_order2}`].map((c,i)=>d[c]=[...`{dyn_order1}`][i])
231 | {data_var}={data_var}.map(i=>{'d[c=String.fromCodePoint(i)]||c).join``' if is_str else '(d[c=String.fromCodePoint(i)]||c).codePointAt())'}
232 | '''
233 | if is_str and not dyn_orders:
234 | # Don't use String.fromCodePoint(...array) due to argument limit: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Function/apply#using_apply_and_built-in_functions
235 | js_decoder += f'{data_var}={data_var}.map(i=>String.fromCodePoint(i)).join``\n'
236 | return js_decoder
237 |
238 |
239 | @overload
240 | def encode_and_get_js_decoder(data: str,
241 | bwtsort: bool = ...,
242 | mtf: Optional[int] = ...,
243 | add_bwt_func: bool = ...,
244 | bwt_func_var: str = ...,
245 | data_var: str = ...,
246 | validate: bool = ...
247 | ) -> Tuple[str, str]: ...
248 |
249 |
250 | @overload
251 | def encode_and_get_js_decoder(data: Iterable[int],
252 | bwtsort: bool = ...,
253 | mtf: Optional[int] = ...,
254 | add_bwt_func: bool = ...,
255 | bwt_func_var: str = ...,
256 | data_var: str = ...,
257 | validate: bool = ...
258 | ) -> Tuple[List[int], str]: ...
259 |
260 |
261 | def encode_and_get_js_decoder(data,
262 | bwtsort=True,
263 | mtf=default_mtf,
264 | add_bwt_func=True,
265 | bwt_func_var=default_vars.bwt_func,
266 | data_var='',
267 | validate=True
268 | ):
269 | is_str = isinstance(data, str)
270 | if not is_str:
271 | data = list(data)
272 | if not data_var:
273 | data_var = default_vars.text if is_str else default_vars.bitarray
274 | if data_var == default_vars.bitarray:
275 | bwtsort = False
276 | mtf = None
277 | encoded, index = encode(data, bwtsort, mtf, validate)
278 | return encoded, get_js_decoder(data, index, bwtsort, mtf, add_bwt_func, bwt_func_var, data_var)
279 |
280 |
281 | def test() -> None:
282 | mtf_test = [3, 2, 2, 2, 3, 2, 2, 3, 2, 2]
283 | mtf0 = mtf_encode(mtf_test[:], mtf=0, validate=True)
284 | assert mtf0 == [3, 3, 0, 0, 1, 1, 0, 1, 1, 0], mtf0
285 | mtf1 = mtf_encode(mtf_test[:], mtf=1, validate=True)
286 | assert mtf1 == [3, 3, 1, 0, 2, 0, 0, 1, 1, 0], mtf1
287 | mtf2 = mtf_encode(mtf_test[:], mtf=2, validate=True)
288 | assert mtf2 == [3, 3, 1, 0, 2, 0, 0, 1, 0, 0], mtf2
289 |
290 | symbols = ['', '\0', '\1', 'a', 'b', 'א', 'ב', '\ue000', '\uffff', '\U00010000']
291 | for x in symbols:
292 | for y in symbols:
293 | for z in symbols:
294 | for mtf in mtf_variants:
295 | for bwtsort in [False, True]:
296 | encode(f'{x}{y}{z}', bwtsort=bwtsort, mtf=mtf, validate=True)
297 |
298 | symbols = ['', '0', '1', '97', '255']
299 | for x in symbols:
300 | for y in symbols:
301 | for z in symbols:
302 | for mtf in mtf_variants:
303 | for bwtsort in [False, True]:
304 | encode([int(c) for c in f'{x}{y}{z}'], bwtsort=bwtsort, mtf=mtf, validate=True)
305 |
306 |
307 | if __name__ == '__main__':
308 | test()
309 |
--------------------------------------------------------------------------------
/ztml/validation.py:
--------------------------------------------------------------------------------
1 | from base64 import b64decode
2 | from contextlib import ExitStack, redirect_stdout
3 | import os
4 | import sys
5 | from tempfile import NamedTemporaryFile
6 | from time import sleep, time
7 | from typing import AnyStr, Iterable, Mapping, Optional, overload, TypeVar, Union
8 |
9 | try:
10 | from typing import Literal
11 | except ImportError:
12 | from typing_extensions import Literal
13 |
14 | import regex
15 | from selenium.common.exceptions import JavascriptException, TimeoutException, WebDriverException
16 | from selenium.webdriver import Chrome, Edge, Firefox, chrome, edge, firefox
17 | from selenium.webdriver.common.by import By
18 | from selenium.webdriver.remote.webdriver import WebDriver
19 | from selenium.webdriver.support.ui import WebDriverWait
20 | from webdriver_manager.chrome import ChromeDriverManager
21 | from webdriver_manager.microsoft import EdgeChromiumDriverManager
22 | from webdriver_manager.firefox import GeckoDriverManager
23 |
24 | if not __package__:
25 | import default_vars, text_prep, webify
26 | else:
27 | # noinspection PyPackages
28 | from . import default_vars, text_prep, webify
29 |
30 |
31 | default_browser = 'chrome'
32 | default_timeout = 60
33 | default_by = By.TAG_NAME
34 | default_element = 'body'
35 | webdriver_paths_filename = 'webdriver_paths.txt'
36 |
37 |
38 | os.environ['WDM_LOG'] = '0'
39 | drivers = dict(chrome=[Chrome, chrome, ChromeDriverManager],
40 | edge=[Edge, edge, EdgeChromiumDriverManager],
41 | firefox=[Firefox, firefox, GeckoDriverManager]
42 | )
43 | BrowserType = Union[str, WebDriver]
44 | critical_error_strings = ['executable needs to be', 'unable to find binary', 'unexpectedly']
45 |
46 |
47 | FilenameOrBytes = TypeVar('FilenameOrBytes', str, bytes)
48 |
49 |
50 | def full_path(filename: str) -> str:
51 | return f"file:///{os.path.realpath(filename).replace(os.sep, '/')}"
52 |
53 |
54 | def get_browser(browser: BrowserType,
55 | stack: Optional[ExitStack] = None
56 | ) -> WebDriver:
57 | if isinstance(browser, WebDriver):
58 | return browser
59 | options = drivers[browser][1].options.Options()
60 | options.headless = True
61 | options.add_argument('--no-sandbox')
62 | if hasattr(options, 'add_experimental_option'):
63 | options.add_experimental_option('excludeSwitches', ['enable-logging'])
64 | try:
65 | with redirect_stdout(None):
66 | service = drivers[browser][2]().install()
67 | folder = os.path.dirname(webdriver_paths_filename)
68 | if folder:
69 | os.makedirs(folder, exist_ok=True)
70 | with open(webdriver_paths_filename, 'a', encoding='utf8') as f:
71 | f.write(f'{browser},{service}\n')
72 | except Exception:
73 | with open(webdriver_paths_filename, encoding='utf8') as f:
74 | for line in reversed(f.read().splitlines()):
75 | b, service = line.split(',', 1)
76 | if b == browser:
77 | break
78 | while isinstance(browser, str):
79 | try:
80 | browser = drivers[browser][0](service=drivers[browser][1].service.Service(service, log_path=os.devnull), options=options)
81 | except WebDriverException as e:
82 | if any(s in e.msg for s in critical_error_strings):
83 | raise
84 | print(e, file=sys.stderr)
85 | sleep(30)
86 | if stack:
87 | browser = stack.enter_context(browser)
88 | return browser
89 |
90 |
91 | @overload
92 | def render_html(file: FilenameOrBytes, by: str = ..., element: str = ...,
93 | raw: bool = ..., image: Literal[True] = ...,
94 | browser: str = ..., timeout: int = ..., content_var: str = ...
95 | ) -> Optional[bytes]: ...
96 |
97 |
98 | @overload
99 | def render_html(file: FilenameOrBytes, by: str = ..., element: str = ...,
100 | raw: bool = ..., image: Literal[False] = ...,
101 | browser: str = ..., timeout: int = ..., content_var: str = ...
102 | ) -> Optional[str]: ...
103 |
104 |
105 | @overload
106 | def render_html(file: FilenameOrBytes, by: str = ..., element: str = ...,
107 | raw: bool = ..., image: bool = ...,
108 | browser: str = ..., timeout: int = ..., content_var: str = ...
109 | ) -> Optional[AnyStr]: ...
110 |
111 |
112 | def render_html(file,
113 | by=default_by,
114 | element=default_element,
115 | raw=False,
116 | image=False,
117 | browser=default_browser,
118 | timeout=default_timeout,
119 | content_var=''
120 | ):
121 | assert not raw or not image
122 | if not by:
123 | by = default_by
124 | if not element:
125 | element = default_element
126 | with ExitStack() as stack:
127 | browser = get_browser(browser, stack)
128 | if isinstance(file, str):
129 | filename = file
130 | else:
131 | with NamedTemporaryFile(suffix='.html', delete=False) as f: # See https://github.com/python/cpython/issues/88221
132 | f.write(file)
133 | filename = f.name
134 | browser.get(full_path(filename))
135 | if isinstance(file, bytes):
136 | try:
137 | os.remove(filename)
138 | except PermissionError:
139 | pass
140 | try:
141 | wait = WebDriverWait(browser, timeout)
142 | if image:
143 | if by == By.TAG_NAME and element == 'body':
144 | data_url = wait.until(lambda x:
145 | regex.sub('^none$', '',
146 | x.find_element(by, element)
147 | .value_of_css_property('background-image')))
148 | else:
149 | data_url = wait.until(lambda x:
150 | x.find_element(by, element)
151 | .get_property('src'))
152 | assert isinstance(data_url, str), type(data_url)
153 | if ';base64,' in data_url:
154 | return b64decode(data_url.split(';base64,', 1)[1].split('"', 1)[0], validate=True)
155 | image_data = browser.execute_script(f'return {content_var or default_vars.bytearray}')
156 | if isinstance(image_data, dict): # Needed for or Firefox, see: https://github.com/SeleniumHQ/selenium/issues/11070
157 | image_data = [v for k, v in sorted(image_data.items(), key=lambda x: int(x[0]))]
158 | return bytes(image_data)
159 | if raw:
160 | sleep(0.1)
161 | get_text = lambda x: x.execute_script(f'return {content_var or default_vars.text}')
162 | else:
163 | get_text = lambda x: x.find_element(by, element).get_property('innerText')
164 | try:
165 | text = wait.until(get_text)
166 | except JavascriptException:
167 | sleep(1)
168 | text = wait.until(get_text)
169 | assert isinstance(text, str), type(text)
170 | return text
171 | except TimeoutException:
172 | return None
173 | except Exception:
174 | print(f'\nError: {browser.name} failed on {full_path(filename)}', file=sys.stderr)
175 | raise
176 |
177 |
178 | def find_first_diff(rendered: AnyStr, data: AnyStr, verbose: bool = True) -> int:
179 | i = -1
180 | for i, (r, t) in enumerate(zip(rendered, data)):
181 | if r != t:
182 | break
183 | else:
184 | i += 1
185 | if verbose:
186 | print(f'\nFirst difference found at {i} / {len(rendered)}', file=sys.stderr)
187 | print(f'Original: {data[max(i - 30, 0) : i]!r} -> {data[i : i + 50]!r}', file=sys.stderr)
188 | print(f'Rendered: {rendered[max(i - 30, 0) : i]!r} -> {rendered[i : i + 50]!r}\n', file=sys.stderr)
189 | return i
190 |
191 |
192 | def validate_html(file: FilenameOrBytes, # Don't use AnyStr as it does not have to be the same type as data
193 | data: AnyStr,
194 | caps: str = text_prep.default_caps,
195 | by: str = default_by,
196 | element: str = default_element,
197 | raw: bool = False,
198 | browser: BrowserType = default_browser,
199 | timeout: int = default_timeout,
200 | unicode_A: int = 0,
201 | ignore_regex: str = '',
202 | content_var: str = '',
203 | verbose: bool = True
204 | ) -> Optional[bool]:
205 | image = isinstance(data, bytes)
206 | assert data, 'Error: Cannot validate against empty data'
207 | rendered = render_html(file, by, element, raw, image, browser, timeout, content_var)
208 | if rendered is None:
209 | return None
210 | if not image:
211 | if caps == 'lower':
212 | data = data.lower()
213 | elif caps == 'upper':
214 | data = data.upper()
215 | elif caps == 'simple':
216 | data = text_prep.decode_caps_simple(data.lower())
217 | if not raw:
218 | if unicode_A:
219 | rendered = regex.sub(r'[^\p{Z}\p{C}]', lambda m: chr(ord(m[0]) - unicode_A + 65 + (6 if ord(m[0]) - unicode_A + 65 > 90 else 0)), rendered)
220 | rendered = regex.sub(ignore_regex, '', rendered)
221 | if rendered == data:
222 | return True
223 | if verbose:
224 | find_first_diff(rendered, data)
225 | return False
226 |
227 |
228 | def validate_files(filenames: Mapping[str, str],
229 | data: Optional[AnyStr] = None,
230 | reduce_whitespace: bool = False,
231 | unix_newline: bool = True,
232 | fix_punct: bool = False,
233 | remove_bom: bool = True,
234 | caps: str = text_prep.default_caps,
235 | by: str = default_by,
236 | element: str = default_element,
237 | raw: bool = False,
238 | image: bool = False,
239 | browsers: Optional[Union[BrowserType, Iterable[BrowserType]]] = None,
240 | timeout: int = default_timeout,
241 | unicode_A: int = 0,
242 | ignore_regex: str = '',
243 | content_var: str = '',
244 | validate: bool = True,
245 | verbose: bool = True
246 | ) -> bool:
247 | error = False
248 | if browsers is None:
249 | browsers = list(drivers)
250 | elif isinstance(browsers, (str, WebDriver)):
251 | browsers = [browsers]
252 | with ExitStack() as stack:
253 | if validate:
254 | browsers = [get_browser(browser, stack) for browser in browsers]
255 | raw_size = None
256 | no_overhead_size = None
257 | for label, filename in sorted(filenames.items(), key=lambda x: (x[0] != 'raw', x[0] != 'base64_html')):
258 | ext = os.path.splitext(filename)[-1][1:].lower()
259 | if raw_size is not None and ext != 'html':
260 | continue
261 | if data is None or label == 'raw':
262 | if ext in webify.raw_extensions:
263 | raw = True
264 | elif ext in webify.image_extensions:
265 | image = True
266 | assert not image or (not raw and not isinstance(data, str))
267 | if data is None:
268 | with open(filename, 'rb') as f:
269 | data = f.read()
270 | if raw_size is None:
271 | raw_size = len(data.encode() if isinstance(data, str) else data)
272 | if not image and isinstance(data, bytes):
273 | data = text_prep.normalize(data.decode(), reduce_whitespace, unix_newline, fix_punct, remove_bom) # Assumes raw text file is utf8. Otherwise, pass it as a data argument
274 |
275 | if verbose:
276 | size = os.path.getsize(filename)
277 | if label == 'base64_html':
278 | no_overhead_size = size * 3 / 4
279 | stats = []
280 | if raw_size:
281 | stats.append(f'ratio={round(size / raw_size * 100, 1)}%')
282 | if no_overhead_size:
283 | stats.append(f'overhead={round((size/no_overhead_size-1) * 100, 1)}%')
284 | if ext == 'html' and label not in ['raw', 'base64_html']:
285 | with open(filename, 'rb') as f:
286 | html = f.read()
287 | matches = regex.findall(webify.literals_regex.encode(), html)
288 | payload = max(matches, key=len, default=b'').split(b'`', 1)[1].rsplit(b'`', 1)[0]
289 | html = html.replace(payload, b'')
290 | stats.append(f'code: {len(html):,} B = {round(len(html) / 1024, 1):,} kB')
291 | stats = ' '.join(stats)
292 | if stats:
293 | stats = f' ({stats})'
294 | mb = size / 1024 ** 2
295 | if mb >= 0.1:
296 | stats = f' = {round(mb, 1):,} MB{stats}'
297 | kb = size / 1024
298 | if kb >= 0.1:
299 | stats = f' = {round(kb, 1):,} kB{stats}'
300 | print(f"{full_path(filename)} {size:,} B{stats}", end='' if validate and ext == 'html' and label != 'raw' else None, file=sys.stderr)
301 |
302 | if validate and ext == 'html' and label != 'raw':
303 | for i, browser in enumerate(browsers):
304 | start_time = time()
305 | valid = validate_html(filename, data, caps, by, element,
306 | raw, browser, timeout, unicode_A,
307 | ignore_regex, content_var, verbose)
308 | assert valid is not False, filename
309 | if not valid:
310 | error = True
311 | if verbose:
312 | if i == 0:
313 | print(f' rendering secs:', end='', file=sys.stderr)
314 | print(f' {browser.name}=' + (f'{time() - start_time :.1f}' if valid else f'{timeout}(TIMEOUT)'), end='', file=sys.stderr)
315 | if verbose:
316 | print(file=sys.stderr)
317 | if verbose and validate:
318 | print('Note: above rendering times from Selenium are much longer than actual browser rendering.', file=sys.stderr)
319 | return error
320 |
--------------------------------------------------------------------------------