├── ztml ├── __init__.py ├── default_vars.py ├── crenc.py ├── huffman.py ├── base125.py ├── tests.py ├── webify.py ├── text_prep.py ├── deflate.py ├── ztml.py ├── bwt_mtf.py └── validation.py ├── .github └── FUNDING.yml ├── ect ├── ect ├── ect.exe ├── ect-ubuntu ├── Build_ECT_Ububtu.ipynb └── License.txt ├── misc ├── reversim2022_slides.pdf ├── minibook.py ├── run_all.bat ├── .htaccess ├── size_checker.py └── example_html.py ├── requirements.txt ├── LICENSE ├── example.py ├── example_image.py ├── TODO.md ├── ZTML.ipynb └── README.md /ztml/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: eyaler 2 | -------------------------------------------------------------------------------- /ect/ect: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eyaler/ztml/HEAD/ect/ect -------------------------------------------------------------------------------- /ect/ect.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eyaler/ztml/HEAD/ect/ect.exe -------------------------------------------------------------------------------- /ect/ect-ubuntu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eyaler/ztml/HEAD/ect/ect-ubuntu -------------------------------------------------------------------------------- /misc/reversim2022_slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eyaler/ztml/HEAD/misc/reversim2022_slides.pdf -------------------------------------------------------------------------------- /ztml/default_vars.py: -------------------------------------------------------------------------------- 1 | bitarray = 'b' 2 | bwt_func = 'f' 3 | bytearray = 'o' 4 | image = 'i' 5 | text = 't' 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bitarray 2 | chardet 3 | numpy 4 | pydivsufsort 5 | pypng 6 | regex 7 | selenium 8 | typing_extensions 9 | webdriver_manager 10 | zopflipy 11 | gutenbergpy 12 | -------------------------------------------------------------------------------- /misc/minibook.py: -------------------------------------------------------------------------------- 1 | # https://xem.github.io/miniBook 2 | 3 | 4 | import sys 5 | from urllib.request import urlopen 6 | 7 | sys.path.append('..') 8 | from ztml import ztml 9 | 10 | 11 | with urlopen('https://xem.github.io/miniBook/example') as f: 12 | out, result = ztml.ztml(f.read(), 'index.html', mtf=80, ect=True, raw=True, validate=True) 13 | print(f'{len(out):,} B') 14 | assert not result 15 | -------------------------------------------------------------------------------- /misc/run_all.bat: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2> nul 2 | 3 | :; trap "exit" INT TERM 4 | :; set -o errexit 5 | :; function goto() { return $?; } 6 | 7 | cd .. 8 | 9 | python example.py || goto :error 10 | 11 | python example_image.py || goto :error 12 | 13 | cd misc 14 | 15 | python example_html.py || goto :error 16 | 17 | python minibook.py || goto :error 18 | 19 | cd ../ztml 20 | 21 | python tests.py || goto :error 22 | 23 | :; exit 0 24 | exit /b 0 25 | 26 | :error 27 | exit /b %errorlevel% 28 | -------------------------------------------------------------------------------- /misc/.htaccess: -------------------------------------------------------------------------------- 1 | # THIS IS FOR ONLINE TESTING OF OUTPUT FILES 2 | 3 | # SHOW FILES IN FOLDER 4 | Options +Indexes 5 | IndexOptions +FancyIndexing 6 | 7 | # DISABLE CACHING 8 | 9 | ExpiresActive Off 10 | 11 | 12 | FileETag None 13 | Header unset ETag 14 | Header unset Pragma 15 | Header unset Cache-Control 16 | Header unset Last-Modified 17 | Header set Pragma "no-cache" 18 | Header set Cache-Control "max-age=0, no-cache, no-store, must-revalidate" 19 | Header set Expires "Thu, 1 Jan 1970 00:00:00 GMT" 20 | -------------------------------------------------------------------------------- /misc/size_checker.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | 5 | old_folder = sys.argv[1] 6 | new_folder = sys.argv[2] 7 | assert new_folder != old_folder 8 | old_files = sorted(os.listdir(old_folder)) 9 | new_files = sorted(os.listdir(new_folder)) 10 | print(f'Old: {old_folder} ({len(old_files)} files)') 11 | print(f'New: {new_folder} ({len(new_files)} files)') 12 | assert new_files == old_files 13 | 14 | for file in old_files: 15 | old_size = os.path.getsize(os.path.join(old_folder, file)) 16 | new_size = os.path.getsize(os.path.join(new_folder, file)) 17 | assert new_size <= old_size, f'{file} grew from {old_size:,} to {new_size:,}' 18 | 19 | print(f'All {len(old_files)} files are equal or smaller.') 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The following license applies to all parts of this software except where a more restrictive license is stated. 2 | 3 | MIT License 4 | 5 | Copyright (c) 2022 Eyal Gruss (https://github.com/eyaler/ztml) 6 | 7 | Copyright (c) 2021-2022 Ethan Halsall (https://github.com/eshaz/simple-yenc) 8 | 9 | Copyright (c) 2016 Kevin Albertson (https://github.com/kevinAlbs/Base122) 10 | 11 | Permission is hereby granted, free of charge, to any person obtaining a copy 12 | of this software and associated documentation files (the "Software"), to deal 13 | in the Software without restriction, including without limitation the rights 14 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 15 | copies of the Software, and to permit persons to whom the Software is 16 | furnished to do so, subject to the following conditions: 17 | 18 | The above copyright notice and this permission notice shall be included in all 19 | copies or substantial portions of the Software. 20 | 21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 23 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 24 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 25 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 26 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 27 | SOFTWARE. 28 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from time import time 4 | 5 | start_time = time() 6 | 7 | from ztml import validation, ztml 8 | 9 | 10 | books = [30123, 2600] 11 | book_mtf = [0, 80] 12 | book_ect = [False, True] 13 | output_folder = 'output' 14 | skip_download_exists = True 15 | element_id = '' 16 | 17 | 18 | assert len(books) == len(book_mtf) == len(book_ect) 19 | error = False 20 | for item, mtf, ect in zip(books, book_mtf, book_ect): 21 | item_start_time = time() 22 | filenames = dict(raw=f'{item}.txt', 23 | # base64_js=f'{item}_64.js', 24 | base64_html=f'{item}_64.html', 25 | # base125_js=f'{item}_125.js', 26 | base125_html=f'{item}_125.html', 27 | # crenc_js=f'{item}_cr.js', 28 | crenc_html=f'{item}_cr.html') 29 | os.makedirs(output_folder, exist_ok=True) 30 | filenames = {k: os.path.join(output_folder, v) for k, v in filenames.items()} 31 | 32 | # If missing, download an example file from the web 33 | if not skip_download_exists or not os.path.exists(filenames['raw']): 34 | from gutenbergpy.textget import get_text_by_id 35 | with open(filenames['raw'], 'wb') as f: 36 | f.write(get_text_by_id(item)) 37 | 38 | with open(filenames['raw'], 'rb') as f: 39 | data = f.read() 40 | 41 | cnt = 0 42 | for label, filename in filenames.items(): 43 | if label == 'raw': 44 | continue 45 | file = ztml.ztml(data, filename, mtf=mtf, ect=ect, bin2txt=label.rsplit('_', 1)[0], element_id=element_id) 46 | cnt += 1 47 | 48 | print(f'{cnt} encodings of {item} took {(time()-item_start_time) / 60 :.1f} min.') 49 | 50 | # Compare file sizes and validate data is recovered 51 | error |= validation.validate_files(filenames, by='id' * bool(element_id), element=element_id) 52 | print() 53 | 54 | if error: 55 | print('Error: some renderings timed out') 56 | else: 57 | print(f'Total of {len(books)} books took {(time()-start_time) / 60 :.1f} min.') 58 | sys.exit(int(error)) 59 | -------------------------------------------------------------------------------- /misc/example_html.py: -------------------------------------------------------------------------------- 1 | # This is just for testing that ZTML can work on its own HTML outputs in raw mode 2 | 3 | 4 | import os 5 | import sys 6 | from time import time 7 | 8 | start_time = time() 9 | 10 | sys.path.append('..') 11 | from ztml import validation, ztml 12 | 13 | 14 | raw_files = ['30123_64.html', 15 | '30123_125.html', 16 | '30123_cr.html', 17 | 'test_pattern.jpg_64.html', 18 | 'test_pattern.jpg_125.html', 19 | 'test_pattern.jpg_cr.html' 20 | ] 21 | output_folder = '../output' 22 | 23 | 24 | error = False 25 | for url in raw_files: 26 | item_start_time = time() 27 | item = url.replace(os.sep, '/').rsplit('/', 1)[-1] 28 | filenames = dict(raw=item, 29 | # base64_js=f'{item}_64.js', 30 | base64_html=f'{item}_64.html', 31 | # base125_js=f'{item}_125.js', 32 | base125_html=f'{item}_125.html', 33 | # crenc_js=f'{item}_cr.js', 34 | crenc_html=f'{item}_cr.html') 35 | os.makedirs(output_folder, exist_ok=True) 36 | filenames = {k: os.path.join(output_folder, v) for k, v in filenames.items()} 37 | 38 | with open(filenames['raw'], 'rb') as f: 39 | data = f.read() 40 | if os.path.splitext(item)[0].endswith('_cr'): 41 | data = data.decode('cp1252', 'backslashreplace') 42 | 43 | cnt = 0 44 | for label, filename in filenames.items(): 45 | if label == 'raw': 46 | continue 47 | file = ztml.ztml(data, filename, bin2txt=label.rsplit('_', 1)[0], raw=True, text_var='z') 48 | cnt += 1 49 | 50 | print(f'{cnt} encodings of {item} took {(time()-item_start_time) / 60 :.1f} min.') 51 | 52 | # Compare file sizes and validate data is recovered 53 | error |= validation.validate_files(filenames, data, content_var='z') 54 | print() 55 | 56 | if error: 57 | print('Error: some renderings timed out') 58 | else: 59 | print(f'Total of {len(raw_files)} raw files took {(time()-start_time) / 60 :.1f} min.') 60 | sys.exit(int(error)) 61 | -------------------------------------------------------------------------------- /ect/Build_ECT_Ububtu.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "private_outputs": true, 7 | "provenance": [], 8 | "authorship_tag": "ABX9TyMGts6SFpEriurAynOgIU2i", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "language_info": { 16 | "name": "python" 17 | } 18 | }, 19 | "cells": [ 20 | { 21 | "cell_type": "markdown", 22 | "metadata": { 23 | "id": "view-in-github", 24 | "colab_type": "text" 25 | }, 26 | "source": [ 27 | "\"Open" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "source": [ 33 | "%cd /content\n", 34 | "!git clone --recursive https://github.com/fhanau/Efficient-Compression-Tool\n", 35 | "!latestTag=$(git describe --tags `git rev-list --tags --max-count=1`)\n", 36 | "!echo $latestTag\n", 37 | "!git checkout $latestTag\n", 38 | "!apt -y install nasm\n", 39 | "%cd Efficient-Compression-Tool\n", 40 | "!mkdir build\n", 41 | "%cd build\n", 42 | "!cmake ../src\n", 43 | "!make" 44 | ], 45 | "metadata": { 46 | "id": "jh1og550gUD-" 47 | }, 48 | "execution_count": null, 49 | "outputs": [] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "source": [ 54 | "!./ect" 55 | ], 56 | "metadata": { 57 | "id": "hR9UQWk88qsx" 58 | }, 59 | "execution_count": null, 60 | "outputs": [] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "source": [ 65 | "from google.colab import files\n", 66 | "!mv ect ect-ubuntu\n", 67 | "files.download('ect-ubuntu')" 68 | ], 69 | "metadata": { 70 | "id": "b7IETFkT7Y7Z" 71 | }, 72 | "execution_count": null, 73 | "outputs": [] 74 | } 75 | ] 76 | } -------------------------------------------------------------------------------- /example_image.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from time import time 4 | from urllib.request import urlopen 5 | 6 | start_time = time() 7 | 8 | from ztml import validation, ztml 9 | 10 | 11 | image_urls = ['http://wiesmann.codiferes.net/share/bitmaps/test_pattern.bmp', 12 | 'http://wiesmann.codiferes.net/share/bitmaps/test_pattern.gif', 13 | 'http://wiesmann.codiferes.net/share/bitmaps/test_pattern.jpg', 14 | 'http://wiesmann.codiferes.net/share/bitmaps/test_pattern.png', 15 | 'http://wiesmann.codiferes.net/share/bitmaps/test_pattern.webp' 16 | ] 17 | output_folder = 'output' 18 | skip_download_exists = True 19 | element_id = '' 20 | 21 | 22 | error = False 23 | for url in image_urls: 24 | item_start_time = time() 25 | item = url.rsplit('/', 1)[-1] 26 | filenames = dict(raw=item, 27 | # base64_js=f'{item}_64.js', 28 | base64_html=f'{item}_64.html', 29 | # base125_js=f'{item}_125.js', 30 | base125_html=f'{item}_125.html', 31 | # crenc_js=f'{item}_cr.js', 32 | crenc_html=f'{item}_cr.html') 33 | os.makedirs(output_folder, exist_ok=True) 34 | filenames = {k: os.path.join(output_folder, v) for k, v in filenames.items()} 35 | 36 | # If missing, download an example file from the web 37 | if not skip_download_exists or not os.path.exists(filenames['raw']): 38 | with urlopen(url) as fin, open(filenames['raw'], 'wb') as fout: 39 | fout.write(fin.read()) 40 | 41 | with open(filenames['raw'], 'rb') as f: 42 | data = f.read() 43 | 44 | cnt = 0 45 | for label, filename in filenames.items(): 46 | if label == 'raw': 47 | continue 48 | file = ztml.ztml(data, filename, bin2txt=label.rsplit('_', 1)[0], element_id=element_id, image=True) 49 | cnt += 1 50 | 51 | print(f'{cnt} encodings of {item} took {(time()-item_start_time) / 60 :.1f} min.') 52 | 53 | # Compare file sizes and validate data is recovered 54 | error |= validation.validate_files(filenames, by='id' * bool(element_id), element=element_id, image=True) 55 | print() 56 | 57 | if error: 58 | print('Error: some renderings timed out') 59 | else: 60 | print(f'Total of {len(image_urls)} images took {(time()-start_time) / 60 :.1f} min.') 61 | sys.exit(int(error)) 62 | -------------------------------------------------------------------------------- /ztml/crenc.py: -------------------------------------------------------------------------------- 1 | """crEnc encoding based on yEnc and optimized for inline HTML / JS text compression and image encoding 2 | 3 | In the spirit of yEnc (why encode?), we only encode symbols where absolutely required. 4 | If the HTML or JS charset can be set to a single-byte encoding as cp1252 (or latin1), 5 | the only symbol requiring special treatment is the carriage-return (CR), hence crEnc, 6 | which can be dealt with by simple backslash escaping. 7 | We embed in JS template literals quotes ``, so we also escape backslash, ` and ${ 8 | giving us an effective 253 byte values out of 256, 9 | with an overhead of ~ 3/256 ~ 1.2% (compared to 33.3% for Base64). 10 | JS does the unescaping, so the decoder only needs to take care of HTML character overrides for NUL and codes in 128 - 159. 11 | An optimal global character modular offset can be applied to minimize escaping, similar to dynEncode (enabled by default). 12 | A minimalistic JS decoder code is generated. 13 | 14 | References: 15 | https://en.wikipedia.org/wiki/Binary-to-text_encoding 16 | http://www.yenc.org 17 | https://github.com/eshaz/simple-yenc 18 | https://github.com/eshaz/simple-yenc#what-is-dynencode 19 | https://html.spec.whatwg.org/multipage/parsing.html#table-charref-overrides 20 | https://stackoverflow.com/questions/10080605/special-character-u0098-read-as-u02dc-using-charcodeat/#10081375 21 | """ 22 | 23 | 24 | from typing import Optional, Tuple 25 | 26 | if not __package__: 27 | import default_vars, webify 28 | else: 29 | # noinspection PyPackages 30 | from . import default_vars, webify 31 | 32 | 33 | def encode(data: bytes, offset: int = 0) -> bytes: 34 | if offset: 35 | data = bytes(byte+offset & 255 for byte in data) 36 | return webify.escape(data) 37 | 38 | 39 | def optimize_encode(data: bytes) -> Tuple[bytes, int, int]: 40 | best_offset = 0 41 | for offset in range(256): 42 | out = encode(data, offset) 43 | length = len(out) 44 | if offset == 0: 45 | best_length = length0 = length 46 | if length < best_length: 47 | best_length = length 48 | best_offset = offset 49 | out = encode(data, best_offset) 50 | return out, best_offset, length0 - best_length 51 | 52 | 53 | def get_js_decoder(data: bytes, 54 | offset: Optional[int] = None, 55 | output_var: str = default_vars.bytearray 56 | ) -> bytes: 57 | if offset is None: 58 | encoded, offset, saved = optimize_encode(data) # Time-consuming op. 59 | else: 60 | encoded = encode(data, offset) 61 | first_part = f'{output_var}=Uint8Array.from(`' 62 | function = f"(i=c.charCodeAt()%65533)>>8?129+' \x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c \x8e \x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c \x9e\x9f'.indexOf(c):i" 63 | if offset: 64 | function = f'({function})-{offset}' 65 | last_part = f"`,c=>{function})\n" 66 | return first_part.encode() + encoded + last_part.encode('l1') # Encode with l1 as I used explicit bytes above 67 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | # Todo 2 | 3 | ### Usability 4 | - Simplify the "from ztml.ztml import ztml" hierarchy 5 | - Support encoding video/audio/fonts/PDF/... 6 | - Support encoding multiple media elements 7 | - Provide an easy way to view and edit output HTML in Colab 8 | - Make into a PIP library and start doing versioning 9 | - JS library? 10 | - Expose more parameters and allow skipping steps in ztml() / CLI / Colab, possibly via config file 11 | - Stand-alone online web GUI 12 | - Stand-alone executable (+ script to build it) 13 | 14 | ### Compression 15 | - Ablation benchmarks 16 | - Launch a challenge for smaller decoders 17 | 18 | - #### Entropy coding: 19 | - Auto-caps should use modifiers for next letter/word/sentence/paragraph or block-level, over simple mode instead of falling back to raw. See e.g. [Grabowski](https://www.researchgate.net/profile/Szymon-Grabowski-2/publication/258239689_Text_Preprocessing_for_Burrows-Wheeler_Block_Sorting_Compression/links/0046352789a298f289000000), [Batista&Alexandre](https://www.di.ubi.pt/~lfbaa/pubs/dcc2008.pdf) 20 | - Dictionary compression for large texts + add references 21 | - [Fast Huffman one-shift decoder](https://researchgate.net/publication/3159499_On_the_implementation_of_minimum_redundancy_prefix_codes), and follow-up works: [Gagie et al.](https://arxiv.org/pdf/1410.3438.pdf), [Grabowski&Koppl](https://arxiv.org/pdf/2108.05495.pdf) 22 | - Consider [Roadroller](https://lifthrasiir.github.io/roadroller) entropy coder 23 | 24 | #### MTF: 25 | - Improve JS MTF decoding times for large files 26 | - Automatic optimizing over MTF variants 27 | - Benchmark alternatives to MTF + add references 28 | 29 | #### Deflate: 30 | - Investigate effect of PNG aspect ratio on compression / optimize over it 31 | - Investigate Safari canvas size limits 32 | - Use 8/24-bit to overcome canvas size limits when necessary (will not work on Safari, unless we go WebGL) 33 | - Compress metadata into PNG 34 | - [Use WOFF2 as a Brotli container](https://github.com/lifthrasiir/roadroller/issues/9#issuecomment-905580540) 35 | 36 | #### Webification and minification: 37 | - [Base139](https://github.com/kevinAlbs/Base122/issues/3#issuecomment-263787763) 38 | - Compress the JS itself and use [eval](http://perfectionkills.com/global-eval-what-are-the-options), considering also JS packing e.g. [JSCrush](http://iteral.com/jscrush), [JS Crusher](https://jmperezperez.com/js-crusher), [RegPack](https://siorki.github.io/regPack), [Roadroller](https://lifthrasiir.github.io/roadroller) 39 | - Strip whitespace from code lines not part of multi-line content strings (see e.g. above JS packers and [closure-compiler](https://github.com/google/closure-compiler), [jsmin](https://crockford.com/jsmin), [miniMinifier](https://github.com/xem/miniMinifier), [Terser](https://terser.org), [UglifyJS](https://github.com/mishoo/UglifyJS)) 40 | 41 | ### Validation and testing 42 | - Running full tests take too long 43 | - Linux installation instructions / Enable validation in Colab 44 | - Validation testing for Safari (consider Playwright to test WebKit) 45 | - Fix slow rendering with Selenium in validation 46 | - Tests for text_prep.py: normalize, caps, the 47 | - Automatic testing on GitHub 48 | -------------------------------------------------------------------------------- /ztml/huffman.py: -------------------------------------------------------------------------------- 1 | """Canonical Huffman encoding 2 | 3 | Even though we later compress with DEFLATE which does its own Huffman encoding internally, 4 | I found that for text compression, it is significantly beneficial to pre-encode with Huffman. 5 | Canonical encoding obviates saving or reconstructing an explicit codebook. 6 | Instead, we save a strings of symbols and a condensed canonical table of bases and offsets, in a variation of Moffat&Turpin. 7 | A minimalistic JS decoder code is generated. 8 | 9 | References: 10 | https://wikipedia.org/wiki/Canonical_Huffman_code 11 | https://github.com/ilanschnell/bitarray/blob/master/doc/canonical.rst 12 | https://researchgate.net/publication/3159499_On_the_implementation_of_minimum_redundancy_prefix_codes (Moffat&Turpin) 13 | https://arxiv.org/pdf/1410.3438.pdf 14 | https://arxiv.org/pdf/2108.05495.pdf 15 | """ 16 | 17 | 18 | from collections import Counter 19 | import sys 20 | from typing import Dict, List, Tuple 21 | 22 | from bitarray import bitarray 23 | from bitarray.util import ba2int, canonical_decode, canonical_huffman 24 | 25 | if not __package__: 26 | import default_vars, webify 27 | else: 28 | # noinspection PyPackages 29 | from . import default_vars, webify 30 | 31 | 32 | DEBUG_SKIP_HUFFMAN = False # This is just for benchmarking and is not implemented in JS decoder 33 | 34 | 35 | def encode(text: str, 36 | validate: bool = True, 37 | verbose: bool = False 38 | ) -> Tuple[List[int], str, str, Dict[str, str]]: 39 | charset = '' 40 | canonical_table = {} 41 | counter = Counter(text) 42 | if DEBUG_SKIP_HUFFMAN: 43 | code_len = len(bin(ord(max(counter, default='\0')))) - 2 44 | codebook = {c: bitarray(bin(ord(c))[2:].zfill(code_len)) for c in counter} 45 | else: 46 | if len(counter): 47 | codebook, counts, symbols = canonical_huffman(counter) 48 | else: 49 | codebook = {} 50 | counts = [] 51 | symbols = [] 52 | charset = ''.join(symbols[::-1]) 53 | canonical_table = {len(code): [2**len(code) - ba2int(code), len(codebook) - i - 1] for i, code in enumerate(codebook.values())} 54 | 55 | bits = bitarray() 56 | if codebook: 57 | bits.encode(codebook, text) 58 | if verbose: 59 | print(sorted([(k, v.to01()) for k, v in codebook.items()], 60 | key=lambda x: -counter[x[0]]), file=sys.stderr) 61 | if charset: 62 | print(len(charset), charset, file=sys.stderr) 63 | print(canonical_table, file=sys.stderr) 64 | if validate: 65 | assert not codebook or ''.join(bits.decode(codebook)) == text 66 | assert DEBUG_SKIP_HUFFMAN or ''.join(canonical_decode(bits, counts, symbols)) == text 67 | canonical_table = ''.join(chr(j) for i in range(max(canonical_table, default=-1) + 1) for j in (canonical_table[i] if i in canonical_table else [2**i + 1, 1])) 68 | rev_codebook = {v.to01(): k for k, v in codebook.items()} 69 | return bits.tolist(), charset, canonical_table, rev_codebook 70 | 71 | 72 | def get_js_decoder(charset: str, 73 | canonical_table: str, 74 | bitarray_var: str = default_vars.bitarray, 75 | text_var: str = default_vars.text, 76 | ) -> str: 77 | # Note that the escaped strings may include more characters requiring safe encoding as regard to encoding domains as well as HTML character overrides 78 | charset = webify.escape(charset, escape_nul=True) 79 | canonical_table = webify.escape(canonical_table, escape_nul=True) 80 | return f'''s=[...`{charset}`] 81 | d=[...`{canonical_table}`] 82 | for(j={text_var}='';j<{bitarray_var}.length;{text_var}+=s[d[k*2-1].codePointAt()+m])for(k=c=0;(m=2**k-d[k++*2].codePointAt()-c)<0;)c+=c+{bitarray_var}[j++] 83 | ''' 84 | 85 | 86 | def encode_and_get_js_decoder(text: str, 87 | bitarray_var: str = default_vars.bitarray, 88 | text_var: str = default_vars.text, 89 | validate: bool = True, 90 | verbose: bool = False 91 | ) -> Tuple[List[int], str]: 92 | bits, charset, canonical_table, _ = encode(text, validate, verbose) 93 | return bits, get_js_decoder(charset, canonical_table, bitarray_var, text_var) 94 | -------------------------------------------------------------------------------- /ztml/base125.py: -------------------------------------------------------------------------------- 1 | """Base125 encoding based on Base122 and optimized for inline HTML / JS text compression and image encoding 2 | 3 | If we must use utf8 encoding for HTML or JS, crEnc will not work. 4 | Instead, we can use this original and unnecessarily-optimized version of the variable length Base122. 5 | The original byte stream is split into 7 bit chunks, 6 | which are encoded as a single byte: 0xxxxxxx, to comply with utf8 code point scheme. 7 | We only use 125 byte values out of 128 (excluding CR, backslash and `) 8 | and encode the remaining three with a double byte scheme: 110ssxxx 10xxxxxx, 9 | where ss is 01, 10 or 11, and 9 bits are left for next data. 10 | Alternatively, if these are the final 7 bits, we instead encode as: 1100010x 10xxxxxx. 11 | As, we embed in JS template literals quotes ``, we further escape ${ with backslash. 12 | The overhead is ~ 8/7 * 253/256 + 16/11 * 3/256 - 1 ~ 14.7% (compared to 33.3% for Base64). 13 | The decoder further takes care of HTML character override for NUL. 14 | An optimal global character modular offset can be added to minimize escaping, similar to dynEncode (disabled by default). 15 | A minimalistic JS decoder code is generated. 16 | 17 | References: 18 | https://en.wikipedia.org/wiki/Binary-to-text_encoding 19 | https://blog.kevinalbs.com/base122 20 | https://github.com/kevinAlbs/Base122 21 | https://github.com/eshaz/simple-yenc#what-is-dynencode 22 | """ 23 | 24 | 25 | from typing import Optional, Tuple 26 | 27 | if not __package__: 28 | import default_vars 29 | else: 30 | # noinspection PyPackages 31 | from . import default_vars 32 | 33 | 34 | illegal = ['', 13, 92, 96] 35 | 36 | 37 | def encode(data: bytes, offset: int = 0, validate: bool = True) -> bytes: 38 | cur_index = 0 39 | cur_bit = 0 # Points to current bit needed 40 | out = bytearray() 41 | 42 | # Get 7 or 9 bits of input data. Returns None if there is no input left 43 | def get_bits(length: int) -> Optional[int]: 44 | nonlocal cur_index, cur_bit 45 | if cur_index >= len(data): 46 | return None 47 | 48 | # Shift, mask, unshift to get first part. Align it to a 7 or 9 bit chunk 49 | first_part = (255>>cur_bit & data[cur_index]+offset & 255) << cur_bit 50 | diff = 8 - length 51 | if diff > 0: 52 | first_part >>= diff 53 | else: 54 | first_part <<= -diff 55 | # Check if we need to go to the next byte for more bits 56 | cur_bit += length 57 | if cur_bit < 8: 58 | return first_part # Do not need next byte 59 | cur_bit -= 8 60 | cur_index += 1 61 | # Now we want bits [0..cur_bit] of the next byte if it exists 62 | if cur_index >= len(data): 63 | return first_part 64 | # Align it 65 | second_part = (0xff00>>cur_bit & data[cur_index]+offset & 255) >> 8-cur_bit 66 | return first_part | second_part 67 | 68 | while True: 69 | # Grab 7 bits 70 | bits = get_bits(7) 71 | if bits is None: 72 | break 73 | try: 74 | illegal_index = illegal.index(bits) 75 | # Since this will be a two-byte character, get the next chunk of 9 bits 76 | next_bits = get_bits(9) 77 | if next_bits is None: 78 | b1 = 4 79 | next_bits = bits 80 | else: 81 | b1 = illegal_index << 3 82 | # Push first 3 bits onto first byte, remaining 6 onto second 83 | out.extend([192 | b1 | next_bits>>6, 128 | next_bits&63]) 84 | except ValueError: 85 | out.append(bits) 86 | 87 | if validate: 88 | decoded = decode(out, offset) 89 | assert decoded == data, (len(decoded), len(data), decoded[:30], data[:30]) 90 | return out.replace(b'${', b'\\${') 91 | 92 | 93 | def optimize_encode(data: bytes, 94 | validate: bool = True 95 | ) -> Tuple[bytes, int, int]: 96 | best_offset = 0 97 | for offset in range(256): 98 | length = len(encode(data, offset, validate=False)) 99 | if offset == 0: 100 | best_length = length0 = length 101 | if length < best_length: 102 | best_length = length 103 | best_offset = offset 104 | out = encode(data, best_offset, validate) 105 | return out, best_offset, length0 - best_length 106 | 107 | 108 | def decode(data: bytes, offset: int = 0) -> bytes: 109 | out = bytearray() 110 | next_byte = 0 111 | k = 0 112 | 113 | def push_bits(bits: int, length: int = 7) -> None: 114 | nonlocal next_byte, k 115 | next_byte |= bits << (length < 8) >> k >> (length > 8) 116 | k += length 117 | if k > 7: 118 | out.append((next_byte&255)-offset & 255) 119 | k -= 8 120 | next_byte = bits << 8-k 121 | 122 | for byte in data.decode(): 123 | b = ord(byte) 124 | if b > 127: 125 | ss = b >> 9 126 | if ss: 127 | push_bits(illegal[ss]) 128 | push_bits(b<<2*(not ss) & 511, 9) 129 | else: 130 | push_bits(b) 131 | return out 132 | 133 | 134 | def get_js_decoder(data: bytes, 135 | offset: Optional[int] = 0, 136 | output_var: str = default_vars.bytearray, 137 | validate: bool = True 138 | ) -> bytes: 139 | if offset is None: 140 | encoded, offset, saved = optimize_encode(data, validate) # Time-consuming op. 141 | else: 142 | encoded = encode(data, offset, validate) 143 | illegal_str = ','.join(str(i) for i in illegal) 144 | first_part = f'''k=n=0 145 | p=(b,l=7)=>(n|=b<<(l<8)>>k>>(l>8),k+=l,k>7?(v=n{-offset or ''},k-=8,n=b<<8-k,v):[]) 146 | {output_var}=new Uint8Array([...`''' 147 | last_part = f'`].flatMap(c=>(i=c.charCodeAt()%65533,i>127?(e=i>>9,[e?p([{illegal_str}][e]):[],p(i<<2*!e&511,9)].flat()):p(i))))\n' 148 | return first_part.encode() + encoded + last_part.encode() 149 | 150 | 151 | def test() -> None: 152 | for i in range(100): 153 | for j in range(100): 154 | for offset in [0, 1]: 155 | for symbol in [b'\r', b'\\', b'`']: 156 | encode(b'\0'*i + symbol*j, offset, validate=True) 157 | 158 | 159 | if __name__ == '__main__': 160 | test() 161 | -------------------------------------------------------------------------------- /ztml/tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | from time import time 3 | 4 | start_time = time() 5 | 6 | if not __package__: 7 | import text_prep, bwt_mtf, deflate, validation, webify, ztml 8 | else: 9 | # noinspection PyPackages 10 | from . import text_prep, bwt_mtf, deflate, validation, webify, ztml 11 | 12 | 13 | min_char_code1 = 0 14 | max_char_code1 = 14000 15 | min_char_code2 = 55000 16 | max_char_code2 = 66000 17 | browsers = list(validation.drivers)[:1] 18 | input_encodings = ['utf8', 'cp1252', 'cp1255'] 19 | bin2txt_encodings = ztml.bin2txt_encodings 20 | caps_modes = ['auto', 'simple'] # text_prep.caps_modes 21 | mtf_variants = [None, 0, 52, 80] # bwt_mtf.mtf_variants 22 | bitdepths = deflate.allowed_bitdepths 23 | ect_modes = [False, True] 24 | temp_folder = 'tmp' 25 | cleanup = True 26 | 27 | 28 | all_chars = ''.join(chr(i) for i in range(min_char_code1, min(max_char_code1 or bwt_mtf.max_unicode, bwt_mtf.max_unicode) + 1)) 29 | if min_char_code2 and max_char_code2: 30 | all_chars += ''.join(chr(i) for i in range(min_char_code2, min(max_char_code2 or bwt_mtf.max_unicode, bwt_mtf.max_unicode) + 1) if chr(i) not in all_chars) 31 | os.makedirs(temp_folder, exist_ok=True) 32 | i = 0 33 | for browser in browsers: 34 | with validation.get_browser(browser) as b: 35 | for encoding in input_encodings: 36 | encoding = encoding.lower() 37 | for bin2txt in bin2txt_encodings: 38 | for caps in caps_modes: 39 | for bwtsort in [True, False]: 40 | for mtf in mtf_variants: 41 | for bitdepth in bitdepths: 42 | for ect in ect_modes: 43 | for render_mode in range(3): 44 | element_id = '' 45 | raw = False 46 | if render_mode == 1: 47 | element_id = 'myid' 48 | elif render_mode == 2: 49 | raw = True 50 | test_start_time = time() 51 | i += 1 52 | print(f'{i}/{len(browsers) * len(input_encodings) * len(bin2txt_encodings) * len(caps_modes) * 2 * len(mtf_variants) * len(bitdepths) * len(ect_modes) * 3} browser={browser} input_enc={encoding} bin2txt={bin2txt} caps={caps} bwtsort={bwtsort} mtf={mtf} bitdepth={bitdepth} ect={ect} id={bool(element_id)} raw={raw}') 53 | suffix = f"{browser}_{encoding}_{bin2txt}_{caps}{'_bwtsort' * bwtsort}_{mtf}_{bitdepth}{'_ect' * ect}" 54 | if element_id: 55 | suffix += '_id' 56 | if raw: 57 | suffix += '_raw' 58 | input_filename = os.path.join(temp_folder, f'ztml_test_file_{suffix}.txt') 59 | output_filename = os.path.join(temp_folder, f'ztml_test_file_{suffix}.html') 60 | output_stream = os.path.join(temp_folder, f'ztml_test_stream_{suffix}.html') 61 | text = all_chars 62 | if mtf is not None: 63 | text = ''.join(c for c in text if ord(c) <= bwt_mtf.max_ord_for_mtf) 64 | if encoding.replace('-', '') == 'utf8': 65 | text = ''.join(c for c in text if ord(c) < bwt_mtf.surrogate_lo or ord(c) > bwt_mtf.surrogate_hi) 66 | out1, result1 = ztml.ztml(text, unix_newline=False, remove_bom=False, caps=caps, bwtsort=bwtsort, mtf=mtf, bitdepth=bitdepth, ect=ect, bin2txt=bin2txt, element_id=element_id, raw=raw, validate=True, browser=b, verbose=True) 67 | out2, result2 = ztml.ztml(text, output_filename, unix_newline=False, remove_bom=False, caps=caps, bwtsort=bwtsort, mtf=mtf, bitdepth=bitdepth, ect=ect, bin2txt=bin2txt, element_id=element_id, raw=raw, validate=True, browser=b, verbose=True) 68 | with open(output_filename, 'rb') as f: 69 | out = f.read() 70 | assert not result1 and not result2 and out1 == out2 == out, (result1, result2, out1 == out2, out1 == out, out2 == out, len(out1), len(out2), validation.full_path(output_filename), len(out)) 71 | with open(input_filename, 'wb') as f: 72 | f.write(webify.safe_encode(text, encoding)) 73 | bwtsort_arg = '--skip_bwtsort' * (not bwtsort) 74 | ect_arg = '--ect' * ect 75 | element_id_or_raw_arg = '' 76 | if element_id: 77 | element_id_or_raw_arg = f'--element_id "{element_id}"' 78 | if raw: 79 | element_id_or_raw_arg = '--raw' 80 | result1 = os.system(f'python ztml.py "{input_filename}" "{output_filename}" --skip_unix_newline --skip_remove_bom --caps {caps} {bwtsort_arg} --mtf {mtf} --bitdepth {bitdepth} {ect_arg} --bin2txt {bin2txt} {element_id_or_raw_arg} --validate --browser {browser} --verbose') 81 | result2 = os.system(f'python ztml.py "{input_filename}" --skip_unix_newline --skip_remove_bom --caps {caps} {bwtsort_arg} --mtf {mtf} --bitdepth {bitdepth} {ect_arg} --bin2txt {bin2txt} {element_id_or_raw_arg} --validate --browser {browser} --verbose > {output_stream}') 82 | with open(output_filename, 'rb') as f1: 83 | out1 = f1.read() 84 | with open(output_stream, 'rb') as f2: 85 | out2 = f2.read() 86 | if out2.endswith(b'\x1b[0m'): # E.g. due to PyCharm terminal 87 | out2 = out2[:-4] 88 | assert not result1 and not result2 and out1 == out2, (result1, result2, out1 == out2, validation.full_path(output_filename), len(out1), validation.full_path(output_stream), len(out2)) 89 | if cleanup: 90 | for filename in [input_filename, output_filename, output_stream]: 91 | try: 92 | os.remove(filename) 93 | except PermissionError: 94 | pass 95 | print(f'Test took {time() - test_start_time :.0f} sec.\n') 96 | if cleanup: 97 | try: 98 | os.rmdir(temp_folder) 99 | except OSError: 100 | pass 101 | print(f'Total took {(time()-start_time) / 60 :.1f} min.') 102 | -------------------------------------------------------------------------------- /ztml/webify.py: -------------------------------------------------------------------------------- 1 | """ Minification by way of aliasing AKA uglification 2 | 3 | Substitutes recurring element, attribute and function names with short aliases. 4 | This is far from being a full-fledged JS minifier, and only addresses specific forms of aliasing 5 | (with defaults tuned for the author's own hand-minified use cases) 6 | You may be able to reduce your script further with JS minifiers and packers (see references), 7 | however these might not be compatible with ZTML (especially when using the non-utf8 crEnc). 8 | 9 | Warnings: 10 | 1. The two-parameter aliases would miss substitutions involving tag function syntax, i.e. 11 | func`str`, even if you specify such forms explicitly. However, see following examples. 12 | 2. While alias substitution does support some level of composition, e.g.: 13 | a.appendChild(b=document.createElement`p`).innerHTML='hi' # => C(a,b=E`p`).C='hi' 14 | More complex compositions would miss later substitutions, e.g.: 15 | a.appendChild(b=document.createElement`p`).appendChild(c) # => C(a,b=E`p`).appendChild(c) 16 | a.appendChild(b=document.createElement`p`).setAttribute('style',c) # => C(a,b=E`p`).setAttribute('style',c) 17 | 3. Non-static method aliases support only specific parameter signatures as appear in 18 | default_aliases. Attempting to specify different signatures will break your code. 19 | 4. You may need to set replace_quoted=False if you do not want e.g. all 'length', "Length" 20 | to be replaced by: L 21 | 5. Aliases to be used in other aliases e.g. document, should be specified before the latter. 22 | 23 | References: 24 | https://github.com/google/closure-compiler 25 | http://iteral.com/jscrush 26 | https://nikhilism.com/post/2012/demystifying-jscrush 27 | https://github.com/possan/jsintros/blob/master/a/src/crush.js 28 | https://jmperezperez.com/js-crusher 29 | https://crockford.com/jsmin 30 | https://github.com/xem/miniMinifier 31 | https://siorki.github.io/regPack 32 | https://lifthrasiir.github.io/roadroller 33 | https://terser.org 34 | https://github.com/mishoo/UglifyJS 35 | """ 36 | 37 | 38 | import re 39 | import sys 40 | from typing import AnyStr 41 | 42 | 43 | raw_extensions = ['htm', 'html', 'svg'] 44 | image_extensions = ['bmp', 'gif', 'jfif', 'jpe', 'jpeg', 'jpg', 'png', 'webp'] 45 | 46 | 47 | default_aliases = ''' 48 | D = document 49 | A = (e, d) => e.setAttribute('style', d) 50 | B = document.body 51 | C = (e, c) => e.appendChild(c) 52 | E = (e='div') => document.createElement(e) 53 | F = String 54 | G = 'target' 55 | H = 'innerHTML' 56 | I = setInterval 57 | J = clearInterval 58 | K = e => e.codePointAt() 59 | L = 'length' 60 | M = Math 61 | N = speechSynthesis 62 | O = setTimeout 63 | ''' 64 | 65 | literals_regex = rf'(`(?:\\.|[^`\\])*`)' 66 | 67 | 68 | def escape(s: AnyStr, escape_nul: bool = False) -> AnyStr: 69 | pattern = r'\\|`|\${' 70 | repl = r'\\\g<0>' 71 | cr = '\r' 72 | esc_cr = '\\r' 73 | nul = '\0' 74 | esc_nul = '\\0' 75 | if isinstance(s, bytes): 76 | pattern = pattern.encode() 77 | repl = repl.encode() 78 | cr = cr.encode() 79 | esc_cr = esc_cr.encode() 80 | s = re.sub(pattern, repl, s).replace(cr, esc_cr) 81 | if escape_nul: 82 | s = s.replace(nul, esc_nul) 83 | return s 84 | 85 | 86 | def safe_encode(s: str, encoding: str, get_back_unused: bool = False) -> bytes: 87 | encoding = encoding.lower() 88 | out = s.encode(encoding, 'strict' if encoding.replace('-', '') == 'utf8' else 'backslashreplace') 89 | out = re.sub(rb'\\U000?([\da-f]{5,6})', rb'\\u{\1}', out) 90 | if get_back_unused and encoding == 'cp1252': 91 | out = out.replace(b'\\x81', b'\x81').replace(b'\\x8d', b'\x8d').replace(b'\\x8f', b'\x8f').replace(b'\\x90', b'\x90').replace(b'\\x9d', b'\x9d') # These actually do not require escaping in HTML 92 | return out 93 | 94 | 95 | def get_len(s: AnyStr, encoding: str) -> int: 96 | return len(safe_encode(s, encoding) if isinstance(s, str) else s) 97 | 98 | 99 | def uglify(script: AnyStr, 100 | aliases: str = default_aliases, 101 | replace_quoted: bool = True, 102 | min_cnt: int = 2, 103 | prevent_grow: bool = True, 104 | add_used_aliases: bool = True, 105 | encoding: str = 'utf8', 106 | ) -> AnyStr: 107 | orig_len = get_len(script, encoding) 108 | shorts = set() 109 | for alias in reversed(aliases.strip().splitlines()): 110 | alias = alias.replace(' ', '') 111 | if not alias: 112 | continue 113 | short, long = alias.split('=', 1) 114 | assert short not in shorts, short 115 | shorts.add(short) 116 | prefix = '' 117 | comma = '' 118 | if re.search(r'(\b\w+\b)[^>]*=>[^.]*\b\1\.', long): 119 | prefix = r'(\w([\w.]|\[[^[\]]+\])*)\.' 120 | if re.search('[^,]+,[^>]+=>', long): 121 | comma = ',' 122 | long = re.sub(r'[^>]*(?P\b\w+\b)[^>]*=>[^.]*\b(?P=prefix)\.|[^>]+=>|\([^,)]*\)|,.*', '', long) 123 | if prefix: 124 | short += '(\\1' 125 | if '(' not in long: 126 | long += '(' 127 | short += comma 128 | long = prefix + re.sub('[\'"]', '[\'"]', re.escape(long)) 129 | elif long[0] == long[-1] in '\'"': 130 | short = lambda x, short=short, long=long: f"{'[' * (len(x[0]) < len(long))}{short}{']' * (len(x[0]) < len(long))}" 131 | long = f'\\.{long[1:-1]}' + re.sub('[\'"]', '[\'"]', f'|{long}') * replace_quoted 132 | if re.match('\\w', long[0]): 133 | long = f'\\b{long}' 134 | if re.match('\\w', long[-1]): 135 | long += '\\b' 136 | if isinstance(script, bytes): 137 | long = safe_encode(long, encoding) 138 | if isinstance(short, str): 139 | short = safe_encode(short, encoding) 140 | else: 141 | short = lambda x, short=short: safe_encode(short(x), encoding) 142 | sub = script[:0] 143 | cnt = 0 144 | parts = re.split(safe_encode(literals_regex, encoding) if isinstance(script, bytes) else literals_regex, script) 145 | for i, part in enumerate(parts): 146 | if i % 2 == 0: 147 | part, c = re.subn(long, short, part) 148 | cnt += c 149 | sub += part 150 | if cnt >= min_cnt: 151 | if add_used_aliases: 152 | alias += '\n' 153 | if isinstance(sub, bytes): 154 | alias = safe_encode(alias, encoding) 155 | if alias not in sub: 156 | sub = alias + sub.lstrip() 157 | if not prevent_grow or get_len(sub, encoding) < get_len(script, encoding): 158 | script = sub 159 | new_len = get_len(script, encoding) 160 | if new_len > orig_len: 161 | print(f'Warning: uglified size increased: {new_len} B > {orig_len} B', file=sys.stderr) 162 | return script 163 | 164 | 165 | def html_wrap(script: AnyStr, 166 | aliases: str = default_aliases, 167 | replace_quoted: bool = True, 168 | min_cnt: int = 2, 169 | prevent_grow: bool = True, 170 | lang: str = '', 171 | encoding: str = 'utf8', 172 | mobile: bool = False, 173 | title: str = '', 174 | ) -> AnyStr: 175 | html_lang = f'' * bool(lang) 176 | encoding = encoding.lower() 177 | if encoding == 'utf-8': 178 | encoding = 'utf8' 179 | elif encoding in ['cp1252', 'latin1']: 180 | encoding = 'l1' # HTML5 treats these the same 181 | mobile_meta = '' * mobile 182 | title_element = f'{title}' * bool(title) 183 | html_header = f'{html_lang}{mobile_meta}{title_element}' 185 | sep = '' 186 | if isinstance(script, bytes): 187 | html_header = safe_encode(html_header, encoding) 188 | html_footer = safe_encode(html_footer, encoding) 189 | sep = safe_encode(sep, encoding) 190 | if aliases: 191 | script = uglify(script, aliases, replace_quoted, min_cnt, prevent_grow, encoding=encoding) 192 | return sep.join([html_header, script.strip(), html_footer]) 193 | -------------------------------------------------------------------------------- /ZTML.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "private_outputs": true, 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "authorship_tag": "ABX9TyOZ/X56cwNPCb8Cs4lZTsRx", 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "language_info": { 17 | "name": "python" 18 | } 19 | }, 20 | "cells": [ 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "id": "view-in-github", 25 | "colab_type": "text" 26 | }, 27 | "source": [ 28 | "\"Open" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "source": [ 34 | "# ZTML\n", 35 | "\n", 36 | "### Extreme inline text compression for HTML / JS\n", 37 | "### By [Eyal Gruss](https://eyalgruss.com) ([@eyaler](https://twitter.com/eyaler)\\)\n", 38 | "\n", 39 | "Repo: [github.com/eyaler/ztml](https://github.com/eyaler/ztml)\n", 40 | "\n", 41 | "Shortcut to Colab: [bit.ly/ztml1](https://bit.ly/ztml1)" 42 | ], 43 | "metadata": { 44 | "id": "V__-3LfHyt5l" 45 | } 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "cellView": "form", 52 | "id": "kKLXYZNYynrz" 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "#@title Setup\n", 57 | "%cd /content\n", 58 | "!git clone -q https://github.com/eyaler/ztml\n", 59 | "!pip -q install -r ztml/requirements.txt" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "source": [ 65 | "#@title Enter text or HTML code\n", 66 | "#@markdown Important: for HTML tick `raw` below\n", 67 | "from IPython.display import display\n", 68 | "from ipywidgets import Layout, Textarea\n", 69 | "try:\n", 70 | " text = textarea.value\n", 71 | "except NameError:\n", 72 | " text = ''\n", 73 | "textarea = Textarea(value=text, placeholder='Type something', description='Text:', layout=Layout(width='90%', height='200px'))\n", 74 | "display(textarea)" 75 | ], 76 | "metadata": { 77 | "cellView": "form", 78 | "id": "Z9RJOcFL_HEw" 79 | }, 80 | "execution_count": null, 81 | "outputs": [] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "source": [ 86 | "#@title or Upload text or HTML or image file\n", 87 | "#@markdown Warning: will clear any input to above textarea\n", 88 | "from google.colab import files\n", 89 | "%cd /content\n", 90 | "try:\n", 91 | " files.upload_file('input_file')\n", 92 | "except ValueError:\n", 93 | " pass\n", 94 | "else:\n", 95 | " try:\n", 96 | " textarea.value = ''\n", 97 | " except NameError:\n", 98 | " pass" 99 | ], 100 | "metadata": { 101 | "cellView": "form", 102 | "id": "pzlcSOpCGFXy" 103 | }, 104 | "execution_count": null, 105 | "outputs": [] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "source": [ 110 | "#@title Compress!\n", 111 | "#@markdown Warning: `bitdepth` of `8-bit`, `24-bit` do not work on Safari\n", 112 | "import os\n", 113 | "output_filename = 'index.html' #@param {type: 'string'}\n", 114 | "input_encoding = '' #@param {type: 'string'}\n", 115 | "reduce_whitespace = False #@param {type: 'boolean'}\n", 116 | "unix_newline = True #@param {type: 'boolean'}\n", 117 | "fix_punct = False #@param {type: 'boolean'}\n", 118 | "remove_bom = True #@param {type: 'boolean'} \n", 119 | "caps = 'auto' #@param ['auto', 'lower', 'raw', 'simple', 'upper']\n", 120 | "bwtsort = True #@param {type: 'boolean'}\n", 121 | "mtf = '0' #@param ['none', 0, 1, 2, 50, 52, 60, 70, 80, 90]\n", 122 | "bitdepth = 1 #@param [1, 8, 24]\n", 123 | "ect = False #@param {type: 'boolean'}\n", 124 | "bin2txt = 'crenc' #@param ['base64', 'base125', 'crenc']\n", 125 | "element_id = '' #@param {type: 'string'}\n", 126 | "raw = True #@param {type: 'boolean'}\n", 127 | "image = False #@param {type: 'boolean'}\n", 128 | "js = False #@param {type: 'boolean'}\n", 129 | "uglify = True #@param {type: 'boolean'}\n", 130 | "replace_quoted = True #@param {type: 'boolean'}\n", 131 | "lang = '' #@param {type: 'string'}\n", 132 | "mobile = False #@param {type: 'boolean'}\n", 133 | "title = '' #@param {type: 'string'}\n", 134 | "text_var = 't' #@param {type: 'string'}\n", 135 | "\n", 136 | "if ect:\n", 137 | " try:\n", 138 | " have_ect_lib\n", 139 | " except NameError:\n", 140 | " !add-apt-repository -y ppa:ubuntu-toolchain-r/test\n", 141 | " !apt upgrade libstdc++6\n", 142 | " have_ect_lib = True\n", 143 | "\n", 144 | "%cd /content\n", 145 | "input_filename = 'input_file'\n", 146 | "try:\n", 147 | " if textarea.value:\n", 148 | " with open(input_filename, 'wb') as f:\n", 149 | " f.write(textarea.value.encode())\n", 150 | " print('Using input to textarea')\n", 151 | " else:\n", 152 | " print('Using uploaded file')\n", 153 | "except NameError:\n", 154 | " print('Using uploaded file')\n", 155 | "reduce_whitespace_arg = '--reduce_whitespace' * reduce_whitespace\n", 156 | "unix_newline_arg = '--skip_unix_newline' * (not unix_newline)\n", 157 | "fix_punct_arg = '--fix_punct' * fix_punct\n", 158 | "remove_bom_arg = '--skip_remove_bom ' * (not remove_bom)\n", 159 | "bwtsort_arg = '--skip_bwtsort ' * (not bwtsort)\n", 160 | "ect_arg = '--ect' * ect\n", 161 | "raw_arg = '--raw' * raw\n", 162 | "image_arg = '--image' * image\n", 163 | "js_arg = '--js' * js\n", 164 | "uglify_arg = '--skip_uglify' * (not uglify)\n", 165 | "replace_quoted_arg = '--skip_replace_quoted' * (not replace_quoted)\n", 166 | "mobile_arg = '--mobile' * mobile\n", 167 | "!python ztml/ztml/ztml.py \"$input_filename\" \"$output_filename\" --input_encoding $input_encoding $reduce_whitespace_arg $unix_newline_arg $fix_punct_arg $remove_bom_arg --caps $caps $bwtsort_arg --mtf $mtf --bitdepth $bitdepth $ect_arg --bin2txt $bin2txt --element_id $element_id $raw_arg $image_arg $js_arg $uglify_arg $replace_quoted_arg --lang $lang $mobile_arg --title $title --text_var $text_var\n", 168 | "input_size = os.path.getsize(input_filename)\n", 169 | "output_size = os.path.getsize(output_filename)\n", 170 | "print(f'{input_size:,} B -> {output_size:,} B ({output_size / input_size * 100 :.1f}%)')" 171 | ], 172 | "metadata": { 173 | "id": "qg-KcsfG0CpP", 174 | "cellView": "form" 175 | }, 176 | "execution_count": null, 177 | "outputs": [] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "source": [ 182 | "#@title Download output file\n", 183 | "from google.colab import files\n", 184 | "if bin2txt == 'crenc':\n", 185 | " print(f'Note: {output_filename} is encoded in cp1252, which some editors might break')\n", 186 | "files.download(output_filename)" 187 | ], 188 | "metadata": { 189 | "cellView": "form", 190 | "id": "3C9EVO8sFyA0" 191 | }, 192 | "execution_count": null, 193 | "outputs": [] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "source": [ 198 | "#@title Display output as hex dump\n", 199 | "from IPython.display import HTML\n", 200 | "with open(output_filename, 'rb') as f:\n", 201 | " hex = '0x' + f.read().hex()\n", 202 | "print(hex)\n", 203 | "HTML(f\"\")" 204 | ], 205 | "metadata": { 206 | "id": "v0GwtZtnTprz", 207 | "cellView": "form" 208 | }, 209 | "execution_count": null, 210 | "outputs": [] 211 | } 212 | ] 213 | } -------------------------------------------------------------------------------- /ztml/text_prep.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from typing import Optional, Tuple 3 | 4 | import regex 5 | 6 | if not __package__: 7 | import default_vars 8 | else: 9 | # noinspection PyPackages 10 | from . import default_vars 11 | 12 | 13 | newline = r'\n\v\f\r\x85\u2028' 14 | single_quote = '[\u2018-\u201b\u05f3\uff07]' 15 | double_quote = '[\u201c-\u201f\u05f4\uff02]' 16 | apos = "['’]" # \\uff07 17 | eos = '[!.?]' # r'\uff01\uff0e\uff1f\ufe52\ufe56\ufe57' 18 | nonword = r'\p{L}\p{M}\p{N}' 19 | caps_modes = ['auto', 'lower', 'raw', 'simple', 'upper'] 20 | default_caps = 'auto' 21 | 22 | 23 | def normalize(text: str, 24 | reduce_whitespace: bool = False, 25 | unix_newline: bool = True, 26 | fix_punct: bool = False, 27 | strip_bom: bool = True 28 | ) -> str: 29 | if reduce_whitespace: 30 | text = regex.sub(rf'\s*[{newline}]\s*[{newline}]\s*', '\n\n', text.replace('\u2029', '\n\n')) 31 | text = regex.sub(rf'[^\S{newline}]*[{newline}][^\S{newline}]*', '\n', text) 32 | text = regex.sub(rf'[^\S{newline}]+', ' ', text) 33 | text = text.strip() 34 | elif unix_newline: 35 | text = regex.sub('\r\n?', '\n', text) 36 | if fix_punct: 37 | text = regex.sub('\\p{Pd}', '-', text) 38 | text = regex.sub(single_quote, "'", text) 39 | text = regex.sub(double_quote, '"', text) 40 | text = regex.sub('\u2026', '...', text) 41 | if strip_bom: 42 | text = regex.sub('^\ufeff', '', text) 43 | return text 44 | 45 | 46 | caps_regex = rf'(((?=(\r\n|[{newline}]))\3){{2,}}|\u2029|^|{eos})\P{{L}}*.|(^|[^{nonword}])i(?![{nonword}])' # Avoid lookbehind to support Safari 47 | 48 | 49 | def decode_caps_simple(text: str) -> str: 50 | return regex.sub(caps_regex, lambda m: m[0].upper(), text) 51 | 52 | 53 | def encode_caps(text: str, caps: str = default_caps) -> str: 54 | assert caps in caps_modes, f"Error: caps='{caps}' not in {caps_modes}" 55 | return text if caps == 'raw' else text.upper() if caps == 'upper' else text.lower() 56 | 57 | 58 | def remove_the(text: str) -> str: 59 | the_str = 'THE' if text == text.upper() else 'the' 60 | return regex.sub(f'(^(?!{the_str}$)| ){the_str}( |$)', r'\1\2', text, flags=regex.MULTILINE) 61 | 62 | 63 | def get_qu_regex(next_letter_case: str, u_caps: Optional[bool] = None) -> str: 64 | u = 'U' if u_caps or u_caps is None and next_letter_case == 'u' else 'u' 65 | return f'(?={apos}?[^{u}\\P{{L{next_letter_case}}}])' 66 | 67 | 68 | def encode_quq(text: str) -> str: 69 | text = regex.sub(f"([Qq])u{get_qu_regex('l')}", '\\1', text) 70 | return regex.sub(f"QU{get_qu_regex('')}", 'Q', text) 71 | 72 | 73 | def decode_quq(text: str, caps: str) -> str: 74 | if caps == 'raw': 75 | text = regex.sub(f"[Qq]{get_qu_regex('l')}", '\\g<0>u', text) 76 | text = regex.sub(f"Q{get_qu_regex('u')}", 'QU', text) 77 | elif caps == 'upper': 78 | text = regex.sub(f"Q{get_qu_regex('', u_caps=True)}", 'QU', text) 79 | else: 80 | text = regex.sub(f"q{get_qu_regex('')}", 'qu', text) 81 | return text 82 | 83 | 84 | def get_quq_js_decoder(caps: str) -> str: 85 | if caps == 'raw': 86 | js_decoder = f".replace(/[Qq]{get_qu_regex('l')}/gu,'$&u').replace(/Q{get_qu_regex('u')}/gu,'QU')" 87 | elif caps == 'upper': 88 | js_decoder = f".replace(/Q{get_qu_regex('', u_caps=True)}/gu,'QU')" 89 | else: 90 | js_decoder = f".replace(/q{get_qu_regex('')}/gu,'qu')" 91 | return js_decoder 92 | 93 | 94 | def count_bad_quq(text: str, caps: str, verbose: bool = False) -> int: 95 | text = encode_caps(text, caps) 96 | recon = decode_quq(encode_quq(text), caps) 97 | text = regex.split('[Qq]', text) 98 | recon = regex.split('[Qq]', recon) 99 | cnt = sum(a != b for a, b in zip(recon, text)) + abs(len(recon) - len(text)) 100 | if verbose and cnt: 101 | print(f'Warning: found {cnt} cases of q followed by a non u, or terminal qu', file=sys.stderr) 102 | return cnt 103 | 104 | 105 | def encode_with_fallbacks(text: str, 106 | caps: str = default_caps, 107 | the: bool = True, 108 | quq: bool = True, 109 | caps_fallback: bool = True, 110 | the_fallback: bool = True, 111 | quq_fallback: bool = True, 112 | verbose: bool = False 113 | ) -> Tuple[str, str, bool, bool]: 114 | if caps_fallback: 115 | if caps == 'auto' and text != decode_caps_simple(encode_caps(text, caps)): 116 | caps = 'raw' 117 | if verbose: 118 | print(f"Falling back to caps='{caps}'", file=sys.stderr) 119 | if caps == 'raw': 120 | if text == text.lower(): 121 | caps = 'lower' 122 | elif text == text.upper(): 123 | caps = 'upper' 124 | text = encode_caps(text, caps) 125 | 126 | if the: 127 | theless = remove_the(text) 128 | if the_fallback: 129 | if theless == text: 130 | the = False 131 | if the and regex.search('^ | | $', text, regex.MULTILINE): 132 | the = False 133 | if verbose: 134 | print(f'Falling back to the={the}', file=sys.stderr) 135 | if the: 136 | text = theless 137 | 138 | if quq: 139 | quless = encode_quq(text) 140 | if quq_fallback: 141 | if len(text) - len(quless) < len(get_quq_js_decoder(caps)): 142 | quq = False 143 | if quq and count_bad_quq(text, caps, verbose): 144 | quq = False 145 | if verbose: 146 | print(f'Falling back to quq={quq}', file=sys.stderr) 147 | if quq: 148 | text = quless 149 | 150 | return text, caps, the, quq 151 | 152 | 153 | def get_js_decoder(text: Optional[str] = None, 154 | caps: str = default_caps, 155 | the: bool = True, 156 | quq: bool = True, 157 | text_var: str = default_vars.text 158 | ) -> str: 159 | assert caps in caps_modes, f"Error: caps='{caps}' not in {caps_modes}" 160 | if text is not None: 161 | text, caps, the, quq = encode_with_fallbacks(text, caps, the, quq) 162 | js_decoder = '' 163 | if quq: 164 | js_decoder += get_quq_js_decoder(caps) 165 | if the: 166 | the_str = 'THE' if caps == 'upper' else 'the' 167 | js_decoder += f".replace(/(^(?!$)| )( |$)/gm,'$1{the_str}$2')" 168 | if caps in ['auto', 'simple']: 169 | js_decoder += f'.replace(/{caps_regex}/gu,m=>m.toUpperCase())' 170 | if js_decoder: 171 | js_decoder = f'{text_var}={text_var}{js_decoder}\n' 172 | return js_decoder 173 | 174 | 175 | def encode_and_get_js_decoder(text: str, 176 | caps: str = default_caps, 177 | the: bool = True, 178 | quq: bool = True, 179 | caps_fallback: bool = True, 180 | the_fallback: bool = True, 181 | quq_fallback: bool = True, 182 | verbose: bool = False, 183 | text_var: str = default_vars.text 184 | ) -> Tuple[str, str]: 185 | text, caps, the, quq = encode_with_fallbacks(text, caps, the, quq, caps_fallback, the_fallback, quq_fallback, verbose) 186 | return text, get_js_decoder(caps=caps, the=the, quq=quq, text_var=text_var) 187 | 188 | 189 | def test_quq() -> None: 190 | bad = 0 191 | for caps in caps_modes: 192 | for q in 'Qq': 193 | for u in ['U', 'u', ' ', "' "]: 194 | for a in "AaUu'’ ": 195 | for b in 'Bb ': 196 | orig = f'{q}{u}{a}{b}' 197 | text, new_caps, _, _ = encode_with_fallbacks(orig, caps, the=False, quq=False) 198 | enc = encode_quq(text) 199 | dec = decode_quq(text, new_caps) 200 | if text != dec: 201 | print(f'caps={caps:>6}->{new_caps:>5}: orig={orig} -> text={text} -> enc={enc} -> dec={dec}', file=sys.stderr) 202 | bad += 1 203 | print(f'Found {bad} bad qu cases', file=sys.stderr) 204 | 205 | 206 | if __name__ == '__main__': 207 | test_quq() 208 | -------------------------------------------------------------------------------- /ztml/deflate.py: -------------------------------------------------------------------------------- 1 | """PNG / DEFLATE encoding optimized for arbitrary data compression 2 | 3 | Encoding data as a PNG image allows efficient DEFLATE compression (similar to ZIP), 4 | while allowing use of the browser's native decompression capability for free, 5 | thus saving the need of an additional decoder, AKA PNG bootstarpping. 6 | The data is then read from the HTML canvas element. 7 | The image aspect ratio is optimized to be squarish (for higher browser compatibility) with minimal padding. 8 | We do not use the alpha channel due to the browser's alpha pre-multiplication in Canvas 2D causing inaccuracies. 9 | In Safari, even without an alpha channel, similar inaccuracies prevent using 8-bit and 24-bit depths for PNGs. 10 | By default, we use Google's optimized Zopfli compression which is compatible with DEFLATE decompression. 11 | Alternatively, you can use ECT which can be beneficial for large texts (but may slightly hurt smaller ones) 12 | (e.g. ECT 0.9.4 gave 1.4% overall improvement over Zopfli on 2600.txt and minibook) 13 | A minimalistic JS decoder code is generated. 14 | 15 | Other experiments: 16 | 8-bit and 24-bit (RGB) give similar overall results to 1-bit (but does not work on Safari) 17 | WEBP gave worse overall results (libwebp/cwebp from 8-bit and 24-bit PNG, but does seem to work on Safari). 18 | 19 | References: 20 | https://web.archive.org/web/20090826082743/http://blog.nihilogic.dk:80/2008/05/compression-using-canvas-and-png.html 21 | https://web.archive.org/web/20130310075429/http://daeken.com/superpacking-js-demos 22 | https://web.archive.org/web/20130219050720/http://alexle.net/archives/306 23 | https://www.iamcal.com/png-store 24 | https://github.com/iamcal/PNGStore 25 | http://bwirl.blogspot.com/2011/11/optimize-web-apps-with-png.html 26 | https://gist.github.com/gasman/2560551 (pnginator) 27 | https://www.pouet.net/prod.php?which=59298 (JsExe) 28 | https://www.pouet.net/topic.php?which=8770 29 | https://github.com/codegolf/zpng 30 | https://github.com/xem/miniBook 31 | https://github.com/google/zopfli 32 | https://github.com/hattya/zopflipy 33 | https://github.com/fhanau/Efficient-Compression-Tool (ECT) 34 | https://encode.su/threads/2274-ECT-an-file-optimizer-with-fast-zopfli-like-deflate-compression 35 | https://stackoverflow.com/questions/60074569/html-canvas-returns-off-by-some-bytes-from-getimagedata 36 | https://stackoverflow.com/questions/23497925/how-can-i-stop-the-alpha-premultiplication-with-canvas-imagedata/#60564905 37 | https://github.com/jhildenbiddle/canvas-size#test-results 38 | https://pqina.nl/blog/canvas-area-exceeds-the-maximum-limit 39 | https://bugs.webkit.org/show_bug.cgi?id=230855 40 | """ 41 | 42 | 43 | from io import BytesIO 44 | import math 45 | import os 46 | import platform 47 | import sys 48 | from tempfile import NamedTemporaryFile 49 | from typing import List, Iterable, Optional 50 | 51 | import png 52 | # noinspection PyPackageRequirements 53 | import zopfli 54 | 55 | if not __package__: 56 | import default_vars 57 | else: 58 | # noinspection PyPackages 59 | from . import default_vars 60 | 61 | 62 | max_dim = 32767 63 | max_len = 11180 ** 2 64 | allowed_bitdepths = [1, 8, 24] # Warning: 8-bit and 24-bit do not work on Safari 65 | default_bitdepth = 1 66 | 67 | 68 | def to_png(bits: Iterable[int], 69 | bitdepth: int = default_bitdepth, # 1, 8, 24 70 | compression: Optional[int] = 9, 71 | ect: bool = False, # This will override zop settings 72 | ect_compression: int = 20009, 73 | ect_filters: str = 'allfilters', # 'allfilters', 'allfilters-b' (brute), 'allfilters-c' (cheap) or '' 74 | zop_filters: str = '', # Any subset of 01234mepb or '' for auto 75 | zop_iterations: int = 15, 76 | zop_iterations_large: int = 5, 77 | omit_iend: bool = True, 78 | filename: str = '', 79 | verbose: bool = False) -> bytes: 80 | data = list(bits) 81 | bit_len = len(data) 82 | assert bit_len 83 | assert bitdepth in allowed_bitdepths, f'Error: bitdepth={bitdepth} not in {allowed_bitdepths}' 84 | assert compression is None or -1 <= compression <= 9 85 | pad_bits = (bitdepth - bit_len) % bitdepth 86 | if bitdepth > 1: 87 | data += [data[-1]] * pad_bits 88 | data = [int(''.join(str(b) for b in data[i : i + bitdepth]), 2) for i in range(0, len(data), bitdepth)] 89 | width = height = pad_pixels = 0 90 | length = None 91 | while width * height != length: 92 | if length is not None: 93 | data.append(data[-1]) 94 | pad_pixels += 1 95 | length = len(data) 96 | assert length <= max_len, f'Error: length={length:,} > max_len={max_len:,}' 97 | height = int(math.sqrt(length)) 98 | while length % height and height > 1 and length // (height-1) <= max_dim: 99 | height -= 1 100 | width = length // height 101 | assert width <= max_dim, f'Error: width={width:,} > max_dim={max_dim:,}' 102 | width_with_channels = width 103 | length_with_channels = length 104 | if bitdepth > 8: 105 | data = [b for i in data for b in i.to_bytes(bitdepth // 8, 'big')] 106 | width_with_channels *= bitdepth // 8 107 | length_with_channels *= bitdepth // 8 108 | data = [data[i : i + width_with_channels] for i in range(0, length_with_channels, width_with_channels)] 109 | png_data = BytesIO() 110 | png.Writer(width, height, greyscale=bitdepth <= 8, 111 | bitdepth=1 if bitdepth == 1 else 8, 112 | compression=compression).write(png_data, data) 113 | png_data.seek(0) 114 | png_data = png_data.read() 115 | out = png_data 116 | 117 | if ect: 118 | with NamedTemporaryFile(suffix='.png', delete=False) as f: # See https://github.com/python/cpython/issues/88221 119 | f.write(out) 120 | filename = f.name 121 | ect_filters_arg = f'--{ect_filters}' * bool(ect_filters) 122 | ect_path = os.path.normpath(os.path.join(os.path.dirname(__file__), '..', 'ect', 'ect')) + '-ubuntu' * (platform.system() == 'Linux') 123 | error = os.system(f'{ect_path} -{ect_compression} -strip -quiet --strict {ect_filters_arg} --mt-deflate {filename}') # Time-consuming op. 124 | assert not error, f'Error: could not run {ect_path} - Please install from https://github.com/fhanau/Efficient-Compression-Tool or use ect=False' 125 | with open(filename, 'rb') as f: 126 | out = f.read() 127 | try: 128 | os.remove(filename) 129 | except PermissionError: 130 | pass 131 | elif zop_iterations > 0 and zop_iterations_large > 0: 132 | out = zopfli.ZopfliPNG(filter_strategies=zop_filters, 133 | iterations=zop_iterations, 134 | iterations_large=zop_iterations_large 135 | ).optimize(png_data) # Time-consuming op. 136 | if omit_iend: # Warning: do this only for PNG files 137 | out = out[:-12] # IEND length (4 bytes) + IEND tag (4 bytes) + IEND CRC-32 (4 bytes). Note: do not omit the IDAT zlib Adler-32 or the IDAT CRC-32 as this will break Safari 138 | if verbose: 139 | print(f'input_bits={bit_len} pad_bits={pad_bits} width={width} height={height} pad_pixels={pad_pixels} total_pad_bits={length*bitdepth - bit_len} bits={length * bitdepth} bytes={length*bitdepth+7 >> 3} png={len(png_data)} final={len(out)}', file=sys.stderr) 140 | if filename: 141 | with open(filename, 'wb') as f: 142 | f.write(out) 143 | return out 144 | 145 | 146 | encode = to_png 147 | 148 | 149 | def load_png(filename: str) -> List[int]: 150 | return png.Reader(filename=filename).read_flat()[2].tolist() 151 | 152 | 153 | def get_js_create_image(image_var: str = default_vars.image, 154 | bytearray_var: str = default_vars.bytearray 155 | ) -> str: 156 | return f'''{image_var}=new Image 157 | {image_var}.src=URL.createObjectURL(new Blob([{bytearray_var}])) 158 | ''' 159 | 160 | 161 | def get_js_image_data(bit_len: int, 162 | decoder_script: str = '', 163 | bitdepth: int = default_bitdepth, 164 | image_var: str = default_vars.image, 165 | bitarray_var: str = default_vars.bitarray 166 | ) -> str: 167 | assert bitdepth in allowed_bitdepths, f'Error: bitdepth={bitdepth} not in {allowed_bitdepths}' 168 | js_image_data = f'''{image_var}.decode().then(c=>{{ 169 | c=document.createElement`canvas` 170 | x=c.getContext`2d` 171 | c=[c.width={image_var}.width,c.height={image_var}.height] 172 | x.drawImage({image_var},0,0) 173 | s=x.getImageData({bitarray_var}=[],0,...c).data{'.filter((v,i)=>(i+1)%4)' * (bitdepth == 24)} 174 | ''' 175 | if bitdepth == 1: 176 | js_image_data += f'for(j={bit_len};j--;){bitarray_var}[j]=s[j*4]>>7&1\n' # Applying >>7 to deal with Safari PNG rendering inaccuracy 177 | else: # Will break Safari 178 | js_image_data += f'''for(j={(bit_len+(bitdepth-bit_len)%bitdepth) // 8};j--;)for(k=8;k--;){bitarray_var}[j*8+k]=s[j{'*4' * (bitdepth <= 8)}]>>7-k&1 179 | {bitarray_var}.length={bit_len} 180 | ''' 181 | js_image_data += f'{decoder_script.strip()}}})' 182 | return js_image_data 183 | 184 | 185 | def get_js_image_decoder(bit_len: int, 186 | decoder_script: str = '', 187 | bitdepth: int = default_bitdepth, 188 | image_var: str = default_vars.image, 189 | bytearray_var: str = default_vars.bytearray, 190 | bitarray_var: str = default_vars.bitarray 191 | ) -> str: 192 | return get_js_create_image(image_var, bytearray_var) + get_js_image_data( 193 | bit_len, decoder_script, bitdepth, image_var, bitarray_var) 194 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Open In Colab 2 | 3 | # ZTML 4 | 5 | ### Extreme inline text compression for HTML / JS 6 | ### By [Eyal Gruss](https://eyalgruss.com) ([@eyaler](https://twitter.com/eyaler)) 7 | 8 | #### Partially made at [Stochastic Labs](http://stochasticlabs.org) 9 | 10 | On-chain media storage can require efficient compression for text embedded inline in HTML / JS. 11 | ZTML is a custom pipeline that generates stand-alone HTML or JS files which embed competitively compressed self-extracting text, with file sizes of 25% - 40% the original. 12 | These file sizes include the decoder code which is a highly golfed 1 - 1.5 kB (including auxiliary indices and tables). 13 | The approach makes sense and is optimized for small texts (tens of kB), but performs quite well also on large texts. 14 | The pipeline includes original low-overhead [binary-to-text alternatives](https://en.wikipedia.org/wiki/Binary-to-text_encoding) to Base64 which are also useful for inline images. 15 | 16 | You can find a very high-level overview in these [slides](misc/reversim2022_slides.pdf) from this [5-minute talk](https://www.youtube.com/watch?v=7rz_MfAIJnY) (in Hebrew) at [Reversim Summit 2022](https://summit2022.reversim.com), and some more technical highlights and discussion in the [encode.su forum thread](https://encode.su/threads/3973-ZTML-Extreme-inline-text-compression-for-HTML-JS). 17 | 18 | ### Benchmark 19 | | | File format | [Micromegas (En)](https://gutenberg.org/files/30123/30123-8.txt) | [War and Peace (En)](https://gutenberg.org/files/2600/2600-0.txt) | 20 | |---------------------------------------------------------------------------------------|---------------|------------------------------------------------------------------|-------------------------------------------------------------------| 21 | | Project Gutenberg plain text utf8 | txt | 63.7 kB | 3.2 MB | 22 | | [paq8px_v206fix1](http://www.mattmahoney.net/dc/text.html#1250) -12RT (excl. decoder) | paq | 13.3 kB (21%) | 575 kB (18%) | 23 | | 7-Zip 22.01 9 Ultra PPMd (excl. decoder) | 7z | 20.8 kB (32%) | 746 kB (23%) | 24 | | 7-Zip 22.01 9 Ultra PPMd (self-extracting) | exe | 232 kB (364%) | 958 kB (29%) | 25 | | Zstandard 1.5.2 -22 --ultra (excl. decoder) | zst | 23.4 kB (37%) | 921 kB (28%) | 26 | | [Roadroller](https://github.com/lifthrasiir/roadroller) 2.1.0 -O2 | js | 26.5 kB (42%) | 1.0 MB (30%) | 27 | | **ZTML Base125** | html (utf8) | 26.4 kB (41%) `mtf=0` | 902 kB (28%) `mtf=80` `ect=True` | 28 | | **ZTML crEnc** | html (cp1252) | 23.5 kB (37%) `mtf=0` | 803 kB (24%) `mtf=80` `ect=True` | 29 | 30 | ### Installation 31 | ``` 32 | git clone https://github.com/eyaler/ztml 33 | pip install -r ztml/requirements.txt 34 | ``` 35 | For running validations, you also need to have Chrome, Edge and Firefox installed. 36 | 37 | ### Usage 38 | A standard simplified pipeline can be run by calling `ztml()`: 39 | ``` 40 | from ztml import ztml 41 | ztml.ztml('Input text that is much longer than this one!', 'output.html') 42 | ``` 43 | or running `ztml.py` from the command line (CLI): 44 | ``` 45 | python ztml/ztml.py input.txt output.html 46 | ``` 47 | See [ztml.py](ztml/ztml.py). 48 | Of course, there is also an accessible [Google Colab](https://colab.research.google.com/github/eyaler/ztml/blob/main/ZTML.ipynb) with a simple GUI. Shortcut: [bit.ly/ztml1](https://bit.ly/ztml). 49 | 50 | [crEnc](ztml/crenc.py) gives better compression but requires setting the HTML or JS charset to cp1252. 51 | [Base125](ztml/base125.py) is the second-best option if one must stick with utf8. 52 | 53 | See [example.py](example.py) for a complete example reproducing the ZTML results in the above benchmark, 54 | and [example_image.py](example_image.py) for an example of encoding inline images, by using `image=True` or passing a file with a supported image extension to the CLI. 55 | Outputs of these runs can be accessed at [eyalgruss.com/ztml](https://eyalgruss.com/ztml). 56 | On top of the built-in validations for Chrome, Edge and Firefox, these were also manually tested on macOS Monterey 12.5 Safari 15.6, macOS Ventura 13.2 Safari 16.3 and iOS 16.0, 16.2 Safari. 57 | 58 | A quick-and-dirty way to compress an existing single-page HTML websites with embedded inline media is to use `raw=True` or pass a '.html' file to the CLI. 59 | 60 | ### What this is not 61 | 1. Not an HTML inliner 62 | 2. Not an image optimizer 63 | 3. Not a full-fledged JS minifier 64 | 65 | ### Caveats 66 | 1. Files larger than a few MB might not work on [iOS Safari](https://pqina.nl/blog/canvas-area-exceeds-the-maximum-limit) or [macOS Safari 15](https://bugs.webkit.org/show_bug.cgi?id=230855). 67 | 2. This solution favors compression rate over compression and decompression times. Use `mtf=None` for faster decompression of large files. 68 | 3. For [compressing word lists](http://golf.horse) (sorted lexicographically), solutions as [Roadroller](https://lifthrasiir.github.io/roadroller) do a much better job. 69 | 70 | ### Pipeline and source code breakdown 71 | | | Stage | Source | Remarks | 72 | |-----|--------------------------------------------|-------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 73 | | 0 | Pipeline and CLI | [ztml.py](ztml/ztml.py) | | 74 | | 1 | Text normalization (lossy) | [text_prep.py](ztml/text_prep.py) | Reduce whitespace; substitute unicode punctuation | 75 | | 2 | Text condensation (lossless) | [text_prep.py](ztml/text_prep.py) | Lowercase with automatic capitalization; substitute common strings as: the, qu | 76 | | 3 | Burrows–Wheeler + Move-to-front transforms | [bwt_mtf.py](ztml/bwt_mtf.py) | Alphabet pre-sorting; Various MTF variants, including some original ones; Higher MTF settings beneficial for larger texts | 77 | | 4 | Huffman encoding | [huffman.py](ztml/huffman.py) | Canonical encoding with a [codebook-free decoder](https://researchgate.net/publication/3159499_On_the_implementation_of_minimum_redundancy_prefix_codes); Benefical as a pre-DEFLATE stage | 78 | | 5 | Burrows–Wheeler transform on bits | [bwt_mtf.py](ztml/bwt_mtf.py) | Beneficial for large texts | 79 | | 6 | PNG / DEFLATE compression | [deflate.py](ztml/deflate.py) | ZIP-like compression with native browser decompression; aspect ratio optimized for maximal compatibility and minimal padding; [Zopfli](https://github.com/google/zopfli) or [ECT](https://github.com/fhanau/Efficient-Compression-Tool) optimizations | 80 | | 7 | Binary-to-text encoding | | Embed in template strings; Fix [HTML character overrides](https://html.spec.whatwg.org/multipage/parsing.html#table-charref-overrides); Allow [dynEncode](https://github.com/eshaz/simple-yenc#what-is-dynencode)-like optimal offset | 81 | | 7a | Base125 (utf8) | [base125.py](ztml/base125.py) | An original variant of [Base122](https://blog.kevinalbs.com/base122), with 14.7% overhead | 82 | | 7b | crEnc (cp1252) | [crenc.py](ztml/crenc.py) | An original variant of [yEnc](http://www.yenc.org) with 1.2% overhead; requires single-byte charset | 83 | | 8 | Uglification | [webify.py](ztml/webify.py) | Substitute recurring JS names with short aliases | 84 | | 9 | Validation | [validation.py](ztml/validation.py) | Reproduce input content on Chrome, Edge and Firefox | 85 | 86 | Note: image encoding only uses steps 0 and 7 and later. 87 | 88 | See source files for explanations, experiments and more references. 89 | 90 | ### Projects using this 91 | - [fragium](https://fragium.com) 92 | - [miniBook](https://xem.github.io/miniBook) submission by Eyal Gruss ([source code](misc/minibook.py)) 93 | - [WEBZOS](https://wbtz.github.io) 94 | -------------------------------------------------------------------------------- /ect/License.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /ztml/ztml.py: -------------------------------------------------------------------------------- 1 | """ZTML - Extreme inline text compression for HTML / JS""" 2 | 3 | 4 | import argparse 5 | from base64 import b64encode 6 | import chardet 7 | import os 8 | import sys 9 | from time import time 10 | from typing import AnyStr, Optional, overload, Tuple, Union 11 | 12 | try: 13 | from typing import Literal 14 | except ImportError: 15 | from typing_extensions import Literal 16 | 17 | if not __package__: 18 | import base125, bwt_mtf, crenc, default_vars, deflate, huffman, text_prep, validation, webify 19 | else: 20 | # noinspection PyPackages 21 | from . import base125, bwt_mtf, crenc, default_vars, deflate, huffman, text_prep, validation, webify 22 | 23 | 24 | bin2txt_encodings = ['base64', 'base125', 'crenc'] 25 | default_bin2txt = 'crenc' 26 | 27 | 28 | @overload 29 | def ztml(data: AnyStr, filename: str = ..., reduce_whitespace: bool = ..., 30 | unix_newline: bool = ..., fix_punct: bool = ..., 31 | remove_bom: bool = ..., caps: str = ..., bwtsort: bool = ..., 32 | mtf: Optional[int] = ..., bitdepth: int = ..., ect: bool = ..., 33 | bin2txt: str = ..., element_id: str = ..., raw: bool = ..., 34 | image: bool = ..., js: bool = ..., uglify: bool = ..., 35 | replace_quoted: bool = ..., lang: str = ..., mobile: bool = ..., 36 | title: str = ..., text_var: str = ..., validate: Literal[False] = ..., 37 | ignore_regex: str = ..., browser: validation.BrowserType = ..., 38 | timeout: int = ..., verbose: bool = ...) -> bytes: ... 39 | 40 | 41 | @overload 42 | def ztml(data: AnyStr, filename: str = ..., reduce_whitespace: bool = ..., 43 | unix_newline: bool = ..., fix_punct: bool = ..., ect: bool = ..., 44 | remove_bom: bool = ..., caps: str = ..., bwtsort: bool = ..., 45 | mtf: Optional[int] = ..., bitdepth: int = ..., bin2txt: str = ..., 46 | element_id: str = ..., raw: bool = ..., image: bool = ..., 47 | js: bool = ..., uglify: bool = ..., replace_quoted: bool = ..., 48 | lang: str = ..., mobile: bool = ..., title: str = ..., 49 | text_var: str = ..., validate: Literal[True] = ..., 50 | ignore_regex: str = ..., browser: validation.BrowserType = ..., 51 | timeout: int = ..., verbose: bool = ...) -> Tuple[bytes, int]: ... 52 | 53 | 54 | @overload 55 | def ztml(data: AnyStr, filename: str = ..., reduce_whitespace: bool = ..., 56 | unix_newline: bool = ..., fix_punct: bool = ..., ect: bool = ..., 57 | remove_bom: bool = ..., caps: str = ..., bwtsort: bool = ..., 58 | mtf: Optional[int] = ..., bitdepth: int = ..., bin2txt: str = ..., 59 | element_id: str = ..., raw: bool = ..., image: bool = ..., 60 | js: bool = ..., uglify: bool = ..., replace_quoted: bool = ..., 61 | lang: str = ..., mobile: bool = ..., title: str = ..., 62 | text_var: str = ..., validate: bool = ..., ignore_regex: str = ..., 63 | browser: validation.BrowserType = ..., timeout: int = ..., 64 | verbose: bool = ...) -> Union[bytes, Tuple[bytes, int]]: ... 65 | 66 | 67 | def ztml(data, 68 | filename='', 69 | reduce_whitespace=False, 70 | unix_newline=True, 71 | fix_punct=False, 72 | remove_bom=True, 73 | caps=text_prep.default_caps, 74 | bwtsort=True, 75 | mtf=bwt_mtf.default_mtf, 76 | bitdepth=deflate.default_bitdepth, 77 | ect=False, 78 | bin2txt=default_bin2txt, 79 | element_id='', 80 | raw=False, 81 | image=False, 82 | js=False, 83 | uglify=True, 84 | replace_quoted=True, 85 | lang='', 86 | mobile=False, 87 | title='', 88 | text_var=default_vars.text, 89 | validate=False, 90 | ignore_regex='', 91 | browser=validation.default_browser, 92 | timeout=validation.default_timeout, 93 | verbose=False 94 | ): 95 | start_time = time() 96 | assert bin2txt in bin2txt_encodings, f'Error: bin2txt={bin2txt} not in {bin2txt_encodings}' 97 | assert not element_id and not image or not raw 98 | if image: 99 | assert isinstance(data, bytes) 100 | image_data = data 101 | else: 102 | if isinstance(data, bytes): 103 | data = data.decode() 104 | data = text_prep.normalize(data, reduce_whitespace, unix_newline, fix_punct, remove_bom) # Reduce whitespace 105 | condensed, string_decoder = text_prep.encode_and_get_js_decoder(data, caps, text_var=text_var) # Lower case and shorten common strings 106 | bwt_mtf_text, bwt_mtf_text_decoder = bwt_mtf.encode_and_get_js_decoder(condensed, bwtsort, mtf, add_bwt_func=False, data_var=text_var) # Burrows-Wheeler + Move-to-front transforms on text. MTF is a time-consuming op. 107 | huffman_bits, huffman_decoder = huffman.encode_and_get_js_decoder(bwt_mtf_text, text_var=text_var) # Huffman encode 108 | bits, bwt_bits_decoder = bwt_mtf.encode_and_get_js_decoder(huffman_bits) # Burrows-Wheeler transform on bits 109 | if raw: 110 | writer = f'document.close(document.write({text_var}))' # document.close() needed to ensure that any style changes added after a script are applied 111 | elif element_id: 112 | writer = f'''document.body.appendChild(document.createElement`pre`).id='{element_id}' 113 | {element_id}.textContent={text_var}''' 114 | else: 115 | writer = f"document.body.style.whiteSpace='pre';document.body.textContent={text_var}" 116 | bits_decoder = f'{bwt_bits_decoder}{huffman_decoder}{bwt_mtf_text_decoder}{string_decoder}{writer}' 117 | image_data = deflate.to_png(bits, bitdepth, ect=ect) # PNG encode. Time-consuming op. 118 | 119 | encoding = 'cp1252' if bin2txt == 'crenc' else 'utf8' 120 | if bin2txt == 'base64': # This is just for benchmarking and is not recommended 121 | image_url = b'data:;base64,' + b64encode(image_data) 122 | if not image: 123 | image_decoder = f"{default_vars.image}=new Image;{default_vars.image}.src='".encode() + image_url + b"'\n" 124 | out = image_decoder + deflate.get_js_image_data(len(bits), bits_decoder, bitdepth).encode() 125 | else: 126 | if bin2txt == 'base125': 127 | bytes_decoder = base125.get_js_decoder(image_data) # Time-consuming op. when offset==None 128 | else: 129 | bytes_decoder = crenc.get_js_decoder(image_data) # Time-consuming op. when offset==None 130 | if image: 131 | image_url = f"'+URL.createObjectURL(new Blob([{default_vars.bytearray}]))+'".encode() 132 | else: 133 | image_decoder = deflate.get_js_image_decoder(len(bits), bits_decoder, bitdepth) 134 | out = webify.safe_encode(image_decoder, encoding, get_back_unused=True) 135 | 136 | if image: 137 | if element_id: 138 | out = f"""document.body.appendChild(new Image).id='{element_id}' 139 | {element_id}.src='""".encode() + image_url + b"'" 140 | else: 141 | out = f"document.body.style.background='url(".encode() + image_url + b")no-repeat'" 142 | 143 | if bin2txt != 'base64': 144 | out = bytes_decoder + out 145 | if os.path.splitext(filename)[-1] == '.js': 146 | js = True 147 | if js and uglify: 148 | out = webify.uglify(out, replace_quoted=replace_quoted, encoding=encoding) 149 | elif not js: 150 | out = webify.html_wrap(out, aliases=webify.default_aliases * uglify, 151 | replace_quoted=replace_quoted, lang=lang, 152 | encoding=encoding, mobile=mobile, title=title) 153 | if filename: 154 | with open(filename, 'wb') as f: 155 | f.write(out) 156 | if verbose: 157 | print(f'Encoding took {time() - start_time :,.1f} sec.', file=sys.stderr) 158 | if validate: 159 | file = webify.html_wrap(out, aliases='', encoding=encoding) if js else filename or out 160 | by = element = '' 161 | if element_id: 162 | by = 'id' 163 | element = element_id 164 | valid = validation.validate_html(file, data, caps, by, element, raw, 165 | browser, timeout, 166 | content_var=text_var, 167 | ignore_regex=ignore_regex, 168 | verbose=True) 169 | out = out, not valid 170 | return out 171 | 172 | 173 | if __name__ == '__main__': 174 | parser = argparse.ArgumentParser() 175 | parser.add_argument('input_filename') 176 | parser.add_argument('output_filename', nargs='?', default='') 177 | parser.add_argument('--input_encoding', nargs='?', const='', default='', help='Auto detect by default') 178 | parser.add_argument('--reduce_whitespace', action='store_true') 179 | parser.add_argument('--skip_unix_newline', action='store_true') 180 | parser.add_argument('--fix_punct', action='store_true') 181 | parser.add_argument('--skip_remove_bom', action='store_true') 182 | parser.add_argument('--caps', type=str.lower, choices=text_prep.caps_modes, default=text_prep.default_caps) 183 | parser.add_argument('--skip_bwtsort', action='store_true') 184 | parser.add_argument('--mtf', type=lambda x: None if x.lower() == 'none' else int(x), choices=bwt_mtf.mtf_variants, 185 | default=bwt_mtf.default_mtf) 186 | parser.add_argument('--bitdepth', type=int, choices=deflate.allowed_bitdepths, default=deflate.default_bitdepth, help='Warning: 8-bit and 24-bit do not work on Safari') 187 | parser.add_argument('--ect', action='store_true') 188 | parser.add_argument('--bin2txt', type=str.lower, choices=bin2txt_encodings, default=default_bin2txt) 189 | parser.add_argument('--element_id', nargs='?', const='', default='', help='Warning: must be a valid JS variable name, and watch out for collisions with HTML namespace') 190 | parser.add_argument('--raw', action='store_true', help='Use document.write() to overwrite the document with the raw text. May also be implied from input_filename extension') 191 | parser.add_argument('--image', action='store_true', help='May also be implied from input_filename extension') 192 | parser.add_argument('--js', action='store_true', help='May also be implied from output_filename extension') 193 | parser.add_argument('--skip_uglify', action='store_true') 194 | parser.add_argument('--skip_replace_quoted', action='store_true') 195 | parser.add_argument('--lang', nargs='?', const='', default='') 196 | parser.add_argument('--mobile', action='store_true') 197 | parser.add_argument('--title', nargs='?', const='', default='') 198 | parser.add_argument('--text_var', default=default_vars.text) 199 | parser.add_argument('--validate', action='store_true') 200 | parser.add_argument('--ignore_regex', nargs='?', const='', default='') 201 | parser.add_argument('--browser', type=str.lower, choices=list(validation.drivers), default=validation.default_browser) 202 | parser.add_argument('--timeout', type=int, default=validation.default_timeout, help='seconds') 203 | parser.add_argument('--verbose', action='store_true') 204 | args = parser.parse_args(args=None if sys.argv[1:] else ['--help']) 205 | ext = os.path.splitext(args.input_filename)[-1][1:].lower() 206 | if ext in webify.raw_extensions: 207 | args.raw = True 208 | elif ext in webify.image_extensions: 209 | args.image = True 210 | with open(args.input_filename, 'rb') as f: 211 | data = f.read() 212 | if not args.image: 213 | if args.input_encoding: 214 | data = data.decode(args.input_encoding) 215 | else: 216 | encoding = chardet.detect(data)['encoding'] or 'utf8' 217 | try: 218 | data = data.decode(encoding) 219 | except UnicodeDecodeError: 220 | if encoding.replace('-', '') == 'utf8': 221 | raise 222 | out = ztml(data, args.output_filename, args.reduce_whitespace, 223 | not args.skip_unix_newline, args.fix_punct, 224 | not args.skip_remove_bom, args.caps, not args.skip_bwtsort, 225 | args.mtf, args.bitdepth, args.ect, args.bin2txt, 226 | args.element_id, args.raw, args.image, args.js, 227 | not args.skip_uglify, not args.skip_replace_quoted, args.lang, 228 | args.mobile, args.title, args.text_var, args.validate, 229 | args.ignore_regex, args.browser, args.timeout, args.verbose) 230 | result = False 231 | if args.validate: 232 | out, result = out 233 | if not args.output_filename: 234 | sys.stdout.buffer.write(out) 235 | sys.exit(int(result)) 236 | -------------------------------------------------------------------------------- /ztml/bwt_mtf.py: -------------------------------------------------------------------------------- 1 | """Burrows-Wheeler and Move-to-front transforms 2 | 3 | Applies pre-BWT alphabet vowel sorting by default to concentrate the vowels together. 4 | BWT Implementation follows pydivsufsort tests, to obviate adding an EOF token. 5 | MTF includes original variants (50-90) inspired by Fenwick's Sticky MTF, 6 | and larger texts show benefit from higher MTF settings. 7 | Additional BWT on bits (after entropy coding and before DEFLATE) was found beneficial for large texts. 8 | 9 | Other experiments: 10 | Run-length encoding for spaces before BWT gave worse overall results. 11 | Run-length encoding after text BWT, and MTF over run characters (just this part of Neimi&Teuhola) gave worse overall results. 12 | Run-length encoding for zeros (ZLE) after MTF gave worse overall results. 13 | 14 | References: 15 | https://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf 16 | https://github.com/louisabraham/pydivsufsort/blob/master/tests/reference.py 17 | https://www.cs.auckland.ac.nz/~peter-f/FTPfiles/2002%20VL%20coding%20BWT.pdf (Fenwick) 18 | https://www.juergen-abel.info/files/preprints/preprint_post_bwt_stages.pdf 19 | https://www.juergen-abel.info/files/preprints/preprint_universal_text_preprocessing.pdf 20 | https://home.uncg.edu/cmp/faculty/srtate/papers/bwtsort.pdf 21 | https://www.math.uni-bielefeld.de/sfb343/preprints/pr99133.ps.gz 22 | https://onlinelibrary.wiley.com/doi/full/10.1002/spe.2873 (Neimi&Teuhola) 23 | http://groups.di.unipi.it/~gulli/tutorial/burrows_wheeler.pdf (note: has errors afaict) 24 | """ 25 | 26 | 27 | from typing import Iterable, List, Optional, overload, Tuple, Union 28 | 29 | import numpy as np 30 | from pydivsufsort import divsufsort 31 | 32 | if not __package__: 33 | import default_vars, webify 34 | else: 35 | # noinspection PyPackages 36 | from . import default_vars, webify 37 | 38 | 39 | order1 = 'AOUIEVWXYZaouievwxyz' 40 | order2 = 'VWXYZAOUIEvwxyzaouie' 41 | mtf_variants = [None, 0, 1, 2, 50, 52, 60, 70, 80, 90] 42 | default_mtf = 0 43 | 44 | 45 | bwtsort_table = str.maketrans(order1, order2) 46 | reverse_bwtsort_table = str.maketrans(order2, order1) 47 | surrogate_lo = 55296 48 | surrogate_hi = 57343 49 | max_unicode = 1114111 50 | max_ord_for_mtf = max_unicode - (surrogate_hi-surrogate_lo) - 1 51 | 52 | 53 | def mtf_rank(mtf: int, rank: int, prev: int) -> int: 54 | assert mtf is not None 55 | assert mtf in mtf_variants, f'Error: mtf={mtf} not in {mtf_variants}' 56 | if mtf == 0: 57 | new_rank = 0 58 | elif mtf == 1: 59 | new_rank = rank > 1 60 | elif mtf == 2: 61 | new_rank = rank > 1 or rank == 1 and not prev 62 | elif mtf == 50: 63 | new_rank = rank // 2 64 | elif mtf == 52: 65 | new_rank = rank // 2 if rank > 1 else rank == 1 and not prev 66 | else: 67 | new_rank = int(rank*(mtf/100) + 0.5) # Round in the same way as JS (do not round half to even) 68 | return new_rank 69 | 70 | 71 | def mtf_encode(data: Iterable[int], 72 | mtf: int == default_mtf, 73 | validate=True 74 | ) -> List[int]: 75 | data = list(data) 76 | max_data = max(data, default=-1) 77 | assert max_data <= max_ord_for_mtf, (max_data, max_ord_for_mtf) 78 | ranks = list(range(max_data + 1)) 79 | out = [] 80 | prev = 1 81 | for i in data: 82 | rank = ranks.index(i) # Time-consuming op. 83 | ranks.pop(rank) 84 | ranks.insert(mtf_rank(mtf, rank, prev), i) 85 | prev = rank 86 | if rank >= surrogate_lo: 87 | rank += surrogate_hi - surrogate_lo + 1 88 | out.append(rank) 89 | if validate: 90 | decoded = mtf_decode(out, mtf) 91 | if not hasattr(data, '__getitem__'): 92 | data = type(decoded)(data) 93 | assert decoded == data, (len(decoded), len(data), decoded[:30], data[:30]) 94 | return out 95 | 96 | 97 | def mtf_decode(data: Iterable[int], mtf: int == default_mtf) -> List[int]: 98 | out = list(data) 99 | ranks = list(range(max(out, default=-1) + 1)) 100 | prev = 1 101 | for i, rank in enumerate(out): 102 | if rank > surrogate_lo: 103 | rank -= surrogate_hi - surrogate_lo + 1 104 | out[i] = ranks.pop(rank) 105 | ranks.insert(mtf_rank(mtf, rank, prev), out[i]) 106 | prev = rank 107 | return out 108 | 109 | 110 | @overload 111 | def encode(data: str, bwtsort: bool = ..., mtf: Optional[int] = ..., 112 | validate: bool = ...) -> Tuple[str, int]: ... 113 | 114 | 115 | @overload 116 | def encode(data: Iterable[int], bwtsort: bool = ..., mtf: Optional[int] = ..., 117 | validate: bool = ...) -> Tuple[List[int], int]: ... 118 | 119 | 120 | def encode(data, bwtsort=True, mtf=default_mtf, validate=True): 121 | is_str = isinstance(data, str) 122 | if not is_str: 123 | data = list(data) 124 | out = list(data) 125 | if bwtsort: 126 | if not is_str: 127 | out = [chr(i) for i in out] 128 | out = ''.join(out).translate(bwtsort_table) 129 | if is_str or bwtsort: 130 | out = [ord(c) for c in out] 131 | sa = divsufsort(np.array(out)) if out else [] 132 | out = out[-1:] + [out[i - 1] for i in sa if i] 133 | index = list(sa).index(0) if out else 0 134 | if mtf is not None: 135 | out = mtf_encode(out, mtf, validate) # Time-consuming op. 136 | if is_str: 137 | out = ''.join(chr(i) for i in out) 138 | if validate: 139 | decoded = decode(out, index, bwtsort, mtf) 140 | if not hasattr(data, '__getitem__'): 141 | data = type(decoded)(data) 142 | assert decoded == data, (len(decoded), len(data), decoded[:30], data[:30]) 143 | return out, index 144 | 145 | 146 | @overload 147 | def decode(data: str, index: int, bwtsort: bool = ..., 148 | mtf: Optional[int] = ...) -> str: ... 149 | 150 | 151 | @overload 152 | def decode(data: Iterable[int], index: int, bwtsort: bool = ..., 153 | mtf: Optional[int] = ...) -> List[int]: ... 154 | 155 | 156 | def decode(data, index, bwtsort=True, mtf=default_mtf): 157 | is_str = isinstance(data, str) 158 | out = list(data) 159 | if mtf is not None: 160 | if is_str: 161 | out = [ord(c) for c in out] 162 | out = mtf_decode(out, mtf) 163 | if is_str: 164 | out = [chr(i) for i in out] 165 | ordered = [(c, i - (i <= index)) for i, c in enumerate(out)] 166 | ordered.sort() 167 | for i in range(len(out)): 168 | out[i], index = ordered[index] 169 | if bwtsort: 170 | if not is_str: 171 | out = [chr(i) for i in out] 172 | out = ''.join(out).translate(reverse_bwtsort_table) 173 | if not is_str: 174 | out = [ord(c) for c in out] 175 | elif is_str: 176 | out = ''.join(out) 177 | return out 178 | 179 | 180 | def get_js_decoder(data: Union[str, Iterable[int]], 181 | index: int, 182 | bwtsort: bool = True, 183 | mtf: Optional[int] = default_mtf, 184 | add_bwt_func: bool = True, 185 | bwt_func_var: str = default_vars.bwt_func, 186 | data_var: str = '' 187 | ) -> str: 188 | assert mtf in mtf_variants, f'Error: mtf={mtf} not in {mtf_variants}' 189 | is_str = isinstance(data, str) 190 | if not is_str: 191 | data = list(data) 192 | if not data_var: 193 | data_var = default_vars.text if is_str else default_vars.bitarray 194 | js_decoder = f'{data_var}=[...{data_var}].map(c=>c.codePointAt())\n' * is_str 195 | if mtf is not None: 196 | if mtf == 0: 197 | mtf_op = f'd.unshift({data_var}[j++]=d.splice(k,1)[0])' 198 | elif mtf == 1: 199 | mtf_op = f'd.splice(k>1,0,{data_var}[j++]=d.splice(k,1)[0])' 200 | elif mtf == 2: 201 | js_decoder += 'n=1\n' 202 | mtf_op = f'd.splice(k>!!n,0,{data_var}[j++]=d.splice(k,1)[0]),n=k' 203 | elif mtf == 50: 204 | mtf_op = f'd.splice(k/2,0,{data_var}[j++]=d.splice(k,1)[0])' 205 | elif mtf == 52: 206 | js_decoder += 'n=1\n' 207 | mtf_op = f'd.splice(k>1?k/2:k>n,0,{data_var}[j++]=d.splice(k,1)[0]),n=k' 208 | else: 209 | mtf_op = f"d.splice(k*{str(mtf / 100).lstrip('0')}+.5,0,{data_var}[j++]=d.splice(k,1)[0])" 210 | if is_str and any(ord(c) > surrogate_lo for c in data): 211 | mtf_op = f'k-={surrogate_hi - surrogate_lo + 1}*(k>{surrogate_lo}),{mtf_op}' 212 | # Use reduce instead of Math.max(...array) due to argument limit: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Function/apply#using_apply_and_built-in_functions 213 | js_decoder += f'''d=[...Array({data_var}.reduce((a,b)=>a>b?a:b+1,0)).keys()] 214 | j=0 215 | for(k of {data_var}){mtf_op} 216 | ''' 217 | if add_bwt_func: 218 | js_decoder += f"{bwt_func_var}=(d,k)=>{{s=d.map((c,i)=>[c,i-(i<=k)]).sort((a,b)=>a[0]-b[0]);for(j in s)[d[j],k]=s[k]}}\n" # Sort on code points to respect order of char above \uffff 219 | js_decoder += f'{bwt_func_var}({data_var},{index})\n' 220 | dyn_orders = None 221 | if bwtsort: 222 | symbols = set(data) 223 | if not is_str: 224 | symbols = {chr(i) for i in symbols} 225 | dyn_orders = list(zip(*[(c1, c2) for c1, c2 in zip(order1, order2) if c1 in symbols])) 226 | if dyn_orders: 227 | dyn_order1, dyn_order2 = dyn_orders 228 | dyn_order1 = webify.escape(''.join(dyn_order1)) 229 | dyn_order2 = webify.escape(''.join(dyn_order2)) 230 | js_decoder += f'''d={{}};[...`{dyn_order2}`].map((c,i)=>d[c]=[...`{dyn_order1}`][i]) 231 | {data_var}={data_var}.map(i=>{'d[c=String.fromCodePoint(i)]||c).join``' if is_str else '(d[c=String.fromCodePoint(i)]||c).codePointAt())'} 232 | ''' 233 | if is_str and not dyn_orders: 234 | # Don't use String.fromCodePoint(...array) due to argument limit: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Function/apply#using_apply_and_built-in_functions 235 | js_decoder += f'{data_var}={data_var}.map(i=>String.fromCodePoint(i)).join``\n' 236 | return js_decoder 237 | 238 | 239 | @overload 240 | def encode_and_get_js_decoder(data: str, 241 | bwtsort: bool = ..., 242 | mtf: Optional[int] = ..., 243 | add_bwt_func: bool = ..., 244 | bwt_func_var: str = ..., 245 | data_var: str = ..., 246 | validate: bool = ... 247 | ) -> Tuple[str, str]: ... 248 | 249 | 250 | @overload 251 | def encode_and_get_js_decoder(data: Iterable[int], 252 | bwtsort: bool = ..., 253 | mtf: Optional[int] = ..., 254 | add_bwt_func: bool = ..., 255 | bwt_func_var: str = ..., 256 | data_var: str = ..., 257 | validate: bool = ... 258 | ) -> Tuple[List[int], str]: ... 259 | 260 | 261 | def encode_and_get_js_decoder(data, 262 | bwtsort=True, 263 | mtf=default_mtf, 264 | add_bwt_func=True, 265 | bwt_func_var=default_vars.bwt_func, 266 | data_var='', 267 | validate=True 268 | ): 269 | is_str = isinstance(data, str) 270 | if not is_str: 271 | data = list(data) 272 | if not data_var: 273 | data_var = default_vars.text if is_str else default_vars.bitarray 274 | if data_var == default_vars.bitarray: 275 | bwtsort = False 276 | mtf = None 277 | encoded, index = encode(data, bwtsort, mtf, validate) 278 | return encoded, get_js_decoder(data, index, bwtsort, mtf, add_bwt_func, bwt_func_var, data_var) 279 | 280 | 281 | def test() -> None: 282 | mtf_test = [3, 2, 2, 2, 3, 2, 2, 3, 2, 2] 283 | mtf0 = mtf_encode(mtf_test[:], mtf=0, validate=True) 284 | assert mtf0 == [3, 3, 0, 0, 1, 1, 0, 1, 1, 0], mtf0 285 | mtf1 = mtf_encode(mtf_test[:], mtf=1, validate=True) 286 | assert mtf1 == [3, 3, 1, 0, 2, 0, 0, 1, 1, 0], mtf1 287 | mtf2 = mtf_encode(mtf_test[:], mtf=2, validate=True) 288 | assert mtf2 == [3, 3, 1, 0, 2, 0, 0, 1, 0, 0], mtf2 289 | 290 | symbols = ['', '\0', '\1', 'a', 'b', 'א', 'ב', '\ue000', '\uffff', '\U00010000'] 291 | for x in symbols: 292 | for y in symbols: 293 | for z in symbols: 294 | for mtf in mtf_variants: 295 | for bwtsort in [False, True]: 296 | encode(f'{x}{y}{z}', bwtsort=bwtsort, mtf=mtf, validate=True) 297 | 298 | symbols = ['', '0', '1', '97', '255'] 299 | for x in symbols: 300 | for y in symbols: 301 | for z in symbols: 302 | for mtf in mtf_variants: 303 | for bwtsort in [False, True]: 304 | encode([int(c) for c in f'{x}{y}{z}'], bwtsort=bwtsort, mtf=mtf, validate=True) 305 | 306 | 307 | if __name__ == '__main__': 308 | test() 309 | -------------------------------------------------------------------------------- /ztml/validation.py: -------------------------------------------------------------------------------- 1 | from base64 import b64decode 2 | from contextlib import ExitStack, redirect_stdout 3 | import os 4 | import sys 5 | from tempfile import NamedTemporaryFile 6 | from time import sleep, time 7 | from typing import AnyStr, Iterable, Mapping, Optional, overload, TypeVar, Union 8 | 9 | try: 10 | from typing import Literal 11 | except ImportError: 12 | from typing_extensions import Literal 13 | 14 | import regex 15 | from selenium.common.exceptions import JavascriptException, TimeoutException, WebDriverException 16 | from selenium.webdriver import Chrome, Edge, Firefox, chrome, edge, firefox 17 | from selenium.webdriver.common.by import By 18 | from selenium.webdriver.remote.webdriver import WebDriver 19 | from selenium.webdriver.support.ui import WebDriverWait 20 | from webdriver_manager.chrome import ChromeDriverManager 21 | from webdriver_manager.microsoft import EdgeChromiumDriverManager 22 | from webdriver_manager.firefox import GeckoDriverManager 23 | 24 | if not __package__: 25 | import default_vars, text_prep, webify 26 | else: 27 | # noinspection PyPackages 28 | from . import default_vars, text_prep, webify 29 | 30 | 31 | default_browser = 'chrome' 32 | default_timeout = 60 33 | default_by = By.TAG_NAME 34 | default_element = 'body' 35 | webdriver_paths_filename = 'webdriver_paths.txt' 36 | 37 | 38 | os.environ['WDM_LOG'] = '0' 39 | drivers = dict(chrome=[Chrome, chrome, ChromeDriverManager], 40 | edge=[Edge, edge, EdgeChromiumDriverManager], 41 | firefox=[Firefox, firefox, GeckoDriverManager] 42 | ) 43 | BrowserType = Union[str, WebDriver] 44 | critical_error_strings = ['executable needs to be', 'unable to find binary', 'unexpectedly'] 45 | 46 | 47 | FilenameOrBytes = TypeVar('FilenameOrBytes', str, bytes) 48 | 49 | 50 | def full_path(filename: str) -> str: 51 | return f"file:///{os.path.realpath(filename).replace(os.sep, '/')}" 52 | 53 | 54 | def get_browser(browser: BrowserType, 55 | stack: Optional[ExitStack] = None 56 | ) -> WebDriver: 57 | if isinstance(browser, WebDriver): 58 | return browser 59 | options = drivers[browser][1].options.Options() 60 | options.headless = True 61 | options.add_argument('--no-sandbox') 62 | if hasattr(options, 'add_experimental_option'): 63 | options.add_experimental_option('excludeSwitches', ['enable-logging']) 64 | try: 65 | with redirect_stdout(None): 66 | service = drivers[browser][2]().install() 67 | folder = os.path.dirname(webdriver_paths_filename) 68 | if folder: 69 | os.makedirs(folder, exist_ok=True) 70 | with open(webdriver_paths_filename, 'a', encoding='utf8') as f: 71 | f.write(f'{browser},{service}\n') 72 | except Exception: 73 | with open(webdriver_paths_filename, encoding='utf8') as f: 74 | for line in reversed(f.read().splitlines()): 75 | b, service = line.split(',', 1) 76 | if b == browser: 77 | break 78 | while isinstance(browser, str): 79 | try: 80 | browser = drivers[browser][0](service=drivers[browser][1].service.Service(service, log_path=os.devnull), options=options) 81 | except WebDriverException as e: 82 | if any(s in e.msg for s in critical_error_strings): 83 | raise 84 | print(e, file=sys.stderr) 85 | sleep(30) 86 | if stack: 87 | browser = stack.enter_context(browser) 88 | return browser 89 | 90 | 91 | @overload 92 | def render_html(file: FilenameOrBytes, by: str = ..., element: str = ..., 93 | raw: bool = ..., image: Literal[True] = ..., 94 | browser: str = ..., timeout: int = ..., content_var: str = ... 95 | ) -> Optional[bytes]: ... 96 | 97 | 98 | @overload 99 | def render_html(file: FilenameOrBytes, by: str = ..., element: str = ..., 100 | raw: bool = ..., image: Literal[False] = ..., 101 | browser: str = ..., timeout: int = ..., content_var: str = ... 102 | ) -> Optional[str]: ... 103 | 104 | 105 | @overload 106 | def render_html(file: FilenameOrBytes, by: str = ..., element: str = ..., 107 | raw: bool = ..., image: bool = ..., 108 | browser: str = ..., timeout: int = ..., content_var: str = ... 109 | ) -> Optional[AnyStr]: ... 110 | 111 | 112 | def render_html(file, 113 | by=default_by, 114 | element=default_element, 115 | raw=False, 116 | image=False, 117 | browser=default_browser, 118 | timeout=default_timeout, 119 | content_var='' 120 | ): 121 | assert not raw or not image 122 | if not by: 123 | by = default_by 124 | if not element: 125 | element = default_element 126 | with ExitStack() as stack: 127 | browser = get_browser(browser, stack) 128 | if isinstance(file, str): 129 | filename = file 130 | else: 131 | with NamedTemporaryFile(suffix='.html', delete=False) as f: # See https://github.com/python/cpython/issues/88221 132 | f.write(file) 133 | filename = f.name 134 | browser.get(full_path(filename)) 135 | if isinstance(file, bytes): 136 | try: 137 | os.remove(filename) 138 | except PermissionError: 139 | pass 140 | try: 141 | wait = WebDriverWait(browser, timeout) 142 | if image: 143 | if by == By.TAG_NAME and element == 'body': 144 | data_url = wait.until(lambda x: 145 | regex.sub('^none$', '', 146 | x.find_element(by, element) 147 | .value_of_css_property('background-image'))) 148 | else: 149 | data_url = wait.until(lambda x: 150 | x.find_element(by, element) 151 | .get_property('src')) 152 | assert isinstance(data_url, str), type(data_url) 153 | if ';base64,' in data_url: 154 | return b64decode(data_url.split(';base64,', 1)[1].split('"', 1)[0], validate=True) 155 | image_data = browser.execute_script(f'return {content_var or default_vars.bytearray}') 156 | if isinstance(image_data, dict): # Needed for or Firefox, see: https://github.com/SeleniumHQ/selenium/issues/11070 157 | image_data = [v for k, v in sorted(image_data.items(), key=lambda x: int(x[0]))] 158 | return bytes(image_data) 159 | if raw: 160 | sleep(0.1) 161 | get_text = lambda x: x.execute_script(f'return {content_var or default_vars.text}') 162 | else: 163 | get_text = lambda x: x.find_element(by, element).get_property('innerText') 164 | try: 165 | text = wait.until(get_text) 166 | except JavascriptException: 167 | sleep(1) 168 | text = wait.until(get_text) 169 | assert isinstance(text, str), type(text) 170 | return text 171 | except TimeoutException: 172 | return None 173 | except Exception: 174 | print(f'\nError: {browser.name} failed on {full_path(filename)}', file=sys.stderr) 175 | raise 176 | 177 | 178 | def find_first_diff(rendered: AnyStr, data: AnyStr, verbose: bool = True) -> int: 179 | i = -1 180 | for i, (r, t) in enumerate(zip(rendered, data)): 181 | if r != t: 182 | break 183 | else: 184 | i += 1 185 | if verbose: 186 | print(f'\nFirst difference found at {i} / {len(rendered)}', file=sys.stderr) 187 | print(f'Original: {data[max(i - 30, 0) : i]!r} -> {data[i : i + 50]!r}', file=sys.stderr) 188 | print(f'Rendered: {rendered[max(i - 30, 0) : i]!r} -> {rendered[i : i + 50]!r}\n', file=sys.stderr) 189 | return i 190 | 191 | 192 | def validate_html(file: FilenameOrBytes, # Don't use AnyStr as it does not have to be the same type as data 193 | data: AnyStr, 194 | caps: str = text_prep.default_caps, 195 | by: str = default_by, 196 | element: str = default_element, 197 | raw: bool = False, 198 | browser: BrowserType = default_browser, 199 | timeout: int = default_timeout, 200 | unicode_A: int = 0, 201 | ignore_regex: str = '', 202 | content_var: str = '', 203 | verbose: bool = True 204 | ) -> Optional[bool]: 205 | image = isinstance(data, bytes) 206 | assert data, 'Error: Cannot validate against empty data' 207 | rendered = render_html(file, by, element, raw, image, browser, timeout, content_var) 208 | if rendered is None: 209 | return None 210 | if not image: 211 | if caps == 'lower': 212 | data = data.lower() 213 | elif caps == 'upper': 214 | data = data.upper() 215 | elif caps == 'simple': 216 | data = text_prep.decode_caps_simple(data.lower()) 217 | if not raw: 218 | if unicode_A: 219 | rendered = regex.sub(r'[^\p{Z}\p{C}]', lambda m: chr(ord(m[0]) - unicode_A + 65 + (6 if ord(m[0]) - unicode_A + 65 > 90 else 0)), rendered) 220 | rendered = regex.sub(ignore_regex, '', rendered) 221 | if rendered == data: 222 | return True 223 | if verbose: 224 | find_first_diff(rendered, data) 225 | return False 226 | 227 | 228 | def validate_files(filenames: Mapping[str, str], 229 | data: Optional[AnyStr] = None, 230 | reduce_whitespace: bool = False, 231 | unix_newline: bool = True, 232 | fix_punct: bool = False, 233 | remove_bom: bool = True, 234 | caps: str = text_prep.default_caps, 235 | by: str = default_by, 236 | element: str = default_element, 237 | raw: bool = False, 238 | image: bool = False, 239 | browsers: Optional[Union[BrowserType, Iterable[BrowserType]]] = None, 240 | timeout: int = default_timeout, 241 | unicode_A: int = 0, 242 | ignore_regex: str = '', 243 | content_var: str = '', 244 | validate: bool = True, 245 | verbose: bool = True 246 | ) -> bool: 247 | error = False 248 | if browsers is None: 249 | browsers = list(drivers) 250 | elif isinstance(browsers, (str, WebDriver)): 251 | browsers = [browsers] 252 | with ExitStack() as stack: 253 | if validate: 254 | browsers = [get_browser(browser, stack) for browser in browsers] 255 | raw_size = None 256 | no_overhead_size = None 257 | for label, filename in sorted(filenames.items(), key=lambda x: (x[0] != 'raw', x[0] != 'base64_html')): 258 | ext = os.path.splitext(filename)[-1][1:].lower() 259 | if raw_size is not None and ext != 'html': 260 | continue 261 | if data is None or label == 'raw': 262 | if ext in webify.raw_extensions: 263 | raw = True 264 | elif ext in webify.image_extensions: 265 | image = True 266 | assert not image or (not raw and not isinstance(data, str)) 267 | if data is None: 268 | with open(filename, 'rb') as f: 269 | data = f.read() 270 | if raw_size is None: 271 | raw_size = len(data.encode() if isinstance(data, str) else data) 272 | if not image and isinstance(data, bytes): 273 | data = text_prep.normalize(data.decode(), reduce_whitespace, unix_newline, fix_punct, remove_bom) # Assumes raw text file is utf8. Otherwise, pass it as a data argument 274 | 275 | if verbose: 276 | size = os.path.getsize(filename) 277 | if label == 'base64_html': 278 | no_overhead_size = size * 3 / 4 279 | stats = [] 280 | if raw_size: 281 | stats.append(f'ratio={round(size / raw_size * 100, 1)}%') 282 | if no_overhead_size: 283 | stats.append(f'overhead={round((size/no_overhead_size-1) * 100, 1)}%') 284 | if ext == 'html' and label not in ['raw', 'base64_html']: 285 | with open(filename, 'rb') as f: 286 | html = f.read() 287 | matches = regex.findall(webify.literals_regex.encode(), html) 288 | payload = max(matches, key=len, default=b'').split(b'`', 1)[1].rsplit(b'`', 1)[0] 289 | html = html.replace(payload, b'') 290 | stats.append(f'code: {len(html):,} B = {round(len(html) / 1024, 1):,} kB') 291 | stats = ' '.join(stats) 292 | if stats: 293 | stats = f' ({stats})' 294 | mb = size / 1024 ** 2 295 | if mb >= 0.1: 296 | stats = f' = {round(mb, 1):,} MB{stats}' 297 | kb = size / 1024 298 | if kb >= 0.1: 299 | stats = f' = {round(kb, 1):,} kB{stats}' 300 | print(f"{full_path(filename)} {size:,} B{stats}", end='' if validate and ext == 'html' and label != 'raw' else None, file=sys.stderr) 301 | 302 | if validate and ext == 'html' and label != 'raw': 303 | for i, browser in enumerate(browsers): 304 | start_time = time() 305 | valid = validate_html(filename, data, caps, by, element, 306 | raw, browser, timeout, unicode_A, 307 | ignore_regex, content_var, verbose) 308 | assert valid is not False, filename 309 | if not valid: 310 | error = True 311 | if verbose: 312 | if i == 0: 313 | print(f' rendering secs:', end='', file=sys.stderr) 314 | print(f' {browser.name}=' + (f'{time() - start_time :.1f}' if valid else f'{timeout}(TIMEOUT)'), end='', file=sys.stderr) 315 | if verbose: 316 | print(file=sys.stderr) 317 | if verbose and validate: 318 | print('Note: above rendering times from Selenium are much longer than actual browser rendering.', file=sys.stderr) 319 | return error 320 | --------------------------------------------------------------------------------