├── sample-files ├── everyByte ├── lena_std.tif ├── hatetris-wr.bin ├── everyPairOfBytes ├── hatetris-wr-rle.bin └── hatetris-wr-rle2.bin ├── .editorconfig ├── .github ├── dependabot.yml └── workflows │ └── workflow-1.yml ├── .gitattributes ├── package.json ├── .gitignore ├── LICENSE.txt ├── src └── index.js └── README.md /sample-files/everyByte: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qntm/base131072/HEAD/sample-files/everyByte -------------------------------------------------------------------------------- /sample-files/lena_std.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qntm/base131072/HEAD/sample-files/lena_std.tif -------------------------------------------------------------------------------- /sample-files/hatetris-wr.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qntm/base131072/HEAD/sample-files/hatetris-wr.bin -------------------------------------------------------------------------------- /sample-files/everyPairOfBytes: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qntm/base131072/HEAD/sample-files/everyPairOfBytes -------------------------------------------------------------------------------- /sample-files/hatetris-wr-rle.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qntm/base131072/HEAD/sample-files/hatetris-wr-rle.bin -------------------------------------------------------------------------------- /sample-files/hatetris-wr-rle2.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qntm/base131072/HEAD/sample-files/hatetris-wr-rle2.bin -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | [*] 2 | insert_final_newline = true 3 | 4 | [{*.js,*.json,*.ts,*.tsx,*.md,*.jsx}] 5 | charset = utf-8 6 | indent_style = space 7 | indent_size = 2 8 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: 'npm' 4 | directory: '/' 5 | schedule: 6 | interval: 'monthly' 7 | ignore: 8 | - dependency-name: '*' 9 | update-types: 10 | - 'version-update:semver-minor' 11 | - 'version-update:semver-patch' 12 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "base131072", 3 | "version": "1.0.0", 4 | "description": "Binary-to-text encoding highly optimised for UTF-32", 5 | "homepage": "https://github.com/qntm/base131072", 6 | "repository": { 7 | "type": "git", 8 | "url": "git://github.com/qntm/base131072.git" 9 | }, 10 | "type": "module", 11 | "main": "src/index.js", 12 | "keywords": [], 13 | "scripts": { 14 | "test": "standard" 15 | }, 16 | "dependencies": {}, 17 | "author": "qntm", 18 | "license": "MIT", 19 | "devDependencies": { 20 | "standard": "^17.0.0" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /.github/workflows/workflow-1.yml: -------------------------------------------------------------------------------- 1 | name: 'Travis CI replacement' 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - '**' 7 | 8 | jobs: 9 | build-job: 10 | runs-on: 'ubuntu-latest' 11 | 12 | strategy: 13 | matrix: 14 | node-version: ['20.x', '22.x', '24.x'] 15 | 16 | steps: 17 | - uses: 'actions/checkout@v2' 18 | 19 | - name: 'Use Node.js ${{ matrix.node-version }}' 20 | uses: 'actions/setup-node@v1' 21 | with: 22 | node-version: '${{ matrix.node-version }}' 23 | 24 | - name: 'Actual npm tasks' 25 | run: | 26 | npm install 27 | npm run test 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | 49 | node_modules 50 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 qntm 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | /** Base131072 is a binary-to-text encoding optimised for UTF-32 and Twitter. */ 2 | 3 | // Z is a number, usually a uint17 but sometimes a uint9 or a uint1 4 | 5 | const BITS_PER_CHAR = 17 // This is a 17-bit encoding 6 | const BITS_PER_BYTE = 8 7 | 8 | // Compressed representation of inclusive-exclusive ranges of characters used 9 | // in this encoding. TODO 10 | const pairStrings = [ 11 | '..................................', 12 | '.............', 13 | '..' 14 | ] 15 | 16 | // Decompression 17 | const lookupE = {} 18 | const lookupD = {} 19 | pairStrings.forEach((pairString, r) => { 20 | const numZBits = BITS_PER_CHAR - BITS_PER_BYTE * r // 0 -> 17, 1 -> 9, 2 -> 1 21 | lookupE[numZBits] = {} 22 | let z = 0 23 | pairString.match(/../gu).forEach(pair => { 24 | const [first, last] = [...pair].map(x => x.codePointAt(0)) 25 | for (let codePoint = first; codePoint < last; codePoint++) { 26 | const chr = String.fromCodePoint(codePoint) 27 | lookupE[numZBits][z] = chr 28 | lookupD[chr] = [numZBits, z] 29 | z++ 30 | } 31 | }) 32 | }) 33 | 34 | const encode = uint8Array => { 35 | const length = uint8Array.length 36 | 37 | let str = '' 38 | let z = 0 39 | let numZBits = 0 40 | 41 | for (let i = 0; i < length; i++) { 42 | const uint8 = uint8Array[i] 43 | 44 | // Take most significant bit first 45 | for (let j = BITS_PER_BYTE - 1; j >= 0; j--) { 46 | const bit = (uint8 >> j) & 1 47 | 48 | z = (z << 1) + bit 49 | numZBits++ 50 | 51 | if (numZBits === BITS_PER_CHAR) { 52 | str += lookupE[numZBits][z] 53 | z = 0 54 | numZBits = 0 55 | } 56 | } 57 | } 58 | 59 | if (numZBits !== 0) { 60 | // Final bits require special treatment. 61 | while (!(numZBits in lookupE)) { 62 | z = (z << 1) + 1 63 | numZBits++ 64 | } 65 | 66 | str += lookupE[numZBits][z] 67 | } 68 | 69 | return str 70 | } 71 | 72 | const decode = str => { 73 | const length = str.length 74 | 75 | // This length is a guess. There's a chance we allocate one more byte here 76 | // than we actually need. But we can count and slice it off later 77 | const uint8Array = new Uint8Array(Math.floor(length * BITS_PER_CHAR / BITS_PER_BYTE)) 78 | let numUint8s = 0 79 | let uint8 = 0 80 | let numUint8Bits = 0 81 | let shouldBeNoMoreChars = false 82 | 83 | for (const chr of str) { 84 | if (shouldBeNoMoreChars) { 85 | throw new Error('Secondary character found before end of input') 86 | } 87 | 88 | if (!(chr in lookupD)) { 89 | throw new Error(`Unrecognised Base131072 character: ${chr}`) 90 | } 91 | 92 | const [numZBits, z] = lookupD[chr] 93 | 94 | // Take most significant bit first 95 | for (let j = numZBits - 1; j >= 0; j--) { 96 | const bit = (z >> j) & 1 97 | 98 | uint8 = (uint8 << 1) + bit 99 | numUint8Bits++ 100 | 101 | if (numUint8Bits === BITS_PER_BYTE) { 102 | uint8Array[numUint8s] = uint8 103 | numUint8s++ 104 | uint8 = 0 105 | numUint8Bits = 0 106 | } 107 | } 108 | 109 | if (numZBits !== BITS_PER_CHAR) { 110 | shouldBeNoMoreChars = true 111 | } 112 | } 113 | 114 | // Final padding bits! Requires special consideration! 115 | // Remember how we always pad with 1s? 116 | // Note: there could be 0 such bits, check still works though 117 | if (uint8 !== ((1 << numUint8Bits) - 1)) { 118 | throw new Error('Padding mismatch') 119 | } 120 | 121 | return new Uint8Array(uint8Array.buffer, 0, numUint8s) 122 | } 123 | 124 | export { encode, decode } 125 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # base131072 2 | 3 | Base131072 is a binary encoding optimised for UTF-32-encoded text and Twitter; it is the intended successor to [Base65536](https://github.com/ferno/base65536). This JavaScript module, `base131072`, is an implementation of this encoding... however, it can't be used yet because there aren't enough safe Unicode characters. 4 | 5 | Efficiency ratings are averaged over long inputs. Higher is better. 6 | 7 |
| Encoding | 11 |Efficiency | 12 |Bytes per Tweet * | 13 ||||
|---|---|---|---|---|---|
| UTF‑8 | 16 |UTF‑16 | 17 |UTF‑32 | 18 ||||
| ASCII‑constrained | 23 |Unary / Base1 | 24 |0% | 25 |0% | 26 |0% | 27 |1 | 28 |
| Binary | 31 |13% | 32 |6% | 33 |3% | 34 |35 | 35 ||
| Hexadecimal | 38 |50% | 39 |25% | 40 |13% | 41 |140 | 42 ||
| Base64 | 45 |75% | 46 |38% | 47 |19% | 48 |210 | 49 ||
| Base85 † | 52 |80% | 53 |40% | 54 |20% | 55 |224 | 56 ||
| BMP‑constrained | 59 |HexagramEncode | 60 |25% | 61 |38% | 62 |19% | 63 |105 | 64 |
| BrailleEncode | 67 |33% | 68 |50% | 69 |25% | 70 |140 | 71 ||
| Base2048 | 74 |56% | 75 |69% | 76 |34% | 77 |385 | 78 ||
| Base32768 | 81 |63% | 82 |94% | 83 |47% | 84 |263 | 85 ||
| Full Unicode | 88 |Ecoji | 89 |31% | 90 |31% | 91 |31% | 92 |175 | 93 |
| Base65536 | 96 |56% | 97 |64% | 98 |50% | 99 |280 | 100 ||
| Base131072 ‡ | 103 |53%+ | 104 |53%+ | 105 |53% | 106 |297 | 107 ||