├── sample-files ├── everyByte ├── lena_std.tif ├── hatetris-wr.bin ├── everyPairOfBytes ├── hatetris-wr-rle.bin └── hatetris-wr-rle2.bin ├── .editorconfig ├── .github ├── dependabot.yml └── workflows │ └── workflow-1.yml ├── .gitattributes ├── package.json ├── .gitignore ├── LICENSE.txt ├── src └── index.js └── README.md /sample-files/everyByte: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qntm/base131072/HEAD/sample-files/everyByte -------------------------------------------------------------------------------- /sample-files/lena_std.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qntm/base131072/HEAD/sample-files/lena_std.tif -------------------------------------------------------------------------------- /sample-files/hatetris-wr.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qntm/base131072/HEAD/sample-files/hatetris-wr.bin -------------------------------------------------------------------------------- /sample-files/everyPairOfBytes: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qntm/base131072/HEAD/sample-files/everyPairOfBytes -------------------------------------------------------------------------------- /sample-files/hatetris-wr-rle.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qntm/base131072/HEAD/sample-files/hatetris-wr-rle.bin -------------------------------------------------------------------------------- /sample-files/hatetris-wr-rle2.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qntm/base131072/HEAD/sample-files/hatetris-wr-rle2.bin -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | [*] 2 | insert_final_newline = true 3 | 4 | [{*.js,*.json,*.ts,*.tsx,*.md,*.jsx}] 5 | charset = utf-8 6 | indent_style = space 7 | indent_size = 2 8 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: 'npm' 4 | directory: '/' 5 | schedule: 6 | interval: 'monthly' 7 | ignore: 8 | - dependency-name: '*' 9 | update-types: 10 | - 'version-update:semver-minor' 11 | - 'version-update:semver-patch' 12 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "base131072", 3 | "version": "1.0.0", 4 | "description": "Binary-to-text encoding highly optimised for UTF-32", 5 | "homepage": "https://github.com/qntm/base131072", 6 | "repository": { 7 | "type": "git", 8 | "url": "git://github.com/qntm/base131072.git" 9 | }, 10 | "type": "module", 11 | "main": "src/index.js", 12 | "keywords": [], 13 | "scripts": { 14 | "test": "standard" 15 | }, 16 | "dependencies": {}, 17 | "author": "qntm", 18 | "license": "MIT", 19 | "devDependencies": { 20 | "standard": "^17.0.0" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /.github/workflows/workflow-1.yml: -------------------------------------------------------------------------------- 1 | name: 'Travis CI replacement' 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - '**' 7 | 8 | jobs: 9 | build-job: 10 | runs-on: 'ubuntu-latest' 11 | 12 | strategy: 13 | matrix: 14 | node-version: ['20.x', '22.x', '24.x'] 15 | 16 | steps: 17 | - uses: 'actions/checkout@v2' 18 | 19 | - name: 'Use Node.js ${{ matrix.node-version }}' 20 | uses: 'actions/setup-node@v1' 21 | with: 22 | node-version: '${{ matrix.node-version }}' 23 | 24 | - name: 'Actual npm tasks' 25 | run: | 26 | npm install 27 | npm run test 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | 49 | node_modules 50 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 qntm 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | /** Base131072 is a binary-to-text encoding optimised for UTF-32 and Twitter. */ 2 | 3 | // Z is a number, usually a uint17 but sometimes a uint9 or a uint1 4 | 5 | const BITS_PER_CHAR = 17 // This is a 17-bit encoding 6 | const BITS_PER_BYTE = 8 7 | 8 | // Compressed representation of inclusive-exclusive ranges of characters used 9 | // in this encoding. TODO 10 | const pairStrings = [ 11 | '..................................', 12 | '.............', 13 | '..' 14 | ] 15 | 16 | // Decompression 17 | const lookupE = {} 18 | const lookupD = {} 19 | pairStrings.forEach((pairString, r) => { 20 | const numZBits = BITS_PER_CHAR - BITS_PER_BYTE * r // 0 -> 17, 1 -> 9, 2 -> 1 21 | lookupE[numZBits] = {} 22 | let z = 0 23 | pairString.match(/../gu).forEach(pair => { 24 | const [first, last] = [...pair].map(x => x.codePointAt(0)) 25 | for (let codePoint = first; codePoint < last; codePoint++) { 26 | const chr = String.fromCodePoint(codePoint) 27 | lookupE[numZBits][z] = chr 28 | lookupD[chr] = [numZBits, z] 29 | z++ 30 | } 31 | }) 32 | }) 33 | 34 | const encode = uint8Array => { 35 | const length = uint8Array.length 36 | 37 | let str = '' 38 | let z = 0 39 | let numZBits = 0 40 | 41 | for (let i = 0; i < length; i++) { 42 | const uint8 = uint8Array[i] 43 | 44 | // Take most significant bit first 45 | for (let j = BITS_PER_BYTE - 1; j >= 0; j--) { 46 | const bit = (uint8 >> j) & 1 47 | 48 | z = (z << 1) + bit 49 | numZBits++ 50 | 51 | if (numZBits === BITS_PER_CHAR) { 52 | str += lookupE[numZBits][z] 53 | z = 0 54 | numZBits = 0 55 | } 56 | } 57 | } 58 | 59 | if (numZBits !== 0) { 60 | // Final bits require special treatment. 61 | while (!(numZBits in lookupE)) { 62 | z = (z << 1) + 1 63 | numZBits++ 64 | } 65 | 66 | str += lookupE[numZBits][z] 67 | } 68 | 69 | return str 70 | } 71 | 72 | const decode = str => { 73 | const length = str.length 74 | 75 | // This length is a guess. There's a chance we allocate one more byte here 76 | // than we actually need. But we can count and slice it off later 77 | const uint8Array = new Uint8Array(Math.floor(length * BITS_PER_CHAR / BITS_PER_BYTE)) 78 | let numUint8s = 0 79 | let uint8 = 0 80 | let numUint8Bits = 0 81 | let shouldBeNoMoreChars = false 82 | 83 | for (const chr of str) { 84 | if (shouldBeNoMoreChars) { 85 | throw new Error('Secondary character found before end of input') 86 | } 87 | 88 | if (!(chr in lookupD)) { 89 | throw new Error(`Unrecognised Base131072 character: ${chr}`) 90 | } 91 | 92 | const [numZBits, z] = lookupD[chr] 93 | 94 | // Take most significant bit first 95 | for (let j = numZBits - 1; j >= 0; j--) { 96 | const bit = (z >> j) & 1 97 | 98 | uint8 = (uint8 << 1) + bit 99 | numUint8Bits++ 100 | 101 | if (numUint8Bits === BITS_PER_BYTE) { 102 | uint8Array[numUint8s] = uint8 103 | numUint8s++ 104 | uint8 = 0 105 | numUint8Bits = 0 106 | } 107 | } 108 | 109 | if (numZBits !== BITS_PER_CHAR) { 110 | shouldBeNoMoreChars = true 111 | } 112 | } 113 | 114 | // Final padding bits! Requires special consideration! 115 | // Remember how we always pad with 1s? 116 | // Note: there could be 0 such bits, check still works though 117 | if (uint8 !== ((1 << numUint8Bits) - 1)) { 118 | throw new Error('Padding mismatch') 119 | } 120 | 121 | return new Uint8Array(uint8Array.buffer, 0, numUint8s) 122 | } 123 | 124 | export { encode, decode } 125 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # base131072 2 | 3 | Base131072 is a binary encoding optimised for UTF-32-encoded text and Twitter; it is the intended successor to [Base65536](https://github.com/ferno/base65536). This JavaScript module, `base131072`, is an implementation of this encoding... however, it can't be used yet because there aren't enough safe Unicode characters. 4 | 5 | Efficiency ratings are averaged over long inputs. Higher is better. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 |
EncodingEfficiencyBytes per Tweet *
UTF‑8UTF‑16UTF‑32
ASCII‑constrainedUnary / Base10%0%0%1
Binary13%6%3%35
Hexadecimal50%25%13%140
Base6475%38%19%210
Base85 †80%40%20%224
BMP‑constrainedHexagramEncode25%38%19%105
BrailleEncode33%50%25%140
Base204856%69%34%385
Base3276863%94%47%263
Full UnicodeEcoji31%31%31%175
Base6553656%64%50%280
Base13107253%+53%+53%297
110 | 111 | \* New-style "long" Tweets, up to 280 Unicode characters give or take Twitter's complex "weighting" calculation.
112 | † Base85 is listed for completeness but all variants use characters which are considered hazardous for general use in text: escape characters, brackets, punctuation *etc.*.
113 | ‡ Base131072 is a work in progress, not yet ready for general use.
114 | 115 | For example, using Base64, up to 105 bytes of binary data can fit in a Tweet. With Base131072, 297 bytes are possible. 116 | 117 | ## How does it work? 118 | 119 | Base131072 is a 17-bit encoding. We take the input binary data as a sequence of 8-bit numbers, compact it into a sequence of bits, then dice the bits up again to make a sequence of 17-bit numbers. We then encode each of these 217 = 131,072 possible numbers as a different Unicode code point. 120 | 121 | ### Padding 122 | 123 | Note that the final 17-bit number in the sequence is likely to be "incomplete", i.e. missing some of its bits. We need to signal this fact in the output string somehow. Here's how we handle those cases. 124 | 125 | #### Final 17-bit number has 1 to 7 missing bits 126 | 127 | In the following cases: 128 | 129 | bbbbbbbbcccccccc_ // 1 missing bit 130 | bbbbbbbcccccccc__ // 2 missing bits 131 | bbbbbbcccccccc___ // 3 missing bits 132 | bbbbbcccccccc____ // 4 missing bits (note: this is how a Tweet containing 297 bytes of data will end) 133 | bbbbcccccccc_____ // 5 missing bits 134 | bbbcccccccc______ // 6 missing bits 135 | bbcccccccc_______ // 7 missing bits 136 | 137 | we pad the incomplete 17-bit number out to 17 bits using 1s: 138 | 139 | bbbbbbbbcccccccc1 140 | bbbbbbbcccccccc11 141 | bbbbbbcccccccc111 142 | bbbbbcccccccc1111 143 | bbbbcccccccc11111 144 | bbbcccccccc111111 145 | bbcccccccc1111111 146 | 147 | and then encode as normal using our 217-bit repertoire. 148 | 149 | #### Final 17-bit number has 8 to 15 missing bits 150 | 151 | In the following cases: 152 | 153 | bcccccccc________ // 8 missing bits 154 | cccccccc_________ // 9 missing bits 155 | ccccccc__________ // 10 missing bits 156 | cccccc___________ // 11 missing bits 157 | ccccc____________ // 12 missing bits (note: this is how a Tweet containing 296 bytes of data will end) 158 | cccc_____________ // 13 missing bits 159 | ccc______________ // 14 missing bits 160 | cc_______________ // 15 missing bits 161 | 162 | we encode them differently. We'll pad the incomplete number out to only 9 bits using 1s: 163 | 164 | bcccccccc 165 | cccccccc1 166 | ccccccc11 167 | cccccc111 168 | ccccc1111 169 | cccc11111 170 | ccc111111 171 | cc1111111 172 | 173 | and then encode them using a completely different, 29-character repertoire. On decoding, we will treat that character differently, returning 9 bits, rather than 17 from characters in the main repertoire. 174 | 175 | #### Final 17-bit number has 16 missing bits 176 | 177 | In this final case: 178 | 179 | c________________ // 16 missing bits 180 | 181 | we simply take this as a 1-bit number: 182 | 183 | c 184 | 185 | and encode it using a third, 21-character repertoire. Again, on decoding, this is treated specially, and only 1 bit is added to the stream, rather than 9 or 17 as for the other characters. 186 | 187 | In other words, Base131072 is a slight misnomer. It uses not 131,072 but 217 + 29 + 21 = 131,586 characters for its three repertoires. Of course, Base64 uses a 65th character for its padding too. 188 | 189 | ### Decoding 190 | 191 | On decoding, we get a series of 8-bit values, the last of which might be incomplete, like so: 192 | 193 | 1_______ // 7 missing bits 194 | 11______ // 6 missing bits 195 | 111_____ // 5 missing bits 196 | 1111____ // 4 missing bits 197 | 11111___ // 3 missing bits 198 | 111111__ // 2 missing bits 199 | 1111111_ // 1 missing bit 200 | 201 | These are the padding 1s added at encoding time. We can check this and discard this final value. 202 | 203 | ## Is this ready yet? 204 | 205 | No. We need 131,586 "safe" characters for this encoding, but as of Unicode 9.0 only 108,397 exist. However, future versions of Unicode may add enough safe characters for this to become possible. In any case, the groundwork can certainly be laid. 206 | --------------------------------------------------------------------------------