├── sample-files
    ├── everyByte
    ├── lena_std.tif
    ├── hatetris-wr.bin
    ├── everyPairOfBytes
    ├── hatetris-wr-rle.bin
    └── hatetris-wr-rle2.bin
├── .editorconfig
├── .github
    ├── dependabot.yml
    └── workflows
    │   └── workflow-1.yml
├── .gitattributes
├── package.json
├── .gitignore
├── LICENSE.txt
├── src
    └── index.js
└── README.md


/sample-files/everyByte:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qntm/base131072/HEAD/sample-files/everyByte


--------------------------------------------------------------------------------
/sample-files/lena_std.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qntm/base131072/HEAD/sample-files/lena_std.tif


--------------------------------------------------------------------------------
/sample-files/hatetris-wr.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qntm/base131072/HEAD/sample-files/hatetris-wr.bin


--------------------------------------------------------------------------------
/sample-files/everyPairOfBytes:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qntm/base131072/HEAD/sample-files/everyPairOfBytes


--------------------------------------------------------------------------------
/sample-files/hatetris-wr-rle.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qntm/base131072/HEAD/sample-files/hatetris-wr-rle.bin


--------------------------------------------------------------------------------
/sample-files/hatetris-wr-rle2.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qntm/base131072/HEAD/sample-files/hatetris-wr-rle2.bin


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | [*]
2 | insert_final_newline = true
3 | 
4 | [{*.js,*.json,*.ts,*.tsx,*.md,*.jsx}]
5 | charset = utf-8
6 | indent_style = space
7 | indent_size = 2
8 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: 'npm'
 4 |     directory: '/'
 5 |     schedule:
 6 |       interval: 'monthly'
 7 |     ignore:
 8 |     - dependency-name: '*'
 9 |       update-types:
10 |       - 'version-update:semver-minor'
11 |       - 'version-update:semver-patch'
12 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "base131072",
 3 |   "version": "1.0.0",
 4 |   "description": "Binary-to-text encoding highly optimised for UTF-32",
 5 |   "homepage": "https://github.com/qntm/base131072",
 6 |   "repository": {
 7 |     "type": "git",
 8 |     "url": "git://github.com/qntm/base131072.git"
 9 |   },
10 |   "type": "module",
11 |   "main": "src/index.js",
12 |   "keywords": [],
13 |   "scripts": {
14 |     "test": "standard"
15 |   },
16 |   "dependencies": {},
17 |   "author": "qntm",
18 |   "license": "MIT",
19 |   "devDependencies": {
20 |     "standard": "^17.0.0"
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/.github/workflows/workflow-1.yml:
--------------------------------------------------------------------------------
 1 | name: 'Travis CI replacement'
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |     - '**'
 7 | 
 8 | jobs:
 9 |   build-job:
10 |     runs-on: 'ubuntu-latest'
11 | 
12 |     strategy:
13 |       matrix:
14 |         node-version: ['20.x', '22.x', '24.x']
15 | 
16 |     steps:
17 |     - uses: 'actions/checkout@v2'
18 | 
19 |     - name: 'Use Node.js ${{ matrix.node-version }}'
20 |       uses: 'actions/setup-node@v1'
21 |       with:
22 |         node-version: '${{ matrix.node-version }}'
23 | 
24 |     - name: 'Actual npm tasks'
25 |       run: |
26 |         npm install
27 |         npm run test
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 | 
24 | # OSX
25 | # =========================
26 | 
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear in the root of a volume
35 | .DocumentRevisions-V100
36 | .fseventsd
37 | .Spotlight-V100
38 | .TemporaryItems
39 | .Trashes
40 | .VolumeIcon.icns
41 | 
42 | # Directories potentially created on remote AFP share
43 | .AppleDB
44 | .AppleDesktop
45 | Network Trash Folder
46 | Temporary Items
47 | .apdisk
48 | 
49 | node_modules
50 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 qntm
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/index.js:
--------------------------------------------------------------------------------
  1 | /** Base131072 is a binary-to-text encoding optimised for UTF-32 and Twitter. */
  2 | 
  3 | // Z is a number, usually a uint17 but sometimes a uint9 or a uint1
  4 | 
  5 | const BITS_PER_CHAR = 17 // This is a 17-bit encoding
  6 | const BITS_PER_BYTE = 8
  7 | 
  8 | // Compressed representation of inclusive-exclusive ranges of characters used
  9 | // in this encoding. TODO
 10 | const pairStrings = [
 11 |   '..................................',
 12 |   '.............',
 13 |   '..'
 14 | ]
 15 | 
 16 | // Decompression
 17 | const lookupE = {}
 18 | const lookupD = {}
 19 | pairStrings.forEach((pairString, r) => {
 20 |   const numZBits = BITS_PER_CHAR - BITS_PER_BYTE * r // 0 -> 17, 1 -> 9, 2 -> 1
 21 |   lookupE[numZBits] = {}
 22 |   let z = 0
 23 |   pairString.match(/../gu).forEach(pair => {
 24 |     const [first, last] = [...pair].map(x => x.codePointAt(0))
 25 |     for (let codePoint = first; codePoint < last; codePoint++) {
 26 |       const chr = String.fromCodePoint(codePoint)
 27 |       lookupE[numZBits][z] = chr
 28 |       lookupD[chr] = [numZBits, z]
 29 |       z++
 30 |     }
 31 |   })
 32 | })
 33 | 
 34 | const encode = uint8Array => {
 35 |   const length = uint8Array.length
 36 | 
 37 |   let str = ''
 38 |   let z = 0
 39 |   let numZBits = 0
 40 | 
 41 |   for (let i = 0; i < length; i++) {
 42 |     const uint8 = uint8Array[i]
 43 | 
 44 |     // Take most significant bit first
 45 |     for (let j = BITS_PER_BYTE - 1; j >= 0; j--) {
 46 |       const bit = (uint8 >> j) & 1
 47 | 
 48 |       z = (z << 1) + bit
 49 |       numZBits++
 50 | 
 51 |       if (numZBits === BITS_PER_CHAR) {
 52 |         str += lookupE[numZBits][z]
 53 |         z = 0
 54 |         numZBits = 0
 55 |       }
 56 |     }
 57 |   }
 58 | 
 59 |   if (numZBits !== 0) {
 60 |     // Final bits require special treatment.
 61 |     while (!(numZBits in lookupE)) {
 62 |       z = (z << 1) + 1
 63 |       numZBits++
 64 |     }
 65 | 
 66 |     str += lookupE[numZBits][z]
 67 |   }
 68 | 
 69 |   return str
 70 | }
 71 | 
 72 | const decode = str => {
 73 |   const length = str.length
 74 | 
 75 |   // This length is a guess. There's a chance we allocate one more byte here
 76 |   // than we actually need. But we can count and slice it off later
 77 |   const uint8Array = new Uint8Array(Math.floor(length * BITS_PER_CHAR / BITS_PER_BYTE))
 78 |   let numUint8s = 0
 79 |   let uint8 = 0
 80 |   let numUint8Bits = 0
 81 |   let shouldBeNoMoreChars = false
 82 | 
 83 |   for (const chr of str) {
 84 |     if (shouldBeNoMoreChars) {
 85 |       throw new Error('Secondary character found before end of input')
 86 |     }
 87 | 
 88 |     if (!(chr in lookupD)) {
 89 |       throw new Error(`Unrecognised Base131072 character: ${chr}`)
 90 |     }
 91 | 
 92 |     const [numZBits, z] = lookupD[chr]
 93 | 
 94 |     // Take most significant bit first
 95 |     for (let j = numZBits - 1; j >= 0; j--) {
 96 |       const bit = (z >> j) & 1
 97 | 
 98 |       uint8 = (uint8 << 1) + bit
 99 |       numUint8Bits++
100 | 
101 |       if (numUint8Bits === BITS_PER_BYTE) {
102 |         uint8Array[numUint8s] = uint8
103 |         numUint8s++
104 |         uint8 = 0
105 |         numUint8Bits = 0
106 |       }
107 |     }
108 | 
109 |     if (numZBits !== BITS_PER_CHAR) {
110 |       shouldBeNoMoreChars = true
111 |     }
112 |   }
113 | 
114 |   // Final padding bits! Requires special consideration!
115 |   // Remember how we always pad with 1s?
116 |   // Note: there could be 0 such bits, check still works though
117 |   if (uint8 !== ((1 << numUint8Bits) - 1)) {
118 |     throw new Error('Padding mismatch')
119 |   }
120 | 
121 |   return new Uint8Array(uint8Array.buffer, 0, numUint8s)
122 | }
123 | 
124 | export { encode, decode }
125 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # base131072
  2 | 
  3 | Base131072 is a binary encoding optimised for UTF-32-encoded text and Twitter; it is the intended successor to [Base65536](https://github.com/ferno/base65536). This JavaScript module, `base131072`, is an implementation of this encoding... however, it can't be used yet because there aren't enough safe Unicode characters.
  4 | 
  5 | Efficiency ratings are averaged over long inputs. Higher is better.
  6 | 
  7 | <table>
  8 |   <thead>
  9 |     <tr>
 10 |       <th colspan="2" rowspan="2">Encoding</th>
 11 |       <th colspan="3">Efficiency</th>
 12 |       <th rowspan="2">Bytes per Tweet *</th>
 13 |     </tr>
 14 |     <tr>
 15 |       <th>UTF&#x2011;8</th>
 16 |       <th>UTF&#x2011;16</th>
 17 |       <th>UTF&#x2011;32</th>
 18 |     </tr>
 19 |   </thead>
 20 |   <tbody>
 21 |     <tr>
 22 |       <td rowspan="5">ASCII&#x2011;constrained</td>
 23 |       <td>Unary / <a href="https://github.com/ferno/base1">Base1</a></td>
 24 |       <td style="text-align: right;">0%</td>
 25 |       <td style="text-align: right;">0%</td>
 26 |       <td style="text-align: right;">0%</td>
 27 |       <td style="text-align: right;">1</td>
 28 |     </tr>
 29 |     <tr>
 30 |       <td>Binary</td>
 31 |       <td style="text-align: right;">13%</td>
 32 |       <td style="text-align: right;">6%</td>
 33 |       <td style="text-align: right;">3%</td>
 34 |       <td style="text-align: right;">35</td>
 35 |     </tr>
 36 |     <tr>
 37 |       <td>Hexadecimal</td>
 38 |       <td style="text-align: right;">50%</td>
 39 |       <td style="text-align: right;">25%</td>
 40 |       <td style="text-align: right;">13%</td>
 41 |       <td style="text-align: right;">140</td>
 42 |     </tr>
 43 |     <tr>
 44 |       <td>Base64</td>
 45 |       <td style="text-align: right;"><strong>75%</strong></td>
 46 |       <td style="text-align: right;">38%</td>
 47 |       <td style="text-align: right;">19%</td>
 48 |       <td style="text-align: right;">210</td>
 49 |     </tr>
 50 |     <tr>
 51 |       <td>Base85 †</td>
 52 |       <td style="text-align: right;">80%</td>
 53 |       <td style="text-align: right;">40%</td>
 54 |       <td style="text-align: right;">20%</td>
 55 |       <td style="text-align: right;">224</td>
 56 |     </tr>
 57 |     <tr>
 58 |       <td rowspan="4">BMP&#x2011;constrained</td>
 59 |       <td><a href="https://github.com/ferno/hexagram-encode">HexagramEncode</a></td>
 60 |       <td style="text-align: right;">25%</td>
 61 |       <td style="text-align: right;">38%</td>
 62 |       <td style="text-align: right;">19%</td>
 63 |       <td style="text-align: right;">105</td>
 64 |     </tr>
 65 |     <tr>
 66 |       <td><a href="https://github.com/ferno/braille-encode">BrailleEncode</a></td>
 67 |       <td style="text-align: right;">33%</td>
 68 |       <td style="text-align: right;">50%</td>
 69 |       <td style="text-align: right;">25%</td>
 70 |       <td style="text-align: right;">140</td>
 71 |     </tr>
 72 |     <tr>
 73 |       <td><a href="https://github.com/qntm/base2048">Base2048</a></td>
 74 |       <td style="text-align: right;">56%</td>
 75 |       <td style="text-align: right;">69%</td>
 76 |       <td style="text-align: right;">34%</td>
 77 |       <td style="text-align: right;"><strong>385</strong></td>
 78 |     </tr>
 79 |     <tr>
 80 |       <td><a href="https://github.com/ferno/base32768">Base32768</a></td>
 81 |       <td style="text-align: right;">63%</td>
 82 |       <td style="text-align: right;"><strong>94%</strong></td>
 83 |       <td style="text-align: right;">47%</td>
 84 |       <td style="text-align: right;">263</td>
 85 |     </tr>
 86 |     <tr>
 87 |       <td rowspan="3">Full Unicode</td>
 88 |       <td><a href="https://github.com/keith-turner/ecoji">Ecoji</a></td>
 89 |       <td style="text-align: right;">31%</td>
 90 |       <td style="text-align: right;">31%</td>
 91 |       <td style="text-align: right;">31%</td>
 92 |       <td style="text-align: right;">175</td>
 93 |     </tr>
 94 |     <tr>
 95 |       <td><a href="https://github.com/ferno/base65536">Base65536</a></td>
 96 |       <td style="text-align: right;">56%</td>
 97 |       <td style="text-align: right;">64%</td>
 98 |       <td style="text-align: right;"><strong>50%</strong></td>
 99 |       <td style="text-align: right;">280</td>
100 |     </tr>
101 |     <tr>
102 |       <td><a href="https://github.com/ferno/base131072">Base131072</a> ‡</td>
103 |       <td style="text-align: right;">53%+</td>
104 |       <td style="text-align: right;">53%+</td>
105 |       <td style="text-align: right;">53%</td>
106 |       <td style="text-align: right;">297</td>
107 |     </tr>
108 |   </tbody>
109 | </table>
110 | 
111 | \* New-style "long" Tweets, up to 280 Unicode characters give or take Twitter's complex "weighting" calculation.<br/>
112 | † Base85 is listed for completeness but all variants use characters which are considered hazardous for general use in text: escape characters, brackets, punctuation *etc.*.<br/>
113 | ‡ Base131072 is a work in progress, not yet ready for general use.<br/>
114 | 
115 | For example, using Base64, up to 105 bytes of binary data can fit in a Tweet. With Base131072, 297 bytes are possible.
116 | 
117 | ## How does it work?
118 | 
119 | Base131072 is a 17-bit encoding. We take the input binary data as a sequence of 8-bit numbers, compact it into a sequence of bits, then dice the bits up again to make a sequence of 17-bit numbers. We then encode each of these 2<sup>17</sup> = 131,072 possible numbers as a different Unicode code point.
120 | 
121 | ### Padding
122 | 
123 | Note that the final 17-bit number in the sequence is likely to be "incomplete", i.e. missing some of its bits. We need to signal this fact in the output string somehow. Here's how we handle those cases.
124 | 
125 | #### Final 17-bit number has 1 to 7 missing bits
126 | 
127 | In the following cases:
128 | 
129 | 	bbbbbbbbcccccccc_ // 1 missing bit
130 | 	bbbbbbbcccccccc__ // 2 missing bits
131 | 	bbbbbbcccccccc___ // 3 missing bits
132 | 	bbbbbcccccccc____ // 4 missing bits (note: this is how a Tweet containing 297 bytes of data will end)
133 | 	bbbbcccccccc_____ // 5 missing bits
134 | 	bbbcccccccc______ // 6 missing bits
135 | 	bbcccccccc_______ // 7 missing bits
136 | 
137 | we pad the incomplete 17-bit number out to 17 bits using 1s:
138 | 
139 | 	bbbbbbbbcccccccc1
140 | 	bbbbbbbcccccccc11
141 | 	bbbbbbcccccccc111
142 | 	bbbbbcccccccc1111
143 | 	bbbbcccccccc11111
144 | 	bbbcccccccc111111
145 | 	bbcccccccc1111111
146 | 
147 | and then encode as normal using our 2<sup>17</sup>-bit repertoire.
148 | 
149 | #### Final 17-bit number has 8 to 15 missing bits
150 | 
151 | In the following cases:
152 | 
153 | 	bcccccccc________ // 8 missing bits
154 | 	cccccccc_________ // 9 missing bits
155 | 	ccccccc__________ // 10 missing bits
156 | 	cccccc___________ // 11 missing bits
157 | 	ccccc____________ // 12 missing bits (note: this is how a Tweet containing 296 bytes of data will end)
158 | 	cccc_____________ // 13 missing bits
159 | 	ccc______________ // 14 missing bits
160 | 	cc_______________ // 15 missing bits
161 | 
162 | we encode them differently. We'll pad the incomplete number out to only 9 bits using 1s:
163 | 
164 | 	bcccccccc
165 | 	cccccccc1
166 | 	ccccccc11
167 | 	cccccc111
168 | 	ccccc1111
169 | 	cccc11111
170 | 	ccc111111
171 | 	cc1111111
172 | 
173 | and then encode them using a completely different, 2<sup>9</sup>-character repertoire. On decoding, we will treat that character differently, returning 9 bits, rather than 17 from characters in the main repertoire.
174 | 
175 | #### Final 17-bit number has 16 missing bits
176 | 
177 | In this final case:
178 | 
179 | 	c________________ // 16 missing bits
180 | 
181 | we simply take this as a 1-bit number:
182 | 
183 | 	c
184 | 
185 | and encode it using a third, 2<sup>1</sup>-character repertoire. Again, on decoding, this is treated specially, and only 1 bit is added to the stream, rather than 9 or 17 as for the other characters.
186 | 
187 | In other words, Base131072 is a slight misnomer. It uses not 131,072 but 2<sup>17</sup> + 2<sup>9</sup> + 2<sup>1</sup> = 131,586 characters for its three repertoires. Of course, Base64 uses a 65th character for its padding too.
188 | 
189 | ### Decoding
190 | 
191 | On decoding, we get a series of 8-bit values, the last of which might be incomplete, like so:
192 | 
193 | 	1_______ // 7 missing bits
194 | 	11______ // 6 missing bits
195 | 	111_____ // 5 missing bits
196 | 	1111____ // 4 missing bits
197 | 	11111___ // 3 missing bits
198 | 	111111__ // 2 missing bits
199 | 	1111111_ // 1 missing bit
200 | 
201 | These are the padding 1s added at encoding time. We can check this and discard this final value.
202 | 
203 | ## Is this ready yet?
204 | 
205 | No. We need 131,586 "safe" characters for this encoding, but as of Unicode 9.0 only 108,397 exist. However, future versions of Unicode may add enough safe characters for this to become possible. In any case, the groundwork can certainly be laid.
206 | 


--------------------------------------------------------------------------------