├── .gitignore ├── composer.json ├── phpunit.xml.dist ├── README.md ├── tests └── Base64Test.php └── src └── Base65536.php /.gitignore: -------------------------------------------------------------------------------- 1 | vendor/ 2 | composer.lock 3 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "phplang/base65536", 3 | "description": "Base65536 encode/decode utility", 4 | "type": "library", 5 | "keywords": [ 6 | "base65536" 7 | ], 8 | "homepage": "https://github.com/phplang/base65536", 9 | "license": "MIT", 10 | "authors": [ 11 | { 12 | "name": "Sara Golemon", 13 | "email": "pollita@php.net", 14 | "homepage": "https://twitter.com/SaraMG", 15 | "role": "Developer" 16 | } 17 | ], 18 | "autoload": { 19 | "psr-4": { 20 | "PhpLang\\": "src/" 21 | } 22 | }, 23 | "require": { 24 | "php": "^7.0" 25 | }, 26 | "require-dev": { 27 | "phpunit/phpunit": "^6.0.0" 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /phpunit.xml.dist: -------------------------------------------------------------------------------- 1 | 2 | 10 | 11 | tests 12 | 13 | 14 | 15 | 16 | src 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # base65536 2 | 3 | Base65536 is a binary encoding optimised for UTF-32-encoded text and Twitter. This PHP composer package, `phplang/base65536`, is loosely based on [qntm/base65536](https://github.com/qntm/base65536). 4 | 5 | ## Usage 6 | 7 | ```php 8 | use \PhpLang\Base65536; 9 | 10 | $buf = 'hello world'; 11 | $str = Base65536::encode($buf); 12 | echo $str; // 6 codes points, '驨ꍬ啯𒁷ꍲᕤ' 13 | 14 | var_dump($buf === Base65536::decode($str)); // bool(true) 15 | ``` 16 | 17 | #### Note 18 | 19 | Per the spec, the default encoding used for input to `decode()` and output from `encode()` is [CESU-8](https://en.wikipedia.org/wiki/CESU-8), a variant of `UTF-8` which encodes split `UTF-16` surrogate pairs. If you want true `UTF-8` output, you must specify so using the second parameter to `encode()` and `decode()`. 20 | 21 | ## License 22 | 23 | MIT, to match the generous licensing of the original. :D 24 | -------------------------------------------------------------------------------- /tests/Base64Test.php: -------------------------------------------------------------------------------- 1 | assertTrue(strlen($enc) > strlen($data)); 13 | $this->assertSame($data, b64k::decode($enc)); 14 | } 15 | } 16 | 17 | public function testKnown() { 18 | $tests = [ 19 | "\x00" => "\u{1500}", 20 | "\x01" => "\u{1501}", 21 | "\xFE" => "\u{15FE}", 22 | "\xFF" => "\u{15FF}", 23 | "\x00\x00" => "\u{3400}", 24 | "\x01\x00" => "\u{3401}", 25 | "\x00\x01" => "\u{3500}", 26 | "\xFE\xFF" => "\u{285FE}", 27 | "\xFF\xFF" => "\u{285FF}", 28 | ]; 29 | foreach ($tests as $bin => $enc) { 30 | $this->assertSame($enc, b64k::encode($bin, 'UTF8')); 31 | $this->assertSame($bin, b64k::decode($enc, 'UTF8')); 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/Base65536.php: -------------------------------------------------------------------------------- 1 | 0x015FF, // Padding block 14 | 0x03400 => 0x04CFF, 15 | 0x04E00 => 0x09EFF, 16 | 0x0A100 => 0x0A3FF, 17 | 0x0A500 => 0x0A5FF, 18 | 0x10600 => 0x106FF, 19 | 0x12000 => 0x122FF, 20 | 0x13000 => 0x133FF, 21 | 0x14400 => 0x145FF, 22 | 0x16800 => 0x169FF, 23 | 0x20000 => 0x285FF, 24 | ]; 25 | 26 | $block = -1; 27 | foreach ($ranges as $start => $end) { 28 | for (;$start < $end; $start += 0x100, ++$block) { 29 | self::$encodeTable[$block] = $start; 30 | self::$decodeTable[$start] = $block; 31 | } 32 | } 33 | $initialized = true; 34 | } 35 | 36 | /** 37 | * Main encoding algorithm 38 | * 39 | * Only deals with two octets of input at a time, 40 | * yielding exactly one integer (UTF-32) codepoint as output. 41 | * 42 | * @param iterable - Data source 43 | * @yield int - Two bytes of data mapped to Unicode ordinals 44 | */ 45 | public static function encode_iterable(iterable $str): \Generator { 46 | $table = static::$encodeTable; 47 | $cp = null; 48 | foreach ($str as $chr) { 49 | if ($cp === null) { 50 | $cp = ord($chr); 51 | continue; 52 | } 53 | yield $cp | $table[ord($chr)]; 54 | $cp = null; 55 | } 56 | if ($cp !== null) { 57 | yield $cp | $table[-1]; 58 | } 59 | } 60 | 61 | /** 62 | * String based convenience wrapper for encode_iterable() 63 | * 64 | * What any given user will probably want in reality. 65 | * Translates a string of input to a string of output. 66 | * Encoding defaults to CESU8 per the "spec" github.com/qntm/base65536 67 | * UTF8 would produce smaller encoded output, but meh... 68 | * 69 | * @param string - Input string of binary data 70 | * @param string - Output encoding (default: CESU8) 71 | * 72 | * @return string - Base65536 encoded $input 73 | */ 74 | public static function encode(string $str, string $encoding = 'CESU8'): string { 75 | $str = (function($str) { 76 | for ($i = 0; $i < strlen($str); ++$i) { 77 | yield $str[$i]; 78 | } 79 | })($str); 80 | 81 | $ret = ''; 82 | foreach (static::encode_iterable($str, $encoding) as $chr) { 83 | $ret .= \IntlChar::chr($chr); 84 | } 85 | return \UConverter::transcode($ret, $encoding, 'UTF8'); 86 | } 87 | 88 | 89 | /** 90 | * Main decode algorithm 91 | * 92 | * Accepts numeric codepoints as iterator inputs and produces 93 | * 1 or 2 bytes of binary data as output. 94 | * 95 | * @param iterable - Input data 96 | * @yields char - One or two octets per input codepoint 97 | */ 98 | public static function decode_iterable(iterable $str): \Generator { 99 | $table = static::$decodeTable; 100 | foreach ($str as $cp) { 101 | $b2 = $table[$cp & 0xFFFFFF00] ?? null; 102 | if ($b2 === null) { 103 | if (\IntlChar::isWhitespace($cp)) continue; 104 | throw new \InvalidArgumentException(sprintf("U+%04X %s is not a valid base65536 character", 105 | $cp, \IntlChar::charName($cp))); 106 | } 107 | yield chr($cp & 0xFF); 108 | if ($b2 !== -1) { 109 | yield chr($b2); 110 | } 111 | } 112 | } 113 | 114 | /** 115 | * String based convenience wrapper for decode_iterable() 116 | * 117 | * What any given user will probably want in reality. 118 | * Translates an encoded string of input to a string of binary output. 119 | * 120 | * @param string - Input string of binary data 121 | * @param string - Input encoding (default: CESU8) 122 | * 123 | * @return string - Binary data 124 | */ 125 | public static function decode(string $str, string $encoding = 'CESU8'): string { 126 | $str = \UConverter::transcode($str, 'UTF8', $encoding); 127 | $str = (function($str) { 128 | $len = strlen($str); 129 | for ($i = 0; $i < $len;) { 130 | $c = ord($str[$i]); 131 | if (($c & 0x80) == 0x00) { ++$i; } else 132 | if (($c & 0xE0) == 0xC0) { $c = \IntlChar::ord(substr($str, $i, 2)); $i += 2; } else 133 | if (($c & 0xF0) == 0xE0) { $c = \IntlChar::ord(substr($str, $i, 3)); $i += 3; } else 134 | if (($c & 0xF8) == 0xF0) { $c = \IntlChar::ord(substr($str, $i, 4)); $i += 4; } else $c = null; 135 | if ($c === null) { 136 | throw new \InvalidArgumentException("Encountered invalid characters in input"); 137 | } 138 | yield $c; 139 | } 140 | })($str); 141 | 142 | $ret = ''; 143 | foreach (static::decode_iterable($str) as $chr) { 144 | $ret .= $chr; 145 | } 146 | return $ret; 147 | } 148 | } 149 | Base65536::init(); 150 | --------------------------------------------------------------------------------