├── LICENSE.txt ├── README.md ├── c ├── unibinary.xcodeproj │ ├── project.pbxproj │ ├── project.xcworkspace │ │ ├── contents.xcworkspacedata │ │ ├── xcshareddata │ │ │ └── unibinary.xccheckout │ │ └── xcuserdata │ │ │ └── nst.xcuserdatad │ │ │ ├── UserInterfaceState.xcuserstate │ │ │ └── WorkspaceSettings.xcsettings │ └── xcuserdata │ │ └── nst.xcuserdatad │ │ ├── xcdebugger │ │ └── Breakpoints_v2.xcbkptlist │ │ └── xcschemes │ │ ├── tests.xcscheme │ │ ├── unibinary.xcscheme │ │ └── xcschememanagement.plist └── unibinary │ ├── Makefile │ ├── main.c │ ├── tests.c │ ├── unibinary.1 │ ├── unibinary.c │ └── unibinary.h ├── javascript ├── test │ ├── template.css │ ├── test.html │ ├── test.js │ ├── typedarray.js │ ├── wru.console.js │ └── wru.min.js ├── unibinary.js └── unibinary_tool.js └── python ├── ub_profile.py ├── ub_test.py └── unibinary.py /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Nicolas Seriot 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # UniBinary 2 | 3 | _Encodes data into printable Unicode characters._ 4 | 5 | ### What is UniBinary 6 | 7 | UniBinary is an encoding algorithm which packs arbitrary data into printable Unicode characters. 8 | 9 | It can be used to send data through media such as Twitter which don't allow binary data but allow Unicode characters. 10 | 11 | UniBinary is akin to Base64 but uses much fewer characters. 12 | 13 | UniBinary comes with three parts: 14 | 15 | - this documentation, 16 | - a Python implementation, 17 | - a C implementation. 18 | 19 | ### Python Implementation 20 | 21 | Encode a binary file into a UTF-8 text file: 22 | 23 | $ python unibinary.py -e /bin/date > /tmp/date.txt 24 | $ file /tmp/date.txt 25 | /tmp/date.txt: UTF-8 Unicode text, with very long lines, with no line terminators 26 | 27 | Decode a UTF-8 text file into a binary file: 28 | 29 | $ python unibinary.py -d /tmp/date.txt > /tmp/date 30 | $ file /tmp/date 31 | /tmp/date: Mach-O 64-bit executable x86_64 32 | 33 | It works! 34 | 35 | $ chmod +x /tmp/date 36 | $ /tmp/date 37 | Thu Jan 17 18:02:24 CET 2013 38 | 39 | Inline string encoding: 40 | 41 | $ python unibinary.py -es "test" 42 | 鬥髴 43 | 44 | Inline string decoding: 45 | 46 | $ python unibinary.py -ds "嫯壭巠唀帀廀帀庀帀庀帀嚀一币帀币帀常帀済靬餯瘷駲餤悀巿巿Ѐ丅戀Ѐ丅榀帀乿巿巰叿巿崀帀丏巿巿崅帀渐帀币帀帐帀丏崀巿嵪焨最帀袁夀劃峀勍嘈凄爪与夑巰一帀ӿ丅丏巿蠀帀夀侃峀勍嘏巿巿巿帀巿崀丏巿" > micro_macho 47 | $ chmod +x micro_macho 48 | $ ./micro_macho 49 | Hello world 50 | 51 | ### C Implementation 52 | 53 | Compile it with `make`: 54 | 55 | $ make 56 | $ make tests 57 | 58 | Run the unit tests: 59 | 60 | $ ./tests 61 | ... 62 | -- ALL TESTS ARE OK -- 63 | 64 | Run the main executable: 65 | 66 | $ ./unibinary 67 | Usage: unibinary [-ed] [-sf] [-b num] [-h] 68 | 69 | UniBinary encodes and decodes data into printable Unicode characters. 70 | 71 | -e, --encode 72 | -d, --decode 73 | -s, --string to be encoded or decoded 74 | -f, --filepath to be encoded or decoded 75 | -b, --break break encoded string into num characters lines 76 | -h, --help show this help message and exit 77 | 78 | Encode a file, break output in lines of 16 characters: 79 | 80 | $ unibinary -b 16 -ef micro_macho 81 | 嫯壭巠唀帀廀帀庀帀庀帀嚀一币Ѐ七 82 | 幀帀氀帀逥餬觠鯯骬蜊丏巿巰一Ѐ七 83 | 戀Ѐ丅榀帀乿巿巰叿巿崀Ѐ七ӿ丄彀 84 | 帀舀帀幀帀戀帀巰仿巶堌蠤Ѐ七袁夀 85 | 劃峀勍嘈凄爪与夑巰一帀ӿ丅丏巿蠀 86 | 帀夀侃峀勍嘏巿ӿ七帀巿崀丏巿 87 | 88 | Encode stdin and decode the output: 89 | 90 | $ echo "test" | unibinary -e | unibinary -d 91 | test 92 | 93 | API (`unibinary.h`) 94 | 95 | // encode 96 | int unibinary_encode(FILE *fd_in, FILE *fd_out, size_t wrap_length); 97 | int unibinary_encode_string(const char* src, wchar_t **dst, size_t wrap_length); 98 | 99 | // decode 100 | int unibinary_decode(FILE *src, FILE *dst); 101 | int unibinary_decode_string(const wchar_t *src, char **dst, long *dst_len); 102 | 103 | Encoding and decoding are efficient and time (worst case) is linear with input size. 104 | 105 | In the following example, 10 times the data take 10 times more time to encode or decode. 106 | 107 | # generate 50 MB of random data 108 | $ dd if=/dev/urandom bs=1k count=1024*50 > /tmp/50 109 | $ shasum /tmp/50 110 | ca8554834cb036a6f7caf449f771573f82ef8b26 /tmp/50 111 | $ time unibinary -ef /tmp/50 > /tmp/50.txt 112 | user 0m8.062s 113 | $ time unibinary -df /tmp/50.txt > /tmp/50_decoded 114 | user 0m6.712s 115 | $ shasum /tmp/50_decoded 116 | ca8554834cb036a6f7caf449f771573f82ef8b26 /tmp/50_decoded 117 | 118 | # generate 500 MB of random data 119 | $ dd if=/dev/urandom bs=1k count=1024*500 > /tmp/500 120 | $ shasum /tmp/500 121 | a69bacfbe3999a817cab9608d14f463fce9b2cd7 /tmp/500 122 | $ user 1m20.879s 123 | $ time unibinary -df /tmp/500.txt > /tmp/500_decoded 124 | user 1m7.764s 125 | $ shasum /tmp/500_decoded 126 | a69bacfbe3999a817cab9608d14f463fce9b2cd7 /tmp/500_decoded 127 | 128 | ### Encoded Text Size 129 | 130 | UniBinary can store 3 arbitrary bytes or 4 ASCII 7-bits characters into 2 Unicode characters. 131 | 132 | You can compare UniBinary with Base64, which stores 3 bytes into 4 ASCII characters: 133 | 134 | | UniBinary (Unicode) | Base64 (ASCII) 135 | --------+---------------------+---------------- 136 | 6 bits | | 1 character 137 | 12 bits | 1 character | 138 | 2 ASCII | 1 character | 139 | 3 bytes | 2 characters | 4 characters 140 | 6 ASCII | 3 characters | 8 characters 141 | 142 | The worst case of encoding `N` bytes is `(N * 2 / 3 + 2)` Unicode characters. 143 | 144 | `C` Unicode characters can store at least `(C - C % 3) * 3 / 2 + (C % 3)` bytes. 145 | 146 | Hence, UniBinary can pack at least 209 bytes in 140 characters. 147 | 148 | In case of a text only made out of `N` ASCII 7-bits characters, the worst case is `N / 2 + 1` Unicode characters. 149 | 150 | Also, any repeated sequence of character will be compressed with a [run-length encoding](http://en.wikipedia.org/wiki/Run-length_encoding). 151 | 152 | ### Format Description 153 | 154 | #### 1. Storing Data into Unicode Code Points 155 | 156 | UniBinary packs data into three ranges of Unicode characters, named `U8`, `U12a` and `U12b`. 157 | 158 | A character in `U8` stores a 8-bits value, a character in `U12a` or `U12b` stores a 12-bits value. 159 | 160 | U8 = [ \u0400, ..., \u0400 + 0x100 [ 161 | 162 | U12b = [ \u4E00, ..., \u4E00 + 0x1000 [ 163 | 164 | U12a_0_0 = [ \u5E00, ..., \u5E00 + 0x1000 [ 165 | U12a_0_1 = [ \u6E00, ..., \u6E00 + 0x1000 [ 166 | U12a_1_0 = [ \u7E00, ..., \u7E00 + 0x1000 [ 167 | U12a_1_1 = [ \u8E00, ..., \u8E00 + 0x1000 [ 168 | 169 | `U8` is actually the "Cyrillic" block, while `U12a` and `U12b` are subsets of the "CJK Unified Ideographs" block. 170 | 171 | `U8` and `U12b` store arbitrary 8 and 12 bits sequences, while the `U12a` blocks are used to store ASCII 7-bits characters. 172 | 173 | The offset in the range represent the bits to be encoded. 174 | 175 | 0xAB (8 bits) gets encoded as \u0400 + 0xAB = \u04AB = ҫ 176 | 0xABC (12 bits) gets encoded as \u4E00 + 0xABC = \u58BC = 뱘 177 | 178 | #### 2. Mapping Arbitrary Bytes into Unicode 179 | 180 | UniBinary reads three bytes to yield two Unicode characters in the `U12b` range . 181 | 182 | Here is how UniBinary encode the 24 bits value `0xABCDEF` into two Unicode characters, and how Base64 does it by comparision: 183 | 184 | UniBinary | Base64 185 | | 186 | A B |C D |E F | A B |C D |E F 187 | 10101011 11001101 11101111 | 10101011 11001101 11101111 188 | [-----------][-----------] | [----][-----][-----][----] 189 | ABC DEF | 101010 111100 110111 101111 190 | \u4E00+0xABC \u4E00+0xDEF | 42 60 55 47 191 | 墼 寯 | q 8 3 v 192 | 193 | If less than three bytes are available, UniBinary reads bytes one by one to yield Unicode characters in `U8`. 194 | 195 | A B 196 | 10101011 197 | [------] 198 | AB 199 | \u0400+0xAB 200 | ҫ 201 | 202 | #### 3. Mapping ASCII 7-bits into one Unicode character 203 | 204 | When UniBinary meets 2 ASCII 7-bits characters `a1` and `a2`, it encodes them into one single Unicode character. This character is chosen out of four possible ranges, depending on the value of the ASCII characters: 205 | 206 | U12a_0_0 [ \u5E00, ..., \u5E00 + 0x1000 [ for a1 < 64 and a2 < 64 207 | U12a_0_1 [ \u6E00, ..., \u6E00 + 0x1000 [ for a1 < 64 and a2 >= 64 208 | U12a_1_0 [ \u7E00, ..., \u7E00 + 0x1000 [ for a1 >= 64 and a2 < 64 209 | U12a_1_1 [ \u8E00, ..., \u8E00 + 0x1000 [ for a1 >= 64 and a2 >= 64 210 | 211 | So, we can pack 2 * 6 bits in a `U12a` Unicode character. We use four different ranges to replace the 7th (MSB) missing bit. We use `U12a_1_0` and `U12a_1_1` to add 64 to `a1`, and `U12a_0_1` and `U12a_1_1` to add 64 to `a2`. As a result, we can store any tuple of 2 ASCII 7-bits characters in a single Unicode character. 212 | 213 | #### 4. Run Length Encoding 214 | 215 | UniBinary also takes advantage of repetitions to spare bytes. A byte `B` repeated more that 3 times gets encoded as `(u8, u12)` where `u8` stores `B` and `u12` stores the number of times that `B` is repeated in the `U12b` range. 216 | 217 | #### 5. Format Summary 218 | 219 | - u8 u12b -> byte B (u8) repeated N times (u12) | N in [3, 0xFFF] 220 | - u12a -> 12 bits (2 ASCII characters) 221 | - u12b u12b -> 24 bits (3 bytes) 222 | - u8 -> 8 bits (1 byte) 223 | 224 | UniBinary encoded data can be described with the following regular expression: 225 | 226 | ( u12a | (u12 u12) | (u8 u12) )* u8 {0,2} 227 | 228 | Note that new lines (`\n`) can appear anywhere in the encoded text. The decoding algorithm does simply ignore them. 229 | 230 | #### 6. Examples 231 | 232 | 0x12 0x34 -> encode 0x12 into U8, encode 0x34 into U8 233 | 0xAB 0xCD 0xEF -> encode 0xABC into U12b, encode 0xDEF into U12b 234 | 0xFF 0xFF 0xFF 0xFF -> encode 0xFF into U8, encode 0x4 into U12b 235 | 236 | AB CD EF FF FF FF FF 00 -> U12(0xABC), U12(0xDEF), U8(4), U12(0xFF), U8(00) -> "墼寯巿巿Ѐ" 237 | 238 | 13808 bytes /usr/bin/true -> 3253 Unicode characters, 9721 bytes UTF-8 file 239 | 240 | ### Encoding Algorithm 241 | 242 | First look for repetitions (no more than `0xFFF` at a time). If no repeat, then try to consume two ASCII chars. If it's not possible, look for three bytes. If less than three bytes are available, encode one byte at a time. 243 | 244 | 1. byte B repeated N times | N >= 3 -> U8(B), U12(N) 245 | 2. ASCII characters A1, A2 -> U12a(A1, A2) 246 | 3. bytes B1, B2, B3 -> U12b(B1 << 4 + B2 >> 4), U12b(((B2 & 0xF) << 8) + B3) 247 | 4. byte B -> U8(B) 248 | 249 | ### Decoding Algorithm 250 | 251 | For each unicode character, use the range to know how to unmarshall data. Extract two ASCII characters out of `U12a`, or `N` times `B` out of `(U8, U12b)`, or three bytes out of `(U12b, U12b)`, or one bytes out of `U8`. 252 | 253 | See to the source code for implementation details. 254 | -------------------------------------------------------------------------------- /c/unibinary.xcodeproj/project.pbxproj: -------------------------------------------------------------------------------- 1 | // !$*UTF8*$! 2 | { 3 | archiveVersion = 1; 4 | classes = { 5 | }; 6 | objectVersion = 54; 7 | objects = { 8 | 9 | /* Begin PBXBuildFile section */ 10 | 03495DA218707FF200D81680 /* main.c in Sources */ = {isa = PBXBuildFile; fileRef = 03495DA118707FF200D81680 /* main.c */; }; 11 | 03495DA51870810D00D81680 /* unibinary.c in Sources */ = {isa = PBXBuildFile; fileRef = 03A3B0E61869ED890070BD43 /* unibinary.c */; }; 12 | 03495DA61870810D00D81680 /* tests.c in Sources */ = {isa = PBXBuildFile; fileRef = 03E6BF9718707F9B001C339E /* tests.c */; }; 13 | 03495DAA1870810D00D81680 /* unibinary.1 in CopyFiles */ = {isa = PBXBuildFile; fileRef = 03A3B0E81869ED890070BD43 /* unibinary.1 */; }; 14 | 03A3B0E71869ED890070BD43 /* unibinary.c in Sources */ = {isa = PBXBuildFile; fileRef = 03A3B0E61869ED890070BD43 /* unibinary.c */; }; 15 | 03A3B0E91869ED890070BD43 /* unibinary.1 in CopyFiles */ = {isa = PBXBuildFile; fileRef = 03A3B0E81869ED890070BD43 /* unibinary.1 */; }; 16 | /* End PBXBuildFile section */ 17 | 18 | /* Begin PBXContainerItemProxy section */ 19 | 03E7F1C418A0E5CA0075AF4A /* PBXContainerItemProxy */ = { 20 | isa = PBXContainerItemProxy; 21 | containerPortal = 03A3B0DB1869ED890070BD43 /* Project object */; 22 | proxyType = 1; 23 | remoteGlobalIDString = 03A3B0E21869ED890070BD43; 24 | remoteInfo = unibinary; 25 | }; 26 | /* End PBXContainerItemProxy section */ 27 | 28 | /* Begin PBXCopyFilesBuildPhase section */ 29 | 03495DA91870810D00D81680 /* CopyFiles */ = { 30 | isa = PBXCopyFilesBuildPhase; 31 | buildActionMask = 2147483647; 32 | dstPath = /usr/share/man/man1/; 33 | dstSubfolderSpec = 0; 34 | files = ( 35 | 03495DAA1870810D00D81680 /* unibinary.1 in CopyFiles */, 36 | ); 37 | runOnlyForDeploymentPostprocessing = 1; 38 | }; 39 | 03A3B0E11869ED890070BD43 /* CopyFiles */ = { 40 | isa = PBXCopyFilesBuildPhase; 41 | buildActionMask = 2147483647; 42 | dstPath = /usr/share/man/man1/; 43 | dstSubfolderSpec = 0; 44 | files = ( 45 | 03A3B0E91869ED890070BD43 /* unibinary.1 in CopyFiles */, 46 | ); 47 | runOnlyForDeploymentPostprocessing = 1; 48 | }; 49 | /* End PBXCopyFilesBuildPhase section */ 50 | 51 | /* Begin PBXFileReference section */ 52 | 03495DA018707FE100D81680 /* unibinary.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = unibinary.h; sourceTree = ""; }; 53 | 03495DA118707FF200D81680 /* main.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = main.c; sourceTree = ""; }; 54 | 03495DAE1870810D00D81680 /* tests */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = tests; sourceTree = BUILT_PRODUCTS_DIR; }; 55 | 03A3B0E31869ED890070BD43 /* unibinary */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = unibinary; sourceTree = BUILT_PRODUCTS_DIR; }; 56 | 03A3B0E61869ED890070BD43 /* unibinary.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; path = unibinary.c; sourceTree = ""; }; 57 | 03A3B0E81869ED890070BD43 /* unibinary.1 */ = {isa = PBXFileReference; lastKnownFileType = text.man; path = unibinary.1; sourceTree = ""; }; 58 | 03E6BF9718707F9B001C339E /* tests.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = tests.c; sourceTree = ""; }; 59 | 03E7F1C318A0E4D80075AF4A /* Makefile */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.make; name = Makefile; path = unibinary/Makefile; sourceTree = ""; }; 60 | /* End PBXFileReference section */ 61 | 62 | /* Begin PBXFrameworksBuildPhase section */ 63 | 03495DA81870810D00D81680 /* Frameworks */ = { 64 | isa = PBXFrameworksBuildPhase; 65 | buildActionMask = 2147483647; 66 | files = ( 67 | ); 68 | runOnlyForDeploymentPostprocessing = 0; 69 | }; 70 | 03A3B0E01869ED890070BD43 /* Frameworks */ = { 71 | isa = PBXFrameworksBuildPhase; 72 | buildActionMask = 2147483647; 73 | files = ( 74 | ); 75 | runOnlyForDeploymentPostprocessing = 0; 76 | }; 77 | /* End PBXFrameworksBuildPhase section */ 78 | 79 | /* Begin PBXGroup section */ 80 | 03495DB2187086B200D81680 /* tests */ = { 81 | isa = PBXGroup; 82 | children = ( 83 | 03E6BF9718707F9B001C339E /* tests.c */, 84 | ); 85 | name = tests; 86 | path = unibinary; 87 | sourceTree = ""; 88 | }; 89 | 03A3B0DA1869ED890070BD43 = { 90 | isa = PBXGroup; 91 | children = ( 92 | 03E7F1C318A0E4D80075AF4A /* Makefile */, 93 | 03A3B0E51869ED890070BD43 /* unibinary */, 94 | 03495DB2187086B200D81680 /* tests */, 95 | 03A3B0E41869ED890070BD43 /* Products */, 96 | ); 97 | sourceTree = ""; 98 | }; 99 | 03A3B0E41869ED890070BD43 /* Products */ = { 100 | isa = PBXGroup; 101 | children = ( 102 | 03A3B0E31869ED890070BD43 /* unibinary */, 103 | 03495DAE1870810D00D81680 /* tests */, 104 | ); 105 | name = Products; 106 | sourceTree = ""; 107 | }; 108 | 03A3B0E51869ED890070BD43 /* unibinary */ = { 109 | isa = PBXGroup; 110 | children = ( 111 | 03495DA018707FE100D81680 /* unibinary.h */, 112 | 03A3B0E61869ED890070BD43 /* unibinary.c */, 113 | 03495DA118707FF200D81680 /* main.c */, 114 | 03A3B0E81869ED890070BD43 /* unibinary.1 */, 115 | ); 116 | path = unibinary; 117 | sourceTree = ""; 118 | }; 119 | /* End PBXGroup section */ 120 | 121 | /* Begin PBXNativeTarget section */ 122 | 03495DA31870810D00D81680 /* tests */ = { 123 | isa = PBXNativeTarget; 124 | buildConfigurationList = 03495DAB1870810D00D81680 /* Build configuration list for PBXNativeTarget "tests" */; 125 | buildPhases = ( 126 | 03495DA41870810D00D81680 /* Sources */, 127 | 03495DA81870810D00D81680 /* Frameworks */, 128 | 03495DA91870810D00D81680 /* CopyFiles */, 129 | ); 130 | buildRules = ( 131 | ); 132 | dependencies = ( 133 | 03E7F1C518A0E5CA0075AF4A /* PBXTargetDependency */, 134 | ); 135 | name = tests; 136 | productName = unibinary; 137 | productReference = 03495DAE1870810D00D81680 /* tests */; 138 | productType = "com.apple.product-type.tool"; 139 | }; 140 | 03A3B0E21869ED890070BD43 /* unibinary */ = { 141 | isa = PBXNativeTarget; 142 | buildConfigurationList = 03A3B0EC1869ED890070BD43 /* Build configuration list for PBXNativeTarget "unibinary" */; 143 | buildPhases = ( 144 | 03A3B0DF1869ED890070BD43 /* Sources */, 145 | 03A3B0E01869ED890070BD43 /* Frameworks */, 146 | 03A3B0E11869ED890070BD43 /* CopyFiles */, 147 | ); 148 | buildRules = ( 149 | ); 150 | dependencies = ( 151 | ); 152 | name = unibinary; 153 | productName = unibinary; 154 | productReference = 03A3B0E31869ED890070BD43 /* unibinary */; 155 | productType = "com.apple.product-type.tool"; 156 | }; 157 | /* End PBXNativeTarget section */ 158 | 159 | /* Begin PBXProject section */ 160 | 03A3B0DB1869ED890070BD43 /* Project object */ = { 161 | isa = PBXProject; 162 | attributes = { 163 | BuildIndependentTargetsInParallel = YES; 164 | LastUpgradeCheck = 1610; 165 | ORGANIZATIONNAME = "Nicolas Seriot"; 166 | }; 167 | buildConfigurationList = 03A3B0DE1869ED890070BD43 /* Build configuration list for PBXProject "unibinary" */; 168 | compatibilityVersion = "Xcode 3.2"; 169 | developmentRegion = en; 170 | hasScannedForEncodings = 0; 171 | knownRegions = ( 172 | en, 173 | Base, 174 | ); 175 | mainGroup = 03A3B0DA1869ED890070BD43; 176 | productRefGroup = 03A3B0E41869ED890070BD43 /* Products */; 177 | projectDirPath = ""; 178 | projectRoot = ""; 179 | targets = ( 180 | 03A3B0E21869ED890070BD43 /* unibinary */, 181 | 03495DA31870810D00D81680 /* tests */, 182 | ); 183 | }; 184 | /* End PBXProject section */ 185 | 186 | /* Begin PBXSourcesBuildPhase section */ 187 | 03495DA41870810D00D81680 /* Sources */ = { 188 | isa = PBXSourcesBuildPhase; 189 | buildActionMask = 2147483647; 190 | files = ( 191 | 03495DA51870810D00D81680 /* unibinary.c in Sources */, 192 | 03495DA61870810D00D81680 /* tests.c in Sources */, 193 | ); 194 | runOnlyForDeploymentPostprocessing = 0; 195 | }; 196 | 03A3B0DF1869ED890070BD43 /* Sources */ = { 197 | isa = PBXSourcesBuildPhase; 198 | buildActionMask = 2147483647; 199 | files = ( 200 | 03A3B0E71869ED890070BD43 /* unibinary.c in Sources */, 201 | 03495DA218707FF200D81680 /* main.c in Sources */, 202 | ); 203 | runOnlyForDeploymentPostprocessing = 0; 204 | }; 205 | /* End PBXSourcesBuildPhase section */ 206 | 207 | /* Begin PBXTargetDependency section */ 208 | 03E7F1C518A0E5CA0075AF4A /* PBXTargetDependency */ = { 209 | isa = PBXTargetDependency; 210 | target = 03A3B0E21869ED890070BD43 /* unibinary */; 211 | targetProxy = 03E7F1C418A0E5CA0075AF4A /* PBXContainerItemProxy */; 212 | }; 213 | /* End PBXTargetDependency section */ 214 | 215 | /* Begin XCBuildConfiguration section */ 216 | 03495DAC1870810D00D81680 /* Debug */ = { 217 | isa = XCBuildConfiguration; 218 | buildSettings = { 219 | DEAD_CODE_STRIPPING = YES; 220 | MACOSX_DEPLOYMENT_TARGET = 11.5; 221 | PRODUCT_NAME = tests; 222 | }; 223 | name = Debug; 224 | }; 225 | 03495DAD1870810D00D81680 /* Release */ = { 226 | isa = XCBuildConfiguration; 227 | buildSettings = { 228 | DEAD_CODE_STRIPPING = YES; 229 | MACOSX_DEPLOYMENT_TARGET = 11.5; 230 | PRODUCT_NAME = tests; 231 | }; 232 | name = Release; 233 | }; 234 | 03A3B0EA1869ED890070BD43 /* Debug */ = { 235 | isa = XCBuildConfiguration; 236 | buildSettings = { 237 | ALWAYS_SEARCH_USER_PATHS = NO; 238 | CLANG_ANALYZER_LOCALIZABILITY_NONLOCALIZED = YES; 239 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; 240 | CLANG_CXX_LIBRARY = "libc++"; 241 | CLANG_ENABLE_OBJC_ARC = YES; 242 | CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; 243 | CLANG_WARN_BOOL_CONVERSION = YES; 244 | CLANG_WARN_COMMA = YES; 245 | CLANG_WARN_CONSTANT_CONVERSION = YES; 246 | CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; 247 | CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; 248 | CLANG_WARN_EMPTY_BODY = YES; 249 | CLANG_WARN_ENUM_CONVERSION = YES; 250 | CLANG_WARN_INFINITE_RECURSION = YES; 251 | CLANG_WARN_INT_CONVERSION = YES; 252 | CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; 253 | CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; 254 | CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; 255 | CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; 256 | CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; 257 | CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; 258 | CLANG_WARN_STRICT_PROTOTYPES = YES; 259 | CLANG_WARN_SUSPICIOUS_MOVE = YES; 260 | CLANG_WARN_UNREACHABLE_CODE = YES; 261 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; 262 | COPY_PHASE_STRIP = NO; 263 | DEAD_CODE_STRIPPING = YES; 264 | ENABLE_STRICT_OBJC_MSGSEND = YES; 265 | ENABLE_TESTABILITY = YES; 266 | ENABLE_USER_SCRIPT_SANDBOXING = YES; 267 | GCC_C_LANGUAGE_STANDARD = gnu99; 268 | GCC_DYNAMIC_NO_PIC = NO; 269 | GCC_ENABLE_OBJC_EXCEPTIONS = YES; 270 | GCC_NO_COMMON_BLOCKS = YES; 271 | GCC_OPTIMIZATION_LEVEL = 0; 272 | GCC_PREPROCESSOR_DEFINITIONS = ( 273 | "DEBUG=1", 274 | "$(inherited)", 275 | ); 276 | GCC_SYMBOLS_PRIVATE_EXTERN = NO; 277 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES; 278 | GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; 279 | GCC_WARN_UNDECLARED_SELECTOR = YES; 280 | GCC_WARN_UNINITIALIZED_AUTOS = YES; 281 | GCC_WARN_UNUSED_FUNCTION = YES; 282 | GCC_WARN_UNUSED_VARIABLE = YES; 283 | ONLY_ACTIVE_ARCH = YES; 284 | OTHER_CFLAGS = "-Wall"; 285 | SDKROOT = macosx; 286 | }; 287 | name = Debug; 288 | }; 289 | 03A3B0EB1869ED890070BD43 /* Release */ = { 290 | isa = XCBuildConfiguration; 291 | buildSettings = { 292 | ALWAYS_SEARCH_USER_PATHS = NO; 293 | CLANG_ANALYZER_LOCALIZABILITY_NONLOCALIZED = YES; 294 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; 295 | CLANG_CXX_LIBRARY = "libc++"; 296 | CLANG_ENABLE_OBJC_ARC = YES; 297 | CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; 298 | CLANG_WARN_BOOL_CONVERSION = YES; 299 | CLANG_WARN_COMMA = YES; 300 | CLANG_WARN_CONSTANT_CONVERSION = YES; 301 | CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; 302 | CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; 303 | CLANG_WARN_EMPTY_BODY = YES; 304 | CLANG_WARN_ENUM_CONVERSION = YES; 305 | CLANG_WARN_INFINITE_RECURSION = YES; 306 | CLANG_WARN_INT_CONVERSION = YES; 307 | CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; 308 | CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; 309 | CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; 310 | CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; 311 | CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; 312 | CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; 313 | CLANG_WARN_STRICT_PROTOTYPES = YES; 314 | CLANG_WARN_SUSPICIOUS_MOVE = YES; 315 | CLANG_WARN_UNREACHABLE_CODE = YES; 316 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; 317 | COPY_PHASE_STRIP = YES; 318 | DEAD_CODE_STRIPPING = YES; 319 | DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; 320 | ENABLE_NS_ASSERTIONS = NO; 321 | ENABLE_STRICT_OBJC_MSGSEND = YES; 322 | ENABLE_USER_SCRIPT_SANDBOXING = YES; 323 | GCC_C_LANGUAGE_STANDARD = gnu99; 324 | GCC_ENABLE_OBJC_EXCEPTIONS = YES; 325 | GCC_NO_COMMON_BLOCKS = YES; 326 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES; 327 | GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; 328 | GCC_WARN_UNDECLARED_SELECTOR = YES; 329 | GCC_WARN_UNINITIALIZED_AUTOS = YES; 330 | GCC_WARN_UNUSED_FUNCTION = YES; 331 | GCC_WARN_UNUSED_VARIABLE = YES; 332 | OTHER_CFLAGS = "-Wall"; 333 | SDKROOT = macosx; 334 | }; 335 | name = Release; 336 | }; 337 | 03A3B0ED1869ED890070BD43 /* Debug */ = { 338 | isa = XCBuildConfiguration; 339 | buildSettings = { 340 | DEAD_CODE_STRIPPING = YES; 341 | MACOSX_DEPLOYMENT_TARGET = 11.0; 342 | PRODUCT_NAME = "$(TARGET_NAME)"; 343 | }; 344 | name = Debug; 345 | }; 346 | 03A3B0EE1869ED890070BD43 /* Release */ = { 347 | isa = XCBuildConfiguration; 348 | buildSettings = { 349 | DEAD_CODE_STRIPPING = YES; 350 | MACOSX_DEPLOYMENT_TARGET = 11.0; 351 | PRODUCT_NAME = "$(TARGET_NAME)"; 352 | }; 353 | name = Release; 354 | }; 355 | /* End XCBuildConfiguration section */ 356 | 357 | /* Begin XCConfigurationList section */ 358 | 03495DAB1870810D00D81680 /* Build configuration list for PBXNativeTarget "tests" */ = { 359 | isa = XCConfigurationList; 360 | buildConfigurations = ( 361 | 03495DAC1870810D00D81680 /* Debug */, 362 | 03495DAD1870810D00D81680 /* Release */, 363 | ); 364 | defaultConfigurationIsVisible = 0; 365 | defaultConfigurationName = Release; 366 | }; 367 | 03A3B0DE1869ED890070BD43 /* Build configuration list for PBXProject "unibinary" */ = { 368 | isa = XCConfigurationList; 369 | buildConfigurations = ( 370 | 03A3B0EA1869ED890070BD43 /* Debug */, 371 | 03A3B0EB1869ED890070BD43 /* Release */, 372 | ); 373 | defaultConfigurationIsVisible = 0; 374 | defaultConfigurationName = Release; 375 | }; 376 | 03A3B0EC1869ED890070BD43 /* Build configuration list for PBXNativeTarget "unibinary" */ = { 377 | isa = XCConfigurationList; 378 | buildConfigurations = ( 379 | 03A3B0ED1869ED890070BD43 /* Debug */, 380 | 03A3B0EE1869ED890070BD43 /* Release */, 381 | ); 382 | defaultConfigurationIsVisible = 0; 383 | defaultConfigurationName = Release; 384 | }; 385 | /* End XCConfigurationList section */ 386 | }; 387 | rootObject = 03A3B0DB1869ED890070BD43 /* Project object */; 388 | } 389 | -------------------------------------------------------------------------------- /c/unibinary.xcodeproj/project.xcworkspace/contents.xcworkspacedata: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /c/unibinary.xcodeproj/project.xcworkspace/xcshareddata/unibinary.xccheckout: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | IDESourceControlProjectFavoriteDictionaryKey 6 | 7 | IDESourceControlProjectIdentifier 8 | 3F27F61D-A4BB-473B-AD19-AAB4617EA8C6 9 | IDESourceControlProjectName 10 | unibinary 11 | IDESourceControlProjectOriginsDictionary 12 | 13 | 88D749B0-2DE6-49D3-B7FD-DE2AD7DF2BAC 14 | https://github.com/nst/UniBinary.git 15 | 16 | IDESourceControlProjectPath 17 | c/unibinary.xcodeproj/project.xcworkspace 18 | IDESourceControlProjectRelativeInstallPathDictionary 19 | 20 | 88D749B0-2DE6-49D3-B7FD-DE2AD7DF2BAC 21 | ../../.. 22 | 23 | IDESourceControlProjectURL 24 | https://github.com/nst/UniBinary.git 25 | IDESourceControlProjectVersion 26 | 110 27 | IDESourceControlProjectWCCIdentifier 28 | 88D749B0-2DE6-49D3-B7FD-DE2AD7DF2BAC 29 | IDESourceControlProjectWCConfigurations 30 | 31 | 32 | IDESourceControlRepositoryExtensionIdentifierKey 33 | public.vcs.git 34 | IDESourceControlWCCIdentifierKey 35 | 88D749B0-2DE6-49D3-B7FD-DE2AD7DF2BAC 36 | IDESourceControlWCCName 37 | UniBinary 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /c/unibinary.xcodeproj/project.xcworkspace/xcuserdata/nst.xcuserdatad/UserInterfaceState.xcuserstate: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nst/UniBinary/f6a9929f90616540060d329ef7114c1b6edb3a76/c/unibinary.xcodeproj/project.xcworkspace/xcuserdata/nst.xcuserdatad/UserInterfaceState.xcuserstate -------------------------------------------------------------------------------- /c/unibinary.xcodeproj/project.xcworkspace/xcuserdata/nst.xcuserdatad/WorkspaceSettings.xcsettings: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | HasAskedToTakeAutomaticSnapshotBeforeSignificantChanges 6 | 7 | SnapshotAutomaticallyBeforeSignificantChanges 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /c/unibinary.xcodeproj/xcuserdata/nst.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | -------------------------------------------------------------------------------- /c/unibinary.xcodeproj/xcuserdata/nst.xcuserdatad/xcschemes/tests.xcscheme: -------------------------------------------------------------------------------- 1 | 2 | 5 | 8 | 9 | 15 | 21 | 22 | 23 | 24 | 25 | 30 | 31 | 32 | 33 | 39 | 40 | 41 | 42 | 51 | 52 | 58 | 59 | 60 | 61 | 65 | 66 | 67 | 68 | 69 | 70 | 76 | 77 | 83 | 84 | 85 | 86 | 88 | 89 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /c/unibinary.xcodeproj/xcuserdata/nst.xcuserdatad/xcschemes/unibinary.xcscheme: -------------------------------------------------------------------------------- 1 | 2 | 5 | 8 | 9 | 15 | 21 | 22 | 23 | 24 | 25 | 30 | 31 | 32 | 33 | 39 | 40 | 41 | 42 | 51 | 52 | 58 | 59 | 60 | 61 | 65 | 66 | 67 | 68 | 69 | 70 | 76 | 77 | 83 | 84 | 85 | 86 | 88 | 89 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /c/unibinary.xcodeproj/xcuserdata/nst.xcuserdatad/xcschemes/xcschememanagement.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | SchemeUserState 6 | 7 | tests.xcscheme 8 | 9 | orderHint 10 | 1 11 | 12 | unibinary.xcscheme 13 | 14 | orderHint 15 | 0 16 | 17 | 18 | SuppressBuildableAutocreation 19 | 20 | 03495DA31870810D00D81680 21 | 22 | primary 23 | 24 | 25 | 03A3B0E21869ED890070BD43 26 | 27 | primary 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /c/unibinary/Makefile: -------------------------------------------------------------------------------- 1 | CC=gcc 2 | CFLAGS=-I. -Wall 3 | 4 | unibinary: unibinary.o main.o 5 | $(CC) -o unibinary main.o unibinary.o $(CFLAGS) 6 | 7 | tests: unibinary.o tests.o 8 | $(CC) -o tests unibinary.o tests.o $(CFLAGS) 9 | 10 | clean: 11 | rm -rf *o unibinary tests 12 | -------------------------------------------------------------------------------- /c/unibinary/main.c: -------------------------------------------------------------------------------- 1 | // 2 | // main.c 3 | // unibinary 4 | // 5 | // Created by Nicolas Seriot on 29/12/13. 6 | // Copyright (c) 2013 Nicolas Seriot. All rights reserved. 7 | // 8 | 9 | #include "unibinary.h" 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | int display_usage(void) { 17 | printf("Usage: unibinary [-ed] [-sf] [-b num] [-h]\n"); 18 | printf("\n"); 19 | printf("UniBinary encodes and decodes data into printable Unicode characters.\n"); 20 | printf("\n"); 21 | printf(" -e, --encode\n"); 22 | printf(" -d, --decode\n"); 23 | printf(" -s, --string to be encoded or decoded\n"); 24 | printf(" -f, --filepath to be encoded or decoded\n"); 25 | printf(" -b, --break break encoded string into num characters lines\n"); 26 | printf(" -h, --help show this help message and exit\n"); 27 | return EXIT_SUCCESS; 28 | } 29 | 30 | static const struct option long_options[] = 31 | { 32 | { "encode", no_argument, 0, 'e' }, 33 | { "decode", no_argument, 0, 'd' }, 34 | { "string", required_argument, 0, 's' }, 35 | { "path", required_argument, 0, 'f' }, 36 | { "break", required_argument, 0, 'b' }, 37 | { "help", no_argument, 0, 'h' }, 38 | { NULL, 0, NULL, 0 } 39 | }; 40 | 41 | struct global_args_t { 42 | short encode; 43 | short decode; 44 | char *string; 45 | const char *path; 46 | short wrap; 47 | } global_args; 48 | 49 | int main(int argc, char * const argv[]) { 50 | 51 | // $ echo test | ./unibinary -e | ./unibinary -d 52 | // test 53 | 54 | char *old_locale = setlocale(LC_ALL, NULL); 55 | char *saved_locale = strdup(old_locale); 56 | if(saved_locale == NULL) return EXIT_FAILURE; 57 | 58 | setlocale(LC_CTYPE, ""); 59 | 60 | static const char *opt_string = "eds:f:b:h"; 61 | 62 | int opt = getopt_long( argc, argv, opt_string, long_options, NULL); 63 | while( opt != -1 ) { 64 | switch( opt ) { 65 | case 'e': 66 | global_args.encode = 1; 67 | break; 68 | case 'd': 69 | global_args.decode = 1; 70 | break; 71 | case 's': 72 | global_args.string = optarg; 73 | break; 74 | case 'f': 75 | global_args.path = optarg; 76 | break; 77 | case 'b': 78 | global_args.wrap = atoi(optarg); 79 | break; 80 | // case 'h': 81 | // display_usage(); 82 | // goto exit_failure; 83 | break; 84 | default: 85 | break; 86 | } 87 | 88 | opt = getopt_long( argc, argv, opt_string, long_options, NULL); 89 | } 90 | 91 | if(global_args.encode) { 92 | // encode 93 | 94 | if(global_args.string != NULL) { 95 | // encode string 96 | wchar_t *wcs; 97 | unibinary_encode_string(global_args.string, &wcs, global_args.wrap); 98 | fwprintf(stdout, wcs); 99 | free(wcs); 100 | } else if (global_args.path != NULL) { 101 | // encode path 102 | FILE *fd_in = fopen(global_args.path, "rb"); 103 | if(fd_in == NULL) goto exit_failure; 104 | 105 | int status = unibinary_encode(fd_in, stdout, global_args.wrap); 106 | fclose(fd_in); 107 | 108 | if(status != 0) goto exit_failure; 109 | } else { 110 | // encode stdin 111 | int status = unibinary_encode(stdin, stdout, global_args.wrap); 112 | if(status != 0) goto exit_failure; 113 | } 114 | } else if (global_args.decode) { 115 | // decode 116 | 117 | if(global_args.string != NULL) { 118 | // decode string 119 | size_t max_wchar_bytes = strlen(global_args.string) * MB_CUR_MAX; 120 | wchar_t wcsout[max_wchar_bytes]; 121 | size_t nb_wc = mbstowcs(wcsout, global_args.string, max_wchar_bytes); 122 | if(nb_wc == -1) goto exit_failure; 123 | 124 | char* data; 125 | long dst_len; 126 | unibinary_decode_string(wcsout, &data, &dst_len); 127 | size_t written = fwrite(data, sizeof(char), dst_len, stdout); 128 | free(data); 129 | 130 | if(written != dst_len) goto exit_failure; 131 | 132 | } else if (global_args.path != NULL) { 133 | // decode path 134 | FILE *fd_in = fopen(global_args.path, "rb"); 135 | if(fd_in == NULL) goto exit_failure; 136 | 137 | int status = unibinary_decode(fd_in, stdout); 138 | fclose(fd_in); 139 | 140 | if(status != 0) goto exit_failure; 141 | } else { 142 | // decode stdin 143 | int status = unibinary_decode(stdin, stdout); 144 | if(status != 0) goto exit_failure; 145 | 146 | } 147 | 148 | } else { 149 | display_usage(); 150 | } 151 | 152 | if(global_args.encode || global_args.decode) { 153 | // add a newline if stdout is not piped 154 | if (isatty(fileno(stdout)) == 1) { 155 | fflush(stdout); 156 | fprintf(stderr, "\n"); 157 | } 158 | } 159 | 160 | goto exit_success; 161 | 162 | exit_success: 163 | setlocale(LC_ALL, saved_locale); 164 | free(saved_locale); 165 | 166 | return EXIT_SUCCESS; 167 | 168 | exit_failure: 169 | setlocale(LC_ALL, saved_locale); 170 | free(saved_locale); 171 | 172 | return EXIT_FAILURE; 173 | } 174 | -------------------------------------------------------------------------------- /c/unibinary/tests.c: -------------------------------------------------------------------------------- 1 | // 2 | // tests.c 3 | // unibinary 4 | // 5 | // Created by Nicolas Seriot on 29/12/13. 6 | // Copyright (c) 2013 Nicolas Seriot. All rights reserved. 7 | // 8 | 9 | #include "unibinary.h" 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | int number_of_repeated_characters_at_index(const char* src, size_t i, size_t srcSize, int *n); 17 | int unichr_12a_from_two_ascii(unsigned char c0, unsigned char c1, wchar_t *u0); 18 | int two_twelve_bits_values_from_three_bytes(uint8_t c0, uint8_t c1, uint8_t c2, wchar_t *u0, wchar_t *u1); 19 | int two_unichr_to_repeat_byte_ntimes(unsigned char c, int n, wchar_t *u0, wchar_t *u1); 20 | int to_U08(uint8_t i, wchar_t *o); 21 | int to_U12(wchar_t i, wchar_t *o); 22 | int from_U12b(wchar_t i, wchar_t *o); 23 | int is_in_U08b(wchar_t i); 24 | int is_in_U12a(wchar_t i); 25 | int bytes_from_u1_u2(wchar_t u1, wchar_t u2, uint8_t **buffer, size_t *bufferSize); 26 | int U12a_to_8_8(wchar_t u, uint8_t *b0, uint8_t *b1); 27 | int two_bytes_from_unichars(wchar_t u1, wchar_t u2, uint8_t *b1, uint8_t *b2); 28 | int three_bytes_from_unichars(wchar_t u1, wchar_t u2, uint8_t *b1, uint8_t *b2, uint8_t *b3); 29 | 30 | int compareFiles(const char* filename1, const char* filename2) { 31 | // 0 if same contents 32 | 33 | FILE *f1 = fopen(filename1, "r"); 34 | FILE *f2 = fopen(filename2, "r"); 35 | 36 | // obtain file size: 37 | fseek (f1 , 0 , SEEK_END); 38 | long size1 = ftell (f1); 39 | rewind (f1); 40 | 41 | // obtain file size: 42 | fseek (f2 , 0 , SEEK_END); 43 | long size2 = ftell (f2); 44 | rewind (f2); 45 | 46 | if (size1 != size2) { 47 | printf("File sizes differ, %ld vs. %ld\n", size1, size2); 48 | fclose(f1); 49 | fclose(f2); 50 | return EXIT_FAILURE; 51 | } 52 | 53 | char tmp1, tmp2; 54 | 55 | int files_are_equal = 1; 56 | 57 | for (int i=0;i 11 | #include 12 | #include 13 | 14 | // encodes ascii 7-bits characters 15 | wchar_t U12a_0_0_start = 0x5E00; // CJK Unified Ideographs (subset) - encodes 12 bits (2 ascii) - MSB 0,0 16 | wchar_t U12a_0_1_start = 0x6E00; // CJK Unified Ideographs (subset) - encodes 12 bits (2 ascii) - MSB 0,1 17 | wchar_t U12a_1_0_start = 0x7E00; // CJK Unified Ideographs (subset) - encodes 12 bits (2 ascii) - MSB 1,0 18 | wchar_t U12a_1_1_start = 0x8E00; // CJK Unified Ideographs (subset) - encodes 12 bits (2 ascii) - MSB 1,1 19 | wchar_t U12a_length = 0x1000; 20 | 21 | // encodes arbitrary bits 22 | wchar_t U12b_start = 0x4E00; // CJK Unified Ideographs (subset) - encodes 12 bits 23 | wchar_t U12b_length = 0x1000; 24 | wchar_t U8_start = 0x0400; // Cyrillic - encodes 8 bits 25 | wchar_t U8_length = 0x0100; 26 | 27 | int is_in_U08b(wchar_t i) { 28 | return i >= U8_start && i < (U8_start + U8_length); 29 | } 30 | 31 | int is_in_U12a(wchar_t u) { 32 | 33 | wchar_t starts[4] = {U12a_0_0_start, U12a_0_1_start, U12a_1_0_start, U12a_1_1_start}; 34 | 35 | for (int i = 0; i < 4; i++) { 36 | wchar_t start = starts[i]; 37 | if(u >= start && u < (start + U12a_length)) { 38 | return 1; 39 | } 40 | } 41 | 42 | return EXIT_SUCCESS; 43 | } 44 | 45 | int is_in_U12b(wchar_t u) { 46 | return u >= U12b_start && u < (U12b_start + U12b_length); 47 | } 48 | 49 | int two_unichr_to_repeat_byte_ntimes(unsigned char c, int n, wchar_t *u0, wchar_t *u1) { 50 | 51 | if (c > 0xFF) return EXIT_FAILURE; 52 | if (n > 0xFFF) return EXIT_FAILURE; 53 | 54 | *u0 = U8_start + c; 55 | *u1 = U12b_start + n; 56 | 57 | return EXIT_SUCCESS; 58 | } 59 | 60 | int two_twelve_bits_values_from_three_bytes(unsigned char c0, unsigned char c1, unsigned char c2, wchar_t *u0, wchar_t *u1) { 61 | 62 | // (0x12, 0x34, 0x56) -> (0x123, 0x456) 63 | 64 | if (c0 > 0xFF || c1 > 0xFF || c2 > 0xFF) return EXIT_FAILURE; 65 | 66 | *u0 = (c0 << 4) + (c1 >> 4); 67 | *u1 = ((c1 & 0xF) << 8) + c2; 68 | 69 | return EXIT_SUCCESS; 70 | } 71 | 72 | int unichr_12a_from_two_ascii(unsigned char c0, unsigned char c1, wchar_t *u0) { 73 | 74 | int unicode_start = 0; 75 | 76 | if (c0 < 64 && c1 < 64) { 77 | unicode_start = U12a_0_0_start; 78 | } else if (c0 < 64 && c1 >= 64) { 79 | c1 -= 64; 80 | unicode_start = U12a_0_1_start; 81 | } else if (c0 >= 64 && c1 < 64) { 82 | c0 -= 64; 83 | unicode_start = U12a_1_0_start; 84 | } else if (c0 >= 64 && c1 >= 64) { 85 | c0 -= 64; 86 | c1 -= 64; 87 | unicode_start = U12a_1_1_start; 88 | } 89 | 90 | *u0 = unicode_start + (c0 << 6) + c1; 91 | 92 | return EXIT_SUCCESS; 93 | } 94 | 95 | int number_of_repeated_characters_at_index(const char* src, size_t i, size_t srcSize, int *n) { 96 | 97 | int repeats_count = 0; 98 | 99 | unsigned char c = src[i]; 100 | 101 | while(i < srcSize && (c == (unsigned char)src[i])) { 102 | repeats_count += 1; 103 | i++; 104 | } 105 | 106 | *n = repeats_count; 107 | 108 | return EXIT_SUCCESS; 109 | } 110 | 111 | int to_U08(uint8_t i, wchar_t *o) { 112 | 113 | if (i > (U8_start + U8_length)) return EXIT_FAILURE; 114 | 115 | *o = U8_start + i; 116 | 117 | return EXIT_SUCCESS; 118 | } 119 | 120 | int to_U12(wchar_t i, wchar_t *o) { 121 | 122 | if(i > (U12b_start + U12b_length)) return EXIT_FAILURE; 123 | 124 | *o = U12b_start + i; 125 | 126 | return EXIT_SUCCESS; 127 | } 128 | 129 | int from_U12b(wchar_t i, wchar_t *o) { 130 | 131 | if(i < U12b_start || i > (U12b_start + U12b_length)) return EXIT_FAILURE; 132 | 133 | *o = i - U12b_start; 134 | 135 | return EXIT_SUCCESS; 136 | } 137 | 138 | int U12a_to_8_8(wchar_t u, uint8_t *b0, uint8_t *b1) { 139 | 140 | wchar_t unicode_start = 0; 141 | wchar_t starts[4] = {U12a_0_0_start, U12a_0_1_start, U12a_1_0_start, U12a_1_1_start}; 142 | 143 | for (int i = 0; i < 4; i++) { 144 | wchar_t start = starts[i]; 145 | if(u >= start && u < (start + U12a_length)) { 146 | unicode_start = start; 147 | break; 148 | } 149 | } 150 | 151 | if(unicode_start == 0) return EXIT_FAILURE; 152 | 153 | wchar_t value = u - unicode_start; 154 | *b0 = (value & 0xFC0) >> 6; 155 | *b1 = u & 0x3F; 156 | 157 | if(unicode_start == U12a_0_0_start) { 158 | // pass 159 | } else if (unicode_start == U12a_0_1_start) { 160 | *b1 += 64; 161 | } else if (unicode_start == U12a_1_0_start) { 162 | *b0 += 64; 163 | } else if (unicode_start == U12a_1_1_start) { 164 | *b0 += 64; 165 | *b1 += 64; 166 | } 167 | 168 | return EXIT_SUCCESS; 169 | } 170 | 171 | int int_from_u08b(wchar_t u, uint8_t* i) { 172 | if(u < U8_start || u > (U8_start + U8_length)) return EXIT_FAILURE; 173 | 174 | *i = u - U8_start; 175 | 176 | return EXIT_SUCCESS; 177 | } 178 | 179 | int int_from_u12b(wchar_t u, wchar_t* i) { 180 | if(u < U12b_start || u > (U12b_start + U12b_length)) return EXIT_FAILURE; 181 | 182 | *i = u - U12b_start; 183 | 184 | return EXIT_SUCCESS; 185 | } 186 | 187 | int repeated_bytes_from_unichars(wchar_t u1, wchar_t u2, uint8_t **dst, size_t *dstSize) { 188 | 189 | uint8_t b; 190 | if(int_from_u08b(u1, &b) != 0) return EXIT_FAILURE; 191 | 192 | wchar_t n; 193 | if(int_from_u12b(u2, &n) != 0) return EXIT_FAILURE; 194 | 195 | if(n > 0xFFF) { 196 | fprintf(stderr, "-- bad number of repeats: 0x%x\n", n); 197 | return EXIT_FAILURE; 198 | } 199 | 200 | uint8_t *out = malloc(n * sizeof(uint8_t)); 201 | if(out == NULL) { 202 | return EXIT_FAILURE; 203 | } 204 | 205 | memset(out, b, n); 206 | 207 | *dstSize = n; 208 | *dst = out; 209 | 210 | return EXIT_SUCCESS; 211 | } 212 | 213 | int three_bytes_from_two_twelve_bits_values(wchar_t i1, wchar_t i2, uint8_t *b1, uint8_t *b2, uint8_t *b3) { 214 | // (0x123, 0x456) -> (0x12, 0x34, 0x56) 215 | 216 | if(i1 > 0xFFF) return EXIT_FAILURE; 217 | if(i2 > 0xFFF) return EXIT_FAILURE; 218 | 219 | *b1 = i1 >> 4; 220 | *b2 = ((i1 & 0xF) << 4) + ((i2 & 0xF00) >> 8); 221 | *b3 = (i2 & 0x0FF); 222 | 223 | return EXIT_SUCCESS; 224 | } 225 | 226 | int three_bytes_from_unichars(wchar_t u1, wchar_t u2, uint8_t *b1, uint8_t *b2, uint8_t *b3) { 227 | 228 | wchar_t i1, i2; 229 | 230 | if(int_from_u12b(u1, &i1) != 0) return EXIT_FAILURE; 231 | if(int_from_u12b(u2, &i2) != 0) return EXIT_FAILURE; 232 | 233 | int error = three_bytes_from_two_twelve_bits_values(i1, i2, b1, b2, b3); 234 | if(error != 0) return EXIT_FAILURE; 235 | 236 | return EXIT_SUCCESS; 237 | } 238 | 239 | int two_bytes_from_unichars(wchar_t u1, wchar_t u2, uint8_t *b1, uint8_t *b2) { 240 | 241 | int status = int_from_u08b(u1, b1); 242 | if(status == EXIT_FAILURE) return EXIT_FAILURE; 243 | 244 | int status2 = int_from_u08b(u2, b2); 245 | if(status2 == EXIT_FAILURE) return EXIT_FAILURE; 246 | 247 | return EXIT_SUCCESS; 248 | } 249 | 250 | int bytes_from_u1_u2(wchar_t u1, wchar_t u2, uint8_t **buffer, size_t *bufferSize) { 251 | 252 | int u1_in_U12b = is_in_U12b(u1); 253 | int u2_in_U12b = is_in_U12b(u2); 254 | 255 | int u1_in_U8b = is_in_U08b(u1); 256 | int u2_in_U8b = is_in_U08b(u2); 257 | 258 | if(u1_in_U12b && u2_in_U12b) { 259 | 260 | *buffer = (uint8_t *)malloc(3 * sizeof(uint8_t)); 261 | if(buffer == NULL) { 262 | fprintf(stderr, "-- malloc error\n"); 263 | return EXIT_FAILURE; 264 | } 265 | 266 | *bufferSize = 3; 267 | 268 | uint8_t b0, b1, b2; 269 | 270 | int status = three_bytes_from_unichars(u1, u2, &b0, &b1, &b2); 271 | if(status != 0) return EXIT_FAILURE; 272 | 273 | *(*buffer+0) = b0; 274 | *(*buffer+1) = b1; 275 | *(*buffer+2) = b2; 276 | 277 | return EXIT_SUCCESS; 278 | 279 | } else if (u1_in_U8b && u2_in_U12b) { 280 | 281 | size_t dstSize; 282 | 283 | uint8_t *out; 284 | 285 | int status = repeated_bytes_from_unichars(u1, u2, &out, &dstSize); 286 | if(status != 0) return EXIT_FAILURE; 287 | 288 | *bufferSize = dstSize; 289 | 290 | *buffer = out; 291 | 292 | return EXIT_SUCCESS; 293 | } else if (u1_in_U8b && u2_in_U8b) { 294 | 295 | uint8_t b0, b1; 296 | int status = two_bytes_from_unichars(u1, u2, &b0, &b1); 297 | if(status != 0) return EXIT_FAILURE; 298 | 299 | uint8_t *out = malloc(2 * sizeof(uint8_t)); 300 | if(out == NULL) { 301 | fprintf(stderr, "-- malloc error\n"); 302 | return EXIT_FAILURE; 303 | } 304 | 305 | *bufferSize = 2; 306 | 307 | out[0] = b0; 308 | out[1] = b1; 309 | 310 | *buffer = out; 311 | 312 | return EXIT_SUCCESS; 313 | } else if (u1_in_U8b && u2 == '\n') { 314 | 315 | *buffer = (uint8_t *)malloc(1 * sizeof(uint8_t)); 316 | if(buffer == NULL) { 317 | fprintf(stderr, "-- malloc error\n"); 318 | return EXIT_FAILURE; 319 | } 320 | 321 | *bufferSize = 1; 322 | 323 | uint8_t b0; 324 | 325 | int success = int_from_u08b(u1, &b0); 326 | if(success == EXIT_FAILURE) { 327 | fprintf(stderr, "-- error\n"); 328 | return EXIT_FAILURE; 329 | } 330 | 331 | (*buffer)[0] = b0; 332 | return EXIT_SUCCESS; 333 | } 334 | 335 | fprintf(stderr, "-- bytes_from_u1_u2() cannot deal with u1:0x%x u2:0x%x\n", u1, u2); 336 | fprintf(stderr, " u1 in U8b:%d U12b:%d, u2 in U8b:%d U12b:%d\n", u1_in_U8b, u1_in_U12b, u2_in_U8b, u2_in_U12b); 337 | return EXIT_FAILURE; 338 | } 339 | 340 | int next_non_newline_char(FILE *src, wchar_t *wc) { 341 | do { 342 | *wc = fgetwc(src); 343 | } while (*wc == '\n'); 344 | return EXIT_SUCCESS; 345 | } 346 | 347 | int unibinary_decode(FILE *src, FILE *dst) { 348 | 349 | int i = 0; 350 | 351 | wchar_t c, c_next; 352 | 353 | int fwd_chars_read = 0; 354 | 355 | while (1) { 356 | 357 | if(fwd_chars_read == 0) { 358 | next_non_newline_char(src, &c); 359 | next_non_newline_char(src, &c_next); 360 | fwd_chars_read = 2; 361 | } else if (fwd_chars_read == 1) { 362 | c = c_next; 363 | next_non_newline_char(src, &c_next); 364 | fwd_chars_read = 2; 365 | } else { 366 | // pass 367 | } 368 | 369 | if(is_in_U12a(c)) { 370 | uint8_t b0; 371 | uint8_t b1; 372 | 373 | int error = U12a_to_8_8(c, &b0, &b1); 374 | if(error != 0) { 375 | fprintf(stderr, "-- error in U12a_to_8_8()\n"); 376 | return EXIT_FAILURE; 377 | } 378 | 379 | fwrite(&b0, 1, 1, dst); 380 | fwrite(&b1, 1, 1, dst); 381 | 382 | i += 1; 383 | 384 | fwd_chars_read -= 1; 385 | 386 | } else if (c != WEOF && c_next != WEOF) { 387 | wchar_t u1 = c; 388 | wchar_t u2 = c_next; 389 | 390 | uint8_t *outBuffer; 391 | size_t outBufferSize; 392 | 393 | int error = bytes_from_u1_u2(u1, u2, &outBuffer, &outBufferSize); 394 | if(error != 0) { 395 | fprintf(stderr, "-- error in bytes_from_u1_u2()\n"); 396 | return EXIT_FAILURE; 397 | } 398 | 399 | if(outBuffer == NULL) { 400 | fprintf(stderr, "-- outBuffer == NULL\n"); 401 | } 402 | 403 | fwrite(outBuffer, 1, outBufferSize, dst); 404 | 405 | i += 2; 406 | 407 | fwd_chars_read -= 2; 408 | 409 | } else if (is_in_U08b(c)) { 410 | uint8_t b0; 411 | int status = int_from_u08b(c, &b0); 412 | if(status != 0) { 413 | fprintf(stderr, "-- error in int_from_u08b()\n"); 414 | return EXIT_FAILURE; 415 | } 416 | 417 | fwrite(&b0, 1, 1, dst); 418 | 419 | i += 1; 420 | 421 | fwd_chars_read -= 1; 422 | 423 | } else if (c == '\n') { 424 | i += 1; 425 | 426 | if(fwd_chars_read > 0) { 427 | fwd_chars_read -= 1; 428 | } 429 | } else if (c == WEOF) { 430 | break; 431 | } else { 432 | fprintf(stderr, "-- cannot decode character at index %u\n", i); 433 | return EXIT_FAILURE; 434 | } 435 | } 436 | 437 | return EXIT_SUCCESS; 438 | } 439 | 440 | int put_wc(FILE *fd_out, wchar_t wc, size_t *count, size_t wrap_length) { 441 | if(fputwc(wc, fd_out) == EOF) return EXIT_FAILURE; 442 | 443 | *count += 1; 444 | if(wrap_length > 0) { 445 | *count = *count % wrap_length; 446 | } 447 | 448 | if(*count == 0) { 449 | if(fputwc('\n', fd_out) == EOF) return EXIT_FAILURE; 450 | } 451 | return EXIT_SUCCESS; 452 | } 453 | 454 | int unibinary_encode_string(const char *src, wchar_t **dst, size_t wrap_length) { 455 | 456 | // 1. write src into a temporary file 457 | 458 | FILE *fd_in = tmpfile(); 459 | if(fd_in == NULL) return EXIT_FAILURE; 460 | 461 | size_t src_len = strlen(src); 462 | size_t written = fwrite(src, 1, src_len, fd_in); 463 | if(written != src_len) { 464 | fclose(fd_in); 465 | return EXIT_FAILURE; 466 | } 467 | rewind(fd_in); 468 | 469 | // 2. open another temporary file to write the encoded string 470 | 471 | FILE *fd_out = tmpfile(); 472 | if(fd_in == NULL) { 473 | fclose(fd_in); 474 | return EXIT_FAILURE; 475 | } 476 | 477 | int status = unibinary_encode(fd_in, fd_out, wrap_length); 478 | fclose(fd_in); 479 | 480 | if(status != 0) return EXIT_FAILURE; 481 | 482 | // 3. read the encoded string and fill *dst 483 | 484 | long file_size = ftell(fd_out); 485 | 486 | rewind(fd_out); 487 | 488 | long max_wchar_bytes_possible = file_size * MB_CUR_MAX; 489 | 490 | if(max_wchar_bytes_possible > INTMAX_MAX) { 491 | fclose(fd_out); 492 | return EXIT_FAILURE; 493 | } 494 | 495 | char *map = mmap(0, file_size, PROT_READ, MAP_SHARED, fileno(fd_out), 0); 496 | 497 | fclose(fd_out); 498 | 499 | if(map == NULL) { 500 | return EXIT_FAILURE; 501 | } 502 | 503 | *dst = (wchar_t *)malloc(file_size * MB_CUR_MAX); 504 | if(dst == NULL) { 505 | fprintf(stderr, "-- malloc error\n"); 506 | return EXIT_FAILURE; 507 | } 508 | 509 | size_t length = mbstowcs(*dst, map, file_size * MB_CUR_MAX); 510 | 511 | if(length == -1) { 512 | free(dst); 513 | return EXIT_FAILURE; 514 | } 515 | 516 | return EXIT_SUCCESS; 517 | } 518 | 519 | int unibinary_decode_string(const wchar_t *src, char **dst, long *dst_len) { 520 | 521 | // 1. write src into a temporary file 522 | 523 | FILE *fd_in = tmpfile(); 524 | if(fd_in == NULL) return EXIT_FAILURE; 525 | 526 | int status = fputws(src, fd_in); 527 | if(status != 0) { 528 | fclose(fd_in); 529 | return EXIT_FAILURE; 530 | } 531 | 532 | rewind(fd_in); 533 | 534 | // 2. open another temporary file to write decoded data 535 | 536 | FILE *fd_out = tmpfile(); 537 | if(fd_in == NULL) { 538 | fclose(fd_in); 539 | return EXIT_FAILURE; 540 | } 541 | 542 | int status2 = unibinary_decode(fd_in, fd_out); 543 | fclose(fd_in); 544 | 545 | if(status2 != 0) return EXIT_FAILURE; 546 | 547 | // 3. read the resulting string and fill **dst 548 | 549 | long file_size = ftell(fd_out); 550 | 551 | *dst_len = file_size; 552 | 553 | rewind(fd_out); 554 | 555 | *dst = (char *)malloc(file_size * sizeof(char)); 556 | if(dst == NULL) { 557 | fprintf(stderr, "-- malloc error\n"); 558 | return EXIT_FAILURE; 559 | } 560 | 561 | size_t read = fread(*dst, sizeof(char), file_size, fd_out); 562 | fclose(fd_out); 563 | 564 | if(read != file_size) { 565 | return EXIT_FAILURE; 566 | } 567 | 568 | return EXIT_SUCCESS; 569 | } 570 | 571 | int unibinary_encode(FILE *fd_in, FILE *fd_out, size_t wrap_length) { 572 | 573 | size_t out_count = 0; 574 | 575 | while(1) { 576 | 577 | unsigned char c0, c1, c2; 578 | size_t read_c0 = fread(&c0, 1, 1, fd_in); 579 | if(read_c0 == 0) break; 580 | 581 | size_t read_c1 = 0; 582 | 583 | long number_of_repeats = 1; 584 | unsigned char cc; 585 | size_t read_cc = fread(&cc, 1, 1, fd_in); 586 | 587 | while(read_cc != 0 && cc == c0 && number_of_repeats < 0xFFF) { 588 | number_of_repeats += 1; 589 | read_cc = fread(&cc, 1, 1, fd_in); 590 | } 591 | 592 | /**/ 593 | 594 | if(number_of_repeats >= 3) { 595 | if(read_cc != 0) { 596 | if(ungetc(cc, fd_in) == EOF) return EXIT_FAILURE; 597 | } 598 | 599 | long n = number_of_repeats; 600 | 601 | wchar_t u0, u1; 602 | int error = two_unichr_to_repeat_byte_ntimes(c0, (int)n, &u0, &u1); 603 | if(error) return EXIT_FAILURE; 604 | 605 | put_wc(fd_out, u0, &out_count, wrap_length); 606 | put_wc(fd_out, u1, &out_count, wrap_length); 607 | 608 | continue; 609 | } else if (number_of_repeats == 2) { 610 | if(read_cc != 0) { 611 | if(ungetc(cc, fd_in) == EOF) return EXIT_FAILURE; 612 | } 613 | 614 | read_c0 = 1; 615 | read_c1 = 1; 616 | // c0 = c0; 617 | c1 = c0; 618 | 619 | } else if (read_cc != 0) { 620 | read_c1 = 1; 621 | c1 = cc; 622 | } 623 | 624 | /**/ 625 | 626 | size_t read_c2 = fread(&c2, 1, 1, fd_in); 627 | 628 | int two_ASCII_7bits_chars_available = read_c0 != 0 && read_c1 != 0 && c0 < 128 && c1 < 128; 629 | int three_bytes_available = read_c0 != 0 && read_c1 != 0 && read_c2 != 0; 630 | 631 | if(two_ASCII_7bits_chars_available) { 632 | 633 | // put 2 x 7 bits into a unichar 634 | wchar_t u0; 635 | int error = unichr_12a_from_two_ascii(c0, c1, &u0); 636 | if(error) return EXIT_FAILURE; 637 | 638 | put_wc(fd_out, u0, &out_count, wrap_length); 639 | 640 | if(read_c2 != 0) { 641 | if(ungetc(c2, fd_in) == EOF) return EXIT_FAILURE; 642 | } 643 | 644 | } else if (three_bytes_available) { 645 | // read 3 bytes, yield 2 unichars 646 | wchar_t u0, u1; 647 | int error = two_twelve_bits_values_from_three_bytes(c0, c1, c2, &u0, &u1); 648 | if(error) return EXIT_FAILURE; 649 | 650 | wchar_t o0, o1; 651 | error = to_U12(u0, &o0); 652 | if(error != 0) return EXIT_FAILURE; 653 | error = to_U12(u1, &o1); 654 | if(error != 0) return EXIT_FAILURE; 655 | 656 | put_wc(fd_out, o0, &out_count, wrap_length); 657 | put_wc(fd_out, o1, &out_count, wrap_length); 658 | 659 | } else if (read_c0 != 0) { 660 | // read 1 byte, encode 1 unichar 661 | wchar_t u0; 662 | int error = to_U08(c0, &u0); 663 | if(error) return EXIT_FAILURE; 664 | 665 | put_wc(fd_out, u0, &out_count, wrap_length); 666 | 667 | if(read_c1 != 0) { 668 | if(ungetc(c1, fd_in) == EOF) return EXIT_FAILURE; 669 | } 670 | 671 | } else { 672 | break; 673 | } 674 | 675 | } 676 | 677 | return EXIT_SUCCESS; 678 | } 679 | -------------------------------------------------------------------------------- /c/unibinary/unibinary.h: -------------------------------------------------------------------------------- 1 | // 2 | // unibinary.h 3 | // unibinary 4 | // 5 | // Created by Nicolas Seriot on 29/12/13. 6 | // Copyright (c) 2013 Nicolas Seriot. All rights reserved. 7 | // 8 | 9 | #include 10 | #include 11 | 12 | #ifndef unibinary_unibinary_h 13 | #define unibinary_unibinary_h 14 | 15 | // encode 16 | 17 | int unibinary_encode(FILE *fd_in, FILE *fd_out, size_t wrap_length); 18 | int unibinary_encode_string(const char* src, wchar_t **dst, size_t wrap_length); 19 | 20 | // decode 21 | 22 | int unibinary_decode(FILE *src, FILE *dst); 23 | int unibinary_decode_string(const wchar_t *src, char **dst, long *dst_len); 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /javascript/test/template.css: -------------------------------------------------------------------------------- 1 | #wru { 2 | font-family: sans-serif; 3 | font-size: 11pt; 4 | border: 1px solid #333; 5 | } 6 | #wru div { 7 | cursor: default; 8 | padding: 0; 9 | color: #000; 10 | } 11 | #wru div span, 12 | #wru div strong { 13 | display: block; 14 | padding: 4px; 15 | margin: 0; 16 | } 17 | #wru div ul { 18 | margin: 0; 19 | padding-bottom: 4px; 20 | } 21 | #wru div.pass { 22 | background: #90EE90; 23 | } 24 | #wru div.fail { 25 | background: #FF6347; 26 | } 27 | #wru div.error { 28 | background: #000; 29 | color: #FFF; 30 | } -------------------------------------------------------------------------------- /javascript/test/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | wru :: UniBinary.js unit tests 5 | 6 | 7 | 8 |
9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /javascript/test/test.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Node, Rhino, JSC, and Browser compatible wru test runner. 3 | */ 4 | 5 | UNIBINARY_UNIT_TESTING = true; 6 | 7 | var global; 8 | var Uint8Array; 9 | var unibinary; 10 | var wru; 11 | var console; 12 | 13 | function getGlobal() { 14 | return (function () { 15 | return this 16 | })() 17 | } 18 | 19 | // node, rhino, or web 20 | try { 21 | // node and phantom js 22 | 23 | wru = require("./wru.console.js"); 24 | unibinary = require("../unibinary"); 25 | 26 | go(wru); 27 | } catch (e) { 28 | // rhino & jsc 29 | try { 30 | 31 | global = undefined; // this is a hack to work around a bug in wru & new Rhino versions involving JavaAdapter 32 | 33 | load("../unibinary.js"); 34 | load("./wru.console.js"); 35 | 36 | if (typeof Uint8Array == "undefined") { 37 | load("typedarray.js"); 38 | var window = getGlobal(); 39 | for (var property in exports) { 40 | if (exports.hasOwnProperty(property)) { 41 | try { 42 | window[property] = exports[property] 43 | } catch (e) { 44 | print(e); 45 | } 46 | } 47 | } 48 | } 49 | 50 | if (typeof console == "undefined") { 51 | console = { 52 | error: function (msg) { 53 | print(msg) 54 | }, 55 | log: function (msg) { 56 | print(msg) 57 | } 58 | } 59 | } 60 | 61 | go(wru); 62 | } catch (e) { 63 | // html (assuming test.html is used in same folders structure) 64 | (function (xhr) { 65 | try { 66 | xhr.open("get", "wru.min.js", true); 67 | xhr.onreadystatechange = function () { 68 | if (xhr.readyState == 4) { 69 | try { 70 | Function(xhr.responseText.replace(/var wru=/, "this.wru=")).call(window); 71 | } catch (e) { 72 | alert(e); 73 | } 74 | go(window.wru); 75 | } 76 | }; 77 | xhr.send(null); 78 | } catch (e) { 79 | alert(e.message || e); 80 | } 81 | }(new XMLHttpRequest)); 82 | } 83 | } 84 | 85 | 86 | function go(wru) { 87 | var assert = { 88 | equal: function (a, b, m) { 89 | wru.assert(a == b); 90 | }, 91 | notEqual: function (a, b, m) { 92 | wru.assert(a != b); 93 | }, 94 | 95 | ok: function (a, m) { 96 | wru.assert(a); 97 | } 98 | } 99 | 100 | wru.test([ 101 | { 102 | name: "test_unichr_12_encoding_decoding", 103 | test: function () { 104 | 105 | var testArray = [0x0, 0x1, 0xAB, 0x123, 0xABC, 0xF, 0xFF, 0xFFF]; 106 | 107 | for (var j = 0; j < testArray.length; j++) { 108 | i = testArray[j]; 109 | 110 | var u = unibinary.unichr_12_from_int(i); 111 | assert.notEqual(u, i); 112 | 113 | var i2 = unibinary.int_from_u12b(u); 114 | assert.equal(i, i2); 115 | } 116 | 117 | }}, 118 | { 119 | 120 | name: "test_3_to_2_bytes", test: function () { 121 | 122 | var ab = unibinary.two_twelve_bits_values_from_three_bytes(0x12, 0x34, 0x56); 123 | 124 | assert.equal(ab[0], 0x123, "0x" + ab[0].toString(16)); 125 | assert.equal(ab[1], 0x456, "0x" + ab[1].toString(16)); 126 | 127 | }}, 128 | { 129 | 130 | name: "test_2_to_3_bytes", test: function () { 131 | 132 | var abc = unibinary.three_bytes_from_two_twelve_bits_values(0x123, 0x456); 133 | 134 | assert.equal(abc[0], 0x12, "0x" + abc[0].toString(16)); 135 | assert.equal(abc[1], 0x34, "0x" + abc[1].toString(16)); 136 | assert.equal(abc[2], 0x56, "0x" + abc[2].toString(16)); 137 | 138 | }}, 139 | { 140 | 141 | name: "test_encode_3_bytes", test: function () { 142 | 143 | var bytes = [0xab, 0xcd, 0xef]; 144 | var gen = unibinary.encode(bytes); 145 | 146 | assert.ok(typeof gen == "string"); 147 | assert.equal(gen.length, 2); 148 | 149 | assert.equal(gen.charCodeAt(0), unibinary.U12b_start + 0xABC); 150 | assert.equal(gen.charCodeAt(1), unibinary.U12b_start + 0xDEF); 151 | 152 | }}, 153 | { 154 | 155 | name: "test_encode_bytes", test: function () { 156 | var bytes = [0xab, 0xcd, 0xef, 0xff]; 157 | 158 | var gen = unibinary.encode(bytes); 159 | 160 | assert.ok(typeof gen == "string"); 161 | assert.equal(gen.length, 3); 162 | 163 | assert.equal(gen.charCodeAt(0), unibinary.U12b_start + 0xABC); 164 | assert.equal(gen.charCodeAt(1), unibinary.U12b_start + 0xDEF); 165 | assert.equal(gen.charCodeAt(2), unibinary.U8_start + 0xFF); 166 | 167 | }}, 168 | { 169 | name: "test_decode_unichars", test: function () { 170 | 171 | var u1 = String.fromCharCode(unibinary.U12b_start + 0xABC); 172 | var u2 = String.fromCharCode(unibinary.U12b_start + 0xDEF); 173 | 174 | var s = u1 + u2; 175 | 176 | var gen = unibinary.decode(s); 177 | 178 | assert.equal(gen.length, 3); 179 | 180 | assert.equal(gen[0], 0xAB); 181 | assert.equal(gen[1], 0xCD); 182 | assert.equal(gen[2], 0xEF); 183 | 184 | }}, 185 | { 186 | 187 | name: "test_is_in_U8b", test: function () { 188 | assert.ok(!unibinary.is_in_U8b(String.fromCharCode(0x03FF))); 189 | assert.ok(unibinary.is_in_U8b(String.fromCharCode(0x400))); 190 | assert.ok(unibinary.is_in_U8b(String.fromCharCode(0x4FF))); 191 | assert.ok(!unibinary.is_in_U8b(String.fromCharCode(0x500))); 192 | }}, 193 | { 194 | 195 | name: "test_unichr_12a_from_two_ascii", test: function () { 196 | var u = unibinary.unichr_12a_from_two_ascii('Z'.charCodeAt(0), 'E'.charCodeAt(0)); 197 | assert.equal(u, String.fromCharCode(0x9485)); 198 | 199 | var u = unibinary.unichr_12a_from_two_ascii('z'.charCodeAt(0), ','.charCodeAt(0)); 200 | assert.equal(u, String.fromCharCode(0x8CAC)); 201 | 202 | }}, 203 | { 204 | 205 | name: "test_ascii_characters_encoding", test: function () { 206 | var s = "abc"; 207 | 208 | var gen = unibinary.encodeString(s); 209 | 210 | assert.ok(typeof gen == "string"); 211 | assert.equal(gen.length, 2); 212 | 213 | assert.equal(gen.charCodeAt(0), 0x9662); 214 | assert.equal(gen.charCodeAt(1), 0x0463); 215 | 216 | }}, 217 | { 218 | 219 | name: "test_ascii_characters_encoding_2", test: function () { 220 | var s = "ZE"; 221 | 222 | var gen = unibinary.encodeString(s); 223 | 224 | assert.ok(typeof gen == "string"); 225 | assert.equal(gen.length, 1); 226 | 227 | assert.equal(gen[0], unibinary.unichr_12a_from_two_ascii('Z'.charCodeAt(0), 'E'.charCodeAt(0))); 228 | 229 | }}, 230 | { 231 | 232 | name: "test_two_unichr_to_repeat_byte_ntimes_aaa", test: function () { 233 | 234 | var gen = unibinary.two_unichr_to_repeat_byte_ntimes('a'.charCodeAt(0), 10); 235 | assert.ok(typeof gen == "string"); 236 | assert.equal(gen.length, 2); 237 | 238 | assert.equal(gen.charCodeAt(0), 0x0461); 239 | assert.equal(gen.charCodeAt(1), 0x4E0A); 240 | 241 | }}, 242 | { 243 | 244 | name: "test_two_unichr_to_repeat_byte_ntimes_xxx", test: function () { 245 | 246 | var gen = unibinary.two_unichr_to_repeat_byte_ntimes('x'.charCodeAt(0), 3); 247 | 248 | assert.ok(typeof gen == "string"); 249 | assert.equal(gen.length, 2); 250 | 251 | assert.equal(gen.charCodeAt(0), 0x0478); 252 | assert.equal(gen.charCodeAt(1), 0x4E03); 253 | 254 | }}, 255 | { 256 | 257 | name: "test_repeat", test: function () { 258 | 259 | var s = "xxx"; 260 | 261 | var gen = unibinary.encodeString(s); 262 | 263 | assert.ok(typeof gen == "string"); 264 | assert.equal(gen.length, 2); 265 | 266 | assert.equal(gen.charCodeAt(0), 0x0478); 267 | assert.equal(gen.charCodeAt(1), 0x4E03); 268 | 269 | }}, 270 | { 271 | 272 | name: "test_ascii_characters_decoding", test: function () { 273 | var s = String.fromCharCode(0x9662) + String.fromCharCode(0x0463); 274 | 275 | var s2 = unibinary.decode(s); 276 | 277 | assert.equal(s2[0], 'a'.charCodeAt(0)); 278 | assert.equal(s2[1], 'b'.charCodeAt(0)); 279 | assert.equal(s2[2], 'c'.charCodeAt(0)); 280 | 281 | 282 | }}, 283 | { 284 | 285 | name: "test_ascii_characters_decoding_2", test: function () { 286 | var s = String.fromCharCode(0x9485); 287 | 288 | var s2 = unibinary.decode(s); 289 | 290 | assert.equal(s2[0], 'Z'.charCodeAt(0)); 291 | assert.equal(s2[1], 'E'.charCodeAt(0)); 292 | 293 | 294 | }}, 295 | { 296 | 297 | name: "test_five_bytes_encoding", test: function () { 298 | var bytes = [0xab, 0xcd, 0xef, 0xab, 0xcd]; 299 | 300 | var gen = unibinary.encode(bytes); 301 | 302 | assert.equal(gen.length, 4); 303 | 304 | assert.equal(gen.charCodeAt(0), unibinary.U12b_start + 0xABC); 305 | assert.equal(gen.charCodeAt(1), unibinary.U12b_start + 0xDEF); 306 | assert.equal(gen.charCodeAt(2), unibinary.U8_start + 0xAB); 307 | assert.equal(gen.charCodeAt(3), unibinary.U8_start + 0xCD); 308 | 309 | }}, 310 | { 311 | 312 | name: "test_ascii_and_bytes_encoding", test: function () { 313 | var bytes = [0xab, 0xcd, 0xef]; 314 | bytes = bytes.concat([0x61, 0x62, 0x63, 0x64, 0x65]); //abcde 315 | 316 | var gen = unibinary.encode(bytes); 317 | 318 | assert.equal(gen.length, 5); 319 | 320 | assert.equal(gen.charCodeAt(0), unibinary.U12b_start + 0xABC); 321 | assert.equal(gen.charCodeAt(1), unibinary.U12b_start + 0xDEF); 322 | assert.equal(gen[2], unibinary.unichr_12a_from_two_ascii('a'.charCodeAt(0), 'b'.charCodeAt(0))); 323 | assert.equal(gen[3], unibinary.unichr_12a_from_two_ascii('c'.charCodeAt(0), 'd'.charCodeAt(0))); 324 | assert.equal(gen[4], unibinary.unichr_08_from_int('e'.charCodeAt(0))); 325 | 326 | 327 | }}, 328 | { 329 | 330 | name: "test_ascii_and_bytes_decoding", test: function () { 331 | var s = String.fromCharCode(unibinary.U12b_start + 0xABC); 332 | s += String.fromCharCode(unibinary.U12b_start + 0xDEF); 333 | s += unibinary.unichr_12a_from_two_ascii('a'.charCodeAt(0), 'b'.charCodeAt(0)); 334 | s += unibinary.unichr_12a_from_two_ascii('c'.charCodeAt(0), 'd'.charCodeAt(0)); 335 | s += unibinary.unichr_08_from_int('e'.charCodeAt(0)); 336 | 337 | var gen = unibinary.decode(s); 338 | 339 | assert.equal(gen.length, 8); 340 | 341 | assert.equal(gen[0], 0xAB); 342 | assert.equal(gen[1], 0xCD); 343 | assert.equal(gen[2], 0xEF); 344 | assert.equal(gen[3], 0x61); 345 | assert.equal(gen[4], 0x62); 346 | assert.equal(gen[5], 0x63); 347 | assert.equal(gen[6], 0x64); 348 | assert.equal(gen[7], 0x65); 349 | 350 | 351 | }}, 352 | { 353 | 354 | name: "test_repeats", test: function () { 355 | var l = [1, 1, 1, 2, 1]; 356 | 357 | var n = unibinary.number_of_left_instances_from_index(l, 0); 358 | 359 | assert.equal(n, 3); 360 | 361 | }}, 362 | { 363 | 364 | name: "test_empty_string", test: function () { 365 | var bytes = ""; 366 | 367 | var gen = unibinary.encodeString(bytes); 368 | 369 | assert.equal(gen, ""); 370 | 371 | }}, 372 | { 373 | 374 | name: "test_one_char", test: function () { 375 | var bytes = "a"; 376 | 377 | var gen = unibinary.encodeString(bytes); 378 | 379 | assert.ok(typeof gen == "string"); 380 | assert.equal(gen.length, 1); 381 | 382 | 383 | assert.equal(gen.charCodeAt(0), 0x0461); 384 | 385 | }}, 386 | { 387 | 388 | name: "test_repeats_2", test: function () { 389 | 390 | var bytes = [0xAB, 0xCD, 0xEF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00]; 391 | 392 | var gen = unibinary.encode(bytes); 393 | 394 | assert.equal(gen.length, 5); 395 | 396 | assert.equal(gen.charCodeAt(0), 0x58BC); 397 | assert.equal(gen.charCodeAt(1), 0x5BEF); 398 | assert.equal(gen.charCodeAt(2), 0x04FF); 399 | assert.equal(gen.charCodeAt(3), 0x4E04); 400 | assert.equal(gen.charCodeAt(4), 0x0400); 401 | 402 | }}, 403 | { 404 | 405 | name: "test_encode_macho_header", test: function () { 406 | 407 | var bytes = [0xCF, 0xFA, 0xED, 0xFE, 0x07, 0x00, 0x00, 0x01]; 408 | 409 | var gen = unibinary.encode(bytes); 410 | 411 | assert.equal(gen.length, 5); 412 | 413 | assert.equal(gen.charCodeAt(0), 0x5AFF); 414 | assert.equal(gen.charCodeAt(1), 0x58ED); 415 | assert.equal(gen.charCodeAt(2), 0x5DE0); 416 | assert.equal(gen.charCodeAt(3), 0x5500); 417 | assert.equal(gen.charCodeAt(4), 0x5E01); 418 | 419 | }}, 420 | { 421 | 422 | name: "test_big_repeats_2000_minus_2", test: function () { 423 | 424 | var bytes = []; 425 | for (var i = 0; i < 0x2000 - 2; i++) bytes.push(0xAA); 426 | 427 | var gen = unibinary.encode(bytes); 428 | 429 | assert.equal(gen.length, 4); 430 | 431 | assert.equal(gen.charCodeAt(0), 0x04AA); 432 | assert.equal(gen.charCodeAt(1), 0x5DFF); 433 | assert.equal(gen.charCodeAt(2), 0x04AA); 434 | assert.equal(gen.charCodeAt(3), 0x5DFF); 435 | 436 | }}, 437 | { 438 | 439 | name: "test_big_repeats_2000", test: function () { 440 | 441 | var bytes = []; 442 | for (var i = 0; i < 0x2000; i++) bytes.push(0xAA); 443 | 444 | var gen = unibinary.encode(bytes); 445 | assert.equal(gen.length, 6); 446 | 447 | assert.equal(gen.charCodeAt(0), 0x04AA); 448 | assert.equal(gen.charCodeAt(1), 0x5DFF); 449 | assert.equal(gen.charCodeAt(2), 0x04AA); 450 | assert.equal(gen.charCodeAt(3), 0x5DFF); 451 | assert.equal(gen.charCodeAt(4), 0x04AA); 452 | assert.equal(gen.charCodeAt(5), 0x04AA); 453 | 454 | }}, 455 | { 456 | 457 | name: "test_ascii_text_encoding_decoding", test: function () { 458 | 459 | var s = "if I'd listened everything that they said to me, took the time to bleed from all the tiny little arrows shot my way, I wouldn't be here! the ones who don't do anything are always the ones who try to put you down. I'm talking to you: hero time starts right now! time to shine!"; 460 | 461 | var encodeGen = unibinary.encodeString(s); 462 | 463 | var decodeGen = unibinary.decodeString(encodeGen); 464 | 465 | assert.equal(s, decodeGen); 466 | 467 | }}, 468 | { 469 | 470 | name: "test_ascii_text_encoding_decoding_2", test: function () { 471 | 472 | var s = ""; 473 | for (var i = 32; i <= 128; i++) s += String.fromCharCode(i); 474 | 475 | var encodeGen = unibinary.encodeString(s); 476 | 477 | var decodeGen = unibinary.decodeString(encodeGen); 478 | 479 | assert.equal(s, decodeGen); 480 | 481 | }} 482 | ]); 483 | } 484 | -------------------------------------------------------------------------------- /javascript/test/typedarray.js: -------------------------------------------------------------------------------- 1 | /* type array polyfill from https://github.com/substack/typedarray */ 2 | 3 | var exports = {}; 4 | // Beyond this value, index getters/setters (i.e. array[0], array[1]) are so slow to 5 | // create, and consume so much memory, that the browser appears frozen. 6 | var MAX_ARRAY_LENGTH = 1e5; 7 | 8 | // Approximations of internal ECMAScript conversion functions 9 | var ECMAScript = (function() { 10 | // Stash a copy in case other scripts modify these 11 | var opts = Object.prototype.toString, 12 | ophop = Object.prototype.hasOwnProperty; 13 | 14 | return { 15 | // Class returns internal [[Class]] property, used to avoid cross-frame instanceof issues: 16 | Class: function(v) { return opts.call(v).replace(/^\[object *|\]$/g, ''); }, 17 | HasProperty: function(o, p) { return p in o; }, 18 | HasOwnProperty: function(o, p) { return ophop.call(o, p); }, 19 | IsCallable: function(o) { return typeof o === 'function'; }, 20 | ToInt32: function(v) { return v >> 0; }, 21 | ToUint32: function(v) { return v >>> 0; } 22 | }; 23 | }()); 24 | 25 | // Snapshot intrinsics 26 | var LN2 = Math.LN2, 27 | abs = Math.abs, 28 | floor = Math.floor, 29 | log = Math.log, 30 | min = Math.min, 31 | pow = Math.pow, 32 | round = Math.round; 33 | 34 | // ES5: lock down object properties 35 | function configureProperties(obj) { 36 | if (getOwnPropNames && defineProp) { 37 | var props = getOwnPropNames(obj), i; 38 | for (i = 0; i < props.length; i += 1) { 39 | defineProp(obj, props[i], { 40 | value: obj[props[i]], 41 | writable: false, 42 | enumerable: false, 43 | configurable: false 44 | }); 45 | } 46 | } 47 | } 48 | 49 | // emulate ES5 getter/setter API using legacy APIs 50 | // http://blogs.msdn.com/b/ie/archive/2010/09/07/transitioning-existing-code-to-the-es5-getter-setter-apis.aspx 51 | // (second clause tests for Object.defineProperty() in IE<9 that only supports extending DOM prototypes, but 52 | // note that IE<9 does not support __defineGetter__ or __defineSetter__ so it just renders the method harmless) 53 | var defineProp 54 | if (Object.defineProperty && (function() { 55 | try { 56 | Object.defineProperty({}, 'x', {}); 57 | return true; 58 | } catch (e) { 59 | return false; 60 | } 61 | })()) { 62 | defineProp = Object.defineProperty; 63 | } else { 64 | defineProp = function(o, p, desc) { 65 | if (!o === Object(o)) throw new TypeError("Object.defineProperty called on non-object"); 66 | if (ECMAScript.HasProperty(desc, 'get') && Object.prototype.__defineGetter__) { Object.prototype.__defineGetter__.call(o, p, desc.get); } 67 | if (ECMAScript.HasProperty(desc, 'set') && Object.prototype.__defineSetter__) { Object.prototype.__defineSetter__.call(o, p, desc.set); } 68 | if (ECMAScript.HasProperty(desc, 'value')) { o[p] = desc.value; } 69 | return o; 70 | }; 71 | } 72 | 73 | var getOwnPropNames = Object.getOwnPropertyNames || function (o) { 74 | if (o !== Object(o)) throw new TypeError("Object.getOwnPropertyNames called on non-object"); 75 | var props = [], p; 76 | for (p in o) { 77 | if (ECMAScript.HasOwnProperty(o, p)) { 78 | props.push(p); 79 | } 80 | } 81 | return props; 82 | }; 83 | 84 | // ES5: Make obj[index] an alias for obj._getter(index)/obj._setter(index, value) 85 | // for index in 0 ... obj.length 86 | function makeArrayAccessors(obj) { 87 | if (!defineProp) { return; } 88 | 89 | if (obj.length > MAX_ARRAY_LENGTH) throw new RangeError("Array too large for polyfill"); 90 | 91 | function makeArrayAccessor(index) { 92 | defineProp(obj, index, { 93 | 'get': function() { return obj._getter(index); }, 94 | 'set': function(v) { obj._setter(index, v); }, 95 | enumerable: true, 96 | configurable: false 97 | }); 98 | } 99 | 100 | var i; 101 | for (i = 0; i < obj.length; i += 1) { 102 | makeArrayAccessor(i); 103 | } 104 | } 105 | 106 | // Internal conversion functions: 107 | // pack() - take a number (interpreted as Type), output a byte array 108 | // unpack() - take a byte array, output a Type-like number 109 | 110 | function as_signed(value, bits) { var s = 32 - bits; return (value << s) >> s; } 111 | function as_unsigned(value, bits) { var s = 32 - bits; return (value << s) >>> s; } 112 | 113 | function packI8(n) { return [n & 0xff]; } 114 | function unpackI8(bytes) { return as_signed(bytes[0], 8); } 115 | 116 | function packU8(n) { return [n & 0xff]; } 117 | function unpackU8(bytes) { return as_unsigned(bytes[0], 8); } 118 | 119 | function packU8Clamped(n) { n = round(Number(n)); return [n < 0 ? 0 : n > 0xff ? 0xff : n & 0xff]; } 120 | 121 | function packI16(n) { return [(n >> 8) & 0xff, n & 0xff]; } 122 | function unpackI16(bytes) { return as_signed(bytes[0] << 8 | bytes[1], 16); } 123 | 124 | function packU16(n) { return [(n >> 8) & 0xff, n & 0xff]; } 125 | function unpackU16(bytes) { return as_unsigned(bytes[0] << 8 | bytes[1], 16); } 126 | 127 | function packI32(n) { return [(n >> 24) & 0xff, (n >> 16) & 0xff, (n >> 8) & 0xff, n & 0xff]; } 128 | function unpackI32(bytes) { return as_signed(bytes[0] << 24 | bytes[1] << 16 | bytes[2] << 8 | bytes[3], 32); } 129 | 130 | function packU32(n) { return [(n >> 24) & 0xff, (n >> 16) & 0xff, (n >> 8) & 0xff, n & 0xff]; } 131 | function unpackU32(bytes) { return as_unsigned(bytes[0] << 24 | bytes[1] << 16 | bytes[2] << 8 | bytes[3], 32); } 132 | 133 | function packIEEE754(v, ebits, fbits) { 134 | 135 | var bias = (1 << (ebits - 1)) - 1, 136 | s, e, f, ln, 137 | i, bits, str, bytes; 138 | 139 | function roundToEven(n) { 140 | var w = floor(n), f = n - w; 141 | if (f < 0.5) 142 | return w; 143 | if (f > 0.5) 144 | return w + 1; 145 | return w % 2 ? w + 1 : w; 146 | } 147 | 148 | // Compute sign, exponent, fraction 149 | if (v !== v) { 150 | // NaN 151 | // http://dev.w3.org/2006/webapi/WebIDL/#es-type-mapping 152 | e = (1 << ebits) - 1; f = pow(2, fbits - 1); s = 0; 153 | } else if (v === Infinity || v === -Infinity) { 154 | e = (1 << ebits) - 1; f = 0; s = (v < 0) ? 1 : 0; 155 | } else if (v === 0) { 156 | e = 0; f = 0; s = (1 / v === -Infinity) ? 1 : 0; 157 | } else { 158 | s = v < 0; 159 | v = abs(v); 160 | 161 | if (v >= pow(2, 1 - bias)) { 162 | e = min(floor(log(v) / LN2), 1023); 163 | f = roundToEven(v / pow(2, e) * pow(2, fbits)); 164 | if (f / pow(2, fbits) >= 2) { 165 | e = e + 1; 166 | f = 1; 167 | } 168 | if (e > bias) { 169 | // Overflow 170 | e = (1 << ebits) - 1; 171 | f = 0; 172 | } else { 173 | // Normalized 174 | e = e + bias; 175 | f = f - pow(2, fbits); 176 | } 177 | } else { 178 | // Denormalized 179 | e = 0; 180 | f = roundToEven(v / pow(2, 1 - bias - fbits)); 181 | } 182 | } 183 | 184 | // Pack sign, exponent, fraction 185 | bits = []; 186 | for (i = fbits; i; i -= 1) { bits.push(f % 2 ? 1 : 0); f = floor(f / 2); } 187 | for (i = ebits; i; i -= 1) { bits.push(e % 2 ? 1 : 0); e = floor(e / 2); } 188 | bits.push(s ? 1 : 0); 189 | bits.reverse(); 190 | str = bits.join(''); 191 | 192 | // Bits to bytes 193 | bytes = []; 194 | while (str.length) { 195 | bytes.push(parseInt(str.substring(0, 8), 2)); 196 | str = str.substring(8); 197 | } 198 | return bytes; 199 | } 200 | 201 | function unpackIEEE754(bytes, ebits, fbits) { 202 | 203 | // Bytes to bits 204 | var bits = [], i, j, b, str, 205 | bias, s, e, f; 206 | 207 | for (i = bytes.length; i; i -= 1) { 208 | b = bytes[i - 1]; 209 | for (j = 8; j; j -= 1) { 210 | bits.push(b % 2 ? 1 : 0); b = b >> 1; 211 | } 212 | } 213 | bits.reverse(); 214 | str = bits.join(''); 215 | 216 | // Unpack sign, exponent, fraction 217 | bias = (1 << (ebits - 1)) - 1; 218 | s = parseInt(str.substring(0, 1), 2) ? -1 : 1; 219 | e = parseInt(str.substring(1, 1 + ebits), 2); 220 | f = parseInt(str.substring(1 + ebits), 2); 221 | 222 | // Produce number 223 | if (e === (1 << ebits) - 1) { 224 | return f !== 0 ? NaN : s * Infinity; 225 | } else if (e > 0) { 226 | // Normalized 227 | return s * pow(2, e - bias) * (1 + f / pow(2, fbits)); 228 | } else if (f !== 0) { 229 | // Denormalized 230 | return s * pow(2, -(bias - 1)) * (f / pow(2, fbits)); 231 | } else { 232 | return s < 0 ? -0 : 0; 233 | } 234 | } 235 | 236 | function unpackF64(b) { return unpackIEEE754(b, 11, 52); } 237 | function packF64(v) { return packIEEE754(v, 11, 52); } 238 | function unpackF32(b) { return unpackIEEE754(b, 8, 23); } 239 | function packF32(v) { return packIEEE754(v, 8, 23); } 240 | 241 | 242 | // 243 | // 3 The ArrayBuffer Type 244 | // 245 | 246 | (function() { 247 | 248 | /** @constructor */ 249 | var ArrayBuffer = function ArrayBuffer(length) { 250 | length = ECMAScript.ToInt32(length); 251 | if (length < 0) throw new RangeError('ArrayBuffer size is not a small enough positive integer'); 252 | 253 | this.byteLength = length; 254 | this._bytes = []; 255 | this._bytes.length = length; 256 | 257 | var i; 258 | for (i = 0; i < this.byteLength; i += 1) { 259 | this._bytes[i] = 0; 260 | } 261 | 262 | configureProperties(this); 263 | }; 264 | 265 | // 266 | // 4 The ArrayBufferView Type 267 | // 268 | 269 | // NOTE: this constructor is not exported 270 | /** @constructor */ 271 | var ArrayBufferView = function ArrayBufferView() { 272 | //this.buffer = null; 273 | //this.byteOffset = 0; 274 | //this.byteLength = 0; 275 | }; 276 | 277 | // 278 | // 5 The Typed Array View Types 279 | // 280 | 281 | function makeConstructor(bytesPerElement, pack, unpack) { 282 | // Each TypedArray type requires a distinct constructor instance with 283 | // identical logic, which this produces. 284 | 285 | var ctor; 286 | ctor = function(buffer, byteOffset, length) { 287 | var array, sequence, i, s; 288 | 289 | if (!arguments.length || typeof arguments[0] === 'number') { 290 | // Constructor(unsigned long length) 291 | this.length = ECMAScript.ToInt32(arguments[0]); 292 | if (length < 0) throw new RangeError('ArrayBufferView size is not a small enough positive integer'); 293 | 294 | this.byteLength = this.length * this.BYTES_PER_ELEMENT; 295 | this.buffer = new ArrayBuffer(this.byteLength); 296 | this.byteOffset = 0; 297 | } else if (typeof arguments[0] === 'object' && arguments[0].constructor === ctor) { 298 | // Constructor(TypedArray array) 299 | array = arguments[0]; 300 | 301 | this.length = array.length; 302 | this.byteLength = this.length * this.BYTES_PER_ELEMENT; 303 | this.buffer = new ArrayBuffer(this.byteLength); 304 | this.byteOffset = 0; 305 | 306 | for (i = 0; i < this.length; i += 1) { 307 | this._setter(i, array._getter(i)); 308 | } 309 | } else if (typeof arguments[0] === 'object' && 310 | !(arguments[0] instanceof ArrayBuffer || ECMAScript.Class(arguments[0]) === 'ArrayBuffer')) { 311 | // Constructor(sequence array) 312 | sequence = arguments[0]; 313 | 314 | this.length = ECMAScript.ToUint32(sequence.length); 315 | this.byteLength = this.length * this.BYTES_PER_ELEMENT; 316 | this.buffer = new ArrayBuffer(this.byteLength); 317 | this.byteOffset = 0; 318 | 319 | for (i = 0; i < this.length; i += 1) { 320 | s = sequence[i]; 321 | this._setter(i, Number(s)); 322 | } 323 | } else if (typeof arguments[0] === 'object' && 324 | (arguments[0] instanceof ArrayBuffer || ECMAScript.Class(arguments[0]) === 'ArrayBuffer')) { 325 | // Constructor(ArrayBuffer buffer, 326 | // optional unsigned long byteOffset, optional unsigned long length) 327 | this.buffer = buffer; 328 | 329 | this.byteOffset = ECMAScript.ToUint32(byteOffset); 330 | if (this.byteOffset > this.buffer.byteLength) { 331 | throw new RangeError("byteOffset out of range"); 332 | } 333 | 334 | if (this.byteOffset % this.BYTES_PER_ELEMENT) { 335 | // The given byteOffset must be a multiple of the element 336 | // size of the specific type, otherwise an exception is raised. 337 | throw new RangeError("ArrayBuffer length minus the byteOffset is not a multiple of the element size."); 338 | } 339 | 340 | if (arguments.length < 3) { 341 | this.byteLength = this.buffer.byteLength - this.byteOffset; 342 | 343 | if (this.byteLength % this.BYTES_PER_ELEMENT) { 344 | throw new RangeError("length of buffer minus byteOffset not a multiple of the element size"); 345 | } 346 | this.length = this.byteLength / this.BYTES_PER_ELEMENT; 347 | } else { 348 | this.length = ECMAScript.ToUint32(length); 349 | this.byteLength = this.length * this.BYTES_PER_ELEMENT; 350 | } 351 | 352 | if ((this.byteOffset + this.byteLength) > this.buffer.byteLength) { 353 | throw new RangeError("byteOffset and length reference an area beyond the end of the buffer"); 354 | } 355 | } else { 356 | throw new TypeError("Unexpected argument type(s)"); 357 | } 358 | 359 | this.constructor = ctor; 360 | 361 | configureProperties(this); 362 | makeArrayAccessors(this); 363 | }; 364 | 365 | ctor.prototype = new ArrayBufferView(); 366 | ctor.prototype.BYTES_PER_ELEMENT = bytesPerElement; 367 | ctor.prototype._pack = pack; 368 | ctor.prototype._unpack = unpack; 369 | ctor.BYTES_PER_ELEMENT = bytesPerElement; 370 | 371 | // getter type (unsigned long index); 372 | ctor.prototype._getter = function(index) { 373 | if (arguments.length < 1) throw new SyntaxError("Not enough arguments"); 374 | 375 | index = ECMAScript.ToUint32(index); 376 | if (index >= this.length) { 377 | return undefined; 378 | } 379 | 380 | var bytes = [], i, o; 381 | for (i = 0, o = this.byteOffset + index * this.BYTES_PER_ELEMENT; 382 | i < this.BYTES_PER_ELEMENT; 383 | i += 1, o += 1) { 384 | bytes.push(this.buffer._bytes[o]); 385 | } 386 | return this._unpack(bytes); 387 | }; 388 | 389 | // NONSTANDARD: convenience alias for getter: type get(unsigned long index); 390 | ctor.prototype.get = ctor.prototype._getter; 391 | 392 | // setter void (unsigned long index, type value); 393 | ctor.prototype._setter = function(index, value) { 394 | if (arguments.length < 2) throw new SyntaxError("Not enough arguments"); 395 | 396 | index = ECMAScript.ToUint32(index); 397 | if (index >= this.length) { 398 | return undefined; 399 | } 400 | 401 | var bytes = this._pack(value), i, o; 402 | for (i = 0, o = this.byteOffset + index * this.BYTES_PER_ELEMENT; 403 | i < this.BYTES_PER_ELEMENT; 404 | i += 1, o += 1) { 405 | this.buffer._bytes[o] = bytes[i]; 406 | } 407 | }; 408 | 409 | // void set(TypedArray array, optional unsigned long offset); 410 | // void set(sequence array, optional unsigned long offset); 411 | ctor.prototype.set = function(index, value) { 412 | if (arguments.length < 1) throw new SyntaxError("Not enough arguments"); 413 | var array, sequence, offset, len, 414 | i, s, d, 415 | byteOffset, byteLength, tmp; 416 | 417 | if (typeof arguments[0] === 'object' && arguments[0].constructor === this.constructor) { 418 | // void set(TypedArray array, optional unsigned long offset); 419 | array = arguments[0]; 420 | offset = ECMAScript.ToUint32(arguments[1]); 421 | 422 | if (offset + array.length > this.length) { 423 | throw new RangeError("Offset plus length of array is out of range"); 424 | } 425 | 426 | byteOffset = this.byteOffset + offset * this.BYTES_PER_ELEMENT; 427 | byteLength = array.length * this.BYTES_PER_ELEMENT; 428 | 429 | if (array.buffer === this.buffer) { 430 | tmp = []; 431 | for (i = 0, s = array.byteOffset; i < byteLength; i += 1, s += 1) { 432 | tmp[i] = array.buffer._bytes[s]; 433 | } 434 | for (i = 0, d = byteOffset; i < byteLength; i += 1, d += 1) { 435 | this.buffer._bytes[d] = tmp[i]; 436 | } 437 | } else { 438 | for (i = 0, s = array.byteOffset, d = byteOffset; 439 | i < byteLength; i += 1, s += 1, d += 1) { 440 | this.buffer._bytes[d] = array.buffer._bytes[s]; 441 | } 442 | } 443 | } else if (typeof arguments[0] === 'object' && typeof arguments[0].length !== 'undefined') { 444 | // void set(sequence array, optional unsigned long offset); 445 | sequence = arguments[0]; 446 | len = ECMAScript.ToUint32(sequence.length); 447 | offset = ECMAScript.ToUint32(arguments[1]); 448 | 449 | if (offset + len > this.length) { 450 | throw new RangeError("Offset plus length of array is out of range"); 451 | } 452 | 453 | for (i = 0; i < len; i += 1) { 454 | s = sequence[i]; 455 | this._setter(offset + i, Number(s)); 456 | } 457 | } else { 458 | throw new TypeError("Unexpected argument type(s)"); 459 | } 460 | }; 461 | 462 | // TypedArray subarray(long begin, optional long end); 463 | ctor.prototype.subarray = function(start, end) { 464 | function clamp(v, min, max) { return v < min ? min : v > max ? max : v; } 465 | 466 | start = ECMAScript.ToInt32(start); 467 | end = ECMAScript.ToInt32(end); 468 | 469 | if (arguments.length < 1) { start = 0; } 470 | if (arguments.length < 2) { end = this.length; } 471 | 472 | if (start < 0) { start = this.length + start; } 473 | if (end < 0) { end = this.length + end; } 474 | 475 | start = clamp(start, 0, this.length); 476 | end = clamp(end, 0, this.length); 477 | 478 | var len = end - start; 479 | if (len < 0) { 480 | len = 0; 481 | } 482 | 483 | return new this.constructor( 484 | this.buffer, this.byteOffset + start * this.BYTES_PER_ELEMENT, len); 485 | }; 486 | 487 | return ctor; 488 | } 489 | 490 | var Int8Array = makeConstructor(1, packI8, unpackI8); 491 | var Uint8Array = makeConstructor(1, packU8, unpackU8); 492 | var Uint8ClampedArray = makeConstructor(1, packU8Clamped, unpackU8); 493 | var Int16Array = makeConstructor(2, packI16, unpackI16); 494 | var Uint16Array = makeConstructor(2, packU16, unpackU16); 495 | var Int32Array = makeConstructor(4, packI32, unpackI32); 496 | var Uint32Array = makeConstructor(4, packU32, unpackU32); 497 | var Float32Array = makeConstructor(4, packF32, unpackF32); 498 | var Float64Array = makeConstructor(8, packF64, unpackF64); 499 | 500 | exports.Int8Array = exports.Int8Array || Int8Array; 501 | exports.Uint8Array = exports.Uint8Array || Uint8Array; 502 | exports.Uint8ClampedArray = exports.Uint8ClampedArray || Uint8ClampedArray; 503 | exports.Int16Array = exports.Int16Array || Int16Array; 504 | exports.Uint16Array = exports.Uint16Array || Uint16Array; 505 | exports.Int32Array = exports.Int32Array || Int32Array; 506 | exports.Uint32Array = exports.Uint32Array || Uint32Array; 507 | exports.Float32Array = exports.Float32Array || Float32Array; 508 | exports.Float64Array = exports.Float64Array || Float64Array; 509 | }()); 510 | 511 | // 512 | // 6 The DataView View Type 513 | // 514 | 515 | (function() { 516 | function r(array, index) { 517 | return ECMAScript.IsCallable(array.get) ? array.get(index) : array[index]; 518 | } 519 | 520 | var IS_BIG_ENDIAN = (function() { 521 | var u16array = new(exports.Uint16Array)([0x1234]), 522 | u8array = new(exports.Uint8Array)(u16array.buffer); 523 | return r(u8array, 0) === 0x12; 524 | }()); 525 | 526 | // Constructor(ArrayBuffer buffer, 527 | // optional unsigned long byteOffset, 528 | // optional unsigned long byteLength) 529 | /** @constructor */ 530 | var DataView = function DataView(buffer, byteOffset, byteLength) { 531 | if (arguments.length === 0) { 532 | buffer = new exports.ArrayBuffer(0); 533 | } else if (!(buffer instanceof exports.ArrayBuffer || ECMAScript.Class(buffer) === 'ArrayBuffer')) { 534 | throw new TypeError("TypeError"); 535 | } 536 | 537 | this.buffer = buffer || new exports.ArrayBuffer(0); 538 | 539 | this.byteOffset = ECMAScript.ToUint32(byteOffset); 540 | if (this.byteOffset > this.buffer.byteLength) { 541 | throw new RangeError("byteOffset out of range"); 542 | } 543 | 544 | if (arguments.length < 3) { 545 | this.byteLength = this.buffer.byteLength - this.byteOffset; 546 | } else { 547 | this.byteLength = ECMAScript.ToUint32(byteLength); 548 | } 549 | 550 | if ((this.byteOffset + this.byteLength) > this.buffer.byteLength) { 551 | throw new RangeError("byteOffset and length reference an area beyond the end of the buffer"); 552 | } 553 | 554 | configureProperties(this); 555 | }; 556 | 557 | function makeGetter(arrayType) { 558 | return function(byteOffset, littleEndian) { 559 | 560 | byteOffset = ECMAScript.ToUint32(byteOffset); 561 | 562 | if (byteOffset + arrayType.BYTES_PER_ELEMENT > this.byteLength) { 563 | throw new RangeError("Array index out of range"); 564 | } 565 | byteOffset += this.byteOffset; 566 | 567 | var uint8Array = new exports.Uint8Array(this.buffer, byteOffset, arrayType.BYTES_PER_ELEMENT), 568 | bytes = [], i; 569 | for (i = 0; i < arrayType.BYTES_PER_ELEMENT; i += 1) { 570 | bytes.push(r(uint8Array, i)); 571 | } 572 | 573 | if (Boolean(littleEndian) === Boolean(IS_BIG_ENDIAN)) { 574 | bytes.reverse(); 575 | } 576 | 577 | return r(new arrayType(new exports.Uint8Array(bytes).buffer), 0); 578 | }; 579 | } 580 | 581 | DataView.prototype.getUint8 = makeGetter(exports.Uint8Array); 582 | DataView.prototype.getInt8 = makeGetter(exports.Int8Array); 583 | DataView.prototype.getUint16 = makeGetter(exports.Uint16Array); 584 | DataView.prototype.getInt16 = makeGetter(exports.Int16Array); 585 | DataView.prototype.getUint32 = makeGetter(exports.Uint32Array); 586 | DataView.prototype.getInt32 = makeGetter(exports.Int32Array); 587 | DataView.prototype.getFloat32 = makeGetter(exports.Float32Array); 588 | DataView.prototype.getFloat64 = makeGetter(exports.Float64Array); 589 | 590 | function makeSetter(arrayType) { 591 | return function(byteOffset, value, littleEndian) { 592 | 593 | byteOffset = ECMAScript.ToUint32(byteOffset); 594 | if (byteOffset + arrayType.BYTES_PER_ELEMENT > this.byteLength) { 595 | throw new RangeError("Array index out of range"); 596 | } 597 | 598 | // Get bytes 599 | var typeArray = new arrayType([value]), 600 | byteArray = new exports.Uint8Array(typeArray.buffer), 601 | bytes = [], i, byteView; 602 | 603 | for (i = 0; i < arrayType.BYTES_PER_ELEMENT; i += 1) { 604 | bytes.push(r(byteArray, i)); 605 | } 606 | 607 | // Flip if necessary 608 | if (Boolean(littleEndian) === Boolean(IS_BIG_ENDIAN)) { 609 | bytes.reverse(); 610 | } 611 | 612 | // Write them 613 | byteView = new exports.Uint8Array(this.buffer, byteOffset, arrayType.BYTES_PER_ELEMENT); 614 | byteView.set(bytes); 615 | }; 616 | } 617 | 618 | DataView.prototype.setUint8 = makeSetter(exports.Uint8Array); 619 | DataView.prototype.setInt8 = makeSetter(exports.Int8Array); 620 | DataView.prototype.setUint16 = makeSetter(exports.Uint16Array); 621 | DataView.prototype.setInt16 = makeSetter(exports.Int16Array); 622 | DataView.prototype.setUint32 = makeSetter(exports.Uint32Array); 623 | DataView.prototype.setInt32 = makeSetter(exports.Int32Array); 624 | DataView.prototype.setFloat32 = makeSetter(exports.Float32Array); 625 | DataView.prototype.setFloat64 = makeSetter(exports.Float64Array); 626 | 627 | exports.DataView = exports.DataView || DataView; 628 | 629 | }()); 630 | -------------------------------------------------------------------------------- /javascript/test/wru.console.js: -------------------------------------------------------------------------------- 1 | /*! 2 | (C) Andrea Giammarchi, @WebReflection - Mit Style License 3 | */ 4 | if(typeof global!="undefined"){var setTimeout=global.setTimeout,setInterval=global.setInterval,clearInterval=global.clearInterval,clearTimeout=global.clearTimeout;setTimeout||(function(h,c,g,a){setInterval=global.setInterval=function b(j,i){return e(j,i,g.call(arguments,2),1)};setTimeout=global.setTimeout=function d(j,i){return e(j,i,g.call(arguments,2))};clearInterval=global.clearInterval=clearTimeout=global.clearTimeout=function f(i){c[i].cancel();h.purge();delete c[i]};function e(l,k,j,i){var m=++a;c[m]=new JavaAdapter(java.util.TimerTask,{run:function(){l.apply(null,j)}});i?h.schedule(c[m],k,k):h.schedule(c[m],k);return m}})(new java.util.Timer(),{},[].slice,0)}else{!function(c,b,a,e){function d(f,g){var h=new Date;while(new Date-h"+al+"";ak.className=aj}function G(){var ai=this.lastChild.style;ai.display=ai.display=="none"?"block":"none"}function c(ai){P[E]+="
    "+D+v.call(ai,d+D)+d+"
";(P.onclick=G).call(P)}function r(){f();s+=a[ah];C+=u[ah];ad+=T[ah];g("("+v.call([a[ah],M=u[ah],T[ah]],", ")+")");P=P.parentNode;T[ah]?c(T,W="error"):(M?c(u,W="fail"):W="pass");P.className=W;M=0;W=i;j()}function b(ai){if(ag(A,ai)){try{A[ai](ab)}catch(aj){aa.call(T,i+aj)}}}function ag(aj,ai){return q.call(aj,ai)}function w(){return F()<0.5?-1:1}function f(){if(R){H(R);R=0}b("teardown")}var Z={timeout:y,assert:function U(aj,ai){if(arguments[ah]==1){ai=aj;aj=Q}z=I;aa.call(ai?a:u,W+aj);return ai},async:function V(ak,an,al,am){var ai=al||Z.timeout||(Z.timeout=y);am=++N;if(typeof ak=="function"){ai=an||Z.timeout;an=ak;ak="asynchronous test #"+am}al=X(function(){am=0;aa.call(u,ak);--N||(R=X(r,0))},L(ai)||Z.timeout);return function aj(){if(!am){return}z=ae;W=ak+": ";try{an.apply(this,arguments)}catch(ao){z=I;aa.call(T,W+ao)}W=i;if(z){H(al);--N||(R=X(r,0))}}},test:function n(ai,aj){Z.after=aj||function(){};m=J.apply(m,[ai]);Z.random&&af.call(m,w);N||j()}},I=true,ae=!I,y=100,i=" ",Q="unknown",ah="length",S="name",e="description",D="
  • ",d="
  • ",k="\\|/-",q=Z.hasOwnProperty,W=i,ac=W.charAt,x=W.slice,m=[],J=m.concat,v=m.join,aa=m.push,K=m.shift,af=m.sort,N=0,M=0,s=0,C=0,ad=0,R=0,E="innerHTML",h=Y.document,O=h.createElement,B,L,F,X,H,A,P,a,u,T,ab,z;B=Y.Math;L=B.abs;F=B.random;X=Y.setTimeout;H=Y.clearTimeout;Z.node=(h.getElementById("wru")||h.body||h.documentElement);Y.setInterval(function(){N&&g(ac.call(k,M++%4))},y);undefined;Z.log=function o(aj,ai){ai?alert(aj):(typeof console!="undefined")&&console.log(aj)};y*=y;Z.random=ae;return Z}(this); -------------------------------------------------------------------------------- /javascript/unibinary.js: -------------------------------------------------------------------------------- 1 | /** 2 | * UniBinary - Encodes and decodes data into printable UniCode characters. 3 | * 4 | * Authors: 5 | * Nicolas Seriot, 2013-01-17 6 | * Toolsley, 2014-12-03 (JavaScript port) 7 | * 8 | * License: BSD 9 | * 10 | */ 11 | (function (root, factory) { 12 | if (typeof define === 'function' && define.amd) { 13 | define([], factory); 14 | } else if (typeof exports === 'object') { 15 | module.exports = factory(); 16 | } else { 17 | root.unibinary = factory(); 18 | } 19 | }(this, function () { 20 | 21 | //encodes ascii characters (7 bits) 22 | var U12a_0_0_start = 0x5E00; // CJK Unified Ideographs (subset) - encodes 12 bits (2 ascii) - MSB 0,0 23 | var U12a_0_1_start = 0x6E00; // CJK Unified Ideographs (subset) - encodes 12 bits (2 ascii) - MSB 0,1 24 | var U12a_1_0_start = 0x7E00; // CJK Unified Ideographs (subset) - encodes 12 bits (2 ascii) - MSB 1,0 25 | var U12a_1_1_start = 0x8E00; // CJK Unified Ideographs (subset) - encodes 12 bits (2 ascii) - MSB 1,1 26 | var U12a_length = 0x1000; 27 | 28 | //encodes arbitrary bits 29 | var U12b_start = 0x4E00; // CJK Unified Ideographs (subset) - encodes 12 bits 30 | var U12b_length = 0x1000; 31 | var U8_start = 0x0400; // Cyrillic - encodes 8 bits 32 | var U8_length = 0x100; 33 | 34 | var two_unichr_to_repeat_byte_ntimes = function (b, n) { 35 | 36 | if (n > 0xFFF) throw new Error("ValueError"); 37 | 38 | if (b > 0xFF) throw new Error("ValueError"); 39 | 40 | var uni_b = String.fromCharCode(U8_start + b); 41 | var uni_r = String.fromCharCode(U12b_start + n); 42 | 43 | return uni_b + uni_r; 44 | 45 | } 46 | 47 | var unichr_12a_from_two_ascii = function (a1, a2) { 48 | 49 | var i1 = a1; 50 | var i2 = a2; 51 | 52 | var unicode_start = null; 53 | 54 | if ((i1 < 64) && (i2 < 64)) { 55 | unicode_start = U12a_0_0_start; 56 | } else if ((i1 < 64) && (i2 >= 64)) { 57 | i2 -= 64; 58 | unicode_start = U12a_0_1_start; 59 | } else if ((i1 >= 64) && (i2 < 64)) { 60 | i1 -= 64; 61 | unicode_start = U12a_1_0_start; 62 | } else if ((i1 >= 64) && (i2 >= 64)) { 63 | i1 -= 64; 64 | i2 -= 64; 65 | unicode_start = U12a_1_1_start; 66 | } 67 | 68 | return String.fromCharCode(unicode_start + (i1 << 6) + i2) 69 | } 70 | 71 | var unichr_08_from_int = function (i) { 72 | if (i > (U8_start + U8_length)) { 73 | console.error("-- unichr_08_from_int: 0x" + i.toString(16)); 74 | throw new Error("ValueError"); 75 | } 76 | 77 | return String.fromCharCode(U8_start + i); 78 | } 79 | 80 | var unichr_12_from_int = function (i) { 81 | if (i > (U12b_start + U12b_length)) { 82 | console.error("-- unichr_12_from_int: 0x" + i.toString(16)); 83 | throw new Error("ValueError"); 84 | } 85 | 86 | return String.fromCharCode(U12b_start + i); 87 | } 88 | 89 | var int_from_u08b = function (u) { 90 | i = u.charCodeAt(0); 91 | if ((i < U8_start) || (i > (U8_start + U8_length))) { 92 | console.error("-- int_from_u8: " + u.toString()); 93 | throw new Error("ValueError"); 94 | } 95 | 96 | return i - U8_start; 97 | } 98 | 99 | var two_bytes_from_u12a = function (u) { 100 | var i1 = null; 101 | var i2 = null; 102 | var unicode_start = null; 103 | i = u.charCodeAt(0); 104 | 105 | for (var j = 0, start; start = [U12a_0_0_start, U12a_0_1_start, U12a_1_0_start, U12a_1_1_start][j]; j++) { 106 | if ((i >= start) && (i < (start + U12a_length))) 107 | unicode_start = start; 108 | } 109 | 110 | if (!unicode_start) { 111 | console.error("-- two_bytes_from_u12a ord=0x" + u.charCodeAt(0)); 112 | throw new Error("ValueError"); 113 | } 114 | 115 | var value = i - unicode_start; 116 | var b0 = (value & 0xFC0) >> 6; 117 | var b1 = i & 0x3F; 118 | 119 | switch (unicode_start) { 120 | case U12a_0_1_start: 121 | b1 += 64; 122 | break; 123 | case U12a_1_0_start: 124 | b0 += 64; 125 | break; 126 | case U12a_1_1_start: 127 | b0 += 64; 128 | b1 += 64; 129 | } 130 | 131 | return [b0, b1] 132 | 133 | } 134 | 135 | var int_from_u12b = function (u) { 136 | var i = u.charCodeAt(0); 137 | if ((i < U12b_start) || ( i > (U12b_start + U12b_length))) { 138 | console.error("-- int_from_u12b: " + u); 139 | throw new Error("ValueError"); 140 | } 141 | 142 | return i - U12b_start; 143 | } 144 | 145 | 146 | var two_twelve_bits_values_from_three_bytes = function (a, b, c) { 147 | // (0x12, 0x34, 0x56) -> (0x123, 0x456) 148 | if ((a > 0xFF) || (b > 0xFF) || (c > 0xFF)) 149 | throw new Error("ValueError"); 150 | 151 | var s1 = (a << 4) + (b >> 4); 152 | var s2 = ((b & 0xF) << 8) + c; 153 | 154 | return [s1 , s2]; 155 | } 156 | 157 | 158 | var three_bytes_from_two_twelve_bits_values = function (i1, i2) { 159 | // (0x123, 0x456) -> (0x12, 0x34, 0x56) 160 | if ((i1 > 0xFFF) || (i2 > 0xFFF)) 161 | throw new Error("ValueError"); 162 | 163 | var b1 = i1 >> 4; 164 | var b2 = ((i1 & 0xF) << 4) + ((i2 & 0xF00) >> 8); 165 | var b3 = i2 & 0x0FF; 166 | 167 | return [b1, b2, b3]; 168 | 169 | } 170 | 171 | var number_of_left_instances_from_index = function (l, index) { 172 | var i = index; 173 | var c = 0; 174 | var x = l[i]; 175 | 176 | while (i < l.length) { 177 | if (l[i] == x) { 178 | c += 1; 179 | } else { 180 | break; 181 | } 182 | i += 1; 183 | } 184 | return c; 185 | 186 | } 187 | 188 | var three_bytes_from_unichars = function (u1, u2) { 189 | var i1 = int_from_u12b(u1); 190 | var i2 = int_from_u12b(u2); 191 | return three_bytes_from_two_twelve_bits_values(i1, i2); 192 | } 193 | 194 | var repeated_bytes_from_unichars = function (u1, u2) { 195 | var b = int_from_u08b(u1); 196 | var n = int_from_u12b(u2); 197 | var r = []; 198 | for (var i = 0; i < n; i++) r.push(b); 199 | return r; 200 | } 201 | 202 | 203 | var two_bytes_from_unichars = function (u1, u2) { 204 | var b1 = int_from_u08b(u1); 205 | var b2 = int_from_u08b(u2); 206 | return [b1, b2]; 207 | } 208 | 209 | 210 | var is_in_U12a = function (u) { 211 | var i = u.charCodeAt(0); 212 | 213 | for (var j = 0, start; start = [U12a_0_0_start, U12a_0_1_start, U12a_1_0_start, U12a_1_1_start][j]; j++) { 214 | if ((i >= start) && (i < (start + U12a_length))) 215 | return true; 216 | } 217 | return false; 218 | 219 | 220 | } 221 | 222 | var is_in_U8b = function (u) { 223 | var i = u.charCodeAt(0); 224 | return ((i >= U8_start) && (i < (U8_start + U8_length))); 225 | } 226 | 227 | var is_in_U12b = function (u) { 228 | var i = u.charCodeAt(0); 229 | return ((i >= U12b_start) && (i < (U12b_start + U12b_length))); 230 | } 231 | 232 | var bytes_from_u1_u2 = function (u1, u2) { 233 | var u1_in_U12 = is_in_U12b(u1); 234 | var u2_in_U12 = is_in_U12b(u2); 235 | 236 | var u1_in_U8 = is_in_U8b(u1); 237 | var u2_in_U8 = is_in_U8b(u2); 238 | 239 | if (u1_in_U12 && u2_in_U12) 240 | return three_bytes_from_unichars(u1, u2) 241 | else if (u1_in_U8 && u2_in_U12) 242 | return repeated_bytes_from_unichars(u1, u2) 243 | else if (u1_in_U8 && u2_in_U8) 244 | return two_bytes_from_unichars(u1, u2) 245 | else { 246 | console.error("--" + u1 + " " + u2 + " " + u1.charCodeAt(0).toString(16) + " " + u2.charCodeAt(0).toString(16)); 247 | throw new Error("ValueError"); 248 | } 249 | 250 | } 251 | 252 | var gen_encode_unichars_from_bytes = function (bytes) { 253 | var i = 0; 254 | 255 | var result = ""; 256 | 257 | while (i < bytes.length) { 258 | var r = number_of_left_instances_from_index(bytes, i); 259 | 260 | if (r >= 3) { 261 | // read N bytes | N >= 3 and N < 0x1000, encode as 2 unichar 262 | if (r >= 0x1000) { 263 | r = 0xFFF 264 | } 265 | 266 | result += two_unichr_to_repeat_byte_ntimes(bytes[i], r); 267 | 268 | i += r; 269 | } else { 270 | var two_ascii_chars_available = bytes.length >= i + 2 && bytes[i] < 128 && bytes[i + 1] < 128; 271 | 272 | if (two_ascii_chars_available) { 273 | //read 2 x 7 bits, encode 1 unichar 274 | result += unichr_12a_from_two_ascii(bytes[i], bytes[i + 1]); 275 | i += 2; 276 | } else if (bytes.length >= i + 3) { 277 | // read 3 bytes, encode 2 unichars 278 | 279 | var s = two_twelve_bits_values_from_three_bytes(bytes[i], bytes[i + 1], bytes[i + 2]); 280 | result += unichr_12_from_int(s[0])+unichr_12_from_int(s[1]); 281 | i += 3; 282 | } else { 283 | // read 1 byte, encode 1 unichar 284 | 285 | result += unichr_08_from_int(bytes[i]); 286 | i += 1; 287 | } 288 | 289 | 290 | } 291 | 292 | } 293 | return result; 294 | 295 | } 296 | 297 | var gen_decode_bytes_from_string = function (s) { 298 | var i = 0; 299 | 300 | 301 | // strip linebreaks 302 | s = s.replace(/(\r\n|\n|\r)/gm,""); 303 | 304 | //first pass determine size 305 | 306 | var bufferSize = 0; 307 | 308 | while (i < s.length) { 309 | if (s[i] == '\n') { 310 | i += 1; 311 | continue; 312 | } 313 | 314 | if (is_in_U12a(s[i])) { 315 | // 1 U12a -> read 2 ascii characters 316 | //var bytes = two_bytes_from_u12a(s[i]) 317 | i += 1; 318 | bufferSize += 2; 319 | } else if ((i + 1) < s.length) { 320 | // (U12b, U12b) -> read 3 bytes 321 | // (U8b, U12b) -> read repetition 322 | // (U8b, U8b) -> read 1 byte, 1 byte 323 | var u1 = s[i]; 324 | i += 1; 325 | 326 | while (s[i] == '\n') { 327 | i += 1; 328 | } 329 | 330 | var u2 = s[i]; 331 | i += 1; 332 | 333 | bytes = bytes_from_u1_u2(u1, u2) 334 | bufferSize += bytes.length; 335 | } else if (is_in_U8b(s[i])) { 336 | // 1 U8b -> read 1 byte 337 | //var b = int_from_u08b(s[i]); 338 | i += 1; 339 | //return [b]; 340 | bufferSize += 1; 341 | } else { 342 | console.error("cannot decode " + s); 343 | } 344 | 345 | } 346 | 347 | var result = new Uint8Array(bufferSize); 348 | var resultLoc = 0; 349 | 350 | i = 0; 351 | while (i < s.length) { 352 | if (s[i] == '\n') { 353 | i += 1; 354 | continue; 355 | } 356 | 357 | if (is_in_U12a(s[i])) { 358 | // 1 U12a -> read 2 ascii characters 359 | var bytes = two_bytes_from_u12a(s[i]) 360 | i += 1; 361 | result[resultLoc] = bytes[0]; 362 | result[resultLoc + 1] = bytes[1]; 363 | resultLoc += 2; 364 | 365 | } else if ((i + 1) < s.length) { 366 | // (U12b, U12b) -> read 3 bytes 367 | // (U8b, U12b) -> read repetition 368 | // (U8b, U8b) -> read 1 byte, 1 byte 369 | var u1 = s[i]; 370 | i += 1; 371 | 372 | while (s[i] == '\n') { 373 | i += 1; 374 | } 375 | 376 | var u2 = s[i]; 377 | i += 1; 378 | 379 | bytes = bytes_from_u1_u2(u1, u2) 380 | for (var j = 0; j < bytes.length; j++) { 381 | result[resultLoc + j] = bytes[j]; 382 | } 383 | resultLoc += bytes.length; 384 | 385 | } else if (is_in_U8b(s[i])) { 386 | // 1 U8b -> read 1 byte 387 | var b = int_from_u08b(s[i]); 388 | i += 1; 389 | 390 | result[resultLoc] = b; 391 | resultLoc++; 392 | } else { 393 | console.error("cannot decode " + s); 394 | } 395 | 396 | } 397 | 398 | return result; 399 | 400 | } 401 | 402 | var decodeString = function (encoded) { 403 | 404 | var resultArray = gen_decode_bytes_from_string(encoded); 405 | 406 | var encodedString = ""; 407 | 408 | for(var i=0;i> 6), 422 | 0x80 | (charcode & 0x3f)); 423 | } 424 | else if (charcode < 0xd800 || charcode >= 0xe000) { 425 | utf8.push(0xe0 | (charcode >> 12), 426 | 0x80 | ((charcode >> 6) & 0x3f), 427 | 0x80 | (charcode & 0x3f)); 428 | } 429 | // surrogate pair 430 | else { 431 | i++; 432 | // UTF-16 encodes 0x10000-0x10FFFF by 433 | // subtracting 0x10000 and splitting the 434 | // 20 bits of 0x0-0xFFFFF into two halves 435 | charcode = 0x10000 + (((charcode & 0x3ff) << 10) 436 | | (str.charCodeAt(i) & 0x3ff)) 437 | utf8.push(0xf0 | (charcode >> 18), 438 | 0x80 | ((charcode >> 12) & 0x3f), 439 | 0x80 | ((charcode >> 6) & 0x3f), 440 | 0x80 | (charcode & 0x3f)); 441 | } 442 | } 443 | 444 | return gen_encode_unichars_from_bytes(utf8); 445 | 446 | } 447 | 448 | if (typeof UNIBINARY_UNIT_TESTING == 'undefined') { 449 | 450 | return { 451 | encode: gen_encode_unichars_from_bytes, 452 | decode: gen_decode_bytes_from_string, 453 | encodeString: encodeString, 454 | decodeString: decodeString 455 | } 456 | 457 | } else { 458 | 459 | return { 460 | encode: gen_encode_unichars_from_bytes, 461 | decode: gen_decode_bytes_from_string, 462 | encodeString: encodeString, 463 | decodeString: decodeString, 464 | two_unichr_to_repeat_byte_ntimes:two_unichr_to_repeat_byte_ntimes, 465 | unichr_12a_from_two_ascii:unichr_12a_from_two_ascii, 466 | unichr_08_from_int:unichr_08_from_int, 467 | unichr_12_from_int:unichr_12_from_int, 468 | int_from_u08b:int_from_u08b, 469 | two_bytes_from_u12a:two_bytes_from_u12a, 470 | int_from_u12b:int_from_u12b, 471 | two_twelve_bits_values_from_three_bytes:two_twelve_bits_values_from_three_bytes, 472 | three_bytes_from_two_twelve_bits_values:three_bytes_from_two_twelve_bits_values, 473 | number_of_left_instances_from_index:number_of_left_instances_from_index, 474 | three_bytes_from_unichars:three_bytes_from_unichars, 475 | repeated_bytes_from_unichars:repeated_bytes_from_unichars, 476 | two_bytes_from_unichars:two_bytes_from_unichars, 477 | is_in_U12a:is_in_U12a, 478 | is_in_U8b:is_in_U8b, 479 | is_in_U12b:is_in_U12b, 480 | bytes_from_u1_u2:bytes_from_u1_u2, 481 | U8_start:U8_start, 482 | U8_length:U8_length, 483 | U12b_start:U12b_start, 484 | U12b_length:U12b_length 485 | } 486 | 487 | } 488 | })); 489 | 490 | -------------------------------------------------------------------------------- /javascript/unibinary_tool.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | /** 4 | * node.js command line tool for unibinary.js 5 | * 6 | * Author: 7 | * Toolsley, 2014-12-03 8 | * 9 | * License: BSD 10 | * 11 | */ 12 | 13 | var fs = require('fs'); 14 | var constants = require('constants'); 15 | var unibinary = require('./unibinary'); 16 | 17 | function toArrayBuffer(buffer) { 18 | var ab = new ArrayBuffer(buffer.length); 19 | var view = new Uint8Array(ab); 20 | for (var i = 0; i < buffer.length; ++i) { 21 | view[i] = buffer[i]; 22 | } 23 | return ab; 24 | } 25 | 26 | function toBuffer(view) { 27 | var buffer = new Buffer(view.length); 28 | for (var i = 0; i < buffer.length; ++i) { 29 | buffer[i] = view[i]; 30 | } 31 | return buffer; 32 | } 33 | 34 | switch (process.argv[2]) { 35 | case "-es": 36 | case "--encode_string": 37 | console.log(unibinary.encodeString(process.argv[3])); 38 | break; 39 | case "-ds": 40 | case "--decode_string": 41 | console.log(process.argv[3]); 42 | console.log(unibinary.decodeString(process.argv[3])); 43 | break; 44 | case "-e": 45 | case "--encode": 46 | fs.readFile(process.argv[3],function (err, data) { 47 | if (err) throw err; 48 | var dataArray = new Uint8Array(toArrayBuffer(data)); 49 | process.stdout.write(unibinary.encode(dataArray)); 50 | }); 51 | break; 52 | case "-d": 53 | case "--decode": 54 | fs.readFile(process.argv[3],"utf-8",function (err, data) { 55 | if (err) throw err; 56 | var dataBuf = toBuffer(unibinary.decode(data)); 57 | process.stdout.write(dataBuf); 58 | }); 59 | break; 60 | default: 61 | 62 | console.log("usage: unibinary_tool.js [-h] [-e ENCODE] [-d DECODE] [-es ENCODE_STRING]\n\ 63 | [-ds DECODE_STRING]\n\ 64 | \n\ 65 | UniBinary encodes and decodes data into printable Unicode characters.\n\ 66 | \n\ 67 | optional arguments:\n\ 68 | -h, --help show this help message and exit\n\ 69 | -e ENCODE, --encode ENCODE\n\ 70 | file to encode\n\ 71 | -d DECODE, --decode DECODE\n\ 72 | file to decode\n\ 73 | -es ENCODE_STRING, --encode_string ENCODE_STRING\n\ 74 | utf-8 string to encode\n\ 75 | -ds DECODE_STRING, --decode_string DECODE_STRING\n\ 76 | utf-8 string to decode\n"); 77 | } -------------------------------------------------------------------------------- /python/ub_profile.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Nicolas Seriot, 2013-01-17 3 | 4 | """ 5 | UniBinary profiling 6 | 7 | $ python ub_profile.py 8 | """ 9 | 10 | from unibinary import * 11 | import hotshot 12 | from hotshot import stats 13 | 14 | def profile_encode_file(): 15 | #f = open("/usr/bin/true", "rb") 16 | f = open("/Users/nst/Desktop/sc.png", "rb") # any file ~ 800 KB 17 | bytes = f.read() 18 | f.close() 19 | 20 | f = codecs.open("/tmp/tmp.txt", 'w', encoding='utf-16') 21 | for unichars in gen_encode_unichars_from_bytes(bytes): 22 | for u in unichars: 23 | f.write(u) 24 | f.close() 25 | 26 | def profile_decode_file(): 27 | f = codecs.open("/tmp/tmp.txt", "r", encoding='utf-16') 28 | s = f.read() 29 | f.close() 30 | 31 | f = open("/tmp/tmp.bin", 'wb') 32 | for chunk in gen_decode_bytes_from_string(s): 33 | for b in chunk: 34 | buf = struct.pack("B", b) 35 | f.write(buf) 36 | f.close() 37 | 38 | if __name__ == '__main__': 39 | 40 | for f in [profile_encode_file, profile_decode_file]: 41 | 42 | prof = hotshot.Profile("hotshot_stats.prof") 43 | prof.runcall(f) 44 | prof.close() 45 | 46 | s = stats.load("hotshot_stats.prof") 47 | s.strip_dirs() 48 | s.sort_stats('time', 'calls') 49 | s.print_stats(20) 50 | -------------------------------------------------------------------------------- /python/ub_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Nicolas Seriot, 2013-01-17 3 | 4 | """ 5 | UniBinary tests 6 | 7 | $ python ub_tests.py 8 | """ 9 | 10 | from unibinary import * 11 | import unittest 12 | 13 | def shasum(filename): 14 | m = hashlib.sha1() 15 | with open(filename,'rb') as f: 16 | for chunk in iter(lambda: f.read(128*m.block_size), b''): 17 | m.update(chunk) 18 | return m.hexdigest() 19 | 20 | class TestUnidata(unittest.TestCase): 21 | 22 | #def setUp(self): 23 | # pass 24 | 25 | #def tearDown(self): 26 | # pass 27 | 28 | def test_unichr_12_encoding_decoding(self): 29 | 30 | for i in [0x0, 0x1, 0xAB, 0x123, 0xABC, 0xF, 0xFF, 0xFFF]: 31 | 32 | u = unichr_12_from_int(i) 33 | self.assertNotEqual(i, u) 34 | 35 | i2 = int_from_u12b(u) 36 | self.assertEqual(i, i2) 37 | 38 | def test_3_to_2_bytes(self): 39 | 40 | (a, b) = two_twelve_bits_values_from_three_bytes(0x12, 0x34, 0x56) 41 | 42 | self.assertEqual(a, 0x123, "0x%x" % a) 43 | self.assertEqual(b, 0x456, "0x%x" % b) 44 | 45 | def test_2_to_3_bytes(self): 46 | 47 | (a,b,c) = three_bytes_from_two_twelve_bits_values(0x123, 0x456) 48 | 49 | self.assertEqual(a, 0x12, "0x%x" % a) 50 | self.assertEqual(b, 0x34, "0x%x" % b) 51 | self.assertEqual(c, 0x56, "0x%x" % c) 52 | 53 | def test_encode_3_bytes(self): 54 | bytes = "\xab\xcd\xef" 55 | 56 | gen = gen_encode_unichars_from_bytes(bytes) 57 | 58 | (u1, u2) = gen.next() 59 | 60 | self.assertFalse(list(gen)) 61 | 62 | self.assertEqual(u1, unichr(U12b_start + 0xABC)) 63 | self.assertEqual(u2, unichr(U12b_start + 0xDEF)) 64 | 65 | def test_encode_bytes(self): 66 | bytes = "\xab\xcd\xef\xff" 67 | 68 | gen = gen_encode_unichars_from_bytes(bytes) 69 | 70 | (u1, u2) = gen.next() 71 | u3 = gen.next() 72 | 73 | self.assertFalse(list(gen)) 74 | 75 | self.assertEqual(u1, unichr(U12b_start + 0xABC)) 76 | self.assertEqual(u2, unichr(U12b_start + 0xDEF)) 77 | self.assertEqual(u3, unichr(U8_start + 0xFF)) 78 | 79 | def test_decode_unichars(self): 80 | 81 | u1 = unichr(U12b_start + 0xABC) 82 | u2 = unichr(U12b_start + 0xDEF) 83 | 84 | s = u'' 85 | s += u1 86 | s += u2 87 | 88 | gen = gen_decode_bytes_from_string(s) 89 | 90 | (a,b,c) = gen.next() 91 | 92 | self.assertFalse(list(gen)) 93 | 94 | self.assertEqual(a, 0xAB) 95 | self.assertEqual(b, 0xCD) 96 | self.assertEqual(c, 0xEF) 97 | 98 | def test_is_in_U8b(self): 99 | self.assertFalse(is_in_U8b(u"\u03FF")) 100 | 101 | self.assertTrue(is_in_U8b(u"\u0400")) 102 | self.assertTrue(is_in_U8b(u"\u04FF")) 103 | 104 | self.assertFalse(is_in_U8b(u"\u0500")) 105 | 106 | def test_encoding_decoding_utf16_file(self): 107 | 108 | src = "/usr/bin/true" 109 | tmp = "/tmp/true.txt" 110 | cpy = "/tmp/true" 111 | 112 | import os 113 | if not os.path.exists(src): 114 | print "-- WARNING: cannot test %s, file does not exist" % src 115 | return 116 | 117 | for e in ['utf-8', 'utf-16']: 118 | 119 | f = open(src, "rb") 120 | bytes = f.read() 121 | f.close() 122 | 123 | f = codecs.open(tmp, 'w', encoding=e) 124 | 125 | for unichars in gen_encode_unichars_from_bytes(bytes): 126 | for u in unichars: 127 | f.write(u) 128 | 129 | f.close() 130 | 131 | ## 132 | 133 | f = codecs.open(tmp, 'r', encoding=e) 134 | s = f.read() 135 | f.close() 136 | 137 | f = open(cpy, 'wb') 138 | for bytes in gen_decode_bytes_from_string(s): 139 | for b in bytes: 140 | buf = struct.pack("B", b) 141 | f.write(buf) 142 | f.close() 143 | 144 | shasum_src = shasum(src) 145 | shasum_cpy = shasum(cpy) 146 | 147 | self.assertEqual(shasum_src, shasum_cpy) 148 | 149 | def test_unichr_12a_from_two_ascii(self): 150 | u = unichr_12a_from_two_ascii('Z', 'E') 151 | self.assertEqual(u, u"\u9485") 152 | 153 | u = unichr_12a_from_two_ascii('z', ',') 154 | self.assertEqual(u, u"\u8CAC") 155 | 156 | def test_ascii_characters_encoding(self): 157 | s = "abc" 158 | 159 | gen = gen_encode_unichars_from_bytes(s) 160 | 161 | u0 = gen.next() 162 | u1 = gen.next() 163 | 164 | self.assertFalse(list(gen)) 165 | 166 | self.assertEqual(u0, u"\u9662") 167 | self.assertEqual(u1, u"\u0463") 168 | 169 | def test_ascii_characters_encoding_2(self): 170 | 171 | s = "ZE" 172 | 173 | gen = gen_encode_unichars_from_bytes(s) 174 | 175 | u0 = gen.next() 176 | 177 | self.assertFalse(list(gen)) 178 | 179 | self.assertEqual(u0, unichr_12a_from_two_ascii('Z', 'E')) 180 | 181 | def test_two_unichr_to_repeat_byte_ntimes_aaa(self): 182 | (uni_b, uni_r) = two_unichr_to_repeat_byte_ntimes(ord('a'), 10) 183 | 184 | self.assertEqual(ord(uni_b), 0x0461) 185 | self.assertEqual(ord(uni_r), 0x4E0A) 186 | 187 | def test_two_unichr_to_repeat_byte_ntimes_xxx(self): 188 | (uni_b, uni_r) = two_unichr_to_repeat_byte_ntimes(ord('x'), 3) 189 | 190 | self.assertEqual(ord(uni_b), 0x0478) 191 | self.assertEqual(ord(uni_r), 0x4E03) 192 | 193 | def test_repeat(self): 194 | 195 | s = "xxx" 196 | 197 | gen = gen_encode_unichars_from_bytes(s) 198 | 199 | (u0, u1) = gen.next() 200 | 201 | self.assertFalse(list(gen)) 202 | 203 | print "%x %x" % (ord(u0), ord(u1)) 204 | 205 | self.assertEqual(ord(u0), 0x0478) 206 | self.assertEqual(ord(u1), 0x4E03) 207 | 208 | def test_ascii_characters_decoding(self): 209 | 210 | s = [u"\u9662", u"\u0463"] 211 | 212 | s2 = [] 213 | for chunks in gen_decode_bytes_from_string(s): 214 | for b in chunks: 215 | s2.append(b) 216 | 217 | self.assertEqual(s2[0], ord('a')) 218 | self.assertEqual(s2[1], ord('b')) 219 | self.assertEqual(s2[2], ord('c')) 220 | 221 | def test_ascii_characters_decoding_2(self): 222 | 223 | s = [u"\u9485"] 224 | 225 | s2 = [] 226 | for chunks in gen_decode_bytes_from_string(s): 227 | for b in chunks: 228 | s2.append(b) 229 | 230 | self.assertEqual(s2[0], ord('Z')) 231 | self.assertEqual(s2[1], ord('E')) 232 | 233 | def test_five_bytes_encoding(self): 234 | bytes = "\xab\xcd\xef\xab\xcd" 235 | gen = gen_encode_unichars_from_bytes(bytes) 236 | 237 | (u1, u2) = gen.next() 238 | (u3) = gen.next() 239 | (u4) = gen.next() 240 | 241 | self.assertFalse(list(gen)) 242 | 243 | self.assertEqual(u1, unichr(U12b_start + 0xABC)) 244 | self.assertEqual(u2, unichr(U12b_start + 0xDEF)) 245 | self.assertEqual(u3, unichr_08_from_int(0xAB)) 246 | self.assertEqual(u4, unichr_08_from_int(0xCD)) 247 | 248 | def test_ascii_and_bytes_encoding(self): 249 | bytes = "\xab\xcd\xef" 250 | bytes += "\x61\x62\x63\x64\x65" # abcde 251 | 252 | gen = gen_encode_unichars_from_bytes(bytes) 253 | 254 | (u1, u2) = gen.next() 255 | (u3) = gen.next() 256 | (u4) = gen.next() 257 | (u5) = gen.next() 258 | 259 | self.assertFalse(list(gen)) 260 | 261 | self.assertEqual(u1, unichr(U12b_start + 0xABC)) 262 | self.assertEqual(u2, unichr(U12b_start + 0xDEF)) 263 | self.assertEqual(u3, unichr_12a_from_two_ascii('a', 'b')) 264 | self.assertEqual(u4, unichr_12a_from_two_ascii('c', 'd')) 265 | self.assertEqual(u5, unichr_08_from_int(ord('e'))) 266 | 267 | def test_ascii_and_bytes_decoding(self): 268 | 269 | u1 = unichr(U12b_start + 0xABC) 270 | u2 = unichr(U12b_start + 0xDEF) 271 | u3 = unichr_12a_from_two_ascii('a', 'b') 272 | u4 = unichr_12a_from_two_ascii('c', 'd') 273 | u5 = unichr_08_from_int(ord('e')) 274 | 275 | s = u'' 276 | s += u1 277 | s += u2 278 | s += u3 279 | s += u4 280 | s += u5 281 | 282 | gen = gen_decode_bytes_from_string(s) 283 | 284 | (a,b,c) = gen.next() 285 | (d,e) = gen.next() 286 | (f,g) = gen.next() 287 | h = gen.next() 288 | 289 | self.assertFalse(list(gen)) 290 | 291 | self.assertEqual(a, 0xAB) 292 | self.assertEqual(b, 0xCD) 293 | self.assertEqual(c, 0xEF) 294 | 295 | def test_repeats(self): 296 | 297 | l = [1,1,1,2,1] 298 | 299 | n = number_of_left_instances_from_index(l, 0) 300 | 301 | self.assertEqual(n, 3) 302 | 303 | def test_empty_string(self): 304 | 305 | bytes = "" 306 | 307 | gen = gen_encode_unichars_from_bytes(bytes) 308 | 309 | self.assertFalse(list(gen)) 310 | 311 | def test_one_char(self): 312 | 313 | bytes = "a" 314 | 315 | gen = gen_encode_unichars_from_bytes(bytes) 316 | 317 | u1 = gen.next() 318 | 319 | self.assertFalse(list(gen)) 320 | 321 | self.assertEqual(ord(u1), 0x0461) 322 | 323 | def test_repeats_2(self): 324 | 325 | bytes = "\xAB\xCD\xEF\xFF\xFF\xFF\xFF\x00" 326 | 327 | gen = gen_encode_unichars_from_bytes(bytes) 328 | 329 | (u1, u2) = gen.next() 330 | (u3, u4) = gen.next() 331 | u5 = gen.next() 332 | 333 | self.assertFalse(list(gen)) 334 | 335 | self.assertEqual(ord(u1), 0x58BC) 336 | self.assertEqual(ord(u2), 0x5bEF) 337 | self.assertEqual(ord(u3), 0x04FF) 338 | self.assertEqual(ord(u4), 0x4E04) 339 | self.assertEqual(ord(u5), 0x0400) 340 | 341 | def test_encode_macho_header(self): 342 | 343 | bytes = "\xCF\xFA\xED\xFE\x07\x00\x00\x01" 344 | 345 | gen = gen_encode_unichars_from_bytes(bytes) 346 | 347 | (u1, u2) = gen.next() 348 | (u3, u4) = gen.next() 349 | u5 = gen.next() 350 | 351 | self.assertFalse(list(gen)) 352 | 353 | self.assertEqual(ord(u1), 0x5AFF) 354 | self.assertEqual(ord(u2), 0x58ED) 355 | self.assertEqual(ord(u3), 0x5DE0) 356 | self.assertEqual(ord(u4), 0x5500) 357 | self.assertEqual(ord(u5), 0x5E01) 358 | 359 | def test_big_repeats_2000_minus_2(self): 360 | 361 | bytes = ["\xAA"] * (0x2000 - 2) 362 | 363 | gen = gen_encode_unichars_from_bytes(bytes) 364 | 365 | (u1, u2) = gen.next() 366 | (u3, u4) = gen.next() 367 | 368 | self.assertFalse(list(gen)) 369 | 370 | self.assertEqual(ord(u1), 0x04AA) 371 | self.assertEqual(ord(u2), 0x5DFF) 372 | self.assertEqual(ord(u3), 0x04AA) 373 | self.assertEqual(ord(u4), 0x5DFF) 374 | 375 | def test_big_repeats_2000(self): 376 | 377 | bytes = ["\xAA"] * 0x2000 378 | 379 | gen = gen_encode_unichars_from_bytes(bytes) 380 | 381 | (u1, u2) = gen.next() 382 | (u3, u4) = gen.next() 383 | u5 = gen.next() 384 | u6 = gen.next() 385 | 386 | self.assertFalse(list(gen)) 387 | 388 | self.assertEqual(ord(u1), 0x04AA) 389 | self.assertEqual(ord(u2), 0x5DFF) 390 | self.assertEqual(ord(u3), 0x04AA) 391 | self.assertEqual(ord(u4), 0x5DFF) 392 | self.assertEqual(ord(u5), 0x04AA) 393 | self.assertEqual(ord(u6), 0x04AA) 394 | 395 | def test_ascii_text_encoding_decoding(self): 396 | 397 | s = "if I'd listened everything that they said to me, took the time to bleed from all the tiny little arrows shot my way, I wouldn't be here! the ones who don't do anything are always the ones who try to put you down. I'm talking to you: hero time starts right now! time to shine!" 398 | 399 | encode_gen = gen_encode_unichars_from_bytes(s) 400 | 401 | e = [b for b in encode_gen] 402 | 403 | s2 = ''.join([chr(c) for chunk in gen_decode_bytes_from_string(e) for c in chunk]) 404 | 405 | self.assertEqual(s, s2) 406 | 407 | def test_ascii_text_encoding_decoding_2(self): 408 | 409 | s = ''.join([chr(i) for i in range(32, 128)]) 410 | 411 | encode_gen = gen_encode_unichars_from_bytes(s) 412 | 413 | e = [b for b in encode_gen] 414 | 415 | self.assertTrue(len(e) * 2 == len(s)) 416 | 417 | s2 = ''.join([chr(c) for chunk in gen_decode_bytes_from_string(e) for c in chunk]) 418 | 419 | self.assertEqual(s, s2) 420 | 421 | if __name__ == '__main__': 422 | # unittest.main() 423 | suite = unittest.TestLoader().loadTestsFromTestCase(TestUnidata) 424 | unittest.TextTestRunner(verbosity=2).run(suite) 425 | -------------------------------------------------------------------------------- /python/unibinary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Nicolas Seriot, 2013-01-17 3 | 4 | """ 5 | UniBinary, or "Base64 for Unicode". 6 | 7 | Encodes and decodes data into printable Unichode characters. 8 | 9 | 2 ASCII characters -> 1 unicode character 10 | 3 arbitrary bytes -> 2 unicode characters 11 | [3, 0xFFF] repeats -> 2 unicode characters 12 | 13 | The encoded text can be copied / pasted / posted on Twitter and stored as UTF-8 text files. 14 | 15 | http://github.com/nst/UniBinary/ 16 | 17 | $ python unibinary.py -h 18 | 19 | $ python unibinary.py -e /bin/date > /tmp/date.txt 20 | $ file /tmp/date.txt 21 | /tmp/date.txt: UTF-8 Unicode text, with very long lines, with no line terminators 22 | 23 | $ python unibinary.py -d /tmp/date.txt > /tmp/date 24 | $ file /tmp/date 25 | /tmp/date: Mach-O 64-bit executable x86_64 26 | 27 | $ chmod +x /tmp/date 28 | $ /tmp/date 29 | Thu Jan 17 18:02:24 CET 2013 30 | """ 31 | 32 | import struct 33 | import sys 34 | import codecs 35 | import hashlib 36 | import argparse 37 | 38 | __author__ = "Nicolas Seriot" 39 | __license__ = "BSD" 40 | 41 | # http://docs.python.org/2/howto/unicode.html 42 | # http://unicode.org/Public/UNIDATA/Blocks.txt 43 | 44 | # encodes ascii characters (7 bits) 45 | U12a_0_0_start = 0x5E00 # CJK Unified Ideographs (subset) - encodes 12 bits (2 ascii) - MSB 0,0 46 | U12a_0_1_start = 0x6E00 # CJK Unified Ideographs (subset) - encodes 12 bits (2 ascii) - MSB 0,1 47 | U12a_1_0_start = 0x7E00 # CJK Unified Ideographs (subset) - encodes 12 bits (2 ascii) - MSB 1,0 48 | U12a_1_1_start = 0x8E00 # CJK Unified Ideographs (subset) - encodes 12 bits (2 ascii) - MSB 1,1 49 | U12a_length = 0x1000 50 | 51 | # encodes arbitrary bits 52 | U12b_start = 0x4E00 # CJK Unified Ideographs (subset) - encodes 12 bits 53 | U12b_length = 0x1000 54 | U8_start = 0x0400 # Cyrillic - encodes 8 bits 55 | U8_length = 0x100 56 | 57 | def two_unichr_to_repeat_byte_ntimes(b, n): 58 | if n > 0xFFF: 59 | raise ValueError 60 | 61 | if b > 0xFF: 62 | raise ValueError 63 | 64 | uni_b = unichr(U8_start + b) 65 | uni_r = unichr(U12b_start + n) 66 | 67 | return (uni_b, uni_r) 68 | 69 | def unichr_12a_from_two_ascii(a1, a2): 70 | 71 | i1 = ord(a1) 72 | i2 = ord(a2) 73 | 74 | unicode_start = None 75 | 76 | if i1 < 64 and i2 < 64: 77 | unicode_start = U12a_0_0_start 78 | elif i1 < 64 and i2 >= 64: 79 | i2 -= 64 80 | unicode_start = U12a_0_1_start 81 | elif i1 >= 64 and i2 < 64: 82 | i1 -= 64 83 | unicode_start = U12a_1_0_start 84 | elif i1 >= 64 and i2 >= 64: 85 | i1 -= 64 86 | i2 -= 64 87 | unicode_start = U12a_1_1_start 88 | 89 | return unichr(unicode_start + (i1 << 6) + i2) 90 | 91 | def unichr_08_from_int(i): 92 | if i > (U8_start + U8_length): 93 | print "-- unichr_08_from_int: 0x%x" % i 94 | raise ValueError 95 | 96 | return unichr(U8_start + i) 97 | 98 | def unichr_12_from_int(i): 99 | if i > (U12b_start + U12b_length): 100 | print "-- unichr_12_from_int: 0x%x" % i 101 | raise ValueError 102 | 103 | return unichr(U12b_start + i) 104 | 105 | def int_from_u12a(u): 106 | 107 | i = ord(u) 108 | 109 | if i < U12a_start or i > (U12a_start + U12a_length): 110 | print "-- int_from_u12a: %c" % u 111 | raise ValueError 112 | 113 | return i - U12a_start 114 | 115 | def int_from_u08b(u): 116 | 117 | i = ord(u) 118 | 119 | if i < U8_start or i > (U8_start + U8_length): 120 | print "-- int_from_u08b: %c" % u 121 | raise ValueError 122 | 123 | return i - U8_start 124 | 125 | def two_bytes_from_u12a(u): 126 | i1 = None 127 | i2 = None 128 | unicode_start = None 129 | 130 | i = ord(u) 131 | for start in (U12a_0_0_start, U12a_0_1_start, U12a_1_0_start, U12a_1_1_start): 132 | if i >= start and i < (start + U12a_length): 133 | unicode_start = start 134 | break 135 | 136 | if not unicode_start: 137 | print "-- two_bytes_from_u12a ord=0x%02x" % ord(u) 138 | raise ValueError 139 | 140 | value = i - unicode_start 141 | b0 = (value & 0xFC0) >> 6 142 | b1 = i & 0x3F 143 | 144 | if unicode_start == U12a_0_0_start: 145 | pass 146 | elif unicode_start == U12a_0_1_start: 147 | b1 += 64 148 | elif unicode_start == U12a_1_0_start: 149 | b0 += 64 150 | elif unicode_start == U12a_1_1_start: 151 | b0 += 64 152 | b1 += 64 153 | 154 | return (b0, b1) 155 | 156 | def int_from_u12b(u): 157 | 158 | i = ord(u) 159 | 160 | if i < U12b_start or i > (U12b_start + U12b_length): 161 | print "-- int_from_u12b: %c" % u 162 | raise ValueError 163 | 164 | return i - U12b_start 165 | 166 | def two_twelve_bits_values_from_three_bytes(a, b, c): 167 | # (0x12, 0x34, 0x56) -> (0x123, 0x456) 168 | 169 | if a > 0xFF or b > 0xFF or c > 0xFF: 170 | raise ValueError 171 | 172 | s1 = (a << 4) + (b >> 4) 173 | s2 = ((b & 0xF) << 8) + c 174 | 175 | return (s1, s2) 176 | 177 | def three_bytes_from_two_twelve_bits_values(i1, i2): 178 | # (0x123, 0x456) -> (0x12, 0x34, 0x56) 179 | 180 | if i1 > 0xFFF or i2 > 0xFFF: 181 | raise ValueError 182 | 183 | b1 = i1 >> 4 184 | b2 = ((i1 & 0xF) << 4) + ((i2 & 0xF00) >> 8) 185 | b3 = i2 & 0x0FF 186 | 187 | return (b1, b2, b3) 188 | 189 | def number_of_left_instances_from_index(l, index): 190 | i = index 191 | c = 0 192 | x = l[i] 193 | while i < len(l): 194 | if l[i] == x: 195 | c += 1 196 | else: 197 | break 198 | i += 1 199 | 200 | return c 201 | 202 | def three_bytes_from_unichars(u1, u2): 203 | i1 = int_from_u12b(u1) 204 | i2 = int_from_u12b(u2) 205 | (b1, b2, b3) = three_bytes_from_two_twelve_bits_values(i1, i2) 206 | return (b1, b2, b3) 207 | 208 | def repeated_bytes_from_unichars(u1, u2): 209 | b = int_from_u08b(u1) 210 | n = int_from_u12b(u2) 211 | return ([b])*n 212 | 213 | def two_bytes_from_unichars(u1, u2): 214 | b1 = int_from_u08b(u1) 215 | b2 = int_from_u08b(u2) 216 | return (b1, b2) 217 | 218 | def is_in_U8a(u): 219 | i = ord(u) 220 | return i >= U8a_start and i < (U8a_start + U8a_length) 221 | 222 | def is_in_U12a(u): 223 | i = ord(u) 224 | 225 | for unicode_start in (U12a_0_0_start, U12a_0_1_start, U12a_1_0_start, U12a_1_1_start): 226 | if i >= unicode_start and i < (unicode_start + U12a_length): 227 | return True 228 | 229 | return False 230 | 231 | def is_in_U8b(u): 232 | i = ord(u) 233 | return i >= U8_start and i < (U8_start + U8_length) 234 | 235 | def is_in_U12b(u): 236 | i = ord(u) 237 | return i >= U12b_start and i < (U12b_start + U12b_length) 238 | 239 | def bytes_from_u1_u2(u1, u2): 240 | 241 | u1_in_U12 = is_in_U12b(u1) 242 | u2_in_U12 = is_in_U12b(u2) 243 | 244 | u1_in_U8 = is_in_U8b(u1) 245 | u2_in_U8 = is_in_U8b(u2) 246 | 247 | if u1_in_U12 and u2_in_U12: 248 | return three_bytes_from_unichars(u1, u2) 249 | elif u1_in_U8 and u2_in_U12: 250 | return repeated_bytes_from_unichars(u1, u2) 251 | elif u1_in_U8 and u2_in_U8: 252 | return two_bytes_from_unichars(u1, u2) 253 | else: 254 | print "-- %c %c 0x%x 0x%x" % (u1, u2, ord(u1), ord(u2)) 255 | raise ValueError 256 | 257 | def gen_encode_unichars_from_bytes(bytes): 258 | 259 | i = 0 260 | 261 | while (i < len(bytes)): 262 | r = number_of_left_instances_from_index(bytes, i) 263 | 264 | if r >= 3: 265 | # read N bytes | N >= 3 and N < 0x1000, encode as 2 unichar 266 | 267 | if r >= 0x1000: 268 | r = 0xFFF 269 | 270 | length = r 271 | (uni_b, uni_n) = two_unichr_to_repeat_byte_ntimes(ord(bytes[i]), r) 272 | i += length 273 | yield (uni_b, uni_n) 274 | 275 | else: 276 | 277 | two_ascii_chars_available = len(bytes) >= i+2 and ord(bytes[i]) < 128 and ord(bytes[i+1]) < 128 278 | 279 | if two_ascii_chars_available: 280 | # read 2 x 7 bits, encode 1 unichar 281 | 282 | (a1, a2) = bytes[i:i+2] 283 | i += 2 284 | yield unichr_12a_from_two_ascii(a1, a2) 285 | 286 | elif len(bytes) >= i+3: 287 | # read 3 bytes, encode 2 unichars 288 | 289 | b = struct.unpack("BBB", bytes[i:i+3]) 290 | i += 3 291 | 292 | (a,b,c) = b 293 | (s1, s2) = two_twelve_bits_values_from_three_bytes(a, b, c) 294 | yield (unichr_12_from_int(s1), unichr_12_from_int(s2)) 295 | 296 | else: 297 | # read 1 byte, encode 1 unichar 298 | 299 | b = struct.unpack("B", bytes[i]) 300 | i += 1 301 | 302 | yield (unichr_08_from_int(b[0])) 303 | 304 | def gen_decode_bytes_from_string(s): 305 | 306 | i = 0 307 | 308 | while (i < len(s)): 309 | 310 | if s[i] == '\n': 311 | i += 1 312 | continue 313 | 314 | if is_in_U12a(s[i]): 315 | # 1 U12a -> read 2 ascii characters 316 | bytes = two_bytes_from_u12a(s[i]) 317 | i += 1 318 | yield bytes 319 | elif i+1 < len(s): 320 | # (U12b, U12b) -> read 3 bytes 321 | # (U8b, U12b) -> read repetition 322 | # (U8b, U8b) -> read 1 byte, 1 byte 323 | u1 = s[i] 324 | i += 1 325 | 326 | while s[i] == '\n': 327 | i += 1 328 | 329 | u2 = s[i] 330 | i += 1 331 | 332 | bytes = bytes_from_u1_u2(u1, u2) 333 | yield bytes 334 | elif is_in_U8b(s[i]): 335 | # 1 U8b -> read 1 byte 336 | b = int_from_u08b(s[i]) 337 | i += 1 338 | yield tuple([b]) 339 | else: 340 | print "-- cannot decode", s 341 | sys.exit(1) 342 | 343 | # main 344 | 345 | def print_decoded_string(s): 346 | 347 | for bytes in gen_decode_bytes_from_string(s): 348 | for b in bytes: 349 | buf = struct.pack("B", b) 350 | sys.stdout.write(buf) 351 | 352 | if __name__ == '__main__': 353 | 354 | parser = argparse.ArgumentParser(description='UniBinary encodes and decodes data into printable Unicode characters.') 355 | parser.add_argument('-e','--encode', help='file to encode') 356 | parser.add_argument('-d','--decode', help='file to decode') 357 | parser.add_argument('-es','--encode_string', help='utf-8 string to encode') 358 | parser.add_argument('-ds','--decode_string', help='utf-8 string to decode') 359 | args = vars(parser.parse_args()) 360 | 361 | if args['encode']: 362 | f = open(args['encode'], "rb") 363 | bytes = f.read() 364 | f.close() 365 | 366 | UTF8Writer = codecs.getwriter('utf-8') 367 | sys.stdout = UTF8Writer(sys.stdout) 368 | 369 | for unichars in gen_encode_unichars_from_bytes(bytes): 370 | string = unicode(''.join(unichars)) 371 | sys.stdout.write(string) 372 | 373 | sys.stdout.flush() 374 | sys.stderr.write('\n') 375 | 376 | elif args['decode']: 377 | f = codecs.open(args['decode'], "r", encoding='utf-8') 378 | s = f.read() 379 | f.close() 380 | 381 | print_decoded_string(s) 382 | 383 | sys.stdout.flush() 384 | sys.stderr.write('\n') 385 | 386 | elif args['encode_string']: 387 | UTF8Writer = codecs.getwriter('utf-8') 388 | sys.stdout = UTF8Writer(sys.stdout) 389 | 390 | for unichars in gen_encode_unichars_from_bytes(args['encode_string']): 391 | string = unicode(''.join(unichars)) 392 | sys.stdout.write(string) 393 | 394 | sys.stdout.flush() 395 | sys.stderr.write('\n') 396 | 397 | elif args['decode_string']: 398 | print_decoded_string(args['decode_string'].decode('utf-8')) 399 | 400 | sys.stdout.flush() 401 | sys.stderr.write('\n') 402 | 403 | else: 404 | parser.print_help() 405 | --------------------------------------------------------------------------------