├── .github └── workflows │ └── build.yml ├── LICENSE ├── README.md ├── encode.nimble ├── src └── encode.nim └── tests ├── config.nims ├── test_encode.nim ├── utf16be.txt ├── utf16le.txt └── utf8.txt /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Github Actions 2 | on: [push, pull_request] 3 | jobs: 4 | build: 5 | strategy: 6 | fail-fast: false 7 | matrix: 8 | os: [ubuntu-latest, windows-latest] 9 | 10 | runs-on: ${{ matrix.os }} 11 | 12 | steps: 13 | - uses: actions/checkout@v2 14 | - uses: jiro4989/setup-nim-action@v1 15 | - run: nimble test -y 16 | - run: nimble test --gc:orc -y 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Andre von Houck 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Encode 2 | 3 | `nimble install encode` 4 | 5 | ![Github Actions](https://github.com/treeform/encode/workflows/Github%20Actions/badge.svg) 6 | 7 | [API reference](https://nimdocs.com/treeform/encode) 8 | 9 | This library has no dependencies other than the Nim standard libarary. 10 | 11 | ## About 12 | 13 | This is a "pure" module with a stricter API utf8, utf16, and utf32 focused API. 14 | Aimed to be a better version of the `encodings` module. 15 | Because: 16 | * Does not depend on `libiconv` or `windows API`. 17 | * On windows it supports UTF16 big endian. 18 | * API is more clear. 19 | -------------------------------------------------------------------------------- /encode.nimble: -------------------------------------------------------------------------------- 1 | # Package 2 | version = "0.1.0" 3 | author = "Andre von Houck" 4 | description = "Encode/decode utf8 utf16 and utf32." 5 | license = "MIT" 6 | 7 | srcDir = "src" 8 | 9 | # Deps 10 | requires "nim >= 1.0.0" 11 | -------------------------------------------------------------------------------- /src/encode.nim: -------------------------------------------------------------------------------- 1 | import unicode 2 | 3 | func maybeSwap(u: uint16, swap: bool): uint16 = 4 | if swap: 5 | ((u and 0xFF) shl 8) or ((u and 0xFF00) shr 8) 6 | else: 7 | u 8 | 9 | proc readUInt16(s: string, i: int): uint16 = 10 | s[i+0].uint16 + 11 | s[i+1].uint16 shl 8 12 | 13 | proc addUInt16(s: var string, v: uint16) = 14 | s.add ((v and 0x00FF) shr 0).char 15 | s.add ((v and 0xFF00) shr 8).char 16 | 17 | proc readUInt32(s: string, i: int): uint32 = 18 | s[i+0].uint32 + 19 | s[i+1].uint32 shl 8 + 20 | s[i+2].uint32 shl 16 + 21 | s[i+3].uint32 shl 24 22 | 23 | proc addUInt32(s: var string, v: uint32) = 24 | s.add ((v and 0x000000FF) shr 0).char 25 | s.add ((v and 0x0000FF00) shr 8).char 26 | s.add ((v and 0x00FF0000) shr 16).char 27 | s.add ((v and 0xFF000000.uint32) shr 24).char 28 | 29 | proc toUTF16Inner(input: string, swap: bool, bom: bool): string = 30 | ## Converts UTF8 to UTF16. 31 | if bom: 32 | result.addUInt16(0xFEFF.uint16.maybeSwap(swap)) 33 | for r in input.runes: 34 | let u = r.uint32 35 | if (0x0000 <= u and u <= 0xD7FF) or (0xE000 <= u and u <= 0xFFFF): 36 | result.addUInt16(u.uint16.maybeSwap(swap)) 37 | elif 0x010000 <= u and u <= 0x10FFFF: 38 | let 39 | u0 = u - 0x10000 40 | w1 = 0xD800 + u0 div 0x400 41 | w2 = 0xDC00 + u0 mod 0x400 42 | result.addUInt16(w1.uint16.maybeSwap(swap)) 43 | result.addUInt16(w2.uint16.maybeSwap(swap)) 44 | 45 | proc toUTF16LE*(input: string): string = 46 | ## Converts UTF8 to UTF16 LE string. 47 | toUTF16Inner(input, false, false) 48 | 49 | proc toUTF16BE*(input: string): string = 50 | ## Converts UTF8 to UTF16 BE string. 51 | toUTF16Inner(input, true, false) 52 | 53 | proc toUTF16LEWithBom*(input: string): string = 54 | ## Converts UTF8 to UTF16 LE with byte order mark string. 55 | toUTF16Inner(input, false, true) 56 | 57 | proc toUTF16BEWithBom*(input: string): string = 58 | ## Converts UTF8 to UTF16 BE with byte order mark string. 59 | toUTF16Inner(input, true, true) 60 | 61 | proc fromUTF16Inner(input: string, i: var int, swap: bool): string = 62 | ## Converts UTF16 Big Endian to UTF8 string. 63 | while i < input.len: 64 | var u1 = input.readUInt16(i).maybeSwap(swap) 65 | i += 2 66 | if u1 - 0xd800 >= 0x800: 67 | result.add Rune(u1.int) 68 | else: 69 | var u2 = input.readUInt16(i).maybeSwap(swap) 70 | i += 2 71 | if ((u1 and 0xfc00) == 0xd800) and ((u2 and 0xfc00) == 0xdc00): 72 | result.add Rune((u1.uint32 shl 10) + u2.uint32 - 0x35fdc00) 73 | else: 74 | # Error, produce tofu character. 75 | result.add "□" 76 | 77 | proc fromUTF16*(input: string): string = 78 | ## Converts UTF16 trying to read byte order marker to UTF8 string. 79 | var 80 | i = 0 81 | swap: bool = false 82 | # Deal with Byte Order Mark 83 | let bom = input.readUInt16(i) 84 | if bom == 0xFEFF: 85 | swap = false 86 | i += 2 87 | elif bom == 0xFFFE: 88 | swap = true 89 | i += 2 90 | input.fromUTF16Inner(i, swap) 91 | 92 | proc fromUTF16BE*(input: string): string = 93 | ## Converts UTF16 Big Endian to UTF8 string. 94 | var i = 0 95 | input.fromUTF16Inner(i, true) 96 | 97 | proc fromUTF16LE*(input: string): string = 98 | ## Converts UTF16 Little Endian to UTF8 string. 99 | var i = 0 100 | input.fromUTF16Inner(i, false) 101 | 102 | proc toUTF32*(input: string): string = 103 | ## Converts UTF8 string to utf32. 104 | for r in input.runes: 105 | result.addUInt32(r.uint32) 106 | 107 | proc fromUTF32*(input: string): string = 108 | ## Converts utf32 to UTF8 string. 109 | var i = 0 110 | while i < input.len: 111 | result.add Rune(input.readUInt32(i)) 112 | i += 4 -------------------------------------------------------------------------------- /tests/config.nims: -------------------------------------------------------------------------------- 1 | --path:"../src" 2 | -------------------------------------------------------------------------------- /tests/test_encode.nim: -------------------------------------------------------------------------------- 1 | import encode 2 | 3 | const 4 | utf8 = readFile("tests/utf8.txt") 5 | utf16be = readFile("tests/utf16be.txt") 6 | utf16le = readFile("tests/utf16le.txt") 7 | utf32 = "h\0\0\0i\0\0\0 \0\0\0t\0\0\0h\0\0\0e\0\0\0r\0\0\0e\0\0\0 \0\0\0h\0\0\0o\0\0\0w\0\0\0 \0\0\0a\0\0\0r\0\0\0e\0\0\0 \0\0\0y\0\0\0o\0\0\0u\0\0\0?\0\0\0" 8 | 9 | block: 10 | echo "UTF 16" 11 | assert utf16be.fromUTF16() == utf16le.fromUTF16() 12 | assert utf16be.fromUTF16() == utf8 13 | assert utf16le.fromUTF16() == utf8 14 | 15 | block: 16 | echo "UTF 32" 17 | assert "hi there how are you?".toUTF32() == utf32 18 | assert "hi there how are you?" == utf32.fromUTF32() 19 | assert utf8.toUTF32().fromUTF32() == utf8 20 | 21 | block: 22 | echo "UTF 16 LE" 23 | assert utf8.toUTF16LE().fromUTF16LE() == utf8 24 | 25 | block: 26 | echo "UTF 16 BE" 27 | assert utf8.toUTF16BE().fromUTF16BE() == utf8 28 | 29 | block: 30 | echo "UTF 16 BE with Bom" 31 | assert utf8.toUTF16BEWithBom() == utf16be 32 | 33 | block: 34 | echo "UTF 16 LE with Bom" 35 | assert utf8.toUTF16LEWithBom() == utf16le -------------------------------------------------------------------------------- /tests/utf16be.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treeform/encode/4e0dbbb13f4fc7feee9b9ff35c1ca3c2d1894fa9/tests/utf16be.txt -------------------------------------------------------------------------------- /tests/utf16le.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treeform/encode/4e0dbbb13f4fc7feee9b9ff35c1ca3c2d1894fa9/tests/utf16le.txt -------------------------------------------------------------------------------- /tests/utf8.txt: -------------------------------------------------------------------------------- 1 | hi there this is in utf16 2 | 乾隆己酉夏,以編排秘籍 3 | и миндаль составляют его доход 4 | ∑(23 ÷ d) ↔ ∂x 5 | 𐅆𐅇𐅈𐅉𐅊𐅋 6 | 𠀖𠀗 7 | 🂡🂢🂣🂤 8 | 🧥👚👕👖 --------------------------------------------------------------------------------