├── .gitignore ├── elm.json ├── LICENSE ├── README.md └── src ├── Bytes.elm ├── Elm └── Kernel │ └── Bytes.js └── Bytes ├── Encode.elm └── Decode.elm /.gitignore: -------------------------------------------------------------------------------- 1 | elm-stuff 2 | -------------------------------------------------------------------------------- /elm.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "package", 3 | "name": "elm/bytes", 4 | "summary": "Work with sequences of bytes (a.k.a. ArrayBuffer, typed arrays, DataView)", 5 | "license": "BSD-3-Clause", 6 | "version": "1.0.8", 7 | "exposed-modules": [ 8 | "Bytes", 9 | "Bytes.Encode", 10 | "Bytes.Decode" 11 | ], 12 | "elm-version": "0.19.0 <= v < 0.20.0", 13 | "dependencies": { 14 | "elm/core": "1.0.1 <= v < 2.0.0" 15 | }, 16 | "test-dependencies": {} 17 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018-present, Evan Czaplicki 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of Evan Czaplicki nor the names of other 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bytes 2 | 3 | Work with densely packed sequences of bytes. 4 | 5 | The goal of this package is to support **network protocols** such as ProtoBuf. Or to put it another way, the goal is to have packages like `elm/http` send fewer bytes over the wire. 6 | 7 | 8 | ## Motivation = [A vision for data interchange in Elm](https://gist.github.com/evancz/1c5f2cf34939336ecb79b97bb89d9da6) 9 | 10 | Please read it! 11 | 12 | 13 | ## Example 14 | 15 | This package lets you create encoders and decoders for working with sequences of bytes. Here is an example for converting between `Point` and `Bytes` values: 16 | 17 | ```elm 18 | import Bytes exposing (Endianness(..)) 19 | import Bytes.Encode as Encode exposing (Encoder) 20 | import Bytes.Decode as Decode exposing (Decoder) 21 | 22 | 23 | -- POINT 24 | 25 | type alias Point = 26 | { x : Float 27 | , y : Float 28 | , z : Float 29 | } 30 | 31 | toPointEncoder : Point -> Encoder 32 | toPointEncoder point = 33 | Encode.sequence 34 | [ Encode.float32 BE point.x 35 | , Encode.float32 BE point.y 36 | , Encode.float32 BE point.z 37 | ] 38 | 39 | pointDecoder : Decoder Point 40 | pointDecoder = 41 | Decode.map3 Point 42 | (Decode.float32 BE) 43 | (Decode.float32 BE) 44 | (Decode.float32 BE) 45 | ``` 46 | 47 | Rather than writing this by hand in client or server code, the hope is that folks implement things like ProtoBuf compilers for Elm. 48 | 49 | Again, the overall plan is described in [**A vision for data interchange in Elm**](https://gist.github.com/evancz/1c5f2cf34939336ecb79b97bb89d9da6)! 50 | 51 | 52 | ## Scope 53 | 54 | **This API is not intended to work like `Int8Array` or `Uint16Array` in JavaScript.** If you have a concrete scenario in which you want to interpret bytes as densely packed arrays of integers or floats, please describe it on [https://discourse.elm-lang.org/](https://discourse.elm-lang.org/) in a friendly and high-level way. What is the project about? What do densely packed arrays do for that project? Is it about perf? What kind of algorithms are you using? Etc. 55 | 56 | If some scenarios require the mutation of entries in place, special care will be required in designing a nice API. All values in Elm are immutable, so the particular API that works well for us will probably depend a lot on the particulars of what folks are trying to do. 57 | -------------------------------------------------------------------------------- /src/Bytes.elm: -------------------------------------------------------------------------------- 1 | module Bytes exposing 2 | ( Bytes 3 | , width 4 | , Endianness(..) 5 | , getHostEndianness 6 | ) 7 | 8 | 9 | {-| 10 | 11 | # Bytes 12 | @docs Bytes, width 13 | 14 | # Endianness 15 | @docs Endianness, getHostEndianness 16 | 17 | -} 18 | 19 | 20 | import Elm.Kernel.Bytes 21 | import Task exposing (Task) 22 | 23 | 24 | -- BYTES 25 | 26 | 27 | {-| A sequence of bytes. 28 | 29 | A byte is a chunk of eight bits. For example, the letter `j` is usually 30 | represented as the byte `01101010`, and the letter `k` is `01101011`. 31 | 32 | Seeing each byte as a stream of zeros and ones can be quite confusing though, 33 | so it is common to use hexidecimal numbers instead: 34 | 35 | ``` 36 | | Binary | Hex | 37 | +--------+-----+ 38 | | 0000 | 0 | 39 | | 0001 | 1 | 40 | | 0010 | 2 | 41 | | 0011 | 3 | j = 01101010 42 | | 0100 | 4 | \__/\__/ 43 | | 0101 | 5 | | | 44 | | 0110 | 6 | 6 A 45 | | 0111 | 7 | 46 | | 1000 | 8 | k = 01101011 47 | | 1001 | 9 | \__/\__/ 48 | | 1010 | A | | | 49 | | 1011 | B | 6 B 50 | | 1100 | C | 51 | | 1101 | D | 52 | | 1110 | E | 53 | | 1111 | F | 54 | ``` 55 | 56 | So `j` is `6A` and `k` is `6B` in hexidecimal. This more compact representation 57 | is great when you have a sequence of bytes. You can see this even in a short 58 | string like `"jazz"`: 59 | 60 | ``` 61 | binary hexidecimal 62 | 01101010 01100001 01111010 01111010 => 6A 61 7A 7A 63 | ``` 64 | 65 | Anyway, the point is that `Bytes` is a sequence of bytes! 66 | -} 67 | type Bytes = Bytes 68 | 69 | 70 | {-| Get the width of a sequence of bytes. 71 | 72 | So if a sequence has four-hundred bytes, then `width bytes` would give back 73 | `400`. That may be 400 unsigned 8-bit integers, 100 signed 32-bit integers, or 74 | even a UTF-8 string. The content does not matter. This is just figuring out 75 | how many bytes there are! 76 | -} 77 | width : Bytes -> Int 78 | width = 79 | Elm.Kernel.Bytes.width 80 | 81 | 82 | 83 | -- ENDIANNESS 84 | 85 | 86 | {-| Different computers store integers and floats slightly differently in 87 | memory. Say we have the integer `0x1A2B3C4D` in our program. It needs four 88 | bytes (32 bits) in memory. It may seem reasonable to lay them out in order: 89 | 90 | ``` 91 | Big-Endian (BE) (Obvious Order) 92 | +----+----+----+----+ 93 | | 1A | 2B | 3C | 4D | 94 | +----+----+----+----+ 95 | ``` 96 | 97 | But some people thought it would be better to store the bytes in the opposite 98 | order: 99 | 100 | ``` 101 | Little-Endian (LE) (Shuffled Order) 102 | +----+----+----+----+ 103 | | 4D | 3C | 2B | 1A | 104 | +----+----+----+----+ 105 | ``` 106 | 107 | Notice that **the _bytes_ are shuffled, not the bits.** It is like if you cut a 108 | photo into four strips and shuffled the strips. It is not a mirror image. 109 | The theory seems to be that an 8-bit `0x1A` and a 32-bit `0x0000001A` both have 110 | `1A` as the first byte in this scheme. Maybe this was helpful when processors 111 | handled one byte at a time. 112 | 113 | **Most processors use little-endian (LE) layout.** This seems to be because 114 | Intel did it this way, and other chip manufactures followed their convention. 115 | **Most network protocols use big-endian (BE) layout.** I suspect this is 116 | because if you are trying to debug a network protocol, it is nice if your 117 | integers are not all shuffled. 118 | 119 | **Note:** Endianness is relevant for integers and floats, but not strings. 120 | UTF-8 specifies the order of bytes explicitly. 121 | 122 | **Note:** The terms little-endian and big-endian are a reference to an egg joke 123 | in Gulliver's Travels. They first appeared in 1980 in [this essay][essay], and 124 | you can decide for yourself if they stood the test of time. I personally find 125 | these terms quite unhelpful, so I say “Obvious Order” and “Shuffled Order” in 126 | my head. I remember which is more common by asking myself, “if things were 127 | obvious, would I have to ask this question?” 128 | 129 | [essay]: http://www.ietf.org/rfc/ien/ien137.txt 130 | -} 131 | type Endianness = LE | BE 132 | 133 | 134 | {-| Is this program running on a big-endian or little-endian machine? 135 | -} 136 | getHostEndianness : Task x Endianness 137 | getHostEndianness = 138 | Elm.Kernel.Bytes.getHostEndianness LE BE 139 | -------------------------------------------------------------------------------- /src/Elm/Kernel/Bytes.js: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | import Bytes.Encode as Encode exposing (getWidth, write) 4 | import Elm.Kernel.Scheduler exposing (binding, succeed) 5 | import Elm.Kernel.Utils exposing (Tuple2, chr) 6 | import Maybe exposing (Just, Nothing) 7 | 8 | */ 9 | 10 | // BYTES 11 | 12 | function _Bytes_width(bytes) 13 | { 14 | return bytes.byteLength; 15 | } 16 | 17 | var _Bytes_getHostEndianness = F2(function(le, be) 18 | { 19 | return __Scheduler_binding(function(callback) 20 | { 21 | callback(__Scheduler_succeed(new Uint8Array(new Uint32Array([1]))[0] === 1 ? le : be)); 22 | }); 23 | }); 24 | 25 | 26 | // ENCODERS 27 | 28 | function _Bytes_encode(encoder) 29 | { 30 | var mutableBytes = new DataView(new ArrayBuffer(__Encode_getWidth(encoder))); 31 | __Encode_write(encoder)(mutableBytes)(0); 32 | return mutableBytes; 33 | } 34 | 35 | 36 | // SIGNED INTEGERS 37 | 38 | var _Bytes_write_i8 = F3(function(mb, i, n) { mb.setInt8(i, n); return i + 1; }); 39 | var _Bytes_write_i16 = F4(function(mb, i, n, isLE) { mb.setInt16(i, n, isLE); return i + 2; }); 40 | var _Bytes_write_i32 = F4(function(mb, i, n, isLE) { mb.setInt32(i, n, isLE); return i + 4; }); 41 | 42 | 43 | // UNSIGNED INTEGERS 44 | 45 | var _Bytes_write_u8 = F3(function(mb, i, n) { mb.setUint8(i, n); return i + 1 ;}); 46 | var _Bytes_write_u16 = F4(function(mb, i, n, isLE) { mb.setUint16(i, n, isLE); return i + 2; }); 47 | var _Bytes_write_u32 = F4(function(mb, i, n, isLE) { mb.setUint32(i, n, isLE); return i + 4; }); 48 | 49 | 50 | // FLOATS 51 | 52 | var _Bytes_write_f32 = F4(function(mb, i, n, isLE) { mb.setFloat32(i, n, isLE); return i + 4; }); 53 | var _Bytes_write_f64 = F4(function(mb, i, n, isLE) { mb.setFloat64(i, n, isLE); return i + 8; }); 54 | 55 | 56 | // BYTES 57 | 58 | var _Bytes_write_bytes = F3(function(mb, offset, bytes) 59 | { 60 | for (var i = 0, len = bytes.byteLength, limit = len - 4; i <= limit; i += 4) 61 | { 62 | mb.setUint32(offset + i, bytes.getUint32(i)); 63 | } 64 | for (; i < len; i++) 65 | { 66 | mb.setUint8(offset + i, bytes.getUint8(i)); 67 | } 68 | return offset + len; 69 | }); 70 | 71 | 72 | // STRINGS 73 | 74 | function _Bytes_getStringWidth(string) 75 | { 76 | for (var width = 0, i = 0; i < string.length; i++) 77 | { 78 | var code = string.charCodeAt(i); 79 | width += 80 | (code < 0x80) ? 1 : 81 | (code < 0x800) ? 2 : 82 | (code < 0xD800 || 0xDBFF < code) ? 3 : (i++, 4); 83 | } 84 | return width; 85 | } 86 | 87 | var _Bytes_write_string = F3(function(mb, offset, string) 88 | { 89 | for (var i = 0; i < string.length; i++) 90 | { 91 | var code = string.charCodeAt(i); 92 | offset += 93 | (code < 0x80) 94 | ? (mb.setUint8(offset, code) 95 | , 1 96 | ) 97 | : 98 | (code < 0x800) 99 | ? (mb.setUint16(offset, 0xC080 /* 0b1100000010000000 */ 100 | | (code >>> 6 & 0x1F /* 0b00011111 */) << 8 101 | | code & 0x3F /* 0b00111111 */) 102 | , 2 103 | ) 104 | : 105 | (code < 0xD800 || 0xDBFF < code) 106 | ? (mb.setUint16(offset, 0xE080 /* 0b1110000010000000 */ 107 | | (code >>> 12 & 0xF /* 0b00001111 */) << 8 108 | | code >>> 6 & 0x3F /* 0b00111111 */) 109 | , mb.setUint8(offset + 2, 0x80 /* 0b10000000 */ 110 | | code & 0x3F /* 0b00111111 */) 111 | , 3 112 | ) 113 | : 114 | (code = (code - 0xD800) * 0x400 + string.charCodeAt(++i) - 0xDC00 + 0x10000 115 | , mb.setUint32(offset, 0xF0808080 /* 0b11110000100000001000000010000000 */ 116 | | (code >>> 18 & 0x7 /* 0b00000111 */) << 24 117 | | (code >>> 12 & 0x3F /* 0b00111111 */) << 16 118 | | (code >>> 6 & 0x3F /* 0b00111111 */) << 8 119 | | code & 0x3F /* 0b00111111 */) 120 | , 4 121 | ); 122 | } 123 | return offset; 124 | }); 125 | 126 | 127 | // DECODER 128 | 129 | var _Bytes_decode = F2(function(decoder, bytes) 130 | { 131 | try { 132 | return __Maybe_Just(A2(decoder, bytes, 0).b); 133 | } catch(e) { 134 | return __Maybe_Nothing; 135 | } 136 | }); 137 | 138 | var _Bytes_read_i8 = F2(function( bytes, offset) { return __Utils_Tuple2(offset + 1, bytes.getInt8(offset)); }); 139 | var _Bytes_read_i16 = F3(function(isLE, bytes, offset) { return __Utils_Tuple2(offset + 2, bytes.getInt16(offset, isLE)); }); 140 | var _Bytes_read_i32 = F3(function(isLE, bytes, offset) { return __Utils_Tuple2(offset + 4, bytes.getInt32(offset, isLE)); }); 141 | var _Bytes_read_u8 = F2(function( bytes, offset) { return __Utils_Tuple2(offset + 1, bytes.getUint8(offset)); }); 142 | var _Bytes_read_u16 = F3(function(isLE, bytes, offset) { return __Utils_Tuple2(offset + 2, bytes.getUint16(offset, isLE)); }); 143 | var _Bytes_read_u32 = F3(function(isLE, bytes, offset) { return __Utils_Tuple2(offset + 4, bytes.getUint32(offset, isLE)); }); 144 | var _Bytes_read_f32 = F3(function(isLE, bytes, offset) { return __Utils_Tuple2(offset + 4, bytes.getFloat32(offset, isLE)); }); 145 | var _Bytes_read_f64 = F3(function(isLE, bytes, offset) { return __Utils_Tuple2(offset + 8, bytes.getFloat64(offset, isLE)); }); 146 | 147 | var _Bytes_read_bytes = F3(function(len, bytes, offset) 148 | { 149 | return __Utils_Tuple2(offset + len, new DataView(bytes.buffer, bytes.byteOffset + offset, len)); 150 | }); 151 | 152 | var _Bytes_read_string = F3(function(len, bytes, offset) 153 | { 154 | var string = ''; 155 | var end = offset + len; 156 | for (; offset < end;) 157 | { 158 | var byte = bytes.getUint8(offset++); 159 | string += 160 | (byte < 128) 161 | ? String.fromCharCode(byte) 162 | : 163 | ((byte & 0xE0 /* 0b11100000 */) === 0xC0 /* 0b11000000 */) 164 | ? String.fromCharCode((byte & 0x1F /* 0b00011111 */) << 6 | bytes.getUint8(offset++) & 0x3F /* 0b00111111 */) 165 | : 166 | ((byte & 0xF0 /* 0b11110000 */) === 0xE0 /* 0b11100000 */) 167 | ? String.fromCharCode( 168 | (byte & 0xF /* 0b00001111 */) << 12 169 | | (bytes.getUint8(offset++) & 0x3F /* 0b00111111 */) << 6 170 | | bytes.getUint8(offset++) & 0x3F /* 0b00111111 */ 171 | ) 172 | : 173 | (byte = 174 | ((byte & 0x7 /* 0b00000111 */) << 18 175 | | (bytes.getUint8(offset++) & 0x3F /* 0b00111111 */) << 12 176 | | (bytes.getUint8(offset++) & 0x3F /* 0b00111111 */) << 6 177 | | bytes.getUint8(offset++) & 0x3F /* 0b00111111 */ 178 | ) - 0x10000 179 | , String.fromCharCode(Math.floor(byte / 0x400) + 0xD800, byte % 0x400 + 0xDC00) 180 | ); 181 | } 182 | return __Utils_Tuple2(offset, string); 183 | }); 184 | 185 | var _Bytes_decodeFailure = F2(function() { throw 0; }); 186 | -------------------------------------------------------------------------------- /src/Bytes/Encode.elm: -------------------------------------------------------------------------------- 1 | module Bytes.Encode exposing 2 | ( encode 3 | , Encoder 4 | , signedInt8, signedInt16, signedInt32 5 | , unsignedInt8, unsignedInt16, unsignedInt32 6 | , float32, float64 7 | , bytes 8 | , string, getStringWidth 9 | , sequence 10 | ) 11 | 12 | 13 | {-| 14 | 15 | # Encoders 16 | @docs encode, Encoder, sequence 17 | 18 | # Integers 19 | @docs signedInt8, signedInt16, signedInt32, 20 | unsignedInt8, unsignedInt16, unsignedInt32 21 | 22 | # Floats 23 | @docs float32, float64 24 | 25 | # Bytes 26 | @docs bytes 27 | 28 | # Strings 29 | @docs string, getStringWidth 30 | 31 | -} 32 | 33 | 34 | import Bytes exposing (Bytes, Endianness(..)) 35 | 36 | 37 | 38 | -- ENCODER 39 | 40 | 41 | {-| Describes how to generate a sequence of bytes. 42 | 43 | These encoders snap together with [`sequence`](#sequence) so you can start with 44 | small building blocks and put them together into a more complex encoding. 45 | -} 46 | type Encoder 47 | = I8 Int 48 | | I16 Endianness Int 49 | | I32 Endianness Int 50 | | U8 Int 51 | | U16 Endianness Int 52 | | U32 Endianness Int 53 | | F32 Endianness Float 54 | | F64 Endianness Float 55 | | Seq Int (List Encoder) 56 | | Utf8 Int String 57 | | Bytes Bytes 58 | 59 | 60 | 61 | -- ENCODE 62 | 63 | 64 | {-| Turn an `Encoder` into `Bytes`. 65 | 66 | encode (unsignedInt8 7) -- <07> 67 | encode (unsignedInt16 BE 7) -- <0007> 68 | encode (unsignedInt16 LE 7) -- <0700> 69 | 70 | The `encode` function is designed to minimize allocation. It figures out the 71 | exact width necessary to fit everything in `Bytes` and then generate that 72 | value directly. This is valuable when you are encoding more elaborate data: 73 | 74 | import Bytes exposing (Endianness(..)) 75 | import Bytes.Encode as Encode 76 | 77 | type alias Person = 78 | { age : Int 79 | , name : String 80 | } 81 | 82 | toEncoder : Person -> Encode.Encoder 83 | toEncoder person = 84 | Encode.sequence 85 | [ Encode.unsignedInt16 BE person.age 86 | , Encode.unsignedInt16 BE (Encode.getStringWidth person.name) 87 | , Encode.string person.name 88 | ] 89 | 90 | -- encode (toEncoder (Person 33 "Tom")) == <00210003546F6D> 91 | 92 | Did you know it was going to be seven bytes? How about when you have a hundred 93 | people to serialize? And when some have Japanese and Norwegian names? Having 94 | this intermediate `Encoder` can help reduce allocation quite a lot! 95 | -} 96 | encode : Encoder -> Bytes 97 | encode = 98 | Elm.Kernel.Bytes.encode 99 | 100 | 101 | 102 | -- INTEGERS 103 | 104 | 105 | {-| Encode integers from `-128` to `127` in one byte. 106 | -} 107 | signedInt8 : Int -> Encoder 108 | signedInt8 = 109 | I8 110 | 111 | 112 | {-| Encode integers from `-32768` to `32767` in two bytes. 113 | -} 114 | signedInt16 : Endianness -> Int -> Encoder 115 | signedInt16 = 116 | I16 117 | 118 | 119 | {-| Encode integers from `-2147483648` to `2147483647` in four bytes. 120 | -} 121 | signedInt32 : Endianness -> Int -> Encoder 122 | signedInt32 = 123 | I32 124 | 125 | 126 | {-| Encode integers from `0` to `255` in one byte. 127 | -} 128 | unsignedInt8 : Int -> Encoder 129 | unsignedInt8 = 130 | U8 131 | 132 | 133 | {-| Encode integers from `0` to `65535` in two bytes. 134 | -} 135 | unsignedInt16 : Endianness -> Int -> Encoder 136 | unsignedInt16 = 137 | U16 138 | 139 | 140 | {-| Encode integers from `0` to `4294967295` in four bytes. 141 | -} 142 | unsignedInt32 : Endianness -> Int -> Encoder 143 | unsignedInt32 = 144 | U32 145 | 146 | 147 | 148 | -- FLOATS 149 | 150 | 151 | {-| Encode 32-bit floating point numbers in four bytes. 152 | -} 153 | float32 : Endianness -> Float -> Encoder 154 | float32 = 155 | F32 156 | 157 | 158 | {-| Encode 64-bit floating point numbers in eight bytes. 159 | -} 160 | float64 : Endianness -> Float -> Encoder 161 | float64 = 162 | F64 163 | 164 | 165 | 166 | -- BYTES 167 | 168 | 169 | {-| Copy bytes directly into the new `Bytes` sequence. This does not record the 170 | width though! You usually want to say something like this: 171 | 172 | import Bytes exposing (Bytes, Endianness(..)) 173 | import Bytes.Encode as Encode 174 | 175 | png : Bytes -> Encode.Encoder 176 | png imageData = 177 | Encode.sequence 178 | [ Encode.unsignedInt32 BE (Bytes.width imageData) 179 | , Encode.bytes imageData 180 | ] 181 | 182 | This allows you to represent the width however is necessary for your protocol. 183 | For example, you can use [Base 128 Varints][pb] for ProtoBuf, 184 | [Variable-Length Integers][sql] for SQLite, or whatever else they dream up. 185 | 186 | [pb]: https://developers.google.com/protocol-buffers/docs/encoding#varints 187 | [sql]: https://www.sqlite.org/src4/doc/trunk/www/varint.wiki 188 | -} 189 | bytes : Bytes -> Encoder 190 | bytes = 191 | Bytes 192 | 193 | 194 | 195 | -- STRINGS 196 | 197 | 198 | {-| Encode a `String` as a bunch of UTF-8 bytes. 199 | 200 | encode (string "$20") -- <24 32 30> 201 | encode (string "£20") -- 202 | encode (string "€20") -- 203 | encode (string "bread") -- <62 72 65 61 64> 204 | encode (string "brød") -- <62 72 C3B8 64> 205 | 206 | Some characters take one byte, while others can take up to four. Read more 207 | about [UTF-8](https://en.wikipedia.org/wiki/UTF-8) to learn the details! 208 | 209 | But if you just encode UTF-8 directly, how can you know when you get to the end 210 | of the string when you are decoding? So most protocols have an integer saying 211 | how many bytes follow, like this: 212 | 213 | sizedString : String -> Encoder 214 | sizedString str = 215 | sequence 216 | [ unsignedInt32 BE (getStringWidth str) 217 | , string str 218 | ] 219 | 220 | You can choose whatever representation you want for the width, which is helpful 221 | because many protocols use different integer representations to save space. For 222 | example: 223 | 224 | - ProtoBuf uses [Base 128 Varints](https://developers.google.com/protocol-buffers/docs/encoding#varints) 225 | - SQLite uses [Variable-Length Integers](https://www.sqlite.org/src4/doc/trunk/www/varint.wiki) 226 | 227 | In both cases, small numbers can fit just one byte, saving some space. (The 228 | SQLite encoding has the benefit that the first byte tells you how long the 229 | number is, making it faster to decode.) In both cases, it is sort of tricky 230 | to make negative numbers small. 231 | -} 232 | string : String -> Encoder 233 | string str = 234 | Utf8 (Elm.Kernel.Bytes.getStringWidth str) str 235 | 236 | 237 | {-| Get the width of a `String` in UTF-8 bytes. 238 | 239 | getStringWidth "$20" == 3 240 | getStringWidth "£20" == 4 241 | getStringWidth "€20" == 5 242 | getStringWidth "bread" == 5 243 | getStringWidth "brød" == 5 244 | 245 | Most protocols need this number to come directly before a chunk of UTF-8 bytes 246 | as a way to know where the string ends! 247 | 248 | Read more about how UTF-8 works [here](https://en.wikipedia.org/wiki/UTF-8). 249 | -} 250 | getStringWidth : String -> Int 251 | getStringWidth = 252 | Elm.Kernel.Bytes.getStringWidth 253 | 254 | 255 | 256 | -- SEQUENCE 257 | 258 | 259 | {-| Put together a bunch of builders. So if you wanted to encode three `Float` 260 | values for the position of a ball in 3D space, you could say: 261 | 262 | import Bytes exposing (Endianness(..)) 263 | import Bytes.Encode as Encode 264 | 265 | type alias Ball = { x : Float, y : Float, z : Float } 266 | 267 | ball : Ball -> Encode.Encoder 268 | ball {x,y,z} = 269 | Encode.sequence 270 | [ Encode.float32 BE x 271 | , Encode.float32 BE y 272 | , Encode.float32 BE z 273 | ] 274 | 275 | -} 276 | sequence : List Encoder -> Encoder 277 | sequence builders = 278 | Seq (getWidths 0 builders) builders 279 | 280 | 281 | 282 | -- WRITE 283 | 284 | 285 | write : Encoder -> Bytes -> Int -> Int 286 | write builder mb offset = 287 | case builder of 288 | I8 n -> Elm.Kernel.Bytes.write_i8 mb offset n 289 | I16 e n -> Elm.Kernel.Bytes.write_i16 mb offset n (e == LE) 290 | I32 e n -> Elm.Kernel.Bytes.write_i32 mb offset n (e == LE) 291 | U8 n -> Elm.Kernel.Bytes.write_u8 mb offset n 292 | U16 e n -> Elm.Kernel.Bytes.write_u16 mb offset n (e == LE) 293 | U32 e n -> Elm.Kernel.Bytes.write_u32 mb offset n (e == LE) 294 | F32 e n -> Elm.Kernel.Bytes.write_f32 mb offset n (e == LE) 295 | F64 e n -> Elm.Kernel.Bytes.write_f64 mb offset n (e == LE) 296 | Seq _ bs -> writeSequence bs mb offset 297 | Utf8 _ s -> Elm.Kernel.Bytes.write_string mb offset s 298 | Bytes bs -> Elm.Kernel.Bytes.write_bytes mb offset bs 299 | 300 | 301 | writeSequence : List Encoder -> Bytes -> Int -> Int 302 | writeSequence builders mb offset = 303 | case builders of 304 | [] -> 305 | offset 306 | 307 | b :: bs -> 308 | writeSequence bs mb (write b mb offset) 309 | 310 | 311 | 312 | -- WIDTHS 313 | 314 | 315 | getWidth : Encoder -> Int 316 | getWidth builder = 317 | case builder of 318 | I8 _ -> 1 319 | I16 _ _ -> 2 320 | I32 _ _ -> 4 321 | U8 _ -> 1 322 | U16 _ _ -> 2 323 | U32 _ _ -> 4 324 | F32 _ _ -> 4 325 | F64 _ _ -> 8 326 | Seq w _ -> w 327 | Utf8 w _ -> w 328 | Bytes bs -> Elm.Kernel.Bytes.width bs 329 | 330 | 331 | getWidths : Int -> List Encoder -> Int 332 | getWidths width builders = 333 | case builders of 334 | [] -> 335 | width 336 | 337 | b :: bs -> 338 | getWidths (width + getWidth b) bs 339 | -------------------------------------------------------------------------------- /src/Bytes/Decode.elm: -------------------------------------------------------------------------------- 1 | module Bytes.Decode exposing 2 | ( Decoder, decode 3 | , signedInt8, signedInt16, signedInt32 4 | , unsignedInt8, unsignedInt16, unsignedInt32 5 | , float32, float64 6 | , string 7 | , bytes 8 | , map, map2, map3, map4, map5 9 | , andThen, succeed, fail 10 | , Step(..), loop 11 | ) 12 | 13 | 14 | {-| 15 | 16 | # Decoders 17 | @docs Decoder, decode 18 | 19 | # Integers 20 | @docs signedInt8, signedInt16, signedInt32, 21 | unsignedInt8, unsignedInt16, unsignedInt32 22 | 23 | # Floats 24 | @docs float32, float64 25 | 26 | # Bytes 27 | @docs bytes 28 | 29 | # Strings 30 | @docs string 31 | 32 | # Map 33 | @docs map, map2, map3, map4, map5 34 | 35 | # And Then 36 | @docs andThen, succeed, fail 37 | 38 | # Loop 39 | @docs Step, loop 40 | -} 41 | 42 | 43 | import Bytes exposing (Bytes, Endianness(..)) 44 | 45 | 46 | 47 | -- PARSER 48 | 49 | 50 | {-| Describes how to turn a sequence of bytes into a nice Elm value. 51 | -} 52 | type Decoder a = 53 | Decoder (Bytes -> Int -> (Int, a)) 54 | 55 | 56 | {-| Turn a sequence of bytes into a nice Elm value. 57 | 58 | -- decode (unsignedInt16 BE) <0007> == Just 7 59 | -- decode (unsignedInt16 LE) <0700> == Just 7 60 | -- decode (unsignedInt16 BE) <0700> == Just 1792 61 | -- decode (unsignedInt32 BE) <0700> == Nothing 62 | 63 | The `Decoder` specifies exactly how this should happen. This process may fail 64 | if the sequence of bytes is corrupted or unexpected somehow. The examples above 65 | show a case where there are not enough bytes. 66 | -} 67 | decode : Decoder a -> Bytes -> Maybe a 68 | decode (Decoder decoder) bs = 69 | Elm.Kernel.Bytes.decode decoder bs 70 | 71 | 72 | 73 | -- SIGNED INTEGERS 74 | 75 | 76 | {-| Decode one byte into an integer from `-128` to `127`. 77 | -} 78 | signedInt8 : Decoder Int 79 | signedInt8 = 80 | Decoder Elm.Kernel.Bytes.read_i8 81 | 82 | 83 | {-| Decode two bytes into an integer from `-32768` to `32767`. 84 | -} 85 | signedInt16 : Endianness -> Decoder Int 86 | signedInt16 endianness = 87 | Decoder (Elm.Kernel.Bytes.read_i16 (endianness == LE)) 88 | 89 | 90 | {-| Decode four bytes into an integer from `-2147483648` to `2147483647`. 91 | -} 92 | signedInt32 : Endianness -> Decoder Int 93 | signedInt32 endianness = 94 | Decoder (Elm.Kernel.Bytes.read_i32 (endianness == LE)) 95 | 96 | 97 | 98 | -- UNSIGNED INTEGERS 99 | 100 | 101 | {-| Decode one byte into an integer from `0` to `255`. 102 | -} 103 | unsignedInt8 : Decoder Int 104 | unsignedInt8 = 105 | Decoder Elm.Kernel.Bytes.read_u8 106 | 107 | 108 | {-| Decode two bytes into an integer from `0` to `65535`. 109 | -} 110 | unsignedInt16 : Endianness -> Decoder Int 111 | unsignedInt16 endianness = 112 | Decoder (Elm.Kernel.Bytes.read_u16 (endianness == LE)) 113 | 114 | 115 | {-| Decode four bytes into an integer from `0` to `4294967295`. 116 | -} 117 | unsignedInt32 : Endianness -> Decoder Int 118 | unsignedInt32 endianness = 119 | Decoder (Elm.Kernel.Bytes.read_u32 (endianness == LE)) 120 | 121 | 122 | 123 | -- FLOATS 124 | 125 | 126 | {-| Decode four bytes into a floating point number. 127 | -} 128 | float32 : Endianness -> Decoder Float 129 | float32 endianness = 130 | Decoder (Elm.Kernel.Bytes.read_f32 (endianness == LE)) 131 | 132 | 133 | {-| Decode eight bytes into a floating point number. 134 | -} 135 | float64 : Endianness -> Decoder Float 136 | float64 endianness = 137 | Decoder (Elm.Kernel.Bytes.read_f64 (endianness == LE)) 138 | 139 | 140 | 141 | -- BYTES 142 | 143 | 144 | {-| Copy a given number of bytes into a new `Bytes` sequence. 145 | -} 146 | bytes : Int -> Decoder Bytes 147 | bytes n = 148 | Decoder (Elm.Kernel.Bytes.read_bytes n) 149 | 150 | 151 | 152 | -- STRINGS 153 | 154 | 155 | {-| Decode a given number of UTF-8 bytes into a `String`. 156 | 157 | Most protocols store the width of the string right before the content, so you 158 | will probably write things like this: 159 | 160 | import Bytes exposing (Endianness(..)) 161 | import Bytes.Decode as Decode 162 | 163 | sizedString : Decode.Decoder String 164 | sizedString = 165 | Decode.unsignedInt32 BE 166 | |> Decode.andThen Decode.string 167 | 168 | In this case we read the width as a 32-bit unsigned integer, but you have the 169 | leeway to read the width as a [Base 128 Varint][pb] for ProtoBuf, a 170 | [Variable-Length Integer][sql] for SQLite, or whatever else they dream up. 171 | 172 | [pb]: https://developers.google.com/protocol-buffers/docs/encoding#varints 173 | [sql]: https://www.sqlite.org/src4/doc/trunk/www/varint.wiki 174 | -} 175 | string : Int -> Decoder String 176 | string n = 177 | Decoder (Elm.Kernel.Bytes.read_string n) 178 | 179 | 180 | 181 | -- MAP 182 | 183 | 184 | {-| Transform the value produced by a decoder. If you encode negative numbers 185 | in a special way, you can say something like this: 186 | 187 | negativeInt8 : Decoder Int 188 | negativeInt8 = 189 | map negate unsignedInt8 190 | 191 | In practice you may see something like ProtoBuf’s [ZigZag encoding][zz] which 192 | decreases the size of small negative numbers. 193 | 194 | [zz]: https://developers.google.com/protocol-buffers/docs/encoding#types 195 | -} 196 | map : (a -> b) -> Decoder a -> Decoder b 197 | map func (Decoder decodeA) = 198 | Decoder <| 199 | \bites offset -> 200 | let 201 | (aOffset, a) = decodeA bites offset 202 | in 203 | (aOffset, func a) 204 | 205 | 206 | {-| Combine two decoders. 207 | 208 | import Bytes exposing (Endiannness(..)) 209 | import Bytes.Decode as Decode 210 | 211 | type alias Point = { x : Float, y : Float } 212 | 213 | decoder : Decode.Decoder Point 214 | decoder = 215 | Decode.map2 Point 216 | (Decode.float32 BE) 217 | (Decode.float32 BE) 218 | -} 219 | map2 : (a -> b -> result) -> Decoder a -> Decoder b -> Decoder result 220 | map2 func (Decoder decodeA) (Decoder decodeB) = 221 | Decoder <| 222 | \bites offset -> 223 | let 224 | (aOffset, a) = decodeA bites offset 225 | (bOffset, b) = decodeB bites aOffset 226 | in 227 | (bOffset, func a b) 228 | 229 | 230 | {-| Combine three decoders. 231 | -} 232 | map3 : (a -> b -> c -> result) -> Decoder a -> Decoder b -> Decoder c -> Decoder result 233 | map3 func (Decoder decodeA) (Decoder decodeB) (Decoder decodeC) = 234 | Decoder <| 235 | \bites offset -> 236 | let 237 | (aOffset, a) = decodeA bites offset 238 | (bOffset, b) = decodeB bites aOffset 239 | (cOffset, c) = decodeC bites bOffset 240 | in 241 | (cOffset, func a b c) 242 | 243 | 244 | {-| Combine four decoders. 245 | -} 246 | map4 : (a -> b -> c -> d -> result) -> Decoder a -> Decoder b -> Decoder c -> Decoder d -> Decoder result 247 | map4 func (Decoder decodeA) (Decoder decodeB) (Decoder decodeC) (Decoder decodeD) = 248 | Decoder <| 249 | \bites offset -> 250 | let 251 | (aOffset, a) = decodeA bites offset 252 | (bOffset, b) = decodeB bites aOffset 253 | (cOffset, c) = decodeC bites bOffset 254 | (dOffset, d) = decodeD bites cOffset 255 | in 256 | (dOffset, func a b c d) 257 | 258 | 259 | {-| Combine five decoders. If you need to combine more things, it is possible 260 | to define more of these with `map2` or `andThen`. 261 | -} 262 | map5 : (a -> b -> c -> d -> e -> result) -> Decoder a -> Decoder b -> Decoder c -> Decoder d -> Decoder e -> Decoder result 263 | map5 func (Decoder decodeA) (Decoder decodeB) (Decoder decodeC) (Decoder decodeD) (Decoder decodeE) = 264 | Decoder <| 265 | \bites offset -> 266 | let 267 | (aOffset, a) = decodeA bites offset 268 | (bOffset, b) = decodeB bites aOffset 269 | (cOffset, c) = decodeC bites bOffset 270 | (dOffset, d) = decodeD bites cOffset 271 | (eOffset, e) = decodeE bites dOffset 272 | in 273 | (eOffset, func a b c d e) 274 | 275 | 276 | 277 | -- AND THEN 278 | 279 | 280 | {-| Decode something **and then** use that information to decode something 281 | else. This is most common with strings or sequences where you need to read 282 | how long the value is going to be: 283 | 284 | import Bytes exposing (Endianness(..)) 285 | import Bytes.Decode as Decode 286 | 287 | string : Decoder String 288 | string = 289 | Decode.unsignedInt32 BE 290 | |> Decode.andThen Decode.string 291 | 292 | Check out the docs for [`succeed`](#succeed), [`fail`](#fail), and 293 | [`loop`](#loop) to see `andThen` used in more ways! 294 | -} 295 | andThen : (a -> Decoder b) -> Decoder a -> Decoder b 296 | andThen callback (Decoder decodeA) = 297 | Decoder <| 298 | \bites offset -> 299 | let 300 | (newOffset, a) = decodeA bites offset 301 | (Decoder decodeB) = callback a 302 | in 303 | decodeB bites newOffset 304 | 305 | 306 | {-| A decoder that always succeeds with a certain value. Maybe we are making 307 | a `Maybe` decoder: 308 | 309 | import Bytes.Decode as Decode exposing (Decoder) 310 | 311 | maybe : Decoder a -> Decoder (Maybe a) 312 | maybe decoder = 313 | let 314 | helper n = 315 | if n == 0 then 316 | Decode.succeed Nothing 317 | else 318 | Decode.map Just decoder 319 | in 320 | Decode.unsignedInt8 321 | |> Decode.andThen helper 322 | 323 | If the first byte is `00000000` then it is `Nothing`, otherwise we start 324 | decoding the value and put it in a `Just`. 325 | -} 326 | succeed : a -> Decoder a 327 | succeed a = 328 | Decoder (\_ offset -> (offset,a)) 329 | 330 | 331 | {-| A decoder that always fails. This can be useful when using `andThen` to 332 | decode custom types: 333 | 334 | import Bytes exposing (Endianness(..)) 335 | import Bytes.Encode as Encode 336 | import Bytes.Decode as Decode 337 | 338 | type Distance = Yards Float | Meters Float 339 | 340 | toEncoder : Distance -> Encode.Encoder 341 | toEncoder distance = 342 | case distance of 343 | Yards n -> Encode.sequence [ Encode.unsignedInt8 0, Encode.float32 BE n ] 344 | Meters n -> Encode.sequence [ Encode.unsignedInt8 1, Encode.float32 BE n ] 345 | 346 | decoder : Decode.Decoder Distance 347 | decoder = 348 | Decode.unsignedInt8 349 | |> Decode.andThen pickDecoder 350 | 351 | pickDecoder : Int -> Decode.Decoder Distance 352 | pickDecoder tag = 353 | case tag of 354 | 0 -> Decode.map Yards (Decode.float32 BE) 355 | 1 -> Decode.map Meters (Decode.float32 BE) 356 | _ -> Decode.fail 357 | 358 | The encoding chosen here uses an 8-bit unsigned integer to indicate which 359 | variant we are working with. If we are working with yards do this, if we are 360 | working with meters do that, and otherwise something went wrong! 361 | -} 362 | fail : Decoder a 363 | fail = 364 | Decoder Elm.Kernel.Bytes.decodeFailure 365 | 366 | 367 | 368 | -- LOOP 369 | 370 | 371 | {-| Decide what steps to take next in your [`loop`](#loop). 372 | 373 | If you are `Done`, you give the result of the whole `loop`. If you decide to 374 | `Loop` around again, you give a new state to work from. Maybe you need to add 375 | an item to a list? Or maybe you need to track some information about what you 376 | just saw? 377 | 378 | **Note:** It may be helpful to learn about [finite-state machines][fsm] to get 379 | a broader intuition about using `state`. I.e. You may want to create a `type` 380 | that describes four possible states, and then use `Loop` to transition between 381 | them as you consume characters. 382 | 383 | [fsm]: https://en.wikipedia.org/wiki/Finite-state_machine 384 | -} 385 | type Step state a 386 | = Loop state 387 | | Done a 388 | 389 | 390 | {-| A decoder that can loop indefinitely. This can be helpful when parsing 391 | repeated structures, like a list: 392 | 393 | import Bytes exposing (Endianness(..)) 394 | import Bytes.Decode as Decode exposing (..) 395 | 396 | list : Decoder a -> Decoder (List a) 397 | list decoder = 398 | unsignedInt32 BE 399 | |> andThen (\len -> loop (len, []) (listStep decoder)) 400 | 401 | listStep : Decoder a -> (Int, List a) -> Decoder (Step (Int, List a) (List a)) 402 | listStep decoder (n, xs) = 403 | if n <= 0 then 404 | succeed (Done xs) 405 | else 406 | map (\x -> Loop (n - 1, x :: xs)) decoder 407 | 408 | The `list` decoder first reads a 32-bit unsigned integer. That determines how 409 | many items will be decoded. From there we use [`loop`](#loop) to track all the 410 | items we have parsed so far and figure out when to stop. 411 | -} 412 | loop : state -> (state -> Decoder (Step state a)) -> Decoder a 413 | loop state callback = 414 | Decoder (loopHelp state callback) 415 | 416 | 417 | loopHelp : state -> (state -> Decoder (Step state a)) -> Bytes -> Int -> (Int, a) 418 | loopHelp state callback bites offset = 419 | let 420 | (Decoder decoder) = callback state 421 | (newOffset, step) = decoder bites offset 422 | in 423 | case step of 424 | Loop newState -> 425 | loopHelp newState callback bites newOffset 426 | 427 | Done result -> 428 | ( newOffset, result ) 429 | --------------------------------------------------------------------------------