├── .eqc_ci ├── .github └── FUNDING.yml ├── .gitignore ├── EQC_CI_LICENCE.txt ├── LICENSE ├── NOTICE ├── README.md ├── doc ├── README.md ├── edoc-info ├── erlang.png ├── overview.edoc ├── sext.md └── stylesheet.css ├── examples └── tt_proto.erl ├── rebar.config ├── src ├── sext.app.src └── sext.erl └── test └── sext_eqc.erl /.eqc_ci: -------------------------------------------------------------------------------- 1 | {build,"mkdir -p ebin; erlc -o ebin -DEQC +\\{parse_transform,eqc_cover\\} src/*.erl test/*.erl"}. 2 | {test_path, "ebin"}. 3 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [uwiger] 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .rebar3 2 | _* 3 | .eunit 4 | *.o 5 | *.beam 6 | *.plt 7 | *.swp 8 | *.swo 9 | .erlang.cookie 10 | log 11 | erl_crash.dump 12 | .rebar 13 | logs 14 | _build 15 | Barrel.nonode@nohost 16 | *.iml 17 | .idea 18 | deps 19 | .eunit/ 20 | ebin 21 | *~ 22 | */*~ 23 | erl_crash.dump 24 | current_counterexample.eqc 25 | -------------------------------------------------------------------------------- /EQC_CI_LICENCE.txt: -------------------------------------------------------------------------------- 1 | This file is an agreement between Quviq AB ("Quviq"), Sven Hultins 2 | Gata 9, Gothenburg, Sweden, and the committers to the github 3 | repository in which the file appears ("the owner"). By placing this 4 | file in a github repository, the owner agrees to the terms below. 5 | 6 | The purpose of the agreement is to enable Quviq AB to provide a 7 | continuous integration service to the owner, whereby the code in the 8 | repository ("the source code") is tested using Quviq's test tools, and 9 | the test results are made available on the web. The test results 10 | include test output, generated test cases, and a copy of the source 11 | code in the repository annotated with coverage information ("the test 12 | results"). 13 | 14 | The owner agrees that Quviq may run the tests in the source code and 15 | display the test results on the web, without obligation. 16 | 17 | The owner warrants that running the tests in the source code and 18 | displaying the test results on the web violates no laws, licences or other 19 | agreements. In the event of such a violation, the owner accepts full 20 | responsibility. 21 | 22 | The owner warrants that the source code is not malicious, and will not 23 | mount an attack on either Quviq's server or any other server--for 24 | example by taking part in a denial of service attack, or by attempting 25 | to send unsolicited emails. 26 | 27 | The owner warrants that the source code does not attempt to reverse 28 | engineer Quviq's code. 29 | 30 | Quviq reserves the right to exclude repositories that break this 31 | agreement from its continuous integration service. 32 | 33 | Any dispute arising from the use of Quviq's service will be resolved 34 | under Swedish law. 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Original author: Ulf Wiger, Erlang Solutions, 2009 2 | 3 | Copyright transfered to Ulf Wiger 2014 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The sext application # 2 | 3 | [Build Status](http://quickcheck-ci.com/p/uwiger/sext) 4 | 5 | __Authors:__ Ulf Wiger ([`ulf@wiger.net`](mailto:ulf@wiger.net)). 6 | 7 | A sortable serialization library 8 | This library offers a serialization format (a la term_to_binary()) that 9 | preserves the Erlang term order. 10 | 11 | ``` 12 | 13 | Copyright 2014-2020 Ulf Wiger 14 | 15 | Licensed under the Apache License, Version 2.0 (the "License"); 16 | you may not use this file except in compliance with the License. 17 | You may obtain a copy of the License at 18 | 19 | http://www.apache.org/licenses/LICENSE-2.0 20 | 21 | Unless required by applicable law or agreed to in writing, software 22 | distributed under the License is distributed on an "AS IS" BASIS, 23 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 24 | See the License for the specific language governing permissions and 25 | limitations under the License. 26 | 27 | ``` 28 | 29 | 30 | # 1. Introduction # 31 | 32 | The idea to this library came out of the need for disk-based storage 33 | with ordered_set semantics in Erlang. One previous solution used Tokyo Cabinet, 34 | in which a C routine is used to hook into the sorting logic of TC. 35 | 36 | I thought a more generic solution would be to be able to have a version 37 | of term_to_binary() that respected the ordering semantics of Erlang terms. 38 | 39 | A new addition is support for 'sb32' encoding. This is my own version of 40 | Base32 encoding, with a slightly different alphabet, in order to preserve 41 | sorting properties while generating octet strings that are perfectly safe 42 | to use in file names. 43 | 44 | Another feature is "prefix encoding", which encodes a term and truncates 45 | the result if it encounters a "wildcard" (e.g. `'$1'` 46 | or `'_'`). This is to enable a convenient and efficient mapping 47 | of Erlang match specifications to e.g. prefix matching on the external storage 48 | and subsequent match_spec matching on the found erlang terms. 49 | 50 | The serialization format supports all Erlang types, and preserves the 51 | internal Erlang term order, with a few exceptions: 52 | 53 | * Floats are represented based on the IEEE 764 Binary 64 standard 54 | representation. This is the representation used by Erlang, specifically 55 | the representation used when encoding floats in binaries. To be exact, 56 | `sext` first normalizes the float by encoding it as an Erlang binary, then 57 | serializes it. 58 | 59 | * In Erlang, integers are cast to floats before comparing them to a float. 60 | This means e.g. that the relative sort order of `1` and `1.0` is undefined. 61 | It is not possible for `sext` to preserve this ambiguity after serialization, 62 | since it could only be done by producing identical encodings for the two 63 | terms, thereby sacrificing the property that encoding a value and then 64 | decoding it again, should produce the initial value. 65 | 66 | 67 | # 2. Specification # 68 | 69 | 70 | ## 2.1 Type tags ## 71 | 72 | Each data type is encoded using a type tag (1 byte) that represents its order 73 | in the global Erlang term ordering. The number type is divided into several 74 | subtypes, to facilitate a reasonably efficient representation: 75 | 76 | 77 | 78 |
TypeDescriptionTag
negbigNegative bignum8
neg4Negative 31-bit integer9
pos4Positive 31-bit integer10
posbigPositive bignum11
atomObj of type atom()12
referenceObj of type reference()13
portObj of type port()14
pidObj of type pid()15
tupleObj of type tuple()16
listObj of type map()17, 1
listObj of type list()17
binaryObj of type binary()18
bin_tailImproper-tail marker followed by binary or bitstring19
79 | 80 | 81 | 82 | ## 2.2 Tuples ## 83 | 84 | Tuples are encoded as the tuple tag, followed by a 32-bit size element, 85 | denoting the number of elements in the tuple, followed by each element 86 | in the tuple individually encoded. 87 | 88 | 89 | ## 2.3 Lists ## 90 | 91 | Lists are encoded as the list tag, followed by each element in the list 92 | individually encoded, followed by the number 2 (1 byte). 93 | 94 | Improper lists, e.g. `[1,2|3]`, have the number 1 inserted before the improper 95 | tail. Since this also indicates the last element in the list, no end byte 96 | is needed. This ensures that it sorts *before* any corresponding proper list, 97 | as long as the improper tail is not a binary (binaries are greater than the 98 | missing 'cons', or list, cell). 99 | 100 | Improper lists that have a binary or bitstring as 'tail', e.g. `[1,2|<<1>>]`, 101 | have a ?bin_tail (code 19) inserted before the tail. This ensures that it 102 | sorts after a corresponding proper list. 103 | 104 | 105 | ## 2.4 Binaries and bitstrings ## 106 | 107 | A binary is basically a bitstring whose size is a multiple of 8. From a sorting 108 | perspective, binaries and bitstrings are both sorted as left-aligned bit 109 | arrays. 110 | 111 | ```erlang 112 | 1> bitstring_to_list(<<11111111111:11>>). 113 | [56,<<7:3>>] 114 | ``` 115 | 116 | Binaries and bitstrings are encoded as the binary tag, followed by each whole 117 | byte, each padded with a leading 1 (one bit), followed by a number of 0-bits 118 | to pad again make the size a multiple of 8 bits, followed by a byte whose 119 | value is Bits, where Bits is the number of "remainder bits"; 8 if the original 120 | binary is 8-bit aligned. 121 | 122 | Example: 123 | 124 | ```erlang 125 | 2> sext:encode(<<1,2,3>>). 126 | <<18,128,192,160,96,8>> 127 | 3> <<18, 1:1,1, 1:1,2, 1:1,3, 0:5, 8>>. 128 | <<18,128,192,160,96,8>> 129 | ``` 130 | 131 | In the example above, we inserted 3 1-bits, and therefore had to insert 5 more 132 | pad bits (zeroes) at the end. The last byte is 8, signifying that the original 133 | binary was 8-bit aligned. 134 | 135 | If the remainder is not an even 8 bits, the remainder bits are padded with 136 | a 1-bit, just like the others, then left-aligned and padded up to a whole 137 | byte (excluding the 1-bit added in front). 138 | The value of the last byte is the bit size of the remainder. 139 | 140 | Example: 141 | 142 | ```erlang 143 | 2> sext:encode(<<1,2,3>>). 144 | <<18,128,192,160,96,8>> 145 | 3> sext:encode(<<18, 1:1,1, 1:1,2, 1:1,3, 0:5, 8>>). 146 | <<18,128,192,160,96,8>> 147 | ``` 148 | 149 | The first part of the bitstring is encoded exactly like above. The number 4:3 150 | is first padded with 1 then padded at the end to become a whole byte. Then 151 | an additional pad, 0:4, is inserted to compensate for the fact that we have 152 | inserted 4 1-bits. Finally, the last byte is 3, to signify the size of the 153 | remainder. 154 | 155 | 156 | ## 2.5 Positive Numbers ## 157 | 158 | Numbers are encoded as the corresponding type tag, followed by the integer 159 | part, a marker indicating the presence of a fraction part, and the fraction 160 | part, if any. The integer part is encoded differently depending on the size 161 | of the value. The fraction part is encoded as a binary (without the 'binary' 162 | type tag). 163 | 164 | 165 | ### 2.5.1 Positive small integers, pos4 ### 166 | 167 | Integers up to 31 bits are encoded as << ?pos4, I:31, F:1 >> 168 | where I is the integer value, and F is 1 if a fraction part follows; 169 | 0 otherwise. 170 | 171 | 172 | ### 2.5.2 Positive large integers ### 173 | 174 | Larger integers are converted to a byte string and then encoded like 175 | binaries (without the 'binary' type tag), followed by a byte signifying 176 | whether a fraction part follows (1 if yes; 0 otherwise). 177 | 178 | ```erlang 179 | Bytes = encode_big(I), 180 | << ?pos_big, Bytes/binary, F:8 >> 181 | ``` 182 | 183 | 184 | ### 2.5.3 Fraction part of positive numbers ### 185 | 186 | The representation of floating point numbers is based on the [IEEE 764 Binary 64 standard representation](http://en.wikipedia.org/wiki/Double_precision_floating-point_format). This is also the representation used by Erlang: 187 | 188 | ```erlang 189 | <> = <> 190 | ``` 191 | 192 | The encoding extracts the integer part and encodes it as a positive integer 193 | (either pos4 or pos_big), flags the presence of a fraction part, and encodes 194 | the fraction part as a binary (without the binary tag). 195 | 196 | 197 | ## 2.6 Negative Numbers ## 198 | 199 | 200 | ### 2.6.1 Small negative numbers ### 201 | 202 | ```erlang 203 | << ?neg4:8, IRep:31, F:1 >> 204 | ``` 205 | 206 | A negative number I is encoded as IRep = Max + I, where Max is the largest 207 | possible number that can be represented with the number of bits present for 208 | the given subtype. For example, Max for neg4 is 0x7FFF FFFF (31 bits). 209 | Keep in mind that I < 0. 210 | 211 | The fraction flag is inverted, compared to the pos4 representation, so it will 212 | be 1 if there is no fraction part; 0 otherwise. 213 | 214 | 215 | ### 2.6.2 Large negative numbers ### 216 | 217 | Larger negative numbers are encoded as: 218 | 219 | ```erlang 220 | encode_negbig(I) -> 221 | {Words, Max} = get_max(-I), 222 | Bin = encode_bin_elems(list_to_binary(encode_big(Max + I)), 223 | WordsRep = 16#FFFFffff - Words, 224 | << ?neg_big:8, WordsRep:32, Bin/binary, F:8 >>. 225 | ``` 226 | 227 | That is, get_max() figures out how many 64-bit words are needed to represent 228 | -I (the positive number), and also gives the maximum value that can be 229 | represented in so many words. WordsRep in essence becomes a sub-subtag of 230 | the negative bignum. 231 | 232 | 233 | ### 2.6.3 Fraction of negative numbers ### 234 | 235 | The fraction is encoded almost like the inverse of the positive fraction 236 | (as a "negative binary", if such a thing existed). Each byte is padded with 237 | a 0-bit rather than a 1-bit, and the byte itself is replaced by 16#ff - Byte. 238 | The sequence is then padded with 1s to become a multiple of 8 bits. 239 | 240 | The last byte, denoting the number of significant bits in the last byte, 241 | is similarly inverted. 242 | 243 | 244 | ## 2.7 Atoms ## 245 | 246 | Atoms are encoded as the atom tag, followed by the string representation of 247 | the atom using the binary encoding described above (but without the binary 248 | tag). 249 | 250 | 251 | ## 2.8 References ## 252 | 253 | The encoding of references is perhaps best described by the code: 254 | 255 | ```erlang 256 | encode_ref(R) -> 257 | RBin = term_to_binary(R), 258 | <<131,114,_Len:16,100,NLen:16,Name:NLen/binary,Rest/binary>> = RBin, 259 | NameEnc = encode_bin_elems(Name), 260 | RestEnc = encode_bin_elems(Rest), 261 | <>. 262 | ``` 263 | 264 | where encode_bin_elems(B) encodes the argument B the same way as a binary 265 | (excluding the 'binary' type tag). 266 | 267 | 268 | ## 2.9 Ports ## 269 | 270 | The encoding of ports is perhaps best described by the code: 271 | 272 | ```erlang 273 | encode_port(P) -> 274 | PBin = term_to_binary(P), 275 | <<131,102,100,ALen:16,Name:ALen/binary,Rest:5/binary>> = PBin, 276 | NameEnc = encode_bin_elems(Name), 277 | <>. 278 | ``` 279 | 280 | 281 | ## 2.10 Pids ## 282 | 283 | The encoding of ports is perhaps best described by the code: 284 | 285 | ```erlang 286 | encode_pid(P) -> 287 | PBin = term_to_binary(P), 288 | <<131,103,100,ALen:16,Name:ALen/binary,Rest:9/binary>> = PBin, 289 | NameEnc = encode_bin_elems(Name), 290 | <>. 291 | ``` 292 | 293 | 294 | ## 2.11 Maps ## 295 | 296 | The encoding of maps is currently experimental. 297 | Maps sort between tuples and lists. Since the smallest list is represented 298 | by `<<17, 2>>`, maps encoding starts with `<<17, 1>>` (introducing a new tag 299 | would break backwards compatibility), followed by the size of the map (4 bytes), 300 | and each Key-Value pair in the map. 301 | 302 | 303 | ## Modules ## 304 | 305 | 306 | 307 |
sext
308 | 309 | -------------------------------------------------------------------------------- /doc/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # The sext application # 4 | 5 | __Authors:__ Ulf Wiger ([`ulf@wiger.net`](mailto:ulf@wiger.net)). 6 | 7 | A sortable serialization library 8 | This library offers a serialization format (a la term_to_binary()) that 9 | preserves the Erlang term order. 10 | 11 | ``` 12 | 13 | Copyright 2010 Erlang Solutions Ltd. 14 | 15 | Licensed under the Apache License, Version 2.0 (the "License"); 16 | you may not use this file except in compliance with the License. 17 | You may obtain a copy of the License at 18 | 19 | http://www.apache.org/licenses/LICENSE-2.0 20 | 21 | Unless required by applicable law or agreed to in writing, software 22 | distributed under the License is distributed on an "AS IS" BASIS, 23 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 24 | See the License for the specific language governing permissions and 25 | limitations under the License. 26 | 27 | ``` 28 | 29 | 30 | # 1. Introduction # 31 | 32 | The idea to this library came out of the need for disk-based storage 33 | with ordered_set semantics in Erlang. One previous solution used Tokyo Cabinet, 34 | in which a C routine is used to hook into the sorting logic of TC. 35 | 36 | I thought a more generic solution would be to be able to have a version 37 | of term_to_binary() that respected the ordering semantics of Erlang terms. 38 | 39 | A new addition is support for 'sb32' encoding. This is my own version of 40 | Base32 encoding, with a slightly different alphabet, in order to preserve 41 | sorting properties while generating octet strings that are perfectly safe 42 | to use in file names. 43 | 44 | Another feature is "prefix encoding", which encodes a term and truncates 45 | the result if it encounters a "wildcard" (e.g. `'$1'` 46 | or `'_'`). This is to enable a convenient and efficient mapping 47 | of Erlang match specifications to e.g. prefix matching on the external storage 48 | and subsequent match_spec matching on the found erlang terms. 49 | 50 | The serialization format supports all Erlang types, and preserves the 51 | internal Erlang term order, with a few exceptions: 52 | 53 | * Floats are represented based on the IEEE 764 Binary 64 standard 54 | representation. This is the representation used by Erlang, specifically 55 | the representation used when encoding floats in binaries. To be exact, 56 | `sext` first normalizes the float by encoding it as an Erlang binary, then 57 | serializes it. 58 | 59 | * In Erlang, integers are cast to floats before comparing them to a float. 60 | This means e.g. that the relative sort order of `1` and `1.0` is undefined. 61 | It is not possible for `sext` to preserve this ambiguity after serialization, 62 | since it could only be done by producing identical encodings for the two 63 | terms, thereby sacrificing the property that encoding a value and then 64 | decoding it again, should produce the initial value. 65 | 66 | 67 | # 2. Specification # 68 | 69 | 70 | ## 2.1 Type tags ## 71 | 72 | Each data type is encoded using a type tag (1 byte) that represents its order 73 | in the global Erlang term ordering. The number type is divided into several 74 | subtypes, to facilitate a reasonably efficient representation: 75 | 76 | 77 | 78 |
TypeDescriptionTag
negbigNegative bignum8
neg4Negative 31-bit integer9
pos4Positive 31-bit integer10
posbigPositive bignum11
atomObj of type atom()12
referenceObj of type reference()13
portObj of type port()14
pidObj of type pid()15
tupleObj of type tuple()16
listObj of type map()17, 1
listObj of type list()17
binaryObj of type binary()18
bin_tailImproper-tail marker followed by binary or bitstring19
79 | 80 | 81 | 82 | ## 2.2 Tuples ## 83 | 84 | Tuples are encoded as the tuple tag, followed by a 32-bit size element, 85 | denoting the number of elements in the tuple, followed by each element 86 | in the tuple individually encoded. 87 | 88 | 89 | ## 2.3 Lists ## 90 | 91 | Lists are encoded as the list tag, followed by each element in the list 92 | individually encoded, followed by the number 2 (1 byte). 93 | 94 | Improper lists, e.g. `[1,2|3]`, have the number 1 inserted before the improper 95 | tail. Since this also indicates the last element in the list, no end byte 96 | is needed. This ensures that it sorts *before* any corresponding proper list, 97 | as long as the improper tail is not a binary (binaries are greater than the 98 | missing 'cons', or list, cell). 99 | 100 | Improper lists that have a binary or bitstring as 'tail', e.g. `[1,2|<<1>>]`, 101 | have a ?bin_tail (code 19) inserted before the tail. This ensures that it 102 | sorts after a corresponding proper list. 103 | 104 | 105 | ## 2.4 Binaries and bitstrings ## 106 | 107 | A binary is basically a bitstring whose size is a multiple of 8. From a sorting 108 | perspective, binaries and bitstrings are both sorted as left-aligned bit 109 | arrays. 110 | 111 | ```erlang 112 | 1> bitstring_to_list(<<11111111111:11>>). 113 | [56,<<7:3>>] 114 | ``` 115 | 116 | Binaries and bitstrings are encoded as the binary tag, followed by each whole 117 | byte, each padded with a leading 1 (one bit), followed by a number of 0-bits 118 | to pad again make the size a multiple of 8 bits, followed by a byte whose 119 | value is Bits, where Bits is the number of "remainder bits"; 8 if the original 120 | binary is 8-bit aligned. 121 | 122 | Example: 123 | 124 | ```erlang 125 | 2> sext:encode(<<1,2,3>>). 126 | <<18,128,192,160,96,8>> 127 | 3> <<18, 1:1,1, 1:1,2, 1:1,3, 0:5, 8>>. 128 | <<18,128,192,160,96,8>> 129 | ``` 130 | 131 | In the example above, we inserted 3 1-bits, and therefore had to insert 5 more 132 | pad bits (zeroes) at the end. The last byte is 8, signifying that the original 133 | binary was 8-bit aligned. 134 | 135 | If the remainder is not an even 8 bits, the remainder bits are padded with 136 | a 1-bit, just like the others, then left-aligned and padded up to a whole 137 | byte (excluding the 1-bit added in front). 138 | The value of the last byte is the bit size of the remainder. 139 | 140 | Example: 141 | 142 | ```erlang 143 | 2> sext:encode(<<1,2,3>>). 144 | <<18,128,192,160,96,8>> 145 | 3> sext:encode(<<18, 1:1,1, 1:1,2, 1:1,3, 0:5, 8>>). 146 | <<18,128,192,160,96,8>> 147 | ``` 148 | 149 | The first part of the bitstring is encoded exactly like above. The number 4:3 150 | is first padded with 1 then padded at the end to become a whole byte. Then 151 | an additional pad, 0:4, is inserted to compensate for the fact that we have 152 | inserted 4 1-bits. Finally, the last byte is 3, to signify the size of the 153 | remainder. 154 | 155 | 156 | ## 2.5 Positive Numbers ## 157 | 158 | Numbers are encoded as the corresponding type tag, followed by the integer 159 | part, a marker indicating the presence of a fraction part, and the fraction 160 | part, if any. The integer part is encoded differently depending on the size 161 | of the value. The fraction part is encoded as a binary (without the 'binary' 162 | type tag). 163 | 164 | 165 | ### 2.5.1 Positive small integers, pos4 ### 166 | 167 | Integers up to 31 bits are encoded as << ?pos4, I:31, F:1 >> 168 | where I is the integer value, and F is 1 if a fraction part follows; 169 | 0 otherwise. 170 | 171 | 172 | ### 2.5.2 Positive large integers ### 173 | 174 | Larger integers are converted to a byte string and then encoded like 175 | binaries (without the 'binary' type tag), followed by a byte signifying 176 | whether a fraction part follows (1 if yes; 0 otherwise). 177 | 178 | ```erlang 179 | Bytes = encode_big(I), 180 | << ?pos_big, Bytes/binary, F:8 >> 181 | ``` 182 | 183 | 184 | ### 2.5.3 Fraction part of positive numbers ### 185 | 186 | The representation of floating point numbers is based on the [IEEE 764 Binary 64 standard representation](http://en.wikipedia.org/wiki/Double_precision_floating-point_format). This is also the representation used by Erlang: 187 | 188 | ```erlang 189 | <> = <> 190 | ``` 191 | 192 | The encoding extracts the integer part and encodes it as a positive integer 193 | (either pos4 or pos_big), flags the presence of a fraction part, and encodes 194 | the fraction part as a binary (without the binary tag). 195 | 196 | 197 | ## 2.6 Negative Numbers ## 198 | 199 | 200 | ### 2.6.1 Small negative numbers ### 201 | 202 | ```erlang 203 | << ?neg4:8, IRep:31, F:1 >> 204 | ``` 205 | 206 | A negative number I is encoded as IRep = Max + I, where Max is the largest 207 | possible number that can be represented with the number of bits present for 208 | the given subtype. For example, Max for neg4 is 0x7FFF FFFF (31 bits). 209 | Keep in mind that I < 0. 210 | 211 | The fraction flag is inverted, compared to the pos4 representation, so it will 212 | be 1 if there is no fraction part; 0 otherwise. 213 | 214 | 215 | ### 2.6.2 Large negative numbers ### 216 | 217 | Larger negative numbers are encoded as: 218 | 219 | ```erlang 220 | encode_negbig(I) -> 221 | {Words, Max} = get_max(-I), 222 | Bin = encode_bin_elems(list_to_binary(encode_big(Max + I)), 223 | WordsRep = 16#FFFFffff - Words, 224 | << ?neg_big:8, WordsRep:32, Bin/binary, F:8 >>. 225 | ``` 226 | 227 | That is, get_max() figures out how many 64-bit words are needed to represent 228 | -I (the positive number), and also gives the maximum value that can be 229 | represented in so many words. WordsRep in essence becomes a sub-subtag of 230 | the negative bignum. 231 | 232 | 233 | ### 2.6.3 Fraction of negative numbers ### 234 | 235 | The fraction is encoded almost like the inverse of the positive fraction 236 | (as a "negative binary", if such a thing existed). Each byte is padded with 237 | a 0-bit rather than a 1-bit, and the byte itself is replaced by 16#ff - Byte. 238 | The sequence is then padded with 1s to become a multiple of 8 bits. 239 | 240 | The last byte, denoting the number of significant bits in the last byte, 241 | is similarly inverted. 242 | 243 | 244 | ## 2.7 Atoms ## 245 | 246 | Atoms are encoded as the atom tag, followed by the string representation of 247 | the atom using the binary encoding described above (but without the binary 248 | tag). 249 | 250 | 251 | ## 2.8 References ## 252 | 253 | The encoding of references is perhaps best described by the code: 254 | 255 | ```erlang 256 | encode_ref(R) -> 257 | RBin = term_to_binary(R), 258 | <<131,114,_Len:16,100,NLen:16,Name:NLen/binary,Rest/binary>> = RBin, 259 | NameEnc = encode_bin_elems(Name), 260 | RestEnc = encode_bin_elems(Rest), 261 | <>. 262 | ``` 263 | 264 | where encode_bin_elems(B) encodes the argument B the same way as a binary 265 | (excluding the 'binary' type tag). 266 | 267 | 268 | ## 2.9 Ports ## 269 | 270 | The encoding of ports is perhaps best described by the code: 271 | 272 | ```erlang 273 | encode_port(P) -> 274 | PBin = term_to_binary(P), 275 | <<131,102,100,ALen:16,Name:ALen/binary,Rest:5/binary>> = PBin, 276 | NameEnc = encode_bin_elems(Name), 277 | <>. 278 | ``` 279 | 280 | 281 | ## 2.10 Pids ## 282 | 283 | The encoding of ports is perhaps best described by the code: 284 | 285 | ```erlang 286 | encode_pid(P) -> 287 | PBin = term_to_binary(P), 288 | <<131,103,100,ALen:16,Name:ALen/binary,Rest:9/binary>> = PBin, 289 | NameEnc = encode_bin_elems(Name), 290 | <>. 291 | ``` 292 | 293 | 294 | ## 2.11 Maps ## 295 | 296 | The encoding of maps is currently experimental. 297 | Maps sort between tuples and lists. Since the smallest list is represented 298 | by `<<17, 2>>`, maps encoding starts with `<<17, 1>>` (introducing a new tag 299 | would break backwards compatibility), followed by the size of the map (4 bytes), 300 | and each Key-Value pair in the map. 301 | 302 | 303 | ## Modules ## 304 | 305 | 306 | 307 |
sext
308 | 309 | -------------------------------------------------------------------------------- /doc/edoc-info: -------------------------------------------------------------------------------- 1 | %% encoding: UTF-8 2 | {application,sext}. 3 | {packages,[]}. 4 | {modules,[sext]}. 5 | -------------------------------------------------------------------------------- /doc/erlang.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uwiger/sext/c22486add9cc374dc8138b1f547c0999a1922a65/doc/erlang.png -------------------------------------------------------------------------------- /doc/overview.edoc: -------------------------------------------------------------------------------- 1 | @author Ulf Wiger 2 | @doc A sortable serialization library 3 | This library offers a serialization format (a la term_to_binary()) that 4 | preserves the Erlang term order. 5 | 6 |
  7 | Copyright 2010 Erlang Solutions Ltd.
  8 | 
  9 | Licensed under the Apache License, Version 2.0 (the "License");
 10 | you may not use this file except in compliance with the License.
 11 | You may obtain a copy of the License at
 12 | 
 13 | http://www.apache.org/licenses/LICENSE-2.0
 14 | 
 15 | Unless required by applicable law or agreed to in writing, software
 16 | distributed under the License is distributed on an "AS IS" BASIS,
 17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18 | See the License for the specific language governing permissions and
 19 | limitations under the License.
 20 | 
21 | 22 |

1. Introduction

23 | 24 | The idea to this library came out of the need for disk-based storage 25 | with ordered_set semantics in Erlang. One previous solution used Tokyo Cabinet, 26 | in which a C routine is used to hook into the sorting logic of TC. 27 | 28 | I thought a more generic solution would be to be able to have a version 29 | of term_to_binary() that respected the ordering semantics of Erlang terms. 30 | 31 | A new addition is support for 'sb32' encoding. This is my own version of 32 | Base32 encoding, with a slightly different alphabet, in order to preserve 33 | sorting properties while generating octet strings that are perfectly safe 34 | to use in file names. 35 | 36 | Another feature is "prefix encoding", which encodes a term and truncates 37 | the result if it encounters a "wildcard" (e.g. '$1' 38 | or '_'). This is to enable a convenient and efficient mapping 39 | of Erlang match specifications to e.g. prefix matching on the external storage 40 | and subsequent match_spec matching on the found erlang terms. 41 | 42 | The serialization format supports all Erlang types, and preserves the 43 | internal Erlang term order, with a few exceptions: 44 | 45 | * Floats are represented based on the IEEE 764 Binary 64 standard 46 | representation. This is the representation used by Erlang, specifically 47 | the representation used when encoding floats in binaries. To be exact, 48 | `sext' first normalizes the float by encoding it as an Erlang binary, then 49 | serializes it. 50 | 51 | * In Erlang, integers are cast to floats before comparing them to a float. 52 | This means e.g. that the relative sort order of `1' and `1.0' is undefined. 53 | It is not possible for `sext' to preserve this ambiguity after serialization, 54 | since it could only be done by producing identical encodings for the two 55 | terms, thereby sacrificing the property that encoding a value and then 56 | decoding it again, should produce the initial value. 57 | 58 |

2. Specification

59 | 60 |

2.1 Type tags

61 | 62 | Each data type is encoded using a type tag (1 byte) that represents its order 63 | in the global Erlang term ordering. The number type is divided into several 64 | subtypes, to facilitate a reasonably efficient representation: 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 |
TypeDescriptionTag
negbigNegative bignum8
neg4Negative 31-bit integer9
pos4Positive 31-bit integer10
posbigPositive bignum11
atomObj of type atom()12
referenceObj of type reference()13
portObj of type port()14
pidObj of type pid()15
tupleObj of type tuple()16
listObj of type map()17, 1
listObj of type list()17
binaryObj of type binary()18
bin_tailImproper-tail marker followed by binary or bitstring19
138 | 139 |

2.2 Tuples

140 | 141 | Tuples are encoded as the tuple tag, followed by a 32-bit size element, 142 | denoting the number of elements in the tuple, followed by each element 143 | in the tuple individually encoded. 144 | 145 |

2.3 Lists

146 | 147 | Lists are encoded as the list tag, followed by each element in the list 148 | individually encoded, followed by the number 2 (1 byte). 149 | 150 | Improper lists, e.g. `[1,2|3]', have the number 1 inserted before the improper 151 | tail. Since this also indicates the last element in the list, no end byte 152 | is needed. This ensures that it sorts *before* any corresponding proper list, 153 | as long as the improper tail is not a binary (binaries are greater than the 154 | missing 'cons', or list, cell). 155 | 156 | Improper lists that have a binary or bitstring as 'tail', e.g. `[1,2|<<1>>]', 157 | have a ?bin_tail (code 19) inserted before the tail. This ensures that it 158 | sorts after a corresponding proper list. 159 | 160 |

2.4 Binaries and bitstrings

161 | 162 | A binary is basically a bitstring whose size is a multiple of 8. From a sorting 163 | perspective, binaries and bitstrings are both sorted as left-aligned bit 164 | arrays. 165 | 166 |
 bitstring_to_list(<<11111111111:11>>).
167 | [56,<<7:3>>]]]>
168 | 169 | Binaries and bitstrings are encoded as the binary tag, followed by each whole 170 | byte, each padded with a leading 1 (one bit), followed by a number of 0-bits 171 | to pad again make the size a multiple of 8 bits, followed by a byte whose 172 | value is Bits, where Bits is the number of "remainder bits"; 8 if the original 173 | binary is 8-bit aligned. 174 | 175 | Example: 176 | 177 |
 sext:encode(<<1,2,3>>).
178 | <<18,128,192,160,96,8>>
179 | 3> <<18, 1:1,1, 1:1,2, 1:1,3, 0:5, 8>>.
180 | <<18,128,192,160,96,8>>]]>
181 | 182 | In the example above, we inserted 3 1-bits, and therefore had to insert 5 more 183 | pad bits (zeroes) at the end. The last byte is 8, signifying that the original 184 | binary was 8-bit aligned. 185 | 186 | If the remainder is not an even 8 bits, the remainder bits are padded with 187 | a 1-bit, just like the others, then left-aligned and padded up to a whole 188 | byte (excluding the 1-bit added in front). 189 | The value of the last byte is the bit size of the remainder. 190 | 191 | Example: 192 | 193 |
 sext:encode(<<1,2,3>>).
194 | <<18,128,192,160,96,8>>
195 | 3> sext:encode(<<18, 1:1,1, 1:1,2, 1:1,3, 0:5, 8>>).
196 | <<18,128,192,160,96,8>>]]>
197 | 198 | The first part of the bitstring is encoded exactly like above. The number 4:3 199 | is first padded with 1 then padded at the end to become a whole byte. Then 200 | an additional pad, 0:4, is inserted to compensate for the fact that we have 201 | inserted 4 1-bits. Finally, the last byte is 3, to signify the size of the 202 | remainder. 203 | 204 |

2.5 Positive Numbers

205 | 206 | Numbers are encoded as the corresponding type tag, followed by the integer 207 | part, a marker indicating the presence of a fraction part, and the fraction 208 | part, if any. The integer part is encoded differently depending on the size 209 | of the value. The fraction part is encoded as a binary (without the 'binary' 210 | type tag). 211 | 212 |

2.5.1 Positive small integers, pos4

213 | 214 | Integers up to 31 bits are encoded as << ?pos4, I:31, F:1 >> 215 | where I is the integer value, and F is 1 if a fraction part follows; 216 | 0 otherwise. 217 | 218 |

2.5.2 Positive large integers

219 | 220 | Larger integers are converted to a byte string and then encoded like 221 | binaries (without the 'binary' type tag), followed by a byte signifying 222 | whether a fraction part follows (1 if yes; 0 otherwise). 223 | 224 |
>]]>
226 | 227 |

2.5.3 Fraction part of positive numbers

228 | 229 | The representation of floating point numbers is based on the IEEE 764 Binary 64 standard representation. This is also the representation used by Erlang: 230 | 231 |
> = <>]]>
232 | 233 | The encoding extracts the integer part and encodes it as a positive integer 234 | (either pos4 or pos_big), flags the presence of a fraction part, and encodes 235 | the fraction part as a binary (without the binary tag). 236 | 237 |

2.6 Negative Numbers

238 | 239 |

2.6.1 Small negative numbers

240 | 241 |
>]]>
242 | 243 | A negative number I is encoded as IRep = Max + I, where Max is the largest 244 | possible number that can be represented with the number of bits present for 245 | the given subtype. For example, Max for neg4 is 0x7FFF FFFF (31 bits). 246 | Keep in mind that I < 0. 247 | 248 | The fraction flag is inverted, compared to the pos4 representation, so it will 249 | be 1 if there is no fraction part; 0 otherwise. 250 | 251 |

2.6.2 Large negative numbers

252 | 253 | Larger negative numbers are encoded as: 254 | 255 |

256 |     {Words, Max} = get_max(-I),
257 |     Bin = encode_bin_elems(list_to_binary(encode_big(Max + I)),
258 |     WordsRep = 16#FFFFffff - Words,
259 |     << ?neg_big:8, WordsRep:32, Bin/binary, F:8 >>.]]>
260 | 261 | That is, get_max() figures out how many 64-bit words are needed to represent 262 | -I (the positive number), and also gives the maximum value that can be 263 | represented in so many words. WordsRep in essence becomes a sub-subtag of 264 | the negative bignum. 265 | 266 |

2.6.3 Fraction of negative numbers

267 | 268 | The fraction is encoded almost like the inverse of the positive fraction 269 | (as a "negative binary", if such a thing existed). Each byte is padded with 270 | a 0-bit rather than a 1-bit, and the byte itself is replaced by 16#ff - Byte. 271 | The sequence is then padded with 1s to become a multiple of 8 bits. 272 | 273 | The last byte, denoting the number of significant bits in the last byte, 274 | is similarly inverted. 275 | 276 |

2.7 Atoms

277 | 278 | Atoms are encoded as the atom tag, followed by the string representation of 279 | the atom using the binary encoding described above (but without the binary 280 | tag). 281 | 282 |

2.8 References

283 | 284 | The encoding of references is perhaps best described by the code: 285 | 286 |

287 |     RBin = term_to_binary(R),
288 |     <<131,114,_Len:16,100,NLen:16,Name:NLen/binary,Rest/binary>> = RBin,
289 |     NameEnc = encode_bin_elems(Name),
290 |     RestEnc = encode_bin_elems(Rest),
291 |     <>.]]>
292 | 293 | where encode_bin_elems(B) encodes the argument B the same way as a binary 294 | (excluding the 'binary' type tag). 295 | 296 |

2.9 Ports

297 | 298 | The encoding of ports is perhaps best described by the code: 299 | 300 |

301 |     PBin = term_to_binary(P),
302 |     <<131,102,100,ALen:16,Name:ALen/binary,Rest:5/binary>> = PBin,
303 |     NameEnc = encode_bin_elems(Name),
304 |     <>.]]>
305 | 306 |

2.10 Pids

307 | 308 | The encoding of ports is perhaps best described by the code: 309 | 310 |

311 |     PBin = term_to_binary(P),
312 |     <<131,103,100,ALen:16,Name:ALen/binary,Rest:9/binary>> = PBin,
313 |     NameEnc = encode_bin_elems(Name),
314 |     <>.]]>
315 | 316 |

2.11 Maps

317 | 318 | The encoding of maps is currently experimental. 319 | 320 | Maps sort between tuples and lists. Since the smallest list is represented 321 | by `<<17, 2>>', maps encoding starts with `<<17, 1>>' (introducing a new tag 322 | would break backwards compatibility), followed by the size of the map (4 bytes), 323 | and each Key-Value pair in the map. 324 | 325 | @end 326 | -------------------------------------------------------------------------------- /doc/sext.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Module sext # 4 | * [Description](#description) 5 | * [Function Index](#index) 6 | * [Function Details](#functions) 7 | 8 | 9 | Sortable serialization library. 10 | __Authors:__ Ulf Wiger ([`ulf@wiger.net`](mailto:ulf@wiger.net)). 11 | 12 | 13 | ## Function Index ## 14 | 15 | 16 |
decode/1Decodes a binary generated using the function sext:encode/1.
decode_hex/1
decode_next/1Decode a binary stream, returning the next decoded term and the 17 | stream remainder.
decode_sb32/1Decodes a binary generated using the function encode_sb32/1.
encode/1Encodes any Erlang term into a binary.
encode/2Encodes an Erlang term using legacy bignum encoding.
encode_hex/1Encodes any Erlang term into a hex-encoded binary.
encode_sb32/1Encodes any Erlang term into an sb32-encoded binary.
from_hex/1Converts from a hex-encoded binary into a 'normal' binary.
from_sb32/1Converts from an sb32-encoded bitstring into a 'normal' bitstring.
partial_decode/1Decode a sext-encoded term or prefix embedded in a byte stream.
prefix/1Encodes a binary for prefix matching of similar encoded terms.
prefix_hex/1Generates a hex-encoded binary for prefix matching.
prefix_sb32/1Generates an sb32-encoded binary for prefix matching.
to_hex/1Converts a binary into a hex-encoded binary 18 | This is conventional hex encoding, with the proviso that 19 | only capital letters are used, e.g.
to_sb32/1Converts a bitstring into an sb-encoded bitstring.
20 | 21 | 22 | 23 | 24 | ## Function Details ## 25 | 26 | 27 | 28 | ### decode/1 ### 29 | 30 | 31 |

 32 | decode(B::binary()) -> term()
 33 | 
34 |
35 | 36 | Decodes a binary generated using the function [`sext:encode/1`](sext.md#encode-1). 37 | 38 | 39 | ### decode_hex/1 ### 40 | 41 | `decode_hex(Data) -> any()` 42 | 43 | 44 | 45 | 46 | ### decode_next/1 ### 47 | 48 | 49 |

 50 | decode_next(X1::Bin) -> {N, Rest}
 51 | 
52 |
53 | 54 | 55 | Decode a binary stream, returning the next decoded term and the 56 | stream remainder 57 | 58 | 59 | This function will raise an exception if the beginning of `Bin` is not 60 | a valid sext-encoded term. 61 | 62 | 63 | ### decode_sb32/1 ### 64 | 65 | `decode_sb32(Data) -> any()` 66 | 67 | Decodes a binary generated using the function [`encode_sb32/1`](#encode_sb32-1). 68 | 69 | 70 | ### encode/1 ### 71 | 72 | 73 |

 74 | encode(T::term()) -> binary()
 75 | 
76 |
77 | 78 | Encodes any Erlang term into a binary. 79 | The lexical sorting properties of the encoded binary match those of the 80 | original Erlang term. That is, encoded terms sort the same way as the 81 | original terms would. 82 | 83 | 84 | ### encode/2 ### 85 | 86 | 87 |

 88 | encode(T::term(), Legacy::boolean()) -> binary()
 89 | 
90 |
91 | 92 | 93 | Encodes an Erlang term using legacy bignum encoding. 94 | On March 4 2013, Basho noticed that encoded bignums didn't always sort 95 | properly. This bug has been fixed, but the encoding of bignums necessarily 96 | changed in an incompatible way. 97 | 98 | 99 | 100 | The new decode/1 version can read the old bignum format, but the old 101 | version obviously cannot read the new. Using `encode(Term, true)`, the term 102 | will be encoded using the old format. 103 | 104 | 105 | Use only as transition support. This function will be deprecated in time. 106 | 107 | 108 | ### encode_hex/1 ### 109 | 110 | 111 |

112 | encode_hex(Term::any()) -> binary()
113 | 
114 |
115 | 116 | 117 | Encodes any Erlang term into a hex-encoded binary. 118 | This is similar to [`encode/1`](#encode-1), but produces an octet string that 119 | can be used without escaping in file names (containing only the characters 120 | 0..9 and A..F). The sorting properties are preserved. 121 | 122 | 123 | Note: The encoding used is regular hex-encoding, with the proviso that only 124 | capital letters are used (mixing upper- and lowercase characters would break 125 | the sorting property). 126 | 127 | 128 | ### encode_sb32/1 ### 129 | 130 | 131 |

132 | encode_sb32(Term::any()) -> binary()
133 | 
134 |
135 | 136 | 137 | Encodes any Erlang term into an sb32-encoded binary. 138 | This is similar to [`encode/1`](#encode-1), but produces an octet string that 139 | can be used without escaping in file names (containing only the characters 140 | 0..9, A..V and '-'). The sorting properties are preserved. 141 | 142 | 143 | Note: The encoding used is inspired by the base32 encoding described in 144 | RFC3548, but uses a different alphabet in order to preserve the sort order. 145 | 146 | 147 | ### from_hex/1 ### 148 | 149 | 150 |

151 | from_hex(Bin::binary()) -> binary()
152 | 
153 |
154 | 155 | 156 | Converts from a hex-encoded binary into a 'normal' binary 157 | 158 | 159 | This function is the reverse of [`to_hex/1`](#to_hex-1). 160 | 161 | 162 | 163 | ### from_sb32/1 ### 164 | 165 | 166 |

167 | from_sb32(Bits::bitstring()) -> bitstring()
168 | 
169 |
170 | 171 | 172 | Converts from an sb32-encoded bitstring into a 'normal' bitstring 173 | 174 | 175 | This function is the reverse of [`to_sb32/1`](#to_sb32-1). 176 | 177 | 178 | ### partial_decode/1 ### 179 | 180 | 181 |

182 | partial_decode(Other::Bytes) -> {full | partial, DecodedTerm, Rest}
183 | 
184 |
185 | 186 | 187 | Decode a sext-encoded term or prefix embedded in a byte stream. 188 | 189 | 190 | Example: 191 | 192 | ``` 193 | 1> T = sext:encode({a,b,c}). 194 | <<16,0,0,0,3,12,176,128,8,12,177,0,8,12,177,128,8>> 195 | 2> sext:partial_decode(<<T/binary, "tail">>). 196 | {full,{a,b,c},<<"tail">>} 197 | 3> P = sext:prefix({a,b,'_'}). 198 | <<16,0,0,0,3,12,176,128,8,12,177,0,8>> 199 | 4> sext:partial_decode(<<P/binary, "tail">>). 200 | {partial,{a,b,'_'},<<"tail">>} 201 | ``` 202 | 203 | 204 | 205 | Note that a decoded prefix may not be exactly like the encoded prefix. 206 | For example, `['_']` will be encoded as 207 | `<<17>>`, i.e. only the 'list' opcode. The 208 | decoded prefix will be `'_'`, since the encoded prefix would 209 | also match the empty list. The decoded prefix will always be a prefix to 210 | anything to which the original prefix is a prefix. 211 | 212 | 213 | For tuples, `{1,'_',3}` encoded and decoded, will result in 214 | `{1,'_','_'}`, i.e. the tuple size is kept, but the elements 215 | after the first wildcard are replaced with wildcards. 216 | 217 | 218 | ### prefix/1 ### 219 | 220 | 221 |

222 | prefix(X::term()) -> binary()
223 | 
224 |
225 | 226 | Encodes a binary for prefix matching of similar encoded terms. 227 | Lists and tuples can be prefixed by using the `'_'` marker, 228 | similarly to Erlang match specifications. For example: 229 | 230 | * `prefix({1,2,'_','_'})` will result in a binary that is 231 | the same as the first part of any encoded 4-tuple with the first two 232 | elements being 1 and 2. The prefix algorithm will search for the 233 | first `'_'`, and treat all following elements as if they 234 | were `'_'`. 235 | 236 | * `prefix([1,2|'_'])` will result in a binary that is the 237 | same as the first part of any encoded list where the first two elements 238 | are 1 and 2. `prefix([1,2,'_'])` will give the same result, 239 | as the prefix pattern is the same for all lists starting with 240 | `[1,2|...]`. 241 | 242 | * `prefix(Binary)` will result in a binary that is the same as the 243 | encoded version of Binary, except that, instead of padding and 244 | terminating, the encoded binary is truncated to the longest byte-aligned 245 | binary. The same is done for bitstrings. 246 | 247 | * `prefix({1,[1,2|'_'],'_'})` will prefix-encode the second 248 | element, and let it end the resulting binary. This prefix will match 249 | any 3-tuple where the first element is 1 and the second element is a 250 | list where the first two elements are 1 and 2. 251 | 252 | * `prefix([1,[1|'_']|'_'])` will result in a prefix that 253 | matches all lists where the first element is 1 and the second element is 254 | a list where the first element is 1. 255 | 256 | * For all other data types, the prefix is the same as the encoded term. 257 | 258 | 259 | 260 | 261 | ### prefix_hex/1 ### 262 | 263 | 264 |

265 | prefix_hex(X::term()) -> binary()
266 | 
267 |
268 | 269 | Generates a hex-encoded binary for prefix matching. 270 | This is similar to [`prefix/1`](#prefix-1), but generates a prefix for binaries 271 | encoded with [`encode_hex/1`](#encode_hex-1), rather than [`encode/1`](#encode-1). 272 | 273 | 274 | ### prefix_sb32/1 ### 275 | 276 | 277 |

278 | prefix_sb32(X::term()) -> binary()
279 | 
280 |
281 | 282 | Generates an sb32-encoded binary for prefix matching. 283 | This is similar to [`prefix/1`](#prefix-1), but generates a prefix for binaries 284 | encoded with [`encode_sb32/1`](#encode_sb32-1), rather than [`encode/1`](#encode-1). 285 | 286 | 287 | ### to_hex/1 ### 288 | 289 | 290 |

291 | to_hex(Bin::binary()) -> binary()
292 | 
293 |
294 | 295 | Converts a binary into a hex-encoded binary 296 | This is conventional hex encoding, with the proviso that 297 | only capital letters are used, e.g. `0..9A..F`. 298 | 299 | 300 | ### to_sb32/1 ### 301 | 302 | 303 |

304 | to_sb32(Bits::bitstring()) -> binary()
305 | 
306 |
307 | 308 | 309 | Converts a bitstring into an sb-encoded bitstring 310 | 311 | 312 | 313 | sb32 (Sortable base32) is a variant of RFC3548, slightly rearranged to 314 | preserve the lexical sorting properties. Base32 was chosen to avoid 315 | filename-unfriendly characters. Also important is that the padding 316 | character be less than any character in the alphabet 317 | 318 | 319 | sb32 alphabet: 320 | 321 | ``` 322 | 323 | 0 0 6 6 12 C 18 I 24 O 30 U 324 | 1 1 7 7 13 D 19 J 25 P 31 V 325 | 2 2 8 8 14 E 20 K 26 Q (pad) - 326 | 3 3 9 9 15 F 21 L 27 R 327 | 4 4 10 A 16 G 22 M 28 S 328 | 5 5 11 B 17 H 23 N 29 T 329 | ``` 330 | 331 | -------------------------------------------------------------------------------- /doc/stylesheet.css: -------------------------------------------------------------------------------- 1 | /* standard EDoc style sheet */ 2 | body { 3 | font-family: Verdana, Arial, Helvetica, sans-serif; 4 | margin-left: .25in; 5 | margin-right: .2in; 6 | margin-top: 0.2in; 7 | margin-bottom: 0.2in; 8 | color: #000000; 9 | background-color: #ffffff; 10 | } 11 | h1,h2 { 12 | margin-left: -0.2in; 13 | } 14 | div.navbar { 15 | background-color: #add8e6; 16 | padding: 0.2em; 17 | } 18 | h2.indextitle { 19 | padding: 0.4em; 20 | background-color: #add8e6; 21 | } 22 | h3.function,h3.typedecl { 23 | background-color: #add8e6; 24 | padding-left: 1em; 25 | } 26 | div.spec { 27 | margin-left: 2em; 28 | background-color: #eeeeee; 29 | } 30 | a.module,a.package { 31 | text-decoration:none 32 | } 33 | a.module:hover,a.package:hover { 34 | background-color: #eeeeee; 35 | } 36 | ul.definitions { 37 | list-style-type: none; 38 | } 39 | ul.index { 40 | list-style-type: none; 41 | background-color: #eeeeee; 42 | } 43 | 44 | /* 45 | * Minor style tweaks 46 | */ 47 | ul { 48 | list-style-type: square; 49 | } 50 | table { 51 | border-collapse: collapse; 52 | } 53 | td { 54 | padding: 3 55 | } 56 | -------------------------------------------------------------------------------- /examples/tt_proto.erl: -------------------------------------------------------------------------------- 1 | %%============================================================================== 2 | %% Copyright 2010 Erlang Solutions Ltd. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %%============================================================================== 16 | %% 17 | %% @author Ulf Wiger 18 | %% @doc Bare-bones Tokyo Tyrant interface library. 19 | %% This is an example to illustrate the use of Sortable EXernal Term (sext) 20 | %% encoding. 21 | %% 22 | %% Tokyo Tyrant (TT) is an add-on 23 | %% to Tokyo Cabinet, adding 24 | %% support for concurrent and remote access to Tokyo Cabinet (TC) through a 25 | %% TCP socket interface. TC supports storage of variable-length byte strings 26 | %% as key-value pairs. The storage type can either be RAM-only or disk, and 27 | %% either hash table or B-tree. 28 | %% 29 | %% Using sext-encoded terms in combination with TT's B-tree storage, it is 30 | %% possible to store very large amounts of data on disk while honoring the 31 | %% Erlang Term ordering semantics. Using the `sext:prefix/1' function, it is 32 | %% also possible to perform efficient range queries. 33 | %% 34 | %% Tokyo Tyrant is easy to install and get running. This module does not show 35 | %% how that is done, nor does it automate the task of starting a TT server. 36 | %% 37 | %% @end 38 | -module(tt_proto). 39 | 40 | -behaviour(gen_server). 41 | 42 | -export([open/2, 43 | put/3, 44 | get/2, 45 | mget/2, 46 | keys/2]). 47 | 48 | %% internal exports 49 | -export([init/1, 50 | handle_call/3, 51 | handle_cast/2, 52 | handle_info/2, 53 | terminate/2, 54 | code_change/3]). 55 | 56 | -compile(export_all). 57 | 58 | -define(DEFAULT_PORT, 1978). 59 | 60 | -record(st, {socket}). 61 | 62 | %% @spec open(Name, Opts) -> {ok, pid()} 63 | %% Opts = [Opt] 64 | %% Opt = {regname,atom()} | {port, integer()} 65 | %% 66 | %% @doc Connects to a running Tokyo Tyrant database server. 67 | %% The default port, 1978, will be used unless another port is specified. 68 | %% If the `regname' option is present, the Tokyo Tyrant proxy process will 69 | %% register itself under that name, and the registered name can be used as 70 | %% an alias when accessing the database. 71 | %% @end 72 | %% 73 | open(Name, Opts) -> 74 | case lists:keyfind(regname, 1, Opts) of 75 | false -> 76 | gen_server:start_link(?MODULE, {Name, Opts}, []); 77 | {_,RegName} -> 78 | gen_server:start_link({local,RegName}, ?MODULE, 79 | {Name, Opts}, []) 80 | end. 81 | 82 | %% @spec put(TT, Key::term(), Value::term()) -> ok | {error, Reason} 83 | %% @doc Inserts a `{Key,Value}' tuple in the database TT. 84 | %% @end 85 | %% 86 | put(TT, Key, Value) -> 87 | cmd(TT, {put, encode(Key), encode(Value)}). 88 | 89 | %% @spec get(TT, Key::term()) -> {ok, Value} | {error, Reason} 90 | %% @doc Looks up Key in the database TT. 91 | %% Returns `{ok,Value}' if found, otherwise `{error,Reason}'. 92 | %% @end 93 | %% 94 | get(TT, Key) -> 95 | case ask(TT, {get, encode(Key)}) of 96 | {ok, Vb} -> 97 | {ok, decode(Vb)}; 98 | Err -> 99 | Err 100 | end. 101 | 102 | %% @spec mget(TT, Keys::[term()]) -> {ok, [{K,V}]} | {error,Reason} 103 | %% @doc Fetches multiple objects from the database TT. 104 | %% All objects matching the list of keys will be returned. If no objects match, 105 | %% the return value will be `{ok, []}'. 106 | %% @end 107 | %% 108 | mget(TT, Keys) when is_list(Keys) -> 109 | Enc = [encode(K) || K <- Keys], 110 | case ask(TT, {mget, Enc}) of 111 | {ok, KVs} -> 112 | {ok, [{decode(K),decode(V)} || {K,V} <- KVs]}; 113 | Err -> 114 | Err 115 | end. 116 | 117 | %% @spec keys(TT, Prefix) -> {ok, Keys} | {error, Reason} 118 | %% @doc Performs a prefix search in database TT based on Prefix. 119 | %% For details on Prefix, @see sext:prefix/1. 120 | %% @end 121 | %% 122 | keys(TT, Prefix) -> 123 | case ask(TT, {keys, encode_prefix(Prefix), 100}) of 124 | {ok, Keys} -> 125 | {ok, [decode(K) || K <- Keys]}; 126 | Err -> 127 | Err 128 | end. 129 | 130 | 131 | %% Tell TokyoTyrant to perform an operation. No reply other than 132 | %% 0 (success), or non-zero (failure). 133 | %% 134 | cmd(TT, Req) -> 135 | gen_server:call(TT, {cmd, Req}). 136 | 137 | ask(TT, Req) -> 138 | gen_server:call(TT, {ask, Req}). 139 | 140 | encode(Term) -> 141 | sext:encode(Term). 142 | 143 | 144 | decode(Bin) -> 145 | sext:decode(Bin). 146 | 147 | encode_prefix(Term) -> 148 | sext:prefix(Term). 149 | 150 | 151 | %% @hidden 152 | init({_Name, Opts}) -> 153 | %% TTName = tt_name(Name, Opts), 154 | Port = proplists:get_value(port, Opts, ?DEFAULT_PORT), 155 | case gen_tcp:connect({127,0,0,1}, Port, [binary,{active,false}, 156 | {nodelay,true}]) of 157 | {ok, Socket} -> 158 | {ok, #st{socket = Socket}}; 159 | Error -> 160 | Error 161 | end. 162 | 163 | %% @hidden 164 | handle_call({cmd, Req}, _From, #st{socket = Sock} = S) -> 165 | Msg = mk_req(Req), 166 | gen_tcp:send(Sock, Msg), 167 | Reply = cmd_reply(Sock), 168 | {reply, Reply, S}; 169 | handle_call({ask, Req}, _From, #st{socket = Sock} = S) -> 170 | Msg = mk_req(Req), 171 | gen_tcp:send(Sock, Msg), 172 | Reply = ask_reply(Req, Sock), 173 | {reply, Reply, S}. 174 | 175 | 176 | %% @hidden 177 | handle_info(Msg, S) -> 178 | io:fwrite("handle_info(~p, ~p)~n", [Msg, S]), 179 | {noreply, S}. 180 | 181 | %% @hidden 182 | handle_cast(_, S) -> 183 | {stop, unknown_cast, S}. 184 | 185 | %% @hidden 186 | terminate(Reason, S) -> 187 | io:fwrite("terminate(~p, ~p)~n", [Reason, S]). 188 | 189 | %% @hidden 190 | code_change(_FromVsn, S, _Extra) -> 191 | {ok, S}. 192 | 193 | 194 | mk_req({put, K, V}) -> 195 | KSz = byte_size(K), 196 | VSz = byte_size(V), 197 | << 16#c8, 16#10, KSz:32, VSz:32, K/binary, V/binary >>; 198 | mk_req({get, K}) -> 199 | KSz = byte_size(K), 200 | << 16#c8, 16#30, KSz:32, K/binary >>; 201 | mk_req({mget, Ks}) -> 202 | N = length(Ks), 203 | Packed = pack_values(Ks), 204 | << 16#c8, 16#31, 205 | N:32, Packed/binary >>; 206 | mk_req({keys, Prefix, Limit}) -> 207 | PSz = byte_size(Prefix), 208 | << 16#c8, 16#58, PSz:32, Limit:32, Prefix/binary >>. 209 | 210 | pack_values(Values) -> 211 | pack_values(Values, <<>>). 212 | 213 | pack_values([H|T], Acc) -> 214 | Sz = byte_size(H), 215 | Bin = << Sz:32, H/binary >>, 216 | pack_values(T, << Acc/binary, Bin/binary >>); 217 | pack_values([], Acc) -> 218 | Acc. 219 | 220 | 221 | cmd_reply(Sock) -> 222 | case gen_tcp:recv(Sock, 1) of 223 | {ok, <<0>>} -> 224 | ok; 225 | {ok, <>} -> 226 | {error, E}; 227 | {error,_} = Err -> 228 | Err 229 | end. 230 | 231 | ask_reply(Req, Sock) -> 232 | Method = element(1, Req), 233 | case gen_tcp:recv(Sock, 0) of 234 | {ok, <<0, Rest/binary>>} -> 235 | try get_reply(Method, Rest, Sock) 236 | catch 237 | throw:{error,Reason} -> 238 | {error, Reason} 239 | end; 240 | {ok, <>} -> 241 | {error, E}; 242 | {error,_} = Err -> 243 | Err 244 | end. 245 | 246 | get_reply(get, Data, Sock) -> 247 | {Val, _} = get_value(Data, Sock), 248 | {ok, Val}; 249 | get_reply(mget, Data, Sock) -> 250 | {N, D1} = get_word(Data, Sock), 251 | Result = get_N(N, D1, fun get_k_v/2, Sock), 252 | {ok, Result}; 253 | get_reply(keys, Data, Sock) -> 254 | {N, D1} = get_word(Data, Sock), 255 | Result = get_N(N, D1, fun get_value/2, Sock), 256 | {ok, Result}. 257 | 258 | get_word(<>, _Sock) -> 259 | {W, Rest}; 260 | get_word(Sofar, Sock) -> 261 | Bin = get_data(Sock), 262 | get_word(<>, Sock). 263 | 264 | get_value(<>, _Sock) -> 265 | {V, Rest}; 266 | get_value(Sofar, Sock) -> 267 | Bin = get_data(Sock), 268 | get_value(<>, Sock). 269 | 270 | get_k_v(<>, _Sock) -> 271 | {{K,V}, Rest}; 272 | get_k_v(Sofar, Sock) -> 273 | Bin = get_data(Sock), 274 | get_k_v(<>, Sock). 275 | 276 | get_N(0, _, _, _) -> 277 | []; 278 | get_N(N, Data, F, Sock) when N > 0 -> 279 | {Item, Rest} = F(Data, Sock), 280 | [Item | get_N(N-1, Rest, F, Sock)]. 281 | 282 | get_data(Sock) -> 283 | case gen_tcp:recv(Sock, 0) of 284 | {ok, Bin} -> 285 | Bin; 286 | {error,_} = Err -> 287 | throw(Err) 288 | end. 289 | 290 | -------------------------------------------------------------------------------- /rebar.config: -------------------------------------------------------------------------------- 1 | %% -*- erlang -*- 2 | {erl_opts, [debug_info]}. 3 | 4 | {profiles, [{docs, [{deps, 5 | [ 6 | {edown, 7 | {git, 8 | "https://github.com/uwiger/edown.git", 9 | {tag, 10 | "0.8"}}} 11 | ]}, 12 | 13 | {edoc_opts, [{doclet, edown_doclet}, 14 | {packages, 15 | false}, 16 | {subpackages, 17 | true}, 18 | {top_level_readme, 19 | {"./README.md", 20 | "http://github.com/uwiger/sext"}}]}]} 21 | ]}. 22 | -------------------------------------------------------------------------------- /src/sext.app.src: -------------------------------------------------------------------------------- 1 | %% -*- erlang-indent-level: 4; indent-tabs-mode: nil -*- 2 | %%============================================================================== 3 | %% Copyright 2014-16 Ulf Wiger 4 | %% 5 | %% Licensed under the Apache License, Version 2.0 (the "License"); 6 | %% you may not use this file except in compliance with the License. 7 | %% You may obtain a copy of the License at 8 | %% 9 | %% http://www.apache.org/licenses/LICENSE-2.0 10 | %% 11 | %% Unless required by applicable law or agreed to in writing, software 12 | %% distributed under the License is distributed on an "AS IS" BASIS, 13 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | %% See the License for the specific language governing permissions and 15 | %% limitations under the License. 16 | %%============================================================================== 17 | 18 | %% @author Ulf Wiger 19 | %% @doc Sortable serialization of Erlang terms. 20 | %% @end 21 | {application, sext, 22 | [{description, "Sortable serialization library"}, 23 | {vsn, git}, 24 | {modules, []}, 25 | {registered, []}, 26 | {applications, [kernel, stdlib]}, 27 | {env, []}, 28 | 29 | {maintainers, ["Ulf Wiger"]}, 30 | {licenses, ["Apache 2.0"]}, 31 | {links, [{"Github", "https://github.com/uwiger/sext"}]} 32 | ]}. 33 | -------------------------------------------------------------------------------- /src/sext.erl: -------------------------------------------------------------------------------- 1 | %% -*- erlang-indent-level: 4; indent-tabs-mode: nil 2 | %%============================================================================== 3 | %% Copyright 2014-16 Ulf Wiger 4 | %% 5 | %% Licensed under the Apache License, Version 2.0 (the "License"); 6 | %% you may not use this file except in compliance with the License. 7 | %% You may obtain a copy of the License at 8 | %% 9 | %% http://www.apache.org/licenses/LICENSE-2.0 10 | %% 11 | %% Unless required by applicable law or agreed to in writing, software 12 | %% distributed under the License is distributed on an "AS IS" BASIS, 13 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | %% See the License for the specific language governing permissions and 15 | %% limitations under the License. 16 | %%============================================================================== 17 | %% 18 | %% @author Ulf Wiger 19 | %% @doc Sortable serialization library 20 | %% @end 21 | -module(sext). 22 | 23 | -export([encode/1, encode/2, decode/1, decode_next/1]). 24 | -export([encode_hex/1, decode_hex/1]). 25 | -export([encode_sb32/1, decode_sb32/1]). 26 | -export([prefix/1, 27 | partial_decode/1]). 28 | -export([prefix_hex/1]). 29 | -export([prefix_sb32/1]). 30 | -export([to_sb32/1, from_sb32/1]). 31 | -export([to_hex/1, from_hex/1]). 32 | 33 | -export([reverse_sext/1]). 34 | 35 | -export([pp/1]). % for debugging only 36 | 37 | -define(rev_sext , 4). 38 | %% 39 | -define(negbig , 8). 40 | -define(neg4 , 9). 41 | -define(pos4 , 10). 42 | -define(posbig , 11). 43 | -define(atom , 12). 44 | -define(reference, 13). 45 | -define(port , 14). 46 | -define(pid , 15). 47 | -define(tuple , 16). 48 | -define(list , 17). 49 | -define(binary , 18). 50 | -define(bin_tail , 19). 51 | 52 | -define(is_sext(X), 53 | X==?negbig; 54 | X==?neg4; 55 | X==?pos4; 56 | X==?posbig; 57 | X==?atom; 58 | X==?reference; 59 | X==?port; 60 | X==?pid; 61 | X==?tuple; 62 | X==?list; 63 | X==?binary; 64 | X==?bin_tail). 65 | 66 | -define(IMAX1, 16#ffffFFFFffffFFFF). 67 | 68 | %% -define(dbg(Fmt,Args), 69 | %% case get(dbg) of 70 | %% true -> io:fwrite("~p: " ++ Fmt, [?LINE|Args]); 71 | %% _ -> no_dbg 72 | %% end). 73 | -define(dbg(F,A),no_debug). 74 | 75 | %% @spec encode(T::term()) -> binary() 76 | %% @doc Encodes any Erlang term into a binary. 77 | %% The lexical sorting properties of the encoded binary match those of the 78 | %% original Erlang term. That is, encoded terms sort the same way as the 79 | %% original terms would. 80 | %% @end 81 | %% 82 | encode(X) -> encode(X, false). 83 | 84 | %% @spec encode(T::term(), Legacy::boolean()) -> binary() 85 | %% @doc Encodes an Erlang term using legacy bignum encoding. 86 | %% On March 4 2013, Basho noticed that encoded bignums didn't always sort 87 | %% properly. This bug has been fixed, but the encoding of bignums necessarily 88 | %% changed in an incompatible way. 89 | %% 90 | %% The new decode/1 version can read the old bignum format, but the old 91 | %% version obviously cannot read the new. Using `encode(Term, true)', the term 92 | %% will be encoded using the old format. 93 | %% 94 | %% Use only as transition support. This function will be deprecated in time. 95 | %% @end 96 | encode(X, Legacy) when is_tuple(X) -> encode_tuple(X, Legacy); 97 | encode(X, Legacy) when is_map(X) -> encode_map(X, Legacy); 98 | encode(X, Legacy) when is_list(X) -> encode_list(X, Legacy); 99 | encode(X, _) when is_pid(X) -> encode_pid(X); 100 | encode(X, _) when is_port(X) -> encode_port(X); 101 | encode(X, _) when is_reference(X) -> encode_ref(X); 102 | encode(X, Legacy) when is_number(X) -> encode_number(X, Legacy); 103 | encode(X, _) when is_binary(X) -> encode_binary(X); 104 | encode(X, _) when is_bitstring(X) -> encode_bitstring(X); 105 | encode(X, _) when is_atom(X) -> encode_atom(X). 106 | 107 | %% @spec reverse_sext(binary()) -> binary() 108 | %% @doc Reverses the sorting properties of a sext-encoded term. Reverted 109 | %% objects compare as smaller than all sext-encoded objects. 110 | %% 111 | %% No hex- or sb32-encoded variants are provided. Use the `to_hex/1' or 112 | %% `to_sb32/1' functions instead. 113 | %% @end 114 | reverse_sext(<> = B) when ?is_sext(X) -> 115 | NegB = encode_neg_bits(B), 116 | <>. 117 | 118 | %% @spec encode_sb32(Term::any()) -> binary() 119 | %% @doc Encodes any Erlang term into an sb32-encoded binary. 120 | %% This is similar to {@link encode/1}, but produces an octet string that 121 | %% can be used without escaping in file names (containing only the characters 122 | %% 0..9, A..V and '-'). The sorting properties are preserved. 123 | %% 124 | %% Note: The encoding used is inspired by the base32 encoding described in 125 | %% RFC3548, but uses a different alphabet in order to preserve the sort order. 126 | %% @end 127 | %% 128 | encode_sb32(Term) -> 129 | to_sb32(encode(Term)). 130 | 131 | %% @spec encode_hex(Term::any()) -> binary() 132 | %% @doc Encodes any Erlang term into a hex-encoded binary. 133 | %% This is similar to {@link encode/1}, but produces an octet string that 134 | %% can be used without escaping in file names (containing only the characters 135 | %% 0..9 and A..F). The sorting properties are preserved. 136 | %% 137 | %% Note: The encoding used is regular hex-encoding, with the proviso that only 138 | %% capital letters are used (mixing upper- and lowercase characters would break 139 | %% the sorting property). 140 | %% @end 141 | %% 142 | encode_hex(Term) -> 143 | to_hex(encode(Term)). 144 | 145 | %% @spec prefix(X::term()) -> binary() 146 | %% @doc Encodes a binary for prefix matching of similar encoded terms. 147 | %% Lists and tuples can be prefixed by using the '_' marker, 148 | %% similarly to Erlang match specifications. For example: 149 | %%
    150 | %%
  • prefix({1,2,'_','_'}) will result in a binary that is 151 | %% the same as the first part of any encoded 4-tuple with the first two 152 | %% elements being 1 and 2. The prefix algorithm will search for the 153 | %% first '_', and treat all following elements as if they 154 | %% were '_'.
  • 155 | %%
  • prefix([1,2|'_']) will result in a binary that is the 156 | %% same as the first part of any encoded list where the first two elements 157 | %% are 1 and 2. prefix([1,2,'_']) will give the same result, 158 | %% as the prefix pattern is the same for all lists starting with 159 | %% `[1,2|...]'.
  • 160 | %%
  • `prefix(Binary)' will result in a binary that is the same as the 161 | %% encoded version of Binary, except that, instead of padding and 162 | %% terminating, the encoded binary is truncated to the longest byte-aligned 163 | %% binary. The same is done for bitstrings.
  • 164 | %%
  • prefix({1,[1,2|'_'],'_'}) will prefix-encode the second 165 | %% element, and let it end the resulting binary. This prefix will match 166 | %% any 3-tuple where the first element is 1 and the second element is a 167 | %% list where the first two elements are 1 and 2.
  • 168 | %%
  • prefix([1,[1|'_']|'_']) will result in a prefix that 169 | %% matches all lists where the first element is 1 and the second element is 170 | %% a list where the first element is 1.
  • 171 | %%
  • For all other data types, the prefix is the same as the encoded term. 172 | %%
  • 173 | %%
174 | %% @end 175 | %% 176 | prefix(X) -> 177 | {_, P} = enc_prefix(X), 178 | P. 179 | 180 | enc_prefix(X) when is_tuple(X) -> prefix_tuple(X); 181 | enc_prefix(X) when is_list(X) -> prefix_list(X); 182 | enc_prefix(X) when is_pid(X) -> {false, encode_pid(X)}; 183 | enc_prefix(X) when is_port(X) -> {false, encode_port(X)}; 184 | enc_prefix(X) when is_reference(X) -> {false, encode_ref(X)}; 185 | enc_prefix(X) when is_number(X) -> {false, encode_number(X)}; 186 | enc_prefix(X) when is_binary(X) -> prefix_binary(X); 187 | enc_prefix(X) when is_bitstring(X) -> prefix_bitstring(X); 188 | enc_prefix(X) when is_atom(X) -> 189 | case is_wild(X) of 190 | true -> 191 | {true, <<>>}; 192 | false -> 193 | {false, encode_atom(X)} 194 | end. 195 | 196 | %% @spec prefix_sb32(X::term()) -> binary() 197 | %% @doc Generates an sb32-encoded binary for prefix matching. 198 | %% This is similar to {@link prefix/1}, but generates a prefix for binaries 199 | %% encoded with {@link encode_sb32/1}, rather than {@link encode/1}. 200 | %% @end 201 | %% 202 | prefix_sb32(X) -> 203 | chop_prefix_tail(to_sb32(prefix(X))). 204 | 205 | %% @spec prefix_hex(X::term()) -> binary() 206 | %% @doc Generates a hex-encoded binary for prefix matching. 207 | %% This is similar to {@link prefix/1}, but generates a prefix for binaries 208 | %% encoded with {@link encode_hex/1}, rather than {@link encode/1}. 209 | %% @end 210 | %% 211 | prefix_hex(X) -> 212 | to_hex(prefix(X)). 213 | 214 | %% Must chop of the pad character and the last encoded unit (which, if pad 215 | %% characters are present, is not a whole byte) 216 | %% 217 | chop_prefix_tail(Bin) -> 218 | Sz = byte_size(Bin), 219 | Sz6 = Sz-7, Sz4 = Sz - 5, Sz3 = Sz - 4, Sz1 = Sz - 2, 220 | case Bin of 221 | << P:Sz6/binary, _, "------" >> -> P; 222 | << P:Sz4/binary, _, "----" >> -> P; 223 | << P:Sz3/binary, _, "---" >> -> P; 224 | << P:Sz1/binary, _, "-" >> -> P; 225 | _ -> Bin 226 | end. 227 | 228 | %% @spec decode(B::binary()) -> term() 229 | %% @doc Decodes a binary generated using the function {@link sext:encode/1}. 230 | %% 231 | %% Note that a reverse-encoded binary (using {@link sext:reverse_sext/1}) 232 | %% decodes into the original sext-encoded binary, not into the term itself. 233 | %% In other words, if `R = reverse_sext(encode(T))', 234 | %% then `T = decode(decode(R))'. 235 | %% @end 236 | %% 237 | decode(Elems) -> 238 | case decode_next(Elems) of 239 | {Term, <<>>} -> Term; 240 | Other -> erlang:error(badarg, Other) 241 | end. 242 | 243 | %% spec decode_sb32(B::binary()) -> term() 244 | %% @doc Decodes a binary generated using the function {@link encode_sb32/1}. 245 | %% @end 246 | %% 247 | decode_sb32(Data) -> 248 | decode(from_sb32(Data)). 249 | 250 | decode_hex(Data) -> 251 | decode(from_hex(Data)). 252 | 253 | pp(none) -> ""; 254 | pp(B) when is_bitstring(B) -> 255 | [ $0 + I || <> <= B ]. 256 | 257 | encode_tuple(T, Legacy) -> 258 | Sz = size(T), 259 | encode_tuple_elems(1, Sz, T, <>, Legacy). 260 | 261 | prefix_tuple(T) -> 262 | Sz = size(T), 263 | Elems = tuple_to_list(T), 264 | prefix_tuple_elems(Elems, <>). 265 | 266 | %% It's easier to iterate over a tuple by converting it to a list, but 267 | %% since the tuple /can/ be huge, let's do it this way. 268 | encode_tuple_elems(P, Sz, T, Acc, Legacy) when P =< Sz -> 269 | E = encode(element(P,T), Legacy), 270 | encode_tuple_elems(P+1, Sz, T, <>, Legacy); 271 | encode_tuple_elems(_, _, _, Acc, _) -> 272 | Acc. 273 | 274 | prefix_tuple_elems([A|T], Acc) when is_atom(A) -> 275 | case is_wild(A) of 276 | true -> 277 | {true, Acc}; 278 | false -> 279 | E = encode(A), 280 | prefix_tuple_elems(T, <>) 281 | end; 282 | prefix_tuple_elems([H|T], Acc) -> 283 | case enc_prefix(H) of 284 | {true, P} -> 285 | {true, <>}; 286 | {false, E} -> 287 | prefix_tuple_elems(T, <>) 288 | end; 289 | prefix_tuple_elems([], Acc) -> 290 | {false, Acc}. 291 | 292 | encode_list(L, Legacy) -> 293 | encode_list_elems(L, <>, Legacy). 294 | 295 | prefix_list(L) -> 296 | prefix_list_elems(L, <>). 297 | 298 | encode_map(M, Legacy) -> 299 | Sz = map_size(M), 300 | maps:fold( 301 | fun(K,V,Acc) -> 302 | <> 304 | end, <>, M). 305 | 306 | 307 | encode_binary(B) -> 308 | Enc = encode_bin_elems(B), 309 | <>. 310 | 311 | prefix_binary(B) -> 312 | Enc = encode_bin_elems(B), 313 | {false, <>}. 314 | 315 | encode_bitstring(B) -> 316 | Enc = encode_bits_elems(B), 317 | <>. 318 | 319 | prefix_bitstring(B) -> 320 | Enc = encode_bits_elems(B), 321 | {false, <>}. 322 | 323 | encode_pid(P) -> 324 | case term_to_binary(P) of 325 | <<131,88,119,ALen:8,Name:ALen/binary,NS:8/binary,C:32>> -> 326 | encode_pid_new(Name, NS, C); 327 | <<131,88,100,ALen:16,Name:ALen/binary,NS:8/binary,C:32>> -> 328 | encode_pid_new(Name, NS, C); 329 | <<131,103,100,ALen:16,Name:ALen/binary,NS:8/binary,C:8>> -> 330 | true = C =< 3, 331 | encode_pid(Name, NS, <>) 332 | end. 333 | 334 | encode_pid_new(Name, NS, C) -> 335 | CBin = 336 | case C > 3 of 337 | true -> <<255, C:32>>; 338 | false -> <> 339 | end, 340 | encode_pid(Name, NS, CBin). 341 | 342 | encode_pid(Name, NS, C) -> 343 | NameEnc = encode_bin_elems(Name), 344 | <>. 345 | 346 | encode_port(P) -> 347 | case term_to_binary(P) of 348 | <<131,120,119,ALen:8,Name:ALen/binary,N:64,C:32>> -> 349 | case N bsr 28 of 350 | 0 -> encode_port_new(Name, <>, C); 351 | _ -> 352 | %% N was limited to 28 bits previously, meaning the initial byte 353 | %% in its binary was =< 15. We therefore prefix the 8-byte N with 354 | %% a byte with value 16 to signal the V4 format, and to ensure V4 355 | %% formats sort consistently with the previous format. In this 356 | %% case we don't need to try shortening the C(reation) field. 357 | encode_port(Name, <<16,N:64>>, <>) 358 | end; 359 | <<131,89,100,ALen:16,Name:ALen/binary,N:32,C:32>> -> 360 | 0 = N bsr 28, % assert 361 | encode_port_new(Name, <>, C); 362 | <<131,102,100,ALen:16,Name:ALen/binary,N:32,C:8>> -> 363 | 0 = N bsr 28, % assert 364 | true = C =< 3, 365 | encode_port(Name, <>, <>) 366 | end. 367 | 368 | encode_port_new(Name, N, C) -> 369 | CBin = 370 | case C > 3 of 371 | true -> <<255, C:32>>; 372 | false -> <> 373 | end, 374 | encode_port(Name, N, CBin). 375 | 376 | encode_port(Name, N, C) -> 377 | NameEnc = encode_bin_elems(Name), 378 | <>. 379 | 380 | encode_ref(R) -> 381 | case term_to_binary(R) of 382 | <<131,90,_Len:16,119,NLen:8,Name:NLen/binary,C:32,Rest/binary>> -> 383 | encode_ref_newer(Name, C, Rest); 384 | <<131,90,_Len:16,100,NLen:16,Name:NLen/binary,C:32,Rest/binary>> -> 385 | encode_ref_newer(Name, C, Rest); 386 | <<131,114,_Len:16,100,NLen:16,Name:NLen/binary,C:8,Rest/binary>> -> 387 | true = C =< 3, 388 | encode_ref(Name, <>) 389 | end. 390 | 391 | encode_ref_newer(Name, C, Rest) -> 392 | NewRest = 393 | case C > 3 of 394 | true -> <<255, C:32, Rest/binary>>; 395 | false -> <> 396 | end, 397 | encode_ref(Name, NewRest). 398 | 399 | encode_ref(Name, Rest) -> 400 | NameEnc = encode_bin_elems(Name), 401 | RestEnc = encode_bin_elems(Rest), 402 | <>. 403 | 404 | encode_atom(A) -> 405 | Bin = list_to_binary(atom_to_list(A)), 406 | Enc = encode_bin_elems(Bin), 407 | <>. 408 | 409 | encode_number(N) -> 410 | encode_number(N, false). 411 | 412 | encode_number(N, Legacy) when is_integer(N) -> 413 | encode_int(N, none, Legacy); 414 | encode_number(F, _Legacy) when is_float(F) -> 415 | encode_float(F). 416 | 417 | %% 418 | %% IEEE 764 Binary 64 standard representation 419 | %% http://en.wikipedia.org/wiki/Double_precision_floating-point_format 420 | %% 421 | %% |12345678 12345678 12345678 12345678 12345678 12345678 12345678 12345678 422 | %% |iEEEEEEE EEEEffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff| 423 | %% 424 | %% i: sign bit 425 | %% E: Exponent, 11 bits 426 | %% f: fraction, 52 bits 427 | %% 428 | %% We perform the following operations: 429 | %% - if E < 1023 (see Exponent bias), the integer part is 0 430 | %% 431 | encode_float(F) -> 432 | <> = <>, 433 | ?dbg("F = ~p | Exp0 = ~p | Frac = ~p~n", [cF, Exp0, Frac]), 434 | {Int0, Fraction} = 435 | case Exp0 - 1023 of 436 | NegExp when NegExp < 0 -> 437 | Offs = -NegExp, 438 | ?dbg("NegExp = ~p, Offs = ~p~n" 439 | "Frac = ~p~n", [NegExp, Offs, Frac]), 440 | {0, << 0:Offs, 1:1,Frac:52 >>}; 441 | Exp1 -> 442 | ?dbg("Exp1 = ~p~n", [Exp1]), 443 | if Exp1 >= 52 -> 444 | %% Decimal part will be zero 445 | {trunc(F), <<0:52>>}; 446 | true -> 447 | R = 52-Exp1, 448 | ?dbg("R = ~p~n", [R]), 449 | Exp2 = Exp1 + 1, % add the leading 1-bit 450 | ?dbg("Exp2 = ~p~n", [Exp2]), 451 | <> = <<1:1, Frac:52>>, 452 | ?dbg("I = ~p, Frac1 = ~p~n", [I,Frac1]), 453 | {I, <>} 454 | end 455 | end, 456 | if Sign == 1 -> 457 | %% explicitly encode a negative int, since Int0 can be zero. 458 | Int = if Int0 >= 0 -> -Int0; 459 | true -> Int0 460 | end, 461 | encode_neg_int(Int, Fraction); 462 | Sign == 0 -> 463 | encode_int(Int0, Fraction) 464 | end. 465 | 466 | encode_neg_int(Int, Fraction)-> 467 | encode_neg_int(Int, Fraction,false). 468 | encode_int(I, R) -> 469 | encode_int(I, R, false). 470 | 471 | encode_int(I,R, _Legacy) when I >= 0, I =< 16#7fffffff -> 472 | ?dbg("encode_int(~p, ~p)~n", [I,R]), 473 | if R == none -> 474 | << ?pos4, I:31, 0:1 >>; 475 | true -> 476 | RSz = bit_size(R), 477 | <> = R, 478 | ?dbg("Fraction = ~p~n", [Fraction]), 479 | if Fraction == 0 -> 480 | << ?pos4, I:31, 1:1, 8:8 >>; 481 | true -> 482 | Rbits = encode_bits_elems(R), 483 | << ?pos4, I:31, 1:1, Rbits/binary >> 484 | end 485 | end; 486 | encode_int(I,R, Legacy) when I > 16#7fffffff -> 487 | ?dbg("encode_int(~p, ~p)~n", [I,R]), 488 | Bytes = encode_big(I, Legacy), 489 | if R == none -> 490 | <>; 491 | true -> 492 | RSz = bit_size(R), 493 | <> = R, 494 | ?dbg("Fraction = ~p~n", [Fraction]), 495 | if Fraction == 0 -> 496 | << ?posbig, Bytes/binary, 1:8, 8:8 >>; 497 | true -> 498 | Rbits = encode_bits_elems(R), 499 | <> 500 | end 501 | end; 502 | encode_int(I, R, Legacy) when I < 0 -> 503 | encode_neg_int(I, R,Legacy). 504 | 505 | encode_neg_int(I,R,_Legacy) when I =< 0, I >= -16#7fffffff -> 506 | ?dbg("encode_neg_int(~p, ~p [sz: ~p])~n", [I,pp(R), try bit_size(R) catch error:_ -> "***" end]), 507 | Adj = max_value(31) + I, % keep in mind that I < 0 508 | ?dbg("Adj = ~p~n", [erlang:integer_to_list(Adj,2)]), 509 | if R == none -> 510 | << ?neg4, Adj:31, 1:1 >>; 511 | true -> 512 | Rbits = encode_neg_bits(R), 513 | ?dbg("R = ~p -> RBits = ~p~n", [pp(R), pp(Rbits)]), 514 | << ?neg4, Adj:31, 0:1, Rbits/binary >> 515 | end; 516 | encode_neg_int(I,R,Legacy) when I < -16#7fFFffFF -> 517 | ?dbg("encode_neg_int(BIG ~p)~n", [I]), 518 | Bytes = encode_big_neg(I,Legacy), 519 | ?dbg("Bytes = ~p~n", [Bytes]), 520 | if R == none -> 521 | <>; 522 | true -> 523 | Rbits = encode_neg_bits(R), 524 | ?dbg("R = ~p -> RBits = ~p~n", [pp(R), pp(Rbits)]), 525 | <> 526 | end. 527 | 528 | encode_big(I, Legacy) -> 529 | Bl = encode_big1(I), 530 | ?dbg("Bl = ~p~n", [Bl]), 531 | Bb = case Legacy of 532 | false -> 533 | prepend_size(list_to_binary(Bl)); 534 | true -> 535 | list_to_binary(Bl) 536 | end, 537 | ?dbg("Bb = ~p~n", [Bb]), 538 | encode_bin_elems(Bb). 539 | 540 | prepend_size(B) -> 541 | Sz = byte_size(B), 542 | <<255, (encode_size(Sz))/binary, B/binary>>. 543 | 544 | remove_size_bits(<<255, T/binary>>) -> 545 | {_, Rest} = untag_7bits(T, <<>>), 546 | Rest; 547 | remove_size_bits(B) -> 548 | %% legacy bignum 549 | B. 550 | 551 | encode_size(I) when I > 127 -> 552 | B = int_to_binary(I), 553 | tag_7bits(B); 554 | encode_size(I) -> 555 | <>. 556 | 557 | tag_7bits(B) when bit_size(B) > 7 -> 558 | <> = B, 559 | <<1:1, H:7, (tag_7bits(T))/binary>>; 560 | tag_7bits(B) -> 561 | Sz = bit_size(B), 562 | <> = B, 563 | <<0:1, I:7>>. 564 | 565 | untag_7bits(<<1:1, H:7, T/binary>>, Acc) -> 566 | untag_7bits(T, <>); 567 | untag_7bits(<<0:1, H:7, T/binary>>, Acc) -> 568 | AccBits = bit_size(Acc), 569 | HBits = 8 - (AccBits rem 8), 570 | {<>, T}. 571 | 572 | int_to_binary(I) when I =< 16#ff -> <>; 573 | int_to_binary(I) when I =< 16#ffff -> <>; 574 | int_to_binary(I) when I =< 16#ffffff -> <>; 575 | int_to_binary(I) when I =< 16#ffffffff -> <>; 576 | int_to_binary(I) when I =< 16#ffffffffff -> <>; 577 | int_to_binary(I) when I =< 16#ffffffffffff -> <>; 578 | int_to_binary(I) when I =< 16#ffffffffffffff -> <>; 579 | int_to_binary(I) when I =< 16#ffffffffffffffff -> <>; 580 | int_to_binary(I) -> 581 | %% Realm of the ridiculous 582 | list_to_binary( 583 | lists:dropwhile(fun(X) -> X==0 end, binary_to_list(<>))). 584 | 585 | %% This function exists for documentation, but not used right now. 586 | %% It's the reverse of encode_size/1, used for encoding bignums. 587 | %% 588 | %% decode_size(<<1:1, _/bitstring>> = T) -> 589 | %% {SzBin, Rest} = untag_7bits(T, <<>>), 590 | %% Bits = bit_size(SzBin), 591 | %% <> = SzBin, 592 | %% {Sz, Rest}; 593 | %% decode_size(<<0:1, H:7, T/binary>>) -> 594 | %% {H, T}. 595 | 596 | encode_big_neg(I,Legacy) -> 597 | {Words, Max} = get_max(-I), 598 | ?dbg("Words = ~p | Max = ~p~n", [Words,Max]), 599 | Iadj = Max + I, % keep in mind that I < 0 600 | ?dbg("IAdj = ~p~n", [Iadj]), 601 | Bin = encode_big(Iadj,Legacy), 602 | ?dbg("Bin = ~p~n", [Bin]), 603 | WordsAdj = 16#ffffFFFF - Words, 604 | ?dbg("WordsAdj = ~p~n", [WordsAdj]), 605 | <>. 606 | 607 | encode_big1(I) -> 608 | encode_big1(I, []). 609 | 610 | encode_big1(I, Acc) when I < 16#ff -> 611 | [I|Acc]; 612 | encode_big1(I, Acc) -> 613 | encode_big1(I bsr 8, [I band 16#ff | Acc]). 614 | 615 | encode_list_elems([], Acc, _) -> 616 | <>; 617 | encode_list_elems(B, Acc, Legacy) when is_bitstring(B) -> 618 | %% improper list 619 | <>; 620 | encode_list_elems(E, Acc, Legacy) when not(is_list(E)) -> 621 | %% improper list 622 | <>; 623 | encode_list_elems([H|T], Acc, Legacy) -> 624 | Enc = encode(H,Legacy), 625 | encode_list_elems(T, <>, Legacy). 626 | 627 | prefix_list_elems([], Acc) -> 628 | {false, <>}; 629 | prefix_list_elems(E, Acc) when not(is_list(E)) -> 630 | case is_wild(E) of 631 | true -> 632 | {true, Acc}; 633 | false -> 634 | Marker = if is_bitstring(E) -> ?bin_tail; 635 | true -> 1 636 | end, 637 | {Bool, P} = enc_prefix(E), 638 | {Bool, <>} 639 | end; 640 | prefix_list_elems([H|T], Acc) -> 641 | case enc_prefix(H) of 642 | {true, P} -> 643 | {true, <>}; 644 | {false, E} -> 645 | prefix_list_elems(T, <>) 646 | end. 647 | 648 | is_wild('_') -> 649 | true; 650 | is_wild(A) when is_atom(A) -> 651 | case atom_to_list(A) of 652 | "\$" ++ S -> 653 | try begin 654 | _ = list_to_integer(S), 655 | true 656 | end 657 | catch 658 | error:_ -> 659 | false 660 | end; 661 | _ -> 662 | false 663 | end; 664 | is_wild(_) -> 665 | false. 666 | 667 | encode_bin_elems(<<>>) -> 668 | <<8>>; 669 | encode_bin_elems(B) -> 670 | Pad = 8 - (size(B) rem 8), 671 | << (<< <<1:1, B1:8>> || <> <= B >>)/bitstring, 0:Pad, 8 >>. 672 | 673 | encode_neg_bits(<<>>) -> 674 | <<247>>; 675 | encode_neg_bits(B) -> 676 | {Padded, TailBits} = pad_neg_bytes(B), 677 | ?dbg("TailBits = ~p~n", [TailBits]), 678 | TailSz0 = bit_size(TailBits), 679 | TailSz = 16#ff - TailSz0, 680 | if TailSz0 == 0 -> 681 | Pad = 8 - (bit_size(Padded) rem 8), 682 | Ip = max_value(Pad), % e.g. max_value(3) -> 2#111 683 | <>; 684 | true -> 685 | ?dbg("TailSz0 = ~p~n", [TailSz0]), 686 | TailPad = 8 - TailSz0, 687 | ?dbg("TailPad = ~p~n", [TailPad]), 688 | Itp = (1 bsl TailPad)-1, 689 | ?dbg("Itp = ~p~n", [Itp]), 690 | Pad = 8 - ((bit_size(Padded) + 1) rem 8), 691 | ?dbg("Pad = ~p~n", [Pad]), 692 | Ip = max_value(Pad), 693 | ?dbg("Ip = ~p~n", [Ip]), 694 | ?dbg("Pad = ~p~n", [Pad]), 695 | ?dbg("TailSz = ~p~n", [TailSz]), 696 | <> 698 | end. 699 | 700 | pad_neg_bytes(Bin) -> 701 | pad_neg_bytes(Bin, <<>>). 702 | 703 | pad_neg_bytes(<>, Acc) -> 704 | H1 = 16#ff - H, 705 | pad_neg_bytes(T, <>); 706 | pad_neg_bytes(Bits, Acc) when is_bitstring(Bits) -> 707 | Sz = bit_size(Bits), 708 | Max = (1 bsl Sz) - 1, 709 | <> = Bits, 710 | I1 = Max - I0, 711 | {Acc, <>}. 712 | 713 | encode_bits_elems(B) -> 714 | {Padded, TailBits} = pad_bytes(B), 715 | TailSz = bit_size(TailBits), 716 | TailPad = 8-TailSz, 717 | Pad = 8 - ((TailSz + TailPad + bit_size(Padded) + 1) rem 8), 718 | <>. 719 | 720 | pad_bytes(Bin) -> 721 | pad_bytes(Bin, <<>>). 722 | 723 | pad_bytes(<>, Acc) -> 724 | pad_bytes(T, <>); 725 | pad_bytes(Bits, Acc) when is_bitstring(Bits) -> 726 | {Acc, Bits}. 727 | 728 | 729 | %% ------------------------------------------------------ 730 | %% Decoding routines 731 | 732 | -spec decode_next(binary()) -> {any(), binary()}. 733 | %% @spec decode_next(Bin) -> {N, Rest} 734 | %% @doc Decode a binary stream, returning the next decoded term and the 735 | %% stream remainder 736 | %% 737 | %% This function will raise an exception if the beginning of `Bin' is not 738 | %% a valid sext-encoded term. 739 | %% @end 740 | decode_next(<>) -> decode_rev_sext(Rest); 741 | decode_next(<>) -> decode_atom(Rest); 742 | decode_next(<>) -> decode_pid(Rest); 743 | decode_next(<>) -> decode_port(Rest); 744 | decode_next(<>) -> decode_ref(Rest); 745 | decode_next(<>) -> decode_tuple(Sz,Rest); 746 | %% decode_next(<>) -> {[], Rest}; 747 | %% decode_next(<>) -> decode_list(Rest); 748 | decode_next(<>) -> decode_map(Rest); 749 | decode_next(<>) -> decode_list(Rest); 750 | decode_next(<>) -> decode_neg_big(Rest); 751 | decode_next(<>) -> decode_pos_big(Rest); 752 | decode_next(<>) -> decode_neg(I,F,Rest); 753 | decode_next(<>) -> decode_pos(I,F,Rest); 754 | decode_next(<>) -> decode_binary(Rest). 755 | 756 | -spec partial_decode(binary()) -> {full | partial, any(), binary()}. 757 | %% @spec partial_decode(Bytes) -> {full | partial, DecodedTerm, Rest} 758 | %% @doc Decode a sext-encoded term or prefix embedded in a byte stream. 759 | %% 760 | %% Example: 761 | %% ``` 762 | %% 1> T = sext:encode({a,b,c}). 763 | %% <<16,0,0,0,3,12,176,128,8,12,177,0,8,12,177,128,8>> 764 | %% 2> sext:partial_decode(<<T/binary, "tail">>). 765 | %% {full,{a,b,c},<<"tail">>} 766 | %% 3> P = sext:prefix({a,b,'_'}). 767 | %% <<16,0,0,0,3,12,176,128,8,12,177,0,8>> 768 | %% 4> sext:partial_decode(<<P/binary, "tail">>). 769 | %% {partial,{a,b,'_'},<<"tail">>} 770 | %% ''' 771 | %% 772 | %% Note that a decoded prefix may not be exactly like the encoded prefix. 773 | %% For example, ['_'] will be encoded as 774 | %% <<17>>, i.e. only the 'list' opcode. The 775 | %% decoded prefix will be '_', since the encoded prefix would 776 | %% also match the empty list. The decoded prefix will always be a prefix to 777 | %% anything to which the original prefix is a prefix. 778 | %% 779 | %% For tuples, {1,'_',3} encoded and decoded, will result in 780 | %% {1,'_','_'}, i.e. the tuple size is kept, but the elements 781 | %% after the first wildcard are replaced with wildcards. 782 | %% @end 783 | partial_decode(<>) -> 784 | partial_decode_tuple(Sz, Rest); 785 | partial_decode(<>) -> 786 | partial_decode_list(Rest); 787 | partial_decode(Other) -> 788 | try decode_next(Other) of 789 | {Dec, Rest} -> 790 | {full, Dec, Rest} 791 | catch 792 | error:function_clause -> 793 | {partial, '_', Other} 794 | end. 795 | 796 | decode_rev_sext(B) -> 797 | decode_neg_binary(B). 798 | 799 | decode_atom(B) -> 800 | {Bin, Rest} = decode_binary(B), 801 | {list_to_atom(binary_to_list(Bin)), Rest}. 802 | 803 | decode_tuple(Sz, Elems) -> 804 | decode_tuple(Sz,Elems,[]). 805 | 806 | decode_tuple(0, Rest, Acc) -> 807 | {list_to_tuple(lists:reverse(Acc)), Rest}; 808 | decode_tuple(N, Elems, Acc) -> 809 | {Term, Rest} = decode_next(Elems), 810 | decode_tuple(N-1, Rest, [Term|Acc]). 811 | 812 | partial_decode_tuple(Sz, Elems) -> 813 | partial_decode_tuple(Sz, Elems, []). 814 | 815 | partial_decode_tuple(0, Rest, Acc) -> 816 | {full, list_to_tuple(lists:reverse(Acc)), Rest}; 817 | partial_decode_tuple(N, Elems, Acc) -> 818 | case partial_decode(Elems) of 819 | {partial, Term, Rest} -> 820 | {partial, list_to_tuple( 821 | lists:reverse([Term|Acc]) ++ pad_(N-1)), Rest}; 822 | {full, Dec, Rest} -> 823 | partial_decode_tuple(N-1, Rest, [Dec|Acc]) 824 | end. 825 | 826 | pad_(0) -> 827 | []; 828 | pad_(N) when N > 0 -> 829 | ['_'|pad_(N-1)]. 830 | 831 | partial_decode_list(Elems) -> 832 | partial_decode_list(Elems, []). 833 | 834 | partial_decode_list(<<>>, Acc) -> 835 | {partial, lists:reverse(Acc) ++ '_', <<>>}; 836 | partial_decode_list(<<2, Rest/binary>>, Acc) -> 837 | {full, lists:reverse(Acc), Rest}; 838 | partial_decode_list(<>, Acc) -> 839 | %% improper list, binary tail 840 | {Term, Rest} = decode_next(Next), 841 | {full, lists:reverse(Acc) ++ Term, Rest}; 842 | partial_decode_list(<<1, Next/binary>>, Acc) -> 843 | {Result, Term, Rest} = partial_decode(Next), 844 | {Result, lists:reverse(Acc) ++ Term, Rest}; 845 | partial_decode_list(<> = Next, Acc) when ?is_sext(X) -> 846 | case partial_decode(Next) of 847 | {full, Term, Rest} -> 848 | partial_decode_list(Rest, [Term|Acc]); 849 | {partial, Term, Rest} -> 850 | {partial, lists:reverse([Term|Acc]) ++ '_', Rest} 851 | end; 852 | partial_decode_list(Rest, Acc) -> 853 | {partial, lists:reverse(Acc) ++ '_', Rest}. 854 | 855 | decode_map(<>) -> 856 | decode_map(Sz, Rest, #{}). 857 | 858 | decode_map(0, Rest, M) -> 859 | {M, Rest}; 860 | decode_map(N, Bin, M) -> 861 | {K, Bin1} = decode_next(Bin), 862 | {V, Bin2} = decode_next(Bin1), 863 | decode_map(N-1, Bin2, maps:put(K, V, M)). 864 | 865 | 866 | decode_list(Elems) -> 867 | decode_list(Elems, []). 868 | 869 | decode_list(<<2, Rest/binary>>, Acc) -> 870 | {lists:reverse(Acc), Rest}; 871 | decode_list(<>, Acc) -> 872 | %% improper list, binary tail 873 | {Term, Rest} = decode_next(Next), 874 | {lists:reverse(Acc) ++ Term, Rest}; 875 | decode_list(<<1, Next/binary>>, Acc) -> 876 | %% improper list, non-binary tail 877 | {Term, Rest} = decode_next(Next), 878 | {lists:reverse(Acc) ++ Term, Rest}; 879 | decode_list(Elems, Acc) -> 880 | {Term, Rest} = decode_next(Elems), 881 | decode_list(Rest, [Term|Acc]). 882 | 883 | decode_pid(Bin) -> 884 | {Name, Rest} = decode_binary(Bin), 885 | NameSz = size(Name), 886 | case Rest of 887 | <> -> 888 | {binary_to_term(<<131,88,100,NameSz:16,Name/binary,NS/binary,C/binary>>), Rest1}; 889 | <> -> 890 | true = C =< 3, 891 | {binary_to_term(<<131,103,100,NameSz:16,Name/binary,NS/binary,C>>), Rest1} 892 | end. 893 | 894 | decode_port(Bin) -> 895 | {Name, Rest} = decode_binary(Bin), 896 | NameSz = size(Name), 897 | case Rest of 898 | <<16, N:8/binary, 255, C:4/binary, Rest1/binary>> -> 899 | {binary_to_term(<<131,120,100,NameSz:16,Name/binary,N/binary,C/binary>>), Rest1}; 900 | <> -> 901 | {binary_to_term(<<131,89,100,NameSz:16,Name/binary,N/binary,C/binary>>), Rest1}; 902 | <> -> 903 | true = C =< 3, 904 | {binary_to_term(<<131,102,100,NameSz:16,Name/binary,N/binary,C>>), Rest1} 905 | end. 906 | 907 | decode_ref(Bin) -> 908 | {Name, Rest} = decode_binary(Bin), 909 | {Tail, Rest1} = decode_binary(Rest), 910 | NLen = size(Name), 911 | case Tail of 912 | <<255, C:4/binary, Tail1/binary>> -> 913 | Len = size(Tail1) div 4, 914 | RefBin = <<131,90,Len:16,100,NLen:16,Name/binary,C/binary,Tail1/binary>>, 915 | {binary_to_term(RefBin), Rest1}; 916 | <> -> 917 | true = C =< 3, 918 | Len = size(Tail1) div 4, 919 | RefBin = <<131,114,Len:16,100,NLen:16,Name/binary,C,Tail1/binary>>, 920 | {binary_to_term(RefBin), Rest1} 921 | end. 922 | 923 | decode_neg(I, 1, Rest) -> 924 | {(I - 16#7fffFFFF), Rest}; 925 | decode_neg(I0, 0, Bin) -> % for negative numbers, 0 means that it's a float 926 | I = 16#7fffFFFF - I0, 927 | ?dbg("decode_neg()... I = ~p | Bin = ~p~n", [I, Bin]), 928 | decode_neg_float(I, Bin). 929 | 930 | decode_neg_float(0, Bin) -> 931 | {R, Rest} = decode_neg_binary(Bin), 932 | ?dbg("Bin = ~p~n", [pp(Bin)]), 933 | ?dbg("R = ~p | Rest = ~p~n", [pp(R), Rest]), 934 | Sz = bit_size(R), 935 | Offs = Sz - 53, 936 | ?dbg("Offs = ~p | Sz - ~p~n", [Offs, Sz]), 937 | <<_:Offs, 1:1, I:52>> = R, 938 | Exp = 1023 - Offs, 939 | <> = <<1:1, Exp:11, I:52>>, 940 | {F, Rest}; 941 | decode_neg_float(I, Bin) -> 942 | {R, Rest} = decode_neg_binary(Bin), 943 | ?dbg("decode_neg_float: I = ~p | R = ~p~n", [I, R]), 944 | Sz = bit_size(R), 945 | ?dbg("Sz = ~p~n", [Sz]), 946 | <> = R, 947 | ?dbg("Ri = ~p~n", [Ri]), 948 | if Ri == 0 -> 949 | %% special case 950 | {0.0-I, Rest}; 951 | true -> 952 | IBits = strip_first_one(I), 953 | ?dbg("IBits = ~p~n", [pp(IBits)]), 954 | Bits = <>, 955 | ?dbg("Bits = ~p (Sz: ~p)~n", [pp(Bits), bit_size(Bits)]), 956 | Exp = bit_size(IBits) + 1023, 957 | ?dbg("Exp = ~p~n", [Exp]), 958 | <> = <>, 959 | ?dbg("Frac = ~p~n", [Frac]), 960 | <> = <<1:1, Exp:11, Frac:52>>, 961 | {F, Rest} 962 | end. 963 | 964 | decode_pos(I, 0, Rest) -> 965 | {I, Rest}; 966 | decode_pos(0, 1, Bin) -> 967 | {Real, Rest} = decode_binary(Bin), 968 | Offs = bit_size(Real) - 53, 969 | <<0:Offs, 1:1, Frac:52>> = Real, 970 | Exp = 1023 - Offs, 971 | <> = <<0:1, Exp:11, Frac:52>>, 972 | {F, Rest}; 973 | decode_pos(I, 1, Bin) -> % float > 1 974 | ?dbg("decode_pos(~p, 1, ~p)~n", [I, Bin]), 975 | {Real, Rest} = decode_binary(Bin), 976 | case decode_binary(Bin) of 977 | {<<>>, Rest} -> 978 | <> = <>, 979 | {F, Rest}; 980 | {Real, Rest} -> 981 | ?dbg("Real = ~p~n", [Real]), 982 | Exp = 52 - bit_size(Real) + 1023, 983 | ?dbg("Exp = ~p~n", [Exp]), 984 | Bits0 = <>, 985 | ?dbg("Bits0 = ~p~n", [Bits0]), 986 | Bits = strip_one(Bits0), 987 | <> = Bits, 988 | <> = <<0:1, Exp:11, Frac:52>>, 989 | {F, Rest} 990 | end. 991 | 992 | decode_pos_big(Bin) -> 993 | ?dbg("decode_pos_big(~p)~n", [Bin]), 994 | {Ib0, Rest} = decode_binary(Bin), 995 | Ib = remove_size_bits(Ib0), 996 | ?dbg("Ib = ~p~n", [Ib]), 997 | ISz = size(Ib) * 8, 998 | ?dbg("ISz = ~p~n", [ISz]), 999 | <> = Ib, 1000 | ?dbg("I = ~p~n", [I]), 1001 | <> = Rest, 1002 | ?dbg("Rest1 = ~p~n", [Rest1]), 1003 | decode_pos(I, F, Rest1). 1004 | 1005 | decode_neg_big(Bin) -> 1006 | ?dbg("decode_neg_big(~p)~n", [Bin]), 1007 | <> = Bin, 1008 | Words = 16#ffffFFFF - WordsAdj, 1009 | ?dbg("Words = ~p~n", [Words]), 1010 | {Ib0, Rest1} = decode_binary(Rest), 1011 | Ib = remove_size_bits(Ib0), 1012 | ?dbg("Ib = ~p | Rest1 = ~p~n", [Ib, Rest1]), 1013 | ISz = size(Ib) * 8, 1014 | <> = Ib, 1015 | ?dbg("I0 = ~p~n", [I0]), 1016 | Max = imax(Words), 1017 | ?dbg("Max = ~p~n", [Max]), 1018 | I = Max - I0, 1019 | ?dbg("I = ~p~n", [I]), 1020 | <> = Rest1, 1021 | ?dbg("F = ~p | Rest2 = ~p~n", [F, Rest2]), 1022 | if F == 0 -> 1023 | decode_neg_float(I, Rest2); 1024 | F == 16#ff -> 1025 | {-I, Rest2} 1026 | end. 1027 | 1028 | %% optimization - no need to loop through a very large number of zeros. 1029 | strip_first_one(I) -> 1030 | Sz = if I < 16#ff -> 8; 1031 | I < 16#ffff -> 16; 1032 | I < 16#ffffff -> 24; 1033 | I < 16#ffffffff -> 32; 1034 | true -> 52 1035 | end, 1036 | strip_one(<>). 1037 | 1038 | strip_one(<<0:1, Rest/bitstring>>) -> strip_one(Rest); 1039 | strip_one(<<1:1, Rest/bitstring>>) -> Rest. 1040 | 1041 | 1042 | decode_binary(<<8, Rest/binary>>) -> {<<>>, Rest}; 1043 | decode_binary(B) -> decode_binary(B, 0, <<>>). 1044 | 1045 | decode_binary(<<1:1,H:8,Rest/bitstring>>, N, Acc) -> 1046 | case Rest of 1047 | <<1:1,_/bitstring>> -> 1048 | decode_binary(Rest, N+9, << Acc/binary, H >>); 1049 | _ -> 1050 | Pad = 8 - ((N+9) rem 8), 1051 | <<0:Pad,EndBits,Rest1/binary>> = Rest, 1052 | TailPad = 8-EndBits, 1053 | <> = <>, 1054 | {<< Acc/binary, Tail:EndBits >>, Rest1} 1055 | end. 1056 | 1057 | decode_neg_binary(<<247, Rest/binary>>) -> {<<>>, Rest}; % 16#ff - 8 1058 | decode_neg_binary(B) -> decode_neg_binary(B, 0, <<>>). 1059 | 1060 | decode_neg_binary(<<0:1,H:8,Rest/bitstring>>, N, Acc) -> 1061 | case Rest of 1062 | <<0:1,_/bitstring>> -> 1063 | decode_neg_binary(Rest, N+9, << Acc/binary, (16#ff - H) >>); 1064 | _ -> 1065 | Pad = 8 - ((N+9) rem 8), 1066 | ?dbg("Pad = ~p~n", [Pad]), 1067 | IPad = (1 bsl Pad) - 1, 1068 | <> = Rest, 1069 | ?dbg("EndBits0 = ~p~n", [EndBits0]), 1070 | EndBits = 16#ff - EndBits0, 1071 | ?dbg("EndBits = ~p~n", [EndBits]), 1072 | if EndBits == 0 -> 1073 | {<< Acc/binary, (16#ff - H)>>, Rest1}; 1074 | true -> 1075 | <> = <<(16#ff - H)>>, 1076 | ?dbg("Tail = ~p~n", [Tail]), 1077 | {<< Acc/binary, Tail:EndBits >>, Rest1} 1078 | end 1079 | end. 1080 | 1081 | %% The largest value that fits in Sz bits 1082 | max_value(Sz) -> 1083 | (1 bsl Sz) - 1. 1084 | 1085 | %% The largest value that fits in Words*64 bits. 1086 | imax(1) -> max_value(64); 1087 | imax(2) -> max_value(128); 1088 | imax(Words) -> max_value(Words*64). 1089 | 1090 | %% Get the smallest imax/1 value that's larger than I. 1091 | get_max(I) -> get_max(I, 1, imax(1)). 1092 | get_max(I, W, Max) when I > Max -> 1093 | get_max(I, W+1, (Max bsl 64) bor ?IMAX1); 1094 | get_max(_, W, Max) -> 1095 | {W, Max}. 1096 | 1097 | %% @spec to_sb32(Bits::bitstring()) -> binary() 1098 | %% @doc Converts a bitstring into an sb-encoded bitstring 1099 | %% 1100 | %% sb32 (Sortable base32) is a variant of RFC3548, slightly rearranged to 1101 | %% preserve the lexical sorting properties. Base32 was chosen to avoid 1102 | %% filename-unfriendly characters. Also important is that the padding 1103 | %% character be less than any character in the alphabet 1104 | %% 1105 | %% sb32 alphabet: 1106 | %%
1107 | %% 0 0     6 6     12 C     18 I     24 O     30 U
1108 | %% 1 1     7 7     13 D     19 J     25 P     31 V
1109 | %% 2 2     8 8     14 E     20 K     26 Q  (pad) -
1110 | %% 3 3     9 9     15 F     21 L     27 R
1111 | %% 4 4    10 A     16 G     22 M     28 S
1112 | %% 5 5    11 B     17 H     23 N     29 T
1113 | %% 
1114 | %% @end 1115 | %% 1116 | to_sb32(Bits) when is_bitstring(Bits) -> 1117 | Sz = bit_size(Bits), 1118 | {Chunk, Rest, Pad} = 1119 | case Sz rem 5 of 1120 | 0 -> {Bits, <<>>, <<>>}; 1121 | R -> sb32_encode_chunks(Sz, R, Bits) 1122 | end, 1123 | Enc = << << (c2sb32(C1)) >> || 1124 | <> <= Chunk >>, 1125 | if Rest == << >> -> 1126 | Enc; 1127 | true -> 1128 | << Enc/bitstring, (c2sb32(Rest)):8, Pad/binary >> 1129 | end. 1130 | 1131 | sb32_encode_chunks(Sz, Rem, Bits) -> 1132 | ChunkSz = Sz - Rem, 1133 | << C:ChunkSz/bitstring, Rest:Rem >> = Bits, 1134 | Pad = encode_pad(Rem), 1135 | {C, Rest, Pad}. 1136 | 1137 | encode_pad(3) -> <<"------">>; 1138 | encode_pad(1) -> <<"----">>; 1139 | encode_pad(4) -> <<"---">>; 1140 | encode_pad(2) -> <<"-">>. 1141 | 1142 | %% @spec from_sb32(Bits::bitstring()) -> bitstring() 1143 | %% @doc Converts from an sb32-encoded bitstring into a 'normal' bitstring 1144 | %% 1145 | %% This function is the reverse of {@link to_sb32/1}. 1146 | %% @end 1147 | %% 1148 | from_sb32(<< C:8, "------" >>) -> << (sb322c(C)):3 >>; 1149 | from_sb32(<< C:8, "----" >> ) -> << (sb322c(C)):1 >>; 1150 | from_sb32(<< C:8, "---" >> ) -> << (sb322c(C)):4 >>; 1151 | from_sb32(<< C:8, "-" >> ) -> << (sb322c(C)):2 >>; 1152 | from_sb32(<< C:8, Rest/bitstring >>) -> 1153 | << (sb322c(C)):5, (from_sb32(Rest))/bitstring >>; 1154 | from_sb32(<< >>) -> 1155 | << >>. 1156 | 1157 | c2sb32(I) when 0 =< I, I =< 9 -> $0 + I; 1158 | c2sb32(I) when 10 =< I, I =< 31 -> $A + I - 10. 1159 | 1160 | sb322c(I) when $0 =< I, I =< $9 -> I - $0; 1161 | sb322c(I) when $A =< I, I =< $V -> I - $A + 10. 1162 | 1163 | %% @spec to_hex(Bin::binary()) -> binary() 1164 | %% @doc Converts a binary into a hex-encoded binary 1165 | %% This is conventional hex encoding, with the proviso that 1166 | %% only capital letters are used, e.g. `0..9A..F'. 1167 | %% @end 1168 | to_hex(Bin) -> 1169 | << << (nib2hex(N)):8 >> || <> <= Bin >>. 1170 | 1171 | %% @spec from_hex(Bin::binary()) -> binary() 1172 | %% @doc Converts from a hex-encoded binary into a 'normal' binary 1173 | %% 1174 | %% This function is the reverse of {@link to_hex/1}. 1175 | %% 1176 | from_hex(Bin) -> 1177 | << << (hex2nib(H)):4 >> || <> <= Bin >>. 1178 | 1179 | nib2hex(N) when 0 =< N, N =< 9 -> $0 + N; 1180 | nib2hex(N) when 10 =< N, N =< 15-> $A + N - 10. 1181 | 1182 | hex2nib(C) when $0 =< C, C =< $9 -> C - $0; 1183 | hex2nib(C) when $A =< C, C =< $F -> C - $A + 10. 1184 | 1185 | -ifdef(TEST). 1186 | -include_lib("eunit/include/eunit.hrl"). 1187 | 1188 | encode_test() -> 1189 | L = test_list(), 1190 | [{I,I} = {I,catch decode(encode(I))} || I <- L]. 1191 | 1192 | test_list() -> 1193 | [-456453453477456464.45456, 1194 | -5.23423564, 1195 | -1.234234, 1196 | -1.23423, 1197 | -0.345, 1198 | -0.34567, 1199 | -0.0034567, 1200 | 0, 1201 | 0.00012345, 1202 | 0.12345, 1203 | 1.2345, 1204 | 123.45, 1205 | 456453453477456464.45456, 1206 | a, 1207 | aaa, 1208 | {}, 1209 | {1}, 1210 | {1,2}, 1211 | {"","123"}, 1212 | {"1","234"}, 1213 | <<>>, 1214 | <<1>>, 1215 | <<1,5:3>>, 1216 | <<1,5:4>>, 1217 | [1,2,3], 1218 | [], 1219 | self(), 1220 | spawn(fun() -> ok end), 1221 | make_ref(), 1222 | make_ref()| 1223 | lists:sublist(erlang:ports(),1,2)]. 1224 | 1225 | -endif. 1226 | -------------------------------------------------------------------------------- /test/sext_eqc.erl: -------------------------------------------------------------------------------- 1 | %% -*- erlang-indent-level: 4; indent-tabs-mode: nil -*- 2 | %%============================================================================== 3 | %% Copyright 2014-16 Ulf Wiger 4 | %% 5 | %% Licensed under the Apache License, Version 2.0 (the "License"); 6 | %% you may not use this file except in compliance with the License. 7 | %% You may obtain a copy of the License at 8 | %% 9 | %% http://www.apache.org/licenses/LICENSE-2.0 10 | %% 11 | %% Unless required by applicable law or agreed to in writing, software 12 | %% distributed under the License is distributed on an "AS IS" BASIS, 13 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | %% See the License for the specific language governing permissions and 15 | %% limitations under the License. 16 | %%============================================================================== 17 | 18 | -module(sext_eqc). 19 | 20 | %% Prefer QuickCheck, but otherwise try with Proper (some properties will 21 | %% have trouble under Proper - feel free to investigate). 22 | -ifdef(EQC). 23 | -undef(QC). 24 | -define(QC,eqc). 25 | -include_lib("eqc/include/eqc.hrl"). 26 | -else. 27 | -ifdef(PROPER). 28 | -undef(QC). 29 | -define(QC,proper). 30 | -include_lib("proper/include/proper.hrl"). 31 | -endif. 32 | -endif. 33 | 34 | -ifdef(QC). 35 | -compile(export_all). 36 | -include_lib("eunit/include/eunit.hrl"). 37 | 38 | get_n(Default) -> 39 | case os:getenv("SEXT_TESTS") of 40 | false -> Default; 41 | Res -> 42 | list_to_integer(Res) 43 | end. 44 | 45 | sext_test_() -> 46 | N = get_n(500), 47 | {timeout, 60, 48 | [ 49 | fun() -> t(run(N, prop_encode, fun prop_encode/0)) end 50 | , fun() -> t(run(N, prop_encode_rev, fun prop_encode_rev/0)) end 51 | , fun() -> t(run(N, prop_decode_legacy_big, fun prop_decode_legacy_big/0)) end 52 | , fun() -> t(run(N, prop_decode_legacy_neg_big, fun prop_decode_legacy_neg_big/0)) end 53 | , fun() -> t(run(N, prop_prefix_equiv,fun prop_prefix_equiv/0))end 54 | , fun() -> t(run(N, prop_sort, fun prop_sort/0)) end 55 | , fun() -> t(run(N, prop_revsort, fun prop_revsort/0)) end 56 | , fun() -> t(run(N, prop_sort_big, fun prop_sort_big/0)) end 57 | , fun() -> t(run(N, prop_sort_neg_big, fun prop_sort_neg_big/0)) end 58 | , fun() -> t(run(N, prop_revsort_neg_big, fun prop_revsort_neg_big/0)) end 59 | , fun() -> t(run(N, prop_encode_sb32, fun prop_encode_sb32/0)) end 60 | , fun() -> t(run(N, prop_sort_sb32, fun prop_sort_sb32/0)) end 61 | , fun() -> t(run(N, prop_partial_decode1, fun prop_partial_decode1/0)) end 62 | , fun() -> t(run(N, prop_partial_decode2, fun prop_partial_decode2/0)) end 63 | , fun() -> t(run(N, prop_partial_decode_plus1, 64 | fun prop_partial_decode_plus1/0)) end 65 | , fun() -> t(run(N, prop_partial_decode_plus2, 66 | fun prop_partial_decode_plus2/0)) end 67 | , fun() -> t(run(N, prop_is_prefix1, fun prop_is_prefix1/0)) end 68 | , fun() -> t(run(N, prop_is_prefix2, fun prop_is_prefix2/0)) end 69 | , fun() -> t(run(N, prop_encode_hex, fun prop_encode_hex/0)) end 70 | , fun() -> t(run(N, prop_sort_hex, fun prop_sort_hex/0)) end 71 | , fun() -> t(run(N, prop_is_prefix_hex1, fun prop_is_prefix_hex1/0)) end 72 | , fun() -> t(run(N, prop_is_prefix_hex2, fun prop_is_prefix_hex2/0)) end 73 | , fun() -> t(run(N,prop_non_proper_sorts,fun prop_non_proper_sorts/0)) end 74 | ]}. 75 | 76 | t({_Lbl, Res}) -> 77 | ?assert(Res == true); 78 | t(Res) -> 79 | ?assert(Res == true). 80 | 81 | run() -> 82 | run(good_number_of_tests()). 83 | 84 | good_number_of_tests() -> 85 | get_n(2000). 86 | 87 | run(Num) -> 88 | [ 89 | run (Num, prop_encode , fun prop_encode/0) 90 | , run(Num, prop_decode_legacy_big, fun prop_decode_legacy_big/0) 91 | , run(Num, prop_decode_legacy_neg_big, fun prop_decode_legacy_neg_big/0) 92 | , run(Num, prop_prefix_equiv,fun prop_prefix_equiv/0) 93 | %% , run(Num, prop_prefix_equiv,fun prop_prefix_equiv/0) 94 | , run(Num, prop_sort , fun prop_sort/0) 95 | , run(Num, prop_sort_big, fun prop_sort_big/0) 96 | , run(Num, prop_sort_neg_big, fun prop_sort_neg_big/0) 97 | , run(Num, prop_encode_sb32, fun prop_encode_sb32/0) 98 | , run(Num, prop_sort_sb32 , fun prop_sort_sb32/0) 99 | , run(Num, prop_partial_decode1, fun prop_partial_decode1/0) 100 | , run(Num, prop_partial_decode2, fun prop_partial_decode2/0) 101 | , run(Num, prop_partial_decode_plus1, fun prop_partial_decode_plus1/0) 102 | , run(Num, prop_partial_decode_plus2, fun prop_partial_decode_plus2/0) 103 | , run(Num, prop_is_prefix1, fun prop_is_prefix1/0) 104 | , run(Num, prop_is_prefix2, fun prop_is_prefix2/0) 105 | , run(Num, prop_non_proper_sorts, fun prop_non_proper_sorts/0) 106 | ]. 107 | 108 | run(Num, Lbl, F) -> 109 | io:fwrite(user, "EQC test: ~p (~p)... ", [Lbl, Num]), 110 | Res = ?QC:quickcheck(?QC:numtests(Num, F())), 111 | io:fwrite(user, "-> ~p~n", [Res]), 112 | {Lbl, Res}. 113 | 114 | 115 | %% In this property, the ?IMPLIES condition guards us against the 116 | %% unfortunate case where {1, 1.0} will have a strict ordering when 117 | %% encoded (in order to satisfy the encode property), but not in Erlang 118 | %% since they compare as equal. It seems a reasonable limitation, that 119 | %% we limit ourselves to testing the sort order of term pairs where the 120 | %% values actually differ. 121 | prop_sort() -> 122 | ?FORALL({T1,T2}, {term_(), term_()}, 123 | begin 124 | {X1,X2} = {sext:encode(T1), sext:encode(T2)}, 125 | collect(size(term_to_binary({T1,T2})), 126 | comp(X1,X2) == comp_i(T1,T2)) 127 | end). 128 | 129 | prop_revsort() -> 130 | ?FORALL({T1,T2}, {term_(), term_()}, 131 | begin 132 | {X1,X2} = {sext:reverse_sext(sext:encode(T1)), 133 | sext:reverse_sext(sext:encode(T2))}, 134 | collect(size(term_to_binary({T1,T2})), 135 | comp(X1,X2) == comp_i(T2,T1)) 136 | end). 137 | 138 | prop_sort_big() -> 139 | ?FORALL({T1,T2}, {big(), big()}, 140 | begin 141 | {X1,X2} = {sext:encode(T1), sext:encode(T2)}, 142 | collect(size(term_to_binary({T1,T2})), 143 | comp(X1,X2) == comp_i(T1,T2)) 144 | end). 145 | 146 | prop_sort_neg_big() -> 147 | ?FORALL({T1,T2}, {neg_big(), neg_big()}, 148 | begin 149 | {X1,X2} = {sext:encode(T1), sext:encode(T2)}, 150 | collect(size(term_to_binary({T1,T2})), 151 | comp(X1,X2) == comp_i(T1,T2)) 152 | end). 153 | 154 | prop_revsort_neg_big() -> 155 | ?FORALL({T1,T2}, {neg_big(), neg_big()}, 156 | begin 157 | {X1,X2} = {sext:reverse_sext(sext:encode(T1)), 158 | sext:reverse_sext(sext:encode(T2))}, 159 | collect(size(term_to_binary({T1,T2})), 160 | comp(X1,X2) == comp_i(T2,T1)) 161 | end). 162 | 163 | prop_sort_sb32() -> 164 | ?FORALL({T1,T2}, {term_(), term_()}, 165 | begin 166 | {X1,X2} = {sext:encode_sb32(T1), sext:encode_sb32(T2)}, 167 | collect(size(term_to_binary({T1,T2})), 168 | comp(X1,X2) == comp_i(T1,T2)) 169 | end). 170 | 171 | prop_sort_hex() -> 172 | ?FORALL({T1,T2}, {term_(), term_()}, 173 | begin 174 | {X1,X2} = {sext:encode_hex(T1), sext:encode_hex(T2)}, 175 | collect(size(term_to_binary({T1,T2})), 176 | comp(X1,X2) == comp_i(T1,T2)) 177 | end). 178 | 179 | 180 | prop_sort_fs() -> 181 | ?FORALL({R1,R2}, {pos_float(),pos_float()}, 182 | begin 183 | {B1,B2} = {sext:encode(R1), sext:encode(R2)}, 184 | comp(R1,R2) == comp(B1,B2) 185 | end). 186 | 187 | prop_sort_neg_fs() -> 188 | ?FORALL({R1,R2}, {neg_float(), neg_float()}, 189 | begin 190 | {B1,B2} = {sext:encode(R1), sext:encode(R2)}, 191 | comp(R1,R2) == comp(B1,B2) 192 | end). 193 | 194 | prop_encode() -> 195 | ?FORALL(T, term_(), 196 | sext:decode(sext:encode(T)) == T). 197 | 198 | prop_encode_rev() -> 199 | ?FORALL(T, term_(), 200 | sext:decode(sext:decode( 201 | sext:reverse_sext(sext:encode(T)))) == T). 202 | 203 | prop_decode_legacy_big() -> 204 | ?FORALL(T, big(), 205 | sext:decode(sext:encode(T, true)) == T). 206 | 207 | prop_decode_legacy_neg_big() -> 208 | ?FORALL(T, neg_big(), 209 | sext:decode(sext:encode(T, true)) == T). 210 | 211 | prop_encode_sb32() -> 212 | ?FORALL(T, term_(), 213 | sext:decode_sb32(sext:encode_sb32(T)) == T). 214 | 215 | prop_encode_hex() -> 216 | ?FORALL(T, term_(), 217 | sext:decode_hex(sext:encode_hex(T)) == T). 218 | 219 | prop_prefix_equiv() -> 220 | ?FORALL(T, term_(), 221 | sext:encode(T) == sext:prefix(T)). 222 | 223 | %% Partial-decoding a whole term should give the term back 224 | prop_partial_decode1() -> 225 | ?FORALL(T, term_(), 226 | begin 227 | Enc = sext:encode(T), 228 | {full, Dec, Rest} = sext:partial_decode(Enc), 229 | Dec == T andalso Rest == <<>> 230 | end). 231 | 232 | %% Partial-decoding a prefix should give a _comparable_ prefix back 233 | prop_partial_decode2() -> 234 | ?FORALL(Pat, wild_pat(), 235 | begin 236 | Pfx = sext:prefix(Pat), 237 | case sext:partial_decode(Pfx) of 238 | {full, _, _} -> true; 239 | {partial, Dec, Rest} -> 240 | comp_pat(Dec, Pat) andalso Rest == <<>> 241 | end 242 | end). 243 | 244 | %% A sext term followed by something not sext-encoded 245 | prop_partial_decode_plus1() -> 246 | ?FORALL(T, term_(), 247 | begin 248 | Enc = sext:encode(T), 249 | {full, Dec, <<"foo">>} = 250 | sext:partial_decode(<>), 251 | Dec == T 252 | end). 253 | 254 | %% A sext prefix followed by something not sext-encoded 255 | prop_partial_decode_plus2() -> 256 | ?FORALL(Pat, wild_pat(), 257 | begin 258 | Pfx = sext:prefix(Pat), 259 | case sext:partial_decode(<>) of 260 | {full, Dec, <<"foo">>} -> 261 | Dec == Pat; 262 | {partial, Dec, <<"foo">>} -> 263 | comp_pat(Dec, Pat) 264 | end 265 | end). 266 | 267 | wild_pat() -> 268 | ?LET({T,W}, {?SUCHTHAT(Tp, prefixable_term(), 269 | positions(Tp) > 0),wild()}, 270 | ?LET(P, choose(1, positions(T)), 271 | make_wild(T, P, W))). 272 | 273 | comp_pat(X, X) -> true; 274 | comp_pat(A, B) when is_tuple(A), is_tuple(B), size(A) == size(B) -> 275 | comp_pat_l(tuple_to_list(A), tuple_to_list(B)); 276 | comp_pat(Dec, Pat) when is_list(Dec), is_list(Pat) -> 277 | comp_pat_l(Dec, Pat); 278 | comp_pat(A, B) -> % A: decoded; B: prefix 279 | case {is_wild(A), is_wild(B)} of 280 | {true, true} -> true; 281 | {true, false} -> 282 | case B of 283 | [H|_] -> 284 | %% This is because the decoded prefix of [] and ['_'|'_'] 285 | %% are both '_' 286 | is_wild(H); 287 | _ -> false 288 | end; 289 | _ -> 290 | false 291 | end. 292 | 293 | comp_pat_l([H1|T1], [H2|T2]) -> 294 | case is_wild(H1) of 295 | true -> true; 296 | false -> 297 | case comp_pat(H1, H2) of 298 | true -> comp_pat_l(T1, T2); 299 | false -> false 300 | end 301 | end; 302 | comp_pat_l([], []) -> true; 303 | comp_pat_l(A, _) -> 304 | is_wild(A). 305 | 306 | 307 | prop_is_prefix1() -> 308 | ?FORALL({T,W}, {?SUCHTHAT(Tp, prefixable_term(), 309 | positions(Tp) > 0),wild()}, 310 | ?LET(P, choose(1, positions(T)), 311 | begin 312 | Pfx = sext:prefix(make_wild(T,P,W)), 313 | true = is_prefix(Pfx, sext:encode(T)) 314 | end)). 315 | 316 | prop_is_prefix2() -> 317 | ?FORALL({T,W}, {?SUCHTHAT(Tp, prefixable_term(), 318 | positions(Tp) > 2), wild()}, 319 | ?LET(P, choose(2, positions(T)), 320 | begin 321 | {Pfx1,Pfx2} = {sext:prefix(make_wild(T,P,W)), 322 | sext:prefix(make_wild(T,P-1,W))}, 323 | true = is_prefix(Pfx2, Pfx1) 324 | end)). 325 | 326 | prop_is_prefix_hex1() -> 327 | ?FORALL({T,W}, {?SUCHTHAT(Tp, prefixable_term(), 328 | positions(Tp) > 0),wild()}, 329 | ?LET(P, choose(1, positions(T)), 330 | begin 331 | Pfx = sext:prefix_hex(make_wild(T,P,W)), 332 | true = is_prefix(Pfx, sext:encode_hex(T)) 333 | end)). 334 | 335 | prop_is_prefix_hex2() -> 336 | ?FORALL({T,W}, {?SUCHTHAT(Tp, prefixable_term(), 337 | positions(Tp) > 2), wild()}, 338 | ?LET(P, choose(2, positions(T)), 339 | begin 340 | {Pfx1,Pfx2} = {sext:prefix_hex(make_wild(T,P,W)), 341 | sext:prefix_hex(make_wild(T,P-1,W))}, 342 | true = is_prefix(Pfx2, Pfx1) 343 | end)). 344 | 345 | prop_non_proper_sorts() -> 346 | ?FORALL({L,T}, {non_empty_list(), simple_term()}, 347 | begin 348 | List = [{L, 1}, 349 | {L ++ T, 2}, 350 | {L ++ [T], 3}], 351 | Encoded = [{sext:encode(A),B} || {A,B} <- List], 352 | Sorted1 = lists:keysort(1, List), 353 | Sorted2 = lists:keysort(1, Encoded), 354 | [I || {_,I} <- Sorted1] 355 | == [J || {_,J} <- Sorted2] 356 | end). 357 | 358 | prop_encode_neg_fs() -> 359 | ?FORALL(T, neg_float(), 360 | sext:decode(sext:encode(T)) == T). 361 | 362 | prop_encode_big() -> 363 | ?FORALL(T, big(), 364 | sext:decode(sext:encode(T)) == T). 365 | 366 | prop_encode_neg_big() -> 367 | ?FORALL(T, neg_big(), 368 | sext:decode(sext:encode(T)) == T). 369 | 370 | 371 | comp(A,B) when A == B, A =/= B -> 372 | %% can only happen when either is a float and the other an int 373 | IsMore = if A < 0 -> 374 | is_float(B); 375 | true -> 376 | is_float(A) 377 | end, 378 | case IsMore of 379 | true -> more; 380 | false -> less 381 | end; 382 | comp(A,B) when A < B -> less; 383 | comp(A,A) -> equal; 384 | comp(_,_) -> more. 385 | 386 | comp_i(Ta, Tb) when is_tuple(Ta), is_tuple(Tb), 387 | tuple_size(Ta) == tuple_size(Tb) -> 388 | comp_l(tuple_to_list(Ta), tuple_to_list(Tb)); 389 | comp_i(La, Lb) when is_list(La), is_list(Lb) -> 390 | comp_l(La, Lb); 391 | comp_i(A, B) -> 392 | comp(A, B). 393 | 394 | comp_l([] , [] ) -> equal; 395 | comp_l([] , [_|_] ) -> less; 396 | comp_l([_|_] , [] ) -> more; 397 | comp_l([Ha|Ta],[Hb|Tb]) -> 398 | case comp(Ha, Hb) of 399 | equal -> 400 | comp_l(Ta, Tb); 401 | Other -> 402 | Other 403 | end; 404 | comp_l(A, B) -> % A or B was an improper list 405 | comp_i(A, B). 406 | 407 | is_prefix(A, B) -> 408 | Sz = byte_size(A), 409 | binary:longest_common_prefix([A,B]) == Sz. 410 | 411 | prop_measure_term() -> 412 | ?FORALL(T,term_(), 413 | measure(term_size,size(term_to_binary(T)),true)). 414 | 415 | simple_term() -> 416 | oneof(simple_types()). 417 | 418 | term_() -> 419 | ?SIZED(Size,term(Size)). 420 | 421 | term(0) -> 422 | simple_term(); 423 | term(Size) -> 424 | %% You need ?LAZY for recursive generators! 425 | ?LAZY(oneof( 426 | simple_types() ++ 427 | [ 428 | %% Don't make lists and tuples EXACTLY Size long 429 | alist(Size), 430 | non_proper_list(Size), 431 | atuple(Size), 432 | astring(Size)])). 433 | 434 | simple_types() -> 435 | [int(), 436 | big(), 437 | pos_float(), 438 | neg_float(), 439 | anatom(), 440 | abin(), 441 | abitstr()]. 442 | 443 | big() -> 444 | ?LET({X,M}, {nat(), pos()}, 445 | %% Multiply by the cube of `M' 446 | %% to get the generator big enough. 447 | %% Verified w/ `eqc_gen:sample/1' 448 | (16#ffffFFFF + X) * (M * M * M)). 449 | 450 | neg_big() -> 451 | ?LET(B, big(), -B). 452 | 453 | pos() -> 454 | ?SUCHTHAT(N,nat(),N>0). 455 | 456 | %% Set the Size just for list generation. 457 | 458 | alist() -> 459 | ?SIZED(Size, alist(Size)). 460 | 461 | alist(Size) -> 462 | list(Size,term(Size div 3)). 463 | 464 | non_proper_list(Size) -> 465 | ?LET(L,alist(Size),make_non_proper(L)). 466 | 467 | list(Size,G) -> 468 | ?SIZED(S,resize(Size,list(resize(S,G)))). 469 | 470 | atuple(Size) -> 471 | ?LET(L, alist(Size), list_to_tuple(L)). 472 | 473 | anatom() -> 474 | oneof([a,b,c,aa,bb,cc]). 475 | 476 | astring(0) -> ""; 477 | astring(Size) -> 478 | list(Size, choose($A,$z)). 479 | 480 | abin() -> 481 | ?LET(L, list(choose(0,255)), list_to_binary(L)). 482 | 483 | abitstr() -> 484 | ?LET({Bin, Sz}, {abin(), choose(0, 7)}, 485 | ?LET(N, choose(0, 16#ff bsr (8-Sz)), 486 | <>)). 487 | 488 | pos_float() -> 489 | ?LET(F, ?SUCHTHAT(R, real(), R > 0 andalso is_float(R)), 490 | norm(F)). 491 | 492 | neg_float() -> 493 | ?LET(F, ?SUCHTHAT(R, real(), R < 0 andalso is_float(R)), 494 | norm(F)). 495 | 496 | norm(F) when is_float(F) -> 497 | <> = <>, 498 | G. 499 | 500 | make_non_proper([A,B]) -> [A|B]; 501 | make_non_proper([A]) -> [A]; 502 | make_non_proper([A|B]) -> [A|make_non_proper(B)]; 503 | make_non_proper([]) -> []. 504 | 505 | 506 | prefixable_term() -> 507 | oneof([non_empty_tuple(), 508 | non_empty_list()]). 509 | 510 | non_empty_tuple() -> 511 | ?LET(L, non_empty_list(), 512 | list_to_tuple(L)). 513 | 514 | non_empty_list() -> 515 | non_empty(alist()). 516 | 517 | positions(T) -> 518 | positions(T, 0). 519 | 520 | positions(T, Acc) when is_tuple(T) -> 521 | positions(tuple_to_list(T), Acc); 522 | positions([H|T], Acc) -> 523 | positions(T, positions(H) + Acc); 524 | positions([], Acc) -> 525 | Acc; 526 | positions(_, Acc) -> 527 | Acc+1. 528 | 529 | is_wild('_') -> true; 530 | is_wild(A) when is_atom(A) -> 531 | case atom_to_list(A) of 532 | "\$" ++ Is -> 533 | try _ = list_to_integer(Is), 534 | true 535 | catch 536 | error:_ -> 537 | false 538 | end; 539 | _ -> 540 | false 541 | end; 542 | is_wild(_) -> 543 | false. 544 | 545 | make_wild(T, P, W) when P > 0 -> 546 | if is_tuple(T) -> 547 | {Res,_} = make_wild1(tuple_to_list(T), P, W, []), 548 | list_to_tuple(Res); 549 | is_list(T) -> 550 | {Res,_} = make_wild1(T, P, W, []), 551 | Res 552 | end. 553 | 554 | make_wild1(L, 0, _, Acc) -> 555 | {lists:reverse(Acc) ++ L, 0}; 556 | make_wild1(T, P, W, Acc) when not(is_list(T)) -> 557 | if P == 1 -> 558 | {lists:reverse(Acc) ++ W, 0}; 559 | true -> 560 | {lists:reverse(Acc) ++ T, P-1} 561 | end; 562 | make_wild1([_|T], 1, W, Acc) -> 563 | {lists:reverse(Acc) ++ [W|T], 0}; 564 | make_wild1([H|T], P, W, Acc) -> 565 | if is_tuple(H) -> 566 | {H1,P1} = make_wild1(tuple_to_list(H), P, W, []), 567 | make_wild1(T, P1, W, [list_to_tuple(H1)|Acc]); 568 | is_list(H) -> 569 | {H1,P1} = make_wild1(H, P, W, []), 570 | make_wild1(T, P1, W, [H1|Acc]); 571 | true -> 572 | make_wild1(T, P-1, W, [H|Acc]) 573 | end; 574 | make_wild1([], P, _W, Acc) -> 575 | {lists:reverse(Acc), P}. 576 | 577 | wild() -> 578 | oneof(['_','$1','$9999']). 579 | 580 | lists_replace(L, P, V) when P > 0, P =< length(L) -> 581 | {L1, [_|L2]} = lists:split(P-1, L), 582 | L1 ++ [V] ++ L2. 583 | 584 | -endif. 585 | 586 | --------------------------------------------------------------------------------