├── .eqc_ci
├── .github
└── FUNDING.yml
├── .gitignore
├── EQC_CI_LICENCE.txt
├── LICENSE
├── NOTICE
├── README.md
├── doc
├── README.md
├── edoc-info
├── erlang.png
├── overview.edoc
├── sext.md
└── stylesheet.css
├── examples
└── tt_proto.erl
├── rebar.config
├── src
├── sext.app.src
└── sext.erl
└── test
└── sext_eqc.erl
/.eqc_ci:
--------------------------------------------------------------------------------
1 | {build,"mkdir -p ebin; erlc -o ebin -DEQC +\\{parse_transform,eqc_cover\\} src/*.erl test/*.erl"}.
2 | {test_path, "ebin"}.
3 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: [uwiger]
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .rebar3
2 | _*
3 | .eunit
4 | *.o
5 | *.beam
6 | *.plt
7 | *.swp
8 | *.swo
9 | .erlang.cookie
10 | log
11 | erl_crash.dump
12 | .rebar
13 | logs
14 | _build
15 | Barrel.nonode@nohost
16 | *.iml
17 | .idea
18 | deps
19 | .eunit/
20 | ebin
21 | *~
22 | */*~
23 | erl_crash.dump
24 | current_counterexample.eqc
25 |
--------------------------------------------------------------------------------
/EQC_CI_LICENCE.txt:
--------------------------------------------------------------------------------
1 | This file is an agreement between Quviq AB ("Quviq"), Sven Hultins
2 | Gata 9, Gothenburg, Sweden, and the committers to the github
3 | repository in which the file appears ("the owner"). By placing this
4 | file in a github repository, the owner agrees to the terms below.
5 |
6 | The purpose of the agreement is to enable Quviq AB to provide a
7 | continuous integration service to the owner, whereby the code in the
8 | repository ("the source code") is tested using Quviq's test tools, and
9 | the test results are made available on the web. The test results
10 | include test output, generated test cases, and a copy of the source
11 | code in the repository annotated with coverage information ("the test
12 | results").
13 |
14 | The owner agrees that Quviq may run the tests in the source code and
15 | display the test results on the web, without obligation.
16 |
17 | The owner warrants that running the tests in the source code and
18 | displaying the test results on the web violates no laws, licences or other
19 | agreements. In the event of such a violation, the owner accepts full
20 | responsibility.
21 |
22 | The owner warrants that the source code is not malicious, and will not
23 | mount an attack on either Quviq's server or any other server--for
24 | example by taking part in a denial of service attack, or by attempting
25 | to send unsolicited emails.
26 |
27 | The owner warrants that the source code does not attempt to reverse
28 | engineer Quviq's code.
29 |
30 | Quviq reserves the right to exclude repositories that break this
31 | agreement from its continuous integration service.
32 |
33 | Any dispute arising from the use of Quviq's service will be resolved
34 | under Swedish law.
35 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 |
--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Original author: Ulf Wiger, Erlang Solutions, 2009
2 |
3 | Copyright transfered to Ulf Wiger 2014
4 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # The sext application #
2 |
3 | [
](http://quickcheck-ci.com/p/uwiger/sext)
4 |
5 | __Authors:__ Ulf Wiger ([`ulf@wiger.net`](mailto:ulf@wiger.net)).
6 |
7 | A sortable serialization library
8 | This library offers a serialization format (a la term_to_binary()) that
9 | preserves the Erlang term order.
10 |
11 | ```
12 |
13 | Copyright 2014-2020 Ulf Wiger
14 |
15 | Licensed under the Apache License, Version 2.0 (the "License");
16 | you may not use this file except in compliance with the License.
17 | You may obtain a copy of the License at
18 |
19 | http://www.apache.org/licenses/LICENSE-2.0
20 |
21 | Unless required by applicable law or agreed to in writing, software
22 | distributed under the License is distributed on an "AS IS" BASIS,
23 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
24 | See the License for the specific language governing permissions and
25 | limitations under the License.
26 |
27 | ```
28 |
29 |
30 | # 1. Introduction #
31 |
32 | The idea to this library came out of the need for disk-based storage
33 | with ordered_set semantics in Erlang. One previous solution used Tokyo Cabinet,
34 | in which a C routine is used to hook into the sorting logic of TC.
35 |
36 | I thought a more generic solution would be to be able to have a version
37 | of term_to_binary() that respected the ordering semantics of Erlang terms.
38 |
39 | A new addition is support for 'sb32' encoding. This is my own version of
40 | Base32 encoding, with a slightly different alphabet, in order to preserve
41 | sorting properties while generating octet strings that are perfectly safe
42 | to use in file names.
43 |
44 | Another feature is "prefix encoding", which encodes a term and truncates
45 | the result if it encounters a "wildcard" (e.g. `'$1'`
46 | or `'_'`). This is to enable a convenient and efficient mapping
47 | of Erlang match specifications to e.g. prefix matching on the external storage
48 | and subsequent match_spec matching on the found erlang terms.
49 |
50 | The serialization format supports all Erlang types, and preserves the
51 | internal Erlang term order, with a few exceptions:
52 |
53 | * Floats are represented based on the IEEE 764 Binary 64 standard
54 | representation. This is the representation used by Erlang, specifically
55 | the representation used when encoding floats in binaries. To be exact,
56 | `sext` first normalizes the float by encoding it as an Erlang binary, then
57 | serializes it.
58 |
59 | * In Erlang, integers are cast to floats before comparing them to a float.
60 | This means e.g. that the relative sort order of `1` and `1.0` is undefined.
61 | It is not possible for `sext` to preserve this ambiguity after serialization,
62 | since it could only be done by producing identical encodings for the two
63 | terms, thereby sacrificing the property that encoding a value and then
64 | decoding it again, should produce the initial value.
65 |
66 |
67 | # 2. Specification #
68 |
69 |
70 | ## 2.1 Type tags ##
71 |
72 | Each data type is encoded using a type tag (1 byte) that represents its order
73 | in the global Erlang term ordering. The number type is divided into several
74 | subtypes, to facilitate a reasonably efficient representation:
75 |
76 |
77 |
Type | Description | Tag |
---|
negbig | Negative bignum | 8 |
neg4 | Negative 31-bit integer | 9 |
pos4 | Positive 31-bit integer | 10 |
posbig | Positive bignum | 11 |
atom | Obj of type atom() | 12 |
reference | Obj of type reference() | 13 |
port | Obj of type port() | 14 |
pid | Obj of type pid() | 15 |
tuple | Obj of type tuple() | 16 |
list | Obj of type map() | 17, 1 |
list | Obj of type list() | 17 |
binary | Obj of type binary() | 18 |
bin_tail | Improper-tail marker followed by binary or bitstring | 19 |
78 |
79 |
80 |
81 |
82 | ## 2.2 Tuples ##
83 |
84 | Tuples are encoded as the tuple tag, followed by a 32-bit size element,
85 | denoting the number of elements in the tuple, followed by each element
86 | in the tuple individually encoded.
87 |
88 |
89 | ## 2.3 Lists ##
90 |
91 | Lists are encoded as the list tag, followed by each element in the list
92 | individually encoded, followed by the number 2 (1 byte).
93 |
94 | Improper lists, e.g. `[1,2|3]`, have the number 1 inserted before the improper
95 | tail. Since this also indicates the last element in the list, no end byte
96 | is needed. This ensures that it sorts *before* any corresponding proper list,
97 | as long as the improper tail is not a binary (binaries are greater than the
98 | missing 'cons', or list, cell).
99 |
100 | Improper lists that have a binary or bitstring as 'tail', e.g. `[1,2|<<1>>]`,
101 | have a ?bin_tail (code 19) inserted before the tail. This ensures that it
102 | sorts after a corresponding proper list.
103 |
104 |
105 | ## 2.4 Binaries and bitstrings ##
106 |
107 | A binary is basically a bitstring whose size is a multiple of 8. From a sorting
108 | perspective, binaries and bitstrings are both sorted as left-aligned bit
109 | arrays.
110 |
111 | ```erlang
112 | 1> bitstring_to_list(<<11111111111:11>>).
113 | [56,<<7:3>>]
114 | ```
115 |
116 | Binaries and bitstrings are encoded as the binary tag, followed by each whole
117 | byte, each padded with a leading 1 (one bit), followed by a number of 0-bits
118 | to pad again make the size a multiple of 8 bits, followed by a byte whose
119 | value is Bits, where Bits is the number of "remainder bits"; 8 if the original
120 | binary is 8-bit aligned.
121 |
122 | Example:
123 |
124 | ```erlang
125 | 2> sext:encode(<<1,2,3>>).
126 | <<18,128,192,160,96,8>>
127 | 3> <<18, 1:1,1, 1:1,2, 1:1,3, 0:5, 8>>.
128 | <<18,128,192,160,96,8>>
129 | ```
130 |
131 | In the example above, we inserted 3 1-bits, and therefore had to insert 5 more
132 | pad bits (zeroes) at the end. The last byte is 8, signifying that the original
133 | binary was 8-bit aligned.
134 |
135 | If the remainder is not an even 8 bits, the remainder bits are padded with
136 | a 1-bit, just like the others, then left-aligned and padded up to a whole
137 | byte (excluding the 1-bit added in front).
138 | The value of the last byte is the bit size of the remainder.
139 |
140 | Example:
141 |
142 | ```erlang
143 | 2> sext:encode(<<1,2,3>>).
144 | <<18,128,192,160,96,8>>
145 | 3> sext:encode(<<18, 1:1,1, 1:1,2, 1:1,3, 0:5, 8>>).
146 | <<18,128,192,160,96,8>>
147 | ```
148 |
149 | The first part of the bitstring is encoded exactly like above. The number 4:3
150 | is first padded with 1 then padded at the end to become a whole byte. Then
151 | an additional pad, 0:4, is inserted to compensate for the fact that we have
152 | inserted 4 1-bits. Finally, the last byte is 3, to signify the size of the
153 | remainder.
154 |
155 |
156 | ## 2.5 Positive Numbers ##
157 |
158 | Numbers are encoded as the corresponding type tag, followed by the integer
159 | part, a marker indicating the presence of a fraction part, and the fraction
160 | part, if any. The integer part is encoded differently depending on the size
161 | of the value. The fraction part is encoded as a binary (without the 'binary'
162 | type tag).
163 |
164 |
165 | ### 2.5.1 Positive small integers, pos4 ###
166 |
167 | Integers up to 31 bits are encoded as << ?pos4, I:31, F:1 >>
168 | where I is the integer value, and F is 1 if a fraction part follows;
169 | 0 otherwise.
170 |
171 |
172 | ### 2.5.2 Positive large integers ###
173 |
174 | Larger integers are converted to a byte string and then encoded like
175 | binaries (without the 'binary' type tag), followed by a byte signifying
176 | whether a fraction part follows (1 if yes; 0 otherwise).
177 |
178 | ```erlang
179 | Bytes = encode_big(I),
180 | << ?pos_big, Bytes/binary, F:8 >>
181 | ```
182 |
183 |
184 | ### 2.5.3 Fraction part of positive numbers ###
185 |
186 | The representation of floating point numbers is based on the [IEEE 764 Binary 64 standard representation](http://en.wikipedia.org/wiki/Double_precision_floating-point_format). This is also the representation used by Erlang:
187 |
188 | ```erlang
189 | <> = <>
190 | ```
191 |
192 | The encoding extracts the integer part and encodes it as a positive integer
193 | (either pos4 or pos_big), flags the presence of a fraction part, and encodes
194 | the fraction part as a binary (without the binary tag).
195 |
196 |
197 | ## 2.6 Negative Numbers ##
198 |
199 |
200 | ### 2.6.1 Small negative numbers ###
201 |
202 | ```erlang
203 | << ?neg4:8, IRep:31, F:1 >>
204 | ```
205 |
206 | A negative number I is encoded as IRep = Max + I, where Max is the largest
207 | possible number that can be represented with the number of bits present for
208 | the given subtype. For example, Max for neg4 is 0x7FFF FFFF (31 bits).
209 | Keep in mind that I < 0.
210 |
211 | The fraction flag is inverted, compared to the pos4 representation, so it will
212 | be 1 if there is no fraction part; 0 otherwise.
213 |
214 |
215 | ### 2.6.2 Large negative numbers ###
216 |
217 | Larger negative numbers are encoded as:
218 |
219 | ```erlang
220 | encode_negbig(I) ->
221 | {Words, Max} = get_max(-I),
222 | Bin = encode_bin_elems(list_to_binary(encode_big(Max + I)),
223 | WordsRep = 16#FFFFffff - Words,
224 | << ?neg_big:8, WordsRep:32, Bin/binary, F:8 >>.
225 | ```
226 |
227 | That is, get_max() figures out how many 64-bit words are needed to represent
228 | -I (the positive number), and also gives the maximum value that can be
229 | represented in so many words. WordsRep in essence becomes a sub-subtag of
230 | the negative bignum.
231 |
232 |
233 | ### 2.6.3 Fraction of negative numbers ###
234 |
235 | The fraction is encoded almost like the inverse of the positive fraction
236 | (as a "negative binary", if such a thing existed). Each byte is padded with
237 | a 0-bit rather than a 1-bit, and the byte itself is replaced by 16#ff - Byte.
238 | The sequence is then padded with 1s to become a multiple of 8 bits.
239 |
240 | The last byte, denoting the number of significant bits in the last byte,
241 | is similarly inverted.
242 |
243 |
244 | ## 2.7 Atoms ##
245 |
246 | Atoms are encoded as the atom tag, followed by the string representation of
247 | the atom using the binary encoding described above (but without the binary
248 | tag).
249 |
250 |
251 | ## 2.8 References ##
252 |
253 | The encoding of references is perhaps best described by the code:
254 |
255 | ```erlang
256 | encode_ref(R) ->
257 | RBin = term_to_binary(R),
258 | <<131,114,_Len:16,100,NLen:16,Name:NLen/binary,Rest/binary>> = RBin,
259 | NameEnc = encode_bin_elems(Name),
260 | RestEnc = encode_bin_elems(Rest),
261 | <>.
262 | ```
263 |
264 | where encode_bin_elems(B) encodes the argument B the same way as a binary
265 | (excluding the 'binary' type tag).
266 |
267 |
268 | ## 2.9 Ports ##
269 |
270 | The encoding of ports is perhaps best described by the code:
271 |
272 | ```erlang
273 | encode_port(P) ->
274 | PBin = term_to_binary(P),
275 | <<131,102,100,ALen:16,Name:ALen/binary,Rest:5/binary>> = PBin,
276 | NameEnc = encode_bin_elems(Name),
277 | <>.
278 | ```
279 |
280 |
281 | ## 2.10 Pids ##
282 |
283 | The encoding of ports is perhaps best described by the code:
284 |
285 | ```erlang
286 | encode_pid(P) ->
287 | PBin = term_to_binary(P),
288 | <<131,103,100,ALen:16,Name:ALen/binary,Rest:9/binary>> = PBin,
289 | NameEnc = encode_bin_elems(Name),
290 | <>.
291 | ```
292 |
293 |
294 | ## 2.11 Maps ##
295 |
296 | The encoding of maps is currently experimental.
297 | Maps sort between tuples and lists. Since the smallest list is represented
298 | by `<<17, 2>>`, maps encoding starts with `<<17, 1>>` (introducing a new tag
299 | would break backwards compatibility), followed by the size of the map (4 bytes),
300 | and each Key-Value pair in the map.
301 |
302 |
303 | ## Modules ##
304 |
305 |
306 |
308 |
309 |
--------------------------------------------------------------------------------
/doc/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # The sext application #
4 |
5 | __Authors:__ Ulf Wiger ([`ulf@wiger.net`](mailto:ulf@wiger.net)).
6 |
7 | A sortable serialization library
8 | This library offers a serialization format (a la term_to_binary()) that
9 | preserves the Erlang term order.
10 |
11 | ```
12 |
13 | Copyright 2010 Erlang Solutions Ltd.
14 |
15 | Licensed under the Apache License, Version 2.0 (the "License");
16 | you may not use this file except in compliance with the License.
17 | You may obtain a copy of the License at
18 |
19 | http://www.apache.org/licenses/LICENSE-2.0
20 |
21 | Unless required by applicable law or agreed to in writing, software
22 | distributed under the License is distributed on an "AS IS" BASIS,
23 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
24 | See the License for the specific language governing permissions and
25 | limitations under the License.
26 |
27 | ```
28 |
29 |
30 | # 1. Introduction #
31 |
32 | The idea to this library came out of the need for disk-based storage
33 | with ordered_set semantics in Erlang. One previous solution used Tokyo Cabinet,
34 | in which a C routine is used to hook into the sorting logic of TC.
35 |
36 | I thought a more generic solution would be to be able to have a version
37 | of term_to_binary() that respected the ordering semantics of Erlang terms.
38 |
39 | A new addition is support for 'sb32' encoding. This is my own version of
40 | Base32 encoding, with a slightly different alphabet, in order to preserve
41 | sorting properties while generating octet strings that are perfectly safe
42 | to use in file names.
43 |
44 | Another feature is "prefix encoding", which encodes a term and truncates
45 | the result if it encounters a "wildcard" (e.g. `'$1'`
46 | or `'_'`). This is to enable a convenient and efficient mapping
47 | of Erlang match specifications to e.g. prefix matching on the external storage
48 | and subsequent match_spec matching on the found erlang terms.
49 |
50 | The serialization format supports all Erlang types, and preserves the
51 | internal Erlang term order, with a few exceptions:
52 |
53 | * Floats are represented based on the IEEE 764 Binary 64 standard
54 | representation. This is the representation used by Erlang, specifically
55 | the representation used when encoding floats in binaries. To be exact,
56 | `sext` first normalizes the float by encoding it as an Erlang binary, then
57 | serializes it.
58 |
59 | * In Erlang, integers are cast to floats before comparing them to a float.
60 | This means e.g. that the relative sort order of `1` and `1.0` is undefined.
61 | It is not possible for `sext` to preserve this ambiguity after serialization,
62 | since it could only be done by producing identical encodings for the two
63 | terms, thereby sacrificing the property that encoding a value and then
64 | decoding it again, should produce the initial value.
65 |
66 |
67 | # 2. Specification #
68 |
69 |
70 | ## 2.1 Type tags ##
71 |
72 | Each data type is encoded using a type tag (1 byte) that represents its order
73 | in the global Erlang term ordering. The number type is divided into several
74 | subtypes, to facilitate a reasonably efficient representation:
75 |
76 |
77 | Type | Description | Tag |
---|
negbig | Negative bignum | 8 |
neg4 | Negative 31-bit integer | 9 |
pos4 | Positive 31-bit integer | 10 |
posbig | Positive bignum | 11 |
atom | Obj of type atom() | 12 |
reference | Obj of type reference() | 13 |
port | Obj of type port() | 14 |
pid | Obj of type pid() | 15 |
tuple | Obj of type tuple() | 16 |
list | Obj of type map() | 17, 1 |
list | Obj of type list() | 17 |
binary | Obj of type binary() | 18 |
bin_tail | Improper-tail marker followed by binary or bitstring | 19 |
78 |
79 |
80 |
81 |
82 | ## 2.2 Tuples ##
83 |
84 | Tuples are encoded as the tuple tag, followed by a 32-bit size element,
85 | denoting the number of elements in the tuple, followed by each element
86 | in the tuple individually encoded.
87 |
88 |
89 | ## 2.3 Lists ##
90 |
91 | Lists are encoded as the list tag, followed by each element in the list
92 | individually encoded, followed by the number 2 (1 byte).
93 |
94 | Improper lists, e.g. `[1,2|3]`, have the number 1 inserted before the improper
95 | tail. Since this also indicates the last element in the list, no end byte
96 | is needed. This ensures that it sorts *before* any corresponding proper list,
97 | as long as the improper tail is not a binary (binaries are greater than the
98 | missing 'cons', or list, cell).
99 |
100 | Improper lists that have a binary or bitstring as 'tail', e.g. `[1,2|<<1>>]`,
101 | have a ?bin_tail (code 19) inserted before the tail. This ensures that it
102 | sorts after a corresponding proper list.
103 |
104 |
105 | ## 2.4 Binaries and bitstrings ##
106 |
107 | A binary is basically a bitstring whose size is a multiple of 8. From a sorting
108 | perspective, binaries and bitstrings are both sorted as left-aligned bit
109 | arrays.
110 |
111 | ```erlang
112 | 1> bitstring_to_list(<<11111111111:11>>).
113 | [56,<<7:3>>]
114 | ```
115 |
116 | Binaries and bitstrings are encoded as the binary tag, followed by each whole
117 | byte, each padded with a leading 1 (one bit), followed by a number of 0-bits
118 | to pad again make the size a multiple of 8 bits, followed by a byte whose
119 | value is Bits, where Bits is the number of "remainder bits"; 8 if the original
120 | binary is 8-bit aligned.
121 |
122 | Example:
123 |
124 | ```erlang
125 | 2> sext:encode(<<1,2,3>>).
126 | <<18,128,192,160,96,8>>
127 | 3> <<18, 1:1,1, 1:1,2, 1:1,3, 0:5, 8>>.
128 | <<18,128,192,160,96,8>>
129 | ```
130 |
131 | In the example above, we inserted 3 1-bits, and therefore had to insert 5 more
132 | pad bits (zeroes) at the end. The last byte is 8, signifying that the original
133 | binary was 8-bit aligned.
134 |
135 | If the remainder is not an even 8 bits, the remainder bits are padded with
136 | a 1-bit, just like the others, then left-aligned and padded up to a whole
137 | byte (excluding the 1-bit added in front).
138 | The value of the last byte is the bit size of the remainder.
139 |
140 | Example:
141 |
142 | ```erlang
143 | 2> sext:encode(<<1,2,3>>).
144 | <<18,128,192,160,96,8>>
145 | 3> sext:encode(<<18, 1:1,1, 1:1,2, 1:1,3, 0:5, 8>>).
146 | <<18,128,192,160,96,8>>
147 | ```
148 |
149 | The first part of the bitstring is encoded exactly like above. The number 4:3
150 | is first padded with 1 then padded at the end to become a whole byte. Then
151 | an additional pad, 0:4, is inserted to compensate for the fact that we have
152 | inserted 4 1-bits. Finally, the last byte is 3, to signify the size of the
153 | remainder.
154 |
155 |
156 | ## 2.5 Positive Numbers ##
157 |
158 | Numbers are encoded as the corresponding type tag, followed by the integer
159 | part, a marker indicating the presence of a fraction part, and the fraction
160 | part, if any. The integer part is encoded differently depending on the size
161 | of the value. The fraction part is encoded as a binary (without the 'binary'
162 | type tag).
163 |
164 |
165 | ### 2.5.1 Positive small integers, pos4 ###
166 |
167 | Integers up to 31 bits are encoded as << ?pos4, I:31, F:1 >>
168 | where I is the integer value, and F is 1 if a fraction part follows;
169 | 0 otherwise.
170 |
171 |
172 | ### 2.5.2 Positive large integers ###
173 |
174 | Larger integers are converted to a byte string and then encoded like
175 | binaries (without the 'binary' type tag), followed by a byte signifying
176 | whether a fraction part follows (1 if yes; 0 otherwise).
177 |
178 | ```erlang
179 | Bytes = encode_big(I),
180 | << ?pos_big, Bytes/binary, F:8 >>
181 | ```
182 |
183 |
184 | ### 2.5.3 Fraction part of positive numbers ###
185 |
186 | The representation of floating point numbers is based on the [IEEE 764 Binary 64 standard representation](http://en.wikipedia.org/wiki/Double_precision_floating-point_format). This is also the representation used by Erlang:
187 |
188 | ```erlang
189 | <> = <>
190 | ```
191 |
192 | The encoding extracts the integer part and encodes it as a positive integer
193 | (either pos4 or pos_big), flags the presence of a fraction part, and encodes
194 | the fraction part as a binary (without the binary tag).
195 |
196 |
197 | ## 2.6 Negative Numbers ##
198 |
199 |
200 | ### 2.6.1 Small negative numbers ###
201 |
202 | ```erlang
203 | << ?neg4:8, IRep:31, F:1 >>
204 | ```
205 |
206 | A negative number I is encoded as IRep = Max + I, where Max is the largest
207 | possible number that can be represented with the number of bits present for
208 | the given subtype. For example, Max for neg4 is 0x7FFF FFFF (31 bits).
209 | Keep in mind that I < 0.
210 |
211 | The fraction flag is inverted, compared to the pos4 representation, so it will
212 | be 1 if there is no fraction part; 0 otherwise.
213 |
214 |
215 | ### 2.6.2 Large negative numbers ###
216 |
217 | Larger negative numbers are encoded as:
218 |
219 | ```erlang
220 | encode_negbig(I) ->
221 | {Words, Max} = get_max(-I),
222 | Bin = encode_bin_elems(list_to_binary(encode_big(Max + I)),
223 | WordsRep = 16#FFFFffff - Words,
224 | << ?neg_big:8, WordsRep:32, Bin/binary, F:8 >>.
225 | ```
226 |
227 | That is, get_max() figures out how many 64-bit words are needed to represent
228 | -I (the positive number), and also gives the maximum value that can be
229 | represented in so many words. WordsRep in essence becomes a sub-subtag of
230 | the negative bignum.
231 |
232 |
233 | ### 2.6.3 Fraction of negative numbers ###
234 |
235 | The fraction is encoded almost like the inverse of the positive fraction
236 | (as a "negative binary", if such a thing existed). Each byte is padded with
237 | a 0-bit rather than a 1-bit, and the byte itself is replaced by 16#ff - Byte.
238 | The sequence is then padded with 1s to become a multiple of 8 bits.
239 |
240 | The last byte, denoting the number of significant bits in the last byte,
241 | is similarly inverted.
242 |
243 |
244 | ## 2.7 Atoms ##
245 |
246 | Atoms are encoded as the atom tag, followed by the string representation of
247 | the atom using the binary encoding described above (but without the binary
248 | tag).
249 |
250 |
251 | ## 2.8 References ##
252 |
253 | The encoding of references is perhaps best described by the code:
254 |
255 | ```erlang
256 | encode_ref(R) ->
257 | RBin = term_to_binary(R),
258 | <<131,114,_Len:16,100,NLen:16,Name:NLen/binary,Rest/binary>> = RBin,
259 | NameEnc = encode_bin_elems(Name),
260 | RestEnc = encode_bin_elems(Rest),
261 | <>.
262 | ```
263 |
264 | where encode_bin_elems(B) encodes the argument B the same way as a binary
265 | (excluding the 'binary' type tag).
266 |
267 |
268 | ## 2.9 Ports ##
269 |
270 | The encoding of ports is perhaps best described by the code:
271 |
272 | ```erlang
273 | encode_port(P) ->
274 | PBin = term_to_binary(P),
275 | <<131,102,100,ALen:16,Name:ALen/binary,Rest:5/binary>> = PBin,
276 | NameEnc = encode_bin_elems(Name),
277 | <>.
278 | ```
279 |
280 |
281 | ## 2.10 Pids ##
282 |
283 | The encoding of ports is perhaps best described by the code:
284 |
285 | ```erlang
286 | encode_pid(P) ->
287 | PBin = term_to_binary(P),
288 | <<131,103,100,ALen:16,Name:ALen/binary,Rest:9/binary>> = PBin,
289 | NameEnc = encode_bin_elems(Name),
290 | <>.
291 | ```
292 |
293 |
294 | ## 2.11 Maps ##
295 |
296 | The encoding of maps is currently experimental.
297 | Maps sort between tuples and lists. Since the smallest list is represented
298 | by `<<17, 2>>`, maps encoding starts with `<<17, 1>>` (introducing a new tag
299 | would break backwards compatibility), followed by the size of the map (4 bytes),
300 | and each Key-Value pair in the map.
301 |
302 |
303 | ## Modules ##
304 |
305 |
306 |
308 |
309 |
--------------------------------------------------------------------------------
/doc/edoc-info:
--------------------------------------------------------------------------------
1 | %% encoding: UTF-8
2 | {application,sext}.
3 | {packages,[]}.
4 | {modules,[sext]}.
5 |
--------------------------------------------------------------------------------
/doc/erlang.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uwiger/sext/c22486add9cc374dc8138b1f547c0999a1922a65/doc/erlang.png
--------------------------------------------------------------------------------
/doc/overview.edoc:
--------------------------------------------------------------------------------
1 | @author Ulf Wiger
2 | @doc A sortable serialization library
3 | This library offers a serialization format (a la term_to_binary()) that
4 | preserves the Erlang term order.
5 |
6 |
7 | Copyright 2010 Erlang Solutions Ltd.
8 |
9 | Licensed under the Apache License, Version 2.0 (the "License");
10 | you may not use this file except in compliance with the License.
11 | You may obtain a copy of the License at
12 |
13 | http://www.apache.org/licenses/LICENSE-2.0
14 |
15 | Unless required by applicable law or agreed to in writing, software
16 | distributed under the License is distributed on an "AS IS" BASIS,
17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | See the License for the specific language governing permissions and
19 | limitations under the License.
20 |
21 |
22 | 1. Introduction
23 |
24 | The idea to this library came out of the need for disk-based storage
25 | with ordered_set semantics in Erlang. One previous solution used Tokyo Cabinet,
26 | in which a C routine is used to hook into the sorting logic of TC.
27 |
28 | I thought a more generic solution would be to be able to have a version
29 | of term_to_binary() that respected the ordering semantics of Erlang terms.
30 |
31 | A new addition is support for 'sb32' encoding. This is my own version of
32 | Base32 encoding, with a slightly different alphabet, in order to preserve
33 | sorting properties while generating octet strings that are perfectly safe
34 | to use in file names.
35 |
36 | Another feature is "prefix encoding", which encodes a term and truncates
37 | the result if it encounters a "wildcard" (e.g. '$1'
38 | or '_'
). This is to enable a convenient and efficient mapping
39 | of Erlang match specifications to e.g. prefix matching on the external storage
40 | and subsequent match_spec matching on the found erlang terms.
41 |
42 | The serialization format supports all Erlang types, and preserves the
43 | internal Erlang term order, with a few exceptions:
44 |
45 | * Floats are represented based on the IEEE 764 Binary 64 standard
46 | representation. This is the representation used by Erlang, specifically
47 | the representation used when encoding floats in binaries. To be exact,
48 | `sext' first normalizes the float by encoding it as an Erlang binary, then
49 | serializes it.
50 |
51 | * In Erlang, integers are cast to floats before comparing them to a float.
52 | This means e.g. that the relative sort order of `1' and `1.0' is undefined.
53 | It is not possible for `sext' to preserve this ambiguity after serialization,
54 | since it could only be done by producing identical encodings for the two
55 | terms, thereby sacrificing the property that encoding a value and then
56 | decoding it again, should produce the initial value.
57 |
58 | 2. Specification
59 |
60 | 2.1 Type tags
61 |
62 | Each data type is encoded using a type tag (1 byte) that represents its order
63 | in the global Erlang term ordering. The number type is divided into several
64 | subtypes, to facilitate a reasonably efficient representation:
65 |
66 |
67 |
68 | Type |
69 | Description |
70 | Tag |
71 |
72 |
73 | negbig |
74 | Negative bignum |
75 | 8 |
76 |
77 |
78 | neg4 |
79 | Negative 31-bit integer |
80 | 9 |
81 |
82 |
83 | pos4 |
84 | Positive 31-bit integer |
85 | 10 |
86 |
87 |
88 | posbig |
89 | Positive bignum |
90 | 11 |
91 |
92 |
93 | atom |
94 | Obj of type atom() |
95 | 12 |
96 |
97 |
98 | reference |
99 | Obj of type reference() |
100 | 13 |
101 |
102 |
103 | port |
104 | Obj of type port() |
105 | 14 |
106 |
107 |
108 | pid |
109 | Obj of type pid() |
110 | 15 |
111 |
112 |
113 | tuple |
114 | Obj of type tuple() |
115 | 16 |
116 |
117 |
118 | list |
119 | Obj of type map() |
120 | 17, 1 |
121 |
122 |
123 | list |
124 | Obj of type list() |
125 | 17 |
126 |
127 |
128 | binary |
129 | Obj of type binary() |
130 | 18 |
131 |
132 |
133 | bin_tail |
134 | Improper-tail marker followed by binary or bitstring |
135 | 19 |
136 |
137 |
138 |
139 | 2.2 Tuples
140 |
141 | Tuples are encoded as the tuple tag, followed by a 32-bit size element,
142 | denoting the number of elements in the tuple, followed by each element
143 | in the tuple individually encoded.
144 |
145 | 2.3 Lists
146 |
147 | Lists are encoded as the list tag, followed by each element in the list
148 | individually encoded, followed by the number 2 (1 byte).
149 |
150 | Improper lists, e.g. `[1,2|3]', have the number 1 inserted before the improper
151 | tail. Since this also indicates the last element in the list, no end byte
152 | is needed. This ensures that it sorts *before* any corresponding proper list,
153 | as long as the improper tail is not a binary (binaries are greater than the
154 | missing 'cons', or list, cell).
155 |
156 | Improper lists that have a binary or bitstring as 'tail', e.g. `[1,2|<<1>>]',
157 | have a ?bin_tail (code 19) inserted before the tail. This ensures that it
158 | sorts after a corresponding proper list.
159 |
160 | 2.4 Binaries and bitstrings
161 |
162 | A binary is basically a bitstring whose size is a multiple of 8. From a sorting
163 | perspective, binaries and bitstrings are both sorted as left-aligned bit
164 | arrays.
165 |
166 | bitstring_to_list(<<11111111111:11>>).
167 | [56,<<7:3>>]]]>
168 |
169 | Binaries and bitstrings are encoded as the binary tag, followed by each whole
170 | byte, each padded with a leading 1 (one bit), followed by a number of 0-bits
171 | to pad again make the size a multiple of 8 bits, followed by a byte whose
172 | value is Bits, where Bits is the number of "remainder bits"; 8 if the original
173 | binary is 8-bit aligned.
174 |
175 | Example:
176 |
177 | sext:encode(<<1,2,3>>).
178 | <<18,128,192,160,96,8>>
179 | 3> <<18, 1:1,1, 1:1,2, 1:1,3, 0:5, 8>>.
180 | <<18,128,192,160,96,8>>]]>
181 |
182 | In the example above, we inserted 3 1-bits, and therefore had to insert 5 more
183 | pad bits (zeroes) at the end. The last byte is 8, signifying that the original
184 | binary was 8-bit aligned.
185 |
186 | If the remainder is not an even 8 bits, the remainder bits are padded with
187 | a 1-bit, just like the others, then left-aligned and padded up to a whole
188 | byte (excluding the 1-bit added in front).
189 | The value of the last byte is the bit size of the remainder.
190 |
191 | Example:
192 |
193 | sext:encode(<<1,2,3>>).
194 | <<18,128,192,160,96,8>>
195 | 3> sext:encode(<<18, 1:1,1, 1:1,2, 1:1,3, 0:5, 8>>).
196 | <<18,128,192,160,96,8>>]]>
197 |
198 | The first part of the bitstring is encoded exactly like above. The number 4:3
199 | is first padded with 1 then padded at the end to become a whole byte. Then
200 | an additional pad, 0:4, is inserted to compensate for the fact that we have
201 | inserted 4 1-bits. Finally, the last byte is 3, to signify the size of the
202 | remainder.
203 |
204 | 2.5 Positive Numbers
205 |
206 | Numbers are encoded as the corresponding type tag, followed by the integer
207 | part, a marker indicating the presence of a fraction part, and the fraction
208 | part, if any. The integer part is encoded differently depending on the size
209 | of the value. The fraction part is encoded as a binary (without the 'binary'
210 | type tag).
211 |
212 | 2.5.1 Positive small integers, pos4
213 |
214 | Integers up to 31 bits are encoded as << ?pos4, I:31, F:1 >>
215 | where I is the integer value, and F is 1 if a fraction part follows;
216 | 0 otherwise.
217 |
218 | 2.5.2 Positive large integers
219 |
220 | Larger integers are converted to a byte string and then encoded like
221 | binaries (without the 'binary' type tag), followed by a byte signifying
222 | whether a fraction part follows (1 if yes; 0 otherwise).
223 |
224 | >]]>
226 |
227 | 2.5.3 Fraction part of positive numbers
228 |
229 | The representation of floating point numbers is based on the IEEE 764 Binary 64 standard representation. This is also the representation used by Erlang:
230 |
231 | > = <>]]>
232 |
233 | The encoding extracts the integer part and encodes it as a positive integer
234 | (either pos4 or pos_big), flags the presence of a fraction part, and encodes
235 | the fraction part as a binary (without the binary tag).
236 |
237 | 2.6 Negative Numbers
238 |
239 | 2.6.1 Small negative numbers
240 |
241 | >]]>
242 |
243 | A negative number I is encoded as IRep = Max + I, where Max is the largest
244 | possible number that can be represented with the number of bits present for
245 | the given subtype. For example, Max for neg4 is 0x7FFF FFFF (31 bits).
246 | Keep in mind that I < 0.
247 |
248 | The fraction flag is inverted, compared to the pos4 representation, so it will
249 | be 1 if there is no fraction part; 0 otherwise.
250 |
251 | 2.6.2 Large negative numbers
252 |
253 | Larger negative numbers are encoded as:
254 |
255 |
256 | {Words, Max} = get_max(-I),
257 | Bin = encode_bin_elems(list_to_binary(encode_big(Max + I)),
258 | WordsRep = 16#FFFFffff - Words,
259 | << ?neg_big:8, WordsRep:32, Bin/binary, F:8 >>.]]>
260 |
261 | That is, get_max() figures out how many 64-bit words are needed to represent
262 | -I (the positive number), and also gives the maximum value that can be
263 | represented in so many words. WordsRep in essence becomes a sub-subtag of
264 | the negative bignum.
265 |
266 | 2.6.3 Fraction of negative numbers
267 |
268 | The fraction is encoded almost like the inverse of the positive fraction
269 | (as a "negative binary", if such a thing existed). Each byte is padded with
270 | a 0-bit rather than a 1-bit, and the byte itself is replaced by 16#ff - Byte.
271 | The sequence is then padded with 1s to become a multiple of 8 bits.
272 |
273 | The last byte, denoting the number of significant bits in the last byte,
274 | is similarly inverted.
275 |
276 | 2.7 Atoms
277 |
278 | Atoms are encoded as the atom tag, followed by the string representation of
279 | the atom using the binary encoding described above (but without the binary
280 | tag).
281 |
282 | 2.8 References
283 |
284 | The encoding of references is perhaps best described by the code:
285 |
286 |
287 | RBin = term_to_binary(R),
288 | <<131,114,_Len:16,100,NLen:16,Name:NLen/binary,Rest/binary>> = RBin,
289 | NameEnc = encode_bin_elems(Name),
290 | RestEnc = encode_bin_elems(Rest),
291 | <>.]]>
292 |
293 | where encode_bin_elems(B) encodes the argument B the same way as a binary
294 | (excluding the 'binary' type tag).
295 |
296 | 2.9 Ports
297 |
298 | The encoding of ports is perhaps best described by the code:
299 |
300 |
301 | PBin = term_to_binary(P),
302 | <<131,102,100,ALen:16,Name:ALen/binary,Rest:5/binary>> = PBin,
303 | NameEnc = encode_bin_elems(Name),
304 | <>.]]>
305 |
306 | 2.10 Pids
307 |
308 | The encoding of ports is perhaps best described by the code:
309 |
310 |
311 | PBin = term_to_binary(P),
312 | <<131,103,100,ALen:16,Name:ALen/binary,Rest:9/binary>> = PBin,
313 | NameEnc = encode_bin_elems(Name),
314 | <>.]]>
315 |
316 | 2.11 Maps
317 |
318 | The encoding of maps is currently experimental.
319 |
320 | Maps sort between tuples and lists. Since the smallest list is represented
321 | by `<<17, 2>>', maps encoding starts with `<<17, 1>>' (introducing a new tag
322 | would break backwards compatibility), followed by the size of the map (4 bytes),
323 | and each Key-Value pair in the map.
324 |
325 | @end
326 |
--------------------------------------------------------------------------------
/doc/sext.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Module sext #
4 | * [Description](#description)
5 | * [Function Index](#index)
6 | * [Function Details](#functions)
7 |
8 |
9 | Sortable serialization library.
10 | __Authors:__ Ulf Wiger ([`ulf@wiger.net`](mailto:ulf@wiger.net)).
11 |
12 |
13 | ## Function Index ##
14 |
15 |
16 | decode/1 | Decodes a binary generated using the function sext:encode/1 . |
decode_hex/1 | |
decode_next/1 | Decode a binary stream, returning the next decoded term and the
17 | stream remainder. |
decode_sb32/1 | Decodes a binary generated using the function encode_sb32/1 . |
encode/1 | Encodes any Erlang term into a binary. |
encode/2 | Encodes an Erlang term using legacy bignum encoding. |
encode_hex/1 | Encodes any Erlang term into a hex-encoded binary. |
encode_sb32/1 | Encodes any Erlang term into an sb32-encoded binary. |
from_hex/1 | Converts from a hex-encoded binary into a 'normal' binary. |
from_sb32/1 | Converts from an sb32-encoded bitstring into a 'normal' bitstring. |
partial_decode/1 | Decode a sext-encoded term or prefix embedded in a byte stream. |
prefix/1 | Encodes a binary for prefix matching of similar encoded terms. |
prefix_hex/1 | Generates a hex-encoded binary for prefix matching. |
prefix_sb32/1 | Generates an sb32-encoded binary for prefix matching. |
to_hex/1 | Converts a binary into a hex-encoded binary
18 | This is conventional hex encoding, with the proviso that
19 | only capital letters are used, e.g. |
to_sb32/1 | Converts a bitstring into an sb-encoded bitstring. |
20 |
21 |
22 |
23 |
24 | ## Function Details ##
25 |
26 |
27 |
28 | ### decode/1 ###
29 |
30 |
31 |
32 | decode(B::binary()) -> term()
33 |
34 |
35 |
36 | Decodes a binary generated using the function [`sext:encode/1`](sext.md#encode-1).
37 |
38 |
39 | ### decode_hex/1 ###
40 |
41 | `decode_hex(Data) -> any()`
42 |
43 |
44 |
45 |
46 | ### decode_next/1 ###
47 |
48 |
49 |
50 | decode_next(X1::Bin) -> {N, Rest}
51 |
52 |
53 |
54 |
55 | Decode a binary stream, returning the next decoded term and the
56 | stream remainder
57 |
58 |
59 | This function will raise an exception if the beginning of `Bin` is not
60 | a valid sext-encoded term.
61 |
62 |
63 | ### decode_sb32/1 ###
64 |
65 | `decode_sb32(Data) -> any()`
66 |
67 | Decodes a binary generated using the function [`encode_sb32/1`](#encode_sb32-1).
68 |
69 |
70 | ### encode/1 ###
71 |
72 |
73 |
74 | encode(T::term()) -> binary()
75 |
76 |
77 |
78 | Encodes any Erlang term into a binary.
79 | The lexical sorting properties of the encoded binary match those of the
80 | original Erlang term. That is, encoded terms sort the same way as the
81 | original terms would.
82 |
83 |
84 | ### encode/2 ###
85 |
86 |
87 |
88 | encode(T::term(), Legacy::boolean()) -> binary()
89 |
90 |
91 |
92 |
93 | Encodes an Erlang term using legacy bignum encoding.
94 | On March 4 2013, Basho noticed that encoded bignums didn't always sort
95 | properly. This bug has been fixed, but the encoding of bignums necessarily
96 | changed in an incompatible way.
97 |
98 |
99 |
100 | The new decode/1 version can read the old bignum format, but the old
101 | version obviously cannot read the new. Using `encode(Term, true)`, the term
102 | will be encoded using the old format.
103 |
104 |
105 | Use only as transition support. This function will be deprecated in time.
106 |
107 |
108 | ### encode_hex/1 ###
109 |
110 |
111 |
112 | encode_hex(Term::any()) -> binary()
113 |
114 |
115 |
116 |
117 | Encodes any Erlang term into a hex-encoded binary.
118 | This is similar to [`encode/1`](#encode-1), but produces an octet string that
119 | can be used without escaping in file names (containing only the characters
120 | 0..9 and A..F). The sorting properties are preserved.
121 |
122 |
123 | Note: The encoding used is regular hex-encoding, with the proviso that only
124 | capital letters are used (mixing upper- and lowercase characters would break
125 | the sorting property).
126 |
127 |
128 | ### encode_sb32/1 ###
129 |
130 |
131 |
132 | encode_sb32(Term::any()) -> binary()
133 |
134 |
135 |
136 |
137 | Encodes any Erlang term into an sb32-encoded binary.
138 | This is similar to [`encode/1`](#encode-1), but produces an octet string that
139 | can be used without escaping in file names (containing only the characters
140 | 0..9, A..V and '-'). The sorting properties are preserved.
141 |
142 |
143 | Note: The encoding used is inspired by the base32 encoding described in
144 | RFC3548, but uses a different alphabet in order to preserve the sort order.
145 |
146 |
147 | ### from_hex/1 ###
148 |
149 |
150 |
151 | from_hex(Bin::binary()) -> binary()
152 |
153 |
154 |
155 |
156 | Converts from a hex-encoded binary into a 'normal' binary
157 |
158 |
159 | This function is the reverse of [`to_hex/1`](#to_hex-1).
160 |
161 |
162 |
163 | ### from_sb32/1 ###
164 |
165 |
166 |
167 | from_sb32(Bits::bitstring()) -> bitstring()
168 |
169 |
170 |
171 |
172 | Converts from an sb32-encoded bitstring into a 'normal' bitstring
173 |
174 |
175 | This function is the reverse of [`to_sb32/1`](#to_sb32-1).
176 |
177 |
178 | ### partial_decode/1 ###
179 |
180 |
181 |
182 | partial_decode(Other::Bytes) -> {full | partial, DecodedTerm, Rest}
183 |
184 |
185 |
186 |
187 | Decode a sext-encoded term or prefix embedded in a byte stream.
188 |
189 |
190 | Example:
191 |
192 | ```
193 | 1> T = sext:encode({a,b,c}).
194 | <<16,0,0,0,3,12,176,128,8,12,177,0,8,12,177,128,8>>
195 | 2> sext:partial_decode(<<T/binary, "tail">>).
196 | {full,{a,b,c},<<"tail">>}
197 | 3> P = sext:prefix({a,b,'_'}).
198 | <<16,0,0,0,3,12,176,128,8,12,177,0,8>>
199 | 4> sext:partial_decode(<<P/binary, "tail">>).
200 | {partial,{a,b,'_'},<<"tail">>}
201 | ```
202 |
203 |
204 |
205 | Note that a decoded prefix may not be exactly like the encoded prefix.
206 | For example, `['_']` will be encoded as
207 | `<<17>>`, i.e. only the 'list' opcode. The
208 | decoded prefix will be `'_'`, since the encoded prefix would
209 | also match the empty list. The decoded prefix will always be a prefix to
210 | anything to which the original prefix is a prefix.
211 |
212 |
213 | For tuples, `{1,'_',3}` encoded and decoded, will result in
214 | `{1,'_','_'}`, i.e. the tuple size is kept, but the elements
215 | after the first wildcard are replaced with wildcards.
216 |
217 |
218 | ### prefix/1 ###
219 |
220 |
221 |
222 | prefix(X::term()) -> binary()
223 |
224 |
225 |
226 | Encodes a binary for prefix matching of similar encoded terms.
227 | Lists and tuples can be prefixed by using the `'_'` marker,
228 | similarly to Erlang match specifications. For example:
229 |
230 | * `prefix({1,2,'_','_'})` will result in a binary that is
231 | the same as the first part of any encoded 4-tuple with the first two
232 | elements being 1 and 2. The prefix algorithm will search for the
233 | first `'_'`, and treat all following elements as if they
234 | were `'_'`.
235 |
236 | * `prefix([1,2|'_'])` will result in a binary that is the
237 | same as the first part of any encoded list where the first two elements
238 | are 1 and 2. `prefix([1,2,'_'])` will give the same result,
239 | as the prefix pattern is the same for all lists starting with
240 | `[1,2|...]`.
241 |
242 | * `prefix(Binary)` will result in a binary that is the same as the
243 | encoded version of Binary, except that, instead of padding and
244 | terminating, the encoded binary is truncated to the longest byte-aligned
245 | binary. The same is done for bitstrings.
246 |
247 | * `prefix({1,[1,2|'_'],'_'})` will prefix-encode the second
248 | element, and let it end the resulting binary. This prefix will match
249 | any 3-tuple where the first element is 1 and the second element is a
250 | list where the first two elements are 1 and 2.
251 |
252 | * `prefix([1,[1|'_']|'_'])` will result in a prefix that
253 | matches all lists where the first element is 1 and the second element is
254 | a list where the first element is 1.
255 |
256 | * For all other data types, the prefix is the same as the encoded term.
257 |
258 |
259 |
260 |
261 | ### prefix_hex/1 ###
262 |
263 |
264 |
265 | prefix_hex(X::term()) -> binary()
266 |
267 |
268 |
269 | Generates a hex-encoded binary for prefix matching.
270 | This is similar to [`prefix/1`](#prefix-1), but generates a prefix for binaries
271 | encoded with [`encode_hex/1`](#encode_hex-1), rather than [`encode/1`](#encode-1).
272 |
273 |
274 | ### prefix_sb32/1 ###
275 |
276 |
277 |
278 | prefix_sb32(X::term()) -> binary()
279 |
280 |
281 |
282 | Generates an sb32-encoded binary for prefix matching.
283 | This is similar to [`prefix/1`](#prefix-1), but generates a prefix for binaries
284 | encoded with [`encode_sb32/1`](#encode_sb32-1), rather than [`encode/1`](#encode-1).
285 |
286 |
287 | ### to_hex/1 ###
288 |
289 |
290 |
291 | to_hex(Bin::binary()) -> binary()
292 |
293 |
294 |
295 | Converts a binary into a hex-encoded binary
296 | This is conventional hex encoding, with the proviso that
297 | only capital letters are used, e.g. `0..9A..F`.
298 |
299 |
300 | ### to_sb32/1 ###
301 |
302 |
303 |
304 | to_sb32(Bits::bitstring()) -> binary()
305 |
306 |
307 |
308 |
309 | Converts a bitstring into an sb-encoded bitstring
310 |
311 |
312 |
313 | sb32 (Sortable base32) is a variant of RFC3548, slightly rearranged to
314 | preserve the lexical sorting properties. Base32 was chosen to avoid
315 | filename-unfriendly characters. Also important is that the padding
316 | character be less than any character in the alphabet
317 |
318 |
319 | sb32 alphabet:
320 |
321 | ```
322 |
323 | 0 0 6 6 12 C 18 I 24 O 30 U
324 | 1 1 7 7 13 D 19 J 25 P 31 V
325 | 2 2 8 8 14 E 20 K 26 Q (pad) -
326 | 3 3 9 9 15 F 21 L 27 R
327 | 4 4 10 A 16 G 22 M 28 S
328 | 5 5 11 B 17 H 23 N 29 T
329 | ```
330 |
331 |
--------------------------------------------------------------------------------
/doc/stylesheet.css:
--------------------------------------------------------------------------------
1 | /* standard EDoc style sheet */
2 | body {
3 | font-family: Verdana, Arial, Helvetica, sans-serif;
4 | margin-left: .25in;
5 | margin-right: .2in;
6 | margin-top: 0.2in;
7 | margin-bottom: 0.2in;
8 | color: #000000;
9 | background-color: #ffffff;
10 | }
11 | h1,h2 {
12 | margin-left: -0.2in;
13 | }
14 | div.navbar {
15 | background-color: #add8e6;
16 | padding: 0.2em;
17 | }
18 | h2.indextitle {
19 | padding: 0.4em;
20 | background-color: #add8e6;
21 | }
22 | h3.function,h3.typedecl {
23 | background-color: #add8e6;
24 | padding-left: 1em;
25 | }
26 | div.spec {
27 | margin-left: 2em;
28 | background-color: #eeeeee;
29 | }
30 | a.module,a.package {
31 | text-decoration:none
32 | }
33 | a.module:hover,a.package:hover {
34 | background-color: #eeeeee;
35 | }
36 | ul.definitions {
37 | list-style-type: none;
38 | }
39 | ul.index {
40 | list-style-type: none;
41 | background-color: #eeeeee;
42 | }
43 |
44 | /*
45 | * Minor style tweaks
46 | */
47 | ul {
48 | list-style-type: square;
49 | }
50 | table {
51 | border-collapse: collapse;
52 | }
53 | td {
54 | padding: 3
55 | }
56 |
--------------------------------------------------------------------------------
/examples/tt_proto.erl:
--------------------------------------------------------------------------------
1 | %%==============================================================================
2 | %% Copyright 2010 Erlang Solutions Ltd.
3 | %%
4 | %% Licensed under the Apache License, Version 2.0 (the "License");
5 | %% you may not use this file except in compliance with the License.
6 | %% You may obtain a copy of the License at
7 | %%
8 | %% http://www.apache.org/licenses/LICENSE-2.0
9 | %%
10 | %% Unless required by applicable law or agreed to in writing, software
11 | %% distributed under the License is distributed on an "AS IS" BASIS,
12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | %% See the License for the specific language governing permissions and
14 | %% limitations under the License.
15 | %%==============================================================================
16 | %%
17 | %% @author Ulf Wiger
18 | %% @doc Bare-bones Tokyo Tyrant interface library.
19 | %% This is an example to illustrate the use of Sortable EXernal Term (sext)
20 | %% encoding.
21 | %%
22 | %% Tokyo Tyrant (TT) is an add-on
23 | %% to Tokyo Cabinet, adding
24 | %% support for concurrent and remote access to Tokyo Cabinet (TC) through a
25 | %% TCP socket interface. TC supports storage of variable-length byte strings
26 | %% as key-value pairs. The storage type can either be RAM-only or disk, and
27 | %% either hash table or B-tree.
28 | %%
29 | %% Using sext-encoded terms in combination with TT's B-tree storage, it is
30 | %% possible to store very large amounts of data on disk while honoring the
31 | %% Erlang Term ordering semantics. Using the `sext:prefix/1' function, it is
32 | %% also possible to perform efficient range queries.
33 | %%
34 | %% Tokyo Tyrant is easy to install and get running. This module does not show
35 | %% how that is done, nor does it automate the task of starting a TT server.
36 | %%
37 | %% @end
38 | -module(tt_proto).
39 |
40 | -behaviour(gen_server).
41 |
42 | -export([open/2,
43 | put/3,
44 | get/2,
45 | mget/2,
46 | keys/2]).
47 |
48 | %% internal exports
49 | -export([init/1,
50 | handle_call/3,
51 | handle_cast/2,
52 | handle_info/2,
53 | terminate/2,
54 | code_change/3]).
55 |
56 | -compile(export_all).
57 |
58 | -define(DEFAULT_PORT, 1978).
59 |
60 | -record(st, {socket}).
61 |
62 | %% @spec open(Name, Opts) -> {ok, pid()}
63 | %% Opts = [Opt]
64 | %% Opt = {regname,atom()} | {port, integer()}
65 | %%
66 | %% @doc Connects to a running Tokyo Tyrant database server.
67 | %% The default port, 1978, will be used unless another port is specified.
68 | %% If the `regname' option is present, the Tokyo Tyrant proxy process will
69 | %% register itself under that name, and the registered name can be used as
70 | %% an alias when accessing the database.
71 | %% @end
72 | %%
73 | open(Name, Opts) ->
74 | case lists:keyfind(regname, 1, Opts) of
75 | false ->
76 | gen_server:start_link(?MODULE, {Name, Opts}, []);
77 | {_,RegName} ->
78 | gen_server:start_link({local,RegName}, ?MODULE,
79 | {Name, Opts}, [])
80 | end.
81 |
82 | %% @spec put(TT, Key::term(), Value::term()) -> ok | {error, Reason}
83 | %% @doc Inserts a `{Key,Value}' tuple in the database TT.
84 | %% @end
85 | %%
86 | put(TT, Key, Value) ->
87 | cmd(TT, {put, encode(Key), encode(Value)}).
88 |
89 | %% @spec get(TT, Key::term()) -> {ok, Value} | {error, Reason}
90 | %% @doc Looks up Key in the database TT.
91 | %% Returns `{ok,Value}' if found, otherwise `{error,Reason}'.
92 | %% @end
93 | %%
94 | get(TT, Key) ->
95 | case ask(TT, {get, encode(Key)}) of
96 | {ok, Vb} ->
97 | {ok, decode(Vb)};
98 | Err ->
99 | Err
100 | end.
101 |
102 | %% @spec mget(TT, Keys::[term()]) -> {ok, [{K,V}]} | {error,Reason}
103 | %% @doc Fetches multiple objects from the database TT.
104 | %% All objects matching the list of keys will be returned. If no objects match,
105 | %% the return value will be `{ok, []}'.
106 | %% @end
107 | %%
108 | mget(TT, Keys) when is_list(Keys) ->
109 | Enc = [encode(K) || K <- Keys],
110 | case ask(TT, {mget, Enc}) of
111 | {ok, KVs} ->
112 | {ok, [{decode(K),decode(V)} || {K,V} <- KVs]};
113 | Err ->
114 | Err
115 | end.
116 |
117 | %% @spec keys(TT, Prefix) -> {ok, Keys} | {error, Reason}
118 | %% @doc Performs a prefix search in database TT based on Prefix.
119 | %% For details on Prefix, @see sext:prefix/1.
120 | %% @end
121 | %%
122 | keys(TT, Prefix) ->
123 | case ask(TT, {keys, encode_prefix(Prefix), 100}) of
124 | {ok, Keys} ->
125 | {ok, [decode(K) || K <- Keys]};
126 | Err ->
127 | Err
128 | end.
129 |
130 |
131 | %% Tell TokyoTyrant to perform an operation. No reply other than
132 | %% 0 (success), or non-zero (failure).
133 | %%
134 | cmd(TT, Req) ->
135 | gen_server:call(TT, {cmd, Req}).
136 |
137 | ask(TT, Req) ->
138 | gen_server:call(TT, {ask, Req}).
139 |
140 | encode(Term) ->
141 | sext:encode(Term).
142 |
143 |
144 | decode(Bin) ->
145 | sext:decode(Bin).
146 |
147 | encode_prefix(Term) ->
148 | sext:prefix(Term).
149 |
150 |
151 | %% @hidden
152 | init({_Name, Opts}) ->
153 | %% TTName = tt_name(Name, Opts),
154 | Port = proplists:get_value(port, Opts, ?DEFAULT_PORT),
155 | case gen_tcp:connect({127,0,0,1}, Port, [binary,{active,false},
156 | {nodelay,true}]) of
157 | {ok, Socket} ->
158 | {ok, #st{socket = Socket}};
159 | Error ->
160 | Error
161 | end.
162 |
163 | %% @hidden
164 | handle_call({cmd, Req}, _From, #st{socket = Sock} = S) ->
165 | Msg = mk_req(Req),
166 | gen_tcp:send(Sock, Msg),
167 | Reply = cmd_reply(Sock),
168 | {reply, Reply, S};
169 | handle_call({ask, Req}, _From, #st{socket = Sock} = S) ->
170 | Msg = mk_req(Req),
171 | gen_tcp:send(Sock, Msg),
172 | Reply = ask_reply(Req, Sock),
173 | {reply, Reply, S}.
174 |
175 |
176 | %% @hidden
177 | handle_info(Msg, S) ->
178 | io:fwrite("handle_info(~p, ~p)~n", [Msg, S]),
179 | {noreply, S}.
180 |
181 | %% @hidden
182 | handle_cast(_, S) ->
183 | {stop, unknown_cast, S}.
184 |
185 | %% @hidden
186 | terminate(Reason, S) ->
187 | io:fwrite("terminate(~p, ~p)~n", [Reason, S]).
188 |
189 | %% @hidden
190 | code_change(_FromVsn, S, _Extra) ->
191 | {ok, S}.
192 |
193 |
194 | mk_req({put, K, V}) ->
195 | KSz = byte_size(K),
196 | VSz = byte_size(V),
197 | << 16#c8, 16#10, KSz:32, VSz:32, K/binary, V/binary >>;
198 | mk_req({get, K}) ->
199 | KSz = byte_size(K),
200 | << 16#c8, 16#30, KSz:32, K/binary >>;
201 | mk_req({mget, Ks}) ->
202 | N = length(Ks),
203 | Packed = pack_values(Ks),
204 | << 16#c8, 16#31,
205 | N:32, Packed/binary >>;
206 | mk_req({keys, Prefix, Limit}) ->
207 | PSz = byte_size(Prefix),
208 | << 16#c8, 16#58, PSz:32, Limit:32, Prefix/binary >>.
209 |
210 | pack_values(Values) ->
211 | pack_values(Values, <<>>).
212 |
213 | pack_values([H|T], Acc) ->
214 | Sz = byte_size(H),
215 | Bin = << Sz:32, H/binary >>,
216 | pack_values(T, << Acc/binary, Bin/binary >>);
217 | pack_values([], Acc) ->
218 | Acc.
219 |
220 |
221 | cmd_reply(Sock) ->
222 | case gen_tcp:recv(Sock, 1) of
223 | {ok, <<0>>} ->
224 | ok;
225 | {ok, <>} ->
226 | {error, E};
227 | {error,_} = Err ->
228 | Err
229 | end.
230 |
231 | ask_reply(Req, Sock) ->
232 | Method = element(1, Req),
233 | case gen_tcp:recv(Sock, 0) of
234 | {ok, <<0, Rest/binary>>} ->
235 | try get_reply(Method, Rest, Sock)
236 | catch
237 | throw:{error,Reason} ->
238 | {error, Reason}
239 | end;
240 | {ok, <>} ->
241 | {error, E};
242 | {error,_} = Err ->
243 | Err
244 | end.
245 |
246 | get_reply(get, Data, Sock) ->
247 | {Val, _} = get_value(Data, Sock),
248 | {ok, Val};
249 | get_reply(mget, Data, Sock) ->
250 | {N, D1} = get_word(Data, Sock),
251 | Result = get_N(N, D1, fun get_k_v/2, Sock),
252 | {ok, Result};
253 | get_reply(keys, Data, Sock) ->
254 | {N, D1} = get_word(Data, Sock),
255 | Result = get_N(N, D1, fun get_value/2, Sock),
256 | {ok, Result}.
257 |
258 | get_word(<>, _Sock) ->
259 | {W, Rest};
260 | get_word(Sofar, Sock) ->
261 | Bin = get_data(Sock),
262 | get_word(<>, Sock).
263 |
264 | get_value(<>, _Sock) ->
265 | {V, Rest};
266 | get_value(Sofar, Sock) ->
267 | Bin = get_data(Sock),
268 | get_value(<>, Sock).
269 |
270 | get_k_v(<>, _Sock) ->
271 | {{K,V}, Rest};
272 | get_k_v(Sofar, Sock) ->
273 | Bin = get_data(Sock),
274 | get_k_v(<>, Sock).
275 |
276 | get_N(0, _, _, _) ->
277 | [];
278 | get_N(N, Data, F, Sock) when N > 0 ->
279 | {Item, Rest} = F(Data, Sock),
280 | [Item | get_N(N-1, Rest, F, Sock)].
281 |
282 | get_data(Sock) ->
283 | case gen_tcp:recv(Sock, 0) of
284 | {ok, Bin} ->
285 | Bin;
286 | {error,_} = Err ->
287 | throw(Err)
288 | end.
289 |
290 |
--------------------------------------------------------------------------------
/rebar.config:
--------------------------------------------------------------------------------
1 | %% -*- erlang -*-
2 | {erl_opts, [debug_info]}.
3 |
4 | {profiles, [{docs, [{deps,
5 | [
6 | {edown,
7 | {git,
8 | "https://github.com/uwiger/edown.git",
9 | {tag,
10 | "0.8"}}}
11 | ]},
12 |
13 | {edoc_opts, [{doclet, edown_doclet},
14 | {packages,
15 | false},
16 | {subpackages,
17 | true},
18 | {top_level_readme,
19 | {"./README.md",
20 | "http://github.com/uwiger/sext"}}]}]}
21 | ]}.
22 |
--------------------------------------------------------------------------------
/src/sext.app.src:
--------------------------------------------------------------------------------
1 | %% -*- erlang-indent-level: 4; indent-tabs-mode: nil -*-
2 | %%==============================================================================
3 | %% Copyright 2014-16 Ulf Wiger
4 | %%
5 | %% Licensed under the Apache License, Version 2.0 (the "License");
6 | %% you may not use this file except in compliance with the License.
7 | %% You may obtain a copy of the License at
8 | %%
9 | %% http://www.apache.org/licenses/LICENSE-2.0
10 | %%
11 | %% Unless required by applicable law or agreed to in writing, software
12 | %% distributed under the License is distributed on an "AS IS" BASIS,
13 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | %% See the License for the specific language governing permissions and
15 | %% limitations under the License.
16 | %%==============================================================================
17 |
18 | %% @author Ulf Wiger
19 | %% @doc Sortable serialization of Erlang terms.
20 | %% @end
21 | {application, sext,
22 | [{description, "Sortable serialization library"},
23 | {vsn, git},
24 | {modules, []},
25 | {registered, []},
26 | {applications, [kernel, stdlib]},
27 | {env, []},
28 |
29 | {maintainers, ["Ulf Wiger"]},
30 | {licenses, ["Apache 2.0"]},
31 | {links, [{"Github", "https://github.com/uwiger/sext"}]}
32 | ]}.
33 |
--------------------------------------------------------------------------------
/src/sext.erl:
--------------------------------------------------------------------------------
1 | %% -*- erlang-indent-level: 4; indent-tabs-mode: nil
2 | %%==============================================================================
3 | %% Copyright 2014-16 Ulf Wiger
4 | %%
5 | %% Licensed under the Apache License, Version 2.0 (the "License");
6 | %% you may not use this file except in compliance with the License.
7 | %% You may obtain a copy of the License at
8 | %%
9 | %% http://www.apache.org/licenses/LICENSE-2.0
10 | %%
11 | %% Unless required by applicable law or agreed to in writing, software
12 | %% distributed under the License is distributed on an "AS IS" BASIS,
13 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | %% See the License for the specific language governing permissions and
15 | %% limitations under the License.
16 | %%==============================================================================
17 | %%
18 | %% @author Ulf Wiger
19 | %% @doc Sortable serialization library
20 | %% @end
21 | -module(sext).
22 |
23 | -export([encode/1, encode/2, decode/1, decode_next/1]).
24 | -export([encode_hex/1, decode_hex/1]).
25 | -export([encode_sb32/1, decode_sb32/1]).
26 | -export([prefix/1,
27 | partial_decode/1]).
28 | -export([prefix_hex/1]).
29 | -export([prefix_sb32/1]).
30 | -export([to_sb32/1, from_sb32/1]).
31 | -export([to_hex/1, from_hex/1]).
32 |
33 | -export([reverse_sext/1]).
34 |
35 | -export([pp/1]). % for debugging only
36 |
37 | -define(rev_sext , 4).
38 | %%
39 | -define(negbig , 8).
40 | -define(neg4 , 9).
41 | -define(pos4 , 10).
42 | -define(posbig , 11).
43 | -define(atom , 12).
44 | -define(reference, 13).
45 | -define(port , 14).
46 | -define(pid , 15).
47 | -define(tuple , 16).
48 | -define(list , 17).
49 | -define(binary , 18).
50 | -define(bin_tail , 19).
51 |
52 | -define(is_sext(X),
53 | X==?negbig;
54 | X==?neg4;
55 | X==?pos4;
56 | X==?posbig;
57 | X==?atom;
58 | X==?reference;
59 | X==?port;
60 | X==?pid;
61 | X==?tuple;
62 | X==?list;
63 | X==?binary;
64 | X==?bin_tail).
65 |
66 | -define(IMAX1, 16#ffffFFFFffffFFFF).
67 |
68 | %% -define(dbg(Fmt,Args),
69 | %% case get(dbg) of
70 | %% true -> io:fwrite("~p: " ++ Fmt, [?LINE|Args]);
71 | %% _ -> no_dbg
72 | %% end).
73 | -define(dbg(F,A),no_debug).
74 |
75 | %% @spec encode(T::term()) -> binary()
76 | %% @doc Encodes any Erlang term into a binary.
77 | %% The lexical sorting properties of the encoded binary match those of the
78 | %% original Erlang term. That is, encoded terms sort the same way as the
79 | %% original terms would.
80 | %% @end
81 | %%
82 | encode(X) -> encode(X, false).
83 |
84 | %% @spec encode(T::term(), Legacy::boolean()) -> binary()
85 | %% @doc Encodes an Erlang term using legacy bignum encoding.
86 | %% On March 4 2013, Basho noticed that encoded bignums didn't always sort
87 | %% properly. This bug has been fixed, but the encoding of bignums necessarily
88 | %% changed in an incompatible way.
89 | %%
90 | %% The new decode/1 version can read the old bignum format, but the old
91 | %% version obviously cannot read the new. Using `encode(Term, true)', the term
92 | %% will be encoded using the old format.
93 | %%
94 | %% Use only as transition support. This function will be deprecated in time.
95 | %% @end
96 | encode(X, Legacy) when is_tuple(X) -> encode_tuple(X, Legacy);
97 | encode(X, Legacy) when is_map(X) -> encode_map(X, Legacy);
98 | encode(X, Legacy) when is_list(X) -> encode_list(X, Legacy);
99 | encode(X, _) when is_pid(X) -> encode_pid(X);
100 | encode(X, _) when is_port(X) -> encode_port(X);
101 | encode(X, _) when is_reference(X) -> encode_ref(X);
102 | encode(X, Legacy) when is_number(X) -> encode_number(X, Legacy);
103 | encode(X, _) when is_binary(X) -> encode_binary(X);
104 | encode(X, _) when is_bitstring(X) -> encode_bitstring(X);
105 | encode(X, _) when is_atom(X) -> encode_atom(X).
106 |
107 | %% @spec reverse_sext(binary()) -> binary()
108 | %% @doc Reverses the sorting properties of a sext-encoded term. Reverted
109 | %% objects compare as smaller than all sext-encoded objects.
110 | %%
111 | %% No hex- or sb32-encoded variants are provided. Use the `to_hex/1' or
112 | %% `to_sb32/1' functions instead.
113 | %% @end
114 | reverse_sext(<> = B) when ?is_sext(X) ->
115 | NegB = encode_neg_bits(B),
116 | <>.
117 |
118 | %% @spec encode_sb32(Term::any()) -> binary()
119 | %% @doc Encodes any Erlang term into an sb32-encoded binary.
120 | %% This is similar to {@link encode/1}, but produces an octet string that
121 | %% can be used without escaping in file names (containing only the characters
122 | %% 0..9, A..V and '-'). The sorting properties are preserved.
123 | %%
124 | %% Note: The encoding used is inspired by the base32 encoding described in
125 | %% RFC3548, but uses a different alphabet in order to preserve the sort order.
126 | %% @end
127 | %%
128 | encode_sb32(Term) ->
129 | to_sb32(encode(Term)).
130 |
131 | %% @spec encode_hex(Term::any()) -> binary()
132 | %% @doc Encodes any Erlang term into a hex-encoded binary.
133 | %% This is similar to {@link encode/1}, but produces an octet string that
134 | %% can be used without escaping in file names (containing only the characters
135 | %% 0..9 and A..F). The sorting properties are preserved.
136 | %%
137 | %% Note: The encoding used is regular hex-encoding, with the proviso that only
138 | %% capital letters are used (mixing upper- and lowercase characters would break
139 | %% the sorting property).
140 | %% @end
141 | %%
142 | encode_hex(Term) ->
143 | to_hex(encode(Term)).
144 |
145 | %% @spec prefix(X::term()) -> binary()
146 | %% @doc Encodes a binary for prefix matching of similar encoded terms.
147 | %% Lists and tuples can be prefixed by using the '_'
marker,
148 | %% similarly to Erlang match specifications. For example:
149 | %%
150 | %% prefix({1,2,'_','_'})
will result in a binary that is
151 | %% the same as the first part of any encoded 4-tuple with the first two
152 | %% elements being 1 and 2. The prefix algorithm will search for the
153 | %% first '_'
, and treat all following elements as if they
154 | %% were '_'
.
155 | %% prefix([1,2|'_'])
will result in a binary that is the
156 | %% same as the first part of any encoded list where the first two elements
157 | %% are 1 and 2. prefix([1,2,'_'])
will give the same result,
158 | %% as the prefix pattern is the same for all lists starting with
159 | %% `[1,2|...]'.
160 | %% - `prefix(Binary)' will result in a binary that is the same as the
161 | %% encoded version of Binary, except that, instead of padding and
162 | %% terminating, the encoded binary is truncated to the longest byte-aligned
163 | %% binary. The same is done for bitstrings.
164 | %% prefix({1,[1,2|'_'],'_'})
will prefix-encode the second
165 | %% element, and let it end the resulting binary. This prefix will match
166 | %% any 3-tuple where the first element is 1 and the second element is a
167 | %% list where the first two elements are 1 and 2.
168 | %% prefix([1,[1|'_']|'_'])
will result in a prefix that
169 | %% matches all lists where the first element is 1 and the second element is
170 | %% a list where the first element is 1.
171 | %% - For all other data types, the prefix is the same as the encoded term.
172 | %%
173 | %%
174 | %% @end
175 | %%
176 | prefix(X) ->
177 | {_, P} = enc_prefix(X),
178 | P.
179 |
180 | enc_prefix(X) when is_tuple(X) -> prefix_tuple(X);
181 | enc_prefix(X) when is_list(X) -> prefix_list(X);
182 | enc_prefix(X) when is_pid(X) -> {false, encode_pid(X)};
183 | enc_prefix(X) when is_port(X) -> {false, encode_port(X)};
184 | enc_prefix(X) when is_reference(X) -> {false, encode_ref(X)};
185 | enc_prefix(X) when is_number(X) -> {false, encode_number(X)};
186 | enc_prefix(X) when is_binary(X) -> prefix_binary(X);
187 | enc_prefix(X) when is_bitstring(X) -> prefix_bitstring(X);
188 | enc_prefix(X) when is_atom(X) ->
189 | case is_wild(X) of
190 | true ->
191 | {true, <<>>};
192 | false ->
193 | {false, encode_atom(X)}
194 | end.
195 |
196 | %% @spec prefix_sb32(X::term()) -> binary()
197 | %% @doc Generates an sb32-encoded binary for prefix matching.
198 | %% This is similar to {@link prefix/1}, but generates a prefix for binaries
199 | %% encoded with {@link encode_sb32/1}, rather than {@link encode/1}.
200 | %% @end
201 | %%
202 | prefix_sb32(X) ->
203 | chop_prefix_tail(to_sb32(prefix(X))).
204 |
205 | %% @spec prefix_hex(X::term()) -> binary()
206 | %% @doc Generates a hex-encoded binary for prefix matching.
207 | %% This is similar to {@link prefix/1}, but generates a prefix for binaries
208 | %% encoded with {@link encode_hex/1}, rather than {@link encode/1}.
209 | %% @end
210 | %%
211 | prefix_hex(X) ->
212 | to_hex(prefix(X)).
213 |
214 | %% Must chop of the pad character and the last encoded unit (which, if pad
215 | %% characters are present, is not a whole byte)
216 | %%
217 | chop_prefix_tail(Bin) ->
218 | Sz = byte_size(Bin),
219 | Sz6 = Sz-7, Sz4 = Sz - 5, Sz3 = Sz - 4, Sz1 = Sz - 2,
220 | case Bin of
221 | << P:Sz6/binary, _, "------" >> -> P;
222 | << P:Sz4/binary, _, "----" >> -> P;
223 | << P:Sz3/binary, _, "---" >> -> P;
224 | << P:Sz1/binary, _, "-" >> -> P;
225 | _ -> Bin
226 | end.
227 |
228 | %% @spec decode(B::binary()) -> term()
229 | %% @doc Decodes a binary generated using the function {@link sext:encode/1}.
230 | %%
231 | %% Note that a reverse-encoded binary (using {@link sext:reverse_sext/1})
232 | %% decodes into the original sext-encoded binary, not into the term itself.
233 | %% In other words, if `R = reverse_sext(encode(T))',
234 | %% then `T = decode(decode(R))'.
235 | %% @end
236 | %%
237 | decode(Elems) ->
238 | case decode_next(Elems) of
239 | {Term, <<>>} -> Term;
240 | Other -> erlang:error(badarg, Other)
241 | end.
242 |
243 | %% spec decode_sb32(B::binary()) -> term()
244 | %% @doc Decodes a binary generated using the function {@link encode_sb32/1}.
245 | %% @end
246 | %%
247 | decode_sb32(Data) ->
248 | decode(from_sb32(Data)).
249 |
250 | decode_hex(Data) ->
251 | decode(from_hex(Data)).
252 |
253 | pp(none) -> "";
254 | pp(B) when is_bitstring(B) ->
255 | [ $0 + I || <> <= B ].
256 |
257 | encode_tuple(T, Legacy) ->
258 | Sz = size(T),
259 | encode_tuple_elems(1, Sz, T, <>, Legacy).
260 |
261 | prefix_tuple(T) ->
262 | Sz = size(T),
263 | Elems = tuple_to_list(T),
264 | prefix_tuple_elems(Elems, <>).
265 |
266 | %% It's easier to iterate over a tuple by converting it to a list, but
267 | %% since the tuple /can/ be huge, let's do it this way.
268 | encode_tuple_elems(P, Sz, T, Acc, Legacy) when P =< Sz ->
269 | E = encode(element(P,T), Legacy),
270 | encode_tuple_elems(P+1, Sz, T, <>, Legacy);
271 | encode_tuple_elems(_, _, _, Acc, _) ->
272 | Acc.
273 |
274 | prefix_tuple_elems([A|T], Acc) when is_atom(A) ->
275 | case is_wild(A) of
276 | true ->
277 | {true, Acc};
278 | false ->
279 | E = encode(A),
280 | prefix_tuple_elems(T, <>)
281 | end;
282 | prefix_tuple_elems([H|T], Acc) ->
283 | case enc_prefix(H) of
284 | {true, P} ->
285 | {true, <>};
286 | {false, E} ->
287 | prefix_tuple_elems(T, <>)
288 | end;
289 | prefix_tuple_elems([], Acc) ->
290 | {false, Acc}.
291 |
292 | encode_list(L, Legacy) ->
293 | encode_list_elems(L, <>, Legacy).
294 |
295 | prefix_list(L) ->
296 | prefix_list_elems(L, <>).
297 |
298 | encode_map(M, Legacy) ->
299 | Sz = map_size(M),
300 | maps:fold(
301 | fun(K,V,Acc) ->
302 | <>
304 | end, <>, M).
305 |
306 |
307 | encode_binary(B) ->
308 | Enc = encode_bin_elems(B),
309 | <>.
310 |
311 | prefix_binary(B) ->
312 | Enc = encode_bin_elems(B),
313 | {false, <>}.
314 |
315 | encode_bitstring(B) ->
316 | Enc = encode_bits_elems(B),
317 | <>.
318 |
319 | prefix_bitstring(B) ->
320 | Enc = encode_bits_elems(B),
321 | {false, <>}.
322 |
323 | encode_pid(P) ->
324 | case term_to_binary(P) of
325 | <<131,88,119,ALen:8,Name:ALen/binary,NS:8/binary,C:32>> ->
326 | encode_pid_new(Name, NS, C);
327 | <<131,88,100,ALen:16,Name:ALen/binary,NS:8/binary,C:32>> ->
328 | encode_pid_new(Name, NS, C);
329 | <<131,103,100,ALen:16,Name:ALen/binary,NS:8/binary,C:8>> ->
330 | true = C =< 3,
331 | encode_pid(Name, NS, <>)
332 | end.
333 |
334 | encode_pid_new(Name, NS, C) ->
335 | CBin =
336 | case C > 3 of
337 | true -> <<255, C:32>>;
338 | false -> <>
339 | end,
340 | encode_pid(Name, NS, CBin).
341 |
342 | encode_pid(Name, NS, C) ->
343 | NameEnc = encode_bin_elems(Name),
344 | <>.
345 |
346 | encode_port(P) ->
347 | case term_to_binary(P) of
348 | <<131,120,119,ALen:8,Name:ALen/binary,N:64,C:32>> ->
349 | case N bsr 28 of
350 | 0 -> encode_port_new(Name, <>, C);
351 | _ ->
352 | %% N was limited to 28 bits previously, meaning the initial byte
353 | %% in its binary was =< 15. We therefore prefix the 8-byte N with
354 | %% a byte with value 16 to signal the V4 format, and to ensure V4
355 | %% formats sort consistently with the previous format. In this
356 | %% case we don't need to try shortening the C(reation) field.
357 | encode_port(Name, <<16,N:64>>, <>)
358 | end;
359 | <<131,89,100,ALen:16,Name:ALen/binary,N:32,C:32>> ->
360 | 0 = N bsr 28, % assert
361 | encode_port_new(Name, <>, C);
362 | <<131,102,100,ALen:16,Name:ALen/binary,N:32,C:8>> ->
363 | 0 = N bsr 28, % assert
364 | true = C =< 3,
365 | encode_port(Name, <>, <>)
366 | end.
367 |
368 | encode_port_new(Name, N, C) ->
369 | CBin =
370 | case C > 3 of
371 | true -> <<255, C:32>>;
372 | false -> <>
373 | end,
374 | encode_port(Name, N, CBin).
375 |
376 | encode_port(Name, N, C) ->
377 | NameEnc = encode_bin_elems(Name),
378 | <>.
379 |
380 | encode_ref(R) ->
381 | case term_to_binary(R) of
382 | <<131,90,_Len:16,119,NLen:8,Name:NLen/binary,C:32,Rest/binary>> ->
383 | encode_ref_newer(Name, C, Rest);
384 | <<131,90,_Len:16,100,NLen:16,Name:NLen/binary,C:32,Rest/binary>> ->
385 | encode_ref_newer(Name, C, Rest);
386 | <<131,114,_Len:16,100,NLen:16,Name:NLen/binary,C:8,Rest/binary>> ->
387 | true = C =< 3,
388 | encode_ref(Name, <>)
389 | end.
390 |
391 | encode_ref_newer(Name, C, Rest) ->
392 | NewRest =
393 | case C > 3 of
394 | true -> <<255, C:32, Rest/binary>>;
395 | false -> <>
396 | end,
397 | encode_ref(Name, NewRest).
398 |
399 | encode_ref(Name, Rest) ->
400 | NameEnc = encode_bin_elems(Name),
401 | RestEnc = encode_bin_elems(Rest),
402 | <>.
403 |
404 | encode_atom(A) ->
405 | Bin = list_to_binary(atom_to_list(A)),
406 | Enc = encode_bin_elems(Bin),
407 | <>.
408 |
409 | encode_number(N) ->
410 | encode_number(N, false).
411 |
412 | encode_number(N, Legacy) when is_integer(N) ->
413 | encode_int(N, none, Legacy);
414 | encode_number(F, _Legacy) when is_float(F) ->
415 | encode_float(F).
416 |
417 | %%
418 | %% IEEE 764 Binary 64 standard representation
419 | %% http://en.wikipedia.org/wiki/Double_precision_floating-point_format
420 | %%
421 | %% |12345678 12345678 12345678 12345678 12345678 12345678 12345678 12345678
422 | %% |iEEEEEEE EEEEffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff|
423 | %%
424 | %% i: sign bit
425 | %% E: Exponent, 11 bits
426 | %% f: fraction, 52 bits
427 | %%
428 | %% We perform the following operations:
429 | %% - if E < 1023 (see Exponent bias), the integer part is 0
430 | %%
431 | encode_float(F) ->
432 | <> = <>,
433 | ?dbg("F = ~p | Exp0 = ~p | Frac = ~p~n", [cF, Exp0, Frac]),
434 | {Int0, Fraction} =
435 | case Exp0 - 1023 of
436 | NegExp when NegExp < 0 ->
437 | Offs = -NegExp,
438 | ?dbg("NegExp = ~p, Offs = ~p~n"
439 | "Frac = ~p~n", [NegExp, Offs, Frac]),
440 | {0, << 0:Offs, 1:1,Frac:52 >>};
441 | Exp1 ->
442 | ?dbg("Exp1 = ~p~n", [Exp1]),
443 | if Exp1 >= 52 ->
444 | %% Decimal part will be zero
445 | {trunc(F), <<0:52>>};
446 | true ->
447 | R = 52-Exp1,
448 | ?dbg("R = ~p~n", [R]),
449 | Exp2 = Exp1 + 1, % add the leading 1-bit
450 | ?dbg("Exp2 = ~p~n", [Exp2]),
451 | <> = <<1:1, Frac:52>>,
452 | ?dbg("I = ~p, Frac1 = ~p~n", [I,Frac1]),
453 | {I, <>}
454 | end
455 | end,
456 | if Sign == 1 ->
457 | %% explicitly encode a negative int, since Int0 can be zero.
458 | Int = if Int0 >= 0 -> -Int0;
459 | true -> Int0
460 | end,
461 | encode_neg_int(Int, Fraction);
462 | Sign == 0 ->
463 | encode_int(Int0, Fraction)
464 | end.
465 |
466 | encode_neg_int(Int, Fraction)->
467 | encode_neg_int(Int, Fraction,false).
468 | encode_int(I, R) ->
469 | encode_int(I, R, false).
470 |
471 | encode_int(I,R, _Legacy) when I >= 0, I =< 16#7fffffff ->
472 | ?dbg("encode_int(~p, ~p)~n", [I,R]),
473 | if R == none ->
474 | << ?pos4, I:31, 0:1 >>;
475 | true ->
476 | RSz = bit_size(R),
477 | <> = R,
478 | ?dbg("Fraction = ~p~n", [Fraction]),
479 | if Fraction == 0 ->
480 | << ?pos4, I:31, 1:1, 8:8 >>;
481 | true ->
482 | Rbits = encode_bits_elems(R),
483 | << ?pos4, I:31, 1:1, Rbits/binary >>
484 | end
485 | end;
486 | encode_int(I,R, Legacy) when I > 16#7fffffff ->
487 | ?dbg("encode_int(~p, ~p)~n", [I,R]),
488 | Bytes = encode_big(I, Legacy),
489 | if R == none ->
490 | <>;
491 | true ->
492 | RSz = bit_size(R),
493 | <> = R,
494 | ?dbg("Fraction = ~p~n", [Fraction]),
495 | if Fraction == 0 ->
496 | << ?posbig, Bytes/binary, 1:8, 8:8 >>;
497 | true ->
498 | Rbits = encode_bits_elems(R),
499 | <>
500 | end
501 | end;
502 | encode_int(I, R, Legacy) when I < 0 ->
503 | encode_neg_int(I, R,Legacy).
504 |
505 | encode_neg_int(I,R,_Legacy) when I =< 0, I >= -16#7fffffff ->
506 | ?dbg("encode_neg_int(~p, ~p [sz: ~p])~n", [I,pp(R), try bit_size(R) catch error:_ -> "***" end]),
507 | Adj = max_value(31) + I, % keep in mind that I < 0
508 | ?dbg("Adj = ~p~n", [erlang:integer_to_list(Adj,2)]),
509 | if R == none ->
510 | << ?neg4, Adj:31, 1:1 >>;
511 | true ->
512 | Rbits = encode_neg_bits(R),
513 | ?dbg("R = ~p -> RBits = ~p~n", [pp(R), pp(Rbits)]),
514 | << ?neg4, Adj:31, 0:1, Rbits/binary >>
515 | end;
516 | encode_neg_int(I,R,Legacy) when I < -16#7fFFffFF ->
517 | ?dbg("encode_neg_int(BIG ~p)~n", [I]),
518 | Bytes = encode_big_neg(I,Legacy),
519 | ?dbg("Bytes = ~p~n", [Bytes]),
520 | if R == none ->
521 | <>;
522 | true ->
523 | Rbits = encode_neg_bits(R),
524 | ?dbg("R = ~p -> RBits = ~p~n", [pp(R), pp(Rbits)]),
525 | <>
526 | end.
527 |
528 | encode_big(I, Legacy) ->
529 | Bl = encode_big1(I),
530 | ?dbg("Bl = ~p~n", [Bl]),
531 | Bb = case Legacy of
532 | false ->
533 | prepend_size(list_to_binary(Bl));
534 | true ->
535 | list_to_binary(Bl)
536 | end,
537 | ?dbg("Bb = ~p~n", [Bb]),
538 | encode_bin_elems(Bb).
539 |
540 | prepend_size(B) ->
541 | Sz = byte_size(B),
542 | <<255, (encode_size(Sz))/binary, B/binary>>.
543 |
544 | remove_size_bits(<<255, T/binary>>) ->
545 | {_, Rest} = untag_7bits(T, <<>>),
546 | Rest;
547 | remove_size_bits(B) ->
548 | %% legacy bignum
549 | B.
550 |
551 | encode_size(I) when I > 127 ->
552 | B = int_to_binary(I),
553 | tag_7bits(B);
554 | encode_size(I) ->
555 | <>.
556 |
557 | tag_7bits(B) when bit_size(B) > 7 ->
558 | <> = B,
559 | <<1:1, H:7, (tag_7bits(T))/binary>>;
560 | tag_7bits(B) ->
561 | Sz = bit_size(B),
562 | <> = B,
563 | <<0:1, I:7>>.
564 |
565 | untag_7bits(<<1:1, H:7, T/binary>>, Acc) ->
566 | untag_7bits(T, <>);
567 | untag_7bits(<<0:1, H:7, T/binary>>, Acc) ->
568 | AccBits = bit_size(Acc),
569 | HBits = 8 - (AccBits rem 8),
570 | {<>, T}.
571 |
572 | int_to_binary(I) when I =< 16#ff -> <>;
573 | int_to_binary(I) when I =< 16#ffff -> <>;
574 | int_to_binary(I) when I =< 16#ffffff -> <>;
575 | int_to_binary(I) when I =< 16#ffffffff -> <>;
576 | int_to_binary(I) when I =< 16#ffffffffff -> <>;
577 | int_to_binary(I) when I =< 16#ffffffffffff -> <>;
578 | int_to_binary(I) when I =< 16#ffffffffffffff -> <>;
579 | int_to_binary(I) when I =< 16#ffffffffffffffff -> <>;
580 | int_to_binary(I) ->
581 | %% Realm of the ridiculous
582 | list_to_binary(
583 | lists:dropwhile(fun(X) -> X==0 end, binary_to_list(<>))).
584 |
585 | %% This function exists for documentation, but not used right now.
586 | %% It's the reverse of encode_size/1, used for encoding bignums.
587 | %%
588 | %% decode_size(<<1:1, _/bitstring>> = T) ->
589 | %% {SzBin, Rest} = untag_7bits(T, <<>>),
590 | %% Bits = bit_size(SzBin),
591 | %% <> = SzBin,
592 | %% {Sz, Rest};
593 | %% decode_size(<<0:1, H:7, T/binary>>) ->
594 | %% {H, T}.
595 |
596 | encode_big_neg(I,Legacy) ->
597 | {Words, Max} = get_max(-I),
598 | ?dbg("Words = ~p | Max = ~p~n", [Words,Max]),
599 | Iadj = Max + I, % keep in mind that I < 0
600 | ?dbg("IAdj = ~p~n", [Iadj]),
601 | Bin = encode_big(Iadj,Legacy),
602 | ?dbg("Bin = ~p~n", [Bin]),
603 | WordsAdj = 16#ffffFFFF - Words,
604 | ?dbg("WordsAdj = ~p~n", [WordsAdj]),
605 | <>.
606 |
607 | encode_big1(I) ->
608 | encode_big1(I, []).
609 |
610 | encode_big1(I, Acc) when I < 16#ff ->
611 | [I|Acc];
612 | encode_big1(I, Acc) ->
613 | encode_big1(I bsr 8, [I band 16#ff | Acc]).
614 |
615 | encode_list_elems([], Acc, _) ->
616 | <>;
617 | encode_list_elems(B, Acc, Legacy) when is_bitstring(B) ->
618 | %% improper list
619 | <>;
620 | encode_list_elems(E, Acc, Legacy) when not(is_list(E)) ->
621 | %% improper list
622 | <>;
623 | encode_list_elems([H|T], Acc, Legacy) ->
624 | Enc = encode(H,Legacy),
625 | encode_list_elems(T, <>, Legacy).
626 |
627 | prefix_list_elems([], Acc) ->
628 | {false, <>};
629 | prefix_list_elems(E, Acc) when not(is_list(E)) ->
630 | case is_wild(E) of
631 | true ->
632 | {true, Acc};
633 | false ->
634 | Marker = if is_bitstring(E) -> ?bin_tail;
635 | true -> 1
636 | end,
637 | {Bool, P} = enc_prefix(E),
638 | {Bool, <>}
639 | end;
640 | prefix_list_elems([H|T], Acc) ->
641 | case enc_prefix(H) of
642 | {true, P} ->
643 | {true, <>};
644 | {false, E} ->
645 | prefix_list_elems(T, <>)
646 | end.
647 |
648 | is_wild('_') ->
649 | true;
650 | is_wild(A) when is_atom(A) ->
651 | case atom_to_list(A) of
652 | "\$" ++ S ->
653 | try begin
654 | _ = list_to_integer(S),
655 | true
656 | end
657 | catch
658 | error:_ ->
659 | false
660 | end;
661 | _ ->
662 | false
663 | end;
664 | is_wild(_) ->
665 | false.
666 |
667 | encode_bin_elems(<<>>) ->
668 | <<8>>;
669 | encode_bin_elems(B) ->
670 | Pad = 8 - (size(B) rem 8),
671 | << (<< <<1:1, B1:8>> || <> <= B >>)/bitstring, 0:Pad, 8 >>.
672 |
673 | encode_neg_bits(<<>>) ->
674 | <<247>>;
675 | encode_neg_bits(B) ->
676 | {Padded, TailBits} = pad_neg_bytes(B),
677 | ?dbg("TailBits = ~p~n", [TailBits]),
678 | TailSz0 = bit_size(TailBits),
679 | TailSz = 16#ff - TailSz0,
680 | if TailSz0 == 0 ->
681 | Pad = 8 - (bit_size(Padded) rem 8),
682 | Ip = max_value(Pad), % e.g. max_value(3) -> 2#111
683 | <>;
684 | true ->
685 | ?dbg("TailSz0 = ~p~n", [TailSz0]),
686 | TailPad = 8 - TailSz0,
687 | ?dbg("TailPad = ~p~n", [TailPad]),
688 | Itp = (1 bsl TailPad)-1,
689 | ?dbg("Itp = ~p~n", [Itp]),
690 | Pad = 8 - ((bit_size(Padded) + 1) rem 8),
691 | ?dbg("Pad = ~p~n", [Pad]),
692 | Ip = max_value(Pad),
693 | ?dbg("Ip = ~p~n", [Ip]),
694 | ?dbg("Pad = ~p~n", [Pad]),
695 | ?dbg("TailSz = ~p~n", [TailSz]),
696 | <>
698 | end.
699 |
700 | pad_neg_bytes(Bin) ->
701 | pad_neg_bytes(Bin, <<>>).
702 |
703 | pad_neg_bytes(<>, Acc) ->
704 | H1 = 16#ff - H,
705 | pad_neg_bytes(T, <>);
706 | pad_neg_bytes(Bits, Acc) when is_bitstring(Bits) ->
707 | Sz = bit_size(Bits),
708 | Max = (1 bsl Sz) - 1,
709 | <> = Bits,
710 | I1 = Max - I0,
711 | {Acc, <>}.
712 |
713 | encode_bits_elems(B) ->
714 | {Padded, TailBits} = pad_bytes(B),
715 | TailSz = bit_size(TailBits),
716 | TailPad = 8-TailSz,
717 | Pad = 8 - ((TailSz + TailPad + bit_size(Padded) + 1) rem 8),
718 | <>.
719 |
720 | pad_bytes(Bin) ->
721 | pad_bytes(Bin, <<>>).
722 |
723 | pad_bytes(<>, Acc) ->
724 | pad_bytes(T, <>);
725 | pad_bytes(Bits, Acc) when is_bitstring(Bits) ->
726 | {Acc, Bits}.
727 |
728 |
729 | %% ------------------------------------------------------
730 | %% Decoding routines
731 |
732 | -spec decode_next(binary()) -> {any(), binary()}.
733 | %% @spec decode_next(Bin) -> {N, Rest}
734 | %% @doc Decode a binary stream, returning the next decoded term and the
735 | %% stream remainder
736 | %%
737 | %% This function will raise an exception if the beginning of `Bin' is not
738 | %% a valid sext-encoded term.
739 | %% @end
740 | decode_next(<>) -> decode_rev_sext(Rest);
741 | decode_next(<>) -> decode_atom(Rest);
742 | decode_next(<>) -> decode_pid(Rest);
743 | decode_next(<>) -> decode_port(Rest);
744 | decode_next(<>) -> decode_ref(Rest);
745 | decode_next(<>) -> decode_tuple(Sz,Rest);
746 | %% decode_next(<>) -> {[], Rest};
747 | %% decode_next(<>) -> decode_list(Rest);
748 | decode_next(<>) -> decode_map(Rest);
749 | decode_next(<>) -> decode_list(Rest);
750 | decode_next(<>) -> decode_neg_big(Rest);
751 | decode_next(<>) -> decode_pos_big(Rest);
752 | decode_next(<>) -> decode_neg(I,F,Rest);
753 | decode_next(<>) -> decode_pos(I,F,Rest);
754 | decode_next(<>) -> decode_binary(Rest).
755 |
756 | -spec partial_decode(binary()) -> {full | partial, any(), binary()}.
757 | %% @spec partial_decode(Bytes) -> {full | partial, DecodedTerm, Rest}
758 | %% @doc Decode a sext-encoded term or prefix embedded in a byte stream.
759 | %%
760 | %% Example:
761 | %% ```
762 | %% 1> T = sext:encode({a,b,c}).
763 | %% <<16,0,0,0,3,12,176,128,8,12,177,0,8,12,177,128,8>>
764 | %% 2> sext:partial_decode(<<T/binary, "tail">>).
765 | %% {full,{a,b,c},<<"tail">>}
766 | %% 3> P = sext:prefix({a,b,'_'}).
767 | %% <<16,0,0,0,3,12,176,128,8,12,177,0,8>>
768 | %% 4> sext:partial_decode(<<P/binary, "tail">>).
769 | %% {partial,{a,b,'_'},<<"tail">>}
770 | %% '''
771 | %%
772 | %% Note that a decoded prefix may not be exactly like the encoded prefix.
773 | %% For example, ['_']
will be encoded as
774 | %% <<17>>
, i.e. only the 'list' opcode. The
775 | %% decoded prefix will be '_'
, since the encoded prefix would
776 | %% also match the empty list. The decoded prefix will always be a prefix to
777 | %% anything to which the original prefix is a prefix.
778 | %%
779 | %% For tuples, {1,'_',3}
encoded and decoded, will result in
780 | %% {1,'_','_'}
, i.e. the tuple size is kept, but the elements
781 | %% after the first wildcard are replaced with wildcards.
782 | %% @end
783 | partial_decode(<>) ->
784 | partial_decode_tuple(Sz, Rest);
785 | partial_decode(<>) ->
786 | partial_decode_list(Rest);
787 | partial_decode(Other) ->
788 | try decode_next(Other) of
789 | {Dec, Rest} ->
790 | {full, Dec, Rest}
791 | catch
792 | error:function_clause ->
793 | {partial, '_', Other}
794 | end.
795 |
796 | decode_rev_sext(B) ->
797 | decode_neg_binary(B).
798 |
799 | decode_atom(B) ->
800 | {Bin, Rest} = decode_binary(B),
801 | {list_to_atom(binary_to_list(Bin)), Rest}.
802 |
803 | decode_tuple(Sz, Elems) ->
804 | decode_tuple(Sz,Elems,[]).
805 |
806 | decode_tuple(0, Rest, Acc) ->
807 | {list_to_tuple(lists:reverse(Acc)), Rest};
808 | decode_tuple(N, Elems, Acc) ->
809 | {Term, Rest} = decode_next(Elems),
810 | decode_tuple(N-1, Rest, [Term|Acc]).
811 |
812 | partial_decode_tuple(Sz, Elems) ->
813 | partial_decode_tuple(Sz, Elems, []).
814 |
815 | partial_decode_tuple(0, Rest, Acc) ->
816 | {full, list_to_tuple(lists:reverse(Acc)), Rest};
817 | partial_decode_tuple(N, Elems, Acc) ->
818 | case partial_decode(Elems) of
819 | {partial, Term, Rest} ->
820 | {partial, list_to_tuple(
821 | lists:reverse([Term|Acc]) ++ pad_(N-1)), Rest};
822 | {full, Dec, Rest} ->
823 | partial_decode_tuple(N-1, Rest, [Dec|Acc])
824 | end.
825 |
826 | pad_(0) ->
827 | [];
828 | pad_(N) when N > 0 ->
829 | ['_'|pad_(N-1)].
830 |
831 | partial_decode_list(Elems) ->
832 | partial_decode_list(Elems, []).
833 |
834 | partial_decode_list(<<>>, Acc) ->
835 | {partial, lists:reverse(Acc) ++ '_', <<>>};
836 | partial_decode_list(<<2, Rest/binary>>, Acc) ->
837 | {full, lists:reverse(Acc), Rest};
838 | partial_decode_list(<>, Acc) ->
839 | %% improper list, binary tail
840 | {Term, Rest} = decode_next(Next),
841 | {full, lists:reverse(Acc) ++ Term, Rest};
842 | partial_decode_list(<<1, Next/binary>>, Acc) ->
843 | {Result, Term, Rest} = partial_decode(Next),
844 | {Result, lists:reverse(Acc) ++ Term, Rest};
845 | partial_decode_list(<> = Next, Acc) when ?is_sext(X) ->
846 | case partial_decode(Next) of
847 | {full, Term, Rest} ->
848 | partial_decode_list(Rest, [Term|Acc]);
849 | {partial, Term, Rest} ->
850 | {partial, lists:reverse([Term|Acc]) ++ '_', Rest}
851 | end;
852 | partial_decode_list(Rest, Acc) ->
853 | {partial, lists:reverse(Acc) ++ '_', Rest}.
854 |
855 | decode_map(<>) ->
856 | decode_map(Sz, Rest, #{}).
857 |
858 | decode_map(0, Rest, M) ->
859 | {M, Rest};
860 | decode_map(N, Bin, M) ->
861 | {K, Bin1} = decode_next(Bin),
862 | {V, Bin2} = decode_next(Bin1),
863 | decode_map(N-1, Bin2, maps:put(K, V, M)).
864 |
865 |
866 | decode_list(Elems) ->
867 | decode_list(Elems, []).
868 |
869 | decode_list(<<2, Rest/binary>>, Acc) ->
870 | {lists:reverse(Acc), Rest};
871 | decode_list(<>, Acc) ->
872 | %% improper list, binary tail
873 | {Term, Rest} = decode_next(Next),
874 | {lists:reverse(Acc) ++ Term, Rest};
875 | decode_list(<<1, Next/binary>>, Acc) ->
876 | %% improper list, non-binary tail
877 | {Term, Rest} = decode_next(Next),
878 | {lists:reverse(Acc) ++ Term, Rest};
879 | decode_list(Elems, Acc) ->
880 | {Term, Rest} = decode_next(Elems),
881 | decode_list(Rest, [Term|Acc]).
882 |
883 | decode_pid(Bin) ->
884 | {Name, Rest} = decode_binary(Bin),
885 | NameSz = size(Name),
886 | case Rest of
887 | <> ->
888 | {binary_to_term(<<131,88,100,NameSz:16,Name/binary,NS/binary,C/binary>>), Rest1};
889 | <> ->
890 | true = C =< 3,
891 | {binary_to_term(<<131,103,100,NameSz:16,Name/binary,NS/binary,C>>), Rest1}
892 | end.
893 |
894 | decode_port(Bin) ->
895 | {Name, Rest} = decode_binary(Bin),
896 | NameSz = size(Name),
897 | case Rest of
898 | <<16, N:8/binary, 255, C:4/binary, Rest1/binary>> ->
899 | {binary_to_term(<<131,120,100,NameSz:16,Name/binary,N/binary,C/binary>>), Rest1};
900 | <> ->
901 | {binary_to_term(<<131,89,100,NameSz:16,Name/binary,N/binary,C/binary>>), Rest1};
902 | <> ->
903 | true = C =< 3,
904 | {binary_to_term(<<131,102,100,NameSz:16,Name/binary,N/binary,C>>), Rest1}
905 | end.
906 |
907 | decode_ref(Bin) ->
908 | {Name, Rest} = decode_binary(Bin),
909 | {Tail, Rest1} = decode_binary(Rest),
910 | NLen = size(Name),
911 | case Tail of
912 | <<255, C:4/binary, Tail1/binary>> ->
913 | Len = size(Tail1) div 4,
914 | RefBin = <<131,90,Len:16,100,NLen:16,Name/binary,C/binary,Tail1/binary>>,
915 | {binary_to_term(RefBin), Rest1};
916 | <> ->
917 | true = C =< 3,
918 | Len = size(Tail1) div 4,
919 | RefBin = <<131,114,Len:16,100,NLen:16,Name/binary,C,Tail1/binary>>,
920 | {binary_to_term(RefBin), Rest1}
921 | end.
922 |
923 | decode_neg(I, 1, Rest) ->
924 | {(I - 16#7fffFFFF), Rest};
925 | decode_neg(I0, 0, Bin) -> % for negative numbers, 0 means that it's a float
926 | I = 16#7fffFFFF - I0,
927 | ?dbg("decode_neg()... I = ~p | Bin = ~p~n", [I, Bin]),
928 | decode_neg_float(I, Bin).
929 |
930 | decode_neg_float(0, Bin) ->
931 | {R, Rest} = decode_neg_binary(Bin),
932 | ?dbg("Bin = ~p~n", [pp(Bin)]),
933 | ?dbg("R = ~p | Rest = ~p~n", [pp(R), Rest]),
934 | Sz = bit_size(R),
935 | Offs = Sz - 53,
936 | ?dbg("Offs = ~p | Sz - ~p~n", [Offs, Sz]),
937 | <<_:Offs, 1:1, I:52>> = R,
938 | Exp = 1023 - Offs,
939 | <> = <<1:1, Exp:11, I:52>>,
940 | {F, Rest};
941 | decode_neg_float(I, Bin) ->
942 | {R, Rest} = decode_neg_binary(Bin),
943 | ?dbg("decode_neg_float: I = ~p | R = ~p~n", [I, R]),
944 | Sz = bit_size(R),
945 | ?dbg("Sz = ~p~n", [Sz]),
946 | <> = R,
947 | ?dbg("Ri = ~p~n", [Ri]),
948 | if Ri == 0 ->
949 | %% special case
950 | {0.0-I, Rest};
951 | true ->
952 | IBits = strip_first_one(I),
953 | ?dbg("IBits = ~p~n", [pp(IBits)]),
954 | Bits = <>,
955 | ?dbg("Bits = ~p (Sz: ~p)~n", [pp(Bits), bit_size(Bits)]),
956 | Exp = bit_size(IBits) + 1023,
957 | ?dbg("Exp = ~p~n", [Exp]),
958 | <> = <>,
959 | ?dbg("Frac = ~p~n", [Frac]),
960 | <> = <<1:1, Exp:11, Frac:52>>,
961 | {F, Rest}
962 | end.
963 |
964 | decode_pos(I, 0, Rest) ->
965 | {I, Rest};
966 | decode_pos(0, 1, Bin) ->
967 | {Real, Rest} = decode_binary(Bin),
968 | Offs = bit_size(Real) - 53,
969 | <<0:Offs, 1:1, Frac:52>> = Real,
970 | Exp = 1023 - Offs,
971 | <> = <<0:1, Exp:11, Frac:52>>,
972 | {F, Rest};
973 | decode_pos(I, 1, Bin) -> % float > 1
974 | ?dbg("decode_pos(~p, 1, ~p)~n", [I, Bin]),
975 | {Real, Rest} = decode_binary(Bin),
976 | case decode_binary(Bin) of
977 | {<<>>, Rest} ->
978 | <> = <>,
979 | {F, Rest};
980 | {Real, Rest} ->
981 | ?dbg("Real = ~p~n", [Real]),
982 | Exp = 52 - bit_size(Real) + 1023,
983 | ?dbg("Exp = ~p~n", [Exp]),
984 | Bits0 = <>,
985 | ?dbg("Bits0 = ~p~n", [Bits0]),
986 | Bits = strip_one(Bits0),
987 | <> = Bits,
988 | <> = <<0:1, Exp:11, Frac:52>>,
989 | {F, Rest}
990 | end.
991 |
992 | decode_pos_big(Bin) ->
993 | ?dbg("decode_pos_big(~p)~n", [Bin]),
994 | {Ib0, Rest} = decode_binary(Bin),
995 | Ib = remove_size_bits(Ib0),
996 | ?dbg("Ib = ~p~n", [Ib]),
997 | ISz = size(Ib) * 8,
998 | ?dbg("ISz = ~p~n", [ISz]),
999 | <> = Ib,
1000 | ?dbg("I = ~p~n", [I]),
1001 | <> = Rest,
1002 | ?dbg("Rest1 = ~p~n", [Rest1]),
1003 | decode_pos(I, F, Rest1).
1004 |
1005 | decode_neg_big(Bin) ->
1006 | ?dbg("decode_neg_big(~p)~n", [Bin]),
1007 | <> = Bin,
1008 | Words = 16#ffffFFFF - WordsAdj,
1009 | ?dbg("Words = ~p~n", [Words]),
1010 | {Ib0, Rest1} = decode_binary(Rest),
1011 | Ib = remove_size_bits(Ib0),
1012 | ?dbg("Ib = ~p | Rest1 = ~p~n", [Ib, Rest1]),
1013 | ISz = size(Ib) * 8,
1014 | <> = Ib,
1015 | ?dbg("I0 = ~p~n", [I0]),
1016 | Max = imax(Words),
1017 | ?dbg("Max = ~p~n", [Max]),
1018 | I = Max - I0,
1019 | ?dbg("I = ~p~n", [I]),
1020 | <> = Rest1,
1021 | ?dbg("F = ~p | Rest2 = ~p~n", [F, Rest2]),
1022 | if F == 0 ->
1023 | decode_neg_float(I, Rest2);
1024 | F == 16#ff ->
1025 | {-I, Rest2}
1026 | end.
1027 |
1028 | %% optimization - no need to loop through a very large number of zeros.
1029 | strip_first_one(I) ->
1030 | Sz = if I < 16#ff -> 8;
1031 | I < 16#ffff -> 16;
1032 | I < 16#ffffff -> 24;
1033 | I < 16#ffffffff -> 32;
1034 | true -> 52
1035 | end,
1036 | strip_one(<>).
1037 |
1038 | strip_one(<<0:1, Rest/bitstring>>) -> strip_one(Rest);
1039 | strip_one(<<1:1, Rest/bitstring>>) -> Rest.
1040 |
1041 |
1042 | decode_binary(<<8, Rest/binary>>) -> {<<>>, Rest};
1043 | decode_binary(B) -> decode_binary(B, 0, <<>>).
1044 |
1045 | decode_binary(<<1:1,H:8,Rest/bitstring>>, N, Acc) ->
1046 | case Rest of
1047 | <<1:1,_/bitstring>> ->
1048 | decode_binary(Rest, N+9, << Acc/binary, H >>);
1049 | _ ->
1050 | Pad = 8 - ((N+9) rem 8),
1051 | <<0:Pad,EndBits,Rest1/binary>> = Rest,
1052 | TailPad = 8-EndBits,
1053 | <> = <>,
1054 | {<< Acc/binary, Tail:EndBits >>, Rest1}
1055 | end.
1056 |
1057 | decode_neg_binary(<<247, Rest/binary>>) -> {<<>>, Rest}; % 16#ff - 8
1058 | decode_neg_binary(B) -> decode_neg_binary(B, 0, <<>>).
1059 |
1060 | decode_neg_binary(<<0:1,H:8,Rest/bitstring>>, N, Acc) ->
1061 | case Rest of
1062 | <<0:1,_/bitstring>> ->
1063 | decode_neg_binary(Rest, N+9, << Acc/binary, (16#ff - H) >>);
1064 | _ ->
1065 | Pad = 8 - ((N+9) rem 8),
1066 | ?dbg("Pad = ~p~n", [Pad]),
1067 | IPad = (1 bsl Pad) - 1,
1068 | <> = Rest,
1069 | ?dbg("EndBits0 = ~p~n", [EndBits0]),
1070 | EndBits = 16#ff - EndBits0,
1071 | ?dbg("EndBits = ~p~n", [EndBits]),
1072 | if EndBits == 0 ->
1073 | {<< Acc/binary, (16#ff - H)>>, Rest1};
1074 | true ->
1075 | <> = <<(16#ff - H)>>,
1076 | ?dbg("Tail = ~p~n", [Tail]),
1077 | {<< Acc/binary, Tail:EndBits >>, Rest1}
1078 | end
1079 | end.
1080 |
1081 | %% The largest value that fits in Sz bits
1082 | max_value(Sz) ->
1083 | (1 bsl Sz) - 1.
1084 |
1085 | %% The largest value that fits in Words*64 bits.
1086 | imax(1) -> max_value(64);
1087 | imax(2) -> max_value(128);
1088 | imax(Words) -> max_value(Words*64).
1089 |
1090 | %% Get the smallest imax/1 value that's larger than I.
1091 | get_max(I) -> get_max(I, 1, imax(1)).
1092 | get_max(I, W, Max) when I > Max ->
1093 | get_max(I, W+1, (Max bsl 64) bor ?IMAX1);
1094 | get_max(_, W, Max) ->
1095 | {W, Max}.
1096 |
1097 | %% @spec to_sb32(Bits::bitstring()) -> binary()
1098 | %% @doc Converts a bitstring into an sb-encoded bitstring
1099 | %%
1100 | %% sb32 (Sortable base32) is a variant of RFC3548, slightly rearranged to
1101 | %% preserve the lexical sorting properties. Base32 was chosen to avoid
1102 | %% filename-unfriendly characters. Also important is that the padding
1103 | %% character be less than any character in the alphabet
1104 | %%
1105 | %% sb32 alphabet:
1106 | %%
1107 | %% 0 0 6 6 12 C 18 I 24 O 30 U
1108 | %% 1 1 7 7 13 D 19 J 25 P 31 V
1109 | %% 2 2 8 8 14 E 20 K 26 Q (pad) -
1110 | %% 3 3 9 9 15 F 21 L 27 R
1111 | %% 4 4 10 A 16 G 22 M 28 S
1112 | %% 5 5 11 B 17 H 23 N 29 T
1113 | %%
1114 | %% @end
1115 | %%
1116 | to_sb32(Bits) when is_bitstring(Bits) ->
1117 | Sz = bit_size(Bits),
1118 | {Chunk, Rest, Pad} =
1119 | case Sz rem 5 of
1120 | 0 -> {Bits, <<>>, <<>>};
1121 | R -> sb32_encode_chunks(Sz, R, Bits)
1122 | end,
1123 | Enc = << << (c2sb32(C1)) >> ||
1124 | <> <= Chunk >>,
1125 | if Rest == << >> ->
1126 | Enc;
1127 | true ->
1128 | << Enc/bitstring, (c2sb32(Rest)):8, Pad/binary >>
1129 | end.
1130 |
1131 | sb32_encode_chunks(Sz, Rem, Bits) ->
1132 | ChunkSz = Sz - Rem,
1133 | << C:ChunkSz/bitstring, Rest:Rem >> = Bits,
1134 | Pad = encode_pad(Rem),
1135 | {C, Rest, Pad}.
1136 |
1137 | encode_pad(3) -> <<"------">>;
1138 | encode_pad(1) -> <<"----">>;
1139 | encode_pad(4) -> <<"---">>;
1140 | encode_pad(2) -> <<"-">>.
1141 |
1142 | %% @spec from_sb32(Bits::bitstring()) -> bitstring()
1143 | %% @doc Converts from an sb32-encoded bitstring into a 'normal' bitstring
1144 | %%
1145 | %% This function is the reverse of {@link to_sb32/1}.
1146 | %% @end
1147 | %%
1148 | from_sb32(<< C:8, "------" >>) -> << (sb322c(C)):3 >>;
1149 | from_sb32(<< C:8, "----" >> ) -> << (sb322c(C)):1 >>;
1150 | from_sb32(<< C:8, "---" >> ) -> << (sb322c(C)):4 >>;
1151 | from_sb32(<< C:8, "-" >> ) -> << (sb322c(C)):2 >>;
1152 | from_sb32(<< C:8, Rest/bitstring >>) ->
1153 | << (sb322c(C)):5, (from_sb32(Rest))/bitstring >>;
1154 | from_sb32(<< >>) ->
1155 | << >>.
1156 |
1157 | c2sb32(I) when 0 =< I, I =< 9 -> $0 + I;
1158 | c2sb32(I) when 10 =< I, I =< 31 -> $A + I - 10.
1159 |
1160 | sb322c(I) when $0 =< I, I =< $9 -> I - $0;
1161 | sb322c(I) when $A =< I, I =< $V -> I - $A + 10.
1162 |
1163 | %% @spec to_hex(Bin::binary()) -> binary()
1164 | %% @doc Converts a binary into a hex-encoded binary
1165 | %% This is conventional hex encoding, with the proviso that
1166 | %% only capital letters are used, e.g. `0..9A..F'.
1167 | %% @end
1168 | to_hex(Bin) ->
1169 | << << (nib2hex(N)):8 >> || <> <= Bin >>.
1170 |
1171 | %% @spec from_hex(Bin::binary()) -> binary()
1172 | %% @doc Converts from a hex-encoded binary into a 'normal' binary
1173 | %%
1174 | %% This function is the reverse of {@link to_hex/1}.
1175 | %%
1176 | from_hex(Bin) ->
1177 | << << (hex2nib(H)):4 >> || <> <= Bin >>.
1178 |
1179 | nib2hex(N) when 0 =< N, N =< 9 -> $0 + N;
1180 | nib2hex(N) when 10 =< N, N =< 15-> $A + N - 10.
1181 |
1182 | hex2nib(C) when $0 =< C, C =< $9 -> C - $0;
1183 | hex2nib(C) when $A =< C, C =< $F -> C - $A + 10.
1184 |
1185 | -ifdef(TEST).
1186 | -include_lib("eunit/include/eunit.hrl").
1187 |
1188 | encode_test() ->
1189 | L = test_list(),
1190 | [{I,I} = {I,catch decode(encode(I))} || I <- L].
1191 |
1192 | test_list() ->
1193 | [-456453453477456464.45456,
1194 | -5.23423564,
1195 | -1.234234,
1196 | -1.23423,
1197 | -0.345,
1198 | -0.34567,
1199 | -0.0034567,
1200 | 0,
1201 | 0.00012345,
1202 | 0.12345,
1203 | 1.2345,
1204 | 123.45,
1205 | 456453453477456464.45456,
1206 | a,
1207 | aaa,
1208 | {},
1209 | {1},
1210 | {1,2},
1211 | {"","123"},
1212 | {"1","234"},
1213 | <<>>,
1214 | <<1>>,
1215 | <<1,5:3>>,
1216 | <<1,5:4>>,
1217 | [1,2,3],
1218 | [],
1219 | self(),
1220 | spawn(fun() -> ok end),
1221 | make_ref(),
1222 | make_ref()|
1223 | lists:sublist(erlang:ports(),1,2)].
1224 |
1225 | -endif.
1226 |
--------------------------------------------------------------------------------
/test/sext_eqc.erl:
--------------------------------------------------------------------------------
1 | %% -*- erlang-indent-level: 4; indent-tabs-mode: nil -*-
2 | %%==============================================================================
3 | %% Copyright 2014-16 Ulf Wiger
4 | %%
5 | %% Licensed under the Apache License, Version 2.0 (the "License");
6 | %% you may not use this file except in compliance with the License.
7 | %% You may obtain a copy of the License at
8 | %%
9 | %% http://www.apache.org/licenses/LICENSE-2.0
10 | %%
11 | %% Unless required by applicable law or agreed to in writing, software
12 | %% distributed under the License is distributed on an "AS IS" BASIS,
13 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | %% See the License for the specific language governing permissions and
15 | %% limitations under the License.
16 | %%==============================================================================
17 |
18 | -module(sext_eqc).
19 |
20 | %% Prefer QuickCheck, but otherwise try with Proper (some properties will
21 | %% have trouble under Proper - feel free to investigate).
22 | -ifdef(EQC).
23 | -undef(QC).
24 | -define(QC,eqc).
25 | -include_lib("eqc/include/eqc.hrl").
26 | -else.
27 | -ifdef(PROPER).
28 | -undef(QC).
29 | -define(QC,proper).
30 | -include_lib("proper/include/proper.hrl").
31 | -endif.
32 | -endif.
33 |
34 | -ifdef(QC).
35 | -compile(export_all).
36 | -include_lib("eunit/include/eunit.hrl").
37 |
38 | get_n(Default) ->
39 | case os:getenv("SEXT_TESTS") of
40 | false -> Default;
41 | Res ->
42 | list_to_integer(Res)
43 | end.
44 |
45 | sext_test_() ->
46 | N = get_n(500),
47 | {timeout, 60,
48 | [
49 | fun() -> t(run(N, prop_encode, fun prop_encode/0)) end
50 | , fun() -> t(run(N, prop_encode_rev, fun prop_encode_rev/0)) end
51 | , fun() -> t(run(N, prop_decode_legacy_big, fun prop_decode_legacy_big/0)) end
52 | , fun() -> t(run(N, prop_decode_legacy_neg_big, fun prop_decode_legacy_neg_big/0)) end
53 | , fun() -> t(run(N, prop_prefix_equiv,fun prop_prefix_equiv/0))end
54 | , fun() -> t(run(N, prop_sort, fun prop_sort/0)) end
55 | , fun() -> t(run(N, prop_revsort, fun prop_revsort/0)) end
56 | , fun() -> t(run(N, prop_sort_big, fun prop_sort_big/0)) end
57 | , fun() -> t(run(N, prop_sort_neg_big, fun prop_sort_neg_big/0)) end
58 | , fun() -> t(run(N, prop_revsort_neg_big, fun prop_revsort_neg_big/0)) end
59 | , fun() -> t(run(N, prop_encode_sb32, fun prop_encode_sb32/0)) end
60 | , fun() -> t(run(N, prop_sort_sb32, fun prop_sort_sb32/0)) end
61 | , fun() -> t(run(N, prop_partial_decode1, fun prop_partial_decode1/0)) end
62 | , fun() -> t(run(N, prop_partial_decode2, fun prop_partial_decode2/0)) end
63 | , fun() -> t(run(N, prop_partial_decode_plus1,
64 | fun prop_partial_decode_plus1/0)) end
65 | , fun() -> t(run(N, prop_partial_decode_plus2,
66 | fun prop_partial_decode_plus2/0)) end
67 | , fun() -> t(run(N, prop_is_prefix1, fun prop_is_prefix1/0)) end
68 | , fun() -> t(run(N, prop_is_prefix2, fun prop_is_prefix2/0)) end
69 | , fun() -> t(run(N, prop_encode_hex, fun prop_encode_hex/0)) end
70 | , fun() -> t(run(N, prop_sort_hex, fun prop_sort_hex/0)) end
71 | , fun() -> t(run(N, prop_is_prefix_hex1, fun prop_is_prefix_hex1/0)) end
72 | , fun() -> t(run(N, prop_is_prefix_hex2, fun prop_is_prefix_hex2/0)) end
73 | , fun() -> t(run(N,prop_non_proper_sorts,fun prop_non_proper_sorts/0)) end
74 | ]}.
75 |
76 | t({_Lbl, Res}) ->
77 | ?assert(Res == true);
78 | t(Res) ->
79 | ?assert(Res == true).
80 |
81 | run() ->
82 | run(good_number_of_tests()).
83 |
84 | good_number_of_tests() ->
85 | get_n(2000).
86 |
87 | run(Num) ->
88 | [
89 | run (Num, prop_encode , fun prop_encode/0)
90 | , run(Num, prop_decode_legacy_big, fun prop_decode_legacy_big/0)
91 | , run(Num, prop_decode_legacy_neg_big, fun prop_decode_legacy_neg_big/0)
92 | , run(Num, prop_prefix_equiv,fun prop_prefix_equiv/0)
93 | %% , run(Num, prop_prefix_equiv,fun prop_prefix_equiv/0)
94 | , run(Num, prop_sort , fun prop_sort/0)
95 | , run(Num, prop_sort_big, fun prop_sort_big/0)
96 | , run(Num, prop_sort_neg_big, fun prop_sort_neg_big/0)
97 | , run(Num, prop_encode_sb32, fun prop_encode_sb32/0)
98 | , run(Num, prop_sort_sb32 , fun prop_sort_sb32/0)
99 | , run(Num, prop_partial_decode1, fun prop_partial_decode1/0)
100 | , run(Num, prop_partial_decode2, fun prop_partial_decode2/0)
101 | , run(Num, prop_partial_decode_plus1, fun prop_partial_decode_plus1/0)
102 | , run(Num, prop_partial_decode_plus2, fun prop_partial_decode_plus2/0)
103 | , run(Num, prop_is_prefix1, fun prop_is_prefix1/0)
104 | , run(Num, prop_is_prefix2, fun prop_is_prefix2/0)
105 | , run(Num, prop_non_proper_sorts, fun prop_non_proper_sorts/0)
106 | ].
107 |
108 | run(Num, Lbl, F) ->
109 | io:fwrite(user, "EQC test: ~p (~p)... ", [Lbl, Num]),
110 | Res = ?QC:quickcheck(?QC:numtests(Num, F())),
111 | io:fwrite(user, "-> ~p~n", [Res]),
112 | {Lbl, Res}.
113 |
114 |
115 | %% In this property, the ?IMPLIES condition guards us against the
116 | %% unfortunate case where {1, 1.0} will have a strict ordering when
117 | %% encoded (in order to satisfy the encode property), but not in Erlang
118 | %% since they compare as equal. It seems a reasonable limitation, that
119 | %% we limit ourselves to testing the sort order of term pairs where the
120 | %% values actually differ.
121 | prop_sort() ->
122 | ?FORALL({T1,T2}, {term_(), term_()},
123 | begin
124 | {X1,X2} = {sext:encode(T1), sext:encode(T2)},
125 | collect(size(term_to_binary({T1,T2})),
126 | comp(X1,X2) == comp_i(T1,T2))
127 | end).
128 |
129 | prop_revsort() ->
130 | ?FORALL({T1,T2}, {term_(), term_()},
131 | begin
132 | {X1,X2} = {sext:reverse_sext(sext:encode(T1)),
133 | sext:reverse_sext(sext:encode(T2))},
134 | collect(size(term_to_binary({T1,T2})),
135 | comp(X1,X2) == comp_i(T2,T1))
136 | end).
137 |
138 | prop_sort_big() ->
139 | ?FORALL({T1,T2}, {big(), big()},
140 | begin
141 | {X1,X2} = {sext:encode(T1), sext:encode(T2)},
142 | collect(size(term_to_binary({T1,T2})),
143 | comp(X1,X2) == comp_i(T1,T2))
144 | end).
145 |
146 | prop_sort_neg_big() ->
147 | ?FORALL({T1,T2}, {neg_big(), neg_big()},
148 | begin
149 | {X1,X2} = {sext:encode(T1), sext:encode(T2)},
150 | collect(size(term_to_binary({T1,T2})),
151 | comp(X1,X2) == comp_i(T1,T2))
152 | end).
153 |
154 | prop_revsort_neg_big() ->
155 | ?FORALL({T1,T2}, {neg_big(), neg_big()},
156 | begin
157 | {X1,X2} = {sext:reverse_sext(sext:encode(T1)),
158 | sext:reverse_sext(sext:encode(T2))},
159 | collect(size(term_to_binary({T1,T2})),
160 | comp(X1,X2) == comp_i(T2,T1))
161 | end).
162 |
163 | prop_sort_sb32() ->
164 | ?FORALL({T1,T2}, {term_(), term_()},
165 | begin
166 | {X1,X2} = {sext:encode_sb32(T1), sext:encode_sb32(T2)},
167 | collect(size(term_to_binary({T1,T2})),
168 | comp(X1,X2) == comp_i(T1,T2))
169 | end).
170 |
171 | prop_sort_hex() ->
172 | ?FORALL({T1,T2}, {term_(), term_()},
173 | begin
174 | {X1,X2} = {sext:encode_hex(T1), sext:encode_hex(T2)},
175 | collect(size(term_to_binary({T1,T2})),
176 | comp(X1,X2) == comp_i(T1,T2))
177 | end).
178 |
179 |
180 | prop_sort_fs() ->
181 | ?FORALL({R1,R2}, {pos_float(),pos_float()},
182 | begin
183 | {B1,B2} = {sext:encode(R1), sext:encode(R2)},
184 | comp(R1,R2) == comp(B1,B2)
185 | end).
186 |
187 | prop_sort_neg_fs() ->
188 | ?FORALL({R1,R2}, {neg_float(), neg_float()},
189 | begin
190 | {B1,B2} = {sext:encode(R1), sext:encode(R2)},
191 | comp(R1,R2) == comp(B1,B2)
192 | end).
193 |
194 | prop_encode() ->
195 | ?FORALL(T, term_(),
196 | sext:decode(sext:encode(T)) == T).
197 |
198 | prop_encode_rev() ->
199 | ?FORALL(T, term_(),
200 | sext:decode(sext:decode(
201 | sext:reverse_sext(sext:encode(T)))) == T).
202 |
203 | prop_decode_legacy_big() ->
204 | ?FORALL(T, big(),
205 | sext:decode(sext:encode(T, true)) == T).
206 |
207 | prop_decode_legacy_neg_big() ->
208 | ?FORALL(T, neg_big(),
209 | sext:decode(sext:encode(T, true)) == T).
210 |
211 | prop_encode_sb32() ->
212 | ?FORALL(T, term_(),
213 | sext:decode_sb32(sext:encode_sb32(T)) == T).
214 |
215 | prop_encode_hex() ->
216 | ?FORALL(T, term_(),
217 | sext:decode_hex(sext:encode_hex(T)) == T).
218 |
219 | prop_prefix_equiv() ->
220 | ?FORALL(T, term_(),
221 | sext:encode(T) == sext:prefix(T)).
222 |
223 | %% Partial-decoding a whole term should give the term back
224 | prop_partial_decode1() ->
225 | ?FORALL(T, term_(),
226 | begin
227 | Enc = sext:encode(T),
228 | {full, Dec, Rest} = sext:partial_decode(Enc),
229 | Dec == T andalso Rest == <<>>
230 | end).
231 |
232 | %% Partial-decoding a prefix should give a _comparable_ prefix back
233 | prop_partial_decode2() ->
234 | ?FORALL(Pat, wild_pat(),
235 | begin
236 | Pfx = sext:prefix(Pat),
237 | case sext:partial_decode(Pfx) of
238 | {full, _, _} -> true;
239 | {partial, Dec, Rest} ->
240 | comp_pat(Dec, Pat) andalso Rest == <<>>
241 | end
242 | end).
243 |
244 | %% A sext term followed by something not sext-encoded
245 | prop_partial_decode_plus1() ->
246 | ?FORALL(T, term_(),
247 | begin
248 | Enc = sext:encode(T),
249 | {full, Dec, <<"foo">>} =
250 | sext:partial_decode(<>),
251 | Dec == T
252 | end).
253 |
254 | %% A sext prefix followed by something not sext-encoded
255 | prop_partial_decode_plus2() ->
256 | ?FORALL(Pat, wild_pat(),
257 | begin
258 | Pfx = sext:prefix(Pat),
259 | case sext:partial_decode(<>) of
260 | {full, Dec, <<"foo">>} ->
261 | Dec == Pat;
262 | {partial, Dec, <<"foo">>} ->
263 | comp_pat(Dec, Pat)
264 | end
265 | end).
266 |
267 | wild_pat() ->
268 | ?LET({T,W}, {?SUCHTHAT(Tp, prefixable_term(),
269 | positions(Tp) > 0),wild()},
270 | ?LET(P, choose(1, positions(T)),
271 | make_wild(T, P, W))).
272 |
273 | comp_pat(X, X) -> true;
274 | comp_pat(A, B) when is_tuple(A), is_tuple(B), size(A) == size(B) ->
275 | comp_pat_l(tuple_to_list(A), tuple_to_list(B));
276 | comp_pat(Dec, Pat) when is_list(Dec), is_list(Pat) ->
277 | comp_pat_l(Dec, Pat);
278 | comp_pat(A, B) -> % A: decoded; B: prefix
279 | case {is_wild(A), is_wild(B)} of
280 | {true, true} -> true;
281 | {true, false} ->
282 | case B of
283 | [H|_] ->
284 | %% This is because the decoded prefix of [] and ['_'|'_']
285 | %% are both '_'
286 | is_wild(H);
287 | _ -> false
288 | end;
289 | _ ->
290 | false
291 | end.
292 |
293 | comp_pat_l([H1|T1], [H2|T2]) ->
294 | case is_wild(H1) of
295 | true -> true;
296 | false ->
297 | case comp_pat(H1, H2) of
298 | true -> comp_pat_l(T1, T2);
299 | false -> false
300 | end
301 | end;
302 | comp_pat_l([], []) -> true;
303 | comp_pat_l(A, _) ->
304 | is_wild(A).
305 |
306 |
307 | prop_is_prefix1() ->
308 | ?FORALL({T,W}, {?SUCHTHAT(Tp, prefixable_term(),
309 | positions(Tp) > 0),wild()},
310 | ?LET(P, choose(1, positions(T)),
311 | begin
312 | Pfx = sext:prefix(make_wild(T,P,W)),
313 | true = is_prefix(Pfx, sext:encode(T))
314 | end)).
315 |
316 | prop_is_prefix2() ->
317 | ?FORALL({T,W}, {?SUCHTHAT(Tp, prefixable_term(),
318 | positions(Tp) > 2), wild()},
319 | ?LET(P, choose(2, positions(T)),
320 | begin
321 | {Pfx1,Pfx2} = {sext:prefix(make_wild(T,P,W)),
322 | sext:prefix(make_wild(T,P-1,W))},
323 | true = is_prefix(Pfx2, Pfx1)
324 | end)).
325 |
326 | prop_is_prefix_hex1() ->
327 | ?FORALL({T,W}, {?SUCHTHAT(Tp, prefixable_term(),
328 | positions(Tp) > 0),wild()},
329 | ?LET(P, choose(1, positions(T)),
330 | begin
331 | Pfx = sext:prefix_hex(make_wild(T,P,W)),
332 | true = is_prefix(Pfx, sext:encode_hex(T))
333 | end)).
334 |
335 | prop_is_prefix_hex2() ->
336 | ?FORALL({T,W}, {?SUCHTHAT(Tp, prefixable_term(),
337 | positions(Tp) > 2), wild()},
338 | ?LET(P, choose(2, positions(T)),
339 | begin
340 | {Pfx1,Pfx2} = {sext:prefix_hex(make_wild(T,P,W)),
341 | sext:prefix_hex(make_wild(T,P-1,W))},
342 | true = is_prefix(Pfx2, Pfx1)
343 | end)).
344 |
345 | prop_non_proper_sorts() ->
346 | ?FORALL({L,T}, {non_empty_list(), simple_term()},
347 | begin
348 | List = [{L, 1},
349 | {L ++ T, 2},
350 | {L ++ [T], 3}],
351 | Encoded = [{sext:encode(A),B} || {A,B} <- List],
352 | Sorted1 = lists:keysort(1, List),
353 | Sorted2 = lists:keysort(1, Encoded),
354 | [I || {_,I} <- Sorted1]
355 | == [J || {_,J} <- Sorted2]
356 | end).
357 |
358 | prop_encode_neg_fs() ->
359 | ?FORALL(T, neg_float(),
360 | sext:decode(sext:encode(T)) == T).
361 |
362 | prop_encode_big() ->
363 | ?FORALL(T, big(),
364 | sext:decode(sext:encode(T)) == T).
365 |
366 | prop_encode_neg_big() ->
367 | ?FORALL(T, neg_big(),
368 | sext:decode(sext:encode(T)) == T).
369 |
370 |
371 | comp(A,B) when A == B, A =/= B ->
372 | %% can only happen when either is a float and the other an int
373 | IsMore = if A < 0 ->
374 | is_float(B);
375 | true ->
376 | is_float(A)
377 | end,
378 | case IsMore of
379 | true -> more;
380 | false -> less
381 | end;
382 | comp(A,B) when A < B -> less;
383 | comp(A,A) -> equal;
384 | comp(_,_) -> more.
385 |
386 | comp_i(Ta, Tb) when is_tuple(Ta), is_tuple(Tb),
387 | tuple_size(Ta) == tuple_size(Tb) ->
388 | comp_l(tuple_to_list(Ta), tuple_to_list(Tb));
389 | comp_i(La, Lb) when is_list(La), is_list(Lb) ->
390 | comp_l(La, Lb);
391 | comp_i(A, B) ->
392 | comp(A, B).
393 |
394 | comp_l([] , [] ) -> equal;
395 | comp_l([] , [_|_] ) -> less;
396 | comp_l([_|_] , [] ) -> more;
397 | comp_l([Ha|Ta],[Hb|Tb]) ->
398 | case comp(Ha, Hb) of
399 | equal ->
400 | comp_l(Ta, Tb);
401 | Other ->
402 | Other
403 | end;
404 | comp_l(A, B) -> % A or B was an improper list
405 | comp_i(A, B).
406 |
407 | is_prefix(A, B) ->
408 | Sz = byte_size(A),
409 | binary:longest_common_prefix([A,B]) == Sz.
410 |
411 | prop_measure_term() ->
412 | ?FORALL(T,term_(),
413 | measure(term_size,size(term_to_binary(T)),true)).
414 |
415 | simple_term() ->
416 | oneof(simple_types()).
417 |
418 | term_() ->
419 | ?SIZED(Size,term(Size)).
420 |
421 | term(0) ->
422 | simple_term();
423 | term(Size) ->
424 | %% You need ?LAZY for recursive generators!
425 | ?LAZY(oneof(
426 | simple_types() ++
427 | [
428 | %% Don't make lists and tuples EXACTLY Size long
429 | alist(Size),
430 | non_proper_list(Size),
431 | atuple(Size),
432 | astring(Size)])).
433 |
434 | simple_types() ->
435 | [int(),
436 | big(),
437 | pos_float(),
438 | neg_float(),
439 | anatom(),
440 | abin(),
441 | abitstr()].
442 |
443 | big() ->
444 | ?LET({X,M}, {nat(), pos()},
445 | %% Multiply by the cube of `M'
446 | %% to get the generator big enough.
447 | %% Verified w/ `eqc_gen:sample/1'
448 | (16#ffffFFFF + X) * (M * M * M)).
449 |
450 | neg_big() ->
451 | ?LET(B, big(), -B).
452 |
453 | pos() ->
454 | ?SUCHTHAT(N,nat(),N>0).
455 |
456 | %% Set the Size just for list generation.
457 |
458 | alist() ->
459 | ?SIZED(Size, alist(Size)).
460 |
461 | alist(Size) ->
462 | list(Size,term(Size div 3)).
463 |
464 | non_proper_list(Size) ->
465 | ?LET(L,alist(Size),make_non_proper(L)).
466 |
467 | list(Size,G) ->
468 | ?SIZED(S,resize(Size,list(resize(S,G)))).
469 |
470 | atuple(Size) ->
471 | ?LET(L, alist(Size), list_to_tuple(L)).
472 |
473 | anatom() ->
474 | oneof([a,b,c,aa,bb,cc]).
475 |
476 | astring(0) -> "";
477 | astring(Size) ->
478 | list(Size, choose($A,$z)).
479 |
480 | abin() ->
481 | ?LET(L, list(choose(0,255)), list_to_binary(L)).
482 |
483 | abitstr() ->
484 | ?LET({Bin, Sz}, {abin(), choose(0, 7)},
485 | ?LET(N, choose(0, 16#ff bsr (8-Sz)),
486 | <>)).
487 |
488 | pos_float() ->
489 | ?LET(F, ?SUCHTHAT(R, real(), R > 0 andalso is_float(R)),
490 | norm(F)).
491 |
492 | neg_float() ->
493 | ?LET(F, ?SUCHTHAT(R, real(), R < 0 andalso is_float(R)),
494 | norm(F)).
495 |
496 | norm(F) when is_float(F) ->
497 | <> = <>,
498 | G.
499 |
500 | make_non_proper([A,B]) -> [A|B];
501 | make_non_proper([A]) -> [A];
502 | make_non_proper([A|B]) -> [A|make_non_proper(B)];
503 | make_non_proper([]) -> [].
504 |
505 |
506 | prefixable_term() ->
507 | oneof([non_empty_tuple(),
508 | non_empty_list()]).
509 |
510 | non_empty_tuple() ->
511 | ?LET(L, non_empty_list(),
512 | list_to_tuple(L)).
513 |
514 | non_empty_list() ->
515 | non_empty(alist()).
516 |
517 | positions(T) ->
518 | positions(T, 0).
519 |
520 | positions(T, Acc) when is_tuple(T) ->
521 | positions(tuple_to_list(T), Acc);
522 | positions([H|T], Acc) ->
523 | positions(T, positions(H) + Acc);
524 | positions([], Acc) ->
525 | Acc;
526 | positions(_, Acc) ->
527 | Acc+1.
528 |
529 | is_wild('_') -> true;
530 | is_wild(A) when is_atom(A) ->
531 | case atom_to_list(A) of
532 | "\$" ++ Is ->
533 | try _ = list_to_integer(Is),
534 | true
535 | catch
536 | error:_ ->
537 | false
538 | end;
539 | _ ->
540 | false
541 | end;
542 | is_wild(_) ->
543 | false.
544 |
545 | make_wild(T, P, W) when P > 0 ->
546 | if is_tuple(T) ->
547 | {Res,_} = make_wild1(tuple_to_list(T), P, W, []),
548 | list_to_tuple(Res);
549 | is_list(T) ->
550 | {Res,_} = make_wild1(T, P, W, []),
551 | Res
552 | end.
553 |
554 | make_wild1(L, 0, _, Acc) ->
555 | {lists:reverse(Acc) ++ L, 0};
556 | make_wild1(T, P, W, Acc) when not(is_list(T)) ->
557 | if P == 1 ->
558 | {lists:reverse(Acc) ++ W, 0};
559 | true ->
560 | {lists:reverse(Acc) ++ T, P-1}
561 | end;
562 | make_wild1([_|T], 1, W, Acc) ->
563 | {lists:reverse(Acc) ++ [W|T], 0};
564 | make_wild1([H|T], P, W, Acc) ->
565 | if is_tuple(H) ->
566 | {H1,P1} = make_wild1(tuple_to_list(H), P, W, []),
567 | make_wild1(T, P1, W, [list_to_tuple(H1)|Acc]);
568 | is_list(H) ->
569 | {H1,P1} = make_wild1(H, P, W, []),
570 | make_wild1(T, P1, W, [H1|Acc]);
571 | true ->
572 | make_wild1(T, P-1, W, [H|Acc])
573 | end;
574 | make_wild1([], P, _W, Acc) ->
575 | {lists:reverse(Acc), P}.
576 |
577 | wild() ->
578 | oneof(['_','$1','$9999']).
579 |
580 | lists_replace(L, P, V) when P > 0, P =< length(L) ->
581 | {L1, [_|L2]} = lists:split(P-1, L),
582 | L1 ++ [V] ++ L2.
583 |
584 | -endif.
585 |
586 |
--------------------------------------------------------------------------------