├── .gitignore
├── .travis.yml
├── CHANGELOG
├── LICENSE
├── NOTICE
├── README.md
├── bootstrap_travis.sh
├── rebar.config
├── rebar.lock
├── src
    ├── idna.app.src
    ├── idna.erl
    ├── idna_bidi.erl
    ├── idna_context.erl
    ├── idna_data.erl
    ├── idna_logger.hrl
    ├── idna_mapping.erl
    ├── idna_table.erl
    ├── idna_ucs.erl
    └── punycode.erl
├── test
    ├── IdnaTestV2.txt
    ├── compat_test.erl
    ├── idna_test.erl
    ├── punycode_test.erl
    └── uts46_test.erl
└── uc_spec
    ├── ArabicShaping.txt
    ├── IdnaMappingTable.txt
    ├── Scripts.txt
    ├── UnicodeData.txt
    ├── gen_idna_mapping_mod.escript
    ├── gen_idna_table_mod.escript
    ├── gen_idnadata_mod.escript
    └── idna-table.txt


/.gitignore:
--------------------------------------------------------------------------------
 1 | .rebar3
 2 | _*
 3 | .eunit
 4 | *.o
 5 | *.beam
 6 | *.plt
 7 | *.swp
 8 | *.swo
 9 | .erlang.cookie
10 | ebin
11 | log
12 | erl_crash.dump
13 | .rebar
14 | _rel
15 | _deps
16 | _plugins
17 | _tdeps
18 | logs
19 | _build
20 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: erlang
 2 | otp_release:
 3 |   - 20.0
 4 |   - 20.1.7
 5 |   - 20.3.8.22
 6 |   - 21.0.9
 7 |   - 21.1.4
 8 |   - 21.2.7
 9 |   - 21.3.8.1
10 |   - 22.0.7
11 |   - 22.1.8.1
12 |   - 22.2.8
13 |   - 22.3.4
14 |   - 23.0.2
15 | 
16 | 
17 | before_script:
18 |     - "./bootstrap_travis.sh"
19 | script: "./rebar3 eunit"
20 | 


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
 1 | # CHANGELOG
 2 | 
 3 | == 7.0.0 - unreleased
 4 | 
 5 | - remove unicode_compat. only support Erlang >= 20
 6 | 
 7 | == 6.1.1 - 2020-12-06
 8 | 
 9 | - fix license information
10 | 
11 | == 6.1.0 - 2020-12-05
12 | 
13 | - update to Unicode 13.0.0
14 | - bump unicode_util_compat to 0.7.0
15 | - remove support of Erlang < 19.3
16 | - remove support of rebar 2
17 | 
18 | == 6.0.1 - 2020-05-14
19 | 
20 | - bump to unicode_compat 0.5.0
21 | 
22 | == 6.0.0 - 2018-08-30
23 | 
24 | - IDNA 2008 support [RFC5981](https://tools.ietf.org/html/rfc5891)
25 | - International Domain Name validation
26 | - fix [Punycode](https://tools.ietf.org/html/rfc3492) algorithm
27 | 
28 | Breaking changes:
29 | - `idna:to_ascii/1` in 5.1.x did not encode or enforce rules if the input is already all ascii
30 | 
31 | == 5.1.2 - 2018-06-09
32 | 
33 | - support build with rebar 2
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright the authors and contributors. All rights reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person
 4 | obtaining a copy of this software and associated documentation
 5 | files (the "Software"), to deal in the Software without
 6 | restriction, including without limitation the rights to use,
 7 | copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the
 9 | Software is furnished to do so, subject to the following
10 | conditions:
11 | 
12 | The above copyright notice and this permission notice shall be
13 | included in all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | This file is part of erlang-idna released under the MIT license.
 2 | See the LICENSE for more information
 3 | 
 4 | Copyright 2014-2020 Benoît Chesneau <bchesneau@gmail.com>
 5 | Copyright 2009-2014 Tim Fletcher
 6 | 
 7 | Others:
 8 | 
 9 | * idna_ucs.erl:
10 | Under the Apache 2 license
11 | Copyright Ericsson AB 2005-2016. All Rights Reserved


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## erlang-idna
 2 | 
 3 | A pure Erlang IDNA implementation that follow the [RFC5891](https://tools.ietf.org/html/rfc5891).
 4 | 
 5 | * support IDNA 2008 and IDNA 2003.
 6 | * label validation:
 7 |     - [x] **check NFC**: Label must be in Normalization Form C
 8 |     - [x] **check hyphen**: The Unicode string MUST NOT contain "--" (two consecutive hyphens) in
 9 |     the third and fourth character positions and MUST NOT start or end
10 |     with a "-" (hyphen).
11 |     - [x]  **Leading Combining Marks**: The Unicode string MUST NOT begin with a combining mark or combining character (see The Unicode Standard, Section 2.11 [Unicode](https://tools.ietf.org/html/rfc5891#ref-Unicode) for an  exact definition).
12 |     - [x] **Contextual Rules**: The Unicode string MUST NOT contain any characters whose validity is
13 |     context-dependent, unless the validity is positively confirmed by a contextual rule.  To check this, each code point identified as  CONTEXTJ or CONTEXTO in the Tables document [RFC5892](https://tools.ietf.org/html/rfc5892#section-2.7) MUST have a  non-null rule.  If such a code point is missing a rule, the label is  invalid.  If the rule exists but the result of applying the rule is  negative or inconclusive, the proposed label is invalid.
14 |     - [x] **check BIDI**: label contains any characters from scripts that are
15 |     written from right to left, it MUST meet the Bidi criteria  [rfc5893](https://tools.ietf.org/html/rfc5893)
16 | 
17 | 
18 | 
19 | 
20 | ## Usage
21 | 
22 | 
23 | 
24 | `idna:encode/{1,2}` and `idna:decode/{1, 2}` functions are used to encode or decode an Internationalized Domain
25 | Names using IDNA protocol.
26 | 
27 | Input can be mapped to unicode using [uts46](https://unicode.org/reports/tr46/#Introduction)
28 | by setting  the `uts46` flag to true (default is false). If transition from IDNA 2003 to
29 | IDNA 2008 is needed, the flag `transitional` can be set to `true`, (`default` is false). If
30 | conformance to STD3 is needed, the flag `std3_rules` can be set to true. (default is `false`).
31 | 
32 | example:
33 | 
34 | ```erlang
35 | 1> idna:encode("日本語。ＪＰ", [uts46]).
36 | "xn--wgv71a119e.xn--jp-"
37 | 2> idna:encode("日本語.ＪＰ", [uts46]).
38 | "xn--wgv71a119e.xn--jp-"
39 | ...
40 | ```
41 | 
42 | 
43 | Legacy support of IDNA 2003 is also available with  `to_ascii` and `to_unicode` functions:
44 | 
45 | 
46 | ```erlang
47 | 1> Domain = "www.詹姆斯.com".
48 | [119,119,119,46,35449,22982,26031,46,99,111,109]
49 | 2> Encoded =  idna:to_ascii("www.詹姆斯.com").
50 | "www.xn--8ws00zhy3a.com"
51 | 3> idna:to_unicode(Encoded).
52 | [119,119,119,46,35449,22982,26031,46,99,111,109]
53 | ```
54 | 
55 | 
56 | 
57 | Update Unicode data
58 | 
59 | wget -O test/IdnaTestV2.txt https://www.unicode.org/Public/idna/latest/IdnaTestV2.txt
60 | wget -O uc_spec/ArabicShaping.txt https://www.unicode.org/Public/UNIDATA/ArabicShaping.txt
61 | wget -O uc_spec/IdnaMappingTable.txt https://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt
62 | wget -O uc_spec/Scripts.txt https://www.unicode.org/Public/UNIDATA/Scripts.txt
63 | wget -O uc_spec/UnicodeData.txt https://www.unicode.org/Public/UNIDATA/UnicodeData.txt
64 | 
65 | git clone https://github.com/kjd/idna.git
66 | ./idna/tools/idna-data make-table --version 13.0.0 > uc_spec/idna-table.txt
67 | 
68 | cd uc_spec
69 | ./gen_idnadata_mod.escript
70 | ./gen_idna_table_mod.escript
71 | ./gen_idna_mapping_mod.escript
72 | 


--------------------------------------------------------------------------------
/bootstrap_travis.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | curl -O -L https://s3.amazonaws.com/rebar3/rebar3
4 | chmod +x rebar3
5 | ./rebar3 update
6 | 


--------------------------------------------------------------------------------
/rebar.config:
--------------------------------------------------------------------------------
1 | {erl_opts, []}.
2 | 
3 | {deps, []}.
4 | 


--------------------------------------------------------------------------------
/rebar.lock:
--------------------------------------------------------------------------------
1 | [].
2 | 


--------------------------------------------------------------------------------
/src/idna.app.src:
--------------------------------------------------------------------------------
 1 | {application, idna, [
 2 |   {description, "A pure Erlang IDNA implementation"},
 3 |   {vsn, "7.0.0"},
 4 |   {modules, []},
 5 |   {registered, []},
 6 |   {applications, [kernel, stdlib]},
 7 | 
 8 |   {licenses, ["MIT"]},
 9 |   {links, [{"Github", "https://github.com/benoitc/erlang-idna"}]}
10 | 
11 | ]}.
12 | 


--------------------------------------------------------------------------------
/src/idna.erl:
--------------------------------------------------------------------------------
  1 | %% -*- coding: utf-8 -*-
  2 | %%%
  3 | %%% This file is part of erlang-idna released under the MIT license.
  4 | %%% See the LICENSE for more information.
  5 | %%%
  6 | -module(idna).
  7 | 
  8 | %% API
  9 | -export([encode/1, encode/2,
 10 |          decode/1, decode/2]).
 11 | 
 12 | %% compatibility API
 13 | -export([to_ascii/1,
 14 |          to_unicode/1,
 15 |          utf8_to_ascii/1,
 16 |          from_ascii/1]).
 17 | 
 18 | 
 19 | -export([alabel/1, ulabel/1]).
 20 | 
 21 | -export([check_hyphen/1,
 22 |          check_nfc/1,
 23 |          check_context/1,
 24 |          check_initial_combiner/1,
 25 |          check_label_length/1]).
 26 | 
 27 | -export([check_label/1, check_label/4]).
 28 | 
 29 | -define(ACE_PREFIX, "xn--").
 30 | 
 31 | -ifdef('OTP_RELEASE').
 32 | -define(lower(C), string:lowercase(C)).
 33 | -else.
 34 | -define(lower(C), string:to_lower(C)).
 35 | -endif.
 36 | 
 37 | -include("idna_logger.hrl").
 38 | 
 39 | 
 40 | -type idna_flags() :: [{uts46, boolean()} |
 41 |                        {std3_rules, boolean()} |
 42 |                        {transitional, boolean()}].
 43 | 
 44 | 
 45 | 
 46 | %% @doc encode Internationalized Domain Names using IDNA protocol
 47 | -spec encode(string()) -> string().
 48 | encode(Domain) ->
 49 |   encode(Domain, []).
 50 | 
 51 | 
 52 | %% @doc encode Internationalized Domain Names using IDNA protocol.
 53 | %% Input can be mapped to unicode using [uts46](https://unicode.org/reports/tr46/#Introduction)
 54 | %% by setting  the `uts46' flag to `true' (default is `false'). If transition from IDNA 2003 to
 55 | %% IDNA 2008 is needed, the flag `transitional' can be set to `true', (default is `false'). If
 56 | %% conformance to STD3 is needed, the flag `std3_rules' can be set to `true'. (default is `false').
 57 | -spec encode(string(), idna_flags()) -> string().
 58 | encode(Domain0, Options) ->
 59 |   ok = validate_options(Options),
 60 |   Domain = case proplists:get_value(uts46, Options, false) of
 61 |              true ->
 62 |                STD3Rules = proplists:get_value(std3_rules, Options, false),
 63 |                Transitional = proplists:get_value(transitional, Options, false),
 64 |                uts46_remap(Domain0, STD3Rules, Transitional);
 65 |              false ->
 66 |                Domain0
 67 |            end,
 68 |   Labels = case proplists:get_value(strict, Options, false) of
 69 |              false ->
 70 |                re:split(Domain, "[.。．｡]", [{return, list}, unicode]);
 71 |              true ->
 72 |                string:tokens(Domain, ".")
 73 |            end,
 74 |   case Labels of
 75 |     [] -> exit(empty_domain);
 76 |     _ ->
 77 |       encode_1(Labels, [])
 78 |   end.
 79 | 
 80 | %% @doc decode an International Domain Name encoded with the IDNA protocol
 81 | -spec decode(string()) -> string().
 82 | decode(Domain) ->
 83 |   decode(Domain, []).
 84 | 
 85 | %% @doc decode an International Domain Name encoded with the IDNA protocol
 86 | -spec decode(string(), idna_flags()) -> string().
 87 | decode(Domain0, Options) ->
 88 |   ok = validate_options(Options),
 89 |   Domain = case proplists:get_value(uts46, Options, false) of
 90 |              true ->
 91 |                STD3Rules = proplists:get_value(std3_rules, Options, false),
 92 |                Transitional = proplists:get_value(transitional, Options, false),
 93 |                uts46_remap(Domain0, STD3Rules, Transitional);
 94 |              false ->
 95 |                Domain0
 96 |            end,
 97 | 
 98 |   Labels = case proplists:get_value(strict, Options, false) of
 99 |              false ->
100 |                re:split(lowercase(Domain), "[.。．｡]", [{return, list}, unicode]);
101 |              true ->
102 |                string:tokens(lowercase(Domain), ".")
103 |            end,
104 |   case Labels of
105 |     [] -> exit(empty_domain);
106 |     _ ->
107 |       decode_1(Labels, [])
108 |   end.
109 | 
110 | 
111 | %% Compatibility API
112 | %%
113 | 
114 | %% @doc encode an International Domain Name to IDNA protocol (compatibility API)
115 | -spec to_ascii(string()) -> string().
116 | to_ascii(Domain) -> encode(Domain).
117 | 
118 | %% @doc decode an an encoded International Domain Name using the IDNA protocol (compatibility API)
119 | -spec to_unicode(string()) -> string().
120 | to_unicode(Domain) -> decode(Domain).
121 | 
122 | 
123 | utf8_to_ascii(Domain) ->
124 |   to_ascii(idna_ucs:from_utf8(Domain)).
125 | 
126 | %% @doc like `to_ascii/1'
127 | -spec from_ascii(nonempty_string()) -> nonempty_string().
128 | from_ascii(Domain) ->
129 |   decode(Domain).
130 | 
131 | 
132 | %% Helper functions
133 | %%
134 | 
135 | validate_options([]) -> ok;
136 | validate_options([uts46|Rs]) -> validate_options(Rs);
137 | validate_options([{uts46, B}|Rs]) when is_boolean(B) -> validate_options(Rs);
138 | validate_options([strict|Rs]) -> validate_options(Rs);
139 | validate_options([{strict, B}|Rs]) when is_boolean(B) -> validate_options(Rs);
140 | validate_options([std3_rules|Rs]) -> validate_options(Rs);
141 | validate_options([{std3_rules, B}|Rs]) when is_boolean(B) -> validate_options(Rs);
142 | validate_options([transitional|Rs]) -> validate_options(Rs);
143 | validate_options([{transitional, B}|Rs]) when is_boolean(B) -> validate_options(Rs);
144 | validate_options([_]) -> erlang:error(badarg).
145 | 
146 | encode_1([], Acc) ->
147 |   lists:reverse(Acc);
148 | encode_1([Label|Labels], []) ->
149 |   encode_1(Labels, lists:reverse(alabel(Label)));
150 | encode_1([Label|Labels], Acc) ->
151 |   encode_1(Labels, lists:reverse(alabel(Label), [$.|Acc])).
152 | 
153 | check_nfc(Label) ->
154 |   case characters_to_nfc_list(Label) of
155 |     Label -> ok;
156 |     _ ->
157 |       erlang:exit({bad_label, {nfc, "Label must be in Normalization Form C"}})
158 |   end.
159 | 
160 | check_hyphen(Label) -> check_hyphen(Label, true).
161 | 
162 | check_hyphen(Label, true) when length(Label) >= 3 ->
163 |   case lists:nthtail(2, Label) of
164 |     [$-, $-|_] ->
165 |       ErrorMsg = error_msg("Label ~p has disallowed hyphens in 3rd and 4th position", [Label]),
166 |       erlang:exit({bad_label, {hyphen, ErrorMsg}});
167 |     _ ->
168 |       case (lists:nth(1, Label) == $-) orelse (lists:last(Label) == $-) of
169 |         true ->
170 |           ErrorMsg = error_msg("Label ~p must not start or end with a hyphen", [Label]),
171 |           erlang:exit({bad_label, {hyphen, ErrorMsg}});
172 |         false ->
173 |           ok
174 |       end
175 |   end;
176 | check_hyphen(Label, true) ->
177 |   case (lists:nth(1, Label) == $-) orelse (lists:last(Label) == $-) of
178 |     true ->
179 |       ErrorMsg = error_msg("Label ~p must not start or end with a hyphen", [Label]),
180 |       erlang:exit({bad_label, {hyphen, ErrorMsg}});
181 |     false ->
182 |       ok
183 |   end;
184 | check_hyphen(_Label, false) ->
185 |   ok.
186 | 
187 | check_initial_combiner([CP|_]) ->
188 |   case idna_data:lookup(CP) of
189 |     {[$M|_], _} ->
190 |       erlang:exit({bad_label, {initial_combiner, "Label begins with an illegal combining character"}});
191 |     _ ->
192 |       ok
193 |   end.
194 | 
195 | check_context(Label) ->
196 |   check_context(Label, Label, true, 0).
197 | 
198 | check_context(Label, CheckJoiners) ->
199 |   check_context(Label, Label, CheckJoiners, 0).
200 | 
201 | check_context([CP | Rest], Label, CheckJoiners, Pos) ->
202 |   case idna_table:lookup(CP) of
203 |     'PVALID' ->
204 |       check_context(Rest, Label, CheckJoiners, Pos + 1);
205 |     'CONTEXTJ' ->
206 |         ok =  valid_contextj(CP, Label, Pos, CheckJoiners),
207 |         check_context(Rest, Label, CheckJoiners, Pos + 1);
208 |     'CONTEXTO' ->
209 |       ok =  valid_contexto(CP, Label, Pos, CheckJoiners),
210 |       check_context(Rest, Label, CheckJoiners, Pos + 1);
211 |     _Status ->
212 |       ErrorMsg = error_msg("Codepoint ~p not allowed (~p) at position ~p in ~p", [CP, _Status, Pos, Label]),
213 |       erlang:exit({bad_label, {context, ErrorMsg}})
214 |   end;
215 | check_context([], _, _, _) ->
216 |   ok.
217 | 
218 | 
219 | valid_contextj(CP, Label, Pos, true) ->
220 |   case idna_context:valid_contextj(CP, Label, Pos) of
221 |     true ->
222 |       ok;
223 |     false ->
224 |       ErrorMsg = error_msg("Joiner ~p not allowed at position ~p in ~p", [CP, Pos, Label]),
225 |       erlang:exit({bad_label, {contextj, ErrorMsg}})
226 |   end;
227 | valid_contextj(_CP, _Label, _Pos, false) ->
228 |   ok.
229 | 
230 | valid_contexto(CP, Label, Pos, true) ->
231 |   case idna_context:valid_contexto(CP, Label, Pos) of
232 |     true ->
233 |       ok;
234 |     false ->
235 |       ErrorMsg = error_msg("Joiner ~p not allowed at position ~p in ~p", [CP, Pos, Label]),
236 |       erlang:exit({bad_label, {contexto, ErrorMsg}})
237 |   end;
238 | valid_contexto(_CP, _Label, _Pos, false) ->
239 |   ok.
240 | 
241 | 
242 | 
243 | -spec check_label(string()) -> ok.
244 | check_label(Label) ->
245 |   check_label(Label, true, true, true).
246 | 
247 | %% @doc validate a label of  a domain
248 | -spec check_label(Label, CheckHyphens, CheckJoiners, CheckBidi) -> Result when
249 |     Label :: string(),
250 |     CheckHyphens :: boolean(),
251 |     CheckJoiners :: boolean(),
252 |     CheckBidi :: boolean(),
253 |     Result :: ok.
254 | check_label(Label, CheckHyphens, CheckJoiners, CheckBidi) ->
255 |   ok = check_nfc(Label),
256 |   ok = check_hyphen(Label, CheckHyphens),
257 |   ok = check_initial_combiner(Label),
258 |   ok = check_context(Label, CheckJoiners),
259 |   ok = check_bidi(Label, CheckBidi),
260 |   ok.
261 | 
262 | 
263 | check_bidi(Label, true) ->
264 |   idna_bidi:check_bidi(Label);
265 | check_bidi(_, false) ->
266 |   ok.
267 | 
268 | check_label_length(Label) when length(Label) > 63 ->
269 |   ErrorMsg = error_msg("The label ~p  is too long", [Label]),
270 |   erlang:exit({bad_label, {too_long, ErrorMsg}});
271 | check_label_length(_) ->
272 |   ok.
273 | 
274 | alabel(Label0) ->
275 |   Label = case lists:all(fun(C) -> idna_ucs:is_ascii(C) end, Label0) of
276 |             true ->
277 |               _ = try ulabel(Label0)
278 |                   catch
279 |                     _:Error ->
280 |                       ErrorMsg = error_msg("The label ~p  is not a valid A-label: ulabel error=~p", [Label0, Error]),
281 |                       erlang:exit({bad_label, {alabel, ErrorMsg}})
282 |                   end,
283 |               ok = check_label_length(Label0),
284 | 
285 |               Label0;
286 |             false ->
287 |               ok = check_label(Label0),
288 |               ?ACE_PREFIX ++ punycode:encode(Label0)
289 |           end,
290 |   ok = check_label_length(Label),
291 |   Label.
292 | 
293 | decode_1([], Acc) ->
294 |   lists:reverse(Acc);
295 | decode_1([Label|Labels], []) ->
296 |   decode_1(Labels, lists:reverse(ulabel(Label)));
297 | decode_1([Label|Labels], Acc) ->
298 |   decode_1(Labels, lists:reverse(ulabel(Label), [$.|Acc])).
299 | 
300 | ulabel([]) -> [];
301 | ulabel(Label0) ->
302 |   Label = case lists:all(fun(C) -> idna_ucs:is_ascii(C) end, Label0) of
303 |             true ->
304 |               case Label0 of
305 |                 [$x,$n,$-,$-|Label1] ->
306 |                   punycode:decode(lowercase(Label1));
307 |                 _ ->
308 |                   lowercase(Label0)
309 |               end;
310 |             false ->
311 |               lowercase(Label0)
312 |           end,
313 |   ok = check_label(Label),
314 |   Label.
315 | 
316 | %% Lowercase all chars in Str
317 | -spec lowercase(String::unicode:chardata()) -> unicode:chardata().
318 | lowercase(CD) when is_list(CD) ->
319 |   try lowercase_list(CD, false)
320 |   catch unchanged -> CD
321 |   end;
322 | lowercase(<<CP1/utf8, Rest/binary>>=Orig) ->
323 |   try lowercase_bin(CP1, Rest, false) of
324 |     List -> unicode:characters_to_binary(List)
325 |   catch unchanged -> Orig
326 |   end;
327 | lowercase(<<>>) ->
328 |   <<>>.
329 | 
330 | 
331 | lowercase_list([CP1|[CP2|_]=Cont], _Changed) when $A =< CP1, CP1 =< $Z, CP2 < 256 ->
332 |   [CP1+32|lowercase_list(Cont, true)];
333 | lowercase_list([CP1|[CP2|_]=Cont], Changed) when CP1 < 128, CP2 < 256 ->
334 |   [CP1|lowercase_list(Cont, Changed)];
335 | lowercase_list([], true) ->
336 |   [];
337 | lowercase_list([], false) ->
338 |   throw(unchanged);
339 | lowercase_list(CPs0, Changed) ->
340 |   case unicode_util:lowercase(CPs0) of
341 |     [Char|CPs] when Char =:= hd(CPs0) -> [Char|lowercase_list(CPs, Changed)];
342 |     [Char|CPs] -> append(Char,lowercase_list(CPs, true));
343 |     [] -> lowercase_list([], Changed)
344 |   end.
345 | 
346 | lowercase_bin(CP1, <<CP2/utf8, Bin/binary>>, _Changed)
347 |   when $A =< CP1, CP1 =< $Z, CP2 < 256 ->
348 |   [CP1+32|lowercase_bin(CP2, Bin, true)];
349 | lowercase_bin(CP1, <<CP2/utf8, Bin/binary>>, Changed)
350 |   when CP1 < 128, CP2 < 256 ->
351 |   [CP1|lowercase_bin(CP2, Bin, Changed)];
352 | lowercase_bin(CP1, Bin, Changed) ->
353 |   case unicode_util:lowercase([CP1|Bin]) of
354 |     [CP1|CPs] ->
355 |       case unicode_util:cp(CPs) of
356 |         [Next|Rest] ->
357 |           [CP1|lowercase_bin(Next, Rest, Changed)];
358 |         [] when Changed ->
359 |           [CP1];
360 |         [] ->
361 |           throw(unchanged)
362 |       end;
363 |     [Char|CPs] ->
364 |       case unicode_util:cp(CPs) of
365 |         [Next|Rest] ->
366 |           [Char|lowercase_bin(Next, Rest, true)];
367 |         [] ->
368 |           [Char]
369 |       end
370 |   end.
371 | 
372 | 
373 | append(Char, <<>>) when is_integer(Char) -> [Char];
374 | append(Char, <<>>) when is_list(Char) -> Char;
375 | append(Char, Bin) when is_binary(Bin) -> [Char,Bin];
376 | append(Char, Str) when is_integer(Char) -> [Char|Str];
377 | append(GC, Str) when is_list(GC) -> GC ++ Str.
378 | 
379 | 
380 | characters_to_nfc_list(CD) ->
381 |   case unicode_util:nfc(CD) of
382 |     [CPs|Str] when is_list(CPs) -> CPs ++ characters_to_nfc_list(Str);
383 |     [CP|Str] -> [CP|characters_to_nfc_list(Str)];
384 |     [] -> []
385 |   end.
386 | 
387 | 
388 | uts46_remap(Str, Std3Rules, Transitional) ->
389 |   characters_to_nfc_list(uts46_remap_1(Str, Std3Rules, Transitional)).
390 | 
391 | uts46_remap_1([Cp|Rs], Std3Rules, Transitional) ->
392 |   Row = try idna_mapping:uts46_map(Cp)
393 |         catch
394 |           error:badarg  ->
395 |             ?LOG_ERROR("codepoint ~p not found in mapping list~n", [Cp]),
396 |             erlang:exit({invalid_codepoint, Cp})
397 |         end,
398 |   {Status, Replacement} = case Row of
399 |                             {_, _} -> Row;
400 |                             S -> {S, undefined}
401 |                           end,
402 |   if
403 |     (Status =:= 'V');
404 |     ((Status =:= 'D') andalso (Transitional =:= false));
405 |     ((Status =:= '3') andalso (Std3Rules =:= true) andalso (Replacement =:= undefined)) ->
406 |       [Cp] ++ uts46_remap_1(Rs, Std3Rules, Transitional);
407 |     (Replacement =/= undefined) andalso (
408 |         (Status =:= 'M') orelse
409 |           (Status =:= '3' andalso Std3Rules =:= false) orelse
410 |           (Status =:= 'D' andalso Transitional =:= true)) ->
411 |       Replacement ++ uts46_remap_1(Rs, Std3Rules, Transitional);
412 |     (Status =:= 'I') ->
413 |       uts46_remap_1(Rs, Std3Rules, Transitional);
414 |     true ->
415 |       erlang:exit({invalid_codepoint, Cp})
416 |   end;
417 | uts46_remap_1([], _, _) ->
418 |   [].
419 | 
420 | error_msg(Msg, Fmt) ->
421 |   lists:flatten(io_lib:format(Msg, Fmt)).
422 | 


--------------------------------------------------------------------------------
/src/idna_bidi.erl:
--------------------------------------------------------------------------------
 1 | %% -*- coding: utf-8 -*-
 2 | %%%
 3 | %%% This file is part of erlang-idna released under the MIT license.
 4 | %%% See the LICENSE for more information.
 5 | %%%
 6 | 
 7 | -module(idna_bidi).
 8 | -author("benoitc").
 9 | 
10 | %% API
11 | -export([check_bidi/1, check_bidi/2]).
12 | 
13 | check_bidi(Label) -> check_bidi(Label, false).
14 | 
15 | check_bidi(Label, CheckLtr) ->
16 |   %% Bidi rules should only be applied if string contains RTL characters
17 |   case {check_rtl(Label, Label), CheckLtr} of
18 |     {false, false}  -> ok;
19 |     _ ->
20 |       [C | _Rest] = Label,
21 |       % bidi rule 1
22 |       RTL = rtl(C, Label),
23 |       check_bidi1(Label, RTL, false, undefined)
24 |   end.
25 | 
26 | check_rtl([C | Rest], Label) ->
27 |   case idna_data:bidirectional(C) of
28 |     false ->
29 |       erlang:exit(bidi_error("unknown directionality in label=~p c=~w~n", [Label, C]));
30 |     Dir ->
31 |       case lists:member(Dir, ["R", "AL", "AN"]) of
32 |         true -> true;
33 |         false -> check_rtl(Rest, Label)
34 |       end
35 |   end;
36 | check_rtl([], _Label) ->
37 |   false.
38 | 
39 | rtl(C, Label) ->
40 |   case idna_data:bidirectional(C) of
41 |     "R" -> true;
42 |     "AL" -> true;
43 |     "L" -> false;
44 |     _ ->
45 |       erlang:exit(bidi_error("first codepoint in label ~p must be directionality L, R or AL ", [Label]))
46 |   end.
47 | 
48 | 
49 | check_bidi1([C | Rest], true, ValidEnding, NumberType) ->
50 |   Dir =  idna_data:bidirectional(C),
51 |   %% bidi rule 2
52 |   ValidEnding2 = case lists:member(Dir, ["R", "AL", "AN", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"]) of
53 |                   true ->
54 |                     % bidi rule 3
55 |                     case lists:member(Dir, ["R", "AL", "AN", "EN"]) of
56 |                       true  -> true;
57 |                       false when Dir =/= "NSM" -> false;
58 |                       false -> ValidEnding
59 |                     end;
60 |                   false ->
61 |                     erlang:exit({bad_label, {bidi, "Invalid direction for codepoint  in a right-to-left label"}})
62 |                 end,
63 |   % bidi rule 4
64 |   NumberType2 = case lists:member(Dir, ["AN", "EN"]) of
65 |                   true when NumberType =:= undefined ->
66 |                     Dir;
67 |                   true when NumberType /= Dir ->
68 |                     erlang:exit({bad_label, {bidi, "Can not mix numeral types in a right-to-left label"}});
69 |                   _ ->
70 |                     NumberType
71 |                 end,
72 |   check_bidi1(Rest, true, ValidEnding2, NumberType2);
73 | check_bidi1([C | Rest], false, ValidEnding, NumberType) ->
74 |   Dir =  idna_data:bidirectional(C),
75 |   % bidi rule 5
76 |   ValidEnding2 = case lists:member(Dir, ["L", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"]) of
77 |                    true ->
78 |                      % bidi rule 6
79 |                      case Dir of
80 |                        "L" -> true;
81 |                        "EN" -> true;
82 |                        _ when Dir /= "NSM" -> false;
83 |                        _ -> ValidEnding
84 |                      end;
85 |                    false ->
86 |                      erlang:exit({bad_label, {bidi, "Invalid direction for codepoint in a left-to-right label"}})
87 |                  end,
88 |   check_bidi1(Rest, false, ValidEnding2, NumberType);
89 | check_bidi1([], _, false, _) ->
90 |   erlang:exit({bad_label, {bidi, "Label ends with illegal codepoint directionality"}});
91 | check_bidi1([], _, true, _) ->
92 |   ok.
93 | 
94 | bidi_error(Msg, Fmt) ->
95 |   ErrorMsg = lists:flatten(io_lib:format(Msg, Fmt)),
96 |   {bad_label, {bidi, ErrorMsg}}.
97 | 


--------------------------------------------------------------------------------
/src/idna_context.erl:
--------------------------------------------------------------------------------
  1 | %% -*- coding: utf-8 -*-
  2 | %%%
  3 | %%% This file is part of erlang-idna released under the MIT license.
  4 | %%% See the LICENSE for more information.
  5 | %%%
  6 | -module(idna_context).
  7 | -author("benoitc").
  8 | 
  9 | %% API
 10 | -export([
 11 |   valid_contextj/2, valid_contextj/3,
 12 |   valid_contexto/2, valid_contexto/3,
 13 |   contexto_with_rule/1
 14 | ]).
 15 | 
 16 | -define(virama_combining_class, 9).
 17 | 
 18 | 
 19 | valid_contextj([], _Pos)  -> true;
 20 | 
 21 | valid_contextj(Label, Pos) ->
 22 |   CP = lists:nth(Pos + 1, Label),
 23 |   valid_contextj(CP, Label, Pos).
 24 | 
 25 | valid_contextj(16#200c, Label, Pos) ->
 26 |   if
 27 |      Pos > 0 ->
 28 |        case unicode_util:lookup(lists:nth(Pos, Label)) of
 29 |          #{ ccc := ?virama_combining_class } -> true;
 30 |          _ ->
 31 |            valid_contextj_1(Label, Pos)
 32 |        end;
 33 |     true ->
 34 |       valid_contextj_1(Label, Pos)
 35 |   end;
 36 | 
 37 | valid_contextj(16#200d, Label, Pos) when Pos > 0 ->
 38 |   case unicode_util:lookup(lists:nth(Pos, Label)) of
 39 |     #{ ccc := ?virama_combining_class } -> true;
 40 |     _ -> false
 41 |   end;
 42 | valid_contextj(_, _, _) ->
 43 |   false.
 44 | 
 45 | valid_contextj_1(Label, Pos) ->
 46 |   case range(lists:reverse(lists:nthtail(Pos, Label))) of
 47 |     true ->
 48 |       range(lists:nthtail(Pos+2, Label));
 49 |     false ->
 50 |       false
 51 |   end.
 52 | 
 53 | range([CP|Rest]) ->
 54 |   case idna_data:joining_types(CP) of
 55 |     "T" -> range(Rest);
 56 |     "L" -> true;
 57 |     "D" -> true;
 58 |     _ ->
 59 |       range(Rest)
 60 |   end;
 61 | range([]) ->
 62 |   false.
 63 | 
 64 | valid_contexto([], _Pos) ->
 65 |   io:format("ici", []),
 66 |   true;
 67 | valid_contexto(Label, Pos) ->
 68 |   CP = lists:nth(Pos + 1, Label),
 69 |   valid_contexto(CP, Label, Pos).
 70 | 
 71 | valid_contexto(CP, Label, Pos) ->
 72 |   Len = length(Label),
 73 |   case CP of
 74 |     16#00B7 ->
 75 | 
 76 |       % MIDDLE DOT
 77 |       if
 78 |         (Pos > 0) andalso (Pos < (Len -1)) ->
 79 |           case lists:sublist(Label, Pos, 3) of
 80 |             [16#006C, _, 16#006C] -> true;
 81 |             _ -> false
 82 |           end;
 83 |         true ->
 84 |           false
 85 |       end;
 86 |     16#0375 ->
 87 |       % GREEK LOWER NUMERAL SIGN (KERAIA)
 88 |       if
 89 |         (Pos < (Len -1)) andalso (Len > 1) ->
 90 |           case idna_data:scripts(lists:nth(Pos + 2, Label)) of
 91 |             "greek" -> true;
 92 |             _Else -> false
 93 |           end;
 94 |         true ->
 95 |           false
 96 |       end;
 97 |     16#30FB ->
 98 |       % KATAKANA MIDDLE DOT
 99 |       script_ok(Label);
100 |     CP when CP == 16#05F3; CP == 16#05F4 ->
101 |       % HEBREW PUNCTUATION GERESH or HEBREW PUNCTUATION GERSHAYIM
102 |       if
103 |         Pos > 0 ->
104 |           case idna_data:scripts(lists:nth(Pos, Label)) of
105 |             "hebrew" -> true;
106 |             _ -> false
107 |           end;
108 |         true ->
109 |           false
110 |       end;
111 |     CP when CP >= 16#660, CP =< 16#669 ->
112 |       % ARABIC-INDIC DIGITS
113 |       contexto_in_range(Label, 16#6F0, 16#6F9);
114 |     CP when 16#6F0 =< CP, CP =< 16#6F9 ->
115 |       % EXTENDED ARABIC-INDIC DIGIT
116 |       contexto_in_range(Label, 16#660, 16#669);
117 |     _ ->
118 | 
119 |       false
120 |   end.
121 | 
122 | 
123 | contexto_in_range([CP | _], Start, End) when CP >= Start, CP =< End -> false;
124 | contexto_in_range([_CP|Rest], Start, End) -> contexto_in_range(Rest, Start, End);
125 | contexto_in_range([], _, _) -> true.
126 | 
127 | script_ok([16#30fb| Rest]) ->
128 |   script_ok(Rest);
129 | script_ok([C | Rest]) ->
130 |   case idna_data:scripts(C) of
131 |     "hiragana" -> true;
132 |     "katakana" -> true;
133 |     "han" -> true;
134 |     _ ->
135 |       script_ok(Rest)
136 |   end;
137 | script_ok([]) ->
138 |   false.
139 | 
140 | contexto_with_rule(16#00B7) -> true;
141 | % MIDDLE DOT
142 | contexto_with_rule(16#0375) -> true;
143 | % GREEK LOWER NUMERAL SIGN (KERAIA)
144 | contexto_with_rule(16#05F3) -> true;
145 | % HEBREW PUNCTUATION GERESH
146 | contexto_with_rule(16#05F4) -> true;
147 | % HEBREW PUNCTUATION GERSHAYIM
148 | contexto_with_rule(16#30FB) -> true;
149 | % KATAKANA MIDDLE DOT
150 | contexto_with_rule(CP) when 16#0660 =< CP, CP =< 16#0669 -> true;
151 | % ARABIC-INDIC DIGITS
152 | contexto_with_rule(CP) when 16#06F0 =< CP, CP =< 16#06F9 -> true;
153 | % KATAKANA MIDDLE DOT
154 | contexto_with_rule(_) -> false.
155 | 


--------------------------------------------------------------------------------
/src/idna_logger.hrl:
--------------------------------------------------------------------------------
1 | -ifdef('OTP_RELEASE').
2 | -include_lib("kernel/include/logger.hrl").
3 | -else.
4 | -define(LOG_INFO(Format, Args), error_logger:info_msg(Format, Args)).
5 | -define(LOG_ERROR(Format, Args), error_logger:error_msg(Format, Args)).
6 | -define(LOG_WARNING(Format, Args), error_logger:warning_msg(Format, Args)).
7 | -endif.


--------------------------------------------------------------------------------
/src/idna_ucs.erl:
--------------------------------------------------------------------------------
  1 | %%% -*- erlang -*-
  2 | %%
  3 | %% Copyright Ericsson AB 2005-2016. All Rights Reserved.
  4 | %%
  5 | %% Licensed under the Apache License, Version 2.0 (the "License");
  6 | %% you may not use this file except in compliance with the License.
  7 | %% You may obtain a copy of the License at
  8 | %%
  9 | %%     http://www.apache.org/licenses/LICENSE-2.0
 10 | %%
 11 | %% Unless required by applicable law or agreed to in writing, software
 12 | %% distributed under the License is distributed on an "AS IS" BASIS,
 13 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | %% See the License for the specific language governing permissions and
 15 | %% limitations under the License.
 16 | 
 17 | 
 18 | -module(idna_ucs).
 19 | 
 20 | -compile([verbose,report_warnings,warn_unused_vars]).
 21 | 
 22 | 
 23 | %%% Micellaneous predicates
 24 | -export([is_iso10646/1, is_unicode/1, is_ascii/1]).
 25 | 
 26 | %%% UTF-8 encoding and decoding
 27 | -export([to_utf8/1, from_utf8/1]).
 28 | 
 29 | %%% Test if Ch is a legitimate ISO-10646 character code
 30 | is_iso10646(Ch) when is_integer(Ch), Ch >= 0 ->
 31 |   if Ch  < 16#D800 -> true;
 32 |     Ch  < 16#E000 -> false;	% Surrogates
 33 |     Ch  < 16#FFFE -> true;
 34 |     Ch =< 16#FFFF -> false;	% FFFE and FFFF (not characters)
 35 |     Ch =< 16#7FFFFFFF -> true;
 36 |     true -> false
 37 |   end;
 38 | is_iso10646(_) -> false.
 39 | 
 40 | %%% Test if Ch is a legitimate ISO-10646 character code capable of
 41 | %%% being encoded in a UTF-16 string.
 42 | is_unicode(Ch) when Ch < 16#110000 -> is_iso10646(Ch);
 43 | is_unicode(_) -> false.
 44 | 
 45 | %%% Test for legitimate ASCII code
 46 | is_ascii(Ch) when is_integer(Ch), Ch >= 0, Ch =< 127 -> true;
 47 | is_ascii(_) -> false.
 48 | 
 49 | 
 50 | %%% UTF-8 encoding and decoding
 51 | to_utf8(List) when is_list(List) -> lists:flatmap(fun to_utf8/1, List);
 52 | to_utf8(Ch) -> char_to_utf8(Ch).
 53 | 
 54 | from_utf8(Bin) when is_binary(Bin) -> from_utf8(binary_to_list(Bin));
 55 | from_utf8(List) ->
 56 |   case expand_utf8(List) of
 57 |     {Result,0} -> Result;
 58 |     {_Res,_NumBadChar} ->
 59 |       exit({ucs,{bad_utf8_character_code}})
 60 |   end.
 61 | 
 62 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 63 | %%% UTF-8 support
 64 | %%% Possible errors encoding UTF-8:
 65 | %%%	- Non-character values (something other than 0 .. 2^31-1).
 66 | %%%	- Surrogate pair code in string.
 67 | %%%	- 16#FFFE or 16#FFFF character in string.
 68 | %%% Possible errors decoding UTF-8:
 69 | %%%	- 10xxxxxx or 1111111x as initial byte.
 70 | %%%	- Insufficient number of 10xxxxxx octets following an initial octet of
 71 | %%%	multi-octet sequence.
 72 | %%% 	- Non-canonical encoding used.
 73 | %%%	- Surrogate-pair code encoded as UTF-8.
 74 | %%%	- 16#FFFE or 16#FFFF character in string.
 75 | char_to_utf8(Ch) when is_integer(Ch), Ch >= 0 ->
 76 |   if Ch < 128 ->
 77 |     %% 0yyyyyyy
 78 |     [Ch];
 79 |     Ch < 16#800 ->
 80 |       %% 110xxxxy 10yyyyyy
 81 |       [16#C0 + (Ch bsr 6),
 82 |         128+(Ch band 16#3F)];
 83 |     Ch < 16#10000 ->
 84 |       %% 1110xxxx 10xyyyyy 10yyyyyy
 85 |       if Ch < 16#D800; Ch > 16#DFFF, Ch < 16#FFFE ->
 86 |         [16#E0 + (Ch bsr 12),
 87 |           128+((Ch bsr 6) band 16#3F),
 88 |           128+(Ch band 16#3F)]
 89 |       end;
 90 |     Ch < 16#200000 ->
 91 |       %% 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy
 92 |       [16#F0+(Ch bsr 18),
 93 |         128+((Ch bsr 12) band 16#3F),
 94 |         128+((Ch bsr 6) band 16#3F),
 95 |         128+(Ch band 16#3F)];
 96 |     Ch < 16#4000000 ->
 97 |       %% 111110xx 10xxxyyy 10yyyyyy 10yyyyyy 10yyyyyy
 98 |       [16#F8+(Ch bsr 24),
 99 |         128+((Ch bsr 18) band 16#3F),
100 |         128+((Ch bsr 12) band 16#3F),
101 |         128+((Ch bsr 6) band 16#3F),
102 |         128+(Ch band 16#3F)];
103 |     Ch < 16#80000000 ->
104 |       %% 1111110x 10xxxxyy 10yyyyyy 10yyyyyy 10yyyyyy 10yyyyyy
105 |       [16#FC+(Ch bsr 30),
106 |         128+((Ch bsr 24) band 16#3F),
107 |         128+((Ch bsr 18) band 16#3F),
108 |         128+((Ch bsr 12) band 16#3F),
109 |         128+((Ch bsr 6) band 16#3F),
110 |         128+(Ch band 16#3F)]
111 |   end.
112 | 
113 | 
114 | 
115 | 
116 | %% expand_utf8([Byte]) -> {[UnicodeChar],NumberOfBadBytes}
117 | %%  Expand UTF8 byte sequences to ISO 10646/Unicode
118 | %%  characters. Any illegal bytes are removed and the number of
119 | %%  bad bytes are returned.
120 | %%
121 | %%  Reference:
122 | %%     RFC 3629: "UTF-8, a transformation format of ISO 10646".
123 | 
124 | expand_utf8(Str) ->
125 |   expand_utf8_1(Str, [], 0).
126 | 
127 | expand_utf8_1([C|Cs], Acc, Bad) when C < 16#80 ->
128 |   %% Plain Ascii character.
129 |   expand_utf8_1(Cs, [C|Acc], Bad);
130 | expand_utf8_1([C1,C2|Cs], Acc, Bad) when C1 band 16#E0 =:= 16#C0,
131 |   C2 band 16#C0 =:= 16#80 ->
132 |   case ((C1 band 16#1F) bsl 6) bor (C2 band 16#3F) of
133 |     C when 16#80 =< C ->
134 |       expand_utf8_1(Cs, [C|Acc], Bad);
135 |     _ ->
136 |       %% Bad range.
137 |       expand_utf8_1(Cs, Acc, Bad+1)
138 |   end;
139 | expand_utf8_1([C1,C2,C3|Cs], Acc, Bad) when C1 band 16#F0 =:= 16#E0,
140 |   C2 band 16#C0 =:= 16#80,
141 |   C3 band 16#C0 =:= 16#80 ->
142 |   case ((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor
143 |     (C3 band 16#3F) of
144 |     C when 16#800 =< C ->
145 |       expand_utf8_1(Cs, [C|Acc], Bad);
146 |     _ ->
147 |       %% Bad range.
148 |       expand_utf8_1(Cs, Acc, Bad+1)
149 |   end;
150 | expand_utf8_1([C1,C2,C3,C4|Cs], Acc, Bad) when C1 band 16#F8 =:= 16#F0,
151 |   C2 band 16#C0 =:= 16#80,
152 |   C3 band 16#C0 =:= 16#80,
153 |   C4 band 16#C0 =:= 16#80 ->
154 |   case ((((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor
155 |     (C3 band 16#3F)) bsl 6) bor (C4 band 16#3F) of
156 |     C when 16#10000 =< C ->
157 |       expand_utf8_1(Cs, [C|Acc], Bad);
158 |     _ ->
159 |       %% Bad range.
160 |       expand_utf8_1(Cs, Acc, Bad+1)
161 |   end;
162 | expand_utf8_1([_|Cs], Acc, Bad) ->
163 |   %% Ignore bad character.
164 |   expand_utf8_1(Cs, Acc, Bad+1);
165 | expand_utf8_1([], Acc, Bad) -> {lists:reverse(Acc),Bad}.
166 | 


--------------------------------------------------------------------------------
/src/punycode.erl:
--------------------------------------------------------------------------------
  1 | %% -*- coding: utf-8 -*-
  2 | %%%
  3 | %%% This file is part of erlang-idna released under the MIT license.
  4 | %%% See the LICENSE for more information.
  5 | %%%
  6 | %% @doc Punycode ([RFC 3492](http://tools.ietf.org/html/rfc3492)) implementation.
  7 | 
  8 | -module(punycode).
  9 | 
 10 | 
 11 | -export([encode/1,
 12 |          decode/1]).
 13 | 
 14 | -define(BASE, 36).
 15 | -define(TMIN, 1).
 16 | -define(TMAX, 26).
 17 | -define(SKEW, 38).
 18 | -define(DAMP, 700).
 19 | -define(INITIAL_BIAS, 72).
 20 | -define(INITIAL_N, 128).
 21 | -define(DELIMITER, $-).
 22 | 
 23 | 
 24 | -define(MAX, 1 bsl 32 - 1).
 25 | 
 26 | %% @doc Convert Unicode to Punycode.
 27 | %%
 28 | %% exit with an overflow error on overflow, which can only happen on inputs
 29 | %% that would take more than 63 encoded bytes, the DNS limit on domain name labels.
 30 | -spec encode(string()) -> string().
 31 | encode(Input) ->
 32 |   Output0 = lists:filtermap(fun
 33 |                              (C) when C < 16#80 -> {true, C};
 34 |                              (_) -> false
 35 |                            end, Input),
 36 |   B = length(Output0),
 37 |   Output = case B > 0 of
 38 |              true -> Output0 ++ [?DELIMITER];
 39 |              false -> Output0
 40 |            end,
 41 |   H = B,
 42 |   encode(Input, Output, H, B, ?INITIAL_N, 0, ?INITIAL_BIAS).
 43 | 
 44 | 
 45 | encode(Input, Output, H, B, N, Delta, Bias) when H < length(Input) ->
 46 |   M = lists:min(lists:filter(fun(C) -> C >= N end, Input)),
 47 |   Delta1 = case (M - N) > ((?MAX - Delta) / (H +1)) of
 48 |              false -> Delta +  (M - N) * (H + 1);
 49 |              true -> exit(oveflow)
 50 |            end,
 51 |   {Output2, H2, Delta2, N2, Bias2} = encode1(Input, Output, H, B, M, Delta1, Bias),
 52 |   encode(Input, Output2, H2, B, N2, Delta2, Bias2);
 53 | encode(_, Output, _, _, _, _, _) ->
 54 |   Output.
 55 | 
 56 | encode1([C|Rest], Output, H, B, N, Delta, Bias) when C < N ->
 57 |   Delta2 = Delta + 1,
 58 |   case Delta2 of
 59 |     0 -> exit(oveflow);
 60 |     _ ->
 61 |       encode1(Rest, Output, H, B, N, Delta2, Bias)
 62 |   end;
 63 | encode1([C|Rest], Output, H, B, N, Delta, Bias) when C == N ->
 64 |   encode2(Rest, Output, H, B, N, Delta, Bias, Delta, ?BASE);
 65 | encode1([_|Rest], Output, H, B, N, Delta, Bias) ->
 66 |   encode1(Rest, Output, H, B, N, Delta, Bias);
 67 | encode1([], Output, H, _B, N, Delta, Bias) ->
 68 |   {Output, H, Delta + 1, N +1, Bias}.
 69 | 
 70 | encode2(Rest, Output, H, B, N, Delta, Bias, Q, K) ->
 71 |   T = if
 72 |         K =< Bias -> ?TMIN;
 73 |         K >= (Bias + ?TMAX) -> ?TMAX;
 74 |         true -> K - Bias
 75 |       end,
 76 |   case  Q < T of
 77 |     true ->
 78 |       CodePoint = to_digit(Q),
 79 |       Output2 = Output ++ [CodePoint],
 80 |       Bias2 = adapt(Delta, H +1, H == B),
 81 |       Delta2 = 0,
 82 |       H2 = H + 1,
 83 |       encode1(Rest, Output2, H2, B, N, Delta2, Bias2);
 84 |     false ->
 85 |       CodePoint = to_digit(T + ((Q - T) rem (?BASE - T))),
 86 |       Output2 = Output ++ [CodePoint],
 87 |       Q2 = (Q - T) div (?BASE - T),
 88 |       encode2(Rest, Output2, H, B, N, Delta, Bias, Q2, K + ?BASE)
 89 |   end.
 90 | 
 91 | to_digit(V) when V >= 0, V =< 25 -> V + $a;
 92 | to_digit(V) when V >= 26, V =< 35 -> V - 26 + $0;
 93 | to_digit(_) -> exit(badarg).
 94 | 
 95 | 
 96 | %% @doc Convert Punycode to Unicode.
 97 | %% exit with an overflow or badarg errors if malformed or overflow.
 98 | %% Overflow can only happen on inputs that take more than 63 encoded bytes,
 99 | %% the DNS limit on domain name labels.
100 | -spec decode(string()) -> string().
101 | decode(Input) ->
102 |   {Output, Input2} = case string:rstr(Input, [?DELIMITER]) of
103 |              0 -> {"", Input};
104 |              Pos ->
105 |                {lists:sublist(Input, Pos - 1), lists:sublist(Input, Pos + 1, length(Input) )}
106 |            end,
107 |   decode(Input2, Output, ?INITIAL_N, ?INITIAL_BIAS, 0).
108 | 
109 | decode([], Output, _, _, _) -> Output;
110 | decode(Input, Output, N, Bias, I) ->
111 |   decode(Input, Output, N, Bias, I, I, 1, ?BASE).
112 | 
113 | decode([C|Rest], Output, N, Bias, I0, OldI, Weight, K) ->
114 |   Digit = digit(C),
115 |   I1 = case Digit > ((?MAX - I0 ) div Weight) of
116 |          false -> I0 + (Digit * Weight);
117 |          true -> exit(overflow)
118 |        end,
119 |   T = if
120 |         K =< Bias -> ?TMIN;
121 |         K >= (Bias + ?TMAX) -> ?TMAX;
122 |         true -> K - Bias
123 |       end,
124 |   case Digit < T of
125 |     true ->
126 |       Len = length(Output),
127 |       Bias2 = adapt(I1 - OldI, Len + 1, (OldI =:= 0)),
128 |       {N2, I2}= case (I1 div (Len +1)) > (?MAX - N) of
129 |                   false ->
130 |                     {N + (I1 div (Len + 1)), I1 rem (Len + 1)};
131 |                   true ->
132 |                     exit(overflow)
133 |                 end,
134 |       Output2 = insert(Output, N2, [], I2),
135 |       decode(Rest, Output2, N2, Bias2, I2+1);
136 |     false ->
137 |       case Weight > (?MAX  div (?BASE - T)) of
138 |         false ->
139 |           decode(Rest, Output, N, Bias, I1, OldI, Weight * (?BASE - T), K + ?BASE);
140 |         true ->
141 |           exit(overflow)
142 |       end
143 |   end.
144 | 
145 | insert(Tail, CP, Head, 0) ->
146 |   Head ++ [CP | Tail];
147 | insert([], _CP, _Head, I) when I > 0->
148 |   exit(overflow);
149 | insert([C | Tail], CP, Head, I) ->
150 |   insert(Tail, CP, Head ++ [C], I - 1).
151 | 
152 | 
153 | digit(C) when C >= $0, C =< $9 -> C - $0 + 26;
154 | digit(C) when C >= $A, C =< $Z -> C - $A;
155 | digit(C) when C >= $a, C =< $z -> C - $a;
156 | digit(_) -> exit(badarg).
157 | 
158 | adapt(Delta, NumPoints, FirstTime) ->
159 |   Delta2 = case FirstTime of
160 |              true ->
161 |                Delta div ?DAMP;
162 |              false ->
163 |                Delta div 2
164 |            end,
165 |   adapt(Delta2 + (Delta2 div NumPoints), 0).
166 | 
167 | adapt(Delta, K) ->
168 |   case Delta > (((?BASE - ?TMIN) * ?TMAX) div 2) of
169 |     true ->
170 |       adapt(Delta div (?BASE - ?TMIN), K + ?BASE);
171 |     false ->
172 |       K + (((?BASE - ?TMIN + 1) * Delta) div (Delta + ?SKEW))
173 |   end.


--------------------------------------------------------------------------------
/test/compat_test.erl:
--------------------------------------------------------------------------------
 1 | %%%
 2 | %%% This file is part of erlang-idna released under the MIT license.
 3 | %%% See the LICENSE for more information.
 4 | %%%
 5 | -module(compat_test).
 6 | -author("benoitc").
 7 | 
 8 | %% API
 9 | -export([to_ascii_test/0, to_unicode_test/0]).
10 | 
11 | 
12 | -include_lib("eunit/include/eunit.hrl").
13 | 
14 | to_ascii_test() ->
15 |   ?assertEqual("xn--zckzah.xn--zckzah", idna:to_ascii("テスト.xn--zckzah")).
16 | 
17 | to_unicode_test() ->
18 |   ?assertEqual([12486,12473,12488,46,12486,12473,12488], idna:to_unicode("xn--zckzah.xn--zckzah")).


--------------------------------------------------------------------------------
/test/idna_test.erl:
--------------------------------------------------------------------------------
  1 | %%%
  2 | %%% This file is part of erlang-idna released under the MIT license.
  3 | %%% See the LICENSE for more information.
  4 | %%%
  5 | -module(idna_test).
  6 | -author("benoitc").
  7 | 
  8 | 
  9 | -define(tld_strings, [
 10 |   {[16#6d4b,16#8bd5], "xn--0zwm56d"},
 11 |   {[16#092a,16#0930,16#0940,16#0915,16#094d,16#0937,16#093e], "xn--11b5bs3a9aj6g"},
 12 |   {[16#d55c,16#ad6d], "xn--3e0b707e"},
 13 |   {[16#09ad,16#09be,16#09b0,16#09a4], "xn--45brj9c"},
 14 |   {[16#09ac,16#09be,16#0982,16#09b2,16#09be], "xn--54b7fta0cc"},
 15 |   {[16#0438,16#0441,16#043f,16#044b,16#0442,16#0430,16#043d,16#0438,16#0435], "xn--80akhbyknj4f"},
 16 |   {[16#0441,16#0440,16#0431], "xn--90a3ac"},
 17 |   {[16#d14c,16#c2a4,16#d2b8], "xn--9t4b11yi5a"},
 18 |   {[16#0b9a,16#0bbf,16#0b99,16#0bcd,16#0b95,16#0baa,16#0bcd,16#0baa,16#0bc2,16#0bb0,16#0bcd], "xn--clchc0ea0b2g2a9gcd"},
 19 |   {[16#05d8,16#05e2,16#05e1,16#05d8], "xn--deba0ad"},
 20 |   {[16#4e2d,16#56fd], "xn--fiqs8s"},
 21 |   {[16#4e2d,16#570b], "xn--fiqz9s"},
 22 |   {[16#0c2d,16#0c3e,16#0c30,16#0c24,16#0c4d], "xn--fpcrj9c3d"},
 23 |   {[16#0dbd,16#0d82,16#0d9a,16#0dcf], "xn--fzc2c9e2c"},
 24 |   {[16#6e2c,16#8a66], "xn--g6w251d"},
 25 |   {[16#0aad,16#0abe,16#0ab0,16#0aa4], "xn--gecrj9c"},
 26 |   {[16#092d,16#093e,16#0930,16#0924], "xn--h2brj9c"},
 27 |   {[16#0622,16#0632,16#0645,16#0627,16#06cc,16#0634,16#06cc], "xn--hgbk6aj7f53bba"},
 28 |   {[16#0baa,16#0bb0,16#0bbf,16#0b9f,16#0bcd,16#0b9a,16#0bc8], "xn--hlcj6aya9esc7a"},
 29 |   {[16#0443,16#043a,16#0440], "xn--j1amh"},
 30 |   {[16#9999,16#6e2f], "xn--j6w193g"},
 31 |   {[16#03b4,16#03bf,16#03ba,16#03b9,16#03bc,16#03ae], "xn--jxalpdlp"},
 32 |   {[16#0625,16#062e,16#062a,16#0628,16#0627,16#0631], "xn--kgbechtv"},
 33 |   {[16#53f0,16#6e7e], "xn--kprw13d"},
 34 |   {[16#53f0,16#7063], "xn--kpry57d"},
 35 |   {[16#0627,16#0644,16#062c,16#0632,16#0627,16#0626,16#0631], "xn--lgbbat1ad8j"},
 36 |   {[16#0639,16#0645,16#0627,16#0646], "xn--mgb9awbf"},
 37 |   {[16#0627,16#06cc,16#0631,16#0627,16#0646], "xn--mgba3a4f16a"},
 38 |   {[16#0627,16#0645,16#0627,16#0631,16#0627,16#062a], "xn--mgbaam7a8h"},
 39 |   {[16#067e,16#0627,16#06a9,16#0633,16#062a,16#0627,16#0646], "xn--mgbai9azgqp6j"},
 40 |   {[16#0627,16#0644,16#0627,16#0631,16#062f,16#0646], "xn--mgbayh7gpa"},
 41 |   {[16#0628,16#06be,16#0627,16#0631,16#062a], "xn--mgbbh1a71e"},
 42 |   {[16#0627,16#0644,16#0645,16#063a,16#0631,16#0628], "xn--mgbc0a9azcg"},
 43 |   {[16#0627,16#0644,16#0633,16#0639,16#0648,16#062f,16#064a,16#0629], "xn--mgberp4a5d4ar"},
 44 |   {[16#10d2,16#10d4], "xn--node"},
 45 |   {[16#0e44,16#0e17,16#0e22], "xn--o3cw4h"},
 46 |   {[16#0633,16#0648,16#0631,16#064a,16#0629], "xn--ogbpf8fl"},
 47 |   {[16#0440,16#0444], "xn--p1ai"},
 48 |   {[16#062a,16#0648,16#0646,16#0633], "xn--pgbs0dh"},
 49 |   {[16#0a2d,16#0a3e,16#0a30,16#0a24], "xn--s9brj9c"},
 50 |   {[16#0645,16#0635,16#0631], "xn--wgbh1c"},
 51 |   {[16#0642,16#0637,16#0631], "xn--wgbl6a"},
 52 |   {[16#0b87,16#0bb2,16#0b99,16#0bcd,16#0b95,16#0bc8], "xn--xkc2al3hye2a"},
 53 |   {[16#0b87,16#0ba8,16#0bcd,16#0ba4,16#0bbf,16#0baf,16#0bbe], "xn--xkc2dl3a5ee0h"},
 54 |   {[16#65b0,16#52a0,16#5761], "xn--yfro4i67o"},
 55 |   {[16#0641,16#0644,16#0633,16#0637,16#064a,16#0646], "xn--ygbi2ammx"},
 56 |   {[16#30c6,16#30b9,16#30c8], "xn--zckzah"},
 57 |   {[16#049b,16#0430,16#0437], "xn--80ao21a"},
 58 |   {[16#0645,16#0644,16#064a,16#0633,16#064a,16#0627], "xn--mgbx4cd0ab"},
 59 |   {[16#043c,16#043e,16#043d], "xn--l1acc"},
 60 |   {[16#0633,16#0648,16#062f,16#0627,16#0646], "xn--mgbpl2fh"}
 61 | ]).
 62 | 
 63 | -include_lib("eunit/include/eunit.hrl").
 64 | 
 65 | alabels_test() ->
 66 |   lists:foreach(
 67 |     fun({ULabel, ALabel}) ->
 68 |       ?assertEqual(ALabel, idna:alabel(ULabel))
 69 |     end,
 70 |     ?tld_strings
 71 |   ).
 72 | 
 73 | ulabels_test() ->
 74 |   lists:foreach(
 75 |     fun({ULabel, ALabel}) ->
 76 |       ?assertEqual(ULabel, idna:ulabel(ALabel))
 77 |     end,
 78 |     ?tld_strings
 79 |   ).
 80 | 
 81 | check_label_length_test() ->
 82 |   ?assertEqual(ok, idna:check_label_length([$a || _ <- lists:seq(1, 63)])),
 83 |   ?assertExit({bad_label, {too_long, _Error}}, idna:check_label_length([$a || _ <- lists:seq(1, 64)])),
 84 |   ?assertExit({bad_label, {too_long, _Error}}, idna:encode([$a || _ <- lists:seq(1, 64)])).
 85 | 
 86 | check_bidi_test() ->
 87 |   L = [16#0061],
 88 |   R = [16#05d0],
 89 |   AL= [16#0627],
 90 |   AN = [16#0660],
 91 |   EN = [16#0030],
 92 |   ES = [16#002d],
 93 |   CS = [16#002c],
 94 |   ET = [16#0024],
 95 |   ON = [16#0021],
 96 |   BN = [16#200c],
 97 |   NSM = [16#0610],
 98 |   WS = [16#0020],
 99 | 
100 |   %% RFC 5893 Rule 1
101 |   ok = idna_bidi:check_bidi(L),
102 |   ok = idna_bidi:check_bidi(R),
103 |   ok = idna_bidi:check_bidi(AL),
104 |   ?assertExit({bad_label, {bidi, _}}, idna_bidi:check_bidi(AN)),
105 | 
106 |   %% RFC 5893 Rule 2
107 |   ok = idna_bidi:check_bidi(R ++ AL),
108 |   ok = idna_bidi:check_bidi(R ++ AN),
109 |   ok = idna_bidi:check_bidi(R ++ EN),
110 |   ok = idna_bidi:check_bidi(R ++ ES ++ AL),
111 |   ok = idna_bidi:check_bidi(R ++ CS ++ AL),
112 |   ok = idna_bidi:check_bidi(R ++ ET ++ AL),
113 |   ok = idna_bidi:check_bidi(R ++ ON ++ AL),
114 |   ok = idna_bidi:check_bidi(R ++ BN ++ AL),
115 |   ok = idna_bidi:check_bidi(R ++ NSM),
116 |   ?assertExit({bad_label, {bidi, _}}, idna_bidi:check_bidi(R ++ L)),
117 |   ?assertExit({bad_label, {bidi, _}}, idna_bidi:check_bidi(R ++ WS)),
118 | 
119 |   %% RFC 5893 Rule 3
120 |   ok = idna_bidi:check_bidi(R ++ AL),
121 |   ok = idna_bidi:check_bidi(R ++ EN),
122 |   ok = idna_bidi:check_bidi(R ++ AN),
123 |   ok = idna_bidi:check_bidi(R ++ NSM),
124 |   ok = idna_bidi:check_bidi(R ++ NSM ++ NSM),
125 |   ?assertExit({bad_label, {bidi, _}}, idna_bidi:check_bidi(R ++ ON)),
126 | 
127 |   %% RFC 5893 Rule 4
128 |   ok = idna_bidi:check_bidi(R ++ EN),
129 |   ok = idna_bidi:check_bidi(R ++ AN),
130 |   ?assertExit({bad_label, {bidi, _}}, idna_bidi:check_bidi(R ++ EN ++ AN)),
131 |   ?assertExit({bad_label, {bidi, _}}, idna_bidi:check_bidi(R ++ AN ++ EN)),
132 | 
133 |   %% RFC 5893 Rule 5
134 |   ok = idna_bidi:check_bidi(L ++ EN, true),
135 |   ok = idna_bidi:check_bidi(L ++ ES ++ L, true),
136 |   ok = idna_bidi:check_bidi(L ++ CS ++ L, true),
137 |   ok = idna_bidi:check_bidi(L ++ ET ++ L, true),
138 |   ok = idna_bidi:check_bidi(L ++ ON ++ L, true),
139 |   ok = idna_bidi:check_bidi(L ++ BN ++ L, true),
140 |   ok = idna_bidi:check_bidi(L ++ NSM, true),
141 | 
142 |   %% RFC 5893 Rule 6
143 |   ok = idna_bidi:check_bidi(L ++ L, true),
144 |   ok = idna_bidi:check_bidi(L ++ EN, true),
145 |   ok = idna_bidi:check_bidi(L ++ EN ++ NSM, true),
146 |   ok = idna_bidi:check_bidi(L ++ EN ++ NSM ++ NSM, true),
147 |   ?assertExit({bad_label, {bidi, _}}, idna_bidi:check_bidi(L ++ CS, true)).
148 | 
149 | check_initial_combiner_test() ->
150 |   M = [16#0300],
151 |   A = [16#0061],
152 | 
153 |   ok = idna:check_initial_combiner(A),
154 |   ok = idna:check_initial_combiner(A ++ M),
155 |   ?assertExit({bad_label, {initial_combiner, _}},idna:check_initial_combiner(M ++ A)).
156 | 
157 | check_hyphen_test() ->
158 |   ok  = idna:check_hyphen("abc"),
159 |   ok  = idna:check_hyphen("a--b"),
160 |   ?assertExit({bad_label, {hyphen, _}},idna:check_hyphen("aa--")),
161 |   ?assertExit({bad_label, {hyphen, _}},idna:check_hyphen("a-")),
162 |   ?assertExit({bad_label, {hyphen, _}},idna:check_hyphen("-a")).
163 | 
164 | 
165 | valid_contextj_test() ->
166 |   Zwnj = [16#200c],
167 |   Zwj = [16#200d],
168 |   Virama = [16#094d],
169 |   Latin = [16#0061],
170 | 
171 |   % RFC 5892 Appendix A.1 (Zero Width Non-Joiner)
172 |   false = idna_context:valid_contextj(Zwnj, 0),
173 |   false = idna_context:valid_contextj(Latin ++ Zwnj, 1),
174 |   true = idna_context:valid_contextj(Virama ++ Zwnj, 1),
175 | 
176 |   % RFC 5892 Appendix A.2 (Zero Width Joiner)
177 |   false = idna_context:valid_contextj(Zwj, 0),
178 |   false = idna_context:valid_contextj(Latin ++ Zwj, 1),
179 |   true = idna_context:valid_contextj(Virama ++ Zwj, 1).
180 | 
181 | 
182 | valid_contexto_test() ->
183 |   Latin = [16#0061],
184 |   Latin_l = [16#006c],
185 |   Greek = [16#03b1],
186 |   Hebrew = [16#05d0],
187 |   Katakana = [16#30a1],
188 |   Hiragana = [16#3041],
189 |   Han = [16#6f22],
190 |   Arabic_digit = [16#0660],
191 |   Ext_arabic_digit = [16#06f0],
192 | 
193 |   % RFC 5892 Rule A.3 (Middle Dot)
194 |   Latin_middle_dot = [16#00b7],
195 |   true = idna_context:valid_contexto(Latin_l ++ Latin_middle_dot ++  Latin_l, 1),
196 |   false = idna_context:valid_contexto(Latin_middle_dot ++ Latin_l, 1),
197 |   false = idna_context:valid_contexto(Latin_l ++ Latin_middle_dot, 0),
198 |   false = idna_context:valid_contexto(Latin_middle_dot, 0),
199 |   false = idna_context:valid_contexto(Latin_l ++ Latin_middle_dot ++ Latin, 1),
200 |   true = idna_context:valid_contexto("ru" ++ Latin_l ++ Latin_middle_dot ++  Latin_l ++ "z", 3),
201 |   false = idna_context:valid_contexto("ru" ++ Latin ++ Latin_middle_dot ++  Latin_l ++ "z", 3),
202 | 
203 |   % RFC 5892 Rule A.4 (Greek Lower Numeral Sign)
204 |   Glns = [16#0375],
205 |   true = idna_context:valid_contexto(Glns ++ Greek, 0),
206 |   false = idna_context:valid_contexto(Glns ++ Latin, 0),
207 |   false = idna_context:valid_contexto(Glns, 0),
208 |   false = idna_context:valid_contexto(Greek ++ Glns, 1),
209 | 
210 |   % RFC 5892 Rule A.5 (Hebrew Punctuation Geresh)
211 |   Geresh = [16#05f3],
212 |   true = idna_context:valid_contexto(Hebrew ++ Geresh, 1),
213 |   false = idna_context:valid_contexto(Latin ++ Geresh, 1),
214 | 
215 |   % RFC 5892 Rule A.6 (Hebrew Punctuation Gershayim)
216 |   Gershayim = [16#05f4],
217 |   true = idna_context:valid_contexto(Hebrew ++ Gershayim, 1),
218 |   false = idna_context:valid_contexto(Latin ++ Gershayim, 1),
219 | 
220 |   % RFC 5892 Rule A.7 (Katakana Middle Dot)
221 |   Ja_middle_dot = [16#30fb],
222 |   true = idna_context:valid_contexto(Katakana ++ Ja_middle_dot ++ Katakana, 1),
223 |   true = idna_context:valid_contexto(Hiragana ++ Ja_middle_dot ++ Hiragana, 1),
224 |   true = idna_context:valid_contexto(Han ++ Ja_middle_dot ++ Han, 1),
225 |   true = idna_context:valid_contexto(Han ++ Ja_middle_dot ++ Latin, 1),
226 |   true = idna_context:valid_contexto([16#6f22, 16#30fb, 16#5b57], 1),
227 |   false = idna_context:valid_contexto([16#0061, 16#30fb, 16#0061], 1),
228 | 
229 |   % RFC 5892 Rule A.8 (Arabic-Indic Digits)
230 |   true = idna_context:valid_contexto(Katakana ++ Ja_middle_dot ++ Katakana, 1),
231 |   false = idna_context:valid_contexto([16#0061, 16#30fb, 16#0061], 1),
232 | 
233 |   % RFC 5892 Rule A.9 (Extended Arabic-Indic Digits)
234 |   true = idna_context:valid_contexto(Ext_arabic_digit ++ Ext_arabic_digit, 0),
235 |   false = idna_context:valid_contexto(Ext_arabic_digit ++ Arabic_digit, 0).
236 | 
237 | encode_test() ->
238 |   ?assertEqual("xn--zckzah.xn--zckzah", idna:encode("xn--zckzah.xn--zckzah")),
239 |   ?assertEqual("xn--zckzah.xn--zckzah", idna:encode([16#30c6,16#30b9,16#30c8, $., 16#30c6, 16#30b9, 16#30c8])),
240 |   ?assertEqual("abc.abc", idna:encode("abc.abc")),
241 |   ?assertEqual("xn--zckzah.abc", idna:encode("xn--zckzah.abc")),
242 |   ?assertEqual("xn--zckzah.abc", idna:encode([16#30c6, 16#30b9, 16#30c8, $., $a, $b, $c])),
243 |   ?assertEqual(
244 |     "xn---------90gglbagaar.aa",
245 |     idna:encode([16#0521,16#0525,16#0523,$-,16#0523,16#0523,$-,$-,$-,$-,$-,16#0521,16#0523,16#0523,16#0523|".aa"])
246 |   ),
247 |   ?assertExit(
248 |     {bad_label, {_, _}},
249 |     idna:encode(
250 |       [16#0521,16#0524,16#0523,$-,16#0523,16#0523,$-,$-,$-,$-,$-,16#0521,16#0523,16#0523,16#0523|".aa"],
251 |       [{uts46, false}]
252 |     )
253 |   ),
254 |   ?assertEqual([$a ||_ <- lists:seq(1, 63)], idna:encode([$a ||_ <- lists:seq(1, 63)])),
255 |   ?assertExit({bad_label, {_, _}}, idna:encode([$a ||_ <- lists:seq(1, 64)])).
256 | 
257 | 
258 | decode_test() ->
259 |   ?assertEqual([16#30c6, 16#30b9, 16#30c8, $., 16#30c6, 16#30b9, 16#30c8], idna:decode("xn--zckzah.xn--zckzah")),
260 |   ?assertEqual(
261 |     [16#30c6, 16#30b9, 16#30c8, $., 16#30c6, 16#30b9, 16#30c8],
262 |     idna:decode([16#30c6, 16#30b9, 16#30c8|".xn--zckzah"])
263 |   ),
264 |   ?assertEqual(
265 |     [16#30c6, 16#30b9, 16#30c8, $., 16#30c6, 16#30b9, 16#30c8],
266 |     idna:decode([16#30c6, 16#30b9, 16#30c8, $., 16#30c6, 16#30b9, 16#30c8])
267 |   ),
268 |   ?assertEqual("abc.abc", idna:decode("abc.abc")),
269 |   ?assertEqual(
270 |     [16#0521,16#0525,16#0523,$-,16#0523,16#0523,$-,$-,$-,$-,$-,16#0521,16#0523,16#0523,16#0523|".aa"],
271 |     idna:decode("xn---------90gglbagaar.aa")
272 |   ),
273 |   ?assertExit({bad_label, {_, _}}, idna:decode("XN---------90GGLBAGAAC.AA")),
274 |   ?assertExit({bad_label, {_, _}}, idna:decode("xn---------90gglbagaac.aa")).


--------------------------------------------------------------------------------
/test/punycode_test.erl:
--------------------------------------------------------------------------------
  1 | %%%
  2 | %%% This file is part of erlang-idna released under the MIT license.
  3 | %%% See the LICENSE for more information.
  4 | %%%
  5 | -module(punycode_test).
  6 | -author("benoitc").
  7 | 
  8 | -export([punicode_encode_test/0, punicode_decode_test/0]).
  9 | 
 10 | -include_lib("eunit/include/eunit.hrl").
 11 | 
 12 | data() ->
 13 |   [
 14 |     {"(A) Arabic (Egyptian)",
 15 |       [16#0644, 16#064A, 16#0647, 16#0645, 16#0627, 16#0628, 16#062A, 16#0643,
 16 |         16#0644, 16#0645, 16#0648, 16#0634, 16#0639, 16#0631, 16#0628, 16#064A,
 17 |         16#061F],
 18 |       "egbpdaj6bu4bxfgehfvwxn"
 19 |     },
 20 | 
 21 |     {
 22 |       "(B) Chinese (simplified)",
 23 |       [16#4ED6, 16#4EEC, 16#4E3A, 16#4EC0, 16#4E48, 16#4E0D, 16#8BF4, 16#4E2D,
 24 |         16#6587],
 25 |       "ihqwcrb4cv8a8dqg056pqjye"
 26 | 
 27 |     },
 28 |     {
 29 |       "(C) Chinese (traditional)",
 30 |       [16#4ED6, 16#5011, 16#7232, 16#4EC0, 16#9EBD, 16#4E0D, 16#8AAA, 16#4E2D,
 31 |         16#6587],
 32 |       "ihqwctvzc91f659drss3x8bo0yb"
 33 |     },
 34 |     {
 35 |       "(D) Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky",
 36 |       [16#0050, 16#0072, 16#006F, 16#010D, 16#0070, 16#0072, 16#006F, 16#0073,
 37 |         16#0074, 16#011B, 16#006E, 16#0065, 16#006D, 16#006C, 16#0075, 16#0076,
 38 |         16#00ED, 16#010D, 16#0065, 16#0073, 16#006B, 16#0079],
 39 |       "Proprostnemluvesky-uyb24dma41a"
 40 |     },
 41 |     {
 42 |       "(E) Hebrew:",
 43 |       [16#05DC, 16#05DE, 16#05D4, 16#05D4, 16#05DD, 16#05E4, 16#05E9, 16#05D5,
 44 |         16#05D8, 16#05DC, 16#05D0, 16#05DE, 16#05D3, 16#05D1, 16#05E8, 16#05D9,
 45 |         16#05DD, 16#05E2, 16#05D1, 16#05E8, 16#05D9, 16#05EA],
 46 |       "4dbcagdahymbxekheh6e0a7fei0b"
 47 |     },
 48 |     {
 49 |       "(F) Hindi (Devanagari):",
 50 |       [16#092F, 16#0939, 16#0932, 16#094B, 16#0917, 16#0939, 16#093F, 16#0928,
 51 |         16#094D, 16#0926, 16#0940, 16#0915, 16#094D, 16#092F, 16#094B, 16#0902,
 52 |         16#0928, 16#0939, 16#0940, 16#0902, 16#092C, 16#094B, 16#0932, 16#0938,
 53 |         16#0915, 16#0924, 16#0947, 16#0939, 16#0948, 16#0902],
 54 |       "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"
 55 |     },
 56 |     {
 57 |       "(G) Japanese (kanji and hiragana):",
 58 |       [16#306A, 16#305C, 16#307F, 16#3093, 16#306A, 16#65E5, 16#672C, 16#8A9E,
 59 |         16#3092, 16#8A71, 16#3057, 16#3066, 16#304F, 16#308C, 16#306A, 16#3044,
 60 |         16#306E, 16#304B],
 61 |       "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"
 62 |     },
 63 |     {
 64 |       "(H) Korean (Hangul syllables):",
 65 |       [ 16#C138, 16#ACC4, 16#C758, 16#BAA8, 16#B4E0, 16#C0AC, 16#B78C, 16#B4E4,
 66 |         16#C774, 16#D55C, 16#AD6D, 16#C5B4, 16#B97C, 16#C774, 16#D574, 16#D55C,
 67 |         16#B2E4, 16#BA74, 16#C5BC, 16#B9C8, 16#B098, 16#C88B, 16#C744, 16#AE4C],
 68 |       "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c"},
 69 |     {
 70 |       "(I) Russian (Cyrillic):",
 71 |       [16#043F, 16#043E, 16#0447, 16#0435, 16#043C, 16#0443, 16#0436, 16#0435,
 72 |         16#043E, 16#043D, 16#0438, 16#043D, 16#0435, 16#0433, 16#043E, 16#0432,
 73 |         16#043E, 16#0440, 16#044F, 16#0442, 16#043F, 16#043E, 16#0440, 16#0443,
 74 |         16#0441, 16#0441, 16#043A, 16#0438],
 75 |       "b1abfaaepdrnnbgefbadotcwatmq2g4l"
 76 |     },
 77 |     {
 78 |       "(J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol",
 79 |       [16#0050, 16#006F, 16#0072, 16#0071, 16#0075, 16#00E9, 16#006E, 16#006F,
 80 |         16#0070, 16#0075, 16#0065, 16#0064, 16#0065, 16#006E, 16#0073, 16#0069,
 81 |         16#006D, 16#0070, 16#006C, 16#0065, 16#006D, 16#0065, 16#006E, 16#0074,
 82 |         16#0065, 16#0068, 16#0061, 16#0062, 16#006C, 16#0061, 16#0072, 16#0065,
 83 |         16#006E, 16#0045, 16#0073, 16#0070, 16#0061, 16#00F1, 16#006F, 16#006C],
 84 |       "PorqunopuedensimplementehablarenEspaol-fmd56a"
 85 |     },
 86 |     {
 87 |       "(K) Vietnamese:",
 88 |       [16#0054, 16#1EA1, 16#0069, 16#0073, 16#0061, 16#006F, 16#0068, 16#1ECD,
 89 |         16#006B, 16#0068, 16#00F4, 16#006E, 16#0067, 16#0074, 16#0068, 16#1EC3,
 90 |         16#0063, 16#0068, 16#1EC9, 16#006E, 16#00F3, 16#0069, 16#0074, 16#0069,
 91 |         16#1EBF, 16#006E, 16#0067, 16#0056, 16#0069, 16#1EC7, 16#0074],
 92 |       "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"
 93 |     },
 94 |     {
 95 |       "(L) 3<nen>B<gumi><kinpachi><sensei>",
 96 |       [16#0033, 16#5E74, 16#0042, 16#7D44, 16#91D1, 16#516B, 16#5148, 16#751F],
 97 |       "3B-ww4c5e180e575a65lsy2b"
 98 |     },
 99 |     {
100 |       "(M) <amuro><namie>-with-SUPER-MONKEYS",
101 |       [16#5B89, 16#5BA4, 16#5948, 16#7F8E, 16#6075, 16#002D, 16#0077, 16#0069,
102 |         16#0074, 16#0068, 16#002D, 16#0053, 16#0055, 16#0050, 16#0045, 16#0052,
103 |         16#002D, 16#004D, 16#004F, 16#004E, 16#004B, 16#0045, 16#0059, 16#0053],
104 |       "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"
105 |     },
106 |     {
107 |       "(N) Hello-Another-Way-<sorezore><no><basho>",
108 |       [16#0048, 16#0065, 16#006C, 16#006C, 16#006F, 16#002D, 16#0041, 16#006E,
109 |         16#006F, 16#0074, 16#0068, 16#0065, 16#0072, 16#002D, 16#0057, 16#0061,
110 |         16#0079, 16#002D, 16#305D, 16#308C, 16#305E, 16#308C, 16#306E, 16#5834,
111 |         16#6240],
112 |       "Hello-Another-Way--fc4qua05auwb3674vfr0b"
113 |     },
114 |     {
115 |       "(O) <hitotsu><yane><no><shita>2",
116 |       [16#3072, 16#3068, 16#3064, 16#5C4B, 16#6839, 16#306E, 16#4E0B, 16#0032],
117 |       "2-u9tlzr9756bt3uc0v"
118 |     },
119 |     {
120 |       "(P) Maji<de>Koi<suru>5<byou><mae>",
121 |       [16#004D, 16#0061, 16#006A, 16#0069, 16#3067, 16#004B, 16#006F, 16#0069,
122 |         16#3059, 16#308B, 16#0035, 16#79D2, 16#524D],
123 |       "MajiKoi5-783gue6qz075azm5e"
124 |     },
125 |     {
126 |       "(Q) <pafii>de<runba>",
127 |       [16#30D1, 16#30D5, 16#30A3, 16#30FC, 16#0064, 16#0065, 16#30EB, 16#30F3, 16#30D0],
128 |       "de-jg4avhby1noc0d"
129 |     },
130 |     {
131 |       "(R) <sono><supiido><de>",
132 |       [16#305D, 16#306E, 16#30B9, 16#30D4, 16#30FC, 16#30C9, 16#3067],
133 |       "d9juau41awczczp"
134 |     },
135 |     {
136 |       "(S) -> $1.00 <-",
137 |       [16#002D, 16#003E, 16#0020, 16#0024, 16#0031, 16#002E, 16#0030, 16#0030,
138 |         16#0020, 16#003C, 16#002D],
139 |       "-> $1.00 <--"
140 |     }
141 |   ].
142 | 
143 | punicode_encode_test() ->
144 |   lists:foreach(
145 |     fun({_Descr, Input, Expect}) ->
146 |       ?assertEqual(Expect, punycode:encode(Input))
147 |     end,
148 |     data()
149 |   ).
150 | 
151 | punicode_decode_test() ->
152 |   lists:foreach(
153 |     fun({_Descr, Expect, Input}) ->
154 |       ?assertEqual(Expect, punycode:decode(Input))
155 |     end,
156 |     data()
157 |   ).
158 | 


--------------------------------------------------------------------------------
/test/uts46_test.erl:
--------------------------------------------------------------------------------
  1 | %% -*- coding: utf-8 -*-
  2 | %%%
  3 | %%% This file is part of erlang-idna released under the MIT license.
  4 | %%% See the LICENSE for more information.
  5 | %%%
  6 | -module(uts46_test).
  7 | -author("benoitc").
  8 | 
  9 | -ifdef('OTP_RELEASE').
 10 | -define(chomp(Str), string:chomp(Str)).
 11 | -define(trim(Str, Dir), string:trim(Str, Dir)).
 12 | -define(trim(Str), string:trim(Str, both)).
 13 | -define(lexemes(Str, Pat), string:lexemes(Str, Pat)).
 14 | -else.
 15 | -define(chomp(Str), string:strip(Str, right, $\n)).
 16 | -define(trim(Str, Dir), string:strip(Str, Dir)).
 17 | -define(trim(Str), string:strip(Str, both)).
 18 | -define(lexemes(Str, Pat), string:strip(string:tokens(Str, Pat), both)).
 19 | -endif.
 20 | 
 21 | -include_lib("eunit/include/eunit.hrl").
 22 | 
 23 | -define(SKIP_TESTS, [
 24 |   [], "xn--r97c.", "𐋷.", "xn--pw9c.xn--fjb8658k", "0.xn--qny", "0.甯",
 25 | 
 26 |   "xn--hwe.xn--ss-ci1ub261a",
 27 |   "ss.xn--lgd921mvv0m",
 28 |   "xn--4xa.xn--1-gocmu97674d.",
 29 |   "xn--ghb2gxqia",
 30 |   "xn--4xa203s.xn--epb",
 31 |   "xn--ghb2g3qq34f",
 32 |   [49,121369,11798,46],
 33 | 
 34 | 
 35 | 
 36 |   [56,52,119355,46,66293,9959],
 37 |   [57,38529,11246,46]
 38 | 
 39 | 
 40 | ]).
 41 | 
 42 | uts46_conformance_test() ->
 43 |   Data = load_file(),
 44 | 
 45 |   lists:foreach(
 46 |     fun({Source, ToUnicode, ToUnicodeStatus, ToAsciiN, ToAsciiNStatus, ToAsciiT, ToAsciiTStatus}=_Row) ->
 47 |         Ignored = (lists:member(Source, ?SKIP_TESTS)
 48 |                    orelse lists:member(ToAsciiN, ?SKIP_TESTS)
 49 |                    orelse lists:member(ToAsciiT, ?SKIP_TESTS)),
 50 | 
 51 |       case Ignored of
 52 |         true -> ok;
 53 |         false ->
 54 |           CheckUnicode = ToUnicodeStatus == [] andalso ToUnicode /= "",
 55 |           case CheckUnicode of
 56 |             true ->
 57 |               %%?debugFmt("test rown=~p~n", [_Row]),
 58 |                %io:format("decode ~p~n", [Source]),
 59 |               ?assertEqual(ToUnicode, idna:decode(Source, [uts46, {std3_rules, true}]));
 60 |             _ ->
 61 |               ok
 62 |           end,
 63 | 
 64 |           CheckAsciiN = ToUnicode /= [] andalso ToAsciiN /= "" andalso ToAsciiNStatus ==[],
 65 | 
 66 |           case CheckAsciiN of
 67 |             true ->
 68 |               %?debugFmt("test rown=~p~n", [_Row]),
 69 |               ?assertEqual(ToAsciiN, idna:encode(Source, [uts46, {transitional, false}]));
 70 |             false ->
 71 |               ok
 72 |           end,
 73 | 
 74 |           CheckToAsciiT = ToAsciiT /= "" andalso ToAsciiTStatus == [],
 75 |           case CheckToAsciiT of
 76 |             true ->
 77 |               %?debugFmt("test rown=~p~n", [_Row]),
 78 |               ?assertEqual(ToAsciiT, idna:encode(Source, [uts46, {transitional, true}]));
 79 |             false ->
 80 |               ok
 81 |           end
 82 |       end
 83 |     end,
 84 |     Data
 85 |   ).
 86 | 
 87 | load_file() ->
 88 |   EbinDir = filename:dirname(code:which(?MODULE)),
 89 |   AppPath = filename:dirname(EbinDir),
 90 |   Name = filename:join([AppPath, "test", "IdnaTestV2.txt"]),
 91 |   {ok, Tests} = file:open(Name, [read, {encoding, utf8}, {read_ahead, 1000000}]),
 92 |   %%{ok, Tests} = file:open(Name, [read, raw, unicode, {read_ahead, 1000000}]),
 93 |   Data = foldl(fun parse_tests/2, [], Tests),
 94 |   file:close(Tests),
 95 |   lists:sort(Data).
 96 | 
 97 | 
 98 | parse_tests(Line0, Acc) ->
 99 |   Line1 = ?chomp(Line0),
100 |   [Line|_Comments] = tokens(Line1, "#"),
101 |   [Source, ToUnicode, ToUnicodeStatusStr,
102 |     ToAsciiN, ToAsciiNStatusStr, ToAsciiT, ToAsciiTStatusStr] =  case tokens(Line, ";") of
103 |                                                                    Row when length(Row) > 6 -> Row;
104 |                                                                    Row -> Row ++ [""]
105 |                                                                  end,
106 |   ToUnicodeStatus = parse_status(?trim(ToUnicodeStatusStr)),
107 |   ToAsciiNStatus = case parse_status(?trim(ToAsciiNStatusStr)) of
108 |                      [] -> ToUnicodeStatus;
109 |                      ToAsciiNStatus1 -> ToAsciiNStatus1
110 |                    end,
111 |   ToAsciiTStatus = case parse_status(?trim(ToAsciiTStatusStr)) of
112 |                      [] -> ToUnicodeStatus;
113 |                      ToAsciiTStatus1 -> ToAsciiTStatus1
114 |                    end,
115 | 
116 |   [{parse_unicode(Source),
117 |     parse_unicode(ToUnicode),  ToUnicodeStatus,
118 |     parse_unicode(ToAsciiN), ToAsciiNStatus,
119 |     parse_unicode(ToAsciiT), ToAsciiTStatus} | Acc].
120 | 
121 | 
122 | parse_unicode(S) ->
123 |   ?trim(S, both).
124 | %parse_unicode(S0) ->
125 | %  ?trim(unicode:characters_to_list(list_to_binary(S0)), both).
126 | 
127 | to_unicode(S) ->
128 |   case lists:all(fun(C) -> idna_ucs:is_unicode(C) end, S) of
129 |     true -> S;
130 |     false -> idna_ucs:from_utf8(S)
131 | 
132 |   end.
133 | 
134 | parse_status(" ") -> [];
135 | parse_status("") -> [];
136 | parse_status("[]") -> [];
137 | parse_status("[" ++ Str) ->
138 |   [ErrorsStr] =  ?lexemes(Str, "]"),
139 |   ?lexemes(ErrorsStr, ",").
140 | 
141 | 
142 | foldl(Fun, Acc, Fd) ->
143 |   Get = fun() -> io:get_line(Fd, "") end,
144 | %  Get = fun() -> file:read_line(Fd) end,
145 |   foldl_1(Fun, Acc, Get).
146 | 
147 | foldl_1(_Fun, {done, Acc}, _Get) -> Acc;
148 | foldl_1(Fun, Acc, Get) ->
149 |   case Get() of
150 |     eof -> Acc;
151 |     "#" ++ _ -> %% Ignore comments
152 |       foldl_1(Fun, Acc, Get);
153 |     "\n" -> %% Ignore empty lines
154 |       foldl_1(Fun, Acc, Get);
155 |     Line ->
156 |       foldl_1(Fun, Fun(Line, Acc), Get)
157 |   end.
158 | 
159 | %% Differs from string:tokens, it returns empty string as token between two delimiters
160 | tokens(S, [C]) ->
161 |   tokens(lists:reverse(S), C, []).
162 | 
163 | tokens([Sep|S], Sep, Toks) ->
164 |   tokens(S, Sep, [[]|Toks]);
165 | tokens([C|S], Sep, Toks) ->
166 |   tokens_2(S, Sep, Toks, [C]);
167 | tokens([], _, Toks) ->
168 |   Toks.
169 | 
170 | tokens_2([Sep|S], Sep, Toks, Tok) ->
171 |   tokens(S, Sep, [Tok|Toks]);
172 | tokens_2([C|S], Sep, Toks, Tok) ->
173 |   tokens_2(S, Sep, Toks, [C|Tok]);
174 | tokens_2([], _Sep, Toks, Tok) ->
175 |   [Tok|Toks].
176 | 


--------------------------------------------------------------------------------
/uc_spec/ArabicShaping.txt:
--------------------------------------------------------------------------------
  1 | # ArabicShaping-13.0.0.txt
  2 | # Date: 2020-01-31, 23:55:00 GMT [KW, RP]
  3 | # © 2020 Unicode®, Inc.
  4 | # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
  5 | # For terms of use, see http://www.unicode.org/terms_of_use.html
  6 | #
  7 | # This file is a normative contributory data file in the
  8 | # Unicode Character Database.
  9 | #
 10 | # This file defines the Joining_Type and Joining_Group property
 11 | # values for Arabic, Syriac, N'Ko, Mandaic, and Manichaean positional
 12 | # shaping, repeating in machine readable form the information
 13 | # exemplified in Tables 9-3, 9-8, 9-9, 9-10, 9-14, 9-15, 9-16, 9-19,
 14 | # 9-20, 10-4, 10-5, 10-6, 10-7, and 19-5 of The Unicode Standard core
 15 | # specification. This file also defines Joining_Type values for
 16 | # Mongolian, Phags-pa, Psalter Pahlavi, Sogdian, Chorasmian, and Adlam positional shaping,
 17 | # and Joining_Type and Joining_Group values for Hanifi Rohingya positional shaping,
 18 | # which are not listed in tables in the standard.
 19 | #
 20 | # See Sections 9.2, 9.3, 9.5, 10.5, 10.6, 13.4, 14.3, 14.10, 16.14, 19.4, and 19.9
 21 | # of The Unicode Standard core specification for more information.
 22 | #
 23 | # Each line contains four fields, separated by a semicolon.
 24 | #
 25 | # Field 0: the code point, in 4-digit hexadecimal
 26 | #   form, of a character.
 27 | #
 28 | # Field 1: gives a short schematic name for that character.
 29 | #   The schematic name is descriptive of the shape, based as
 30 | #   consistently as possible on a name for the skeleton and
 31 | #   then the diacritic marks applied to the skeleton, if any.
 32 | #   Note that this schematic name is considered a comment,
 33 | #   and does not constitute a formal property value.
 34 | #
 35 | # Field 2: defines the joining type (property name: Joining_Type)
 36 | #   R Right_Joining
 37 | #   L Left_Joining
 38 | #   D Dual_Joining
 39 | #   C Join_Causing
 40 | #   U Non_Joining
 41 | #   T Transparent
 42 | #
 43 | # See Section 9.2, Arabic for more information on these joining types.
 44 | # Note that for cursive joining scripts which are typically rendered
 45 | # top-to-bottom, rather than right-to-left, Joining_Type=L conventionally
 46 | # refers to bottom joining, and Joining_Type=R conventionally refers
 47 | # to top joining. See Section 14.3, Phags-pa for more information on the
 48 | # interpretation of joining types in vertical layout.
 49 | #
 50 | # Field 3: defines the joining group (property name: Joining_Group)
 51 | #
 52 | # The values of the joining group are based schematically on character
 53 | # names. Where a schematic character name consists of two or more parts
 54 | # separated by spaces, the formal Joining_Group property value, as specified in
 55 | # PropertyValueAliases.txt, consists of the same name parts joined by
 56 | # underscores. Hence, the entry:
 57 | #
 58 | #   0629; TEH MARBUTA; R; TEH MARBUTA
 59 | #
 60 | # corresponds to [Joining_Group = Teh_Marbuta].
 61 | #
 62 | # Note: The property value now designated [Joining_Group = Teh_Marbuta_Goal]
 63 | #   used to apply to both of the following characters
 64 | #   in earlier versions of the standard:
 65 | #
 66 | #   U+06C2 ARABIC LETTER HEH GOAL WITH HAMZA ABOVE
 67 | #   U+06C3 ARABIC LETTER TEH MARBUTA GOAL
 68 | #
 69 | #   However, it currently applies only to U+06C3, and *not* to U+06C2.
 70 | #   To avoid destabilizing existing Joining_Group property aliases, the
 71 | #   prior Joining_Group value for U+06C3 (Hamza_On_Heh_Goal) has been
 72 | #   retained as a property value alias, despite the fact that it
 73 | #   no longer applies to its namesake character, U+06C2.
 74 | #   See PropertyValueAliases.txt.
 75 | #
 76 | # When other cursive scripts are added to the Unicode Standard in the
 77 | # future, the joining group value of all its letters will default to
 78 | # jg=No_Joining_Group in this data file. Other, more specific
 79 | # joining group values will be defined only if an explicit proposal
 80 | # to define those values exactly has been approved by the UTC. This
 81 | # is the convention exemplified by the N'Ko, Mandaic, Mongolian,
 82 | # Phags-pa, Psalter Pahlavi, Sogdian, Chorasmian, and Adlam scripts.
 83 | # Only the Arabic, Manichaean, and Syriac scripts currently have
 84 | # explicit joining group values defined for all characters, including
 85 | # those which have only a single character in a particular Joining_Group
 86 | # class. Hanifi Rohingya has explicit Joining_Group values assigned only for
 87 | # the few characters which share a particular Joining_Group class, but
 88 | # assigns jg=No_Joining_Group to all the singletons.
 89 | #
 90 | # Note: Code points that are not explicitly listed in this file are
 91 | # either of joining type T or U:
 92 | #
 93 | # - Those that are not explicitly listed and that are of General Category Mn, Me, or Cf
 94 | #   have joining type T.
 95 | # - All others not explicitly listed have joining type U.
 96 | #
 97 | # For an explicit listing of all characters of joining type T, see
 98 | # the derived property file DerivedJoiningType.txt.
 99 | #
100 | # #############################################################
101 | 
102 | # Unicode; Schematic Name; Joining Type; Joining Group
103 | 
104 | # Arabic Characters
105 | 
106 | 0600; ARABIC NUMBER SIGN; U; No_Joining_Group
107 | 0601; ARABIC SIGN SANAH; U; No_Joining_Group
108 | 0602; ARABIC FOOTNOTE MARKER; U; No_Joining_Group
109 | 0603; ARABIC SIGN SAFHA; U; No_Joining_Group
110 | 0604; ARABIC SIGN SAMVAT; U; No_Joining_Group
111 | 0605; ARABIC NUMBER MARK ABOVE; U; No_Joining_Group
112 | 0608; ARABIC RAY; U; No_Joining_Group
113 | 060B; AFGHANI SIGN; U; No_Joining_Group
114 | 0620; DOTLESS YEH WITH SEPARATE RING BELOW; D; YEH
115 | 0621; HAMZA; U; No_Joining_Group
116 | 0622; ALEF WITH MADDA ABOVE; R; ALEF
117 | 0623; ALEF WITH HAMZA ABOVE; R; ALEF
118 | 0624; WAW WITH HAMZA ABOVE; R; WAW
119 | 0625; ALEF WITH HAMZA BELOW; R; ALEF
120 | 0626; DOTLESS YEH WITH HAMZA ABOVE; D; YEH
121 | 0627; ALEF; R; ALEF
122 | 0628; BEH; D; BEH
123 | 0629; TEH MARBUTA; R; TEH MARBUTA
124 | 062A; DOTLESS BEH WITH 2 DOTS ABOVE; D; BEH
125 | 062B; DOTLESS BEH WITH 3 DOTS ABOVE; D; BEH
126 | 062C; HAH WITH DOT BELOW; D; HAH
127 | 062D; HAH; D; HAH
128 | 062E; HAH WITH DOT ABOVE; D; HAH
129 | 062F; DAL; R; DAL
130 | 0630; DAL WITH DOT ABOVE; R; DAL
131 | 0631; REH; R; REH
132 | 0632; REH WITH DOT ABOVE; R; REH
133 | 0633; SEEN; D; SEEN
134 | 0634; SEEN WITH 3 DOTS ABOVE; D; SEEN
135 | 0635; SAD; D; SAD
136 | 0636; SAD WITH DOT ABOVE; D; SAD
137 | 0637; TAH; D; TAH
138 | 0638; TAH WITH DOT ABOVE; D; TAH
139 | 0639; AIN; D; AIN
140 | 063A; AIN WITH DOT ABOVE; D; AIN
141 | 063B; KEHEH WITH 2 DOTS ABOVE; D; GAF
142 | 063C; KEHEH WITH 3 DOTS BELOW; D; GAF
143 | 063D; FARSI YEH WITH INVERTED V ABOVE; D; FARSI YEH
144 | 063E; FARSI YEH WITH 2 DOTS ABOVE; D; FARSI YEH
145 | 063F; FARSI YEH WITH 3 DOTS ABOVE; D; FARSI YEH
146 | 0640; TATWEEL; C; No_Joining_Group
147 | 0641; FEH; D; FEH
148 | 0642; QAF; D; QAF
149 | 0643; KAF; D; KAF
150 | 0644; LAM; D; LAM
151 | 0645; MEEM; D; MEEM
152 | 0646; NOON; D; NOON
153 | 0647; HEH; D; HEH
154 | 0648; WAW; R; WAW
155 | 0649; DOTLESS YEH; D; YEH
156 | 064A; YEH; D; YEH
157 | 066E; DOTLESS BEH; D; BEH
158 | 066F; DOTLESS QAF; D; QAF
159 | 0671; ALEF WITH WASLA ABOVE; R; ALEF
160 | 0672; ALEF WITH WAVY HAMZA ABOVE; R; ALEF
161 | 0673; ALEF WITH WAVY HAMZA BELOW; R; ALEF
162 | 0674; HIGH HAMZA; U; No_Joining_Group
163 | 0675; HIGH HAMZA ALEF; R; ALEF
164 | 0676; HIGH HAMZA WAW; R; WAW
165 | 0677; HIGH HAMZA WAW WITH DAMMA ABOVE; R; WAW
166 | 0678; HIGH HAMZA DOTLESS YEH; D; YEH
167 | 0679; DOTLESS BEH WITH TAH ABOVE; D; BEH
168 | 067A; DOTLESS BEH WITH VERTICAL 2 DOTS ABOVE; D; BEH
169 | 067B; DOTLESS BEH WITH VERTICAL 2 DOTS BELOW; D; BEH
170 | 067C; DOTLESS BEH WITH ATTACHED RING BELOW AND 2 DOTS ABOVE; D; BEH
171 | 067D; DOTLESS BEH WITH INVERTED 3 DOTS ABOVE; D; BEH
172 | 067E; DOTLESS BEH WITH 3 DOTS BELOW; D; BEH
173 | 067F; DOTLESS BEH WITH 4 DOTS ABOVE; D; BEH
174 | 0680; DOTLESS BEH WITH 4 DOTS BELOW; D; BEH
175 | 0681; HAH WITH HAMZA ABOVE; D; HAH
176 | 0682; HAH WITH VERTICAL 2 DOTS ABOVE; D; HAH
177 | 0683; HAH WITH 2 DOTS BELOW; D; HAH
178 | 0684; HAH WITH VERTICAL 2 DOTS BELOW; D; HAH
179 | 0685; HAH WITH 3 DOTS ABOVE; D; HAH
180 | 0686; HAH WITH 3 DOTS BELOW; D; HAH
181 | 0687; HAH WITH 4 DOTS BELOW; D; HAH
182 | 0688; DAL WITH TAH ABOVE; R; DAL
183 | 0689; DAL WITH ATTACHED RING BELOW; R; DAL
184 | 068A; DAL WITH DOT BELOW; R; DAL
185 | 068B; DAL WITH DOT BELOW AND TAH ABOVE; R; DAL
186 | 068C; DAL WITH 2 DOTS ABOVE; R; DAL
187 | 068D; DAL WITH 2 DOTS BELOW; R; DAL
188 | 068E; DAL WITH 3 DOTS ABOVE; R; DAL
189 | 068F; DAL WITH INVERTED 3 DOTS ABOVE; R; DAL
190 | 0690; DAL WITH 4 DOTS ABOVE; R; DAL
191 | 0691; REH WITH TAH ABOVE; R; REH
192 | 0692; REH WITH V ABOVE; R; REH
193 | 0693; REH WITH ATTACHED RING BELOW; R; REH
194 | 0694; REH WITH DOT BELOW; R; REH
195 | 0695; REH WITH V BELOW; R; REH
196 | 0696; REH WITH DOT BELOW AND DOT WITHIN; R; REH
197 | 0697; REH WITH 2 DOTS ABOVE; R; REH
198 | 0698; REH WITH 3 DOTS ABOVE; R; REH
199 | 0699; REH WITH 4 DOTS ABOVE; R; REH
200 | 069A; SEEN WITH DOT BELOW AND DOT ABOVE; D; SEEN
201 | 069B; SEEN WITH 3 DOTS BELOW; D; SEEN
202 | 069C; SEEN WITH 3 DOTS BELOW AND 3 DOTS ABOVE; D; SEEN
203 | 069D; SAD WITH 2 DOTS BELOW; D; SAD
204 | 069E; SAD WITH 3 DOTS ABOVE; D; SAD
205 | 069F; TAH WITH 3 DOTS ABOVE; D; TAH
206 | 06A0; AIN WITH 3 DOTS ABOVE; D; AIN
207 | 06A1; DOTLESS FEH; D; FEH
208 | 06A2; DOTLESS FEH WITH DOT BELOW; D; FEH
209 | 06A3; FEH WITH DOT BELOW; D; FEH
210 | 06A4; DOTLESS FEH WITH 3 DOTS ABOVE; D; FEH
211 | 06A5; DOTLESS FEH WITH 3 DOTS BELOW; D; FEH
212 | 06A6; DOTLESS FEH WITH 4 DOTS ABOVE; D; FEH
213 | 06A7; DOTLESS QAF WITH DOT ABOVE; D; QAF
214 | 06A8; DOTLESS QAF WITH 3 DOTS ABOVE; D; QAF
215 | 06A9; KEHEH; D; GAF
216 | 06AA; SWASH KAF; D; SWASH KAF
217 | 06AB; KEHEH WITH ATTACHED RING BELOW; D; GAF
218 | 06AC; KAF WITH DOT ABOVE; D; KAF
219 | 06AD; KAF WITH 3 DOTS ABOVE; D; KAF
220 | 06AE; KAF WITH 3 DOTS BELOW; D; KAF
221 | 06AF; GAF; D; GAF
222 | 06B0; GAF WITH ATTACHED RING BELOW; D; GAF
223 | 06B1; GAF WITH 2 DOTS ABOVE; D; GAF
224 | 06B2; GAF WITH 2 DOTS BELOW; D; GAF
225 | 06B3; GAF WITH VERTICAL 2 DOTS BELOW; D; GAF
226 | 06B4; GAF WITH 3 DOTS ABOVE; D; GAF
227 | 06B5; LAM WITH V ABOVE; D; LAM
228 | 06B6; LAM WITH DOT ABOVE; D; LAM
229 | 06B7; LAM WITH 3 DOTS ABOVE; D; LAM
230 | 06B8; LAM WITH 3 DOTS BELOW; D; LAM
231 | 06B9; NOON WITH DOT BELOW; D; NOON
232 | 06BA; DOTLESS NOON; D; NOON
233 | 06BB; DOTLESS NOON WITH TAH ABOVE; D; NOON
234 | 06BC; NOON WITH ATTACHED RING BELOW; D; NOON
235 | 06BD; NYA; D; NYA
236 | 06BE; KNOTTED HEH; D; KNOTTED HEH
237 | 06BF; HAH WITH 3 DOTS BELOW AND DOT ABOVE; D; HAH
238 | 06C0; DOTLESS TEH MARBUTA WITH HAMZA ABOVE; R; TEH MARBUTA
239 | 06C1; HEH GOAL; D; HEH GOAL
240 | 06C2; HEH GOAL WITH HAMZA ABOVE; D; HEH GOAL
241 | 06C3; TEH MARBUTA GOAL; R; TEH MARBUTA GOAL
242 | 06C4; WAW WITH ATTACHED RING WITHIN; R; WAW
243 | 06C5; WAW WITH BAR; R; WAW
244 | 06C6; WAW WITH V ABOVE; R; WAW
245 | 06C7; WAW WITH DAMMA ABOVE; R; WAW
246 | 06C8; WAW WITH ALEF ABOVE; R; WAW
247 | 06C9; WAW WITH INVERTED V ABOVE; R; WAW
248 | 06CA; WAW WITH 2 DOTS ABOVE; R; WAW
249 | 06CB; WAW WITH 3 DOTS ABOVE; R; WAW
250 | 06CC; FARSI YEH; D; FARSI YEH
251 | 06CD; YEH WITH TAIL; R; YEH WITH TAIL
252 | 06CE; FARSI YEH WITH V ABOVE; D; FARSI YEH
253 | 06CF; WAW WITH DOT ABOVE; R; WAW
254 | 06D0; DOTLESS YEH WITH VERTICAL 2 DOTS BELOW; D; YEH
255 | 06D1; DOTLESS YEH WITH 3 DOTS BELOW; D; YEH
256 | 06D2; YEH BARREE; R; YEH BARREE
257 | 06D3; YEH BARREE WITH HAMZA ABOVE; R; YEH BARREE
258 | 06D5; DOTLESS TEH MARBUTA; R; TEH MARBUTA
259 | 06DD; ARABIC END OF AYAH; U; No_Joining_Group
260 | 06EE; DAL WITH INVERTED V ABOVE; R; DAL
261 | 06EF; REH WITH INVERTED V ABOVE; R; REH
262 | 06FA; SEEN WITH DOT BELOW AND 3 DOTS ABOVE; D; SEEN
263 | 06FB; SAD WITH DOT BELOW AND DOT ABOVE; D; SAD
264 | 06FC; AIN WITH DOT BELOW AND DOT ABOVE; D; AIN
265 | 06FF; KNOTTED HEH WITH INVERTED V ABOVE; D; KNOTTED HEH
266 | 
267 | # Syriac Characters
268 | 
269 | 070F; SYRIAC ABBREVIATION MARK; T; No_Joining_Group
270 | 0710; ALAPH; R; ALAPH
271 | 0712; BETH; D; BETH
272 | 0713; GAMAL; D; GAMAL
273 | 0714; GAMAL GARSHUNI; D; GAMAL
274 | 0715; DALATH; R; DALATH RISH
275 | 0716; DOTLESS DALATH RISH; R; DALATH RISH
276 | 0717; HE; R; HE
277 | 0718; WAW; R; SYRIAC WAW
278 | 0719; ZAIN; R; ZAIN
279 | 071A; HETH; D; HETH
280 | 071B; TETH; D; TETH
281 | 071C; TETH GARSHUNI; D; TETH
282 | 071D; YUDH; D; YUDH
283 | 071E; YUDH HE; R; YUDH HE
284 | 071F; KAPH; D; KAPH
285 | 0720; LAMADH; D; LAMADH
286 | 0721; MIM; D; MIM
287 | 0722; NUN; D; NUN
288 | 0723; SEMKATH; D; SEMKATH
289 | 0724; FINAL SEMKATH; D; FINAL SEMKATH
290 | 0725; E; D; E
291 | 0726; PE; D; PE
292 | 0727; REVERSED PE; D; REVERSED PE
293 | 0728; SADHE; R; SADHE
294 | 0729; QAPH; D; QAPH
295 | 072A; RISH; R; DALATH RISH
296 | 072B; SHIN; D; SHIN
297 | 072C; TAW; R; TAW
298 | 072D; PERSIAN BHETH; D; BETH
299 | 072E; PERSIAN GHAMAL; D; GAMAL
300 | 072F; PERSIAN DHALATH; R; DALATH RISH
301 | 074D; SOGDIAN ZHAIN; R; ZHAIN
302 | 074E; SOGDIAN KHAPH; D; KHAPH
303 | 074F; SOGDIAN FE; D; FE
304 | 
305 | # Arabic Supplement Characters
306 | 
307 | 0750; DOTLESS BEH WITH HORIZONTAL 3 DOTS BELOW; D; BEH
308 | 0751; BEH WITH 3 DOTS ABOVE; D; BEH
309 | 0752; DOTLESS BEH WITH INVERTED 3 DOTS BELOW; D; BEH
310 | 0753; DOTLESS BEH WITH INVERTED 3 DOTS BELOW AND 2 DOTS ABOVE; D; BEH
311 | 0754; DOTLESS BEH WITH 2 DOTS BELOW AND DOT ABOVE; D; BEH
312 | 0755; DOTLESS BEH WITH INVERTED V BELOW; D; BEH
313 | 0756; DOTLESS BEH WITH V ABOVE; D; BEH
314 | 0757; HAH WITH 2 DOTS ABOVE; D; HAH
315 | 0758; HAH WITH INVERTED 3 DOTS BELOW; D; HAH
316 | 0759; DAL WITH VERTICAL 2 DOTS BELOW AND TAH ABOVE; R; DAL
317 | 075A; DAL WITH INVERTED V BELOW; R; DAL
318 | 075B; REH WITH BAR; R; REH
319 | 075C; SEEN WITH 4 DOTS ABOVE; D; SEEN
320 | 075D; AIN WITH 2 DOTS ABOVE; D; AIN
321 | 075E; AIN WITH INVERTED 3 DOTS ABOVE; D; AIN
322 | 075F; AIN WITH VERTICAL 2 DOTS ABOVE; D; AIN
323 | 0760; DOTLESS FEH WITH 2 DOTS BELOW; D; FEH
324 | 0761; DOTLESS FEH WITH INVERTED 3 DOTS BELOW; D; FEH
325 | 0762; KEHEH WITH DOT ABOVE; D; GAF
326 | 0763; KEHEH WITH 3 DOTS ABOVE; D; GAF
327 | 0764; KEHEH WITH INVERTED 3 DOTS BELOW; D; GAF
328 | 0765; MEEM WITH DOT ABOVE; D; MEEM
329 | 0766; MEEM WITH DOT BELOW; D; MEEM
330 | 0767; NOON WITH 2 DOTS BELOW; D; NOON
331 | 0768; NOON WITH TAH ABOVE; D; NOON
332 | 0769; NOON WITH V ABOVE; D; NOON
333 | 076A; LAM WITH BAR; D; LAM
334 | 076B; REH WITH VERTICAL 2 DOTS ABOVE; R; REH
335 | 076C; REH WITH HAMZA ABOVE; R; REH
336 | 076D; SEEN WITH VERTICAL 2 DOTS ABOVE; D; SEEN
337 | 076E; HAH WITH TAH BELOW; D; HAH
338 | 076F; HAH WITH TAH AND 2 DOTS BELOW; D; HAH
339 | 0770; SEEN WITH 2 DOTS AND TAH ABOVE; D; SEEN
340 | 0771; REH WITH 2 DOTS AND TAH ABOVE; R; REH
341 | 0772; HAH WITH TAH ABOVE; D; HAH
342 | 0773; ALEF WITH DIGIT TWO ABOVE; R; ALEF
343 | 0774; ALEF WITH DIGIT THREE ABOVE; R; ALEF
344 | 0775; FARSI YEH WITH DIGIT TWO ABOVE; D; FARSI YEH
345 | 0776; FARSI YEH WITH DIGIT THREE ABOVE; D; FARSI YEH
346 | 0777; DOTLESS YEH WITH DIGIT FOUR BELOW; D; YEH
347 | 0778; WAW WITH DIGIT TWO ABOVE; R; WAW
348 | 0779; WAW WITH DIGIT THREE ABOVE; R; WAW
349 | 077A; BURUSHASKI YEH BARREE WITH DIGIT TWO ABOVE; D; BURUSHASKI YEH BARREE
350 | 077B; BURUSHASKI YEH BARREE WITH DIGIT THREE ABOVE; D; BURUSHASKI YEH BARREE
351 | 077C; HAH WITH DIGIT FOUR BELOW; D; HAH
352 | 077D; SEEN WITH DIGIT FOUR ABOVE; D; SEEN
353 | 077E; SEEN WITH INVERTED V ABOVE; D; SEEN
354 | 077F; KAF WITH 2 DOTS ABOVE; D; KAF
355 | 
356 | # N'Ko Characters
357 | 
358 | 07CA; NKO A; D; No_Joining_Group
359 | 07CB; NKO EE; D; No_Joining_Group
360 | 07CC; NKO I; D; No_Joining_Group
361 | 07CD; NKO E; D; No_Joining_Group
362 | 07CE; NKO U; D; No_Joining_Group
363 | 07CF; NKO OO; D; No_Joining_Group
364 | 07D0; NKO O; D; No_Joining_Group
365 | 07D1; NKO DAGBASINNA; D; No_Joining_Group
366 | 07D2; NKO N; D; No_Joining_Group
367 | 07D3; NKO BA; D; No_Joining_Group
368 | 07D4; NKO PA; D; No_Joining_Group
369 | 07D5; NKO TA; D; No_Joining_Group
370 | 07D6; NKO JA; D; No_Joining_Group
371 | 07D7; NKO CHA; D; No_Joining_Group
372 | 07D8; NKO DA; D; No_Joining_Group
373 | 07D9; NKO RA; D; No_Joining_Group
374 | 07DA; NKO RRA; D; No_Joining_Group
375 | 07DB; NKO SA; D; No_Joining_Group
376 | 07DC; NKO GBA; D; No_Joining_Group
377 | 07DD; NKO FA; D; No_Joining_Group
378 | 07DE; NKO KA; D; No_Joining_Group
379 | 07DF; NKO LA; D; No_Joining_Group
380 | 07E0; NKO NA WOLOSO; D; No_Joining_Group
381 | 07E1; NKO MA; D; No_Joining_Group
382 | 07E2; NKO NYA; D; No_Joining_Group
383 | 07E3; NKO NA; D; No_Joining_Group
384 | 07E4; NKO HA; D; No_Joining_Group
385 | 07E5; NKO WA; D; No_Joining_Group
386 | 07E6; NKO YA; D; No_Joining_Group
387 | 07E7; NKO NYA WOLOSO; D; No_Joining_Group
388 | 07E8; NKO JONA JA; D; No_Joining_Group
389 | 07E9; NKO JONA CHA; D; No_Joining_Group
390 | 07EA; NKO JONA RA; D; No_Joining_Group
391 | 07FA; NKO LAJANYALAN; C; No_Joining_Group
392 | 
393 | # Mandaic Characters
394 | 
395 | 0840; MANDAIC HALQA; R; No_Joining_Group
396 | 0841; MANDAIC AB; D; No_Joining_Group
397 | 0842; MANDAIC AG; D; No_Joining_Group
398 | 0843; MANDAIC AD; D; No_Joining_Group
399 | 0844; MANDAIC AH; D; No_Joining_Group
400 | 0845; MANDAIC USHENNA; D; No_Joining_Group
401 | 0846; MANDAIC AZ; R; No_Joining_Group
402 | 0847; MANDAIC IT; R; No_Joining_Group
403 | 0848; MANDAIC ATT; D; No_Joining_Group
404 | 0849; MANDAIC AKSA; R; No_Joining_Group
405 | 084A; MANDAIC AK; D; No_Joining_Group
406 | 084B; MANDAIC AL; D; No_Joining_Group
407 | 084C; MANDAIC AM; D; No_Joining_Group
408 | 084D; MANDAIC AN; D; No_Joining_Group
409 | 084E; MANDAIC AS; D; No_Joining_Group
410 | 084F; MANDAIC IN; D; No_Joining_Group
411 | 0850; MANDAIC AP; D; No_Joining_Group
412 | 0851; MANDAIC ASZ; D; No_Joining_Group
413 | 0852; MANDAIC AQ; D; No_Joining_Group
414 | 0853; MANDAIC AR; D; No_Joining_Group
415 | 0854; MANDAIC ASH; R; No_Joining_Group
416 | 0855; MANDAIC AT; D; No_Joining_Group
417 | 0856; MANDAIC DUSHENNA; R; No_Joining_Group
418 | 0857; MANDAIC KAD; R; No_Joining_Group
419 | 0858; MANDAIC AIN; R; No_Joining_Group
420 | 
421 | # Syriac Supplement Characters
422 | 
423 | 0860; MALAYALAM NGA; D; MALAYALAM NGA
424 | 0861; MALAYALAM JA; U; MALAYALAM JA
425 | 0862; MALAYALAM NYA; D; MALAYALAM NYA
426 | 0863; MALAYALAM TTA; D; MALAYALAM TTA
427 | 0864; MALAYALAM NNA; D; MALAYALAM NNA
428 | 0865; MALAYALAM NNNA; D; MALAYALAM NNNA
429 | 0866; MALAYALAM BHA; U; MALAYALAM BHA
430 | 0867; MALAYALAM RA; R; MALAYALAM RA
431 | 0868; MALAYALAM LLA; D; MALAYALAM LLA
432 | 0869; MALAYALAM LLLA; R; MALAYALAM LLLA
433 | 086A; MALAYALAM SSA; R; MALAYALAM SSA
434 | 
435 | # Arabic Extended-A Characters
436 | 
437 | 08A0; DOTLESS BEH WITH V BELOW; D; BEH
438 | 08A1; BEH WITH HAMZA ABOVE; D; BEH
439 | 08A2; HAH WITH DOT BELOW AND 2 DOTS ABOVE; D; HAH
440 | 08A3; TAH WITH 2 DOTS ABOVE; D; TAH
441 | 08A4; DOTLESS FEH WITH DOT BELOW AND 3 DOTS ABOVE; D; FEH
442 | 08A5; QAF WITH DOT BELOW; D; QAF
443 | 08A6; LAM WITH DOUBLE BAR; D; LAM
444 | 08A7; MEEM WITH 3 DOTS ABOVE; D; MEEM
445 | 08A8; YEH WITH HAMZA ABOVE; D; YEH
446 | 08A9; YEH WITH DOT ABOVE; D; YEH
447 | 08AA; REH WITH LOOP; R; REH
448 | 08AB; WAW WITH DOT WITHIN; R; WAW
449 | 08AC; ROHINGYA YEH; R; ROHINGYA YEH
450 | 08AD; LOW ALEF; U; No_Joining_Group
451 | 08AE; DAL WITH 3 DOTS BELOW; R; DAL
452 | 08AF; SAD WITH 3 DOTS BELOW; D; SAD
453 | 08B0; KEHEH WITH STROKE BELOW; D; GAF
454 | 08B1; STRAIGHT WAW; R; STRAIGHT WAW
455 | 08B2; REH WITH DOT AND INVERTED V ABOVE; R; REH
456 | 08B3; AIN WITH 3 DOTS BELOW; D; AIN
457 | 08B4; KAF WITH DOT BELOW; D; KAF
458 | 08B6; BEH WITH MEEM ABOVE; D; BEH
459 | 08B7; DOTLESS BEH WITH 3 DOTS BELOW AND MEEM ABOVE; D; BEH
460 | 08B8; DOTLESS BEH WITH TEH ABOVE; D; BEH
461 | 08B9; REH WITH NOON ABOVE; R; REH
462 | 08BA; YEH WITH NOON ABOVE; D; YEH
463 | 08BB; AFRICAN FEH; D; AFRICAN FEH
464 | 08BC; AFRICAN QAF; D; AFRICAN QAF
465 | 08BD; AFRICAN NOON; D; AFRICAN NOON
466 | 08BE; DOTLESS BEH WITH 3 DOTS BELOW AND V ABOVE; D; BEH
467 | 08BF; DOTLESS BEH WITH 2 DOTS AND V ABOVE; D; BEH
468 | 08C0; DOTLESS BEH WITH TAH AND V ABOVE; D; BEH
469 | 08C1; HAH WITH 3 DOTS BELOW AND V ABOVE; D; HAH
470 | 08C2; KEHEH WITH V ABOVE; D; GAF
471 | 08C3; AIN WITH DIAMOND 4 DOTS ABOVE; D; AIN
472 | 08C4; AFRICAN QAF WITH 3 DOTS ABOVE; D; AFRICAN QAF
473 | 08C5; HAH WITH DOT BELOW AND 3 DOTS ABOVE; D; HAH
474 | 08C6; HAH WITH DIAMOND 4 DOTS BELOW; D; HAH
475 | 08C7; LAM WITH TAH ABOVE; D; LAM
476 | 08E2; ARABIC DISPUTED END OF AYAH; U; No_Joining_Group
477 | 
478 | # Mongolian Characters
479 | 
480 | 1806; MONGOLIAN TODO SOFT HYPHEN; U; No_Joining_Group
481 | 1807; MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER; D; No_Joining_Group
482 | 180A; MONGOLIAN NIRUGU; C; No_Joining_Group
483 | 180E; MONGOLIAN VOWEL SEPARATOR; U; No_Joining_Group
484 | 1820; MONGOLIAN A; D; No_Joining_Group
485 | 1821; MONGOLIAN E; D; No_Joining_Group
486 | 1822; MONGOLIAN I; D; No_Joining_Group
487 | 1823; MONGOLIAN O; D; No_Joining_Group
488 | 1824; MONGOLIAN U; D; No_Joining_Group
489 | 1825; MONGOLIAN OE; D; No_Joining_Group
490 | 1826; MONGOLIAN UE; D; No_Joining_Group
491 | 1827; MONGOLIAN EE; D; No_Joining_Group
492 | 1828; MONGOLIAN NA; D; No_Joining_Group
493 | 1829; MONGOLIAN ANG; D; No_Joining_Group
494 | 182A; MONGOLIAN BA; D; No_Joining_Group
495 | 182B; MONGOLIAN PA; D; No_Joining_Group
496 | 182C; MONGOLIAN QA; D; No_Joining_Group
497 | 182D; MONGOLIAN GA; D; No_Joining_Group
498 | 182E; MONGOLIAN MA; D; No_Joining_Group
499 | 182F; MONGOLIAN LA; D; No_Joining_Group
500 | 1830; MONGOLIAN SA; D; No_Joining_Group
501 | 1831; MONGOLIAN SHA; D; No_Joining_Group
502 | 1832; MONGOLIAN TA; D; No_Joining_Group
503 | 1833; MONGOLIAN DA; D; No_Joining_Group
504 | 1834; MONGOLIAN CHA; D; No_Joining_Group
505 | 1835; MONGOLIAN JA; D; No_Joining_Group
506 | 1836; MONGOLIAN YA; D; No_Joining_Group
507 | 1837; MONGOLIAN RA; D; No_Joining_Group
508 | 1838; MONGOLIAN WA; D; No_Joining_Group
509 | 1839; MONGOLIAN FA; D; No_Joining_Group
510 | 183A; MONGOLIAN KA; D; No_Joining_Group
511 | 183B; MONGOLIAN KHA; D; No_Joining_Group
512 | 183C; MONGOLIAN TSA; D; No_Joining_Group
513 | 183D; MONGOLIAN ZA; D; No_Joining_Group
514 | 183E; MONGOLIAN HAA; D; No_Joining_Group
515 | 183F; MONGOLIAN ZRA; D; No_Joining_Group
516 | 1840; MONGOLIAN LHA; D; No_Joining_Group
517 | 1841; MONGOLIAN ZHI; D; No_Joining_Group
518 | 1842; MONGOLIAN CHI; D; No_Joining_Group
519 | 1843; MONGOLIAN TODO LONG VOWEL SIGN; D; No_Joining_Group
520 | 1844; MONGOLIAN TODO E; D; No_Joining_Group
521 | 1845; MONGOLIAN TODO I; D; No_Joining_Group
522 | 1846; MONGOLIAN TODO O; D; No_Joining_Group
523 | 1847; MONGOLIAN TODO U; D; No_Joining_Group
524 | 1848; MONGOLIAN TODO OE; D; No_Joining_Group
525 | 1849; MONGOLIAN TODO UE; D; No_Joining_Group
526 | 184A; MONGOLIAN TODO ANG; D; No_Joining_Group
527 | 184B; MONGOLIAN TODO BA; D; No_Joining_Group
528 | 184C; MONGOLIAN TODO PA; D; No_Joining_Group
529 | 184D; MONGOLIAN TODO QA; D; No_Joining_Group
530 | 184E; MONGOLIAN TODO GA; D; No_Joining_Group
531 | 184F; MONGOLIAN TODO MA; D; No_Joining_Group
532 | 1850; MONGOLIAN TODO TA; D; No_Joining_Group
533 | 1851; MONGOLIAN TODO DA; D; No_Joining_Group
534 | 1852; MONGOLIAN TODO CHA; D; No_Joining_Group
535 | 1853; MONGOLIAN TODO JA; D; No_Joining_Group
536 | 1854; MONGOLIAN TODO TSA; D; No_Joining_Group
537 | 1855; MONGOLIAN TODO YA; D; No_Joining_Group
538 | 1856; MONGOLIAN TODO WA; D; No_Joining_Group
539 | 1857; MONGOLIAN TODO KA; D; No_Joining_Group
540 | 1858; MONGOLIAN TODO GAA; D; No_Joining_Group
541 | 1859; MONGOLIAN TODO HAA; D; No_Joining_Group
542 | 185A; MONGOLIAN TODO JIA; D; No_Joining_Group
543 | 185B; MONGOLIAN TODO NIA; D; No_Joining_Group
544 | 185C; MONGOLIAN TODO DZA; D; No_Joining_Group
545 | 185D; MONGOLIAN SIBE E; D; No_Joining_Group
546 | 185E; MONGOLIAN SIBE I; D; No_Joining_Group
547 | 185F; MONGOLIAN SIBE IY; D; No_Joining_Group
548 | 1860; MONGOLIAN SIBE UE; D; No_Joining_Group
549 | 1861; MONGOLIAN SIBE U; D; No_Joining_Group
550 | 1862; MONGOLIAN SIBE ANG; D; No_Joining_Group
551 | 1863; MONGOLIAN SIBE KA; D; No_Joining_Group
552 | 1864; MONGOLIAN SIBE GA; D; No_Joining_Group
553 | 1865; MONGOLIAN SIBE HA; D; No_Joining_Group
554 | 1866; MONGOLIAN SIBE PA; D; No_Joining_Group
555 | 1867; MONGOLIAN SIBE SHA; D; No_Joining_Group
556 | 1868; MONGOLIAN SIBE TA; D; No_Joining_Group
557 | 1869; MONGOLIAN SIBE DA; D; No_Joining_Group
558 | 186A; MONGOLIAN SIBE JA; D; No_Joining_Group
559 | 186B; MONGOLIAN SIBE FA; D; No_Joining_Group
560 | 186C; MONGOLIAN SIBE GAA; D; No_Joining_Group
561 | 186D; MONGOLIAN SIBE HAA; D; No_Joining_Group
562 | 186E; MONGOLIAN SIBE TSA; D; No_Joining_Group
563 | 186F; MONGOLIAN SIBE ZA; D; No_Joining_Group
564 | 1870; MONGOLIAN SIBE RAA; D; No_Joining_Group
565 | 1871; MONGOLIAN SIBE CHA; D; No_Joining_Group
566 | 1872; MONGOLIAN SIBE ZHA; D; No_Joining_Group
567 | 1873; MONGOLIAN MANCHU I; D; No_Joining_Group
568 | 1874; MONGOLIAN MANCHU KA; D; No_Joining_Group
569 | 1875; MONGOLIAN MANCHU RA; D; No_Joining_Group
570 | 1876; MONGOLIAN MANCHU FA; D; No_Joining_Group
571 | 1877; MONGOLIAN MANCHU ZHA; D; No_Joining_Group
572 | 1878; MONGOLIAN MANCHU CHA WITH 2 DOTS; D; No_Joining_Group
573 | 1880; MONGOLIAN ALI GALI ANUSVARA ONE; U; No_Joining_Group
574 | 1881; MONGOLIAN ALI GALI VISARGA ONE; U; No_Joining_Group
575 | 1882; MONGOLIAN ALI GALI DAMARU; U; No_Joining_Group
576 | 1883; MONGOLIAN ALI GALI UBADAMA; U; No_Joining_Group
577 | 1884; MONGOLIAN ALI GALI INVERTED UBADAMA; U; No_Joining_Group
578 | 1885; MONGOLIAN ALI GALI BALUDA; T; No_Joining_Group
579 | 1886; MONGOLIAN ALI GALI THREE BALUDA; T; No_Joining_Group
580 | 1887; MONGOLIAN ALI GALI A; D; No_Joining_Group
581 | 1888; MONGOLIAN ALI GALI I; D; No_Joining_Group
582 | 1889; MONGOLIAN ALI GALI KA; D; No_Joining_Group
583 | 188A; MONGOLIAN ALI GALI NGA; D; No_Joining_Group
584 | 188B; MONGOLIAN ALI GALI CA; D; No_Joining_Group
585 | 188C; MONGOLIAN ALI GALI TTA; D; No_Joining_Group
586 | 188D; MONGOLIAN ALI GALI TTHA; D; No_Joining_Group
587 | 188E; MONGOLIAN ALI GALI DDA; D; No_Joining_Group
588 | 188F; MONGOLIAN ALI GALI NNA; D; No_Joining_Group
589 | 1890; MONGOLIAN ALI GALI TA; D; No_Joining_Group
590 | 1891; MONGOLIAN ALI GALI DA; D; No_Joining_Group
591 | 1892; MONGOLIAN ALI GALI PA; D; No_Joining_Group
592 | 1893; MONGOLIAN ALI GALI PHA; D; No_Joining_Group
593 | 1894; MONGOLIAN ALI GALI SSA; D; No_Joining_Group
594 | 1895; MONGOLIAN ALI GALI ZHA; D; No_Joining_Group
595 | 1896; MONGOLIAN ALI GALI ZA; D; No_Joining_Group
596 | 1897; MONGOLIAN ALI GALI AH; D; No_Joining_Group
597 | 1898; MONGOLIAN TODO ALI GALI TA; D; No_Joining_Group
598 | 1899; MONGOLIAN TODO ALI GALI ZHA; D; No_Joining_Group
599 | 189A; MONGOLIAN MANCHU ALI GALI GHA; D; No_Joining_Group
600 | 189B; MONGOLIAN MANCHU ALI GALI NGA; D; No_Joining_Group
601 | 189C; MONGOLIAN MANCHU ALI GALI CA; D; No_Joining_Group
602 | 189D; MONGOLIAN MANCHU ALI GALI JHA; D; No_Joining_Group
603 | 189E; MONGOLIAN MANCHU ALI GALI TTA; D; No_Joining_Group
604 | 189F; MONGOLIAN MANCHU ALI GALI DDHA; D; No_Joining_Group
605 | 18A0; MONGOLIAN MANCHU ALI GALI TA; D; No_Joining_Group
606 | 18A1; MONGOLIAN MANCHU ALI GALI DHA; D; No_Joining_Group
607 | 18A2; MONGOLIAN MANCHU ALI GALI SSA; D; No_Joining_Group
608 | 18A3; MONGOLIAN MANCHU ALI GALI CYA; D; No_Joining_Group
609 | 18A4; MONGOLIAN MANCHU ALI GALI ZHA; D; No_Joining_Group
610 | 18A5; MONGOLIAN MANCHU ALI GALI ZA; D; No_Joining_Group
611 | 18A6; MONGOLIAN ALI GALI HALF U; D; No_Joining_Group
612 | 18A7; MONGOLIAN ALI GALI HALF YA; D; No_Joining_Group
613 | 18A8; MONGOLIAN MANCHU ALI GALI BHA; D; No_Joining_Group
614 | 18AA; MONGOLIAN MANCHU ALI GALI LHA; D; No_Joining_Group
615 | 
616 | # Other
617 | 
618 | 200C; ZERO WIDTH NON-JOINER; U; No_Joining_Group
619 | 200D; ZERO WIDTH JOINER; C; No_Joining_Group
620 | 202F; NARROW NO-BREAK SPACE; U; No_Joining_Group
621 | 2066; LEFT-TO-RIGHT ISOLATE; U; No_Joining_Group
622 | 2067; RIGHT-TO-LEFT ISOLATE; U; No_Joining_Group
623 | 2068; FIRST STRONG ISOLATE; U; No_Joining_Group
624 | 2069; POP DIRECTIONAL ISOLATE; U; No_Joining_Group
625 | 
626 | # Phags-Pa Characters
627 | 
628 | A840; PHAGS-PA KA; D; No_Joining_Group
629 | A841; PHAGS-PA KHA; D; No_Joining_Group
630 | A842; PHAGS-PA GA; D; No_Joining_Group
631 | A843; PHAGS-PA NGA; D; No_Joining_Group
632 | A844; PHAGS-PA CA; D; No_Joining_Group
633 | A845; PHAGS-PA CHA; D; No_Joining_Group
634 | A846; PHAGS-PA JA; D; No_Joining_Group
635 | A847; PHAGS-PA NYA; D; No_Joining_Group
636 | A848; PHAGS-PA TA; D; No_Joining_Group
637 | A849; PHAGS-PA THA; D; No_Joining_Group
638 | A84A; PHAGS-PA DA; D; No_Joining_Group
639 | A84B; PHAGS-PA NA; D; No_Joining_Group
640 | A84C; PHAGS-PA PA; D; No_Joining_Group
641 | A84D; PHAGS-PA PHA; D; No_Joining_Group
642 | A84E; PHAGS-PA BA; D; No_Joining_Group
643 | A84F; PHAGS-PA MA; D; No_Joining_Group
644 | A850; PHAGS-PA TSA; D; No_Joining_Group
645 | A851; PHAGS-PA TSHA; D; No_Joining_Group
646 | A852; PHAGS-PA DZA; D; No_Joining_Group
647 | A853; PHAGS-PA WA; D; No_Joining_Group
648 | A854; PHAGS-PA ZHA; D; No_Joining_Group
649 | A855; PHAGS-PA ZA; D; No_Joining_Group
650 | A856; PHAGS-PA SMALL A; D; No_Joining_Group
651 | A857; PHAGS-PA YA; D; No_Joining_Group
652 | A858; PHAGS-PA RA; D; No_Joining_Group
653 | A859; PHAGS-PA LA; D; No_Joining_Group
654 | A85A; PHAGS-PA SHA; D; No_Joining_Group
655 | A85B; PHAGS-PA SA; D; No_Joining_Group
656 | A85C; PHAGS-PA HA; D; No_Joining_Group
657 | A85D; PHAGS-PA A; D; No_Joining_Group
658 | A85E; PHAGS-PA I; D; No_Joining_Group
659 | A85F; PHAGS-PA U; D; No_Joining_Group
660 | A860; PHAGS-PA E; D; No_Joining_Group
661 | A861; PHAGS-PA O; D; No_Joining_Group
662 | A862; PHAGS-PA QA; D; No_Joining_Group
663 | A863; PHAGS-PA XA; D; No_Joining_Group
664 | A864; PHAGS-PA FA; D; No_Joining_Group
665 | A865; PHAGS-PA GGA; D; No_Joining_Group
666 | A866; PHAGS-PA EE; D; No_Joining_Group
667 | A867; PHAGS-PA SUBJOINED WA; D; No_Joining_Group
668 | A868; PHAGS-PA SUBJOINED YA; D; No_Joining_Group
669 | A869; PHAGS-PA TTA; D; No_Joining_Group
670 | A86A; PHAGS-PA TTHA; D; No_Joining_Group
671 | A86B; PHAGS-PA DDA; D; No_Joining_Group
672 | A86C; PHAGS-PA NNA; D; No_Joining_Group
673 | A86D; PHAGS-PA ALTERNATE YA; D; No_Joining_Group
674 | A86E; PHAGS-PA VOICELESS SHA; D; No_Joining_Group
675 | A86F; PHAGS-PA VOICED HA; D; No_Joining_Group
676 | A870; PHAGS-PA ASPIRATED FA; D; No_Joining_Group
677 | A871; PHAGS-PA SUBJOINED RA; D; No_Joining_Group
678 | A872; PHAGS-PA SUPERFIXED RA; L; No_Joining_Group
679 | A873; PHAGS-PA CANDRABINDU; U; No_Joining_Group
680 | 
681 | # Manichaean Characters
682 | 
683 | 10AC0; MANICHAEAN ALEPH; D; MANICHAEAN ALEPH
684 | 10AC1; MANICHAEAN BETH; D; MANICHAEAN BETH
685 | 10AC2; MANICHAEAN BETH WITH 2 DOTS ABOVE; D; MANICHAEAN BETH
686 | 10AC3; MANICHAEAN GIMEL; D; MANICHAEAN GIMEL
687 | 10AC4; MANICHAEAN GIMEL WITH ATTACHED RING BELOW; D; MANICHAEAN GIMEL
688 | 10AC5; MANICHAEAN DALETH; R; MANICHAEAN DALETH
689 | 10AC6; MANICHAEAN HE; U; No_Joining_Group
690 | 10AC7; MANICHAEAN WAW; R; MANICHAEAN WAW
691 | 10AC8; MANICHAEAN UD; U; No_Joining_Group
692 | 10AC9; MANICHAEAN ZAYIN; R; MANICHAEAN ZAYIN
693 | 10ACA; MANICHAEAN ZAYIN WITH 2 DOTS ABOVE; R; MANICHAEAN ZAYIN
694 | 10ACB; MANICHAEAN JAYIN; U; No_Joining_Group
695 | 10ACC; MANICHAEAN JAYIN WITH 2 DOTS ABOVE; U; No_Joining_Group
696 | 10ACD; MANICHAEAN HETH; L; MANICHAEAN HETH
697 | 10ACE; MANICHAEAN TETH; R; MANICHAEAN TETH
698 | 10ACF; MANICHAEAN YODH; R; MANICHAEAN YODH
699 | 10AD0; MANICHAEAN KAPH; R; MANICHAEAN KAPH
700 | 10AD1; MANICHAEAN KAPH WITH DOT ABOVE; R; MANICHAEAN KAPH
701 | 10AD2; MANICHAEAN KAPH WITH 2 DOTS ABOVE; R; MANICHAEAN KAPH
702 | 10AD3; MANICHAEAN LAMEDH; D; MANICHAEAN LAMEDH
703 | 10AD4; MANICHAEAN DHAMEDH; D; MANICHAEAN DHAMEDH
704 | 10AD5; MANICHAEAN THAMEDH; D; MANICHAEAN THAMEDH
705 | 10AD6; MANICHAEAN MEM; D; MANICHAEAN MEM
706 | 10AD7; MANICHAEAN NUN; L; MANICHAEAN NUN
707 | 10AD8; MANICHAEAN SAMEKH; D; MANICHAEAN SAMEKH
708 | 10AD9; MANICHAEAN AYIN; D; MANICHAEAN AYIN
709 | 10ADA; MANICHAEAN AYIN WITH 2 DOTS ABOVE; D; MANICHAEAN AYIN
710 | 10ADB; MANICHAEAN PE; D; MANICHAEAN PE
711 | 10ADC; MANICHAEAN PE WITH DOT ABOVE; D; MANICHAEAN PE
712 | 10ADD; MANICHAEAN SADHE; R; MANICHAEAN SADHE
713 | 10ADE; MANICHAEAN QOPH; D; MANICHAEAN QOPH
714 | 10ADF; MANICHAEAN QOPH WITH DOT ABOVE; D; MANICHAEAN QOPH
715 | 10AE0; MANICHAEAN QOPH WITH 2 DOTS ABOVE; D; MANICHAEAN QOPH
716 | 10AE1; MANICHAEAN RESH; R; MANICHAEAN RESH
717 | 10AE2; MANICHAEAN SHIN; U; No_Joining_Group
718 | 10AE3; MANICHAEAN SHIN WITH 2 DOTS ABOVE; U; No_Joining_Group
719 | 10AE4; MANICHAEAN TAW; R; MANICHAEAN TAW
720 | 10AEB; MANICHAEAN ONE; D; MANICHAEAN ONE
721 | 10AEC; MANICHAEAN FIVE; D; MANICHAEAN FIVE
722 | 10AED; MANICHAEAN TEN; D; MANICHAEAN TEN
723 | 10AEE; MANICHAEAN TWENTY; D; MANICHAEAN TWENTY
724 | 10AEF; MANICHAEAN HUNDRED; R; MANICHAEAN HUNDRED
725 | 
726 | # Psalter Pahlavi Characters
727 | 
728 | 10B80; PSALTER PAHLAVI ALEPH; D; No_Joining_Group
729 | 10B81; PSALTER PAHLAVI BETH; R; No_Joining_Group
730 | 10B82; PSALTER PAHLAVI GIMEL; D; No_Joining_Group
731 | 10B83; PSALTER PAHLAVI DALETH; R; No_Joining_Group
732 | 10B84; PSALTER PAHLAVI HE; R; No_Joining_Group
733 | 10B85; PSALTER PAHLAVI WAW-AYIN-RESH; R; No_Joining_Group
734 | 10B86; PSALTER PAHLAVI ZAYIN; D; No_Joining_Group
735 | 10B87; PSALTER PAHLAVI HETH; D; No_Joining_Group
736 | 10B88; PSALTER PAHLAVI YODH; D; No_Joining_Group
737 | 10B89; PSALTER PAHLAVI KAPH; R; No_Joining_Group
738 | 10B8A; PSALTER PAHLAVI LAMEDH; D; No_Joining_Group
739 | 10B8B; PSALTER PAHLAVI MEM-QOPH; D; No_Joining_Group
740 | 10B8C; PSALTER PAHLAVI NUN; R; No_Joining_Group
741 | 10B8D; PSALTER PAHLAVI SAMEKH; D; No_Joining_Group
742 | 10B8E; PSALTER PAHLAVI PE; R; No_Joining_Group
743 | 10B8F; PSALTER PAHLAVI SADHE; R; No_Joining_Group
744 | 10B90; PSALTER PAHLAVI SHIN; D; No_Joining_Group
745 | 10B91; PSALTER PAHLAVI TAW; R; No_Joining_Group
746 | 10BA9; PSALTER PAHLAVI ONE; R; No_Joining_Group
747 | 10BAA; PSALTER PAHLAVI TWO; R; No_Joining_Group
748 | 10BAB; PSALTER PAHLAVI THREE; R; No_Joining_Group
749 | 10BAC; PSALTER PAHLAVI FOUR; R; No_Joining_Group
750 | 10BAD; PSALTER PAHLAVI TEN; D; No_Joining_Group
751 | 10BAE; PSALTER PAHLAVI TWENTY; D; No_Joining_Group
752 | 10BAF; PSALTER PAHLAVI HUNDRED; U; No_Joining_Group
753 | 
754 | # Hanifi Rohingya Characters
755 | 
756 | 10D00; HANIFI ROHINGYA A; L; No_Joining_Group
757 | 10D01; HANIFI ROHINGYA BA; D; No_Joining_Group
758 | 10D02; HANIFI ROHINGYA PA; D; HANIFI ROHINGYA PA
759 | 10D03; HANIFI ROHINGYA TA; D; No_Joining_Group
760 | 10D04; HANIFI ROHINGYA TTA; D; No_Joining_Group
761 | 10D05; HANIFI ROHINGYA JA; D; No_Joining_Group
762 | 10D06; HANIFI ROHINGYA CA; D; No_Joining_Group
763 | 10D07; HANIFI ROHINGYA HA; D; No_Joining_Group
764 | 10D08; HANIFI ROHINGYA KHA; D; No_Joining_Group
765 | 10D09; HANIFI ROHINGYA PA WITH DOT ABOVE; D; HANIFI ROHINGYA PA
766 | 10D0A; HANIFI ROHINGYA DA; D; No_Joining_Group
767 | 10D0B; HANIFI ROHINGYA DDA; D; No_Joining_Group
768 | 10D0C; HANIFI ROHINGYA RA; D; No_Joining_Group
769 | 10D0D; HANIFI ROHINGYA RRA; D; No_Joining_Group
770 | 10D0E; HANIFI ROHINGYA ZA; D; No_Joining_Group
771 | 10D0F; HANIFI ROHINGYA SA; D; No_Joining_Group
772 | 10D10; HANIFI ROHINGYA SHA; D; No_Joining_Group
773 | 10D11; HANIFI ROHINGYA KA; D; No_Joining_Group
774 | 10D12; HANIFI ROHINGYA GA; D; No_Joining_Group
775 | 10D13; HANIFI ROHINGYA LA; D; No_Joining_Group
776 | 10D14; HANIFI ROHINGYA MA; D; No_Joining_Group
777 | 10D15; HANIFI ROHINGYA NA; D; No_Joining_Group
778 | 10D16; HANIFI ROHINGYA WA; D; No_Joining_Group
779 | 10D17; HANIFI ROHINGYA KINNA WA; D; No_Joining_Group
780 | 10D18; HANIFI ROHINGYA YA; D; No_Joining_Group
781 | 10D19; HANIFI ROHINGYA KINNA YA; D; HANIFI ROHINGYA KINNA YA
782 | 10D1A; HANIFI ROHINGYA NGA; D; No_Joining_Group
783 | 10D1B; HANIFI ROHINGYA NYA; D; No_Joining_Group
784 | 10D1C; HANIFI ROHINGYA PA WITH 3 DOTS ABOVE; D; HANIFI ROHINGYA PA
785 | 10D1D; HANIFI ROHINGYA VOWEL A; D; No_Joining_Group
786 | 10D1E; HANIFI ROHINGYA DOTLESS KINNA YA WITH LEFT-FACING HOOK BELOW; D; HANIFI ROHINGYA KINNA YA
787 | 10D1F; HANIFI ROHINGYA VOWEL U; D; No_Joining_Group
788 | 10D20; HANIFI ROHINGYA DOTLESS KINNA YA WITH RIGHT-FACING HOOK BELOW; D; HANIFI ROHINGYA KINNA YA
789 | 10D21; HANIFI ROHINGYA VOWEL O; D; No_Joining_Group
790 | 10D22; HANIFI ROHINGYA SAKIN; R; No_Joining_Group
791 | 10D23; HANIFI ROHINGYA DOTLESS KINNA YA WITH DOT ABOVE; D; HANIFI ROHINGYA KINNA YA
792 | 
793 | # Sogdian Characters
794 | 
795 | 10F30; SOGDIAN ALEPH; D; No_Joining_Group
796 | 10F31; SOGDIAN BETH; D; No_Joining_Group
797 | 10F32; SOGDIAN GIMEL; D; No_Joining_Group
798 | 10F33; SOGDIAN HE; R; No_Joining_Group
799 | 10F34; SOGDIAN WAW; D; No_Joining_Group
800 | 10F35; SOGDIAN ZAYIN; D; No_Joining_Group
801 | 10F36; SOGDIAN HETH; D; No_Joining_Group
802 | 10F37; SOGDIAN YODH; D; No_Joining_Group
803 | 10F38; SOGDIAN KAPH; D; No_Joining_Group
804 | 10F39; SOGDIAN LAMEDH; D; No_Joining_Group
805 | 10F3A; SOGDIAN MEM; D; No_Joining_Group
806 | 10F3B; SOGDIAN NUN; D; No_Joining_Group
807 | 10F3C; SOGDIAN SAMEKH; D; No_Joining_Group
808 | 10F3D; SOGDIAN AYIN; D; No_Joining_Group
809 | 10F3E; SOGDIAN PE; D; No_Joining_Group
810 | 10F3F; SOGDIAN SADHE; D; No_Joining_Group
811 | 10F40; SOGDIAN RESH-AYIN; D; No_Joining_Group
812 | 10F41; SOGDIAN SHIN; D; No_Joining_Group
813 | 10F42; SOGDIAN TAW; D; No_Joining_Group
814 | 10F43; SOGDIAN FETH; D; No_Joining_Group
815 | 10F44; SOGDIAN LESH; D; No_Joining_Group
816 | 10F45; SOGDIAN INDEPENDENT SHIN; U; No_Joining_Group
817 | 10F51; SOGDIAN ONE; D; No_Joining_Group
818 | 10F52; SOGDIAN TEN; D; No_Joining_Group
819 | 10F53; SOGDIAN TWENTY; D; No_Joining_Group
820 | 10F54; SOGDIAN ONE HUNDRED; R; No_Joining_Group
821 | 
822 | # Chorasmian Characters
823 | 
824 | 10FB0; CHORASMIAN ALEPH; D; No_Joining_Group
825 | 10FB1; CHORASMIAN SMALL ALEPH; U; No_Joining_Group
826 | 10FB2; CHORASMIAN BETH; D; No_Joining_Group
827 | 10FB3; CHORASMIAN GIMEL; D; No_Joining_Group
828 | 10FB4; CHORASMIAN DALETH; R; No_Joining_Group
829 | 10FB5; CHORASMIAN HE; R; No_Joining_Group
830 | 10FB6; CHORASMIAN WAW; R; No_Joining_Group
831 | 10FB7; CHORASMIAN CURLED WAW; U; No_Joining_Group
832 | 10FB8; CHORASMIAN ZAYIN; D; No_Joining_Group
833 | 10FB9; CHORASMIAN HETH; R; No_Joining_Group
834 | 10FBA; CHORASMIAN YODH; R; No_Joining_Group
835 | 10FBB; CHORASMIAN KAPH; D; No_Joining_Group
836 | 10FBC; CHORASMIAN LAMEDH; D; No_Joining_Group
837 | 10FBD; CHORASMIAN MEM; R; No_Joining_Group
838 | 10FBE; CHORASMIAN NUN; D; No_Joining_Group
839 | 10FBF; CHORASMIAN SAMEKH; D; No_Joining_Group
840 | 10FC0; CHORASMIAN AYIN; U; No_Joining_Group
841 | 10FC1; CHORASMIAN PE; D; No_Joining_Group
842 | 10FC2; CHORASMIAN RESH; R; No_Joining_Group
843 | 10FC3; CHORASMIAN SHIN; R; No_Joining_Group
844 | 10FC4; CHORASMIAN TAW; D; No_Joining_Group
845 | 10FC5; CHORASMIAN ONE; U; No_Joining_Group
846 | 10FC6; CHORASMIAN TWO; U; No_Joining_Group
847 | 10FC7; CHORASMIAN THREE; U; No_Joining_Group
848 | 10FC8; CHORASMIAN FOUR; U; No_Joining_Group
849 | 10FC9; CHORASMIAN TEN; R; No_Joining_Group
850 | 10FCA; CHORASMIAN TWENTY; D; No_Joining_Group
851 | 10FCB; CHORASMIAN ONE HUNDRED; L; No_Joining_Group
852 | 
853 | # Kaithi Number Signs
854 | # These are prepended concatenation marks, comparable
855 | # to the number signs in the Arabic script.
856 | # Listed here for consistency in property values.
857 | 
858 | 110BD; KAITHI NUMBER SIGN; U; No_Joining_Group
859 | 110CD; KAITHI NUMBER SIGN ABOVE; U; No_Joining_Group
860 | 
861 | # Adlam Characters
862 | 
863 | 1E900;ADLAM CAPITAL ALIF; D; No_Joining_Group
864 | 1E901;ADLAM CAPITAL DAALI; D; No_Joining_Group
865 | 1E902;ADLAM CAPITAL LAAM; D; No_Joining_Group
866 | 1E903;ADLAM CAPITAL MIIM; D; No_Joining_Group
867 | 1E904;ADLAM CAPITAL BA; D; No_Joining_Group
868 | 1E905;ADLAM CAPITAL SINNYIIYHE; D; No_Joining_Group
869 | 1E906;ADLAM CAPITAL PE; D; No_Joining_Group
870 | 1E907;ADLAM CAPITAL BHE; D; No_Joining_Group
871 | 1E908;ADLAM CAPITAL RA; D; No_Joining_Group
872 | 1E909;ADLAM CAPITAL E; D; No_Joining_Group
873 | 1E90A;ADLAM CAPITAL FA; D; No_Joining_Group
874 | 1E90B;ADLAM CAPITAL I; D; No_Joining_Group
875 | 1E90C;ADLAM CAPITAL O; D; No_Joining_Group
876 | 1E90D;ADLAM CAPITAL DHA; D; No_Joining_Group
877 | 1E90E;ADLAM CAPITAL YHE; D; No_Joining_Group
878 | 1E90F;ADLAM CAPITAL WAW; D; No_Joining_Group
879 | 1E910;ADLAM CAPITAL NUN; D; No_Joining_Group
880 | 1E911;ADLAM CAPITAL KAF; D; No_Joining_Group
881 | 1E912;ADLAM CAPITAL YA; D; No_Joining_Group
882 | 1E913;ADLAM CAPITAL U; D; No_Joining_Group
883 | 1E914;ADLAM CAPITAL JIIM; D; No_Joining_Group
884 | 1E915;ADLAM CAPITAL CHI; D; No_Joining_Group
885 | 1E916;ADLAM CAPITAL HA; D; No_Joining_Group
886 | 1E917;ADLAM CAPITAL QAAF; D; No_Joining_Group
887 | 1E918;ADLAM CAPITAL GA; D; No_Joining_Group
888 | 1E919;ADLAM CAPITAL NYA; D; No_Joining_Group
889 | 1E91A;ADLAM CAPITAL TU; D; No_Joining_Group
890 | 1E91B;ADLAM CAPITAL NHA; D; No_Joining_Group
891 | 1E91C;ADLAM CAPITAL VA; D; No_Joining_Group
892 | 1E91D;ADLAM CAPITAL KHA; D; No_Joining_Group
893 | 1E91E;ADLAM CAPITAL GBE; D; No_Joining_Group
894 | 1E91F;ADLAM CAPITAL ZAL; D; No_Joining_Group
895 | 1E920;ADLAM CAPITAL KPO; D; No_Joining_Group
896 | 1E921;ADLAM CAPITAL SHA; D; No_Joining_Group
897 | 1E922;ADLAM SMALL ALIF; D; No_Joining_Group
898 | 1E923;ADLAM SMALL DAALI; D; No_Joining_Group
899 | 1E924;ADLAM SMALL LAAM; D; No_Joining_Group
900 | 1E925;ADLAM SMALL MIIM; D; No_Joining_Group
901 | 1E926;ADLAM SMALL BA; D; No_Joining_Group
902 | 1E927;ADLAM SMALL SINNYIIYHE; D; No_Joining_Group
903 | 1E928;ADLAM SMALL PE; D; No_Joining_Group
904 | 1E929;ADLAM SMALL BHE; D; No_Joining_Group
905 | 1E92A;ADLAM SMALL RA; D; No_Joining_Group
906 | 1E92B;ADLAM SMALL E; D; No_Joining_Group
907 | 1E92C;ADLAM SMALL FA; D; No_Joining_Group
908 | 1E92D;ADLAM SMALL I; D; No_Joining_Group
909 | 1E92E;ADLAM SMALL O; D; No_Joining_Group
910 | 1E92F;ADLAM SMALL DHA; D; No_Joining_Group
911 | 1E930;ADLAM SMALL YHE; D; No_Joining_Group
912 | 1E931;ADLAM SMALL WAW; D; No_Joining_Group
913 | 1E932;ADLAM SMALL NUN; D; No_Joining_Group
914 | 1E933;ADLAM SMALL KAF; D; No_Joining_Group
915 | 1E934;ADLAM SMALL YA; D; No_Joining_Group
916 | 1E935;ADLAM SMALL U; D; No_Joining_Group
917 | 1E936;ADLAM SMALL JIIM; D; No_Joining_Group
918 | 1E937;ADLAM SMALL CHI; D; No_Joining_Group
919 | 1E938;ADLAM SMALL HA; D; No_Joining_Group
920 | 1E939;ADLAM SMALL QAAF; D; No_Joining_Group
921 | 1E93A;ADLAM SMALL GA; D; No_Joining_Group
922 | 1E93B;ADLAM SMALL NYA; D; No_Joining_Group
923 | 1E93C;ADLAM SMALL TU; D; No_Joining_Group
924 | 1E93D;ADLAM SMALL NHA; D; No_Joining_Group
925 | 1E93E;ADLAM SMALL VA; D; No_Joining_Group
926 | 1E93F;ADLAM SMALL KHA; D; No_Joining_Group
927 | 1E940;ADLAM SMALL GBE; D; No_Joining_Group
928 | 1E941;ADLAM SMALL ZAL; D; No_Joining_Group
929 | 1E942;ADLAM SMALL KPO; D; No_Joining_Group
930 | 1E943;ADLAM SMALL SHA; D; No_Joining_Group
931 | 1E94B;ADLAM NASALIZATION MARK; T; No_Joining_Group
932 | 
933 | # EOF
934 | 


--------------------------------------------------------------------------------
/uc_spec/gen_idna_mapping_mod.escript:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env escript
  2 | %% -*- erlang -*-
  3 | %%! +A0
  4 | 
  5 | -mode(compile).
  6 | 
  7 | -define(MOD, "idna_mapping").
  8 | 
  9 | -export([main/1]).
 10 | 
 11 | -ifdef('OTP_RELEASE').
 12 | -define(chomp(Str), string:chomp(Str)).
 13 | -define(trim(Str), string:trim(Str, both)).
 14 | -define(lexemes(Str, Pat), string:lexemes(Str, Pat)).
 15 | -define(lower(C), string:lowercase(C)).
 16 | -else.
 17 | -define(chomp(Str), string:strip(Str, right, $\n)).
 18 | -define(trim(Str), string:strip(Str, both)).
 19 | -define(lexemes(Str, Pat), string:strip(string:tokens(Str, Pat), both)).
 20 | -define(lower(C), string:to_lower(C)).
 21 | -endif.
 22 | 
 23 | -define(UTS46_STATUSES, #{
 24 |   "valid" => {'V', false},
 25 |   "ignored" => {'I', false},
 26 |   "mapped" => {'M', true},
 27 |   "deviation" => {'D', true},
 28 |   "disallowed" => {'X', false},
 29 |   "disallowed_STD3_valid" => {'3', false},
 30 |   "disallowed_STD3_mapped" => {'3', true}
 31 | }).
 32 | 
 33 | 
 34 | 
 35 | main(_) ->
 36 |   {ok, IM} = file:open("../uc_spec/IdnaMappingTable.txt", [read, raw, {read_ahead, 1000000}]),
 37 |   Data = foldl(fun parse_idna_mapping/2, [], IM),
 38 |   file:close(IM),
 39 | 
 40 |   %% Make module
 41 |   OutputPath = filename:join(["..", "src", ?MOD++".erl"]),
 42 |   {ok, Out} = file:open(OutputPath, [write]),
 43 |   gen_file(Out, Data),
 44 |   ok = file:close(Out),
 45 |   ok.
 46 | 
 47 | parse_idna_mapping(Line0, Acc) ->
 48 |   [Line|_Comments] = tokens(Line0, "#"),
 49 |   case tokens(Line, ";") of
 50 |     [CodePoints, Status] ->
 51 |       [{to_range(CodePoints), {?trim(Status), undefined, undefined}} | Acc];
 52 |     [CodePoints, Status, Mapping] ->
 53 |       [{to_range(CodePoints), {?trim(Status), to_mapping(Mapping), undefined}} | Acc];
 54 |     [CodePoints, Status, Mapping, Idna2008Status] ->
 55 |       [{to_range(CodePoints), {?trim(Status), to_mapping(Mapping), to_atom(Idna2008Status)}} | Acc]
 56 |   end.
 57 | 
 58 | 
 59 | to_mapping(Mapping) ->
 60 |   [hex_to_int(C) || C <- ?lexemes(Mapping, " ")].
 61 | 
 62 | to_range(CodePoints0) ->
 63 |   case tokens(CodePoints0, ".") of
 64 |     [CodePoint] ->
 65 |       {hex_to_int(CodePoint), undefined};
 66 |     [CodePoint1, "", CodePoint2] ->
 67 |       {hex_to_int(CodePoint1), hex_to_int(CodePoint2)}
 68 |   end.
 69 | 
 70 | 
 71 | gen_file(Fd, Data) ->
 72 |   gen_header(Fd),
 73 |   gen_utc46(Fd, Data),
 74 |   ok.
 75 | 
 76 | 
 77 | gen_header(Fd) ->
 78 |   io:put_chars(Fd, "%%\n%% this file is generated do not modify\n"),
 79 |   io:put_chars(Fd, "%% see ../uc_spec/gen_idna_mapping.escript\n\n"),
 80 |   io:put_chars(Fd, "-module(" ++ ?MOD ++").\n"),
 81 |   io:put_chars(Fd, "-compile(compressed).\n"),
 82 |   io:put_chars(Fd, "-export([uts46_map/1]).\n"),
 83 |   ok.
 84 | 
 85 | gen_utc46(Fd, Data) ->
 86 |   lists:foreach(
 87 |     fun({CP, {S, M, _}}) ->
 88 |         {Status, Mapping} = maps:get(S, ?UTS46_STATUSES),
 89 |         case Mapping of
 90 |           true ->
 91 |             io:format(Fd, "uts46_map~s {~p, ~w};\n", [gen_single_clause(CP), Status, M]);
 92 |           false ->
 93 |             io:format(Fd, "uts46_map~s ~p;\n", [gen_single_clause(CP), Status])
 94 |         end
 95 |     end,
 96 |     optimize_ranges(lists:sort(Data))
 97 |   ),
 98 |   io:put_chars(Fd, "uts46_map(_) -> erlang:error(badarg).\n").
 99 | 
100 | 
101 | gen_single_clause({R0, undefined}) ->
102 |   io_lib:format("(~w) ->", [R0]);
103 | gen_single_clause({R0, R1}) ->
104 |   io_lib:format("(CP) when ~w =< CP, CP =< ~w ->", [R0,R1]).
105 | 
106 | optimize_ranges(Rs0) ->
107 |   PF = fun
108 |          ({{N, undefined}, _}) when is_integer(N) -> true;
109 |          (_) -> false
110 |        end,
111 | 
112 |   {Singles, Rs} = lists:partition(PF, Rs0),
113 |   Singles ++ Rs.
114 | 
115 | 
116 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
117 | 
118 | hex_to_int([]) -> [];
119 | hex_to_int(HexStr) ->
120 |   list_to_integer(?trim(HexStr), 16).
121 | 
122 | to_atom(Str) ->
123 |   list_to_atom(?lower(?trim(Str))).
124 | 
125 | foldl(Fun, Acc, Fd) ->
126 |   Get = fun() -> file:read_line(Fd) end,
127 |   foldl_1(Fun, Acc, Get).
128 | 
129 | foldl_1(_Fun, {done, Acc}, _Get) -> Acc;
130 | foldl_1(Fun, Acc, Get) ->
131 |   case Get() of
132 |     eof -> Acc;
133 |     {ok, "#" ++ _} -> %% Ignore comments
134 |       foldl_1(Fun, Acc, Get);
135 |     {ok, "\n"} -> %% Ignore empty lines
136 |       foldl_1(Fun, Acc, Get);
137 |     {ok, Line} ->
138 |       foldl_1(Fun, Fun(Line, Acc), Get)
139 |   end.
140 | 
141 | 
142 | 
143 | %% Differs from string:tokens, it returns empty string as token between two delimiters
144 | tokens(S, [C]) ->
145 |   tokens(lists:reverse(S), C, []).
146 | 
147 | tokens([Sep|S], Sep, Toks) ->
148 |   tokens(S, Sep, [[]|Toks]);
149 | tokens([C|S], Sep, Toks) ->
150 |   tokens_2(S, Sep, Toks, [C]);
151 | tokens([], _, Toks) ->
152 |   Toks.
153 | 
154 | tokens_2([Sep|S], Sep, Toks, Tok) ->
155 |   tokens(S, Sep, [Tok|Toks]);
156 | tokens_2([C|S], Sep, Toks, Tok) ->
157 |   tokens_2(S, Sep, Toks, [C|Tok]);
158 | tokens_2([], _Sep, Toks, Tok) ->
159 |   [Tok|Toks].
160 | 


--------------------------------------------------------------------------------
/uc_spec/gen_idna_table_mod.escript:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env escript
  2 | %% -*- erlang -*-
  3 | %%! +A0
  4 | 
  5 | -mode(compile).
  6 | 
  7 | -define(MOD, "idna_table").
  8 | 
  9 | -export([main/1]).
 10 | 
 11 | -ifdef('OTP_RELEASE').
 12 | -define(trim(Str), string:trim(Str, both)).
 13 | -define(lexemes(Str, Pat), string:lexemes(Str, Pat)).
 14 | -else.
 15 | -define(trim(Str), string:strip(Str, both)).
 16 | -define(lexemes(Str, Pat), string:strip(string:tokens(Str, Pat), both)).
 17 | -endif.
 18 | 
 19 | 
 20 | main(_) ->
 21 |   {ok, IM} = file:open("../uc_spec/idna-table.txt", [read, raw, {read_ahead, 1000000}]),
 22 |   Data = foldl(fun parse_idna_table/2, [], IM),
 23 |   file:close(IM),
 24 | 
 25 |   %% Make module
 26 |   OutputPath = filename:join(["..", "src", ?MOD++".erl"]),
 27 |   {ok, Out} = file:open(OutputPath, [write]),
 28 |   gen_file(Out, Data),
 29 |   ok = file:close(Out),
 30 |   ok.
 31 | 
 32 | 
 33 | parse_idna_table(Line0, Acc) ->
 34 |   [Line|_Comments] = tokens(Line0, "#"),
 35 |   [CodePoints, Status] = tokens(Line, ";"),
 36 |   [{to_range(CodePoints), to_atom(Status)} | Acc].
 37 | 
 38 | 
 39 | gen_file(Fd, Data) ->
 40 |   gen_header(Fd),
 41 |   gen_disallowed_p(Fd),
 42 |   gen_contextj_p(Fd),
 43 |   gen_contexto_p(Fd),
 44 |   gen_unassigned_p(Fd),
 45 |   gen_valid_p(Fd),
 46 |   gen_lookup(Fd, Data),
 47 |   ok.
 48 | 
 49 | 
 50 | gen_header(Fd) ->
 51 |   io:put_chars(Fd, "%%\n%% this file is generated do not modify\n"),
 52 |   io:put_chars(Fd, "%% see ../uc_spec/gen_idna_table.escript\n\n"),
 53 |   io:put_chars(Fd, "-module(" ++ ?MOD ++").\n"),
 54 |   io:put_chars(Fd, "-compile(compressed).\n"),
 55 |   io:put_chars(Fd, "-export([lookup/1]).\n"),
 56 |   io:put_chars(Fd, "-export([disallowed_p/1, contextj_p/1, contexto_p/1, unassigned_p/1, valid_p/1]).\n"),
 57 |   ok.
 58 | 
 59 | gen_disallowed_p(Fd) ->
 60 |   io:put_chars(Fd, "disallowed_p(CP) -> lookup(CP) == 'DISALLOWED'.\n").
 61 | 
 62 | gen_contextj_p(Fd) ->
 63 |   io:put_chars(Fd, "contextj_p(CP) -> lookup(CP) == 'CONTEXTJ'.\n").
 64 | 
 65 | gen_contexto_p(Fd) ->
 66 |   io:put_chars(Fd, "contexto_p(CP) -> lookup(CP) == 'CONTEXTO'.\n").
 67 | 
 68 | gen_unassigned_p(Fd) ->
 69 |   io:put_chars(Fd, "unassigned_p(CP) -> lookup(CP) == 'UNASSIGNED'.\n").
 70 | 
 71 | gen_valid_p(Fd) ->
 72 |   io:put_chars(Fd, "valid_p(CP) -> lookup(CP) == 'PVALID'.\n").
 73 | 
 74 | gen_lookup(Fd, Data) ->
 75 |   lists:foreach(fun({Cp, Class}) ->
 76 |     io:format(Fd, "lookup~s ~p;~n", [gen_single_clause(Cp), Class])
 77 |                 end,
 78 |     optimize_ranges(lists:sort(Data))),
 79 |   io:put_chars(Fd, "lookup(_) -> 'UNASSIGNED'."),
 80 |   ok.
 81 | 
 82 | gen_single_clause({R0, undefined}) ->
 83 |   io_lib:format("(~w) ->", [R0]);
 84 | gen_single_clause({R0, R1}) ->
 85 |   io_lib:format("(CP) when ~w =< CP, CP =< ~w ->", [R0,R1]).
 86 | 
 87 | optimize_ranges(Rs0) ->
 88 |   PF = fun
 89 |          ({{N, undefined}, _}) when is_integer(N) -> true;
 90 |          (_) -> false
 91 |        end,
 92 | 
 93 |   {Singles, Rs} = lists:partition(PF, Rs0),
 94 |   Singles ++ Rs.
 95 | 
 96 | 
 97 | to_range(CodePoints0) ->
 98 |   case tokens(CodePoints0, ".") of
 99 |     [CodePoint] ->
100 |       {hex_to_int(CodePoint), undefined};
101 |     [CodePoint1, "", CodePoint2] ->
102 |       {hex_to_int(CodePoint1), hex_to_int(CodePoint2)}
103 |   end.
104 | 
105 | hex_to_int([]) -> [];
106 | hex_to_int(HexStr) ->
107 |   list_to_integer(?trim(HexStr), 16).
108 | 
109 | to_atom(Str) ->
110 |   list_to_atom(?trim(Str)).
111 | 
112 | foldl(Fun, Acc, Fd) ->
113 |   Get = fun() -> file:read_line(Fd) end,
114 |   foldl_1(Fun, Acc, Get).
115 | 
116 | foldl_1(_Fun, {done, Acc}, _Get) -> Acc;
117 | foldl_1(Fun, Acc, Get) ->
118 |   case Get() of
119 |     eof -> Acc;
120 |     {ok, "#" ++ _} -> %% Ignore comments
121 |       foldl_1(Fun, Acc, Get);
122 |     {ok, "\n"} -> %% Ignore empty lines
123 |       foldl_1(Fun, Acc, Get);
124 |     {ok, Line} ->
125 |       foldl_1(Fun, Fun(Line, Acc), Get)
126 |   end.
127 | 
128 | 
129 | 
130 | %% Differs from string:tokens, it returns empty string as token between two delimiters
131 | tokens(S, [C]) ->
132 |   tokens(lists:reverse(S), C, []).
133 | 
134 | tokens([Sep|S], Sep, Toks) ->
135 |   tokens(S, Sep, [[]|Toks]);
136 | tokens([C|S], Sep, Toks) ->
137 |   tokens_2(S, Sep, Toks, [C]);
138 | tokens([], _, Toks) ->
139 |   Toks.
140 | 
141 | tokens_2([Sep|S], Sep, Toks, Tok) ->
142 |   tokens(S, Sep, [Tok|Toks]);
143 | tokens_2([C|S], Sep, Toks, Tok) ->
144 |   tokens_2(S, Sep, Toks, [C|Tok]);
145 | tokens_2([], _Sep, Toks, Tok) ->
146 |   [Tok|Toks].
147 | 


--------------------------------------------------------------------------------
/uc_spec/gen_idnadata_mod.escript:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env escript
  2 | %% -*- erlang -*-
  3 | %%! +A0
  4 | 
  5 | -mode(compile).
  6 | 
  7 | -define(MOD, "idna_data").
  8 | 
  9 | -export([main/1]).
 10 | 
 11 | -ifdef('OTP_RELEASE').
 12 | -define(chomp(Str), string:chomp(Str)).
 13 | -define(trim(Str, Dir), string:trim(Str, Dir)).
 14 | -define(lexemes(Str, Pat), string:lexemes(Str, Pat)).
 15 | -define(lower(C), string:lowercase(C)).
 16 | -else.
 17 | -define(chomp(Str), string:strip(Str, right, $\n)).
 18 | -define(trim(Str, Dir), string:strip(Str, Dir)).
 19 | -define(lexemes(Str, Pat), string:strip(string:tokens(Str, Pat), both)).
 20 | -define(lower(C), string:to_lower(C)).
 21 | -endif.
 22 | 
 23 | %% Default `Bidi_Class` for unassigned codepoints.
 24 | %%
 25 | %% Ref: <https://www.unicode.org/Public/UNIDATA/extracted/DerivedBidiClass.txt>
 26 | -define(BIDI_CLASS_DEFAULTS, [
 27 |   {{16#0600, 16#07BF}, "AL"},
 28 |   {{16#0860, 16#086F}, "AL"},
 29 |   {{16#08A0, 16#08FF}, "AL"},
 30 |   {{16#FB50, 16#FDCF}, "AL"},
 31 |   {{16#FDF0, 16#FDFF}, "AL"},
 32 |   {{16#FE70, 16#FEFF}, "AL"},
 33 |   {{16#00010D00, 16#00010D3F}, "AL"},
 34 |   {{16#00010F30, 16#00010F6F}, "AL"},
 35 |   {{16#0001EC70, 16#0001ECBF}, "AL"},
 36 |   {{16#0001EE00, 16#0001EEFF}, "AL"},
 37 |   %% Arabic, Syriac, and Thaana blocks, among others
 38 |   {{16#0590, 16#05FF}, "R"},
 39 |   {{16#07C0, 16#085F}, "R"},
 40 |   {{16#0870, 16#089F}, "R"},
 41 |   {{16#FB1D, 16#FB4F}, "R"},
 42 |   {{16#00010800, 16#00010CFF}, "R"},
 43 |   {{16#00010D40, 16#00010F2F}, "R"},
 44 |   {{16#00010F70, 16#00010FFF}, "R"},
 45 |   {{16#0001E800, 16#0001EC6F}, "R"},
 46 |   {{16#0001ECC0, 16#0001EDFF}, "R"},
 47 |   {{16#0001EF00, 16#0001EFFF}, "R"},
 48 |   %% Hebrew, NKo, and Phoenician blocks, among others.
 49 |   {{16#20A0, 16#20CF}, "ET"}
 50 |   %% Currency Symbols block.
 51 | ]).
 52 | 
 53 | main(_) ->
 54 |   {ok, IM} = file:open("../uc_spec/UnicodeData.txt", [read, raw, {read_ahead, 1000000}]),
 55 |   Data = foldl(fun parse_unicode_data/2, [], IM),
 56 |   file:close(IM),
 57 | 
 58 |   {ok, AS} = file:open("../uc_spec/ArabicShaping.txt", [read, raw, {read_ahead, 1000000}]),
 59 |   JoiningTypes = foldl(fun parse_as/2, [], AS),
 60 |   ok = file:close(AS),
 61 | 
 62 |   {ok, ScriptsF} = file:open("../uc_spec/Scripts.txt", [read, raw, {read_ahead, 1000000}]),
 63 |   Scripts = foldl(fun parse_scripts/2, [], ScriptsF),
 64 |   ok = file:close(ScriptsF),
 65 | 
 66 |   %% Make module
 67 |   OutputPath = filename:join(["..", "src", ?MOD++".erl"]),
 68 |   {ok, Out} = file:open(OutputPath, [write]),
 69 |   gen_file(Out, Data, JoiningTypes, Scripts),
 70 |   ok = file:close(Out),
 71 |   ok.
 72 | 
 73 | gen_file(Fd, Data, JoiningTypes, Scripts) ->
 74 |   gen_header(Fd),
 75 |   gen_bidirectional(Fd),
 76 |   gen_lookup(Fd, Data),
 77 |   gen_joining_types(Fd, JoiningTypes),
 78 |   gen_scripts_types(Fd, Scripts),
 79 |   ok.
 80 | 
 81 | 
 82 | gen_header(Fd) ->
 83 |   io:put_chars(Fd, "%%\n%% this file is generated do not modify\n"),
 84 |   io:put_chars(Fd, "%% see ../uc_spec/gen_idnadata_mod.escript\n\n"),
 85 |   io:put_chars(Fd, "-module(" ++ ?MOD ++").\n"),
 86 |   io:put_chars(Fd, "-compile(compressed).\n"),
 87 |   io:put_chars(Fd, "-export([lookup/1, joining_types/1, scripts/1]).\n"),
 88 |   io:put_chars(Fd, "-export([bidirectional/1]).\n"),
 89 |   ok.
 90 | 
 91 | gen_lookup(Fd, Data) ->
 92 |   lists:foreach(
 93 |     fun({Cp,Tp}) ->
 94 |       io:format(Fd, "lookup(~w) -> ~p;~n", [Cp, Tp])
 95 |     end,
 96 |     lists:sort(Data)
 97 |   ),
 98 |   io:put_chars(Fd, "lookup(_) -> false.\n\n").
 99 | 
100 | gen_bidirectional(Fd) ->
101 |   io:put_chars(Fd, "bidirectional(CP) ->\n"),
102 |   io:put_chars(Fd, "  case lookup(CP) of \n"),
103 |   io:put_chars(Fd, "    {_, C} -> C;\n"),
104 |   io:put_chars(Fd, "    false -> bidirectional_1(CP)\n"),
105 |   io:put_chars(Fd, "  end.\n\n"),
106 |   lists:foreach(
107 |     fun({Cp, Class}) ->
108 |       io:format(Fd, "bidirectional_1~s ~p;~n", [gen_single_clause(Cp), Class])
109 |     end,
110 |     lists:sort(?BIDI_CLASS_DEFAULTS)
111 |   ),
112 |   io:put_chars(Fd, "bidirectional_1(_) -> \"L\".\n\n").
113 | 
114 | gen_joining_types(Fd, JoiningTypes) ->
115 |   lists:foreach(
116 |     fun({Cp, Jt}) ->
117 |       io:format(Fd, "joining_types(~w) -> ~p;~n", [Cp, ?trim(Jt, both)])
118 |     end,
119 |     lists:sort(JoiningTypes)
120 |   ),
121 |   io:put_chars(Fd, "joining_types(_) -> undefined.\n\n").
122 | 
123 | gen_scripts_types(Fd, Scripts) ->
124 |   lists:foreach(
125 |     fun({Cp, Jt}) ->
126 |       io:format(Fd, "scripts~s ~p;~n", [gen_single_clause(Cp), ?lower(?trim(Jt, both))])
127 |     end,
128 |     optimize_scripts_ranges(lists:sort(Scripts))
129 |   ),
130 |   io:put_chars(Fd, "scripts(_) -> false.\n\n").
131 | 
132 | optimize_scripts_ranges(Rs0) ->
133 |   PF = fun
134 |          ({{N, undefined}, _}) when is_integer(N) -> true;
135 |          (_) -> false
136 |        end,
137 | 
138 |   {Singles, Rs} = lists:partition(PF, Rs0),
139 |   Singles ++ Rs.
140 | 
141 | 
142 | gen_single_clause({R0, undefined}) ->
143 |   io_lib:format("(~w) ->", [R0]);
144 | gen_single_clause({R0, R1}) ->
145 |   io_lib:format("(CP) when ~w =< CP, CP =< ~w ->", [R0,R1]).
146 | 
147 | 
148 | 
149 | parse_unicode_data(Line0, Acc) ->
150 |   Line = ?chomp(Line0),
151 |   [CodePoint, _Name, Cat, _Class, BiDi |_] = tokens(Line, ";"),
152 |   [{hex_to_int(CodePoint), {?trim(Cat, both), ?trim(BiDi, both)}} | Acc].
153 | 
154 | parse_as(Line0, Acc) ->
155 |   Line = ?chomp(Line0),
156 |   case tokens(Line, ";") of
157 |     [CodePoint, _,JT|_] ->
158 |       [{hex_to_int(CodePoint), ?trim(JT, both) } | Acc];
159 |     _ ->
160 |       Acc
161 |   end.
162 | 
163 | parse_scripts(Line0, Acc) ->
164 |   [Line|_Comments] = tokens(Line0, "#"),
165 |   [CodePoints, Script0] = tokens(Line, ";"),
166 |   Script1 = ?trim(Script0, both),
167 |   case lists:member(Script1, ["Greek", "Han", "Hebrew", "Hiragana", "Katakana"]) of
168 |     true ->
169 |       [{to_range(CodePoints), Script1} | Acc];
170 |     false ->
171 |       Acc
172 |   end.
173 | 
174 | 
175 | to_range(CodePoints0) ->
176 |   case tokens(CodePoints0, ".") of
177 |     [CodePoint] ->
178 |       {hex_to_int(CodePoint), undefined};
179 |     [CodePoint1, "", CodePoint2] ->
180 |       {hex_to_int(CodePoint1), hex_to_int(CodePoint2)}
181 |   end.
182 | 
183 | hex_to_int([]) -> [];
184 | hex_to_int(HexStr) ->
185 |   list_to_integer(?trim(HexStr, both), 16).
186 | 
187 | 
188 | foldl(Fun, Acc, Fd) ->
189 |   Get = fun() -> file:read_line(Fd) end,
190 |   foldl_1(Fun, Acc, Get).
191 | 
192 | foldl_1(_Fun, {done, Acc}, _Get) -> Acc;
193 | foldl_1(Fun, Acc, Get) ->
194 |   case Get() of
195 |     eof -> Acc;
196 |     {ok, "#" ++ _} -> %% Ignore comments
197 |       foldl_1(Fun, Acc, Get);
198 |     {ok, "\n"} -> %% Ignore empty lines
199 |       foldl_1(Fun, Acc, Get);
200 |     {ok, Line} ->
201 |       foldl_1(Fun, Fun(Line, Acc), Get)
202 |   end.
203 | 
204 | 
205 | 
206 | %% Differs from string:tokens, it returns empty string as token between two delimiters
207 | tokens(S, [C]) ->
208 |   tokens(lists:reverse(S), C, []).
209 | 
210 | tokens([Sep|S], Sep, Toks) ->
211 |   tokens(S, Sep, [[]|Toks]);
212 | tokens([C|S], Sep, Toks) ->
213 |   tokens_2(S, Sep, Toks, [C]);
214 | tokens([], _, Toks) ->
215 |   Toks.
216 | 
217 | tokens_2([Sep|S], Sep, Toks, Tok) ->
218 |   tokens(S, Sep, [Tok|Toks]);
219 | tokens_2([C|S], Sep, Toks, Tok) ->
220 |   tokens_2(S, Sep, Toks, [C|Tok]);
221 | tokens_2([], _Sep, Toks, Tok) ->
222 |   [Tok|Toks].
223 | 


--------------------------------------------------------------------------------