├── .gitignore ├── .travis.yml ├── CHANGELOG ├── LICENSE ├── NOTICE ├── README.md ├── bootstrap_travis.sh ├── rebar.config ├── rebar.lock ├── src ├── idna.app.src ├── idna.erl ├── idna_bidi.erl ├── idna_context.erl ├── idna_data.erl ├── idna_logger.hrl ├── idna_mapping.erl ├── idna_table.erl ├── idna_ucs.erl └── punycode.erl ├── test ├── IdnaTestV2.txt ├── compat_test.erl ├── idna_test.erl ├── punycode_test.erl └── uts46_test.erl └── uc_spec ├── ArabicShaping.txt ├── IdnaMappingTable.txt ├── Scripts.txt ├── UnicodeData.txt ├── gen_idna_mapping_mod.escript ├── gen_idna_table_mod.escript ├── gen_idnadata_mod.escript └── idna-table.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .rebar3 2 | _* 3 | .eunit 4 | *.o 5 | *.beam 6 | *.plt 7 | *.swp 8 | *.swo 9 | .erlang.cookie 10 | ebin 11 | log 12 | erl_crash.dump 13 | .rebar 14 | _rel 15 | _deps 16 | _plugins 17 | _tdeps 18 | logs 19 | _build 20 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: erlang 2 | otp_release: 3 | - 20.0 4 | - 20.1.7 5 | - 20.3.8.22 6 | - 21.0.9 7 | - 21.1.4 8 | - 21.2.7 9 | - 21.3.8.1 10 | - 22.0.7 11 | - 22.1.8.1 12 | - 22.2.8 13 | - 22.3.4 14 | - 23.0.2 15 | 16 | 17 | before_script: 18 | - "./bootstrap_travis.sh" 19 | script: "./rebar3 eunit" 20 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | # CHANGELOG 2 | 3 | == 7.0.0 - unreleased 4 | 5 | - remove unicode_compat. only support Erlang >= 20 6 | 7 | == 6.1.1 - 2020-12-06 8 | 9 | - fix license information 10 | 11 | == 6.1.0 - 2020-12-05 12 | 13 | - update to Unicode 13.0.0 14 | - bump unicode_util_compat to 0.7.0 15 | - remove support of Erlang < 19.3 16 | - remove support of rebar 2 17 | 18 | == 6.0.1 - 2020-05-14 19 | 20 | - bump to unicode_compat 0.5.0 21 | 22 | == 6.0.0 - 2018-08-30 23 | 24 | - IDNA 2008 support [RFC5981](https://tools.ietf.org/html/rfc5891) 25 | - International Domain Name validation 26 | - fix [Punycode](https://tools.ietf.org/html/rfc3492) algorithm 27 | 28 | Breaking changes: 29 | - `idna:to_ascii/1` in 5.1.x did not encode or enforce rules if the input is already all ascii 30 | 31 | == 5.1.2 - 2018-06-09 32 | 33 | - support build with rebar 2 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright the authors and contributors. All rights reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person 4 | obtaining a copy of this software and associated documentation 5 | files (the "Software"), to deal in the Software without 6 | restriction, including without limitation the rights to use, 7 | copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the 9 | Software is furnished to do so, subject to the following 10 | conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 17 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 19 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | This file is part of erlang-idna released under the MIT license. 2 | See the LICENSE for more information 3 | 4 | Copyright 2014-2020 Benoît Chesneau 5 | Copyright 2009-2014 Tim Fletcher 6 | 7 | Others: 8 | 9 | * idna_ucs.erl: 10 | Under the Apache 2 license 11 | Copyright Ericsson AB 2005-2016. All Rights Reserved -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## erlang-idna 2 | 3 | A pure Erlang IDNA implementation that follow the [RFC5891](https://tools.ietf.org/html/rfc5891). 4 | 5 | * support IDNA 2008 and IDNA 2003. 6 | * label validation: 7 | - [x] **check NFC**: Label must be in Normalization Form C 8 | - [x] **check hyphen**: The Unicode string MUST NOT contain "--" (two consecutive hyphens) in 9 | the third and fourth character positions and MUST NOT start or end 10 | with a "-" (hyphen). 11 | - [x] **Leading Combining Marks**: The Unicode string MUST NOT begin with a combining mark or combining character (see The Unicode Standard, Section 2.11 [Unicode](https://tools.ietf.org/html/rfc5891#ref-Unicode) for an exact definition). 12 | - [x] **Contextual Rules**: The Unicode string MUST NOT contain any characters whose validity is 13 | context-dependent, unless the validity is positively confirmed by a contextual rule. To check this, each code point identified as CONTEXTJ or CONTEXTO in the Tables document [RFC5892](https://tools.ietf.org/html/rfc5892#section-2.7) MUST have a non-null rule. If such a code point is missing a rule, the label is invalid. If the rule exists but the result of applying the rule is negative or inconclusive, the proposed label is invalid. 14 | - [x] **check BIDI**: label contains any characters from scripts that are 15 | written from right to left, it MUST meet the Bidi criteria [rfc5893](https://tools.ietf.org/html/rfc5893) 16 | 17 | 18 | 19 | 20 | ## Usage 21 | 22 | 23 | 24 | `idna:encode/{1,2}` and `idna:decode/{1, 2}` functions are used to encode or decode an Internationalized Domain 25 | Names using IDNA protocol. 26 | 27 | Input can be mapped to unicode using [uts46](https://unicode.org/reports/tr46/#Introduction) 28 | by setting the `uts46` flag to true (default is false). If transition from IDNA 2003 to 29 | IDNA 2008 is needed, the flag `transitional` can be set to `true`, (`default` is false). If 30 | conformance to STD3 is needed, the flag `std3_rules` can be set to true. (default is `false`). 31 | 32 | example: 33 | 34 | ```erlang 35 | 1> idna:encode("日本語。JP", [uts46]). 36 | "xn--wgv71a119e.xn--jp-" 37 | 2> idna:encode("日本語.JP", [uts46]). 38 | "xn--wgv71a119e.xn--jp-" 39 | ... 40 | ``` 41 | 42 | 43 | Legacy support of IDNA 2003 is also available with `to_ascii` and `to_unicode` functions: 44 | 45 | 46 | ```erlang 47 | 1> Domain = "www.詹姆斯.com". 48 | [119,119,119,46,35449,22982,26031,46,99,111,109] 49 | 2> Encoded = idna:to_ascii("www.詹姆斯.com"). 50 | "www.xn--8ws00zhy3a.com" 51 | 3> idna:to_unicode(Encoded). 52 | [119,119,119,46,35449,22982,26031,46,99,111,109] 53 | ``` 54 | 55 | 56 | 57 | Update Unicode data 58 | 59 | wget -O test/IdnaTestV2.txt https://www.unicode.org/Public/idna/latest/IdnaTestV2.txt 60 | wget -O uc_spec/ArabicShaping.txt https://www.unicode.org/Public/UNIDATA/ArabicShaping.txt 61 | wget -O uc_spec/IdnaMappingTable.txt https://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt 62 | wget -O uc_spec/Scripts.txt https://www.unicode.org/Public/UNIDATA/Scripts.txt 63 | wget -O uc_spec/UnicodeData.txt https://www.unicode.org/Public/UNIDATA/UnicodeData.txt 64 | 65 | git clone https://github.com/kjd/idna.git 66 | ./idna/tools/idna-data make-table --version 13.0.0 > uc_spec/idna-table.txt 67 | 68 | cd uc_spec 69 | ./gen_idnadata_mod.escript 70 | ./gen_idna_table_mod.escript 71 | ./gen_idna_mapping_mod.escript 72 | -------------------------------------------------------------------------------- /bootstrap_travis.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | curl -O -L https://s3.amazonaws.com/rebar3/rebar3 4 | chmod +x rebar3 5 | ./rebar3 update 6 | -------------------------------------------------------------------------------- /rebar.config: -------------------------------------------------------------------------------- 1 | {erl_opts, []}. 2 | 3 | {deps, []}. 4 | -------------------------------------------------------------------------------- /rebar.lock: -------------------------------------------------------------------------------- 1 | []. 2 | -------------------------------------------------------------------------------- /src/idna.app.src: -------------------------------------------------------------------------------- 1 | {application, idna, [ 2 | {description, "A pure Erlang IDNA implementation"}, 3 | {vsn, "7.0.0"}, 4 | {modules, []}, 5 | {registered, []}, 6 | {applications, [kernel, stdlib]}, 7 | 8 | {licenses, ["MIT"]}, 9 | {links, [{"Github", "https://github.com/benoitc/erlang-idna"}]} 10 | 11 | ]}. 12 | -------------------------------------------------------------------------------- /src/idna.erl: -------------------------------------------------------------------------------- 1 | %% -*- coding: utf-8 -*- 2 | %%% 3 | %%% This file is part of erlang-idna released under the MIT license. 4 | %%% See the LICENSE for more information. 5 | %%% 6 | -module(idna). 7 | 8 | %% API 9 | -export([encode/1, encode/2, 10 | decode/1, decode/2]). 11 | 12 | %% compatibility API 13 | -export([to_ascii/1, 14 | to_unicode/1, 15 | utf8_to_ascii/1, 16 | from_ascii/1]). 17 | 18 | 19 | -export([alabel/1, ulabel/1]). 20 | 21 | -export([check_hyphen/1, 22 | check_nfc/1, 23 | check_context/1, 24 | check_initial_combiner/1, 25 | check_label_length/1]). 26 | 27 | -export([check_label/1, check_label/4]). 28 | 29 | -define(ACE_PREFIX, "xn--"). 30 | 31 | -ifdef('OTP_RELEASE'). 32 | -define(lower(C), string:lowercase(C)). 33 | -else. 34 | -define(lower(C), string:to_lower(C)). 35 | -endif. 36 | 37 | -include("idna_logger.hrl"). 38 | 39 | 40 | -type idna_flags() :: [{uts46, boolean()} | 41 | {std3_rules, boolean()} | 42 | {transitional, boolean()}]. 43 | 44 | 45 | 46 | %% @doc encode Internationalized Domain Names using IDNA protocol 47 | -spec encode(string()) -> string(). 48 | encode(Domain) -> 49 | encode(Domain, []). 50 | 51 | 52 | %% @doc encode Internationalized Domain Names using IDNA protocol. 53 | %% Input can be mapped to unicode using [uts46](https://unicode.org/reports/tr46/#Introduction) 54 | %% by setting the `uts46' flag to `true' (default is `false'). If transition from IDNA 2003 to 55 | %% IDNA 2008 is needed, the flag `transitional' can be set to `true', (default is `false'). If 56 | %% conformance to STD3 is needed, the flag `std3_rules' can be set to `true'. (default is `false'). 57 | -spec encode(string(), idna_flags()) -> string(). 58 | encode(Domain0, Options) -> 59 | ok = validate_options(Options), 60 | Domain = case proplists:get_value(uts46, Options, false) of 61 | true -> 62 | STD3Rules = proplists:get_value(std3_rules, Options, false), 63 | Transitional = proplists:get_value(transitional, Options, false), 64 | uts46_remap(Domain0, STD3Rules, Transitional); 65 | false -> 66 | Domain0 67 | end, 68 | Labels = case proplists:get_value(strict, Options, false) of 69 | false -> 70 | re:split(Domain, "[.。.。]", [{return, list}, unicode]); 71 | true -> 72 | string:tokens(Domain, ".") 73 | end, 74 | case Labels of 75 | [] -> exit(empty_domain); 76 | _ -> 77 | encode_1(Labels, []) 78 | end. 79 | 80 | %% @doc decode an International Domain Name encoded with the IDNA protocol 81 | -spec decode(string()) -> string(). 82 | decode(Domain) -> 83 | decode(Domain, []). 84 | 85 | %% @doc decode an International Domain Name encoded with the IDNA protocol 86 | -spec decode(string(), idna_flags()) -> string(). 87 | decode(Domain0, Options) -> 88 | ok = validate_options(Options), 89 | Domain = case proplists:get_value(uts46, Options, false) of 90 | true -> 91 | STD3Rules = proplists:get_value(std3_rules, Options, false), 92 | Transitional = proplists:get_value(transitional, Options, false), 93 | uts46_remap(Domain0, STD3Rules, Transitional); 94 | false -> 95 | Domain0 96 | end, 97 | 98 | Labels = case proplists:get_value(strict, Options, false) of 99 | false -> 100 | re:split(lowercase(Domain), "[.。.。]", [{return, list}, unicode]); 101 | true -> 102 | string:tokens(lowercase(Domain), ".") 103 | end, 104 | case Labels of 105 | [] -> exit(empty_domain); 106 | _ -> 107 | decode_1(Labels, []) 108 | end. 109 | 110 | 111 | %% Compatibility API 112 | %% 113 | 114 | %% @doc encode an International Domain Name to IDNA protocol (compatibility API) 115 | -spec to_ascii(string()) -> string(). 116 | to_ascii(Domain) -> encode(Domain). 117 | 118 | %% @doc decode an an encoded International Domain Name using the IDNA protocol (compatibility API) 119 | -spec to_unicode(string()) -> string(). 120 | to_unicode(Domain) -> decode(Domain). 121 | 122 | 123 | utf8_to_ascii(Domain) -> 124 | to_ascii(idna_ucs:from_utf8(Domain)). 125 | 126 | %% @doc like `to_ascii/1' 127 | -spec from_ascii(nonempty_string()) -> nonempty_string(). 128 | from_ascii(Domain) -> 129 | decode(Domain). 130 | 131 | 132 | %% Helper functions 133 | %% 134 | 135 | validate_options([]) -> ok; 136 | validate_options([uts46|Rs]) -> validate_options(Rs); 137 | validate_options([{uts46, B}|Rs]) when is_boolean(B) -> validate_options(Rs); 138 | validate_options([strict|Rs]) -> validate_options(Rs); 139 | validate_options([{strict, B}|Rs]) when is_boolean(B) -> validate_options(Rs); 140 | validate_options([std3_rules|Rs]) -> validate_options(Rs); 141 | validate_options([{std3_rules, B}|Rs]) when is_boolean(B) -> validate_options(Rs); 142 | validate_options([transitional|Rs]) -> validate_options(Rs); 143 | validate_options([{transitional, B}|Rs]) when is_boolean(B) -> validate_options(Rs); 144 | validate_options([_]) -> erlang:error(badarg). 145 | 146 | encode_1([], Acc) -> 147 | lists:reverse(Acc); 148 | encode_1([Label|Labels], []) -> 149 | encode_1(Labels, lists:reverse(alabel(Label))); 150 | encode_1([Label|Labels], Acc) -> 151 | encode_1(Labels, lists:reverse(alabel(Label), [$.|Acc])). 152 | 153 | check_nfc(Label) -> 154 | case characters_to_nfc_list(Label) of 155 | Label -> ok; 156 | _ -> 157 | erlang:exit({bad_label, {nfc, "Label must be in Normalization Form C"}}) 158 | end. 159 | 160 | check_hyphen(Label) -> check_hyphen(Label, true). 161 | 162 | check_hyphen(Label, true) when length(Label) >= 3 -> 163 | case lists:nthtail(2, Label) of 164 | [$-, $-|_] -> 165 | ErrorMsg = error_msg("Label ~p has disallowed hyphens in 3rd and 4th position", [Label]), 166 | erlang:exit({bad_label, {hyphen, ErrorMsg}}); 167 | _ -> 168 | case (lists:nth(1, Label) == $-) orelse (lists:last(Label) == $-) of 169 | true -> 170 | ErrorMsg = error_msg("Label ~p must not start or end with a hyphen", [Label]), 171 | erlang:exit({bad_label, {hyphen, ErrorMsg}}); 172 | false -> 173 | ok 174 | end 175 | end; 176 | check_hyphen(Label, true) -> 177 | case (lists:nth(1, Label) == $-) orelse (lists:last(Label) == $-) of 178 | true -> 179 | ErrorMsg = error_msg("Label ~p must not start or end with a hyphen", [Label]), 180 | erlang:exit({bad_label, {hyphen, ErrorMsg}}); 181 | false -> 182 | ok 183 | end; 184 | check_hyphen(_Label, false) -> 185 | ok. 186 | 187 | check_initial_combiner([CP|_]) -> 188 | case idna_data:lookup(CP) of 189 | {[$M|_], _} -> 190 | erlang:exit({bad_label, {initial_combiner, "Label begins with an illegal combining character"}}); 191 | _ -> 192 | ok 193 | end. 194 | 195 | check_context(Label) -> 196 | check_context(Label, Label, true, 0). 197 | 198 | check_context(Label, CheckJoiners) -> 199 | check_context(Label, Label, CheckJoiners, 0). 200 | 201 | check_context([CP | Rest], Label, CheckJoiners, Pos) -> 202 | case idna_table:lookup(CP) of 203 | 'PVALID' -> 204 | check_context(Rest, Label, CheckJoiners, Pos + 1); 205 | 'CONTEXTJ' -> 206 | ok = valid_contextj(CP, Label, Pos, CheckJoiners), 207 | check_context(Rest, Label, CheckJoiners, Pos + 1); 208 | 'CONTEXTO' -> 209 | ok = valid_contexto(CP, Label, Pos, CheckJoiners), 210 | check_context(Rest, Label, CheckJoiners, Pos + 1); 211 | _Status -> 212 | ErrorMsg = error_msg("Codepoint ~p not allowed (~p) at position ~p in ~p", [CP, _Status, Pos, Label]), 213 | erlang:exit({bad_label, {context, ErrorMsg}}) 214 | end; 215 | check_context([], _, _, _) -> 216 | ok. 217 | 218 | 219 | valid_contextj(CP, Label, Pos, true) -> 220 | case idna_context:valid_contextj(CP, Label, Pos) of 221 | true -> 222 | ok; 223 | false -> 224 | ErrorMsg = error_msg("Joiner ~p not allowed at position ~p in ~p", [CP, Pos, Label]), 225 | erlang:exit({bad_label, {contextj, ErrorMsg}}) 226 | end; 227 | valid_contextj(_CP, _Label, _Pos, false) -> 228 | ok. 229 | 230 | valid_contexto(CP, Label, Pos, true) -> 231 | case idna_context:valid_contexto(CP, Label, Pos) of 232 | true -> 233 | ok; 234 | false -> 235 | ErrorMsg = error_msg("Joiner ~p not allowed at position ~p in ~p", [CP, Pos, Label]), 236 | erlang:exit({bad_label, {contexto, ErrorMsg}}) 237 | end; 238 | valid_contexto(_CP, _Label, _Pos, false) -> 239 | ok. 240 | 241 | 242 | 243 | -spec check_label(string()) -> ok. 244 | check_label(Label) -> 245 | check_label(Label, true, true, true). 246 | 247 | %% @doc validate a label of a domain 248 | -spec check_label(Label, CheckHyphens, CheckJoiners, CheckBidi) -> Result when 249 | Label :: string(), 250 | CheckHyphens :: boolean(), 251 | CheckJoiners :: boolean(), 252 | CheckBidi :: boolean(), 253 | Result :: ok. 254 | check_label(Label, CheckHyphens, CheckJoiners, CheckBidi) -> 255 | ok = check_nfc(Label), 256 | ok = check_hyphen(Label, CheckHyphens), 257 | ok = check_initial_combiner(Label), 258 | ok = check_context(Label, CheckJoiners), 259 | ok = check_bidi(Label, CheckBidi), 260 | ok. 261 | 262 | 263 | check_bidi(Label, true) -> 264 | idna_bidi:check_bidi(Label); 265 | check_bidi(_, false) -> 266 | ok. 267 | 268 | check_label_length(Label) when length(Label) > 63 -> 269 | ErrorMsg = error_msg("The label ~p is too long", [Label]), 270 | erlang:exit({bad_label, {too_long, ErrorMsg}}); 271 | check_label_length(_) -> 272 | ok. 273 | 274 | alabel(Label0) -> 275 | Label = case lists:all(fun(C) -> idna_ucs:is_ascii(C) end, Label0) of 276 | true -> 277 | _ = try ulabel(Label0) 278 | catch 279 | _:Error -> 280 | ErrorMsg = error_msg("The label ~p is not a valid A-label: ulabel error=~p", [Label0, Error]), 281 | erlang:exit({bad_label, {alabel, ErrorMsg}}) 282 | end, 283 | ok = check_label_length(Label0), 284 | 285 | Label0; 286 | false -> 287 | ok = check_label(Label0), 288 | ?ACE_PREFIX ++ punycode:encode(Label0) 289 | end, 290 | ok = check_label_length(Label), 291 | Label. 292 | 293 | decode_1([], Acc) -> 294 | lists:reverse(Acc); 295 | decode_1([Label|Labels], []) -> 296 | decode_1(Labels, lists:reverse(ulabel(Label))); 297 | decode_1([Label|Labels], Acc) -> 298 | decode_1(Labels, lists:reverse(ulabel(Label), [$.|Acc])). 299 | 300 | ulabel([]) -> []; 301 | ulabel(Label0) -> 302 | Label = case lists:all(fun(C) -> idna_ucs:is_ascii(C) end, Label0) of 303 | true -> 304 | case Label0 of 305 | [$x,$n,$-,$-|Label1] -> 306 | punycode:decode(lowercase(Label1)); 307 | _ -> 308 | lowercase(Label0) 309 | end; 310 | false -> 311 | lowercase(Label0) 312 | end, 313 | ok = check_label(Label), 314 | Label. 315 | 316 | %% Lowercase all chars in Str 317 | -spec lowercase(String::unicode:chardata()) -> unicode:chardata(). 318 | lowercase(CD) when is_list(CD) -> 319 | try lowercase_list(CD, false) 320 | catch unchanged -> CD 321 | end; 322 | lowercase(<>=Orig) -> 323 | try lowercase_bin(CP1, Rest, false) of 324 | List -> unicode:characters_to_binary(List) 325 | catch unchanged -> Orig 326 | end; 327 | lowercase(<<>>) -> 328 | <<>>. 329 | 330 | 331 | lowercase_list([CP1|[CP2|_]=Cont], _Changed) when $A =< CP1, CP1 =< $Z, CP2 < 256 -> 332 | [CP1+32|lowercase_list(Cont, true)]; 333 | lowercase_list([CP1|[CP2|_]=Cont], Changed) when CP1 < 128, CP2 < 256 -> 334 | [CP1|lowercase_list(Cont, Changed)]; 335 | lowercase_list([], true) -> 336 | []; 337 | lowercase_list([], false) -> 338 | throw(unchanged); 339 | lowercase_list(CPs0, Changed) -> 340 | case unicode_util:lowercase(CPs0) of 341 | [Char|CPs] when Char =:= hd(CPs0) -> [Char|lowercase_list(CPs, Changed)]; 342 | [Char|CPs] -> append(Char,lowercase_list(CPs, true)); 343 | [] -> lowercase_list([], Changed) 344 | end. 345 | 346 | lowercase_bin(CP1, <>, _Changed) 347 | when $A =< CP1, CP1 =< $Z, CP2 < 256 -> 348 | [CP1+32|lowercase_bin(CP2, Bin, true)]; 349 | lowercase_bin(CP1, <>, Changed) 350 | when CP1 < 128, CP2 < 256 -> 351 | [CP1|lowercase_bin(CP2, Bin, Changed)]; 352 | lowercase_bin(CP1, Bin, Changed) -> 353 | case unicode_util:lowercase([CP1|Bin]) of 354 | [CP1|CPs] -> 355 | case unicode_util:cp(CPs) of 356 | [Next|Rest] -> 357 | [CP1|lowercase_bin(Next, Rest, Changed)]; 358 | [] when Changed -> 359 | [CP1]; 360 | [] -> 361 | throw(unchanged) 362 | end; 363 | [Char|CPs] -> 364 | case unicode_util:cp(CPs) of 365 | [Next|Rest] -> 366 | [Char|lowercase_bin(Next, Rest, true)]; 367 | [] -> 368 | [Char] 369 | end 370 | end. 371 | 372 | 373 | append(Char, <<>>) when is_integer(Char) -> [Char]; 374 | append(Char, <<>>) when is_list(Char) -> Char; 375 | append(Char, Bin) when is_binary(Bin) -> [Char,Bin]; 376 | append(Char, Str) when is_integer(Char) -> [Char|Str]; 377 | append(GC, Str) when is_list(GC) -> GC ++ Str. 378 | 379 | 380 | characters_to_nfc_list(CD) -> 381 | case unicode_util:nfc(CD) of 382 | [CPs|Str] when is_list(CPs) -> CPs ++ characters_to_nfc_list(Str); 383 | [CP|Str] -> [CP|characters_to_nfc_list(Str)]; 384 | [] -> [] 385 | end. 386 | 387 | 388 | uts46_remap(Str, Std3Rules, Transitional) -> 389 | characters_to_nfc_list(uts46_remap_1(Str, Std3Rules, Transitional)). 390 | 391 | uts46_remap_1([Cp|Rs], Std3Rules, Transitional) -> 392 | Row = try idna_mapping:uts46_map(Cp) 393 | catch 394 | error:badarg -> 395 | ?LOG_ERROR("codepoint ~p not found in mapping list~n", [Cp]), 396 | erlang:exit({invalid_codepoint, Cp}) 397 | end, 398 | {Status, Replacement} = case Row of 399 | {_, _} -> Row; 400 | S -> {S, undefined} 401 | end, 402 | if 403 | (Status =:= 'V'); 404 | ((Status =:= 'D') andalso (Transitional =:= false)); 405 | ((Status =:= '3') andalso (Std3Rules =:= true) andalso (Replacement =:= undefined)) -> 406 | [Cp] ++ uts46_remap_1(Rs, Std3Rules, Transitional); 407 | (Replacement =/= undefined) andalso ( 408 | (Status =:= 'M') orelse 409 | (Status =:= '3' andalso Std3Rules =:= false) orelse 410 | (Status =:= 'D' andalso Transitional =:= true)) -> 411 | Replacement ++ uts46_remap_1(Rs, Std3Rules, Transitional); 412 | (Status =:= 'I') -> 413 | uts46_remap_1(Rs, Std3Rules, Transitional); 414 | true -> 415 | erlang:exit({invalid_codepoint, Cp}) 416 | end; 417 | uts46_remap_1([], _, _) -> 418 | []. 419 | 420 | error_msg(Msg, Fmt) -> 421 | lists:flatten(io_lib:format(Msg, Fmt)). 422 | -------------------------------------------------------------------------------- /src/idna_bidi.erl: -------------------------------------------------------------------------------- 1 | %% -*- coding: utf-8 -*- 2 | %%% 3 | %%% This file is part of erlang-idna released under the MIT license. 4 | %%% See the LICENSE for more information. 5 | %%% 6 | 7 | -module(idna_bidi). 8 | -author("benoitc"). 9 | 10 | %% API 11 | -export([check_bidi/1, check_bidi/2]). 12 | 13 | check_bidi(Label) -> check_bidi(Label, false). 14 | 15 | check_bidi(Label, CheckLtr) -> 16 | %% Bidi rules should only be applied if string contains RTL characters 17 | case {check_rtl(Label, Label), CheckLtr} of 18 | {false, false} -> ok; 19 | _ -> 20 | [C | _Rest] = Label, 21 | % bidi rule 1 22 | RTL = rtl(C, Label), 23 | check_bidi1(Label, RTL, false, undefined) 24 | end. 25 | 26 | check_rtl([C | Rest], Label) -> 27 | case idna_data:bidirectional(C) of 28 | false -> 29 | erlang:exit(bidi_error("unknown directionality in label=~p c=~w~n", [Label, C])); 30 | Dir -> 31 | case lists:member(Dir, ["R", "AL", "AN"]) of 32 | true -> true; 33 | false -> check_rtl(Rest, Label) 34 | end 35 | end; 36 | check_rtl([], _Label) -> 37 | false. 38 | 39 | rtl(C, Label) -> 40 | case idna_data:bidirectional(C) of 41 | "R" -> true; 42 | "AL" -> true; 43 | "L" -> false; 44 | _ -> 45 | erlang:exit(bidi_error("first codepoint in label ~p must be directionality L, R or AL ", [Label])) 46 | end. 47 | 48 | 49 | check_bidi1([C | Rest], true, ValidEnding, NumberType) -> 50 | Dir = idna_data:bidirectional(C), 51 | %% bidi rule 2 52 | ValidEnding2 = case lists:member(Dir, ["R", "AL", "AN", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"]) of 53 | true -> 54 | % bidi rule 3 55 | case lists:member(Dir, ["R", "AL", "AN", "EN"]) of 56 | true -> true; 57 | false when Dir =/= "NSM" -> false; 58 | false -> ValidEnding 59 | end; 60 | false -> 61 | erlang:exit({bad_label, {bidi, "Invalid direction for codepoint in a right-to-left label"}}) 62 | end, 63 | % bidi rule 4 64 | NumberType2 = case lists:member(Dir, ["AN", "EN"]) of 65 | true when NumberType =:= undefined -> 66 | Dir; 67 | true when NumberType /= Dir -> 68 | erlang:exit({bad_label, {bidi, "Can not mix numeral types in a right-to-left label"}}); 69 | _ -> 70 | NumberType 71 | end, 72 | check_bidi1(Rest, true, ValidEnding2, NumberType2); 73 | check_bidi1([C | Rest], false, ValidEnding, NumberType) -> 74 | Dir = idna_data:bidirectional(C), 75 | % bidi rule 5 76 | ValidEnding2 = case lists:member(Dir, ["L", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"]) of 77 | true -> 78 | % bidi rule 6 79 | case Dir of 80 | "L" -> true; 81 | "EN" -> true; 82 | _ when Dir /= "NSM" -> false; 83 | _ -> ValidEnding 84 | end; 85 | false -> 86 | erlang:exit({bad_label, {bidi, "Invalid direction for codepoint in a left-to-right label"}}) 87 | end, 88 | check_bidi1(Rest, false, ValidEnding2, NumberType); 89 | check_bidi1([], _, false, _) -> 90 | erlang:exit({bad_label, {bidi, "Label ends with illegal codepoint directionality"}}); 91 | check_bidi1([], _, true, _) -> 92 | ok. 93 | 94 | bidi_error(Msg, Fmt) -> 95 | ErrorMsg = lists:flatten(io_lib:format(Msg, Fmt)), 96 | {bad_label, {bidi, ErrorMsg}}. 97 | -------------------------------------------------------------------------------- /src/idna_context.erl: -------------------------------------------------------------------------------- 1 | %% -*- coding: utf-8 -*- 2 | %%% 3 | %%% This file is part of erlang-idna released under the MIT license. 4 | %%% See the LICENSE for more information. 5 | %%% 6 | -module(idna_context). 7 | -author("benoitc"). 8 | 9 | %% API 10 | -export([ 11 | valid_contextj/2, valid_contextj/3, 12 | valid_contexto/2, valid_contexto/3, 13 | contexto_with_rule/1 14 | ]). 15 | 16 | -define(virama_combining_class, 9). 17 | 18 | 19 | valid_contextj([], _Pos) -> true; 20 | 21 | valid_contextj(Label, Pos) -> 22 | CP = lists:nth(Pos + 1, Label), 23 | valid_contextj(CP, Label, Pos). 24 | 25 | valid_contextj(16#200c, Label, Pos) -> 26 | if 27 | Pos > 0 -> 28 | case unicode_util:lookup(lists:nth(Pos, Label)) of 29 | #{ ccc := ?virama_combining_class } -> true; 30 | _ -> 31 | valid_contextj_1(Label, Pos) 32 | end; 33 | true -> 34 | valid_contextj_1(Label, Pos) 35 | end; 36 | 37 | valid_contextj(16#200d, Label, Pos) when Pos > 0 -> 38 | case unicode_util:lookup(lists:nth(Pos, Label)) of 39 | #{ ccc := ?virama_combining_class } -> true; 40 | _ -> false 41 | end; 42 | valid_contextj(_, _, _) -> 43 | false. 44 | 45 | valid_contextj_1(Label, Pos) -> 46 | case range(lists:reverse(lists:nthtail(Pos, Label))) of 47 | true -> 48 | range(lists:nthtail(Pos+2, Label)); 49 | false -> 50 | false 51 | end. 52 | 53 | range([CP|Rest]) -> 54 | case idna_data:joining_types(CP) of 55 | "T" -> range(Rest); 56 | "L" -> true; 57 | "D" -> true; 58 | _ -> 59 | range(Rest) 60 | end; 61 | range([]) -> 62 | false. 63 | 64 | valid_contexto([], _Pos) -> 65 | io:format("ici", []), 66 | true; 67 | valid_contexto(Label, Pos) -> 68 | CP = lists:nth(Pos + 1, Label), 69 | valid_contexto(CP, Label, Pos). 70 | 71 | valid_contexto(CP, Label, Pos) -> 72 | Len = length(Label), 73 | case CP of 74 | 16#00B7 -> 75 | 76 | % MIDDLE DOT 77 | if 78 | (Pos > 0) andalso (Pos < (Len -1)) -> 79 | case lists:sublist(Label, Pos, 3) of 80 | [16#006C, _, 16#006C] -> true; 81 | _ -> false 82 | end; 83 | true -> 84 | false 85 | end; 86 | 16#0375 -> 87 | % GREEK LOWER NUMERAL SIGN (KERAIA) 88 | if 89 | (Pos < (Len -1)) andalso (Len > 1) -> 90 | case idna_data:scripts(lists:nth(Pos + 2, Label)) of 91 | "greek" -> true; 92 | _Else -> false 93 | end; 94 | true -> 95 | false 96 | end; 97 | 16#30FB -> 98 | % KATAKANA MIDDLE DOT 99 | script_ok(Label); 100 | CP when CP == 16#05F3; CP == 16#05F4 -> 101 | % HEBREW PUNCTUATION GERESH or HEBREW PUNCTUATION GERSHAYIM 102 | if 103 | Pos > 0 -> 104 | case idna_data:scripts(lists:nth(Pos, Label)) of 105 | "hebrew" -> true; 106 | _ -> false 107 | end; 108 | true -> 109 | false 110 | end; 111 | CP when CP >= 16#660, CP =< 16#669 -> 112 | % ARABIC-INDIC DIGITS 113 | contexto_in_range(Label, 16#6F0, 16#6F9); 114 | CP when 16#6F0 =< CP, CP =< 16#6F9 -> 115 | % EXTENDED ARABIC-INDIC DIGIT 116 | contexto_in_range(Label, 16#660, 16#669); 117 | _ -> 118 | 119 | false 120 | end. 121 | 122 | 123 | contexto_in_range([CP | _], Start, End) when CP >= Start, CP =< End -> false; 124 | contexto_in_range([_CP|Rest], Start, End) -> contexto_in_range(Rest, Start, End); 125 | contexto_in_range([], _, _) -> true. 126 | 127 | script_ok([16#30fb| Rest]) -> 128 | script_ok(Rest); 129 | script_ok([C | Rest]) -> 130 | case idna_data:scripts(C) of 131 | "hiragana" -> true; 132 | "katakana" -> true; 133 | "han" -> true; 134 | _ -> 135 | script_ok(Rest) 136 | end; 137 | script_ok([]) -> 138 | false. 139 | 140 | contexto_with_rule(16#00B7) -> true; 141 | % MIDDLE DOT 142 | contexto_with_rule(16#0375) -> true; 143 | % GREEK LOWER NUMERAL SIGN (KERAIA) 144 | contexto_with_rule(16#05F3) -> true; 145 | % HEBREW PUNCTUATION GERESH 146 | contexto_with_rule(16#05F4) -> true; 147 | % HEBREW PUNCTUATION GERSHAYIM 148 | contexto_with_rule(16#30FB) -> true; 149 | % KATAKANA MIDDLE DOT 150 | contexto_with_rule(CP) when 16#0660 =< CP, CP =< 16#0669 -> true; 151 | % ARABIC-INDIC DIGITS 152 | contexto_with_rule(CP) when 16#06F0 =< CP, CP =< 16#06F9 -> true; 153 | % KATAKANA MIDDLE DOT 154 | contexto_with_rule(_) -> false. 155 | -------------------------------------------------------------------------------- /src/idna_logger.hrl: -------------------------------------------------------------------------------- 1 | -ifdef('OTP_RELEASE'). 2 | -include_lib("kernel/include/logger.hrl"). 3 | -else. 4 | -define(LOG_INFO(Format, Args), error_logger:info_msg(Format, Args)). 5 | -define(LOG_ERROR(Format, Args), error_logger:error_msg(Format, Args)). 6 | -define(LOG_WARNING(Format, Args), error_logger:warning_msg(Format, Args)). 7 | -endif. -------------------------------------------------------------------------------- /src/idna_ucs.erl: -------------------------------------------------------------------------------- 1 | %%% -*- erlang -*- 2 | %% 3 | %% Copyright Ericsson AB 2005-2016. All Rights Reserved. 4 | %% 5 | %% Licensed under the Apache License, Version 2.0 (the "License"); 6 | %% you may not use this file except in compliance with the License. 7 | %% You may obtain a copy of the License at 8 | %% 9 | %% http://www.apache.org/licenses/LICENSE-2.0 10 | %% 11 | %% Unless required by applicable law or agreed to in writing, software 12 | %% distributed under the License is distributed on an "AS IS" BASIS, 13 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | %% See the License for the specific language governing permissions and 15 | %% limitations under the License. 16 | 17 | 18 | -module(idna_ucs). 19 | 20 | -compile([verbose,report_warnings,warn_unused_vars]). 21 | 22 | 23 | %%% Micellaneous predicates 24 | -export([is_iso10646/1, is_unicode/1, is_ascii/1]). 25 | 26 | %%% UTF-8 encoding and decoding 27 | -export([to_utf8/1, from_utf8/1]). 28 | 29 | %%% Test if Ch is a legitimate ISO-10646 character code 30 | is_iso10646(Ch) when is_integer(Ch), Ch >= 0 -> 31 | if Ch < 16#D800 -> true; 32 | Ch < 16#E000 -> false; % Surrogates 33 | Ch < 16#FFFE -> true; 34 | Ch =< 16#FFFF -> false; % FFFE and FFFF (not characters) 35 | Ch =< 16#7FFFFFFF -> true; 36 | true -> false 37 | end; 38 | is_iso10646(_) -> false. 39 | 40 | %%% Test if Ch is a legitimate ISO-10646 character code capable of 41 | %%% being encoded in a UTF-16 string. 42 | is_unicode(Ch) when Ch < 16#110000 -> is_iso10646(Ch); 43 | is_unicode(_) -> false. 44 | 45 | %%% Test for legitimate ASCII code 46 | is_ascii(Ch) when is_integer(Ch), Ch >= 0, Ch =< 127 -> true; 47 | is_ascii(_) -> false. 48 | 49 | 50 | %%% UTF-8 encoding and decoding 51 | to_utf8(List) when is_list(List) -> lists:flatmap(fun to_utf8/1, List); 52 | to_utf8(Ch) -> char_to_utf8(Ch). 53 | 54 | from_utf8(Bin) when is_binary(Bin) -> from_utf8(binary_to_list(Bin)); 55 | from_utf8(List) -> 56 | case expand_utf8(List) of 57 | {Result,0} -> Result; 58 | {_Res,_NumBadChar} -> 59 | exit({ucs,{bad_utf8_character_code}}) 60 | end. 61 | 62 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 63 | %%% UTF-8 support 64 | %%% Possible errors encoding UTF-8: 65 | %%% - Non-character values (something other than 0 .. 2^31-1). 66 | %%% - Surrogate pair code in string. 67 | %%% - 16#FFFE or 16#FFFF character in string. 68 | %%% Possible errors decoding UTF-8: 69 | %%% - 10xxxxxx or 1111111x as initial byte. 70 | %%% - Insufficient number of 10xxxxxx octets following an initial octet of 71 | %%% multi-octet sequence. 72 | %%% - Non-canonical encoding used. 73 | %%% - Surrogate-pair code encoded as UTF-8. 74 | %%% - 16#FFFE or 16#FFFF character in string. 75 | char_to_utf8(Ch) when is_integer(Ch), Ch >= 0 -> 76 | if Ch < 128 -> 77 | %% 0yyyyyyy 78 | [Ch]; 79 | Ch < 16#800 -> 80 | %% 110xxxxy 10yyyyyy 81 | [16#C0 + (Ch bsr 6), 82 | 128+(Ch band 16#3F)]; 83 | Ch < 16#10000 -> 84 | %% 1110xxxx 10xyyyyy 10yyyyyy 85 | if Ch < 16#D800; Ch > 16#DFFF, Ch < 16#FFFE -> 86 | [16#E0 + (Ch bsr 12), 87 | 128+((Ch bsr 6) band 16#3F), 88 | 128+(Ch band 16#3F)] 89 | end; 90 | Ch < 16#200000 -> 91 | %% 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy 92 | [16#F0+(Ch bsr 18), 93 | 128+((Ch bsr 12) band 16#3F), 94 | 128+((Ch bsr 6) band 16#3F), 95 | 128+(Ch band 16#3F)]; 96 | Ch < 16#4000000 -> 97 | %% 111110xx 10xxxyyy 10yyyyyy 10yyyyyy 10yyyyyy 98 | [16#F8+(Ch bsr 24), 99 | 128+((Ch bsr 18) band 16#3F), 100 | 128+((Ch bsr 12) band 16#3F), 101 | 128+((Ch bsr 6) band 16#3F), 102 | 128+(Ch band 16#3F)]; 103 | Ch < 16#80000000 -> 104 | %% 1111110x 10xxxxyy 10yyyyyy 10yyyyyy 10yyyyyy 10yyyyyy 105 | [16#FC+(Ch bsr 30), 106 | 128+((Ch bsr 24) band 16#3F), 107 | 128+((Ch bsr 18) band 16#3F), 108 | 128+((Ch bsr 12) band 16#3F), 109 | 128+((Ch bsr 6) band 16#3F), 110 | 128+(Ch band 16#3F)] 111 | end. 112 | 113 | 114 | 115 | 116 | %% expand_utf8([Byte]) -> {[UnicodeChar],NumberOfBadBytes} 117 | %% Expand UTF8 byte sequences to ISO 10646/Unicode 118 | %% characters. Any illegal bytes are removed and the number of 119 | %% bad bytes are returned. 120 | %% 121 | %% Reference: 122 | %% RFC 3629: "UTF-8, a transformation format of ISO 10646". 123 | 124 | expand_utf8(Str) -> 125 | expand_utf8_1(Str, [], 0). 126 | 127 | expand_utf8_1([C|Cs], Acc, Bad) when C < 16#80 -> 128 | %% Plain Ascii character. 129 | expand_utf8_1(Cs, [C|Acc], Bad); 130 | expand_utf8_1([C1,C2|Cs], Acc, Bad) when C1 band 16#E0 =:= 16#C0, 131 | C2 band 16#C0 =:= 16#80 -> 132 | case ((C1 band 16#1F) bsl 6) bor (C2 band 16#3F) of 133 | C when 16#80 =< C -> 134 | expand_utf8_1(Cs, [C|Acc], Bad); 135 | _ -> 136 | %% Bad range. 137 | expand_utf8_1(Cs, Acc, Bad+1) 138 | end; 139 | expand_utf8_1([C1,C2,C3|Cs], Acc, Bad) when C1 band 16#F0 =:= 16#E0, 140 | C2 band 16#C0 =:= 16#80, 141 | C3 band 16#C0 =:= 16#80 -> 142 | case ((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor 143 | (C3 band 16#3F) of 144 | C when 16#800 =< C -> 145 | expand_utf8_1(Cs, [C|Acc], Bad); 146 | _ -> 147 | %% Bad range. 148 | expand_utf8_1(Cs, Acc, Bad+1) 149 | end; 150 | expand_utf8_1([C1,C2,C3,C4|Cs], Acc, Bad) when C1 band 16#F8 =:= 16#F0, 151 | C2 band 16#C0 =:= 16#80, 152 | C3 band 16#C0 =:= 16#80, 153 | C4 band 16#C0 =:= 16#80 -> 154 | case ((((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor 155 | (C3 band 16#3F)) bsl 6) bor (C4 band 16#3F) of 156 | C when 16#10000 =< C -> 157 | expand_utf8_1(Cs, [C|Acc], Bad); 158 | _ -> 159 | %% Bad range. 160 | expand_utf8_1(Cs, Acc, Bad+1) 161 | end; 162 | expand_utf8_1([_|Cs], Acc, Bad) -> 163 | %% Ignore bad character. 164 | expand_utf8_1(Cs, Acc, Bad+1); 165 | expand_utf8_1([], Acc, Bad) -> {lists:reverse(Acc),Bad}. 166 | -------------------------------------------------------------------------------- /src/punycode.erl: -------------------------------------------------------------------------------- 1 | %% -*- coding: utf-8 -*- 2 | %%% 3 | %%% This file is part of erlang-idna released under the MIT license. 4 | %%% See the LICENSE for more information. 5 | %%% 6 | %% @doc Punycode ([RFC 3492](http://tools.ietf.org/html/rfc3492)) implementation. 7 | 8 | -module(punycode). 9 | 10 | 11 | -export([encode/1, 12 | decode/1]). 13 | 14 | -define(BASE, 36). 15 | -define(TMIN, 1). 16 | -define(TMAX, 26). 17 | -define(SKEW, 38). 18 | -define(DAMP, 700). 19 | -define(INITIAL_BIAS, 72). 20 | -define(INITIAL_N, 128). 21 | -define(DELIMITER, $-). 22 | 23 | 24 | -define(MAX, 1 bsl 32 - 1). 25 | 26 | %% @doc Convert Unicode to Punycode. 27 | %% 28 | %% exit with an overflow error on overflow, which can only happen on inputs 29 | %% that would take more than 63 encoded bytes, the DNS limit on domain name labels. 30 | -spec encode(string()) -> string(). 31 | encode(Input) -> 32 | Output0 = lists:filtermap(fun 33 | (C) when C < 16#80 -> {true, C}; 34 | (_) -> false 35 | end, Input), 36 | B = length(Output0), 37 | Output = case B > 0 of 38 | true -> Output0 ++ [?DELIMITER]; 39 | false -> Output0 40 | end, 41 | H = B, 42 | encode(Input, Output, H, B, ?INITIAL_N, 0, ?INITIAL_BIAS). 43 | 44 | 45 | encode(Input, Output, H, B, N, Delta, Bias) when H < length(Input) -> 46 | M = lists:min(lists:filter(fun(C) -> C >= N end, Input)), 47 | Delta1 = case (M - N) > ((?MAX - Delta) / (H +1)) of 48 | false -> Delta + (M - N) * (H + 1); 49 | true -> exit(oveflow) 50 | end, 51 | {Output2, H2, Delta2, N2, Bias2} = encode1(Input, Output, H, B, M, Delta1, Bias), 52 | encode(Input, Output2, H2, B, N2, Delta2, Bias2); 53 | encode(_, Output, _, _, _, _, _) -> 54 | Output. 55 | 56 | encode1([C|Rest], Output, H, B, N, Delta, Bias) when C < N -> 57 | Delta2 = Delta + 1, 58 | case Delta2 of 59 | 0 -> exit(oveflow); 60 | _ -> 61 | encode1(Rest, Output, H, B, N, Delta2, Bias) 62 | end; 63 | encode1([C|Rest], Output, H, B, N, Delta, Bias) when C == N -> 64 | encode2(Rest, Output, H, B, N, Delta, Bias, Delta, ?BASE); 65 | encode1([_|Rest], Output, H, B, N, Delta, Bias) -> 66 | encode1(Rest, Output, H, B, N, Delta, Bias); 67 | encode1([], Output, H, _B, N, Delta, Bias) -> 68 | {Output, H, Delta + 1, N +1, Bias}. 69 | 70 | encode2(Rest, Output, H, B, N, Delta, Bias, Q, K) -> 71 | T = if 72 | K =< Bias -> ?TMIN; 73 | K >= (Bias + ?TMAX) -> ?TMAX; 74 | true -> K - Bias 75 | end, 76 | case Q < T of 77 | true -> 78 | CodePoint = to_digit(Q), 79 | Output2 = Output ++ [CodePoint], 80 | Bias2 = adapt(Delta, H +1, H == B), 81 | Delta2 = 0, 82 | H2 = H + 1, 83 | encode1(Rest, Output2, H2, B, N, Delta2, Bias2); 84 | false -> 85 | CodePoint = to_digit(T + ((Q - T) rem (?BASE - T))), 86 | Output2 = Output ++ [CodePoint], 87 | Q2 = (Q - T) div (?BASE - T), 88 | encode2(Rest, Output2, H, B, N, Delta, Bias, Q2, K + ?BASE) 89 | end. 90 | 91 | to_digit(V) when V >= 0, V =< 25 -> V + $a; 92 | to_digit(V) when V >= 26, V =< 35 -> V - 26 + $0; 93 | to_digit(_) -> exit(badarg). 94 | 95 | 96 | %% @doc Convert Punycode to Unicode. 97 | %% exit with an overflow or badarg errors if malformed or overflow. 98 | %% Overflow can only happen on inputs that take more than 63 encoded bytes, 99 | %% the DNS limit on domain name labels. 100 | -spec decode(string()) -> string(). 101 | decode(Input) -> 102 | {Output, Input2} = case string:rstr(Input, [?DELIMITER]) of 103 | 0 -> {"", Input}; 104 | Pos -> 105 | {lists:sublist(Input, Pos - 1), lists:sublist(Input, Pos + 1, length(Input) )} 106 | end, 107 | decode(Input2, Output, ?INITIAL_N, ?INITIAL_BIAS, 0). 108 | 109 | decode([], Output, _, _, _) -> Output; 110 | decode(Input, Output, N, Bias, I) -> 111 | decode(Input, Output, N, Bias, I, I, 1, ?BASE). 112 | 113 | decode([C|Rest], Output, N, Bias, I0, OldI, Weight, K) -> 114 | Digit = digit(C), 115 | I1 = case Digit > ((?MAX - I0 ) div Weight) of 116 | false -> I0 + (Digit * Weight); 117 | true -> exit(overflow) 118 | end, 119 | T = if 120 | K =< Bias -> ?TMIN; 121 | K >= (Bias + ?TMAX) -> ?TMAX; 122 | true -> K - Bias 123 | end, 124 | case Digit < T of 125 | true -> 126 | Len = length(Output), 127 | Bias2 = adapt(I1 - OldI, Len + 1, (OldI =:= 0)), 128 | {N2, I2}= case (I1 div (Len +1)) > (?MAX - N) of 129 | false -> 130 | {N + (I1 div (Len + 1)), I1 rem (Len + 1)}; 131 | true -> 132 | exit(overflow) 133 | end, 134 | Output2 = insert(Output, N2, [], I2), 135 | decode(Rest, Output2, N2, Bias2, I2+1); 136 | false -> 137 | case Weight > (?MAX div (?BASE - T)) of 138 | false -> 139 | decode(Rest, Output, N, Bias, I1, OldI, Weight * (?BASE - T), K + ?BASE); 140 | true -> 141 | exit(overflow) 142 | end 143 | end. 144 | 145 | insert(Tail, CP, Head, 0) -> 146 | Head ++ [CP | Tail]; 147 | insert([], _CP, _Head, I) when I > 0-> 148 | exit(overflow); 149 | insert([C | Tail], CP, Head, I) -> 150 | insert(Tail, CP, Head ++ [C], I - 1). 151 | 152 | 153 | digit(C) when C >= $0, C =< $9 -> C - $0 + 26; 154 | digit(C) when C >= $A, C =< $Z -> C - $A; 155 | digit(C) when C >= $a, C =< $z -> C - $a; 156 | digit(_) -> exit(badarg). 157 | 158 | adapt(Delta, NumPoints, FirstTime) -> 159 | Delta2 = case FirstTime of 160 | true -> 161 | Delta div ?DAMP; 162 | false -> 163 | Delta div 2 164 | end, 165 | adapt(Delta2 + (Delta2 div NumPoints), 0). 166 | 167 | adapt(Delta, K) -> 168 | case Delta > (((?BASE - ?TMIN) * ?TMAX) div 2) of 169 | true -> 170 | adapt(Delta div (?BASE - ?TMIN), K + ?BASE); 171 | false -> 172 | K + (((?BASE - ?TMIN + 1) * Delta) div (Delta + ?SKEW)) 173 | end. -------------------------------------------------------------------------------- /test/compat_test.erl: -------------------------------------------------------------------------------- 1 | %%% 2 | %%% This file is part of erlang-idna released under the MIT license. 3 | %%% See the LICENSE for more information. 4 | %%% 5 | -module(compat_test). 6 | -author("benoitc"). 7 | 8 | %% API 9 | -export([to_ascii_test/0, to_unicode_test/0]). 10 | 11 | 12 | -include_lib("eunit/include/eunit.hrl"). 13 | 14 | to_ascii_test() -> 15 | ?assertEqual("xn--zckzah.xn--zckzah", idna:to_ascii("テスト.xn--zckzah")). 16 | 17 | to_unicode_test() -> 18 | ?assertEqual([12486,12473,12488,46,12486,12473,12488], idna:to_unicode("xn--zckzah.xn--zckzah")). -------------------------------------------------------------------------------- /test/idna_test.erl: -------------------------------------------------------------------------------- 1 | %%% 2 | %%% This file is part of erlang-idna released under the MIT license. 3 | %%% See the LICENSE for more information. 4 | %%% 5 | -module(idna_test). 6 | -author("benoitc"). 7 | 8 | 9 | -define(tld_strings, [ 10 | {[16#6d4b,16#8bd5], "xn--0zwm56d"}, 11 | {[16#092a,16#0930,16#0940,16#0915,16#094d,16#0937,16#093e], "xn--11b5bs3a9aj6g"}, 12 | {[16#d55c,16#ad6d], "xn--3e0b707e"}, 13 | {[16#09ad,16#09be,16#09b0,16#09a4], "xn--45brj9c"}, 14 | {[16#09ac,16#09be,16#0982,16#09b2,16#09be], "xn--54b7fta0cc"}, 15 | {[16#0438,16#0441,16#043f,16#044b,16#0442,16#0430,16#043d,16#0438,16#0435], "xn--80akhbyknj4f"}, 16 | {[16#0441,16#0440,16#0431], "xn--90a3ac"}, 17 | {[16#d14c,16#c2a4,16#d2b8], "xn--9t4b11yi5a"}, 18 | {[16#0b9a,16#0bbf,16#0b99,16#0bcd,16#0b95,16#0baa,16#0bcd,16#0baa,16#0bc2,16#0bb0,16#0bcd], "xn--clchc0ea0b2g2a9gcd"}, 19 | {[16#05d8,16#05e2,16#05e1,16#05d8], "xn--deba0ad"}, 20 | {[16#4e2d,16#56fd], "xn--fiqs8s"}, 21 | {[16#4e2d,16#570b], "xn--fiqz9s"}, 22 | {[16#0c2d,16#0c3e,16#0c30,16#0c24,16#0c4d], "xn--fpcrj9c3d"}, 23 | {[16#0dbd,16#0d82,16#0d9a,16#0dcf], "xn--fzc2c9e2c"}, 24 | {[16#6e2c,16#8a66], "xn--g6w251d"}, 25 | {[16#0aad,16#0abe,16#0ab0,16#0aa4], "xn--gecrj9c"}, 26 | {[16#092d,16#093e,16#0930,16#0924], "xn--h2brj9c"}, 27 | {[16#0622,16#0632,16#0645,16#0627,16#06cc,16#0634,16#06cc], "xn--hgbk6aj7f53bba"}, 28 | {[16#0baa,16#0bb0,16#0bbf,16#0b9f,16#0bcd,16#0b9a,16#0bc8], "xn--hlcj6aya9esc7a"}, 29 | {[16#0443,16#043a,16#0440], "xn--j1amh"}, 30 | {[16#9999,16#6e2f], "xn--j6w193g"}, 31 | {[16#03b4,16#03bf,16#03ba,16#03b9,16#03bc,16#03ae], "xn--jxalpdlp"}, 32 | {[16#0625,16#062e,16#062a,16#0628,16#0627,16#0631], "xn--kgbechtv"}, 33 | {[16#53f0,16#6e7e], "xn--kprw13d"}, 34 | {[16#53f0,16#7063], "xn--kpry57d"}, 35 | {[16#0627,16#0644,16#062c,16#0632,16#0627,16#0626,16#0631], "xn--lgbbat1ad8j"}, 36 | {[16#0639,16#0645,16#0627,16#0646], "xn--mgb9awbf"}, 37 | {[16#0627,16#06cc,16#0631,16#0627,16#0646], "xn--mgba3a4f16a"}, 38 | {[16#0627,16#0645,16#0627,16#0631,16#0627,16#062a], "xn--mgbaam7a8h"}, 39 | {[16#067e,16#0627,16#06a9,16#0633,16#062a,16#0627,16#0646], "xn--mgbai9azgqp6j"}, 40 | {[16#0627,16#0644,16#0627,16#0631,16#062f,16#0646], "xn--mgbayh7gpa"}, 41 | {[16#0628,16#06be,16#0627,16#0631,16#062a], "xn--mgbbh1a71e"}, 42 | {[16#0627,16#0644,16#0645,16#063a,16#0631,16#0628], "xn--mgbc0a9azcg"}, 43 | {[16#0627,16#0644,16#0633,16#0639,16#0648,16#062f,16#064a,16#0629], "xn--mgberp4a5d4ar"}, 44 | {[16#10d2,16#10d4], "xn--node"}, 45 | {[16#0e44,16#0e17,16#0e22], "xn--o3cw4h"}, 46 | {[16#0633,16#0648,16#0631,16#064a,16#0629], "xn--ogbpf8fl"}, 47 | {[16#0440,16#0444], "xn--p1ai"}, 48 | {[16#062a,16#0648,16#0646,16#0633], "xn--pgbs0dh"}, 49 | {[16#0a2d,16#0a3e,16#0a30,16#0a24], "xn--s9brj9c"}, 50 | {[16#0645,16#0635,16#0631], "xn--wgbh1c"}, 51 | {[16#0642,16#0637,16#0631], "xn--wgbl6a"}, 52 | {[16#0b87,16#0bb2,16#0b99,16#0bcd,16#0b95,16#0bc8], "xn--xkc2al3hye2a"}, 53 | {[16#0b87,16#0ba8,16#0bcd,16#0ba4,16#0bbf,16#0baf,16#0bbe], "xn--xkc2dl3a5ee0h"}, 54 | {[16#65b0,16#52a0,16#5761], "xn--yfro4i67o"}, 55 | {[16#0641,16#0644,16#0633,16#0637,16#064a,16#0646], "xn--ygbi2ammx"}, 56 | {[16#30c6,16#30b9,16#30c8], "xn--zckzah"}, 57 | {[16#049b,16#0430,16#0437], "xn--80ao21a"}, 58 | {[16#0645,16#0644,16#064a,16#0633,16#064a,16#0627], "xn--mgbx4cd0ab"}, 59 | {[16#043c,16#043e,16#043d], "xn--l1acc"}, 60 | {[16#0633,16#0648,16#062f,16#0627,16#0646], "xn--mgbpl2fh"} 61 | ]). 62 | 63 | -include_lib("eunit/include/eunit.hrl"). 64 | 65 | alabels_test() -> 66 | lists:foreach( 67 | fun({ULabel, ALabel}) -> 68 | ?assertEqual(ALabel, idna:alabel(ULabel)) 69 | end, 70 | ?tld_strings 71 | ). 72 | 73 | ulabels_test() -> 74 | lists:foreach( 75 | fun({ULabel, ALabel}) -> 76 | ?assertEqual(ULabel, idna:ulabel(ALabel)) 77 | end, 78 | ?tld_strings 79 | ). 80 | 81 | check_label_length_test() -> 82 | ?assertEqual(ok, idna:check_label_length([$a || _ <- lists:seq(1, 63)])), 83 | ?assertExit({bad_label, {too_long, _Error}}, idna:check_label_length([$a || _ <- lists:seq(1, 64)])), 84 | ?assertExit({bad_label, {too_long, _Error}}, idna:encode([$a || _ <- lists:seq(1, 64)])). 85 | 86 | check_bidi_test() -> 87 | L = [16#0061], 88 | R = [16#05d0], 89 | AL= [16#0627], 90 | AN = [16#0660], 91 | EN = [16#0030], 92 | ES = [16#002d], 93 | CS = [16#002c], 94 | ET = [16#0024], 95 | ON = [16#0021], 96 | BN = [16#200c], 97 | NSM = [16#0610], 98 | WS = [16#0020], 99 | 100 | %% RFC 5893 Rule 1 101 | ok = idna_bidi:check_bidi(L), 102 | ok = idna_bidi:check_bidi(R), 103 | ok = idna_bidi:check_bidi(AL), 104 | ?assertExit({bad_label, {bidi, _}}, idna_bidi:check_bidi(AN)), 105 | 106 | %% RFC 5893 Rule 2 107 | ok = idna_bidi:check_bidi(R ++ AL), 108 | ok = idna_bidi:check_bidi(R ++ AN), 109 | ok = idna_bidi:check_bidi(R ++ EN), 110 | ok = idna_bidi:check_bidi(R ++ ES ++ AL), 111 | ok = idna_bidi:check_bidi(R ++ CS ++ AL), 112 | ok = idna_bidi:check_bidi(R ++ ET ++ AL), 113 | ok = idna_bidi:check_bidi(R ++ ON ++ AL), 114 | ok = idna_bidi:check_bidi(R ++ BN ++ AL), 115 | ok = idna_bidi:check_bidi(R ++ NSM), 116 | ?assertExit({bad_label, {bidi, _}}, idna_bidi:check_bidi(R ++ L)), 117 | ?assertExit({bad_label, {bidi, _}}, idna_bidi:check_bidi(R ++ WS)), 118 | 119 | %% RFC 5893 Rule 3 120 | ok = idna_bidi:check_bidi(R ++ AL), 121 | ok = idna_bidi:check_bidi(R ++ EN), 122 | ok = idna_bidi:check_bidi(R ++ AN), 123 | ok = idna_bidi:check_bidi(R ++ NSM), 124 | ok = idna_bidi:check_bidi(R ++ NSM ++ NSM), 125 | ?assertExit({bad_label, {bidi, _}}, idna_bidi:check_bidi(R ++ ON)), 126 | 127 | %% RFC 5893 Rule 4 128 | ok = idna_bidi:check_bidi(R ++ EN), 129 | ok = idna_bidi:check_bidi(R ++ AN), 130 | ?assertExit({bad_label, {bidi, _}}, idna_bidi:check_bidi(R ++ EN ++ AN)), 131 | ?assertExit({bad_label, {bidi, _}}, idna_bidi:check_bidi(R ++ AN ++ EN)), 132 | 133 | %% RFC 5893 Rule 5 134 | ok = idna_bidi:check_bidi(L ++ EN, true), 135 | ok = idna_bidi:check_bidi(L ++ ES ++ L, true), 136 | ok = idna_bidi:check_bidi(L ++ CS ++ L, true), 137 | ok = idna_bidi:check_bidi(L ++ ET ++ L, true), 138 | ok = idna_bidi:check_bidi(L ++ ON ++ L, true), 139 | ok = idna_bidi:check_bidi(L ++ BN ++ L, true), 140 | ok = idna_bidi:check_bidi(L ++ NSM, true), 141 | 142 | %% RFC 5893 Rule 6 143 | ok = idna_bidi:check_bidi(L ++ L, true), 144 | ok = idna_bidi:check_bidi(L ++ EN, true), 145 | ok = idna_bidi:check_bidi(L ++ EN ++ NSM, true), 146 | ok = idna_bidi:check_bidi(L ++ EN ++ NSM ++ NSM, true), 147 | ?assertExit({bad_label, {bidi, _}}, idna_bidi:check_bidi(L ++ CS, true)). 148 | 149 | check_initial_combiner_test() -> 150 | M = [16#0300], 151 | A = [16#0061], 152 | 153 | ok = idna:check_initial_combiner(A), 154 | ok = idna:check_initial_combiner(A ++ M), 155 | ?assertExit({bad_label, {initial_combiner, _}},idna:check_initial_combiner(M ++ A)). 156 | 157 | check_hyphen_test() -> 158 | ok = idna:check_hyphen("abc"), 159 | ok = idna:check_hyphen("a--b"), 160 | ?assertExit({bad_label, {hyphen, _}},idna:check_hyphen("aa--")), 161 | ?assertExit({bad_label, {hyphen, _}},idna:check_hyphen("a-")), 162 | ?assertExit({bad_label, {hyphen, _}},idna:check_hyphen("-a")). 163 | 164 | 165 | valid_contextj_test() -> 166 | Zwnj = [16#200c], 167 | Zwj = [16#200d], 168 | Virama = [16#094d], 169 | Latin = [16#0061], 170 | 171 | % RFC 5892 Appendix A.1 (Zero Width Non-Joiner) 172 | false = idna_context:valid_contextj(Zwnj, 0), 173 | false = idna_context:valid_contextj(Latin ++ Zwnj, 1), 174 | true = idna_context:valid_contextj(Virama ++ Zwnj, 1), 175 | 176 | % RFC 5892 Appendix A.2 (Zero Width Joiner) 177 | false = idna_context:valid_contextj(Zwj, 0), 178 | false = idna_context:valid_contextj(Latin ++ Zwj, 1), 179 | true = idna_context:valid_contextj(Virama ++ Zwj, 1). 180 | 181 | 182 | valid_contexto_test() -> 183 | Latin = [16#0061], 184 | Latin_l = [16#006c], 185 | Greek = [16#03b1], 186 | Hebrew = [16#05d0], 187 | Katakana = [16#30a1], 188 | Hiragana = [16#3041], 189 | Han = [16#6f22], 190 | Arabic_digit = [16#0660], 191 | Ext_arabic_digit = [16#06f0], 192 | 193 | % RFC 5892 Rule A.3 (Middle Dot) 194 | Latin_middle_dot = [16#00b7], 195 | true = idna_context:valid_contexto(Latin_l ++ Latin_middle_dot ++ Latin_l, 1), 196 | false = idna_context:valid_contexto(Latin_middle_dot ++ Latin_l, 1), 197 | false = idna_context:valid_contexto(Latin_l ++ Latin_middle_dot, 0), 198 | false = idna_context:valid_contexto(Latin_middle_dot, 0), 199 | false = idna_context:valid_contexto(Latin_l ++ Latin_middle_dot ++ Latin, 1), 200 | true = idna_context:valid_contexto("ru" ++ Latin_l ++ Latin_middle_dot ++ Latin_l ++ "z", 3), 201 | false = idna_context:valid_contexto("ru" ++ Latin ++ Latin_middle_dot ++ Latin_l ++ "z", 3), 202 | 203 | % RFC 5892 Rule A.4 (Greek Lower Numeral Sign) 204 | Glns = [16#0375], 205 | true = idna_context:valid_contexto(Glns ++ Greek, 0), 206 | false = idna_context:valid_contexto(Glns ++ Latin, 0), 207 | false = idna_context:valid_contexto(Glns, 0), 208 | false = idna_context:valid_contexto(Greek ++ Glns, 1), 209 | 210 | % RFC 5892 Rule A.5 (Hebrew Punctuation Geresh) 211 | Geresh = [16#05f3], 212 | true = idna_context:valid_contexto(Hebrew ++ Geresh, 1), 213 | false = idna_context:valid_contexto(Latin ++ Geresh, 1), 214 | 215 | % RFC 5892 Rule A.6 (Hebrew Punctuation Gershayim) 216 | Gershayim = [16#05f4], 217 | true = idna_context:valid_contexto(Hebrew ++ Gershayim, 1), 218 | false = idna_context:valid_contexto(Latin ++ Gershayim, 1), 219 | 220 | % RFC 5892 Rule A.7 (Katakana Middle Dot) 221 | Ja_middle_dot = [16#30fb], 222 | true = idna_context:valid_contexto(Katakana ++ Ja_middle_dot ++ Katakana, 1), 223 | true = idna_context:valid_contexto(Hiragana ++ Ja_middle_dot ++ Hiragana, 1), 224 | true = idna_context:valid_contexto(Han ++ Ja_middle_dot ++ Han, 1), 225 | true = idna_context:valid_contexto(Han ++ Ja_middle_dot ++ Latin, 1), 226 | true = idna_context:valid_contexto([16#6f22, 16#30fb, 16#5b57], 1), 227 | false = idna_context:valid_contexto([16#0061, 16#30fb, 16#0061], 1), 228 | 229 | % RFC 5892 Rule A.8 (Arabic-Indic Digits) 230 | true = idna_context:valid_contexto(Katakana ++ Ja_middle_dot ++ Katakana, 1), 231 | false = idna_context:valid_contexto([16#0061, 16#30fb, 16#0061], 1), 232 | 233 | % RFC 5892 Rule A.9 (Extended Arabic-Indic Digits) 234 | true = idna_context:valid_contexto(Ext_arabic_digit ++ Ext_arabic_digit, 0), 235 | false = idna_context:valid_contexto(Ext_arabic_digit ++ Arabic_digit, 0). 236 | 237 | encode_test() -> 238 | ?assertEqual("xn--zckzah.xn--zckzah", idna:encode("xn--zckzah.xn--zckzah")), 239 | ?assertEqual("xn--zckzah.xn--zckzah", idna:encode([16#30c6,16#30b9,16#30c8, $., 16#30c6, 16#30b9, 16#30c8])), 240 | ?assertEqual("abc.abc", idna:encode("abc.abc")), 241 | ?assertEqual("xn--zckzah.abc", idna:encode("xn--zckzah.abc")), 242 | ?assertEqual("xn--zckzah.abc", idna:encode([16#30c6, 16#30b9, 16#30c8, $., $a, $b, $c])), 243 | ?assertEqual( 244 | "xn---------90gglbagaar.aa", 245 | idna:encode([16#0521,16#0525,16#0523,$-,16#0523,16#0523,$-,$-,$-,$-,$-,16#0521,16#0523,16#0523,16#0523|".aa"]) 246 | ), 247 | ?assertExit( 248 | {bad_label, {_, _}}, 249 | idna:encode( 250 | [16#0521,16#0524,16#0523,$-,16#0523,16#0523,$-,$-,$-,$-,$-,16#0521,16#0523,16#0523,16#0523|".aa"], 251 | [{uts46, false}] 252 | ) 253 | ), 254 | ?assertEqual([$a ||_ <- lists:seq(1, 63)], idna:encode([$a ||_ <- lists:seq(1, 63)])), 255 | ?assertExit({bad_label, {_, _}}, idna:encode([$a ||_ <- lists:seq(1, 64)])). 256 | 257 | 258 | decode_test() -> 259 | ?assertEqual([16#30c6, 16#30b9, 16#30c8, $., 16#30c6, 16#30b9, 16#30c8], idna:decode("xn--zckzah.xn--zckzah")), 260 | ?assertEqual( 261 | [16#30c6, 16#30b9, 16#30c8, $., 16#30c6, 16#30b9, 16#30c8], 262 | idna:decode([16#30c6, 16#30b9, 16#30c8|".xn--zckzah"]) 263 | ), 264 | ?assertEqual( 265 | [16#30c6, 16#30b9, 16#30c8, $., 16#30c6, 16#30b9, 16#30c8], 266 | idna:decode([16#30c6, 16#30b9, 16#30c8, $., 16#30c6, 16#30b9, 16#30c8]) 267 | ), 268 | ?assertEqual("abc.abc", idna:decode("abc.abc")), 269 | ?assertEqual( 270 | [16#0521,16#0525,16#0523,$-,16#0523,16#0523,$-,$-,$-,$-,$-,16#0521,16#0523,16#0523,16#0523|".aa"], 271 | idna:decode("xn---------90gglbagaar.aa") 272 | ), 273 | ?assertExit({bad_label, {_, _}}, idna:decode("XN---------90GGLBAGAAC.AA")), 274 | ?assertExit({bad_label, {_, _}}, idna:decode("xn---------90gglbagaac.aa")). -------------------------------------------------------------------------------- /test/punycode_test.erl: -------------------------------------------------------------------------------- 1 | %%% 2 | %%% This file is part of erlang-idna released under the MIT license. 3 | %%% See the LICENSE for more information. 4 | %%% 5 | -module(punycode_test). 6 | -author("benoitc"). 7 | 8 | -export([punicode_encode_test/0, punicode_decode_test/0]). 9 | 10 | -include_lib("eunit/include/eunit.hrl"). 11 | 12 | data() -> 13 | [ 14 | {"(A) Arabic (Egyptian)", 15 | [16#0644, 16#064A, 16#0647, 16#0645, 16#0627, 16#0628, 16#062A, 16#0643, 16 | 16#0644, 16#0645, 16#0648, 16#0634, 16#0639, 16#0631, 16#0628, 16#064A, 17 | 16#061F], 18 | "egbpdaj6bu4bxfgehfvwxn" 19 | }, 20 | 21 | { 22 | "(B) Chinese (simplified)", 23 | [16#4ED6, 16#4EEC, 16#4E3A, 16#4EC0, 16#4E48, 16#4E0D, 16#8BF4, 16#4E2D, 24 | 16#6587], 25 | "ihqwcrb4cv8a8dqg056pqjye" 26 | 27 | }, 28 | { 29 | "(C) Chinese (traditional)", 30 | [16#4ED6, 16#5011, 16#7232, 16#4EC0, 16#9EBD, 16#4E0D, 16#8AAA, 16#4E2D, 31 | 16#6587], 32 | "ihqwctvzc91f659drss3x8bo0yb" 33 | }, 34 | { 35 | "(D) Czech: Proprostnemluvesky", 36 | [16#0050, 16#0072, 16#006F, 16#010D, 16#0070, 16#0072, 16#006F, 16#0073, 37 | 16#0074, 16#011B, 16#006E, 16#0065, 16#006D, 16#006C, 16#0075, 16#0076, 38 | 16#00ED, 16#010D, 16#0065, 16#0073, 16#006B, 16#0079], 39 | "Proprostnemluvesky-uyb24dma41a" 40 | }, 41 | { 42 | "(E) Hebrew:", 43 | [16#05DC, 16#05DE, 16#05D4, 16#05D4, 16#05DD, 16#05E4, 16#05E9, 16#05D5, 44 | 16#05D8, 16#05DC, 16#05D0, 16#05DE, 16#05D3, 16#05D1, 16#05E8, 16#05D9, 45 | 16#05DD, 16#05E2, 16#05D1, 16#05E8, 16#05D9, 16#05EA], 46 | "4dbcagdahymbxekheh6e0a7fei0b" 47 | }, 48 | { 49 | "(F) Hindi (Devanagari):", 50 | [16#092F, 16#0939, 16#0932, 16#094B, 16#0917, 16#0939, 16#093F, 16#0928, 51 | 16#094D, 16#0926, 16#0940, 16#0915, 16#094D, 16#092F, 16#094B, 16#0902, 52 | 16#0928, 16#0939, 16#0940, 16#0902, 16#092C, 16#094B, 16#0932, 16#0938, 53 | 16#0915, 16#0924, 16#0947, 16#0939, 16#0948, 16#0902], 54 | "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd" 55 | }, 56 | { 57 | "(G) Japanese (kanji and hiragana):", 58 | [16#306A, 16#305C, 16#307F, 16#3093, 16#306A, 16#65E5, 16#672C, 16#8A9E, 59 | 16#3092, 16#8A71, 16#3057, 16#3066, 16#304F, 16#308C, 16#306A, 16#3044, 60 | 16#306E, 16#304B], 61 | "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa" 62 | }, 63 | { 64 | "(H) Korean (Hangul syllables):", 65 | [ 16#C138, 16#ACC4, 16#C758, 16#BAA8, 16#B4E0, 16#C0AC, 16#B78C, 16#B4E4, 66 | 16#C774, 16#D55C, 16#AD6D, 16#C5B4, 16#B97C, 16#C774, 16#D574, 16#D55C, 67 | 16#B2E4, 16#BA74, 16#C5BC, 16#B9C8, 16#B098, 16#C88B, 16#C744, 16#AE4C], 68 | "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c"}, 69 | { 70 | "(I) Russian (Cyrillic):", 71 | [16#043F, 16#043E, 16#0447, 16#0435, 16#043C, 16#0443, 16#0436, 16#0435, 72 | 16#043E, 16#043D, 16#0438, 16#043D, 16#0435, 16#0433, 16#043E, 16#0432, 73 | 16#043E, 16#0440, 16#044F, 16#0442, 16#043F, 16#043E, 16#0440, 16#0443, 74 | 16#0441, 16#0441, 16#043A, 16#0438], 75 | "b1abfaaepdrnnbgefbadotcwatmq2g4l" 76 | }, 77 | { 78 | "(J) Spanish: PorqunopuedensimplementehablarenEspaol", 79 | [16#0050, 16#006F, 16#0072, 16#0071, 16#0075, 16#00E9, 16#006E, 16#006F, 80 | 16#0070, 16#0075, 16#0065, 16#0064, 16#0065, 16#006E, 16#0073, 16#0069, 81 | 16#006D, 16#0070, 16#006C, 16#0065, 16#006D, 16#0065, 16#006E, 16#0074, 82 | 16#0065, 16#0068, 16#0061, 16#0062, 16#006C, 16#0061, 16#0072, 16#0065, 83 | 16#006E, 16#0045, 16#0073, 16#0070, 16#0061, 16#00F1, 16#006F, 16#006C], 84 | "PorqunopuedensimplementehablarenEspaol-fmd56a" 85 | }, 86 | { 87 | "(K) Vietnamese:", 88 | [16#0054, 16#1EA1, 16#0069, 16#0073, 16#0061, 16#006F, 16#0068, 16#1ECD, 89 | 16#006B, 16#0068, 16#00F4, 16#006E, 16#0067, 16#0074, 16#0068, 16#1EC3, 90 | 16#0063, 16#0068, 16#1EC9, 16#006E, 16#00F3, 16#0069, 16#0074, 16#0069, 91 | 16#1EBF, 16#006E, 16#0067, 16#0056, 16#0069, 16#1EC7, 16#0074], 92 | "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g" 93 | }, 94 | { 95 | "(L) 3B", 96 | [16#0033, 16#5E74, 16#0042, 16#7D44, 16#91D1, 16#516B, 16#5148, 16#751F], 97 | "3B-ww4c5e180e575a65lsy2b" 98 | }, 99 | { 100 | "(M) -with-SUPER-MONKEYS", 101 | [16#5B89, 16#5BA4, 16#5948, 16#7F8E, 16#6075, 16#002D, 16#0077, 16#0069, 102 | 16#0074, 16#0068, 16#002D, 16#0053, 16#0055, 16#0050, 16#0045, 16#0052, 103 | 16#002D, 16#004D, 16#004F, 16#004E, 16#004B, 16#0045, 16#0059, 16#0053], 104 | "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n" 105 | }, 106 | { 107 | "(N) Hello-Another-Way-", 108 | [16#0048, 16#0065, 16#006C, 16#006C, 16#006F, 16#002D, 16#0041, 16#006E, 109 | 16#006F, 16#0074, 16#0068, 16#0065, 16#0072, 16#002D, 16#0057, 16#0061, 110 | 16#0079, 16#002D, 16#305D, 16#308C, 16#305E, 16#308C, 16#306E, 16#5834, 111 | 16#6240], 112 | "Hello-Another-Way--fc4qua05auwb3674vfr0b" 113 | }, 114 | { 115 | "(O) 2", 116 | [16#3072, 16#3068, 16#3064, 16#5C4B, 16#6839, 16#306E, 16#4E0B, 16#0032], 117 | "2-u9tlzr9756bt3uc0v" 118 | }, 119 | { 120 | "(P) MajiKoi5", 121 | [16#004D, 16#0061, 16#006A, 16#0069, 16#3067, 16#004B, 16#006F, 16#0069, 122 | 16#3059, 16#308B, 16#0035, 16#79D2, 16#524D], 123 | "MajiKoi5-783gue6qz075azm5e" 124 | }, 125 | { 126 | "(Q) de", 127 | [16#30D1, 16#30D5, 16#30A3, 16#30FC, 16#0064, 16#0065, 16#30EB, 16#30F3, 16#30D0], 128 | "de-jg4avhby1noc0d" 129 | }, 130 | { 131 | "(R) ", 132 | [16#305D, 16#306E, 16#30B9, 16#30D4, 16#30FC, 16#30C9, 16#3067], 133 | "d9juau41awczczp" 134 | }, 135 | { 136 | "(S) -> $1.00 <-", 137 | [16#002D, 16#003E, 16#0020, 16#0024, 16#0031, 16#002E, 16#0030, 16#0030, 138 | 16#0020, 16#003C, 16#002D], 139 | "-> $1.00 <--" 140 | } 141 | ]. 142 | 143 | punicode_encode_test() -> 144 | lists:foreach( 145 | fun({_Descr, Input, Expect}) -> 146 | ?assertEqual(Expect, punycode:encode(Input)) 147 | end, 148 | data() 149 | ). 150 | 151 | punicode_decode_test() -> 152 | lists:foreach( 153 | fun({_Descr, Expect, Input}) -> 154 | ?assertEqual(Expect, punycode:decode(Input)) 155 | end, 156 | data() 157 | ). 158 | -------------------------------------------------------------------------------- /test/uts46_test.erl: -------------------------------------------------------------------------------- 1 | %% -*- coding: utf-8 -*- 2 | %%% 3 | %%% This file is part of erlang-idna released under the MIT license. 4 | %%% See the LICENSE for more information. 5 | %%% 6 | -module(uts46_test). 7 | -author("benoitc"). 8 | 9 | -ifdef('OTP_RELEASE'). 10 | -define(chomp(Str), string:chomp(Str)). 11 | -define(trim(Str, Dir), string:trim(Str, Dir)). 12 | -define(trim(Str), string:trim(Str, both)). 13 | -define(lexemes(Str, Pat), string:lexemes(Str, Pat)). 14 | -else. 15 | -define(chomp(Str), string:strip(Str, right, $\n)). 16 | -define(trim(Str, Dir), string:strip(Str, Dir)). 17 | -define(trim(Str), string:strip(Str, both)). 18 | -define(lexemes(Str, Pat), string:strip(string:tokens(Str, Pat), both)). 19 | -endif. 20 | 21 | -include_lib("eunit/include/eunit.hrl"). 22 | 23 | -define(SKIP_TESTS, [ 24 | [], "xn--r97c.", "𐋷.", "xn--pw9c.xn--fjb8658k", "0.xn--qny", "0.甯", 25 | 26 | "xn--hwe.xn--ss-ci1ub261a", 27 | "ss.xn--lgd921mvv0m", 28 | "xn--4xa.xn--1-gocmu97674d.", 29 | "xn--ghb2gxqia", 30 | "xn--4xa203s.xn--epb", 31 | "xn--ghb2g3qq34f", 32 | [49,121369,11798,46], 33 | 34 | 35 | 36 | [56,52,119355,46,66293,9959], 37 | [57,38529,11246,46] 38 | 39 | 40 | ]). 41 | 42 | uts46_conformance_test() -> 43 | Data = load_file(), 44 | 45 | lists:foreach( 46 | fun({Source, ToUnicode, ToUnicodeStatus, ToAsciiN, ToAsciiNStatus, ToAsciiT, ToAsciiTStatus}=_Row) -> 47 | Ignored = (lists:member(Source, ?SKIP_TESTS) 48 | orelse lists:member(ToAsciiN, ?SKIP_TESTS) 49 | orelse lists:member(ToAsciiT, ?SKIP_TESTS)), 50 | 51 | case Ignored of 52 | true -> ok; 53 | false -> 54 | CheckUnicode = ToUnicodeStatus == [] andalso ToUnicode /= "", 55 | case CheckUnicode of 56 | true -> 57 | %%?debugFmt("test rown=~p~n", [_Row]), 58 | %io:format("decode ~p~n", [Source]), 59 | ?assertEqual(ToUnicode, idna:decode(Source, [uts46, {std3_rules, true}])); 60 | _ -> 61 | ok 62 | end, 63 | 64 | CheckAsciiN = ToUnicode /= [] andalso ToAsciiN /= "" andalso ToAsciiNStatus ==[], 65 | 66 | case CheckAsciiN of 67 | true -> 68 | %?debugFmt("test rown=~p~n", [_Row]), 69 | ?assertEqual(ToAsciiN, idna:encode(Source, [uts46, {transitional, false}])); 70 | false -> 71 | ok 72 | end, 73 | 74 | CheckToAsciiT = ToAsciiT /= "" andalso ToAsciiTStatus == [], 75 | case CheckToAsciiT of 76 | true -> 77 | %?debugFmt("test rown=~p~n", [_Row]), 78 | ?assertEqual(ToAsciiT, idna:encode(Source, [uts46, {transitional, true}])); 79 | false -> 80 | ok 81 | end 82 | end 83 | end, 84 | Data 85 | ). 86 | 87 | load_file() -> 88 | EbinDir = filename:dirname(code:which(?MODULE)), 89 | AppPath = filename:dirname(EbinDir), 90 | Name = filename:join([AppPath, "test", "IdnaTestV2.txt"]), 91 | {ok, Tests} = file:open(Name, [read, {encoding, utf8}, {read_ahead, 1000000}]), 92 | %%{ok, Tests} = file:open(Name, [read, raw, unicode, {read_ahead, 1000000}]), 93 | Data = foldl(fun parse_tests/2, [], Tests), 94 | file:close(Tests), 95 | lists:sort(Data). 96 | 97 | 98 | parse_tests(Line0, Acc) -> 99 | Line1 = ?chomp(Line0), 100 | [Line|_Comments] = tokens(Line1, "#"), 101 | [Source, ToUnicode, ToUnicodeStatusStr, 102 | ToAsciiN, ToAsciiNStatusStr, ToAsciiT, ToAsciiTStatusStr] = case tokens(Line, ";") of 103 | Row when length(Row) > 6 -> Row; 104 | Row -> Row ++ [""] 105 | end, 106 | ToUnicodeStatus = parse_status(?trim(ToUnicodeStatusStr)), 107 | ToAsciiNStatus = case parse_status(?trim(ToAsciiNStatusStr)) of 108 | [] -> ToUnicodeStatus; 109 | ToAsciiNStatus1 -> ToAsciiNStatus1 110 | end, 111 | ToAsciiTStatus = case parse_status(?trim(ToAsciiTStatusStr)) of 112 | [] -> ToUnicodeStatus; 113 | ToAsciiTStatus1 -> ToAsciiTStatus1 114 | end, 115 | 116 | [{parse_unicode(Source), 117 | parse_unicode(ToUnicode), ToUnicodeStatus, 118 | parse_unicode(ToAsciiN), ToAsciiNStatus, 119 | parse_unicode(ToAsciiT), ToAsciiTStatus} | Acc]. 120 | 121 | 122 | parse_unicode(S) -> 123 | ?trim(S, both). 124 | %parse_unicode(S0) -> 125 | % ?trim(unicode:characters_to_list(list_to_binary(S0)), both). 126 | 127 | to_unicode(S) -> 128 | case lists:all(fun(C) -> idna_ucs:is_unicode(C) end, S) of 129 | true -> S; 130 | false -> idna_ucs:from_utf8(S) 131 | 132 | end. 133 | 134 | parse_status(" ") -> []; 135 | parse_status("") -> []; 136 | parse_status("[]") -> []; 137 | parse_status("[" ++ Str) -> 138 | [ErrorsStr] = ?lexemes(Str, "]"), 139 | ?lexemes(ErrorsStr, ","). 140 | 141 | 142 | foldl(Fun, Acc, Fd) -> 143 | Get = fun() -> io:get_line(Fd, "") end, 144 | % Get = fun() -> file:read_line(Fd) end, 145 | foldl_1(Fun, Acc, Get). 146 | 147 | foldl_1(_Fun, {done, Acc}, _Get) -> Acc; 148 | foldl_1(Fun, Acc, Get) -> 149 | case Get() of 150 | eof -> Acc; 151 | "#" ++ _ -> %% Ignore comments 152 | foldl_1(Fun, Acc, Get); 153 | "\n" -> %% Ignore empty lines 154 | foldl_1(Fun, Acc, Get); 155 | Line -> 156 | foldl_1(Fun, Fun(Line, Acc), Get) 157 | end. 158 | 159 | %% Differs from string:tokens, it returns empty string as token between two delimiters 160 | tokens(S, [C]) -> 161 | tokens(lists:reverse(S), C, []). 162 | 163 | tokens([Sep|S], Sep, Toks) -> 164 | tokens(S, Sep, [[]|Toks]); 165 | tokens([C|S], Sep, Toks) -> 166 | tokens_2(S, Sep, Toks, [C]); 167 | tokens([], _, Toks) -> 168 | Toks. 169 | 170 | tokens_2([Sep|S], Sep, Toks, Tok) -> 171 | tokens(S, Sep, [Tok|Toks]); 172 | tokens_2([C|S], Sep, Toks, Tok) -> 173 | tokens_2(S, Sep, Toks, [C|Tok]); 174 | tokens_2([], _Sep, Toks, Tok) -> 175 | [Tok|Toks]. 176 | -------------------------------------------------------------------------------- /uc_spec/ArabicShaping.txt: -------------------------------------------------------------------------------- 1 | # ArabicShaping-13.0.0.txt 2 | # Date: 2020-01-31, 23:55:00 GMT [KW, RP] 3 | # © 2020 Unicode®, Inc. 4 | # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. 5 | # For terms of use, see http://www.unicode.org/terms_of_use.html 6 | # 7 | # This file is a normative contributory data file in the 8 | # Unicode Character Database. 9 | # 10 | # This file defines the Joining_Type and Joining_Group property 11 | # values for Arabic, Syriac, N'Ko, Mandaic, and Manichaean positional 12 | # shaping, repeating in machine readable form the information 13 | # exemplified in Tables 9-3, 9-8, 9-9, 9-10, 9-14, 9-15, 9-16, 9-19, 14 | # 9-20, 10-4, 10-5, 10-6, 10-7, and 19-5 of The Unicode Standard core 15 | # specification. This file also defines Joining_Type values for 16 | # Mongolian, Phags-pa, Psalter Pahlavi, Sogdian, Chorasmian, and Adlam positional shaping, 17 | # and Joining_Type and Joining_Group values for Hanifi Rohingya positional shaping, 18 | # which are not listed in tables in the standard. 19 | # 20 | # See Sections 9.2, 9.3, 9.5, 10.5, 10.6, 13.4, 14.3, 14.10, 16.14, 19.4, and 19.9 21 | # of The Unicode Standard core specification for more information. 22 | # 23 | # Each line contains four fields, separated by a semicolon. 24 | # 25 | # Field 0: the code point, in 4-digit hexadecimal 26 | # form, of a character. 27 | # 28 | # Field 1: gives a short schematic name for that character. 29 | # The schematic name is descriptive of the shape, based as 30 | # consistently as possible on a name for the skeleton and 31 | # then the diacritic marks applied to the skeleton, if any. 32 | # Note that this schematic name is considered a comment, 33 | # and does not constitute a formal property value. 34 | # 35 | # Field 2: defines the joining type (property name: Joining_Type) 36 | # R Right_Joining 37 | # L Left_Joining 38 | # D Dual_Joining 39 | # C Join_Causing 40 | # U Non_Joining 41 | # T Transparent 42 | # 43 | # See Section 9.2, Arabic for more information on these joining types. 44 | # Note that for cursive joining scripts which are typically rendered 45 | # top-to-bottom, rather than right-to-left, Joining_Type=L conventionally 46 | # refers to bottom joining, and Joining_Type=R conventionally refers 47 | # to top joining. See Section 14.3, Phags-pa for more information on the 48 | # interpretation of joining types in vertical layout. 49 | # 50 | # Field 3: defines the joining group (property name: Joining_Group) 51 | # 52 | # The values of the joining group are based schematically on character 53 | # names. Where a schematic character name consists of two or more parts 54 | # separated by spaces, the formal Joining_Group property value, as specified in 55 | # PropertyValueAliases.txt, consists of the same name parts joined by 56 | # underscores. Hence, the entry: 57 | # 58 | # 0629; TEH MARBUTA; R; TEH MARBUTA 59 | # 60 | # corresponds to [Joining_Group = Teh_Marbuta]. 61 | # 62 | # Note: The property value now designated [Joining_Group = Teh_Marbuta_Goal] 63 | # used to apply to both of the following characters 64 | # in earlier versions of the standard: 65 | # 66 | # U+06C2 ARABIC LETTER HEH GOAL WITH HAMZA ABOVE 67 | # U+06C3 ARABIC LETTER TEH MARBUTA GOAL 68 | # 69 | # However, it currently applies only to U+06C3, and *not* to U+06C2. 70 | # To avoid destabilizing existing Joining_Group property aliases, the 71 | # prior Joining_Group value for U+06C3 (Hamza_On_Heh_Goal) has been 72 | # retained as a property value alias, despite the fact that it 73 | # no longer applies to its namesake character, U+06C2. 74 | # See PropertyValueAliases.txt. 75 | # 76 | # When other cursive scripts are added to the Unicode Standard in the 77 | # future, the joining group value of all its letters will default to 78 | # jg=No_Joining_Group in this data file. Other, more specific 79 | # joining group values will be defined only if an explicit proposal 80 | # to define those values exactly has been approved by the UTC. This 81 | # is the convention exemplified by the N'Ko, Mandaic, Mongolian, 82 | # Phags-pa, Psalter Pahlavi, Sogdian, Chorasmian, and Adlam scripts. 83 | # Only the Arabic, Manichaean, and Syriac scripts currently have 84 | # explicit joining group values defined for all characters, including 85 | # those which have only a single character in a particular Joining_Group 86 | # class. Hanifi Rohingya has explicit Joining_Group values assigned only for 87 | # the few characters which share a particular Joining_Group class, but 88 | # assigns jg=No_Joining_Group to all the singletons. 89 | # 90 | # Note: Code points that are not explicitly listed in this file are 91 | # either of joining type T or U: 92 | # 93 | # - Those that are not explicitly listed and that are of General Category Mn, Me, or Cf 94 | # have joining type T. 95 | # - All others not explicitly listed have joining type U. 96 | # 97 | # For an explicit listing of all characters of joining type T, see 98 | # the derived property file DerivedJoiningType.txt. 99 | # 100 | # ############################################################# 101 | 102 | # Unicode; Schematic Name; Joining Type; Joining Group 103 | 104 | # Arabic Characters 105 | 106 | 0600; ARABIC NUMBER SIGN; U; No_Joining_Group 107 | 0601; ARABIC SIGN SANAH; U; No_Joining_Group 108 | 0602; ARABIC FOOTNOTE MARKER; U; No_Joining_Group 109 | 0603; ARABIC SIGN SAFHA; U; No_Joining_Group 110 | 0604; ARABIC SIGN SAMVAT; U; No_Joining_Group 111 | 0605; ARABIC NUMBER MARK ABOVE; U; No_Joining_Group 112 | 0608; ARABIC RAY; U; No_Joining_Group 113 | 060B; AFGHANI SIGN; U; No_Joining_Group 114 | 0620; DOTLESS YEH WITH SEPARATE RING BELOW; D; YEH 115 | 0621; HAMZA; U; No_Joining_Group 116 | 0622; ALEF WITH MADDA ABOVE; R; ALEF 117 | 0623; ALEF WITH HAMZA ABOVE; R; ALEF 118 | 0624; WAW WITH HAMZA ABOVE; R; WAW 119 | 0625; ALEF WITH HAMZA BELOW; R; ALEF 120 | 0626; DOTLESS YEH WITH HAMZA ABOVE; D; YEH 121 | 0627; ALEF; R; ALEF 122 | 0628; BEH; D; BEH 123 | 0629; TEH MARBUTA; R; TEH MARBUTA 124 | 062A; DOTLESS BEH WITH 2 DOTS ABOVE; D; BEH 125 | 062B; DOTLESS BEH WITH 3 DOTS ABOVE; D; BEH 126 | 062C; HAH WITH DOT BELOW; D; HAH 127 | 062D; HAH; D; HAH 128 | 062E; HAH WITH DOT ABOVE; D; HAH 129 | 062F; DAL; R; DAL 130 | 0630; DAL WITH DOT ABOVE; R; DAL 131 | 0631; REH; R; REH 132 | 0632; REH WITH DOT ABOVE; R; REH 133 | 0633; SEEN; D; SEEN 134 | 0634; SEEN WITH 3 DOTS ABOVE; D; SEEN 135 | 0635; SAD; D; SAD 136 | 0636; SAD WITH DOT ABOVE; D; SAD 137 | 0637; TAH; D; TAH 138 | 0638; TAH WITH DOT ABOVE; D; TAH 139 | 0639; AIN; D; AIN 140 | 063A; AIN WITH DOT ABOVE; D; AIN 141 | 063B; KEHEH WITH 2 DOTS ABOVE; D; GAF 142 | 063C; KEHEH WITH 3 DOTS BELOW; D; GAF 143 | 063D; FARSI YEH WITH INVERTED V ABOVE; D; FARSI YEH 144 | 063E; FARSI YEH WITH 2 DOTS ABOVE; D; FARSI YEH 145 | 063F; FARSI YEH WITH 3 DOTS ABOVE; D; FARSI YEH 146 | 0640; TATWEEL; C; No_Joining_Group 147 | 0641; FEH; D; FEH 148 | 0642; QAF; D; QAF 149 | 0643; KAF; D; KAF 150 | 0644; LAM; D; LAM 151 | 0645; MEEM; D; MEEM 152 | 0646; NOON; D; NOON 153 | 0647; HEH; D; HEH 154 | 0648; WAW; R; WAW 155 | 0649; DOTLESS YEH; D; YEH 156 | 064A; YEH; D; YEH 157 | 066E; DOTLESS BEH; D; BEH 158 | 066F; DOTLESS QAF; D; QAF 159 | 0671; ALEF WITH WASLA ABOVE; R; ALEF 160 | 0672; ALEF WITH WAVY HAMZA ABOVE; R; ALEF 161 | 0673; ALEF WITH WAVY HAMZA BELOW; R; ALEF 162 | 0674; HIGH HAMZA; U; No_Joining_Group 163 | 0675; HIGH HAMZA ALEF; R; ALEF 164 | 0676; HIGH HAMZA WAW; R; WAW 165 | 0677; HIGH HAMZA WAW WITH DAMMA ABOVE; R; WAW 166 | 0678; HIGH HAMZA DOTLESS YEH; D; YEH 167 | 0679; DOTLESS BEH WITH TAH ABOVE; D; BEH 168 | 067A; DOTLESS BEH WITH VERTICAL 2 DOTS ABOVE; D; BEH 169 | 067B; DOTLESS BEH WITH VERTICAL 2 DOTS BELOW; D; BEH 170 | 067C; DOTLESS BEH WITH ATTACHED RING BELOW AND 2 DOTS ABOVE; D; BEH 171 | 067D; DOTLESS BEH WITH INVERTED 3 DOTS ABOVE; D; BEH 172 | 067E; DOTLESS BEH WITH 3 DOTS BELOW; D; BEH 173 | 067F; DOTLESS BEH WITH 4 DOTS ABOVE; D; BEH 174 | 0680; DOTLESS BEH WITH 4 DOTS BELOW; D; BEH 175 | 0681; HAH WITH HAMZA ABOVE; D; HAH 176 | 0682; HAH WITH VERTICAL 2 DOTS ABOVE; D; HAH 177 | 0683; HAH WITH 2 DOTS BELOW; D; HAH 178 | 0684; HAH WITH VERTICAL 2 DOTS BELOW; D; HAH 179 | 0685; HAH WITH 3 DOTS ABOVE; D; HAH 180 | 0686; HAH WITH 3 DOTS BELOW; D; HAH 181 | 0687; HAH WITH 4 DOTS BELOW; D; HAH 182 | 0688; DAL WITH TAH ABOVE; R; DAL 183 | 0689; DAL WITH ATTACHED RING BELOW; R; DAL 184 | 068A; DAL WITH DOT BELOW; R; DAL 185 | 068B; DAL WITH DOT BELOW AND TAH ABOVE; R; DAL 186 | 068C; DAL WITH 2 DOTS ABOVE; R; DAL 187 | 068D; DAL WITH 2 DOTS BELOW; R; DAL 188 | 068E; DAL WITH 3 DOTS ABOVE; R; DAL 189 | 068F; DAL WITH INVERTED 3 DOTS ABOVE; R; DAL 190 | 0690; DAL WITH 4 DOTS ABOVE; R; DAL 191 | 0691; REH WITH TAH ABOVE; R; REH 192 | 0692; REH WITH V ABOVE; R; REH 193 | 0693; REH WITH ATTACHED RING BELOW; R; REH 194 | 0694; REH WITH DOT BELOW; R; REH 195 | 0695; REH WITH V BELOW; R; REH 196 | 0696; REH WITH DOT BELOW AND DOT WITHIN; R; REH 197 | 0697; REH WITH 2 DOTS ABOVE; R; REH 198 | 0698; REH WITH 3 DOTS ABOVE; R; REH 199 | 0699; REH WITH 4 DOTS ABOVE; R; REH 200 | 069A; SEEN WITH DOT BELOW AND DOT ABOVE; D; SEEN 201 | 069B; SEEN WITH 3 DOTS BELOW; D; SEEN 202 | 069C; SEEN WITH 3 DOTS BELOW AND 3 DOTS ABOVE; D; SEEN 203 | 069D; SAD WITH 2 DOTS BELOW; D; SAD 204 | 069E; SAD WITH 3 DOTS ABOVE; D; SAD 205 | 069F; TAH WITH 3 DOTS ABOVE; D; TAH 206 | 06A0; AIN WITH 3 DOTS ABOVE; D; AIN 207 | 06A1; DOTLESS FEH; D; FEH 208 | 06A2; DOTLESS FEH WITH DOT BELOW; D; FEH 209 | 06A3; FEH WITH DOT BELOW; D; FEH 210 | 06A4; DOTLESS FEH WITH 3 DOTS ABOVE; D; FEH 211 | 06A5; DOTLESS FEH WITH 3 DOTS BELOW; D; FEH 212 | 06A6; DOTLESS FEH WITH 4 DOTS ABOVE; D; FEH 213 | 06A7; DOTLESS QAF WITH DOT ABOVE; D; QAF 214 | 06A8; DOTLESS QAF WITH 3 DOTS ABOVE; D; QAF 215 | 06A9; KEHEH; D; GAF 216 | 06AA; SWASH KAF; D; SWASH KAF 217 | 06AB; KEHEH WITH ATTACHED RING BELOW; D; GAF 218 | 06AC; KAF WITH DOT ABOVE; D; KAF 219 | 06AD; KAF WITH 3 DOTS ABOVE; D; KAF 220 | 06AE; KAF WITH 3 DOTS BELOW; D; KAF 221 | 06AF; GAF; D; GAF 222 | 06B0; GAF WITH ATTACHED RING BELOW; D; GAF 223 | 06B1; GAF WITH 2 DOTS ABOVE; D; GAF 224 | 06B2; GAF WITH 2 DOTS BELOW; D; GAF 225 | 06B3; GAF WITH VERTICAL 2 DOTS BELOW; D; GAF 226 | 06B4; GAF WITH 3 DOTS ABOVE; D; GAF 227 | 06B5; LAM WITH V ABOVE; D; LAM 228 | 06B6; LAM WITH DOT ABOVE; D; LAM 229 | 06B7; LAM WITH 3 DOTS ABOVE; D; LAM 230 | 06B8; LAM WITH 3 DOTS BELOW; D; LAM 231 | 06B9; NOON WITH DOT BELOW; D; NOON 232 | 06BA; DOTLESS NOON; D; NOON 233 | 06BB; DOTLESS NOON WITH TAH ABOVE; D; NOON 234 | 06BC; NOON WITH ATTACHED RING BELOW; D; NOON 235 | 06BD; NYA; D; NYA 236 | 06BE; KNOTTED HEH; D; KNOTTED HEH 237 | 06BF; HAH WITH 3 DOTS BELOW AND DOT ABOVE; D; HAH 238 | 06C0; DOTLESS TEH MARBUTA WITH HAMZA ABOVE; R; TEH MARBUTA 239 | 06C1; HEH GOAL; D; HEH GOAL 240 | 06C2; HEH GOAL WITH HAMZA ABOVE; D; HEH GOAL 241 | 06C3; TEH MARBUTA GOAL; R; TEH MARBUTA GOAL 242 | 06C4; WAW WITH ATTACHED RING WITHIN; R; WAW 243 | 06C5; WAW WITH BAR; R; WAW 244 | 06C6; WAW WITH V ABOVE; R; WAW 245 | 06C7; WAW WITH DAMMA ABOVE; R; WAW 246 | 06C8; WAW WITH ALEF ABOVE; R; WAW 247 | 06C9; WAW WITH INVERTED V ABOVE; R; WAW 248 | 06CA; WAW WITH 2 DOTS ABOVE; R; WAW 249 | 06CB; WAW WITH 3 DOTS ABOVE; R; WAW 250 | 06CC; FARSI YEH; D; FARSI YEH 251 | 06CD; YEH WITH TAIL; R; YEH WITH TAIL 252 | 06CE; FARSI YEH WITH V ABOVE; D; FARSI YEH 253 | 06CF; WAW WITH DOT ABOVE; R; WAW 254 | 06D0; DOTLESS YEH WITH VERTICAL 2 DOTS BELOW; D; YEH 255 | 06D1; DOTLESS YEH WITH 3 DOTS BELOW; D; YEH 256 | 06D2; YEH BARREE; R; YEH BARREE 257 | 06D3; YEH BARREE WITH HAMZA ABOVE; R; YEH BARREE 258 | 06D5; DOTLESS TEH MARBUTA; R; TEH MARBUTA 259 | 06DD; ARABIC END OF AYAH; U; No_Joining_Group 260 | 06EE; DAL WITH INVERTED V ABOVE; R; DAL 261 | 06EF; REH WITH INVERTED V ABOVE; R; REH 262 | 06FA; SEEN WITH DOT BELOW AND 3 DOTS ABOVE; D; SEEN 263 | 06FB; SAD WITH DOT BELOW AND DOT ABOVE; D; SAD 264 | 06FC; AIN WITH DOT BELOW AND DOT ABOVE; D; AIN 265 | 06FF; KNOTTED HEH WITH INVERTED V ABOVE; D; KNOTTED HEH 266 | 267 | # Syriac Characters 268 | 269 | 070F; SYRIAC ABBREVIATION MARK; T; No_Joining_Group 270 | 0710; ALAPH; R; ALAPH 271 | 0712; BETH; D; BETH 272 | 0713; GAMAL; D; GAMAL 273 | 0714; GAMAL GARSHUNI; D; GAMAL 274 | 0715; DALATH; R; DALATH RISH 275 | 0716; DOTLESS DALATH RISH; R; DALATH RISH 276 | 0717; HE; R; HE 277 | 0718; WAW; R; SYRIAC WAW 278 | 0719; ZAIN; R; ZAIN 279 | 071A; HETH; D; HETH 280 | 071B; TETH; D; TETH 281 | 071C; TETH GARSHUNI; D; TETH 282 | 071D; YUDH; D; YUDH 283 | 071E; YUDH HE; R; YUDH HE 284 | 071F; KAPH; D; KAPH 285 | 0720; LAMADH; D; LAMADH 286 | 0721; MIM; D; MIM 287 | 0722; NUN; D; NUN 288 | 0723; SEMKATH; D; SEMKATH 289 | 0724; FINAL SEMKATH; D; FINAL SEMKATH 290 | 0725; E; D; E 291 | 0726; PE; D; PE 292 | 0727; REVERSED PE; D; REVERSED PE 293 | 0728; SADHE; R; SADHE 294 | 0729; QAPH; D; QAPH 295 | 072A; RISH; R; DALATH RISH 296 | 072B; SHIN; D; SHIN 297 | 072C; TAW; R; TAW 298 | 072D; PERSIAN BHETH; D; BETH 299 | 072E; PERSIAN GHAMAL; D; GAMAL 300 | 072F; PERSIAN DHALATH; R; DALATH RISH 301 | 074D; SOGDIAN ZHAIN; R; ZHAIN 302 | 074E; SOGDIAN KHAPH; D; KHAPH 303 | 074F; SOGDIAN FE; D; FE 304 | 305 | # Arabic Supplement Characters 306 | 307 | 0750; DOTLESS BEH WITH HORIZONTAL 3 DOTS BELOW; D; BEH 308 | 0751; BEH WITH 3 DOTS ABOVE; D; BEH 309 | 0752; DOTLESS BEH WITH INVERTED 3 DOTS BELOW; D; BEH 310 | 0753; DOTLESS BEH WITH INVERTED 3 DOTS BELOW AND 2 DOTS ABOVE; D; BEH 311 | 0754; DOTLESS BEH WITH 2 DOTS BELOW AND DOT ABOVE; D; BEH 312 | 0755; DOTLESS BEH WITH INVERTED V BELOW; D; BEH 313 | 0756; DOTLESS BEH WITH V ABOVE; D; BEH 314 | 0757; HAH WITH 2 DOTS ABOVE; D; HAH 315 | 0758; HAH WITH INVERTED 3 DOTS BELOW; D; HAH 316 | 0759; DAL WITH VERTICAL 2 DOTS BELOW AND TAH ABOVE; R; DAL 317 | 075A; DAL WITH INVERTED V BELOW; R; DAL 318 | 075B; REH WITH BAR; R; REH 319 | 075C; SEEN WITH 4 DOTS ABOVE; D; SEEN 320 | 075D; AIN WITH 2 DOTS ABOVE; D; AIN 321 | 075E; AIN WITH INVERTED 3 DOTS ABOVE; D; AIN 322 | 075F; AIN WITH VERTICAL 2 DOTS ABOVE; D; AIN 323 | 0760; DOTLESS FEH WITH 2 DOTS BELOW; D; FEH 324 | 0761; DOTLESS FEH WITH INVERTED 3 DOTS BELOW; D; FEH 325 | 0762; KEHEH WITH DOT ABOVE; D; GAF 326 | 0763; KEHEH WITH 3 DOTS ABOVE; D; GAF 327 | 0764; KEHEH WITH INVERTED 3 DOTS BELOW; D; GAF 328 | 0765; MEEM WITH DOT ABOVE; D; MEEM 329 | 0766; MEEM WITH DOT BELOW; D; MEEM 330 | 0767; NOON WITH 2 DOTS BELOW; D; NOON 331 | 0768; NOON WITH TAH ABOVE; D; NOON 332 | 0769; NOON WITH V ABOVE; D; NOON 333 | 076A; LAM WITH BAR; D; LAM 334 | 076B; REH WITH VERTICAL 2 DOTS ABOVE; R; REH 335 | 076C; REH WITH HAMZA ABOVE; R; REH 336 | 076D; SEEN WITH VERTICAL 2 DOTS ABOVE; D; SEEN 337 | 076E; HAH WITH TAH BELOW; D; HAH 338 | 076F; HAH WITH TAH AND 2 DOTS BELOW; D; HAH 339 | 0770; SEEN WITH 2 DOTS AND TAH ABOVE; D; SEEN 340 | 0771; REH WITH 2 DOTS AND TAH ABOVE; R; REH 341 | 0772; HAH WITH TAH ABOVE; D; HAH 342 | 0773; ALEF WITH DIGIT TWO ABOVE; R; ALEF 343 | 0774; ALEF WITH DIGIT THREE ABOVE; R; ALEF 344 | 0775; FARSI YEH WITH DIGIT TWO ABOVE; D; FARSI YEH 345 | 0776; FARSI YEH WITH DIGIT THREE ABOVE; D; FARSI YEH 346 | 0777; DOTLESS YEH WITH DIGIT FOUR BELOW; D; YEH 347 | 0778; WAW WITH DIGIT TWO ABOVE; R; WAW 348 | 0779; WAW WITH DIGIT THREE ABOVE; R; WAW 349 | 077A; BURUSHASKI YEH BARREE WITH DIGIT TWO ABOVE; D; BURUSHASKI YEH BARREE 350 | 077B; BURUSHASKI YEH BARREE WITH DIGIT THREE ABOVE; D; BURUSHASKI YEH BARREE 351 | 077C; HAH WITH DIGIT FOUR BELOW; D; HAH 352 | 077D; SEEN WITH DIGIT FOUR ABOVE; D; SEEN 353 | 077E; SEEN WITH INVERTED V ABOVE; D; SEEN 354 | 077F; KAF WITH 2 DOTS ABOVE; D; KAF 355 | 356 | # N'Ko Characters 357 | 358 | 07CA; NKO A; D; No_Joining_Group 359 | 07CB; NKO EE; D; No_Joining_Group 360 | 07CC; NKO I; D; No_Joining_Group 361 | 07CD; NKO E; D; No_Joining_Group 362 | 07CE; NKO U; D; No_Joining_Group 363 | 07CF; NKO OO; D; No_Joining_Group 364 | 07D0; NKO O; D; No_Joining_Group 365 | 07D1; NKO DAGBASINNA; D; No_Joining_Group 366 | 07D2; NKO N; D; No_Joining_Group 367 | 07D3; NKO BA; D; No_Joining_Group 368 | 07D4; NKO PA; D; No_Joining_Group 369 | 07D5; NKO TA; D; No_Joining_Group 370 | 07D6; NKO JA; D; No_Joining_Group 371 | 07D7; NKO CHA; D; No_Joining_Group 372 | 07D8; NKO DA; D; No_Joining_Group 373 | 07D9; NKO RA; D; No_Joining_Group 374 | 07DA; NKO RRA; D; No_Joining_Group 375 | 07DB; NKO SA; D; No_Joining_Group 376 | 07DC; NKO GBA; D; No_Joining_Group 377 | 07DD; NKO FA; D; No_Joining_Group 378 | 07DE; NKO KA; D; No_Joining_Group 379 | 07DF; NKO LA; D; No_Joining_Group 380 | 07E0; NKO NA WOLOSO; D; No_Joining_Group 381 | 07E1; NKO MA; D; No_Joining_Group 382 | 07E2; NKO NYA; D; No_Joining_Group 383 | 07E3; NKO NA; D; No_Joining_Group 384 | 07E4; NKO HA; D; No_Joining_Group 385 | 07E5; NKO WA; D; No_Joining_Group 386 | 07E6; NKO YA; D; No_Joining_Group 387 | 07E7; NKO NYA WOLOSO; D; No_Joining_Group 388 | 07E8; NKO JONA JA; D; No_Joining_Group 389 | 07E9; NKO JONA CHA; D; No_Joining_Group 390 | 07EA; NKO JONA RA; D; No_Joining_Group 391 | 07FA; NKO LAJANYALAN; C; No_Joining_Group 392 | 393 | # Mandaic Characters 394 | 395 | 0840; MANDAIC HALQA; R; No_Joining_Group 396 | 0841; MANDAIC AB; D; No_Joining_Group 397 | 0842; MANDAIC AG; D; No_Joining_Group 398 | 0843; MANDAIC AD; D; No_Joining_Group 399 | 0844; MANDAIC AH; D; No_Joining_Group 400 | 0845; MANDAIC USHENNA; D; No_Joining_Group 401 | 0846; MANDAIC AZ; R; No_Joining_Group 402 | 0847; MANDAIC IT; R; No_Joining_Group 403 | 0848; MANDAIC ATT; D; No_Joining_Group 404 | 0849; MANDAIC AKSA; R; No_Joining_Group 405 | 084A; MANDAIC AK; D; No_Joining_Group 406 | 084B; MANDAIC AL; D; No_Joining_Group 407 | 084C; MANDAIC AM; D; No_Joining_Group 408 | 084D; MANDAIC AN; D; No_Joining_Group 409 | 084E; MANDAIC AS; D; No_Joining_Group 410 | 084F; MANDAIC IN; D; No_Joining_Group 411 | 0850; MANDAIC AP; D; No_Joining_Group 412 | 0851; MANDAIC ASZ; D; No_Joining_Group 413 | 0852; MANDAIC AQ; D; No_Joining_Group 414 | 0853; MANDAIC AR; D; No_Joining_Group 415 | 0854; MANDAIC ASH; R; No_Joining_Group 416 | 0855; MANDAIC AT; D; No_Joining_Group 417 | 0856; MANDAIC DUSHENNA; R; No_Joining_Group 418 | 0857; MANDAIC KAD; R; No_Joining_Group 419 | 0858; MANDAIC AIN; R; No_Joining_Group 420 | 421 | # Syriac Supplement Characters 422 | 423 | 0860; MALAYALAM NGA; D; MALAYALAM NGA 424 | 0861; MALAYALAM JA; U; MALAYALAM JA 425 | 0862; MALAYALAM NYA; D; MALAYALAM NYA 426 | 0863; MALAYALAM TTA; D; MALAYALAM TTA 427 | 0864; MALAYALAM NNA; D; MALAYALAM NNA 428 | 0865; MALAYALAM NNNA; D; MALAYALAM NNNA 429 | 0866; MALAYALAM BHA; U; MALAYALAM BHA 430 | 0867; MALAYALAM RA; R; MALAYALAM RA 431 | 0868; MALAYALAM LLA; D; MALAYALAM LLA 432 | 0869; MALAYALAM LLLA; R; MALAYALAM LLLA 433 | 086A; MALAYALAM SSA; R; MALAYALAM SSA 434 | 435 | # Arabic Extended-A Characters 436 | 437 | 08A0; DOTLESS BEH WITH V BELOW; D; BEH 438 | 08A1; BEH WITH HAMZA ABOVE; D; BEH 439 | 08A2; HAH WITH DOT BELOW AND 2 DOTS ABOVE; D; HAH 440 | 08A3; TAH WITH 2 DOTS ABOVE; D; TAH 441 | 08A4; DOTLESS FEH WITH DOT BELOW AND 3 DOTS ABOVE; D; FEH 442 | 08A5; QAF WITH DOT BELOW; D; QAF 443 | 08A6; LAM WITH DOUBLE BAR; D; LAM 444 | 08A7; MEEM WITH 3 DOTS ABOVE; D; MEEM 445 | 08A8; YEH WITH HAMZA ABOVE; D; YEH 446 | 08A9; YEH WITH DOT ABOVE; D; YEH 447 | 08AA; REH WITH LOOP; R; REH 448 | 08AB; WAW WITH DOT WITHIN; R; WAW 449 | 08AC; ROHINGYA YEH; R; ROHINGYA YEH 450 | 08AD; LOW ALEF; U; No_Joining_Group 451 | 08AE; DAL WITH 3 DOTS BELOW; R; DAL 452 | 08AF; SAD WITH 3 DOTS BELOW; D; SAD 453 | 08B0; KEHEH WITH STROKE BELOW; D; GAF 454 | 08B1; STRAIGHT WAW; R; STRAIGHT WAW 455 | 08B2; REH WITH DOT AND INVERTED V ABOVE; R; REH 456 | 08B3; AIN WITH 3 DOTS BELOW; D; AIN 457 | 08B4; KAF WITH DOT BELOW; D; KAF 458 | 08B6; BEH WITH MEEM ABOVE; D; BEH 459 | 08B7; DOTLESS BEH WITH 3 DOTS BELOW AND MEEM ABOVE; D; BEH 460 | 08B8; DOTLESS BEH WITH TEH ABOVE; D; BEH 461 | 08B9; REH WITH NOON ABOVE; R; REH 462 | 08BA; YEH WITH NOON ABOVE; D; YEH 463 | 08BB; AFRICAN FEH; D; AFRICAN FEH 464 | 08BC; AFRICAN QAF; D; AFRICAN QAF 465 | 08BD; AFRICAN NOON; D; AFRICAN NOON 466 | 08BE; DOTLESS BEH WITH 3 DOTS BELOW AND V ABOVE; D; BEH 467 | 08BF; DOTLESS BEH WITH 2 DOTS AND V ABOVE; D; BEH 468 | 08C0; DOTLESS BEH WITH TAH AND V ABOVE; D; BEH 469 | 08C1; HAH WITH 3 DOTS BELOW AND V ABOVE; D; HAH 470 | 08C2; KEHEH WITH V ABOVE; D; GAF 471 | 08C3; AIN WITH DIAMOND 4 DOTS ABOVE; D; AIN 472 | 08C4; AFRICAN QAF WITH 3 DOTS ABOVE; D; AFRICAN QAF 473 | 08C5; HAH WITH DOT BELOW AND 3 DOTS ABOVE; D; HAH 474 | 08C6; HAH WITH DIAMOND 4 DOTS BELOW; D; HAH 475 | 08C7; LAM WITH TAH ABOVE; D; LAM 476 | 08E2; ARABIC DISPUTED END OF AYAH; U; No_Joining_Group 477 | 478 | # Mongolian Characters 479 | 480 | 1806; MONGOLIAN TODO SOFT HYPHEN; U; No_Joining_Group 481 | 1807; MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER; D; No_Joining_Group 482 | 180A; MONGOLIAN NIRUGU; C; No_Joining_Group 483 | 180E; MONGOLIAN VOWEL SEPARATOR; U; No_Joining_Group 484 | 1820; MONGOLIAN A; D; No_Joining_Group 485 | 1821; MONGOLIAN E; D; No_Joining_Group 486 | 1822; MONGOLIAN I; D; No_Joining_Group 487 | 1823; MONGOLIAN O; D; No_Joining_Group 488 | 1824; MONGOLIAN U; D; No_Joining_Group 489 | 1825; MONGOLIAN OE; D; No_Joining_Group 490 | 1826; MONGOLIAN UE; D; No_Joining_Group 491 | 1827; MONGOLIAN EE; D; No_Joining_Group 492 | 1828; MONGOLIAN NA; D; No_Joining_Group 493 | 1829; MONGOLIAN ANG; D; No_Joining_Group 494 | 182A; MONGOLIAN BA; D; No_Joining_Group 495 | 182B; MONGOLIAN PA; D; No_Joining_Group 496 | 182C; MONGOLIAN QA; D; No_Joining_Group 497 | 182D; MONGOLIAN GA; D; No_Joining_Group 498 | 182E; MONGOLIAN MA; D; No_Joining_Group 499 | 182F; MONGOLIAN LA; D; No_Joining_Group 500 | 1830; MONGOLIAN SA; D; No_Joining_Group 501 | 1831; MONGOLIAN SHA; D; No_Joining_Group 502 | 1832; MONGOLIAN TA; D; No_Joining_Group 503 | 1833; MONGOLIAN DA; D; No_Joining_Group 504 | 1834; MONGOLIAN CHA; D; No_Joining_Group 505 | 1835; MONGOLIAN JA; D; No_Joining_Group 506 | 1836; MONGOLIAN YA; D; No_Joining_Group 507 | 1837; MONGOLIAN RA; D; No_Joining_Group 508 | 1838; MONGOLIAN WA; D; No_Joining_Group 509 | 1839; MONGOLIAN FA; D; No_Joining_Group 510 | 183A; MONGOLIAN KA; D; No_Joining_Group 511 | 183B; MONGOLIAN KHA; D; No_Joining_Group 512 | 183C; MONGOLIAN TSA; D; No_Joining_Group 513 | 183D; MONGOLIAN ZA; D; No_Joining_Group 514 | 183E; MONGOLIAN HAA; D; No_Joining_Group 515 | 183F; MONGOLIAN ZRA; D; No_Joining_Group 516 | 1840; MONGOLIAN LHA; D; No_Joining_Group 517 | 1841; MONGOLIAN ZHI; D; No_Joining_Group 518 | 1842; MONGOLIAN CHI; D; No_Joining_Group 519 | 1843; MONGOLIAN TODO LONG VOWEL SIGN; D; No_Joining_Group 520 | 1844; MONGOLIAN TODO E; D; No_Joining_Group 521 | 1845; MONGOLIAN TODO I; D; No_Joining_Group 522 | 1846; MONGOLIAN TODO O; D; No_Joining_Group 523 | 1847; MONGOLIAN TODO U; D; No_Joining_Group 524 | 1848; MONGOLIAN TODO OE; D; No_Joining_Group 525 | 1849; MONGOLIAN TODO UE; D; No_Joining_Group 526 | 184A; MONGOLIAN TODO ANG; D; No_Joining_Group 527 | 184B; MONGOLIAN TODO BA; D; No_Joining_Group 528 | 184C; MONGOLIAN TODO PA; D; No_Joining_Group 529 | 184D; MONGOLIAN TODO QA; D; No_Joining_Group 530 | 184E; MONGOLIAN TODO GA; D; No_Joining_Group 531 | 184F; MONGOLIAN TODO MA; D; No_Joining_Group 532 | 1850; MONGOLIAN TODO TA; D; No_Joining_Group 533 | 1851; MONGOLIAN TODO DA; D; No_Joining_Group 534 | 1852; MONGOLIAN TODO CHA; D; No_Joining_Group 535 | 1853; MONGOLIAN TODO JA; D; No_Joining_Group 536 | 1854; MONGOLIAN TODO TSA; D; No_Joining_Group 537 | 1855; MONGOLIAN TODO YA; D; No_Joining_Group 538 | 1856; MONGOLIAN TODO WA; D; No_Joining_Group 539 | 1857; MONGOLIAN TODO KA; D; No_Joining_Group 540 | 1858; MONGOLIAN TODO GAA; D; No_Joining_Group 541 | 1859; MONGOLIAN TODO HAA; D; No_Joining_Group 542 | 185A; MONGOLIAN TODO JIA; D; No_Joining_Group 543 | 185B; MONGOLIAN TODO NIA; D; No_Joining_Group 544 | 185C; MONGOLIAN TODO DZA; D; No_Joining_Group 545 | 185D; MONGOLIAN SIBE E; D; No_Joining_Group 546 | 185E; MONGOLIAN SIBE I; D; No_Joining_Group 547 | 185F; MONGOLIAN SIBE IY; D; No_Joining_Group 548 | 1860; MONGOLIAN SIBE UE; D; No_Joining_Group 549 | 1861; MONGOLIAN SIBE U; D; No_Joining_Group 550 | 1862; MONGOLIAN SIBE ANG; D; No_Joining_Group 551 | 1863; MONGOLIAN SIBE KA; D; No_Joining_Group 552 | 1864; MONGOLIAN SIBE GA; D; No_Joining_Group 553 | 1865; MONGOLIAN SIBE HA; D; No_Joining_Group 554 | 1866; MONGOLIAN SIBE PA; D; No_Joining_Group 555 | 1867; MONGOLIAN SIBE SHA; D; No_Joining_Group 556 | 1868; MONGOLIAN SIBE TA; D; No_Joining_Group 557 | 1869; MONGOLIAN SIBE DA; D; No_Joining_Group 558 | 186A; MONGOLIAN SIBE JA; D; No_Joining_Group 559 | 186B; MONGOLIAN SIBE FA; D; No_Joining_Group 560 | 186C; MONGOLIAN SIBE GAA; D; No_Joining_Group 561 | 186D; MONGOLIAN SIBE HAA; D; No_Joining_Group 562 | 186E; MONGOLIAN SIBE TSA; D; No_Joining_Group 563 | 186F; MONGOLIAN SIBE ZA; D; No_Joining_Group 564 | 1870; MONGOLIAN SIBE RAA; D; No_Joining_Group 565 | 1871; MONGOLIAN SIBE CHA; D; No_Joining_Group 566 | 1872; MONGOLIAN SIBE ZHA; D; No_Joining_Group 567 | 1873; MONGOLIAN MANCHU I; D; No_Joining_Group 568 | 1874; MONGOLIAN MANCHU KA; D; No_Joining_Group 569 | 1875; MONGOLIAN MANCHU RA; D; No_Joining_Group 570 | 1876; MONGOLIAN MANCHU FA; D; No_Joining_Group 571 | 1877; MONGOLIAN MANCHU ZHA; D; No_Joining_Group 572 | 1878; MONGOLIAN MANCHU CHA WITH 2 DOTS; D; No_Joining_Group 573 | 1880; MONGOLIAN ALI GALI ANUSVARA ONE; U; No_Joining_Group 574 | 1881; MONGOLIAN ALI GALI VISARGA ONE; U; No_Joining_Group 575 | 1882; MONGOLIAN ALI GALI DAMARU; U; No_Joining_Group 576 | 1883; MONGOLIAN ALI GALI UBADAMA; U; No_Joining_Group 577 | 1884; MONGOLIAN ALI GALI INVERTED UBADAMA; U; No_Joining_Group 578 | 1885; MONGOLIAN ALI GALI BALUDA; T; No_Joining_Group 579 | 1886; MONGOLIAN ALI GALI THREE BALUDA; T; No_Joining_Group 580 | 1887; MONGOLIAN ALI GALI A; D; No_Joining_Group 581 | 1888; MONGOLIAN ALI GALI I; D; No_Joining_Group 582 | 1889; MONGOLIAN ALI GALI KA; D; No_Joining_Group 583 | 188A; MONGOLIAN ALI GALI NGA; D; No_Joining_Group 584 | 188B; MONGOLIAN ALI GALI CA; D; No_Joining_Group 585 | 188C; MONGOLIAN ALI GALI TTA; D; No_Joining_Group 586 | 188D; MONGOLIAN ALI GALI TTHA; D; No_Joining_Group 587 | 188E; MONGOLIAN ALI GALI DDA; D; No_Joining_Group 588 | 188F; MONGOLIAN ALI GALI NNA; D; No_Joining_Group 589 | 1890; MONGOLIAN ALI GALI TA; D; No_Joining_Group 590 | 1891; MONGOLIAN ALI GALI DA; D; No_Joining_Group 591 | 1892; MONGOLIAN ALI GALI PA; D; No_Joining_Group 592 | 1893; MONGOLIAN ALI GALI PHA; D; No_Joining_Group 593 | 1894; MONGOLIAN ALI GALI SSA; D; No_Joining_Group 594 | 1895; MONGOLIAN ALI GALI ZHA; D; No_Joining_Group 595 | 1896; MONGOLIAN ALI GALI ZA; D; No_Joining_Group 596 | 1897; MONGOLIAN ALI GALI AH; D; No_Joining_Group 597 | 1898; MONGOLIAN TODO ALI GALI TA; D; No_Joining_Group 598 | 1899; MONGOLIAN TODO ALI GALI ZHA; D; No_Joining_Group 599 | 189A; MONGOLIAN MANCHU ALI GALI GHA; D; No_Joining_Group 600 | 189B; MONGOLIAN MANCHU ALI GALI NGA; D; No_Joining_Group 601 | 189C; MONGOLIAN MANCHU ALI GALI CA; D; No_Joining_Group 602 | 189D; MONGOLIAN MANCHU ALI GALI JHA; D; No_Joining_Group 603 | 189E; MONGOLIAN MANCHU ALI GALI TTA; D; No_Joining_Group 604 | 189F; MONGOLIAN MANCHU ALI GALI DDHA; D; No_Joining_Group 605 | 18A0; MONGOLIAN MANCHU ALI GALI TA; D; No_Joining_Group 606 | 18A1; MONGOLIAN MANCHU ALI GALI DHA; D; No_Joining_Group 607 | 18A2; MONGOLIAN MANCHU ALI GALI SSA; D; No_Joining_Group 608 | 18A3; MONGOLIAN MANCHU ALI GALI CYA; D; No_Joining_Group 609 | 18A4; MONGOLIAN MANCHU ALI GALI ZHA; D; No_Joining_Group 610 | 18A5; MONGOLIAN MANCHU ALI GALI ZA; D; No_Joining_Group 611 | 18A6; MONGOLIAN ALI GALI HALF U; D; No_Joining_Group 612 | 18A7; MONGOLIAN ALI GALI HALF YA; D; No_Joining_Group 613 | 18A8; MONGOLIAN MANCHU ALI GALI BHA; D; No_Joining_Group 614 | 18AA; MONGOLIAN MANCHU ALI GALI LHA; D; No_Joining_Group 615 | 616 | # Other 617 | 618 | 200C; ZERO WIDTH NON-JOINER; U; No_Joining_Group 619 | 200D; ZERO WIDTH JOINER; C; No_Joining_Group 620 | 202F; NARROW NO-BREAK SPACE; U; No_Joining_Group 621 | 2066; LEFT-TO-RIGHT ISOLATE; U; No_Joining_Group 622 | 2067; RIGHT-TO-LEFT ISOLATE; U; No_Joining_Group 623 | 2068; FIRST STRONG ISOLATE; U; No_Joining_Group 624 | 2069; POP DIRECTIONAL ISOLATE; U; No_Joining_Group 625 | 626 | # Phags-Pa Characters 627 | 628 | A840; PHAGS-PA KA; D; No_Joining_Group 629 | A841; PHAGS-PA KHA; D; No_Joining_Group 630 | A842; PHAGS-PA GA; D; No_Joining_Group 631 | A843; PHAGS-PA NGA; D; No_Joining_Group 632 | A844; PHAGS-PA CA; D; No_Joining_Group 633 | A845; PHAGS-PA CHA; D; No_Joining_Group 634 | A846; PHAGS-PA JA; D; No_Joining_Group 635 | A847; PHAGS-PA NYA; D; No_Joining_Group 636 | A848; PHAGS-PA TA; D; No_Joining_Group 637 | A849; PHAGS-PA THA; D; No_Joining_Group 638 | A84A; PHAGS-PA DA; D; No_Joining_Group 639 | A84B; PHAGS-PA NA; D; No_Joining_Group 640 | A84C; PHAGS-PA PA; D; No_Joining_Group 641 | A84D; PHAGS-PA PHA; D; No_Joining_Group 642 | A84E; PHAGS-PA BA; D; No_Joining_Group 643 | A84F; PHAGS-PA MA; D; No_Joining_Group 644 | A850; PHAGS-PA TSA; D; No_Joining_Group 645 | A851; PHAGS-PA TSHA; D; No_Joining_Group 646 | A852; PHAGS-PA DZA; D; No_Joining_Group 647 | A853; PHAGS-PA WA; D; No_Joining_Group 648 | A854; PHAGS-PA ZHA; D; No_Joining_Group 649 | A855; PHAGS-PA ZA; D; No_Joining_Group 650 | A856; PHAGS-PA SMALL A; D; No_Joining_Group 651 | A857; PHAGS-PA YA; D; No_Joining_Group 652 | A858; PHAGS-PA RA; D; No_Joining_Group 653 | A859; PHAGS-PA LA; D; No_Joining_Group 654 | A85A; PHAGS-PA SHA; D; No_Joining_Group 655 | A85B; PHAGS-PA SA; D; No_Joining_Group 656 | A85C; PHAGS-PA HA; D; No_Joining_Group 657 | A85D; PHAGS-PA A; D; No_Joining_Group 658 | A85E; PHAGS-PA I; D; No_Joining_Group 659 | A85F; PHAGS-PA U; D; No_Joining_Group 660 | A860; PHAGS-PA E; D; No_Joining_Group 661 | A861; PHAGS-PA O; D; No_Joining_Group 662 | A862; PHAGS-PA QA; D; No_Joining_Group 663 | A863; PHAGS-PA XA; D; No_Joining_Group 664 | A864; PHAGS-PA FA; D; No_Joining_Group 665 | A865; PHAGS-PA GGA; D; No_Joining_Group 666 | A866; PHAGS-PA EE; D; No_Joining_Group 667 | A867; PHAGS-PA SUBJOINED WA; D; No_Joining_Group 668 | A868; PHAGS-PA SUBJOINED YA; D; No_Joining_Group 669 | A869; PHAGS-PA TTA; D; No_Joining_Group 670 | A86A; PHAGS-PA TTHA; D; No_Joining_Group 671 | A86B; PHAGS-PA DDA; D; No_Joining_Group 672 | A86C; PHAGS-PA NNA; D; No_Joining_Group 673 | A86D; PHAGS-PA ALTERNATE YA; D; No_Joining_Group 674 | A86E; PHAGS-PA VOICELESS SHA; D; No_Joining_Group 675 | A86F; PHAGS-PA VOICED HA; D; No_Joining_Group 676 | A870; PHAGS-PA ASPIRATED FA; D; No_Joining_Group 677 | A871; PHAGS-PA SUBJOINED RA; D; No_Joining_Group 678 | A872; PHAGS-PA SUPERFIXED RA; L; No_Joining_Group 679 | A873; PHAGS-PA CANDRABINDU; U; No_Joining_Group 680 | 681 | # Manichaean Characters 682 | 683 | 10AC0; MANICHAEAN ALEPH; D; MANICHAEAN ALEPH 684 | 10AC1; MANICHAEAN BETH; D; MANICHAEAN BETH 685 | 10AC2; MANICHAEAN BETH WITH 2 DOTS ABOVE; D; MANICHAEAN BETH 686 | 10AC3; MANICHAEAN GIMEL; D; MANICHAEAN GIMEL 687 | 10AC4; MANICHAEAN GIMEL WITH ATTACHED RING BELOW; D; MANICHAEAN GIMEL 688 | 10AC5; MANICHAEAN DALETH; R; MANICHAEAN DALETH 689 | 10AC6; MANICHAEAN HE; U; No_Joining_Group 690 | 10AC7; MANICHAEAN WAW; R; MANICHAEAN WAW 691 | 10AC8; MANICHAEAN UD; U; No_Joining_Group 692 | 10AC9; MANICHAEAN ZAYIN; R; MANICHAEAN ZAYIN 693 | 10ACA; MANICHAEAN ZAYIN WITH 2 DOTS ABOVE; R; MANICHAEAN ZAYIN 694 | 10ACB; MANICHAEAN JAYIN; U; No_Joining_Group 695 | 10ACC; MANICHAEAN JAYIN WITH 2 DOTS ABOVE; U; No_Joining_Group 696 | 10ACD; MANICHAEAN HETH; L; MANICHAEAN HETH 697 | 10ACE; MANICHAEAN TETH; R; MANICHAEAN TETH 698 | 10ACF; MANICHAEAN YODH; R; MANICHAEAN YODH 699 | 10AD0; MANICHAEAN KAPH; R; MANICHAEAN KAPH 700 | 10AD1; MANICHAEAN KAPH WITH DOT ABOVE; R; MANICHAEAN KAPH 701 | 10AD2; MANICHAEAN KAPH WITH 2 DOTS ABOVE; R; MANICHAEAN KAPH 702 | 10AD3; MANICHAEAN LAMEDH; D; MANICHAEAN LAMEDH 703 | 10AD4; MANICHAEAN DHAMEDH; D; MANICHAEAN DHAMEDH 704 | 10AD5; MANICHAEAN THAMEDH; D; MANICHAEAN THAMEDH 705 | 10AD6; MANICHAEAN MEM; D; MANICHAEAN MEM 706 | 10AD7; MANICHAEAN NUN; L; MANICHAEAN NUN 707 | 10AD8; MANICHAEAN SAMEKH; D; MANICHAEAN SAMEKH 708 | 10AD9; MANICHAEAN AYIN; D; MANICHAEAN AYIN 709 | 10ADA; MANICHAEAN AYIN WITH 2 DOTS ABOVE; D; MANICHAEAN AYIN 710 | 10ADB; MANICHAEAN PE; D; MANICHAEAN PE 711 | 10ADC; MANICHAEAN PE WITH DOT ABOVE; D; MANICHAEAN PE 712 | 10ADD; MANICHAEAN SADHE; R; MANICHAEAN SADHE 713 | 10ADE; MANICHAEAN QOPH; D; MANICHAEAN QOPH 714 | 10ADF; MANICHAEAN QOPH WITH DOT ABOVE; D; MANICHAEAN QOPH 715 | 10AE0; MANICHAEAN QOPH WITH 2 DOTS ABOVE; D; MANICHAEAN QOPH 716 | 10AE1; MANICHAEAN RESH; R; MANICHAEAN RESH 717 | 10AE2; MANICHAEAN SHIN; U; No_Joining_Group 718 | 10AE3; MANICHAEAN SHIN WITH 2 DOTS ABOVE; U; No_Joining_Group 719 | 10AE4; MANICHAEAN TAW; R; MANICHAEAN TAW 720 | 10AEB; MANICHAEAN ONE; D; MANICHAEAN ONE 721 | 10AEC; MANICHAEAN FIVE; D; MANICHAEAN FIVE 722 | 10AED; MANICHAEAN TEN; D; MANICHAEAN TEN 723 | 10AEE; MANICHAEAN TWENTY; D; MANICHAEAN TWENTY 724 | 10AEF; MANICHAEAN HUNDRED; R; MANICHAEAN HUNDRED 725 | 726 | # Psalter Pahlavi Characters 727 | 728 | 10B80; PSALTER PAHLAVI ALEPH; D; No_Joining_Group 729 | 10B81; PSALTER PAHLAVI BETH; R; No_Joining_Group 730 | 10B82; PSALTER PAHLAVI GIMEL; D; No_Joining_Group 731 | 10B83; PSALTER PAHLAVI DALETH; R; No_Joining_Group 732 | 10B84; PSALTER PAHLAVI HE; R; No_Joining_Group 733 | 10B85; PSALTER PAHLAVI WAW-AYIN-RESH; R; No_Joining_Group 734 | 10B86; PSALTER PAHLAVI ZAYIN; D; No_Joining_Group 735 | 10B87; PSALTER PAHLAVI HETH; D; No_Joining_Group 736 | 10B88; PSALTER PAHLAVI YODH; D; No_Joining_Group 737 | 10B89; PSALTER PAHLAVI KAPH; R; No_Joining_Group 738 | 10B8A; PSALTER PAHLAVI LAMEDH; D; No_Joining_Group 739 | 10B8B; PSALTER PAHLAVI MEM-QOPH; D; No_Joining_Group 740 | 10B8C; PSALTER PAHLAVI NUN; R; No_Joining_Group 741 | 10B8D; PSALTER PAHLAVI SAMEKH; D; No_Joining_Group 742 | 10B8E; PSALTER PAHLAVI PE; R; No_Joining_Group 743 | 10B8F; PSALTER PAHLAVI SADHE; R; No_Joining_Group 744 | 10B90; PSALTER PAHLAVI SHIN; D; No_Joining_Group 745 | 10B91; PSALTER PAHLAVI TAW; R; No_Joining_Group 746 | 10BA9; PSALTER PAHLAVI ONE; R; No_Joining_Group 747 | 10BAA; PSALTER PAHLAVI TWO; R; No_Joining_Group 748 | 10BAB; PSALTER PAHLAVI THREE; R; No_Joining_Group 749 | 10BAC; PSALTER PAHLAVI FOUR; R; No_Joining_Group 750 | 10BAD; PSALTER PAHLAVI TEN; D; No_Joining_Group 751 | 10BAE; PSALTER PAHLAVI TWENTY; D; No_Joining_Group 752 | 10BAF; PSALTER PAHLAVI HUNDRED; U; No_Joining_Group 753 | 754 | # Hanifi Rohingya Characters 755 | 756 | 10D00; HANIFI ROHINGYA A; L; No_Joining_Group 757 | 10D01; HANIFI ROHINGYA BA; D; No_Joining_Group 758 | 10D02; HANIFI ROHINGYA PA; D; HANIFI ROHINGYA PA 759 | 10D03; HANIFI ROHINGYA TA; D; No_Joining_Group 760 | 10D04; HANIFI ROHINGYA TTA; D; No_Joining_Group 761 | 10D05; HANIFI ROHINGYA JA; D; No_Joining_Group 762 | 10D06; HANIFI ROHINGYA CA; D; No_Joining_Group 763 | 10D07; HANIFI ROHINGYA HA; D; No_Joining_Group 764 | 10D08; HANIFI ROHINGYA KHA; D; No_Joining_Group 765 | 10D09; HANIFI ROHINGYA PA WITH DOT ABOVE; D; HANIFI ROHINGYA PA 766 | 10D0A; HANIFI ROHINGYA DA; D; No_Joining_Group 767 | 10D0B; HANIFI ROHINGYA DDA; D; No_Joining_Group 768 | 10D0C; HANIFI ROHINGYA RA; D; No_Joining_Group 769 | 10D0D; HANIFI ROHINGYA RRA; D; No_Joining_Group 770 | 10D0E; HANIFI ROHINGYA ZA; D; No_Joining_Group 771 | 10D0F; HANIFI ROHINGYA SA; D; No_Joining_Group 772 | 10D10; HANIFI ROHINGYA SHA; D; No_Joining_Group 773 | 10D11; HANIFI ROHINGYA KA; D; No_Joining_Group 774 | 10D12; HANIFI ROHINGYA GA; D; No_Joining_Group 775 | 10D13; HANIFI ROHINGYA LA; D; No_Joining_Group 776 | 10D14; HANIFI ROHINGYA MA; D; No_Joining_Group 777 | 10D15; HANIFI ROHINGYA NA; D; No_Joining_Group 778 | 10D16; HANIFI ROHINGYA WA; D; No_Joining_Group 779 | 10D17; HANIFI ROHINGYA KINNA WA; D; No_Joining_Group 780 | 10D18; HANIFI ROHINGYA YA; D; No_Joining_Group 781 | 10D19; HANIFI ROHINGYA KINNA YA; D; HANIFI ROHINGYA KINNA YA 782 | 10D1A; HANIFI ROHINGYA NGA; D; No_Joining_Group 783 | 10D1B; HANIFI ROHINGYA NYA; D; No_Joining_Group 784 | 10D1C; HANIFI ROHINGYA PA WITH 3 DOTS ABOVE; D; HANIFI ROHINGYA PA 785 | 10D1D; HANIFI ROHINGYA VOWEL A; D; No_Joining_Group 786 | 10D1E; HANIFI ROHINGYA DOTLESS KINNA YA WITH LEFT-FACING HOOK BELOW; D; HANIFI ROHINGYA KINNA YA 787 | 10D1F; HANIFI ROHINGYA VOWEL U; D; No_Joining_Group 788 | 10D20; HANIFI ROHINGYA DOTLESS KINNA YA WITH RIGHT-FACING HOOK BELOW; D; HANIFI ROHINGYA KINNA YA 789 | 10D21; HANIFI ROHINGYA VOWEL O; D; No_Joining_Group 790 | 10D22; HANIFI ROHINGYA SAKIN; R; No_Joining_Group 791 | 10D23; HANIFI ROHINGYA DOTLESS KINNA YA WITH DOT ABOVE; D; HANIFI ROHINGYA KINNA YA 792 | 793 | # Sogdian Characters 794 | 795 | 10F30; SOGDIAN ALEPH; D; No_Joining_Group 796 | 10F31; SOGDIAN BETH; D; No_Joining_Group 797 | 10F32; SOGDIAN GIMEL; D; No_Joining_Group 798 | 10F33; SOGDIAN HE; R; No_Joining_Group 799 | 10F34; SOGDIAN WAW; D; No_Joining_Group 800 | 10F35; SOGDIAN ZAYIN; D; No_Joining_Group 801 | 10F36; SOGDIAN HETH; D; No_Joining_Group 802 | 10F37; SOGDIAN YODH; D; No_Joining_Group 803 | 10F38; SOGDIAN KAPH; D; No_Joining_Group 804 | 10F39; SOGDIAN LAMEDH; D; No_Joining_Group 805 | 10F3A; SOGDIAN MEM; D; No_Joining_Group 806 | 10F3B; SOGDIAN NUN; D; No_Joining_Group 807 | 10F3C; SOGDIAN SAMEKH; D; No_Joining_Group 808 | 10F3D; SOGDIAN AYIN; D; No_Joining_Group 809 | 10F3E; SOGDIAN PE; D; No_Joining_Group 810 | 10F3F; SOGDIAN SADHE; D; No_Joining_Group 811 | 10F40; SOGDIAN RESH-AYIN; D; No_Joining_Group 812 | 10F41; SOGDIAN SHIN; D; No_Joining_Group 813 | 10F42; SOGDIAN TAW; D; No_Joining_Group 814 | 10F43; SOGDIAN FETH; D; No_Joining_Group 815 | 10F44; SOGDIAN LESH; D; No_Joining_Group 816 | 10F45; SOGDIAN INDEPENDENT SHIN; U; No_Joining_Group 817 | 10F51; SOGDIAN ONE; D; No_Joining_Group 818 | 10F52; SOGDIAN TEN; D; No_Joining_Group 819 | 10F53; SOGDIAN TWENTY; D; No_Joining_Group 820 | 10F54; SOGDIAN ONE HUNDRED; R; No_Joining_Group 821 | 822 | # Chorasmian Characters 823 | 824 | 10FB0; CHORASMIAN ALEPH; D; No_Joining_Group 825 | 10FB1; CHORASMIAN SMALL ALEPH; U; No_Joining_Group 826 | 10FB2; CHORASMIAN BETH; D; No_Joining_Group 827 | 10FB3; CHORASMIAN GIMEL; D; No_Joining_Group 828 | 10FB4; CHORASMIAN DALETH; R; No_Joining_Group 829 | 10FB5; CHORASMIAN HE; R; No_Joining_Group 830 | 10FB6; CHORASMIAN WAW; R; No_Joining_Group 831 | 10FB7; CHORASMIAN CURLED WAW; U; No_Joining_Group 832 | 10FB8; CHORASMIAN ZAYIN; D; No_Joining_Group 833 | 10FB9; CHORASMIAN HETH; R; No_Joining_Group 834 | 10FBA; CHORASMIAN YODH; R; No_Joining_Group 835 | 10FBB; CHORASMIAN KAPH; D; No_Joining_Group 836 | 10FBC; CHORASMIAN LAMEDH; D; No_Joining_Group 837 | 10FBD; CHORASMIAN MEM; R; No_Joining_Group 838 | 10FBE; CHORASMIAN NUN; D; No_Joining_Group 839 | 10FBF; CHORASMIAN SAMEKH; D; No_Joining_Group 840 | 10FC0; CHORASMIAN AYIN; U; No_Joining_Group 841 | 10FC1; CHORASMIAN PE; D; No_Joining_Group 842 | 10FC2; CHORASMIAN RESH; R; No_Joining_Group 843 | 10FC3; CHORASMIAN SHIN; R; No_Joining_Group 844 | 10FC4; CHORASMIAN TAW; D; No_Joining_Group 845 | 10FC5; CHORASMIAN ONE; U; No_Joining_Group 846 | 10FC6; CHORASMIAN TWO; U; No_Joining_Group 847 | 10FC7; CHORASMIAN THREE; U; No_Joining_Group 848 | 10FC8; CHORASMIAN FOUR; U; No_Joining_Group 849 | 10FC9; CHORASMIAN TEN; R; No_Joining_Group 850 | 10FCA; CHORASMIAN TWENTY; D; No_Joining_Group 851 | 10FCB; CHORASMIAN ONE HUNDRED; L; No_Joining_Group 852 | 853 | # Kaithi Number Signs 854 | # These are prepended concatenation marks, comparable 855 | # to the number signs in the Arabic script. 856 | # Listed here for consistency in property values. 857 | 858 | 110BD; KAITHI NUMBER SIGN; U; No_Joining_Group 859 | 110CD; KAITHI NUMBER SIGN ABOVE; U; No_Joining_Group 860 | 861 | # Adlam Characters 862 | 863 | 1E900;ADLAM CAPITAL ALIF; D; No_Joining_Group 864 | 1E901;ADLAM CAPITAL DAALI; D; No_Joining_Group 865 | 1E902;ADLAM CAPITAL LAAM; D; No_Joining_Group 866 | 1E903;ADLAM CAPITAL MIIM; D; No_Joining_Group 867 | 1E904;ADLAM CAPITAL BA; D; No_Joining_Group 868 | 1E905;ADLAM CAPITAL SINNYIIYHE; D; No_Joining_Group 869 | 1E906;ADLAM CAPITAL PE; D; No_Joining_Group 870 | 1E907;ADLAM CAPITAL BHE; D; No_Joining_Group 871 | 1E908;ADLAM CAPITAL RA; D; No_Joining_Group 872 | 1E909;ADLAM CAPITAL E; D; No_Joining_Group 873 | 1E90A;ADLAM CAPITAL FA; D; No_Joining_Group 874 | 1E90B;ADLAM CAPITAL I; D; No_Joining_Group 875 | 1E90C;ADLAM CAPITAL O; D; No_Joining_Group 876 | 1E90D;ADLAM CAPITAL DHA; D; No_Joining_Group 877 | 1E90E;ADLAM CAPITAL YHE; D; No_Joining_Group 878 | 1E90F;ADLAM CAPITAL WAW; D; No_Joining_Group 879 | 1E910;ADLAM CAPITAL NUN; D; No_Joining_Group 880 | 1E911;ADLAM CAPITAL KAF; D; No_Joining_Group 881 | 1E912;ADLAM CAPITAL YA; D; No_Joining_Group 882 | 1E913;ADLAM CAPITAL U; D; No_Joining_Group 883 | 1E914;ADLAM CAPITAL JIIM; D; No_Joining_Group 884 | 1E915;ADLAM CAPITAL CHI; D; No_Joining_Group 885 | 1E916;ADLAM CAPITAL HA; D; No_Joining_Group 886 | 1E917;ADLAM CAPITAL QAAF; D; No_Joining_Group 887 | 1E918;ADLAM CAPITAL GA; D; No_Joining_Group 888 | 1E919;ADLAM CAPITAL NYA; D; No_Joining_Group 889 | 1E91A;ADLAM CAPITAL TU; D; No_Joining_Group 890 | 1E91B;ADLAM CAPITAL NHA; D; No_Joining_Group 891 | 1E91C;ADLAM CAPITAL VA; D; No_Joining_Group 892 | 1E91D;ADLAM CAPITAL KHA; D; No_Joining_Group 893 | 1E91E;ADLAM CAPITAL GBE; D; No_Joining_Group 894 | 1E91F;ADLAM CAPITAL ZAL; D; No_Joining_Group 895 | 1E920;ADLAM CAPITAL KPO; D; No_Joining_Group 896 | 1E921;ADLAM CAPITAL SHA; D; No_Joining_Group 897 | 1E922;ADLAM SMALL ALIF; D; No_Joining_Group 898 | 1E923;ADLAM SMALL DAALI; D; No_Joining_Group 899 | 1E924;ADLAM SMALL LAAM; D; No_Joining_Group 900 | 1E925;ADLAM SMALL MIIM; D; No_Joining_Group 901 | 1E926;ADLAM SMALL BA; D; No_Joining_Group 902 | 1E927;ADLAM SMALL SINNYIIYHE; D; No_Joining_Group 903 | 1E928;ADLAM SMALL PE; D; No_Joining_Group 904 | 1E929;ADLAM SMALL BHE; D; No_Joining_Group 905 | 1E92A;ADLAM SMALL RA; D; No_Joining_Group 906 | 1E92B;ADLAM SMALL E; D; No_Joining_Group 907 | 1E92C;ADLAM SMALL FA; D; No_Joining_Group 908 | 1E92D;ADLAM SMALL I; D; No_Joining_Group 909 | 1E92E;ADLAM SMALL O; D; No_Joining_Group 910 | 1E92F;ADLAM SMALL DHA; D; No_Joining_Group 911 | 1E930;ADLAM SMALL YHE; D; No_Joining_Group 912 | 1E931;ADLAM SMALL WAW; D; No_Joining_Group 913 | 1E932;ADLAM SMALL NUN; D; No_Joining_Group 914 | 1E933;ADLAM SMALL KAF; D; No_Joining_Group 915 | 1E934;ADLAM SMALL YA; D; No_Joining_Group 916 | 1E935;ADLAM SMALL U; D; No_Joining_Group 917 | 1E936;ADLAM SMALL JIIM; D; No_Joining_Group 918 | 1E937;ADLAM SMALL CHI; D; No_Joining_Group 919 | 1E938;ADLAM SMALL HA; D; No_Joining_Group 920 | 1E939;ADLAM SMALL QAAF; D; No_Joining_Group 921 | 1E93A;ADLAM SMALL GA; D; No_Joining_Group 922 | 1E93B;ADLAM SMALL NYA; D; No_Joining_Group 923 | 1E93C;ADLAM SMALL TU; D; No_Joining_Group 924 | 1E93D;ADLAM SMALL NHA; D; No_Joining_Group 925 | 1E93E;ADLAM SMALL VA; D; No_Joining_Group 926 | 1E93F;ADLAM SMALL KHA; D; No_Joining_Group 927 | 1E940;ADLAM SMALL GBE; D; No_Joining_Group 928 | 1E941;ADLAM SMALL ZAL; D; No_Joining_Group 929 | 1E942;ADLAM SMALL KPO; D; No_Joining_Group 930 | 1E943;ADLAM SMALL SHA; D; No_Joining_Group 931 | 1E94B;ADLAM NASALIZATION MARK; T; No_Joining_Group 932 | 933 | # EOF 934 | -------------------------------------------------------------------------------- /uc_spec/gen_idna_mapping_mod.escript: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env escript 2 | %% -*- erlang -*- 3 | %%! +A0 4 | 5 | -mode(compile). 6 | 7 | -define(MOD, "idna_mapping"). 8 | 9 | -export([main/1]). 10 | 11 | -ifdef('OTP_RELEASE'). 12 | -define(chomp(Str), string:chomp(Str)). 13 | -define(trim(Str), string:trim(Str, both)). 14 | -define(lexemes(Str, Pat), string:lexemes(Str, Pat)). 15 | -define(lower(C), string:lowercase(C)). 16 | -else. 17 | -define(chomp(Str), string:strip(Str, right, $\n)). 18 | -define(trim(Str), string:strip(Str, both)). 19 | -define(lexemes(Str, Pat), string:strip(string:tokens(Str, Pat), both)). 20 | -define(lower(C), string:to_lower(C)). 21 | -endif. 22 | 23 | -define(UTS46_STATUSES, #{ 24 | "valid" => {'V', false}, 25 | "ignored" => {'I', false}, 26 | "mapped" => {'M', true}, 27 | "deviation" => {'D', true}, 28 | "disallowed" => {'X', false}, 29 | "disallowed_STD3_valid" => {'3', false}, 30 | "disallowed_STD3_mapped" => {'3', true} 31 | }). 32 | 33 | 34 | 35 | main(_) -> 36 | {ok, IM} = file:open("../uc_spec/IdnaMappingTable.txt", [read, raw, {read_ahead, 1000000}]), 37 | Data = foldl(fun parse_idna_mapping/2, [], IM), 38 | file:close(IM), 39 | 40 | %% Make module 41 | OutputPath = filename:join(["..", "src", ?MOD++".erl"]), 42 | {ok, Out} = file:open(OutputPath, [write]), 43 | gen_file(Out, Data), 44 | ok = file:close(Out), 45 | ok. 46 | 47 | parse_idna_mapping(Line0, Acc) -> 48 | [Line|_Comments] = tokens(Line0, "#"), 49 | case tokens(Line, ";") of 50 | [CodePoints, Status] -> 51 | [{to_range(CodePoints), {?trim(Status), undefined, undefined}} | Acc]; 52 | [CodePoints, Status, Mapping] -> 53 | [{to_range(CodePoints), {?trim(Status), to_mapping(Mapping), undefined}} | Acc]; 54 | [CodePoints, Status, Mapping, Idna2008Status] -> 55 | [{to_range(CodePoints), {?trim(Status), to_mapping(Mapping), to_atom(Idna2008Status)}} | Acc] 56 | end. 57 | 58 | 59 | to_mapping(Mapping) -> 60 | [hex_to_int(C) || C <- ?lexemes(Mapping, " ")]. 61 | 62 | to_range(CodePoints0) -> 63 | case tokens(CodePoints0, ".") of 64 | [CodePoint] -> 65 | {hex_to_int(CodePoint), undefined}; 66 | [CodePoint1, "", CodePoint2] -> 67 | {hex_to_int(CodePoint1), hex_to_int(CodePoint2)} 68 | end. 69 | 70 | 71 | gen_file(Fd, Data) -> 72 | gen_header(Fd), 73 | gen_utc46(Fd, Data), 74 | ok. 75 | 76 | 77 | gen_header(Fd) -> 78 | io:put_chars(Fd, "%%\n%% this file is generated do not modify\n"), 79 | io:put_chars(Fd, "%% see ../uc_spec/gen_idna_mapping.escript\n\n"), 80 | io:put_chars(Fd, "-module(" ++ ?MOD ++").\n"), 81 | io:put_chars(Fd, "-compile(compressed).\n"), 82 | io:put_chars(Fd, "-export([uts46_map/1]).\n"), 83 | ok. 84 | 85 | gen_utc46(Fd, Data) -> 86 | lists:foreach( 87 | fun({CP, {S, M, _}}) -> 88 | {Status, Mapping} = maps:get(S, ?UTS46_STATUSES), 89 | case Mapping of 90 | true -> 91 | io:format(Fd, "uts46_map~s {~p, ~w};\n", [gen_single_clause(CP), Status, M]); 92 | false -> 93 | io:format(Fd, "uts46_map~s ~p;\n", [gen_single_clause(CP), Status]) 94 | end 95 | end, 96 | optimize_ranges(lists:sort(Data)) 97 | ), 98 | io:put_chars(Fd, "uts46_map(_) -> erlang:error(badarg).\n"). 99 | 100 | 101 | gen_single_clause({R0, undefined}) -> 102 | io_lib:format("(~w) ->", [R0]); 103 | gen_single_clause({R0, R1}) -> 104 | io_lib:format("(CP) when ~w =< CP, CP =< ~w ->", [R0,R1]). 105 | 106 | optimize_ranges(Rs0) -> 107 | PF = fun 108 | ({{N, undefined}, _}) when is_integer(N) -> true; 109 | (_) -> false 110 | end, 111 | 112 | {Singles, Rs} = lists:partition(PF, Rs0), 113 | Singles ++ Rs. 114 | 115 | 116 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 117 | 118 | hex_to_int([]) -> []; 119 | hex_to_int(HexStr) -> 120 | list_to_integer(?trim(HexStr), 16). 121 | 122 | to_atom(Str) -> 123 | list_to_atom(?lower(?trim(Str))). 124 | 125 | foldl(Fun, Acc, Fd) -> 126 | Get = fun() -> file:read_line(Fd) end, 127 | foldl_1(Fun, Acc, Get). 128 | 129 | foldl_1(_Fun, {done, Acc}, _Get) -> Acc; 130 | foldl_1(Fun, Acc, Get) -> 131 | case Get() of 132 | eof -> Acc; 133 | {ok, "#" ++ _} -> %% Ignore comments 134 | foldl_1(Fun, Acc, Get); 135 | {ok, "\n"} -> %% Ignore empty lines 136 | foldl_1(Fun, Acc, Get); 137 | {ok, Line} -> 138 | foldl_1(Fun, Fun(Line, Acc), Get) 139 | end. 140 | 141 | 142 | 143 | %% Differs from string:tokens, it returns empty string as token between two delimiters 144 | tokens(S, [C]) -> 145 | tokens(lists:reverse(S), C, []). 146 | 147 | tokens([Sep|S], Sep, Toks) -> 148 | tokens(S, Sep, [[]|Toks]); 149 | tokens([C|S], Sep, Toks) -> 150 | tokens_2(S, Sep, Toks, [C]); 151 | tokens([], _, Toks) -> 152 | Toks. 153 | 154 | tokens_2([Sep|S], Sep, Toks, Tok) -> 155 | tokens(S, Sep, [Tok|Toks]); 156 | tokens_2([C|S], Sep, Toks, Tok) -> 157 | tokens_2(S, Sep, Toks, [C|Tok]); 158 | tokens_2([], _Sep, Toks, Tok) -> 159 | [Tok|Toks]. 160 | -------------------------------------------------------------------------------- /uc_spec/gen_idna_table_mod.escript: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env escript 2 | %% -*- erlang -*- 3 | %%! +A0 4 | 5 | -mode(compile). 6 | 7 | -define(MOD, "idna_table"). 8 | 9 | -export([main/1]). 10 | 11 | -ifdef('OTP_RELEASE'). 12 | -define(trim(Str), string:trim(Str, both)). 13 | -define(lexemes(Str, Pat), string:lexemes(Str, Pat)). 14 | -else. 15 | -define(trim(Str), string:strip(Str, both)). 16 | -define(lexemes(Str, Pat), string:strip(string:tokens(Str, Pat), both)). 17 | -endif. 18 | 19 | 20 | main(_) -> 21 | {ok, IM} = file:open("../uc_spec/idna-table.txt", [read, raw, {read_ahead, 1000000}]), 22 | Data = foldl(fun parse_idna_table/2, [], IM), 23 | file:close(IM), 24 | 25 | %% Make module 26 | OutputPath = filename:join(["..", "src", ?MOD++".erl"]), 27 | {ok, Out} = file:open(OutputPath, [write]), 28 | gen_file(Out, Data), 29 | ok = file:close(Out), 30 | ok. 31 | 32 | 33 | parse_idna_table(Line0, Acc) -> 34 | [Line|_Comments] = tokens(Line0, "#"), 35 | [CodePoints, Status] = tokens(Line, ";"), 36 | [{to_range(CodePoints), to_atom(Status)} | Acc]. 37 | 38 | 39 | gen_file(Fd, Data) -> 40 | gen_header(Fd), 41 | gen_disallowed_p(Fd), 42 | gen_contextj_p(Fd), 43 | gen_contexto_p(Fd), 44 | gen_unassigned_p(Fd), 45 | gen_valid_p(Fd), 46 | gen_lookup(Fd, Data), 47 | ok. 48 | 49 | 50 | gen_header(Fd) -> 51 | io:put_chars(Fd, "%%\n%% this file is generated do not modify\n"), 52 | io:put_chars(Fd, "%% see ../uc_spec/gen_idna_table.escript\n\n"), 53 | io:put_chars(Fd, "-module(" ++ ?MOD ++").\n"), 54 | io:put_chars(Fd, "-compile(compressed).\n"), 55 | io:put_chars(Fd, "-export([lookup/1]).\n"), 56 | io:put_chars(Fd, "-export([disallowed_p/1, contextj_p/1, contexto_p/1, unassigned_p/1, valid_p/1]).\n"), 57 | ok. 58 | 59 | gen_disallowed_p(Fd) -> 60 | io:put_chars(Fd, "disallowed_p(CP) -> lookup(CP) == 'DISALLOWED'.\n"). 61 | 62 | gen_contextj_p(Fd) -> 63 | io:put_chars(Fd, "contextj_p(CP) -> lookup(CP) == 'CONTEXTJ'.\n"). 64 | 65 | gen_contexto_p(Fd) -> 66 | io:put_chars(Fd, "contexto_p(CP) -> lookup(CP) == 'CONTEXTO'.\n"). 67 | 68 | gen_unassigned_p(Fd) -> 69 | io:put_chars(Fd, "unassigned_p(CP) -> lookup(CP) == 'UNASSIGNED'.\n"). 70 | 71 | gen_valid_p(Fd) -> 72 | io:put_chars(Fd, "valid_p(CP) -> lookup(CP) == 'PVALID'.\n"). 73 | 74 | gen_lookup(Fd, Data) -> 75 | lists:foreach(fun({Cp, Class}) -> 76 | io:format(Fd, "lookup~s ~p;~n", [gen_single_clause(Cp), Class]) 77 | end, 78 | optimize_ranges(lists:sort(Data))), 79 | io:put_chars(Fd, "lookup(_) -> 'UNASSIGNED'."), 80 | ok. 81 | 82 | gen_single_clause({R0, undefined}) -> 83 | io_lib:format("(~w) ->", [R0]); 84 | gen_single_clause({R0, R1}) -> 85 | io_lib:format("(CP) when ~w =< CP, CP =< ~w ->", [R0,R1]). 86 | 87 | optimize_ranges(Rs0) -> 88 | PF = fun 89 | ({{N, undefined}, _}) when is_integer(N) -> true; 90 | (_) -> false 91 | end, 92 | 93 | {Singles, Rs} = lists:partition(PF, Rs0), 94 | Singles ++ Rs. 95 | 96 | 97 | to_range(CodePoints0) -> 98 | case tokens(CodePoints0, ".") of 99 | [CodePoint] -> 100 | {hex_to_int(CodePoint), undefined}; 101 | [CodePoint1, "", CodePoint2] -> 102 | {hex_to_int(CodePoint1), hex_to_int(CodePoint2)} 103 | end. 104 | 105 | hex_to_int([]) -> []; 106 | hex_to_int(HexStr) -> 107 | list_to_integer(?trim(HexStr), 16). 108 | 109 | to_atom(Str) -> 110 | list_to_atom(?trim(Str)). 111 | 112 | foldl(Fun, Acc, Fd) -> 113 | Get = fun() -> file:read_line(Fd) end, 114 | foldl_1(Fun, Acc, Get). 115 | 116 | foldl_1(_Fun, {done, Acc}, _Get) -> Acc; 117 | foldl_1(Fun, Acc, Get) -> 118 | case Get() of 119 | eof -> Acc; 120 | {ok, "#" ++ _} -> %% Ignore comments 121 | foldl_1(Fun, Acc, Get); 122 | {ok, "\n"} -> %% Ignore empty lines 123 | foldl_1(Fun, Acc, Get); 124 | {ok, Line} -> 125 | foldl_1(Fun, Fun(Line, Acc), Get) 126 | end. 127 | 128 | 129 | 130 | %% Differs from string:tokens, it returns empty string as token between two delimiters 131 | tokens(S, [C]) -> 132 | tokens(lists:reverse(S), C, []). 133 | 134 | tokens([Sep|S], Sep, Toks) -> 135 | tokens(S, Sep, [[]|Toks]); 136 | tokens([C|S], Sep, Toks) -> 137 | tokens_2(S, Sep, Toks, [C]); 138 | tokens([], _, Toks) -> 139 | Toks. 140 | 141 | tokens_2([Sep|S], Sep, Toks, Tok) -> 142 | tokens(S, Sep, [Tok|Toks]); 143 | tokens_2([C|S], Sep, Toks, Tok) -> 144 | tokens_2(S, Sep, Toks, [C|Tok]); 145 | tokens_2([], _Sep, Toks, Tok) -> 146 | [Tok|Toks]. 147 | -------------------------------------------------------------------------------- /uc_spec/gen_idnadata_mod.escript: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env escript 2 | %% -*- erlang -*- 3 | %%! +A0 4 | 5 | -mode(compile). 6 | 7 | -define(MOD, "idna_data"). 8 | 9 | -export([main/1]). 10 | 11 | -ifdef('OTP_RELEASE'). 12 | -define(chomp(Str), string:chomp(Str)). 13 | -define(trim(Str, Dir), string:trim(Str, Dir)). 14 | -define(lexemes(Str, Pat), string:lexemes(Str, Pat)). 15 | -define(lower(C), string:lowercase(C)). 16 | -else. 17 | -define(chomp(Str), string:strip(Str, right, $\n)). 18 | -define(trim(Str, Dir), string:strip(Str, Dir)). 19 | -define(lexemes(Str, Pat), string:strip(string:tokens(Str, Pat), both)). 20 | -define(lower(C), string:to_lower(C)). 21 | -endif. 22 | 23 | %% Default `Bidi_Class` for unassigned codepoints. 24 | %% 25 | %% Ref: 26 | -define(BIDI_CLASS_DEFAULTS, [ 27 | {{16#0600, 16#07BF}, "AL"}, 28 | {{16#0860, 16#086F}, "AL"}, 29 | {{16#08A0, 16#08FF}, "AL"}, 30 | {{16#FB50, 16#FDCF}, "AL"}, 31 | {{16#FDF0, 16#FDFF}, "AL"}, 32 | {{16#FE70, 16#FEFF}, "AL"}, 33 | {{16#00010D00, 16#00010D3F}, "AL"}, 34 | {{16#00010F30, 16#00010F6F}, "AL"}, 35 | {{16#0001EC70, 16#0001ECBF}, "AL"}, 36 | {{16#0001EE00, 16#0001EEFF}, "AL"}, 37 | %% Arabic, Syriac, and Thaana blocks, among others 38 | {{16#0590, 16#05FF}, "R"}, 39 | {{16#07C0, 16#085F}, "R"}, 40 | {{16#0870, 16#089F}, "R"}, 41 | {{16#FB1D, 16#FB4F}, "R"}, 42 | {{16#00010800, 16#00010CFF}, "R"}, 43 | {{16#00010D40, 16#00010F2F}, "R"}, 44 | {{16#00010F70, 16#00010FFF}, "R"}, 45 | {{16#0001E800, 16#0001EC6F}, "R"}, 46 | {{16#0001ECC0, 16#0001EDFF}, "R"}, 47 | {{16#0001EF00, 16#0001EFFF}, "R"}, 48 | %% Hebrew, NKo, and Phoenician blocks, among others. 49 | {{16#20A0, 16#20CF}, "ET"} 50 | %% Currency Symbols block. 51 | ]). 52 | 53 | main(_) -> 54 | {ok, IM} = file:open("../uc_spec/UnicodeData.txt", [read, raw, {read_ahead, 1000000}]), 55 | Data = foldl(fun parse_unicode_data/2, [], IM), 56 | file:close(IM), 57 | 58 | {ok, AS} = file:open("../uc_spec/ArabicShaping.txt", [read, raw, {read_ahead, 1000000}]), 59 | JoiningTypes = foldl(fun parse_as/2, [], AS), 60 | ok = file:close(AS), 61 | 62 | {ok, ScriptsF} = file:open("../uc_spec/Scripts.txt", [read, raw, {read_ahead, 1000000}]), 63 | Scripts = foldl(fun parse_scripts/2, [], ScriptsF), 64 | ok = file:close(ScriptsF), 65 | 66 | %% Make module 67 | OutputPath = filename:join(["..", "src", ?MOD++".erl"]), 68 | {ok, Out} = file:open(OutputPath, [write]), 69 | gen_file(Out, Data, JoiningTypes, Scripts), 70 | ok = file:close(Out), 71 | ok. 72 | 73 | gen_file(Fd, Data, JoiningTypes, Scripts) -> 74 | gen_header(Fd), 75 | gen_bidirectional(Fd), 76 | gen_lookup(Fd, Data), 77 | gen_joining_types(Fd, JoiningTypes), 78 | gen_scripts_types(Fd, Scripts), 79 | ok. 80 | 81 | 82 | gen_header(Fd) -> 83 | io:put_chars(Fd, "%%\n%% this file is generated do not modify\n"), 84 | io:put_chars(Fd, "%% see ../uc_spec/gen_idnadata_mod.escript\n\n"), 85 | io:put_chars(Fd, "-module(" ++ ?MOD ++").\n"), 86 | io:put_chars(Fd, "-compile(compressed).\n"), 87 | io:put_chars(Fd, "-export([lookup/1, joining_types/1, scripts/1]).\n"), 88 | io:put_chars(Fd, "-export([bidirectional/1]).\n"), 89 | ok. 90 | 91 | gen_lookup(Fd, Data) -> 92 | lists:foreach( 93 | fun({Cp,Tp}) -> 94 | io:format(Fd, "lookup(~w) -> ~p;~n", [Cp, Tp]) 95 | end, 96 | lists:sort(Data) 97 | ), 98 | io:put_chars(Fd, "lookup(_) -> false.\n\n"). 99 | 100 | gen_bidirectional(Fd) -> 101 | io:put_chars(Fd, "bidirectional(CP) ->\n"), 102 | io:put_chars(Fd, " case lookup(CP) of \n"), 103 | io:put_chars(Fd, " {_, C} -> C;\n"), 104 | io:put_chars(Fd, " false -> bidirectional_1(CP)\n"), 105 | io:put_chars(Fd, " end.\n\n"), 106 | lists:foreach( 107 | fun({Cp, Class}) -> 108 | io:format(Fd, "bidirectional_1~s ~p;~n", [gen_single_clause(Cp), Class]) 109 | end, 110 | lists:sort(?BIDI_CLASS_DEFAULTS) 111 | ), 112 | io:put_chars(Fd, "bidirectional_1(_) -> \"L\".\n\n"). 113 | 114 | gen_joining_types(Fd, JoiningTypes) -> 115 | lists:foreach( 116 | fun({Cp, Jt}) -> 117 | io:format(Fd, "joining_types(~w) -> ~p;~n", [Cp, ?trim(Jt, both)]) 118 | end, 119 | lists:sort(JoiningTypes) 120 | ), 121 | io:put_chars(Fd, "joining_types(_) -> undefined.\n\n"). 122 | 123 | gen_scripts_types(Fd, Scripts) -> 124 | lists:foreach( 125 | fun({Cp, Jt}) -> 126 | io:format(Fd, "scripts~s ~p;~n", [gen_single_clause(Cp), ?lower(?trim(Jt, both))]) 127 | end, 128 | optimize_scripts_ranges(lists:sort(Scripts)) 129 | ), 130 | io:put_chars(Fd, "scripts(_) -> false.\n\n"). 131 | 132 | optimize_scripts_ranges(Rs0) -> 133 | PF = fun 134 | ({{N, undefined}, _}) when is_integer(N) -> true; 135 | (_) -> false 136 | end, 137 | 138 | {Singles, Rs} = lists:partition(PF, Rs0), 139 | Singles ++ Rs. 140 | 141 | 142 | gen_single_clause({R0, undefined}) -> 143 | io_lib:format("(~w) ->", [R0]); 144 | gen_single_clause({R0, R1}) -> 145 | io_lib:format("(CP) when ~w =< CP, CP =< ~w ->", [R0,R1]). 146 | 147 | 148 | 149 | parse_unicode_data(Line0, Acc) -> 150 | Line = ?chomp(Line0), 151 | [CodePoint, _Name, Cat, _Class, BiDi |_] = tokens(Line, ";"), 152 | [{hex_to_int(CodePoint), {?trim(Cat, both), ?trim(BiDi, both)}} | Acc]. 153 | 154 | parse_as(Line0, Acc) -> 155 | Line = ?chomp(Line0), 156 | case tokens(Line, ";") of 157 | [CodePoint, _,JT|_] -> 158 | [{hex_to_int(CodePoint), ?trim(JT, both) } | Acc]; 159 | _ -> 160 | Acc 161 | end. 162 | 163 | parse_scripts(Line0, Acc) -> 164 | [Line|_Comments] = tokens(Line0, "#"), 165 | [CodePoints, Script0] = tokens(Line, ";"), 166 | Script1 = ?trim(Script0, both), 167 | case lists:member(Script1, ["Greek", "Han", "Hebrew", "Hiragana", "Katakana"]) of 168 | true -> 169 | [{to_range(CodePoints), Script1} | Acc]; 170 | false -> 171 | Acc 172 | end. 173 | 174 | 175 | to_range(CodePoints0) -> 176 | case tokens(CodePoints0, ".") of 177 | [CodePoint] -> 178 | {hex_to_int(CodePoint), undefined}; 179 | [CodePoint1, "", CodePoint2] -> 180 | {hex_to_int(CodePoint1), hex_to_int(CodePoint2)} 181 | end. 182 | 183 | hex_to_int([]) -> []; 184 | hex_to_int(HexStr) -> 185 | list_to_integer(?trim(HexStr, both), 16). 186 | 187 | 188 | foldl(Fun, Acc, Fd) -> 189 | Get = fun() -> file:read_line(Fd) end, 190 | foldl_1(Fun, Acc, Get). 191 | 192 | foldl_1(_Fun, {done, Acc}, _Get) -> Acc; 193 | foldl_1(Fun, Acc, Get) -> 194 | case Get() of 195 | eof -> Acc; 196 | {ok, "#" ++ _} -> %% Ignore comments 197 | foldl_1(Fun, Acc, Get); 198 | {ok, "\n"} -> %% Ignore empty lines 199 | foldl_1(Fun, Acc, Get); 200 | {ok, Line} -> 201 | foldl_1(Fun, Fun(Line, Acc), Get) 202 | end. 203 | 204 | 205 | 206 | %% Differs from string:tokens, it returns empty string as token between two delimiters 207 | tokens(S, [C]) -> 208 | tokens(lists:reverse(S), C, []). 209 | 210 | tokens([Sep|S], Sep, Toks) -> 211 | tokens(S, Sep, [[]|Toks]); 212 | tokens([C|S], Sep, Toks) -> 213 | tokens_2(S, Sep, Toks, [C]); 214 | tokens([], _, Toks) -> 215 | Toks. 216 | 217 | tokens_2([Sep|S], Sep, Toks, Tok) -> 218 | tokens(S, Sep, [Tok|Toks]); 219 | tokens_2([C|S], Sep, Toks, Tok) -> 220 | tokens_2(S, Sep, Toks, [C|Tok]); 221 | tokens_2([], _Sep, Toks, Tok) -> 222 | [Tok|Toks]. 223 | --------------------------------------------------------------------------------