├── .gitignore ├── COPYING ├── README.md ├── VERSION.txt ├── bench ├── mbench.lua └── ubench.lua ├── build_config.lua ├── build_east_asian_width.lua ├── build_general_category.lua ├── build_is_white_space.lua ├── docs ├── 10.0.0 │ └── ucd │ │ ├── EastAsianWidth.txt │ │ ├── PropList.txt │ │ └── UnicodeData.txt ├── 11.0.0 │ └── ucd │ │ ├── EastAsianWidth.txt │ │ ├── PropList.txt │ │ └── UnicodeData.txt ├── 12.0.0 │ └── ucd │ │ ├── EastAsianWidth.txt │ │ ├── PropList.txt │ │ └── UnicodeData.txt ├── 12.1.0 │ └── ucd │ │ ├── EastAsianWidth.txt │ │ ├── PropList.txt │ │ └── UnicodeData.txt ├── 13.0.0 │ └── ucd │ │ ├── EastAsianWidth.txt │ │ ├── PropList.txt │ │ └── UnicodeData.txt ├── 14.0.0 │ └── ucd │ │ ├── EastAsianWidth.txt │ │ ├── PropList.txt │ │ └── UnicodeData.txt ├── 15.0.0 │ └── ucd │ │ ├── EastAsianWidth.txt │ │ ├── PropList.txt │ │ └── UnicodeData.txt ├── 16.0.0 │ └── ucd │ │ ├── EastAsianWidth.txt │ │ ├── PropList.txt │ │ └── UnicodeData.txt ├── _config.yml └── index.html ├── dromozoa-markdown-table ├── dromozoa-utf8-1.19-1.rockspec ├── dromozoa ├── ucd.lua ├── ucd │ ├── builder.lua │ ├── east_asian_width.lua │ ├── general_category.lua │ └── is_white_space.lua ├── utf16.lua ├── utf16 │ └── decode_surrogate_pair.lua ├── utf8.lua └── utf8 │ ├── check_integer.lua │ ├── check_string.lua │ ├── count.lua │ ├── count_table.lua │ ├── decode.lua │ ├── decode_each.lua │ ├── decode_table.lua │ ├── encode.lua │ ├── encode53.lua │ ├── encode_error.lua │ ├── encode_table.lua │ ├── offset.lua │ ├── offset_table.lua │ └── pure.lua ├── test.sh └── test ├── icu4j ├── pom.xml ├── run.sh └── src │ ├── main │ └── java │ │ └── com │ │ └── dromozoa │ │ └── utf8 │ │ └── Application.java │ └── test │ └── java │ └── com │ └── dromozoa │ └── utf8 │ └── ApplicationTest.java ├── table01.exp ├── table01.md ├── table02.exp ├── table02.md ├── table03.exp ├── table03.md ├── table04.exp ├── table04.md ├── test.exp ├── test.lua ├── test_decode_surrogate_pair.lua ├── test_east_asian_width.lua ├── test_east_asian_width.txt ├── test_general_category.lua ├── test_general_category.txt ├── test_is_white_space.lua ├── test_is_white_space.txt └── test_ucd_builder.lua /.gitignore: -------------------------------------------------------------------------------- 1 | .*.swp 2 | /out 3 | /test/icu4j/target 4 | /ubench_result* 5 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dromozoa-utf8 2 | 3 | Lua 5.3 compatible pure-Lua UTF-8 implementation. 4 | Currently, `lax` optional argument introduced in Lua 5.4 is not supported. 5 | 6 | ## v1.19 7 | 8 | * Unicode 16.0 9 | 10 | ## v1.18 11 | 12 | * Unicode 15.0 13 | * old luarocks support 14 | 15 | ## v1.17 16 | 17 | * maintenance release 18 | 19 | ## v1.16 20 | 21 | * Unicode 13.0 22 | * Lua 5.4 like error messages 23 | 24 | ## v1.15 25 | 26 | * performance improvement 27 | 28 | ## v1.14 29 | 30 | * Unicode 12.1 31 | 32 | ## v1.13 33 | 34 | * Unicode 12.0 35 | 36 | ## v1.12 37 | 38 | * Unicode 11.0 39 | 40 | ## v1.11 41 | 42 | * maintenance release 43 | 44 | ## v1.10 45 | 46 | * new function `dromozoa.ucd.general_category` 47 | 48 | ## v1.9 49 | 50 | * new module `dromozoa.ucd` 51 | * new module `dromozoa.utf16` 52 | 53 | ## v1.8 54 | 55 | * new function `dromozoa.ucd.is_white_space` 56 | 57 | ## v1.7 58 | 59 | * maintenance release 60 | 61 | ## v1.6 62 | 63 | * new utility `dromozoa-markdown-table` 64 | * new module `dromozoa.ucd.builder` 65 | 66 | ## v1.5 67 | 68 | * new function `dromozoa.ucd.east_asian_width` 69 | * new function `dromozoa.utf16.decode_surrogate_pair` 70 | 71 | ## v1.4 72 | 73 | ### Features 74 | 75 | * table-based performance improvement 76 | * almost compatible argument check 77 | * strict UTF-8 encoding check (do not accept CESU-8) 78 | 79 | ### Performance Improvement 80 | 81 | | Function | Improvement Ratio | 82 | |--------------------------|------------------:| 83 | | `utf8.char` | 9.9 | 84 | | `utf8.codes` | 1.8 | 85 | | `utf8.codepoint` | 1.8 | 86 | | `utf8.len` | 1.5 | 87 | | `utf8.offset` (positive) | 3.3 | 88 | | `utf8.offset` (negative) | 2.3 | 89 | 90 | ### Memory Usage 91 | 92 | | Version | ILP32 | LP64/LLP64 | 93 | |---------|-------:|-----------:| 94 | | v1.3 | 9KiB | 10KiB | 95 | | v1.4 | 232KiB | 274KiB | 96 | -------------------------------------------------------------------------------- /VERSION.txt: -------------------------------------------------------------------------------- 1 | 1.19 2 | -------------------------------------------------------------------------------- /bench/mbench.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2017,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | collectgarbage() 19 | collectgarbage() 20 | local c1 = collectgarbage "count" 21 | 22 | local pure = require "dromozoa.utf8.pure" 23 | 24 | collectgarbage() 25 | collectgarbage() 26 | local c2 = collectgarbage "count" 27 | 28 | print((c2 - c1) * 1024) 29 | -------------------------------------------------------------------------------- /bench/ubench.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2017,2018,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local pure = require "dromozoa.utf8.pure" 19 | 20 | local unpack = table.unpack or unpack 21 | 22 | local utf8_char = table.concat { 23 | string.char(0x41, 0xE2, 0x89, 0xA2, 0xCE, 0x91, 0x2E); 24 | string.char(0xED, 0x95, 0x9C, 0xEA, 0xB5, 0xAD, 0xEC, 0x96, 0xB4); 25 | string.char(0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC, 0xE8, 0xAA, 0x9E); 26 | string.char(0xEF, 0xBB, 0xBF, 0xF0, 0xA3, 0x8E, 0xB4); 27 | } 28 | 29 | local codepoint_source = { 30 | 0x0041, 0x2262, 0x0391, 0x002E, 31 | 0xD55C, 0xAD6D, 0xC5B4, 32 | 0x65E5, 0x672C, 0x8A9E, 33 | 0xFEFF, 0x0233B4, 34 | } 35 | 36 | local n = 5 37 | utf8_char = utf8_char:rep(n) 38 | 39 | local codepoint = {} 40 | for _ = 1, n do 41 | for i = 1, #codepoint_source do 42 | codepoint[#codepoint + 1] = codepoint_source[i] 43 | end 44 | end 45 | 46 | local function run_char(x, f, ...) 47 | local result = f(...) 48 | return x + #result, f, ... 49 | end 50 | 51 | local function run_each(x, f, ...) 52 | for p, c in f(...) do 53 | x = x + p 54 | end 55 | return x, f, ... 56 | end 57 | 58 | local function run_codepoint(x, f, ...) 59 | local result = f(...) 60 | return x + result, f, ... 61 | end 62 | 63 | local function run_count(x, f, ...) 64 | local result = f(...) 65 | return x + result, f, ... 66 | end 67 | 68 | local function run_offset(x, f, ...) 69 | local result = f(...) 70 | return x + result, f, ... 71 | end 72 | 73 | local function setup(benchmarks, module, prefix) 74 | benchmarks[#benchmarks + 1] = { prefix .. ".char", run_char, 0, module.char, unpack(codepoint) } 75 | benchmarks[#benchmarks + 1] = { prefix .. ".codes", run_each, 0, module.codes, utf8_char } 76 | benchmarks[#benchmarks + 1] = { prefix .. ".codepoint", run_codepoint, 0, module.codepoint, utf8_char, 1, #utf8_char } 77 | benchmarks[#benchmarks + 1] = { prefix .. ".len", run_count, 0, module.len, utf8_char:rep(2) } 78 | benchmarks[#benchmarks + 1] = { prefix .. ".offset.P", run_offset, 0, module.offset, utf8_char, #codepoint } 79 | benchmarks[#benchmarks + 1] = { prefix .. ".offset.M", run_offset, 0, module.offset, utf8_char, -#codepoint } 80 | end 81 | 82 | local benchmarks = {} 83 | setup(benchmarks, pure, "pure") 84 | if utf8 then 85 | setup(benchmarks, utf8, "utf8") 86 | end 87 | return benchmarks 88 | -------------------------------------------------------------------------------- /build_config.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2019,2020,2023,2024 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | return { 19 | ucd_version = "16.0.0"; 20 | } 21 | -------------------------------------------------------------------------------- /build_east_asian_width.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2017-2019,2023,2024 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local builder = require "dromozoa.ucd.builder" 19 | local build_config = require "build_config" 20 | 21 | local unpack = table.unpack or unpack 22 | 23 | local source_filename = "docs/" .. build_config.ucd_version .. "/ucd/EastAsianWidth.txt" 24 | local result_filename = "dromozoa/ucd/east_asian_width.lua" 25 | 26 | local properties = { 27 | ["N"] = true; -- neutral 28 | ["Na"] = true; -- narrow 29 | ["H"] = true; -- halfwidth 30 | ["A"] = true; -- ambiguous 31 | ["W"] = true; -- wide 32 | ["F"] = true; -- fullwidth 33 | } 34 | 35 | local _ = builder "N" 36 | 37 | for line in io.lines(source_filename) do 38 | local first, last, property = line:match "^(%x+)%.%.(%x+)%s*;%s*(%a+)" 39 | if not first then 40 | first, property = line:match "^(%x+)%s*;%s*(%a+)" 41 | last = first 42 | end 43 | if first then 44 | local first = tonumber(first, 16) 45 | local last = tonumber(last, 16) 46 | assert(first <= last) 47 | assert(not prev or prev < first) 48 | assert(properties[property]) 49 | _:range(first, last, property) 50 | prev = last 51 | end 52 | end 53 | 54 | local data = _:build() 55 | local out = assert(io.open(result_filename, "w")) 56 | _.compile(out, data):close() 57 | -------------------------------------------------------------------------------- /build_general_category.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2018,2019,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local builder = require "dromozoa.ucd.builder" 19 | local build_config = require "build_config" 20 | 21 | local unpack = table.unpack or unpack 22 | 23 | local source_filename = "docs/" .. build_config.ucd_version .. "/ucd/UnicodeData.txt" 24 | local result_filename = "dromozoa/ucd/general_category.lua" 25 | 26 | local _ = builder "Cn" 27 | 28 | local prev_code 29 | local prev_property 30 | 31 | for line in io.lines(source_filename) do 32 | local code, name, property = assert(line:match "^(%x+);(.-);(.-);") 33 | code = tonumber(code, 16) 34 | if name:find ", First>$" then 35 | prev_code = code 36 | prev_property = property 37 | else 38 | if name:find ", Last>$" then 39 | assert(prev_code < code) 40 | assert(prev_property == property) 41 | _:range(prev_code, code, property) 42 | prev_code = nil 43 | prev_property = nil 44 | else 45 | assert(not prev_code) 46 | assert(not prev_property) 47 | _:range(code, code, property) 48 | end 49 | end 50 | end 51 | 52 | local data = _:build() 53 | local out = assert(io.open(result_filename, "w")) 54 | _.compile(out, data):close() 55 | -------------------------------------------------------------------------------- /build_is_white_space.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2018,2019,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local builder = require "dromozoa.ucd.builder" 19 | local build_config = require "build_config" 20 | 21 | local unpack = table.unpack or unpack 22 | 23 | local source_filename = "docs/" .. build_config.ucd_version .. "/ucd/PropList.txt" 24 | local result_filename = "dromozoa/ucd/is_white_space.lua" 25 | 26 | local _ = builder(false) 27 | 28 | for line in io.lines(source_filename) do 29 | local first, last, property = line:match "^(%x+)%.%.(%x+)%s*;%s*([%w_]+)" 30 | if not first then 31 | first, property = line:match "^(%x+)%s*;%s*([%w_]+)" 32 | last = first 33 | end 34 | if first and property == "White_Space" then 35 | local first = tonumber(first, 16) 36 | local last = tonumber(last, 16) 37 | assert(first <= last) 38 | assert(not prev or prev < first) 39 | _:range(first, last, true) 40 | prev = last 41 | end 42 | end 43 | 44 | local data = _:build() 45 | local out = assert(io.open(result_filename, "w")) 46 | _.compile(out, data):close() 47 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | dromozoa-utf8 7 | 8 | 9 | 23 | 24 |
25 | 26 |

dromozoa-utf8

27 | 28 |

リンク

29 | 30 | 51 | 52 |

UTF-8デコード

53 | 54 |

55 | RFC 3629で与えられているABNFを下記に示す。冗長なエンコードと代用符号位置のエンコードが除外されている。 56 |

57 | 58 |
UTF8-octets = *( UTF8-char )
 59 | UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
 60 | UTF8-1      = %x00-7F
 61 | UTF8-2      = %xC2-DF UTF8-tail
 62 | UTF8-3      = %xE0 %xA0-BF UTF8-tail
 63 |             / %xE1-EC 2( UTF8-tail )
 64 |             / %xED %x80-9F UTF8-tail
 65 |             / %xEE-EF 2( UTF8-tail )
 66 | UTF8-4      = %xF0 %x90-BF 2( UTF8-tail )
 67 |             / %xF1-F3 3( UTF8-tail )
 68 |             / %xF4 %x80-8F 2( UTF8-tail )
 69 | UTF8-tail   = %x80-BF
70 | 71 |

72 | 展開して正規表現で書きくだす。 73 |

74 | 75 |
UTF8-1  = [\x00-\x7F]
 76 | UTF8-2  = [\xC2-\xDF] [\x80-\xBF]
 77 | UTF8-3a = [\xE0-\xE0] [\xA0-\xBF] [\x80-\xBF]
 78 | UTF8-3b = [\xE1-\xEC] [\x80-\xBF] [\x80-\xBF]
 79 | UTF8-3c = [\xED-\xED] [\x80-\x9F] [\x80-\xBF]
 80 | UTF8-3d = [\xEE-\xEF] [\x80-\xBF] [\x80-\xBF]
 81 | UTF8-4a = [\xF0-\xF0] [\x90-\xBF] [\x80-\xBF] [\x80-\xBF]
 82 | UTF8-4b = [\xF1-\xF3] [\x80-\xBF] [\x80-\xBF] [\x80-\xBF]
 83 | UTF8-4c = [\xF4-\xF4] [\x80-\x8F] [\x80-\xBF] [\x80-\xBF]
84 | 85 |

86 | 第1バイトだけに着目し、妥当でない領域も考慮すると、 87 |

88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 |
第1バイト長さ
00..7F1
80..C1
C2..DF2
E03
E1..EC3
ED3
EE..EF3
F04
F1..F34
F44
F5..FF
103 | 104 |

105 | 第2バイト以降の範囲は6種類である。演算後の値をテーブルとして保持する場合、第2バイト以降の範囲は7種類に分けられる(次表A,B,B2,B3,C,C2,C3)。 106 |

107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 |
第1バイト第2バイト第3バイト第4バイト
00..7F
C2..DF80..BF (A)
E0A0..BF (B2)80..BF (A)
E1..EC80..BF (B)80..BF (A)
ED80..9F (B3)80..BF (A)
EE..EF80..BF (B)80..BF (A)
F090..BF (C2)80..BF (B)80..BF (A)
F1..F380..BF (C)80..BF (B)80..BF (A)
F480..8F (C3)80..BF (B)80..BF (A)
120 | 121 |

UTF-8エンコード

122 | 123 |

124 | 1112064種類の整数を引数としてUTF-8バイト列を返すような関数であり、メモリ量を考えなければ単純な表で実装が可能である。 125 |

126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 |
開始符号位置終了符号位置長さ個数個数(16進数)
0000007F11280x0080
008007FF219200x0780
0800D7FF3532480xD000
E000FFFF381920x2000
01000010FFFF410485760x010000
135 | 136 |

137 | 実験によれば、U+0001..U+07FFまでの区間を表にすると、消費メモリ量と速度のバランスがよいようだった。U+0800以降については、上位12bitで表を作ることで消費メモリ量と速度のバランスをとる。 138 |

139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 |
長さ符号位置6bit分割上位12bit
2080000 20 000020
2D8000D 20 000360
2DFFF0D 3F 3F037F
2FFFF0F 3F 3F03FF
301000000 10 00 000010
310FFFF04 0F 3F 3F010F
149 | 150 |

Luaと演算

151 | 152 |
    153 |
  • Lua 5.3のビット演算は演算子実装なので関数呼び出しよりも速い。
  • 154 |
  • LuaJITは最適化を考慮する必要がある。
  • 155 |
156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 |
バージョン実装
Lua 5.1
LuaJIT関数 (bit)
Lua 5.2関数 (bit32)
Lua 5.3演算子
164 | 165 |

文字数のカウント

166 | 167 |

168 | 任意のバイトは、UTF-8の有効な第1〜4バイトであるか不正なバイトである。状態\(s\)と4バイトを引数として、状態を返すような関数\(f(s,a,b,c,d)\)を考え、有限状態機械で表現する。有限状態機械は最小化しないほうが高速かもしれない。 169 |

170 | 171 |

オフセット

172 | 173 |

174 | Lua 5.3のutf8.offsetは、\(n\)文字めの位置をバイト単位で返す関数である。実装は、全バイトにアクセスして末尾バイト(80..BFでないバイト)かどうかを判定している。仕様が妥当なUTF-8文字列であることを仮定するので、バイト列の先頭からオフセットを探索する場合、先頭バイトだけにアクセスする実装が可能である。バイト列の末尾からオフセットを探索する場合、先頭バイトだけに着目して実装することはできないので、文字数のカウントと同様のテクニックを用いて実装する。 175 |

176 | 177 |
178 | 185 | 186 | 187 | 192 | 193 | 194 | -------------------------------------------------------------------------------- /dromozoa-markdown-table: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env lua 2 | 3 | -- Copyright (C) 2017,2018,2023 Tomoyuki Fujimori 4 | -- 5 | -- This file is part of dromozoa-utf8. 6 | -- 7 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 8 | -- it under the terms of the GNU General Public License as published by 9 | -- the Free Software Foundation, either version 3 of the License, or 10 | -- (at your option) any later version. 11 | -- 12 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 13 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | -- GNU General Public License for more details. 16 | -- 17 | -- You should have received a copy of the GNU General Public License 18 | -- along with dromozoa-utf8. If not, see . 19 | 20 | local utf8 = require "dromozoa.utf8" 21 | local east_asian_width = require "dromozoa.ucd.east_asian_width" 22 | 23 | local ambiguous_width = ... 24 | 25 | ambiguous_width = tonumber(ambiguous_width) 26 | if not ambiguous_width then 27 | ambiguous_width = 2 28 | end 29 | 30 | local widths = { 31 | ["N"] = 1; -- neutral 32 | ["Na"] = 1; -- narrow 33 | ["H"] = 1; -- halfwidth 34 | ["A"] = ambiguous_width; -- ambiguous 35 | ["W"] = 2; -- wide 36 | ["F"] = 2; -- fullwidth 37 | } 38 | 39 | local function width(s) 40 | local width = 0 41 | for _, c in utf8.codes(s) do 42 | width = width + widths[east_asian_width(c)] 43 | end 44 | return width 45 | end 46 | 47 | local data = {} 48 | local n = 0 49 | 50 | for line in io.lines() do 51 | local items = {} 52 | local m = 0 53 | for item in ("|" .. line:match "^%s*|?(.-)|?%s*$"):gmatch "|([^|]*)" do 54 | m = m + 1 55 | items[m] = item:match "^%s*(.-)%s*$" 56 | end 57 | n = n + 1 58 | data[n] = items 59 | end 60 | 61 | local alignments = {} 62 | local max_widths = {} 63 | 64 | for i = 1, n do 65 | local items = data[i] 66 | if i == 2 then 67 | for j = 1, #items do 68 | local item = items[j] 69 | if item:find ":$" then 70 | if item:find "^:." then 71 | alignments[j] = "center" 72 | else 73 | alignments[j] = "right" 74 | end 75 | else 76 | alignments[j] = "normal" 77 | end 78 | end 79 | else 80 | for j = 1, #items do 81 | local item = items[j] 82 | local m = max_widths[j] 83 | local n = width(item) 84 | if m == nil or m < n then 85 | max_widths[j] = n 86 | end 87 | end 88 | end 89 | end 90 | 91 | local m = #alignments 92 | local n = #max_widths 93 | if m < n then 94 | for i = m + 1, n do 95 | alignments[i] = "normal" 96 | end 97 | else 98 | for i = n + 1, m do 99 | max_widths[i] = 0 100 | end 101 | n = m 102 | end 103 | 104 | for i = 1, #data do 105 | local items = data[i] 106 | if #items == 0 then 107 | io.write "\n" 108 | elseif i == 2 then 109 | for j = 1, n do 110 | local alignment = alignments[j] 111 | local max_width = max_widths[j] 112 | io.write "|" 113 | if alignment == "center" then 114 | io.write ":" 115 | else 116 | io.write "-" 117 | end 118 | io.write(("-"):rep(max_width)) 119 | if alignment == "normal" then 120 | io.write "-" 121 | else 122 | io.write ":" 123 | end 124 | end 125 | io.write "|\n" 126 | else 127 | for j = 1, n do 128 | local alignment = alignments[j] 129 | local max_width = max_widths[j] 130 | io.write "|" 131 | local item = items[j] 132 | if not item then 133 | item = "" 134 | end 135 | local w = max_width - width(item) + 2 136 | local w1 137 | if alignment == "normal" then 138 | w1 = 1 139 | elseif alignment == "center" then 140 | w1 = (w - w % 2) / 2 141 | elseif alignment == "right" then 142 | w1 = w - 1 143 | end 144 | local w2 = w - w1 145 | io.write((" "):rep(w1), item, (" "):rep(w2)) 146 | end 147 | io.write "|\n" 148 | end 149 | end 150 | -------------------------------------------------------------------------------- /dromozoa-utf8-1.19-1.rockspec: -------------------------------------------------------------------------------- 1 | package = "dromozoa-utf8" 2 | version = "1.19-1" 3 | source = { 4 | url = "https://github.com/dromozoa/dromozoa-utf8/archive/v1.19.tar.gz"; 5 | file = "dromozoa-utf8-1.19.tar.gz"; 6 | } 7 | description = { 8 | summary = "Lua 5.3 compatible pure-Lua UTF-8 implementation"; 9 | license = "GPL-3"; 10 | homepage = "https://github.com/dromozoa/dromozoa-utf8/"; 11 | maintainer = "Tomoyuki Fujimori "; 12 | } 13 | build = { 14 | type = "builtin"; 15 | modules = { 16 | ["dromozoa.ucd"] = "dromozoa/ucd.lua"; 17 | ["dromozoa.ucd.builder"] = "dromozoa/ucd/builder.lua"; 18 | ["dromozoa.ucd.east_asian_width"] = "dromozoa/ucd/east_asian_width.lua"; 19 | ["dromozoa.ucd.general_category"] = "dromozoa/ucd/general_category.lua"; 20 | ["dromozoa.ucd.is_white_space"] = "dromozoa/ucd/is_white_space.lua"; 21 | ["dromozoa.utf16"] = "dromozoa/utf16.lua"; 22 | ["dromozoa.utf16.decode_surrogate_pair"] = "dromozoa/utf16/decode_surrogate_pair.lua"; 23 | ["dromozoa.utf8"] = "dromozoa/utf8.lua"; 24 | ["dromozoa.utf8.check_integer"] = "dromozoa/utf8/check_integer.lua"; 25 | ["dromozoa.utf8.check_string"] = "dromozoa/utf8/check_string.lua"; 26 | ["dromozoa.utf8.count"] = "dromozoa/utf8/count.lua"; 27 | ["dromozoa.utf8.count_table"] = "dromozoa/utf8/count_table.lua"; 28 | ["dromozoa.utf8.decode"] = "dromozoa/utf8/decode.lua"; 29 | ["dromozoa.utf8.decode_each"] = "dromozoa/utf8/decode_each.lua"; 30 | ["dromozoa.utf8.decode_table"] = "dromozoa/utf8/decode_table.lua"; 31 | ["dromozoa.utf8.encode"] = "dromozoa/utf8/encode.lua"; 32 | ["dromozoa.utf8.encode53"] = "dromozoa/utf8/encode53.lua"; 33 | ["dromozoa.utf8.encode_error"] = "dromozoa/utf8/encode_error.lua"; 34 | ["dromozoa.utf8.encode_table"] = "dromozoa/utf8/encode_table.lua"; 35 | ["dromozoa.utf8.offset"] = "dromozoa/utf8/offset.lua"; 36 | ["dromozoa.utf8.offset_table"] = "dromozoa/utf8/offset_table.lua"; 37 | ["dromozoa.utf8.pure"] = "dromozoa/utf8/pure.lua"; 38 | }; 39 | install = { 40 | bin = { 41 | ["dromozoa-markdown-table"] = "dromozoa-markdown-table"; 42 | }; 43 | }; 44 | } 45 | -------------------------------------------------------------------------------- /dromozoa/ucd.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2018,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | return { 19 | east_asian_width = require "dromozoa.ucd.east_asian_width"; 20 | general_category = require "dromozoa.ucd.general_category"; 21 | is_white_space = require "dromozoa.ucd.is_white_space"; 22 | } 23 | -------------------------------------------------------------------------------- /dromozoa/ucd/builder.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2017,2018,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local function quote(v) 19 | local t = type(v) 20 | if t == "nil" then 21 | return "nil" 22 | elseif t == "number" then 23 | return ("%.17g"):format(v) 24 | elseif t == "string" then 25 | return ("%q"):format(v) 26 | elseif t == "boolean" then 27 | if v then 28 | return "true" 29 | else 30 | return "false" 31 | end 32 | else 33 | error("nil/number/string/boolean expected, got " .. t) 34 | end 35 | end 36 | 37 | local function compile(out, tree_class, tree_value, i, depth) 38 | local u = tree_value[i] 39 | local j = i * 2 40 | local k = j + 1 41 | 42 | local indent = (" "):rep(depth) 43 | local depth = depth + 1 44 | 45 | if tree_class[k] == "node" then 46 | out:write(indent, "if c < ", quote(u), " then\n") 47 | compile(out, tree_class, tree_value, j, depth) 48 | out:write(indent, "else\n") 49 | compile(out, tree_class, tree_value, k, depth) 50 | out:write(indent, "end\n") 51 | elseif tree_class[j] == "node" then 52 | local w = tree_value[k] 53 | out:write(indent, "if c < ", quote(u), " then\n") 54 | compile(out, tree_class, tree_value, j, depth) 55 | out:write(indent, "else\n") 56 | out:write(indent, " return ", quote(w), "\n") 57 | out:write(indent, "end\n") 58 | else 59 | local v = tree_value[j] 60 | local w = tree_value[k] 61 | if type(v) == "boolean" and type(w) == "boolean" then 62 | if v then 63 | out:write(indent, "return c < ", quote(u), "\n") 64 | else 65 | out:write(indent, "return c >= ", quote(u), "\n") 66 | end 67 | else 68 | out:write(indent, "if c < ", quote(u), " then return ", quote(v), " else return ", quote(w), " end\n") 69 | end 70 | end 71 | end 72 | 73 | local class = {} 74 | local metatable = { __index = class } 75 | 76 | function class:range(first, last, value) 77 | local map = self.map 78 | for i = first, last do 79 | map[i] = value 80 | end 81 | end 82 | 83 | function class:build() 84 | local map = self.map 85 | local range_first = {} 86 | local range_value = {} 87 | local n = 0 88 | 89 | for i = 0, 0x10FFFF do 90 | local value = map[i] 91 | if value ~= range_value[n] then 92 | n = n + 1 93 | range_first[n] = i 94 | range_value[n] = value 95 | end 96 | end 97 | 98 | local m = n - 1 99 | local indice = {} 100 | for i = 1, m do 101 | indice[i] = i 102 | end 103 | 104 | local tree_class = {} 105 | local tree_value = {} 106 | 107 | local height = math.ceil(math.log(n) / math.log(2)) 108 | for i = height, 0, -1 do 109 | local j = 2^i 110 | local k = 1 111 | local index = indice[k] 112 | while index and j <= m do 113 | tree_class[j] = "node" 114 | tree_value[j] = range_first[index + 1] 115 | table.remove(indice, k) 116 | j = j + 1 117 | k = k + 1 118 | index = indice[k] 119 | end 120 | end 121 | 122 | for i = 1, n do 123 | local first = range_first[i] 124 | local value = range_value[i] 125 | local j = 1 126 | while tree_class[j] == "node" do 127 | if first < tree_value[j] then 128 | j = j * 2 129 | else 130 | j = j * 2 + 1 131 | end 132 | end 133 | tree_class[j] = "leaf" 134 | tree_value[j] = value 135 | end 136 | 137 | return { 138 | range = { 139 | first = range_first; 140 | value = range_value; 141 | }; 142 | tree = { 143 | class = tree_class; 144 | value = tree_value; 145 | }; 146 | } 147 | end 148 | 149 | function class.compile(out, data) 150 | local tree = data.tree 151 | local buffer = {} 152 | out:write [[ 153 | return function (c) 154 | c = c + 0 155 | ]] 156 | compile(out, tree.class, tree.value, 1, 1) 157 | out:write "end\n" 158 | return out 159 | end 160 | 161 | return setmetatable(class, { 162 | __call = function (_, value) 163 | local map = {} 164 | for i = 0, 0x10FFFF do 165 | map[i] = value 166 | end 167 | return setmetatable({ map = map }, metatable) 168 | end; 169 | }) 170 | -------------------------------------------------------------------------------- /dromozoa/ucd/east_asian_width.lua: -------------------------------------------------------------------------------- 1 | return function (c) 2 | c = c + 0 3 | if c < 9926 then 4 | if c < 8741 then 5 | if c < 8208 then 6 | if c < 328 then 7 | if c < 236 then 8 | if c < 182 then 9 | if c < 169 then 10 | if c < 162 then 11 | if c < 127 then 12 | if c < 32 then return "N" else return "Na" end 13 | else 14 | if c < 161 then return "N" else return "A" end 15 | end 16 | else 17 | if c < 165 then 18 | if c < 164 then return "Na" else return "A" end 19 | else 20 | if c < 167 then return "Na" else return "A" end 21 | end 22 | end 23 | else 24 | if c < 173 then 25 | if c < 171 then 26 | if c < 170 then return "N" else return "A" end 27 | else 28 | if c < 172 then return "N" else return "Na" end 29 | end 30 | else 31 | if c < 176 then 32 | if c < 175 then return "A" else return "Na" end 33 | else 34 | if c < 181 then return "A" else return "N" end 35 | end 36 | end 37 | end 38 | else 39 | if c < 215 then 40 | if c < 198 then 41 | if c < 188 then 42 | if c < 187 then return "A" else return "N" end 43 | else 44 | if c < 192 then return "A" else return "N" end 45 | end 46 | else 47 | if c < 208 then 48 | if c < 199 then return "A" else return "N" end 49 | else 50 | if c < 209 then return "A" else return "N" end 51 | end 52 | end 53 | else 54 | if c < 230 then 55 | if c < 222 then 56 | if c < 217 then return "A" else return "N" end 57 | else 58 | if c < 226 then return "A" else return "N" end 59 | end 60 | else 61 | if c < 232 then 62 | if c < 231 then return "A" else return "N" end 63 | else 64 | if c < 235 then return "A" else return "N" end 65 | end 66 | end 67 | end 68 | end 69 | else 70 | if c < 275 then 71 | if c < 252 then 72 | if c < 242 then 73 | if c < 240 then 74 | if c < 238 then return "A" else return "N" end 75 | else 76 | if c < 241 then return "A" else return "N" end 77 | end 78 | else 79 | if c < 247 then 80 | if c < 244 then return "A" else return "N" end 81 | else 82 | if c < 251 then return "A" else return "N" end 83 | end 84 | end 85 | else 86 | if c < 257 then 87 | if c < 254 then 88 | if c < 253 then return "A" else return "N" end 89 | else 90 | if c < 255 then return "A" else return "N" end 91 | end 92 | else 93 | if c < 273 then 94 | if c < 258 then return "A" else return "N" end 95 | else 96 | if c < 274 then return "A" else return "N" end 97 | end 98 | end 99 | end 100 | else 101 | if c < 305 then 102 | if c < 294 then 103 | if c < 283 then 104 | if c < 276 then return "A" else return "N" end 105 | else 106 | if c < 284 then return "A" else return "N" end 107 | end 108 | else 109 | if c < 299 then 110 | if c < 296 then return "A" else return "N" end 111 | else 112 | if c < 300 then return "A" else return "N" end 113 | end 114 | end 115 | else 116 | if c < 319 then 117 | if c < 312 then 118 | if c < 308 then return "A" else return "N" end 119 | else 120 | if c < 313 then return "A" else return "N" end 121 | end 122 | else 123 | if c < 324 then 124 | if c < 323 then return "A" else return "N" end 125 | else 126 | if c < 325 then return "A" else return "N" end 127 | end 128 | end 129 | end 130 | end 131 | end 132 | else 133 | if c < 711 then 134 | if c < 468 then 135 | if c < 363 then 136 | if c < 338 then 137 | if c < 333 then 138 | if c < 332 then return "A" else return "N" end 139 | else 140 | if c < 334 then return "A" else return "N" end 141 | end 142 | else 143 | if c < 358 then 144 | if c < 340 then return "A" else return "N" end 145 | else 146 | if c < 360 then return "A" else return "N" end 147 | end 148 | end 149 | else 150 | if c < 464 then 151 | if c < 462 then 152 | if c < 364 then return "A" else return "N" end 153 | else 154 | if c < 463 then return "A" else return "N" end 155 | end 156 | else 157 | if c < 466 then 158 | if c < 465 then return "A" else return "N" end 159 | else 160 | if c < 467 then return "A" else return "N" end 161 | end 162 | end 163 | end 164 | else 165 | if c < 476 then 166 | if c < 472 then 167 | if c < 470 then 168 | if c < 469 then return "A" else return "N" end 169 | else 170 | if c < 471 then return "A" else return "N" end 171 | end 172 | else 173 | if c < 474 then 174 | if c < 473 then return "A" else return "N" end 175 | else 176 | if c < 475 then return "A" else return "N" end 177 | end 178 | end 179 | else 180 | if c < 609 then 181 | if c < 593 then 182 | if c < 477 then return "A" else return "N" end 183 | else 184 | if c < 594 then return "A" else return "N" end 185 | end 186 | else 187 | if c < 708 then 188 | if c < 610 then return "A" else return "N" end 189 | else 190 | if c < 709 then return "A" else return "N" end 191 | end 192 | end 193 | end 194 | end 195 | else 196 | if c < 913 then 197 | if c < 728 then 198 | if c < 717 then 199 | if c < 713 then 200 | if c < 712 then return "A" else return "N" end 201 | else 202 | if c < 716 then return "A" else return "N" end 203 | end 204 | else 205 | if c < 720 then 206 | if c < 718 then return "A" else return "N" end 207 | else 208 | if c < 721 then return "A" else return "N" end 209 | end 210 | end 211 | else 212 | if c < 735 then 213 | if c < 733 then 214 | if c < 732 then return "A" else return "N" end 215 | else 216 | if c < 734 then return "A" else return "N" end 217 | end 218 | else 219 | if c < 768 then 220 | if c < 736 then return "A" else return "N" end 221 | else 222 | if c < 880 then return "A" else return "N" end 223 | end 224 | end 225 | end 226 | else 227 | if c < 1025 then 228 | if c < 945 then 229 | if c < 931 then 230 | if c < 930 then return "A" else return "N" end 231 | else 232 | if c < 938 then return "A" else return "N" end 233 | end 234 | else 235 | if c < 963 then 236 | if c < 962 then return "A" else return "N" end 237 | else 238 | if c < 970 then return "A" else return "N" end 239 | end 240 | end 241 | else 242 | if c < 1105 then 243 | if c < 1040 then 244 | if c < 1026 then return "A" else return "N" end 245 | else 246 | if c < 1104 then return "A" else return "N" end 247 | end 248 | else 249 | if c < 4352 then 250 | if c < 1106 then return "A" else return "N" end 251 | else 252 | if c < 4448 then return "W" else return "N" end 253 | end 254 | end 255 | end 256 | end 257 | end 258 | end 259 | else 260 | if c < 8585 then 261 | if c < 8451 then 262 | if c < 8245 then 263 | if c < 8224 then 264 | if c < 8216 then 265 | if c < 8211 then 266 | if c < 8209 then return "A" else return "N" end 267 | else 268 | if c < 8215 then return "A" else return "N" end 269 | end 270 | else 271 | if c < 8220 then 272 | if c < 8218 then return "A" else return "N" end 273 | else 274 | if c < 8222 then return "A" else return "N" end 275 | end 276 | end 277 | else 278 | if c < 8240 then 279 | if c < 8228 then 280 | if c < 8227 then return "A" else return "N" end 281 | else 282 | if c < 8232 then return "A" else return "N" end 283 | end 284 | else 285 | if c < 8242 then 286 | if c < 8241 then return "A" else return "N" end 287 | else 288 | if c < 8244 then return "A" else return "N" end 289 | end 290 | end 291 | end 292 | else 293 | if c < 8319 then 294 | if c < 8254 then 295 | if c < 8251 then 296 | if c < 8246 then return "A" else return "N" end 297 | else 298 | if c < 8252 then return "A" else return "N" end 299 | end 300 | else 301 | if c < 8308 then 302 | if c < 8255 then return "A" else return "N" end 303 | else 304 | if c < 8309 then return "A" else return "N" end 305 | end 306 | end 307 | else 308 | if c < 8361 then 309 | if c < 8321 then 310 | if c < 8320 then return "A" else return "N" end 311 | else 312 | if c < 8325 then return "A" else return "N" end 313 | end 314 | else 315 | if c < 8364 then 316 | if c < 8362 then return "H" else return "N" end 317 | else 318 | if c < 8365 then return "A" else return "N" end 319 | end 320 | end 321 | end 322 | end 323 | else 324 | if c < 8531 then 325 | if c < 8470 then 326 | if c < 8457 then 327 | if c < 8453 then 328 | if c < 8452 then return "A" else return "N" end 329 | else 330 | if c < 8454 then return "A" else return "N" end 331 | end 332 | else 333 | if c < 8467 then 334 | if c < 8458 then return "A" else return "N" end 335 | else 336 | if c < 8468 then return "A" else return "N" end 337 | end 338 | end 339 | else 340 | if c < 8486 then 341 | if c < 8481 then 342 | if c < 8471 then return "A" else return "N" end 343 | else 344 | if c < 8483 then return "A" else return "N" end 345 | end 346 | else 347 | if c < 8491 then 348 | if c < 8487 then return "A" else return "N" end 349 | else 350 | if c < 8492 then return "A" else return "N" end 351 | end 352 | end 353 | end 354 | else 355 | if c < 8544 then 356 | if c < 8539 then 357 | if c < 8533 then return "A" else return "N" end 358 | else 359 | if c < 8543 then return "A" else return "N" end 360 | end 361 | else 362 | if c < 8560 then 363 | if c < 8556 then return "A" else return "N" end 364 | else 365 | if c < 8570 then return "A" else return "N" end 366 | end 367 | end 368 | end 369 | end 370 | else 371 | if c < 8711 then 372 | if c < 8660 then 373 | if c < 8632 then 374 | if c < 8592 then 375 | if c < 8586 then return "A" else return "N" end 376 | else 377 | if c < 8602 then return "A" else return "N" end 378 | end 379 | else 380 | if c < 8658 then 381 | if c < 8634 then return "A" else return "N" end 382 | else 383 | if c < 8659 then return "A" else return "N" end 384 | end 385 | end 386 | else 387 | if c < 8704 then 388 | if c < 8679 then 389 | if c < 8661 then return "A" else return "N" end 390 | else 391 | if c < 8680 then return "A" else return "N" end 392 | end 393 | else 394 | if c < 8706 then 395 | if c < 8705 then return "A" else return "N" end 396 | else 397 | if c < 8708 then return "A" else return "N" end 398 | end 399 | end 400 | end 401 | else 402 | if c < 8725 then 403 | if c < 8719 then 404 | if c < 8715 then 405 | if c < 8713 then return "A" else return "N" end 406 | else 407 | if c < 8716 then return "A" else return "N" end 408 | end 409 | else 410 | if c < 8721 then 411 | if c < 8720 then return "A" else return "N" end 412 | else 413 | if c < 8722 then return "A" else return "N" end 414 | end 415 | end 416 | else 417 | if c < 8733 then 418 | if c < 8730 then 419 | if c < 8726 then return "A" else return "N" end 420 | else 421 | if c < 8731 then return "A" else return "N" end 422 | end 423 | else 424 | if c < 8739 then 425 | if c < 8737 then return "A" else return "N" end 426 | else 427 | if c < 8740 then return "A" else return "N" end 428 | end 429 | end 430 | end 431 | end 432 | end 433 | end 434 | else 435 | if c < 9654 then 436 | if c < 8869 then 437 | if c < 8800 then 438 | if c < 8764 then 439 | if c < 8750 then 440 | if c < 8743 then 441 | if c < 8742 then return "A" else return "N" end 442 | else 443 | if c < 8749 then return "A" else return "N" end 444 | end 445 | else 446 | if c < 8756 then 447 | if c < 8751 then return "A" else return "N" end 448 | else 449 | if c < 8760 then return "A" else return "N" end 450 | end 451 | end 452 | else 453 | if c < 8780 then 454 | if c < 8776 then 455 | if c < 8766 then return "A" else return "N" end 456 | else 457 | if c < 8777 then return "A" else return "N" end 458 | end 459 | else 460 | if c < 8786 then 461 | if c < 8781 then return "A" else return "N" end 462 | else 463 | if c < 8787 then return "A" else return "N" end 464 | end 465 | end 466 | end 467 | else 468 | if c < 8834 then 469 | if c < 8810 then 470 | if c < 8804 then 471 | if c < 8802 then return "A" else return "N" end 472 | else 473 | if c < 8808 then return "A" else return "N" end 474 | end 475 | else 476 | if c < 8814 then 477 | if c < 8812 then return "A" else return "N" end 478 | else 479 | if c < 8816 then return "A" else return "N" end 480 | end 481 | end 482 | else 483 | if c < 8853 then 484 | if c < 8838 then 485 | if c < 8836 then return "A" else return "N" end 486 | else 487 | if c < 8840 then return "A" else return "N" end 488 | end 489 | else 490 | if c < 8857 then 491 | if c < 8854 then return "A" else return "N" end 492 | else 493 | if c < 8858 then return "A" else return "N" end 494 | end 495 | end 496 | end 497 | end 498 | else 499 | if c < 9312 then 500 | if c < 9001 then 501 | if c < 8978 then 502 | if c < 8895 then 503 | if c < 8870 then return "A" else return "N" end 504 | else 505 | if c < 8896 then return "A" else return "N" end 506 | end 507 | else 508 | if c < 8986 then 509 | if c < 8979 then return "A" else return "N" end 510 | else 511 | if c < 8988 then return "W" else return "N" end 512 | end 513 | end 514 | else 515 | if c < 9200 then 516 | if c < 9193 then 517 | if c < 9003 then return "W" else return "N" end 518 | else 519 | if c < 9197 then return "W" else return "N" end 520 | end 521 | else 522 | if c < 9203 then 523 | if c < 9201 then return "W" else return "N" end 524 | else 525 | if c < 9204 then return "W" else return "N" end 526 | end 527 | end 528 | end 529 | else 530 | if c < 9618 then 531 | if c < 9552 then 532 | if c < 9451 then 533 | if c < 9450 then return "A" else return "N" end 534 | else 535 | if c < 9548 then return "A" else return "N" end 536 | end 537 | else 538 | if c < 9600 then 539 | if c < 9588 then return "A" else return "N" end 540 | else 541 | if c < 9616 then return "A" else return "N" end 542 | end 543 | end 544 | else 545 | if c < 9635 then 546 | if c < 9632 then 547 | if c < 9622 then return "A" else return "N" end 548 | else 549 | if c < 9634 then return "A" else return "N" end 550 | end 551 | else 552 | if c < 9650 then 553 | if c < 9642 then return "A" else return "N" end 554 | else 555 | if c < 9652 then return "A" else return "N" end 556 | end 557 | end 558 | end 559 | end 560 | end 561 | else 562 | if c < 9792 then 563 | if c < 9725 then 564 | if c < 9675 then 565 | if c < 9664 then 566 | if c < 9660 then 567 | if c < 9656 then return "A" else return "N" end 568 | else 569 | if c < 9662 then return "A" else return "N" end 570 | end 571 | else 572 | if c < 9670 then 573 | if c < 9666 then return "A" else return "N" end 574 | else 575 | if c < 9673 then return "A" else return "N" end 576 | end 577 | end 578 | else 579 | if c < 9698 then 580 | if c < 9678 then 581 | if c < 9676 then return "A" else return "N" end 582 | else 583 | if c < 9682 then return "A" else return "N" end 584 | end 585 | else 586 | if c < 9711 then 587 | if c < 9702 then return "A" else return "N" end 588 | else 589 | if c < 9712 then return "A" else return "N" end 590 | end 591 | end 592 | end 593 | else 594 | if c < 9748 then 595 | if c < 9737 then 596 | if c < 9733 then 597 | if c < 9727 then return "W" else return "N" end 598 | else 599 | if c < 9735 then return "A" else return "N" end 600 | end 601 | else 602 | if c < 9742 then 603 | if c < 9738 then return "A" else return "N" end 604 | else 605 | if c < 9744 then return "A" else return "N" end 606 | end 607 | end 608 | else 609 | if c < 9758 then 610 | if c < 9756 then 611 | if c < 9750 then return "W" else return "N" end 612 | else 613 | if c < 9757 then return "A" else return "N" end 614 | end 615 | else 616 | if c < 9776 then 617 | if c < 9759 then return "A" else return "N" end 618 | else 619 | if c < 9784 then return "W" else return "N" end 620 | end 621 | end 622 | end 623 | end 624 | else 625 | if c < 9855 then 626 | if c < 9827 then 627 | if c < 9800 then 628 | if c < 9794 then 629 | if c < 9793 then return "A" else return "N" end 630 | else 631 | if c < 9795 then return "A" else return "N" end 632 | end 633 | else 634 | if c < 9824 then 635 | if c < 9812 then return "W" else return "N" end 636 | else 637 | if c < 9826 then return "A" else return "N" end 638 | end 639 | end 640 | else 641 | if c < 9836 then 642 | if c < 9831 then 643 | if c < 9830 then return "A" else return "N" end 644 | else 645 | if c < 9835 then return "A" else return "N" end 646 | end 647 | else 648 | if c < 9839 then 649 | if c < 9838 then return "A" else return "N" end 650 | else 651 | if c < 9840 then return "A" else return "N" end 652 | end 653 | end 654 | end 655 | else 656 | if c < 9889 then 657 | if c < 9875 then 658 | if c < 9866 then 659 | if c < 9856 then return "W" else return "N" end 660 | else 661 | if c < 9872 then return "W" else return "N" end 662 | end 663 | else 664 | if c < 9886 then 665 | if c < 9876 then return "W" else return "N" end 666 | else 667 | if c < 9888 then return "A" else return "N" end 668 | end 669 | end 670 | else 671 | if c < 9917 then 672 | if c < 9898 then 673 | if c < 9890 then return "W" else return "N" end 674 | else 675 | if c < 9900 then return "W" else return "N" end 676 | end 677 | else 678 | if c < 9920 then 679 | if c < 9919 then return "W" else return "A" end 680 | else 681 | if c < 9924 then return "N" else return "W" end 682 | end 683 | end 684 | end 685 | end 686 | end 687 | end 688 | end 689 | else 690 | if c < 110581 then 691 | if c < 12351 then 692 | if c < 10067 then 693 | if c < 9979 then 694 | if c < 9960 then 695 | if c < 9941 then 696 | if c < 9935 then 697 | if c < 9934 then return "A" else return "W" end 698 | else 699 | if c < 9940 then return "A" else return "W" end 700 | end 701 | else 702 | if c < 9955 then 703 | if c < 9954 then return "A" else return "N" end 704 | else 705 | if c < 9956 then return "A" else return "N" end 706 | end 707 | end 708 | else 709 | if c < 9972 then 710 | if c < 9963 then 711 | if c < 9962 then return "A" else return "W" end 712 | else 713 | if c < 9970 then return "A" else return "W" end 714 | end 715 | else 716 | if c < 9974 then 717 | if c < 9973 then return "A" else return "W" end 718 | else 719 | if c < 9978 then return "A" else return "W" end 720 | end 721 | end 722 | end 723 | else 724 | if c < 10024 then 725 | if c < 9989 then 726 | if c < 9982 then 727 | if c < 9981 then return "A" else return "W" end 728 | else 729 | if c < 9984 then return "A" else return "N" end 730 | end 731 | else 732 | if c < 9994 then 733 | if c < 9990 then return "W" else return "N" end 734 | else 735 | if c < 9996 then return "W" else return "N" end 736 | end 737 | end 738 | else 739 | if c < 10060 then 740 | if c < 10045 then 741 | if c < 10025 then return "W" else return "N" end 742 | else 743 | if c < 10046 then return "A" else return "N" end 744 | end 745 | else 746 | if c < 10062 then 747 | if c < 10061 then return "W" else return "N" end 748 | else 749 | if c < 10063 then return "W" else return "N" end 750 | end 751 | end 752 | end 753 | end 754 | else 755 | if c < 11035 then 756 | if c < 10160 then 757 | if c < 10102 then 758 | if c < 10071 then 759 | if c < 10070 then return "W" else return "N" end 760 | else 761 | if c < 10072 then return "W" else return "N" end 762 | end 763 | else 764 | if c < 10133 then 765 | if c < 10112 then return "A" else return "N" end 766 | else 767 | if c < 10136 then return "W" else return "N" end 768 | end 769 | end 770 | else 771 | if c < 10214 then 772 | if c < 10175 then 773 | if c < 10161 then return "W" else return "N" end 774 | else 775 | if c < 10176 then return "W" else return "N" end 776 | end 777 | else 778 | if c < 10629 then 779 | if c < 10222 then return "Na" else return "N" end 780 | else 781 | if c < 10631 then return "Na" else return "N" end 782 | end 783 | end 784 | end 785 | else 786 | if c < 11930 then 787 | if c < 11093 then 788 | if c < 11088 then 789 | if c < 11037 then return "W" else return "N" end 790 | else 791 | if c < 11089 then return "W" else return "N" end 792 | end 793 | else 794 | if c < 11098 then 795 | if c < 11094 then return "W" else return "A" end 796 | else 797 | if c < 11904 then return "N" else return "W" end 798 | end 799 | end 800 | else 801 | if c < 12246 then 802 | if c < 12020 then 803 | if c < 11931 then return "N" else return "W" end 804 | else 805 | if c < 12032 then return "N" else return "W" end 806 | end 807 | else 808 | if c < 12288 then 809 | if c < 12272 then return "N" else return "W" end 810 | else 811 | if c < 12289 then return "F" else return "W" end 812 | end 813 | end 814 | end 815 | end 816 | end 817 | else 818 | if c < 65127 then 819 | if c < 42125 then 820 | if c < 12687 then 821 | if c < 12544 then 822 | if c < 12439 then 823 | if c < 12353 then return "N" else return "W" end 824 | else 825 | if c < 12441 then return "N" else return "W" end 826 | end 827 | else 828 | if c < 12592 then 829 | if c < 12549 then return "N" else return "W" end 830 | else 831 | if c < 12593 then return "N" else return "W" end 832 | end 833 | end 834 | else 835 | if c < 12831 then 836 | if c < 12774 then 837 | if c < 12688 then return "N" else return "W" end 838 | else 839 | if c < 12783 then return "N" else return "W" end 840 | end 841 | else 842 | if c < 12872 then 843 | if c < 12832 then return "N" else return "W" end 844 | else 845 | if c < 12880 then return "A" else return "W" end 846 | end 847 | end 848 | end 849 | else 850 | if c < 63744 then 851 | if c < 43389 then 852 | if c < 42183 then 853 | if c < 42128 then return "N" else return "W" end 854 | else 855 | if c < 43360 then return "N" else return "W" end 856 | end 857 | else 858 | if c < 55204 then 859 | if c < 44032 then return "N" else return "W" end 860 | else 861 | if c < 57344 then return "N" else return "A" end 862 | end 863 | end 864 | else 865 | if c < 65050 then 866 | if c < 65024 then 867 | if c < 64256 then return "W" else return "N" end 868 | else 869 | if c < 65040 then return "A" else return "W" end 870 | end 871 | else 872 | if c < 65107 then 873 | if c < 65072 then return "N" else return "W" end 874 | else 875 | if c < 65108 then return "N" else return "W" end 876 | end 877 | end 878 | end 879 | end 880 | else 881 | if c < 65512 then 882 | if c < 65482 then 883 | if c < 65377 then 884 | if c < 65132 then 885 | if c < 65128 then return "N" else return "W" end 886 | else 887 | if c < 65281 then return "N" else return "F" end 888 | end 889 | else 890 | if c < 65474 then 891 | if c < 65471 then return "H" else return "N" end 892 | else 893 | if c < 65480 then return "H" else return "N" end 894 | end 895 | end 896 | else 897 | if c < 65498 then 898 | if c < 65490 then 899 | if c < 65488 then return "H" else return "N" end 900 | else 901 | if c < 65496 then return "H" else return "N" end 902 | end 903 | else 904 | if c < 65504 then 905 | if c < 65501 then return "H" else return "N" end 906 | else 907 | if c < 65511 then return "F" else return "N" end 908 | end 909 | end 910 | end 911 | else 912 | if c < 94208 then 913 | if c < 94176 then 914 | if c < 65533 then 915 | if c < 65519 then return "H" else return "N" end 916 | else 917 | if c < 65534 then return "A" else return "N" end 918 | end 919 | else 920 | if c < 94192 then 921 | if c < 94181 then return "W" else return "N" end 922 | else 923 | if c < 94194 then return "W" else return "N" end 924 | end 925 | end 926 | else 927 | if c < 101631 then 928 | if c < 100352 then 929 | if c < 100344 then return "W" else return "N" end 930 | else 931 | if c < 101590 then return "W" else return "N" end 932 | end 933 | else 934 | if c < 110576 then 935 | if c < 101641 then return "W" else return "N" end 936 | else 937 | if c < 110580 then return "W" else return "N" end 938 | end 939 | end 940 | end 941 | end 942 | end 943 | end 944 | else 945 | if c < 128064 then 946 | if c < 127375 then 947 | if c < 119552 then 948 | if c < 110928 then 949 | if c < 110592 then 950 | if c < 110589 then 951 | if c < 110588 then return "W" else return "N" end 952 | else 953 | if c < 110591 then return "W" else return "N" end 954 | end 955 | else 956 | if c < 110898 then 957 | if c < 110883 then return "W" else return "N" end 958 | else 959 | if c < 110899 then return "W" else return "N" end 960 | end 961 | end 962 | else 963 | if c < 110948 then 964 | if c < 110933 then 965 | if c < 110931 then return "W" else return "N" end 966 | else 967 | if c < 110934 then return "W" else return "N" end 968 | end 969 | else 970 | if c < 110960 then 971 | if c < 110952 then return "W" else return "N" end 972 | else 973 | if c < 111356 then return "W" else return "N" end 974 | end 975 | end 976 | end 977 | else 978 | if c < 127232 then 979 | if c < 126980 then 980 | if c < 119648 then 981 | if c < 119639 then return "W" else return "N" end 982 | else 983 | if c < 119671 then return "W" else return "N" end 984 | end 985 | else 986 | if c < 127183 then 987 | if c < 126981 then return "W" else return "N" end 988 | else 989 | if c < 127184 then return "W" else return "N" end 990 | end 991 | end 992 | else 993 | if c < 127280 then 994 | if c < 127248 then 995 | if c < 127243 then return "A" else return "N" end 996 | else 997 | if c < 127278 then return "A" else return "N" end 998 | end 999 | else 1000 | if c < 127344 then 1001 | if c < 127338 then return "A" else return "N" end 1002 | else 1003 | if c < 127374 then return "A" else return "W" end 1004 | end 1005 | end 1006 | end 1007 | end 1008 | else 1009 | if c < 127789 then 1010 | if c < 127552 then 1011 | if c < 127488 then 1012 | if c < 127387 then 1013 | if c < 127377 then return "A" else return "W" end 1014 | else 1015 | if c < 127405 then return "A" else return "N" end 1016 | end 1017 | else 1018 | if c < 127504 then 1019 | if c < 127491 then return "W" else return "N" end 1020 | else 1021 | if c < 127548 then return "W" else return "N" end 1022 | end 1023 | end 1024 | else 1025 | if c < 127584 then 1026 | if c < 127568 then 1027 | if c < 127561 then return "W" else return "N" end 1028 | else 1029 | if c < 127570 then return "W" else return "N" end 1030 | end 1031 | else 1032 | if c < 127744 then 1033 | if c < 127590 then return "W" else return "N" end 1034 | else 1035 | if c < 127777 then return "W" else return "N" end 1036 | end 1037 | end 1038 | end 1039 | else 1040 | if c < 127951 then 1041 | if c < 127870 then 1042 | if c < 127799 then 1043 | if c < 127798 then return "W" else return "N" end 1044 | else 1045 | if c < 127869 then return "W" else return "N" end 1046 | end 1047 | else 1048 | if c < 127904 then 1049 | if c < 127892 then return "W" else return "N" end 1050 | else 1051 | if c < 127947 then return "W" else return "N" end 1052 | end 1053 | end 1054 | else 1055 | if c < 127988 then 1056 | if c < 127968 then 1057 | if c < 127956 then return "W" else return "N" end 1058 | else 1059 | if c < 127985 then return "W" else return "N" end 1060 | end 1061 | else 1062 | if c < 127992 then 1063 | if c < 127989 then return "W" else return "N" end 1064 | else 1065 | if c < 128063 then return "W" else return "N" end 1066 | end 1067 | end 1068 | end 1069 | end 1070 | end 1071 | else 1072 | if c < 128992 then 1073 | if c < 128507 then 1074 | if c < 128336 then 1075 | if c < 128255 then 1076 | if c < 128066 then 1077 | if c < 128065 then return "W" else return "N" end 1078 | else 1079 | if c < 128253 then return "W" else return "N" end 1080 | end 1081 | else 1082 | if c < 128331 then 1083 | if c < 128318 then return "W" else return "N" end 1084 | else 1085 | if c < 128335 then return "W" else return "N" end 1086 | end 1087 | end 1088 | else 1089 | if c < 128405 then 1090 | if c < 128378 then 1091 | if c < 128360 then return "W" else return "N" end 1092 | else 1093 | if c < 128379 then return "W" else return "N" end 1094 | end 1095 | else 1096 | if c < 128420 then 1097 | if c < 128407 then return "W" else return "N" end 1098 | else 1099 | if c < 128421 then return "W" else return "N" end 1100 | end 1101 | end 1102 | end 1103 | else 1104 | if c < 128725 then 1105 | if c < 128716 then 1106 | if c < 128640 then 1107 | if c < 128592 then return "W" else return "N" end 1108 | else 1109 | if c < 128710 then return "W" else return "N" end 1110 | end 1111 | else 1112 | if c < 128720 then 1113 | if c < 128717 then return "W" else return "N" end 1114 | else 1115 | if c < 128723 then return "W" else return "N" end 1116 | end 1117 | end 1118 | else 1119 | if c < 128747 then 1120 | if c < 128732 then 1121 | if c < 128728 then return "W" else return "N" end 1122 | else 1123 | if c < 128736 then return "W" else return "N" end 1124 | end 1125 | else 1126 | if c < 128756 then 1127 | if c < 128749 then return "W" else return "N" end 1128 | else 1129 | if c < 128765 then return "W" else return "N" end 1130 | end 1131 | end 1132 | end 1133 | end 1134 | else 1135 | if c < 129742 then 1136 | if c < 129351 then 1137 | if c < 129292 then 1138 | if c < 129008 then 1139 | if c < 129004 then return "W" else return "N" end 1140 | else 1141 | if c < 129009 then return "W" else return "N" end 1142 | end 1143 | else 1144 | if c < 129340 then 1145 | if c < 129339 then return "W" else return "N" end 1146 | else 1147 | if c < 129350 then return "W" else return "N" end 1148 | end 1149 | end 1150 | else 1151 | if c < 129664 then 1152 | if c < 129648 then 1153 | if c < 129536 then return "W" else return "N" end 1154 | else 1155 | if c < 129661 then return "W" else return "N" end 1156 | end 1157 | else 1158 | if c < 129679 then 1159 | if c < 129674 then return "W" else return "N" end 1160 | else 1161 | if c < 129735 then return "W" else return "N" end 1162 | end 1163 | end 1164 | end 1165 | else 1166 | if c < 196608 then 1167 | if c < 129776 then 1168 | if c < 129759 then 1169 | if c < 129757 then return "W" else return "N" end 1170 | else 1171 | if c < 129770 then return "W" else return "N" end 1172 | end 1173 | else 1174 | if c < 131072 then 1175 | if c < 129785 then return "W" else return "N" end 1176 | else 1177 | if c < 196606 then return "W" else return "N" end 1178 | end 1179 | end 1180 | else 1181 | if c < 983040 then 1182 | if c < 917760 then 1183 | if c < 262142 then return "W" else return "N" end 1184 | else 1185 | if c < 918000 then return "A" else return "N" end 1186 | end 1187 | else 1188 | if c < 1048576 then 1189 | if c < 1048574 then return "A" else return "N" end 1190 | else 1191 | if c < 1114110 then return "A" else return "N" end 1192 | end 1193 | end 1194 | end 1195 | end 1196 | end 1197 | end 1198 | end 1199 | end 1200 | end 1201 | -------------------------------------------------------------------------------- /dromozoa/ucd/is_white_space.lua: -------------------------------------------------------------------------------- 1 | return function (c) 2 | c = c + 0 3 | if c < 8232 then 4 | if c < 161 then 5 | if c < 33 then 6 | if c < 14 then 7 | return c >= 9 8 | else 9 | return c >= 32 10 | end 11 | else 12 | if c < 134 then 13 | return c >= 133 14 | else 15 | return c >= 160 16 | end 17 | end 18 | else 19 | if c < 8192 then 20 | if c < 5761 then 21 | return c >= 5760 22 | else 23 | return false 24 | end 25 | else 26 | return c < 8203 27 | end 28 | end 29 | else 30 | if c < 8287 then 31 | if c < 8239 then 32 | return c < 8234 33 | else 34 | return c < 8240 35 | end 36 | else 37 | if c < 12288 then 38 | return c < 8288 39 | else 40 | return c < 12289 41 | end 42 | end 43 | end 44 | end 45 | -------------------------------------------------------------------------------- /dromozoa/utf16.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2018,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | return { 19 | decode_surrogate_pair = require "dromozoa.utf16.decode_surrogate_pair"; 20 | } 21 | -------------------------------------------------------------------------------- /dromozoa/utf16/decode_surrogate_pair.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2017,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local error = error 19 | 20 | return function (a, b) 21 | if a < 0xD800 or 0xDBFF < a then 22 | error "bad argument #1 (value out of high surrogate area)" 23 | end 24 | if b < 0xDC00 or 0xDFFF < b then 25 | error "bad argument #2 (value out of low surrogate area)" 26 | end 27 | return (a - 0xD800) * 0x0400 + (b - 0xDC00) + 0x010000 28 | end 29 | -------------------------------------------------------------------------------- /dromozoa/utf8.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2015,2019,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local utf8 = utf8 19 | 20 | if utf8 then 21 | return utf8 22 | else 23 | return require "dromozoa.utf8.pure" 24 | end 25 | -------------------------------------------------------------------------------- /dromozoa/utf8/check_integer.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2017,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local error = error 19 | local tonumber = tonumber 20 | local type = type 21 | 22 | return function (v, i) 23 | local t = type(v) 24 | if t ~= "number" then 25 | if t == "string" then 26 | v = tonumber(v) 27 | if not v then 28 | error("bad argument #" .. i .. " (number expected, got " .. t .. ")") 29 | end 30 | else 31 | error("bad argument #" .. i .. " (number expected, got " .. t .. ")") 32 | end 33 | end 34 | if v % 1 ~= 0 then 35 | error("bad argument #" .. i .. " (number has no integer representation)") 36 | end 37 | return v 38 | end 39 | -------------------------------------------------------------------------------- /dromozoa/utf8/check_string.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2017,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local error = error 19 | local tostring = tostring 20 | local type = type 21 | 22 | return function (v, i) 23 | local t = type(v) 24 | if t ~= "string" then 25 | if t == "number" then 26 | v = tostring(v) 27 | else 28 | error("bad argument #" .. i .. " (string expected, got " .. t .. ")") 29 | end 30 | end 31 | return v 32 | end 33 | -------------------------------------------------------------------------------- /dromozoa/utf8/count.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2017,2019,2020,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local check_integer = require "dromozoa.utf8.check_integer" 19 | local check_string = require "dromozoa.utf8.check_string" 20 | local counter_table = require "dromozoa.utf8.count_table" 21 | 22 | local byte = string.byte 23 | 24 | local S = counter_table.S 25 | local E = counter_table.E 26 | 27 | return function (s, i, j) 28 | s = check_string(s, 1) 29 | 30 | local n = #s 31 | local m = n + 1 32 | 33 | if i == nil then 34 | i = 1 35 | else 36 | i = check_integer(i, 2) 37 | if i < 0 then 38 | i = i + m 39 | end 40 | end 41 | 42 | if j == nil then 43 | j = n 44 | else 45 | j = check_integer(j, 3) 46 | if j < 0 then 47 | j = j + m 48 | end 49 | end 50 | 51 | if i < 1 or m < i then 52 | error "bad argument #2 (initial position out of bounds)" 53 | end 54 | if n < j then 55 | error "bad argument #3 (final position out of bounds)" 56 | end 57 | if j < i then 58 | return 0 59 | end 60 | 61 | local s1 = S 62 | local count = 0 63 | for i = i + 3, j, 4 do 64 | if s1 == S then count = count + 1 end 65 | local a, b, c, d = byte(s, i - 3, i) 66 | local s2 = s1[a] 67 | local s3 = s2[b] 68 | local s4 = s3[c] 69 | s1 = s4[d] 70 | if s1 == E then 71 | if s2 == E then return nil, i - 3 end 72 | if s3 == E then return nil, i - 2 end 73 | if s4 == E then return nil, i - 1 end 74 | return nil, i 75 | end 76 | if s2 == S then count = count + 1 end 77 | if s3 == S then count = count + 1 end 78 | if s4 == S then count = count + 1 end 79 | end 80 | 81 | local p = j + 1 82 | local m = p - (p - i) % 4 83 | if m < p then 84 | if s1 == S then count = count + 1 end 85 | local a, b, c = byte(s, m, j) 86 | if c then 87 | local s2 = s1[a] 88 | local s3 = s2[b] 89 | local s4 = s3[c] 90 | if s4 == E then 91 | if s2 == E then return nil, j - 2 end 92 | if s3 == E then return nil, j - 1 end 93 | return nil, j 94 | end 95 | if s2 == S then count = count + 1 end 96 | if s3 == S then count = count + 1 end 97 | elseif b then 98 | local s2 = s1[a] 99 | local s3 = s2[b] 100 | if s3 == E then 101 | if s2 == E then return nil, j - 1 end 102 | return nil, j 103 | end 104 | if s2 == S then count = count + 1 end 105 | else 106 | local s2 = s1[a] 107 | if s2 == E then return nil, j end 108 | end 109 | end 110 | 111 | return count 112 | end 113 | -------------------------------------------------------------------------------- /dromozoa/utf8/count_table.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2017,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local A = {} 19 | local B_80_BF = {} 20 | local C_80_BF = {} 21 | local C_80_9F = {} 22 | local C_A0_BF = {} 23 | local D_80_BF = {} 24 | local D_80_8F = {} 25 | local D_90_BF = {} 26 | local E = {} 27 | 28 | for i = 0x00, 0xFF do 29 | if i <= 0x7F then 30 | A[i] = A 31 | elseif i <= 0xC1 then 32 | A[i] = E 33 | elseif i <= 0xDF then 34 | A[i] = B_80_BF 35 | elseif i <= 0xEF then 36 | if i == 0xE0 then 37 | A[i] = C_A0_BF 38 | elseif i == 0xED then 39 | A[i] = C_80_9F 40 | else 41 | A[i] = C_80_BF 42 | end 43 | elseif i <= 0xF4 then 44 | if i == 0xF0 then 45 | A[i] = D_90_BF 46 | elseif i == 0xF4 then 47 | A[i] = D_80_8F 48 | else 49 | A[i] = D_80_BF 50 | end 51 | else 52 | A[i] = E 53 | end 54 | 55 | if 0x80 <= i and i <= 0xBF then 56 | B_80_BF[i] = A 57 | C_80_BF[i] = B_80_BF 58 | C_80_9F[i] = i <= 0x9F and B_80_BF or E 59 | C_A0_BF[i] = 0xA0 <= i and B_80_BF or E 60 | D_80_BF[i] = C_80_BF 61 | D_80_8F[i] = i <= 0x8F and C_80_BF or E 62 | D_90_BF[i] = 0x90 <= i and C_80_BF or E 63 | else 64 | B_80_BF[i] = E 65 | C_80_BF[i] = E 66 | C_80_9F[i] = E 67 | C_A0_BF[i] = E 68 | D_80_BF[i] = E 69 | D_80_8F[i] = E 70 | D_90_BF[i] = E 71 | end 72 | 73 | E[i] = E 74 | end 75 | 76 | return { 77 | S = A; 78 | E = E; 79 | } 80 | -------------------------------------------------------------------------------- /dromozoa/utf8/decode.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2017,2019,2020,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local check_integer = require "dromozoa.utf8.check_integer" 19 | local check_string = require "dromozoa.utf8.check_string" 20 | local decode_table = require "dromozoa.utf8.decode_table" 21 | 22 | local error = error 23 | local byte = string.byte 24 | local unpack = table.unpack or unpack 25 | 26 | local A = decode_table.A 27 | local B = decode_table.B 28 | local TA = decode_table.TA 29 | local TB = decode_table.TB 30 | 31 | return function (s, i, j) 32 | s = check_string(s, 1) 33 | 34 | local n = #s 35 | local m = n + 1 36 | 37 | if i == nil then 38 | i = 1 39 | else 40 | i = check_integer(i, 2) 41 | if i < 0 then 42 | i = i + m 43 | end 44 | end 45 | 46 | if j == nil then 47 | j = i 48 | else 49 | j = check_integer(j, 3) 50 | if j < 0 then 51 | j = j + m 52 | end 53 | end 54 | 55 | if i < 1 then 56 | error "bad argument #2 (out of bounds)" 57 | end 58 | if n < j then 59 | error "bad argument #3 (out of bounds)" 60 | end 61 | if j < i then 62 | return 63 | end 64 | 65 | if i == j then 66 | local a, b, c, d = byte(s, i, i + 3) 67 | local v = A[a] 68 | if v then 69 | if a <= 0xDF then 70 | if a <= 0x7F then 71 | return v 72 | else 73 | local b = TA[b] 74 | return v + b 75 | end 76 | else 77 | if a <= 0xEF then 78 | local b = B[a][b] 79 | local c = TA[c] 80 | return v + b + c 81 | else 82 | local b = B[a][b] 83 | local c = TB[c] 84 | local d = TA[d] 85 | return v + b + c + d 86 | end 87 | end 88 | elseif a then 89 | error "invalid UTF-8 code" 90 | end 91 | else 92 | local source = { byte(s, i, j + 3) } 93 | j = j - i + 1 94 | i = 1 95 | local result = {} 96 | local k = 0 97 | while i <= j do 98 | k = k + 1 99 | local a = source[i] 100 | local v = A[a] 101 | if v then 102 | if a <= 0xDF then 103 | if a <= 0x7F then 104 | result[k] = v 105 | i = i + 1 106 | else 107 | result[k] = v + TA[source[i + 1]] 108 | i = i + 2 109 | end 110 | else 111 | if a <= 0xEF then 112 | result[k] = v + B[a][source[i + 1]] + TA[source[i + 2]] 113 | i = i + 3 114 | else 115 | result[k] = v + B[a][source[i + 1]] + TB[source[i + 2]] + TA[source[i + 3]] 116 | i = i + 4 117 | end 118 | end 119 | elseif a then 120 | error "invalid UTF-8 code" 121 | end 122 | end 123 | return unpack(result, 1, k) 124 | end 125 | end 126 | -------------------------------------------------------------------------------- /dromozoa/utf8/decode_each.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2017,2019,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local check_string = require "dromozoa.utf8.check_string" 19 | local decode_table = require "dromozoa.utf8.decode_table" 20 | 21 | local error = error 22 | local byte = string.byte 23 | 24 | local A = decode_table.A 25 | local B = decode_table.B 26 | local TA = decode_table.TA 27 | local TB = decode_table.TB 28 | 29 | return function (s) 30 | s = check_string(s, 1) 31 | 32 | local i = 1 33 | local source = { byte(s, i, #s) } 34 | return function () 35 | local j = i 36 | local a = source[j] 37 | local v = A[a] 38 | if v then 39 | if a <= 0xDF then 40 | if a <= 0x7F then 41 | i = j + 1 42 | return j, v 43 | else 44 | i = j + 2 45 | return j, v + TA[source[j + 1]] 46 | end 47 | else 48 | if a <= 0xEF then 49 | i = j + 3 50 | return j, v + B[a][source[j + 1]] + TA[source[j + 2]] 51 | else 52 | i = j + 4 53 | return j, v + B[a][source[j + 1]] + TB[source[j + 2]] + TA[source[j + 3]] 54 | end 55 | end 56 | error "invalid UTF-8 code" 57 | elseif a then 58 | error "invalid UTF-8 code" 59 | end 60 | end 61 | end 62 | -------------------------------------------------------------------------------- /dromozoa/utf8/decode_table.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2017,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local TA_80_BF = {} 19 | local TB_80_BF = {} 20 | local TB_A0_BF = {} 21 | local TB_80_9F = {} 22 | local TC_80_BF = {} 23 | local TC_80_8F = {} 24 | local TC_90_BF = {} 25 | 26 | for i = 0x00, 0x7F do 27 | TA_80_BF[i] = false 28 | TB_80_BF[i] = false 29 | TB_80_9F[i] = false 30 | TB_A0_BF[i] = false 31 | TC_80_BF[i] = false 32 | TC_80_8F[i] = false 33 | TC_90_BF[i] = false 34 | end 35 | 36 | for i = 0x80, 0xBF do 37 | local a = i % 0x40 38 | local b = a * 0x40 39 | local c = b * 0x40 40 | TA_80_BF[i] = a 41 | TB_80_BF[i] = b 42 | TB_80_9F[i] = i <= 0x9F and b 43 | TB_A0_BF[i] = 0xA0 <= i and b 44 | TC_80_BF[i] = c 45 | TC_80_8F[i] = i <= 0x8F and c 46 | TC_90_BF[i] = 0x90 <= i and c 47 | end 48 | 49 | local A = {} 50 | local B = {} 51 | 52 | for i = 0x00, 0xF4 do 53 | if i <= 0x7F then 54 | A[i] = i 55 | B[i] = false 56 | elseif i <= 0xC1 then 57 | A[i] = false 58 | B[i] = false 59 | elseif i <= 0xDF then 60 | A[i] = i % 0x20 * 0x40 61 | B[i] = TA_80_BF 62 | elseif i <= 0xEF then 63 | A[i] = i % 0x10 * 0x1000 64 | if i == 0xE0 then 65 | B[i] = TB_A0_BF 66 | elseif i == 0xED then 67 | B[i] = TB_80_9F 68 | else 69 | B[i] = TB_80_BF 70 | end 71 | else 72 | A[i] = i % 0x08 * 0x40000 73 | if i == 0xF0 then 74 | B[i] = TC_90_BF 75 | elseif i == 0xF4 then 76 | B[i] = TC_80_8F 77 | else 78 | B[i] = TC_80_BF 79 | end 80 | end 81 | end 82 | 83 | return { 84 | A = A; 85 | B = B; 86 | TA = TA_80_BF; 87 | TB = TB_80_BF; 88 | TC = TC_80_BF; 89 | } 90 | -------------------------------------------------------------------------------- /dromozoa/utf8/encode.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2017,2020,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | if _VERSION >= "Lua 5.3" then 19 | return require "dromozoa.utf8.encode53" 20 | end 21 | 22 | local encode_error = require "dromozoa.utf8.encode_error" 23 | local encode_table = require "dromozoa.utf8.encode_table" 24 | 25 | local select = select 26 | local concat = table.concat 27 | 28 | local A = encode_table.A 29 | local B = encode_table.B 30 | local C = encode_table.C 31 | local T = encode_table.T 32 | 33 | return function (...) 34 | local n = select("#", ...) 35 | if n == 1 then 36 | local a = ... + 0 37 | if a <= 0x07FF then 38 | local v = A[a] 39 | if v then 40 | return v 41 | end 42 | elseif a <= 0xFFFF then 43 | local c = a % 0x40 44 | local a = (a - c) / 0x40 45 | local v = B[a] 46 | if v then 47 | return v .. T[c] 48 | end 49 | elseif a <= 0x10FFFF then 50 | local d = a % 0x40 51 | local a = (a - d) / 0x40 52 | local c = a % 0x40 53 | local a = (a - c) / 0x40 54 | local v = C[a] 55 | if v then 56 | return v .. T[c] .. T[d] 57 | end 58 | end 59 | encode_error(..., 1) 60 | else 61 | local data = {...} 62 | for i = 1, n do 63 | local a = data[i] + 0 64 | if a <= 0x07FF then 65 | local v = A[a] 66 | if v then 67 | data[i] = v 68 | else 69 | encode_error(data[i], i) 70 | end 71 | elseif a <= 0xFFFF then 72 | local c = a % 0x40 73 | local a = (a - c) / 0x40 74 | local v = B[a] 75 | if v then 76 | data[i] = v .. T[c] 77 | else 78 | encode_error(data[i], i) 79 | end 80 | elseif a <= 0x10FFFF then 81 | local d = a % 0x40 82 | local a = (a - d) / 0x40 83 | local c = a % 0x40 84 | local a = (a - c) / 0x40 85 | local v = C[a] 86 | if v then 87 | data[i] = v .. T[c] .. T[d] 88 | else 89 | encode_error(data[i], i) 90 | end 91 | else 92 | encode_error(data[i], i) 93 | end 94 | end 95 | return concat(data) 96 | end 97 | end 98 | -------------------------------------------------------------------------------- /dromozoa/utf8/encode53.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2017,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local encode_error = require "dromozoa.utf8.encode_error" 19 | local encode_table = require "dromozoa.utf8.encode_table" 20 | 21 | local select = select 22 | local concat = table.concat 23 | 24 | local A = encode_table.A 25 | local B = encode_table.B 26 | local C = encode_table.C 27 | local T = encode_table.T 28 | 29 | return function (...) 30 | local n = select("#", ...) 31 | if n == 1 then 32 | local a = ... + 0 33 | if a <= 0x07FF then 34 | local v = A[a] 35 | if v then 36 | return v 37 | end 38 | elseif a <= 0xFFFF then 39 | local c = a & 0x3F 40 | local a = a >> 6 41 | local v = B[a] 42 | if v then 43 | return v .. T[c] 44 | end 45 | elseif a <= 0x10FFFF then 46 | local d = a & 0x3F 47 | local a = a >> 6 48 | local c = a & 0x3F 49 | local a = a >> 6 50 | local v = C[a] 51 | if v then 52 | return v .. T[c] .. T[d] 53 | end 54 | end 55 | encode_error(..., 1) 56 | else 57 | local data = {...} 58 | for i = 1, n do 59 | local a = data[i] + 0 60 | if a <= 0x07FF then 61 | local v = A[a] 62 | if v then 63 | data[i] = v 64 | else 65 | encode_error(data[i], i) 66 | end 67 | elseif a <= 0xFFFF then 68 | local c = a & 0x3F 69 | local a = a >> 6 70 | local v = B[a] 71 | if v then 72 | data[i] = v .. T[c] 73 | else 74 | encode_error(data[i], i) 75 | end 76 | elseif a <= 0x10FFFF then 77 | local d = a & 0x3F 78 | local a = a >> 6 79 | local c = a & 0x3F 80 | local a = a >> 6 81 | local v = C[a] 82 | if v then 83 | data[i] = v .. T[c] .. T[d] 84 | else 85 | encode_error(data[i], i) 86 | end 87 | else 88 | encode_error(data[i], i) 89 | end 90 | end 91 | return concat(data) 92 | end 93 | end 94 | -------------------------------------------------------------------------------- /dromozoa/utf8/encode_error.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2017,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local check_integer = require "dromozoa.utf8.check_integer" 19 | 20 | local error = error 21 | 22 | return function (v, i) 23 | check_integer(v, i) 24 | error("bad argument #" .. i .. " (value out of range)") 25 | end 26 | -------------------------------------------------------------------------------- /dromozoa/utf8/encode_table.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2017,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local char = string.char 19 | 20 | local A = {} 21 | local B = {} 22 | local C = {} 23 | local T = {} 24 | 25 | for i = 0x0000, 0x007F do 26 | A[i] = char(i) 27 | end 28 | for i = 0x0080, 0x07FF do 29 | local b = i % 0x40 30 | local a = (i - b) / 0x40 31 | A[i] = char(a + 0xC0, b + 0x80) 32 | end 33 | 34 | for i = 0x0000, 0x001F do 35 | B[i] = false 36 | end 37 | for i = 0x0020, 0x03FF do 38 | if 0x0360 <= i and i <= 0x037F then 39 | B[i] = false 40 | else 41 | local b = i % 0x40 42 | local a = (i - b) / 0x40 43 | B[i] = char(a + 0xE0, b + 0x80) 44 | end 45 | end 46 | 47 | for i = 0x0000, 0x000F do 48 | C[i] = false 49 | end 50 | for i = 0x0010, 0x010F do 51 | local b = i % 0x40 52 | local a = (i - b) / 0x40 53 | C[i] = char(a + 0xF0, b + 0x80) 54 | end 55 | 56 | for i = 0x0000, 0x003F do 57 | T[i] = char(i + 0x80) 58 | end 59 | 60 | return { 61 | A = A; 62 | B = B; 63 | C = C; 64 | T = T; 65 | } 66 | -------------------------------------------------------------------------------- /dromozoa/utf8/offset.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2017,2020,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local check_integer = require "dromozoa.utf8.check_integer" 19 | local check_string = require "dromozoa.utf8.check_string" 20 | local offset_table = require "dromozoa.utf8.offset_table" 21 | 22 | local error = error 23 | local byte = string.byte 24 | 25 | local H = offset_table.H 26 | local T = offset_table.T 27 | 28 | return function (s, n, i) 29 | s = check_string(s, 1) 30 | n = check_integer(n, 2) 31 | 32 | local m = #s + 1 33 | 34 | if i == nil then 35 | if n < 0 then 36 | i = m 37 | else 38 | i = 1 39 | end 40 | else 41 | i = check_integer(i, 3) 42 | if i < 0 then 43 | i = i + m 44 | end 45 | end 46 | 47 | if i < 1 or m < i then 48 | error "bad argument #3 (position out of bounds)" 49 | end 50 | 51 | if n == 0 then 52 | if i == m then 53 | return i 54 | elseif i > 3 then 55 | local j = i - 3 56 | local a, b, c, d = byte(s, j, i) 57 | if H[d] then return i end 58 | if H[c] then return i - 1 end 59 | if H[b] then return i - 2 end 60 | if H[a] then return j end 61 | else 62 | local a, b, c = byte(s, 1, i) 63 | if H[c] then return 3 end 64 | if H[b] then return 2 end 65 | if H[a] then return 1 end 66 | end 67 | elseif n < 0 then 68 | local a = byte(s, i) 69 | if T[a] then 70 | error "initial position is a continuation byte" 71 | end 72 | i = i - 1 73 | for i = i, 4, -4 do 74 | local j = i - 3 75 | local a, b, c, d = byte(s, j, i) 76 | if H[d] then n = n + 1 if n == 0 then return i end end 77 | if H[c] then n = n + 1 if n == 0 then return i - 1 end end 78 | if H[b] then n = n + 1 if n == 0 then return i - 2 end end 79 | if H[a] then n = n + 1 if n == 0 then return j end end 80 | end 81 | local a, b, c = byte(s, 1, i % 4) 82 | if H[c] then n = n + 1 if n == 0 then return 3 end end 83 | if H[b] then n = n + 1 if n == 0 then return 2 end end 84 | if H[a] then n = n + 1 if n == 0 then return 1 end end 85 | else 86 | local a = byte(s, i) 87 | if T[a] then 88 | error "initial position is a continuation byte" 89 | end 90 | if n == 1 then 91 | return i 92 | end 93 | for n = n, 3, -1 do 94 | local x = H[a] 95 | if not x then 96 | return 97 | else 98 | i = i + x 99 | a = byte(s, i) 100 | end 101 | end 102 | local x = H[a] 103 | if not x then 104 | return 105 | else 106 | return i + x 107 | end 108 | end 109 | end 110 | -------------------------------------------------------------------------------- /dromozoa/utf8/offset_table.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2017,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local H = {} 19 | local T = {} 20 | 21 | for i = 0x00, 0xFF do 22 | if i <= 0x7F then 23 | H[i] = 1 24 | elseif i <= 0xC1 then 25 | H[i] = false 26 | elseif i <= 0xDF then 27 | H[i] = 2 28 | elseif i <= 0xEF then 29 | H[i] = 3 30 | elseif i <= 0xF4 then 31 | H[i] = 4 32 | else 33 | H[i] = false 34 | end 35 | T[i] = 0x80 <= i and i <= 0xBF 36 | end 37 | 38 | return { 39 | H = H; 40 | T = T; 41 | } 42 | -------------------------------------------------------------------------------- /dromozoa/utf8/pure.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2015,2017,2019,2020,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | return { 19 | char = require "dromozoa.utf8.encode"; 20 | charpattern = "[\000-\127\194-\253][\128-\191]*"; 21 | codes = require "dromozoa.utf8.decode_each"; 22 | codepoint = require "dromozoa.utf8.decode"; 23 | len = require "dromozoa.utf8.count"; 24 | offset = require "dromozoa.utf8.offset"; 25 | } 26 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh -e 2 | 3 | # Copyright (C) 2017-2019,2023 Tomoyuki Fujimori 4 | # 5 | # This file is part of dromozoa-utf8. 6 | # 7 | # dromozoa-utf8 is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # dromozoa-utf8 is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU General Public License 18 | # along with dromozoa-utf8. If not, see . 19 | 20 | LUA_PATH="?.lua;;" 21 | export LUA_PATH 22 | 23 | for i in test/test*.lua 24 | do 25 | case X$# in 26 | X0) lua "$i";; 27 | *) "$@" "$i";; 28 | esac 29 | done 30 | 31 | mkdir -p out 32 | 33 | for i in test/table*.md 34 | do 35 | name=`expr "X$i" : 'Xtest\(/.*\)\.md$' | sed -e 's/^.//'` 36 | case X$# in 37 | X0) 38 | lua dromozoa-markdown-table <"$i" >"out/$name-01.md" 39 | lua dromozoa-markdown-table <"out/$name-01.md" >"out/$name-02.md";; 40 | *) 41 | "$@" dromozoa-markdown-table <"$i" >"out/$name-01.md" 42 | "$@" dromozoa-markdown-table <"out/$name-01.md" >"out/$name-02.md";; 43 | esac 44 | diff -u "test/$name.exp" "out/$name-01.md" 45 | diff -u "test/$name.exp" "out/$name-02.md" 46 | done 47 | 48 | rm -f -r out test.exp 49 | -------------------------------------------------------------------------------- /test/icu4j/pom.xml: -------------------------------------------------------------------------------- 1 | 19 | 20 | 4.0.0 21 | 22 | com.dromozoa.utf8 23 | icu4j 24 | 1.0 25 | jar 26 | 27 | 28 | UTF-8 29 | 1.8 30 | 1.8 31 | 32 | 33 | 34 | 35 | 36 | org.apache.maven.plugins 37 | maven-assembly-plugin 38 | 3.3.0 39 | 40 | 41 | jar-with-dependencies 42 | 43 | 44 | 45 | com.dromozoa.utf8.Application 46 | 47 | 48 | 49 | 50 | 51 | package 52 | 53 | single 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | com.ibm.icu 64 | icu4j 65 | 76.1 66 | 67 | 68 | org.junit.jupiter 69 | junit-jupiter-api 70 | 5.11.3 71 | test 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /test/icu4j/run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh -e 2 | 3 | # Copyright (C) 2018,2023 Tomoyuki Fujimori 4 | # 5 | # This file is part of dromozoa-utf8. 6 | # 7 | # dromozoa-utf8 is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # dromozoa-utf8 is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU General Public License 18 | # along with dromozoa-utf8. If not, see . 19 | 20 | jar=target/icu4j-1.0-jar-with-dependencies.jar 21 | 22 | java -jar "$jar" EAST_ASIAN_WIDTH >../test_east_asian_width.txt 23 | java -jar "$jar" WHITE_SPACE >../test_is_white_space.txt 24 | java -jar "$jar" GENERAL_CATEGORY >../test_general_category.txt 25 | -------------------------------------------------------------------------------- /test/icu4j/src/main/java/com/dromozoa/utf8/Application.java: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2017,2018,2023 Tomoyuki Fujimori 2 | // 3 | // This file is part of dromozoa-utf8. 4 | // 5 | // dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | // it under the terms of the GNU General Public License as published by 7 | // the Free Software Foundation, either version 3 of the License, or 8 | // (at your option) any later version. 9 | // 10 | // dromozoa-utf8 is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | // GNU General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU General Public License 16 | // along with dromozoa-utf8. If not, see . 17 | 18 | package com.dromozoa.utf8; 19 | 20 | import java.util.*; 21 | import com.ibm.icu.lang.*; 22 | 23 | public class Application { 24 | private static class Range { 25 | public final int first; 26 | public int last; 27 | public final String property; 28 | 29 | public Range(int codePoint, String property) { 30 | first = codePoint; 31 | last = codePoint; 32 | this.property = property; 33 | } 34 | } 35 | 36 | private static String getPropertyString(int codePoint, String name) { 37 | if (name.equals("EAST_ASIAN_WIDTH")) { 38 | switch (UCharacter.getIntPropertyValue(codePoint, UProperty.EAST_ASIAN_WIDTH)) { 39 | case UCharacter.EastAsianWidth.AMBIGUOUS: 40 | return "A"; 41 | case UCharacter.EastAsianWidth.FULLWIDTH: 42 | return "F"; 43 | case UCharacter.EastAsianWidth.HALFWIDTH: 44 | return "H"; 45 | case UCharacter.EastAsianWidth.NEUTRAL: 46 | return "N"; 47 | case UCharacter.EastAsianWidth.NARROW: 48 | return "Na"; 49 | case UCharacter.EastAsianWidth.WIDE: 50 | return "W"; 51 | } 52 | } else if (name.equals("WHITE_SPACE")) { 53 | return Boolean.toString(UCharacter.hasBinaryProperty(codePoint, UProperty.WHITE_SPACE)); 54 | } else if (name.equals("GENERAL_CATEGORY")) { 55 | switch (UCharacter.getType(codePoint)) { 56 | case UCharacterCategory.UNASSIGNED: 57 | return "Cn"; 58 | case UCharacterCategory.UPPERCASE_LETTER: 59 | return "Lu"; 60 | case UCharacterCategory.LOWERCASE_LETTER: 61 | return "Ll"; 62 | case UCharacterCategory.TITLECASE_LETTER: 63 | return "Lt"; 64 | case UCharacterCategory.MODIFIER_LETTER: 65 | return "Lm"; 66 | case UCharacterCategory.OTHER_LETTER: 67 | return "Lo"; 68 | case UCharacterCategory.NON_SPACING_MARK: 69 | return "Mn"; 70 | case UCharacterCategory.ENCLOSING_MARK: 71 | return "Me"; 72 | case UCharacterCategory.COMBINING_SPACING_MARK: 73 | return "Mc"; 74 | case UCharacterCategory.DECIMAL_DIGIT_NUMBER: 75 | return "Nd"; 76 | case UCharacterCategory.LETTER_NUMBER: 77 | return "Nl"; 78 | case UCharacterCategory.OTHER_NUMBER: 79 | return "No"; 80 | case UCharacterCategory.SPACE_SEPARATOR: 81 | return "Zs"; 82 | case UCharacterCategory.LINE_SEPARATOR: 83 | return "Zl"; 84 | case UCharacterCategory.PARAGRAPH_SEPARATOR: 85 | return "Zp"; 86 | case UCharacterCategory.CONTROL: 87 | return "Cc"; 88 | case UCharacterCategory.FORMAT: 89 | return "Cf"; 90 | case UCharacterCategory.PRIVATE_USE: 91 | return "Co"; 92 | case UCharacterCategory.SURROGATE: 93 | return "Cs"; 94 | case UCharacterCategory.DASH_PUNCTUATION: 95 | return "Pd"; 96 | case UCharacterCategory.START_PUNCTUATION: 97 | return "Ps"; 98 | case UCharacterCategory.END_PUNCTUATION: 99 | return "Pe"; 100 | case UCharacterCategory.CONNECTOR_PUNCTUATION: 101 | return "Pc"; 102 | case UCharacterCategory.OTHER_PUNCTUATION: 103 | return "Po"; 104 | case UCharacterCategory.MATH_SYMBOL: 105 | return "Sm"; 106 | case UCharacterCategory.CURRENCY_SYMBOL: 107 | return "Sc"; 108 | case UCharacterCategory.MODIFIER_SYMBOL: 109 | return "Sk"; 110 | case UCharacterCategory.OTHER_SYMBOL: 111 | return "So"; 112 | case UCharacterCategory.INITIAL_PUNCTUATION: 113 | return "Pi"; 114 | case UCharacterCategory.FINAL_PUNCTUATION: 115 | return "Pf"; 116 | } 117 | System.err.println("invalid code " + codePoint + " " + UCharacter.getType(codePoint)); 118 | } 119 | throw new RuntimeException(); 120 | } 121 | 122 | public static void main(String[] args) { 123 | String name = "EAST_ASIAN_WIDTH"; 124 | int codePointFirst = 0; 125 | int codePointLast = 0x10FFFF; 126 | 127 | if (args != null) { 128 | if (args.length > 0) { 129 | name = args[0]; 130 | } 131 | if (args.length > 1) { 132 | codePointFirst = Integer.parseInt(args[1], 16); 133 | } 134 | if (args.length > 2) { 135 | codePointLast = Integer.parseInt(args[2], 16); 136 | } 137 | } 138 | 139 | List ranges = new ArrayList<>(); 140 | for (int codePoint = codePointFirst; codePoint <= codePointLast; ++codePoint) { 141 | String property = getPropertyString(codePoint, name); 142 | if (!ranges.isEmpty()) { 143 | Range range = ranges.get(ranges.size() - 1); 144 | if (range.property == property) { 145 | if (range.last != codePoint - 1) { 146 | throw new RuntimeException(); 147 | } 148 | range.last = codePoint; 149 | continue; 150 | } 151 | } 152 | ranges.add(new Range(codePoint, property)); 153 | } 154 | 155 | for (Range range : ranges) { 156 | System.out.println(range.first + "\t" + range.last + "\t" + range.property); 157 | } 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /test/icu4j/src/test/java/com/dromozoa/utf8/ApplicationTest.java: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2017,2018,2023 Tomoyuki Fujimori 2 | // 3 | // This file is part of dromozoa-utf8. 4 | // 5 | // dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | // it under the terms of the GNU General Public License as published by 7 | // the Free Software Foundation, either version 3 of the License, or 8 | // (at your option) any later version. 9 | // 10 | // dromozoa-utf8 is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | // GNU General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU General Public License 16 | // along with dromozoa-utf8. If not, see . 17 | 18 | package com.dromozoa.utf8; 19 | 20 | import org.junit.jupiter.api.Test; 21 | 22 | public class ApplicationTest { 23 | @Test 24 | public void testEastAsianWidth() { 25 | String[] args = { "EAST_ASIAN_WIDTH", "3000", "30FF" }; 26 | Application.main(args); 27 | } 28 | 29 | @Test 30 | public void testWhiteSpace() { 31 | String[] args = { "WHITE_SPACE", "2000", "20FF" }; 32 | Application.main(args); 33 | } 34 | 35 | @Test 36 | public void testGeneralCategory() { 37 | String[] args = { "GENERAL_CATEGORY", "2000", "200F" }; 38 | Application.main(args); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /test/table01.exp: -------------------------------------------------------------------------------- 1 | | foo | bar? | baz | 2 | |----:|:----------:|-----| 3 | | 1 | あ | a | 4 | | 2 | あいうえお | b | 5 | | 3 | ■○ | | 6 | -------------------------------------------------------------------------------- /test/table01.md: -------------------------------------------------------------------------------- 1 | | foo | bar? |baz| 2 | |-:|:-:|| 3 | | 1 | あ|a| 4 | | 2 | あいうえお | b | 5 | | 3 | ■○ || 6 | -------------------------------------------------------------------------------- /test/table02.exp: -------------------------------------------------------------------------------- 1 | | a | b | 2 | |----:|---| 3 | | foo | | 4 | -------------------------------------------------------------------------------- /test/table02.md: -------------------------------------------------------------------------------- 1 | a|b 2 | : 3 | foo 4 | -------------------------------------------------------------------------------- /test/table03.exp: -------------------------------------------------------------------------------- 1 | | a | | 2 | |:---:|--| 3 | | foo | | 4 | -------------------------------------------------------------------------------- /test/table03.md: -------------------------------------------------------------------------------- 1 | a 2 | :-:|- 3 | foo 4 | -------------------------------------------------------------------------------- /test/table04.exp: -------------------------------------------------------------------------------- 1 | | center | 2 | |:------:| 3 | | a | 4 | | ab | 5 | | abc | 6 | | abcd | 7 | -------------------------------------------------------------------------------- /test/table04.md: -------------------------------------------------------------------------------- 1 | center 2 | :----: 3 | a 4 | ab 5 | abc 6 | abcd 7 | -------------------------------------------------------------------------------- /test/test.exp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dromozoa/dromozoa-utf8/371cbfb161ef261cee0f2b4c9c91d8052881577b/test/test.exp -------------------------------------------------------------------------------- /test/test.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2017-2020,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local pure = require "dromozoa.utf8.pure" 19 | 20 | local unpack = table.unpack or unpack 21 | 22 | local verbose = os.getenv "VERBOSE" == "1" 23 | 24 | local pack = table.pack or function (...) 25 | return { n = select("#", ...), ... } 26 | end 27 | 28 | local count = 0 29 | local handle 30 | local expect 31 | 32 | if _VERSION == "Lua 5.4" then 33 | handle = assert(io.open("test.exp", "w")) 34 | else 35 | expect = assert(loadfile "test/test.exp")() 36 | end 37 | 38 | local function each(module, ...) 39 | local result = {} 40 | for p, c in module.codes(...) do 41 | result[#result + 1] = p 42 | result[#result + 1] = c 43 | end 44 | return unpack(result) 45 | end 46 | 47 | local function run(module, name, ...) 48 | if name == "charpattern" then 49 | return pack(true, module.charpattern) 50 | elseif name == "codes" then 51 | return pack(pcall(each, module, ...)) 52 | else 53 | return pack(pcall(module[name], ...)) 54 | end 55 | end 56 | 57 | local function dump(v) 58 | local t = type(v) 59 | if t == "nil" then 60 | return "nil" 61 | elseif t == "number" then 62 | return ("%.17g"):format(v) 63 | elseif t == "string" then 64 | return ("%q"):format(v) 65 | elseif t == "boolean" then 66 | if v then 67 | return "true" 68 | else 69 | return "false" 70 | end 71 | elseif t == "table" then 72 | local n = assert(v.n) 73 | local result = {} 74 | for i = 1, #v do 75 | result[i] = dump(v[i]) 76 | end 77 | return "{n=" .. n .. "," .. table.concat(result, ",") .. "}" 78 | end 79 | end 80 | 81 | local function write(...) 82 | if handle then 83 | handle:write(...) 84 | end 85 | end 86 | 87 | local function check(name, ...) 88 | if verbose then 89 | io.stderr:write(name, " ", dump(pack(...)), "\n") 90 | end 91 | 92 | local result1 93 | if expect then 94 | count = count + 1 95 | result1 = expect[count] 96 | else 97 | result1 = run(utf8, name, ...) 98 | end 99 | local result2 = run(pure, name, ...) 100 | 101 | if verbose then 102 | io.stderr:write(" utf8 ", dump(result1), "\n") 103 | io.stderr:write(" pure ", dump(result2), "\n") 104 | end 105 | 106 | if result1[1] then 107 | assert(result2[1]) 108 | local n = result1.n 109 | if n < result2.n then 110 | n = result2.n 111 | end 112 | for i = 2, n do 113 | assert(result1[i] == result2[i]) 114 | end 115 | else 116 | assert(not result2[1]) 117 | local message1 = result1[2] 118 | local message2 = result2[2] 119 | local bad_argument, reason = message1:match "(bad argument #%d+) .-%((.*)%)$" 120 | if bad_argument then 121 | reason = reason:gsub("expected, got no value$", "expected, got nil") 122 | if not message2:find(bad_argument, nil, true) or not message2:find(reason, nil, true) then 123 | assert(message2:find("attempt to perform arithmetic on", nil, true) or message2:find("attempt to add", nil, true)) 124 | end 125 | else 126 | local reason = "initial position is a continuation byte" 127 | if message1:find(reason, nil, true) then 128 | assert(message2:find(reason, nil, true)) 129 | else 130 | local reason = "invalid UTF-8 code"; 131 | if message1:find(reason, nil, true) then 132 | if not message2:find(reason, nil, true) then 133 | assert(message2:find("attempt to perform arithmetic on", nil, true) or message2:find("attempt to add", nil, true)) 134 | end 135 | else 136 | error "unchecked" 137 | end 138 | end 139 | end 140 | end 141 | write(" ", dump(result1), ";\n") 142 | end 143 | 144 | write "return {\n" 145 | 146 | check "charpattern" 147 | 148 | check("char", -1) 149 | check("char", 0, -1) 150 | check("char", 0x10FFFF) 151 | -- Lua 5.4 relaxed 152 | -- check("char", 0x110000) 153 | check("char", 0, 0x10FFFF) 154 | -- Lua 5.4 relaxed 155 | -- check("char", 0, 0x110000) 156 | check("char", 0x41, 0x42, 0x43) 157 | check("char", "65", "0x42") 158 | check("char", 65.5) 159 | check("char", true) 160 | check("char", 0x41, nil, 0x43) 161 | 162 | check("codes", string.char(0xE2)) 163 | check("codes", string.char(0xE2, 0x89)) 164 | check("codes", string.char(0xE2, 0x89, 0xA2)) 165 | check("codes", string.char(0xE2)) 166 | check("codes", string.char(0xE2, 0x00)) 167 | check("codes", string.char(0xE2, 0x89, 0x00)) 168 | check("codes", string.char(0xE2, 0xFF)) 169 | check("codes", string.char(0xE2, 0x89, 0xFF)) 170 | 171 | local data = { 172 | 65; 173 | 65.5; 174 | 0; 175 | 1; 176 | 1.5; 177 | -1; 178 | -2; 179 | -4; 180 | ""; 181 | "foo"; 182 | "1"; 183 | "0x01"; 184 | true; 185 | false; 186 | { n = 0 }; 187 | } 188 | 189 | for i = 0, #data do 190 | local a = data[i] 191 | for j = 0, #data do 192 | local b = data[j] 193 | for k = 0, #data do 194 | local c = data[k] 195 | check("char", a, b, c) 196 | check("codes", a, b, c) 197 | check("codepoint", a, b, c) 198 | check("len", a, b, c) 199 | check("offset", a, b, c) 200 | end 201 | end 202 | end 203 | 204 | local data = { 205 | { 206 | codepoint = {}; 207 | utf8_char = ""; 208 | }; 209 | { 210 | codepoint = { 0x0041, 0x2262, 0x0391, 0x002E }; 211 | utf8_char = string.char(0x41, 0xE2, 0x89, 0xA2, 0xCE, 0x91, 0x2E); 212 | }; 213 | { 214 | codepoint = { 0xD55C, 0xAD6D, 0xC5B4 }; 215 | utf8_char = string.char(0xED, 0x95, 0x9C, 0xEA, 0xB5, 0xAD, 0xEC, 0x96, 0xB4); 216 | }; 217 | { 218 | codepoint = { 0x65E5, 0x672C, 0x8A9E }; 219 | utf8_char = string.char(0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC, 0xE8, 0xAA, 0x9E); 220 | }; 221 | { 222 | codepoint = { 0xFEFF, 0x233B4 }; 223 | utf8_char = string.char(0xEF, 0xBB, 0xBF, 0xF0, 0xA3, 0x8E, 0xB4); 224 | }; 225 | { 226 | codepoint = { 227 | 0x0041, 0x2262, 0x0391, 0x002E, 228 | 0xD55C, 0xAD6D, 0xC5B4, 229 | 0x65E5, 0x672C, 0x8A9E, 230 | 0xFEFF, 0x0233B4, 231 | }; 232 | utf8_char = string.char( 233 | 0x41, 234 | 0xE2, 0x89, 0xA2, 235 | 0xCE, 0x91, 236 | 0x2E, 237 | 0xED, 0x95, 0x9C, 238 | 0xEA, 0xB5, 0xAD, 239 | 0xEC, 0x96, 0xB4, 240 | 0xE6, 0x97, 0xA5, 241 | 0xE6, 0x9C, 0xAC, 242 | 0xE8, 0xAA, 0x9E, 243 | 0xEF, 0xBB, 0xBF, 244 | 0xF0, 0xA3, 0x8E, 0xB4); 245 | }; 246 | } 247 | 248 | for i = 1, #data do 249 | local codepoint = data[i].codepoint 250 | local utf8_char = data[i].utf8_char 251 | 252 | local m = #codepoint + 2 253 | local n = #utf8_char + 2 254 | 255 | check("char", unpack(codepoint)) 256 | check("codes", utf8_char) 257 | 258 | check("codepoint", utf8_char) 259 | for j = -n, n do 260 | check("codepoint", utf8_char, j) 261 | for k = -n, n do 262 | check("codepoint", utf8_char, j, k) 263 | end 264 | end 265 | 266 | check("len", utf8_char) 267 | for j = -n, n do 268 | check("len", utf8_char, j) 269 | for k = -n, n do 270 | check("len", utf8_char, j, k) 271 | end 272 | end 273 | 274 | check("offset", utf8_char) 275 | for j = -m, m do 276 | check("offset", utf8_char, j) 277 | for k = -n, n do 278 | check("offset", utf8_char, j, k) 279 | end 280 | end 281 | end 282 | 283 | write "}\n" 284 | 285 | if handle then 286 | handle:close() 287 | end 288 | -------------------------------------------------------------------------------- /test/test_decode_surrogate_pair.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2017,2018,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local utf16 = require "dromozoa.utf16" 19 | 20 | local function check_error(a, b, expect) 21 | local result, message = pcall(utf16.decode_surrogate_pair, a, b) 22 | assert(not result) 23 | assert(message:find(expect, nil, true)) 24 | end 25 | 26 | check_error(0x0000, 0x0000, "bad argument #1") 27 | check_error(0xD7FF, 0x0000, "bad argument #1") 28 | check_error(0xDC00, 0x0000, "bad argument #1") 29 | check_error(0xFFFF, 0x0000, "bad argument #1") 30 | 31 | check_error(0xD800, 0x0000, "bad argument #2") 32 | check_error(0xD800, 0xDBFF, "bad argument #2") 33 | check_error(0xD800, 0xE000, "bad argument #2") 34 | check_error(0xD800, 0xFFFF, "bad argument #2") 35 | 36 | -- U+10000 (DBC0 DC00) 37 | assert(utf16.decode_surrogate_pair(0xD800, 0xDC00) == 0x010000) 38 | -- U+1F37A (D83C DF7A) BEER MUG 39 | assert(utf16.decode_surrogate_pair(0xD83C, 0xDF7A) == 0x01F37A) 40 | -- U+1F37B (D83C DF7B) CLINKING BEER MUGS 41 | assert(utf16.decode_surrogate_pair(0xD83C, 0xDF7B) == 0x01F37B) 42 | -- U+100000 (DBC0 DC00) 43 | assert(utf16.decode_surrogate_pair(0xDBC0, 0xDC00) == 0x100000) 44 | assert(utf16.decode_surrogate_pair(0xDBFF, 0xDFFF) == 0x10FFFF) 45 | -------------------------------------------------------------------------------- /test/test_east_asian_width.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2017,2018,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local ucd = require "dromozoa.ucd" 19 | 20 | for line in io.lines "test/test_east_asian_width.txt" do 21 | local first, last, property = line:match "^(%d+)\t(%d+)\t(%a%a?)$" 22 | for i = first, last do 23 | assert(ucd.east_asian_width(i) == property) 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /test/test_east_asian_width.txt: -------------------------------------------------------------------------------- 1 | 0 31 N 2 | 32 126 Na 3 | 127 160 N 4 | 161 161 A 5 | 162 163 Na 6 | 164 164 A 7 | 165 166 Na 8 | 167 168 A 9 | 169 169 N 10 | 170 170 A 11 | 171 171 N 12 | 172 172 Na 13 | 173 174 A 14 | 175 175 Na 15 | 176 180 A 16 | 181 181 N 17 | 182 186 A 18 | 187 187 N 19 | 188 191 A 20 | 192 197 N 21 | 198 198 A 22 | 199 207 N 23 | 208 208 A 24 | 209 214 N 25 | 215 216 A 26 | 217 221 N 27 | 222 225 A 28 | 226 229 N 29 | 230 230 A 30 | 231 231 N 31 | 232 234 A 32 | 235 235 N 33 | 236 237 A 34 | 238 239 N 35 | 240 240 A 36 | 241 241 N 37 | 242 243 A 38 | 244 246 N 39 | 247 250 A 40 | 251 251 N 41 | 252 252 A 42 | 253 253 N 43 | 254 254 A 44 | 255 256 N 45 | 257 257 A 46 | 258 272 N 47 | 273 273 A 48 | 274 274 N 49 | 275 275 A 50 | 276 282 N 51 | 283 283 A 52 | 284 293 N 53 | 294 295 A 54 | 296 298 N 55 | 299 299 A 56 | 300 304 N 57 | 305 307 A 58 | 308 311 N 59 | 312 312 A 60 | 313 318 N 61 | 319 322 A 62 | 323 323 N 63 | 324 324 A 64 | 325 327 N 65 | 328 331 A 66 | 332 332 N 67 | 333 333 A 68 | 334 337 N 69 | 338 339 A 70 | 340 357 N 71 | 358 359 A 72 | 360 362 N 73 | 363 363 A 74 | 364 461 N 75 | 462 462 A 76 | 463 463 N 77 | 464 464 A 78 | 465 465 N 79 | 466 466 A 80 | 467 467 N 81 | 468 468 A 82 | 469 469 N 83 | 470 470 A 84 | 471 471 N 85 | 472 472 A 86 | 473 473 N 87 | 474 474 A 88 | 475 475 N 89 | 476 476 A 90 | 477 592 N 91 | 593 593 A 92 | 594 608 N 93 | 609 609 A 94 | 610 707 N 95 | 708 708 A 96 | 709 710 N 97 | 711 711 A 98 | 712 712 N 99 | 713 715 A 100 | 716 716 N 101 | 717 717 A 102 | 718 719 N 103 | 720 720 A 104 | 721 727 N 105 | 728 731 A 106 | 732 732 N 107 | 733 733 A 108 | 734 734 N 109 | 735 735 A 110 | 736 767 N 111 | 768 879 A 112 | 880 912 N 113 | 913 929 A 114 | 930 930 N 115 | 931 937 A 116 | 938 944 N 117 | 945 961 A 118 | 962 962 N 119 | 963 969 A 120 | 970 1024 N 121 | 1025 1025 A 122 | 1026 1039 N 123 | 1040 1103 A 124 | 1104 1104 N 125 | 1105 1105 A 126 | 1106 4351 N 127 | 4352 4447 W 128 | 4448 8207 N 129 | 8208 8208 A 130 | 8209 8210 N 131 | 8211 8214 A 132 | 8215 8215 N 133 | 8216 8217 A 134 | 8218 8219 N 135 | 8220 8221 A 136 | 8222 8223 N 137 | 8224 8226 A 138 | 8227 8227 N 139 | 8228 8231 A 140 | 8232 8239 N 141 | 8240 8240 A 142 | 8241 8241 N 143 | 8242 8243 A 144 | 8244 8244 N 145 | 8245 8245 A 146 | 8246 8250 N 147 | 8251 8251 A 148 | 8252 8253 N 149 | 8254 8254 A 150 | 8255 8307 N 151 | 8308 8308 A 152 | 8309 8318 N 153 | 8319 8319 A 154 | 8320 8320 N 155 | 8321 8324 A 156 | 8325 8360 N 157 | 8361 8361 H 158 | 8362 8363 N 159 | 8364 8364 A 160 | 8365 8450 N 161 | 8451 8451 A 162 | 8452 8452 N 163 | 8453 8453 A 164 | 8454 8456 N 165 | 8457 8457 A 166 | 8458 8466 N 167 | 8467 8467 A 168 | 8468 8469 N 169 | 8470 8470 A 170 | 8471 8480 N 171 | 8481 8482 A 172 | 8483 8485 N 173 | 8486 8486 A 174 | 8487 8490 N 175 | 8491 8491 A 176 | 8492 8530 N 177 | 8531 8532 A 178 | 8533 8538 N 179 | 8539 8542 A 180 | 8543 8543 N 181 | 8544 8555 A 182 | 8556 8559 N 183 | 8560 8569 A 184 | 8570 8584 N 185 | 8585 8585 A 186 | 8586 8591 N 187 | 8592 8601 A 188 | 8602 8631 N 189 | 8632 8633 A 190 | 8634 8657 N 191 | 8658 8658 A 192 | 8659 8659 N 193 | 8660 8660 A 194 | 8661 8678 N 195 | 8679 8679 A 196 | 8680 8703 N 197 | 8704 8704 A 198 | 8705 8705 N 199 | 8706 8707 A 200 | 8708 8710 N 201 | 8711 8712 A 202 | 8713 8714 N 203 | 8715 8715 A 204 | 8716 8718 N 205 | 8719 8719 A 206 | 8720 8720 N 207 | 8721 8721 A 208 | 8722 8724 N 209 | 8725 8725 A 210 | 8726 8729 N 211 | 8730 8730 A 212 | 8731 8732 N 213 | 8733 8736 A 214 | 8737 8738 N 215 | 8739 8739 A 216 | 8740 8740 N 217 | 8741 8741 A 218 | 8742 8742 N 219 | 8743 8748 A 220 | 8749 8749 N 221 | 8750 8750 A 222 | 8751 8755 N 223 | 8756 8759 A 224 | 8760 8763 N 225 | 8764 8765 A 226 | 8766 8775 N 227 | 8776 8776 A 228 | 8777 8779 N 229 | 8780 8780 A 230 | 8781 8785 N 231 | 8786 8786 A 232 | 8787 8799 N 233 | 8800 8801 A 234 | 8802 8803 N 235 | 8804 8807 A 236 | 8808 8809 N 237 | 8810 8811 A 238 | 8812 8813 N 239 | 8814 8815 A 240 | 8816 8833 N 241 | 8834 8835 A 242 | 8836 8837 N 243 | 8838 8839 A 244 | 8840 8852 N 245 | 8853 8853 A 246 | 8854 8856 N 247 | 8857 8857 A 248 | 8858 8868 N 249 | 8869 8869 A 250 | 8870 8894 N 251 | 8895 8895 A 252 | 8896 8977 N 253 | 8978 8978 A 254 | 8979 8985 N 255 | 8986 8987 W 256 | 8988 9000 N 257 | 9001 9002 W 258 | 9003 9192 N 259 | 9193 9196 W 260 | 9197 9199 N 261 | 9200 9200 W 262 | 9201 9202 N 263 | 9203 9203 W 264 | 9204 9311 N 265 | 9312 9449 A 266 | 9450 9450 N 267 | 9451 9547 A 268 | 9548 9551 N 269 | 9552 9587 A 270 | 9588 9599 N 271 | 9600 9615 A 272 | 9616 9617 N 273 | 9618 9621 A 274 | 9622 9631 N 275 | 9632 9633 A 276 | 9634 9634 N 277 | 9635 9641 A 278 | 9642 9649 N 279 | 9650 9651 A 280 | 9652 9653 N 281 | 9654 9655 A 282 | 9656 9659 N 283 | 9660 9661 A 284 | 9662 9663 N 285 | 9664 9665 A 286 | 9666 9669 N 287 | 9670 9672 A 288 | 9673 9674 N 289 | 9675 9675 A 290 | 9676 9677 N 291 | 9678 9681 A 292 | 9682 9697 N 293 | 9698 9701 A 294 | 9702 9710 N 295 | 9711 9711 A 296 | 9712 9724 N 297 | 9725 9726 W 298 | 9727 9732 N 299 | 9733 9734 A 300 | 9735 9736 N 301 | 9737 9737 A 302 | 9738 9741 N 303 | 9742 9743 A 304 | 9744 9747 N 305 | 9748 9749 W 306 | 9750 9755 N 307 | 9756 9756 A 308 | 9757 9757 N 309 | 9758 9758 A 310 | 9759 9775 N 311 | 9776 9783 W 312 | 9784 9791 N 313 | 9792 9792 A 314 | 9793 9793 N 315 | 9794 9794 A 316 | 9795 9799 N 317 | 9800 9811 W 318 | 9812 9823 N 319 | 9824 9825 A 320 | 9826 9826 N 321 | 9827 9829 A 322 | 9830 9830 N 323 | 9831 9834 A 324 | 9835 9835 N 325 | 9836 9837 A 326 | 9838 9838 N 327 | 9839 9839 A 328 | 9840 9854 N 329 | 9855 9855 W 330 | 9856 9865 N 331 | 9866 9871 W 332 | 9872 9874 N 333 | 9875 9875 W 334 | 9876 9885 N 335 | 9886 9887 A 336 | 9888 9888 N 337 | 9889 9889 W 338 | 9890 9897 N 339 | 9898 9899 W 340 | 9900 9916 N 341 | 9917 9918 W 342 | 9919 9919 A 343 | 9920 9923 N 344 | 9924 9925 W 345 | 9926 9933 A 346 | 9934 9934 W 347 | 9935 9939 A 348 | 9940 9940 W 349 | 9941 9953 A 350 | 9954 9954 N 351 | 9955 9955 A 352 | 9956 9959 N 353 | 9960 9961 A 354 | 9962 9962 W 355 | 9963 9969 A 356 | 9970 9971 W 357 | 9972 9972 A 358 | 9973 9973 W 359 | 9974 9977 A 360 | 9978 9978 W 361 | 9979 9980 A 362 | 9981 9981 W 363 | 9982 9983 A 364 | 9984 9988 N 365 | 9989 9989 W 366 | 9990 9993 N 367 | 9994 9995 W 368 | 9996 10023 N 369 | 10024 10024 W 370 | 10025 10044 N 371 | 10045 10045 A 372 | 10046 10059 N 373 | 10060 10060 W 374 | 10061 10061 N 375 | 10062 10062 W 376 | 10063 10066 N 377 | 10067 10069 W 378 | 10070 10070 N 379 | 10071 10071 W 380 | 10072 10101 N 381 | 10102 10111 A 382 | 10112 10132 N 383 | 10133 10135 W 384 | 10136 10159 N 385 | 10160 10160 W 386 | 10161 10174 N 387 | 10175 10175 W 388 | 10176 10213 N 389 | 10214 10221 Na 390 | 10222 10628 N 391 | 10629 10630 Na 392 | 10631 11034 N 393 | 11035 11036 W 394 | 11037 11087 N 395 | 11088 11088 W 396 | 11089 11092 N 397 | 11093 11093 W 398 | 11094 11097 A 399 | 11098 11903 N 400 | 11904 11929 W 401 | 11930 11930 N 402 | 11931 12019 W 403 | 12020 12031 N 404 | 12032 12245 W 405 | 12246 12271 N 406 | 12272 12287 W 407 | 12288 12288 F 408 | 12289 12350 W 409 | 12351 12352 N 410 | 12353 12438 W 411 | 12439 12440 N 412 | 12441 12543 W 413 | 12544 12548 N 414 | 12549 12591 W 415 | 12592 12592 N 416 | 12593 12686 W 417 | 12687 12687 N 418 | 12688 12773 W 419 | 12774 12782 N 420 | 12783 12830 W 421 | 12831 12831 N 422 | 12832 12871 W 423 | 12872 12879 A 424 | 12880 42124 W 425 | 42125 42127 N 426 | 42128 42182 W 427 | 42183 43359 N 428 | 43360 43388 W 429 | 43389 44031 N 430 | 44032 55203 W 431 | 55204 57343 N 432 | 57344 63743 A 433 | 63744 64255 W 434 | 64256 65023 N 435 | 65024 65039 A 436 | 65040 65049 W 437 | 65050 65071 N 438 | 65072 65106 W 439 | 65107 65107 N 440 | 65108 65126 W 441 | 65127 65127 N 442 | 65128 65131 W 443 | 65132 65280 N 444 | 65281 65376 F 445 | 65377 65470 H 446 | 65471 65473 N 447 | 65474 65479 H 448 | 65480 65481 N 449 | 65482 65487 H 450 | 65488 65489 N 451 | 65490 65495 H 452 | 65496 65497 N 453 | 65498 65500 H 454 | 65501 65503 N 455 | 65504 65510 F 456 | 65511 65511 N 457 | 65512 65518 H 458 | 65519 65532 N 459 | 65533 65533 A 460 | 65534 94175 N 461 | 94176 94180 W 462 | 94181 94191 N 463 | 94192 94193 W 464 | 94194 94207 N 465 | 94208 100343 W 466 | 100344 100351 N 467 | 100352 101589 W 468 | 101590 101630 N 469 | 101631 101640 W 470 | 101641 110575 N 471 | 110576 110579 W 472 | 110580 110580 N 473 | 110581 110587 W 474 | 110588 110588 N 475 | 110589 110590 W 476 | 110591 110591 N 477 | 110592 110882 W 478 | 110883 110897 N 479 | 110898 110898 W 480 | 110899 110927 N 481 | 110928 110930 W 482 | 110931 110932 N 483 | 110933 110933 W 484 | 110934 110947 N 485 | 110948 110951 W 486 | 110952 110959 N 487 | 110960 111355 W 488 | 111356 119551 N 489 | 119552 119638 W 490 | 119639 119647 N 491 | 119648 119670 W 492 | 119671 126979 N 493 | 126980 126980 W 494 | 126981 127182 N 495 | 127183 127183 W 496 | 127184 127231 N 497 | 127232 127242 A 498 | 127243 127247 N 499 | 127248 127277 A 500 | 127278 127279 N 501 | 127280 127337 A 502 | 127338 127343 N 503 | 127344 127373 A 504 | 127374 127374 W 505 | 127375 127376 A 506 | 127377 127386 W 507 | 127387 127404 A 508 | 127405 127487 N 509 | 127488 127490 W 510 | 127491 127503 N 511 | 127504 127547 W 512 | 127548 127551 N 513 | 127552 127560 W 514 | 127561 127567 N 515 | 127568 127569 W 516 | 127570 127583 N 517 | 127584 127589 W 518 | 127590 127743 N 519 | 127744 127776 W 520 | 127777 127788 N 521 | 127789 127797 W 522 | 127798 127798 N 523 | 127799 127868 W 524 | 127869 127869 N 525 | 127870 127891 W 526 | 127892 127903 N 527 | 127904 127946 W 528 | 127947 127950 N 529 | 127951 127955 W 530 | 127956 127967 N 531 | 127968 127984 W 532 | 127985 127987 N 533 | 127988 127988 W 534 | 127989 127991 N 535 | 127992 128062 W 536 | 128063 128063 N 537 | 128064 128064 W 538 | 128065 128065 N 539 | 128066 128252 W 540 | 128253 128254 N 541 | 128255 128317 W 542 | 128318 128330 N 543 | 128331 128334 W 544 | 128335 128335 N 545 | 128336 128359 W 546 | 128360 128377 N 547 | 128378 128378 W 548 | 128379 128404 N 549 | 128405 128406 W 550 | 128407 128419 N 551 | 128420 128420 W 552 | 128421 128506 N 553 | 128507 128591 W 554 | 128592 128639 N 555 | 128640 128709 W 556 | 128710 128715 N 557 | 128716 128716 W 558 | 128717 128719 N 559 | 128720 128722 W 560 | 128723 128724 N 561 | 128725 128727 W 562 | 128728 128731 N 563 | 128732 128735 W 564 | 128736 128746 N 565 | 128747 128748 W 566 | 128749 128755 N 567 | 128756 128764 W 568 | 128765 128991 N 569 | 128992 129003 W 570 | 129004 129007 N 571 | 129008 129008 W 572 | 129009 129291 N 573 | 129292 129338 W 574 | 129339 129339 N 575 | 129340 129349 W 576 | 129350 129350 N 577 | 129351 129535 W 578 | 129536 129647 N 579 | 129648 129660 W 580 | 129661 129663 N 581 | 129664 129673 W 582 | 129674 129678 N 583 | 129679 129734 W 584 | 129735 129741 N 585 | 129742 129756 W 586 | 129757 129758 N 587 | 129759 129769 W 588 | 129770 129775 N 589 | 129776 129784 W 590 | 129785 131071 N 591 | 131072 196605 W 592 | 196606 196607 N 593 | 196608 262141 W 594 | 262142 917759 N 595 | 917760 917999 A 596 | 918000 983039 N 597 | 983040 1048573 A 598 | 1048574 1048575 N 599 | 1048576 1114109 A 600 | 1114110 1114111 N 601 | -------------------------------------------------------------------------------- /test/test_general_category.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2018,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local ucd = require "dromozoa.ucd" 19 | 20 | for line in io.lines "test/test_general_category.txt" do 21 | local first, last, property = assert(line:match "^(%d+)\t(%d+)\t(%a%a?)$") 22 | for i = first, last do 23 | assert(ucd.general_category(i) == property) 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /test/test_is_white_space.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2018,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local ucd = require "dromozoa.ucd" 19 | 20 | for line in io.lines "test/test_is_white_space.txt" do 21 | local first, last, property = line:match "^(%d+)\t(%d+)\t(%a+)$" 22 | property = property == "true" 23 | for i = first, last do 24 | assert(ucd.is_white_space(i) == property) 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /test/test_is_white_space.txt: -------------------------------------------------------------------------------- 1 | 0 8 false 2 | 9 13 true 3 | 14 31 false 4 | 32 32 true 5 | 33 132 false 6 | 133 133 true 7 | 134 159 false 8 | 160 160 true 9 | 161 5759 false 10 | 5760 5760 true 11 | 5761 8191 false 12 | 8192 8202 true 13 | 8203 8231 false 14 | 8232 8233 true 15 | 8234 8238 false 16 | 8239 8239 true 17 | 8240 8286 false 18 | 8287 8287 true 19 | 8288 12287 false 20 | 12288 12288 true 21 | 12289 1114111 false 22 | -------------------------------------------------------------------------------- /test/test_ucd_builder.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (C) 2017,2018,2023 Tomoyuki Fujimori 2 | -- 3 | -- This file is part of dromozoa-utf8. 4 | -- 5 | -- dromozoa-utf8 is free software: you can redistribute it and/or modify 6 | -- it under the terms of the GNU General Public License as published by 7 | -- the Free Software Foundation, either version 3 of the License, or 8 | -- (at your option) any later version. 9 | -- 10 | -- dromozoa-utf8 is distributed in the hope that it will be useful, 11 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | -- GNU General Public License for more details. 14 | -- 15 | -- You should have received a copy of the GNU General Public License 16 | -- along with dromozoa-utf8. If not, see . 17 | 18 | local utf8 = require "dromozoa.utf8" 19 | local builder = require "dromozoa.ucd.builder" 20 | 21 | local _ = builder(false) 22 | _:range(0x30, 0x39, true) 23 | _:range(0x41, 0x5A, true) 24 | _:range(0x61, 0x7A, true) 25 | local data = _:build() 26 | 27 | local tmpname = os.tmpname() 28 | _.compile(assert(io.open(tmpname, "w")), data):close() 29 | 30 | local f = assert(loadfile(tmpname))() 31 | os.remove(tmpname) 32 | 33 | assert(f(utf8.codepoint "0")) 34 | assert(f(utf8.codepoint "A")) 35 | assert(f(utf8.codepoint "a")) 36 | assert(not f(utf8.codepoint " ")) 37 | assert(not f(utf8.codepoint "あ")) 38 | --------------------------------------------------------------------------------