├── .gitattributes ├── .gitignore ├── LICENSE.txt ├── README.md ├── api.py ├── bert.py ├── cpp-app ├── CMakeLists.txt ├── app.cpp └── unilib │ ├── tokenizer.cpp │ ├── tokenizer.h │ ├── unicode.cpp │ ├── unicode.h │ ├── uninorms.cpp │ ├── uninorms.h │ ├── unistrip.cpp │ ├── unistrip.h │ ├── utf16.cpp │ ├── utf16.h │ ├── utf8.cpp │ ├── utf8.h │ ├── version.cpp │ └── version.h ├── data ├── test.txt ├── train.txt └── valid.txt ├── img ├── cmake.png ├── curl.png ├── inference.png ├── make.png └── postman.png ├── requirements.txt └── run_ner.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.cpp linguist-detectable=false -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # IPython 79 | profile_default/ 80 | ipython_config.py 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # celery beat schedule file 86 | celerybeat-schedule 87 | 88 | # SageMath parsed files 89 | *.sage.py 90 | 91 | # Environments 92 | .env 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | env.bak/ 98 | venv.bak/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | .spyproject 103 | 104 | # Rope project settings 105 | .ropeproject 106 | 107 | # mkdocs documentation 108 | /site 109 | 110 | # mypy 111 | .mypy_cache/ 112 | .dmypy.json 113 | dmypy.json 114 | 115 | # Pyre type checker 116 | .pyre/ 117 | out*/ 118 | .vscode/ 119 | out* 120 | libtorch/* 121 | cpp-app/CMakeFiles/* 122 | base/* 123 | cmake* 124 | Makefile 125 | CMakeCache.txt 126 | !cmake.png -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | GNU AFFERO GENERAL PUBLIC LICENSE 2 | Version 3, 19 November 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU Affero General Public License is a free, copyleft license for 11 | software and other kinds of works, specifically designed to ensure 12 | cooperation with the community in the case of network server software. 13 | 14 | The licenses for most software and other practical works are designed 15 | to take away your freedom to share and change the works. By contrast, 16 | our General Public Licenses are intended to guarantee your freedom to 17 | share and change all versions of a program--to make sure it remains free 18 | software for all its users. 19 | 20 | When we speak of free software, we are referring to freedom, not 21 | price. Our General Public Licenses are designed to make sure that you 22 | have the freedom to distribute copies of free software (and charge for 23 | them if you wish), that you receive source code or can get it if you 24 | want it, that you can change the software or use pieces of it in new 25 | free programs, and that you know you can do these things. 26 | 27 | Developers that use our General Public Licenses protect your rights 28 | with two steps: (1) assert copyright on the software, and (2) offer 29 | you this License which gives you legal permission to copy, distribute 30 | and/or modify the software. 31 | 32 | A secondary benefit of defending all users' freedom is that 33 | improvements made in alternate versions of the program, if they 34 | receive widespread use, become available for other developers to 35 | incorporate. Many developers of free software are heartened and 36 | encouraged by the resulting cooperation. However, in the case of 37 | software used on network servers, this result may fail to come about. 38 | The GNU General Public License permits making a modified version and 39 | letting the public access it on a server without ever releasing its 40 | source code to the public. 41 | 42 | The GNU Affero General Public License is designed specifically to 43 | ensure that, in such cases, the modified source code becomes available 44 | to the community. It requires the operator of a network server to 45 | provide the source code of the modified version running there to the 46 | users of that server. Therefore, public use of a modified version, on 47 | a publicly accessible server, gives the public access to the source 48 | code of the modified version. 49 | 50 | An older license, called the Affero General Public License and 51 | published by Affero, was designed to accomplish similar goals. This is 52 | a different license, not a version of the Affero GPL, but Affero has 53 | released a new version of the Affero GPL which permits relicensing under 54 | this license. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | TERMS AND CONDITIONS 60 | 61 | 0. Definitions. 62 | 63 | "This License" refers to version 3 of the GNU Affero General Public License. 64 | 65 | "Copyright" also means copyright-like laws that apply to other kinds of 66 | works, such as semiconductor masks. 67 | 68 | "The Program" refers to any copyrightable work licensed under this 69 | License. Each licensee is addressed as "you". "Licensees" and 70 | "recipients" may be individuals or organizations. 71 | 72 | To "modify" a work means to copy from or adapt all or part of the work 73 | in a fashion requiring copyright permission, other than the making of an 74 | exact copy. The resulting work is called a "modified version" of the 75 | earlier work or a work "based on" the earlier work. 76 | 77 | A "covered work" means either the unmodified Program or a work based 78 | on the Program. 79 | 80 | To "propagate" a work means to do anything with it that, without 81 | permission, would make you directly or secondarily liable for 82 | infringement under applicable copyright law, except executing it on a 83 | computer or modifying a private copy. Propagation includes copying, 84 | distribution (with or without modification), making available to the 85 | public, and in some countries other activities as well. 86 | 87 | To "convey" a work means any kind of propagation that enables other 88 | parties to make or receive copies. Mere interaction with a user through 89 | a computer network, with no transfer of a copy, is not conveying. 90 | 91 | An interactive user interface displays "Appropriate Legal Notices" 92 | to the extent that it includes a convenient and prominently visible 93 | feature that (1) displays an appropriate copyright notice, and (2) 94 | tells the user that there is no warranty for the work (except to the 95 | extent that warranties are provided), that licensees may convey the 96 | work under this License, and how to view a copy of this License. If 97 | the interface presents a list of user commands or options, such as a 98 | menu, a prominent item in the list meets this criterion. 99 | 100 | 1. Source Code. 101 | 102 | The "source code" for a work means the preferred form of the work 103 | for making modifications to it. "Object code" means any non-source 104 | form of a work. 105 | 106 | A "Standard Interface" means an interface that either is an official 107 | standard defined by a recognized standards body, or, in the case of 108 | interfaces specified for a particular programming language, one that 109 | is widely used among developers working in that language. 110 | 111 | The "System Libraries" of an executable work include anything, other 112 | than the work as a whole, that (a) is included in the normal form of 113 | packaging a Major Component, but which is not part of that Major 114 | Component, and (b) serves only to enable use of the work with that 115 | Major Component, or to implement a Standard Interface for which an 116 | implementation is available to the public in source code form. A 117 | "Major Component", in this context, means a major essential component 118 | (kernel, window system, and so on) of the specific operating system 119 | (if any) on which the executable work runs, or a compiler used to 120 | produce the work, or an object code interpreter used to run it. 121 | 122 | The "Corresponding Source" for a work in object code form means all 123 | the source code needed to generate, install, and (for an executable 124 | work) run the object code and to modify the work, including scripts to 125 | control those activities. However, it does not include the work's 126 | System Libraries, or general-purpose tools or generally available free 127 | programs which are used unmodified in performing those activities but 128 | which are not part of the work. For example, Corresponding Source 129 | includes interface definition files associated with source files for 130 | the work, and the source code for shared libraries and dynamically 131 | linked subprograms that the work is specifically designed to require, 132 | such as by intimate data communication or control flow between those 133 | subprograms and other parts of the work. 134 | 135 | The Corresponding Source need not include anything that users 136 | can regenerate automatically from other parts of the Corresponding 137 | Source. 138 | 139 | The Corresponding Source for a work in source code form is that 140 | same work. 141 | 142 | 2. Basic Permissions. 143 | 144 | All rights granted under this License are granted for the term of 145 | copyright on the Program, and are irrevocable provided the stated 146 | conditions are met. This License explicitly affirms your unlimited 147 | permission to run the unmodified Program. The output from running a 148 | covered work is covered by this License only if the output, given its 149 | content, constitutes a covered work. This License acknowledges your 150 | rights of fair use or other equivalent, as provided by copyright law. 151 | 152 | You may make, run and propagate covered works that you do not 153 | convey, without conditions so long as your license otherwise remains 154 | in force. You may convey covered works to others for the sole purpose 155 | of having them make modifications exclusively for you, or provide you 156 | with facilities for running those works, provided that you comply with 157 | the terms of this License in conveying all material for which you do 158 | not control copyright. Those thus making or running the covered works 159 | for you must do so exclusively on your behalf, under your direction 160 | and control, on terms that prohibit them from making any copies of 161 | your copyrighted material outside their relationship with you. 162 | 163 | Conveying under any other circumstances is permitted solely under 164 | the conditions stated below. Sublicensing is not allowed; section 10 165 | makes it unnecessary. 166 | 167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 168 | 169 | No covered work shall be deemed part of an effective technological 170 | measure under any applicable law fulfilling obligations under article 171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 172 | similar laws prohibiting or restricting circumvention of such 173 | measures. 174 | 175 | When you convey a covered work, you waive any legal power to forbid 176 | circumvention of technological measures to the extent such circumvention 177 | is effected by exercising rights under this License with respect to 178 | the covered work, and you disclaim any intention to limit operation or 179 | modification of the work as a means of enforcing, against the work's 180 | users, your or third parties' legal rights to forbid circumvention of 181 | technological measures. 182 | 183 | 4. Conveying Verbatim Copies. 184 | 185 | You may convey verbatim copies of the Program's source code as you 186 | receive it, in any medium, provided that you conspicuously and 187 | appropriately publish on each copy an appropriate copyright notice; 188 | keep intact all notices stating that this License and any 189 | non-permissive terms added in accord with section 7 apply to the code; 190 | keep intact all notices of the absence of any warranty; and give all 191 | recipients a copy of this License along with the Program. 192 | 193 | You may charge any price or no price for each copy that you convey, 194 | and you may offer support or warranty protection for a fee. 195 | 196 | 5. Conveying Modified Source Versions. 197 | 198 | You may convey a work based on the Program, or the modifications to 199 | produce it from the Program, in the form of source code under the 200 | terms of section 4, provided that you also meet all of these conditions: 201 | 202 | a) The work must carry prominent notices stating that you modified 203 | it, and giving a relevant date. 204 | 205 | b) The work must carry prominent notices stating that it is 206 | released under this License and any conditions added under section 207 | 7. This requirement modifies the requirement in section 4 to 208 | "keep intact all notices". 209 | 210 | c) You must license the entire work, as a whole, under this 211 | License to anyone who comes into possession of a copy. This 212 | License will therefore apply, along with any applicable section 7 213 | additional terms, to the whole of the work, and all its parts, 214 | regardless of how they are packaged. This License gives no 215 | permission to license the work in any other way, but it does not 216 | invalidate such permission if you have separately received it. 217 | 218 | d) If the work has interactive user interfaces, each must display 219 | Appropriate Legal Notices; however, if the Program has interactive 220 | interfaces that do not display Appropriate Legal Notices, your 221 | work need not make them do so. 222 | 223 | A compilation of a covered work with other separate and independent 224 | works, which are not by their nature extensions of the covered work, 225 | and which are not combined with it such as to form a larger program, 226 | in or on a volume of a storage or distribution medium, is called an 227 | "aggregate" if the compilation and its resulting copyright are not 228 | used to limit the access or legal rights of the compilation's users 229 | beyond what the individual works permit. Inclusion of a covered work 230 | in an aggregate does not cause this License to apply to the other 231 | parts of the aggregate. 232 | 233 | 6. Conveying Non-Source Forms. 234 | 235 | You may convey a covered work in object code form under the terms 236 | of sections 4 and 5, provided that you also convey the 237 | machine-readable Corresponding Source under the terms of this License, 238 | in one of these ways: 239 | 240 | a) Convey the object code in, or embodied in, a physical product 241 | (including a physical distribution medium), accompanied by the 242 | Corresponding Source fixed on a durable physical medium 243 | customarily used for software interchange. 244 | 245 | b) Convey the object code in, or embodied in, a physical product 246 | (including a physical distribution medium), accompanied by a 247 | written offer, valid for at least three years and valid for as 248 | long as you offer spare parts or customer support for that product 249 | model, to give anyone who possesses the object code either (1) a 250 | copy of the Corresponding Source for all the software in the 251 | product that is covered by this License, on a durable physical 252 | medium customarily used for software interchange, for a price no 253 | more than your reasonable cost of physically performing this 254 | conveying of source, or (2) access to copy the 255 | Corresponding Source from a network server at no charge. 256 | 257 | c) Convey individual copies of the object code with a copy of the 258 | written offer to provide the Corresponding Source. This 259 | alternative is allowed only occasionally and noncommercially, and 260 | only if you received the object code with such an offer, in accord 261 | with subsection 6b. 262 | 263 | d) Convey the object code by offering access from a designated 264 | place (gratis or for a charge), and offer equivalent access to the 265 | Corresponding Source in the same way through the same place at no 266 | further charge. You need not require recipients to copy the 267 | Corresponding Source along with the object code. If the place to 268 | copy the object code is a network server, the Corresponding Source 269 | may be on a different server (operated by you or a third party) 270 | that supports equivalent copying facilities, provided you maintain 271 | clear directions next to the object code saying where to find the 272 | Corresponding Source. Regardless of what server hosts the 273 | Corresponding Source, you remain obligated to ensure that it is 274 | available for as long as needed to satisfy these requirements. 275 | 276 | e) Convey the object code using peer-to-peer transmission, provided 277 | you inform other peers where the object code and Corresponding 278 | Source of the work are being offered to the general public at no 279 | charge under subsection 6d. 280 | 281 | A separable portion of the object code, whose source code is excluded 282 | from the Corresponding Source as a System Library, need not be 283 | included in conveying the object code work. 284 | 285 | A "User Product" is either (1) a "consumer product", which means any 286 | tangible personal property which is normally used for personal, family, 287 | or household purposes, or (2) anything designed or sold for incorporation 288 | into a dwelling. In determining whether a product is a consumer product, 289 | doubtful cases shall be resolved in favor of coverage. For a particular 290 | product received by a particular user, "normally used" refers to a 291 | typical or common use of that class of product, regardless of the status 292 | of the particular user or of the way in which the particular user 293 | actually uses, or expects or is expected to use, the product. A product 294 | is a consumer product regardless of whether the product has substantial 295 | commercial, industrial or non-consumer uses, unless such uses represent 296 | the only significant mode of use of the product. 297 | 298 | "Installation Information" for a User Product means any methods, 299 | procedures, authorization keys, or other information required to install 300 | and execute modified versions of a covered work in that User Product from 301 | a modified version of its Corresponding Source. The information must 302 | suffice to ensure that the continued functioning of the modified object 303 | code is in no case prevented or interfered with solely because 304 | modification has been made. 305 | 306 | If you convey an object code work under this section in, or with, or 307 | specifically for use in, a User Product, and the conveying occurs as 308 | part of a transaction in which the right of possession and use of the 309 | User Product is transferred to the recipient in perpetuity or for a 310 | fixed term (regardless of how the transaction is characterized), the 311 | Corresponding Source conveyed under this section must be accompanied 312 | by the Installation Information. But this requirement does not apply 313 | if neither you nor any third party retains the ability to install 314 | modified object code on the User Product (for example, the work has 315 | been installed in ROM). 316 | 317 | The requirement to provide Installation Information does not include a 318 | requirement to continue to provide support service, warranty, or updates 319 | for a work that has been modified or installed by the recipient, or for 320 | the User Product in which it has been modified or installed. Access to a 321 | network may be denied when the modification itself materially and 322 | adversely affects the operation of the network or violates the rules and 323 | protocols for communication across the network. 324 | 325 | Corresponding Source conveyed, and Installation Information provided, 326 | in accord with this section must be in a format that is publicly 327 | documented (and with an implementation available to the public in 328 | source code form), and must require no special password or key for 329 | unpacking, reading or copying. 330 | 331 | 7. Additional Terms. 332 | 333 | "Additional permissions" are terms that supplement the terms of this 334 | License by making exceptions from one or more of its conditions. 335 | Additional permissions that are applicable to the entire Program shall 336 | be treated as though they were included in this License, to the extent 337 | that they are valid under applicable law. If additional permissions 338 | apply only to part of the Program, that part may be used separately 339 | under those permissions, but the entire Program remains governed by 340 | this License without regard to the additional permissions. 341 | 342 | When you convey a copy of a covered work, you may at your option 343 | remove any additional permissions from that copy, or from any part of 344 | it. (Additional permissions may be written to require their own 345 | removal in certain cases when you modify the work.) You may place 346 | additional permissions on material, added by you to a covered work, 347 | for which you have or can give appropriate copyright permission. 348 | 349 | Notwithstanding any other provision of this License, for material you 350 | add to a covered work, you may (if authorized by the copyright holders of 351 | that material) supplement the terms of this License with terms: 352 | 353 | a) Disclaiming warranty or limiting liability differently from the 354 | terms of sections 15 and 16 of this License; or 355 | 356 | b) Requiring preservation of specified reasonable legal notices or 357 | author attributions in that material or in the Appropriate Legal 358 | Notices displayed by works containing it; or 359 | 360 | c) Prohibiting misrepresentation of the origin of that material, or 361 | requiring that modified versions of such material be marked in 362 | reasonable ways as different from the original version; or 363 | 364 | d) Limiting the use for publicity purposes of names of licensors or 365 | authors of the material; or 366 | 367 | e) Declining to grant rights under trademark law for use of some 368 | trade names, trademarks, or service marks; or 369 | 370 | f) Requiring indemnification of licensors and authors of that 371 | material by anyone who conveys the material (or modified versions of 372 | it) with contractual assumptions of liability to the recipient, for 373 | any liability that these contractual assumptions directly impose on 374 | those licensors and authors. 375 | 376 | All other non-permissive additional terms are considered "further 377 | restrictions" within the meaning of section 10. If the Program as you 378 | received it, or any part of it, contains a notice stating that it is 379 | governed by this License along with a term that is a further 380 | restriction, you may remove that term. If a license document contains 381 | a further restriction but permits relicensing or conveying under this 382 | License, you may add to a covered work material governed by the terms 383 | of that license document, provided that the further restriction does 384 | not survive such relicensing or conveying. 385 | 386 | If you add terms to a covered work in accord with this section, you 387 | must place, in the relevant source files, a statement of the 388 | additional terms that apply to those files, or a notice indicating 389 | where to find the applicable terms. 390 | 391 | Additional terms, permissive or non-permissive, may be stated in the 392 | form of a separately written license, or stated as exceptions; 393 | the above requirements apply either way. 394 | 395 | 8. Termination. 396 | 397 | You may not propagate or modify a covered work except as expressly 398 | provided under this License. Any attempt otherwise to propagate or 399 | modify it is void, and will automatically terminate your rights under 400 | this License (including any patent licenses granted under the third 401 | paragraph of section 11). 402 | 403 | However, if you cease all violation of this License, then your 404 | license from a particular copyright holder is reinstated (a) 405 | provisionally, unless and until the copyright holder explicitly and 406 | finally terminates your license, and (b) permanently, if the copyright 407 | holder fails to notify you of the violation by some reasonable means 408 | prior to 60 days after the cessation. 409 | 410 | Moreover, your license from a particular copyright holder is 411 | reinstated permanently if the copyright holder notifies you of the 412 | violation by some reasonable means, this is the first time you have 413 | received notice of violation of this License (for any work) from that 414 | copyright holder, and you cure the violation prior to 30 days after 415 | your receipt of the notice. 416 | 417 | Termination of your rights under this section does not terminate the 418 | licenses of parties who have received copies or rights from you under 419 | this License. If your rights have been terminated and not permanently 420 | reinstated, you do not qualify to receive new licenses for the same 421 | material under section 10. 422 | 423 | 9. Acceptance Not Required for Having Copies. 424 | 425 | You are not required to accept this License in order to receive or 426 | run a copy of the Program. Ancillary propagation of a covered work 427 | occurring solely as a consequence of using peer-to-peer transmission 428 | to receive a copy likewise does not require acceptance. However, 429 | nothing other than this License grants you permission to propagate or 430 | modify any covered work. These actions infringe copyright if you do 431 | not accept this License. Therefore, by modifying or propagating a 432 | covered work, you indicate your acceptance of this License to do so. 433 | 434 | 10. Automatic Licensing of Downstream Recipients. 435 | 436 | Each time you convey a covered work, the recipient automatically 437 | receives a license from the original licensors, to run, modify and 438 | propagate that work, subject to this License. You are not responsible 439 | for enforcing compliance by third parties with this License. 440 | 441 | An "entity transaction" is a transaction transferring control of an 442 | organization, or substantially all assets of one, or subdividing an 443 | organization, or merging organizations. If propagation of a covered 444 | work results from an entity transaction, each party to that 445 | transaction who receives a copy of the work also receives whatever 446 | licenses to the work the party's predecessor in interest had or could 447 | give under the previous paragraph, plus a right to possession of the 448 | Corresponding Source of the work from the predecessor in interest, if 449 | the predecessor has it or can get it with reasonable efforts. 450 | 451 | You may not impose any further restrictions on the exercise of the 452 | rights granted or affirmed under this License. For example, you may 453 | not impose a license fee, royalty, or other charge for exercise of 454 | rights granted under this License, and you may not initiate litigation 455 | (including a cross-claim or counterclaim in a lawsuit) alleging that 456 | any patent claim is infringed by making, using, selling, offering for 457 | sale, or importing the Program or any portion of it. 458 | 459 | 11. Patents. 460 | 461 | A "contributor" is a copyright holder who authorizes use under this 462 | License of the Program or a work on which the Program is based. The 463 | work thus licensed is called the contributor's "contributor version". 464 | 465 | A contributor's "essential patent claims" are all patent claims 466 | owned or controlled by the contributor, whether already acquired or 467 | hereafter acquired, that would be infringed by some manner, permitted 468 | by this License, of making, using, or selling its contributor version, 469 | but do not include claims that would be infringed only as a 470 | consequence of further modification of the contributor version. For 471 | purposes of this definition, "control" includes the right to grant 472 | patent sublicenses in a manner consistent with the requirements of 473 | this License. 474 | 475 | Each contributor grants you a non-exclusive, worldwide, royalty-free 476 | patent license under the contributor's essential patent claims, to 477 | make, use, sell, offer for sale, import and otherwise run, modify and 478 | propagate the contents of its contributor version. 479 | 480 | In the following three paragraphs, a "patent license" is any express 481 | agreement or commitment, however denominated, not to enforce a patent 482 | (such as an express permission to practice a patent or covenant not to 483 | sue for patent infringement). To "grant" such a patent license to a 484 | party means to make such an agreement or commitment not to enforce a 485 | patent against the party. 486 | 487 | If you convey a covered work, knowingly relying on a patent license, 488 | and the Corresponding Source of the work is not available for anyone 489 | to copy, free of charge and under the terms of this License, through a 490 | publicly available network server or other readily accessible means, 491 | then you must either (1) cause the Corresponding Source to be so 492 | available, or (2) arrange to deprive yourself of the benefit of the 493 | patent license for this particular work, or (3) arrange, in a manner 494 | consistent with the requirements of this License, to extend the patent 495 | license to downstream recipients. "Knowingly relying" means you have 496 | actual knowledge that, but for the patent license, your conveying the 497 | covered work in a country, or your recipient's use of the covered work 498 | in a country, would infringe one or more identifiable patents in that 499 | country that you have reason to believe are valid. 500 | 501 | If, pursuant to or in connection with a single transaction or 502 | arrangement, you convey, or propagate by procuring conveyance of, a 503 | covered work, and grant a patent license to some of the parties 504 | receiving the covered work authorizing them to use, propagate, modify 505 | or convey a specific copy of the covered work, then the patent license 506 | you grant is automatically extended to all recipients of the covered 507 | work and works based on it. 508 | 509 | A patent license is "discriminatory" if it does not include within 510 | the scope of its coverage, prohibits the exercise of, or is 511 | conditioned on the non-exercise of one or more of the rights that are 512 | specifically granted under this License. You may not convey a covered 513 | work if you are a party to an arrangement with a third party that is 514 | in the business of distributing software, under which you make payment 515 | to the third party based on the extent of your activity of conveying 516 | the work, and under which the third party grants, to any of the 517 | parties who would receive the covered work from you, a discriminatory 518 | patent license (a) in connection with copies of the covered work 519 | conveyed by you (or copies made from those copies), or (b) primarily 520 | for and in connection with specific products or compilations that 521 | contain the covered work, unless you entered into that arrangement, 522 | or that patent license was granted, prior to 28 March 2007. 523 | 524 | Nothing in this License shall be construed as excluding or limiting 525 | any implied license or other defenses to infringement that may 526 | otherwise be available to you under applicable patent law. 527 | 528 | 12. No Surrender of Others' Freedom. 529 | 530 | If conditions are imposed on you (whether by court order, agreement or 531 | otherwise) that contradict the conditions of this License, they do not 532 | excuse you from the conditions of this License. If you cannot convey a 533 | covered work so as to satisfy simultaneously your obligations under this 534 | License and any other pertinent obligations, then as a consequence you may 535 | not convey it at all. For example, if you agree to terms that obligate you 536 | to collect a royalty for further conveying from those to whom you convey 537 | the Program, the only way you could satisfy both those terms and this 538 | License would be to refrain entirely from conveying the Program. 539 | 540 | 13. Remote Network Interaction; Use with the GNU General Public License. 541 | 542 | Notwithstanding any other provision of this License, if you modify the 543 | Program, your modified version must prominently offer all users 544 | interacting with it remotely through a computer network (if your version 545 | supports such interaction) an opportunity to receive the Corresponding 546 | Source of your version by providing access to the Corresponding Source 547 | from a network server at no charge, through some standard or customary 548 | means of facilitating copying of software. This Corresponding Source 549 | shall include the Corresponding Source for any work covered by version 3 550 | of the GNU General Public License that is incorporated pursuant to the 551 | following paragraph. 552 | 553 | Notwithstanding any other provision of this License, you have 554 | permission to link or combine any covered work with a work licensed 555 | under version 3 of the GNU General Public License into a single 556 | combined work, and to convey the resulting work. The terms of this 557 | License will continue to apply to the part which is the covered work, 558 | but the work with which it is combined will remain governed by version 559 | 3 of the GNU General Public License. 560 | 561 | 14. Revised Versions of this License. 562 | 563 | The Free Software Foundation may publish revised and/or new versions of 564 | the GNU Affero General Public License from time to time. Such new versions 565 | will be similar in spirit to the present version, but may differ in detail to 566 | address new problems or concerns. 567 | 568 | Each version is given a distinguishing version number. If the 569 | Program specifies that a certain numbered version of the GNU Affero General 570 | Public License "or any later version" applies to it, you have the 571 | option of following the terms and conditions either of that numbered 572 | version or of any later version published by the Free Software 573 | Foundation. If the Program does not specify a version number of the 574 | GNU Affero General Public License, you may choose any version ever published 575 | by the Free Software Foundation. 576 | 577 | If the Program specifies that a proxy can decide which future 578 | versions of the GNU Affero General Public License can be used, that proxy's 579 | public statement of acceptance of a version permanently authorizes you 580 | to choose that version for the Program. 581 | 582 | Later license versions may give you additional or different 583 | permissions. However, no additional obligations are imposed on any 584 | author or copyright holder as a result of your choosing to follow a 585 | later version. 586 | 587 | 15. Disclaimer of Warranty. 588 | 589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 597 | 598 | 16. Limitation of Liability. 599 | 600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 608 | SUCH DAMAGES. 609 | 610 | 17. Interpretation of Sections 15 and 16. 611 | 612 | If the disclaimer of warranty and limitation of liability provided 613 | above cannot be given local legal effect according to their terms, 614 | reviewing courts shall apply local law that most closely approximates 615 | an absolute waiver of all civil liability in connection with the 616 | Program, unless a warranty or assumption of liability accompanies a 617 | copy of the Program in return for a fee. 618 | 619 | END OF TERMS AND CONDITIONS 620 | 621 | How to Apply These Terms to Your New Programs 622 | 623 | If you develop a new program, and you want it to be of the greatest 624 | possible use to the public, the best way to achieve this is to make it 625 | free software which everyone can redistribute and change under these terms. 626 | 627 | To do so, attach the following notices to the program. It is safest 628 | to attach them to the start of each source file to most effectively 629 | state the exclusion of warranty; and each file should have at least 630 | the "copyright" line and a pointer to where the full notice is found. 631 | 632 | 633 | Copyright (C) 634 | 635 | This program is free software: you can redistribute it and/or modify 636 | it under the terms of the GNU Affero General Public License as published 637 | by the Free Software Foundation, either version 3 of the License, or 638 | (at your option) any later version. 639 | 640 | This program is distributed in the hope that it will be useful, 641 | but WITHOUT ANY WARRANTY; without even the implied warranty of 642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 643 | GNU Affero General Public License for more details. 644 | 645 | You should have received a copy of the GNU Affero General Public License 646 | along with this program. If not, see . 647 | 648 | Also add information on how to contact you by electronic and paper mail. 649 | 650 | If your software can interact with users remotely through a computer 651 | network, you should also make sure that it provides a way for users to 652 | get its source. For example, if your program is a web application, its 653 | interface could display a "Source" link that leads users to an archive 654 | of the code. There are many ways you could offer source, and different 655 | solutions will be better for different programs; see section 13 for the 656 | specific requirements. 657 | 658 | You should also get your employer (if you work as a programmer) or school, 659 | if any, to sign a "copyright disclaimer" for the program, if necessary. 660 | For more information on this, and how to apply and follow the GNU AGPL, see 661 | . 662 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BERT NER 2 | 3 | Use google BERT to do CoNLL-2003 NER ! 4 | 5 | ![new](https://i.imgur.com/OB4Ugp4.png) Train model using Python and Inference using C++ 6 | 7 | [ALBERT-TF2.0](https://github.com/kamalkraj/ALBERT-TF2.0) 8 | 9 | [BERT-NER-TENSORFLOW-2.0](https://github.com/kamalkraj/BERT-NER-TF) 10 | 11 | [BERT-SQuAD](https://github.com/kamalkraj/BERT-SQuAD) 12 | 13 | 14 | # Requirements 15 | 16 | - `python3` 17 | - `pip3 install -r requirements.txt` 18 | 19 | # Run 20 | 21 | `python run_ner.py --data_dir=data/ --bert_model=bert-base-cased --task_name=ner --output_dir=out_base --max_seq_length=128 --do_train --num_train_epochs 5 --do_eval --warmup_proportion=0.1` 22 | 23 | 24 | # Result 25 | 26 | ## BERT-BASE 27 | 28 | ### Validation Data 29 | ``` 30 | precision recall f1-score support 31 | 32 | PER 0.9677 0.9745 0.9711 1842 33 | LOC 0.9654 0.9711 0.9682 1837 34 | MISC 0.8851 0.9111 0.8979 922 35 | ORG 0.9299 0.9292 0.9295 1341 36 | 37 | avg / total 0.9456 0.9534 0.9495 5942 38 | ``` 39 | ### Test Data 40 | ``` 41 | precision recall f1-score support 42 | 43 | PER 0.9635 0.9629 0.9632 1617 44 | ORG 0.8883 0.9097 0.8989 1661 45 | LOC 0.9272 0.9317 0.9294 1668 46 | MISC 0.7689 0.8248 0.7959 702 47 | 48 | avg / total 0.9065 0.9209 0.9135 5648 49 | ``` 50 | ## Pretrained model download from [here](https://1drv.ms/u/s!Auc3VRul9wo5hghurzE47bTRyUeR?e=08seO3) 51 | 52 | ## BERT-LARGE 53 | 54 | ### Validation Data 55 | ``` 56 | precision recall f1-score support 57 | 58 | ORG 0.9288 0.9441 0.9364 1341 59 | LOC 0.9754 0.9728 0.9741 1837 60 | MISC 0.8976 0.9219 0.9096 922 61 | PER 0.9762 0.9799 0.9781 1842 62 | 63 | avg / total 0.9531 0.9606 0.9568 5942 64 | ``` 65 | ### Test Data 66 | ``` 67 | precision recall f1-score support 68 | 69 | LOC 0.9366 0.9293 0.9329 1668 70 | ORG 0.8881 0.9175 0.9026 1661 71 | PER 0.9695 0.9623 0.9659 1617 72 | MISC 0.7787 0.8319 0.8044 702 73 | 74 | avg / total 0.9121 0.9232 0.9174 5648 75 | ``` 76 | ## Pretrained model download from [here](https://1drv.ms/u/s!Auc3VRul9wo5hgr8jwhFD8iPCYp1?e=UsJJ2V) 77 | 78 | # Inference 79 | 80 | ```python 81 | from bert import Ner 82 | 83 | model = Ner("out_base/") 84 | 85 | output = model.predict("Steve went to Paris") 86 | 87 | print(output) 88 | ''' 89 | [ 90 | { 91 | "confidence": 0.9981840252876282, 92 | "tag": "B-PER", 93 | "word": "Steve" 94 | }, 95 | { 96 | "confidence": 0.9998939037322998, 97 | "tag": "O", 98 | "word": "went" 99 | }, 100 | { 101 | "confidence": 0.999891996383667, 102 | "tag": "O", 103 | "word": "to" 104 | }, 105 | { 106 | "confidence": 0.9991968274116516, 107 | "tag": "B-LOC", 108 | "word": "Paris" 109 | } 110 | ] 111 | ''' 112 | ``` 113 | 114 | # Inference C++ 115 | 116 | ## Pretrained and converted bert-base model download from [here](https://1drv.ms/u/s!Auc3VRul9wo5hgkJjtxZ8FAQGuj2?e=wffJCT) 117 | ### Download libtorch from [here](https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-1.2.0.zip) 118 | 119 | - install `cmake`, tested with `cmake` version `3.10.2` 120 | - unzip downloaded model and `libtorch` in `BERT-NER` 121 | - Compile C++ App 122 | ```bash 123 | cd cpp-app/ 124 | cmake -DCMAKE_PREFIX_PATH=../libtorch 125 | ``` 126 | ![cmake output image](/img/cmake.png) 127 | ```bash 128 | make 129 | ``` 130 | ![make output image](/img/make.png) 131 | 132 | 133 | - Runing APP 134 | ```bash 135 | ./app ../base 136 | ``` 137 | ![inference output image](/img/inference.png) 138 | 139 | NB: Bert-Base C++ model is split in to two parts. 140 | - Bert Feature extractor and NER classifier. 141 | - This is done because `jit trace` don't support `input` depended `for` loop or `if` conditions inside `forword` function of `model`. 142 | 143 | 144 | 145 | # Deploy REST-API 146 | BERT NER model deployed as rest api 147 | ```bash 148 | python api.py 149 | ``` 150 | API will be live at `0.0.0.0:8000` endpoint `predict` 151 | #### cURL request 152 | ` curl -X POST http://0.0.0.0:8000/predict -H 'Content-Type: application/json' -d '{ "text": "Steve went to Paris" }'` 153 | 154 | Output 155 | ```json 156 | { 157 | "result": [ 158 | { 159 | "confidence": 0.9981840252876282, 160 | "tag": "B-PER", 161 | "word": "Steve" 162 | }, 163 | { 164 | "confidence": 0.9998939037322998, 165 | "tag": "O", 166 | "word": "went" 167 | }, 168 | { 169 | "confidence": 0.999891996383667, 170 | "tag": "O", 171 | "word": "to" 172 | }, 173 | { 174 | "confidence": 0.9991968274116516, 175 | "tag": "B-LOC", 176 | "word": "Paris" 177 | } 178 | ] 179 | } 180 | ``` 181 | #### cURL 182 | ![curl output image](/img/curl.png) 183 | #### Postman 184 | ![postman output image](/img/postman.png) 185 | 186 | ### C++ unicode support 187 | - http://github.com/ufal/unilib 188 | 189 | ### Tensorflow version 190 | 191 | - https://github.com/kyzhouhzau/BERT-NER 192 | -------------------------------------------------------------------------------- /api.py: -------------------------------------------------------------------------------- 1 | from flask import Flask,request,jsonify 2 | from flask_cors import CORS 3 | 4 | from bert import Ner 5 | 6 | app = Flask(__name__) 7 | CORS(app) 8 | 9 | model = Ner("out_!x") 10 | 11 | @app.route("/predict",methods=['POST']) 12 | def predict(): 13 | text = request.json["text"] 14 | try: 15 | out = model.predict(text) 16 | return jsonify({"result":out}) 17 | except Exception as e: 18 | print(e) 19 | return jsonify({"result":"Model Failed"}) 20 | 21 | if __name__ == "__main__": 22 | app.run('0.0.0.0',port=8000) -------------------------------------------------------------------------------- /bert.py: -------------------------------------------------------------------------------- 1 | """BERT NER Inference.""" 2 | 3 | from __future__ import absolute_import, division, print_function 4 | 5 | import json 6 | import os 7 | 8 | import torch 9 | import torch.nn.functional as F 10 | from nltk import word_tokenize 11 | from pytorch_transformers import (BertConfig, BertForTokenClassification, 12 | BertTokenizer) 13 | 14 | 15 | class BertNer(BertForTokenClassification): 16 | 17 | def forward(self, input_ids, token_type_ids=None, attention_mask=None, valid_ids=None): 18 | sequence_output = self.bert(input_ids, token_type_ids, attention_mask, head_mask=None)[0] 19 | batch_size,max_len,feat_dim = sequence_output.shape 20 | valid_output = torch.zeros(batch_size,max_len,feat_dim,dtype=torch.float32,device='cuda' if torch.cuda.is_available() else 'cpu') 21 | for i in range(batch_size): 22 | jj = -1 23 | for j in range(max_len): 24 | if valid_ids[i][j].item() == 1: 25 | jj += 1 26 | valid_output[i][jj] = sequence_output[i][j] 27 | sequence_output = self.dropout(valid_output) 28 | logits = self.classifier(sequence_output) 29 | return logits 30 | 31 | class Ner: 32 | 33 | def __init__(self,model_dir: str): 34 | self.model , self.tokenizer, self.model_config = self.load_model(model_dir) 35 | self.label_map = self.model_config["label_map"] 36 | self.max_seq_length = self.model_config["max_seq_length"] 37 | self.label_map = {int(k):v for k,v in self.label_map.items()} 38 | self.device = "cuda" if torch.cuda.is_available() else "cpu" 39 | self.model = self.model.to(self.device) 40 | self.model.eval() 41 | 42 | def load_model(self, model_dir: str, model_config: str = "model_config.json"): 43 | model_config = os.path.join(model_dir,model_config) 44 | model_config = json.load(open(model_config)) 45 | model = BertNer.from_pretrained(model_dir) 46 | tokenizer = BertTokenizer.from_pretrained(model_dir, do_lower_case=model_config["do_lower"]) 47 | return model, tokenizer, model_config 48 | 49 | def tokenize(self, text: str): 50 | """ tokenize input""" 51 | words = word_tokenize(text) 52 | tokens = [] 53 | valid_positions = [] 54 | for i,word in enumerate(words): 55 | token = self.tokenizer.tokenize(word) 56 | tokens.extend(token) 57 | for i in range(len(token)): 58 | if i == 0: 59 | valid_positions.append(1) 60 | else: 61 | valid_positions.append(0) 62 | return tokens, valid_positions 63 | 64 | def preprocess(self, text: str): 65 | """ preprocess """ 66 | tokens, valid_positions = self.tokenize(text) 67 | ## insert "[CLS]" 68 | tokens.insert(0,"[CLS]") 69 | valid_positions.insert(0,1) 70 | ## insert "[SEP]" 71 | tokens.append("[SEP]") 72 | valid_positions.append(1) 73 | segment_ids = [] 74 | for i in range(len(tokens)): 75 | segment_ids.append(0) 76 | input_ids = self.tokenizer.convert_tokens_to_ids(tokens) 77 | input_mask = [1] * len(input_ids) 78 | while len(input_ids) < self.max_seq_length: 79 | input_ids.append(0) 80 | input_mask.append(0) 81 | segment_ids.append(0) 82 | valid_positions.append(0) 83 | return input_ids,input_mask,segment_ids,valid_positions 84 | 85 | def predict(self, text: str): 86 | input_ids,input_mask,segment_ids,valid_ids = self.preprocess(text) 87 | input_ids = torch.tensor([input_ids],dtype=torch.long,device=self.device) 88 | input_mask = torch.tensor([input_mask],dtype=torch.long,device=self.device) 89 | segment_ids = torch.tensor([segment_ids],dtype=torch.long,device=self.device) 90 | valid_ids = torch.tensor([valid_ids],dtype=torch.long,device=self.device) 91 | with torch.no_grad(): 92 | logits = self.model(input_ids, segment_ids, input_mask,valid_ids) 93 | logits = F.softmax(logits,dim=2) 94 | logits_label = torch.argmax(logits,dim=2) 95 | logits_label = logits_label.detach().cpu().numpy().tolist()[0] 96 | 97 | logits_confidence = [values[label].item() for values,label in zip(logits[0],logits_label)] 98 | 99 | logits = [] 100 | pos = 0 101 | for index,mask in enumerate(valid_ids[0]): 102 | if index == 0: 103 | continue 104 | if mask == 1: 105 | logits.append((logits_label[index-pos],logits_confidence[index-pos])) 106 | else: 107 | pos += 1 108 | logits.pop() 109 | 110 | labels = [(self.label_map[label],confidence) for label,confidence in logits] 111 | words = word_tokenize(text) 112 | assert len(labels) == len(words) 113 | output = [{"word":word,"tag":label,"confidence":confidence} for word,(label,confidence) in zip(words,labels)] 114 | return output 115 | 116 | -------------------------------------------------------------------------------- /cpp-app/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0 FATAL_ERROR) 2 | project(custom_ops) 3 | 4 | find_package(Torch REQUIRED) 5 | 6 | add_executable(app app.cpp unilib/tokenizer.cpp unilib/unicode.cpp unilib/uninorms.cpp) 7 | target_include_directories(app PUBLIC "unilib") 8 | target_link_libraries(app "${TORCH_LIBRARIES}") 9 | set_property(TARGET app PROPERTY CXX_STANDARD 11) 10 | -------------------------------------------------------------------------------- /cpp-app/app.cpp: -------------------------------------------------------------------------------- 1 | #include // One-stop header. 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "tokenizer.h" 8 | 9 | using namespace std; 10 | 11 | BertTokenizer tokenizer; 12 | BasicTokenizer basictokenizer; 13 | 14 | int max_seq_length = 128; 15 | map label_map; 16 | bool do_lower_case; 17 | 18 | vector rive(const string &str, char delimiter) 19 | { 20 | vector internal; 21 | stringstream ss(str); // Turn the string into a stream. 22 | string tok; 23 | 24 | while(getline(ss, tok, delimiter)) { 25 | internal.push_back(tok); 26 | } 27 | return internal; 28 | } 29 | 30 | string join(const char* a, char* b) 31 | { 32 | string message; 33 | message.reserve(strlen(a) + 1 + strlen(b)); 34 | message = a; 35 | message += "/"; 36 | message += b; 37 | return message; 38 | } 39 | 40 | torch::jit::script::Module bert; 41 | torch::jit::script::Module ner; 42 | 43 | void load_model(const char* model) 44 | { 45 | bert = torch::jit::load(join(model,"bert_features.zip").c_str()); 46 | ner = torch::jit::load(join(model,"bert_ner.zip").c_str()); 47 | string line; 48 | ifstream config (join(model,"model_config.txt").c_str()); 49 | int i= 0; 50 | if (config.is_open()) 51 | { 52 | while ( getline (config,line) ) 53 | { 54 | //cout << line << '\n'< 1) 68 | { 69 | vector temp = rive(line,' '); 70 | label_map.insert(pair{stoi(temp.at(0)),temp.at(1)}); 71 | } 72 | i++; 73 | } 74 | config.close(); 75 | } 76 | bert.eval(); 77 | ner.eval(); 78 | } 79 | 80 | void tokenize(string text, vector &tokens, vector &valid_positions) 81 | { 82 | vector words = basictokenizer.tokenize(text); 83 | vector token; 84 | vector::iterator itr; 85 | for (itr = words.begin(); itr < words.end(); itr++) 86 | { 87 | token = tokenizer.tokenize(*itr); 88 | tokens.insert(tokens.end(),token.begin(),token.end()); 89 | for (int i = 0; i < token.size(); i++) 90 | { 91 | if(i == 0) 92 | valid_positions.push_back(1); 93 | else 94 | valid_positions.push_back(0); 95 | } 96 | } 97 | } 98 | 99 | void preprocess(string text, vector &input_ids, vector &input_mask, vector &segment_ids, vector &valid_positions) 100 | { 101 | vector tokens; 102 | tokenize(text,tokens,valid_positions); 103 | // insert "[CLS}" 104 | tokens.insert(tokens.begin(),"[CLS]"); 105 | valid_positions.insert(valid_positions.begin(),1.0); 106 | // insert "[SEP]" 107 | tokens.push_back("[SEP]"); 108 | valid_positions.push_back(1.0); 109 | for(int i = 0; i < tokens.size(); i++) 110 | { 111 | segment_ids.push_back(0.0); 112 | input_mask.push_back(1.0); 113 | } 114 | input_ids = tokenizer.convert_tokens_to_ids(tokens); 115 | while(input_ids.size() < max_seq_length) 116 | { 117 | input_ids.push_back(0.0); 118 | input_mask.push_back(0.0); 119 | segment_ids.push_back(0.0); 120 | valid_positions.push_back(0.0); 121 | } 122 | } 123 | 124 | vector> predict(string text) 125 | { 126 | vector inputs; 127 | vector input_ids; 128 | vector input_mask; 129 | vector segment_ids; 130 | vector valid_positions; 131 | preprocess(text,input_ids,input_mask,segment_ids,valid_positions); 132 | inputs.push_back(torch::from_blob(input_ids.data(), {1, max_seq_length}).to(torch::kInt64)); 133 | inputs.push_back(torch::from_blob(input_mask.data(), {1, max_seq_length}).to(torch::kInt64)); 134 | inputs.push_back(torch::from_blob(segment_ids.data(), {1, max_seq_length}).to(torch::kInt64)); 135 | torch::Tensor valid_ids = torch::from_blob(valid_positions.data(), {1, max_seq_length}).to(torch::kInt64); 136 | 137 | torch::Tensor features = bert.forward(inputs).toTensor(); 138 | 139 | int batch_size = features.size(0); 140 | int max_len = features.size(1); 141 | int feat_dim = features.size(2); 142 | 143 | auto options = torch::TensorOptions().dtype(torch::kInt64); 144 | torch::Tensor one = torch::ones({1},options); 145 | 146 | 147 | torch::Tensor valid_output = torch::zeros({batch_size, max_len, feat_dim}); 148 | for(int i = 0; i < batch_size; i++) 149 | { 150 | int jj = -1; 151 | for(int j = 0; j < max_len; j++) 152 | { 153 | if (valid_ids[i][j].equal(one[0])) 154 | { 155 | jj++; 156 | valid_output[i][jj] = features[i][j]; 157 | } 158 | } 159 | } 160 | vector features_out; 161 | features_out.push_back(valid_output); 162 | torch::Tensor output = ner.forward(features_out).toTensor().argmax(2); 163 | 164 | vector out(output[0].data(), output[0].data() + output[0].numel()); 165 | 166 | vector logits; 167 | int pos = 0; 168 | for (int i = 1; i < valid_positions.size(); i++) 169 | { 170 | if (valid_positions.at(i) == 1.0) 171 | logits.push_back(out.at(i-pos)); 172 | else 173 | pos++; 174 | } 175 | logits.pop_back(); 176 | 177 | vector words = basictokenizer.tokenize(text); 178 | vector> result; 179 | for (int i = 0; i < words.size(); i++) 180 | { 181 | map mp; 182 | mp.insert({words.at(i),label_map[logits.at(i)]}); 183 | result.push_back(mp); 184 | } 185 | return result; 186 | } 187 | 188 | int main(int argc, const char* argv[]) 189 | { 190 | const char* model = argv[1]; 191 | string text; 192 | tokenizer.add_vocab(join(model,"vocab.txt").c_str()); 193 | load_model(model); 194 | while(true) 195 | { 196 | cout<<"\n"<<"Input -> "; 197 | getline(cin,text); 198 | vector> output = predict(text); 199 | for(int i=0; i < output.size(); i++) 200 | { 201 | cout< 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "unicode.h" 10 | #include "uninorms.h" 11 | #include 12 | #include 13 | 14 | #include "tokenizer.h" 15 | 16 | 17 | using namespace std; 18 | using namespace ufal::unilib; 19 | 20 | 21 | map categories = { 22 | {"Lu", unicode::Lu}, {"Ll", unicode::Ll}, {"Lt", unicode::Lt}, {"Lm", unicode::Lm}, {"Lo", unicode::Lo}, 23 | {"Mn", unicode::Mn}, {"Mc", unicode::Mc}, {"Me", unicode::Me}, {"Nd", unicode::Nd}, {"Nl", unicode::Nl}, 24 | {"No", unicode::No}, {"Pc", unicode::Pc}, {"Pd", unicode::Pd}, {"Ps", unicode::Ps}, {"Pe", unicode::Pe}, 25 | {"Pi", unicode::Pi}, {"Pf", unicode::Pf}, {"Po", unicode::Po}, {"Sm", unicode::Sm}, {"Sc", unicode::Sc}, 26 | {"Sk", unicode::Sk}, {"So", unicode::So}, {"Zs", unicode::Zs}, {"Zl", unicode::Zl}, {"Zp", unicode::Zp}, 27 | {"Cc", unicode::Cc}, {"Cf", unicode::Cf}, {"Cs", unicode::Cs}, {"Co", unicode::Co}, {"Cn", unicode::Cn}, 28 | }; 29 | 30 | map categories_rev; 31 | 32 | string ltrim(string str ) { 33 | return regex_replace( str, regex("^\\s+"), string("") ); 34 | } 35 | 36 | string rtrim( string str ) { 37 | return regex_replace( str, regex("\\s+$"), string("") ); 38 | } 39 | 40 | string trim( string str ) { 41 | return ltrim( rtrim( str ) ); 42 | } 43 | 44 | vector split(const string &str, char delimiter) { 45 | vector internal; 46 | stringstream ss(str); // Turn the string into a stream. 47 | string tok; 48 | 49 | while(getline(ss, tok, delimiter)) { 50 | internal.push_back(tok); 51 | } 52 | return internal; 53 | } 54 | 55 | map read_vocab(const char* filename) 56 | { 57 | map vocab; 58 | int index = 0; 59 | unsigned int line_count = 1; 60 | ifstream fs8(filename); 61 | if (!fs8.is_open()) 62 | { 63 | cout << "Could not open " << filename << endl; 64 | return vocab; 65 | } 66 | string line; 67 | // Read all the lines in the file 68 | while (getline(fs8, line)) 69 | { 70 | // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function) 71 | // string::iterator end_it = utf8::find_invalid(line.begin(), line.end()); 72 | vocab.insert(pair(string(line.begin(), line.end()),index)); 73 | index++; 74 | line_count++; 75 | } 76 | return vocab; 77 | } 78 | 79 | vector whitespace_tokenize(string text) 80 | { 81 | vector result; 82 | char delimeter = ' '; 83 | text = trim(text); 84 | if(text == "") 85 | { 86 | return result; 87 | } 88 | result = split(text,delimeter); 89 | return result; 90 | } 91 | 92 | bool _is_whitespace(char letter) 93 | { 94 | if (letter == ' ' or letter == '\t' or letter == '\n' or letter == '\r') 95 | return true; 96 | long int cat = unicode::category(int(letter)); 97 | if(cat == categories["Zs"]) 98 | return true; 99 | return false; 100 | } 101 | 102 | bool _is_control(char letter) 103 | { 104 | if(letter == '\t' or letter == '\n' or letter == '\r') 105 | return false; 106 | unicode::category_t cat = unicode::category(int(letter)); 107 | string cat_ = categories_rev[cat]; 108 | if(cat_[0] == 'C') 109 | return true; 110 | return false; 111 | } 112 | 113 | bool _is_punctuation(char letter) 114 | { 115 | int cp = int(letter); 116 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 117 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)) 118 | return true; 119 | unicode::category_t cat = unicode::category(int(letter)); 120 | string cat_ = categories_rev[cat]; 121 | if(cat_[0] == 'P') 122 | return true; 123 | return false; 124 | } 125 | 126 | string BasicTokenizer::_clean_text(string text) 127 | { 128 | string output; 129 | int len = 0; 130 | int len_char_array = text.length(); 131 | char * char_array = new char [text.length()+1]; 132 | strcpy (char_array, text.c_str()); 133 | while(char_array[len] != '\0') 134 | { 135 | int cp = int(char_array[len]); 136 | if (cp == 0 or cp == 0xfffd or _is_control(char_array[len])) 137 | continue; 138 | if (_is_whitespace(char_array[len])) 139 | output = output + " "; 140 | else 141 | output = output + char_array[len]; 142 | ++len; 143 | } 144 | return output; 145 | } 146 | 147 | vector BasicTokenizer::_run_split_on_punc(string text) 148 | { 149 | // vector never_split = {"[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"}; 150 | if (find(never_split_.begin(), never_split_.end(), text) != never_split_.end()) 151 | { 152 | vector temp = {text}; 153 | return temp; 154 | } 155 | int len_char_array = text.length(); 156 | char * char_array = new char [text.length()+1]; 157 | strcpy (char_array, text.c_str()); 158 | int i = 0; 159 | bool start_new_word = true; 160 | vector> output; 161 | while(i < len_char_array) 162 | { 163 | char letter = char_array[i]; 164 | if (_is_punctuation(letter)) 165 | { 166 | vector temp = {letter}; 167 | output.push_back(temp); 168 | start_new_word = true; 169 | } 170 | else 171 | { 172 | if (start_new_word) 173 | { 174 | vector temp_2; 175 | output.push_back(temp_2); 176 | } 177 | start_new_word = false; 178 | output.back().push_back(letter); 179 | } 180 | i += 1; 181 | } 182 | vector final_output; 183 | vector>::iterator ptr; 184 | for (ptr = output.begin(); ptr < output.end(); ptr++) 185 | { 186 | vector out = *ptr; 187 | string word = ""; 188 | vector::iterator itr; 189 | for (itr = out.begin(); itr < out.end(); itr++) 190 | { 191 | word = word + *itr; 192 | } 193 | final_output.push_back(word); 194 | } 195 | return final_output; 196 | } 197 | 198 | string BasicTokenizer::_run_strip_accents(string text) 199 | { 200 | wstring_convert, char32_t> conv; 201 | auto temp = conv.from_bytes(text); 202 | auto nfd = [](u32string str) { uninorms::nfd(str); return str; }; 203 | auto text_ = nfd(temp); 204 | string output; 205 | int i = 0; 206 | int len_char_array = text_.length()+1; 207 | char * char_array = new char [text_.length() + 1]; 208 | int j; 209 | for (j = 0; j < len_char_array; j++) 210 | { 211 | char_array[j] = text_[j]; 212 | } 213 | while(i < len_char_array) 214 | { 215 | long int cat = unicode::category(int(char_array[i])); 216 | if(cat == categories["Mn"]) 217 | { 218 | i++; 219 | continue; 220 | } 221 | // if (_is_punctuation(char_array[i])) 222 | // { 223 | // i++; 224 | // continue; 225 | // } 226 | output = output + char_array[i]; 227 | i++; 228 | } 229 | return output; 230 | } 231 | 232 | vector BasicTokenizer::tokenize(string text) 233 | { 234 | text = _clean_text(text); 235 | vector orig_tokens = whitespace_tokenize(text); 236 | vector split_tokens; 237 | vector::iterator itr; 238 | for (itr = orig_tokens.begin(); itr < orig_tokens.end(); itr++) 239 | { 240 | string temp = *itr; 241 | if (do_lower_case_ and not bool(find(never_split_.begin(), never_split_.end(), *itr) != never_split_.end())) 242 | { 243 | transform(temp.begin(), temp.end(), temp.begin(),[](unsigned char c){ return std::tolower(c); });; 244 | temp = _run_strip_accents(temp); 245 | } 246 | vector split = _run_split_on_punc(temp); 247 | split_tokens.insert( split_tokens.end(), split.begin(), split.end() ); 248 | } 249 | string temp_text; 250 | vector::iterator ptr; 251 | for (ptr = split_tokens.begin(); ptr < split_tokens.end(); ptr++) 252 | { 253 | temp_text = temp_text + " "+ *ptr; 254 | } 255 | return whitespace_tokenize(temp_text); 256 | } 257 | 258 | 259 | void WordpieceTokenizer::add_vocab(map vocab) 260 | { 261 | vocab_ = vocab; 262 | unk_token_ = "[UNK]"; 263 | max_input_chars_per_word_ = 100; 264 | } 265 | 266 | vector WordpieceTokenizer::tokenize(string text) 267 | { 268 | vector output_tokens; 269 | vector whitespace_tokens = whitespace_tokenize(text); 270 | vector::iterator ptr; 271 | for (ptr = whitespace_tokens.begin(); ptr < whitespace_tokens.end(); ptr++) 272 | { 273 | // cout<<*ptr<<"\n"; 274 | string token = *ptr; 275 | int len_char_array = token.length(); 276 | char * char_array = new char [token.length()+1]; 277 | strcpy (char_array, token.c_str()); 278 | if (len_char_array > max_input_chars_per_word_) 279 | { 280 | output_tokens.push_back(unk_token_); 281 | continue; 282 | } 283 | // cout< sub_tokens; 287 | while(start < len_char_array) 288 | { 289 | int end = len_char_array; 290 | string cur_substr = ""; 291 | while(start < end) 292 | { 293 | string substr; 294 | for(int c = start;c < end; c++) 295 | substr = substr + char_array[c]; 296 | if (start > 0) 297 | substr = "##" + substr; 298 | if (vocab_.count(substr) == 1) 299 | { 300 | cur_substr = substr; 301 | break; 302 | } 303 | end = end - 1; 304 | } 305 | if(cur_substr == "") 306 | { 307 | is_bad = true; 308 | break; 309 | } 310 | sub_tokens.push_back(cur_substr); 311 | start = end; 312 | } 313 | if(is_bad) 314 | output_tokens.push_back(unk_token_); 315 | else 316 | { 317 | output_tokens.insert( output_tokens.end(), sub_tokens.begin(), sub_tokens.end() ); 318 | } 319 | } 320 | return output_tokens; 321 | } 322 | 323 | 324 | void BertTokenizer::add_vocab(const char* vocab_file) 325 | { 326 | vocab = read_vocab(vocab_file); 327 | for (map::iterator i = vocab.begin(); i != vocab.end(); ++i) 328 | ids_to_tokens[i->second] = i->first; 329 | do_basic_tokenize_ = true; 330 | do_lower_case_ = false; 331 | wordpiece_tokenizer.add_vocab(vocab); 332 | maxlen_ = 512; 333 | } 334 | 335 | vector BertTokenizer::tokenize(string text) 336 | { 337 | vector split_tokens; 338 | if(do_basic_tokenize_) 339 | { 340 | vector temp_tokens = basic_tokenizer.tokenize(text); 341 | vector::iterator ptr; 342 | for (ptr = temp_tokens.begin(); ptr < temp_tokens.end(); ptr++) 343 | { 344 | vector subtokens = wordpiece_tokenizer.tokenize(*ptr); 345 | split_tokens.insert(split_tokens.end(),subtokens.begin(),subtokens.end()); 346 | } 347 | } 348 | else 349 | { 350 | split_tokens = wordpiece_tokenizer.tokenize(text); 351 | } 352 | return split_tokens; 353 | } 354 | 355 | vector BertTokenizer::convert_tokens_to_ids(vector tokens) 356 | { 357 | vector ids; 358 | vector::iterator ptr; 359 | for (ptr = tokens.begin(); ptr < tokens.end(); ptr++) 360 | { 361 | ids.push_back(float(vocab[*ptr])); 362 | } 363 | if (ids.size() > maxlen_) 364 | cout<<"Token indices sequence length is longer than the specified maximum"; 365 | return ids; 366 | } 367 | -------------------------------------------------------------------------------- /cpp-app/unilib/tokenizer.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | using namespace std; 5 | 6 | vector whitespace_tokenize(string text); 7 | map read_vocab(const char* filename); 8 | 9 | class BasicTokenizer 10 | { 11 | public: 12 | bool do_lower_case_; 13 | vector never_split_; 14 | BasicTokenizer(bool do_lower_case=false, vector never_split = {"[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"}) 15 | { 16 | do_lower_case_ = do_lower_case; 17 | never_split_ = never_split; 18 | } 19 | string _clean_text(string text); 20 | vector _run_split_on_punc(string text); 21 | string _run_strip_accents(string text); 22 | vector tokenize(string text); 23 | }; 24 | 25 | class WordpieceTokenizer 26 | { 27 | public: 28 | map vocab_; 29 | string unk_token_; 30 | int max_input_chars_per_word_; 31 | WordpieceTokenizer(){}; 32 | 33 | WordpieceTokenizer(map vocab,string unk_token="[UNK]",int max_input_chars_per_word=100) 34 | { 35 | vocab_ = vocab; 36 | unk_token_ = unk_token; 37 | max_input_chars_per_word_ = max_input_chars_per_word; 38 | } 39 | void add_vocab(map vocab); 40 | vector tokenize(string text); 41 | }; 42 | 43 | class BertTokenizer 44 | { 45 | public: 46 | map vocab; 47 | map ids_to_tokens; 48 | bool do_lower_case_; 49 | bool do_basic_tokenize_; 50 | int maxlen_; 51 | BasicTokenizer basic_tokenizer; 52 | WordpieceTokenizer wordpiece_tokenizer; 53 | BertTokenizer(){}; 54 | BertTokenizer(const char* vocab_file, bool do_lower_case=false,int max_len=512,bool do_basic_tokenize=true, 55 | vector never_split={"[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"}) 56 | { 57 | vocab = read_vocab(vocab_file); 58 | for (map::iterator i = vocab.begin(); i != vocab.end(); ++i) 59 | ids_to_tokens[i->second] = i->first; 60 | do_basic_tokenize_ = do_basic_tokenize; 61 | do_lower_case_ = do_lower_case; 62 | wordpiece_tokenizer.add_vocab(vocab); 63 | maxlen_ = max_len; 64 | } 65 | void add_vocab(const char* vocab_file); 66 | vector tokenize(string text); 67 | vector convert_tokens_to_ids(vector tokens); 68 | }; 69 | -------------------------------------------------------------------------------- /cpp-app/unilib/unicode.h: -------------------------------------------------------------------------------- 1 | // This file is part of UniLib . 2 | // 3 | // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of 4 | // Mathematics and Physics, Charles University in Prague, Czech Republic. 5 | // 6 | // This Source Code Form is subject to the terms of the Mozilla Public 7 | // License, v. 2.0. If a copy of the MPL was not distributed with this 8 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 | // 10 | // UniLib version: 3.1.2-devel 11 | // Unicode version: 8.0.0 12 | 13 | #pragma once 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | namespace ufal { 20 | namespace unilib { 21 | 22 | class unicode { 23 | enum : uint8_t { 24 | _Lu = 1, _Ll = 2, _Lt = 3, _Lm = 4, _Lo = 5, 25 | _Mn = 6, _Mc = 7, _Me = 8, 26 | _Nd = 9, _Nl = 10, _No = 11, 27 | _Pc = 12, _Pd = 13, _Ps = 14, _Pe = 15, _Pi = 16, _Pf = 17, _Po = 18, 28 | _Sm = 19, _Sc = 20, _Sk = 21, _So = 22, 29 | _Zs = 23, _Zl = 24, _Zp = 25, 30 | _Cc = 26, _Cf = 27, _Cs = 28, _Co = 29, _Cn = 30 31 | }; 32 | 33 | public: 34 | typedef uint32_t category_t; 35 | enum : category_t { 36 | Lu = 1 << _Lu, Ll = 1 << _Ll, Lt = 1 << _Lt, Lut = Lu | Lt, LC = Lu | Ll | Lt, 37 | Lm = 1 << _Lm, Lo = 1 << _Lo, L = Lu | Ll | Lt | Lm | Lo, 38 | Mn = 1 << _Mn, Mc = 1 << _Mc, Me = 1 << _Me, M = Mn | Mc | Me, 39 | Nd = 1 << _Nd, Nl = 1 << _Nl, No = 1 << _No, N = Nd | Nl | No, 40 | Pc = 1 << _Pc, Pd = 1 << _Pd, Ps = 1 << _Ps, Pe = 1 << _Pe, Pi = 1 << _Pi, 41 | Pf = 1 << _Pf, Po = 1 << _Po, P = Pc | Pd | Ps | Pe | Pi | Pf | Po, 42 | Sm = 1 << _Sm, Sc = 1 << _Sc, Sk = 1 << _Sk, So = 1 << _So, S = Sm | Sc | Sk | So, 43 | Zs = 1 << _Zs, Zl = 1 << _Zl, Zp = 1 << _Zp, Z = Zs | Zl | Zp, 44 | Cc = 1 << _Cc, Cf = 1 << _Cf, Cs = 1 << _Cs, Co = 1 << _Co, Cn = 1 << _Cn, C = Cc | Cf | Cs | Co | Cn 45 | }; 46 | 47 | static inline category_t category(char32_t chr); 48 | 49 | static inline char32_t lowercase(char32_t chr); 50 | static inline char32_t uppercase(char32_t chr); 51 | static inline char32_t titlecase(char32_t chr); 52 | 53 | private: 54 | static const char32_t CHARS = 0x110000; 55 | static const int32_t DEFAULT_CAT = Cn; 56 | 57 | static const uint8_t category_index[CHARS >> 8]; 58 | static const uint8_t category_block[][256]; 59 | static const uint8_t othercase_index[CHARS >> 8]; 60 | static const char32_t othercase_block[][256]; 61 | 62 | enum othercase_type { LOWER_ONLY = 1, UPPERTITLE_ONLY = 2, LOWER_THEN_UPPER = 3, UPPER_THEN_TITLE = 4, TITLE_THEN_LOWER = 5 }; 63 | }; 64 | 65 | unicode::category_t unicode::category(char32_t chr) { 66 | return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; 67 | } 68 | 69 | char32_t unicode::lowercase(char32_t chr) { 70 | if (chr < CHARS) { 71 | char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF]; 72 | if ((othercase & 0xFF) == othercase_type::LOWER_ONLY) return othercase >> 8; 73 | if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase >> 8; 74 | if ((othercase & 0xFF) == othercase_type::TITLE_THEN_LOWER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8; 75 | } 76 | return chr; 77 | } 78 | 79 | char32_t unicode::uppercase(char32_t chr) { 80 | if (chr < CHARS) { 81 | char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF]; 82 | if ((othercase & 0xFF) == othercase_type::UPPERTITLE_ONLY) return othercase >> 8; 83 | if ((othercase & 0xFF) == othercase_type::UPPER_THEN_TITLE) return othercase >> 8; 84 | if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8; 85 | } 86 | return chr; 87 | } 88 | 89 | char32_t unicode::titlecase(char32_t chr) { 90 | if (chr < CHARS) { 91 | char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF]; 92 | if ((othercase & 0xFF) == othercase_type::UPPERTITLE_ONLY) return othercase >> 8; 93 | if ((othercase & 0xFF) == othercase_type::TITLE_THEN_LOWER) return othercase >> 8; 94 | if ((othercase & 0xFF) == othercase_type::UPPER_THEN_TITLE) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8; 95 | } 96 | return chr; 97 | } 98 | 99 | } // namespace unilib 100 | } // namespace ufal 101 | -------------------------------------------------------------------------------- /cpp-app/unilib/uninorms.h: -------------------------------------------------------------------------------- 1 | // This file is part of UniLib . 2 | // 3 | // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of 4 | // Mathematics and Physics, Charles University in Prague, Czech Republic. 5 | // 6 | // This Source Code Form is subject to the terms of the Mozilla Public 7 | // License, v. 2.0. If a copy of the MPL was not distributed with this 8 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 | // 10 | // UniLib version: 3.1.2-devel 11 | // Unicode version: 8.0.0 12 | 13 | #pragma once 14 | 15 | #include 16 | #include 17 | 18 | namespace ufal { 19 | namespace unilib { 20 | 21 | class uninorms { 22 | public: 23 | static void nfc(std::u32string& str); 24 | static void nfd(std::u32string& str); 25 | static void nfkc(std::u32string& str); 26 | static void nfkd(std::u32string& str); 27 | 28 | private: 29 | static void compose(std::u32string& str); 30 | static void decompose(std::u32string& str, bool kanonical); 31 | 32 | static const char32_t CHARS = 0x110000; 33 | 34 | struct Hangul { 35 | // Hangul decomposition and composition 36 | static const char32_t SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; 37 | static const char32_t LCount = 19, VCount = 21, TCount = 28, NCount = VCount * TCount, SCount = LCount * NCount; 38 | }; 39 | 40 | static const uint8_t ccc_index[CHARS >> 8]; 41 | static const uint8_t ccc_block[][256]; 42 | 43 | static const uint8_t composition_index[CHARS >> 8]; 44 | static const uint16_t composition_block[][257]; 45 | static const char32_t composition_data[]; 46 | 47 | static const uint8_t decomposition_index[CHARS >> 8]; 48 | static const uint16_t decomposition_block[][257]; 49 | static const char32_t decomposition_data[]; 50 | }; 51 | 52 | } // namespace unilib 53 | } // namespace ufal 54 | -------------------------------------------------------------------------------- /cpp-app/unilib/unistrip.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of UniLib . 2 | // 3 | // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of 4 | // Mathematics and Physics, Charles University in Prague, Czech Republic. 5 | // 6 | // This Source Code Form is subject to the terms of the Mozilla Public 7 | // License, v. 2.0. If a copy of the MPL was not distributed with this 8 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 | // 10 | // UniLib version: 3.1.2-devel 11 | // Unicode version: 8.0.0 12 | 13 | #include "unistrip.h" 14 | 15 | namespace ufal { 16 | namespace unilib { 17 | 18 | const char32_t unistrip::CHARS; 19 | 20 | const uint8_t unistrip::combining_mark_index[unistrip::CHARS >> 8] = { 21 | 0,0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,0,0,15,0,0,0,16,17,18,19,20,21,22,0,0,23,0,0,0,0,0,0,0,0,0,0,0,24,25,0,0,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,27,0,28,29,30,31,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,32,0,0,33,0,0,34,35,36,0,0,0,0,0,0,37,0,0,0,0,0,38,39,40,41,42,43,44,45,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,46,47,0,0,0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,49,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,50,51,0,0,0,0,0,0,0,52,0,0,0,0,0,0,0,0,0,0,0,0,0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 22 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 23 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 24 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,54,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 25 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 26 | }; 27 | 28 | const uint8_t unistrip::combining_mark_block[][32] = { 29 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 30 | {255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 31 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,248,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 32 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,254,255,255,255,255,191,182,0,0,0,0,0,0,0}, 33 | {0,0,255,7,0,0,0,0,0,248,255,255,0,0,1,0,0,0,0,0,0,0,0,0,0,0,192,159,159,61,0,0}, 34 | {0,0,2,0,0,0,255,255,255,7,0,0,0,0,0,0,0,0,0,0,192,255,1,0,0,0,0,0,0,248,15,0}, 35 | {0,0,192,251,239,62,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,248,255,255,255}, 36 | {15,0,0,0,0,0,0,220,255,255,254,0,12,0,0,0,14,0,0,0,0,0,0,208,159,57,128,0,12,0,0,0}, 37 | {14,0,0,0,0,0,0,208,135,57,2,0,0,0,35,0,14,0,0,0,0,0,0,208,191,59,0,0,12,0,0,0}, 38 | {14,0,0,0,0,0,0,208,159,57,192,0,12,0,0,0,4,0,0,0,0,0,0,192,199,61,128,0,0,0,0,0}, 39 | {15,0,0,0,0,0,0,192,223,61,96,0,12,0,0,0,14,0,0,0,0,0,0,208,223,61,96,0,12,0,0,0}, 40 | {14,0,0,0,0,0,0,192,223,61,128,0,12,0,0,0,12,0,0,0,0,0,0,0,0,132,95,255,0,0,12,0}, 41 | {0,0,0,0,0,0,242,7,128,127,0,0,0,0,0,0,0,0,0,0,0,0,242,27,0,63,0,0,0,0,0,0}, 42 | {0,0,0,3,0,0,160,194,0,0,0,0,0,0,254,255,223,224,255,254,255,255,255,31,64,0,0,0,0,0,0,0}, 43 | {0,0,0,0,0,248,255,127,0,0,192,195,157,63,30,0,252,191,0,60,0,0,0,0,0,0,0,0,0,0,0,0}, 44 | {0,0,0,0,0,0,0,0,0,0,0,224,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 45 | {0,0,28,0,0,0,28,0,0,0,12,0,0,0,12,0,0,0,0,0,0,0,240,255,255,255,15,32,0,0,0,0}, 46 | {0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0}, 47 | {0,0,0,0,255,15,255,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 48 | {0,0,128,15,0,0,0,0,0,0,224,127,255,255,255,159,0,0,0,0,0,0,255,127,0,0,0,0,0,0,0,0}, 49 | {31,0,0,0,0,0,240,255,31,0,0,0,0,248,15,0,7,0,0,0,254,63,0,0,0,0,0,0,192,255,15,0}, 50 | {0,0,0,0,240,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,247,255,255,33,28,3}, 51 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,63,240}, 52 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,1,0}, 53 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,3,0}, 54 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255}, 55 | {0,0,0,0,0,252,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0}, 56 | {0,0,0,0,0,0,0,0,0,0,0,0,0,128,247,63,0,0,0,192,0,0,0,0,0,0,0,0,0,0,3,0}, 57 | {68,8,0,0,248,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,240,255,31,0,0,0,255,255,3,0}, 58 | {0,0,0,0,192,63,0,0,128,255,15,0,0,0,0,0,15,0,0,0,0,0,248,255,1,0,0,0,32,0,0,0}, 59 | {0,0,0,0,0,254,127,0,8,48,0,0,0,0,0,56,0,0,0,0,0,0,157,193,2,0,0,0,0,248,96,0}, 60 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,248,55,0,0}, 61 | {0,0,0,64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 62 | {255,255,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 63 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,32}, 64 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0}, 65 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,192,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 66 | {110,240,0,0,0,0,0,135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,96,0,0,0}, 67 | {7,0,0,0,0,0,0,255,127,0,0,0,0,0,0,128,7,0,0,0,0,0,255,7,0,0,0,0,0,0,0,0}, 68 | {7,0,0,0,128,255,31,0,0,0,0,0,0,0,8,0,7,0,0,0,0,0,248,255,1,28,0,0,0,0,0,0}, 69 | {0,0,0,0,0,240,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,255,7,0,0}, 70 | {15,0,0,0,0,0,0,208,159,57,128,0,204,31,31,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 71 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,15,0,0,0,0,0,0,0}, 72 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,63,255,1,0,0,48,0,0,0,0}, 73 | {0,0,0,0,0,0,255,255,1,0,0,0,0,0,0,0,0,0,0,0,0,248,255,0,0,0,0,0,0,0,0,0}, 74 | {0,0,0,224,255,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 75 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,31,0}, 76 | {0,0,0,0,0,0,127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 77 | {0,0,0,0,0,0,0,0,0,0,254,255,255,255,255,127,0,128,7,0,0,0,0,0,0,0,0,0,0,0,0,0}, 78 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,96,0,0,0,0,0,0,0,0,0,0,0,0}, 79 | {0,0,0,0,0,0,0,0,0,0,0,0,224,227,7,248,231,15,0,0,0,60,0,0,0,0,0,0,0,0,0,0}, 80 | {0,0,0,0,0,0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 81 | {255,255,255,255,255,255,127,248,255,255,255,255,255,31,32,0,16,0,0,248,254,255,0,0,0,0,0,0,0,0,0,0}, 82 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,127,0,0,0,0,0}, 83 | {255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0} 84 | }; 85 | 86 | const uint8_t unistrip::stripped_index[unistrip::CHARS >> 8] = { 87 | 0,1,2,3,4,5,6,5,5,7,8,9,5,5,5,10,11,5,5,5,5,5,5,5,5,5,5,12,5,5,13,14,5,15,16,5,5,5,5,5,5,5,17,5,5,5,5,5,18,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,19,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,20,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,21,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, 88 | 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, 89 | 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, 90 | 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, 91 | 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5 92 | }; 93 | 94 | const uint16_t unistrip::stripped_block[][256] = { 95 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,2,3,3,3,3,4,4,4,4,0,5,6,6,6,6,6,0,0,7,7,7,7,8,0,0,9,9,9,9,9,9,0,10,11,11,11,11,12,12,12,12,0,13,14,14,14,14,14,0,0,15,15,15,15,16,0,16}, 96 | {1,9,1,9,1,9,2,10,2,10,2,10,2,10,17,18,0,0,3,11,3,11,3,11,3,11,3,11,19,20,19,20,19,20,19,20,21,22,0,0,4,12,4,12,4,12,4,12,4,0,0,0,23,24,25,26,0,27,28,27,28,27,28,0,0,0,0,5,13,5,13,5,13,0,0,0,6,14,6,14,6,14,0,0,29,30,29,30,29,30,31,32,31,32,31,32,31,32,33,34,33,34,0,0,7,15,7,15,7,15,7,15,7,15,7,15,35,36,8,16,8,37,38,37,38,37,38,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,9,4,12,6,14,7,15,7,15,7,15,7,15,7,15,0,1,9,1,9,39,40,0,0,19,20,25,26,6,14,6,14,41,42,24,0,0,0,19,20,0,0,5,13,1,9,39,40,43,44}, 97 | {1,9,1,9,3,11,3,11,4,12,4,12,6,14,6,14,29,30,29,30,7,15,7,15,31,32,33,34,0,0,21,22,0,0,0,0,0,0,1,9,3,11,6,14,6,14,6,14,6,14,8,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 98 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,45,46,0,47,48,49,0,50,0,51,52,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,49,51,54,55,56,53,57,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,53,57,58,57,59,0,0,0,0,60,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 99 | {61,61,0,62,0,0,0,63,0,0,0,0,64,65,66,0,0,0,0,0,0,0,0,0,0,65,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,67,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,68,68,0,69,0,0,0,70,0,0,0,0,71,67,72,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,73,74,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,75,76,0,0,0,0,0,0,0,0,0,0,0,0,0,77,78,77,78,0,0,61,68,0,0,79,80,75,76,81,82,0,0,65,67,65,67,83,84,0,0,85,86,87,88,66,72,66,72,66,72,89,90,0,0,91,92,0,0,0,0,0,0}, 100 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 101 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,93,93,94,93,95,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,96,0,97,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,98,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 102 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,99,0,0,0,0,0,0,0,100,0,0,101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,102,103,104,105,106,107,108,109,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,110,111,0,112,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 103 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,113,0,0,114,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,115,116,117,0,0,118,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 104 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,119,120,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,121,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 105 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,122,0,0,0,0,0,0,0,0,0,123,0,0,0,0,124,0,0,0,0,125,0,0,0,0,126,0,0,0,0,0,0,0,0,0,0,0,0,127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 106 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 107 | {0,0,0,0,0,0,129,0,130,0,131,0,132,0,133,0,0,0,134,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 108 | {1,9,135,136,135,136,135,136,2,10,17,18,17,18,17,18,17,18,17,18,3,11,3,11,3,11,3,11,3,11,137,138,19,20,21,22,21,22,21,22,21,22,21,22,4,12,4,12,25,26,25,26,25,26,27,28,27,28,27,28,27,28,139,140,139,140,139,140,5,13,5,13,5,13,5,13,6,14,6,14,6,14,6,14,141,142,141,142,29,30,29,30,29,30,29,30,31,32,31,32,31,32,31,32,31,32,33,34,33,34,33,34,33,34,7,15,7,15,7,15,7,15,7,15,143,144,143,144,35,36,35,36,35,36,35,36,35,36,145,146,145,146,8,16,37,38,37,38,37,38,22,34,36,16,0,147,0,0,0,0,1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9,3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11,4,12,4,12,6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14,7,15,7,15,7,15,7,15,7,15,7,15,7,15,8,16,8,16,8,16,8,16,0,0,0,0,0,0}, 109 | {54,54,54,54,54,54,54,54,46,46,46,46,46,46,46,46,55,55,55,55,55,55,0,0,47,47,47,47,47,47,0,0,56,56,56,56,56,56,56,56,48,48,48,48,48,48,48,48,53,53,53,53,53,53,53,53,49,49,49,49,49,49,49,49,58,58,58,58,58,58,0,0,50,50,50,50,50,50,0,0,57,57,57,57,57,57,57,57,0,51,0,51,0,51,0,51,59,59,59,59,59,59,59,59,52,52,52,52,52,52,52,52,54,54,55,55,56,56,53,53,58,58,57,57,59,59,0,0,54,54,54,54,54,54,54,54,46,46,46,46,46,46,46,46,56,56,56,56,56,56,56,56,48,48,48,48,48,48,48,48,59,59,59,59,59,59,59,59,52,52,52,52,52,52,52,52,54,54,54,54,54,0,54,54,46,46,46,46,46,0,0,0,0,45,56,56,56,0,56,56,47,47,48,48,48,148,148,148,53,53,53,53,0,0,53,53,49,49,49,49,0,149,149,149,57,57,57,57,150,150,57,57,51,51,51,51,151,45,45,0,0,0,59,59,59,0,59,59,50,50,52,52,52,0,0,0}, 110 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,152,153,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,154,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,155,156,157,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 111 | {0,0,0,0,158,0,0,0,0,159,0,0,160,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,161,0,162,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,163,0,0,164,0,0,165,0,166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,167,0,168,0,0,0,0,0,0,0,0,0,0,169,170,171,172,173,0,0,174,175,0,0,176,177,0,0,0,0,0,0,178,179,0,0,180,181,0,0,182,183,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,184,185,186,187,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,188,189,190,191,0,0,0,0,0,0,192,193,194,195,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 112 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,196,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 113 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,197,0,198,0,199,0,200,0,201,0,202,0,203,0,204,0,205,0,206,0,207,0,208,0,0,209,0,210,0,211,0,0,0,0,0,0,212,212,0,213,213,0,214,214,0,215,215,0,216,216,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,217,0,0,0,0,0,0,0,0,0,218,0,0,0,0,0,0,0,0,0,0,0,0,0,219,0,220,0,221,0,222,0,223,0,224,0,225,0,226,0,227,0,228,0,229,0,230,0,0,231,0,232,0,233,0,0,0,0,0,0,234,234,0,235,235,0,236,236,0,237,237,0,238,238,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,239,0,0,240,241,242,243,0,0,0,244,0}, 114 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,245,0,246,0,0,0,0,0,0,0,0,0,0,247,247,247,247,248,248,248,249,250,251,252,253,254,0,255,245,256,257,258,0,259,0,260,261,0,262,263,0,264,265,266,247,267,253,249,257,263,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 115 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,268,0,269,0,0,0,0,0,0,0,0,0,0,0,0,0,0,270,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 116 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,271,272,272,272,272,272,272,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,273,274,273,274,273,274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} 117 | }; 118 | 119 | const char32_t unistrip::stripped_data[] = { 120 | 0,65,67,69,73,78,79,85,89,97,99,101,105,110,111,117,121,68,100,71,103,72,104,74,106,75,107,76,108,82,114,83,115,84,116,87,119,90,122,198,230,439,658,216,248,168,913,917,919,921,927,933,937,953,945,949,951,965,959,969,978,1045,1043,1030,1050,1048,1059,1080,1077,1075,1110,1082,1091,1140,1141,1046,1078,1040,1072,1240,1241,1047,1079,1054,1086,1256,1257,1069,1101,1063,1095,1067,1099,1575,1608,1610,1749,1729,1746,2344,2352,2355,2325,2326,2327,2332,2337,2338,2347,2351,2465,2466,2479,2610,2616,2582,2583,2588,2603,2849,2850,2962,3906,3916,3921,3926,3931,3904,4133,6917,6919,6921,6923,6925,6929,66,98,70,102,77,109,80,112,86,118,88,120,383,8127,8190,961,929,8592,8594,8596,8656,8660,8658,8707,8712,8715,8739,8741,8764,8771,8773,8776,61,8801,8781,60,62,8804,8805,8818,8819,8822,8823,8826,8827,8834,8835,8838,8839,8866,8872,8873,8875,8828,8829,8849,8850,8882,8883,8884,8885,10973,12363,12365,12367,12369,12371,12373,12375,12377,12379,12381,12383,12385,12388,12390,12392,12399,12402,12405,12408,12411,12358,12445,12459,12461,12463,12465,12467,12469,12471,12473,12475,12477,12479,12481,12484,12486,12488,12495,12498,12501,12504,12507,12454,12527,12528,12529,12530,12541,1497,1522,1513,1488,1489,1490,1491,1492,1493,1494,1496,1498,1499,1500,1502,1504,1505,1507,1508,1510,1511,1512,1514,69785,69787,69797,119127,119128,119225,119226 121 | }; 122 | 123 | } // namespace unilib 124 | } // namespace ufal 125 | -------------------------------------------------------------------------------- /cpp-app/unilib/unistrip.h: -------------------------------------------------------------------------------- 1 | // This file is part of UniLib . 2 | // 3 | // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of 4 | // Mathematics and Physics, Charles University in Prague, Czech Republic. 5 | // 6 | // This Source Code Form is subject to the terms of the Mozilla Public 7 | // License, v. 2.0. If a copy of the MPL was not distributed with this 8 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 | // 10 | // UniLib version: 3.1.2-devel 11 | // Unicode version: 8.0.0 12 | 13 | #pragma once 14 | 15 | #include 16 | 17 | namespace ufal { 18 | namespace unilib { 19 | 20 | class unistrip { 21 | public: 22 | static inline bool is_combining_mark(char32_t chr); 23 | static inline char32_t strip_combining_marks(char32_t chr); 24 | 25 | private: 26 | static const char32_t CHARS = 0x110000; 27 | 28 | static const uint8_t combining_mark_index[CHARS >> 8]; 29 | static const uint8_t combining_mark_block[][32]; 30 | 31 | static const uint8_t stripped_index[CHARS >> 8]; 32 | static const uint16_t stripped_block[][256]; 33 | static const char32_t stripped_data[]; 34 | }; 35 | 36 | bool unistrip::is_combining_mark(char32_t chr) { 37 | return chr < CHARS && combining_mark_block[combining_mark_index[chr >> 8]][(chr >> 3) & 0x1F] & (uint8_t(1) << (chr & 0x07)); 38 | } 39 | 40 | char32_t unistrip::strip_combining_marks(char32_t chr) { 41 | if (chr >= CHARS) return chr; 42 | uint16_t index = stripped_block[stripped_index[chr >> 8]][chr & 0xFF]; 43 | return index ? stripped_data[index] : chr; 44 | } 45 | 46 | } // namespace unilib 47 | } // namespace ufal 48 | -------------------------------------------------------------------------------- /cpp-app/unilib/utf16.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of UniLib . 2 | // 3 | // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of 4 | // Mathematics and Physics, Charles University in Prague, Czech Republic. 5 | // 6 | // This Source Code Form is subject to the terms of the Mozilla Public 7 | // License, v. 2.0. If a copy of the MPL was not distributed with this 8 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 | // 10 | // UniLib version: 3.1.2-devel 11 | // Unicode version: 8.0.0 12 | 13 | #include "utf16.h" 14 | 15 | namespace ufal { 16 | namespace unilib { 17 | 18 | bool utf16::valid(const char16_t* str) { 19 | for (; *str; str++) 20 | if (*str >= 0xD800 && *str < 0xDC00) { 21 | str++; if (*str < 0xDC00 || *str >= 0xE000) return false; 22 | } else if (*str >= 0xDC00 && *str < 0xE000) return false; 23 | 24 | return true; 25 | } 26 | 27 | bool utf16::valid(const char16_t* str, size_t len) { 28 | for (; len; str++, len--) 29 | if (*str >= 0xD800 && *str < 0xDC00) { 30 | str++; if (!--len || *str < 0xDC00 || *str >= 0xE000) return false; 31 | } else if (*str >= 0xDC00 && *str < 0xE000) return false; 32 | 33 | return true; 34 | } 35 | 36 | void utf16::decode(const char16_t* str, std::u32string& decoded) { 37 | decoded.clear(); 38 | 39 | for (char32_t chr; (chr = decode(str)); ) 40 | decoded.push_back(chr); 41 | } 42 | 43 | void utf16::decode(const char16_t* str, size_t len, std::u32string& decoded) { 44 | decoded.clear(); 45 | 46 | while (len) 47 | decoded.push_back(decode(str, len)); 48 | } 49 | 50 | void utf16::encode(const std::u32string& str, std::u16string& encoded) { 51 | encoded.clear(); 52 | 53 | for (auto&& chr : str) 54 | append(encoded, chr); 55 | } 56 | 57 | const char16_t utf16::REPLACEMENT_CHAR; 58 | 59 | } // namespace unilib 60 | } // namespace ufal 61 | -------------------------------------------------------------------------------- /cpp-app/unilib/utf16.h: -------------------------------------------------------------------------------- 1 | // This file is part of UniLib . 2 | // 3 | // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of 4 | // Mathematics and Physics, Charles University in Prague, Czech Republic. 5 | // 6 | // This Source Code Form is subject to the terms of the Mozilla Public 7 | // License, v. 2.0. If a copy of the MPL was not distributed with this 8 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 | // 10 | // UniLib version: 3.1.2-devel 11 | // Unicode version: 8.0.0 12 | 13 | #pragma once 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | namespace ufal { 21 | namespace unilib { 22 | 23 | class utf16 { 24 | public: 25 | static bool valid(const char16_t* str); 26 | static bool valid(const char16_t* str, size_t len); 27 | static inline bool valid(const std::u16string& str); 28 | 29 | static inline char32_t decode(const char16_t*& str); 30 | static inline char32_t decode(const char16_t*& str, size_t& len); 31 | static inline char32_t first(const char16_t* str); 32 | static inline char32_t first(const char16_t* str, size_t len); 33 | static inline char32_t first(const std::u16string& str); 34 | 35 | static void decode(const char16_t* str, std::u32string& decoded); 36 | static void decode(const char16_t* str, size_t len, std::u32string& decoded); 37 | static inline void decode(const std::u16string& str, std::u32string& decoded); 38 | 39 | class string_decoder { 40 | public: 41 | class iterator; 42 | inline iterator begin(); 43 | inline iterator end(); 44 | private: 45 | inline string_decoder(const char16_t* str); 46 | const char16_t* str; 47 | friend class utf16; 48 | }; 49 | static inline string_decoder decoder(const char16_t* str); 50 | static inline string_decoder decoder(const std::u16string& str); 51 | 52 | class buffer_decoder { 53 | public: 54 | class iterator; 55 | inline iterator begin(); 56 | inline iterator end(); 57 | private: 58 | inline buffer_decoder(const char16_t* str, size_t len); 59 | const char16_t* str; 60 | size_t len; 61 | friend class utf16; 62 | }; 63 | static inline buffer_decoder decoder(const char16_t* str, size_t len); 64 | 65 | static inline void append(char16_t*& str, char32_t chr); 66 | static inline void append(std::u16string& str, char32_t chr); 67 | static void encode(const std::u32string& str, std::u16string& encoded); 68 | 69 | template static void map(F f, const char16_t* str, std::u16string& result); 70 | template static void map(F f, const char16_t* str, size_t len, std::u16string& result); 71 | template static void map(F f, const std::u16string& str, std::u16string& result); 72 | 73 | private: 74 | static const char16_t REPLACEMENT_CHAR = '?'; 75 | }; 76 | 77 | bool utf16::valid(const std::u16string& str) { 78 | return valid(str.c_str()); 79 | } 80 | 81 | char32_t utf16::decode(const char16_t*& str) { 82 | if (*str < 0xD800 || *str >= 0xE000) return *str++; 83 | if (*str >= 0xDC00) return ++str, REPLACEMENT_CHAR; 84 | char32_t res = 0x10000 + ((*str++ - 0xD800) << 10); 85 | if (*str < 0xDC00 || *str >= 0xE000) return REPLACEMENT_CHAR; 86 | return res + (*str++ - 0xDC00); 87 | } 88 | 89 | char32_t utf16::decode(const char16_t*& str, size_t& len) { 90 | if (!len) return 0; 91 | --len; 92 | if (*str < 0xD800 || *str >= 0xE000) return *str++; 93 | if (!len || *str >= 0xDC00) return ++str, REPLACEMENT_CHAR; 94 | char32_t res = 0x10000 + ((*str++ - 0xD800) << 10); 95 | if (*str < 0xDC00 || *str >= 0xE000) return REPLACEMENT_CHAR; 96 | return res + ((--len, *str++) - 0xDC00); 97 | } 98 | 99 | char32_t utf16::first(const char16_t* str) { 100 | return decode(str); 101 | } 102 | 103 | char32_t utf16::first(const char16_t* str, size_t len) { 104 | return decode(str, len); 105 | } 106 | 107 | char32_t utf16::first(const std::u16string& str) { 108 | return first(str.c_str()); 109 | } 110 | 111 | void utf16::decode(const std::u16string& str, std::u32string& decoded) { 112 | decode(str.c_str(), decoded); 113 | } 114 | 115 | class utf16::string_decoder::iterator : public std::iterator { 116 | public: 117 | iterator(const char16_t* str) : codepoint(0), next(str) { operator++(); } 118 | iterator(const iterator& it) : codepoint(it.codepoint), next(it.next) {} 119 | iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } 120 | iterator operator++(int) { iterator tmp(*this); operator++(); return tmp; } 121 | bool operator==(const iterator& other) const { return next == other.next; } 122 | bool operator!=(const iterator& other) const { return next != other.next; } 123 | const char32_t& operator*() { return codepoint; } 124 | private: 125 | char32_t codepoint; 126 | const char16_t* next; 127 | }; 128 | 129 | utf16::string_decoder::string_decoder(const char16_t* str) : str(str) {} 130 | 131 | utf16::string_decoder::iterator utf16::string_decoder::begin() { 132 | return iterator(str); 133 | } 134 | 135 | utf16::string_decoder::iterator utf16::string_decoder::end() { 136 | return iterator(nullptr); 137 | } 138 | 139 | utf16::string_decoder utf16::decoder(const char16_t* str) { 140 | return string_decoder(str); 141 | } 142 | 143 | utf16::string_decoder utf16::decoder(const std::u16string& str) { 144 | return string_decoder(str.c_str()); 145 | } 146 | 147 | class utf16::buffer_decoder::iterator : public std::iterator { 148 | public: 149 | iterator(const char16_t* str, size_t len) : codepoint(0), next(str), len(len) { operator++(); } 150 | iterator(const iterator& it) : codepoint(it.codepoint), next(it.next), len(it.len) {} 151 | iterator& operator++() { if (!len) next = nullptr; if (next) codepoint = decode(next, len); return *this; } 152 | iterator operator++(int) { iterator tmp(*this); operator++(); return tmp; } 153 | bool operator==(const iterator& other) const { return next == other.next; } 154 | bool operator!=(const iterator& other) const { return next != other.next; } 155 | const char32_t& operator*() { return codepoint; } 156 | private: 157 | char32_t codepoint; 158 | const char16_t* next; 159 | size_t len; 160 | }; 161 | 162 | utf16::buffer_decoder::buffer_decoder(const char16_t* str, size_t len) : str(str), len(len) {} 163 | 164 | utf16::buffer_decoder::iterator utf16::buffer_decoder::begin() { 165 | return iterator(str, len); 166 | } 167 | 168 | utf16::buffer_decoder::iterator utf16::buffer_decoder::end() { 169 | return iterator(nullptr, 0); 170 | } 171 | 172 | utf16::buffer_decoder utf16::decoder(const char16_t* str, size_t len) { 173 | return buffer_decoder(str, len); 174 | } 175 | 176 | void utf16::append(char16_t*& str, char32_t chr) { 177 | if (chr <= 0xFFFF) *str++ = chr; 178 | else if (chr <= 0x10FFFF) { *str++ = 0xD800 + ((chr - 0x10000) >> 10); *str++ = 0xDC00 + ((chr - 0x10000) & 0x3FF); } 179 | else *str++ = REPLACEMENT_CHAR; 180 | } 181 | 182 | void utf16::append(std::u16string& str, char32_t chr) { 183 | if (chr <= 0xFFFF) str += chr; 184 | else if (chr <= 0x10FFFF) { str += 0xD800 + ((chr - 0x10000) >> 10); str += 0xDC00 + ((chr - 0x10000) & 0x3FF); } 185 | else str += REPLACEMENT_CHAR; 186 | } 187 | 188 | template void utf16::map(F f, const char16_t* str, std::u16string& result) { 189 | result.clear(); 190 | 191 | for (char32_t chr; (chr = decode(str)); ) 192 | append(result, f(chr)); 193 | } 194 | 195 | template void utf16::map(F f, const char16_t* str, size_t len, std::u16string& result) { 196 | result.clear(); 197 | 198 | while (len) 199 | append(result, f(decode(str, len))); 200 | } 201 | 202 | template void utf16::map(F f, const std::u16string& str, std::u16string& result) { 203 | map(f, str.c_str(), result); 204 | } 205 | 206 | } // namespace unilib 207 | } // namespace ufal 208 | -------------------------------------------------------------------------------- /cpp-app/unilib/utf8.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of UniLib . 2 | // 3 | // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of 4 | // Mathematics and Physics, Charles University in Prague, Czech Republic. 5 | // 6 | // This Source Code Form is subject to the terms of the Mozilla Public 7 | // License, v. 2.0. If a copy of the MPL was not distributed with this 8 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 | // 10 | // UniLib version: 3.1.2-devel 11 | // Unicode version: 8.0.0 12 | 13 | #include "utf8.h" 14 | 15 | namespace ufal { 16 | namespace unilib { 17 | 18 | bool utf8::valid(const char* str) { 19 | for (; *str; str++) 20 | if (((unsigned char)*str) >= 0x80) { 21 | if (((unsigned char)*str) < 0xC0) return false; 22 | else if (((unsigned char)*str) < 0xE0) { 23 | str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 24 | } else if (((unsigned char)*str) < 0xF0) { 25 | str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 26 | str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 27 | } else if (((unsigned char)*str) < 0xF8) { 28 | str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 29 | str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 30 | str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 31 | } else return false; 32 | } 33 | return true; 34 | } 35 | 36 | bool utf8::valid(const char* str, size_t len) { 37 | for (; len > 0; str++, len--) 38 | if (((unsigned char)*str) >= 0x80) { 39 | if (((unsigned char)*str) < 0xC0) return false; 40 | else if (((unsigned char)*str) < 0xE0) { 41 | str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 42 | } else if (((unsigned char)*str) < 0xF0) { 43 | str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 44 | str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 45 | } else if (((unsigned char)*str) < 0xF8) { 46 | str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 47 | str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 48 | str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 49 | } else return false; 50 | } 51 | return true; 52 | } 53 | 54 | void utf8::decode(const char* str, std::u32string& decoded) { 55 | decoded.clear(); 56 | 57 | for (char32_t chr; (chr = decode(str)); ) 58 | decoded.push_back(chr); 59 | } 60 | 61 | void utf8::decode(const char* str, size_t len, std::u32string& decoded) { 62 | decoded.clear(); 63 | 64 | while (len) 65 | decoded.push_back(decode(str, len)); 66 | } 67 | 68 | void utf8::encode(const std::u32string& str, std::string& encoded) { 69 | encoded.clear(); 70 | 71 | for (auto&& chr : str) 72 | append(encoded, chr); 73 | } 74 | 75 | const char utf8::REPLACEMENT_CHAR; 76 | 77 | } // namespace unilib 78 | } // namespace ufal 79 | -------------------------------------------------------------------------------- /cpp-app/unilib/utf8.h: -------------------------------------------------------------------------------- 1 | // This file is part of UniLib . 2 | // 3 | // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of 4 | // Mathematics and Physics, Charles University in Prague, Czech Republic. 5 | // 6 | // This Source Code Form is subject to the terms of the Mozilla Public 7 | // License, v. 2.0. If a copy of the MPL was not distributed with this 8 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 | // 10 | // UniLib version: 3.1.2-devel 11 | // Unicode version: 8.0.0 12 | 13 | #pragma once 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | namespace ufal { 21 | namespace unilib { 22 | 23 | class utf8 { 24 | public: 25 | static bool valid(const char* str); 26 | static bool valid(const char* str, size_t len); 27 | static inline bool valid(const std::string& str); 28 | 29 | static inline char32_t decode(const char*& str); 30 | static inline char32_t decode(const char*& str, size_t& len); 31 | static inline char32_t first(const char* str); 32 | static inline char32_t first(const char* str, size_t len); 33 | static inline char32_t first(const std::string& str); 34 | 35 | static void decode(const char* str, std::u32string& decoded); 36 | static void decode(const char* str, size_t len, std::u32string& decoded); 37 | static inline void decode(const std::string& str, std::u32string& decoded); 38 | 39 | class string_decoder { 40 | public: 41 | class iterator; 42 | inline iterator begin(); 43 | inline iterator end(); 44 | private: 45 | inline string_decoder(const char* str); 46 | const char* str; 47 | friend class utf8; 48 | }; 49 | static inline string_decoder decoder(const char* str); 50 | static inline string_decoder decoder(const std::string& str); 51 | 52 | class buffer_decoder { 53 | public: 54 | class iterator; 55 | inline iterator begin(); 56 | inline iterator end(); 57 | private: 58 | inline buffer_decoder(const char* str, size_t len); 59 | const char* str; 60 | size_t len; 61 | friend class utf8; 62 | }; 63 | static inline buffer_decoder decoder(const char* str, size_t len); 64 | 65 | static inline void append(char*& str, char32_t chr); 66 | static inline void append(std::string& str, char32_t chr); 67 | static void encode(const std::u32string& str, std::string& encoded); 68 | 69 | template static void map(F f, const char* str, std::string& result); 70 | template static void map(F f, const char* str, size_t len, std::string& result); 71 | template static void map(F f, const std::string& str, std::string& result); 72 | 73 | private: 74 | static const char REPLACEMENT_CHAR = '?'; 75 | }; 76 | 77 | bool utf8::valid(const std::string& str) { 78 | return valid(str.c_str()); 79 | } 80 | 81 | char32_t utf8::decode(const char*& str) { 82 | if (((unsigned char)*str) < 0x80) return (unsigned char)*str++; 83 | else if (((unsigned char)*str) < 0xC0) return ++str, REPLACEMENT_CHAR; 84 | else if (((unsigned char)*str) < 0xE0) { 85 | char32_t res = (((unsigned char)*str++) & 0x1F) << 6; 86 | if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 87 | return res + (((unsigned char)*str++) & 0x3F); 88 | } else if (((unsigned char)*str) < 0xF0) { 89 | char32_t res = (((unsigned char)*str++) & 0x0F) << 12; 90 | if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 91 | res += (((unsigned char)*str++) & 0x3F) << 6; 92 | if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 93 | return res + (((unsigned char)*str++) & 0x3F); 94 | } else if (((unsigned char)*str) < 0xF8) { 95 | char32_t res = (((unsigned char)*str++) & 0x07) << 18; 96 | if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 97 | res += (((unsigned char)*str++) & 0x3F) << 12; 98 | if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 99 | res += (((unsigned char)*str++) & 0x3F) << 6; 100 | if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 101 | return res + (((unsigned char)*str++) & 0x3F); 102 | } else return ++str, REPLACEMENT_CHAR; 103 | } 104 | 105 | char32_t utf8::decode(const char*& str, size_t& len) { 106 | if (!len) return 0; 107 | --len; 108 | if (((unsigned char)*str) < 0x80) return (unsigned char)*str++; 109 | else if (((unsigned char)*str) < 0xC0) return ++str, REPLACEMENT_CHAR; 110 | else if (((unsigned char)*str) < 0xE0) { 111 | char32_t res = (((unsigned char)*str++) & 0x1F) << 6; 112 | if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 113 | return res + ((--len, ((unsigned char)*str++)) & 0x3F); 114 | } else if (((unsigned char)*str) < 0xF0) { 115 | char32_t res = (((unsigned char)*str++) & 0x0F) << 12; 116 | if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 117 | res += ((--len, ((unsigned char)*str++)) & 0x3F) << 6; 118 | if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 119 | return res + ((--len, ((unsigned char)*str++)) & 0x3F); 120 | } else if (((unsigned char)*str) < 0xF8) { 121 | char32_t res = (((unsigned char)*str++) & 0x07) << 18; 122 | if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 123 | res += ((--len, ((unsigned char)*str++)) & 0x3F) << 12; 124 | if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 125 | res += ((--len, ((unsigned char)*str++)) & 0x3F) << 6; 126 | if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 127 | return res + ((--len, ((unsigned char)*str++)) & 0x3F); 128 | } else return ++str, REPLACEMENT_CHAR; 129 | } 130 | 131 | char32_t utf8::first(const char* str) { 132 | return decode(str); 133 | } 134 | 135 | char32_t utf8::first(const char* str, size_t len) { 136 | return decode(str, len); 137 | } 138 | 139 | char32_t utf8::first(const std::string& str) { 140 | return first(str.c_str()); 141 | } 142 | 143 | void utf8::decode(const std::string& str, std::u32string& decoded) { 144 | decode(str.c_str(), decoded); 145 | } 146 | 147 | class utf8::string_decoder::iterator : public std::iterator { 148 | public: 149 | iterator(const char* str) : codepoint(0), next(str) { operator++(); } 150 | iterator(const iterator& it) : codepoint(it.codepoint), next(it.next) {} 151 | iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } 152 | iterator operator++(int) { iterator tmp(*this); operator++(); return tmp; } 153 | bool operator==(const iterator& other) const { return next == other.next; } 154 | bool operator!=(const iterator& other) const { return next != other.next; } 155 | const char32_t& operator*() { return codepoint; } 156 | private: 157 | char32_t codepoint; 158 | const char* next; 159 | }; 160 | 161 | utf8::string_decoder::string_decoder(const char* str) : str(str) {} 162 | 163 | utf8::string_decoder::iterator utf8::string_decoder::begin() { 164 | return iterator(str); 165 | } 166 | 167 | utf8::string_decoder::iterator utf8::string_decoder::end() { 168 | return iterator(nullptr); 169 | } 170 | 171 | utf8::string_decoder utf8::decoder(const char* str) { 172 | return string_decoder(str); 173 | } 174 | 175 | utf8::string_decoder utf8::decoder(const std::string& str) { 176 | return string_decoder(str.c_str()); 177 | } 178 | 179 | class utf8::buffer_decoder::iterator : public std::iterator { 180 | public: 181 | iterator(const char* str, size_t len) : codepoint(0), next(str), len(len) { operator++(); } 182 | iterator(const iterator& it) : codepoint(it.codepoint), next(it.next), len(it.len) {} 183 | iterator& operator++() { if (!len) next = nullptr; if (next) codepoint = decode(next, len); return *this; } 184 | iterator operator++(int) { iterator tmp(*this); operator++(); return tmp; } 185 | bool operator==(const iterator& other) const { return next == other.next; } 186 | bool operator!=(const iterator& other) const { return next != other.next; } 187 | const char32_t& operator*() { return codepoint; } 188 | private: 189 | char32_t codepoint; 190 | const char* next; 191 | size_t len; 192 | }; 193 | 194 | utf8::buffer_decoder::buffer_decoder(const char* str, size_t len) : str(str), len(len) {} 195 | 196 | utf8::buffer_decoder::iterator utf8::buffer_decoder::begin() { 197 | return iterator(str, len); 198 | } 199 | 200 | utf8::buffer_decoder::iterator utf8::buffer_decoder::end() { 201 | return iterator(nullptr, 0); 202 | } 203 | 204 | utf8::buffer_decoder utf8::decoder(const char* str, size_t len) { 205 | return buffer_decoder(str, len); 206 | } 207 | 208 | void utf8::append(char*& str, char32_t chr) { 209 | if (chr < 0x80) *str++ = chr; 210 | else if (chr < 0x800) { *str++ = 0xC0 + (chr >> 6); *str++ = 0x80 + (chr & 0x3F); } 211 | else if (chr < 0x10000) { *str++ = 0xE0 + (chr >> 12); *str++ = 0x80 + ((chr >> 6) & 0x3F); *str++ = 0x80 + (chr & 0x3F); } 212 | else if (chr < 0x200000) { *str++ = 0xF0 + (chr >> 18); *str++ = 0x80 + ((chr >> 12) & 0x3F); *str++ = 0x80 + ((chr >> 6) & 0x3F); *str++ = 0x80 + (chr & 0x3F); } 213 | else *str++ = REPLACEMENT_CHAR; 214 | } 215 | 216 | void utf8::append(std::string& str, char32_t chr) { 217 | if (chr < 0x80) str += chr; 218 | else if (chr < 0x800) { str += 0xC0 + (chr >> 6); str += 0x80 + (chr & 0x3F); } 219 | else if (chr < 0x10000) { str += 0xE0 + (chr >> 12); str += 0x80 + ((chr >> 6) & 0x3F); str += 0x80 + (chr & 0x3F); } 220 | else if (chr < 0x200000) { str += 0xF0 + (chr >> 18); str += 0x80 + ((chr >> 12) & 0x3F); str += 0x80 + ((chr >> 6) & 0x3F); str += 0x80 + (chr & 0x3F); } 221 | else str += REPLACEMENT_CHAR; 222 | } 223 | 224 | template void utf8::map(F f, const char* str, std::string& result) { 225 | result.clear(); 226 | 227 | for (char32_t chr; (chr = decode(str)); ) 228 | append(result, f(chr)); 229 | } 230 | 231 | template void utf8::map(F f, const char* str, size_t len, std::string& result) { 232 | result.clear(); 233 | 234 | while (len) 235 | append(result, f(decode(str, len))); 236 | } 237 | 238 | template void utf8::map(F f, const std::string& str, std::string& result) { 239 | map(f, str.c_str(), result); 240 | } 241 | 242 | } // namespace unilib 243 | } // namespace ufal 244 | -------------------------------------------------------------------------------- /cpp-app/unilib/version.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of UniLib . 2 | // 3 | // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of 4 | // Mathematics and Physics, Charles University in Prague, Czech Republic. 5 | // 6 | // This Source Code Form is subject to the terms of the Mozilla Public 7 | // License, v. 2.0. If a copy of the MPL was not distributed with this 8 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 | // 10 | // UniLib version: 3.1.2-devel 11 | // Unicode version: 8.0.0 12 | 13 | #include "version.h" 14 | 15 | namespace ufal { 16 | namespace unilib { 17 | 18 | // Returns current version. 19 | version version::current() { 20 | return {3, 1, 2, "devel"}; 21 | } 22 | 23 | } // namespace unilib 24 | } // namespace ufal 25 | -------------------------------------------------------------------------------- /cpp-app/unilib/version.h: -------------------------------------------------------------------------------- 1 | // This file is part of UniLib . 2 | // 3 | // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of 4 | // Mathematics and Physics, Charles University in Prague, Czech Republic. 5 | // 6 | // This Source Code Form is subject to the terms of the Mozilla Public 7 | // License, v. 2.0. If a copy of the MPL was not distributed with this 8 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 | // 10 | // UniLib version: 3.1.2-devel 11 | // Unicode version: 8.0.0 12 | 13 | #pragma once 14 | 15 | #include 16 | 17 | namespace ufal { 18 | namespace unilib { 19 | 20 | struct version { 21 | unsigned major; 22 | unsigned minor; 23 | unsigned patch; 24 | std::string prerelease; 25 | 26 | // Returns current version. 27 | static version current(); 28 | }; 29 | 30 | } // namespace unilib 31 | } // namespace ufal 32 | -------------------------------------------------------------------------------- /img/cmake.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamalkraj/BERT-NER/e5be564156f194f1becb0d82aeaf6e762d9eb9ed/img/cmake.png -------------------------------------------------------------------------------- /img/curl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamalkraj/BERT-NER/e5be564156f194f1becb0d82aeaf6e762d9eb9ed/img/curl.png -------------------------------------------------------------------------------- /img/inference.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamalkraj/BERT-NER/e5be564156f194f1becb0d82aeaf6e762d9eb9ed/img/inference.png -------------------------------------------------------------------------------- /img/make.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamalkraj/BERT-NER/e5be564156f194f1becb0d82aeaf6e762d9eb9ed/img/make.png -------------------------------------------------------------------------------- /img/postman.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamalkraj/BERT-NER/e5be564156f194f1becb0d82aeaf6e762d9eb9ed/img/postman.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pytorch-transformers==1.2.0 2 | torch==1.2.0 3 | # metric 4 | seqeval==0.0.5 5 | # training progressbar 6 | tqdm==4.31.1 7 | # tokeniztion 8 | nltk==3.4.5 9 | # for rest api 10 | Flask==1.1.1 11 | Flask-Cors==3.0.8 12 | -------------------------------------------------------------------------------- /run_ner.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import argparse 4 | import csv 5 | import json 6 | import logging 7 | import os 8 | import random 9 | import sys 10 | 11 | import numpy as np 12 | import torch 13 | import torch.nn.functional as F 14 | from pytorch_transformers import (WEIGHTS_NAME, AdamW, BertConfig, 15 | BertForTokenClassification, BertTokenizer, 16 | WarmupLinearSchedule) 17 | from torch import nn 18 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, 19 | TensorDataset) 20 | from torch.utils.data.distributed import DistributedSampler 21 | from tqdm import tqdm, trange 22 | 23 | from seqeval.metrics import classification_report 24 | 25 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 26 | datefmt = '%m/%d/%Y %H:%M:%S', 27 | level = logging.INFO) 28 | logger = logging.getLogger(__name__) 29 | 30 | class Ner(BertForTokenClassification): 31 | 32 | def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,valid_ids=None,attention_mask_label=None): 33 | sequence_output = self.bert(input_ids, token_type_ids, attention_mask,head_mask=None)[0] 34 | batch_size,max_len,feat_dim = sequence_output.shape 35 | valid_output = torch.zeros(batch_size,max_len,feat_dim,dtype=torch.float32,device='cuda') 36 | for i in range(batch_size): 37 | jj = -1 38 | for j in range(max_len): 39 | if valid_ids[i][j].item() == 1: 40 | jj += 1 41 | valid_output[i][jj] = sequence_output[i][j] 42 | sequence_output = self.dropout(valid_output) 43 | logits = self.classifier(sequence_output) 44 | 45 | if labels is not None: 46 | loss_fct = nn.CrossEntropyLoss(ignore_index=0) 47 | # Only keep active parts of the loss 48 | #attention_mask_label = None 49 | if attention_mask_label is not None: 50 | active_loss = attention_mask_label.view(-1) == 1 51 | active_logits = logits.view(-1, self.num_labels)[active_loss] 52 | active_labels = labels.view(-1)[active_loss] 53 | loss = loss_fct(active_logits, active_labels) 54 | else: 55 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 56 | return loss 57 | else: 58 | return logits 59 | 60 | 61 | class InputExample(object): 62 | """A single training/test example for simple sequence classification.""" 63 | 64 | def __init__(self, guid, text_a, text_b=None, label=None): 65 | """Constructs a InputExample. 66 | 67 | Args: 68 | guid: Unique id for the example. 69 | text_a: string. The untokenized text of the first sequence. For single 70 | sequence tasks, only this sequence must be specified. 71 | text_b: (Optional) string. The untokenized text of the second sequence. 72 | Only must be specified for sequence pair tasks. 73 | label: (Optional) string. The label of the example. This should be 74 | specified for train and dev examples, but not for test examples. 75 | """ 76 | self.guid = guid 77 | self.text_a = text_a 78 | self.text_b = text_b 79 | self.label = label 80 | 81 | class InputFeatures(object): 82 | """A single set of features of data.""" 83 | 84 | def __init__(self, input_ids, input_mask, segment_ids, label_id, valid_ids=None, label_mask=None): 85 | self.input_ids = input_ids 86 | self.input_mask = input_mask 87 | self.segment_ids = segment_ids 88 | self.label_id = label_id 89 | self.valid_ids = valid_ids 90 | self.label_mask = label_mask 91 | 92 | def readfile(filename): 93 | ''' 94 | read file 95 | ''' 96 | f = open(filename) 97 | data = [] 98 | sentence = [] 99 | label= [] 100 | for line in f: 101 | if len(line)==0 or line.startswith('-DOCSTART') or line[0]=="\n": 102 | if len(sentence) > 0: 103 | data.append((sentence,label)) 104 | sentence = [] 105 | label = [] 106 | continue 107 | splits = line.split(' ') 108 | sentence.append(splits[0]) 109 | label.append(splits[-1][:-1]) 110 | 111 | if len(sentence) >0: 112 | data.append((sentence,label)) 113 | sentence = [] 114 | label = [] 115 | return data 116 | 117 | class DataProcessor(object): 118 | """Base class for data converters for sequence classification data sets.""" 119 | 120 | def get_train_examples(self, data_dir): 121 | """Gets a collection of `InputExample`s for the train set.""" 122 | raise NotImplementedError() 123 | 124 | def get_dev_examples(self, data_dir): 125 | """Gets a collection of `InputExample`s for the dev set.""" 126 | raise NotImplementedError() 127 | 128 | def get_labels(self): 129 | """Gets the list of labels for this data set.""" 130 | raise NotImplementedError() 131 | 132 | @classmethod 133 | def _read_tsv(cls, input_file, quotechar=None): 134 | """Reads a tab separated value file.""" 135 | return readfile(input_file) 136 | 137 | 138 | class NerProcessor(DataProcessor): 139 | """Processor for the CoNLL-2003 data set.""" 140 | 141 | def get_train_examples(self, data_dir): 142 | """See base class.""" 143 | return self._create_examples( 144 | self._read_tsv(os.path.join(data_dir, "train.txt")), "train") 145 | 146 | def get_dev_examples(self, data_dir): 147 | """See base class.""" 148 | return self._create_examples( 149 | self._read_tsv(os.path.join(data_dir, "valid.txt")), "dev") 150 | 151 | def get_test_examples(self, data_dir): 152 | """See base class.""" 153 | return self._create_examples( 154 | self._read_tsv(os.path.join(data_dir, "test.txt")), "test") 155 | 156 | def get_labels(self): 157 | return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "[CLS]", "[SEP]"] 158 | 159 | def _create_examples(self,lines,set_type): 160 | examples = [] 161 | for i,(sentence,label) in enumerate(lines): 162 | guid = "%s-%s" % (set_type, i) 163 | text_a = ' '.join(sentence) 164 | text_b = None 165 | label = label 166 | examples.append(InputExample(guid=guid,text_a=text_a,text_b=text_b,label=label)) 167 | return examples 168 | 169 | def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer): 170 | """Loads a data file into a list of `InputBatch`s.""" 171 | 172 | label_map = {label : i for i, label in enumerate(label_list,1)} 173 | 174 | features = [] 175 | for (ex_index,example) in enumerate(examples): 176 | textlist = example.text_a.split(' ') 177 | labellist = example.label 178 | tokens = [] 179 | labels = [] 180 | valid = [] 181 | label_mask = [] 182 | for i, word in enumerate(textlist): 183 | token = tokenizer.tokenize(word) 184 | tokens.extend(token) 185 | label_1 = labellist[i] 186 | for m in range(len(token)): 187 | if m == 0: 188 | labels.append(label_1) 189 | valid.append(1) 190 | label_mask.append(1) 191 | else: 192 | valid.append(0) 193 | if len(tokens) >= max_seq_length - 1: 194 | tokens = tokens[0:(max_seq_length - 2)] 195 | labels = labels[0:(max_seq_length - 2)] 196 | valid = valid[0:(max_seq_length - 2)] 197 | label_mask = label_mask[0:(max_seq_length - 2)] 198 | ntokens = [] 199 | segment_ids = [] 200 | label_ids = [] 201 | ntokens.append("[CLS]") 202 | segment_ids.append(0) 203 | valid.insert(0,1) 204 | label_mask.insert(0,1) 205 | label_ids.append(label_map["[CLS]"]) 206 | for i, token in enumerate(tokens): 207 | ntokens.append(token) 208 | segment_ids.append(0) 209 | if len(labels) > i: 210 | label_ids.append(label_map[labels[i]]) 211 | ntokens.append("[SEP]") 212 | segment_ids.append(0) 213 | valid.append(1) 214 | label_mask.append(1) 215 | label_ids.append(label_map["[SEP]"]) 216 | input_ids = tokenizer.convert_tokens_to_ids(ntokens) 217 | input_mask = [1] * len(input_ids) 218 | label_mask = [1] * len(label_ids) 219 | while len(input_ids) < max_seq_length: 220 | input_ids.append(0) 221 | input_mask.append(0) 222 | segment_ids.append(0) 223 | label_ids.append(0) 224 | valid.append(1) 225 | label_mask.append(0) 226 | while len(label_ids) < max_seq_length: 227 | label_ids.append(0) 228 | label_mask.append(0) 229 | assert len(input_ids) == max_seq_length 230 | assert len(input_mask) == max_seq_length 231 | assert len(segment_ids) == max_seq_length 232 | assert len(label_ids) == max_seq_length 233 | assert len(valid) == max_seq_length 234 | assert len(label_mask) == max_seq_length 235 | 236 | if ex_index < 5: 237 | logger.info("*** Example ***") 238 | logger.info("guid: %s" % (example.guid)) 239 | logger.info("tokens: %s" % " ".join( 240 | [str(x) for x in tokens])) 241 | logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 242 | logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) 243 | logger.info( 244 | "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) 245 | # logger.info("label: %s (id = %d)" % (example.label, label_ids)) 246 | 247 | features.append( 248 | InputFeatures(input_ids=input_ids, 249 | input_mask=input_mask, 250 | segment_ids=segment_ids, 251 | label_id=label_ids, 252 | valid_ids=valid, 253 | label_mask=label_mask)) 254 | return features 255 | 256 | def main(): 257 | parser = argparse.ArgumentParser() 258 | 259 | ## Required parameters 260 | parser.add_argument("--data_dir", 261 | default=None, 262 | type=str, 263 | required=True, 264 | help="The input data dir. Should contain the .tsv files (or other data files) for the task.") 265 | parser.add_argument("--bert_model", default=None, type=str, required=True, 266 | help="Bert pre-trained model selected in the list: bert-base-uncased, " 267 | "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " 268 | "bert-base-multilingual-cased, bert-base-chinese.") 269 | parser.add_argument("--task_name", 270 | default=None, 271 | type=str, 272 | required=True, 273 | help="The name of the task to train.") 274 | parser.add_argument("--output_dir", 275 | default=None, 276 | type=str, 277 | required=True, 278 | help="The output directory where the model predictions and checkpoints will be written.") 279 | 280 | ## Other parameters 281 | parser.add_argument("--cache_dir", 282 | default="", 283 | type=str, 284 | help="Where do you want to store the pre-trained models downloaded from s3") 285 | parser.add_argument("--max_seq_length", 286 | default=128, 287 | type=int, 288 | help="The maximum total input sequence length after WordPiece tokenization. \n" 289 | "Sequences longer than this will be truncated, and sequences shorter \n" 290 | "than this will be padded.") 291 | parser.add_argument("--do_train", 292 | action='store_true', 293 | help="Whether to run training.") 294 | parser.add_argument("--do_eval", 295 | action='store_true', 296 | help="Whether to run eval or not.") 297 | parser.add_argument("--eval_on", 298 | default="dev", 299 | help="Whether to run eval on the dev set or test set.") 300 | parser.add_argument("--do_lower_case", 301 | action='store_true', 302 | help="Set this flag if you are using an uncased model.") 303 | parser.add_argument("--train_batch_size", 304 | default=32, 305 | type=int, 306 | help="Total batch size for training.") 307 | parser.add_argument("--eval_batch_size", 308 | default=8, 309 | type=int, 310 | help="Total batch size for eval.") 311 | parser.add_argument("--learning_rate", 312 | default=5e-5, 313 | type=float, 314 | help="The initial learning rate for Adam.") 315 | parser.add_argument("--num_train_epochs", 316 | default=3.0, 317 | type=float, 318 | help="Total number of training epochs to perform.") 319 | parser.add_argument("--warmup_proportion", 320 | default=0.1, 321 | type=float, 322 | help="Proportion of training to perform linear learning rate warmup for. " 323 | "E.g., 0.1 = 10%% of training.") 324 | parser.add_argument("--weight_decay", default=0.01, type=float, 325 | help="Weight deay if we apply some.") 326 | parser.add_argument("--adam_epsilon", default=1e-8, type=float, 327 | help="Epsilon for Adam optimizer.") 328 | parser.add_argument("--max_grad_norm", default=1.0, type=float, 329 | help="Max gradient norm.") 330 | parser.add_argument("--no_cuda", 331 | action='store_true', 332 | help="Whether not to use CUDA when available") 333 | parser.add_argument("--local_rank", 334 | type=int, 335 | default=-1, 336 | help="local_rank for distributed training on gpus") 337 | parser.add_argument('--seed', 338 | type=int, 339 | default=42, 340 | help="random seed for initialization") 341 | parser.add_argument('--gradient_accumulation_steps', 342 | type=int, 343 | default=1, 344 | help="Number of updates steps to accumulate before performing a backward/update pass.") 345 | parser.add_argument('--fp16', 346 | action='store_true', 347 | help="Whether to use 16-bit float precision instead of 32-bit") 348 | parser.add_argument('--fp16_opt_level', type=str, default='O1', 349 | help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." 350 | "See details at https://nvidia.github.io/apex/amp.html") 351 | parser.add_argument('--loss_scale', 352 | type=float, default=0, 353 | help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" 354 | "0 (default value): dynamic loss scaling.\n" 355 | "Positive power of 2: static loss scaling value.\n") 356 | parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") 357 | parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") 358 | args = parser.parse_args() 359 | 360 | if args.server_ip and args.server_port: 361 | # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script 362 | import ptvsd 363 | print("Waiting for debugger attach") 364 | ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) 365 | ptvsd.wait_for_attach() 366 | 367 | processors = {"ner":NerProcessor} 368 | 369 | if args.local_rank == -1 or args.no_cuda: 370 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 371 | n_gpu = torch.cuda.device_count() 372 | else: 373 | torch.cuda.set_device(args.local_rank) 374 | device = torch.device("cuda", args.local_rank) 375 | n_gpu = 1 376 | # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 377 | torch.distributed.init_process_group(backend='nccl') 378 | logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( 379 | device, n_gpu, bool(args.local_rank != -1), args.fp16)) 380 | 381 | if args.gradient_accumulation_steps < 1: 382 | raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( 383 | args.gradient_accumulation_steps)) 384 | 385 | args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps 386 | 387 | random.seed(args.seed) 388 | np.random.seed(args.seed) 389 | torch.manual_seed(args.seed) 390 | 391 | if not args.do_train and not args.do_eval: 392 | raise ValueError("At least one of `do_train` or `do_eval` must be True.") 393 | 394 | if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: 395 | raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) 396 | if not os.path.exists(args.output_dir): 397 | os.makedirs(args.output_dir) 398 | 399 | task_name = args.task_name.lower() 400 | 401 | if task_name not in processors: 402 | raise ValueError("Task not found: %s" % (task_name)) 403 | 404 | processor = processors[task_name]() 405 | label_list = processor.get_labels() 406 | num_labels = len(label_list) + 1 407 | 408 | tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) 409 | 410 | train_examples = None 411 | num_train_optimization_steps = 0 412 | if args.do_train: 413 | train_examples = processor.get_train_examples(args.data_dir) 414 | num_train_optimization_steps = int( 415 | len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs 416 | if args.local_rank != -1: 417 | num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() 418 | 419 | if args.local_rank not in [-1, 0]: 420 | torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab 421 | 422 | # Prepare model 423 | config = BertConfig.from_pretrained(args.bert_model, num_labels=num_labels, finetuning_task=args.task_name) 424 | model = Ner.from_pretrained(args.bert_model, 425 | from_tf = False, 426 | config = config) 427 | 428 | if args.local_rank == 0: 429 | torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab 430 | 431 | model.to(device) 432 | 433 | param_optimizer = list(model.named_parameters()) 434 | no_decay = ['bias','LayerNorm.weight'] 435 | optimizer_grouped_parameters = [ 436 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, 437 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 438 | ] 439 | warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) 440 | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) 441 | scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) 442 | if args.fp16: 443 | try: 444 | from apex import amp 445 | except ImportError: 446 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") 447 | model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) 448 | 449 | # multi-gpu training (should be after apex fp16 initialization) 450 | if n_gpu > 1: 451 | model = torch.nn.DataParallel(model) 452 | 453 | if args.local_rank != -1: 454 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], 455 | output_device=args.local_rank, 456 | find_unused_parameters=True) 457 | 458 | global_step = 0 459 | nb_tr_steps = 0 460 | tr_loss = 0 461 | label_map = {i : label for i, label in enumerate(label_list,1)} 462 | if args.do_train: 463 | train_features = convert_examples_to_features( 464 | train_examples, label_list, args.max_seq_length, tokenizer) 465 | logger.info("***** Running training *****") 466 | logger.info(" Num examples = %d", len(train_examples)) 467 | logger.info(" Batch size = %d", args.train_batch_size) 468 | logger.info(" Num steps = %d", num_train_optimization_steps) 469 | all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) 470 | all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) 471 | all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) 472 | all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) 473 | all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) 474 | all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) 475 | train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_valid_ids,all_lmask_ids) 476 | if args.local_rank == -1: 477 | train_sampler = RandomSampler(train_data) 478 | else: 479 | train_sampler = DistributedSampler(train_data) 480 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) 481 | 482 | model.train() 483 | for _ in trange(int(args.num_train_epochs), desc="Epoch"): 484 | tr_loss = 0 485 | nb_tr_examples, nb_tr_steps = 0, 0 486 | for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): 487 | batch = tuple(t.to(device) for t in batch) 488 | input_ids, input_mask, segment_ids, label_ids, valid_ids,l_mask = batch 489 | loss = model(input_ids, segment_ids, input_mask, label_ids,valid_ids,l_mask) 490 | if n_gpu > 1: 491 | loss = loss.mean() # mean() to average on multi-gpu. 492 | if args.gradient_accumulation_steps > 1: 493 | loss = loss / args.gradient_accumulation_steps 494 | 495 | if args.fp16: 496 | with amp.scale_loss(loss, optimizer) as scaled_loss: 497 | scaled_loss.backward() 498 | torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) 499 | else: 500 | loss.backward() 501 | torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) 502 | 503 | tr_loss += loss.item() 504 | nb_tr_examples += input_ids.size(0) 505 | nb_tr_steps += 1 506 | if (step + 1) % args.gradient_accumulation_steps == 0: 507 | optimizer.step() 508 | scheduler.step() # Update learning rate schedule 509 | model.zero_grad() 510 | global_step += 1 511 | 512 | # Save a trained model and the associated configuration 513 | model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self 514 | model_to_save.save_pretrained(args.output_dir) 515 | tokenizer.save_pretrained(args.output_dir) 516 | label_map = {i : label for i, label in enumerate(label_list,1)} 517 | model_config = {"bert_model":args.bert_model,"do_lower":args.do_lower_case,"max_seq_length":args.max_seq_length,"num_labels":len(label_list)+1,"label_map":label_map} 518 | json.dump(model_config,open(os.path.join(args.output_dir,"model_config.json"),"w")) 519 | # Load a trained model and config that you have fine-tuned 520 | else: 521 | # Load a trained model and vocabulary that you have fine-tuned 522 | model = Ner.from_pretrained(args.output_dir) 523 | tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) 524 | 525 | model.to(device) 526 | 527 | if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): 528 | if args.eval_on == "dev": 529 | eval_examples = processor.get_dev_examples(args.data_dir) 530 | elif args.eval_on == "test": 531 | eval_examples = processor.get_test_examples(args.data_dir) 532 | else: 533 | raise ValueError("eval on dev or test set only") 534 | eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) 535 | logger.info("***** Running evaluation *****") 536 | logger.info(" Num examples = %d", len(eval_examples)) 537 | logger.info(" Batch size = %d", args.eval_batch_size) 538 | all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) 539 | all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) 540 | all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) 541 | all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) 542 | all_valid_ids = torch.tensor([f.valid_ids for f in eval_features], dtype=torch.long) 543 | all_lmask_ids = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long) 544 | eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_valid_ids,all_lmask_ids) 545 | # Run prediction for full data 546 | eval_sampler = SequentialSampler(eval_data) 547 | eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) 548 | model.eval() 549 | eval_loss, eval_accuracy = 0, 0 550 | nb_eval_steps, nb_eval_examples = 0, 0 551 | y_true = [] 552 | y_pred = [] 553 | label_map = {i : label for i, label in enumerate(label_list,1)} 554 | for input_ids, input_mask, segment_ids, label_ids,valid_ids,l_mask in tqdm(eval_dataloader, desc="Evaluating"): 555 | input_ids = input_ids.to(device) 556 | input_mask = input_mask.to(device) 557 | segment_ids = segment_ids.to(device) 558 | valid_ids = valid_ids.to(device) 559 | label_ids = label_ids.to(device) 560 | l_mask = l_mask.to(device) 561 | 562 | with torch.no_grad(): 563 | logits = model(input_ids, segment_ids, input_mask,valid_ids=valid_ids,attention_mask_label=l_mask) 564 | 565 | logits = torch.argmax(F.log_softmax(logits,dim=2),dim=2) 566 | logits = logits.detach().cpu().numpy() 567 | label_ids = label_ids.to('cpu').numpy() 568 | input_mask = input_mask.to('cpu').numpy() 569 | 570 | for i, label in enumerate(label_ids): 571 | temp_1 = [] 572 | temp_2 = [] 573 | for j,m in enumerate(label): 574 | if j == 0: 575 | continue 576 | elif label_ids[i][j] == len(label_map): 577 | y_true.append(temp_1) 578 | y_pred.append(temp_2) 579 | break 580 | else: 581 | temp_1.append(label_map[label_ids[i][j]]) 582 | temp_2.append(label_map[logits[i][j]]) 583 | 584 | report = classification_report(y_true, y_pred,digits=4) 585 | logger.info("\n%s", report) 586 | output_eval_file = os.path.join(args.output_dir, "eval_results.txt") 587 | with open(output_eval_file, "w") as writer: 588 | logger.info("***** Eval results *****") 589 | logger.info("\n%s", report) 590 | writer.write(report) 591 | 592 | 593 | if __name__ == "__main__": 594 | main() 595 | --------------------------------------------------------------------------------