├── .gitignore ├── .travis.yml ├── Doxyfile ├── LICENSE ├── LICENSE.Unicode ├── Makefile ├── README.md ├── data └── ucd │ ├── CaseFolding.txt │ ├── CompositionExclusions.txt │ ├── DerivedCoreProperties.txt │ ├── EastAsianWidth.txt │ ├── NormalizationTest.txt │ ├── PropList.txt │ ├── UnicodeData.txt │ ├── auxiliary │ ├── GraphemeBreakProperty.txt │ ├── GraphemeBreakTest.txt │ ├── WordBreakProperty.txt │ └── WordBreakTest.txt │ └── emoji │ └── emoji-data.txt ├── src ├── array.c ├── char.c ├── encode.c ├── error.c ├── escape.c ├── graph.c ├── graphscan.c ├── normalize.c ├── private │ ├── array.h │ ├── casefold.h │ ├── charwidth.h │ ├── combining.h │ ├── compose.h │ ├── decompose.h │ ├── emojiprop.h │ ├── graphbreak.h │ └── wordbreak.h ├── render.c ├── text.c ├── textassign.c ├── textiter.c ├── textmap.c ├── utf8lite.h └── wordscan.c ├── tests ├── check_charwidth.c ├── check_graphscan.c ├── check_render.c ├── check_text.c ├── check_textmap.c ├── check_unicode.c ├── check_wordscan.c ├── testutil.c ├── testutil.h └── wcwidth9 │ ├── LICENSE │ └── wcwidth9.h ├── utf8lite.xcodeproj └── project.pbxproj └── util ├── compute-typelen.py ├── gen-casefold.py ├── gen-charwidth.py ├── gen-combining.py ├── gen-compose.py ├── gen-decompose.py ├── gen-emojiprop.py ├── gen-graphbreak.py ├── gen-normalization.py ├── gen-wordbreak.py ├── property.py ├── table-graphbreak.c └── unicode_data.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.a 2 | *.o 3 | __pycache__ 4 | a.out 5 | project.xcworkspace 6 | xcuserdata 7 | /doc/html 8 | /tests/check_charwidth 9 | /tests/check_graphscan 10 | /tests/check_render 11 | /tests/check_text 12 | /tests/check_textmap 13 | /tests/check_unicode 14 | /tests/check_wordscan 15 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c 2 | 3 | before_install: 4 | - sudo apt-get -qq update 5 | - sudo apt-get install -y check 6 | - export CFLAGS="-coverage -O0" 7 | - export LDFLAGS="-coverage" 8 | 9 | install: 10 | - make 11 | 12 | script: 13 | - make check || exit 1 14 | - gcov -o src src/*.c 15 | 16 | after_success: 17 | - bash <(curl -s https://codecov.io/bash) -X gcov 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /LICENSE.Unicode: -------------------------------------------------------------------------------- 1 | UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE 2 | 3 | Unicode Data Files include all data files under the directories 4 | http://www.unicode.org/Public/, http://www.unicode.org/reports/, 5 | http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and 6 | http://www.unicode.org/utility/trac/browser/. 7 | 8 | Unicode Data Files do not include PDF online code charts under the 9 | directory http://www.unicode.org/Public/. 10 | 11 | Software includes any source code published in the Unicode Standard 12 | or under the directories 13 | http://www.unicode.org/Public/, http://www.unicode.org/reports/, 14 | http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and 15 | http://www.unicode.org/utility/trac/browser/. 16 | 17 | NOTICE TO USER: Carefully read the following legal agreement. 18 | BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S 19 | DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), 20 | YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE 21 | TERMS AND CONDITIONS OF THIS AGREEMENT. 22 | IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE 23 | THE DATA FILES OR SOFTWARE. 24 | 25 | COPYRIGHT AND PERMISSION NOTICE 26 | 27 | Copyright (c) 1991-2017 Unicode, Inc. All rights reserved. 28 | Distributed under the Terms of Use in http://www.unicode.org/copyright.html. 29 | 30 | Permission is hereby granted, free of charge, to any person obtaining 31 | a copy of the Unicode data files and any associated documentation 32 | (the "Data Files") or Unicode software and any associated documentation 33 | (the "Software") to deal in the Data Files or Software 34 | without restriction, including without limitation the rights to use, 35 | copy, modify, merge, publish, distribute, and/or sell copies of 36 | the Data Files or Software, and to permit persons to whom the Data Files 37 | or Software are furnished to do so, provided that either 38 | (a) this copyright and permission notice appear with all copies 39 | of the Data Files or Software, or 40 | (b) this copyright and permission notice appear in associated 41 | Documentation. 42 | 43 | THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF 44 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 45 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 46 | NONINFRINGEMENT OF THIRD PARTY RIGHTS. 47 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS 48 | NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL 49 | DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 50 | DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 51 | TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 52 | PERFORMANCE OF THE DATA FILES OR SOFTWARE. 53 | 54 | Except as contained in this notice, the name of a copyright holder 55 | shall not be used in advertising or otherwise to promote the sale, 56 | use or other dealings in these Data Files or Software without prior 57 | written authorization of the copyright holder. 58 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CC += -std=c99 2 | 3 | CFLAGS += -Wall -Wextra -pedantic -Werror \ 4 | -Wno-cast-qual \ 5 | -Wno-padded \ 6 | -Wno-unused-macros \ 7 | -g 8 | 9 | #LDFLAGS += 10 | LIBS += -lm 11 | AR = ar rcu 12 | RANLIB = ranlib 13 | MKDIR_P = mkdir -p 14 | CURL = curl 15 | 16 | LIB_CFLAGS = \ 17 | -Wno-cast-align \ 18 | -Wno-cast-qual \ 19 | -Wno-float-equal \ 20 | -Wno-missing-prototypes \ 21 | -Wno-sign-conversion \ 22 | -Wno-unreachable-code-break 23 | 24 | TEST_CFLAGS = $(shell pkg-config --cflags check) \ 25 | -Wno-double-promotion \ 26 | -Wno-float-equal \ 27 | -Wno-gnu-zero-variadic-macro-arguments \ 28 | -Wno-missing-prototypes \ 29 | -Wno-missing-variable-declarations \ 30 | -Wno-reserved-id-macro \ 31 | -Wno-strict-prototypes \ 32 | -Wno-used-but-marked-unused 33 | 34 | TEST_LIBS = $(shell pkg-config --libs check) 35 | 36 | UNICODE = http://www.unicode.org/Public/15.1.0 37 | 38 | UTF8LITE_A = libutf8lite.a 39 | LIB_O = src/array.o src/char.o src/encode.o src/error.o src/escape.o \ 40 | src/graph.o src/graphscan.o src/normalize.o src/render.o src/text.o \ 41 | src/textassign.o src/textiter.o src/textmap.o src/wordscan.o 42 | 43 | DATA = data/ucd/emoji/emoji-data.txt \ 44 | data/ucd/CaseFolding.txt \ 45 | data/ucd/CompositionExclusions.txt \ 46 | data/ucd/DerivedCoreProperties.txt \ 47 | data/ucd/EastAsianWidth.txt \ 48 | data/ucd/PropList.txt \ 49 | data/ucd/Scripts.txt \ 50 | data/ucd/UnicodeData.txt \ 51 | data/ucd/auxiliary/GraphemeBreakProperty.txt \ 52 | data/ucd/auxiliary/WordBreakProperty.txt 53 | 54 | TESTS_T = tests/check_charwidth tests/check_graphscan tests/check_render \ 55 | tests/check_text tests/check_textmap tests/check_unicode \ 56 | tests/check_wordscan 57 | TESTS_O = tests/check_charwidth.o tests/check_graphscan.o tests/check_render.o \ 58 | tests/check_text.o test/check_textmap.o tests/check_unicode.o \ 59 | tests/check_wordscan.o tests/testutil.o 60 | 61 | TESTS_DATA = data/ucd/NormalizationTest.txt \ 62 | data/ucd/auxiliary/GraphemeBreakTest.txt \ 63 | data/ucd/auxiliary/WordBreakTest.txt 64 | 65 | ALL_O = $(LIB_O) $(UTF8LITE_O) $(STEMMER_O) 66 | ALL_T = $(UTF8LITE_A) $(UTF8LITE_T) 67 | ALL_A = $(UTF8LITE_A) 68 | 69 | 70 | # Products 71 | 72 | all: $(ALL_T) 73 | 74 | $(UTF8LITE_A): $(LIB_O) $(STEMMER_O) 75 | $(AR) $@ $(LIB_O) $(STEMMER_O) 76 | $(RANLIB) $@ 77 | 78 | $(UTF8LITE_T): $(UTF8LITE_O) $(UTF8LITE_A) 79 | $(CC) -o $@ $(UTF8LITE_O) $(UTF8LITE_A) $(LIBS) $(LDFLAGS) 80 | 81 | 82 | # Data 83 | 84 | data/ucd/emoji/emoji-data.txt: 85 | $(MKDIR_P) data/ucd/emoji 86 | $(CURL) -o $@ $(UNICODE)/ucd/emoji/emoji-data.txt 87 | 88 | data/ucd/CaseFolding.txt: 89 | $(MKDIR_P) data/ucd 90 | $(CURL) -o $@ $(UNICODE)/ucd/CaseFolding.txt 91 | 92 | data/ucd/CompositionExclusions.txt: 93 | $(MKDIR_P) data/ucd 94 | $(CURL) -o $@ $(UNICODE)/ucd/CompositionExclusions.txt 95 | 96 | data/ucd/DerivedCoreProperties.txt: 97 | $(MKDIR_P) data/ucd 98 | $(CURL) -o $@ $(UNICODE)/ucd/DerivedCoreProperties.txt 99 | 100 | data/ucd/EastAsianWidth.txt: 101 | $(MKDIR_P) data/ucd 102 | $(CURL) -o $@ $(UNICODE)/ucd/EastAsianWidth.txt 103 | 104 | data/ucd/PropList.txt: 105 | $(MKDIR_P) data/ucd 106 | $(CURL) -o $@ $(UNICODE)/ucd/PropList.txt 107 | 108 | data/ucd/Scripts.txt: 109 | $(MKDIR_P) data/ucd 110 | $(CURL) -o $@ $(UNICODE)/ucd/Scripts.txt 111 | 112 | data/ucd/NormalizationTest.txt: 113 | $(MKDIR_P) data/ucd 114 | $(CURL) -o $@ $(UNICODE)/ucd/NormalizationTest.txt 115 | 116 | data/ucd/UnicodeData.txt: 117 | $(MKDIR_P) data/ucd 118 | $(CURL) -o $@ $(UNICODE)/ucd/UnicodeData.txt 119 | 120 | data/ucd/auxiliary/GraphemeBreakProperty.txt: 121 | $(MKDIR_P) data/ucd/auxiliary 122 | $(CURL) -o $@ $(UNICODE)/ucd/auxiliary/GraphemeBreakProperty.txt 123 | 124 | data/ucd/auxiliary/GraphemeBreakTest.txt: 125 | $(MKDIR_P) data/ucd/auxiliary 126 | $(CURL) -o $@ $(UNICODE)/ucd/auxiliary/GraphemeBreakTest.txt 127 | 128 | data/ucd/auxiliary/WordBreakProperty.txt: 129 | $(MKDIR_P) data/ucd/auxiliary 130 | $(CURL) -o $@ $(UNICODE)/ucd/auxiliary/WordBreakProperty.txt 131 | 132 | data/ucd/auxiliary/WordBreakTest.txt: 133 | $(MKDIR_P) data/ucd/auxiliary 134 | $(CURL) -o $@ $(UNICODE)/ucd/auxiliary/WordBreakTest.txt 135 | 136 | # Generated Sources 137 | 138 | src/private/casefold.h: util/gen-casefold.py \ 139 | data/ucd/CaseFolding.txt 140 | $(MKDIR_P) src/private 141 | ./util/gen-casefold.py > $@ 142 | 143 | src/private/charwidth.h: util/gen-charwidth.py util/property.py util/unicode_data.py \ 144 | data/ucd/emoji/emoji-data.txt data/ucd/DerivedCoreProperties.txt \ 145 | data/ucd/EastAsianWidth.txt data/ucd/UnicodeData.txt 146 | $(MKDIR_P) src/private 147 | ./util/gen-charwidth.py > $@ 148 | 149 | src/private/combining.h: util/gen-combining.py util/unicode_data.py \ 150 | data/ucd/UnicodeData.txt 151 | $(MKDIR_P) src/private 152 | ./util/gen-combining.py > $@ 153 | 154 | src/private/compose.h: util/gen-compose.py util/unicode_data.py \ 155 | data/ucd/CompositionExclusions.txt data/ucd/UnicodeData.txt 156 | $(MKDIR_P) src/private 157 | ./util/gen-compose.py > $@ 158 | 159 | src/private/decompose.h: util/gen-decompose.py util/unicode_data.py \ 160 | data/ucd/UnicodeData.txt 161 | $(MKDIR_P) src/private 162 | ./util/gen-decompose.py > $@ 163 | 164 | src/private/emojiprop.h: util/gen-emojiprop.py data/ucd/emoji/emoji-data.txt 165 | $(MKDIR_P) src/private 166 | ./util/gen-emojiprop.py > $@ 167 | 168 | src/private/graphbreak.h: util/gen-graphbreak.py util/gen-graphbreak.py \ 169 | data/ucd/emoji/emoji-data.txt \ 170 | data/ucd/auxiliary/GraphemeBreakProperty.txt 171 | $(MKDIR_P) src/private 172 | ./util/gen-graphbreak.py > $@ 173 | 174 | src/private/normalization.h: util/gen-normalization.py \ 175 | data/ucd/DerivedNormalizationProps.txt 176 | $(MKDIR_P) src/private 177 | ./util/gen-normalization.py > $@ 178 | 179 | src/private/wordbreak.h: util/gen-wordbreak.py \ 180 | data/ucd/DerivedCoreProperties.txt \ 181 | data/ucd/PropList.txt \ 182 | data/ucd/auxiliary/WordBreakProperty.txt 183 | $(MKDIR_P) src/private 184 | ./util/gen-wordbreak.py > $@ 185 | 186 | 187 | # Tests 188 | 189 | tests/check_charwidth: tests/check_charwidth.o tests/testutil.o $(UTF8LITE_A) 190 | $(CC) -o $@ $^ $(LIBS) $(TEST_LIBS) $(LDFLAGS) 191 | 192 | tests/check_graphscan: tests/check_graphscan.o tests/testutil.o $(UTF8LITE_A) \ 193 | data/ucd/auxiliary/GraphemeBreakTest.txt 194 | $(CC) -o $@ tests/check_graphscan.o tests/testutil.o $(UTF8LITE_A) \ 195 | $(LIBS) $(TEST_LIBS) $(LDFLAGS) 196 | 197 | tests/check_render: tests/check_render.o tests/testutil.o $(UTF8LITE_A) 198 | $(CC) -o $@ $^ $(LIBS) $(TEST_LIBS) $(LDFLAGS) 199 | 200 | tests/check_text: tests/check_text.o tests/testutil.o $(UTF8LITE_A) 201 | $(CC) -o $@ $^ $(LIBS) $(TEST_LIBS) $(LDFLAGS) 202 | 203 | tests/check_textmap: tests/check_textmap.o tests/testutil.o $(UTF8LITE_A) 204 | $(CC) -o $@ $^ $(LIBS) $(TEST_LIBS) $(LDFLAGS) 205 | 206 | tests/check_unicode: tests/check_unicode.o $(UTF8LITE_A) \ 207 | data/ucd/NormalizationTest.txt 208 | $(CC) -o $@ tests/check_unicode.o $(UTF8LITE_A) \ 209 | $(LIBS) $(TEST_LIBS) $(LDFLAGS) 210 | 211 | tests/check_wordscan: tests/check_wordscan.o tests/testutil.o $(UTF8LITE_A) \ 212 | data/ucd/auxiliary/WordBreakTest.txt 213 | $(CC) -o $@ tests/check_wordscan.o tests/testutil.o $(UTF8LITE_A) \ 214 | $(LIBS) $(TEST_LIBS) $(LDFLAGS) 215 | 216 | 217 | # Special Rules 218 | 219 | check: $(TESTS_T) $(TESTS_T:=.test) 220 | 221 | clean: 222 | $(RM) -r $(ALL_O) $(ALL_T) $(TESTS_O) $(TESTS_T) 223 | 224 | data: $(DATA) $(TESTS_DATA) 225 | 226 | doc: 227 | doxygen 228 | 229 | %.test: % 230 | $< 231 | 232 | lib/%.o: lib/%.c 233 | $(CC) -c $(CFLAGS) $(LIB_CFLAGS) $(CPPFLAGS) $< -o $@ 234 | 235 | tests/%.o: tests/%.c 236 | $(CC) -c $(CFLAGS) $(TEST_CFLAGS) $(CPPFLAGS) $< -o $@ 237 | 238 | 239 | .PHONY: all check clean data doc 240 | 241 | src/array.o: src/array.c src/private/array.h src/utf8lite.h 242 | src/char.o: src/char.c src/private/charwidth.h src/utf8lite.h 243 | src/encode.o: src/encode.c src/utf8lite.h 244 | src/error.o: src/error.c src/utf8lite.h 245 | src/escape.o: src/escape.c src/utf8lite.h 246 | src/graph.o: src/graph.c src/utf8lite.h 247 | src/graphscan.o: src/graphscan.c src/private/graphbreak.h src/utf8lite.h 248 | src/normalize.o: src/normalize.c src/private/casefold.h \ 249 | src/private/combining.h src/private/compose.h src/private/decompose.h \ 250 | src/utf8lite.h 251 | src/render.o: src/render.c src/private/array.h src/utf8lite.h 252 | src/text.o: src/text.c src/utf8lite.h 253 | src/textassign.o: src/textassign.c src/utf8lite.h 254 | src/textiter.o: src/textiter.c src/utf8lite.h 255 | src/textmap.o: src/textmap.c src/utf8lite.h 256 | src/wordscan.o: src/wordscan.c src/private/emojiprop.h \ 257 | src/private/wordbreak.h src/utf8lite.h 258 | 259 | tests/check_charwidth.o: tests/check_charwidth.c src/utf8lite.h tests/testutil.h 260 | tests/check_graphscan.o: tests/check_graphscan.c src/utf8lite.h tests/testutil.h 261 | tests/check_render.o: tests/check_render.c src/utf8lite.h tests/testutil.h 262 | tests/check_text.o: tests/check_text.c src/utf8lite.h tests/testutil.h 263 | tests/check_textmap.o: tests/check_text.c src/utf8lite.h tests/testutil.h 264 | tests/check_unicode.o: tests/check_unicode.c src/utf8lite.h tests/testutil.h 265 | tests/check_wordscan.o: tests/check_wordscan.c src/utf8lite.h tests/testutil.h 266 | tests/testutil.o: tests/testutil.c src/utf8lite.h tests/testutil.h 267 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | utf8lite (C Library) 2 | ==================== 3 | 4 | [![Build Status](https://api.travis-ci.org/patperry/utf8lite.svg?branch=master)](https://travis-ci.org/patperry/utf8lite) 5 | [![Coverage Status](https://codecov.io/github/patperry/utf8lite/coverage.svg?branch=master)](https://codecov.io/github/patperry/utf8lite?branch=master) 6 | 7 | Lightweight UTF-8 processing. 8 | -------------------------------------------------------------------------------- /data/ucd/CompositionExclusions.txt: -------------------------------------------------------------------------------- 1 | # CompositionExclusions-15.1.0.txt 2 | # Date: 2023-01-05 3 | # © 2023 Unicode®, Inc. 4 | # For terms of use, see https://www.unicode.org/terms_of_use.html 5 | # 6 | # Unicode Character Database 7 | # For documentation, see https://www.unicode.org/reports/tr44/ 8 | # 9 | # This file lists the characters for the Composition Exclusion Table 10 | # defined in UAX #15, Unicode Normalization Forms. 11 | # 12 | # This file is a normative contributory data file in the 13 | # Unicode Character Database. 14 | # 15 | # For more information, see 16 | # https://www.unicode.org/reports/tr15/#Primary_Exclusion_List_Table 17 | # 18 | # For a full derivation of composition exclusions, see the derived property 19 | # Full_Composition_Exclusion in DerivedNormalizationProps.txt 20 | # 21 | 22 | # ================================================ 23 | # (1) Script Specifics 24 | # 25 | # This list of characters cannot be derived from the UnicodeData.txt file. 26 | # 27 | # Included are the following subcategories: 28 | # 29 | # - Many precomposed characters using a nukta diacritic in the Devanagari, 30 | # Bangla/Bengali, Gurmukhi, or Odia/Oriya scripts. 31 | # - Tibetan letters and subjoined letters with decompositions including 32 | # U+0FB7 TIBETAN SUBJOINED LETTER HA or U+0FB5 TIBETAN SUBJOINED LETTER SSA. 33 | # - Two two-part Tibetan vowel signs involving top and bottom pieces. 34 | # - A large collection of compatibility precomposed characters for Hebrew 35 | # involving dagesh and/or other combining marks. 36 | # 37 | # This list is unlikely to grow. 38 | # 39 | # ================================================ 40 | 41 | 0958 # DEVANAGARI LETTER QA 42 | 0959 # DEVANAGARI LETTER KHHA 43 | 095A # DEVANAGARI LETTER GHHA 44 | 095B # DEVANAGARI LETTER ZA 45 | 095C # DEVANAGARI LETTER DDDHA 46 | 095D # DEVANAGARI LETTER RHA 47 | 095E # DEVANAGARI LETTER FA 48 | 095F # DEVANAGARI LETTER YYA 49 | 09DC # BENGALI LETTER RRA 50 | 09DD # BENGALI LETTER RHA 51 | 09DF # BENGALI LETTER YYA 52 | 0A33 # GURMUKHI LETTER LLA 53 | 0A36 # GURMUKHI LETTER SHA 54 | 0A59 # GURMUKHI LETTER KHHA 55 | 0A5A # GURMUKHI LETTER GHHA 56 | 0A5B # GURMUKHI LETTER ZA 57 | 0A5E # GURMUKHI LETTER FA 58 | 0B5C # ORIYA LETTER RRA 59 | 0B5D # ORIYA LETTER RHA 60 | 0F43 # TIBETAN LETTER GHA 61 | 0F4D # TIBETAN LETTER DDHA 62 | 0F52 # TIBETAN LETTER DHA 63 | 0F57 # TIBETAN LETTER BHA 64 | 0F5C # TIBETAN LETTER DZHA 65 | 0F69 # TIBETAN LETTER KSSA 66 | 0F76 # TIBETAN VOWEL SIGN VOCALIC R 67 | 0F78 # TIBETAN VOWEL SIGN VOCALIC L 68 | 0F93 # TIBETAN SUBJOINED LETTER GHA 69 | 0F9D # TIBETAN SUBJOINED LETTER DDHA 70 | 0FA2 # TIBETAN SUBJOINED LETTER DHA 71 | 0FA7 # TIBETAN SUBJOINED LETTER BHA 72 | 0FAC # TIBETAN SUBJOINED LETTER DZHA 73 | 0FB9 # TIBETAN SUBJOINED LETTER KSSA 74 | FB1D # HEBREW LETTER YOD WITH HIRIQ 75 | FB1F # HEBREW LIGATURE YIDDISH YOD YOD PATAH 76 | FB2A # HEBREW LETTER SHIN WITH SHIN DOT 77 | FB2B # HEBREW LETTER SHIN WITH SIN DOT 78 | FB2C # HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT 79 | FB2D # HEBREW LETTER SHIN WITH DAGESH AND SIN DOT 80 | FB2E # HEBREW LETTER ALEF WITH PATAH 81 | FB2F # HEBREW LETTER ALEF WITH QAMATS 82 | FB30 # HEBREW LETTER ALEF WITH MAPIQ 83 | FB31 # HEBREW LETTER BET WITH DAGESH 84 | FB32 # HEBREW LETTER GIMEL WITH DAGESH 85 | FB33 # HEBREW LETTER DALET WITH DAGESH 86 | FB34 # HEBREW LETTER HE WITH MAPIQ 87 | FB35 # HEBREW LETTER VAV WITH DAGESH 88 | FB36 # HEBREW LETTER ZAYIN WITH DAGESH 89 | FB38 # HEBREW LETTER TET WITH DAGESH 90 | FB39 # HEBREW LETTER YOD WITH DAGESH 91 | FB3A # HEBREW LETTER FINAL KAF WITH DAGESH 92 | FB3B # HEBREW LETTER KAF WITH DAGESH 93 | FB3C # HEBREW LETTER LAMED WITH DAGESH 94 | FB3E # HEBREW LETTER MEM WITH DAGESH 95 | FB40 # HEBREW LETTER NUN WITH DAGESH 96 | FB41 # HEBREW LETTER SAMEKH WITH DAGESH 97 | FB43 # HEBREW LETTER FINAL PE WITH DAGESH 98 | FB44 # HEBREW LETTER PE WITH DAGESH 99 | FB46 # HEBREW LETTER TSADI WITH DAGESH 100 | FB47 # HEBREW LETTER QOF WITH DAGESH 101 | FB48 # HEBREW LETTER RESH WITH DAGESH 102 | FB49 # HEBREW LETTER SHIN WITH DAGESH 103 | FB4A # HEBREW LETTER TAV WITH DAGESH 104 | FB4B # HEBREW LETTER VAV WITH HOLAM 105 | FB4C # HEBREW LETTER BET WITH RAFE 106 | FB4D # HEBREW LETTER KAF WITH RAFE 107 | FB4E # HEBREW LETTER PE WITH RAFE 108 | 109 | # Total code points: 67 110 | 111 | # ================================================ 112 | # (2) Post Composition Version precomposed characters 113 | # 114 | # These characters cannot be derived solely from the UnicodeData.txt file 115 | # in this version of Unicode. 116 | # 117 | # Note that characters added to the standard after the 118 | # Composition Version and which have canonical decomposition mappings 119 | # are not automatically added to this list of Post Composition 120 | # Version precomposed characters. 121 | # ================================================ 122 | 123 | 2ADC # FORKING 124 | 1D15E # MUSICAL SYMBOL HALF NOTE 125 | 1D15F # MUSICAL SYMBOL QUARTER NOTE 126 | 1D160 # MUSICAL SYMBOL EIGHTH NOTE 127 | 1D161 # MUSICAL SYMBOL SIXTEENTH NOTE 128 | 1D162 # MUSICAL SYMBOL THIRTY-SECOND NOTE 129 | 1D163 # MUSICAL SYMBOL SIXTY-FOURTH NOTE 130 | 1D164 # MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE 131 | 1D1BB # MUSICAL SYMBOL MINIMA 132 | 1D1BC # MUSICAL SYMBOL MINIMA BLACK 133 | 1D1BD # MUSICAL SYMBOL SEMIMINIMA WHITE 134 | 1D1BE # MUSICAL SYMBOL SEMIMINIMA BLACK 135 | 1D1BF # MUSICAL SYMBOL FUSA WHITE 136 | 1D1C0 # MUSICAL SYMBOL FUSA BLACK 137 | 138 | # Total code points: 14 139 | 140 | # ================================================ 141 | # (3) Singleton Decompositions 142 | # 143 | # These characters can be derived from the UnicodeData.txt file 144 | # by including all canonically decomposable characters whose 145 | # canonical decomposition consists of a single character. 146 | # 147 | # These characters are simply quoted here for reference. 148 | # See also Full_Composition_Exclusion in DerivedNormalizationProps.txt 149 | # ================================================ 150 | 151 | # 0340..0341 [2] COMBINING GRAVE TONE MARK..COMBINING ACUTE TONE MARK 152 | # 0343 COMBINING GREEK KORONIS 153 | # 0374 GREEK NUMERAL SIGN 154 | # 037E GREEK QUESTION MARK 155 | # 0387 GREEK ANO TELEIA 156 | # 1F71 GREEK SMALL LETTER ALPHA WITH OXIA 157 | # 1F73 GREEK SMALL LETTER EPSILON WITH OXIA 158 | # 1F75 GREEK SMALL LETTER ETA WITH OXIA 159 | # 1F77 GREEK SMALL LETTER IOTA WITH OXIA 160 | # 1F79 GREEK SMALL LETTER OMICRON WITH OXIA 161 | # 1F7B GREEK SMALL LETTER UPSILON WITH OXIA 162 | # 1F7D GREEK SMALL LETTER OMEGA WITH OXIA 163 | # 1FBB GREEK CAPITAL LETTER ALPHA WITH OXIA 164 | # 1FBE GREEK PROSGEGRAMMENI 165 | # 1FC9 GREEK CAPITAL LETTER EPSILON WITH OXIA 166 | # 1FCB GREEK CAPITAL LETTER ETA WITH OXIA 167 | # 1FD3 GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 168 | # 1FDB GREEK CAPITAL LETTER IOTA WITH OXIA 169 | # 1FE3 GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA 170 | # 1FEB GREEK CAPITAL LETTER UPSILON WITH OXIA 171 | # 1FEE..1FEF [2] GREEK DIALYTIKA AND OXIA..GREEK VARIA 172 | # 1FF9 GREEK CAPITAL LETTER OMICRON WITH OXIA 173 | # 1FFB GREEK CAPITAL LETTER OMEGA WITH OXIA 174 | # 1FFD GREEK OXIA 175 | # 2000..2001 [2] EN QUAD..EM QUAD 176 | # 2126 OHM SIGN 177 | # 212A..212B [2] KELVIN SIGN..ANGSTROM SIGN 178 | # 2329 LEFT-POINTING ANGLE BRACKET 179 | # 232A RIGHT-POINTING ANGLE BRACKET 180 | # F900..FA0D [270] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA0D 181 | # FA10 CJK COMPATIBILITY IDEOGRAPH-FA10 182 | # FA12 CJK COMPATIBILITY IDEOGRAPH-FA12 183 | # FA15..FA1E [10] CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPATIBILITY IDEOGRAPH-FA1E 184 | # FA20 CJK COMPATIBILITY IDEOGRAPH-FA20 185 | # FA22 CJK COMPATIBILITY IDEOGRAPH-FA22 186 | # FA25..FA26 [2] CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPATIBILITY IDEOGRAPH-FA26 187 | # FA2A..FA6D [68] CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPATIBILITY IDEOGRAPH-FA6D 188 | # FA70..FAD9 [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9 189 | # 2F800..2FA1D [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D 190 | 191 | # Total code points: 1035 192 | 193 | # ================================================ 194 | # (4) Non-Starter Decompositions 195 | # 196 | # These characters can be derived from the UnicodeData.txt file 197 | # by including each expanding canonical decomposition 198 | # (i.e., those which canonically decompose to a sequence 199 | # of characters instead of a single character), such that: 200 | # 201 | # A. The character is not a Starter. 202 | # 203 | # OR (inclusive) 204 | # 205 | # B. The character's canonical decomposition begins 206 | # with a character that is not a Starter. 207 | # 208 | # Note that a "Starter" is any character with a zero combining class. 209 | # 210 | # These characters are simply quoted here for reference. 211 | # See also Full_Composition_Exclusion in DerivedNormalizationProps.txt 212 | # ================================================ 213 | 214 | # 0344 COMBINING GREEK DIALYTIKA TONOS 215 | # 0F73 TIBETAN VOWEL SIGN II 216 | # 0F75 TIBETAN VOWEL SIGN UU 217 | # 0F81 TIBETAN VOWEL SIGN REVERSED II 218 | 219 | # Total code points: 4 220 | 221 | # EOF 222 | -------------------------------------------------------------------------------- /src/array.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include "utf8lite.h" 23 | #include "private/array.h" 24 | 25 | 26 | /* Default initial size for nonempty dynamic arrays. Must be positive. */ 27 | #define UTF8LITE_ARRAY_SIZE_INIT 32 28 | 29 | /* Growth factor for dynamic arrays. Must be greater than 1. 30 | * 31 | * https://en.wikipedia.org/wiki/Dynamic_array#Growth_factor 32 | */ 33 | #define UTF8LITE_ARRAY_GROW 1.618 /* Golden Ratio, (1 + sqrt(5)) / 2 */ 34 | 35 | 36 | int utf8lite_bigarray_size_add(size_t *sizeptr, size_t width, size_t count, 37 | size_t nadd) 38 | { 39 | size_t size = *sizeptr; 40 | size_t size_min; 41 | int err; 42 | double n1; 43 | 44 | if (width == 0) { 45 | return 0; 46 | } 47 | 48 | if (count > (SIZE_MAX - nadd) / width) { 49 | err = UTF8LITE_ERROR_OVERFLOW; 50 | //utf8lite_log(err, "array size (%"PRIu64" + %"PRIu64 51 | // " elements of %"PRIu64" bytes each)" 52 | // " exceeds maximum (%"PRIu64" elements)", 53 | // (uint64_t)count, (uint64_t)nadd, 54 | // (uint64_t)width, (uint64_t)SIZE_MAX); 55 | return err; 56 | } 57 | 58 | size_min = count + nadd; 59 | if (size >= size_min) { 60 | return 0; 61 | } 62 | 63 | assert(UTF8LITE_ARRAY_SIZE_INIT > 0); 64 | assert(UTF8LITE_ARRAY_GROW > 1); 65 | 66 | if (size < UTF8LITE_ARRAY_SIZE_INIT && size_min > 0) { 67 | size = UTF8LITE_ARRAY_SIZE_INIT; 68 | } 69 | 70 | while (size < size_min) { 71 | n1 = UTF8LITE_ARRAY_GROW * size; 72 | if (n1 > SIZE_MAX / width) { 73 | size = SIZE_MAX / width; 74 | } else { 75 | size = (size_t)n1; 76 | } 77 | } 78 | 79 | *sizeptr = size; 80 | return 0; 81 | } 82 | 83 | 84 | int utf8lite_array_size_add(int *sizeptr, size_t width, int count, int nadd) 85 | { 86 | size_t size, size_min, size_max; 87 | int err; 88 | 89 | assert(*sizeptr >= 0); 90 | assert(count >= 0); 91 | assert(nadd >= 0); 92 | 93 | if (width == 0) { 94 | return 0; 95 | } 96 | 97 | size = (size_t)*sizeptr; 98 | if ((err = utf8lite_bigarray_size_add(&size, width, (size_t)count, 99 | (size_t)nadd))) { 100 | return err; 101 | } 102 | size_max = (size_t)INT_MAX / width; 103 | if (size > size_max) { 104 | size = size_max; 105 | size_min = (size_t)count + (size_t)nadd; 106 | if (size < size_min) { 107 | err = UTF8LITE_ERROR_OVERFLOW; 108 | //utf8lite_log(err, "array size (%"PRIu64 109 | // " elements of %"PRIu64" bytes each)" 110 | // " exceeds maximum (%"PRIu64" elements)", 111 | // (uint64_t)size_min, (uint64_t)width, 112 | // (uint64_t)size_max); 113 | return err; 114 | } 115 | } 116 | 117 | *sizeptr = (int)size; 118 | return 0; 119 | } 120 | 121 | 122 | int utf8lite_array_grow(void **baseptr, int *sizeptr, size_t width, int count, 123 | int nadd) 124 | { 125 | void *base = *baseptr; 126 | int size = *sizeptr; 127 | int err; 128 | 129 | assert(0 <= count); 130 | assert(count <= size); 131 | assert(width > 0); 132 | 133 | if (nadd <= size - count) { 134 | return 0; 135 | } 136 | 137 | if ((err = utf8lite_array_size_add(&size, width, count, nadd))) { 138 | return err; 139 | } 140 | 141 | if (!(base = realloc(base, ((size_t)size) * width))) { 142 | err = UTF8LITE_ERROR_NOMEM; 143 | return err; 144 | } 145 | 146 | *baseptr = base; 147 | *sizeptr = size; 148 | return 0; 149 | } 150 | 151 | 152 | int utf8lite_bigarray_grow(void **baseptr, size_t *sizeptr, size_t width, 153 | size_t count, size_t nadd) 154 | { 155 | void *base = *baseptr; 156 | size_t size = *sizeptr; 157 | int err; 158 | 159 | assert(count <= size); 160 | assert(width > 0); 161 | 162 | if (nadd <= size - count) { 163 | return 0; 164 | } 165 | 166 | if ((err = utf8lite_bigarray_size_add(&size, width, count, nadd))) { 167 | return err; 168 | } 169 | 170 | if (!(base = realloc(base, size * width))) { 171 | err = UTF8LITE_ERROR_NOMEM; 172 | //utf8lite_log(err, "failed allocating array"); 173 | return err; 174 | } 175 | 176 | *baseptr = base; 177 | *sizeptr = size; 178 | return 0; 179 | } 180 | -------------------------------------------------------------------------------- /src/char.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | 18 | #include 19 | #include "private/charwidth.h" 20 | #include "utf8lite.h" 21 | 22 | 23 | int utf8lite_charwidth(int32_t code) 24 | { 25 | int prop = charwidth(code); 26 | switch(prop) { 27 | case CHARWIDTH_NONE: 28 | return UTF8LITE_CHARWIDTH_NONE; 29 | case CHARWIDTH_IGNORABLE: 30 | return UTF8LITE_CHARWIDTH_IGNORABLE; 31 | case CHARWIDTH_MARK: 32 | return UTF8LITE_CHARWIDTH_MARK; 33 | case CHARWIDTH_NARROW: 34 | return UTF8LITE_CHARWIDTH_NARROW; 35 | case CHARWIDTH_AMBIGUOUS: 36 | return UTF8LITE_CHARWIDTH_AMBIGUOUS; 37 | case CHARWIDTH_WIDE: 38 | return UTF8LITE_CHARWIDTH_WIDE; 39 | case CHARWIDTH_EMOJI: 40 | return UTF8LITE_CHARWIDTH_EMOJI; 41 | default: 42 | assert(0 && "internal error: unrecognized charwidth property"); 43 | return prop; 44 | } 45 | } 46 | 47 | 48 | // TODO: use character class lookup table 49 | int utf8lite_isspace(int32_t code) 50 | { 51 | if (code <= 0x7F) { 52 | return (code == 0x20 || (0x09 <= code && code < 0x0E)); 53 | } else if (code <= 0x1FFF) { 54 | switch (code) { 55 | case 0x0085: 56 | case 0x00A0: 57 | case 0x1680: 58 | return 1; 59 | default: 60 | return 0; 61 | } 62 | } else if (code <= 0x200A) { 63 | return 1; 64 | } else if (code <= 0x3000) { 65 | switch (code) { 66 | case 0x2028: 67 | case 0x2029: 68 | case 0x202F: 69 | case 0x205F: 70 | case 0x3000: 71 | return 1; 72 | default: 73 | return 0; 74 | } 75 | } else { 76 | return 0; 77 | } 78 | } 79 | 80 | 81 | int utf8lite_isignorable(int32_t code) 82 | { 83 | int prop = utf8lite_charwidth(code); 84 | return (prop == UTF8LITE_CHARWIDTH_IGNORABLE); 85 | } 86 | -------------------------------------------------------------------------------- /src/encode.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "utf8lite.h" 19 | 20 | /* 21 | Source: 22 | http://www.unicode.org/versions/Unicode7.0.0/UnicodeStandard-7.0.pdf 23 | page 124, 3.9 "Unicode Encoding Forms", "UTF-8" 24 | 25 | Table 3-7. Well-Formed UTF-8 Byte Sequences 26 | ----------------------------------------------------------------------------- 27 | | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte | 28 | | U+0000..U+007F | 00..7F | | | | 29 | | U+0080..U+07FF | C2..DF | 80..BF | | | 30 | | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 31 | | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | 32 | | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 33 | | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | 34 | | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 35 | | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | 36 | | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 37 | ----------------------------------------------------------------------------- 38 | 39 | (table taken from https://github.com/JulienPalard/is_utf8 ) 40 | */ 41 | 42 | 43 | int utf8lite_scan_utf8(const uint8_t **bufptr, const uint8_t *end, 44 | struct utf8lite_message *msg) 45 | { 46 | const uint8_t *ptr = *bufptr; 47 | uint_fast8_t ch, ch1; 48 | unsigned nc; 49 | int err; 50 | 51 | assert(ptr < end); 52 | 53 | /* First byte 54 | * ---------- 55 | * 56 | * 1-byte sequence: 57 | * 00: 0000 0000 58 | * 7F: 0111 1111 59 | * (ch1 & 0x80 == 0) 60 | * 61 | * Invalid: 62 | * 80: 1000 0000 63 | * BF: 1011 1111 64 | * C0: 1100 0000 65 | * C1: 1100 0001 66 | * (ch & 0xF0 == 0x80 || ch == 0xC0 || ch == 0xC1) 67 | * 68 | * 2-byte sequence: 69 | * C2: 1100 0010 70 | * DF: 1101 1111 71 | * (ch & 0xE0 == 0xC0 && ch > 0xC1) 72 | * 73 | * 3-byte sequence 74 | * E0: 1110 0000 75 | * EF: 1110 1111 76 | * (ch & 0xF0 == E0) 77 | * 78 | * 4-byte sequence: 79 | * F0: 1111 0000 80 | * F4: 1111 0100 81 | * (ch & 0xFC == 0xF0 || ch == 0xF4) 82 | */ 83 | 84 | ch1 = *ptr++; 85 | 86 | if ((ch1 & 0x80) == 0) { 87 | goto success; 88 | } else if ((ch1 & 0xC0) == 0x80) { 89 | goto inval_lead; 90 | } else if ((ch1 & 0xE0) == 0xC0) { 91 | if (ch1 == 0xC0 || ch1 == 0xC1) { 92 | goto inval_lead; 93 | } 94 | nc = 1; 95 | } else if ((ch1 & 0xF0) == 0xE0) { 96 | nc = 2; 97 | } else if ((ch1 & 0xFC) == 0xF0 || ch1 == 0xF4) { 98 | nc = 3; 99 | } else { 100 | // expecting bytes in the following ranges: 00..7F C2..F4 101 | goto inval_lead; 102 | } 103 | 104 | // ensure string is long enough 105 | if (ptr + nc > end) { 106 | // expecting another continuation byte 107 | goto inval_incomplete; 108 | } 109 | 110 | /* First Continuation byte 111 | * ----------- 112 | * X + 80..BF: 113 | * 80: 1000 0000 114 | * BF: 1011 1111 115 | * (ch & 0xC0 == 0x80) 116 | * 117 | * E0 + A0..BF: 118 | * A0: 1010 0000 119 | * BF: 1011 1111 120 | * (ch & 0xE0 == 0xA0) 121 | * 122 | * ED + 80..9F: 123 | * 80: 1000 0000 124 | * 9F: 1001 1111 125 | * (ch & 0xE0 == 0x80) 126 | * 127 | * F0 + 90..BF: 128 | * 90: 1001 0000 129 | * BF: 1011 1111 130 | * (ch & 0xF0 == 0x90 || ch & 0xE0 == A0) 131 | * 132 | */ 133 | 134 | // validate the first continuation byte 135 | ch = *ptr++; 136 | switch (ch1) { 137 | case 0xE0: 138 | if ((ch & 0xE0) != 0xA0) { 139 | // expecting a byte between A0 and BF 140 | goto inval_cont; 141 | } 142 | break; 143 | case 0xED: 144 | if ((ch & 0xE0) != 0x80) { 145 | // expecting a byte between A0 and 9F 146 | goto inval_cont; 147 | } 148 | break; 149 | case 0xF0: 150 | if ((ch & 0xE0) != 0xA0 && (ch & 0xF0) != 0x90) { 151 | // expecting a byte between 90 and BF 152 | goto inval_cont; 153 | } 154 | break; 155 | case 0xF4: 156 | if ((ch & 0xF0) != 0x80) { 157 | // expecting a byte between 80 and 8F 158 | goto inval_cont; 159 | } 160 | default: 161 | if ((ch & 0xC0) != 0x80) { 162 | // expecting a byte between 80 and BF 163 | goto inval_cont; 164 | } 165 | break; 166 | } 167 | nc--; 168 | 169 | // validate the trailing continuation bytes 170 | while (nc-- > 0) { 171 | ch = *ptr++; 172 | if ((ch & 0xC0) != 0x80) { 173 | // expecting a byte between 80 and BF 174 | goto inval_cont; 175 | } 176 | } 177 | 178 | success: 179 | err = 0; 180 | goto out; 181 | 182 | inval_incomplete: 183 | utf8lite_message_set(msg, "not enough continuation bytes" 184 | " after leading byte (0x%02X)", 185 | (unsigned)ch1); 186 | goto error; 187 | 188 | inval_lead: 189 | utf8lite_message_set(msg, "invalid leading byte (0x%02X)", 190 | (unsigned)ch1); 191 | goto error; 192 | 193 | inval_cont: 194 | utf8lite_message_set(msg, "leading byte 0x%02X followed by" 195 | " invalid continuation byte (0x%02X)", 196 | (unsigned)ch1, (unsigned)ch); 197 | goto error; 198 | 199 | error: 200 | ptr--; 201 | err = UTF8LITE_ERROR_INVAL; 202 | out: 203 | *bufptr = ptr; 204 | return err; 205 | } 206 | 207 | 208 | void utf8lite_decode_utf8(const uint8_t **bufptr, int32_t *codeptr) 209 | { 210 | const uint8_t *ptr = *bufptr; 211 | int32_t code; 212 | uint_fast8_t ch; 213 | unsigned nc; 214 | 215 | ch = *ptr++; 216 | if (!(ch & 0x80)) { 217 | code = ch; 218 | nc = 0; 219 | } else if (!(ch & 0x20)) { 220 | code = ch & 0x1F; 221 | nc = 1; 222 | } else if (!(ch & 0x10)) { 223 | code = ch & 0x0F; 224 | nc = 2; 225 | } else { 226 | code = ch & 0x07; 227 | nc = 3; 228 | } 229 | 230 | while (nc-- > 0) { 231 | ch = *ptr++; 232 | code = (code << 6) + (ch & 0x3F); 233 | } 234 | 235 | *bufptr = ptr; 236 | *codeptr = code; 237 | } 238 | 239 | 240 | // http://www.fileformat.info/info/unicode/utf8.htm 241 | void utf8lite_encode_utf8(int32_t code, uint8_t **bufptr) 242 | { 243 | uint8_t *ptr = *bufptr; 244 | int32_t x = code; 245 | 246 | if (x <= 0x7F) { 247 | *ptr++ = (uint8_t)x; 248 | } else if (x <= 0x07FF) { 249 | *ptr++ = (uint8_t)(0xC0 | (x >> 6)); 250 | *ptr++ = (uint8_t)(0x80 | (x & 0x3F)); 251 | } else if (x <= 0xFFFF) { 252 | *ptr++ = (uint8_t)(0xE0 | (x >> 12)); 253 | *ptr++ = (uint8_t)(0x80 | ((x >> 6) & 0x3F)); 254 | *ptr++ = (uint8_t)(0x80 | (x & 0x3F)); 255 | } else { 256 | *ptr++ = (uint8_t)(0xF0 | (x >> 18)); 257 | *ptr++ = (uint8_t)(0x80 | ((x >> 12) & 0x3F)); 258 | *ptr++ = (uint8_t)(0x80 | ((x >> 6) & 0x3F)); 259 | *ptr++ = (uint8_t)(0x80 | (x & 0x3F)); 260 | } 261 | 262 | *bufptr = ptr; 263 | } 264 | 265 | 266 | void utf8lite_rencode_utf8(int32_t code, uint8_t **bufptr) 267 | { 268 | uint8_t *ptr = *bufptr; 269 | int32_t x = code; 270 | 271 | if (x <= 0x7F) { 272 | *--ptr = (uint8_t)x; 273 | } else if (x <= 0x07FF) { 274 | *--ptr = (uint8_t)(0x80 | (x & 0x3F)); 275 | *--ptr = (uint8_t)(0xC0 | (x >> 6)); 276 | } else if (x <= 0xFFFF) { 277 | *--ptr = (uint8_t)(0x80 | (x & 0x3F)); 278 | *--ptr = (uint8_t)(0x80 | ((x >> 6) & 0x3F)); 279 | *--ptr = (uint8_t)(0xE0 | (x >> 12)); 280 | } else { 281 | *--ptr = (uint8_t)(0x80 | (x & 0x3F)); 282 | *--ptr = (uint8_t)(0x80 | ((x >> 6) & 0x3F)); 283 | *--ptr = (uint8_t)(0x80 | ((x >> 12) & 0x3F)); 284 | *--ptr = (uint8_t)(0xF0 | (x >> 18)); 285 | } 286 | 287 | *bufptr = ptr; 288 | } 289 | -------------------------------------------------------------------------------- /src/error.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include "utf8lite.h" 22 | 23 | 24 | void utf8lite_message_clear(struct utf8lite_message *msg) 25 | { 26 | if (msg) { 27 | msg->string[0] = '\0'; 28 | } 29 | } 30 | 31 | 32 | void utf8lite_message_set(struct utf8lite_message *msg, 33 | const char *fmt, ...) 34 | { 35 | va_list ap; 36 | 37 | if (msg) { 38 | va_start(ap, fmt); 39 | vsnprintf(msg->string, sizeof(msg->string), fmt, ap); 40 | va_end(ap); 41 | } 42 | } 43 | 44 | 45 | void utf8lite_message_append(struct utf8lite_message *msg, 46 | const char *fmt, ...) 47 | { 48 | size_t n, nmax; 49 | va_list ap; 50 | 51 | if (msg) { 52 | nmax = sizeof(msg->string); 53 | n = strlen(msg->string); 54 | assert(n <= nmax); 55 | 56 | va_start(ap, fmt); 57 | vsnprintf(msg->string + n, nmax - n, fmt, ap); 58 | va_end(ap); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/escape.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "utf8lite.h" 19 | 20 | /* http://stackoverflow.com/a/11986885 */ 21 | #define hextoi(ch) ((ch > '9') ? (ch &~ 0x20) - 'A' + 10 : (ch - '0')) 22 | 23 | int utf8lite_scan_escape(const uint8_t **bufptr, const uint8_t *end, 24 | struct utf8lite_message *msg) 25 | { 26 | const uint8_t *input = *bufptr; 27 | const uint8_t *ptr = input; 28 | uint_fast8_t ch; 29 | int err; 30 | 31 | if (ptr == end) { 32 | goto error_incomplete; 33 | } 34 | 35 | ch = *ptr++; 36 | 37 | switch (ch) { 38 | case '"': 39 | case '\\': 40 | case '/': 41 | case 'b': 42 | case 'f': 43 | case 'n': 44 | case 'r': 45 | case 't': 46 | break; 47 | case 'u': 48 | if ((err = utf8lite_scan_uescape(&ptr, end, msg))) { 49 | goto out; 50 | } 51 | break; 52 | default: 53 | goto error_inval; 54 | } 55 | 56 | err = 0; 57 | goto out; 58 | 59 | error_incomplete: 60 | err = UTF8LITE_ERROR_INVAL; 61 | utf8lite_message_set(msg, "incomplete escape code (\\)"); 62 | goto out; 63 | 64 | error_inval: 65 | err = UTF8LITE_ERROR_INVAL; 66 | utf8lite_message_set(msg, "invalid escape code (\\%c)", ch); 67 | goto out; 68 | 69 | out: 70 | *bufptr = ptr; 71 | return err; 72 | } 73 | 74 | 75 | int utf8lite_scan_uescape(const uint8_t **bufptr, const uint8_t *end, 76 | struct utf8lite_message *msg) 77 | { 78 | const uint8_t *input = *bufptr; 79 | const uint8_t *ptr = input; 80 | int32_t code, low; 81 | uint_fast8_t ch; 82 | unsigned i; 83 | int err; 84 | 85 | if (ptr + 4 > end) { 86 | goto error_inval_incomplete; 87 | } 88 | 89 | code = 0; 90 | for (i = 0; i < 4; i++) { 91 | ch = *ptr++; 92 | if (!isxdigit(ch)) { 93 | goto error_inval_hex; 94 | } 95 | code = (code << 4) + hextoi(ch); 96 | } 97 | 98 | if (UTF8LITE_IS_UTF16_HIGH(code)) { 99 | if (ptr + 6 > end || ptr[0] != '\\' || ptr[1] != 'u') { 100 | goto error_inval_nolow; 101 | } 102 | ptr += 2; 103 | input = ptr; 104 | 105 | low = 0; 106 | for (i = 0; i < 4; i++) { 107 | ch = *ptr++; 108 | if (!isxdigit(ch)) { 109 | goto error_inval_hex; 110 | } 111 | low = (low << 4) + hextoi(ch); 112 | } 113 | if (!UTF8LITE_IS_UTF16_LOW(low)) { 114 | ptr -= 6; 115 | goto error_inval_low; 116 | } 117 | } else if (UTF8LITE_IS_UTF16_LOW(code)) { 118 | goto error_inval_nohigh; 119 | } 120 | 121 | err = 0; 122 | goto out; 123 | 124 | error_inval_incomplete: 125 | err = UTF8LITE_ERROR_INVAL; 126 | utf8lite_message_set(msg, "incomplete escape code (\\u%.*s)", 127 | (int)(end - input), input); 128 | goto out; 129 | 130 | error_inval_hex: 131 | err = UTF8LITE_ERROR_INVAL; 132 | utf8lite_message_set(msg, "invalid hex value in escape code (\\u%.*s)", 133 | 4, input); 134 | goto out; 135 | 136 | error_inval_nolow: 137 | err = UTF8LITE_ERROR_INVAL; 138 | utf8lite_message_set(msg, "missing UTF-16 low surrogate" 139 | " after high surrogate escape code (\\u%.*s)", 140 | 4, input); 141 | goto out; 142 | 143 | error_inval_low: 144 | err = UTF8LITE_ERROR_INVAL; 145 | utf8lite_message_set(msg, "invalid UTF-16 low surrogate (\\u%.*s)" 146 | " after high surrogate escape code (\\u%.*s)", 147 | 4, input, 4, input - 6); 148 | goto out; 149 | 150 | error_inval_nohigh: 151 | err = UTF8LITE_ERROR_INVAL; 152 | utf8lite_message_set(msg, "missing UTF-16 high surrogate" 153 | " before low surrogate escape code (\\u%.*s)", 154 | 4, input); 155 | goto out; 156 | 157 | out: 158 | *bufptr = ptr; 159 | return err; 160 | } 161 | 162 | 163 | void utf8lite_decode_uescape(const uint8_t **inputptr, int32_t *codeptr) 164 | { 165 | const uint8_t *ptr = *inputptr; 166 | int32_t code; 167 | uint_fast16_t low; 168 | uint_fast8_t ch; 169 | unsigned i; 170 | 171 | code = 0; 172 | for (i = 0; i < 4; i++) { 173 | ch = *ptr++; 174 | code = (code << 4) + hextoi(ch); 175 | } 176 | 177 | if (UTF8LITE_IS_UTF16_HIGH(code)) { 178 | // skip over \u 179 | ptr += 2; 180 | 181 | low = 0; 182 | for (i = 0; i < 4; i++) { 183 | ch = *ptr++; 184 | low = (uint_fast16_t)(low << 4) + hextoi(ch); 185 | } 186 | 187 | code = UTF8LITE_DECODE_UTF16_PAIR(code, low); 188 | } 189 | 190 | *codeptr = code; 191 | *inputptr = ptr; 192 | } 193 | 194 | 195 | void utf8lite_decode_escape(const uint8_t **inputptr, int32_t *codeptr) 196 | { 197 | const uint8_t *ptr = *inputptr; 198 | int32_t code; 199 | 200 | code = *ptr++; 201 | 202 | switch (code) { 203 | case 'b': 204 | code = '\b'; 205 | break; 206 | case 'f': 207 | code = '\f'; 208 | break; 209 | case 'n': 210 | code = '\n'; 211 | break; 212 | case 'r': 213 | code = '\r'; 214 | break; 215 | case 't': 216 | code = '\t'; 217 | break; 218 | case 'u': 219 | *inputptr = ptr; 220 | utf8lite_decode_uescape(inputptr, codeptr); 221 | return; 222 | default: 223 | break; 224 | } 225 | 226 | *inputptr = ptr; 227 | *codeptr = code; 228 | } 229 | -------------------------------------------------------------------------------- /src/graph.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "utf8lite.h" 19 | 20 | /* 21 | width 22 | graph Ambiguous Emoji Ignorable Narrow Mark None Wide 23 | Control 0 0 3809 0 0 2116 0 24 | CR 0 0 0 0 0 1 0 25 | EBase 0 98 0 0 0 0 0 26 | EBaseGAZ 0 4 0 0 0 0 0 27 | EModifier 0 5 0 0 0 0 0 28 | Extend 0 0 359 26 1514 0 2 29 | GlueAfterZWJ 2 20 0 0 0 0 0 30 | L 0 0 1 0 0 0 124 31 | LF 0 0 0 0 0 1 0 32 | LV 0 0 0 0 0 0 399 33 | LVT 0 0 0 0 0 0 10773 34 | Other 882 996 2 21606 0 971540 99206 35 | Prepend 0 0 0 9 10 0 0 36 | RegionalIndicator 0 26 0 0 0 0 0 37 | SpacingMark 0 0 0 348 0 0 0 38 | T 0 0 0 137 0 0 0 39 | V 0 0 1 94 0 0 0 40 | ZWJ 0 0 1 0 0 0 0 41 | */ 42 | 43 | 44 | static int ascii_width(int32_t ch, int flags); 45 | static int utf8_escape_width(int32_t ch, int flags); 46 | static int utf8_width(int32_t ch, int cw, int flags); 47 | 48 | 49 | int utf8lite_graph_measure(const struct utf8lite_graph *g, 50 | int flags, int *widthptr) 51 | { 52 | struct utf8lite_text_iter it; 53 | int32_t ch; 54 | int err = 0, cw, w, width; 55 | 56 | width = 0; 57 | utf8lite_text_iter_make(&it, &g->text); 58 | 59 | while (utf8lite_text_iter_advance(&it)) { 60 | ch = it.current; 61 | 62 | if (ch <= 0x7F) { 63 | w = ascii_width(ch, flags); 64 | } else if (flags & UTF8LITE_ESCAPE_UTF8) { 65 | w = utf8_escape_width(ch, flags); 66 | } else if ((flags & UTF8LITE_ESCAPE_EXTENDED) 67 | && (ch > 0xFFFF)) { 68 | w = utf8_escape_width(ch, flags); 69 | } else { 70 | cw = utf8lite_charwidth(ch); 71 | if (cw == UTF8LITE_CHARWIDTH_EMOJI) { 72 | width = 2; 73 | goto exit; 74 | } 75 | w = utf8_width(ch, cw, flags); 76 | } 77 | 78 | if (w < 0) { 79 | width = w; 80 | goto exit; 81 | } else if (w > INT_MAX - width) { 82 | width = -1; 83 | err = UTF8LITE_ERROR_OVERFLOW; 84 | goto exit; 85 | } else { 86 | width += w; 87 | } 88 | } 89 | 90 | exit: 91 | if (widthptr) { 92 | *widthptr = width; 93 | } 94 | return err; 95 | } 96 | 97 | 98 | int ascii_width(int32_t ch, int flags) 99 | { 100 | // handle control characters 101 | if (ch <= 0x1F || ch == 0x7F) { 102 | if (!(flags & UTF8LITE_ESCAPE_CONTROL)) { 103 | return -1; 104 | } 105 | 106 | switch (ch) { 107 | case '\a': 108 | case '\v': 109 | // \u0007, \u000b (JSON) : \a, \b (C) 110 | return (flags & UTF8LITE_ENCODE_JSON) ? 6 : 2; 111 | case '\b': 112 | case '\f': 113 | case '\n': 114 | case '\r': 115 | case '\t': 116 | return 2; 117 | default: 118 | return 6; // \uXXXX 119 | } 120 | } 121 | 122 | // handle printable characters 123 | switch (ch) { 124 | case '\"': 125 | return (flags & UTF8LITE_ESCAPE_DQUOTE) ? 2 : 1; 126 | case '\'': 127 | return (flags & UTF8LITE_ESCAPE_SQUOTE) ? 2 : 1; 128 | case '\\': 129 | if (flags & (UTF8LITE_ESCAPE_CONTROL 130 | | UTF8LITE_ESCAPE_DQUOTE 131 | | UTF8LITE_ESCAPE_SQUOTE 132 | | UTF8LITE_ESCAPE_EXTENDED 133 | | UTF8LITE_ESCAPE_UTF8)) { 134 | return 2; 135 | } else { 136 | return 1; 137 | } 138 | default: 139 | return 1; 140 | } 141 | } 142 | 143 | 144 | int utf8_width(int32_t ch, int cw, int flags) 145 | { 146 | int w = -1; 147 | 148 | switch ((enum utf8lite_charwidth_type)cw) { 149 | case UTF8LITE_CHARWIDTH_NONE: 150 | if (flags & UTF8LITE_ESCAPE_CONTROL) { 151 | w = utf8_escape_width(ch, flags); 152 | } else { 153 | w = -1; 154 | } 155 | break; 156 | 157 | case UTF8LITE_CHARWIDTH_IGNORABLE: 158 | case UTF8LITE_CHARWIDTH_MARK: 159 | w = 0; 160 | break; 161 | 162 | case UTF8LITE_CHARWIDTH_NARROW: 163 | w = 1; 164 | break; 165 | 166 | case UTF8LITE_CHARWIDTH_AMBIGUOUS: 167 | w = (flags & UTF8LITE_ENCODE_AMBIGWIDE) ? 2 : 1; 168 | break; 169 | 170 | case UTF8LITE_CHARWIDTH_WIDE: 171 | case UTF8LITE_CHARWIDTH_EMOJI: 172 | w = 2; 173 | break; 174 | } 175 | 176 | return w; 177 | } 178 | 179 | 180 | int utf8_escape_width(int32_t ch, int flags) 181 | { 182 | if (ch <= 0xFFFF) { 183 | return 6; 184 | } else if (flags & UTF8LITE_ENCODE_JSON) { 185 | return 12; 186 | } else { 187 | return 10; 188 | } 189 | } 190 | -------------------------------------------------------------------------------- /src/normalize.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "private/casefold.h" 19 | #include "private/compose.h" 20 | #include "private/combining.h" 21 | #include "private/decompose.h" 22 | #include "utf8lite.h" 23 | 24 | /* From Unicode-8.0 Section 3.12 Conjoining Jamo Behavior */ 25 | #define HANGUL_SBASE 0xAC00 26 | #define HANGUL_LBASE 0x1100 27 | #define HANGUL_VBASE 0x1161 28 | #define HANGUL_TBASE 0x11A7 29 | #define HANGUL_LCOUNT 19 30 | #define HANGUL_VCOUNT 21 31 | #define HANGUL_TCOUNT 28 32 | #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT) 33 | #define HANTUL_SCOUNT (HANGUL_LCOUNT * HANGUL_NCOUNT) 34 | 35 | 36 | static void hangul_decompose(int32_t code, int32_t **bufp) 37 | { 38 | int32_t *dst = *bufp; 39 | int32_t sindex = code - HANGUL_SBASE; 40 | int32_t lindex = sindex / HANGUL_NCOUNT; 41 | int32_t vindex = (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT; 42 | int32_t tindex = sindex % HANGUL_TCOUNT; 43 | int32_t lpart = HANGUL_LBASE + lindex; 44 | int32_t vpart = HANGUL_VBASE + vindex; 45 | int32_t tpart = HANGUL_TBASE + tindex; 46 | 47 | *dst++ = lpart; 48 | *dst++ = vpart; 49 | if (tindex > 0) { 50 | *dst++ = tpart; 51 | } 52 | 53 | *bufp = dst; 54 | } 55 | 56 | 57 | static int is_hangul_vpart(int32_t code) 58 | { 59 | return (HANGUL_VBASE <= code && code < HANGUL_VBASE + HANGUL_VCOUNT); 60 | } 61 | 62 | 63 | 64 | static int is_hangul_tpart(int32_t code) 65 | { 66 | // strict less-than on lower bound 67 | return (HANGUL_TBASE < code && code < HANGUL_TBASE + HANGUL_TCOUNT); 68 | } 69 | 70 | 71 | static int32_t hangul_compose_lv(int32_t lpart, int32_t vpart) 72 | { 73 | int32_t lindex = lpart - HANGUL_LBASE; 74 | int32_t vindex = vpart - HANGUL_VBASE; 75 | int32_t lvindex = lindex * HANGUL_NCOUNT + vindex * HANGUL_TCOUNT; 76 | int32_t s = HANGUL_SBASE + lvindex; 77 | return s; 78 | } 79 | 80 | 81 | static int32_t hangul_compose_lvt(int32_t lvpart, int32_t tpart) 82 | { 83 | int32_t tindex = tpart - HANGUL_TBASE; 84 | int32_t s = lvpart + tindex; 85 | return s; 86 | } 87 | 88 | 89 | static void casefold(int type, int32_t code, int32_t **bufp) 90 | { 91 | const int32_t block_size = CASEFOLD_BLOCK_SIZE; 92 | unsigned i = casefold_stage1[code / block_size]; 93 | struct casefold c = casefold_stage2[i][code % block_size]; 94 | unsigned length = c.length; 95 | const int32_t *src; 96 | int32_t *dst; 97 | 98 | if (length == 0) { 99 | dst = *bufp; 100 | *dst++ = code; 101 | *bufp = dst; 102 | } else if (length == 1) { 103 | utf8lite_map(type, (int32_t)c.data, bufp); 104 | } else { 105 | src = &casefold_mapping[c.data]; 106 | while (length-- > 0) { 107 | utf8lite_map(type, *src, bufp); 108 | src++; 109 | } 110 | } 111 | } 112 | 113 | 114 | 115 | void utf8lite_map(int type, int32_t code, int32_t **bufptr) 116 | { 117 | const int32_t block_size = DECOMPOSITION_BLOCK_SIZE; 118 | unsigned i = decomposition_stage1[code / block_size]; 119 | struct decomposition d = decomposition_stage2[i][code % block_size]; 120 | unsigned length = d.length; 121 | const int32_t *src; 122 | int32_t *dst; 123 | 124 | if (length == 0 || (d.type > 0 && !(type & (1 << (d.type - 1))))) { 125 | if (type & UTF8LITE_CASEFOLD_ALL) { 126 | casefold(type, code, bufptr); 127 | } else { 128 | dst = *bufptr; 129 | *dst++ = code; 130 | *bufptr = dst; 131 | } 132 | } else if (length == 1) { 133 | utf8lite_map(type, d.data, bufptr); 134 | } else if (d.type >= 0) { 135 | src = &decomposition_mapping[d.data]; 136 | while (length-- > 0) { 137 | utf8lite_map(type, *src, bufptr); 138 | src++; 139 | } 140 | } else { 141 | hangul_decompose(code, bufptr); 142 | } 143 | } 144 | 145 | 146 | void utf8lite_order(int32_t *ptr, size_t len) 147 | { 148 | int32_t *end = ptr + len; 149 | int32_t *c_begin, *c_end, *c_tail, *c_ptr; 150 | int32_t code, code_prev; 151 | int32_t cl, cl_prev; 152 | 153 | while (ptr != end) { 154 | c_begin = ptr; 155 | code = *ptr++; 156 | cl = combining_class(code); 157 | 158 | // skip to the next combining mark 159 | if (cl == 0) { 160 | continue; 161 | } 162 | 163 | // It takes 21 bits to encode a codepoint and 8 bits 164 | // to encode c combining class. 165 | // Mark the start of the combining mark sequence (c_begin) 166 | // encode the combining class in bits 22-29. 167 | *c_begin = code | (cl << UTF8LITE_CODE_BITS); 168 | 169 | // the combining mark sequence ends at the first starter 170 | // (c_end) 171 | c_end = ptr; 172 | while (c_end != end) { 173 | // until we hit a non-starter, encode the combining 174 | // class in the high 8 bits of the code 175 | code = *ptr++; 176 | cl = combining_class(code); 177 | if (cl == 0) { 178 | break; 179 | } 180 | 181 | *c_end = code | (cl << UTF8LITE_CODE_BITS); 182 | c_end++; 183 | } 184 | 185 | // sort the combining marks, using insertion sort (stable) 186 | for (c_tail = c_begin + 1; c_tail != c_end; c_tail++) { 187 | c_ptr = c_tail; 188 | code = *c_ptr; 189 | cl = code & (0xFF << UTF8LITE_CODE_BITS); 190 | 191 | while (c_ptr != c_begin) { 192 | code_prev = c_ptr[-1]; 193 | cl_prev = (code_prev 194 | & (0xFF << UTF8LITE_CODE_BITS)); 195 | 196 | if (cl_prev <= cl) { 197 | break; 198 | } 199 | 200 | // swap with previous item 201 | c_ptr[0] = code_prev; 202 | 203 | // move down 204 | c_ptr--; 205 | } 206 | 207 | // complete the final swap 208 | *c_ptr = code; 209 | } 210 | 211 | // remove the combining mark annotations 212 | while (c_begin != c_end) { 213 | code = *c_begin; 214 | *c_begin = code & (~(0xFF << UTF8LITE_CODE_BITS)); 215 | c_begin++; 216 | } 217 | } 218 | } 219 | 220 | 221 | 222 | 223 | static int has_compose(int32_t code, int *offsetptr, int *lengthptr) 224 | { 225 | const int32_t block_size = COMPOSITION_BLOCK_SIZE; 226 | unsigned i = composition_stage1[code / block_size]; 227 | struct composition c = composition_stage2[i][code % block_size]; 228 | int offset = (int)c.offset; 229 | int length = (int)c.length; 230 | 231 | *offsetptr = offset; 232 | *lengthptr = length; 233 | 234 | return (length > 0 ? 1 : 0); 235 | } 236 | 237 | 238 | static int code_cmp(const void *x1, const void *x2) 239 | { 240 | int32_t y1 = *(const int32_t *)x1; 241 | int32_t y2 = *(const int32_t *)x2; 242 | 243 | if (y1 < y2) { 244 | return -1; 245 | } else if (y1 > y2) { 246 | return +1; 247 | } else { 248 | return 0; 249 | } 250 | } 251 | 252 | 253 | static int combiner_find(int offset, int length, int32_t code) 254 | { 255 | const int32_t *base = composition_combiner + offset; 256 | const int32_t *ptr; 257 | 258 | // handle empty and singleton case 259 | if (length == 0) { 260 | return -1; 261 | } else if (length == 1) { 262 | return (*base == code) ? 0 : -1; 263 | } 264 | 265 | // handle general case 266 | ptr = bsearch(&code, base, (size_t)length, sizeof(*base), code_cmp); 267 | 268 | if (ptr == NULL) { 269 | return -1; 270 | } else { 271 | return (int)(ptr - base); 272 | } 273 | } 274 | 275 | 276 | static int has_combiner(int32_t left, int offset, int length, int32_t code, 277 | int32_t *primaryptr) 278 | { 279 | int i; 280 | 281 | if (offset < COMPOSITION_HANGUL_LPART) { 282 | i = combiner_find(offset, length, code); 283 | if (i >= 0) { 284 | *primaryptr = composition_primary[offset + i]; 285 | return 1; 286 | } 287 | } else if (offset == COMPOSITION_HANGUL_LPART) { 288 | if (is_hangul_vpart(code)) { 289 | *primaryptr = hangul_compose_lv(left, code); 290 | return 1; 291 | } 292 | } else if (offset == COMPOSITION_HANGUL_LVPART) { 293 | if (is_hangul_tpart(code)) { 294 | *primaryptr = hangul_compose_lvt(left, code); 295 | return 1; 296 | } 297 | } 298 | 299 | return 0; 300 | } 301 | 302 | 303 | void utf8lite_compose(int32_t *ptr, size_t *lenptr) 304 | { 305 | size_t len = *lenptr; 306 | int32_t *begin = ptr; 307 | int32_t *end = begin + len; 308 | int32_t *leftptr, *dst; 309 | int32_t left = 0, code, prim; 310 | uint8_t code_ccc, prev_ccc = 0; 311 | int moff = 0, mlen = 0; 312 | int blocked, has_prev, did_del; 313 | 314 | did_del = 0; 315 | 316 | // find the first combining starter (the left code point, L) 317 | leftptr = begin; 318 | while (leftptr != end) { 319 | left = *leftptr; 320 | if (has_compose(left, &moff, &mlen)) { 321 | break; 322 | } 323 | leftptr++; 324 | } 325 | 326 | if (leftptr == end) { 327 | goto out; 328 | } 329 | 330 | ptr = leftptr + 1; 331 | has_prev = 0; 332 | while (ptr != end) { 333 | code = *ptr; 334 | code_ccc = combining_class(code); 335 | 336 | // determine whether the code is blocked 337 | if (has_prev && prev_ccc >= code_ccc) { 338 | blocked = 1; 339 | } else { 340 | blocked = 0; 341 | } 342 | 343 | if (!blocked && has_combiner(left, moff, mlen, code, &prim)) { 344 | // replace L by P 345 | *leftptr = prim; 346 | left = prim; 347 | has_compose(left, &moff, &mlen); 348 | 349 | // delete C 350 | *ptr = UTF8LITE_CODE_NONE; 351 | did_del = 1; 352 | } else if (code_ccc == 0) { 353 | // new leftmost combining starter, L 354 | leftptr = ptr; 355 | left = code; 356 | has_compose(left, &moff, &mlen); 357 | has_prev = 0; 358 | } else { 359 | prev_ccc = code_ccc; 360 | has_prev = 1; 361 | } 362 | ptr++; 363 | } 364 | 365 | // remove the deleted entries 366 | if (did_del) { 367 | ptr = begin; 368 | dst = begin; 369 | while (ptr != end) { 370 | code = *ptr++; 371 | if (code != UTF8LITE_CODE_NONE) { 372 | *dst++ = code; 373 | } 374 | } 375 | len = (size_t)(dst - begin); 376 | } 377 | 378 | out: 379 | *lenptr = len; 380 | } 381 | -------------------------------------------------------------------------------- /src/private/array.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef UTF8LITE_ARRAY_H 18 | #define UTF8LITE_ARRAY_H 19 | 20 | /** 21 | * \file array.h 22 | * 23 | * Dynamic array, growing to accommodate more elements. 24 | */ 25 | 26 | #include 27 | 28 | /** 29 | * Grow an array to accommodate more elements, possibly re-allocating. 30 | * 31 | * \param baseptr pointer to pointer to first element 32 | * \param sizeptr pointer to the capacity (in elements) of the array 33 | * \param width size of each element 34 | * \param count number of occupied elements 35 | * \param nadd number of elements to append after the `count` occupied 36 | * elements 37 | * 38 | * \returns 0 on success 39 | */ 40 | int utf8lite_array_grow(void **baseptr, int *sizeptr, size_t width, int count, 41 | int nadd); 42 | 43 | /** 44 | * Determine the capacity for an array that needs to grow. 45 | * 46 | * \param sizeptr pointer to the capacity (in elements) of the array 47 | * \param width size of each element 48 | * \param count number of occupied elements 49 | * \param nadd number of elements to append after the `count` occupied 50 | * elements 51 | * 52 | * \returns 0 on success, `UTF8LITE_ERROR_OVERFLOW` on overflow 53 | */ 54 | int utf8lite_array_size_add(int *sizeptr, size_t width, int count, int nadd); 55 | 56 | /** 57 | * Grow an big array to accommodate more elements, possibly re-allocating. 58 | * 59 | * \param baseptr pointer to pointer to first element 60 | * \param sizeptr pointer to the capacity (in elements) of the array 61 | * \param width size of each element 62 | * \param count number of occupied elements 63 | * \param nadd number of elements to append after the `count` occupied 64 | * elements 65 | * 66 | * \returns 0 on success 67 | */ 68 | int utf8lite_bigarray_grow(void **baseptr, size_t *sizeptr, size_t width, 69 | size_t count, size_t nadd); 70 | 71 | /** 72 | * Determine the capacity for an array that needs to grow. 73 | * 74 | * \param sizeptr pointer to the capacity (in elements) of the array 75 | * \param width size of each element 76 | * \param count number of occupied elements 77 | * \param nadd number of elements to append after the `count` occupied 78 | * elements 79 | * 80 | * \returns 0 on success, `UTF8LITE_ERROR_OVERFLOW` on overflow 81 | */ 82 | int utf8lite_bigarray_size_add(size_t *sizeptr, size_t width, size_t count, 83 | size_t nadd); 84 | 85 | #endif /* UTF8LITE_ARRAY_H */ 86 | -------------------------------------------------------------------------------- /src/text.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include "utf8lite.h" 22 | 23 | 24 | int utf8lite_text_init_copy(struct utf8lite_text *text, 25 | const struct utf8lite_text *other) 26 | { 27 | size_t size = UTF8LITE_TEXT_SIZE(other); 28 | size_t attr = other->attr; 29 | 30 | if (other->ptr) { 31 | if (!(text->ptr = malloc(size + 1))) { 32 | return UTF8LITE_ERROR_NOMEM; 33 | } 34 | 35 | memcpy(text->ptr, other->ptr, size); 36 | text->ptr[size] = '\0'; 37 | } else { 38 | text->ptr = NULL; 39 | } 40 | text->attr = attr; 41 | return 0; 42 | } 43 | 44 | 45 | void utf8lite_text_destroy(struct utf8lite_text *text) 46 | { 47 | free(text->ptr); 48 | } 49 | 50 | 51 | int utf8lite_text_isascii(const struct utf8lite_text *text) 52 | { 53 | struct utf8lite_text_iter it; 54 | 55 | utf8lite_text_iter_make(&it, text); 56 | while (utf8lite_text_iter_advance(&it)) { 57 | if (!UTF8LITE_IS_ASCII(it.current)) { 58 | return 0; 59 | } 60 | } 61 | return 1; 62 | } 63 | 64 | 65 | // Dan Bernstein's djb2 XOR hash: http://www.cse.yorku.ca/~oz/hash.html 66 | #define HASH_SEED 5381 67 | #define HASH_COMBINE(seed, v) (((hash) << 5) + (hash)) ^ ((size_t)(v)) 68 | 69 | 70 | static size_t hash_raw(const struct utf8lite_text *text) 71 | { 72 | const uint8_t *ptr = text->ptr; 73 | const uint8_t *end = ptr + UTF8LITE_TEXT_SIZE(text); 74 | size_t hash = HASH_SEED; 75 | size_t ch; 76 | 77 | while (ptr != end) { 78 | ch = *ptr++; 79 | hash = HASH_COMBINE(hash, ch); 80 | } 81 | 82 | return hash; 83 | } 84 | 85 | 86 | size_t utf8lite_text_hash(const struct utf8lite_text *text) 87 | { 88 | uint8_t buf[4]; 89 | const uint8_t *ptr = text->ptr; 90 | const uint8_t *end = ptr + UTF8LITE_TEXT_SIZE(text); 91 | uint8_t *bufptr, *bufend; 92 | size_t hash = HASH_SEED; 93 | int32_t code; 94 | uint_fast8_t ch; 95 | 96 | if (!UTF8LITE_TEXT_HAS_ESC(text)) { 97 | return hash_raw(text); 98 | } 99 | 100 | while (ptr != end) { 101 | ch = *ptr++; 102 | if (ch == '\\') { 103 | utf8lite_decode_escape(&ptr, &code); 104 | 105 | bufptr = buf; 106 | bufend = bufptr; 107 | utf8lite_encode_utf8(code, &bufend); 108 | 109 | while (bufptr != bufend) { 110 | ch = *bufptr++; 111 | hash = HASH_COMBINE(hash, ch); 112 | } 113 | } else { 114 | hash = HASH_COMBINE(hash, ch); 115 | } 116 | } 117 | 118 | return hash; 119 | } 120 | 121 | 122 | int utf8lite_text_equals(const struct utf8lite_text *text1, 123 | const struct utf8lite_text *text2) 124 | { 125 | struct utf8lite_text_iter it1, it2; 126 | size_t n; 127 | 128 | if (text1->attr == text2->attr) { 129 | // same bits and size 130 | n = UTF8LITE_TEXT_SIZE(text1); 131 | return !memcmp(text1->ptr, text2->ptr, n); 132 | } else if (UTF8LITE_TEXT_BITS(text1) == UTF8LITE_TEXT_BITS(text2)) { 133 | // same bits, different size 134 | return 0; 135 | } else { 136 | // different bits or different size 137 | utf8lite_text_iter_make(&it1, text1); 138 | utf8lite_text_iter_make(&it2, text2); 139 | while (utf8lite_text_iter_advance(&it1)) { 140 | utf8lite_text_iter_advance(&it2); 141 | if (it1.current != it2.current) { 142 | return 0; 143 | } 144 | } 145 | return !utf8lite_text_iter_advance(&it2); 146 | } 147 | } 148 | 149 | 150 | static int compare_raw(const struct utf8lite_text *text1, 151 | const struct utf8lite_text *text2) 152 | { 153 | size_t n1 = UTF8LITE_TEXT_SIZE(text1); 154 | size_t n2 = UTF8LITE_TEXT_SIZE(text2); 155 | size_t n = (n1 < n2) ? n1 : n2; 156 | int cmp; 157 | 158 | cmp = memcmp(text1->ptr, text2->ptr, n); 159 | if (cmp == 0) { 160 | if (n1 < n2) { 161 | cmp = -1; 162 | } else if (n1 == n2) { 163 | cmp = 0; 164 | } else { 165 | cmp = +1; 166 | } 167 | } 168 | return cmp; 169 | } 170 | 171 | 172 | int utf8lite_text_compare(const struct utf8lite_text *text1, 173 | const struct utf8lite_text *text2) 174 | { 175 | struct utf8lite_text_iter it1, it2; 176 | 177 | if (!UTF8LITE_TEXT_HAS_ESC(text1) && !UTF8LITE_TEXT_HAS_ESC(text2)) { 178 | return compare_raw(text1, text2); 179 | } 180 | 181 | utf8lite_text_iter_make(&it1, text1); 182 | utf8lite_text_iter_make(&it2, text2); 183 | while (utf8lite_text_iter_advance(&it1)) { 184 | utf8lite_text_iter_advance(&it2); 185 | if (it1.current < it2.current) { 186 | return -1; 187 | } else if (it1.current > it2.current) { 188 | return +1; 189 | } 190 | } 191 | 192 | return utf8lite_text_iter_advance(&it2) ? -1 : 0; 193 | } 194 | -------------------------------------------------------------------------------- /src/textassign.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include "utf8lite.h" 22 | 23 | 24 | static int assign_esc(struct utf8lite_text *text, 25 | const uint8_t *ptr, size_t size, 26 | struct utf8lite_message *msg); 27 | static void assign_esc_unsafe(struct utf8lite_text *text, const uint8_t *ptr, 28 | size_t size); 29 | static int assign_raw(struct utf8lite_text *text, const uint8_t *ptr, 30 | size_t size, struct utf8lite_message *msg); 31 | static void assign_raw_unsafe(struct utf8lite_text *text, const uint8_t *ptr, 32 | size_t size); 33 | 34 | static void append_location(struct utf8lite_message *msg, size_t offset); 35 | 36 | 37 | int utf8lite_text_assign(struct utf8lite_text *text, const uint8_t *ptr, 38 | size_t size, int flags, struct utf8lite_message *msg) 39 | { 40 | int err = 0; 41 | 42 | if (size > UTF8LITE_TEXT_SIZE_MAX) { 43 | err = UTF8LITE_ERROR_OVERFLOW; 44 | utf8lite_message_set(msg, "text size (%"PRIu64" bytes)" 45 | " exceeds maximum (%"PRIu64" bytes)", 46 | (uint64_t)size, 47 | (uint64_t)UTF8LITE_TEXT_SIZE_MAX); 48 | } else if (flags & UTF8LITE_TEXT_UNESCAPE) { 49 | if (flags & UTF8LITE_TEXT_VALID) { 50 | assign_esc_unsafe(text, ptr, size); 51 | } else { 52 | err = assign_esc(text, ptr, size, msg); 53 | } 54 | } else { 55 | if (flags & UTF8LITE_TEXT_VALID) { 56 | assign_raw_unsafe(text, ptr, size); 57 | } else { 58 | err = assign_raw(text, ptr, size, msg); 59 | } 60 | } 61 | 62 | if (err) { 63 | text->ptr = NULL; 64 | text->attr = 0; 65 | } 66 | 67 | return err; 68 | } 69 | 70 | 71 | int assign_raw(struct utf8lite_text *text, const uint8_t *ptr, size_t size, 72 | struct utf8lite_message *msg) 73 | { 74 | const uint8_t *input = ptr; 75 | const uint8_t *end = ptr + size; 76 | uint_fast8_t ch; 77 | int err; 78 | 79 | text->ptr = (uint8_t *)ptr; 80 | 81 | while (ptr != end) { 82 | ch = *ptr++; 83 | if (ch & 0x80) { 84 | ptr--; 85 | if ((err = utf8lite_scan_utf8(&ptr, end, msg))) { 86 | goto error; 87 | } 88 | } 89 | } 90 | 91 | text->attr = size; 92 | return 0; 93 | 94 | error: 95 | append_location(msg, (size_t)(ptr - input)); 96 | text->ptr = NULL; 97 | text->attr = 0; 98 | return err; 99 | } 100 | 101 | 102 | int assign_esc(struct utf8lite_text *text, const uint8_t *ptr, size_t size, 103 | struct utf8lite_message *msg) 104 | { 105 | const uint8_t *input = ptr; 106 | const uint8_t *end = ptr + size; 107 | size_t attr = 0; 108 | uint_fast8_t ch; 109 | int err; 110 | 111 | text->ptr = (uint8_t *)ptr; 112 | 113 | while (ptr != end) { 114 | ch = *ptr++; 115 | if (ch == '\\') { 116 | attr |= UTF8LITE_TEXT_ESC_BIT; 117 | 118 | if ((err = utf8lite_scan_escape(&ptr, end, msg))) { 119 | goto error; 120 | } 121 | } else if (ch & 0x80) { 122 | ptr--; 123 | if ((err = utf8lite_scan_utf8(&ptr, end, msg))) { 124 | goto error; 125 | } 126 | } 127 | } 128 | 129 | attr |= size; 130 | text->attr = attr; 131 | return 0; 132 | 133 | error: 134 | append_location(msg, (size_t)(ptr - input)); 135 | return err; 136 | } 137 | 138 | 139 | void assign_raw_unsafe(struct utf8lite_text *text, const uint8_t *ptr, 140 | size_t size) 141 | { 142 | const uint8_t *end = ptr + size; 143 | uint_fast8_t ch; 144 | 145 | text->ptr = (uint8_t *)ptr; 146 | 147 | while (ptr != end) { 148 | ch = *ptr++; 149 | if (ch & 0x80) { 150 | ptr += UTF8LITE_UTF8_TAIL_LEN(ch); 151 | } 152 | } 153 | 154 | text->attr = size; 155 | } 156 | 157 | 158 | 159 | void assign_esc_unsafe(struct utf8lite_text *text, const uint8_t *ptr, 160 | size_t size) 161 | { 162 | const uint8_t *end = ptr + size; 163 | size_t attr = 0; 164 | int32_t code; 165 | uint_fast8_t ch; 166 | 167 | text->ptr = (uint8_t *)ptr; 168 | 169 | while (ptr != end) { 170 | ch = *ptr++; 171 | if (ch == '\\') { 172 | attr |= UTF8LITE_TEXT_ESC_BIT; 173 | ch = *ptr++; 174 | 175 | switch (ch) { 176 | case 'u': 177 | utf8lite_decode_uescape(&ptr, &code); 178 | break; 179 | default: 180 | break; 181 | } 182 | } else if (ch & 0x80) { 183 | ptr += UTF8LITE_UTF8_TAIL_LEN(ch); 184 | } 185 | } 186 | 187 | attr |= size; 188 | text->attr = attr; 189 | } 190 | 191 | 192 | void append_location(struct utf8lite_message *msg, size_t offset) 193 | { 194 | utf8lite_message_append(msg, " at position %"PRIu64, 195 | (uint64_t)(offset + 1)); 196 | } 197 | -------------------------------------------------------------------------------- /src/textiter.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "utf8lite.h" 19 | 20 | /* http://stackoverflow.com/a/11986885 */ 21 | #define hextoi(ch) ((ch > '9') ? (ch &~ 0x20) - 'A' + 10 : (ch - '0')) 22 | 23 | static void iter_retreat_escaped(struct utf8lite_text_iter *it, 24 | const uint8_t *begin); 25 | static void iter_retreat_raw(struct utf8lite_text_iter *it); 26 | 27 | void utf8lite_text_iter_make(struct utf8lite_text_iter *it, 28 | const struct utf8lite_text *text) 29 | { 30 | it->ptr = text->ptr; 31 | it->end = it->ptr + UTF8LITE_TEXT_SIZE(text); 32 | it->text_attr = text->attr; 33 | it->current = UTF8LITE_CODE_NONE; 34 | } 35 | 36 | 37 | int utf8lite_text_iter_advance(struct utf8lite_text_iter *it) 38 | { 39 | const uint8_t *ptr = it->ptr; 40 | size_t text_attr = it->text_attr; 41 | int32_t code; 42 | 43 | if (it->ptr == it->end) { 44 | goto at_end; 45 | } 46 | 47 | code = *ptr++; 48 | 49 | if (code == '\\' && (text_attr & UTF8LITE_TEXT_ESC_BIT)) { 50 | utf8lite_decode_escape(&ptr, &code); 51 | } else if (code >= 0x80) { 52 | ptr--; 53 | utf8lite_decode_utf8(&ptr, &code); 54 | } 55 | 56 | it->ptr = ptr; 57 | it->current = code; 58 | return 1; 59 | 60 | at_end: 61 | it->current = UTF8LITE_CODE_NONE; 62 | return 0; 63 | } 64 | 65 | 66 | void utf8lite_text_iter_skip(struct utf8lite_text_iter *it) 67 | { 68 | it->ptr = it->end; 69 | it->current = UTF8LITE_CODE_NONE; 70 | } 71 | 72 | 73 | int utf8lite_text_iter_retreat(struct utf8lite_text_iter *it) 74 | { 75 | const size_t size = (it->text_attr & UTF8LITE_TEXT_SIZE_MASK); 76 | const uint8_t *begin = it->end - size; 77 | const uint8_t *ptr = it->ptr; 78 | const uint8_t *end = it->end; 79 | int32_t code = it->current; 80 | 81 | if (ptr == begin) { 82 | return 0; 83 | } 84 | 85 | if (it->text_attr & UTF8LITE_TEXT_ESC_BIT) { 86 | iter_retreat_escaped(it, begin); 87 | } else { 88 | iter_retreat_raw(it); 89 | } 90 | 91 | // we were at the end of the text 92 | if (code == UTF8LITE_CODE_NONE) { 93 | it->ptr = end; 94 | return 1; 95 | } 96 | 97 | // at this point, it->code == code, and it->ptr is the code start 98 | ptr = it->ptr; 99 | 100 | if (ptr == begin) { 101 | it->current = UTF8LITE_CODE_NONE; 102 | return 0; 103 | } 104 | 105 | // read the previous code 106 | if (it->text_attr & UTF8LITE_TEXT_ESC_BIT) { 107 | iter_retreat_escaped(it, begin); 108 | } else { 109 | iter_retreat_raw(it); 110 | } 111 | 112 | // now, it->code is the previous code, and it->ptr is the start 113 | // of the previous code 114 | 115 | // set the pointer to the end of the previous code 116 | it->ptr = ptr; 117 | return 1; 118 | } 119 | 120 | 121 | void utf8lite_text_iter_reset(struct utf8lite_text_iter *it) 122 | { 123 | const size_t size = (it->text_attr & UTF8LITE_TEXT_SIZE_MASK); 124 | const uint8_t *begin = it->end - size; 125 | 126 | it->ptr = begin; 127 | it->current = UTF8LITE_CODE_NONE; 128 | } 129 | 130 | 131 | void iter_retreat_raw(struct utf8lite_text_iter *it) 132 | { 133 | const uint8_t *ptr = it->ptr; 134 | int32_t code; 135 | 136 | code = *(--ptr); 137 | 138 | if (code < 0x80) { 139 | it->ptr = (uint8_t *)ptr; 140 | it->current = code; 141 | } else { 142 | // skip over continuation bytes 143 | do { 144 | ptr--; 145 | } while (*ptr < 0xC0); 146 | 147 | it->ptr = (uint8_t *)ptr; 148 | 149 | utf8lite_decode_utf8(&ptr, &it->current); 150 | } 151 | } 152 | 153 | 154 | 155 | // we are at an escape if we are preceded by an odd number of 156 | // backslash (\) characters 157 | static int at_escape(const uint8_t *begin, const uint8_t *ptr) 158 | { 159 | int at = 0; 160 | uint_fast8_t prev; 161 | 162 | while (begin < ptr) { 163 | prev = *(--ptr); 164 | 165 | if (prev != '\\') { 166 | goto out; 167 | } 168 | 169 | at = !at; 170 | } 171 | 172 | out: 173 | return at; 174 | } 175 | 176 | 177 | void iter_retreat_escaped(struct utf8lite_text_iter *it, const uint8_t *begin) 178 | { 179 | const uint8_t *ptr = it->ptr; 180 | int32_t code, unesc, hi; 181 | int i; 182 | 183 | code = *(--ptr); 184 | 185 | // check for 2-byte escape 186 | switch (code) { 187 | case '"': 188 | case '\\': 189 | case '/': 190 | unesc = code; 191 | break; 192 | 193 | case 'b': 194 | unesc = '\b'; 195 | break; 196 | 197 | case 'f': 198 | unesc = '\f'; 199 | break; 200 | case 'n': 201 | unesc = '\n'; 202 | break; 203 | 204 | case 'r': 205 | unesc = '\r'; 206 | break; 207 | 208 | case 't': 209 | unesc = '\t'; 210 | break; 211 | 212 | default: 213 | unesc = 0; 214 | break; 215 | } 216 | 217 | if (unesc) { 218 | if (at_escape(begin, ptr)) { 219 | ptr--; 220 | code = unesc; 221 | } 222 | goto out; 223 | } 224 | 225 | // check for 6-byte escape 226 | if (isxdigit((int)code)) { 227 | if (!(begin + 4 < ptr && ptr[-4] == 'u' 228 | && at_escape(begin, ptr - 4))) { 229 | goto out; 230 | } 231 | 232 | code = 0; 233 | for (i = 0; i < 4; i++) { 234 | code = (code << 4) + hextoi(ptr[i - 3]); 235 | } 236 | ptr -= 5; 237 | 238 | if (UTF8LITE_IS_UTF16_LOW(code)) { 239 | hi = 0; 240 | for (i = 0; i < 4; i++) { 241 | hi = (hi << 4) + hextoi(ptr[i - 4]); 242 | } 243 | 244 | code = UTF8LITE_DECODE_UTF16_PAIR(hi, code); 245 | ptr -= 6; 246 | } 247 | 248 | goto out; 249 | } 250 | 251 | // check for ascii 252 | if (code < 0x80) { 253 | goto out; 254 | } 255 | 256 | // if we got here, then code is a continuation byte 257 | 258 | // skip over preceding continuation bytes 259 | do { 260 | ptr--; 261 | } while (*ptr < 0xC0); 262 | 263 | // decode the utf-8 value 264 | it->ptr = (uint8_t *)ptr; 265 | utf8lite_decode_utf8(&ptr, &it->current); 266 | return; 267 | 268 | out: 269 | it->ptr = (uint8_t *)ptr; 270 | it->current = code; 271 | } 272 | -------------------------------------------------------------------------------- /src/textmap.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include "utf8lite.h" 24 | 25 | 26 | static void utf8lite_textmap_clear_type(struct utf8lite_textmap *map); 27 | static int utf8lite_textmap_set_type(struct utf8lite_textmap *map, int type); 28 | 29 | static int utf8lite_textmap_reserve(struct utf8lite_textmap *map, size_t size); 30 | static int utf8lite_textmap_set_ascii(struct utf8lite_textmap *map, 31 | const struct utf8lite_text *text); 32 | static int utf8lite_textmap_set_utf32(struct utf8lite_textmap *map, 33 | const int32_t *ptr, 34 | const int32_t *end); 35 | 36 | 37 | int utf8lite_textmap_init(struct utf8lite_textmap *map, int type) 38 | { 39 | int err; 40 | 41 | map->text.ptr = NULL; 42 | map->text.attr = 0; 43 | map->codes = NULL; 44 | map->size_max = 0; 45 | 46 | utf8lite_textmap_clear_type(map); 47 | err = utf8lite_textmap_set_type(map, type); 48 | return err; 49 | } 50 | 51 | 52 | void utf8lite_textmap_destroy(struct utf8lite_textmap *map) 53 | { 54 | free(map->codes); 55 | free(map->text.ptr); 56 | } 57 | 58 | 59 | void utf8lite_textmap_clear_type(struct utf8lite_textmap *map) 60 | { 61 | uint_fast8_t ch; 62 | 63 | map->charmap_type = UTF8LITE_DECOMP_NORMAL | UTF8LITE_CASEFOLD_NONE; 64 | 65 | for (ch = 0; ch < 0x80; ch++) { 66 | map->ascii_map[ch] = (int8_t)ch; 67 | } 68 | 69 | map->type = 0; 70 | } 71 | 72 | 73 | int utf8lite_textmap_set_type(struct utf8lite_textmap *map, int type) 74 | { 75 | int_fast8_t ch; 76 | 77 | if (map->type == type) { 78 | return 0; 79 | } 80 | 81 | utf8lite_textmap_clear_type(map); 82 | 83 | if (type & UTF8LITE_TEXTMAP_CASE) { 84 | for (ch = 'A'; ch <= 'Z'; ch++) { 85 | map->ascii_map[ch] = ch + ('a' - 'A'); 86 | } 87 | 88 | map->charmap_type |= UTF8LITE_CASEFOLD_ALL; 89 | } 90 | 91 | if (type & UTF8LITE_TEXTMAP_COMPAT) { 92 | map->charmap_type = UTF8LITE_DECOMP_ALL; 93 | } 94 | 95 | map->type = type; 96 | 97 | return 0; 98 | } 99 | 100 | 101 | int utf8lite_textmap_reserve(struct utf8lite_textmap *map, size_t size) 102 | { 103 | uint8_t *ptr = map->text.ptr; 104 | int32_t *codes = map->codes; 105 | 106 | if (map->size_max >= size) { 107 | return 0; 108 | } 109 | 110 | if (!(ptr = realloc(ptr, size))) { 111 | return UTF8LITE_ERROR_NOMEM; 112 | } 113 | map->text.ptr = ptr; 114 | 115 | if (size > SIZE_MAX / UTF8LITE_UNICODE_DECOMP_MAX) { 116 | return UTF8LITE_ERROR_OVERFLOW; 117 | } 118 | 119 | if (!(codes = realloc(codes, size * UTF8LITE_UNICODE_DECOMP_MAX))) { 120 | return UTF8LITE_ERROR_NOMEM; 121 | } 122 | map->codes = codes; 123 | 124 | map->size_max = size; 125 | return 0; 126 | } 127 | 128 | 129 | int utf8lite_textmap_set(struct utf8lite_textmap *map, 130 | const struct utf8lite_text *text) 131 | { 132 | struct utf8lite_text_iter it; 133 | size_t size = UTF8LITE_TEXT_SIZE(text); 134 | int32_t *dst; 135 | int err; 136 | 137 | if (utf8lite_text_isascii(text)) { 138 | return utf8lite_textmap_set_ascii(map, text); 139 | } 140 | 141 | // For most inputs, mapping to type reduces or preserves the size. 142 | // However, for U+0390 and U+03B0, case folding triples the size. 143 | // (You can verify this with util/compute-typelen.py) 144 | // 145 | // Add one for a trailing NUL. 146 | if (size > ((SIZE_MAX - 1) / 3)) { 147 | err = UTF8LITE_ERROR_OVERFLOW; 148 | goto out; 149 | } 150 | 151 | if ((err = utf8lite_textmap_reserve(map, 3 * size + 1))) { 152 | goto out; 153 | } 154 | 155 | dst = map->codes; 156 | utf8lite_text_iter_make(&it, text); 157 | while (utf8lite_text_iter_advance(&it)) { 158 | utf8lite_map(map->charmap_type, it.current, &dst); 159 | } 160 | 161 | size = (size_t)(dst - map->codes); 162 | utf8lite_order(map->codes, size); 163 | utf8lite_compose(map->codes, &size); 164 | 165 | if ((err = utf8lite_textmap_set_utf32(map, map->codes, 166 | map->codes + size))) { 167 | goto out; 168 | } 169 | 170 | out: 171 | return err; 172 | } 173 | 174 | 175 | int utf8lite_textmap_set_utf32(struct utf8lite_textmap *map, const int32_t *ptr, 176 | const int32_t *end) 177 | { 178 | int map_quote = map->type & UTF8LITE_TEXTMAP_QUOTE; 179 | int rm_di = map->type & UTF8LITE_TEXTMAP_RMDI; 180 | uint8_t *dst = map->text.ptr; 181 | int32_t code; 182 | int8_t ch; 183 | 184 | while (ptr != end) { 185 | code = *ptr++; 186 | 187 | if (code <= 0x7F) { 188 | ch = map->ascii_map[code]; 189 | if (ch >= 0) { 190 | *dst++ = (uint8_t)ch; 191 | } 192 | continue; 193 | } else { 194 | switch (code) { 195 | case 0x055A: // ARMENIAN APOSTROPHE 196 | case 0x2018: // LEFT SINGLE QUOTATION MARK 197 | case 0x2019: // RIGHT SINGLE QUOTATION MARK 198 | case 0x201B: // SINGLE HIGH-REVERSED-9 QUOTATION MARK 199 | case 0xFF07: // FULLWIDTH APOSTROPHE 200 | if (map_quote) { 201 | code = '\''; 202 | } 203 | break; 204 | 205 | default: 206 | if (rm_di && utf8lite_isignorable(code)) { 207 | continue; 208 | } 209 | break; 210 | } 211 | } 212 | utf8lite_encode_utf8(code, &dst); 213 | } 214 | 215 | *dst = '\0'; // not necessary, but helps with debugging 216 | map->text.attr = (UTF8LITE_TEXT_SIZE_MASK 217 | & ((size_t)(dst - map->text.ptr))); 218 | return 0; 219 | } 220 | 221 | 222 | int utf8lite_textmap_set_ascii(struct utf8lite_textmap *map, 223 | const struct utf8lite_text *text) 224 | { 225 | struct utf8lite_text_iter it; 226 | size_t size = UTF8LITE_TEXT_SIZE(text); 227 | int8_t ch; 228 | uint8_t *dst; 229 | int err; 230 | 231 | assert(size < SIZE_MAX); 232 | 233 | if ((err = utf8lite_textmap_reserve(map, size + 1))) { 234 | goto error; 235 | } 236 | 237 | dst = map->text.ptr; 238 | 239 | utf8lite_text_iter_make(&it, text); 240 | while (utf8lite_text_iter_advance(&it)) { 241 | ch = map->ascii_map[it.current]; 242 | if (ch >= 0) { 243 | *dst++ = (uint8_t)ch; 244 | } 245 | } 246 | 247 | *dst = '\0'; // not necessary, but helps with debugging 248 | map->text.attr = (UTF8LITE_TEXT_SIZE_MASK 249 | & ((size_t)(dst - map->text.ptr))); 250 | return 0; 251 | 252 | error: 253 | return err; 254 | } 255 | -------------------------------------------------------------------------------- /src/wordscan.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include "utf8lite.h" 20 | #include "private/emojiprop.h" 21 | #include "private/wordbreak.h" 22 | #include "utf8lite.h" 23 | 24 | 25 | void utf8lite_wordscan_make(struct utf8lite_wordscan *scan, 26 | const struct utf8lite_text *text) 27 | { 28 | utf8lite_text_iter_make(&scan->iter, text); 29 | utf8lite_wordscan_reset(scan); 30 | } 31 | 32 | 33 | #define NEXT() \ 34 | do { \ 35 | follow_zwj = (scan->prop == WORD_BREAK_ZWJ); \ 36 | scan->ptr = scan->iter_ptr; \ 37 | scan->code = scan->iter.current; \ 38 | scan->prop = scan->iter_prop; \ 39 | scan->iter_ptr = scan->iter.ptr; \ 40 | if (utf8lite_text_iter_advance(&scan->iter)) { \ 41 | scan->iter_prop = word_break(scan->iter.current); \ 42 | } else { \ 43 | scan->iter_prop = WORD_BREAK_NONE; \ 44 | } \ 45 | } while (0) 46 | 47 | #define EXTEND() \ 48 | do { \ 49 | while (scan->prop == WORD_BREAK_EXTEND \ 50 | || scan->prop == WORD_BREAK_FORMAT \ 51 | || scan->prop == WORD_BREAK_ZWJ) { \ 52 | NEXT(); \ 53 | } \ 54 | } while (0) 55 | 56 | 57 | void utf8lite_wordscan_reset(struct utf8lite_wordscan *scan) 58 | { 59 | scan->current.ptr = NULL; 60 | scan->current.attr = scan->iter.text_attr & ~UTF8LITE_TEXT_SIZE_MASK; 61 | 62 | utf8lite_text_iter_reset(&scan->iter); 63 | scan->ptr = scan->iter.ptr; 64 | 65 | if (utf8lite_text_iter_advance(&scan->iter)) { 66 | scan->code = scan->iter.current; 67 | scan->prop = word_break(scan->code); 68 | 69 | scan->iter_ptr = scan->iter.ptr; 70 | if (utf8lite_text_iter_advance(&scan->iter)) { 71 | scan->iter_prop = word_break(scan->iter.current); 72 | } else { 73 | scan->iter_prop = WORD_BREAK_NONE; 74 | } 75 | } else { 76 | scan->code = 0; 77 | scan->prop = WORD_BREAK_NONE; 78 | scan->iter_ptr = NULL; 79 | scan->iter_prop = WORD_BREAK_NONE; 80 | } 81 | } 82 | 83 | 84 | static int next_signif_prop(const struct utf8lite_wordscan *scan) 85 | { 86 | struct utf8lite_text_iter iter; 87 | int prop; 88 | 89 | switch (scan->iter_prop) { 90 | case WORD_BREAK_EXTEND: 91 | case WORD_BREAK_FORMAT: 92 | case WORD_BREAK_ZWJ: 93 | break; 94 | default: 95 | return scan->iter_prop; 96 | } 97 | 98 | iter = scan->iter; 99 | while (utf8lite_text_iter_advance(&iter)) { 100 | prop = word_break(iter.current); 101 | switch (prop) { 102 | case WORD_BREAK_EXTEND: 103 | case WORD_BREAK_FORMAT: 104 | case WORD_BREAK_ZWJ: 105 | break; 106 | default: 107 | return prop; 108 | } 109 | } 110 | return WORD_BREAK_NONE; 111 | } 112 | 113 | 114 | int utf8lite_wordscan_advance(struct utf8lite_wordscan *scan) 115 | { 116 | int follow_zwj = 0; 117 | scan->current.ptr = (uint8_t *)scan->ptr; 118 | scan->current.attr &= ~UTF8LITE_TEXT_SIZE_MASK; 119 | 120 | Start: 121 | switch ((enum word_break_prop)scan->prop) { 122 | case WORD_BREAK_NONE: 123 | // Break at the start and end of text unless the text is empty 124 | // WB2: Any + eot 125 | goto Break; 126 | 127 | case WORD_BREAK_CR: 128 | NEXT(); 129 | goto CR; 130 | 131 | case WORD_BREAK_NEWLINE: 132 | case WORD_BREAK_LF: 133 | NEXT(); 134 | goto Newline; 135 | 136 | case WORD_BREAK_WSEGSPACE: 137 | NEXT(); 138 | goto WSegSpace; 139 | 140 | case WORD_BREAK_ALETTER: 141 | NEXT(); 142 | goto ALetter; 143 | 144 | case WORD_BREAK_NUMERIC: 145 | NEXT(); 146 | goto Numeric; 147 | 148 | case WORD_BREAK_EXTENDNUMLET: 149 | NEXT(); 150 | goto ExtendNumLet; 151 | 152 | case WORD_BREAK_HEBREW_LETTER: 153 | NEXT(); 154 | goto Hebrew_Letter; 155 | 156 | case WORD_BREAK_KATAKANA: 157 | NEXT(); 158 | goto Katakana; 159 | 160 | case WORD_BREAK_REGIONAL_INDICATOR: 161 | NEXT(); 162 | goto Regional_Indicator; 163 | 164 | case WORD_BREAK_DOUBLE_QUOTE: 165 | case WORD_BREAK_MIDLETTER: 166 | case WORD_BREAK_MIDNUM: 167 | case WORD_BREAK_MIDNUMLET: 168 | case WORD_BREAK_SINGLE_QUOTE: 169 | case WORD_BREAK_EXTEND: // marks 170 | case WORD_BREAK_FORMAT: // Cf format controls 171 | case WORD_BREAK_ZWJ: 172 | case WORD_BREAK_OTHER: 173 | NEXT(); 174 | goto Any; 175 | } 176 | 177 | assert(0 && "Unhandled word break property"); 178 | return 0; 179 | CR: 180 | if (scan->prop == WORD_BREAK_LF) { 181 | // Do not break within CRLF 182 | // WB3: CR * LF 183 | NEXT(); 184 | } 185 | 186 | Newline: 187 | // Otherwise break after Newlines 188 | // WB3a: (Newline | CR | LF) + 189 | goto Break; 190 | 191 | 192 | WSegSpace: 193 | // WB3d: Keep horizontal whitespace together. 194 | if (scan->prop == WORD_BREAK_WSEGSPACE) { 195 | NEXT(); 196 | goto WSegSpace; 197 | } 198 | EXTEND(); 199 | goto MaybeBreak; 200 | 201 | ALetter: 202 | EXTEND(); 203 | 204 | switch (scan->prop) { 205 | case WORD_BREAK_ALETTER: 206 | // Do not break between most letters 207 | // WB5: AHLetter * AHLetter 208 | NEXT(); 209 | goto ALetter; 210 | 211 | case WORD_BREAK_HEBREW_LETTER: 212 | // WB5: AHLetter * AHLetter 213 | NEXT(); 214 | goto Hebrew_Letter; 215 | 216 | case WORD_BREAK_MIDLETTER: 217 | case WORD_BREAK_MIDNUMLET: 218 | case WORD_BREAK_SINGLE_QUOTE: 219 | // Do not break across certain punctuation 220 | 221 | // WB6: AHLetter * (MidLetter | MidNumLetQ) AHLetter 222 | 223 | switch (next_signif_prop(scan)) { 224 | case WORD_BREAK_ALETTER: 225 | // WB7: AHLetter (MidLetter | MidNumLetQ) * AHLetter 226 | NEXT(); 227 | EXTEND(); 228 | NEXT(); 229 | goto ALetter; 230 | case WORD_BREAK_HEBREW_LETTER: 231 | // WB7: AHLetter (MidLetter | MidNumLetQ) * AHLetter 232 | NEXT(); 233 | EXTEND(); 234 | NEXT(); 235 | goto Hebrew_Letter; 236 | default: 237 | goto MaybeBreak; 238 | } 239 | 240 | case WORD_BREAK_NUMERIC: 241 | // Do not break within sequences of digits, or digits 242 | // adjacent to letters (“3a”, or “A3”). 243 | // WB9: AHLetter * Numeric 244 | NEXT(); 245 | goto Numeric; 246 | 247 | case WORD_BREAK_EXTENDNUMLET: 248 | // Do not break from extenders 249 | // WB13a: AHLetter * ExtendNumLet 250 | NEXT(); 251 | goto ExtendNumLet; 252 | 253 | default: 254 | goto MaybeBreak; 255 | } 256 | 257 | Hebrew_Letter: 258 | EXTEND(); 259 | 260 | switch (scan->prop) { 261 | case WORD_BREAK_ALETTER: 262 | // Do not break between most letters 263 | // WB5: AHLetter * AHLetter 264 | NEXT(); 265 | goto ALetter; 266 | 267 | case WORD_BREAK_HEBREW_LETTER: 268 | // WB5: AHLetter * AHLetter 269 | NEXT(); 270 | goto Hebrew_Letter; 271 | 272 | case WORD_BREAK_MIDLETTER: 273 | case WORD_BREAK_MIDNUMLET: 274 | case WORD_BREAK_SINGLE_QUOTE: 275 | // Do not break across certain punctuation 276 | 277 | // WB6: AHLetter * (MidLetter | MidNumLetQ) * AHLetter 278 | switch (next_signif_prop(scan)) { 279 | case WORD_BREAK_HEBREW_LETTER: 280 | // WB7: 281 | // AHLetter (MidLetter | MidNumLetQ) * AHLetter 282 | NEXT(); 283 | EXTEND(); 284 | NEXT(); 285 | goto Hebrew_Letter; 286 | case WORD_BREAK_ALETTER: 287 | // WB7: 288 | // AHLetter (MidLetter | MidNumLetQ) * AHLetter 289 | NEXT(); 290 | EXTEND(); 291 | NEXT(); 292 | goto ALetter; 293 | default: 294 | break; 295 | } 296 | 297 | if (scan->prop == WORD_BREAK_SINGLE_QUOTE) { 298 | NEXT(); 299 | goto Any; 300 | } 301 | 302 | goto MaybeBreak; 303 | 304 | 305 | case WORD_BREAK_DOUBLE_QUOTE: 306 | // WB7b: Hebrew_Letter * Double_Quote Hebrew_Letter 307 | switch (next_signif_prop(scan)) { 308 | case WORD_BREAK_HEBREW_LETTER: 309 | // Wb7c: 310 | // Hebrew_Letter Double_Quote * Hebrew_Letter 311 | NEXT(); 312 | EXTEND(); 313 | NEXT(); 314 | goto Hebrew_Letter; 315 | default: 316 | goto MaybeBreak; 317 | } 318 | 319 | case WORD_BREAK_NUMERIC: 320 | // WB9: AHLetter * Numeric 321 | NEXT(); 322 | goto Numeric; 323 | 324 | case WORD_BREAK_EXTENDNUMLET: 325 | // WB13a: AHLetter * ExtendNumLet 326 | NEXT(); 327 | goto ExtendNumLet; 328 | 329 | default: 330 | goto MaybeBreak; 331 | } 332 | 333 | Numeric: 334 | EXTEND(); 335 | 336 | switch (scan->prop) { 337 | case WORD_BREAK_NUMERIC: 338 | // WB8: Numeric * Numeric 339 | NEXT(); 340 | goto Numeric; 341 | 342 | case WORD_BREAK_MIDNUMLET: 343 | case WORD_BREAK_SINGLE_QUOTE: 344 | case WORD_BREAK_MIDNUM: 345 | // WB12: Numeric * (MidNum | MidNumLetQ) Numeric 346 | if (next_signif_prop(scan) == WORD_BREAK_NUMERIC) { 347 | // WB11: Numeric (MidNum|MidNumLeqQ) * Numeric 348 | NEXT(); 349 | EXTEND(); 350 | NEXT(); 351 | goto Numeric; 352 | } 353 | goto MaybeBreak; 354 | 355 | case WORD_BREAK_EXTENDNUMLET: 356 | // WB13a: Numeric * ExtendNumLet 357 | NEXT(); 358 | goto ExtendNumLet; 359 | 360 | case WORD_BREAK_ALETTER: 361 | // WB10: Numeric * AHLetter 362 | NEXT(); 363 | goto ALetter; 364 | 365 | case WORD_BREAK_HEBREW_LETTER: 366 | // WB10: Numeric * AHLetter 367 | NEXT(); 368 | goto Hebrew_Letter; 369 | 370 | default: 371 | goto MaybeBreak; 372 | } 373 | 374 | Katakana: 375 | EXTEND(); 376 | 377 | switch (scan->prop) { 378 | case WORD_BREAK_KATAKANA: 379 | // WB13: Katakana * Katakana 380 | NEXT(); 381 | goto Katakana; 382 | 383 | case WORD_BREAK_EXTENDNUMLET: 384 | // WB13a: Katakana * ExtendNumLet 385 | NEXT(); 386 | goto ExtendNumLet; 387 | 388 | default: 389 | goto MaybeBreak; 390 | } 391 | 392 | ExtendNumLet: 393 | EXTEND(); 394 | 395 | switch (scan->prop) { 396 | case WORD_BREAK_ALETTER: 397 | // WB13b: ExtendNumLet * AHLetter 398 | NEXT(); 399 | goto ALetter; 400 | 401 | case WORD_BREAK_NUMERIC: 402 | // WB13b: ExtendNumLet * Numeric 403 | NEXT(); 404 | goto Numeric; 405 | 406 | case WORD_BREAK_EXTENDNUMLET: 407 | // WB13a: ExtendNumLet * ExtendNumLet 408 | NEXT(); 409 | goto ExtendNumLet; 410 | 411 | case WORD_BREAK_HEBREW_LETTER: 412 | // WB13b: ExtendNumLet * AHLetter 413 | NEXT(); 414 | goto Hebrew_Letter; 415 | 416 | case WORD_BREAK_KATAKANA: 417 | // WB13c: ExtendNumLet * Katakana 418 | NEXT(); 419 | goto Katakana; 420 | 421 | default: 422 | goto MaybeBreak; 423 | } 424 | 425 | Regional_Indicator: 426 | EXTEND(); 427 | 428 | //fprintf(stderr, "Regional_Indicator: code = U+%04X\n", code); 429 | 430 | // Do not break within emoji flag sequences. That is, do not break 431 | // between regional indicator (RI) symbols if there is an odd number 432 | // of RI characters before the break point 433 | 434 | switch (scan->prop) { 435 | case WORD_BREAK_REGIONAL_INDICATOR: 436 | // WB15/16: [^RI] RI * RI 437 | NEXT(); 438 | EXTEND(); 439 | goto MaybeBreak; 440 | 441 | default: 442 | // WB15/16: [^RI] RI * RI 443 | goto MaybeBreak; 444 | } 445 | 446 | Any: 447 | EXTEND(); 448 | goto MaybeBreak; 449 | 450 | MaybeBreak: 451 | // WB3c: Do not break within emoji zwj sequences. 452 | if (follow_zwj && (emoji_prop(scan->code) 453 | & EMOJI_PROP_EXTENDED_PICTOGRAPHIC)) { 454 | NEXT(); 455 | goto Start; 456 | } 457 | goto Break; 458 | 459 | Break: 460 | scan->current.attr |= (size_t)(scan->ptr - scan->current.ptr); 461 | return (scan->ptr == scan->current.ptr) ? 0 : 1; 462 | } 463 | -------------------------------------------------------------------------------- /tests/check_charwidth.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include "../src/utf8lite.h" 21 | #include "wcwidth9/wcwidth9.h" 22 | #include "testutil.h" 23 | 24 | struct code_width { 25 | int32_t code; 26 | int width; 27 | }; 28 | 29 | START_TEST(test_examples) 30 | { 31 | struct code_width tests[] = { 32 | // Examples from https://github.com/patperry/r-utf8/issues/9 33 | {.code = 0x2139, .width = UTF8LITE_CHARWIDTH_NARROW}, 34 | {.code = 0x2600, .width = UTF8LITE_CHARWIDTH_NARROW}, 35 | {.code = 0x2728, .width = UTF8LITE_CHARWIDTH_EMOJI} 36 | }; 37 | struct code_width *t; 38 | int i, n, ok, prop, prop0, nfail; 39 | 40 | n = (int)(sizeof(tests) / sizeof(tests[0])); 41 | nfail = 0; 42 | for (i = 0; i < n; i++) { 43 | t = &tests[i]; 44 | prop0 = t->width; 45 | prop = utf8lite_charwidth(t->code); 46 | ok = (prop == prop0); 47 | 48 | if (!ok) { 49 | nfail++; 50 | printf("U+%04X expected: %d got: %d\n", t->code, 51 | prop0, prop); 52 | } 53 | } 54 | 55 | ck_assert(nfail == 0); 56 | } 57 | END_TEST 58 | 59 | /* 60 | * This check is kind of meaningless. wcwidth9 has Unicode 9.0.0, gives 61 | * different behavior for lots of characters. 62 | */ 63 | START_TEST(test_wcwidth9) 64 | { 65 | int prop, prop0, ok, nfail; 66 | int32_t code; 67 | 68 | nfail = 0; 69 | for (code = 0; code <= UTF8LITE_CODE_MAX; code++) { 70 | prop0 = (code < 0x10FFFE) ? wcwidth9(code) : -3; 71 | prop = utf8lite_charwidth(code); 72 | 73 | if (code == 0x1F93B || code == 0x1F946) { 74 | // These characters changed from East Asian Wide 75 | // to Narrow in Unicode 13.0 76 | ok = prop == UTF8LITE_CHARWIDTH_NARROW; 77 | goto Check; 78 | } else if (0x1F1E6 <= code && code <= 0x1F1FF) { 79 | // regional indicators 80 | ok = prop == UTF8LITE_CHARWIDTH_NARROW; 81 | goto Check; 82 | } 83 | 84 | switch (prop) { 85 | case UTF8LITE_CHARWIDTH_NONE: 86 | ok = prop0 == -1 || prop0 == -3 || prop0 == 1; 87 | break; 88 | 89 | case UTF8LITE_CHARWIDTH_IGNORABLE: 90 | ok = prop0 == -1 || prop0 >= 1; 91 | break; 92 | 93 | case UTF8LITE_CHARWIDTH_MARK: 94 | ok = prop0 == -1 || prop0 == 1; 95 | break; 96 | 97 | case UTF8LITE_CHARWIDTH_NARROW: 98 | ok = prop0 == 1 || prop0 == -1 || code > 0xFFFF; 99 | break; 100 | 101 | case UTF8LITE_CHARWIDTH_AMBIGUOUS: 102 | ok = prop0 == -2; 103 | break; 104 | 105 | case UTF8LITE_CHARWIDTH_WIDE: 106 | ok = prop0 == 2 || prop0 == -1; 107 | break; 108 | 109 | case UTF8LITE_CHARWIDTH_EMOJI: 110 | ok = prop0 == 2 || prop0 == 1 || prop0 == -1 || prop0 == -2; 111 | break; 112 | 113 | default: 114 | ok = 0; 115 | break; 116 | } 117 | Check: 118 | if (!ok) { 119 | nfail++; 120 | printf("U+%04X wcwidth9: %d utf8lite: %d\n", code, prop0, prop); 121 | } 122 | } 123 | 124 | ck_assert(nfail == 0); 125 | } 126 | END_TEST 127 | 128 | 129 | Suite *charwidth_suite(void) 130 | { 131 | Suite *s; 132 | TCase *tc; 133 | 134 | s = suite_create("charwidth"); 135 | tc = tcase_create("core"); 136 | tcase_add_test(tc, test_examples); 137 | suite_add_tcase(s, tc); 138 | 139 | tc = tcase_create("wcwidth9"); 140 | tcase_add_test(tc, test_wcwidth9); 141 | suite_add_tcase(s, tc); 142 | 143 | return s; 144 | } 145 | 146 | 147 | int main(void) 148 | { 149 | int number_failed; 150 | Suite *s; 151 | SRunner *sr; 152 | 153 | s = charwidth_suite(); 154 | sr = srunner_create(s); 155 | 156 | srunner_run_all(sr, CK_NORMAL); 157 | number_failed = srunner_ntests_failed(sr); 158 | srunner_free(sr); 159 | 160 | return (number_failed == 0) ? EXIT_SUCCESS : EXIT_FAILURE; 161 | } 162 | -------------------------------------------------------------------------------- /tests/check_graphscan.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include "../src/utf8lite.h" 21 | #include "testutil.h" 22 | 23 | #define GRAPH_BREAK_TEST "data/ucd/auxiliary/GraphemeBreakTest.txt" 24 | struct utf8lite_graphscan scan; 25 | 26 | 27 | void setup_scan(void) 28 | { 29 | setup(); 30 | } 31 | 32 | 33 | void teardown_scan(void) 34 | { 35 | teardown(); 36 | } 37 | 38 | 39 | void start(const struct utf8lite_text *text) 40 | { 41 | utf8lite_graphscan_make(&scan, text); 42 | } 43 | 44 | 45 | const struct utf8lite_text *next(void) 46 | { 47 | struct utf8lite_text *graph; 48 | if (!utf8lite_graphscan_advance(&scan)) { 49 | return NULL; 50 | } 51 | graph = alloc(sizeof(*graph)); 52 | *graph = scan.current.text; 53 | return graph; 54 | } 55 | 56 | 57 | const struct utf8lite_text *prev(void) 58 | { 59 | struct utf8lite_text *graph; 60 | if (!utf8lite_graphscan_retreat(&scan)) { 61 | return NULL; 62 | } 63 | graph = alloc(sizeof(*graph)); 64 | *graph = scan.current.text; 65 | return graph; 66 | } 67 | 68 | 69 | START_TEST(test_empty) 70 | { 71 | start(S("")); 72 | ck_assert(next() == NULL); 73 | ck_assert(next() == NULL); 74 | ck_assert(prev() == NULL); 75 | ck_assert(prev() == NULL); 76 | } 77 | END_TEST 78 | 79 | 80 | START_TEST(test_single) 81 | { 82 | start(S("x")); 83 | ck_assert(prev() == NULL); 84 | assert_text_eq(next(), S("x")); 85 | ck_assert(prev() == NULL); 86 | assert_text_eq(next(), S("x")); 87 | ck_assert(next() == NULL); 88 | ck_assert(next() == NULL); 89 | assert_text_eq(prev(), S("x")); 90 | ck_assert(prev() == NULL); 91 | ck_assert(prev() == NULL); 92 | } 93 | END_TEST 94 | 95 | 96 | START_TEST(test_emoji_modifier) 97 | { 98 | // This is an Extended_Pictographic followed by Extend 99 | start(JS("\\uD83D\\uDE0A\\uD83C\\uDFFB")); // U+1F60A U+1F3FB 100 | assert_text_eq(next(), JS("\\uD83D\\uDE0A\\uD83C\\uDFFB")); 101 | ck_assert(next() == NULL); 102 | } 103 | END_TEST 104 | 105 | 106 | START_TEST(test_emoji_zwj_sequence) 107 | { 108 | // \U0001F469\u200D\u2764\uFE0F\u200D\U0001F48B\u200D\U0001F469 109 | start(JS("\\ud83d\\udc69\\u200d\\u2764\\ufe0f\\u200d\\ud83d\\udc8b\\u200d\\ud83d\\udc69")); 110 | 111 | assert_text_eq(next(), JS("\\ud83d\\udc69\\u200d\\u2764\\ufe0f\\u200d\\ud83d\\udc8b\\u200d\\ud83d\\udc69")); 112 | 113 | ck_assert(next() == NULL); 114 | } 115 | END_TEST 116 | 117 | // Check that isolated codepoints are single graphemes. 118 | START_TEST(test_isolated) 119 | { 120 | uint8_t buf[4]; 121 | uint8_t *end; 122 | int32_t code; 123 | struct utf8lite_text text; 124 | 125 | for (code = 1; code <= 0x1FFF; code++) { 126 | if (!UTF8LITE_IS_UNICODE(code)) 127 | continue; 128 | end = buf; 129 | utf8lite_encode_utf8(code, &end); 130 | utf8lite_text_assign(&text, buf, end - buf, 0, NULL); 131 | 132 | start(&text); 133 | assert_text_eq(next(), &text); 134 | ck_assert(next() == NULL); 135 | } 136 | } 137 | END_TEST 138 | 139 | // Unicode Grapheme Break Test 140 | // http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt 141 | struct unitest { 142 | char comment[4096]; 143 | unsigned line; 144 | int is_ascii; 145 | 146 | struct utf8lite_text text; 147 | uint8_t buf[4096]; 148 | 149 | int32_t code[256]; 150 | int can_break_before[256]; 151 | uint8_t *code_end[256]; 152 | unsigned ncode; 153 | 154 | uint8_t *break_begin[256]; 155 | uint8_t *break_end[256]; 156 | unsigned nbreak; 157 | 158 | }; 159 | 160 | struct unitest unitests[4096]; 161 | unsigned nunitest; 162 | 163 | void write_unitest(FILE *stream, const struct unitest *test) 164 | { 165 | unsigned i, n = test->ncode; 166 | 167 | for (i = 0; i < n; i++) { 168 | fprintf(stream, "%s %04X ", 169 | (test->can_break_before[i]) ? "\xC3\xB7" : "\xC3\x97", 170 | test->code[i]); 171 | } 172 | fprintf(stream, "\xC3\xB7 %s\n", test->comment); 173 | } 174 | 175 | void setup_unicode(void) 176 | { 177 | struct unitest *test; 178 | FILE *file; 179 | unsigned code, line, nbreak, ncode; 180 | uint8_t *dst; 181 | char *comment; 182 | int ch, is_ascii; 183 | 184 | setup_scan(); 185 | file = fopen(GRAPH_BREAK_TEST, "r"); 186 | if (!file) { 187 | file = fopen("../"GRAPH_BREAK_TEST, "r"); 188 | } 189 | 190 | nunitest = 0; 191 | test = &unitests[0]; 192 | 193 | line = 1; 194 | ncode = 0; 195 | nbreak = 0; 196 | is_ascii = 1; 197 | test->text.ptr = &test->buf[0]; 198 | dst = test->text.ptr; 199 | 200 | ck_assert_msg(file != NULL, "file '"GRAPH_BREAK_TEST"' not found"); 201 | while ((ch = fgetc(file)) != EOF) { 202 | switch (ch) { 203 | case '#': 204 | comment = &test->comment[0]; 205 | do { 206 | *comment++ = (char)ch; 207 | ch = fgetc(file); 208 | } while (ch != EOF && ch != '\n'); 209 | *comment = '\0'; 210 | 211 | if (ch == EOF) { 212 | goto eof; 213 | } 214 | /* fallthrough */ 215 | case '\n': 216 | *dst = '\0'; 217 | 218 | test->line = line; 219 | test->is_ascii = is_ascii; 220 | test->text.attr = (size_t)(dst - test->text.ptr); 221 | 222 | if (ncode > 0) { 223 | test->ncode = ncode; 224 | test->nbreak = nbreak; 225 | ncode = 0; 226 | nbreak = 0; 227 | is_ascii = 1; 228 | nunitest++; 229 | test = &unitests[nunitest]; 230 | test->text.ptr = &test->buf[0]; 231 | test->comment[0] = '\0'; 232 | dst = test->text.ptr; 233 | } 234 | line++; 235 | break; 236 | 237 | case 0xC3: 238 | ch = fgetc(file); 239 | if (ch == EOF) { 240 | goto eof; 241 | } else if (ch == 0x97) { 242 | // MULTIPLICATON SIGN (U+00D7) 0xC3 0x97 243 | test->can_break_before[ncode] = 0; 244 | } else if (ch == 0xB7) { 245 | // DIVISION SIGN (U+00F7) 0xC3 0xB7 246 | test->can_break_before[ncode] = 1; 247 | } else { 248 | goto inval; 249 | } 250 | 251 | if (test->can_break_before[ncode]) { 252 | test->break_begin[nbreak] = dst; 253 | if (nbreak > 0) { 254 | test->break_end[nbreak - 1] = dst; 255 | } 256 | nbreak++; 257 | } 258 | 259 | if (fscanf(file, "%x", &code)) { 260 | test->code[ncode] = (int32_t)code; 261 | if (code > 0x7F) { 262 | is_ascii = 0; 263 | } 264 | utf8lite_encode_utf8((int32_t)code, &dst); 265 | test->code_end[ncode] = dst; 266 | ncode++; 267 | } else { 268 | test->break_end[nbreak - 1] = dst; 269 | nbreak--; 270 | } 271 | break; 272 | } 273 | 274 | } 275 | eof: 276 | fclose(file); 277 | return; 278 | inval: 279 | fprintf(stderr, "invalid character on line %d\n", line); 280 | fclose(file); 281 | } 282 | 283 | 284 | void teardown_unicode(void) 285 | { 286 | teardown_scan(); 287 | } 288 | 289 | 290 | START_TEST(test_unicode_forward) 291 | { 292 | struct unitest *test; 293 | unsigned i, j; 294 | 295 | for (i = 0; i < nunitest; i++) { 296 | test = &unitests[i]; 297 | 298 | //fprintf(stderr, "[%u]: ", i); 299 | //write_unitest(stderr, test); 300 | utf8lite_graphscan_make(&scan, &test->text); 301 | 302 | for (j = 0; j < test->nbreak; j++) { 303 | //fprintf(stderr, "Break %u\n", j); 304 | ck_assert(utf8lite_graphscan_advance(&scan)); 305 | ck_assert(scan.current.text.ptr 306 | == test->break_begin[j]); 307 | ck_assert(scan.current.text.ptr 308 | + UTF8LITE_TEXT_SIZE(&scan.current.text) 309 | == test->break_end[j]); 310 | } 311 | ck_assert(!utf8lite_graphscan_advance(&scan)); 312 | } 313 | } 314 | END_TEST 315 | 316 | 317 | START_TEST(test_unicode_backward) 318 | { 319 | struct unitest *test; 320 | unsigned i, j; 321 | 322 | for (i = 0; i < nunitest; i++) { 323 | test = &unitests[i]; 324 | 325 | //fprintf(stderr, "[%u]: ", i); 326 | //write_unitest(stderr, test); 327 | utf8lite_graphscan_make(&scan, &test->text); 328 | utf8lite_graphscan_skip(&scan); 329 | ck_assert(scan.current.text.ptr 330 | == test->break_end[test->nbreak]); 331 | ck_assert(scan.current.text.attr == 0); 332 | 333 | j = test->nbreak; 334 | while (j-- > 0) { 335 | //fprintf(stderr, "Break %u\n", j); 336 | ck_assert(utf8lite_graphscan_retreat(&scan)); 337 | ck_assert(scan.current.text.ptr 338 | == test->break_begin[j]); 339 | ck_assert(scan.current.text.ptr 340 | + UTF8LITE_TEXT_SIZE(&scan.current.text) 341 | == test->break_end[j]); 342 | } 343 | //fprintf(stderr, "Start\n"); 344 | ck_assert(!utf8lite_graphscan_retreat(&scan)); 345 | ck_assert(!utf8lite_graphscan_retreat(&scan)); 346 | } 347 | } 348 | END_TEST 349 | 350 | 351 | Suite *graphscan_suite(void) 352 | { 353 | Suite *s; 354 | TCase *tc; 355 | 356 | s = suite_create("graphscan"); 357 | tc = tcase_create("core"); 358 | tcase_add_checked_fixture(tc, setup_scan, teardown_scan); 359 | tcase_add_test(tc, test_empty); 360 | tcase_add_test(tc, test_single); 361 | tcase_add_test(tc, test_emoji_modifier); 362 | tcase_add_test(tc, test_emoji_zwj_sequence); 363 | tcase_add_test(tc, test_isolated); 364 | suite_add_tcase(s, tc); 365 | 366 | tc = tcase_create("Unicode GraphemeBreakTest.txt"); 367 | tcase_add_checked_fixture(tc, setup_unicode, teardown_unicode); 368 | tcase_add_test(tc, test_unicode_forward); 369 | tcase_add_test(tc, test_unicode_backward); 370 | suite_add_tcase(s, tc); 371 | 372 | return s; 373 | } 374 | 375 | 376 | int main(void) 377 | { 378 | int number_failed; 379 | Suite *s; 380 | SRunner *sr; 381 | 382 | s = graphscan_suite(); 383 | sr = srunner_create(s); 384 | 385 | srunner_run_all(sr, CK_NORMAL); 386 | number_failed = srunner_ntests_failed(sr); 387 | srunner_free(sr); 388 | 389 | return (number_failed == 0) ? EXIT_SUCCESS : EXIT_FAILURE; 390 | } 391 | -------------------------------------------------------------------------------- /tests/check_textmap.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include "../src/utf8lite.h" 21 | #include "testutil.h" 22 | 23 | 24 | #define TEXTMAP_CASE UTF8LITE_TEXTMAP_CASE 25 | #define TEXTMAP_COMPAT UTF8LITE_TEXTMAP_COMPAT 26 | #define TEXTMAP_QUOTE UTF8LITE_TEXTMAP_QUOTE 27 | #define TEXTMAP_RMDI UTF8LITE_TEXTMAP_RMDI 28 | 29 | 30 | struct utf8lite_text *get_map(const struct utf8lite_text *text, int flags) 31 | { 32 | struct utf8lite_text *val; 33 | struct utf8lite_textmap map; 34 | size_t size; 35 | 36 | ck_assert(!utf8lite_textmap_init(&map, flags)); 37 | ck_assert(!utf8lite_textmap_set(&map, text)); 38 | 39 | size = UTF8LITE_TEXT_SIZE(&map.text); 40 | val = alloc(sizeof(*val)); 41 | 42 | val->ptr = alloc(size + 1); 43 | memcpy(val->ptr, map.text.ptr, size); 44 | val->ptr[size] = '\0'; 45 | val->attr = map.text.attr; 46 | 47 | utf8lite_textmap_destroy(&map); 48 | return val; 49 | } 50 | 51 | 52 | struct utf8lite_text *casefold(const struct utf8lite_text *text) 53 | { 54 | return get_map(text, TEXTMAP_CASE); 55 | } 56 | 57 | 58 | START_TEST(test_map_basic) 59 | { 60 | assert_text_eq(get_map(S("hello"), 0), S("hello")); 61 | assert_text_eq(get_map(S("world"), 0), JS("world")); 62 | assert_text_eq(get_map(JS("foo"), 0), S("foo")); 63 | } 64 | END_TEST 65 | 66 | 67 | START_TEST(test_map_esc) 68 | { 69 | // backslash 70 | assert_text_eq(get_map(S("\\"), 0), S("\\")); 71 | assert_text_eq(get_map(JS("\\\\"), 0), S("\\")); 72 | assert_text_eq(get_map(JS("\\u005C"), 0), S("\\")); 73 | assert_text_eq(get_map(S("\\\\"), 0), S("\\\\")); 74 | assert_text_eq(get_map(S("\\u005C"), TEXTMAP_CASE), S("\\u005c")); 75 | 76 | // quote (') 77 | assert_text_eq(get_map(S("'"), TEXTMAP_QUOTE), S("'")); 78 | assert_text_eq(get_map(JS("'"), TEXTMAP_QUOTE), S("'")); 79 | assert_text_eq(get_map(S("\""), TEXTMAP_QUOTE), S("\"")); 80 | assert_text_eq(get_map(JS("\\\""), TEXTMAP_QUOTE), S("\"")); 81 | assert_text_eq(get_map(JS("\\u2019"), TEXTMAP_QUOTE), S("\'")); 82 | //assert_text_eq(get_map(JS("\\u201c"), TEXTMAP_QUOTE), S("\"")); 83 | assert_text_eq(get_map(S("\\\'"), TEXTMAP_QUOTE), S("\\\'")); 84 | assert_text_eq(get_map(S("\\u2019"), TEXTMAP_QUOTE), S("\\u2019")); 85 | } 86 | END_TEST 87 | 88 | 89 | START_TEST(test_keep_control_ascii) 90 | { 91 | const struct utf8lite_text *js, *t; 92 | char str[256]; 93 | uint8_t i; 94 | 95 | assert_text_eq(get_map(S("\a"), 0), S("\a")); 96 | assert_text_eq(get_map(S("\b"), 0), S("\b")); 97 | 98 | // C0 99 | for (i = 1; i < 0x20; i++) { 100 | if (0x09 <= i && i <= 0x0D) { 101 | continue; 102 | } 103 | str[0] = (char)i; str[1] = '\0'; 104 | t = S(str); 105 | assert_text_eq(get_map(t, 0), t); 106 | 107 | sprintf(str, "\\u%04X", i); 108 | js = JS(str); 109 | assert_text_eq(get_map(js, 0), t); 110 | } 111 | 112 | // delete 113 | assert_text_eq(get_map(S("\x7F"), 0), S("\x7F")); 114 | assert_text_eq(get_map(JS("\\u007F"), 0), S("\x7F")); 115 | } 116 | END_TEST 117 | 118 | 119 | START_TEST(test_keep_control_utf8) 120 | { 121 | const struct utf8lite_text *t, *js; 122 | uint8_t str[256]; 123 | uint8_t i; 124 | 125 | // C1 126 | for (i = 0x80; i < 0xA0; i++) { 127 | if (i == 0x85) { 128 | continue; 129 | } 130 | 131 | str[0] = 0xC2; str[1] = i; str[2] = '\0'; 132 | t = S((char *)str); 133 | assert_text_eq(get_map(t, 0), t); 134 | 135 | sprintf((char *)str, "\\u%04X", i); 136 | js = JS((char *)str); 137 | assert_text_eq(get_map(js, 0), t); 138 | } 139 | } 140 | END_TEST 141 | 142 | 143 | START_TEST(test_keep_ws_ascii) 144 | { 145 | assert_text_eq(get_map(S("\t"), 0), S("\t")); 146 | assert_text_eq(get_map(S("\n"), 0), S("\n")); 147 | assert_text_eq(get_map(S("\v"), 0), S("\v")); 148 | assert_text_eq(get_map(S("\f"), 0), S("\f")); 149 | assert_text_eq(get_map(S("\r"), 0), S("\r")); 150 | assert_text_eq(get_map(S(" "), 0), S(" ")); 151 | } 152 | END_TEST 153 | 154 | 155 | START_TEST(test_keep_ws_utf8) 156 | { 157 | const struct utf8lite_text *t, *js, *text; 158 | uint8_t str[256]; 159 | uint8_t *buf; 160 | unsigned ws[] = { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 161 | 0x0020, 0x0085, 0x00A0, 0x1680, 0x2000, 162 | 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 163 | 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 164 | 0x2028, 0x2029, 0x202F, 0x205F, 0x3000 }; 165 | int i, n = sizeof(ws) / sizeof(ws[0]); 166 | 167 | for (i = 0; i < n; i++) { 168 | //fprintf(stderr, "i = %d; ws = U+%04X\n", i, ws[i]); 169 | 170 | switch (ws[i]) { 171 | case 0x0009: 172 | text = S("\t"); 173 | break; 174 | case 0x000A: 175 | text = S("\n"); 176 | break; 177 | case 0x000B: 178 | text = S("\v"); 179 | break; 180 | case 0x000C: 181 | text = S("\f"); 182 | break; 183 | case 0x000D: 184 | text = S("\r"); 185 | break; 186 | case 0x0085: // NEXT LINE (NEL) 187 | text = S("\xC2\x85"); 188 | break; 189 | case 0x1680: // OGHAM SPACE MARK 190 | text = S("\xE1\x9A\x80"); 191 | break; 192 | case 0x2028: // LINE SEPARATOR 193 | text = S("\xE2\x80\xA8"); 194 | break; 195 | case 0x2029: // PARAGRAPH SEPARATOR 196 | text = S("\xE2\x80\xA9"); 197 | break; 198 | default: 199 | text = S(" "); 200 | break; 201 | } 202 | 203 | buf = str; 204 | utf8lite_encode_utf8(ws[i], &buf); 205 | *buf = '\0'; 206 | t = S((char *)str); 207 | assert_text_eq(get_map(t, TEXTMAP_COMPAT), text); 208 | 209 | sprintf((char *)str, "\\u%04x", ws[i]); 210 | js = JS((char *)str); 211 | assert_text_eq(get_map(js, TEXTMAP_COMPAT), text); 212 | } 213 | } 214 | END_TEST 215 | 216 | 217 | // removed the following features from textmap, no need to test 218 | #if 0 219 | 220 | /* 221 | * Control Characters (Cc) 222 | * ----------------------- 223 | * 224 | * U+0000..U+001F (C0) 225 | * U+007F (delete) 226 | * U+0080..U+009F (C1) 227 | * 228 | * Source: UnicodeStandard-8.0, Sec. 23.1, p. 808. 229 | */ 230 | 231 | START_TEST(test_rm_control_ascii) 232 | { 233 | char str[256]; 234 | uint8_t i; 235 | 236 | assert_text_eq(get_map(S("\a"), TYPE_RMCC), S("")); 237 | assert_text_eq(get_map(S("\b"), TYPE_RMCC), S("")); 238 | assert_text_eq(get_map(S("\t"), TYPE_RMCC), S("\t")); 239 | assert_text_eq(get_map(S("\n"), TYPE_RMCC), S("\n")); 240 | assert_text_eq(get_map(S("\v"), TYPE_RMCC), S("\v")); 241 | assert_text_eq(get_map(S("\f"), TYPE_RMCC), S("\f")); 242 | assert_text_eq(get_map(S("\r"), TYPE_RMCC), S("\r")); 243 | 244 | // C0 245 | for (i = 1; i < 0x20; i++) { 246 | if (0x09 <= i && i <= 0x0D) { 247 | continue; 248 | } 249 | 250 | str[0] = (char)i; str[1] = '\0'; 251 | assert_text_eq(get_map(S(str), TYPE_RMCC), S("")); 252 | 253 | sprintf(str, "\\u%04X", i); 254 | assert_text_eq(get_map(JS(str), TYPE_RMCC), S("")); 255 | } 256 | 257 | // delete 258 | assert_text_eq(get_map(S("\x7F"), TYPE_RMCC), S("")); 259 | assert_text_eq(get_map(JS("\\u007F"), TYPE_RMCC), S("")); 260 | } 261 | END_TEST 262 | 263 | 264 | START_TEST(test_rm_control_utf8) 265 | { 266 | uint8_t str[256]; 267 | uint8_t i; 268 | 269 | // C1: JSON 270 | for (i = 0x80; i < 0xA0; i++) { 271 | if (i == 0x85) { 272 | continue; 273 | } 274 | 275 | str[0] = 0xC2; str[1] = i; str[2] = '\0'; 276 | assert_text_eq(get_map(S((char *)str), TYPE_RMCC), S("")); 277 | 278 | sprintf((char *)str, "\\u%04X", i); 279 | assert_text_eq(get_map(JS((char *)str), TYPE_RMCC), S("")); 280 | } 281 | } 282 | END_TEST 283 | 284 | 285 | START_TEST(test_rm_ws_ascii) 286 | { 287 | assert_text_eq(get_map(S("\t"), TYPE_RMWS), S("")); 288 | assert_text_eq(get_map(S("\n"), TYPE_RMWS), S("")); 289 | assert_text_eq(get_map(S("\v"), TYPE_RMWS), S("")); 290 | assert_text_eq(get_map(S("\f"), TYPE_RMWS), S("")); 291 | assert_text_eq(get_map(S("\r"), TYPE_RMWS), S("")); 292 | assert_text_eq(get_map(S(" "), TYPE_RMWS), S("")); 293 | } 294 | END_TEST 295 | 296 | 297 | START_TEST(test_rm_ws_utf8) 298 | { 299 | const struct utf8lite_text *t, *js; 300 | uint8_t str[256]; 301 | uint8_t *buf; 302 | uint32_t ws[] = { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 303 | 0x0020, 0x0085, 0x00A0, 0x1680, 0x2000, 304 | 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 305 | 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 306 | 0x2028, 0x2029, 0x202F, 0x205F, 0x3000 }; 307 | int i, n = sizeof(ws) / sizeof(ws[0]); 308 | 309 | for (i = 0; i < n; i++) { 310 | buf = str; 311 | utf8lite_encode_utf8(ws[i], &buf); 312 | *buf = '\0'; 313 | t = S((char *)str); 314 | assert_text_eq(get_map(t, TYPE_RMWS), S("")); 315 | 316 | sprintf((char *)str, "\\u%04x", ws[i]); 317 | js = JS((char *)str); 318 | assert_text_eq(get_map(js, TYPE_RMWS), S("")); 319 | } 320 | } 321 | END_TEST 322 | 323 | #endif 324 | 325 | START_TEST(test_casefold_ascii) 326 | { 327 | const struct utf8lite_text *text; 328 | uint8_t buf[2] = { 0, 0 }; 329 | uint8_t i; 330 | 331 | assert_text_eq(casefold(S("UPPER CASE")), S("upper case")); 332 | assert_text_eq(casefold(S("lower case")), S("lower case")); 333 | assert_text_eq(casefold(S("mIxEd CaSe")), S("mixed case")); 334 | 335 | for (i = 0x01; i < 'A'; i++) { 336 | buf[0] = i; 337 | text = S((char *)buf); 338 | assert_text_eq(casefold(text), text); 339 | } 340 | for (i = 'Z' + 1; i < 0x7F; i++) { 341 | buf[0] = i; 342 | text = S((char *)buf); 343 | assert_text_eq(casefold(text), text); 344 | } 345 | 346 | // upper 347 | assert_text_eq(casefold(S("ABCDEFGHIJKLMNOPQRSTUVWXYZ")), 348 | S("abcdefghijklmnopqrstuvwxyz")); 349 | 350 | // lower 351 | assert_text_eq(casefold(S("abcdefghijklmnopqrstuvwxyz")), 352 | S("abcdefghijklmnopqrstuvwxyz")); 353 | 354 | // digit 355 | assert_text_eq(casefold(S("0123456789")), S("0123456789")); 356 | 357 | // punct 358 | assert_text_eq(casefold(S("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")), 359 | S("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")); 360 | 361 | // space 362 | assert_text_eq(casefold(S("\t\n\v\f\r ")), S("\t\n\v\f\r ")); 363 | } 364 | END_TEST 365 | 366 | 367 | START_TEST(test_casefold_utf8) 368 | { 369 | assert_text_eq(casefold(JS("\u1e9e")), JS("ss")); // capital eszett 370 | assert_text_eq(casefold(JS("\u00df")), JS("ss")); // lowercase eszett 371 | } 372 | END_TEST 373 | 374 | 375 | // removed this feature 376 | #if 0 377 | 378 | START_TEST(test_fold_dash) 379 | { 380 | assert_text_eq(get_map(S("-"), TYPE_DASHFOLD), S("-")); 381 | assert_text_eq(get_map(JS("\\u058A"), TYPE_DASHFOLD), S("-")); 382 | assert_text_eq(get_map(JS("\\u2212"), TYPE_DASHFOLD), S("-")); 383 | assert_text_eq(get_map(JS("\\u2E3A"), TYPE_DASHFOLD), S("-")); 384 | assert_text_eq(get_map(JS("\\u2E3B"), TYPE_DASHFOLD), S("-")); 385 | assert_text_eq(get_map(JS("\\uFF0D"), TYPE_DASHFOLD), S("-")); 386 | } 387 | END_TEST 388 | 389 | 390 | START_TEST(test_nofold_dash) 391 | { 392 | assert_text_eq(get_map(S("-"), 0), S("-")); 393 | assert_text_eq(get_map(JS("\\u058A"), 0), S("\xD6\x8A")); 394 | assert_text_eq(get_map(JS("\\u2212"), 0), S("\xE2\x88\x92")); 395 | assert_text_eq(get_map(JS("\\u2E3A"), 0), S("\xE2\xB8\xBA")); 396 | assert_text_eq(get_map(JS("\\u2E3B"), 0), S("\xE2\xB8\xBB")); 397 | assert_text_eq(get_map(JS("\\uFF0D"), 0), S("\xEF\xBC\x8D")); 398 | } 399 | END_TEST 400 | 401 | #endif 402 | 403 | 404 | START_TEST(test_map_quote) 405 | { 406 | assert_text_eq(get_map(S("'"), TEXTMAP_QUOTE), S("'")); 407 | assert_text_eq(get_map(S("\""), TEXTMAP_QUOTE), S("\"")); 408 | assert_text_eq(get_map(JS("\\u2018"), TEXTMAP_QUOTE), S("'")); 409 | assert_text_eq(get_map(JS("\\u2019"), TEXTMAP_QUOTE), S("'")); 410 | //assert_text_eq(get_map(JS("\\u201C"), TEXTMAP_QUOTE), S("\"")); 411 | //assert_text_eq(get_map(JS("\\u201D"), TEXTMAP_QUOTE), S("\"")); 412 | } 413 | END_TEST 414 | 415 | 416 | START_TEST(test_nomap_quote) 417 | { 418 | assert_text_eq(get_map(S("'"), 0), S("'")); 419 | assert_text_eq(get_map(S("\""), 0), S("\"")); 420 | assert_text_eq(get_map(JS("\\u2018"), 0), S("\xE2\x80\x98")); 421 | assert_text_eq(get_map(JS("\\u2019"), 0), S("\xE2\x80\x99")); 422 | assert_text_eq(get_map(JS("\\u201A"), 0), S("\xE2\x80\x9A")); 423 | assert_text_eq(get_map(JS("\\u201F"), 0), S("\xE2\x80\x9F")); 424 | } 425 | END_TEST 426 | 427 | 428 | Suite *textmap_suite(void) 429 | { 430 | Suite *s; 431 | TCase *tc; 432 | 433 | s = suite_create("textmap"); 434 | tc = tcase_create("normalize"); 435 | tcase_add_checked_fixture(tc, setup, teardown); 436 | tcase_add_test(tc, test_map_basic); 437 | tcase_add_test(tc, test_map_esc); 438 | // tcase_add_test(tc, test_rm_control_ascii); 439 | tcase_add_test(tc, test_keep_control_ascii); 440 | // tcase_add_test(tc, test_rm_control_utf8); 441 | tcase_add_test(tc, test_keep_control_utf8); 442 | // tcase_add_test(tc, test_rm_ws_ascii); 443 | tcase_add_test(tc, test_keep_ws_ascii); 444 | // tcase_add_test(tc, test_rm_ws_utf8); 445 | tcase_add_test(tc, test_keep_ws_utf8); 446 | tcase_add_test(tc, test_casefold_ascii); 447 | tcase_add_test(tc, test_casefold_utf8); 448 | // tcase_add_test(tc, test_fold_dash); 449 | // tcase_add_test(tc, test_nofold_dash); 450 | tcase_add_test(tc, test_map_quote); 451 | tcase_add_test(tc, test_nomap_quote); 452 | suite_add_tcase(s, tc); 453 | 454 | return s; 455 | } 456 | 457 | 458 | int main(void) 459 | { 460 | int number_failed; 461 | Suite *s; 462 | SRunner *sr; 463 | 464 | s = textmap_suite(); 465 | sr = srunner_create(s); 466 | 467 | srunner_run_all(sr, CK_NORMAL); 468 | number_failed = srunner_ntests_failed(sr); 469 | srunner_free(sr); 470 | 471 | return (number_failed == 0) ? EXIT_SUCCESS : EXIT_FAILURE; 472 | } 473 | -------------------------------------------------------------------------------- /tests/check_wordscan.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include "../src/utf8lite.h" 21 | #include "testutil.h" 22 | 23 | #define WORD_BREAK_TEST "data/ucd/auxiliary/WordBreakTest.txt" 24 | struct utf8lite_wordscan scan; 25 | 26 | 27 | void setup_scan(void) 28 | { 29 | setup(); 30 | } 31 | 32 | 33 | void teardown_scan(void) 34 | { 35 | teardown(); 36 | } 37 | 38 | 39 | void start(const struct utf8lite_text *text) 40 | { 41 | utf8lite_wordscan_make(&scan, text); 42 | } 43 | 44 | 45 | const struct utf8lite_text *next(void) 46 | { 47 | struct utf8lite_text *word; 48 | if (!utf8lite_wordscan_advance(&scan)) { 49 | return NULL; 50 | } 51 | word = alloc(sizeof(*word)); 52 | *word = scan.current; 53 | return word; 54 | } 55 | 56 | 57 | START_TEST(test_figure1) 58 | { 59 | // Test Figure 1 from http://www.unicode.org/reports/tr29/ 60 | start(S("The quick (\"brown\") fox can't jump 32.3 feet, right?")); 61 | assert_text_eq(next(), S("The")); 62 | assert_text_eq(next(), S(" ")); 63 | assert_text_eq(next(), S("quick")); 64 | assert_text_eq(next(), S(" ")); 65 | assert_text_eq(next(), S("(")); 66 | assert_text_eq(next(), S("\"")); 67 | assert_text_eq(next(), S("brown")); 68 | assert_text_eq(next(), S("\"")); 69 | assert_text_eq(next(), S(")")); 70 | assert_text_eq(next(), S(" ")); 71 | assert_text_eq(next(), S("fox")); 72 | assert_text_eq(next(), S(" ")); 73 | assert_text_eq(next(), S("can't")); 74 | assert_text_eq(next(), S(" ")); 75 | assert_text_eq(next(), S("jump")); 76 | assert_text_eq(next(), S(" ")); 77 | assert_text_eq(next(), S("32.3")); 78 | assert_text_eq(next(), S(" ")); 79 | assert_text_eq(next(), S("feet")); 80 | assert_text_eq(next(), S(",")); 81 | assert_text_eq(next(), S(" ")); 82 | assert_text_eq(next(), S("right")); 83 | assert_text_eq(next(), S("?")); 84 | ck_assert(next() == NULL); 85 | } 86 | END_TEST 87 | 88 | START_TEST(test_quote) 89 | { 90 | start(S("both 'single' and \"double\".")); 91 | assert_text_eq(next(), S("both")); 92 | assert_text_eq(next(), S(" ")); 93 | assert_text_eq(next(), S("'")); 94 | assert_text_eq(next(), S("single")); 95 | assert_text_eq(next(), S("'")); 96 | assert_text_eq(next(), S(" ")); 97 | assert_text_eq(next(), S("and")); 98 | assert_text_eq(next(), S(" ")); 99 | assert_text_eq(next(), S("\"")); 100 | assert_text_eq(next(), S("double")); 101 | assert_text_eq(next(), S("\"")); 102 | assert_text_eq(next(), S(".")); 103 | ck_assert(next() == NULL); 104 | } 105 | END_TEST 106 | 107 | 108 | START_TEST(test_extendnumlet) 109 | { 110 | start(S("_")); 111 | assert_text_eq(next(), S("_")); 112 | 113 | start(S("__")); 114 | assert_text_eq(next(), S("__")); 115 | 116 | start(S("___")); 117 | assert_text_eq(next(), S("___")); 118 | 119 | start(JS("\\u202f")); 120 | assert_text_eq(next(), JS("\\u202f")); 121 | 122 | start(JS("\\u202f\\u202f")); 123 | assert_text_eq(next(), JS("\\u202f\\u202f")); 124 | 125 | start(JS("\\u202f_")); 126 | assert_text_eq(next(), JS("\\u202f_")); 127 | 128 | start(S("_1")); 129 | assert_text_eq(next(), S("_1")); 130 | 131 | start(S("__1")); 132 | assert_text_eq(next(), S("__1")); 133 | 134 | start(S("_A")); 135 | assert_text_eq(next(), S("_A")); 136 | 137 | start(S("__A")); 138 | assert_text_eq(next(), S("__A")); 139 | } 140 | END_TEST 141 | 142 | 143 | // Unicode Word Break Test 144 | // http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakTest.txt 145 | struct unitest { 146 | char comment[1024]; 147 | unsigned line; 148 | int is_ascii; 149 | 150 | struct utf8lite_text text; 151 | uint8_t buf[1024]; 152 | 153 | int32_t code[256]; 154 | int can_break_before[256]; 155 | uint8_t *code_end[256]; 156 | unsigned ncode; 157 | 158 | uint8_t *break_begin[256]; 159 | uint8_t *break_end[256]; 160 | unsigned nbreak; 161 | 162 | }; 163 | 164 | struct unitest unitests[4096]; 165 | unsigned nunitest; 166 | 167 | void write_unitest(FILE *stream, const struct unitest *test) 168 | { 169 | unsigned i, n = test->ncode; 170 | 171 | for (i = 0; i < n; i++) { 172 | fprintf(stream, "%s %04X ", 173 | (test->can_break_before[i]) ? "\xC3\xB7" : "\xC3\x97", 174 | test->code[i]); 175 | } 176 | fprintf(stream, "\xC3\xB7 %s\n", test->comment); 177 | } 178 | 179 | void setup_unicode(void) 180 | { 181 | struct unitest *test; 182 | FILE *file; 183 | unsigned code, line, nbreak, ncode; 184 | uint8_t *dst; 185 | char *comment; 186 | int ch, is_ascii; 187 | 188 | setup_scan(); 189 | file = fopen(WORD_BREAK_TEST, "r"); 190 | if (!file) { 191 | file = fopen("../"WORD_BREAK_TEST, "r"); 192 | } 193 | 194 | nunitest = 0; 195 | test = &unitests[0]; 196 | 197 | line = 1; 198 | ncode = 0; 199 | nbreak = 0; 200 | is_ascii = 1; 201 | test->text.ptr = &test->buf[0]; 202 | dst = test->text.ptr; 203 | 204 | ck_assert_msg(file != NULL, "file '"WORD_BREAK_TEST"' not found"); 205 | while ((ch = fgetc(file)) != EOF) { 206 | switch (ch) { 207 | case '#': 208 | comment = &test->comment[0]; 209 | do { 210 | *comment++ = (char)ch; 211 | ch = fgetc(file); 212 | } while (ch != EOF && ch != '\n'); 213 | *comment = '\0'; 214 | 215 | if (ch == EOF) { 216 | goto eof; 217 | } 218 | /* fallthrough */ 219 | case '\n': 220 | *dst = '\0'; 221 | 222 | test->line = line; 223 | test->is_ascii = is_ascii; 224 | test->text.attr = (size_t)(dst - test->text.ptr); 225 | 226 | if (ncode > 0) { 227 | test->ncode = ncode; 228 | test->nbreak = nbreak; 229 | ncode = 0; 230 | nbreak = 0; 231 | is_ascii = 1; 232 | nunitest++; 233 | test = &unitests[nunitest]; 234 | comment = &test->comment[0]; 235 | test->text.ptr = &test->buf[0]; 236 | test->comment[0] = '\0'; 237 | dst = test->text.ptr; 238 | } 239 | line++; 240 | break; 241 | 242 | case 0xC3: 243 | ch = fgetc(file); 244 | if (ch == EOF) { 245 | goto eof; 246 | } else if (ch == 0x97) { 247 | // MULTIPLICATON SIGN (U+00D7) 0xC3 0x97 248 | test->can_break_before[ncode] = 0; 249 | } else if (ch == 0xB7) { 250 | // DIVISION SIGN (U+00F7) 0xC3 0xB7 251 | test->can_break_before[ncode] = 1; 252 | } else { 253 | goto inval; 254 | } 255 | 256 | if (test->can_break_before[ncode]) { 257 | test->break_begin[nbreak] = dst; 258 | if (nbreak > 0) { 259 | test->break_end[nbreak - 1] = dst; 260 | } 261 | nbreak++; 262 | } 263 | 264 | if (fscanf(file, "%x", &code)) { 265 | test->code[ncode] = (int32_t)code; 266 | if (code > 0x7F) { 267 | is_ascii = 0; 268 | } 269 | utf8lite_encode_utf8((int32_t)code, &dst); 270 | test->code_end[ncode] = dst; 271 | ncode++; 272 | } else { 273 | test->break_end[nbreak - 1] = dst; 274 | nbreak--; 275 | } 276 | break; 277 | } 278 | 279 | } 280 | eof: 281 | return; 282 | inval: 283 | fprintf(stderr, "invalid character on line %d\n", line); 284 | 285 | fclose(file); 286 | } 287 | 288 | void teardown_unicode(void) 289 | { 290 | teardown_scan(); 291 | } 292 | 293 | START_TEST(test_unicode) 294 | { 295 | struct unitest *test; 296 | unsigned i, j; 297 | 298 | for (i = 0; i < nunitest; i++) { 299 | test = &unitests[i]; 300 | 301 | //write_unitest(stderr, test); 302 | utf8lite_wordscan_make(&scan, &test->text); 303 | 304 | for (j = 0; j < test->nbreak; j++) { 305 | //fprintf(stderr, "Break %u\n", j); 306 | ck_assert(utf8lite_wordscan_advance(&scan)); 307 | ck_assert(scan.current.ptr == test->break_begin[j]); 308 | ck_assert(scan.current.ptr 309 | + UTF8LITE_TEXT_SIZE(&scan.current) 310 | == test->break_end[j]); 311 | } 312 | ck_assert(!utf8lite_wordscan_advance(&scan)); 313 | } 314 | } 315 | END_TEST 316 | 317 | Suite *wordscan_suite(void) 318 | { 319 | Suite *s; 320 | TCase *tc; 321 | 322 | s = suite_create("wordscan"); 323 | tc = tcase_create("core"); 324 | tcase_add_checked_fixture(tc, setup_scan, teardown_scan); 325 | tcase_add_test(tc, test_figure1); 326 | tcase_add_test(tc, test_quote); 327 | tcase_add_test(tc, test_extendnumlet); 328 | suite_add_tcase(s, tc); 329 | 330 | tc = tcase_create("Unicode WordBreakTest.txt"); 331 | tcase_add_checked_fixture(tc, setup_unicode, teardown_unicode); 332 | tcase_add_test(tc, test_unicode); 333 | suite_add_tcase(s, tc); 334 | 335 | return s; 336 | } 337 | 338 | 339 | int main(void) 340 | { 341 | int number_failed; 342 | Suite *s; 343 | SRunner *sr; 344 | 345 | s = wordscan_suite(); 346 | sr = srunner_create(s); 347 | 348 | srunner_run_all(sr, CK_NORMAL); 349 | number_failed = srunner_ntests_failed(sr); 350 | srunner_free(sr); 351 | 352 | return (number_failed == 0) ? EXIT_SUCCESS : EXIT_FAILURE; 353 | } 354 | -------------------------------------------------------------------------------- /tests/testutil.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include "../src/utf8lite.h" 22 | #include "testutil.h" 23 | 24 | static struct utf8lite_text *mktext(const char *str, int flags); 25 | 26 | 27 | static void **allocs; 28 | static int nalloc; 29 | 30 | 31 | void setup(void) 32 | { 33 | allocs = NULL; 34 | nalloc = 0; 35 | } 36 | 37 | 38 | void teardown(void) 39 | { 40 | while (nalloc-- > 0) { 41 | free(allocs[nalloc]); 42 | } 43 | free(allocs); 44 | } 45 | 46 | 47 | void *alloc(size_t size) 48 | { 49 | void *ptr; 50 | 51 | allocs = realloc(allocs, (size_t)(nalloc + 1) * sizeof(*allocs)); 52 | ck_assert(allocs != NULL); 53 | 54 | ptr = malloc(size); 55 | ck_assert(ptr != NULL || size == 0); 56 | 57 | allocs[nalloc] = ptr; 58 | nalloc++; 59 | 60 | return ptr; 61 | } 62 | 63 | 64 | struct utf8lite_text *JS(const char *str) 65 | { 66 | return mktext(str, UTF8LITE_TEXT_UNESCAPE); 67 | } 68 | 69 | 70 | struct utf8lite_text *S(const char *str) 71 | { 72 | return mktext(str, 0); 73 | } 74 | 75 | 76 | struct utf8lite_text *mktext(const char *str, int flags) 77 | { 78 | struct utf8lite_text *text = alloc(sizeof(*text)); 79 | struct utf8lite_text text2; 80 | size_t size = strlen(str); 81 | uint8_t *ptr = alloc(size + 1); 82 | int err; 83 | 84 | memcpy(ptr, str, size + 1); 85 | err = utf8lite_text_assign(text, ptr, size, flags, NULL); 86 | ck_assert(!err); 87 | 88 | ck_assert(!utf8lite_text_assign(&text2, ptr, size, 89 | flags | UTF8LITE_TEXT_VALID, NULL)); 90 | ck_assert(text->ptr == text2.ptr); 91 | ck_assert_uint_eq(text->attr, text2.attr); 92 | 93 | return text; 94 | } 95 | -------------------------------------------------------------------------------- /tests/testutil.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef TESTUTIL_H 18 | #define TESTUTIL_H 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | struct utf8lite_text; 25 | 26 | /** 27 | * This macro is broken on the old version of check (0.9.8) that Travis CI 28 | * uses, so we re-define it. 29 | */ 30 | #ifdef ck_assert_int_eq 31 | # undef ck_assert_int_eq 32 | #endif 33 | #define ck_assert_int_eq(X, Y) do { \ 34 | intmax_t _ck_x = (X); \ 35 | intmax_t _ck_y = (Y); \ 36 | ck_assert_msg(_ck_x == _ck_y, \ 37 | "Assertion '%s' failed: %s == %jd, %s == %jd", \ 38 | #X " == " #Y, #X, _ck_x, #Y, _ck_y); \ 39 | } while (0) 40 | 41 | 42 | /** 43 | * This macro doesn't exist on check version 0.9.8. 44 | */ 45 | #ifdef ck_assert_uint_eq 46 | # undef ck_assert_uint_eq 47 | #endif 48 | #define ck_assert_uint_eq(X, Y) do { \ 49 | uintmax_t _ck_x = (X); \ 50 | uintmax_t _ck_y = (Y); \ 51 | ck_assert_msg(_ck_x == _ck_y, \ 52 | "Assertion '%s' failed: %s == %ju, %s == %ju", \ 53 | #X " == " #Y, #X, _ck_x, #Y, _ck_y); \ 54 | } while (0) 55 | 56 | 57 | /** 58 | * Broken on check (0.9.8) 59 | */ 60 | #ifdef ck_assert_str_eq 61 | # undef ck_assert_str_eq 62 | #endif 63 | #define ck_assert_str_eq(X, Y) do { \ 64 | const char* _ck_x = (X); \ 65 | const char* _ck_y = (Y); \ 66 | const char* _ck_x_s; \ 67 | const char* _ck_y_s; \ 68 | const char* _ck_x_q; \ 69 | const char* _ck_y_q; \ 70 | if (_ck_x != NULL) { \ 71 | _ck_x_q = "\""; \ 72 | _ck_x_s = _ck_x; \ 73 | } else { \ 74 | _ck_x_q = ""; \ 75 | _ck_x_s = "(null)"; \ 76 | } \ 77 | if (_ck_y != NULL) { \ 78 | _ck_y_q = "\""; \ 79 | _ck_y_s = _ck_y; \ 80 | } else { \ 81 | _ck_y_q = ""; \ 82 | _ck_y_s = "(null)"; \ 83 | } \ 84 | ck_assert_msg( \ 85 | ((_ck_x != NULL) && (_ck_y != NULL) \ 86 | && (0 == strcmp(_ck_y, _ck_x))), \ 87 | "Assertion '%s' failed: %s == %s%s%s, %s == %s%s%s", \ 88 | #X" == "#Y, \ 89 | #X, _ck_x_q, _ck_x_s, _ck_x_q, \ 90 | #Y, _ck_y_q, _ck_y_s, _ck_y_q); \ 91 | } while (0) 92 | 93 | 94 | 95 | #define assert_text_eq(X, Y) do { \ 96 | const struct utf8lite_text * _ck_x = (X); \ 97 | const struct utf8lite_text * _ck_y = (Y); \ 98 | ck_assert_msg(utf8lite_text_equals(_ck_y, _ck_x), \ 99 | "Assertion '%s == %s' failed: %s == \"%.*s\" (0x%zx)," \ 100 | " %s==\"%.*s\" (0x%zx)", \ 101 | #X, #Y, \ 102 | #X, (int)UTF8LITE_TEXT_SIZE(_ck_x), _ck_x->ptr, _ck_x->attr, \ 103 | #Y, (int)UTF8LITE_TEXT_SIZE(_ck_y), _ck_y->ptr, _ck_y->attr); \ 104 | } while (0) 105 | 106 | 107 | #define assert_text_ne(X, Y) do { \ 108 | const struct utf8lite_text * _ck_x = (X); \ 109 | const struct utf8lite_text * _ck_y = (Y); \ 110 | ck_assert_msg(!utf8lite_text_equals(_ck_y, _ck_x), \ 111 | "Assertion '%s != %s' failed: %s == \"%s\" (0x%zx)," \ 112 | " %s==\"%s\" (0x%zx)", \ 113 | #X, #Y, \ 114 | #X, (int)UTF8LITE_TEXT_SIZE(_ck_x), _ck_x->ptr, _ck_x->attr, \ 115 | #Y, (int)UTF8LITE_TEXT_SIZE(_ck_y), _ck_y->ptr, _ck_y->attr); \ 116 | } while (0) 117 | 118 | 119 | /** 120 | * Common test framework set up. 121 | */ 122 | void setup(void); 123 | 124 | /** 125 | * Common test framework tear down. 126 | */ 127 | void teardown(void); 128 | 129 | /** 130 | * Allocate memory. 131 | */ 132 | void *alloc(size_t size); 133 | 134 | /** 135 | * Allocate a text object, interpreting JSON-style escape codes. 136 | */ 137 | struct utf8lite_text *JS(const char *str); 138 | 139 | /** 140 | * Cast a raw string as a text object, ignoring escape codes. 141 | */ 142 | struct utf8lite_text *S(const char *str); 143 | 144 | #endif /* TESTUTIL_H */ 145 | -------------------------------------------------------------------------------- /util/compute-typelen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import math 4 | import re 5 | 6 | CASE_FOLDING = 'data/ucd/CaseFolding.txt' 7 | UNICODE_MAX = 0x10FFFF 8 | 9 | # Parse CaseFolding.txt 10 | 11 | try: 12 | file = open(CASE_FOLDING, 'r') 13 | except FileNotFoundError: 14 | file = open('../' + CASE_FOLDING, 'r') 15 | 16 | def utf8_len(code): 17 | if code <= 0x7f: 18 | return 1 19 | elif code <= 0x07FF: 20 | return 2 21 | elif code <= 0xFFFF: 22 | return 3 23 | else: 24 | return 4 25 | 26 | with file: 27 | for line in file: 28 | if line[0] != '#' and line[0] != '\n': 29 | fields = line.split(';') 30 | code = int(fields[0], 16) 31 | status = fields[1].strip(); 32 | 33 | if status == 'C' or status == 'F': 34 | l0 = utf8_len(code) 35 | 36 | mapping = [int(x,16) for x in fields[2].split()] 37 | l1 = sum([utf8_len(m) for m in mapping]) 38 | 39 | ratio = l1 / l0 40 | if ratio >= 3: 41 | print('U+{:04X}'.format(code), mapping, 'ratio: ', ratio) 42 | 43 | -------------------------------------------------------------------------------- /util/gen-casefold.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2017 Patrick O. Perry. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import math 18 | import re 19 | 20 | CASE_FOLDING = 'data/ucd/CaseFolding.txt' 21 | UNICODE_MAX = 0x10FFFF 22 | 23 | # Parse CaseFolding.txt 24 | 25 | try: 26 | file = open(CASE_FOLDING, 'r') 27 | except FileNotFoundError: 28 | file = open('../' + CASE_FOLDING, 'r') 29 | 30 | casefold = [] 31 | casefold_map = [] 32 | 33 | with file: 34 | for line in file: 35 | if line[0] != '#' and line[0] != '\n': 36 | fields = line.split(';') 37 | code = int(fields[0], 16) 38 | status = fields[1].strip(); 39 | 40 | if status == 'C' or status == 'F': 41 | while code > len(casefold): 42 | casefold.append(None) 43 | assert code == len(casefold) 44 | 45 | mapping = [int(x,16) for x in fields[2].split()] 46 | n = len(mapping) 47 | if n > 1: 48 | casefold.append((n, len(casefold_map))) 49 | casefold_map.extend(mapping) 50 | else: 51 | casefold.append((1, mapping[0])) 52 | 53 | while len(casefold) <= UNICODE_MAX: 54 | casefold.append(None) 55 | 56 | 57 | def compute_tables(block_size): 58 | nblock = (UNICODE_MAX + 1) // block_size 59 | stage1 = [None] * nblock 60 | stage2 = [] 61 | stage2_dict = {} 62 | for i in range(nblock): 63 | begin = i * block_size 64 | end = begin + block_size 65 | block = tuple(casefold[begin:end]) 66 | if block in stage2_dict: 67 | j = stage2_dict[block] 68 | else: 69 | j = len(stage2) 70 | stage2_dict[block] = j 71 | stage2.append(block) 72 | stage1[i] = j 73 | return (stage1,stage2) 74 | 75 | 76 | def stage1_item_size(nstage2): 77 | nbyte = max(1, math.ceil(math.log(nstage2, 2) / 8)) 78 | size = 2**math.ceil(math.log(nbyte, 2)) 79 | return size 80 | 81 | page_size = 4096 82 | nbytes = {} 83 | 84 | best_block_size = 1 85 | smallest_size = UNICODE_MAX + 1 86 | 87 | for i in range(1,17): 88 | block_size = 2**i 89 | stage1,stage2 = compute_tables(block_size) 90 | 91 | nbyte1 = len(stage1) * stage1_item_size(len(stage2)) 92 | nbyte2 = len(stage2) * block_size 93 | 94 | nbyte1 = math.ceil(nbyte1 / page_size) * page_size 95 | nbyte2 = math.ceil(nbyte2 / page_size) * page_size 96 | nbyte = nbyte1 + nbyte2 97 | nbytes[block_size] = nbyte 98 | 99 | if nbyte < smallest_size: 100 | smallest_size = nbyte 101 | best_block_size = block_size 102 | 103 | 104 | block_size = best_block_size 105 | stage1,stage2 = compute_tables(block_size) 106 | 107 | type1_size = stage1_item_size(len(stage2)) 108 | if type1_size == 1: 109 | type1 = 'uint8_t' 110 | elif type1_size == 2: 111 | type1 = 'uint16_t' 112 | elif type1_size == 4: 113 | type1 = 'uint32_t' 114 | else: 115 | type1 = 'uint64_t' 116 | 117 | # Write casefold.h to stdout 118 | 119 | print("/* This file is automatically generated. DO NOT EDIT!") 120 | print(" Instead, edit gen-casefold.py and re-run. */") 121 | print("") 122 | print("/*") 123 | print(" * Case folding properties.") 124 | print(" *") 125 | print(" * Defined in UAX #44 \"Unicode Character Database\"") 126 | print(" *") 127 | print(" * http://www.unicode.org/reports/tr44/") 128 | print(" *") 129 | print(" * Section 5.6, Case and Case Mapping") 130 | print(" *") 131 | print(" *") 132 | print(" * We use a two-stage lookup strategy as described at") 133 | print(" *") 134 | print(" * http://www.strchr.com/multi-stage_tables") 135 | print(" *") 136 | print(" */") 137 | print("") 138 | print("#ifndef UNICODE_CASEFOLD_H") 139 | print("#define UNICODE_CASEFOLD_H") 140 | print("") 141 | print("#include ") 142 | print("") 143 | print("/* casefold") 144 | print(" * --------") 145 | print(" * length: the length (in codepoints) of the case fold mapping,") 146 | print(" * or 0 if there is no case fold") 147 | print(" *") 148 | print(" * data: the mapped-to codepoint (length = 1), or") 149 | print(" * an index into the `casefold_mapping` array, pointing") 150 | print(" * to the first codepoint in the mapping (length > 1)") 151 | print(" */") 152 | print("struct casefold {") 153 | print("\tunsigned length : 8;") 154 | print("\tunsigned data : 24;") 155 | print("};") 156 | print("") 157 | print("#define CASEFOLD_BLOCK_SIZE", block_size) 158 | print("") 159 | print("static const " + type1 + " casefold_stage1[] = {") 160 | for i in range(len(stage1) - 1): 161 | if i % 16 == 0: 162 | print("/* U+{:04X} */".format(i * block_size), end="") 163 | print("{0: >3},".format(stage1[i]), end="") 164 | if i % 16 == 15: 165 | print("") 166 | print("{0: >3}".format(stage1[len(stage1) - 1])) 167 | print("};") 168 | print("") 169 | print("static const struct casefold casefold_stage2[][" + 170 | str(block_size) + "] = {") 171 | for i in range(0,len(stage2)): 172 | print(" /* block " + str(i) + " */") 173 | print(" {", end="") 174 | for j in range(block_size): 175 | val = stage2[i][j] 176 | if val is None: 177 | print("{0,0}", end="") 178 | else: 179 | print("{{{0},0x{1:05X}}}".format(val[0], val[1]), end="") 180 | 181 | #print("{0: >3}".format(prop_vals[stage2[i][j]]), end="") 182 | if j + 1 == block_size: 183 | print("\n }", end="") 184 | else: 185 | print(",", end="") 186 | if j % 5 == 4: 187 | print("\n ", end="") 188 | if i + 1 != len(stage2): 189 | print(",\n") 190 | else: 191 | print("") 192 | print("};") 193 | print("") 194 | print("static const int32_t casefold_mapping[] = {") 195 | for i in range(len(casefold_map) - 1): 196 | if i % 8 == 0: 197 | print("/* 0x{:04X} */ ".format(i), end="") 198 | print("0x{0:04X},".format(casefold_map[i]), end="") 199 | if i % 8 == 7: 200 | print("") 201 | print("0x{0:04X}".format(casefold_map[len(casefold_map) - 1])) 202 | print("};") 203 | print("") 204 | print("#endif /* UNICODE_CASEFOLD_H */") 205 | -------------------------------------------------------------------------------- /util/gen-charwidth.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2017 Patrick O. Perry. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import math 18 | 19 | try: 20 | import property 21 | import unicode_data 22 | except ModuleNotFoundError: 23 | from util import property 24 | from util import unicode_data 25 | 26 | 27 | DERIVED_CORE_PROPERTIES = "data/ucd/DerivedCoreProperties.txt" 28 | EAST_ASIAN_WIDTH = "data/ucd/EastAsianWidth.txt" 29 | EMOJI_DATA = "data/ucd/emoji/emoji-data.txt" 30 | 31 | 32 | east_asian_width = property.read(EAST_ASIAN_WIDTH) 33 | # A: Ambiguous (can be narrow or wide depending on context; treat as wide) 34 | # F: Fullwidth (always wide) 35 | # H: Halfwidth (always narrow) 36 | # Na: Narrow (always narrow) 37 | # N: Neutral (non-East Asian; treat as single) 38 | # W: Wide (always wide) 39 | 40 | 41 | emoji_props = property.read(EMOJI_DATA, sets=True) 42 | 43 | # https://www.unicode.org/reports/tr51/#def_basic_emoji_set 44 | emoji = ((emoji_props['Emoji'] - emoji_props['Emoji_Component']) 45 | & emoji_props['Emoji_Presentation']) 46 | 47 | # Treat ignorables as invisible 48 | derived_core_properties = property.read(DERIVED_CORE_PROPERTIES, sets=True) 49 | default_ignorable = derived_core_properties['Default_Ignorable_Code_Point'] 50 | 51 | 52 | # unassigned: not assigned, other, surrogate 53 | none_cats = set(['Cc', 'Cn', 'Co', 'Cs', 'Zl', 'Zp']) 54 | mark_cats = set(['Cf', 'Me', 'Mn']) 55 | none = set([0xFFF9, 0xFFFA, 0xFFFB]) # interlinear annotation markers 56 | mark = set() 57 | for code in range(len(unicode_data.uchars)): 58 | u = unicode_data.uchars[code] 59 | if code in none or code in mark: 60 | pass 61 | elif u is None or u.category in none_cats: 62 | none.add(code) 63 | elif u.category in mark_cats: 64 | mark.add(code) 65 | 66 | 67 | code_props = [None] * len(east_asian_width) 68 | for code in range(len(code_props)): 69 | eaw = east_asian_width[code] 70 | if code in default_ignorable: # default ingorable overrides 71 | code_props[code] = 'Ignorable' 72 | elif code in emoji: # emoji overrides 73 | code_props[code] = 'Emoji' 74 | elif code in mark: # mark overrides 75 | code_props[code] = 'Mark' 76 | elif code in none: # none overrides 77 | code_props[code] = 'None' 78 | elif eaw == 'F' or eaw == 'W': 79 | code_props[code] = 'Wide' 80 | elif eaw == 'H' or eaw == 'Na' or eaw == 'N': 81 | code_props[code] = 'Narrow' 82 | elif eaw == 'A': 83 | code_props[code] = 'Ambiguous' 84 | else: 85 | code_props[code] = 'Narrow' # default to narrow 86 | 87 | 88 | prop_names = [ 89 | 'None', 'Ignorable', 'Mark', 'Narrow', 'Ambiguous', 'Wide', 'Emoji' 90 | ] 91 | prop_vals = {} 92 | for p in prop_names: 93 | prop_vals[p] = len(prop_vals) 94 | 95 | 96 | def compute_tables(block_size): 97 | nblock = len(code_props) // block_size 98 | stage1 = [None] * nblock 99 | stage2 = [] 100 | stage2_dict = {} 101 | for i in range(nblock): 102 | begin = i * block_size 103 | end = begin + block_size 104 | block = tuple(code_props[begin:end]) 105 | if block in stage2_dict: 106 | j = stage2_dict[block] 107 | else: 108 | j = len(stage2) 109 | stage2_dict[block] = j 110 | stage2.append(block) 111 | stage1[i] = j 112 | return (stage1,stage2) 113 | 114 | 115 | def stage1_item_size(nstage2): 116 | nbyte = math.ceil(math.log(nstage2, 2) / 8) 117 | size = 2**math.ceil(math.log(nbyte, 2)) 118 | return size 119 | 120 | 121 | page_size = 4096 122 | block_size = 256 123 | 124 | nbytes = {} 125 | 126 | best_block_size = 1 127 | smallest_size = len(code_props) 128 | 129 | for i in range(1,17): 130 | block_size = 2**i 131 | stage1,stage2 = compute_tables(block_size) 132 | 133 | nbyte1 = len(stage1) * stage1_item_size(len(stage2)) 134 | nbyte2 = len(stage2) * block_size 135 | 136 | nbyte1 = math.ceil(nbyte1 / page_size) * page_size 137 | nbyte2 = math.ceil(nbyte2 / page_size) * page_size 138 | nbyte = nbyte1 + nbyte2 139 | nbytes[block_size] = nbyte 140 | 141 | if nbyte < smallest_size: 142 | smallest_size = nbyte 143 | best_block_size = block_size 144 | 145 | 146 | block_size = best_block_size 147 | stage1,stage2 = compute_tables(block_size) 148 | 149 | type1_size = stage1_item_size(len(stage2)) 150 | 151 | if type1_size == 1: 152 | type1 = 'uint8_t' 153 | elif type1_size == 2: 154 | type1 = 'uint16_t' 155 | elif type1_size == 4: 156 | type1 = 'uint32_t' 157 | else: 158 | type1 = 'uint64_t' 159 | 160 | type2 = 'int8_t' 161 | 162 | 163 | # Write chardwidth.h to stdout 164 | 165 | print("/* This file is automatically generated. DO NOT EDIT!") 166 | print(" Instead, edit gen-charwidth.py and re-run. */") 167 | print("") 168 | print("/*") 169 | print(" * Unicode East_Asian_Width property values.") 170 | print(" *") 171 | print(" * Defined in UAX #11 \"East Asian Width\"") 172 | print(" *") 173 | print(" * http://www.unicode.org/reports/tr11/") 174 | print(" *") 175 | print(" * We use the two-stage lookup strategy described at") 176 | print(" *") 177 | print(" * http://www.strchr.com/multi-stage_tables") 178 | print(" *") 179 | print(" */") 180 | print("") 181 | print("#ifndef CHARWIDTH_H") 182 | print("#define CHARWIDTH_H") 183 | print("") 184 | print("#include ") 185 | print("") 186 | print("enum charwidth_prop {") 187 | first = True 188 | for prop in prop_names: 189 | if not first: 190 | print(",\n", end="") 191 | else: 192 | first = False 193 | print("\tCHARWIDTH_" + prop.upper() + " = " + str(prop_vals[prop]), end="") 194 | print("\n};") 195 | print("") 196 | print("static const " + type1 + " charwidth_stage1[] = {") 197 | for i in range(len(stage1) - 1): 198 | if i % 16 == 0: 199 | print("/* U+{:04X} */".format(i * block_size), end="") 200 | print("{0: >3},".format(stage1[i]), end="") 201 | if i % 16 == 15: 202 | print("") 203 | print("{0: >3}".format(stage1[len(stage1) - 1])) 204 | print("};") 205 | print("") 206 | print("static const " + type2 + " charwidth_stage2[][" + 207 | str(block_size) + "] = {") 208 | for i in range(len(stage2)): 209 | print(" /* block " + str(i) + " */") 210 | print(" {", end="") 211 | for j in range(block_size): 212 | print("{0: >3}".format(prop_vals[stage2[i][j]]), end="") 213 | if j + 1 == block_size: 214 | print("\n }", end="") 215 | else: 216 | print(",", end="") 217 | if j % 16 == 15: 218 | print("\n ", end="") 219 | if i + 1 != len(stage2): 220 | print(",\n") 221 | else: 222 | print("") 223 | print("};") 224 | 225 | print("") 226 | print("static int charwidth(int32_t code)") 227 | print("{") 228 | print("\tconst int32_t block_size = " + str(block_size) + ";") 229 | print("\t" + type1 + " i = charwidth_stage1[code / block_size];") 230 | print("\treturn charwidth_stage2[i][code % block_size];") 231 | print("}") 232 | print("") 233 | print("#endif /* CHARWIDTH_H */") 234 | -------------------------------------------------------------------------------- /util/gen-combining.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2017 Patrick O. Perry. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import math 18 | 19 | try: 20 | import unicode_data 21 | except ModuleNotFoundError: 22 | from util import unicode_data 23 | 24 | 25 | combin_vals = set([0]) 26 | combin = [] 27 | 28 | for code in range(len(unicode_data.uchars)): 29 | u = unicode_data.uchars[code] 30 | 31 | if u is None or u.ccc is None: 32 | combin.append(0) 33 | else: 34 | ccc = u.ccc 35 | combin_vals.add(ccc) 36 | combin.append(ccc) 37 | 38 | 39 | def compute_tables(block_size): 40 | nblock = len(combin) // block_size 41 | stage1 = [None] * nblock 42 | stage2 = [] 43 | stage2_dict = {} 44 | for i in range(nblock): 45 | begin = i * block_size 46 | end = begin + block_size 47 | block = tuple(combin[begin:end]) 48 | if block in stage2_dict: 49 | j = stage2_dict[block] 50 | else: 51 | j = len(stage2) 52 | stage2_dict[block] = j 53 | stage2.append(block) 54 | stage1[i] = j 55 | return (stage1,stage2) 56 | 57 | 58 | def stage1_item_size(nstage2): 59 | nbyte = math.ceil(math.log(nstage2, 2) / 8) 60 | size = 2**math.ceil(math.log(nbyte, 2)) 61 | return size 62 | 63 | page_size = 4096 64 | block_size = 256 65 | 66 | nbytes = {} 67 | 68 | best_block_size = 1 69 | smallest_size = len(combin) 70 | 71 | for i in range(1,17): 72 | block_size = 2**i 73 | stage1,stage2 = compute_tables(block_size) 74 | 75 | nbyte1 = len(stage1) * stage1_item_size(len(stage2)) 76 | nbyte2 = len(stage2) * block_size 77 | 78 | nbyte1 = math.ceil(nbyte1 / page_size) * page_size 79 | nbyte2 = math.ceil(nbyte2 / page_size) * page_size 80 | nbyte = nbyte1 + nbyte2 81 | nbytes[block_size] = nbyte 82 | 83 | if nbyte < smallest_size: 84 | smallest_size = nbyte 85 | best_block_size = block_size 86 | 87 | 88 | block_size = best_block_size 89 | stage1,stage2 = compute_tables(block_size) 90 | 91 | type1_size = stage1_item_size(len(stage2)) 92 | if type1_size == 1: 93 | type1 = 'uint8_t' 94 | elif type1_size == 2: 95 | type1 = 'uint16_t' 96 | elif type1_size == 4: 97 | type1 = 'uint32_t' 98 | else: 99 | type1 = 'uint64_t' 100 | 101 | type2 = 'uint8_t' 102 | 103 | 104 | # Write combining.h to stdout 105 | 106 | print("/* This file is automatically generated. DO NOT EDIT!") 107 | print(" Instead, edit gen-combining.py and re-run. */") 108 | print("") 109 | print("/*") 110 | print(" * Canonical_Combining_Class property values.") 111 | print(" *") 112 | print(" * Defined in UAX #44 \"Unicode Character Database\"") 113 | print(" *") 114 | print(" * http://www.unicode.org/reports/tr44/") 115 | print(" *") 116 | print(" * Section 5.7.4, Table 15.") 117 | print(" *") 118 | print(" *") 119 | print(" * We use the two-stage lookup strategy described at") 120 | print(" *") 121 | print(" * http://www.strchr.com/multi-stage_tables") 122 | print(" *") 123 | print(" */") 124 | print("") 125 | print("#ifndef UNICODE_COMBINING_H") 126 | print("#define UNICODE_COMBINING_H") 127 | print("") 128 | print("#include ") 129 | print("") 130 | print("static const " + type1 + " combining_class_stage1[] = {") 131 | for i in range(len(stage1) - 1): 132 | if i % 16 == 0: 133 | print("/* U+{:04X} */".format(i * block_size), end="") 134 | print("{0: >3},".format(stage1[i]), end="") 135 | if i % 16 == 15: 136 | print("") 137 | print("{0: >3}".format(stage1[len(stage1) - 1])) 138 | print("};") 139 | print("") 140 | print("static const " + type2 + " combining_class_stage2[][" + 141 | str(block_size) + "] = {") 142 | for i in range(len(stage2)): 143 | print(" /* block " + str(i) + " */") 144 | print(" {", end="") 145 | for j in range(block_size): 146 | print("{0: >3}".format(stage2[i][j]), end="") 147 | if j + 1 == block_size: 148 | print("\n }", end="") 149 | else: 150 | print(",", end="") 151 | if j % 16 == 15: 152 | print("\n ", end="") 153 | if i + 1 != len(stage2): 154 | print(",\n") 155 | else: 156 | print("") 157 | print("};") 158 | print("") 159 | print("static uint8_t combining_class(int32_t code)") 160 | print("{") 161 | print("\tconst int32_t block_size = " + str(block_size) + ";") 162 | print("\t" + type1 + " i = combining_class_stage1[code / block_size];") 163 | print("\treturn combining_class_stage2[i][code % block_size];") 164 | print("}") 165 | print("") 166 | print("#endif /* UNICODE_COMBINING_H */") 167 | 168 | -------------------------------------------------------------------------------- /util/gen-compose.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2017 Patrick O. Perry. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import math 18 | import re 19 | 20 | try: 21 | import unicode_data 22 | except ModuleNotFoundError: 23 | from util import unicode_data 24 | 25 | 26 | EXCLUSIONS = 'data/ucd/CompositionExclusions.txt' 27 | 28 | # get the length-2 decomposition maps (excluding hangul and compatibility maps) 29 | 30 | decomp_map = {} 31 | starter = [None] * len(unicode_data.uchars) 32 | 33 | for code in range(len(unicode_data.uchars)): 34 | u = unicode_data.uchars[code] 35 | if u is None: 36 | continue 37 | 38 | ccc = u.ccc 39 | if ccc is None or ccc == 0: 40 | starter[code] = True 41 | else: 42 | starter[code] = False 43 | 44 | d = u.decomp 45 | if d is not None and d.type is None: 46 | if len(d.map) == 2: 47 | decomp_map[code] = tuple(d.map) 48 | 49 | 50 | # exclude non-starter decomposiitons 51 | 52 | decomp_map2 = {} 53 | for p,d in decomp_map.items(): 54 | if starter[p] and starter[d[0]]: 55 | decomp_map2[p] = d 56 | decomp_map = decomp_map2 57 | 58 | 59 | # exclude composition exclusions 60 | 61 | try: 62 | file = open(EXCLUSIONS, 'r') 63 | except FileNotFoundError: 64 | file = open('../' + EXCLUSIONS, 'r') 65 | 66 | with file: 67 | for line in file: 68 | fields = line.partition('#') 69 | code = fields[0].strip() 70 | if len(code) > 0: 71 | code = int(code, 16) 72 | if code in decomp_map: 73 | del decomp_map[code] 74 | 75 | #print('primary\tletter\tcode') 76 | #for p,d in decomp_map.items(): 77 | # print(p, '\t', d[0], '\t', d[1], sep='') 78 | 79 | # construct table l : [(c,p)] 80 | compose_map = {} 81 | for p,d in decomp_map.items(): 82 | l = d[0] 83 | c = d[1] 84 | if l not in compose_map: 85 | compose_map[l] = [] 86 | compose_map[l].append((c, p)) 87 | 88 | compose = [] 89 | combiner = [] 90 | primary = [] 91 | off = 0 92 | for code in range(len(unicode_data.uchars)): 93 | if code in compose_map: 94 | maps = compose_map[code] 95 | maps.sort() 96 | compose.append((off, len(maps))) 97 | combiner.extend([c for (c,p) in maps]) 98 | primary.extend([p for (c,p) in maps]) 99 | off += len(maps) 100 | else: 101 | compose.append((0,0)) 102 | 103 | # Hangul 104 | hangul_lpart = off 105 | hangul_lvpart = off + 1 106 | 107 | for code in range(0x1100, 0x1113): 108 | compose[code] = (hangul_lpart, 1) 109 | 110 | for code in range(0xAC00, 0xD7A4): 111 | if (code - 0xAC00) % 28 == 0: 112 | compose[code] = (hangul_lvpart, 1) 113 | 114 | 115 | def compute_tables(block_size): 116 | nblock = len(compose) // block_size 117 | stage1 = [None] * nblock 118 | stage2 = [] 119 | stage2_dict = {} 120 | for i in range(nblock): 121 | begin = i * block_size 122 | end = begin + block_size 123 | block = tuple(compose[begin:end]) 124 | if block in stage2_dict: 125 | j = stage2_dict[block] 126 | else: 127 | j = len(stage2) 128 | stage2_dict[block] = j 129 | stage2.append(block) 130 | stage1[i] = j 131 | return (stage1,stage2) 132 | 133 | 134 | def stage1_item_size(nstage2): 135 | nbyte = math.ceil(math.log(nstage2, 2) / 8) 136 | size = 2**math.ceil(math.log(nbyte, 2)) 137 | return size 138 | 139 | page_size = 4096 140 | block_size = 256 141 | 142 | nbytes = {} 143 | 144 | best_block_size = 1 145 | smallest_size = len(compose) 146 | 147 | for i in range(1,17): 148 | block_size = 2**i 149 | stage1,stage2 = compute_tables(block_size) 150 | 151 | nbyte1 = len(stage1) * stage1_item_size(len(stage2)) 152 | nbyte2 = len(stage2) * block_size 153 | 154 | nbyte1 = math.ceil(nbyte1 / page_size) * page_size 155 | nbyte2 = math.ceil(nbyte2 / page_size) * page_size 156 | nbyte = nbyte1 + nbyte2 157 | nbytes[block_size] = nbyte 158 | 159 | if nbyte < smallest_size: 160 | smallest_size = nbyte 161 | best_block_size = block_size 162 | 163 | 164 | block_size = best_block_size 165 | stage1,stage2 = compute_tables(block_size) 166 | 167 | type1_size = stage1_item_size(len(stage2)) 168 | if type1_size == 1: 169 | type1 = 'uint8_t' 170 | elif type1_size == 2: 171 | type1 = 'uint16_t' 172 | elif type1_size == 4: 173 | type1 = 'uint32_t' 174 | else: 175 | type1 = 'uint64_t' 176 | 177 | type2 = 'struct composition' 178 | 179 | 180 | # Write compose.h to stdout 181 | 182 | 183 | print("/* This file is automatically generated. DO NOT EDIT!") 184 | print(" Instead, edit gen-compose.py and re-run. */") 185 | print("") 186 | print("/*") 187 | print(" * Unicode primary composites.") 188 | print(" *") 189 | print(" * Defined in Unicode Sec 3.11 \"Normalization Forms\"") 190 | print(" *") 191 | print(" * We use the two-stage lookup strategy described at") 192 | print(" *") 193 | print(" * http://www.strchr.com/multi-stage_tables") 194 | print(" *") 195 | print(" */") 196 | print("") 197 | print("#ifndef UNICODE_COMPOSE_H") 198 | print("#define UNICODE_COMPOSE_H") 199 | print("") 200 | print("#include ") 201 | print("") 202 | print("/* composition") 203 | print(" * -----------") 204 | print(" * offset: the offset into the primary and combiner arrays,") 205 | print(" * or 0 if there are no compositions") 206 | print(" * length: the number of compositions for the codepont") 207 | print(" */") 208 | print("struct composition {") 209 | print("\tunsigned offset : 11;") 210 | print("\tunsigned length : 5;") 211 | print("};") 212 | print("") 213 | print("#define COMPOSITION_BLOCK_SIZE", block_size) 214 | print("") 215 | print("#define COMPOSITION_HANGUL_LPART", hangul_lpart) 216 | print("") 217 | print("#define COMPOSITION_HANGUL_LVPART", hangul_lvpart) 218 | print("") 219 | print("static const " + type1 + " composition_stage1[] = {") 220 | for i in range(len(stage1) - 1): 221 | if i % 16 == 0: 222 | print("/* U+{:04X} */".format(i * block_size), end="") 223 | print("{0: >3},".format(stage1[i]), end="") 224 | if i % 16 == 15: 225 | print("") 226 | print("{0: >3}".format(stage1[len(stage1) - 1])) 227 | print("};") 228 | print("") 229 | print("static const " + type2 + " composition_stage2[][" + 230 | str(block_size) + "] = {") 231 | for i in range(len(stage2)): 232 | print(" /* block " + str(i) + " */") 233 | print(" {", end="") 234 | for j in range(block_size): 235 | print("{{{0: >3}".format(stage2[i][j][0]), end="") 236 | print(",{0: >2}}}".format(stage2[i][j][1]), end="") 237 | if j + 1 == block_size: 238 | print("\n }", end="") 239 | else: 240 | print(",", end="") 241 | if j % 7 == 6: 242 | print("\n ", end="") 243 | else: 244 | print(" ", end="") 245 | if i + 1 != len(stage2): 246 | print(",\n") 247 | else: 248 | print("") 249 | print("};") 250 | print("") 251 | print("static const int32_t composition_combiner[] = {") 252 | for i in range(len(combiner) - 1): 253 | if i % 8 == 0: 254 | print("/* {0: >3} */ ".format(i), end="") 255 | print("0x{0:04X},".format(combiner[i]), end="") 256 | if i % 8 == 7: 257 | print("") 258 | print("0x{0:04X}".format(combiner[len(combiner) - 1])) 259 | print("};") 260 | print("") 261 | print("static const int32_t composition_primary[] = {") 262 | for i in range(len(primary) - 1): 263 | if i % 8 == 0: 264 | print("/* {0: >3} */ ".format(i), end="") 265 | print("0x{0:04X},".format(primary[i]), end="") 266 | if i % 8 == 7: 267 | print("") 268 | print("0x{0:04X}".format(primary[len(primary) - 1])) 269 | print("};") 270 | print("") 271 | print("#endif /* UNICODE_COMPOSE_H */") 272 | -------------------------------------------------------------------------------- /util/gen-decompose.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2017 Patrick O. Perry. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import operator 18 | import math 19 | 20 | try: 21 | import unicode_data 22 | except ModuleNotFoundError: 23 | from util import unicode_data 24 | 25 | decomp_vals = unicode_data.decomp_vals 26 | decomp_map = unicode_data.decomp_map 27 | decomp = unicode_data.decomp 28 | 29 | def compute_tables(block_size): 30 | nblock = len(decomp) // block_size 31 | stage1 = [None] * nblock 32 | stage2 = [] 33 | stage2_dict = {} 34 | for i in range(nblock): 35 | begin = i * block_size 36 | end = begin + block_size 37 | block = tuple(decomp[begin:end]) 38 | if block in stage2_dict: 39 | j = stage2_dict[block] 40 | else: 41 | j = len(stage2) 42 | stage2_dict[block] = j 43 | stage2.append(block) 44 | stage1[i] = j 45 | return (stage1,stage2) 46 | 47 | 48 | def stage1_item_size(nstage2): 49 | nbyte = math.ceil(math.log(nstage2, 2) / 8) 50 | size = 2**math.ceil(math.log(nbyte, 2)) 51 | return size 52 | 53 | page_size = 4096 54 | block_size = 256 55 | 56 | nbytes = {} 57 | 58 | best_block_size = 1 59 | smallest_size = len(decomp) 60 | 61 | for i in range(1,17): 62 | block_size = 2**i 63 | stage1,stage2 = compute_tables(block_size) 64 | 65 | nbyte1 = len(stage1) * stage1_item_size(len(stage2)) 66 | nbyte2 = len(stage2) * block_size 67 | 68 | nbyte1 = math.ceil(nbyte1 / page_size) * page_size 69 | nbyte2 = math.ceil(nbyte2 / page_size) * page_size 70 | nbyte = nbyte1 + nbyte2 71 | nbytes[block_size] = nbyte 72 | 73 | if nbyte < smallest_size: 74 | smallest_size = nbyte 75 | best_block_size = block_size 76 | 77 | 78 | block_size = best_block_size 79 | stage1,stage2 = compute_tables(block_size) 80 | 81 | type1_size = stage1_item_size(len(stage2)) 82 | if type1_size == 1: 83 | type1 = 'uint8_t' 84 | elif type1_size == 2: 85 | type1 = 'uint16_t' 86 | elif type1_size == 4: 87 | type1 = 'uint32_t' 88 | else: 89 | type1 = 'uint64_t' 90 | 91 | # Write decompose.h to stdout 92 | 93 | print("/* This file is automatically generated. DO NOT EDIT!") 94 | print(" Instead, edit gen-decompose.py and re-run. */") 95 | print("") 96 | print("/*") 97 | print(" * Decomposition mappings.") 98 | print(" *") 99 | print(" * Defined in UAX #44 \"Unicode Character Database\"") 100 | print(" *") 101 | print(" * http://www.unicode.org/reports/tr44/") 102 | print(" *") 103 | print(" * Section 5.7.3, Table 14.") 104 | print(" *") 105 | print(" *") 106 | print(" * We use a two-stage lookup strategy as described at") 107 | print(" *") 108 | print(" * http://www.strchr.com/multi-stage_tables") 109 | print(" *") 110 | print(" */") 111 | print("") 112 | print("#ifndef UNICODE_DECOMPOSE_H") 113 | print("#define UNICODE_DECOMPOSE_H") 114 | print("") 115 | print("#include ") 116 | print("") 117 | print("/* decomposition_type") 118 | print(" * ------------------") 119 | print(" * compatibility decompositions have decomposition_type != 0") 120 | print(" */") 121 | print("enum decomposition_type {", end="") 122 | first = True 123 | for k,v in sorted(decomp_vals.items(), key=operator.itemgetter(1)): 124 | if not first: 125 | print(",", end="") 126 | print("\n\tDECOMPOSITION_" + k.upper() + " = " + str(v), end="") 127 | first = False 128 | print("\n};") 129 | print("") 130 | print("/* decomposition") 131 | print(" * -------------") 132 | print(" * type: the decomposition_type") 133 | print(" *") 134 | print(" * length: the length (in codepoints) of the decomposition mapping,") 135 | print(" * or 0 if there is no decomposition") 136 | print(" *") 137 | print(" * data: the mapped-to codepoint (length = 1), or") 138 | print(" * an index into the `decomposition_mapping` array, pointing") 139 | print(" * to the first codepoint in the mapping (length > 1)") 140 | print(" */") 141 | print("struct decomposition {") 142 | print("\tint type : 6;") 143 | print("\tunsigned length : 5;") 144 | print("\tunsigned data : 21;") 145 | print("};") 146 | print("") 147 | print("#define DECOMPOSITION_BLOCK_SIZE", block_size) 148 | print("") 149 | print("static const " + type1 + " decomposition_stage1[] = {") 150 | for i in range(len(stage1) - 1): 151 | if i % 16 == 0: 152 | print("/* U+{:04X} */".format(i * block_size), end="") 153 | print("{0: >3},".format(stage1[i]), end="") 154 | if i % 16 == 15: 155 | print("") 156 | print("{0: >3}".format(stage1[len(stage1) - 1])) 157 | print("};") 158 | print("") 159 | print("static const struct decomposition decomposition_stage2[][" + 160 | str(block_size) + "] = {") 161 | for i in range(0,len(stage2)): 162 | print(" /* block " + str(i) + " */") 163 | print(" {", end="") 164 | for j in range(block_size): 165 | val = stage2[i][j] 166 | if val is None: 167 | print("{0,0,0}", end="") 168 | else: 169 | if val[0] is None: 170 | t = 0 171 | else: 172 | t = decomp_vals[val[0]] 173 | print("{{{0},{1},0x{2:05X}}}".format(t, val[1], val[2]), end="") 174 | 175 | #print("{0: >3}".format(prop_vals[stage2[i][j]]), end="") 176 | if j + 1 == block_size: 177 | print("\n }", end="") 178 | else: 179 | print(",", end="") 180 | if j % 5 == 4: 181 | print("\n ", end="") 182 | if i + 1 != len(stage2): 183 | print(",\n") 184 | else: 185 | print("") 186 | print("};") 187 | print("") 188 | print("static const int32_t decomposition_mapping[] = {") 189 | for i in range(len(decomp_map) - 1): 190 | if i % 8 == 0: 191 | print("/* 0x{:04X} */ ".format(i), end="") 192 | print("0x{0:04X},".format(decomp_map[i]), end="") 193 | if i % 8 == 7: 194 | print("") 195 | print("0x{0:04X}".format(decomp_map[len(decomp_map) - 1])) 196 | print("};") 197 | print("") 198 | print("#endif /* UNICODE_DECOMPOSE_H */") 199 | -------------------------------------------------------------------------------- /util/gen-emojiprop.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2017 Patrick O. Perry. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import math 18 | 19 | try: 20 | import property 21 | import unicode_data 22 | except ModuleNotFoundError: 23 | from util import property 24 | from util import unicode_data 25 | 26 | EMOJI_DATA = "data/ucd/emoji/emoji-data.txt" 27 | 28 | emoji_props = property.read(EMOJI_DATA, sets=True) 29 | 30 | 31 | props = [0] * len(unicode_data.uchars) 32 | for i, (key, values) in enumerate(emoji_props.items()): 33 | bit = 0x1 << i 34 | for value in values: 35 | props[value] |= bit 36 | 37 | 38 | def compute_tables(block_size): 39 | nblock = len(props) // block_size 40 | stage1 = [None] * nblock 41 | stage2 = [] 42 | stage2_dict = {} 43 | for i in range(nblock): 44 | begin = i * block_size 45 | end = begin + block_size 46 | block = tuple(props[begin:end]) 47 | if block in stage2_dict: 48 | j = stage2_dict[block] 49 | else: 50 | j = len(stage2) 51 | stage2_dict[block] = j 52 | stage2.append(block) 53 | stage1[i] = j 54 | return (stage1,stage2) 55 | 56 | 57 | def stage1_item_size(nstage2): 58 | nbyte = math.ceil(math.log(nstage2, 2) / 8) 59 | size = 2**math.ceil(math.log(nbyte, 2)) 60 | return size 61 | 62 | page_size = 4096 63 | block_size = 256 64 | 65 | nbytes = {} 66 | 67 | best_block_size = 1 68 | smallest_size = len(props) 69 | 70 | for i in range(1,17): 71 | block_size = 2**i 72 | stage1,stage2 = compute_tables(block_size) 73 | 74 | nbyte1 = len(stage1) * stage1_item_size(len(stage2)) 75 | nbyte2 = len(stage2) * block_size 76 | 77 | nbyte1 = math.ceil(nbyte1 / page_size) * page_size 78 | nbyte2 = math.ceil(nbyte2 / page_size) * page_size 79 | nbyte = nbyte1 + nbyte2 80 | nbytes[block_size] = nbyte 81 | 82 | if nbyte < smallest_size: 83 | smallest_size = nbyte 84 | best_block_size = block_size 85 | 86 | 87 | block_size = best_block_size 88 | stage1, stage2 = compute_tables(block_size) 89 | 90 | type1_size = stage1_item_size(len(stage2)) 91 | if type1_size == 1: 92 | type1 = 'uint8_t' 93 | elif type1_size == 2: 94 | type1 = 'uint16_t' 95 | elif type1_size == 4: 96 | type1 = 'uint32_t' 97 | else: 98 | type1 = 'uint64_t' 99 | 100 | assert len(emoji_props) <= 8 101 | type2 = 'uint8_t' 102 | 103 | print("/* This file is automatically generated. DO NOT EDIT!") 104 | print(" Instead, edit gen-emoji.py and re-run. */") 105 | print("") 106 | print("/*") 107 | print(" * Unicode Emoji property values.") 108 | print(" *") 109 | print(" * We use the two-stage lookup strategy described at") 110 | print(" *") 111 | print(" * http://www.strchr.com/multi-stage_tables") 112 | print(" *") 113 | print(" */") 114 | print("") 115 | print("#ifndef UNICODE_EMOJIPROP_H") 116 | print("#define UNICODE_EMOJIPROP_H") 117 | print("") 118 | print("#include ") 119 | print("") 120 | print("enum emoji_prop_type {") 121 | print("\tEMOJI_PROP_NONE = 0", end="") 122 | for i, name in enumerate(emoji_props.keys()): 123 | print(",\n\tEMOJI_PROP_" + name.upper() + " = (1 << " + str(i) + ")", 124 | end="") 125 | print("\n};") 126 | print("") 127 | print("static const " + type1 + " emoji_prop_stage1[] = {") 128 | for i in range(len(stage1) - 1): 129 | if i % 16 == 0: 130 | print("/* U+{:04X} */".format(i * block_size), end="") 131 | print("{0: >3},".format(stage1[i]), end="") 132 | if i % 16 == 15: 133 | print("") 134 | print("{0: >3}".format(stage1[len(stage1) - 1])) 135 | print("};") 136 | print("") 137 | print("static const " + type2 + " emoji_prop_stage2[][" + 138 | str(block_size) + "] = {") 139 | for i in range(len(stage2)): 140 | print(" /* block " + str(i) + " */") 141 | print(" {", end="") 142 | for j in range(block_size): 143 | print("{0: >3}".format(stage2[i][j]), end="") 144 | if j + 1 == block_size: 145 | print("\n }", end="") 146 | else: 147 | print(",", end="") 148 | if j % 16 == 15: 149 | print("\n ", end="") 150 | if i + 1 != len(stage2): 151 | print(",\n") 152 | else: 153 | print("") 154 | print("};") 155 | 156 | print("") 157 | print("static int emoji_prop(int32_t code)") 158 | print("{") 159 | print("\tconst int32_t block_size = " + str(block_size) + ";") 160 | print("\t" + type1 + " i = emoji_prop_stage1[code / block_size];") 161 | print("\treturn emoji_prop_stage2[i][code % block_size];") 162 | print("}") 163 | print("") 164 | print("#endif /* UNICODE_EMOJIPROP_H */") 165 | -------------------------------------------------------------------------------- /util/gen-graphbreak.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2017 Patrick O. Perry. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import math 18 | 19 | try: 20 | import property 21 | except ModuleNotFoundError: 22 | from util import property 23 | 24 | DERIVED_CORE_PROPERTIES = "data/ucd/DerivedCoreProperties.txt" 25 | EMOJI_DATA = "data/ucd/emoji/emoji-data.txt" 26 | GRAPHEME_BREAK_PROPERTY = "data/ucd/auxiliary/GraphemeBreakProperty.txt" 27 | 28 | code_props = property.read(GRAPHEME_BREAK_PROPERTY) 29 | emoji_props = property.read(EMOJI_DATA, sets=True) 30 | 31 | derived_core_properties = property.read(DERIVED_CORE_PROPERTIES, sets=True) 32 | #incb_consonant = derived_core_properties['Indic_Conjunct_Break=Consonant'] 33 | #incb_extend = derived_core_properties['Indic_Conjunct_Break=Extend'] 34 | 35 | for i in range(len(code_props)): 36 | if code_props[i] is None: 37 | code_props[i] = 'Other' 38 | elif code_props[i] == 'Extend': 39 | code_props[i] = 'Extend_Other' 40 | 41 | for i in emoji_props['Extended_Pictographic']: 42 | assert code_props[i] == 'Other' 43 | code_props[i] = 'Extended_Pictographic' 44 | 45 | for p,v in ( 46 | ('InCB=Linker', 'Extend_InCB_Linker'), 47 | ('InCB=Extend', 'Extend_InCB_Extend'), 48 | ('InCB=Consonant', 'InCB_Consonant'), 49 | ): 50 | for i in derived_core_properties[p]: 51 | if code_props[i] == 'ZWJ': 52 | assert p == 'InCB=Extend' 53 | continue 54 | elif p in ('InCB=Extend', 'InCB=Linker'): 55 | assert code_props[i] == 'Extend_Other' 56 | else: 57 | assert code_props[i] == 'Other' 58 | code_props[i] = v 59 | 60 | prop_names = set(code_props) 61 | prop_names.remove('Other') 62 | 63 | 64 | prop_vals = {} 65 | prop_vals['Other'] = 0; 66 | 67 | for p in sorted(prop_names): 68 | prop_vals[p] = len(prop_vals) 69 | 70 | 71 | def compute_tables(block_size): 72 | nblock = len(code_props) // block_size 73 | stage1 = [None] * nblock 74 | stage2 = [] 75 | stage2_dict = {} 76 | for i in range(nblock): 77 | begin = i * block_size 78 | end = begin + block_size 79 | block = tuple(code_props[begin:end]) 80 | if block in stage2_dict: 81 | j = stage2_dict[block] 82 | else: 83 | j = len(stage2) 84 | stage2_dict[block] = j 85 | stage2.append(block) 86 | stage1[i] = j 87 | return (stage1,stage2) 88 | 89 | 90 | def stage1_item_size(nstage2): 91 | nbyte = math.ceil(math.log(nstage2, 2) / 8) 92 | size = 2**math.ceil(math.log(nbyte, 2)) 93 | return size 94 | 95 | page_size = 4096 96 | block_size = 256 97 | 98 | nbytes = {} 99 | 100 | best_block_size = 1 101 | smallest_size = len(code_props) 102 | 103 | for i in range(1,17): 104 | block_size = 2**i 105 | stage1,stage2 = compute_tables(block_size) 106 | 107 | nbyte1 = len(stage1) * stage1_item_size(len(stage2)) 108 | nbyte2 = len(stage2) * block_size 109 | 110 | nbyte1 = math.ceil(nbyte1 / page_size) * page_size 111 | nbyte2 = math.ceil(nbyte2 / page_size) * page_size 112 | nbyte = nbyte1 + nbyte2 113 | nbytes[block_size] = nbyte 114 | 115 | if nbyte < smallest_size: 116 | smallest_size = nbyte 117 | best_block_size = block_size 118 | 119 | 120 | block_size = best_block_size 121 | stage1,stage2 = compute_tables(block_size) 122 | 123 | type1_size = stage1_item_size(len(stage2)) 124 | 125 | if type1_size == 1: 126 | type1 = 'uint8_t' 127 | elif type1_size == 2: 128 | type1 = 'uint16_t' 129 | elif type1_size == 4: 130 | type1 = 'uint32_t' 131 | else: 132 | type1 = 'uint64_t' 133 | 134 | type2 = 'int8_t' 135 | 136 | 137 | 138 | # Write graphbreak.h to stdout 139 | 140 | print("/* This file is automatically generated. DO NOT EDIT!") 141 | print(" Instead, edit gen-graphbreak.py and re-run. */") 142 | print("") 143 | print("/*") 144 | print(" * Unicode Grapheme_Break property values.") 145 | print(" *") 146 | print(" * Defined in UAX #29 \"Unicode Text Segmentation\"") 147 | print(" *") 148 | print(" * http://www.unicode.org/reports/tr29/") 149 | print(" *") 150 | print(" * Section 4.1, Table 3.") 151 | print(" *") 152 | print(" *") 153 | print(" * We use the two-stage lookup strategy described at") 154 | print(" *") 155 | print(" * http://www.strchr.com/multi-stage_tables") 156 | print(" *") 157 | print(" */") 158 | print("") 159 | print("#ifndef UNICODE_GRAPHBREAK_H") 160 | print("#define UNICODE_GRAPHBREAK_H") 161 | print("") 162 | print("#include ") 163 | print("") 164 | print("enum graph_break_prop {") 165 | print("\tGRAPH_BREAK_OTHER = 0", end="") 166 | for prop in sorted(prop_names): 167 | print(",\n\tGRAPH_BREAK_" + prop.upper() + " = " + str(prop_vals[prop]), 168 | end="") 169 | print("\n};") 170 | print("") 171 | print("static const " + type1 + " graph_break_stage1[] = {") 172 | for i in range(len(stage1) - 1): 173 | if i % 16 == 0: 174 | print("/* U+{:04X} */".format(i * block_size), end="") 175 | print("{0: >3},".format(stage1[i]), end="") 176 | if i % 16 == 15: 177 | print("") 178 | print("{0: >3}".format(stage1[len(stage1) - 1])) 179 | print("};") 180 | print("") 181 | print("static const " + type2 + " graph_break_stage2[][" + 182 | str(block_size) + "] = {") 183 | #for i in range(len(stage2)): 184 | for i in range(0,len(stage2)): 185 | print(" /* block " + str(i) + " */") 186 | print(" {", end="") 187 | for j in range(block_size): 188 | print("{0: >3}".format(prop_vals[stage2[i][j]]), end="") 189 | if j + 1 == block_size: 190 | print("\n }", end="") 191 | else: 192 | print(",", end="") 193 | if j % 16 == 15: 194 | print("\n ", end="") 195 | if i + 1 != len(stage2): 196 | print(",\n") 197 | else: 198 | print("") 199 | print("};") 200 | 201 | print("") 202 | print("static int graph_break(int32_t code)") 203 | print("{") 204 | print("\tconst int32_t block_size = " + str(block_size) + ";") 205 | print("\t" + type1 + " i = graph_break_stage1[code / block_size];") 206 | print("\treturn graph_break_stage2[i][code % block_size];") 207 | print("}") 208 | print("") 209 | print("#endif /* UNICODE_GRAPHBREAK_H */") 210 | -------------------------------------------------------------------------------- /util/gen-normalization.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2017 Patrick O. Perry. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import math 18 | import re 19 | 20 | PATTERN = re.compile(r"""^([0-9A-Fa-f]+) # (first code) 21 | (\.\.([0-9A-Fa-f]+))? # (.. last code)? 22 | \s* 23 | ; # ; 24 | \s* 25 | (\w+) # (property name) 26 | \s* 27 | ; # ; 28 | \s* 29 | (\w+) # (property value) 30 | \s* 31 | (\#.*)?$ # (# comment)?""", re.X) 32 | 33 | UNICODE_MAX = 0x10FFFF 34 | 35 | DERIVED_NORMALIZATION_PROPS = "data/ucd/DerivedNormalizationProps.txt" 36 | try: 37 | infile = open(DERIVED_NORMALIZATION_PROPS) 38 | except FileNotFoundError: 39 | infile = open("../" + DERIVED_NORMALIZATION_PROPS, "r") 40 | 41 | code_props = ['Yes'] * (UNICODE_MAX + 1) 42 | 43 | for line in infile: 44 | m = PATTERN.match(line) 45 | if m is None or m.group(4) != 'NFC_QC': 46 | continue 47 | begin = int(m.group(1), 16) 48 | if m.group(3) is None: 49 | end = begin + 1 50 | else: 51 | end = int(m.group(3), 16) + 1 52 | prop = m.group(5) 53 | if prop == 'M': 54 | prop = 'Maybe' 55 | elif prop == 'N': 56 | prop = 'No' 57 | elif prop == 'Y': 58 | prop = 'Yes' 59 | for code in range(begin, end): 60 | code_props[code] = prop 61 | #print(line, end = "") 62 | #print('[', '{:04X}'.format(begin), ',', '{:04X}'.format(end), '): ', 63 | # prop, sep = '') 64 | infile.close() 65 | 66 | 67 | prop_names = set(code_props) 68 | 69 | prop_vals = {} 70 | prop_vals['No'] = 0 71 | prop_vals['Yes'] = 1 72 | prop_vals['Maybe'] = -1 73 | 74 | def compute_tables(block_size): 75 | nblock = len(code_props) // block_size 76 | stage1 = [None] * nblock 77 | stage2 = [] 78 | stage2_dict = {} 79 | for i in range(nblock): 80 | begin = i * block_size 81 | end = begin + block_size 82 | block = tuple(code_props[begin:end]) 83 | if block in stage2_dict: 84 | j = stage2_dict[block] 85 | else: 86 | j = len(stage2) 87 | stage2_dict[block] = j 88 | stage2.append(block) 89 | stage1[i] = j 90 | return (stage1,stage2) 91 | 92 | def stage1_item_size(nstage2): 93 | nbyte = math.ceil(math.log(nstage2, 2) / 8) 94 | size = 2**math.ceil(math.log(nbyte, 2)) 95 | return size 96 | 97 | page_size = 4096 98 | block_size = 256 99 | 100 | nbytes = {} 101 | 102 | best_block_size = 1 103 | smallest_size = len(code_props) 104 | 105 | for i in range(1,17): 106 | block_size = 2**i 107 | stage1,stage2 = compute_tables(block_size) 108 | # 109 | nbyte1 = len(stage1) * stage1_item_size(len(stage2)) 110 | nbyte2 = len(stage2) * block_size 111 | # 112 | nbyte1 = math.ceil(nbyte1 / page_size) * page_size 113 | nbyte2 = math.ceil(nbyte2 / page_size) * page_size 114 | nbyte = nbyte1 + nbyte2 115 | nbytes[block_size] = nbyte 116 | # 117 | if nbyte < smallest_size: 118 | smallest_size = nbyte 119 | best_block_size = block_size 120 | 121 | 122 | block_size = best_block_size 123 | stage1,stage2 = compute_tables(block_size) 124 | 125 | type1_size = stage1_item_size(len(stage2)) 126 | 127 | if type1_size == 1: 128 | type1 = 'uint8_t' 129 | elif type1_size == 2: 130 | type1 = 'uint16_t' 131 | elif type1_size == 4: 132 | type1 = 'uint32_t' 133 | else: 134 | type1 = 'uint64_t' 135 | 136 | type2 = 'int8_t' 137 | 138 | 139 | 140 | # Write normalizationprop.h to stdout 141 | 142 | print("/* This file is automatically generated. DO NOT EDIT!") 143 | print(" Instead, edit gen-normalization.py and re-run. */") 144 | print("") 145 | print("/*") 146 | print(" * Unicode NFC_QC property values.") 147 | print(" *") 148 | print(" * Defined in UAX #15 \"Unicode Normalization Forms\"") 149 | print(" *") 150 | print(" * http://www.unicode.org/reports/tr15/") 151 | print(" *") 152 | print(" * Section 9, \"Detecting Normalization Forms.\"") 153 | print(" *") 154 | print(" *") 155 | print(" * We use the two-stage lookup strategy described at") 156 | print(" *") 157 | print(" * http://www.strchr.com/multi-stage_tables") 158 | print(" *") 159 | print(" */") 160 | print("") 161 | print("#ifndef NORMALIZATIONPROP_H") 162 | print("#define NORMALIZATIONPROP_H") 163 | print("") 164 | print("#include ") 165 | print("") 166 | print("enum nfc_qc_prop {") 167 | for i in range(len(prop_names)): 168 | prop = sorted(prop_names)[i] 169 | if i > 0: 170 | print(",") 171 | print("\tNFC_QC_" + prop.upper() + " = " + str(prop_vals[prop]), end="") 172 | print("\n};") 173 | print("") 174 | print("static const " + type1 + " nfc_qc_stage1[] = {") 175 | for i in range(len(stage1) - 1): 176 | if i % 16 == 0: 177 | print("/* U+{:04X} */".format(i * block_size), end="") 178 | print("{0: >3},".format(stage1[i]), end="") 179 | if i % 16 == 15: 180 | print("") 181 | print("{0: >3}".format(stage1[len(stage1) - 1])) 182 | print("};") 183 | print("") 184 | print("static const " + type2 + " nfc_qc_stage2[][" + 185 | str(block_size) + "] = {") 186 | for i in range(len(stage2)): 187 | print(" /* block " + str(i) + " */") 188 | print(" {", end="") 189 | for j in range(block_size): 190 | print("{0: >3}".format(prop_vals[stage2[i][j]]), end="") 191 | if j + 1 == block_size: 192 | print("\n }", end="") 193 | else: 194 | print(",", end="") 195 | if j % 16 == 15: 196 | print("\n ", end="") 197 | if i + 1 != len(stage2): 198 | print(",\n") 199 | else: 200 | print("") 201 | print("};") 202 | 203 | print("") 204 | print("static int nfc_qc(uint32_t code)") 205 | print("{") 206 | print("\tconst uint32_t block_size = " + str(block_size) + ";") 207 | print("\t" + type1 + " i = nfc_qc_stage1[code / block_size];") 208 | print("\treturn nfc_qc_stage2[i][code % block_size];") 209 | print("}") 210 | print("") 211 | print("#endif /* NORMALIZATIONPROP_H */") 212 | -------------------------------------------------------------------------------- /util/gen-wordbreak.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2016 Patrick O. Perry. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import math 18 | 19 | try: 20 | import property 21 | import unicode_data 22 | except ModuleNotFoundError: 23 | from util import property 24 | from util import unicode_data 25 | 26 | 27 | WORD_BREAK_PROPERTY = "data/ucd/auxiliary/WordBreakProperty.txt" 28 | code_props = property.read(WORD_BREAK_PROPERTY) 29 | 30 | prop_names = set(code_props) 31 | prop_names.remove(None) 32 | 33 | prop_names.add('Other') 34 | for code in range(len(code_props)): 35 | if code_props[code] is None: 36 | code_props[code] = 'Other' 37 | 38 | prop_vals = {} 39 | prop_vals['None'] = 0; 40 | for p in sorted(prop_names): 41 | prop_vals[p] = len(prop_vals) 42 | 43 | 44 | def compute_tables(block_size): 45 | nblock = len(code_props) // block_size 46 | stage1 = [None] * nblock 47 | stage2 = [] 48 | stage2_dict = {} 49 | for i in range(nblock): 50 | begin = i * block_size 51 | end = begin + block_size 52 | block = tuple(code_props[begin:end]) 53 | if block in stage2_dict: 54 | j = stage2_dict[block] 55 | else: 56 | j = len(stage2) 57 | stage2_dict[block] = j 58 | stage2.append(block) 59 | stage1[i] = j 60 | return (stage1,stage2) 61 | 62 | 63 | def stage1_item_size(nstage2): 64 | nbyte = math.ceil(math.log(nstage2, 2) / 8) 65 | size = 2**math.ceil(math.log(nbyte, 2)) 66 | return size 67 | 68 | page_size = 4096 69 | block_size = 256 70 | 71 | nbytes = {} 72 | 73 | best_block_size = 1 74 | smallest_size = len(code_props) 75 | 76 | for i in range(1,17): 77 | block_size = 2**i 78 | stage1,stage2 = compute_tables(block_size) 79 | 80 | nbyte1 = len(stage1) * stage1_item_size(len(stage2)) 81 | nbyte2 = len(stage2) * block_size 82 | 83 | nbyte1 = math.ceil(nbyte1 / page_size) * page_size 84 | nbyte2 = math.ceil(nbyte2 / page_size) * page_size 85 | nbyte = nbyte1 + nbyte2 86 | nbytes[block_size] = nbyte 87 | 88 | if nbyte < smallest_size: 89 | smallest_size = nbyte 90 | best_block_size = block_size 91 | 92 | 93 | block_size = best_block_size 94 | stage1,stage2 = compute_tables(block_size) 95 | 96 | type1_size = stage1_item_size(len(stage2)) 97 | 98 | if type1_size == 1: 99 | type1 = 'uint8_t' 100 | elif type1_size == 2: 101 | type1 = 'uint16_t' 102 | elif type1_size == 4: 103 | type1 = 'uint32_t' 104 | else: 105 | type1 = 'uint64_t' 106 | 107 | type2 = 'int8_t' 108 | 109 | 110 | # Write wordbreakprop.h to stdout 111 | 112 | print("/* This file is automatically generated. DO NOT EDIT!") 113 | print(" Instead, edit gen-wordbreak.py and re-run. */") 114 | print("") 115 | print("/*") 116 | print(" * Unicode Word_Break property values.") 117 | print(" *") 118 | print(" * Defined in UAX #29 \"Unicode Text Segmentation\"") 119 | print(" *") 120 | print(" * http://www.unicode.org/reports/tr29/") 121 | print(" *") 122 | print(" * Section 4.1, Table 3.") 123 | print(" *") 124 | print(" *") 125 | print(" * We use the two-stage lookup strategy described at") 126 | print(" *") 127 | print(" * http://www.strchr.com/multi-stage_tables") 128 | print(" *") 129 | print(" */") 130 | print("") 131 | print("#ifndef WORDBREAKPROP_H") 132 | print("#define WORDBREAKPROP_H") 133 | print("") 134 | print("#include ") 135 | print("") 136 | print("enum word_break_prop {") 137 | print("\tWORD_BREAK_NONE = 0", end="") 138 | for prop in sorted(prop_names): 139 | print(",\n\tWORD_BREAK_" + prop.upper() + " = " + str(prop_vals[prop]), 140 | end="") 141 | print("\n};") 142 | print("") 143 | print("static const " + type1 + " word_break_stage1[] = {") 144 | for i in range(len(stage1) - 1): 145 | if i % 16 == 0: 146 | print("/* U+{:04X} */".format(i * block_size), end="") 147 | print("{0: >3},".format(stage1[i]), end="") 148 | if i % 16 == 15: 149 | print("") 150 | print("{0: >3}".format(stage1[len(stage1) - 1])) 151 | print("};") 152 | print("") 153 | print("static const " + type2 + " word_break_stage2[][" + 154 | str(block_size) + "] = {") 155 | #for i in range(len(stage2)): 156 | for i in range(0,len(stage2)): 157 | print(" /* block " + str(i) + " */") 158 | print(" {", end="") 159 | for j in range(block_size): 160 | print("{0: >3}".format(prop_vals[stage2[i][j]]), end="") 161 | if j + 1 == block_size: 162 | print("\n }", end="") 163 | else: 164 | print(",", end="") 165 | if j % 16 == 15: 166 | print("\n ", end="") 167 | if i + 1 != len(stage2): 168 | print(",\n") 169 | else: 170 | print("") 171 | print("};") 172 | 173 | print("") 174 | print("static int word_break(int32_t code)") 175 | print("{") 176 | print("\tconst int32_t block_size = " + str(block_size) + ";") 177 | print("\t" + type1 + " i = word_break_stage1[code / block_size];") 178 | print("\treturn word_break_stage2[i][code % block_size];") 179 | print("}") 180 | print("") 181 | print("#endif /* WORDBREAKPROP_H */") 182 | -------------------------------------------------------------------------------- /util/property.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | 4 | PATTERN = re.compile(r"""^([0-9A-Fa-f]+) # (first code) 5 | (\.\.([0-9A-Fa-f]+))? # (.. last code)? 6 | \s* 7 | ; # ; 8 | \s* 9 | (\w+) # (prop name) 10 | \s* 11 | (;\s*(\w+))? # (; (prop value)) 12 | \s* 13 | (\#.*)?$ # (# comment)?""", re.X) 14 | 15 | FIRST_CODE = 1 16 | LAST_CODE = 3 17 | PROP_NAME = 4 18 | PROP_VALUE = 6 19 | 20 | UNICODE_MAX = 0x10FFFF 21 | 22 | 23 | def read(filename, sets=False): 24 | try: 25 | file = open(filename, "r") 26 | except FileNotFoundError: 27 | file = open("../" + filename, "r") 28 | 29 | code_props = [None] * (UNICODE_MAX + 1) 30 | prop_names = set() 31 | properties = {} 32 | 33 | with file: 34 | for line in file: 35 | line = line.split("#")[0] # remove comment 36 | m = PATTERN.match(line) 37 | if m: 38 | first = int(m.group(FIRST_CODE), 16) 39 | if m.group(LAST_CODE): 40 | last = int(m.group(LAST_CODE), 16) 41 | else: 42 | last = first 43 | name = m.group(PROP_NAME) 44 | val = m.group(PROP_VALUE) 45 | if val != None: 46 | name = name + '=' + val 47 | if not name in properties: 48 | properties[name] = set() 49 | prop = properties[name] 50 | for u in range(first, last + 1): 51 | if not sets: 52 | assert code_props[u] is None 53 | code_props[u] = name 54 | prop.add(u) 55 | prop_names.add(name) 56 | if sets: 57 | return properties 58 | else: 59 | return code_props 60 | -------------------------------------------------------------------------------- /util/table-graphbreak.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "../src/private/charwidth.h" 5 | #include "../src/private/graphbreak.h" 6 | #include "../src/utf8lite.h" 7 | 8 | const char *str_charwidth(enum charwidth_prop prop) 9 | { 10 | switch (prop) { 11 | case CHARWIDTH_OTHER: 12 | return "Other"; 13 | case CHARWIDTH_EMOJI: 14 | return "Emoji"; 15 | case CHARWIDTH_AMBIGUOUS: 16 | return "Ambiguous"; 17 | case CHARWIDTH_IGNORABLE: 18 | return "Ignorable"; 19 | case CHARWIDTH_NONE: 20 | return "None"; 21 | case CHARWIDTH_NARROW: 22 | return "Narrow"; 23 | case CHARWIDTH_WIDE: 24 | return "Wide"; 25 | default: 26 | assert(0 && "Unrecognized charwidth property"); 27 | } 28 | } 29 | 30 | const char *str_graph_break(enum graph_break_prop prop) 31 | { 32 | switch (prop) { 33 | case GRAPH_BREAK_OTHER: 34 | return "None"; 35 | case GRAPH_BREAK_CR: 36 | return "CR"; 37 | case GRAPH_BREAK_CONTROL: 38 | return "Control"; 39 | case GRAPH_BREAK_E_BASE: 40 | return "EBase"; 41 | case GRAPH_BREAK_E_BASE_GAZ: 42 | return "EBaseGAZ"; 43 | case GRAPH_BREAK_E_MODIFIER: 44 | return "EModifier"; 45 | case GRAPH_BREAK_EXTEND: 46 | return "Extend"; 47 | case GRAPH_BREAK_GLUE_AFTER_ZWJ: 48 | return "GlueAfterZWJ"; 49 | case GRAPH_BREAK_L: 50 | return "L"; 51 | case GRAPH_BREAK_LF: 52 | return "LF"; 53 | case GRAPH_BREAK_LV: 54 | return "LV"; 55 | case GRAPH_BREAK_LVT: 56 | return "LVT"; 57 | case GRAPH_BREAK_PREPEND: 58 | return "Prepend"; 59 | case GRAPH_BREAK_REGIONAL_INDICATOR: 60 | return "RegionalIndicator"; 61 | case GRAPH_BREAK_SPACINGMARK: 62 | return "SpacingMark"; 63 | case GRAPH_BREAK_T: 64 | return "T"; 65 | case GRAPH_BREAK_V: 66 | return "V"; 67 | case GRAPH_BREAK_ZWJ: 68 | return "ZWJ"; 69 | default: 70 | assert(0 && "Unrecognized graph break property"); 71 | } 72 | } 73 | 74 | int main(int argc, const char **argv) 75 | { 76 | int32_t i, n = UTF8LITE_CODE_MAX; 77 | int cw, gb; 78 | 79 | printf("code,width,graph\n"); 80 | for (i = 0; i <= n; i++) { 81 | cw = charwidth(i); 82 | gb = graph_break(i); 83 | printf("U+%04"PRIX32",%s,%s\n", (uint32_t)i, 84 | str_charwidth(cw), str_graph_break(gb)); 85 | } 86 | 87 | return 0; 88 | } 89 | -------------------------------------------------------------------------------- /util/unicode_data.py: -------------------------------------------------------------------------------- 1 | # unicode_data 2 | 3 | import collections 4 | import re 5 | 6 | 7 | UNICODE_DATA = 'data/ucd/UnicodeData.txt' 8 | UNICODE_MAX = 0x10FFFF 9 | 10 | try: 11 | unicode_data = open(UNICODE_DATA, "r") 12 | except FileNotFoundError: 13 | unicode_data = open("../" + UNICODE_DATA, "r") 14 | 15 | field_names = [ 16 | 'name', # Name 17 | 'category', # General_Category 18 | 'ccc', # Canonical_Combining_Class 19 | 'bidi', # Bidi_Class 20 | 'decomp', # Decomposition_Type, Decomposition_Mapping 21 | 'decimal', # Numeric_Value (Numeric_Type = Decimal) 22 | 'digit', # Numeric_Value (Numeric_Type = Decimal, Digit) 23 | 'numeric', # Numeric_Value (Numeric_Type = Decimal, Digit, Numeric) 24 | 'mirrored', # Bidi_Mirrored 25 | 'old_name', # Unicode_1_Name 26 | 'comment', # ISO_Comment 27 | 'ucase', # Simple_Uppercase_Mapping 28 | 'lcase', # Simple_Lowercase_Mapping 29 | 'tcase' # Simple_Titlecase_Mapping 30 | ] 31 | 32 | UChar = collections.namedtuple('UChar', field_names) 33 | ids = UChar._make(range(1, len(field_names) + 1)) 34 | 35 | Decomp = collections.namedtuple('Decomp', ['type', 'map']) 36 | 37 | DECOMP_PATTERN = re.compile(r"""^(<(\w+)>)?\s* # decomposition type 38 | ((\s*[0-9A-Fa-f]+)+) # decomposition mapping 39 | \s*$""", re.X) 40 | RANGE_PATTERN = re.compile(r"""^<([^,]+),\s* # range name 41 | (First|Last) # first or last 42 | >$""", re.X) 43 | 44 | def parse_decomp(code, field): 45 | if field != '': 46 | m = DECOMP_PATTERN.match(field) 47 | assert m 48 | d_type = m.group(2) 49 | d_map = tuple([int(x, 16) for x in m.group(3).split()]) 50 | return Decomp(type=d_type, map=d_map) 51 | 52 | elif code in range(0xAC00, 0xD7A4): 53 | return Decomp(type='hangul', map=None) 54 | else: 55 | return None 56 | 57 | 58 | def parse_code(field): 59 | if field != '': 60 | return int(field, 16) 61 | else: 62 | return None 63 | 64 | def parse_int(field): 65 | if field != '': 66 | return int(field) 67 | else: 68 | return None 69 | 70 | def parse_str(field): 71 | if field == '': 72 | return None 73 | else: 74 | return field 75 | 76 | 77 | uchars = [None] * (UNICODE_MAX + 1) 78 | 79 | with unicode_data: 80 | for line in unicode_data: 81 | fields = line.strip().split(';') 82 | code = int(fields[0], 16) 83 | uchars[code] = UChar(name = fields[ids.name], 84 | category = parse_str(fields[ids.category]), 85 | ccc = parse_int(fields[ids.ccc]), 86 | bidi = fields[ids.bidi], 87 | decomp = parse_decomp(code, fields[ids.decomp]), 88 | decimal = fields[ids.decimal], 89 | digit = fields[ids.digit], 90 | numeric = fields[ids.numeric], 91 | mirrored = fields[ids.mirrored], 92 | old_name = fields[ids.old_name], 93 | comment = fields[ids.comment], 94 | ucase = parse_code(fields[ids.ucase]), 95 | lcase = parse_code(fields[ids.lcase]), 96 | tcase = parse_code(fields[ids.tcase])) 97 | 98 | 99 | utype = None 100 | 101 | for code in range(len(uchars)): 102 | u = uchars[code] 103 | if u is None: 104 | uchars[code] = utype 105 | else: 106 | m = RANGE_PATTERN.match(u.name) 107 | if m: 108 | if m.group(2) == 'First': 109 | utype = u._replace(name = '<' + m.group(1) + '>') 110 | else: 111 | utype = None 112 | 113 | 114 | 115 | decomp_vals = { 116 | 'hangul': -1, 'none': 0, 117 | 'font': 1, 'noBreak': 2, 'initial': 3, 'medial': 4, 'final': 5, 118 | 'isolated': 6, 'circle': 7, 'super': 8, 'sub': 9, 'vertical': 10, 119 | 'wide': 11, 'narrow': 12, 'small': 13, 'square': 14, 'fraction': 15, 120 | 'compat': 16 } 121 | 122 | decomp_map = [] 123 | decomp = [] 124 | 125 | for code in range(len(uchars)): 126 | u = uchars[code] 127 | 128 | if u is None or u.decomp is None: 129 | decomp.append(None) 130 | continue 131 | 132 | d = u.decomp 133 | if d.map is not None: 134 | d_len = len(d.map) 135 | 136 | if d_len > 1: 137 | d_data = len(decomp_map) 138 | decomp_map.extend(d.map) 139 | else: 140 | d_data = d.map[0] 141 | 142 | decomp.append((d.type, d_len, d_data)) 143 | 144 | elif d.type == 'hangul': 145 | decomp.append(('hangul', 2, 0)) 146 | 147 | else: 148 | decomp.append(None) 149 | 150 | 151 | # From Unicode-8.0 Section 3.12 Conjoining Jamo Behavior 152 | HANGUL_SBASE = 0xAC00 153 | HANGUL_LBASE = 0x1100 154 | HANGUL_VBASE = 0x1161 155 | HANGUL_TBASE = 0x11A7 156 | HANGUL_LCOUNT = 19 157 | HANGUL_VCOUNT = 21 158 | HANGUL_TCOUNT = 28 159 | HANGUL_NCOUNT = (HANGUL_VCOUNT * HANGUL_TCOUNT) 160 | HANTUL_SCOUNT = (HANGUL_LCOUNT * HANGUL_NCOUNT) 161 | 162 | 163 | def hangul_decompose(code): 164 | sindex = code - HANGUL_SBASE 165 | lindex = sindex // HANGUL_NCOUNT 166 | vindex = (sindex % HANGUL_NCOUNT) // HANGUL_TCOUNT 167 | tindex = sindex % HANGUL_TCOUNT 168 | lpart = HANGUL_LBASE + lindex 169 | vpart = HANGUL_VBASE + vindex; 170 | tpart = HANGUL_TBASE + tindex; 171 | if tindex > 0: 172 | return (lpart, vpart, tpart) 173 | else: 174 | return (lpart, vpart) 175 | 176 | 177 | # get a character's decomposition if one exists, as a tuple 178 | def decompose(code, compat=True): 179 | dc = decomp[code] 180 | if dc is None: 181 | return None 182 | t = dc[0] 183 | l = dc[1] 184 | if not compat and t is not None and t != 'hangul': 185 | return None 186 | if l == 1: 187 | return (dc[2],) 188 | elif dc[0] != 'hangul': 189 | o = dc[2] 190 | return tuple(decomp_map[o:o + l]) 191 | else: 192 | return hangul_decompose(code) 193 | --------------------------------------------------------------------------------