├── .gitignore
├── .travis.yml
├── Doxyfile
├── LICENSE
├── LICENSE.Unicode
├── Makefile
├── README.md
├── data
    └── ucd
    │   ├── CaseFolding.txt
    │   ├── CompositionExclusions.txt
    │   ├── DerivedCoreProperties.txt
    │   ├── EastAsianWidth.txt
    │   ├── NormalizationTest.txt
    │   ├── PropList.txt
    │   ├── UnicodeData.txt
    │   ├── auxiliary
    │       ├── GraphemeBreakProperty.txt
    │       ├── GraphemeBreakTest.txt
    │       ├── WordBreakProperty.txt
    │       └── WordBreakTest.txt
    │   └── emoji
    │       └── emoji-data.txt
├── src
    ├── array.c
    ├── char.c
    ├── encode.c
    ├── error.c
    ├── escape.c
    ├── graph.c
    ├── graphscan.c
    ├── normalize.c
    ├── private
    │   ├── array.h
    │   ├── casefold.h
    │   ├── charwidth.h
    │   ├── combining.h
    │   ├── compose.h
    │   ├── decompose.h
    │   ├── emojiprop.h
    │   ├── graphbreak.h
    │   └── wordbreak.h
    ├── render.c
    ├── text.c
    ├── textassign.c
    ├── textiter.c
    ├── textmap.c
    ├── utf8lite.h
    └── wordscan.c
├── tests
    ├── check_charwidth.c
    ├── check_graphscan.c
    ├── check_render.c
    ├── check_text.c
    ├── check_textmap.c
    ├── check_unicode.c
    ├── check_wordscan.c
    ├── testutil.c
    ├── testutil.h
    └── wcwidth9
    │   ├── LICENSE
    │   └── wcwidth9.h
├── utf8lite.xcodeproj
    └── project.pbxproj
└── util
    ├── compute-typelen.py
    ├── gen-casefold.py
    ├── gen-charwidth.py
    ├── gen-combining.py
    ├── gen-compose.py
    ├── gen-decompose.py
    ├── gen-emojiprop.py
    ├── gen-graphbreak.py
    ├── gen-normalization.py
    ├── gen-wordbreak.py
    ├── property.py
    ├── table-graphbreak.c
    └── unicode_data.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.a
 2 | *.o
 3 | __pycache__
 4 | a.out
 5 | project.xcworkspace
 6 | xcuserdata
 7 | /doc/html
 8 | /tests/check_charwidth
 9 | /tests/check_graphscan
10 | /tests/check_render
11 | /tests/check_text
12 | /tests/check_textmap
13 | /tests/check_unicode
14 | /tests/check_wordscan
15 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: c
 2 | 
 3 | before_install:
 4 |   - sudo apt-get -qq update
 5 |   - sudo apt-get install -y check
 6 |   - export CFLAGS="-coverage -O0"
 7 |   - export LDFLAGS="-coverage"
 8 | 
 9 | install:
10 |   - make
11 | 
12 | script:
13 |   - make check || exit 1
14 |   - gcov -o src src/*.c
15 | 
16 | after_success:
17 |   - bash <(curl -s https://codecov.io/bash) -X gcov
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/LICENSE.Unicode:
--------------------------------------------------------------------------------
 1 | UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
 2 | 
 3 | Unicode Data Files include all data files under the directories
 4 | http://www.unicode.org/Public/, http://www.unicode.org/reports/,
 5 | http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and
 6 | http://www.unicode.org/utility/trac/browser/.
 7 | 
 8 | Unicode Data Files do not include PDF online code charts under the
 9 | directory http://www.unicode.org/Public/.
10 | 
11 | Software includes any source code published in the Unicode Standard
12 | or under the directories
13 | http://www.unicode.org/Public/, http://www.unicode.org/reports/,
14 | http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and
15 | http://www.unicode.org/utility/trac/browser/.
16 | 
17 | NOTICE TO USER: Carefully read the following legal agreement.
18 | BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
19 | DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
20 | YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
21 | TERMS AND CONDITIONS OF THIS AGREEMENT.
22 | IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
23 | THE DATA FILES OR SOFTWARE.
24 | 
25 | COPYRIGHT AND PERMISSION NOTICE
26 | 
27 | Copyright (c) 1991-2017 Unicode, Inc. All rights reserved.
28 | Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
29 | 
30 | Permission is hereby granted, free of charge, to any person obtaining
31 | a copy of the Unicode data files and any associated documentation
32 | (the "Data Files") or Unicode software and any associated documentation
33 | (the "Software") to deal in the Data Files or Software
34 | without restriction, including without limitation the rights to use,
35 | copy, modify, merge, publish, distribute, and/or sell copies of
36 | the Data Files or Software, and to permit persons to whom the Data Files
37 | or Software are furnished to do so, provided that either
38 | (a) this copyright and permission notice appear with all copies
39 | of the Data Files or Software, or
40 | (b) this copyright and permission notice appear in associated
41 | Documentation.
42 | 
43 | THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
44 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
45 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
46 | NONINFRINGEMENT OF THIRD PARTY RIGHTS.
47 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
48 | NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
49 | DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
50 | DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
51 | TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
52 | PERFORMANCE OF THE DATA FILES OR SOFTWARE.
53 | 
54 | Except as contained in this notice, the name of a copyright holder
55 | shall not be used in advertising or otherwise to promote the sale,
56 | use or other dealings in these Data Files or Software without prior
57 | written authorization of the copyright holder.
58 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | CC     += -std=c99
  2 | 
  3 | CFLAGS += -Wall -Wextra -pedantic -Werror \
  4 | 	-Wno-cast-qual \
  5 | 	-Wno-padded \
  6 | 	-Wno-unused-macros \
  7 | 	-g
  8 | 
  9 | #LDFLAGS +=
 10 | LIBS    += -lm
 11 | AR      = ar rcu
 12 | RANLIB  = ranlib
 13 | MKDIR_P = mkdir -p
 14 | CURL    = curl
 15 | 
 16 | LIB_CFLAGS = \
 17 | 	-Wno-cast-align \
 18 | 	-Wno-cast-qual \
 19 | 	-Wno-float-equal \
 20 | 	-Wno-missing-prototypes \
 21 | 	-Wno-sign-conversion \
 22 | 	-Wno-unreachable-code-break
 23 | 
 24 | TEST_CFLAGS = $(shell pkg-config --cflags check) \
 25 | 	-Wno-double-promotion \
 26 | 	-Wno-float-equal \
 27 | 	-Wno-gnu-zero-variadic-macro-arguments \
 28 | 	-Wno-missing-prototypes \
 29 | 	-Wno-missing-variable-declarations \
 30 | 	-Wno-reserved-id-macro \
 31 | 	-Wno-strict-prototypes \
 32 | 	-Wno-used-but-marked-unused
 33 | 
 34 | TEST_LIBS = $(shell pkg-config --libs check)
 35 | 
 36 | UNICODE = http://www.unicode.org/Public/15.1.0
 37 | 
 38 | UTF8LITE_A = libutf8lite.a
 39 | LIB_O	= src/array.o src/char.o src/encode.o src/error.o src/escape.o \
 40 | 	  src/graph.o src/graphscan.o src/normalize.o src/render.o src/text.o \
 41 | 	  src/textassign.o src/textiter.o src/textmap.o src/wordscan.o
 42 | 
 43 | DATA    = data/ucd/emoji/emoji-data.txt \
 44 | 	  data/ucd/CaseFolding.txt \
 45 | 	  data/ucd/CompositionExclusions.txt \
 46 | 	  data/ucd/DerivedCoreProperties.txt \
 47 | 	  data/ucd/EastAsianWidth.txt \
 48 | 	  data/ucd/PropList.txt \
 49 | 	  data/ucd/Scripts.txt \
 50 | 	  data/ucd/UnicodeData.txt \
 51 | 	  data/ucd/auxiliary/GraphemeBreakProperty.txt \
 52 | 	  data/ucd/auxiliary/WordBreakProperty.txt
 53 | 
 54 | TESTS_T = tests/check_charwidth tests/check_graphscan tests/check_render \
 55 | 	  tests/check_text tests/check_textmap tests/check_unicode \
 56 | 	  tests/check_wordscan
 57 | TESTS_O = tests/check_charwidth.o tests/check_graphscan.o tests/check_render.o \
 58 | 	  tests/check_text.o test/check_textmap.o tests/check_unicode.o \
 59 | 	  tests/check_wordscan.o tests/testutil.o
 60 | 
 61 | TESTS_DATA = data/ucd/NormalizationTest.txt \
 62 | 	     data/ucd/auxiliary/GraphemeBreakTest.txt \
 63 | 	     data/ucd/auxiliary/WordBreakTest.txt
 64 | 
 65 | ALL_O = $(LIB_O) $(UTF8LITE_O) $(STEMMER_O)
 66 | ALL_T = $(UTF8LITE_A) $(UTF8LITE_T)
 67 | ALL_A = $(UTF8LITE_A)
 68 | 
 69 | 
 70 | # Products
 71 | 
 72 | all: $(ALL_T)
 73 | 
 74 | $(UTF8LITE_A): $(LIB_O) $(STEMMER_O)
 75 | 	$(AR) $@ $(LIB_O) $(STEMMER_O)
 76 | 	$(RANLIB) $@
 77 | 
 78 | $(UTF8LITE_T): $(UTF8LITE_O) $(UTF8LITE_A)
 79 | 	$(CC) -o $@ $(UTF8LITE_O) $(UTF8LITE_A) $(LIBS) $(LDFLAGS)
 80 | 
 81 | 
 82 | # Data
 83 | 
 84 | data/ucd/emoji/emoji-data.txt:
 85 | 	$(MKDIR_P) data/ucd/emoji
 86 | 	$(CURL) -o $@ $(UNICODE)/ucd/emoji/emoji-data.txt
 87 | 
 88 | data/ucd/CaseFolding.txt:
 89 | 	$(MKDIR_P) data/ucd
 90 | 	$(CURL) -o $@ $(UNICODE)/ucd/CaseFolding.txt
 91 | 
 92 | data/ucd/CompositionExclusions.txt:
 93 | 	$(MKDIR_P) data/ucd
 94 | 	$(CURL) -o $@ $(UNICODE)/ucd/CompositionExclusions.txt
 95 | 
 96 | data/ucd/DerivedCoreProperties.txt:
 97 | 	$(MKDIR_P) data/ucd
 98 | 	$(CURL) -o $@ $(UNICODE)/ucd/DerivedCoreProperties.txt
 99 | 
100 | data/ucd/EastAsianWidth.txt:
101 | 	$(MKDIR_P) data/ucd
102 | 	$(CURL) -o $@ $(UNICODE)/ucd/EastAsianWidth.txt
103 | 
104 | data/ucd/PropList.txt:
105 | 	$(MKDIR_P) data/ucd
106 | 	$(CURL) -o $@ $(UNICODE)/ucd/PropList.txt
107 | 
108 | data/ucd/Scripts.txt:
109 | 	$(MKDIR_P) data/ucd
110 | 	$(CURL) -o $@ $(UNICODE)/ucd/Scripts.txt
111 | 
112 | data/ucd/NormalizationTest.txt:
113 | 	$(MKDIR_P) data/ucd
114 | 	$(CURL) -o $@ $(UNICODE)/ucd/NormalizationTest.txt
115 | 
116 | data/ucd/UnicodeData.txt:
117 | 	$(MKDIR_P) data/ucd
118 | 	$(CURL) -o $@ $(UNICODE)/ucd/UnicodeData.txt
119 | 
120 | data/ucd/auxiliary/GraphemeBreakProperty.txt:
121 | 	$(MKDIR_P) data/ucd/auxiliary
122 | 	$(CURL) -o $@ $(UNICODE)/ucd/auxiliary/GraphemeBreakProperty.txt
123 | 
124 | data/ucd/auxiliary/GraphemeBreakTest.txt:
125 | 	$(MKDIR_P) data/ucd/auxiliary
126 | 	$(CURL) -o $@ $(UNICODE)/ucd/auxiliary/GraphemeBreakTest.txt
127 | 
128 | data/ucd/auxiliary/WordBreakProperty.txt:
129 | 	$(MKDIR_P) data/ucd/auxiliary
130 | 	$(CURL) -o $@ $(UNICODE)/ucd/auxiliary/WordBreakProperty.txt
131 | 
132 | data/ucd/auxiliary/WordBreakTest.txt:
133 | 	$(MKDIR_P) data/ucd/auxiliary
134 | 	$(CURL) -o $@ $(UNICODE)/ucd/auxiliary/WordBreakTest.txt
135 | 
136 | # Generated Sources
137 | 
138 | src/private/casefold.h: util/gen-casefold.py \
139 | 		data/ucd/CaseFolding.txt
140 | 	$(MKDIR_P) src/private
141 | 	./util/gen-casefold.py > $@
142 | 
143 | src/private/charwidth.h: util/gen-charwidth.py util/property.py util/unicode_data.py \
144 | 		data/ucd/emoji/emoji-data.txt data/ucd/DerivedCoreProperties.txt \
145 | 		data/ucd/EastAsianWidth.txt data/ucd/UnicodeData.txt
146 | 	$(MKDIR_P) src/private
147 | 	./util/gen-charwidth.py > $@
148 | 
149 | src/private/combining.h: util/gen-combining.py util/unicode_data.py \
150 | 		data/ucd/UnicodeData.txt
151 | 	$(MKDIR_P) src/private
152 | 	./util/gen-combining.py > $@
153 | 
154 | src/private/compose.h: util/gen-compose.py util/unicode_data.py \
155 | 		data/ucd/CompositionExclusions.txt data/ucd/UnicodeData.txt
156 | 	$(MKDIR_P) src/private
157 | 	./util/gen-compose.py > $@
158 | 
159 | src/private/decompose.h: util/gen-decompose.py util/unicode_data.py \
160 | 		data/ucd/UnicodeData.txt
161 | 	$(MKDIR_P) src/private
162 | 	./util/gen-decompose.py > $@
163 | 
164 | src/private/emojiprop.h: util/gen-emojiprop.py data/ucd/emoji/emoji-data.txt
165 | 	$(MKDIR_P) src/private
166 | 	./util/gen-emojiprop.py > $@
167 | 
168 | src/private/graphbreak.h: util/gen-graphbreak.py util/gen-graphbreak.py \
169 | 		data/ucd/emoji/emoji-data.txt \
170 | 		data/ucd/auxiliary/GraphemeBreakProperty.txt
171 | 	$(MKDIR_P) src/private
172 | 	./util/gen-graphbreak.py > $@
173 | 
174 | src/private/normalization.h: util/gen-normalization.py \
175 | 		data/ucd/DerivedNormalizationProps.txt
176 | 	$(MKDIR_P) src/private
177 | 	./util/gen-normalization.py > $@
178 | 
179 | src/private/wordbreak.h: util/gen-wordbreak.py \
180 | 		data/ucd/DerivedCoreProperties.txt \
181 | 		data/ucd/PropList.txt \
182 | 		data/ucd/auxiliary/WordBreakProperty.txt
183 | 	$(MKDIR_P) src/private
184 | 	./util/gen-wordbreak.py > $@
185 | 
186 | 
187 | # Tests
188 | 
189 | tests/check_charwidth: tests/check_charwidth.o tests/testutil.o $(UTF8LITE_A)
190 | 	$(CC) -o $@ $^ $(LIBS) $(TEST_LIBS) $(LDFLAGS)
191 | 
192 | tests/check_graphscan: tests/check_graphscan.o tests/testutil.o $(UTF8LITE_A) \
193 | 		data/ucd/auxiliary/GraphemeBreakTest.txt
194 | 	$(CC) -o $@ tests/check_graphscan.o tests/testutil.o $(UTF8LITE_A) \
195 | 		$(LIBS) $(TEST_LIBS) $(LDFLAGS)
196 | 
197 | tests/check_render: tests/check_render.o tests/testutil.o $(UTF8LITE_A)
198 | 	$(CC) -o $@ $^ $(LIBS) $(TEST_LIBS) $(LDFLAGS)
199 | 
200 | tests/check_text: tests/check_text.o tests/testutil.o $(UTF8LITE_A)
201 | 	$(CC) -o $@ $^ $(LIBS) $(TEST_LIBS) $(LDFLAGS)
202 | 
203 | tests/check_textmap: tests/check_textmap.o tests/testutil.o $(UTF8LITE_A)
204 | 	$(CC) -o $@ $^ $(LIBS) $(TEST_LIBS) $(LDFLAGS)
205 | 
206 | tests/check_unicode: tests/check_unicode.o $(UTF8LITE_A) \
207 | 		data/ucd/NormalizationTest.txt
208 | 	$(CC) -o $@ tests/check_unicode.o $(UTF8LITE_A) \
209 | 		$(LIBS) $(TEST_LIBS) $(LDFLAGS)
210 | 
211 | tests/check_wordscan: tests/check_wordscan.o tests/testutil.o $(UTF8LITE_A) \
212 | 		data/ucd/auxiliary/WordBreakTest.txt
213 | 	$(CC) -o $@ tests/check_wordscan.o tests/testutil.o $(UTF8LITE_A) \
214 | 		$(LIBS) $(TEST_LIBS) $(LDFLAGS)
215 | 
216 | 
217 | # Special Rules
218 | 
219 | check: $(TESTS_T) $(TESTS_T:=.test)
220 | 
221 | clean:
222 | 	$(RM) -r $(ALL_O) $(ALL_T) $(TESTS_O) $(TESTS_T)
223 | 
224 | data: $(DATA) $(TESTS_DATA)
225 | 
226 | doc:
227 | 	doxygen
228 | 
229 | %.test: %
230 | 	$<
231 | 
232 | lib/%.o: lib/%.c
233 | 	$(CC) -c $(CFLAGS) $(LIB_CFLAGS) $(CPPFLAGS) $< -o $@
234 | 
235 | tests/%.o: tests/%.c
236 | 	$(CC) -c $(CFLAGS) $(TEST_CFLAGS) $(CPPFLAGS) $< -o $@
237 | 
238 | 
239 | .PHONY: all check clean data doc
240 | 
241 | src/array.o: src/array.c src/private/array.h src/utf8lite.h
242 | src/char.o: src/char.c src/private/charwidth.h src/utf8lite.h
243 | src/encode.o: src/encode.c src/utf8lite.h
244 | src/error.o: src/error.c src/utf8lite.h
245 | src/escape.o: src/escape.c src/utf8lite.h
246 | src/graph.o: src/graph.c src/utf8lite.h
247 | src/graphscan.o: src/graphscan.c src/private/graphbreak.h src/utf8lite.h
248 | src/normalize.o: src/normalize.c src/private/casefold.h \
249 | 	src/private/combining.h src/private/compose.h src/private/decompose.h \
250 | 	src/utf8lite.h
251 | src/render.o: src/render.c src/private/array.h src/utf8lite.h
252 | src/text.o: src/text.c src/utf8lite.h
253 | src/textassign.o: src/textassign.c src/utf8lite.h
254 | src/textiter.o: src/textiter.c src/utf8lite.h
255 | src/textmap.o: src/textmap.c src/utf8lite.h
256 | src/wordscan.o: src/wordscan.c src/private/emojiprop.h \
257 | 	src/private/wordbreak.h src/utf8lite.h
258 | 
259 | tests/check_charwidth.o: tests/check_charwidth.c src/utf8lite.h tests/testutil.h
260 | tests/check_graphscan.o: tests/check_graphscan.c src/utf8lite.h tests/testutil.h
261 | tests/check_render.o: tests/check_render.c src/utf8lite.h tests/testutil.h
262 | tests/check_text.o: tests/check_text.c src/utf8lite.h tests/testutil.h
263 | tests/check_textmap.o: tests/check_text.c src/utf8lite.h tests/testutil.h
264 | tests/check_unicode.o: tests/check_unicode.c src/utf8lite.h tests/testutil.h
265 | tests/check_wordscan.o: tests/check_wordscan.c src/utf8lite.h tests/testutil.h
266 | tests/testutil.o: tests/testutil.c src/utf8lite.h tests/testutil.h
267 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | utf8lite (C Library)
2 | ====================
3 | 
4 | [![Build Status](https://api.travis-ci.org/patperry/utf8lite.svg?branch=master)](https://travis-ci.org/patperry/utf8lite)
5 | [![Coverage Status](https://codecov.io/github/patperry/utf8lite/coverage.svg?branch=master)](https://codecov.io/github/patperry/utf8lite?branch=master)
6 | 
7 | Lightweight UTF-8 processing.
8 | 


--------------------------------------------------------------------------------
/data/ucd/CompositionExclusions.txt:
--------------------------------------------------------------------------------
  1 | # CompositionExclusions-15.1.0.txt
  2 | # Date: 2023-01-05
  3 | # © 2023 Unicode®, Inc.
  4 | # For terms of use, see https://www.unicode.org/terms_of_use.html
  5 | #
  6 | # Unicode Character Database
  7 | # For documentation, see https://www.unicode.org/reports/tr44/
  8 | #
  9 | # This file lists the characters for the Composition Exclusion Table
 10 | # defined in UAX #15, Unicode Normalization Forms.
 11 | #
 12 | # This file is a normative contributory data file in the
 13 | # Unicode Character Database.
 14 | #
 15 | # For more information, see
 16 | # https://www.unicode.org/reports/tr15/#Primary_Exclusion_List_Table
 17 | #
 18 | # For a full derivation of composition exclusions, see the derived property
 19 | # Full_Composition_Exclusion in DerivedNormalizationProps.txt
 20 | #
 21 | 
 22 | # ================================================
 23 | # (1) Script Specifics
 24 | #
 25 | # This list of characters cannot be derived from the UnicodeData.txt file.
 26 | #
 27 | # Included are the following subcategories:
 28 | #
 29 | # - Many precomposed characters using a nukta diacritic in the Devanagari,
 30 | #   Bangla/Bengali, Gurmukhi, or Odia/Oriya scripts.
 31 | # - Tibetan letters and subjoined letters with decompositions including 
 32 | #   U+0FB7 TIBETAN SUBJOINED LETTER HA or U+0FB5 TIBETAN SUBJOINED LETTER SSA.
 33 | # - Two two-part Tibetan vowel signs involving top and bottom pieces.
 34 | # - A large collection of compatibility precomposed characters for Hebrew
 35 | #   involving dagesh and/or other combining marks.
 36 | #
 37 | # This list is unlikely to grow.
 38 | #
 39 | # ================================================
 40 | 
 41 | 0958    #  DEVANAGARI LETTER QA
 42 | 0959    #  DEVANAGARI LETTER KHHA
 43 | 095A    #  DEVANAGARI LETTER GHHA
 44 | 095B    #  DEVANAGARI LETTER ZA
 45 | 095C    #  DEVANAGARI LETTER DDDHA
 46 | 095D    #  DEVANAGARI LETTER RHA
 47 | 095E    #  DEVANAGARI LETTER FA
 48 | 095F    #  DEVANAGARI LETTER YYA
 49 | 09DC    #  BENGALI LETTER RRA
 50 | 09DD    #  BENGALI LETTER RHA
 51 | 09DF    #  BENGALI LETTER YYA
 52 | 0A33    #  GURMUKHI LETTER LLA
 53 | 0A36    #  GURMUKHI LETTER SHA
 54 | 0A59    #  GURMUKHI LETTER KHHA
 55 | 0A5A    #  GURMUKHI LETTER GHHA
 56 | 0A5B    #  GURMUKHI LETTER ZA
 57 | 0A5E    #  GURMUKHI LETTER FA
 58 | 0B5C    #  ORIYA LETTER RRA
 59 | 0B5D    #  ORIYA LETTER RHA
 60 | 0F43    #  TIBETAN LETTER GHA
 61 | 0F4D    #  TIBETAN LETTER DDHA
 62 | 0F52    #  TIBETAN LETTER DHA
 63 | 0F57    #  TIBETAN LETTER BHA
 64 | 0F5C    #  TIBETAN LETTER DZHA
 65 | 0F69    #  TIBETAN LETTER KSSA
 66 | 0F76    #  TIBETAN VOWEL SIGN VOCALIC R
 67 | 0F78    #  TIBETAN VOWEL SIGN VOCALIC L
 68 | 0F93    #  TIBETAN SUBJOINED LETTER GHA
 69 | 0F9D    #  TIBETAN SUBJOINED LETTER DDHA
 70 | 0FA2    #  TIBETAN SUBJOINED LETTER DHA
 71 | 0FA7    #  TIBETAN SUBJOINED LETTER BHA
 72 | 0FAC    #  TIBETAN SUBJOINED LETTER DZHA
 73 | 0FB9    #  TIBETAN SUBJOINED LETTER KSSA
 74 | FB1D    #  HEBREW LETTER YOD WITH HIRIQ
 75 | FB1F    #  HEBREW LIGATURE YIDDISH YOD YOD PATAH
 76 | FB2A    #  HEBREW LETTER SHIN WITH SHIN DOT
 77 | FB2B    #  HEBREW LETTER SHIN WITH SIN DOT
 78 | FB2C    #  HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT
 79 | FB2D    #  HEBREW LETTER SHIN WITH DAGESH AND SIN DOT
 80 | FB2E    #  HEBREW LETTER ALEF WITH PATAH
 81 | FB2F    #  HEBREW LETTER ALEF WITH QAMATS
 82 | FB30    #  HEBREW LETTER ALEF WITH MAPIQ
 83 | FB31    #  HEBREW LETTER BET WITH DAGESH
 84 | FB32    #  HEBREW LETTER GIMEL WITH DAGESH
 85 | FB33    #  HEBREW LETTER DALET WITH DAGESH
 86 | FB34    #  HEBREW LETTER HE WITH MAPIQ
 87 | FB35    #  HEBREW LETTER VAV WITH DAGESH
 88 | FB36    #  HEBREW LETTER ZAYIN WITH DAGESH
 89 | FB38    #  HEBREW LETTER TET WITH DAGESH
 90 | FB39    #  HEBREW LETTER YOD WITH DAGESH
 91 | FB3A    #  HEBREW LETTER FINAL KAF WITH DAGESH
 92 | FB3B    #  HEBREW LETTER KAF WITH DAGESH
 93 | FB3C    #  HEBREW LETTER LAMED WITH DAGESH
 94 | FB3E    #  HEBREW LETTER MEM WITH DAGESH
 95 | FB40    #  HEBREW LETTER NUN WITH DAGESH
 96 | FB41    #  HEBREW LETTER SAMEKH WITH DAGESH
 97 | FB43    #  HEBREW LETTER FINAL PE WITH DAGESH
 98 | FB44    #  HEBREW LETTER PE WITH DAGESH
 99 | FB46    #  HEBREW LETTER TSADI WITH DAGESH
100 | FB47    #  HEBREW LETTER QOF WITH DAGESH
101 | FB48    #  HEBREW LETTER RESH WITH DAGESH
102 | FB49    #  HEBREW LETTER SHIN WITH DAGESH
103 | FB4A    #  HEBREW LETTER TAV WITH DAGESH
104 | FB4B    #  HEBREW LETTER VAV WITH HOLAM
105 | FB4C    #  HEBREW LETTER BET WITH RAFE
106 | FB4D    #  HEBREW LETTER KAF WITH RAFE
107 | FB4E    #  HEBREW LETTER PE WITH RAFE
108 | 
109 | # Total code points: 67
110 | 
111 | # ================================================
112 | # (2) Post Composition Version precomposed characters
113 | #
114 | # These characters cannot be derived solely from the UnicodeData.txt file
115 | # in this version of Unicode.
116 | #
117 | # Note that characters added to the standard after the
118 | # Composition Version and which have canonical decomposition mappings
119 | # are not automatically added to this list of Post Composition
120 | # Version precomposed characters.
121 | # ================================================
122 | 
123 | 2ADC    #  FORKING
124 | 1D15E   #  MUSICAL SYMBOL HALF NOTE
125 | 1D15F   #  MUSICAL SYMBOL QUARTER NOTE
126 | 1D160   #  MUSICAL SYMBOL EIGHTH NOTE
127 | 1D161   #  MUSICAL SYMBOL SIXTEENTH NOTE
128 | 1D162   #  MUSICAL SYMBOL THIRTY-SECOND NOTE
129 | 1D163   #  MUSICAL SYMBOL SIXTY-FOURTH NOTE
130 | 1D164   #  MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
131 | 1D1BB   #  MUSICAL SYMBOL MINIMA
132 | 1D1BC   #  MUSICAL SYMBOL MINIMA BLACK
133 | 1D1BD   #  MUSICAL SYMBOL SEMIMINIMA WHITE
134 | 1D1BE   #  MUSICAL SYMBOL SEMIMINIMA BLACK
135 | 1D1BF   #  MUSICAL SYMBOL FUSA WHITE
136 | 1D1C0   #  MUSICAL SYMBOL FUSA BLACK
137 | 
138 | # Total code points: 14
139 | 
140 | # ================================================
141 | # (3) Singleton Decompositions
142 | #
143 | # These characters can be derived from the UnicodeData.txt file
144 | # by including all canonically decomposable characters whose
145 | # canonical decomposition consists of a single character.
146 | #
147 | # These characters are simply quoted here for reference.
148 | # See also Full_Composition_Exclusion in DerivedNormalizationProps.txt
149 | # ================================================
150 | 
151 | # 0340..0341       [2] COMBINING GRAVE TONE MARK..COMBINING ACUTE TONE MARK
152 | # 0343                 COMBINING GREEK KORONIS
153 | # 0374                 GREEK NUMERAL SIGN
154 | # 037E                 GREEK QUESTION MARK
155 | # 0387                 GREEK ANO TELEIA
156 | # 1F71                 GREEK SMALL LETTER ALPHA WITH OXIA
157 | # 1F73                 GREEK SMALL LETTER EPSILON WITH OXIA
158 | # 1F75                 GREEK SMALL LETTER ETA WITH OXIA
159 | # 1F77                 GREEK SMALL LETTER IOTA WITH OXIA
160 | # 1F79                 GREEK SMALL LETTER OMICRON WITH OXIA
161 | # 1F7B                 GREEK SMALL LETTER UPSILON WITH OXIA
162 | # 1F7D                 GREEK SMALL LETTER OMEGA WITH OXIA
163 | # 1FBB                 GREEK CAPITAL LETTER ALPHA WITH OXIA
164 | # 1FBE                 GREEK PROSGEGRAMMENI
165 | # 1FC9                 GREEK CAPITAL LETTER EPSILON WITH OXIA
166 | # 1FCB                 GREEK CAPITAL LETTER ETA WITH OXIA
167 | # 1FD3                 GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
168 | # 1FDB                 GREEK CAPITAL LETTER IOTA WITH OXIA
169 | # 1FE3                 GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
170 | # 1FEB                 GREEK CAPITAL LETTER UPSILON WITH OXIA
171 | # 1FEE..1FEF       [2] GREEK DIALYTIKA AND OXIA..GREEK VARIA
172 | # 1FF9                 GREEK CAPITAL LETTER OMICRON WITH OXIA
173 | # 1FFB                 GREEK CAPITAL LETTER OMEGA WITH OXIA
174 | # 1FFD                 GREEK OXIA
175 | # 2000..2001       [2] EN QUAD..EM QUAD
176 | # 2126                 OHM SIGN
177 | # 212A..212B       [2] KELVIN SIGN..ANGSTROM SIGN
178 | # 2329                 LEFT-POINTING ANGLE BRACKET
179 | # 232A                 RIGHT-POINTING ANGLE BRACKET
180 | # F900..FA0D     [270] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA0D
181 | # FA10                 CJK COMPATIBILITY IDEOGRAPH-FA10
182 | # FA12                 CJK COMPATIBILITY IDEOGRAPH-FA12
183 | # FA15..FA1E      [10] CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPATIBILITY IDEOGRAPH-FA1E
184 | # FA20                 CJK COMPATIBILITY IDEOGRAPH-FA20
185 | # FA22                 CJK COMPATIBILITY IDEOGRAPH-FA22
186 | # FA25..FA26       [2] CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPATIBILITY IDEOGRAPH-FA26
187 | # FA2A..FA6D      [68] CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPATIBILITY IDEOGRAPH-FA6D
188 | # FA70..FAD9     [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
189 | # 2F800..2FA1D   [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
190 | 
191 | # Total code points: 1035
192 | 
193 | # ================================================
194 | # (4) Non-Starter Decompositions
195 | #
196 | # These characters can be derived from the UnicodeData.txt file
197 | # by including each expanding canonical decomposition
198 | # (i.e., those which canonically decompose to a sequence
199 | # of characters instead of a single character), such that:
200 | #
201 | # A. The character is not a Starter.
202 | #
203 | # OR (inclusive)
204 | #
205 | # B. The character's canonical decomposition begins
206 | # with a character that is not a Starter.
207 | #
208 | # Note that a "Starter" is any character with a zero combining class.
209 | #
210 | # These characters are simply quoted here for reference.
211 | # See also Full_Composition_Exclusion in DerivedNormalizationProps.txt
212 | # ================================================
213 | 
214 | # 0344                 COMBINING GREEK DIALYTIKA TONOS
215 | # 0F73                 TIBETAN VOWEL SIGN II
216 | # 0F75                 TIBETAN VOWEL SIGN UU
217 | # 0F81                 TIBETAN VOWEL SIGN REVERSED II
218 | 
219 | # Total code points: 4
220 | 
221 | # EOF
222 | 


--------------------------------------------------------------------------------
/src/array.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Patrick O. Perry.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <assert.h>
 18 | #include <inttypes.h>
 19 | #include <limits.h>
 20 | #include <stdint.h>
 21 | #include <stdlib.h>
 22 | #include "utf8lite.h"
 23 | #include "private/array.h"
 24 | 
 25 | 
 26 | /* Default initial size for nonempty dynamic arrays. Must be positive. */
 27 | #define UTF8LITE_ARRAY_SIZE_INIT	32
 28 | 
 29 | /* Growth factor for dynamic arrays. Must be greater than 1.
 30 |  *
 31 |  *     https://en.wikipedia.org/wiki/Dynamic_array#Growth_factor
 32 |  */
 33 | #define UTF8LITE_ARRAY_GROW 1.618 /* Golden Ratio, (1 + sqrt(5)) / 2 */
 34 | 
 35 | 
 36 | int utf8lite_bigarray_size_add(size_t *sizeptr, size_t width, size_t count,
 37 | 			       size_t nadd)
 38 | {
 39 | 	size_t size = *sizeptr;
 40 | 	size_t size_min;
 41 | 	int err;
 42 | 	double n1;
 43 | 
 44 | 	if (width == 0) {
 45 | 		return 0;
 46 | 	}
 47 | 
 48 | 	if (count > (SIZE_MAX - nadd) / width) {
 49 | 		err = UTF8LITE_ERROR_OVERFLOW;
 50 | 		//utf8lite_log(err, "array size (%"PRIu64" + %"PRIu64
 51 | 		//	     " elements of %"PRIu64" bytes each)"
 52 | 		//	     " exceeds maximum (%"PRIu64" elements)",
 53 | 		//	     (uint64_t)count, (uint64_t)nadd,
 54 | 		//	     (uint64_t)width, (uint64_t)SIZE_MAX);
 55 | 		return err;
 56 | 	}
 57 | 
 58 | 	size_min = count + nadd;
 59 | 	if (size >= size_min) {
 60 | 		return 0;
 61 | 	}
 62 | 
 63 | 	assert(UTF8LITE_ARRAY_SIZE_INIT > 0);
 64 | 	assert(UTF8LITE_ARRAY_GROW > 1);
 65 | 
 66 | 	if (size < UTF8LITE_ARRAY_SIZE_INIT && size_min > 0) {
 67 | 		size = UTF8LITE_ARRAY_SIZE_INIT;
 68 | 	}
 69 | 
 70 | 	while (size < size_min) {
 71 | 		n1 = UTF8LITE_ARRAY_GROW * size;
 72 | 		if (n1 > SIZE_MAX / width) {
 73 | 			size = SIZE_MAX / width;
 74 | 		} else {
 75 | 			size = (size_t)n1;
 76 | 		}
 77 | 	}
 78 | 
 79 | 	*sizeptr = size;
 80 | 	return 0;
 81 | }
 82 | 
 83 | 
 84 | int utf8lite_array_size_add(int *sizeptr, size_t width, int count, int nadd)
 85 | {
 86 | 	size_t size, size_min, size_max;
 87 | 	int err;
 88 | 
 89 | 	assert(*sizeptr >= 0);
 90 | 	assert(count >= 0);
 91 | 	assert(nadd >= 0);
 92 | 
 93 | 	if (width == 0) {
 94 | 		return 0;
 95 | 	}
 96 | 
 97 | 	size = (size_t)*sizeptr;
 98 | 	if ((err = utf8lite_bigarray_size_add(&size, width, (size_t)count,
 99 | 					    (size_t)nadd))) {
100 | 		return err;
101 | 	}
102 | 	size_max = (size_t)INT_MAX / width;
103 | 	if (size > size_max) {
104 | 		size = size_max;
105 | 		size_min = (size_t)count + (size_t)nadd;
106 | 		if (size < size_min) {
107 | 			err = UTF8LITE_ERROR_OVERFLOW;
108 | 			//utf8lite_log(err, "array size (%"PRIu64
109 | 			//	   " elements of %"PRIu64" bytes each)"
110 | 			//	   " exceeds maximum (%"PRIu64" elements)",
111 | 			//	   (uint64_t)size_min, (uint64_t)width,
112 | 			//	   (uint64_t)size_max);
113 | 			return err;
114 | 		}
115 | 	}
116 | 
117 | 	*sizeptr = (int)size;
118 | 	return 0;
119 | }
120 | 
121 | 
122 | int utf8lite_array_grow(void **baseptr, int *sizeptr, size_t width, int count,
123 | 		      int nadd)
124 | {
125 | 	void *base = *baseptr;
126 | 	int size = *sizeptr;
127 | 	int err;
128 | 
129 | 	assert(0 <= count);
130 | 	assert(count <= size);
131 | 	assert(width > 0);
132 | 
133 | 	if (nadd <= size - count) {
134 | 		return 0;
135 | 	}
136 | 
137 | 	if ((err = utf8lite_array_size_add(&size, width, count, nadd))) {
138 | 		return err;
139 | 	}
140 | 
141 | 	if (!(base = realloc(base, ((size_t)size) * width))) {
142 | 		err = UTF8LITE_ERROR_NOMEM;
143 | 		return err;
144 | 	}
145 | 
146 | 	*baseptr = base;
147 | 	*sizeptr = size;
148 | 	return 0;
149 | }
150 | 
151 | 
152 | int utf8lite_bigarray_grow(void **baseptr, size_t *sizeptr, size_t width,
153 | 			 size_t count, size_t nadd)
154 | {
155 | 	void *base = *baseptr;
156 | 	size_t size = *sizeptr;
157 | 	int err;
158 | 
159 | 	assert(count <= size);
160 | 	assert(width > 0);
161 | 
162 | 	if (nadd <= size - count) {
163 | 		return 0;
164 | 	}
165 | 
166 | 	if ((err = utf8lite_bigarray_size_add(&size, width, count, nadd))) {
167 | 		return err;
168 | 	}
169 | 
170 | 	if (!(base = realloc(base, size * width))) {
171 | 		err = UTF8LITE_ERROR_NOMEM;
172 | 		//utf8lite_log(err, "failed allocating array");
173 | 		return err;
174 | 	}
175 | 
176 | 	*baseptr = base;
177 | 	*sizeptr = size;
178 | 	return 0;
179 | }
180 | 


--------------------------------------------------------------------------------
/src/char.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Patrick O. Perry.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | 
18 | #include <assert.h>
19 | #include "private/charwidth.h"
20 | #include "utf8lite.h"
21 | 
22 | 
23 | int utf8lite_charwidth(int32_t code)
24 | {
25 | 	int prop = charwidth(code);
26 | 	switch(prop) {
27 | 	case CHARWIDTH_NONE:
28 | 		return UTF8LITE_CHARWIDTH_NONE;
29 | 	case CHARWIDTH_IGNORABLE:
30 | 		return UTF8LITE_CHARWIDTH_IGNORABLE;
31 | 	case CHARWIDTH_MARK:
32 | 		return UTF8LITE_CHARWIDTH_MARK;
33 | 	case CHARWIDTH_NARROW:
34 | 		return UTF8LITE_CHARWIDTH_NARROW;
35 | 	case CHARWIDTH_AMBIGUOUS:
36 | 		return UTF8LITE_CHARWIDTH_AMBIGUOUS;
37 | 	case CHARWIDTH_WIDE:
38 | 		return UTF8LITE_CHARWIDTH_WIDE;
39 | 	case CHARWIDTH_EMOJI:
40 | 		return UTF8LITE_CHARWIDTH_EMOJI;
41 | 	default:
42 | 		assert(0 && "internal error: unrecognized charwidth property");
43 | 		return prop;
44 | 	}
45 | }
46 | 
47 | 
48 | // TODO: use character class lookup table
49 | int utf8lite_isspace(int32_t code)
50 | {
51 | 	if (code <= 0x7F) {
52 | 		return (code == 0x20 || (0x09 <= code && code < 0x0E));
53 | 	} else if (code <= 0x1FFF) {
54 | 		switch (code) {
55 | 		case 0x0085:
56 | 		case 0x00A0:
57 | 		case 0x1680:
58 | 			return 1;
59 | 		default:
60 | 			return 0;
61 | 		}
62 | 	} else if (code <= 0x200A) {
63 | 		return 1;
64 | 	} else if (code <= 0x3000) {
65 | 		switch (code) {
66 | 		case 0x2028:
67 | 		case 0x2029:
68 | 		case 0x202F:
69 | 		case 0x205F:
70 | 		case 0x3000:
71 | 			return 1;
72 | 		default:
73 | 			return 0;
74 | 		}
75 | 	} else {
76 | 		return 0;
77 | 	}
78 | }
79 | 
80 | 
81 | int utf8lite_isignorable(int32_t code)
82 | {
83 |     int prop = utf8lite_charwidth(code);
84 |     return (prop == UTF8LITE_CHARWIDTH_IGNORABLE);
85 | }
86 | 


--------------------------------------------------------------------------------
/src/encode.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Patrick O. Perry.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <assert.h>
 18 | #include "utf8lite.h"
 19 | 
 20 | /*
 21 |   Source:
 22 |    http://www.unicode.org/versions/Unicode7.0.0/UnicodeStandard-7.0.pdf
 23 |    page 124, 3.9 "Unicode Encoding Forms", "UTF-8"
 24 | 
 25 |   Table 3-7. Well-Formed UTF-8 Byte Sequences
 26 |   -----------------------------------------------------------------------------
 27 |   |  Code Points        | First Byte | Second Byte | Third Byte | Fourth Byte |
 28 |   |  U+0000..U+007F     |     00..7F |             |            |             |
 29 |   |  U+0080..U+07FF     |     C2..DF |      80..BF |            |             |
 30 |   |  U+0800..U+0FFF     |         E0 |      A0..BF |     80..BF |             |
 31 |   |  U+1000..U+CFFF     |     E1..EC |      80..BF |     80..BF |             |
 32 |   |  U+D000..U+D7FF     |         ED |      80..9F |     80..BF |             |
 33 |   |  U+E000..U+FFFF     |     EE..EF |      80..BF |     80..BF |             |
 34 |   |  U+10000..U+3FFFF   |         F0 |      90..BF |     80..BF |      80..BF |
 35 |   |  U+40000..U+FFFFF   |     F1..F3 |      80..BF |     80..BF |      80..BF |
 36 |   |  U+100000..U+10FFFF |         F4 |      80..8F |     80..BF |      80..BF |
 37 |   -----------------------------------------------------------------------------
 38 | 
 39 |   (table taken from https://github.com/JulienPalard/is_utf8 )
 40 | */
 41 | 
 42 | 
 43 | int utf8lite_scan_utf8(const uint8_t **bufptr, const uint8_t *end,
 44 | 		       struct utf8lite_message *msg)
 45 | {
 46 | 	const uint8_t *ptr = *bufptr;
 47 | 	uint_fast8_t ch, ch1;
 48 | 	unsigned nc;
 49 | 	int err;
 50 | 
 51 | 	assert(ptr < end);
 52 | 
 53 | 	/* First byte
 54 | 	 * ----------
 55 | 	 *
 56 | 	 * 1-byte sequence:
 57 | 	 * 00: 0000 0000
 58 | 	 * 7F: 0111 1111
 59 | 	 * (ch1 & 0x80 == 0)
 60 | 	 *
 61 | 	 * Invalid:
 62 | 	 * 80: 1000 0000
 63 | 	 * BF: 1011 1111
 64 | 	 * C0: 1100 0000
 65 | 	 * C1: 1100 0001
 66 | 	 * (ch & 0xF0 == 0x80 || ch == 0xC0 || ch == 0xC1)
 67 | 	 *
 68 | 	 * 2-byte sequence:
 69 | 	 * C2: 1100 0010
 70 | 	 * DF: 1101 1111
 71 | 	 * (ch & 0xE0 == 0xC0 && ch > 0xC1)
 72 | 	 *
 73 | 	 * 3-byte sequence
 74 | 	 * E0: 1110 0000
 75 | 	 * EF: 1110 1111
 76 | 	 * (ch & 0xF0 == E0)
 77 | 	 *
 78 | 	 * 4-byte sequence:
 79 | 	 * F0: 1111 0000
 80 | 	 * F4: 1111 0100
 81 | 	 * (ch & 0xFC == 0xF0 || ch == 0xF4)
 82 | 	 */
 83 | 
 84 | 	ch1 = *ptr++;
 85 | 
 86 | 	if ((ch1 & 0x80) == 0) {
 87 | 		goto success;
 88 | 	} else if ((ch1 & 0xC0) == 0x80) {
 89 | 		goto inval_lead;
 90 | 	} else if ((ch1 & 0xE0) == 0xC0) {
 91 | 		if (ch1 == 0xC0 || ch1 == 0xC1) {
 92 | 			goto inval_lead;
 93 | 		}
 94 | 		nc = 1;
 95 | 	} else if ((ch1 & 0xF0) == 0xE0) {
 96 | 		nc = 2;
 97 | 	} else if ((ch1 & 0xFC) == 0xF0 || ch1 == 0xF4) {
 98 | 		nc = 3;
 99 | 	} else {
100 | 		// expecting bytes in the following ranges: 00..7F C2..F4
101 | 		goto inval_lead;
102 | 	}
103 | 
104 | 	// ensure string is long enough
105 | 	if (ptr + nc > end) {
106 | 		// expecting another continuation byte
107 | 		goto inval_incomplete;
108 | 	}
109 | 
110 | 	/* First Continuation byte
111 | 	 * -----------
112 | 	 * X  + 80..BF:
113 | 	 * 80: 1000 0000
114 | 	 * BF: 1011 1111
115 | 	 * (ch & 0xC0 == 0x80)
116 | 	 *
117 | 	 * E0 + A0..BF:
118 | 	 * A0: 1010 0000
119 | 	 * BF: 1011 1111
120 | 	 * (ch & 0xE0 == 0xA0)
121 | 	 *
122 | 	 * ED + 80..9F:
123 | 	 * 80: 1000 0000
124 | 	 * 9F: 1001 1111
125 | 	 * (ch & 0xE0 == 0x80)
126 | 	 *
127 | 	 * F0 + 90..BF:
128 | 	 * 90: 1001 0000
129 | 	 * BF: 1011 1111
130 | 	 * (ch & 0xF0 == 0x90 || ch & 0xE0 == A0)
131 | 	 *
132 | 	 */
133 | 
134 | 	// validate the first continuation byte
135 | 	ch = *ptr++;
136 | 	switch (ch1) {
137 | 	case 0xE0:
138 | 		if ((ch & 0xE0) != 0xA0) {
139 | 			// expecting a byte between A0 and BF
140 | 			goto inval_cont;
141 | 		}
142 | 		break;
143 | 	case 0xED:
144 | 		if ((ch & 0xE0) != 0x80) {
145 | 			// expecting a byte between A0 and 9F
146 | 			goto inval_cont;
147 | 		}
148 | 		break;
149 | 	case 0xF0:
150 | 		if ((ch & 0xE0) != 0xA0 && (ch & 0xF0) != 0x90) {
151 | 			// expecting a byte between 90 and BF
152 | 			goto inval_cont;
153 | 		}
154 | 		break;
155 | 	case 0xF4:
156 | 		if ((ch & 0xF0) != 0x80) {
157 | 			// expecting a byte between 80 and 8F
158 | 			goto inval_cont;
159 | 		}
160 | 	default:
161 | 		if ((ch & 0xC0) != 0x80) {
162 | 			// expecting a byte between 80 and BF
163 | 			goto inval_cont;
164 | 		}
165 | 		break;
166 | 	}
167 | 	nc--;
168 | 
169 | 	// validate the trailing continuation bytes
170 | 	while (nc-- > 0) {
171 | 		ch = *ptr++;
172 | 		if ((ch & 0xC0) != 0x80) {
173 | 			// expecting a byte between 80 and BF
174 | 			goto inval_cont;
175 | 		}
176 | 	}
177 | 
178 | success:
179 | 	err = 0;
180 | 	goto out;
181 | 
182 | inval_incomplete:
183 | 	utf8lite_message_set(msg, "not enough continuation bytes"
184 | 			     " after leading byte (0x%02X)",
185 | 			     (unsigned)ch1);
186 | 	goto error;
187 | 
188 | inval_lead:
189 | 	utf8lite_message_set(msg, "invalid leading byte (0x%02X)",
190 | 			     (unsigned)ch1);
191 | 	goto error;
192 | 
193 | inval_cont:
194 | 	utf8lite_message_set(msg, "leading byte 0x%02X followed by"
195 | 			     " invalid continuation byte (0x%02X)",
196 | 			     (unsigned)ch1, (unsigned)ch);
197 | 	goto error;
198 | 
199 | error:
200 | 	ptr--;
201 | 	err = UTF8LITE_ERROR_INVAL;
202 | out:
203 | 	*bufptr = ptr;
204 | 	return err;
205 | }
206 | 
207 | 
208 | void utf8lite_decode_utf8(const uint8_t **bufptr, int32_t *codeptr)
209 | {
210 | 	const uint8_t *ptr = *bufptr;
211 | 	int32_t code;
212 | 	uint_fast8_t ch;
213 | 	unsigned nc;
214 | 
215 | 	ch = *ptr++;
216 | 	if (!(ch & 0x80)) {
217 | 		code = ch;
218 | 		nc = 0;
219 | 	} else if (!(ch & 0x20)) {
220 | 		code = ch & 0x1F;
221 | 		nc = 1;
222 | 	} else if (!(ch & 0x10)) {
223 | 		code = ch & 0x0F;
224 | 		nc = 2;
225 | 	} else {
226 | 		code = ch & 0x07;
227 | 		nc = 3;
228 | 	}
229 | 
230 | 	while (nc-- > 0) {
231 | 		ch = *ptr++;
232 | 		code = (code << 6) + (ch & 0x3F);
233 | 	}
234 | 
235 | 	*bufptr = ptr;
236 | 	*codeptr = code;
237 | }
238 | 
239 | 
240 | // http://www.fileformat.info/info/unicode/utf8.htm
241 | void utf8lite_encode_utf8(int32_t code, uint8_t **bufptr)
242 | {
243 | 	uint8_t *ptr = *bufptr;
244 | 	int32_t x = code;
245 | 
246 | 	if (x <= 0x7F) {
247 | 		*ptr++ = (uint8_t)x;
248 | 	} else if (x <= 0x07FF) {
249 | 		*ptr++ = (uint8_t)(0xC0 | (x >> 6));
250 | 		*ptr++ = (uint8_t)(0x80 | (x & 0x3F));
251 | 	} else if (x <= 0xFFFF) {
252 | 		*ptr++ = (uint8_t)(0xE0 | (x >> 12));
253 | 		*ptr++ = (uint8_t)(0x80 | ((x >> 6) & 0x3F));
254 | 		*ptr++ = (uint8_t)(0x80 | (x & 0x3F));
255 | 	} else {
256 | 		*ptr++ = (uint8_t)(0xF0 | (x >> 18));
257 | 		*ptr++ = (uint8_t)(0x80 | ((x >> 12) & 0x3F));
258 | 		*ptr++ = (uint8_t)(0x80 | ((x >> 6) & 0x3F));
259 | 		*ptr++ = (uint8_t)(0x80 | (x & 0x3F));
260 | 	}
261 | 
262 | 	*bufptr = ptr;
263 | }
264 | 
265 | 
266 | void utf8lite_rencode_utf8(int32_t code, uint8_t **bufptr)
267 | {
268 | 	uint8_t *ptr = *bufptr;
269 | 	int32_t x = code;
270 | 
271 | 	if (x <= 0x7F) {
272 | 		*--ptr = (uint8_t)x;
273 | 	} else if (x <= 0x07FF) {
274 | 		*--ptr = (uint8_t)(0x80 | (x & 0x3F));
275 | 		*--ptr = (uint8_t)(0xC0 | (x >> 6));
276 | 	} else if (x <= 0xFFFF) {
277 | 		*--ptr = (uint8_t)(0x80 | (x & 0x3F));
278 | 		*--ptr = (uint8_t)(0x80 | ((x >> 6) & 0x3F));
279 | 		*--ptr = (uint8_t)(0xE0 | (x >> 12));
280 | 	} else {
281 | 		*--ptr = (uint8_t)(0x80 | (x & 0x3F));
282 | 		*--ptr = (uint8_t)(0x80 | ((x >> 6) & 0x3F));
283 | 		*--ptr = (uint8_t)(0x80 | ((x >> 12) & 0x3F));
284 | 		*--ptr = (uint8_t)(0xF0 | (x >> 18));
285 | 	}
286 | 
287 | 	*bufptr = ptr;
288 | }
289 | 


--------------------------------------------------------------------------------
/src/error.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Patrick O. Perry.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <assert.h>
18 | #include <stdarg.h>
19 | #include <stdio.h>
20 | #include <string.h>
21 | #include "utf8lite.h"
22 | 
23 | 
24 | void utf8lite_message_clear(struct utf8lite_message *msg)
25 | {
26 | 	if (msg) {
27 | 		msg->string[0] = '\0';
28 | 	}
29 | }
30 | 
31 | 
32 | void utf8lite_message_set(struct utf8lite_message *msg,
33 | 			  const char *fmt, ...)
34 | {
35 | 	va_list ap;
36 | 
37 | 	if (msg) {
38 | 		va_start(ap, fmt);
39 | 		vsnprintf(msg->string, sizeof(msg->string), fmt, ap);
40 | 		va_end(ap);
41 | 	}
42 | }
43 | 
44 | 
45 | void utf8lite_message_append(struct utf8lite_message *msg,
46 | 			     const char *fmt, ...)
47 | {
48 | 	size_t n, nmax;
49 | 	va_list ap;
50 | 	       
51 | 	if (msg) {
52 | 		nmax = sizeof(msg->string);
53 | 		n = strlen(msg->string);
54 | 		assert(n <= nmax);
55 | 
56 | 		va_start(ap, fmt);
57 | 		vsnprintf(msg->string + n, nmax - n, fmt, ap);
58 | 		va_end(ap);
59 | 	}
60 | }
61 | 


--------------------------------------------------------------------------------
/src/escape.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Patrick O. Perry.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <ctype.h>
 18 | #include "utf8lite.h"
 19 | 
 20 | /* http://stackoverflow.com/a/11986885 */
 21 | #define hextoi(ch) ((ch > '9') ? (ch &~ 0x20) - 'A' + 10 : (ch - '0'))
 22 | 
 23 | int utf8lite_scan_escape(const uint8_t **bufptr, const uint8_t *end,
 24 | 			 struct utf8lite_message *msg)
 25 | {
 26 | 	const uint8_t *input = *bufptr;
 27 | 	const uint8_t *ptr = input;
 28 | 	uint_fast8_t ch;
 29 | 	int err;
 30 | 	
 31 | 	if (ptr == end) {
 32 | 		goto error_incomplete;
 33 | 	}
 34 | 
 35 | 	ch = *ptr++;
 36 | 
 37 | 	switch (ch) {
 38 | 	case '"':
 39 | 	case '\\':
 40 | 	case '/':
 41 | 	case 'b':
 42 | 	case 'f':
 43 | 	case 'n':
 44 | 	case 'r':
 45 | 	case 't':
 46 | 		break;
 47 | 	case 'u':
 48 | 		if ((err = utf8lite_scan_uescape(&ptr, end, msg))) {
 49 | 			goto out;
 50 | 		}
 51 | 		break;
 52 | 	default:
 53 | 		goto error_inval;
 54 | 	}
 55 | 
 56 | 	err = 0;
 57 | 	goto out;
 58 | 
 59 | error_incomplete:
 60 | 	err = UTF8LITE_ERROR_INVAL;
 61 | 	utf8lite_message_set(msg, "incomplete escape code (\\)");
 62 | 	goto out;
 63 | 
 64 | error_inval:
 65 | 	err = UTF8LITE_ERROR_INVAL;
 66 | 	utf8lite_message_set(msg, "invalid escape code (\\%c)", ch);
 67 | 	goto out;
 68 | 
 69 | out:
 70 | 	*bufptr = ptr;
 71 | 	return err;
 72 | }
 73 | 
 74 | 
 75 | int utf8lite_scan_uescape(const uint8_t **bufptr, const uint8_t *end,
 76 | 			  struct utf8lite_message *msg)
 77 | {
 78 | 	const uint8_t *input = *bufptr;
 79 | 	const uint8_t *ptr = input;
 80 | 	int32_t code, low;
 81 | 	uint_fast8_t ch;
 82 | 	unsigned i;
 83 | 	int err;
 84 | 
 85 | 	if (ptr + 4 > end) {
 86 | 		goto error_inval_incomplete;
 87 | 	}
 88 | 
 89 | 	code = 0;
 90 | 	for (i = 0; i < 4; i++) {
 91 | 		ch = *ptr++;
 92 | 		if (!isxdigit(ch)) {
 93 | 			goto error_inval_hex;
 94 | 		}
 95 | 		code = (code << 4) + hextoi(ch);
 96 | 	}
 97 | 
 98 | 	if (UTF8LITE_IS_UTF16_HIGH(code)) {
 99 | 		if (ptr + 6 > end || ptr[0] != '\\' || ptr[1] != 'u') {
100 | 			goto error_inval_nolow;
101 | 		}
102 | 		ptr += 2;
103 | 		input = ptr;
104 | 
105 | 		low = 0;
106 | 		for (i = 0; i < 4; i++) {
107 | 			ch = *ptr++;
108 | 			if (!isxdigit(ch)) {
109 | 				goto error_inval_hex;
110 | 			}
111 | 			low = (low << 4) + hextoi(ch);
112 | 		}
113 | 		if (!UTF8LITE_IS_UTF16_LOW(low)) {
114 | 			ptr -= 6;
115 | 			goto error_inval_low;
116 | 		}
117 | 	} else if (UTF8LITE_IS_UTF16_LOW(code)) {
118 | 		goto error_inval_nohigh;
119 | 	}
120 | 
121 | 	err = 0;
122 | 	goto out;
123 | 
124 | error_inval_incomplete:
125 | 	err = UTF8LITE_ERROR_INVAL;
126 | 	utf8lite_message_set(msg, "incomplete escape code (\\u%.*s)",
127 | 			     (int)(end - input), input);
128 | 	goto out;
129 | 
130 | error_inval_hex:
131 | 	err = UTF8LITE_ERROR_INVAL;
132 | 	utf8lite_message_set(msg, "invalid hex value in escape code (\\u%.*s)",
133 | 			     4, input);
134 | 	goto out;
135 | 
136 | error_inval_nolow:
137 | 	err = UTF8LITE_ERROR_INVAL;
138 | 	utf8lite_message_set(msg, "missing UTF-16 low surrogate"
139 | 			     " after high surrogate escape code (\\u%.*s)",
140 | 			     4, input);
141 | 	goto out;
142 | 
143 | error_inval_low:
144 | 	err = UTF8LITE_ERROR_INVAL;
145 | 	utf8lite_message_set(msg, "invalid UTF-16 low surrogate (\\u%.*s)"
146 | 			     " after high surrogate escape code (\\u%.*s)",
147 | 			     4, input, 4, input - 6);
148 | 	goto out;
149 | 
150 | error_inval_nohigh:
151 | 	err = UTF8LITE_ERROR_INVAL;
152 | 	utf8lite_message_set(msg, "missing UTF-16 high surrogate"
153 | 			     " before low surrogate escape code (\\u%.*s)",
154 | 			     4, input);
155 | 	goto out;
156 | 
157 | out:
158 | 	*bufptr = ptr;
159 | 	return err;
160 | }
161 | 
162 | 
163 | void utf8lite_decode_uescape(const uint8_t **inputptr, int32_t *codeptr)
164 | {
165 | 	const uint8_t *ptr = *inputptr;
166 | 	int32_t code;
167 | 	uint_fast16_t low;
168 | 	uint_fast8_t ch;
169 | 	unsigned i;
170 | 
171 | 	code = 0;
172 | 	for (i = 0; i < 4; i++) {
173 | 		ch = *ptr++;
174 | 		code = (code << 4) + hextoi(ch);
175 | 	}
176 | 
177 | 	if (UTF8LITE_IS_UTF16_HIGH(code)) {
178 | 		// skip over \u
179 | 		ptr += 2;
180 | 
181 | 		low = 0;
182 | 		for (i = 0; i < 4; i++) {
183 | 			ch = *ptr++;
184 | 			low = (uint_fast16_t)(low << 4) + hextoi(ch);
185 | 		}
186 | 
187 | 		code = UTF8LITE_DECODE_UTF16_PAIR(code, low);
188 | 	}
189 | 
190 | 	*codeptr = code;
191 | 	*inputptr = ptr;
192 | }
193 | 
194 | 
195 | void utf8lite_decode_escape(const uint8_t **inputptr, int32_t *codeptr)
196 | {
197 | 	const uint8_t *ptr = *inputptr;
198 | 	int32_t code;
199 | 
200 | 	code = *ptr++;
201 | 
202 | 	switch (code) {
203 | 	case 'b':
204 | 		code = '\b';
205 | 		break;
206 | 	case 'f':
207 | 		code = '\f';
208 | 		break;
209 | 	case 'n':
210 | 		code = '\n';
211 | 		break;
212 | 	case 'r':
213 | 		code = '\r';
214 | 		break;
215 | 	case 't':
216 | 		code = '\t';
217 | 		break;
218 | 	case 'u':
219 | 		*inputptr = ptr;
220 | 		utf8lite_decode_uescape(inputptr, codeptr);
221 | 		return;
222 | 	default:
223 | 		break;
224 | 	}
225 | 
226 | 	*inputptr = ptr;
227 | 	*codeptr = code;
228 | }
229 | 


--------------------------------------------------------------------------------
/src/graph.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Patrick O. Perry.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <limits.h>
 18 | #include "utf8lite.h"
 19 | 
 20 | /*
 21 |                    width
 22 | graph               Ambiguous  Emoji Ignorable Narrow   Mark   None   Wide
 23 |   Control                   0      0      3809      0      0   2116      0
 24 |   CR                        0      0         0      0      0      1      0
 25 |   EBase                     0     98         0      0      0      0      0
 26 |   EBaseGAZ                  0      4         0      0      0      0      0
 27 |   EModifier                 0      5         0      0      0      0      0
 28 |   Extend                    0      0       359     26   1514      0      2
 29 |   GlueAfterZWJ              2     20         0      0      0      0      0
 30 |   L                         0      0         1      0      0      0    124
 31 |   LF                        0      0         0      0      0      1      0
 32 |   LV                        0      0         0      0      0      0    399
 33 |   LVT                       0      0         0      0      0      0  10773
 34 |   Other                   882    996         2  21606      0 971540  99206
 35 |   Prepend                   0      0         0      9     10      0      0
 36 |   RegionalIndicator         0     26         0      0      0      0      0
 37 |   SpacingMark               0      0         0    348      0      0      0
 38 |   T                         0      0         0    137      0      0      0
 39 |   V                         0      0         1     94      0      0      0
 40 |   ZWJ                       0      0         1      0      0      0      0
 41 | */
 42 | 
 43 | 
 44 | static int ascii_width(int32_t ch, int flags);
 45 | static int utf8_escape_width(int32_t ch, int flags);
 46 | static int utf8_width(int32_t ch, int cw, int flags);
 47 | 
 48 | 
 49 | int utf8lite_graph_measure(const struct utf8lite_graph *g,
 50 | 			   int flags, int *widthptr)
 51 | {
 52 | 	struct utf8lite_text_iter it;
 53 | 	int32_t ch;
 54 | 	int err = 0, cw, w, width;
 55 | 
 56 | 	width = 0;
 57 | 	utf8lite_text_iter_make(&it, &g->text);
 58 | 
 59 | 	while (utf8lite_text_iter_advance(&it)) {
 60 | 		ch = it.current;
 61 | 
 62 | 		if (ch <= 0x7F) {
 63 | 			w = ascii_width(ch, flags);
 64 | 		} else if (flags & UTF8LITE_ESCAPE_UTF8) {
 65 | 			w = utf8_escape_width(ch, flags);
 66 | 		} else if ((flags & UTF8LITE_ESCAPE_EXTENDED)
 67 | 				&& (ch > 0xFFFF)) {
 68 | 			w = utf8_escape_width(ch, flags);
 69 | 		} else {
 70 | 			cw = utf8lite_charwidth(ch);
 71 | 			if (cw == UTF8LITE_CHARWIDTH_EMOJI) {
 72 | 				width = 2;
 73 | 				goto exit;
 74 | 			}
 75 | 			w = utf8_width(ch, cw, flags);
 76 | 		}
 77 | 
 78 | 		if (w < 0) {
 79 | 			width = w;
 80 | 			goto exit;
 81 | 		} else if (w > INT_MAX - width) {
 82 | 			width = -1;
 83 | 			err = UTF8LITE_ERROR_OVERFLOW;
 84 | 			goto exit;
 85 | 		} else {
 86 | 			width += w;
 87 | 		}
 88 | 	}
 89 | 
 90 | exit:
 91 | 	if (widthptr) {
 92 | 		*widthptr = width;
 93 | 	}
 94 | 	return err;
 95 | }
 96 | 
 97 | 
 98 | int ascii_width(int32_t ch, int flags)
 99 | {
100 | 	// handle control characters
101 | 	if (ch <= 0x1F || ch == 0x7F) {
102 | 		if (!(flags & UTF8LITE_ESCAPE_CONTROL)) {
103 | 			return -1;
104 | 		}
105 | 
106 | 		switch (ch) {
107 | 		case '\a':
108 | 		case '\v':
109 | 			// \u0007, \u000b (JSON) : \a, \b (C)
110 | 			return (flags & UTF8LITE_ENCODE_JSON) ? 6 : 2;
111 | 		case '\b':
112 | 		case '\f':
113 | 		case '\n':
114 | 		case '\r':
115 | 		case '\t':
116 | 			return 2;
117 | 		default:
118 | 			return 6; // \uXXXX
119 | 		}
120 | 	}
121 | 
122 | 	// handle printable characters
123 | 	switch (ch) {
124 | 	case '\"':
125 | 		return (flags & UTF8LITE_ESCAPE_DQUOTE) ? 2 : 1;
126 | 	case '\'':
127 | 		return (flags & UTF8LITE_ESCAPE_SQUOTE) ? 2 : 1;
128 | 	case '\\':
129 | 		if (flags & (UTF8LITE_ESCAPE_CONTROL
130 | 				| UTF8LITE_ESCAPE_DQUOTE
131 | 				| UTF8LITE_ESCAPE_SQUOTE
132 | 				| UTF8LITE_ESCAPE_EXTENDED
133 | 				| UTF8LITE_ESCAPE_UTF8)) {
134 | 			return 2;
135 | 		} else {
136 | 			return 1;
137 | 		}
138 | 	default:
139 | 		return 1;
140 | 	}
141 | }
142 | 
143 | 
144 | int utf8_width(int32_t ch, int cw, int flags)
145 | {
146 | 	int w = -1;
147 | 
148 | 	switch ((enum utf8lite_charwidth_type)cw) {
149 | 	case UTF8LITE_CHARWIDTH_NONE:
150 | 		if (flags & UTF8LITE_ESCAPE_CONTROL) {
151 | 			w = utf8_escape_width(ch, flags);
152 | 		} else {
153 | 			w = -1;
154 | 		}
155 | 		break;
156 | 
157 | 	case UTF8LITE_CHARWIDTH_IGNORABLE:
158 | 	case UTF8LITE_CHARWIDTH_MARK:
159 | 		w = 0;
160 | 		break;
161 | 
162 | 	case UTF8LITE_CHARWIDTH_NARROW:
163 | 		w = 1;
164 | 		break;
165 | 
166 | 	case UTF8LITE_CHARWIDTH_AMBIGUOUS:
167 | 		w = (flags & UTF8LITE_ENCODE_AMBIGWIDE) ? 2 : 1;
168 | 		break;
169 | 
170 | 	case UTF8LITE_CHARWIDTH_WIDE:
171 | 	case UTF8LITE_CHARWIDTH_EMOJI:
172 | 		w = 2;
173 | 		break;
174 | 	}
175 | 
176 | 	return w;
177 | }
178 | 
179 | 
180 | int utf8_escape_width(int32_t ch, int flags)
181 | {
182 | 	if (ch <= 0xFFFF) {
183 | 		return 6;
184 | 	} else if (flags & UTF8LITE_ENCODE_JSON) {
185 | 		return 12;
186 | 	} else {
187 | 		return 10;
188 | 	}
189 | }
190 | 


--------------------------------------------------------------------------------
/src/normalize.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Patrick O. Perry.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <stdlib.h>
 18 | #include "private/casefold.h"
 19 | #include "private/compose.h"
 20 | #include "private/combining.h"
 21 | #include "private/decompose.h"
 22 | #include "utf8lite.h"
 23 | 
 24 | /* From Unicode-8.0 Section 3.12 Conjoining Jamo Behavior */
 25 | #define HANGUL_SBASE 0xAC00
 26 | #define HANGUL_LBASE 0x1100
 27 | #define HANGUL_VBASE 0x1161
 28 | #define HANGUL_TBASE 0x11A7
 29 | #define HANGUL_LCOUNT 19
 30 | #define HANGUL_VCOUNT 21
 31 | #define HANGUL_TCOUNT 28
 32 | #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
 33 | #define HANTUL_SCOUNT (HANGUL_LCOUNT * HANGUL_NCOUNT)
 34 | 
 35 | 
 36 | static void hangul_decompose(int32_t code, int32_t **bufp)
 37 | {
 38 | 	int32_t *dst = *bufp;
 39 | 	int32_t sindex = code - HANGUL_SBASE;
 40 | 	int32_t lindex = sindex / HANGUL_NCOUNT;
 41 | 	int32_t vindex = (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT;
 42 | 	int32_t tindex = sindex % HANGUL_TCOUNT;
 43 | 	int32_t lpart = HANGUL_LBASE + lindex;
 44 | 	int32_t vpart = HANGUL_VBASE + vindex;
 45 | 	int32_t tpart = HANGUL_TBASE + tindex;
 46 | 
 47 | 	*dst++ = lpart;
 48 | 	*dst++ = vpart;
 49 | 	if (tindex > 0) {
 50 | 		*dst++ = tpart;
 51 | 	}
 52 | 
 53 | 	*bufp = dst;
 54 | }
 55 | 
 56 | 
 57 | static int is_hangul_vpart(int32_t code)
 58 | {
 59 | 	return (HANGUL_VBASE <= code && code < HANGUL_VBASE + HANGUL_VCOUNT);
 60 | }
 61 | 
 62 | 
 63 | 
 64 | static int is_hangul_tpart(int32_t code)
 65 | {
 66 | 	// strict less-than on lower bound
 67 | 	return (HANGUL_TBASE < code && code < HANGUL_TBASE + HANGUL_TCOUNT);
 68 | }
 69 | 
 70 | 
 71 | static int32_t hangul_compose_lv(int32_t lpart, int32_t vpart)
 72 | {
 73 | 	int32_t lindex = lpart - HANGUL_LBASE;
 74 | 	int32_t vindex = vpart - HANGUL_VBASE;
 75 | 	int32_t lvindex = lindex * HANGUL_NCOUNT + vindex * HANGUL_TCOUNT;
 76 | 	int32_t s = HANGUL_SBASE + lvindex;
 77 | 	return s;
 78 | }
 79 | 
 80 | 
 81 | static int32_t hangul_compose_lvt(int32_t lvpart, int32_t tpart)
 82 | {
 83 | 	int32_t tindex = tpart - HANGUL_TBASE;
 84 | 	int32_t s = lvpart + tindex;
 85 | 	return s;
 86 | }
 87 | 
 88 | 
 89 | static void casefold(int type, int32_t code, int32_t **bufp)
 90 | {
 91 | 	const int32_t block_size = CASEFOLD_BLOCK_SIZE;
 92 | 	unsigned i = casefold_stage1[code / block_size];
 93 | 	struct casefold c = casefold_stage2[i][code % block_size];
 94 | 	unsigned length = c.length;
 95 | 	const int32_t *src;
 96 | 	int32_t *dst;
 97 | 
 98 | 	if (length == 0) {
 99 | 		dst = *bufp;
100 | 		*dst++ = code;
101 | 		*bufp = dst;
102 | 	} else if (length == 1) {
103 | 		utf8lite_map(type, (int32_t)c.data, bufp);
104 | 	} else {
105 | 		src = &casefold_mapping[c.data];
106 | 		while (length-- > 0) {
107 | 			utf8lite_map(type, *src, bufp);
108 | 			src++;
109 | 		}
110 | 	}
111 | }
112 | 
113 | 
114 | 
115 | void utf8lite_map(int type, int32_t code, int32_t **bufptr)
116 | {
117 | 	const int32_t block_size = DECOMPOSITION_BLOCK_SIZE;
118 | 	unsigned i = decomposition_stage1[code / block_size];
119 | 	struct decomposition d = decomposition_stage2[i][code % block_size];
120 | 	unsigned length = d.length;
121 | 	const int32_t *src;
122 | 	int32_t *dst;
123 | 
124 | 	if (length == 0 || (d.type > 0 && !(type & (1 << (d.type - 1))))) {
125 | 		if (type & UTF8LITE_CASEFOLD_ALL) {
126 | 			casefold(type, code, bufptr);
127 | 		} else {
128 | 			dst = *bufptr;
129 | 			*dst++ = code;
130 | 			*bufptr = dst;
131 | 		}
132 | 	} else if (length == 1) {
133 | 		utf8lite_map(type, d.data, bufptr);
134 | 	} else if (d.type >= 0) {
135 | 		src = &decomposition_mapping[d.data];
136 | 		while (length-- > 0) {
137 | 			utf8lite_map(type, *src, bufptr);
138 | 			src++;
139 | 		}
140 | 	} else {
141 | 		hangul_decompose(code, bufptr);
142 | 	}
143 | }
144 | 
145 | 
146 | void utf8lite_order(int32_t *ptr, size_t len)
147 | {
148 | 	int32_t *end = ptr + len;
149 | 	int32_t *c_begin, *c_end, *c_tail, *c_ptr;
150 | 	int32_t code, code_prev;
151 | 	int32_t cl, cl_prev;
152 | 
153 | 	while (ptr != end) {
154 | 		c_begin = ptr;
155 | 		code = *ptr++;
156 | 		cl = combining_class(code);
157 | 
158 | 		// skip to the next combining mark
159 | 		if (cl == 0) {
160 | 			continue;
161 | 		}
162 | 
163 | 		// It takes 21 bits to encode a codepoint and 8 bits
164 | 		// to encode c combining class.
165 | 		// Mark the start of the combining mark sequence (c_begin)
166 | 		// encode the combining class in bits 22-29.
167 | 		*c_begin = code | (cl << UTF8LITE_CODE_BITS);
168 | 
169 | 		// the combining mark sequence ends at the first starter
170 | 		// (c_end)
171 | 		c_end = ptr;
172 | 		while (c_end != end) {
173 | 			// until we hit a non-starter, encode the combining
174 | 			// class in the high 8 bits of the code
175 | 			code = *ptr++;
176 | 			cl = combining_class(code);
177 | 			if (cl == 0) {
178 | 				break;
179 | 			}
180 | 
181 | 			*c_end = code | (cl << UTF8LITE_CODE_BITS);
182 | 			c_end++;
183 | 		}
184 | 
185 | 		// sort the combining marks, using insertion sort (stable)
186 | 		for (c_tail = c_begin + 1; c_tail != c_end; c_tail++) {
187 | 			c_ptr = c_tail;
188 | 			code = *c_ptr;
189 | 			cl = code & (0xFF << UTF8LITE_CODE_BITS);
190 | 
191 | 			while (c_ptr != c_begin) {
192 | 				code_prev = c_ptr[-1];
193 | 				cl_prev = (code_prev
194 | 					   & (0xFF << UTF8LITE_CODE_BITS));
195 | 
196 | 				if (cl_prev <= cl) {
197 | 					break;
198 | 				}
199 | 
200 | 				// swap with previous item
201 | 				c_ptr[0] = code_prev;
202 | 
203 | 				// move down
204 | 				c_ptr--;
205 | 			}
206 | 
207 | 			// complete the final swap
208 | 			*c_ptr = code;
209 | 		}
210 | 
211 | 		// remove the combining mark annotations
212 | 		while (c_begin != c_end) {
213 | 			code = *c_begin;
214 | 			*c_begin = code & (~(0xFF << UTF8LITE_CODE_BITS));
215 | 			c_begin++;
216 | 		}
217 | 	}
218 | }
219 | 
220 | 
221 | 
222 | 
223 | static int has_compose(int32_t code, int *offsetptr, int *lengthptr)
224 | {
225 | 	const int32_t block_size = COMPOSITION_BLOCK_SIZE;
226 | 	unsigned i = composition_stage1[code / block_size];
227 | 	struct composition c = composition_stage2[i][code % block_size];
228 | 	int offset = (int)c.offset;
229 | 	int length = (int)c.length;
230 | 
231 | 	*offsetptr = offset;
232 | 	*lengthptr = length;
233 | 
234 | 	return (length > 0 ? 1 : 0);
235 | }
236 | 
237 | 
238 | static int code_cmp(const void *x1, const void *x2)
239 | {
240 | 	int32_t y1 = *(const int32_t *)x1;
241 | 	int32_t y2 = *(const int32_t *)x2;
242 | 
243 | 	if (y1 < y2) {
244 | 		return -1;
245 | 	} else if (y1 > y2) {
246 | 		return +1;
247 | 	} else {
248 | 		return 0;
249 | 	}
250 | }
251 | 
252 | 
253 | static int combiner_find(int offset, int length, int32_t code)
254 | {
255 | 	const int32_t *base = composition_combiner + offset;
256 | 	const int32_t *ptr;
257 | 
258 | 	// handle empty and singleton case
259 | 	if (length == 0) {
260 | 		return -1;
261 | 	} else if (length == 1) {
262 | 		return (*base == code) ? 0 : -1;
263 | 	}
264 | 
265 | 	// handle general case
266 | 	ptr = bsearch(&code, base, (size_t)length, sizeof(*base), code_cmp);
267 | 
268 | 	if (ptr == NULL) {
269 | 		return -1;
270 | 	} else {
271 | 		return (int)(ptr - base);
272 | 	}
273 | }
274 | 
275 | 
276 | static int has_combiner(int32_t left, int offset, int length, int32_t code,
277 | 		        int32_t *primaryptr)
278 | {
279 | 	int i;
280 | 
281 | 	if (offset < COMPOSITION_HANGUL_LPART) {
282 | 		i = combiner_find(offset, length, code);
283 | 		if (i >= 0) {
284 | 			*primaryptr = composition_primary[offset + i];
285 | 			return 1;
286 | 		}
287 | 	} else if (offset == COMPOSITION_HANGUL_LPART) {
288 | 		if (is_hangul_vpart(code)) {
289 | 			*primaryptr = hangul_compose_lv(left, code);
290 | 			return 1;
291 | 		}
292 | 	} else if (offset == COMPOSITION_HANGUL_LVPART) {
293 | 		if (is_hangul_tpart(code)) {
294 | 			*primaryptr = hangul_compose_lvt(left, code);
295 | 			return 1;
296 | 		}
297 | 	}
298 | 
299 | 	return 0;
300 | }
301 | 
302 | 
303 | void utf8lite_compose(int32_t *ptr, size_t *lenptr)
304 | {
305 | 	size_t len = *lenptr;
306 | 	int32_t *begin = ptr;
307 | 	int32_t *end = begin + len;
308 | 	int32_t *leftptr, *dst;
309 | 	int32_t left = 0, code, prim;
310 | 	uint8_t code_ccc, prev_ccc = 0;
311 | 	int moff = 0, mlen = 0;
312 | 	int blocked, has_prev, did_del;
313 | 
314 | 	did_del = 0;
315 | 
316 | 	// find the first combining starter (the left code point, L)
317 | 	leftptr = begin;
318 | 	while (leftptr != end) {
319 | 		left = *leftptr;
320 | 		if (has_compose(left, &moff, &mlen)) {
321 | 			break;
322 | 		}
323 | 		leftptr++;
324 | 	}
325 | 
326 | 	if (leftptr == end) {
327 | 		goto out;
328 | 	}
329 | 
330 | 	ptr = leftptr + 1;
331 | 	has_prev = 0;
332 | 	while (ptr != end) {
333 | 		code = *ptr;
334 | 		code_ccc = combining_class(code);
335 | 
336 | 		// determine whether the code is blocked
337 | 		if (has_prev && prev_ccc >= code_ccc) {
338 | 			blocked = 1;
339 | 		} else {
340 | 			blocked = 0;
341 | 		}
342 | 
343 | 		if (!blocked && has_combiner(left, moff, mlen, code, &prim)) {
344 | 			// replace L by P
345 | 			*leftptr = prim;
346 | 			left = prim;
347 | 			has_compose(left, &moff, &mlen);
348 | 
349 | 			// delete C
350 | 			*ptr = UTF8LITE_CODE_NONE;
351 | 			did_del = 1;
352 | 		} else if (code_ccc == 0) {
353 | 			// new leftmost combining starter, L
354 | 			leftptr = ptr;
355 | 			left = code;
356 | 			has_compose(left, &moff, &mlen);
357 | 			has_prev = 0;
358 | 		} else {
359 | 			prev_ccc = code_ccc;
360 | 			has_prev = 1;
361 | 		}
362 | 		ptr++;
363 | 	}
364 | 
365 | 	// remove the deleted entries
366 | 	if (did_del) {
367 | 		ptr = begin;
368 | 		dst = begin;
369 | 		while (ptr != end) {
370 | 			code = *ptr++;
371 | 			if (code != UTF8LITE_CODE_NONE) {
372 | 				*dst++ = code;
373 | 			}
374 | 		}
375 | 		len = (size_t)(dst - begin);
376 | 	}
377 | 
378 | out:
379 | 	*lenptr = len;
380 | }
381 | 


--------------------------------------------------------------------------------
/src/private/array.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Patrick O. Perry.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #ifndef UTF8LITE_ARRAY_H
18 | #define UTF8LITE_ARRAY_H
19 | 
20 | /**
21 |  * \file array.h
22 |  *
23 |  * Dynamic array, growing to accommodate more elements.
24 |  */
25 | 
26 | #include <stddef.h>
27 | 
28 | /**
29 |  * Grow an array to accommodate more elements, possibly re-allocating.
30 |  *
31 |  * \param baseptr pointer to pointer to first element
32 |  * \param sizeptr pointer to the capacity (in elements) of the array
33 |  * \param width size of each element
34 |  * \param count number of occupied elements
35 |  * \param nadd number of elements to append after the `count` occupied
36 |  *        elements
37 |  *
38 |  * \returns 0 on success
39 |  */
40 | int utf8lite_array_grow(void **baseptr, int *sizeptr, size_t width, int count,
41 | 			int nadd);
42 | 
43 | /**
44 |  * Determine the capacity for an array that needs to grow.
45 |  *
46 |  * \param sizeptr pointer to the capacity (in elements) of the array
47 |  * \param width size of each element
48 |  * \param count number of occupied elements
49 |  * \param nadd number of elements to append after the `count` occupied
50 |  *        elements
51 |  *
52 |  * \returns 0 on success, `UTF8LITE_ERROR_OVERFLOW` on overflow
53 |  */
54 | int utf8lite_array_size_add(int *sizeptr, size_t width, int count, int nadd);
55 | 
56 | /**
57 |  * Grow an big array to accommodate more elements, possibly re-allocating.
58 |  *
59 |  * \param baseptr pointer to pointer to first element
60 |  * \param sizeptr pointer to the capacity (in elements) of the array
61 |  * \param width size of each element
62 |  * \param count number of occupied elements
63 |  * \param nadd number of elements to append after the `count` occupied
64 |  *        elements
65 |  *
66 |  * \returns 0 on success
67 |  */
68 | int utf8lite_bigarray_grow(void **baseptr, size_t *sizeptr, size_t width,
69 | 			   size_t count, size_t nadd);
70 | 
71 | /**
72 |  * Determine the capacity for an array that needs to grow.
73 |  *
74 |  * \param sizeptr pointer to the capacity (in elements) of the array
75 |  * \param width size of each element
76 |  * \param count number of occupied elements
77 |  * \param nadd number of elements to append after the `count` occupied
78 |  *        elements
79 |  *
80 |  * \returns 0 on success, `UTF8LITE_ERROR_OVERFLOW` on overflow
81 |  */
82 | int utf8lite_bigarray_size_add(size_t *sizeptr, size_t width, size_t count,
83 | 			       size_t nadd);
84 | 
85 | #endif /* UTF8LITE_ARRAY_H */
86 | 


--------------------------------------------------------------------------------
/src/text.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Patrick O. Perry.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <ctype.h>
 18 | #include <stdint.h>
 19 | #include <stdlib.h>
 20 | #include <string.h>
 21 | #include "utf8lite.h"
 22 | 
 23 | 
 24 | int utf8lite_text_init_copy(struct utf8lite_text *text,
 25 | 			  const struct utf8lite_text *other)
 26 | {
 27 |         size_t size = UTF8LITE_TEXT_SIZE(other);
 28 |         size_t attr = other->attr;
 29 | 
 30 | 	if (other->ptr) {
 31 | 		if (!(text->ptr = malloc(size + 1))) {
 32 | 			return UTF8LITE_ERROR_NOMEM;
 33 | 		}
 34 | 
 35 | 		memcpy(text->ptr, other->ptr, size);
 36 | 		text->ptr[size] = '\0';
 37 | 	} else {
 38 | 		text->ptr = NULL;
 39 | 	}
 40 |         text->attr = attr;
 41 |         return 0;
 42 | }
 43 | 
 44 | 
 45 | void utf8lite_text_destroy(struct utf8lite_text *text)
 46 | {
 47 |         free(text->ptr);
 48 | }
 49 | 
 50 | 
 51 | int utf8lite_text_isascii(const struct utf8lite_text *text)
 52 | {
 53 | 	struct utf8lite_text_iter it;
 54 | 
 55 | 	utf8lite_text_iter_make(&it, text);
 56 | 	while (utf8lite_text_iter_advance(&it)) {
 57 | 		if (!UTF8LITE_IS_ASCII(it.current)) {
 58 | 			return 0;
 59 | 		}
 60 | 	}
 61 | 	return 1;
 62 | }
 63 | 
 64 | 
 65 | // Dan Bernstein's djb2 XOR hash: http://www.cse.yorku.ca/~oz/hash.html
 66 | #define HASH_SEED 5381
 67 | #define HASH_COMBINE(seed, v) (((hash) << 5) + (hash)) ^ ((size_t)(v))
 68 | 
 69 | 
 70 | static size_t hash_raw(const struct utf8lite_text *text)
 71 | {
 72 | 	const uint8_t *ptr = text->ptr;
 73 | 	const uint8_t *end = ptr + UTF8LITE_TEXT_SIZE(text);
 74 | 	size_t hash = HASH_SEED;
 75 | 	size_t ch;
 76 | 
 77 | 	while (ptr != end) {
 78 | 		ch = *ptr++;
 79 | 		hash = HASH_COMBINE(hash, ch);
 80 | 	}
 81 | 
 82 | 	return hash;
 83 | }
 84 | 
 85 | 
 86 | size_t utf8lite_text_hash(const struct utf8lite_text *text)
 87 | {
 88 | 	uint8_t buf[4];
 89 | 	const uint8_t *ptr = text->ptr;
 90 | 	const uint8_t *end = ptr + UTF8LITE_TEXT_SIZE(text);
 91 | 	uint8_t *bufptr, *bufend;
 92 | 	size_t hash = HASH_SEED;
 93 | 	int32_t code;
 94 | 	uint_fast8_t ch;
 95 | 
 96 | 	if (!UTF8LITE_TEXT_HAS_ESC(text)) {
 97 | 		return hash_raw(text);
 98 | 	}
 99 | 
100 | 	while (ptr != end) {
101 | 		ch = *ptr++;
102 | 		if (ch == '\\') {
103 | 			utf8lite_decode_escape(&ptr, &code);
104 | 
105 | 			bufptr = buf;
106 | 			bufend = bufptr;
107 | 			utf8lite_encode_utf8(code, &bufend);
108 | 
109 | 			while (bufptr != bufend) {
110 | 				ch = *bufptr++;
111 | 				hash = HASH_COMBINE(hash, ch);
112 | 			}
113 | 		} else {
114 | 			hash = HASH_COMBINE(hash, ch);
115 | 		}
116 | 	}
117 | 
118 | 	return hash;
119 | }
120 | 
121 | 
122 | int utf8lite_text_equals(const struct utf8lite_text *text1,
123 | 			 const struct utf8lite_text *text2)
124 | {
125 | 	struct utf8lite_text_iter it1, it2;
126 | 	size_t n;
127 | 
128 | 	if (text1->attr == text2->attr) {
129 | 		// same bits and size
130 | 		n = UTF8LITE_TEXT_SIZE(text1);
131 | 		return !memcmp(text1->ptr, text2->ptr, n);
132 | 	} else if (UTF8LITE_TEXT_BITS(text1) == UTF8LITE_TEXT_BITS(text2)) {
133 | 		// same bits, different size
134 | 		return 0;
135 | 	} else {
136 | 		// different bits or different size
137 | 		utf8lite_text_iter_make(&it1, text1);
138 | 		utf8lite_text_iter_make(&it2, text2);
139 | 		while (utf8lite_text_iter_advance(&it1)) {
140 | 			utf8lite_text_iter_advance(&it2);
141 | 			if (it1.current != it2.current) {
142 | 				return 0;
143 | 			}
144 | 		}
145 | 		return !utf8lite_text_iter_advance(&it2);
146 | 	}
147 | }
148 | 
149 | 
150 | static int compare_raw(const struct utf8lite_text *text1,
151 | 		       const struct utf8lite_text *text2)
152 | {
153 | 	size_t n1 = UTF8LITE_TEXT_SIZE(text1);
154 | 	size_t n2 = UTF8LITE_TEXT_SIZE(text2);
155 | 	size_t n = (n1 < n2) ? n1 : n2;
156 | 	int cmp;
157 | 
158 | 	cmp = memcmp(text1->ptr, text2->ptr, n);
159 | 	if (cmp == 0) {
160 | 		if (n1 < n2) {
161 | 			cmp = -1;
162 | 		} else if (n1 == n2) {
163 | 			cmp = 0;
164 | 		} else {
165 | 			cmp = +1;
166 | 		}
167 | 	}
168 | 	return cmp;
169 | }
170 | 
171 | 
172 | int utf8lite_text_compare(const struct utf8lite_text *text1,
173 | 			  const struct utf8lite_text *text2)
174 | {
175 | 	struct utf8lite_text_iter it1, it2;
176 | 
177 | 	if (!UTF8LITE_TEXT_HAS_ESC(text1) && !UTF8LITE_TEXT_HAS_ESC(text2)) {
178 | 		return compare_raw(text1, text2);
179 | 	}
180 | 
181 | 	utf8lite_text_iter_make(&it1, text1);
182 | 	utf8lite_text_iter_make(&it2, text2);
183 | 	while (utf8lite_text_iter_advance(&it1)) {
184 | 		utf8lite_text_iter_advance(&it2);
185 | 		if (it1.current < it2.current) {
186 | 			return -1;
187 | 		} else if (it1.current > it2.current) {
188 | 			return +1;
189 | 		}
190 | 	}
191 | 
192 | 	return utf8lite_text_iter_advance(&it2) ? -1 : 0;
193 | }
194 | 


--------------------------------------------------------------------------------
/src/textassign.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Patrick O. Perry.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <ctype.h>
 18 | #include <inttypes.h>
 19 | #include <stdio.h>
 20 | #include <string.h>
 21 | #include "utf8lite.h"
 22 | 
 23 | 
 24 | static int assign_esc(struct utf8lite_text *text,
 25 | 		      const uint8_t *ptr, size_t size,
 26 | 		      struct utf8lite_message *msg);
 27 | static void assign_esc_unsafe(struct utf8lite_text *text, const uint8_t *ptr,
 28 | 			      size_t size);
 29 | static int assign_raw(struct utf8lite_text *text, const uint8_t *ptr,
 30 | 		      size_t size, struct utf8lite_message *msg);
 31 | static void assign_raw_unsafe(struct utf8lite_text *text, const uint8_t *ptr,
 32 | 			      size_t size);
 33 | 
 34 | static void append_location(struct utf8lite_message *msg, size_t offset);
 35 | 
 36 | 
 37 | int utf8lite_text_assign(struct utf8lite_text *text, const uint8_t *ptr,
 38 | 			 size_t size, int flags, struct utf8lite_message *msg)
 39 | {
 40 | 	int err = 0;
 41 | 
 42 | 	if (size > UTF8LITE_TEXT_SIZE_MAX) {
 43 | 		err = UTF8LITE_ERROR_OVERFLOW;
 44 | 		utf8lite_message_set(msg, "text size (%"PRIu64" bytes)"
 45 | 				     " exceeds maximum (%"PRIu64" bytes)",
 46 | 				     (uint64_t)size,
 47 | 				     (uint64_t)UTF8LITE_TEXT_SIZE_MAX);
 48 | 	} else if (flags & UTF8LITE_TEXT_UNESCAPE) {
 49 | 		if (flags & UTF8LITE_TEXT_VALID) {
 50 | 			assign_esc_unsafe(text, ptr, size);
 51 | 		} else {
 52 | 			err = assign_esc(text, ptr, size, msg);
 53 | 		}
 54 | 	} else {
 55 | 		if (flags & UTF8LITE_TEXT_VALID) {
 56 | 			assign_raw_unsafe(text, ptr, size);
 57 | 		} else {
 58 | 			err = assign_raw(text, ptr, size, msg);
 59 | 		}
 60 | 	}
 61 | 
 62 | 	if (err) {
 63 | 		text->ptr = NULL;
 64 | 		text->attr = 0;
 65 | 	}
 66 | 
 67 | 	return err;
 68 | }
 69 | 
 70 | 
 71 | int assign_raw(struct utf8lite_text *text, const uint8_t *ptr, size_t size,
 72 | 	       struct utf8lite_message *msg)
 73 | {
 74 | 	const uint8_t *input = ptr;
 75 | 	const uint8_t *end = ptr + size;
 76 | 	uint_fast8_t ch;
 77 | 	int err;
 78 | 
 79 | 	text->ptr = (uint8_t *)ptr;
 80 | 
 81 | 	while (ptr != end) {
 82 | 		ch = *ptr++;
 83 | 		if (ch & 0x80) {
 84 | 			ptr--;
 85 | 			if ((err = utf8lite_scan_utf8(&ptr, end, msg))) {
 86 | 				goto error;
 87 | 			}
 88 | 		}
 89 | 	}
 90 | 
 91 | 	text->attr = size;
 92 | 	return 0;
 93 | 
 94 | error:
 95 | 	append_location(msg, (size_t)(ptr - input));
 96 | 	text->ptr = NULL;
 97 | 	text->attr = 0;
 98 | 	return err;
 99 | }
100 | 
101 | 
102 | int assign_esc(struct utf8lite_text *text, const uint8_t *ptr, size_t size,
103 | 	       struct utf8lite_message *msg)
104 | {
105 | 	const uint8_t *input = ptr;
106 | 	const uint8_t *end = ptr + size;
107 | 	size_t attr = 0;
108 | 	uint_fast8_t ch;
109 | 	int err;
110 | 
111 | 	text->ptr = (uint8_t *)ptr;
112 | 
113 | 	while (ptr != end) {
114 | 		ch = *ptr++;
115 | 		if (ch == '\\') {
116 | 			attr |= UTF8LITE_TEXT_ESC_BIT;
117 | 
118 | 			if ((err = utf8lite_scan_escape(&ptr, end, msg))) {
119 | 				goto error;
120 | 			}
121 | 		} else if (ch & 0x80) {
122 | 			ptr--;
123 | 			if ((err = utf8lite_scan_utf8(&ptr, end, msg))) {
124 | 				goto error;
125 | 			}
126 | 		}
127 | 	}
128 | 
129 | 	attr |= size;
130 | 	text->attr = attr;
131 | 	return 0;
132 | 
133 | error:
134 | 	append_location(msg, (size_t)(ptr - input));
135 | 	return err;
136 | }
137 | 
138 | 
139 | void assign_raw_unsafe(struct utf8lite_text *text, const uint8_t *ptr,
140 | 		       size_t size)
141 | {
142 | 	const uint8_t *end = ptr + size;
143 | 	uint_fast8_t ch;
144 | 
145 | 	text->ptr = (uint8_t *)ptr;
146 | 
147 | 	while (ptr != end) {
148 | 		ch = *ptr++;
149 | 		if (ch & 0x80) {
150 | 			ptr += UTF8LITE_UTF8_TAIL_LEN(ch);
151 | 		}
152 | 	}
153 | 
154 | 	text->attr = size;
155 | }
156 | 
157 | 
158 | 
159 | void assign_esc_unsafe(struct utf8lite_text *text, const uint8_t *ptr,
160 | 		       size_t size)
161 | {
162 | 	const uint8_t *end = ptr + size;
163 | 	size_t attr = 0;
164 | 	int32_t code;
165 | 	uint_fast8_t ch;
166 | 
167 | 	text->ptr = (uint8_t *)ptr;
168 | 
169 | 	while (ptr != end) {
170 | 		ch = *ptr++;
171 | 		if (ch == '\\') {
172 | 			attr |= UTF8LITE_TEXT_ESC_BIT;
173 | 			ch = *ptr++;
174 | 
175 | 			switch (ch) {
176 | 			case 'u':
177 | 				utf8lite_decode_uescape(&ptr, &code);
178 | 				break;
179 | 			default:
180 | 				break;
181 | 			}
182 | 		} else if (ch & 0x80) {
183 | 			ptr += UTF8LITE_UTF8_TAIL_LEN(ch);
184 | 		}
185 | 	}
186 | 
187 | 	attr |= size;
188 | 	text->attr = attr;
189 | }
190 | 
191 | 
192 | void append_location(struct utf8lite_message *msg, size_t offset)
193 | {
194 | 	utf8lite_message_append(msg, " at position %"PRIu64,
195 | 				(uint64_t)(offset + 1));
196 | }
197 | 


--------------------------------------------------------------------------------
/src/textiter.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Patrick O. Perry.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <ctype.h>
 18 | #include "utf8lite.h"
 19 | 
 20 | /* http://stackoverflow.com/a/11986885 */
 21 | #define hextoi(ch) ((ch > '9') ? (ch &~ 0x20) - 'A' + 10 : (ch - '0'))
 22 | 
 23 | static void iter_retreat_escaped(struct utf8lite_text_iter *it,
 24 | 				 const uint8_t *begin);
 25 | static void iter_retreat_raw(struct utf8lite_text_iter *it);
 26 | 
 27 | void utf8lite_text_iter_make(struct utf8lite_text_iter *it,
 28 | 			   const struct utf8lite_text *text)
 29 | {
 30 | 	it->ptr = text->ptr;
 31 | 	it->end = it->ptr + UTF8LITE_TEXT_SIZE(text);
 32 | 	it->text_attr = text->attr;
 33 | 	it->current = UTF8LITE_CODE_NONE;
 34 | }
 35 | 
 36 | 
 37 | int utf8lite_text_iter_advance(struct utf8lite_text_iter *it)
 38 | {
 39 | 	const uint8_t *ptr = it->ptr;
 40 | 	size_t text_attr = it->text_attr;
 41 | 	int32_t code;
 42 | 
 43 | 	if (it->ptr == it->end) {
 44 | 		goto at_end;
 45 | 	}
 46 | 
 47 | 	code = *ptr++;
 48 | 
 49 | 	if (code == '\\' && (text_attr & UTF8LITE_TEXT_ESC_BIT)) {
 50 | 		utf8lite_decode_escape(&ptr, &code);
 51 | 	} else if (code >= 0x80) {
 52 | 		ptr--;
 53 | 		utf8lite_decode_utf8(&ptr, &code);
 54 | 	}
 55 | 
 56 | 	it->ptr = ptr;
 57 | 	it->current = code;
 58 | 	return 1;
 59 | 
 60 | at_end:
 61 | 	it->current = UTF8LITE_CODE_NONE;
 62 | 	return 0;
 63 | }
 64 | 
 65 | 
 66 | void utf8lite_text_iter_skip(struct utf8lite_text_iter *it)
 67 | {
 68 | 	it->ptr = it->end;
 69 | 	it->current = UTF8LITE_CODE_NONE;
 70 | }
 71 | 
 72 | 
 73 | int utf8lite_text_iter_retreat(struct utf8lite_text_iter *it)
 74 | {
 75 | 	const size_t size = (it->text_attr & UTF8LITE_TEXT_SIZE_MASK);
 76 | 	const uint8_t *begin = it->end - size;
 77 | 	const uint8_t *ptr = it->ptr;
 78 | 	const uint8_t *end = it->end;
 79 | 	int32_t code = it->current;
 80 | 
 81 | 	if (ptr == begin) {
 82 | 		return 0;
 83 | 	}
 84 | 
 85 | 	if (it->text_attr & UTF8LITE_TEXT_ESC_BIT) {
 86 | 		iter_retreat_escaped(it, begin);
 87 | 	} else {
 88 | 		iter_retreat_raw(it);
 89 | 	}
 90 | 
 91 | 	// we were at the end of the text
 92 | 	if (code == UTF8LITE_CODE_NONE) {
 93 | 		it->ptr = end;
 94 | 		return 1;
 95 | 	}
 96 | 
 97 | 	// at this point, it->code == code, and it->ptr is the code start
 98 | 	ptr = it->ptr;
 99 | 
100 | 	if (ptr == begin) {
101 | 		it->current = UTF8LITE_CODE_NONE;
102 | 		return 0;
103 | 	}
104 | 
105 | 	// read the previous code
106 | 	if (it->text_attr & UTF8LITE_TEXT_ESC_BIT) {
107 | 		iter_retreat_escaped(it, begin);
108 | 	} else {
109 | 		iter_retreat_raw(it);
110 | 	}
111 | 
112 | 	// now, it->code is the previous code, and it->ptr is the start
113 | 	// of the previous code
114 | 
115 | 	// set the pointer to the end of the previous code
116 | 	it->ptr = ptr;
117 | 	return 1;
118 | }
119 | 
120 | 
121 | void utf8lite_text_iter_reset(struct utf8lite_text_iter *it)
122 | {
123 | 	const size_t size = (it->text_attr & UTF8LITE_TEXT_SIZE_MASK);
124 | 	const uint8_t *begin = it->end - size;
125 | 
126 | 	it->ptr = begin;
127 | 	it->current = UTF8LITE_CODE_NONE;
128 | }
129 | 
130 | 
131 | void iter_retreat_raw(struct utf8lite_text_iter *it)
132 | {
133 | 	const uint8_t *ptr = it->ptr;
134 | 	int32_t code;
135 | 
136 | 	code = *(--ptr);
137 | 
138 | 	if (code < 0x80) {
139 | 		it->ptr = (uint8_t *)ptr;
140 | 		it->current = code;
141 | 	} else {
142 | 		// skip over continuation bytes
143 | 		do {
144 | 			ptr--;
145 | 		} while (*ptr < 0xC0);
146 | 
147 | 		it->ptr = (uint8_t *)ptr;
148 | 
149 | 		utf8lite_decode_utf8(&ptr, &it->current);
150 | 	}
151 | }
152 | 
153 | 
154 | 
155 | // we are at an escape if we are preceded by an odd number of
156 | // backslash (\) characters
157 | static int at_escape(const uint8_t *begin, const uint8_t *ptr)
158 | {
159 | 	int at = 0;
160 | 	uint_fast8_t prev;
161 | 
162 | 	while (begin < ptr) {
163 | 		prev = *(--ptr);
164 | 
165 | 		if (prev != '\\') {
166 | 			goto out;
167 | 		}
168 | 
169 | 		at = !at;
170 | 	}
171 | 
172 | out:
173 | 	return at;
174 | }
175 | 
176 | 
177 | void iter_retreat_escaped(struct utf8lite_text_iter *it, const uint8_t *begin)
178 | {
179 | 	const uint8_t *ptr = it->ptr;
180 | 	int32_t code, unesc, hi;
181 | 	int i;
182 | 
183 | 	code = *(--ptr);
184 | 
185 | 	// check for 2-byte escape
186 | 	switch (code) {
187 | 	case '"':
188 | 	case '\\':
189 | 	case '/':
190 | 		unesc = code;
191 | 		break;
192 | 
193 | 	case 'b':
194 | 		unesc = '\b';
195 | 		break;
196 | 
197 | 	case 'f':
198 | 		unesc = '\f';
199 | 		break;
200 | 	case 'n':
201 | 		unesc = '\n';
202 | 		break;
203 | 
204 | 	case 'r':
205 | 		unesc = '\r';
206 | 		break;
207 | 
208 | 	case 't':
209 | 		unesc = '\t';
210 | 		break;
211 | 
212 | 	default:
213 | 		unesc = 0;
214 | 		break;
215 | 	}
216 | 
217 | 	if (unesc) {
218 | 	       if (at_escape(begin, ptr)) {
219 | 		       ptr--;
220 | 		       code = unesc;
221 | 	       }
222 | 	       goto out;
223 | 	}
224 | 
225 | 	// check for 6-byte escape
226 | 	if (isxdigit((int)code)) {
227 | 		if (!(begin + 4 < ptr && ptr[-4] == 'u'
228 | 				&& at_escape(begin, ptr - 4))) {
229 | 			goto out;
230 | 		}
231 | 
232 | 		code = 0;
233 | 		for (i = 0; i < 4; i++) {
234 | 			code = (code << 4) + hextoi(ptr[i - 3]);
235 | 		}
236 | 		ptr -= 5;
237 | 
238 | 		if (UTF8LITE_IS_UTF16_LOW(code)) {
239 | 			hi = 0;
240 | 			for (i = 0; i < 4; i++) {
241 | 				hi = (hi << 4) + hextoi(ptr[i - 4]);
242 | 			}
243 | 
244 | 			code = UTF8LITE_DECODE_UTF16_PAIR(hi, code);
245 | 			ptr -= 6;
246 | 		}
247 | 
248 | 		goto out;
249 | 	}
250 | 
251 | 	// check for ascii
252 | 	if (code < 0x80) {
253 | 		goto out;
254 | 	}
255 | 
256 | 	// if we got here, then code is a continuation byte
257 | 
258 | 	// skip over preceding continuation bytes
259 | 	do {
260 | 		ptr--;
261 | 	} while (*ptr < 0xC0);
262 | 
263 | 	// decode the utf-8 value
264 | 	it->ptr = (uint8_t *)ptr;
265 | 	utf8lite_decode_utf8(&ptr, &it->current);
266 | 	return;
267 | 
268 | out:
269 | 	it->ptr = (uint8_t *)ptr;
270 | 	it->current = code;
271 | }
272 | 


--------------------------------------------------------------------------------
/src/textmap.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Patrick O. Perry.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <assert.h>
 18 | #include <errno.h>
 19 | #include <inttypes.h>
 20 | #include <stdint.h>
 21 | #include <stdlib.h>
 22 | #include <string.h>
 23 | #include "utf8lite.h"
 24 | 
 25 | 
 26 | static void utf8lite_textmap_clear_type(struct utf8lite_textmap *map);
 27 | static int utf8lite_textmap_set_type(struct utf8lite_textmap *map, int type);
 28 | 
 29 | static int utf8lite_textmap_reserve(struct utf8lite_textmap *map, size_t size);
 30 | static int utf8lite_textmap_set_ascii(struct utf8lite_textmap *map,
 31 | 			     const struct utf8lite_text *text);
 32 | static int utf8lite_textmap_set_utf32(struct utf8lite_textmap *map,
 33 | 				    const int32_t *ptr,
 34 | 				    const int32_t *end);
 35 | 
 36 | 
 37 | int utf8lite_textmap_init(struct utf8lite_textmap *map, int type)
 38 | {
 39 | 	int err;
 40 | 
 41 | 	map->text.ptr = NULL;
 42 | 	map->text.attr = 0;
 43 | 	map->codes = NULL;
 44 | 	map->size_max = 0;
 45 | 
 46 | 	utf8lite_textmap_clear_type(map);
 47 | 	err = utf8lite_textmap_set_type(map, type);
 48 | 	return err;
 49 | }
 50 | 
 51 | 
 52 | void utf8lite_textmap_destroy(struct utf8lite_textmap *map)
 53 | {
 54 | 	free(map->codes);
 55 | 	free(map->text.ptr);
 56 | }
 57 | 
 58 | 
 59 | void utf8lite_textmap_clear_type(struct utf8lite_textmap *map)
 60 | {
 61 | 	uint_fast8_t ch;
 62 | 
 63 | 	map->charmap_type = UTF8LITE_DECOMP_NORMAL | UTF8LITE_CASEFOLD_NONE;
 64 | 
 65 | 	for (ch = 0; ch < 0x80; ch++) {
 66 | 		map->ascii_map[ch] = (int8_t)ch;
 67 | 	}
 68 | 
 69 | 	map->type = 0;
 70 | }
 71 | 
 72 | 
 73 | int utf8lite_textmap_set_type(struct utf8lite_textmap *map, int type)
 74 | {
 75 | 	int_fast8_t ch;
 76 | 
 77 | 	if (map->type == type) {
 78 | 		return 0;
 79 | 	}
 80 | 
 81 | 	utf8lite_textmap_clear_type(map);
 82 | 
 83 | 	if (type & UTF8LITE_TEXTMAP_CASE) {
 84 | 		for (ch = 'A'; ch <= 'Z'; ch++) {
 85 | 			map->ascii_map[ch] = ch + ('a' - 'A');
 86 | 		}
 87 | 
 88 | 		map->charmap_type |= UTF8LITE_CASEFOLD_ALL;
 89 | 	}
 90 | 
 91 | 	if (type & UTF8LITE_TEXTMAP_COMPAT) {
 92 | 		map->charmap_type = UTF8LITE_DECOMP_ALL;
 93 | 	}
 94 | 
 95 | 	map->type = type;
 96 | 
 97 | 	return 0;
 98 | }
 99 | 
100 | 
101 | int utf8lite_textmap_reserve(struct utf8lite_textmap *map, size_t size)
102 | {
103 | 	uint8_t *ptr = map->text.ptr;
104 | 	int32_t *codes = map->codes;
105 | 
106 | 	if (map->size_max >= size) {
107 | 		return 0;
108 | 	}
109 | 
110 | 	if (!(ptr = realloc(ptr, size))) {
111 | 		return UTF8LITE_ERROR_NOMEM;
112 | 	}
113 | 	map->text.ptr = ptr;
114 | 
115 | 	if (size > SIZE_MAX / UTF8LITE_UNICODE_DECOMP_MAX) {
116 | 		return UTF8LITE_ERROR_OVERFLOW;
117 | 	}
118 | 
119 | 	if (!(codes = realloc(codes, size * UTF8LITE_UNICODE_DECOMP_MAX))) {
120 | 		return UTF8LITE_ERROR_NOMEM;
121 | 	}
122 | 	map->codes = codes;
123 | 
124 | 	map->size_max = size;
125 | 	return 0;
126 | }
127 | 
128 | 
129 | int utf8lite_textmap_set(struct utf8lite_textmap *map,
130 | 		       const struct utf8lite_text *text)
131 | {
132 | 	struct utf8lite_text_iter it;
133 | 	size_t size = UTF8LITE_TEXT_SIZE(text);
134 | 	int32_t *dst;
135 | 	int err;
136 | 
137 | 	if (utf8lite_text_isascii(text)) {
138 | 		return utf8lite_textmap_set_ascii(map, text);
139 | 	}
140 | 
141 | 	// For most inputs, mapping to type reduces or preserves the size.
142 | 	// However, for U+0390 and U+03B0, case folding triples the size.
143 | 	// (You can verify this with util/compute-typelen.py)
144 | 	//
145 | 	// Add one for a trailing NUL.
146 | 	if (size > ((SIZE_MAX - 1) / 3)) {
147 | 		err = UTF8LITE_ERROR_OVERFLOW;
148 | 		goto out;
149 | 	}
150 | 
151 | 	if ((err = utf8lite_textmap_reserve(map, 3 * size + 1))) {
152 | 		goto out;
153 | 	}
154 | 
155 | 	dst = map->codes;
156 | 	utf8lite_text_iter_make(&it, text);
157 | 	while (utf8lite_text_iter_advance(&it)) {
158 | 		utf8lite_map(map->charmap_type, it.current, &dst);
159 | 	}
160 | 
161 | 	size = (size_t)(dst - map->codes);
162 | 	utf8lite_order(map->codes, size);
163 | 	utf8lite_compose(map->codes, &size);
164 | 
165 | 	if ((err = utf8lite_textmap_set_utf32(map, map->codes,
166 | 					      map->codes + size))) {
167 | 		goto out;
168 | 	}
169 | 
170 | out:
171 | 	return err;
172 | }
173 | 
174 | 
175 | int utf8lite_textmap_set_utf32(struct utf8lite_textmap *map, const int32_t *ptr,
176 | 			     const int32_t *end)
177 | {
178 | 	int map_quote = map->type & UTF8LITE_TEXTMAP_QUOTE;
179 | 	int rm_di = map->type & UTF8LITE_TEXTMAP_RMDI;
180 | 	uint8_t *dst = map->text.ptr;
181 | 	int32_t code;
182 | 	int8_t ch;
183 | 
184 | 	while (ptr != end) {
185 | 		code = *ptr++;
186 | 
187 | 		if (code <= 0x7F) {
188 | 			ch = map->ascii_map[code];
189 | 			if (ch >= 0) {
190 | 				*dst++ = (uint8_t)ch;
191 | 			}
192 | 			continue;
193 | 		} else {
194 | 			switch (code) {
195 | 			case 0x055A: // ARMENIAN APOSTROPHE
196 | 			case 0x2018: // LEFT SINGLE QUOTATION MARK
197 | 			case 0x2019: // RIGHT SINGLE QUOTATION MARK
198 | 			case 0x201B: // SINGLE HIGH-REVERSED-9 QUOTATION MARK
199 | 			case 0xFF07: // FULLWIDTH APOSTROPHE
200 | 				if (map_quote) {
201 | 					code = '\'';
202 | 				}
203 | 				break;
204 | 
205 | 			default:
206 | 				if (rm_di && utf8lite_isignorable(code)) {
207 | 					continue;
208 | 				}
209 | 				break;
210 | 			}
211 | 		}
212 | 		utf8lite_encode_utf8(code, &dst);
213 | 	}
214 | 
215 | 	*dst = '\0'; // not necessary, but helps with debugging
216 | 	map->text.attr = (UTF8LITE_TEXT_SIZE_MASK
217 | 			  & ((size_t)(dst - map->text.ptr)));
218 | 	return 0;
219 | }
220 | 
221 | 
222 | int utf8lite_textmap_set_ascii(struct utf8lite_textmap *map,
223 | 			       const struct utf8lite_text *text)
224 | {
225 | 	struct utf8lite_text_iter it;
226 | 	size_t size = UTF8LITE_TEXT_SIZE(text);
227 | 	int8_t ch;
228 | 	uint8_t *dst;
229 | 	int err;
230 | 
231 | 	assert(size < SIZE_MAX);
232 | 
233 | 	if ((err = utf8lite_textmap_reserve(map, size + 1))) {
234 | 		goto error;
235 | 	}
236 | 
237 | 	dst = map->text.ptr;
238 | 
239 | 	utf8lite_text_iter_make(&it, text);
240 | 	while (utf8lite_text_iter_advance(&it)) {
241 | 		ch = map->ascii_map[it.current];
242 | 		if (ch >= 0) {
243 | 			*dst++ = (uint8_t)ch;
244 | 		}
245 | 	}
246 | 
247 | 	*dst = '\0'; // not necessary, but helps with debugging
248 | 	map->text.attr = (UTF8LITE_TEXT_SIZE_MASK
249 | 			  & ((size_t)(dst - map->text.ptr)));
250 | 	return 0;
251 | 
252 | error:
253 | 	return err;
254 | }
255 | 


--------------------------------------------------------------------------------
/src/wordscan.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Patrick O. Perry.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <assert.h>
 18 | #include <stdio.h>
 19 | #include "utf8lite.h"
 20 | #include "private/emojiprop.h"
 21 | #include "private/wordbreak.h"
 22 | #include "utf8lite.h"
 23 | 
 24 | 
 25 | void utf8lite_wordscan_make(struct utf8lite_wordscan *scan,
 26 | 			    const struct utf8lite_text *text)
 27 | {
 28 | 	utf8lite_text_iter_make(&scan->iter, text);
 29 | 	utf8lite_wordscan_reset(scan);
 30 | }
 31 | 
 32 | 
 33 | #define NEXT() \
 34 | 	do { \
 35 | 		follow_zwj = (scan->prop == WORD_BREAK_ZWJ); \
 36 | 		scan->ptr = scan->iter_ptr; \
 37 | 		scan->code = scan->iter.current; \
 38 | 		scan->prop = scan->iter_prop; \
 39 | 		scan->iter_ptr = scan->iter.ptr; \
 40 | 		if (utf8lite_text_iter_advance(&scan->iter)) { \
 41 | 			scan->iter_prop = word_break(scan->iter.current); \
 42 | 		} else { \
 43 | 			scan->iter_prop = WORD_BREAK_NONE; \
 44 | 		} \
 45 | 	} while (0)
 46 | 
 47 | #define EXTEND() \
 48 | 	do { \
 49 | 		while (scan->prop == WORD_BREAK_EXTEND \
 50 | 				|| scan->prop == WORD_BREAK_FORMAT \
 51 | 				|| scan->prop == WORD_BREAK_ZWJ) { \
 52 | 			NEXT(); \
 53 | 		} \
 54 | 	} while (0)
 55 | 		
 56 | 
 57 | void utf8lite_wordscan_reset(struct utf8lite_wordscan *scan)
 58 | {
 59 | 	scan->current.ptr = NULL;
 60 | 	scan->current.attr = scan->iter.text_attr & ~UTF8LITE_TEXT_SIZE_MASK;
 61 | 
 62 | 	utf8lite_text_iter_reset(&scan->iter);
 63 | 	scan->ptr = scan->iter.ptr;
 64 | 
 65 | 	if (utf8lite_text_iter_advance(&scan->iter)) {
 66 | 		scan->code = scan->iter.current;
 67 | 		scan->prop = word_break(scan->code);
 68 | 
 69 | 		scan->iter_ptr = scan->iter.ptr;
 70 | 		if (utf8lite_text_iter_advance(&scan->iter)) {
 71 | 			scan->iter_prop = word_break(scan->iter.current);
 72 | 		} else {
 73 | 			scan->iter_prop = WORD_BREAK_NONE;
 74 | 		}
 75 | 	} else {
 76 | 		scan->code = 0;
 77 | 		scan->prop = WORD_BREAK_NONE;
 78 | 		scan->iter_ptr = NULL;
 79 | 		scan->iter_prop = WORD_BREAK_NONE;
 80 | 	}
 81 | }
 82 | 
 83 | 
 84 | static int next_signif_prop(const struct utf8lite_wordscan *scan)
 85 | {
 86 | 	struct utf8lite_text_iter iter;
 87 | 	int prop;
 88 | 
 89 | 	switch (scan->iter_prop) {
 90 | 	case WORD_BREAK_EXTEND:
 91 | 	case WORD_BREAK_FORMAT:
 92 | 	case WORD_BREAK_ZWJ:
 93 | 		break;
 94 | 	default:
 95 | 		return scan->iter_prop;
 96 | 	}
 97 | 
 98 |        	iter = scan->iter;
 99 | 	while (utf8lite_text_iter_advance(&iter)) {
100 | 		prop = word_break(iter.current);
101 | 		switch (prop) {
102 | 		case WORD_BREAK_EXTEND:
103 | 		case WORD_BREAK_FORMAT:
104 | 		case WORD_BREAK_ZWJ:
105 | 			break;
106 | 		default:
107 | 			return prop;
108 | 		}
109 | 	}
110 | 	return WORD_BREAK_NONE;
111 | }
112 | 
113 | 
114 | int utf8lite_wordscan_advance(struct utf8lite_wordscan *scan)
115 | {
116 | 	int follow_zwj = 0;
117 | 	scan->current.ptr = (uint8_t *)scan->ptr;
118 | 	scan->current.attr &= ~UTF8LITE_TEXT_SIZE_MASK;
119 | 
120 | Start:
121 | 	switch ((enum word_break_prop)scan->prop) {
122 | 	case WORD_BREAK_NONE:
123 | 		// Break at the start and end of text unless the text is empty
124 | 		// WB2: Any + eot
125 | 		goto Break;
126 | 
127 | 	case WORD_BREAK_CR:
128 | 		NEXT();
129 | 		goto CR;
130 | 
131 | 	case WORD_BREAK_NEWLINE:
132 | 	case WORD_BREAK_LF:
133 | 		NEXT();
134 | 		goto Newline;
135 | 
136 | 	case WORD_BREAK_WSEGSPACE:
137 | 		NEXT();
138 | 		goto WSegSpace;
139 | 
140 | 	case WORD_BREAK_ALETTER:
141 | 		NEXT();
142 | 		goto ALetter;
143 | 
144 | 	case WORD_BREAK_NUMERIC:
145 | 		NEXT();
146 | 		goto Numeric;
147 | 
148 | 	case WORD_BREAK_EXTENDNUMLET:
149 | 		NEXT();
150 | 		goto ExtendNumLet;
151 | 
152 | 	case WORD_BREAK_HEBREW_LETTER:
153 | 		NEXT();
154 | 		goto Hebrew_Letter;
155 | 
156 | 	case WORD_BREAK_KATAKANA:
157 | 		NEXT();
158 | 		goto Katakana;
159 | 
160 | 	case WORD_BREAK_REGIONAL_INDICATOR:
161 | 		NEXT();
162 | 		goto Regional_Indicator;
163 | 
164 | 	case WORD_BREAK_DOUBLE_QUOTE:
165 | 	case WORD_BREAK_MIDLETTER:
166 | 	case WORD_BREAK_MIDNUM:
167 | 	case WORD_BREAK_MIDNUMLET:
168 | 	case WORD_BREAK_SINGLE_QUOTE:
169 | 	case WORD_BREAK_EXTEND: // marks
170 | 	case WORD_BREAK_FORMAT: // Cf format controls
171 | 	case WORD_BREAK_ZWJ:
172 | 	case WORD_BREAK_OTHER:
173 | 		NEXT();
174 | 		goto Any;
175 | 	}
176 | 
177 | 	assert(0 && "Unhandled word break property");
178 | 	return 0;
179 | CR:
180 | 	if (scan->prop == WORD_BREAK_LF) {
181 | 		// Do not break within CRLF
182 | 		// WB3: CR * LF
183 | 		NEXT();
184 | 	}
185 | 
186 | Newline:
187 | 	// Otherwise break after Newlines
188 | 	// WB3a: (Newline | CR | LF) +
189 | 	goto Break;
190 | 
191 | 
192 | WSegSpace:
193 | 	// WB3d: Keep horizontal whitespace together.
194 | 	if (scan->prop == WORD_BREAK_WSEGSPACE) {
195 |     		NEXT();
196 | 		goto WSegSpace;
197 | 	}
198 | 	EXTEND();
199 | 	goto MaybeBreak;
200 | 
201 | ALetter:
202 | 	EXTEND();
203 | 
204 | 	switch (scan->prop) {
205 | 	case WORD_BREAK_ALETTER:
206 | 		// Do not break between most letters
207 | 		// WB5: AHLetter * AHLetter
208 | 		NEXT();
209 | 		goto ALetter;
210 | 
211 | 	case WORD_BREAK_HEBREW_LETTER:
212 | 		// WB5: AHLetter * AHLetter
213 | 		NEXT();
214 | 		goto Hebrew_Letter;
215 | 
216 | 	case WORD_BREAK_MIDLETTER:
217 | 	case WORD_BREAK_MIDNUMLET:
218 | 	case WORD_BREAK_SINGLE_QUOTE:
219 | 		// Do not break across certain punctuation
220 | 
221 | 		// WB6: AHLetter * (MidLetter | MidNumLetQ) AHLetter
222 | 
223 | 		switch (next_signif_prop(scan)) {
224 | 		case WORD_BREAK_ALETTER:
225 | 			// WB7: AHLetter (MidLetter | MidNumLetQ) * AHLetter
226 | 			NEXT();
227 | 			EXTEND();
228 | 			NEXT();
229 | 			goto ALetter;
230 | 		case WORD_BREAK_HEBREW_LETTER:
231 | 			// WB7:  AHLetter (MidLetter | MidNumLetQ) * AHLetter
232 | 			NEXT();
233 | 			EXTEND();
234 | 			NEXT();
235 | 			goto Hebrew_Letter;
236 | 		default:
237 | 			goto MaybeBreak;
238 | 		}
239 | 
240 | 	case WORD_BREAK_NUMERIC:
241 | 		// Do not break within sequences of digits, or digits
242 | 		// adjacent to letters (“3a”, or “A3”).
243 | 		// WB9: AHLetter * Numeric
244 | 		NEXT();
245 | 		goto Numeric;
246 | 
247 | 	case WORD_BREAK_EXTENDNUMLET:
248 | 		// Do not break from extenders
249 | 		// WB13a: AHLetter * ExtendNumLet
250 | 		NEXT();
251 | 		goto ExtendNumLet;
252 | 
253 | 	default:
254 | 		goto MaybeBreak;
255 | 	}
256 | 
257 | Hebrew_Letter:
258 | 	EXTEND();
259 | 
260 | 	switch (scan->prop) {
261 | 	case WORD_BREAK_ALETTER:
262 | 		// Do not break between most letters
263 | 		// WB5: AHLetter * AHLetter
264 | 		NEXT();
265 | 		goto ALetter;
266 | 
267 | 	case WORD_BREAK_HEBREW_LETTER:
268 | 		// WB5: AHLetter * AHLetter
269 | 		NEXT();
270 | 		goto Hebrew_Letter;
271 | 
272 | 	case WORD_BREAK_MIDLETTER:
273 | 	case WORD_BREAK_MIDNUMLET:
274 | 	case WORD_BREAK_SINGLE_QUOTE:
275 | 		// Do not break across certain punctuation
276 | 
277 | 		// WB6: AHLetter * (MidLetter | MidNumLetQ) * AHLetter
278 | 		switch (next_signif_prop(scan)) {
279 | 		case WORD_BREAK_HEBREW_LETTER:
280 | 			// WB7:
281 | 			// AHLetter (MidLetter | MidNumLetQ) * AHLetter
282 | 			NEXT();
283 | 			EXTEND();
284 | 			NEXT();
285 | 			goto Hebrew_Letter;
286 | 		case WORD_BREAK_ALETTER:
287 | 			// WB7:
288 | 			// AHLetter (MidLetter | MidNumLetQ) * AHLetter
289 | 			NEXT();
290 | 			EXTEND();
291 | 			NEXT();
292 | 			goto ALetter;
293 | 		default:
294 | 			break;
295 | 		}
296 | 
297 | 		if (scan->prop == WORD_BREAK_SINGLE_QUOTE) {
298 | 			NEXT();
299 | 			goto Any;
300 | 		}
301 | 
302 | 		goto MaybeBreak;
303 | 
304 | 
305 | 	case WORD_BREAK_DOUBLE_QUOTE:
306 | 		// WB7b: Hebrew_Letter * Double_Quote Hebrew_Letter
307 | 		switch (next_signif_prop(scan)) {
308 | 		case WORD_BREAK_HEBREW_LETTER:
309 | 			// Wb7c:
310 | 			//   Hebrew_Letter Double_Quote * Hebrew_Letter
311 | 			NEXT();
312 | 			EXTEND();
313 | 			NEXT();
314 | 			goto Hebrew_Letter;
315 | 		default:
316 | 			goto MaybeBreak;
317 | 		}
318 | 
319 | 	case WORD_BREAK_NUMERIC:
320 | 		// WB9: AHLetter * Numeric
321 | 		NEXT();
322 | 		goto Numeric;
323 | 
324 | 	case WORD_BREAK_EXTENDNUMLET:
325 | 		// WB13a: AHLetter * ExtendNumLet
326 | 		NEXT();
327 | 		goto ExtendNumLet;
328 | 
329 | 	default:
330 | 		goto MaybeBreak;
331 | 	}
332 | 
333 | Numeric:
334 | 	EXTEND();
335 | 
336 | 	switch (scan->prop) {
337 | 	case WORD_BREAK_NUMERIC:
338 | 		// WB8: Numeric * Numeric
339 | 		NEXT();
340 | 		goto Numeric;
341 | 
342 | 	case WORD_BREAK_MIDNUMLET:
343 | 	case WORD_BREAK_SINGLE_QUOTE:
344 | 	case WORD_BREAK_MIDNUM:
345 | 		// WB12: Numeric * (MidNum | MidNumLetQ) Numeric
346 | 		if (next_signif_prop(scan) == WORD_BREAK_NUMERIC) {
347 | 			// WB11: Numeric (MidNum|MidNumLeqQ) * Numeric
348 | 			NEXT();
349 | 			EXTEND();
350 | 			NEXT();
351 | 			goto Numeric;
352 | 		}
353 | 		goto MaybeBreak;
354 | 
355 | 	case WORD_BREAK_EXTENDNUMLET:
356 | 		// WB13a: Numeric * ExtendNumLet
357 | 		NEXT();
358 | 		goto ExtendNumLet;
359 | 
360 | 	case WORD_BREAK_ALETTER:
361 | 		// WB10: Numeric * AHLetter
362 | 		NEXT();
363 | 		goto ALetter;
364 | 
365 | 	case WORD_BREAK_HEBREW_LETTER:
366 | 		// WB10: Numeric * AHLetter
367 | 		NEXT();
368 | 		goto Hebrew_Letter;
369 | 
370 | 	default:
371 | 		goto MaybeBreak;
372 | 	}
373 | 
374 | Katakana:
375 | 	EXTEND();
376 | 
377 | 	switch (scan->prop) {
378 | 	case WORD_BREAK_KATAKANA:
379 | 		// WB13: Katakana * Katakana
380 | 		NEXT();
381 | 		goto Katakana;
382 | 
383 | 	case WORD_BREAK_EXTENDNUMLET:
384 | 		// WB13a: Katakana * ExtendNumLet
385 | 		NEXT();
386 | 		goto ExtendNumLet;
387 | 
388 | 	default:
389 | 		goto MaybeBreak;
390 | 	}
391 | 
392 | ExtendNumLet:
393 | 	EXTEND();
394 | 
395 | 	switch (scan->prop) {
396 | 	case WORD_BREAK_ALETTER:
397 | 		// WB13b: ExtendNumLet * AHLetter
398 | 		NEXT();
399 | 		goto ALetter;
400 | 
401 | 	case WORD_BREAK_NUMERIC:
402 | 		// WB13b: ExtendNumLet * Numeric
403 | 		NEXT();
404 | 		goto Numeric;
405 | 
406 | 	case WORD_BREAK_EXTENDNUMLET:
407 | 		// WB13a: ExtendNumLet * ExtendNumLet
408 | 		NEXT();
409 | 		goto ExtendNumLet;
410 | 
411 | 	case WORD_BREAK_HEBREW_LETTER:
412 | 		// WB13b: ExtendNumLet * AHLetter
413 | 		NEXT();
414 | 		goto Hebrew_Letter;
415 | 
416 | 	case WORD_BREAK_KATAKANA:
417 | 		// WB13c: ExtendNumLet * Katakana
418 | 		NEXT();
419 | 		goto Katakana;
420 | 
421 | 	default:
422 | 		goto MaybeBreak;
423 | 	}
424 | 
425 | Regional_Indicator:
426 | 	EXTEND();
427 | 
428 | 	//fprintf(stderr, "Regional_Indicator: code = U+%04X\n", code);
429 | 
430 | 	// Do not break within emoji flag sequences. That is, do not break
431 | 	// between regional indicator (RI) symbols if there is an odd number
432 | 	// of RI characters before the break point
433 | 
434 | 	switch (scan->prop) {
435 | 	case WORD_BREAK_REGIONAL_INDICATOR:
436 | 		// WB15/16: [^RI] RI * RI
437 | 		NEXT();
438 | 		EXTEND();
439 | 		goto MaybeBreak;
440 | 
441 | 	default:
442 | 		// WB15/16: [^RI] RI * RI
443 | 		goto MaybeBreak;
444 | 	}
445 | 
446 | Any:
447 | 	EXTEND();
448 | 	goto MaybeBreak;
449 | 
450 | MaybeBreak:
451 | 	// WB3c: Do not break within emoji zwj sequences.
452 | 	if (follow_zwj && (emoji_prop(scan->code)
453 | 			   & EMOJI_PROP_EXTENDED_PICTOGRAPHIC)) {
454 | 		NEXT();
455 | 		goto Start;
456 | 	}
457 | 	goto Break;
458 | 	
459 | Break:
460 | 	scan->current.attr |= (size_t)(scan->ptr - scan->current.ptr);
461 | 	return (scan->ptr == scan->current.ptr) ? 0 : 1;
462 | }
463 | 


--------------------------------------------------------------------------------
/tests/check_charwidth.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Patrick O. Perry.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <check.h>
 18 | #include <stdint.h>
 19 | #include <stdio.h>
 20 | #include "../src/utf8lite.h"
 21 | #include "wcwidth9/wcwidth9.h"
 22 | #include "testutil.h"
 23 | 
 24 | struct code_width {
 25 | 	int32_t code;
 26 | 	int width;
 27 | };
 28 | 
 29 | START_TEST(test_examples)
 30 | {
 31 | 	struct code_width tests[] = {
 32 | 		// Examples from https://github.com/patperry/r-utf8/issues/9
 33 | 		{.code = 0x2139, .width = UTF8LITE_CHARWIDTH_NARROW},
 34 | 		{.code = 0x2600, .width = UTF8LITE_CHARWIDTH_NARROW},
 35 | 		{.code = 0x2728, .width = UTF8LITE_CHARWIDTH_EMOJI}
 36 | 	};
 37 | 	struct code_width *t;
 38 | 	int i, n, ok, prop, prop0, nfail;
 39 | 
 40 | 	n = (int)(sizeof(tests) / sizeof(tests[0]));
 41 | 	nfail = 0;
 42 | 	for (i = 0; i < n; i++) {
 43 | 		t = &tests[i];
 44 | 		prop0 = t->width;
 45 | 		prop = utf8lite_charwidth(t->code);
 46 | 		ok = (prop == prop0);
 47 | 
 48 | 		if (!ok) {
 49 | 			nfail++;
 50 | 			printf("U+%04X expected: %d got: %d\n", t->code,
 51 | 				prop0, prop);
 52 | 		}
 53 | 	}
 54 | 
 55 | 	ck_assert(nfail == 0);
 56 | }
 57 | END_TEST
 58 | 
 59 | /*
 60 |  * This check is kind of meaningless. wcwidth9 has Unicode 9.0.0, gives
 61 |  * different behavior for lots of characters.
 62 |  */
 63 | START_TEST(test_wcwidth9)
 64 | {
 65 | 	int prop, prop0, ok, nfail;
 66 | 	int32_t code;
 67 | 
 68 | 	nfail = 0;
 69 | 	for (code = 0; code <= UTF8LITE_CODE_MAX; code++) {
 70 | 		prop0 = (code < 0x10FFFE) ? wcwidth9(code) : -3;
 71 | 		prop = utf8lite_charwidth(code);
 72 | 
 73 | 		if (code == 0x1F93B || code == 0x1F946) {
 74 | 			// These characters changed from East Asian Wide
 75 | 			// to Narrow in Unicode 13.0
 76 | 			ok = prop == UTF8LITE_CHARWIDTH_NARROW;
 77 | 			goto Check;
 78 | 		} else if (0x1F1E6 <= code && code <= 0x1F1FF) {
 79 | 			// regional indicators
 80 | 			ok = prop == UTF8LITE_CHARWIDTH_NARROW;
 81 | 			goto Check;
 82 | 		}
 83 | 
 84 | 		switch (prop) {
 85 | 		case UTF8LITE_CHARWIDTH_NONE:
 86 | 			ok = prop0 == -1 || prop0 == -3 || prop0 == 1;
 87 | 			break;
 88 | 
 89 | 		case UTF8LITE_CHARWIDTH_IGNORABLE:
 90 | 			ok = prop0 == -1 || prop0 >= 1;
 91 | 			break;
 92 | 
 93 | 		case UTF8LITE_CHARWIDTH_MARK:
 94 | 			ok = prop0 == -1 || prop0 == 1;
 95 | 			break;
 96 | 
 97 | 		case UTF8LITE_CHARWIDTH_NARROW:
 98 | 			ok = prop0 == 1 || prop0 == -1 || code > 0xFFFF;
 99 | 			break;
100 | 
101 | 		case UTF8LITE_CHARWIDTH_AMBIGUOUS:
102 | 			ok = prop0 == -2;
103 | 			break;
104 | 
105 | 		case UTF8LITE_CHARWIDTH_WIDE:
106 | 			ok = prop0 == 2 || prop0 == -1;
107 | 			break;
108 | 
109 | 		case UTF8LITE_CHARWIDTH_EMOJI:
110 | 			ok = prop0 == 2 || prop0 == 1 || prop0 == -1 || prop0 == -2;
111 | 			break;
112 | 
113 | 		default:
114 | 			ok = 0;
115 | 			break;
116 | 		}
117 | Check:
118 | 		if (!ok) {
119 | 			nfail++;
120 | 			printf("U+%04X wcwidth9: %d utf8lite: %d\n", code, prop0, prop);
121 | 		}
122 | 	}
123 | 
124 | 	ck_assert(nfail == 0);
125 | }
126 | END_TEST
127 | 
128 | 
129 | Suite *charwidth_suite(void)
130 | {
131 |         Suite *s;
132 |         TCase *tc;
133 | 
134 |         s = suite_create("charwidth");
135 | 	tc = tcase_create("core");
136 |         tcase_add_test(tc, test_examples);
137 |         suite_add_tcase(s, tc);
138 | 
139 | 	tc = tcase_create("wcwidth9");
140 |         tcase_add_test(tc, test_wcwidth9);
141 |         suite_add_tcase(s, tc);
142 | 
143 | 	return s;
144 | }
145 | 
146 | 
147 | int main(void)
148 | {
149 |         int number_failed;
150 |         Suite *s;
151 |         SRunner *sr;
152 | 
153 |         s = charwidth_suite();
154 |         sr = srunner_create(s);
155 | 
156 |         srunner_run_all(sr, CK_NORMAL);
157 |         number_failed = srunner_ntests_failed(sr);
158 |         srunner_free(sr);
159 | 
160 |         return (number_failed == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
161 | }
162 | 


--------------------------------------------------------------------------------
/tests/check_graphscan.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Patrick O. Perry.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <stdio.h>
 18 | #include <stdlib.h>
 19 | #include <check.h>
 20 | #include "../src/utf8lite.h"
 21 | #include "testutil.h"
 22 | 
 23 | #define GRAPH_BREAK_TEST "data/ucd/auxiliary/GraphemeBreakTest.txt"
 24 | struct utf8lite_graphscan scan;
 25 | 
 26 | 
 27 | void setup_scan(void)
 28 | {
 29 | 	setup();
 30 | }
 31 | 
 32 | 
 33 | void teardown_scan(void)
 34 | {
 35 | 	teardown();
 36 | }
 37 | 
 38 | 
 39 | void start(const struct utf8lite_text *text)
 40 | {
 41 | 	utf8lite_graphscan_make(&scan, text);
 42 | }
 43 | 
 44 | 
 45 | const struct utf8lite_text *next(void)
 46 | {
 47 | 	struct utf8lite_text *graph;
 48 | 	if (!utf8lite_graphscan_advance(&scan)) {
 49 | 		return NULL;
 50 | 	}
 51 | 	graph = alloc(sizeof(*graph));
 52 | 	*graph = scan.current.text;
 53 | 	return graph;
 54 | }
 55 | 
 56 | 
 57 | const struct utf8lite_text *prev(void)
 58 | {
 59 | 	struct utf8lite_text *graph;
 60 | 	if (!utf8lite_graphscan_retreat(&scan)) {
 61 | 		return NULL;
 62 | 	}
 63 | 	graph = alloc(sizeof(*graph));
 64 | 	*graph = scan.current.text;
 65 | 	return graph;
 66 | }
 67 | 
 68 | 
 69 | START_TEST(test_empty)
 70 | {
 71 | 	start(S(""));
 72 | 	ck_assert(next() == NULL);
 73 | 	ck_assert(next() == NULL);
 74 | 	ck_assert(prev() == NULL);
 75 | 	ck_assert(prev() == NULL);
 76 | }
 77 | END_TEST
 78 | 
 79 | 
 80 | START_TEST(test_single)
 81 | {
 82 | 	start(S("x"));
 83 | 	ck_assert(prev() == NULL);
 84 | 	assert_text_eq(next(), S("x"));
 85 | 	ck_assert(prev() == NULL);
 86 | 	assert_text_eq(next(), S("x"));
 87 | 	ck_assert(next() == NULL);
 88 | 	ck_assert(next() == NULL);
 89 | 	assert_text_eq(prev(), S("x"));
 90 | 	ck_assert(prev() == NULL);
 91 | 	ck_assert(prev() == NULL);
 92 | }
 93 | END_TEST
 94 | 
 95 | 
 96 | START_TEST(test_emoji_modifier)
 97 | {
 98 | 	// This is an Extended_Pictographic followed by Extend
 99 | 	start(JS("\\uD83D\\uDE0A\\uD83C\\uDFFB")); // U+1F60A U+1F3FB
100 | 	assert_text_eq(next(), JS("\\uD83D\\uDE0A\\uD83C\\uDFFB"));
101 | 	ck_assert(next() == NULL);
102 | }
103 | END_TEST
104 | 
105 | 
106 | START_TEST(test_emoji_zwj_sequence)
107 | {
108 | 	// \U0001F469\u200D\u2764\uFE0F\u200D\U0001F48B\u200D\U0001F469
109 | 	start(JS("\\ud83d\\udc69\\u200d\\u2764\\ufe0f\\u200d\\ud83d\\udc8b\\u200d\\ud83d\\udc69"));
110 | 
111 | 	assert_text_eq(next(), JS("\\ud83d\\udc69\\u200d\\u2764\\ufe0f\\u200d\\ud83d\\udc8b\\u200d\\ud83d\\udc69"));
112 | 
113 | 	ck_assert(next() == NULL);
114 | }
115 | END_TEST
116 | 
117 | // Check that isolated codepoints are single graphemes.
118 | START_TEST(test_isolated)
119 | {
120 | 	uint8_t buf[4];
121 | 	uint8_t *end;
122 | 	int32_t code;
123 | 	struct utf8lite_text text;
124 | 
125 | 	for (code = 1; code <= 0x1FFF; code++) {
126 | 		if (!UTF8LITE_IS_UNICODE(code))
127 | 			continue;
128 | 		end = buf;
129 | 		utf8lite_encode_utf8(code, &end);
130 | 		utf8lite_text_assign(&text, buf, end - buf, 0, NULL);
131 | 
132 | 		start(&text);
133 | 		assert_text_eq(next(), &text);
134 | 		ck_assert(next() == NULL);
135 | 	}
136 | }
137 | END_TEST
138 | 
139 | // Unicode Grapheme Break Test
140 | // http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt
141 | struct unitest {
142 | 	char comment[4096];
143 | 	unsigned line;
144 | 	int is_ascii;
145 | 
146 | 	struct utf8lite_text text;
147 | 	uint8_t buf[4096];
148 | 
149 | 	int32_t code[256];
150 | 	int can_break_before[256];
151 | 	uint8_t *code_end[256];
152 | 	unsigned ncode;
153 | 
154 | 	uint8_t *break_begin[256];
155 | 	uint8_t *break_end[256];
156 | 	unsigned nbreak;
157 | 
158 | };
159 | 
160 | struct unitest unitests[4096];
161 | unsigned nunitest;
162 | 
163 | void write_unitest(FILE *stream, const struct unitest *test)
164 | {
165 | 	unsigned i, n = test->ncode;
166 | 
167 | 	for (i = 0; i < n; i++) {
168 | 		fprintf(stream, "%s %04X ",
169 | 			(test->can_break_before[i]) ? "\xC3\xB7" : "\xC3\x97",
170 | 			test->code[i]);
171 | 	}
172 | 	fprintf(stream, "\xC3\xB7 %s\n", test->comment);
173 | }
174 | 
175 | void setup_unicode(void)
176 | {
177 | 	struct unitest *test;
178 | 	FILE *file;
179 | 	unsigned code, line, nbreak, ncode;
180 | 	uint8_t *dst;
181 | 	char *comment;
182 | 	int ch, is_ascii;
183 | 
184 | 	setup_scan();
185 | 	file = fopen(GRAPH_BREAK_TEST, "r");
186 | 	if (!file) {
187 | 		file = fopen("../"GRAPH_BREAK_TEST, "r");
188 | 	}
189 | 
190 | 	nunitest = 0;
191 | 	test = &unitests[0];
192 | 
193 | 	line = 1;
194 | 	ncode = 0;
195 | 	nbreak = 0;
196 | 	is_ascii = 1;
197 | 	test->text.ptr = &test->buf[0];
198 | 	dst = test->text.ptr;
199 | 
200 | 	ck_assert_msg(file != NULL, "file '"GRAPH_BREAK_TEST"' not found");
201 | 	while ((ch = fgetc(file)) != EOF) {
202 | 		switch (ch) {
203 | 		case '#':
204 | 			comment = &test->comment[0];
205 | 			do {
206 | 				*comment++ = (char)ch;
207 | 				ch = fgetc(file);
208 | 			} while (ch != EOF && ch != '\n');
209 | 			*comment = '\0';
210 | 
211 | 			if (ch == EOF) {
212 | 				goto eof;
213 | 			}
214 | 			/* fallthrough */
215 | 		case '\n':
216 | 			*dst = '\0';
217 | 
218 | 			test->line = line;
219 | 			test->is_ascii = is_ascii;
220 | 			test->text.attr = (size_t)(dst - test->text.ptr);
221 | 
222 | 			if (ncode > 0) {
223 | 				test->ncode = ncode;
224 | 				test->nbreak = nbreak;
225 | 				ncode = 0;
226 | 				nbreak = 0;
227 | 				is_ascii = 1;
228 | 				nunitest++;
229 | 				test = &unitests[nunitest];
230 | 				test->text.ptr = &test->buf[0];
231 | 				test->comment[0] = '\0';
232 | 				dst = test->text.ptr;
233 | 			}
234 | 			line++;
235 | 			break;
236 | 
237 | 		case 0xC3:
238 | 			ch = fgetc(file);
239 | 			if (ch == EOF) {
240 | 				goto eof;
241 | 			} else if (ch == 0x97) {
242 | 				// MULTIPLICATON SIGN (U+00D7) 0xC3 0x97
243 | 				test->can_break_before[ncode] = 0;
244 | 			} else if (ch == 0xB7) {
245 | 				// DIVISION SIGN (U+00F7) 0xC3 0xB7
246 | 				test->can_break_before[ncode] = 1;
247 | 			} else {
248 | 				goto inval;
249 | 			}
250 | 
251 | 			if (test->can_break_before[ncode]) {
252 | 				test->break_begin[nbreak] = dst;
253 | 				if (nbreak > 0) {
254 | 					test->break_end[nbreak - 1] = dst;
255 | 				}
256 | 				nbreak++;
257 | 			}
258 | 
259 | 			if (fscanf(file, "%x", &code)) {
260 | 				test->code[ncode] = (int32_t)code;
261 | 				if (code > 0x7F) {
262 | 					is_ascii = 0;
263 | 				}
264 | 				utf8lite_encode_utf8((int32_t)code, &dst);
265 | 				test->code_end[ncode] = dst;
266 | 				ncode++;
267 | 			} else {
268 | 				test->break_end[nbreak - 1] = dst;
269 | 				nbreak--;
270 | 			}
271 | 			break;
272 | 		}
273 | 
274 | 	}
275 | eof:
276 | 	fclose(file);
277 | 	return;
278 | inval:
279 | 	fprintf(stderr, "invalid character on line %d\n", line);
280 | 	fclose(file);
281 | }
282 | 
283 | 
284 | void teardown_unicode(void)
285 | {
286 | 	teardown_scan();
287 | }
288 | 
289 | 
290 | START_TEST(test_unicode_forward)
291 | {
292 | 	struct unitest *test;
293 | 	unsigned i, j;
294 | 
295 | 	for (i = 0; i < nunitest; i++) {
296 | 		test = &unitests[i];
297 | 
298 | 		//fprintf(stderr, "[%u]: ", i);
299 | 		//write_unitest(stderr, test);
300 | 		utf8lite_graphscan_make(&scan, &test->text);
301 | 
302 | 		for (j = 0; j < test->nbreak; j++) {
303 | 			//fprintf(stderr, "Break %u\n", j);
304 | 			ck_assert(utf8lite_graphscan_advance(&scan));
305 | 			ck_assert(scan.current.text.ptr
306 | 					== test->break_begin[j]);
307 | 			ck_assert(scan.current.text.ptr
308 | 				  + UTF8LITE_TEXT_SIZE(&scan.current.text)
309 | 				  == test->break_end[j]);
310 | 		}
311 | 		ck_assert(!utf8lite_graphscan_advance(&scan));
312 | 	}
313 | }
314 | END_TEST
315 | 
316 | 
317 | START_TEST(test_unicode_backward)
318 | {
319 | 	struct unitest *test;
320 | 	unsigned i, j;
321 | 
322 | 	for (i = 0; i < nunitest; i++) {
323 | 		test = &unitests[i];
324 | 
325 | 		//fprintf(stderr, "[%u]: ", i);
326 | 		//write_unitest(stderr, test);
327 | 		utf8lite_graphscan_make(&scan, &test->text);
328 | 		utf8lite_graphscan_skip(&scan);
329 | 		ck_assert(scan.current.text.ptr
330 | 				== test->break_end[test->nbreak]);
331 | 		ck_assert(scan.current.text.attr == 0);
332 | 
333 | 		j = test->nbreak;
334 | 		while (j-- > 0) {
335 | 			//fprintf(stderr, "Break %u\n", j);
336 | 			ck_assert(utf8lite_graphscan_retreat(&scan));
337 | 			ck_assert(scan.current.text.ptr
338 | 					== test->break_begin[j]);
339 | 			ck_assert(scan.current.text.ptr
340 | 				  + UTF8LITE_TEXT_SIZE(&scan.current.text)
341 | 				  == test->break_end[j]);
342 | 		}
343 | 		//fprintf(stderr, "Start\n");
344 | 		ck_assert(!utf8lite_graphscan_retreat(&scan));
345 | 		ck_assert(!utf8lite_graphscan_retreat(&scan));
346 | 	}
347 | }
348 | END_TEST
349 | 
350 | 
351 | Suite *graphscan_suite(void)
352 | {
353 |         Suite *s;
354 |         TCase *tc;
355 | 
356 |         s = suite_create("graphscan");
357 | 	tc = tcase_create("core");
358 |         tcase_add_checked_fixture(tc, setup_scan, teardown_scan);
359 |         tcase_add_test(tc, test_empty);
360 |         tcase_add_test(tc, test_single);
361 |         tcase_add_test(tc, test_emoji_modifier);
362 | 	tcase_add_test(tc, test_emoji_zwj_sequence);
363 | 	tcase_add_test(tc, test_isolated);
364 |         suite_add_tcase(s, tc);
365 | 
366 |         tc = tcase_create("Unicode GraphemeBreakTest.txt");
367 |         tcase_add_checked_fixture(tc, setup_unicode, teardown_unicode);
368 |         tcase_add_test(tc, test_unicode_forward);
369 |         tcase_add_test(tc, test_unicode_backward);
370 |         suite_add_tcase(s, tc);
371 | 
372 | 	return s;
373 | }
374 | 
375 | 
376 | int main(void)
377 | {
378 |         int number_failed;
379 |         Suite *s;
380 |         SRunner *sr;
381 | 
382 |         s = graphscan_suite();
383 |         sr = srunner_create(s);
384 | 
385 |         srunner_run_all(sr, CK_NORMAL);
386 |         number_failed = srunner_ntests_failed(sr);
387 |         srunner_free(sr);
388 | 
389 |         return (number_failed == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
390 | }
391 | 


--------------------------------------------------------------------------------
/tests/check_textmap.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Patrick O. Perry.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <stdio.h>
 18 | #include <stdlib.h>
 19 | #include <check.h>
 20 | #include "../src/utf8lite.h"
 21 | #include "testutil.h"
 22 | 
 23 | 
 24 | #define TEXTMAP_CASE	UTF8LITE_TEXTMAP_CASE
 25 | #define TEXTMAP_COMPAT	UTF8LITE_TEXTMAP_COMPAT
 26 | #define TEXTMAP_QUOTE	UTF8LITE_TEXTMAP_QUOTE
 27 | #define TEXTMAP_RMDI	UTF8LITE_TEXTMAP_RMDI
 28 | 
 29 | 
 30 | struct utf8lite_text *get_map(const struct utf8lite_text *text, int flags)
 31 | {
 32 | 	struct utf8lite_text *val;
 33 | 	struct utf8lite_textmap map;
 34 | 	size_t size;
 35 | 
 36 | 	ck_assert(!utf8lite_textmap_init(&map, flags));
 37 | 	ck_assert(!utf8lite_textmap_set(&map, text));
 38 | 
 39 | 	size = UTF8LITE_TEXT_SIZE(&map.text);
 40 | 	val = alloc(sizeof(*val));
 41 | 
 42 | 	val->ptr = alloc(size + 1);
 43 | 	memcpy(val->ptr, map.text.ptr, size);
 44 | 	val->ptr[size] = '\0';
 45 | 	val->attr = map.text.attr;
 46 | 
 47 | 	utf8lite_textmap_destroy(&map);
 48 | 	return val;
 49 | }
 50 | 
 51 | 
 52 | struct utf8lite_text *casefold(const struct utf8lite_text *text)
 53 | {
 54 | 	return get_map(text, TEXTMAP_CASE);
 55 | }
 56 | 
 57 | 
 58 | START_TEST(test_map_basic)
 59 | {
 60 | 	assert_text_eq(get_map(S("hello"), 0), S("hello"));
 61 | 	assert_text_eq(get_map(S("world"), 0), JS("world"));
 62 | 	assert_text_eq(get_map(JS("foo"), 0), S("foo"));
 63 | }
 64 | END_TEST
 65 | 
 66 | 
 67 | START_TEST(test_map_esc)
 68 | {
 69 | 	// backslash
 70 | 	assert_text_eq(get_map(S("\\"), 0), S("\\"));
 71 | 	assert_text_eq(get_map(JS("\\\\"), 0), S("\\"));
 72 | 	assert_text_eq(get_map(JS("\\u005C"), 0), S("\\"));
 73 | 	assert_text_eq(get_map(S("\\\\"), 0), S("\\\\"));
 74 | 	assert_text_eq(get_map(S("\\u005C"), TEXTMAP_CASE), S("\\u005c"));
 75 | 
 76 | 	// quote (')
 77 | 	assert_text_eq(get_map(S("'"), TEXTMAP_QUOTE), S("'"));
 78 | 	assert_text_eq(get_map(JS("'"), TEXTMAP_QUOTE), S("'"));
 79 | 	assert_text_eq(get_map(S("\""), TEXTMAP_QUOTE), S("\""));
 80 | 	assert_text_eq(get_map(JS("\\\""), TEXTMAP_QUOTE), S("\""));
 81 | 	assert_text_eq(get_map(JS("\\u2019"), TEXTMAP_QUOTE), S("\'"));
 82 | 	//assert_text_eq(get_map(JS("\\u201c"), TEXTMAP_QUOTE), S("\""));
 83 | 	assert_text_eq(get_map(S("\\\'"), TEXTMAP_QUOTE), S("\\\'"));
 84 | 	assert_text_eq(get_map(S("\\u2019"), TEXTMAP_QUOTE), S("\\u2019"));
 85 | }
 86 | END_TEST
 87 | 
 88 | 
 89 | START_TEST(test_keep_control_ascii)
 90 | {
 91 | 	const struct utf8lite_text *js, *t;
 92 | 	char str[256];
 93 | 	uint8_t i;
 94 | 
 95 | 	assert_text_eq(get_map(S("\a"), 0), S("\a"));
 96 | 	assert_text_eq(get_map(S("\b"), 0), S("\b"));
 97 | 
 98 | 	// C0
 99 | 	for (i = 1; i < 0x20; i++) {
100 | 		if (0x09 <= i && i <= 0x0D) {
101 | 			continue;
102 | 		}
103 | 		str[0] = (char)i; str[1] = '\0';
104 | 		t = S(str);
105 | 		assert_text_eq(get_map(t, 0), t);
106 | 
107 | 		sprintf(str, "\\u%04X", i);
108 | 		js = JS(str);
109 | 		assert_text_eq(get_map(js, 0), t);
110 | 	}
111 | 
112 | 	// delete
113 | 	assert_text_eq(get_map(S("\x7F"), 0), S("\x7F"));
114 | 	assert_text_eq(get_map(JS("\\u007F"), 0), S("\x7F"));
115 | }
116 | END_TEST
117 | 
118 | 
119 | START_TEST(test_keep_control_utf8)
120 | {
121 | 	const struct utf8lite_text *t, *js;
122 | 	uint8_t str[256];
123 | 	uint8_t i;
124 | 
125 | 	// C1
126 | 	for (i = 0x80; i < 0xA0; i++) {
127 | 		if (i == 0x85) {
128 | 			continue;
129 | 		}
130 | 
131 | 		str[0] = 0xC2; str[1] = i; str[2] = '\0';
132 | 		t = S((char *)str);
133 | 		assert_text_eq(get_map(t, 0), t);
134 | 
135 | 		sprintf((char *)str, "\\u%04X", i);
136 | 		js = JS((char *)str);
137 | 		assert_text_eq(get_map(js, 0), t);
138 | 	}
139 | }
140 | END_TEST
141 | 
142 | 
143 | START_TEST(test_keep_ws_ascii)
144 | {
145 | 	assert_text_eq(get_map(S("\t"), 0), S("\t"));
146 | 	assert_text_eq(get_map(S("\n"), 0), S("\n"));
147 | 	assert_text_eq(get_map(S("\v"), 0), S("\v"));
148 | 	assert_text_eq(get_map(S("\f"), 0), S("\f"));
149 | 	assert_text_eq(get_map(S("\r"), 0), S("\r"));
150 | 	assert_text_eq(get_map(S(" "), 0), S(" "));
151 | }
152 | END_TEST
153 | 
154 | 
155 | START_TEST(test_keep_ws_utf8)
156 | {
157 | 	const struct utf8lite_text *t, *js, *text;
158 | 	uint8_t str[256];
159 | 	uint8_t *buf;
160 | 	unsigned ws[] = { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D,
161 | 			  0x0020, 0x0085, 0x00A0, 0x1680, 0x2000,
162 | 			  0x2001, 0x2002, 0x2003, 0x2004, 0x2005,
163 | 			  0x2006, 0x2007, 0x2008, 0x2009, 0x200A,
164 | 			  0x2028, 0x2029, 0x202F, 0x205F, 0x3000 };
165 | 	int i, n = sizeof(ws) / sizeof(ws[0]);
166 | 
167 | 	for (i = 0; i < n; i++) {
168 | 		//fprintf(stderr, "i = %d; ws = U+%04X\n", i, ws[i]);
169 | 
170 | 		switch (ws[i]) {
171 | 		case 0x0009:
172 | 			text = S("\t");
173 | 			break;
174 | 		case 0x000A:
175 | 			text = S("\n");
176 | 			break;
177 | 		case 0x000B:
178 | 			text = S("\v");
179 | 			break;
180 | 		case 0x000C:
181 | 			text = S("\f");
182 | 			break;
183 | 		case 0x000D:
184 | 			text = S("\r");
185 | 			break;
186 | 		case 0x0085: // NEXT LINE (NEL)
187 | 			text = S("\xC2\x85");
188 | 			break;
189 | 		case 0x1680: // OGHAM SPACE MARK
190 | 			text = S("\xE1\x9A\x80");
191 | 			break;
192 | 		case 0x2028: // LINE SEPARATOR
193 | 			text = S("\xE2\x80\xA8");
194 | 			break;
195 | 		case 0x2029: // PARAGRAPH SEPARATOR
196 | 			text = S("\xE2\x80\xA9");
197 | 			break;
198 | 		default:
199 | 			text = S(" ");
200 | 			break;
201 | 		}
202 | 
203 | 		buf = str;
204 | 		utf8lite_encode_utf8(ws[i], &buf);
205 | 		*buf = '\0';
206 | 		t = S((char *)str);
207 | 		assert_text_eq(get_map(t, TEXTMAP_COMPAT), text);
208 | 
209 | 		sprintf((char *)str, "\\u%04x", ws[i]);
210 | 		js = JS((char *)str);
211 | 		assert_text_eq(get_map(js, TEXTMAP_COMPAT), text);
212 | 	}
213 | }
214 | END_TEST
215 | 
216 | 
217 | // removed the following features from textmap, no need to test
218 | #if 0
219 | 
220 | /*
221 |  * Control Characters (Cc)
222 |  * -----------------------
223 |  *
224 |  *	U+0000..U+001F	(C0)
225 |  *	U+007F		(delete)
226 |  *	U+0080..U+009F	(C1)
227 |  *
228 |  * Source: UnicodeStandard-8.0, Sec. 23.1, p. 808.
229 |  */
230 | 
231 | START_TEST(test_rm_control_ascii)
232 | {
233 | 	char str[256];
234 | 	uint8_t i;
235 | 
236 | 	assert_text_eq(get_map(S("\a"), TYPE_RMCC), S(""));
237 | 	assert_text_eq(get_map(S("\b"), TYPE_RMCC), S(""));
238 | 	assert_text_eq(get_map(S("\t"), TYPE_RMCC), S("\t"));
239 | 	assert_text_eq(get_map(S("\n"), TYPE_RMCC), S("\n"));
240 | 	assert_text_eq(get_map(S("\v"), TYPE_RMCC), S("\v"));
241 | 	assert_text_eq(get_map(S("\f"), TYPE_RMCC), S("\f"));
242 | 	assert_text_eq(get_map(S("\r"), TYPE_RMCC), S("\r"));
243 | 
244 | 	// C0
245 | 	for (i = 1; i < 0x20; i++) {
246 | 		if (0x09 <= i && i <= 0x0D) {
247 | 			continue;
248 | 		}
249 | 
250 | 		str[0] = (char)i; str[1] = '\0';
251 | 		assert_text_eq(get_map(S(str), TYPE_RMCC), S(""));
252 | 
253 | 		sprintf(str, "\\u%04X", i);
254 | 		assert_text_eq(get_map(JS(str), TYPE_RMCC), S(""));
255 | 	}
256 | 
257 | 	// delete
258 | 	assert_text_eq(get_map(S("\x7F"), TYPE_RMCC), S(""));
259 | 	assert_text_eq(get_map(JS("\\u007F"), TYPE_RMCC), S(""));
260 | }
261 | END_TEST
262 | 
263 | 
264 | START_TEST(test_rm_control_utf8)
265 | {
266 | 	uint8_t str[256];
267 | 	uint8_t i;
268 | 
269 | 	// C1: JSON
270 | 	for (i = 0x80; i < 0xA0; i++) {
271 | 		if (i == 0x85) {
272 | 			continue;
273 | 		}
274 | 
275 | 		str[0] = 0xC2; str[1] = i; str[2] = '\0';
276 | 		assert_text_eq(get_map(S((char *)str), TYPE_RMCC), S(""));
277 | 
278 | 		sprintf((char *)str, "\\u%04X", i);
279 | 		assert_text_eq(get_map(JS((char *)str), TYPE_RMCC), S(""));
280 | 	}
281 | }
282 | END_TEST
283 | 
284 | 
285 | START_TEST(test_rm_ws_ascii)
286 | {
287 | 	assert_text_eq(get_map(S("\t"), TYPE_RMWS), S(""));
288 | 	assert_text_eq(get_map(S("\n"), TYPE_RMWS), S(""));
289 | 	assert_text_eq(get_map(S("\v"), TYPE_RMWS), S(""));
290 | 	assert_text_eq(get_map(S("\f"), TYPE_RMWS), S(""));
291 | 	assert_text_eq(get_map(S("\r"), TYPE_RMWS), S(""));
292 | 	assert_text_eq(get_map(S(" "), TYPE_RMWS), S(""));
293 | }
294 | END_TEST
295 | 
296 | 
297 | START_TEST(test_rm_ws_utf8)
298 | {
299 | 	const struct utf8lite_text *t, *js;
300 | 	uint8_t str[256];
301 | 	uint8_t *buf;
302 | 	uint32_t ws[] = { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D,
303 | 			  0x0020, 0x0085, 0x00A0, 0x1680, 0x2000,
304 | 			  0x2001, 0x2002, 0x2003, 0x2004, 0x2005,
305 | 			  0x2006, 0x2007, 0x2008, 0x2009, 0x200A,
306 | 			  0x2028, 0x2029, 0x202F, 0x205F, 0x3000 };
307 | 	int i, n = sizeof(ws) / sizeof(ws[0]);
308 | 
309 | 	for (i = 0; i < n; i++) {
310 | 		buf = str;
311 | 		utf8lite_encode_utf8(ws[i], &buf);
312 | 		*buf = '\0';
313 | 		t = S((char *)str);
314 | 		assert_text_eq(get_map(t, TYPE_RMWS), S(""));
315 | 
316 | 		sprintf((char *)str, "\\u%04x", ws[i]);
317 | 		js = JS((char *)str);
318 | 		assert_text_eq(get_map(js, TYPE_RMWS), S(""));
319 | 	}
320 | }
321 | END_TEST
322 | 
323 | #endif
324 | 
325 | START_TEST(test_casefold_ascii)
326 | {
327 | 	const struct utf8lite_text *text;
328 | 	uint8_t buf[2] = { 0, 0 };
329 | 	uint8_t i;
330 | 
331 | 	assert_text_eq(casefold(S("UPPER CASE")), S("upper case"));
332 | 	assert_text_eq(casefold(S("lower case")), S("lower case"));
333 | 	assert_text_eq(casefold(S("mIxEd CaSe")), S("mixed case"));
334 | 
335 | 	for (i = 0x01; i < 'A'; i++) {
336 | 		buf[0] = i;
337 | 		text = S((char *)buf);
338 | 		assert_text_eq(casefold(text), text);
339 | 	}
340 | 	for (i = 'Z' + 1; i < 0x7F; i++) {
341 | 		buf[0] = i;
342 | 		text = S((char *)buf);
343 | 		assert_text_eq(casefold(text), text);
344 | 	}
345 | 
346 | 	// upper
347 | 	assert_text_eq(casefold(S("ABCDEFGHIJKLMNOPQRSTUVWXYZ")),
348 | 		       S("abcdefghijklmnopqrstuvwxyz"));
349 | 
350 | 	// lower
351 | 	assert_text_eq(casefold(S("abcdefghijklmnopqrstuvwxyz")),
352 | 		       S("abcdefghijklmnopqrstuvwxyz"));
353 | 
354 | 	// digit
355 | 	assert_text_eq(casefold(S("0123456789")), S("0123456789"));
356 | 
357 | 	// punct
358 | 	assert_text_eq(casefold(S("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")),
359 | 		       S("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"));
360 | 
361 | 	// space
362 | 	assert_text_eq(casefold(S("\t\n\v\f\r ")), S("\t\n\v\f\r "));
363 | }
364 | END_TEST
365 | 
366 | 
367 | START_TEST(test_casefold_utf8)
368 | {
369 | 	assert_text_eq(casefold(JS("\u1e9e")), JS("ss")); // capital eszett
370 | 	assert_text_eq(casefold(JS("\u00df")), JS("ss")); // lowercase eszett
371 | }
372 | END_TEST
373 | 
374 | 
375 | // removed this feature
376 | #if 0
377 | 
378 | START_TEST(test_fold_dash)
379 | {
380 | 	assert_text_eq(get_map(S("-"), TYPE_DASHFOLD), S("-"));
381 | 	assert_text_eq(get_map(JS("\\u058A"), TYPE_DASHFOLD), S("-"));
382 | 	assert_text_eq(get_map(JS("\\u2212"), TYPE_DASHFOLD), S("-"));
383 | 	assert_text_eq(get_map(JS("\\u2E3A"), TYPE_DASHFOLD), S("-"));
384 | 	assert_text_eq(get_map(JS("\\u2E3B"), TYPE_DASHFOLD), S("-"));
385 | 	assert_text_eq(get_map(JS("\\uFF0D"), TYPE_DASHFOLD), S("-"));
386 | }
387 | END_TEST
388 | 
389 | 
390 | START_TEST(test_nofold_dash)
391 | {
392 | 	assert_text_eq(get_map(S("-"), 0), S("-"));
393 | 	assert_text_eq(get_map(JS("\\u058A"), 0), S("\xD6\x8A"));
394 | 	assert_text_eq(get_map(JS("\\u2212"), 0), S("\xE2\x88\x92"));
395 | 	assert_text_eq(get_map(JS("\\u2E3A"), 0), S("\xE2\xB8\xBA"));
396 | 	assert_text_eq(get_map(JS("\\u2E3B"), 0), S("\xE2\xB8\xBB"));
397 | 	assert_text_eq(get_map(JS("\\uFF0D"), 0), S("\xEF\xBC\x8D"));
398 | }
399 | END_TEST
400 | 
401 | #endif
402 | 
403 | 
404 | START_TEST(test_map_quote)
405 | {
406 | 	assert_text_eq(get_map(S("'"), TEXTMAP_QUOTE), S("'"));
407 | 	assert_text_eq(get_map(S("\""), TEXTMAP_QUOTE), S("\""));
408 | 	assert_text_eq(get_map(JS("\\u2018"), TEXTMAP_QUOTE), S("'"));
409 | 	assert_text_eq(get_map(JS("\\u2019"), TEXTMAP_QUOTE), S("'"));
410 | 	//assert_text_eq(get_map(JS("\\u201C"), TEXTMAP_QUOTE), S("\""));
411 | 	//assert_text_eq(get_map(JS("\\u201D"), TEXTMAP_QUOTE), S("\""));
412 | }
413 | END_TEST
414 | 
415 | 
416 | START_TEST(test_nomap_quote)
417 | {
418 | 	assert_text_eq(get_map(S("'"), 0), S("'"));
419 | 	assert_text_eq(get_map(S("\""), 0), S("\""));
420 | 	assert_text_eq(get_map(JS("\\u2018"), 0), S("\xE2\x80\x98"));
421 | 	assert_text_eq(get_map(JS("\\u2019"), 0), S("\xE2\x80\x99"));
422 | 	assert_text_eq(get_map(JS("\\u201A"), 0), S("\xE2\x80\x9A"));
423 | 	assert_text_eq(get_map(JS("\\u201F"), 0), S("\xE2\x80\x9F"));
424 | }
425 | END_TEST
426 | 
427 | 
428 | Suite *textmap_suite(void)
429 | {
430 | 	Suite *s;
431 | 	TCase *tc;
432 | 
433 | 	s = suite_create("textmap");
434 | 	tc = tcase_create("normalize");
435 | 	tcase_add_checked_fixture(tc, setup, teardown);
436 | 	tcase_add_test(tc, test_map_basic);
437 | 	tcase_add_test(tc, test_map_esc);
438 | 	// tcase_add_test(tc, test_rm_control_ascii);
439 | 	tcase_add_test(tc, test_keep_control_ascii);
440 | 	// tcase_add_test(tc, test_rm_control_utf8);
441 | 	tcase_add_test(tc, test_keep_control_utf8);
442 | 	// tcase_add_test(tc, test_rm_ws_ascii);
443 | 	tcase_add_test(tc, test_keep_ws_ascii);
444 | 	// tcase_add_test(tc, test_rm_ws_utf8);
445 | 	tcase_add_test(tc, test_keep_ws_utf8);
446 | 	tcase_add_test(tc, test_casefold_ascii);
447 | 	tcase_add_test(tc, test_casefold_utf8);
448 | 	// tcase_add_test(tc, test_fold_dash);
449 | 	// tcase_add_test(tc, test_nofold_dash);
450 | 	tcase_add_test(tc, test_map_quote);
451 | 	tcase_add_test(tc, test_nomap_quote);
452 | 	suite_add_tcase(s, tc);
453 | 
454 | 	return s;
455 | }
456 | 
457 | 
458 | int main(void)
459 | {
460 |         int number_failed;
461 |         Suite *s;
462 |         SRunner *sr;
463 | 
464 |         s = textmap_suite();
465 |         sr = srunner_create(s);
466 | 
467 |         srunner_run_all(sr, CK_NORMAL);
468 |         number_failed = srunner_ntests_failed(sr);
469 |         srunner_free(sr);
470 | 
471 |         return (number_failed == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
472 | }
473 | 


--------------------------------------------------------------------------------
/tests/check_wordscan.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Patrick O. Perry.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <stdio.h>
 18 | #include <stdlib.h>
 19 | #include <check.h>
 20 | #include "../src/utf8lite.h"
 21 | #include "testutil.h"
 22 | 
 23 | #define WORD_BREAK_TEST "data/ucd/auxiliary/WordBreakTest.txt"
 24 | struct utf8lite_wordscan scan;
 25 | 
 26 | 
 27 | void setup_scan(void)
 28 | {
 29 | 	setup();
 30 | }
 31 | 
 32 | 
 33 | void teardown_scan(void)
 34 | {
 35 | 	teardown();
 36 | }
 37 | 
 38 | 
 39 | void start(const struct utf8lite_text *text)
 40 | {
 41 | 	utf8lite_wordscan_make(&scan, text);
 42 | }
 43 | 
 44 | 
 45 | const struct utf8lite_text *next(void)
 46 | {
 47 | 	struct utf8lite_text *word;
 48 | 	if (!utf8lite_wordscan_advance(&scan)) {
 49 | 		return NULL;
 50 | 	}
 51 | 	word = alloc(sizeof(*word));
 52 | 	*word = scan.current;
 53 | 	return word;
 54 | }
 55 | 
 56 | 
 57 | START_TEST(test_figure1)
 58 | {
 59 | 	// Test Figure 1 from http://www.unicode.org/reports/tr29/
 60 | 	start(S("The quick (\"brown\") fox can't jump 32.3 feet, right?"));
 61 | 	assert_text_eq(next(), S("The"));
 62 | 	assert_text_eq(next(), S(" "));
 63 | 	assert_text_eq(next(), S("quick"));
 64 | 	assert_text_eq(next(), S(" "));
 65 | 	assert_text_eq(next(), S("("));
 66 | 	assert_text_eq(next(), S("\""));
 67 | 	assert_text_eq(next(), S("brown"));
 68 | 	assert_text_eq(next(), S("\""));
 69 | 	assert_text_eq(next(), S(")"));
 70 | 	assert_text_eq(next(), S(" "));
 71 | 	assert_text_eq(next(), S("fox"));
 72 | 	assert_text_eq(next(), S(" "));
 73 | 	assert_text_eq(next(), S("can't"));
 74 | 	assert_text_eq(next(), S(" "));
 75 | 	assert_text_eq(next(), S("jump"));
 76 | 	assert_text_eq(next(), S(" "));
 77 | 	assert_text_eq(next(), S("32.3"));
 78 | 	assert_text_eq(next(), S(" "));
 79 | 	assert_text_eq(next(), S("feet"));
 80 | 	assert_text_eq(next(), S(","));
 81 | 	assert_text_eq(next(), S(" "));
 82 | 	assert_text_eq(next(), S("right"));
 83 | 	assert_text_eq(next(), S("?"));
 84 | 	ck_assert(next() == NULL);
 85 | }
 86 | END_TEST
 87 | 
 88 | START_TEST(test_quote)
 89 | {
 90 | 	start(S("both 'single' and \"double\"."));
 91 | 	assert_text_eq(next(), S("both"));
 92 | 	assert_text_eq(next(), S(" "));
 93 | 	assert_text_eq(next(), S("'"));
 94 | 	assert_text_eq(next(), S("single"));
 95 | 	assert_text_eq(next(), S("'"));
 96 | 	assert_text_eq(next(), S(" "));
 97 | 	assert_text_eq(next(), S("and"));
 98 | 	assert_text_eq(next(), S(" "));
 99 | 	assert_text_eq(next(), S("\""));
100 | 	assert_text_eq(next(), S("double"));
101 | 	assert_text_eq(next(), S("\""));
102 | 	assert_text_eq(next(), S("."));
103 | 	ck_assert(next() == NULL);
104 | }
105 | END_TEST
106 | 
107 | 
108 | START_TEST(test_extendnumlet)
109 | {
110 | 	start(S("_"));
111 | 	assert_text_eq(next(), S("_"));
112 | 
113 | 	start(S("__"));
114 | 	assert_text_eq(next(), S("__"));
115 | 
116 | 	start(S("___"));
117 | 	assert_text_eq(next(), S("___"));
118 | 
119 | 	start(JS("\\u202f"));
120 | 	assert_text_eq(next(), JS("\\u202f"));
121 | 
122 | 	start(JS("\\u202f\\u202f"));
123 | 	assert_text_eq(next(), JS("\\u202f\\u202f"));
124 | 
125 | 	start(JS("\\u202f_"));
126 | 	assert_text_eq(next(), JS("\\u202f_"));
127 | 
128 | 	start(S("_1"));
129 | 	assert_text_eq(next(), S("_1"));
130 | 
131 | 	start(S("__1"));
132 | 	assert_text_eq(next(), S("__1"));
133 | 
134 | 	start(S("_A"));
135 | 	assert_text_eq(next(), S("_A"));
136 | 
137 | 	start(S("__A"));
138 | 	assert_text_eq(next(), S("__A"));
139 | }
140 | END_TEST
141 | 
142 | 
143 | // Unicode Word Break Test
144 | // http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakTest.txt
145 | struct unitest {
146 | 	char comment[1024];
147 | 	unsigned line;
148 | 	int is_ascii;
149 | 
150 | 	struct utf8lite_text text;
151 | 	uint8_t buf[1024];
152 | 
153 | 	int32_t code[256];
154 | 	int can_break_before[256];
155 | 	uint8_t *code_end[256];
156 | 	unsigned ncode;
157 | 
158 | 	uint8_t *break_begin[256];
159 | 	uint8_t *break_end[256];
160 | 	unsigned nbreak;
161 | 
162 | };
163 | 
164 | struct unitest unitests[4096];
165 | unsigned nunitest;
166 | 
167 | void write_unitest(FILE *stream, const struct unitest *test)
168 | {
169 | 	unsigned i, n = test->ncode;
170 | 
171 | 	for (i = 0; i < n; i++) {
172 | 		fprintf(stream, "%s %04X ",
173 | 			(test->can_break_before[i]) ? "\xC3\xB7" : "\xC3\x97",
174 | 			test->code[i]);
175 | 	}
176 | 	fprintf(stream, "\xC3\xB7 %s\n", test->comment);
177 | }
178 | 
179 | void setup_unicode(void)
180 | {
181 | 	struct unitest *test;
182 | 	FILE *file;
183 | 	unsigned code, line, nbreak, ncode;
184 | 	uint8_t *dst;
185 | 	char *comment;
186 | 	int ch, is_ascii;
187 | 
188 | 	setup_scan();
189 | 	file = fopen(WORD_BREAK_TEST, "r");
190 | 	if (!file) {
191 | 		file = fopen("../"WORD_BREAK_TEST, "r");
192 | 	}
193 | 
194 | 	nunitest = 0;
195 | 	test = &unitests[0];
196 | 
197 | 	line = 1;
198 | 	ncode = 0;
199 | 	nbreak = 0;
200 | 	is_ascii = 1;
201 | 	test->text.ptr = &test->buf[0];
202 | 	dst = test->text.ptr;
203 | 
204 | 	ck_assert_msg(file != NULL, "file '"WORD_BREAK_TEST"' not found");
205 | 	while ((ch = fgetc(file)) != EOF) {
206 | 		switch (ch) {
207 | 		case '#':
208 | 			comment = &test->comment[0];
209 | 			do {
210 | 				*comment++ = (char)ch;
211 | 				ch = fgetc(file);
212 | 			} while (ch != EOF && ch != '\n');
213 | 			*comment = '\0';
214 | 
215 | 			if (ch == EOF) {
216 | 				goto eof;
217 | 			}
218 | 			/* fallthrough */
219 | 		case '\n':
220 | 			*dst = '\0';
221 | 
222 | 			test->line = line;
223 | 			test->is_ascii = is_ascii;
224 | 			test->text.attr = (size_t)(dst - test->text.ptr);
225 | 
226 | 			if (ncode > 0) {
227 | 				test->ncode = ncode;
228 | 				test->nbreak = nbreak;
229 | 				ncode = 0;
230 | 				nbreak = 0;
231 | 				is_ascii = 1;
232 | 				nunitest++;
233 | 				test = &unitests[nunitest];
234 | 				comment = &test->comment[0];
235 | 				test->text.ptr = &test->buf[0];
236 | 				test->comment[0] = '\0';
237 | 				dst = test->text.ptr;
238 | 			}
239 | 			line++;
240 | 			break;
241 | 
242 | 		case 0xC3:
243 | 			ch = fgetc(file);
244 | 			if (ch == EOF) {
245 | 				goto eof;
246 | 			} else if (ch == 0x97) {
247 | 				// MULTIPLICATON SIGN (U+00D7) 0xC3 0x97
248 | 				test->can_break_before[ncode] = 0;
249 | 			} else if (ch == 0xB7) {
250 | 				// DIVISION SIGN (U+00F7) 0xC3 0xB7
251 | 				test->can_break_before[ncode] = 1;
252 | 			} else {
253 | 				goto inval;
254 | 			}
255 | 
256 | 			if (test->can_break_before[ncode]) {
257 | 				test->break_begin[nbreak] = dst;
258 | 				if (nbreak > 0) {
259 | 					test->break_end[nbreak - 1] = dst;
260 | 				}
261 | 				nbreak++;
262 | 			}
263 | 
264 | 			if (fscanf(file, "%x", &code)) {
265 | 				test->code[ncode] = (int32_t)code;
266 | 				if (code > 0x7F) {
267 | 					is_ascii = 0;
268 | 				}
269 | 				utf8lite_encode_utf8((int32_t)code, &dst);
270 | 				test->code_end[ncode] = dst;
271 | 				ncode++;
272 | 			} else {
273 | 				test->break_end[nbreak - 1] = dst;
274 | 				nbreak--;
275 | 			}
276 | 			break;
277 | 		}
278 | 
279 | 	}
280 | eof:
281 | 	return;
282 | inval:
283 | 	fprintf(stderr, "invalid character on line %d\n", line);
284 | 
285 | 	fclose(file);
286 | }
287 | 
288 | void teardown_unicode(void)
289 | {
290 | 	teardown_scan();
291 | }
292 | 
293 | START_TEST(test_unicode)
294 | {
295 | 	struct unitest *test;
296 | 	unsigned i, j;
297 | 
298 | 	for (i = 0; i < nunitest; i++) {
299 | 		test = &unitests[i];
300 | 
301 | 		//write_unitest(stderr, test);
302 | 		utf8lite_wordscan_make(&scan, &test->text);
303 | 
304 | 		for (j = 0; j < test->nbreak; j++) {
305 | 			//fprintf(stderr, "Break %u\n", j);
306 | 			ck_assert(utf8lite_wordscan_advance(&scan));
307 | 			ck_assert(scan.current.ptr == test->break_begin[j]);
308 | 			ck_assert(scan.current.ptr
309 | 					+ UTF8LITE_TEXT_SIZE(&scan.current)
310 | 					== test->break_end[j]);
311 | 		}
312 | 		ck_assert(!utf8lite_wordscan_advance(&scan));
313 | 	}
314 | }
315 | END_TEST
316 | 
317 | Suite *wordscan_suite(void)
318 | {
319 |         Suite *s;
320 |         TCase *tc;
321 | 
322 |         s = suite_create("wordscan");
323 | 	tc = tcase_create("core");
324 |         tcase_add_checked_fixture(tc, setup_scan, teardown_scan);
325 |         tcase_add_test(tc, test_figure1);
326 |         tcase_add_test(tc, test_quote);
327 |         tcase_add_test(tc, test_extendnumlet);
328 |         suite_add_tcase(s, tc);
329 | 
330 |         tc = tcase_create("Unicode WordBreakTest.txt");
331 |         tcase_add_checked_fixture(tc, setup_unicode, teardown_unicode);
332 |         tcase_add_test(tc, test_unicode);
333 |         suite_add_tcase(s, tc);
334 | 
335 | 	return s;
336 | }
337 | 
338 | 
339 | int main(void)
340 | {
341 |         int number_failed;
342 |         Suite *s;
343 |         SRunner *sr;
344 | 
345 |         s = wordscan_suite();
346 |         sr = srunner_create(s);
347 | 
348 |         srunner_run_all(sr, CK_NORMAL);
349 |         number_failed = srunner_ntests_failed(sr);
350 |         srunner_free(sr);
351 | 
352 |         return (number_failed == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
353 | }
354 | 


--------------------------------------------------------------------------------
/tests/testutil.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Patrick O. Perry.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <stddef.h>
18 | #include <stdlib.h>
19 | #include <stdint.h>
20 | #include <check.h>
21 | #include "../src/utf8lite.h"
22 | #include "testutil.h"
23 | 
24 | static struct utf8lite_text *mktext(const char *str, int flags);
25 | 
26 | 
27 | static void **allocs;
28 | static int nalloc;
29 | 
30 | 
31 | void setup(void)
32 | {
33 | 	allocs = NULL;
34 | 	nalloc = 0;
35 | }
36 | 
37 | 
38 | void teardown(void)
39 | {
40 | 	while (nalloc-- > 0) {
41 | 		free(allocs[nalloc]);
42 | 	}
43 | 	free(allocs);
44 | }
45 | 
46 | 
47 | void *alloc(size_t size)
48 | {
49 | 	void *ptr;
50 | 
51 | 	allocs = realloc(allocs, (size_t)(nalloc + 1) * sizeof(*allocs));
52 | 	ck_assert(allocs != NULL);
53 | 
54 | 	ptr = malloc(size);
55 | 	ck_assert(ptr != NULL || size == 0);
56 | 
57 | 	allocs[nalloc] = ptr;
58 | 	nalloc++;
59 | 
60 | 	return ptr;
61 | }
62 | 
63 | 
64 | struct utf8lite_text *JS(const char *str)
65 | {
66 | 	return mktext(str, UTF8LITE_TEXT_UNESCAPE);
67 | }
68 | 
69 | 
70 | struct utf8lite_text *S(const char *str)
71 | {
72 | 	return mktext(str, 0);
73 | }
74 | 
75 | 
76 | struct utf8lite_text *mktext(const char *str, int flags)
77 | {
78 | 	struct utf8lite_text *text = alloc(sizeof(*text));
79 | 	struct utf8lite_text text2;
80 | 	size_t size = strlen(str);
81 | 	uint8_t *ptr = alloc(size + 1);
82 | 	int err;
83 | 
84 | 	memcpy(ptr, str, size + 1);
85 | 	err = utf8lite_text_assign(text, ptr, size, flags, NULL);
86 | 	ck_assert(!err);
87 | 
88 | 	ck_assert(!utf8lite_text_assign(&text2, ptr, size,
89 | 					flags | UTF8LITE_TEXT_VALID, NULL));
90 | 	ck_assert(text->ptr == text2.ptr);
91 | 	ck_assert_uint_eq(text->attr, text2.attr);
92 | 
93 | 	return text;
94 | }
95 | 


--------------------------------------------------------------------------------
/tests/testutil.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Patrick O. Perry.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #ifndef TESTUTIL_H
 18 | #define TESTUTIL_H
 19 | 
 20 | #include <check.h>
 21 | #include <stddef.h>
 22 | #include <stdint.h>
 23 | 
 24 | struct utf8lite_text;
 25 | 
 26 | /**
 27 |  * This macro is broken on the old version of check (0.9.8) that Travis CI
 28 |  * uses, so we re-define it.
 29 |  */
 30 | #ifdef ck_assert_int_eq
 31 | #  undef ck_assert_int_eq
 32 | #endif
 33 | #define ck_assert_int_eq(X, Y) do { \
 34 | 	intmax_t _ck_x = (X); \
 35 | 	intmax_t _ck_y = (Y); \
 36 | 	ck_assert_msg(_ck_x == _ck_y, \
 37 | 			"Assertion '%s' failed: %s == %jd, %s == %jd", \
 38 | 			#X " == " #Y, #X, _ck_x, #Y, _ck_y); \
 39 | } while (0)
 40 | 
 41 | 
 42 | /**
 43 |  * This macro doesn't exist on check version 0.9.8.
 44 |  */
 45 | #ifdef ck_assert_uint_eq
 46 | #  undef ck_assert_uint_eq
 47 | #endif
 48 | #define ck_assert_uint_eq(X, Y) do { \
 49 | 	uintmax_t _ck_x = (X); \
 50 | 	uintmax_t _ck_y = (Y); \
 51 | 	ck_assert_msg(_ck_x == _ck_y, \
 52 | 			"Assertion '%s' failed: %s == %ju, %s == %ju", \
 53 | 			#X " == " #Y, #X, _ck_x, #Y, _ck_y); \
 54 | } while (0)
 55 | 
 56 | 
 57 | /**
 58 |  * Broken on check (0.9.8)
 59 |  */
 60 | #ifdef ck_assert_str_eq
 61 | #  undef ck_assert_str_eq
 62 | #endif
 63 | #define ck_assert_str_eq(X, Y) do { \
 64 | 	const char* _ck_x = (X); \
 65 | 	const char* _ck_y = (Y); \
 66 | 	const char* _ck_x_s; \
 67 | 	const char* _ck_y_s; \
 68 | 	const char* _ck_x_q; \
 69 | 	const char* _ck_y_q; \
 70 | 	if (_ck_x != NULL) { \
 71 | 		_ck_x_q = "\""; \
 72 | 		_ck_x_s = _ck_x; \
 73 | 	} else { \
 74 | 		_ck_x_q = ""; \
 75 | 		_ck_x_s = "(null)"; \
 76 | 	} \
 77 | 	if (_ck_y != NULL) { \
 78 | 		_ck_y_q = "\""; \
 79 | 		_ck_y_s = _ck_y; \
 80 | 	} else { \
 81 | 		_ck_y_q = ""; \
 82 | 		_ck_y_s = "(null)"; \
 83 | 	} \
 84 | 	ck_assert_msg( \
 85 | 		((_ck_x != NULL) && (_ck_y != NULL) \
 86 | 		 && (0 == strcmp(_ck_y, _ck_x))), \
 87 | 		 "Assertion '%s' failed: %s == %s%s%s, %s == %s%s%s", \
 88 | 		#X" == "#Y, \
 89 | 		#X, _ck_x_q, _ck_x_s, _ck_x_q, \
 90 | 		#Y, _ck_y_q, _ck_y_s, _ck_y_q); \
 91 | } while (0)
 92 | 
 93 | 
 94 | 
 95 | #define assert_text_eq(X, Y) do { \
 96 | 	const struct utf8lite_text * _ck_x = (X); \
 97 | 	const struct utf8lite_text * _ck_y = (Y); \
 98 | 	ck_assert_msg(utf8lite_text_equals(_ck_y, _ck_x), \
 99 | 		"Assertion '%s == %s' failed: %s == \"%.*s\" (0x%zx)," \
100 | 		" %s==\"%.*s\" (0x%zx)", \
101 | 		#X, #Y, \
102 | 		#X, (int)UTF8LITE_TEXT_SIZE(_ck_x), _ck_x->ptr, _ck_x->attr, \
103 | 		#Y, (int)UTF8LITE_TEXT_SIZE(_ck_y), _ck_y->ptr, _ck_y->attr); \
104 | } while (0)
105 | 
106 | 
107 | #define assert_text_ne(X, Y) do { \
108 | 	const struct utf8lite_text * _ck_x = (X); \
109 | 	const struct utf8lite_text * _ck_y = (Y); \
110 | 	ck_assert_msg(!utf8lite_text_equals(_ck_y, _ck_x), \
111 | 		"Assertion '%s != %s' failed: %s == \"%s\" (0x%zx)," \
112 | 		" %s==\"%s\" (0x%zx)", \
113 | 		#X, #Y, \
114 | 		#X, (int)UTF8LITE_TEXT_SIZE(_ck_x), _ck_x->ptr, _ck_x->attr, \
115 | 		#Y, (int)UTF8LITE_TEXT_SIZE(_ck_y), _ck_y->ptr, _ck_y->attr); \
116 | } while (0)
117 | 
118 | 
119 | /**
120 |  * Common test framework set up.
121 |  */
122 | void setup(void);
123 | 
124 | /**
125 |  * Common test framework tear down.
126 |  */ 
127 | void teardown(void);
128 | 
129 | /**
130 |  * Allocate memory.
131 |  */
132 | void *alloc(size_t size);
133 | 
134 | /**
135 |  * Allocate a text object, interpreting JSON-style escape codes.
136 |  */
137 | struct utf8lite_text *JS(const char *str);
138 | 
139 | /**
140 |  * Cast a raw string as a text object, ignoring escape codes.
141 |  */
142 | struct utf8lite_text *S(const char *str);
143 | 
144 | #endif /* TESTUTIL_H */
145 | 


--------------------------------------------------------------------------------
/util/compute-typelen.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import math
 4 | import re
 5 | 
 6 | CASE_FOLDING = 'data/ucd/CaseFolding.txt'
 7 | UNICODE_MAX = 0x10FFFF
 8 | 
 9 | # Parse CaseFolding.txt
10 | 
11 | try:
12 |     file = open(CASE_FOLDING, 'r')
13 | except FileNotFoundError:
14 |     file = open('../' + CASE_FOLDING, 'r')
15 | 
16 | def utf8_len(code):
17 |     if code <= 0x7f:
18 |         return 1
19 |     elif code <= 0x07FF:
20 |         return 2
21 |     elif code <= 0xFFFF:
22 |         return 3
23 |     else:
24 |         return 4
25 | 
26 | with file:
27 |     for line in file:
28 |         if line[0] != '#' and line[0] != '\n':
29 |             fields = line.split(';')
30 |             code = int(fields[0], 16)
31 |             status = fields[1].strip();
32 | 
33 |             if status == 'C' or status == 'F':
34 |                 l0 = utf8_len(code)
35 | 
36 |                 mapping = [int(x,16) for x in fields[2].split()]
37 |                 l1 = sum([utf8_len(m) for m in mapping])
38 | 
39 |                 ratio = l1 / l0
40 |                 if ratio >= 3:
41 |                     print('U+{:04X}'.format(code), mapping, 'ratio: ', ratio)
42 | 
43 | 


--------------------------------------------------------------------------------
/util/gen-casefold.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright 2017 Patrick O. Perry.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import math
 18 | import re
 19 | 
 20 | CASE_FOLDING = 'data/ucd/CaseFolding.txt'
 21 | UNICODE_MAX = 0x10FFFF
 22 | 
 23 | # Parse CaseFolding.txt
 24 | 
 25 | try:
 26 |     file = open(CASE_FOLDING, 'r')
 27 | except FileNotFoundError:
 28 |     file = open('../' + CASE_FOLDING, 'r')
 29 | 
 30 | casefold = []
 31 | casefold_map = []
 32 | 
 33 | with file:
 34 |     for line in file:
 35 |         if line[0] != '#' and line[0] != '\n':
 36 |             fields = line.split(';')
 37 |             code = int(fields[0], 16)
 38 |             status = fields[1].strip();
 39 | 
 40 |             if status == 'C' or status == 'F':
 41 |                 while code > len(casefold):
 42 |                     casefold.append(None)
 43 |                 assert code == len(casefold)
 44 | 
 45 |                 mapping = [int(x,16) for x in fields[2].split()]
 46 |                 n = len(mapping)
 47 |                 if n > 1:
 48 |                     casefold.append((n, len(casefold_map)))
 49 |                     casefold_map.extend(mapping)
 50 |                 else:
 51 |                     casefold.append((1, mapping[0]))
 52 | 
 53 | while len(casefold) <= UNICODE_MAX:
 54 |     casefold.append(None)
 55 | 
 56 | 
 57 | def compute_tables(block_size):
 58 |     nblock = (UNICODE_MAX + 1) // block_size
 59 |     stage1 = [None] * nblock
 60 |     stage2 = []
 61 |     stage2_dict = {}
 62 |     for i in range(nblock):
 63 |         begin = i * block_size
 64 |         end = begin + block_size
 65 |         block = tuple(casefold[begin:end])
 66 |         if block in stage2_dict:
 67 |             j = stage2_dict[block]
 68 |         else:
 69 |             j = len(stage2)
 70 |             stage2_dict[block] = j
 71 |             stage2.append(block)
 72 |         stage1[i] = j
 73 |     return (stage1,stage2)
 74 | 
 75 | 
 76 | def stage1_item_size(nstage2):
 77 |     nbyte = max(1, math.ceil(math.log(nstage2, 2) / 8))
 78 |     size = 2**math.ceil(math.log(nbyte, 2))
 79 |     return size
 80 | 
 81 | page_size = 4096
 82 | nbytes = {}
 83 | 
 84 | best_block_size = 1
 85 | smallest_size = UNICODE_MAX + 1
 86 | 
 87 | for i in range(1,17):
 88 |     block_size = 2**i
 89 |     stage1,stage2 = compute_tables(block_size)
 90 | 
 91 |     nbyte1 = len(stage1) * stage1_item_size(len(stage2))
 92 |     nbyte2 = len(stage2) * block_size
 93 | 
 94 |     nbyte1 = math.ceil(nbyte1 / page_size) * page_size
 95 |     nbyte2 = math.ceil(nbyte2 / page_size) * page_size
 96 |     nbyte = nbyte1 + nbyte2
 97 |     nbytes[block_size] = nbyte
 98 | 
 99 |     if nbyte < smallest_size:
100 |         smallest_size = nbyte
101 |         best_block_size = block_size
102 | 
103 | 
104 | block_size = best_block_size
105 | stage1,stage2 = compute_tables(block_size)
106 | 
107 | type1_size = stage1_item_size(len(stage2))
108 | if type1_size == 1:
109 |     type1 = 'uint8_t'
110 | elif type1_size == 2:
111 |     type1 = 'uint16_t'
112 | elif type1_size == 4:
113 |     type1 = 'uint32_t'
114 | else:
115 |     type1 = 'uint64_t'
116 | 
117 | # Write casefold.h to stdout
118 | 
119 | print("/* This file is automatically generated. DO NOT EDIT!")
120 | print("   Instead, edit gen-casefold.py and re-run.  */")
121 | print("")
122 | print("/*")
123 | print(" * Case folding properties.")
124 | print(" *")
125 | print(" * Defined in UAX #44 \"Unicode Character Database\"")
126 | print(" *")
127 | print(" *     http://www.unicode.org/reports/tr44/")
128 | print(" *")
129 | print(" * Section 5.6, Case and Case Mapping")
130 | print(" *")
131 | print(" *")
132 | print(" * We use a two-stage lookup strategy as described at")
133 | print(" *")
134 | print(" *     http://www.strchr.com/multi-stage_tables")
135 | print(" *")
136 | print(" */")
137 | print("")
138 | print("#ifndef UNICODE_CASEFOLD_H")
139 | print("#define UNICODE_CASEFOLD_H")
140 | print("")
141 | print("#include <stdint.h>")
142 | print("")
143 | print("/* casefold")
144 | print(" * --------")
145 | print(" * length: the length (in codepoints) of the case fold mapping,")
146 | print(" *         or 0 if there is no case fold")
147 | print(" *")
148 | print(" * data:   the mapped-to codepoint (length = 1), or")
149 | print(" *         an index into the `casefold_mapping` array, pointing")
150 | print(" *         to the first codepoint in the mapping (length > 1)")
151 | print(" */")
152 | print("struct casefold {")
153 | print("\tunsigned length : 8;")
154 | print("\tunsigned data : 24;")
155 | print("};")
156 | print("")
157 | print("#define CASEFOLD_BLOCK_SIZE", block_size)
158 | print("")
159 | print("static const " + type1 + " casefold_stage1[] = {")
160 | for i in range(len(stage1) - 1):
161 |     if i % 16  == 0:
162 |         print("/* U+{:04X} */".format(i * block_size), end="")
163 |     print("{0: >3},".format(stage1[i]), end="")
164 |     if i % 16 == 15:
165 |         print("")
166 | print("{0: >3}".format(stage1[len(stage1) - 1]))
167 | print("};")
168 | print("")
169 | print("static const struct casefold casefold_stage2[][" +
170 |         str(block_size) + "] = {")
171 | for i in range(0,len(stage2)):
172 |     print("  /* block " + str(i) + " */")
173 |     print("  {", end="")
174 |     for j in range(block_size):
175 |         val = stage2[i][j]
176 |         if val is None:
177 |             print("{0,0}", end="")
178 |         else:
179 |             print("{{{0},0x{1:05X}}}".format(val[0], val[1]), end="")
180 | 
181 |         #print("{0: >3}".format(prop_vals[stage2[i][j]]), end="")
182 |         if j + 1 == block_size:
183 |             print("\n  }", end="")
184 |         else:
185 |             print(",", end="")
186 |             if j % 5 == 4:
187 |                 print("\n   ", end="")
188 |     if i + 1 != len(stage2):
189 |         print(",\n")
190 |     else:
191 |         print("")
192 | print("};")
193 | print("")
194 | print("static const int32_t casefold_mapping[] = {")
195 | for i in range(len(casefold_map) - 1):
196 |     if i % 8  == 0:
197 |         print("/* 0x{:04X} */ ".format(i), end="")
198 |     print("0x{0:04X},".format(casefold_map[i]), end="")
199 |     if i % 8 == 7:
200 |         print("")
201 | print("0x{0:04X}".format(casefold_map[len(casefold_map) - 1]))
202 | print("};")
203 | print("")
204 | print("#endif /* UNICODE_CASEFOLD_H */")
205 | 


--------------------------------------------------------------------------------
/util/gen-charwidth.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright 2017 Patrick O. Perry.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import math
 18 | 
 19 | try:
 20 |     import property
 21 |     import unicode_data
 22 | except ModuleNotFoundError:
 23 |     from util import property
 24 |     from util import unicode_data
 25 | 
 26 | 
 27 | DERIVED_CORE_PROPERTIES = "data/ucd/DerivedCoreProperties.txt"
 28 | EAST_ASIAN_WIDTH = "data/ucd/EastAsianWidth.txt"
 29 | EMOJI_DATA = "data/ucd/emoji/emoji-data.txt"
 30 | 
 31 | 
 32 | east_asian_width = property.read(EAST_ASIAN_WIDTH)
 33 | # A:  Ambiguous (can be narrow or wide depending on context; treat as wide)
 34 | # F:  Fullwidth (always wide)
 35 | # H:  Halfwidth (always narrow)
 36 | # Na: Narrow    (always narrow)
 37 | # N:  Neutral   (non-East Asian; treat as single)
 38 | # W:  Wide      (always wide)
 39 | 
 40 | 
 41 | emoji_props = property.read(EMOJI_DATA, sets=True)
 42 | 
 43 | # https://www.unicode.org/reports/tr51/#def_basic_emoji_set
 44 | emoji = ((emoji_props['Emoji'] - emoji_props['Emoji_Component'])
 45 |          & emoji_props['Emoji_Presentation'])
 46 | 
 47 | # Treat ignorables as invisible
 48 | derived_core_properties = property.read(DERIVED_CORE_PROPERTIES, sets=True)
 49 | default_ignorable = derived_core_properties['Default_Ignorable_Code_Point']
 50 | 
 51 | 
 52 | # unassigned: not assigned, other, surrogate
 53 | none_cats = set(['Cc', 'Cn', 'Co', 'Cs', 'Zl', 'Zp'])
 54 | mark_cats = set(['Cf', 'Me', 'Mn'])
 55 | none = set([0xFFF9, 0xFFFA, 0xFFFB]) # interlinear annotation markers
 56 | mark = set()
 57 | for code in range(len(unicode_data.uchars)):
 58 |     u = unicode_data.uchars[code]
 59 |     if code in none or code in mark:
 60 |         pass
 61 |     elif u is None or u.category in none_cats:
 62 |         none.add(code)
 63 |     elif u.category in mark_cats:
 64 |         mark.add(code)
 65 | 
 66 | 
 67 | code_props = [None] * len(east_asian_width)
 68 | for code in range(len(code_props)):
 69 |     eaw = east_asian_width[code]
 70 |     if code in default_ignorable: # default ingorable overrides
 71 |         code_props[code] = 'Ignorable'
 72 |     elif code in emoji: # emoji overrides
 73 |         code_props[code] = 'Emoji'
 74 |     elif code in mark: # mark overrides
 75 |         code_props[code] = 'Mark'
 76 |     elif code in none: # none overrides
 77 |         code_props[code] = 'None'
 78 |     elif eaw == 'F' or eaw == 'W':
 79 |         code_props[code] = 'Wide'
 80 |     elif eaw == 'H' or eaw == 'Na' or eaw == 'N':
 81 |         code_props[code] = 'Narrow'
 82 |     elif eaw == 'A':
 83 |         code_props[code] = 'Ambiguous'
 84 |     else:
 85 |         code_props[code] = 'Narrow' # default to narrow
 86 | 
 87 | 
 88 | prop_names = [
 89 |         'None', 'Ignorable', 'Mark', 'Narrow', 'Ambiguous', 'Wide', 'Emoji'
 90 |         ]
 91 | prop_vals = {}
 92 | for p in prop_names:
 93 |     prop_vals[p] = len(prop_vals)
 94 | 
 95 | 
 96 | def compute_tables(block_size):
 97 |     nblock = len(code_props) // block_size
 98 |     stage1 = [None] * nblock
 99 |     stage2 = []
100 |     stage2_dict = {}
101 |     for i in range(nblock):
102 |         begin = i * block_size
103 |         end = begin + block_size
104 |         block = tuple(code_props[begin:end])
105 |         if block in stage2_dict:
106 |             j = stage2_dict[block]
107 |         else:
108 |             j = len(stage2)
109 |             stage2_dict[block] = j
110 |             stage2.append(block)
111 |         stage1[i] = j
112 |     return (stage1,stage2)
113 | 
114 | 
115 | def stage1_item_size(nstage2):
116 |     nbyte = math.ceil(math.log(nstage2, 2) / 8)
117 |     size = 2**math.ceil(math.log(nbyte, 2))
118 |     return size
119 | 
120 | 
121 | page_size = 4096
122 | block_size = 256
123 | 
124 | nbytes = {}
125 | 
126 | best_block_size = 1
127 | smallest_size = len(code_props)
128 | 
129 | for i in range(1,17):
130 |     block_size = 2**i
131 |     stage1,stage2 = compute_tables(block_size)
132 | 
133 |     nbyte1 = len(stage1) * stage1_item_size(len(stage2))
134 |     nbyte2 = len(stage2) * block_size
135 | 
136 |     nbyte1 = math.ceil(nbyte1 / page_size) * page_size
137 |     nbyte2 = math.ceil(nbyte2 / page_size) * page_size
138 |     nbyte = nbyte1 + nbyte2
139 |     nbytes[block_size] = nbyte
140 | 
141 |     if nbyte < smallest_size:
142 |         smallest_size = nbyte
143 |         best_block_size = block_size
144 | 
145 | 
146 | block_size = best_block_size
147 | stage1,stage2 = compute_tables(block_size)
148 | 
149 | type1_size = stage1_item_size(len(stage2))
150 | 
151 | if type1_size == 1:
152 |     type1 = 'uint8_t'
153 | elif type1_size == 2:
154 |     type1 = 'uint16_t'
155 | elif type1_size == 4:
156 |     type1 = 'uint32_t'
157 | else:
158 |     type1 = 'uint64_t'
159 | 
160 | type2 = 'int8_t'
161 | 
162 | 
163 | # Write chardwidth.h to stdout
164 | 
165 | print("/* This file is automatically generated. DO NOT EDIT!")
166 | print("   Instead, edit gen-charwidth.py and re-run.  */")
167 | print("")
168 | print("/*")
169 | print(" * Unicode East_Asian_Width property values.")
170 | print(" *")
171 | print(" * Defined in UAX #11 \"East Asian Width\"")
172 | print(" *")
173 | print(" *     http://www.unicode.org/reports/tr11/")
174 | print(" *")
175 | print(" * We use the two-stage lookup strategy described at")
176 | print(" *")
177 | print(" *     http://www.strchr.com/multi-stage_tables")
178 | print(" *")
179 | print(" */")
180 | print("")
181 | print("#ifndef CHARWIDTH_H")
182 | print("#define CHARWIDTH_H")
183 | print("")
184 | print("#include <stdint.h>")
185 | print("")
186 | print("enum charwidth_prop {")
187 | first = True
188 | for prop in prop_names:
189 |     if not first:
190 |         print(",\n", end="")
191 |     else:
192 |         first = False
193 |     print("\tCHARWIDTH_" + prop.upper() + " = " + str(prop_vals[prop]), end="")
194 | print("\n};")
195 | print("")
196 | print("static const " + type1 + " charwidth_stage1[] = {")
197 | for i in range(len(stage1) - 1):
198 |     if i % 16  == 0:
199 |         print("/* U+{:04X} */".format(i * block_size), end="")
200 |     print("{0: >3},".format(stage1[i]), end="")
201 |     if i % 16 == 15:
202 |         print("")
203 | print("{0: >3}".format(stage1[len(stage1) - 1]))
204 | print("};")
205 | print("")
206 | print("static const " + type2 + " charwidth_stage2[][" +
207 |         str(block_size) + "] = {")
208 | for i in range(len(stage2)):
209 |     print("  /* block " + str(i) + " */")
210 |     print("  {", end="")
211 |     for j in range(block_size):
212 |         print("{0: >3}".format(prop_vals[stage2[i][j]]), end="")
213 |         if j + 1 == block_size:
214 |             print("\n  }", end="")
215 |         else:
216 |             print(",", end="")
217 |             if j % 16 == 15:
218 |                 print("\n   ", end="")
219 |     if i + 1 != len(stage2):
220 |         print(",\n")
221 |     else:
222 |         print("")
223 | print("};")
224 | 
225 | print("")
226 | print("static int charwidth(int32_t code)")
227 | print("{")
228 | print("\tconst int32_t block_size = " + str(block_size) + ";")
229 | print("\t" + type1 + " i = charwidth_stage1[code / block_size];")
230 | print("\treturn charwidth_stage2[i][code % block_size];")
231 | print("}")
232 | print("")
233 | print("#endif /* CHARWIDTH_H */")
234 | 


--------------------------------------------------------------------------------
/util/gen-combining.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright 2017 Patrick O. Perry.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import math
 18 | 
 19 | try:
 20 |     import unicode_data
 21 | except ModuleNotFoundError:
 22 |     from util import unicode_data
 23 | 
 24 | 
 25 | combin_vals = set([0])
 26 | combin = []
 27 | 
 28 | for code in range(len(unicode_data.uchars)):
 29 |     u = unicode_data.uchars[code]
 30 | 
 31 |     if u is None or u.ccc is None:
 32 |         combin.append(0)
 33 |     else:
 34 |         ccc = u.ccc
 35 |         combin_vals.add(ccc)
 36 |         combin.append(ccc)
 37 | 
 38 | 
 39 | def compute_tables(block_size):
 40 |     nblock = len(combin) // block_size
 41 |     stage1 = [None] * nblock
 42 |     stage2 = []
 43 |     stage2_dict = {}
 44 |     for i in range(nblock):
 45 |         begin = i * block_size
 46 |         end = begin + block_size
 47 |         block = tuple(combin[begin:end])
 48 |         if block in stage2_dict:
 49 |             j = stage2_dict[block]
 50 |         else:
 51 |             j = len(stage2)
 52 |             stage2_dict[block] = j
 53 |             stage2.append(block)
 54 |         stage1[i] = j
 55 |     return (stage1,stage2)
 56 | 
 57 | 
 58 | def stage1_item_size(nstage2):
 59 |     nbyte = math.ceil(math.log(nstage2, 2) / 8)
 60 |     size = 2**math.ceil(math.log(nbyte, 2))
 61 |     return size
 62 | 
 63 | page_size = 4096
 64 | block_size = 256
 65 | 
 66 | nbytes = {}
 67 | 
 68 | best_block_size = 1
 69 | smallest_size = len(combin)
 70 | 
 71 | for i in range(1,17):
 72 |     block_size = 2**i
 73 |     stage1,stage2 = compute_tables(block_size)
 74 | 
 75 |     nbyte1 = len(stage1) * stage1_item_size(len(stage2))
 76 |     nbyte2 = len(stage2) * block_size
 77 | 
 78 |     nbyte1 = math.ceil(nbyte1 / page_size) * page_size
 79 |     nbyte2 = math.ceil(nbyte2 / page_size) * page_size
 80 |     nbyte = nbyte1 + nbyte2
 81 |     nbytes[block_size] = nbyte
 82 | 
 83 |     if nbyte < smallest_size:
 84 |         smallest_size = nbyte
 85 |         best_block_size = block_size
 86 | 
 87 | 
 88 | block_size = best_block_size
 89 | stage1,stage2 = compute_tables(block_size)
 90 | 
 91 | type1_size = stage1_item_size(len(stage2))
 92 | if type1_size == 1:
 93 |     type1 = 'uint8_t'
 94 | elif type1_size == 2:
 95 |     type1 = 'uint16_t'
 96 | elif type1_size == 4:
 97 |     type1 = 'uint32_t'
 98 | else:
 99 |     type1 = 'uint64_t'
100 | 
101 | type2 = 'uint8_t'
102 | 
103 | 
104 | # Write combining.h to stdout
105 | 
106 | print("/* This file is automatically generated. DO NOT EDIT!")
107 | print("   Instead, edit gen-combining.py and re-run.  */")
108 | print("")
109 | print("/*")
110 | print(" * Canonical_Combining_Class property values.")
111 | print(" *")
112 | print(" * Defined in UAX #44 \"Unicode Character Database\"")
113 | print(" *")
114 | print(" *     http://www.unicode.org/reports/tr44/")
115 | print(" *")
116 | print(" * Section 5.7.4, Table 15.")
117 | print(" *")
118 | print(" *")
119 | print(" * We use the two-stage lookup strategy described at")
120 | print(" *")
121 | print(" *     http://www.strchr.com/multi-stage_tables")
122 | print(" *")
123 | print(" */")
124 | print("")
125 | print("#ifndef UNICODE_COMBINING_H")
126 | print("#define UNICODE_COMBINING_H")
127 | print("")
128 | print("#include <stdint.h>")
129 | print("")
130 | print("static const " + type1 + " combining_class_stage1[] = {")
131 | for i in range(len(stage1) - 1):
132 |     if i % 16  == 0:
133 |         print("/* U+{:04X} */".format(i * block_size), end="")
134 |     print("{0: >3},".format(stage1[i]), end="")
135 |     if i % 16 == 15:
136 |         print("")
137 | print("{0: >3}".format(stage1[len(stage1) - 1]))
138 | print("};")
139 | print("")
140 | print("static const " + type2 + " combining_class_stage2[][" +
141 |         str(block_size) + "] = {")
142 | for i in range(len(stage2)):
143 |     print("  /* block " + str(i) + " */")
144 |     print("  {", end="")
145 |     for j in range(block_size):
146 |         print("{0: >3}".format(stage2[i][j]), end="")
147 |         if j + 1 == block_size:
148 |             print("\n  }", end="")
149 |         else:
150 |             print(",", end="")
151 |             if j % 16 == 15:
152 |                 print("\n   ", end="")
153 |     if i + 1 != len(stage2):
154 |         print(",\n")
155 |     else:
156 |         print("")
157 | print("};")
158 | print("")
159 | print("static uint8_t combining_class(int32_t code)")
160 | print("{")
161 | print("\tconst int32_t block_size = " + str(block_size) + ";")
162 | print("\t" + type1 + " i = combining_class_stage1[code / block_size];")
163 | print("\treturn combining_class_stage2[i][code % block_size];")
164 | print("}")
165 | print("")
166 | print("#endif /* UNICODE_COMBINING_H */")
167 | 
168 | 


--------------------------------------------------------------------------------
/util/gen-compose.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright 2017 Patrick O. Perry.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import math
 18 | import re
 19 | 
 20 | try:
 21 |     import unicode_data
 22 | except ModuleNotFoundError:
 23 |     from util import unicode_data
 24 | 
 25 | 
 26 | EXCLUSIONS = 'data/ucd/CompositionExclusions.txt'
 27 | 
 28 | # get the length-2 decomposition maps (excluding hangul and compatibility maps)
 29 | 
 30 | decomp_map = {}
 31 | starter = [None] * len(unicode_data.uchars)
 32 | 
 33 | for code in range(len(unicode_data.uchars)):
 34 |     u = unicode_data.uchars[code]
 35 |     if u is None:
 36 |         continue
 37 | 
 38 |     ccc = u.ccc
 39 |     if ccc is None or ccc == 0:
 40 |         starter[code] = True
 41 |     else:
 42 |         starter[code] = False
 43 | 
 44 |     d = u.decomp
 45 |     if d is not None and d.type is None:
 46 |         if len(d.map) == 2:
 47 |             decomp_map[code] = tuple(d.map)
 48 | 
 49 | 
 50 | # exclude non-starter decomposiitons
 51 | 
 52 | decomp_map2 = {}
 53 | for p,d in decomp_map.items():
 54 |     if starter[p] and starter[d[0]]:
 55 |         decomp_map2[p] = d
 56 | decomp_map = decomp_map2
 57 | 
 58 | 
 59 | # exclude composition exclusions
 60 | 
 61 | try:
 62 |     file = open(EXCLUSIONS, 'r')
 63 | except FileNotFoundError:
 64 |     file = open('../' + EXCLUSIONS, 'r')
 65 | 
 66 | with file:
 67 |     for line in file:
 68 |         fields = line.partition('#')
 69 |         code = fields[0].strip()
 70 |         if len(code) > 0:
 71 |             code = int(code, 16)
 72 |             if code in decomp_map:
 73 |                 del decomp_map[code]
 74 | 
 75 | #print('primary\tletter\tcode')
 76 | #for p,d in decomp_map.items():
 77 | #    print(p, '\t', d[0], '\t', d[1], sep='')
 78 | 
 79 | # construct table l : [(c,p)]
 80 | compose_map = {}
 81 | for p,d in decomp_map.items():
 82 |     l = d[0]
 83 |     c = d[1]
 84 |     if l not in compose_map:
 85 |         compose_map[l] = []
 86 |     compose_map[l].append((c, p))
 87 | 
 88 | compose = []
 89 | combiner = []
 90 | primary = []
 91 | off = 0
 92 | for code in range(len(unicode_data.uchars)):
 93 |     if code in compose_map:
 94 |         maps = compose_map[code]
 95 |         maps.sort()
 96 |         compose.append((off, len(maps)))
 97 |         combiner.extend([c for (c,p) in maps])
 98 |         primary.extend([p for (c,p) in maps])
 99 |         off += len(maps)
100 |     else:
101 |         compose.append((0,0))
102 | 
103 | # Hangul
104 | hangul_lpart = off
105 | hangul_lvpart = off + 1
106 | 
107 | for code in range(0x1100, 0x1113):
108 |     compose[code] = (hangul_lpart, 1)
109 | 
110 | for code in range(0xAC00, 0xD7A4):
111 |     if (code - 0xAC00) % 28 == 0:
112 |         compose[code] = (hangul_lvpart, 1)
113 | 
114 | 
115 | def compute_tables(block_size):
116 |     nblock = len(compose) // block_size
117 |     stage1 = [None] * nblock
118 |     stage2 = []
119 |     stage2_dict = {}
120 |     for i in range(nblock):
121 |         begin = i * block_size
122 |         end = begin + block_size
123 |         block = tuple(compose[begin:end])
124 |         if block in stage2_dict:
125 |             j = stage2_dict[block]
126 |         else:
127 |             j = len(stage2)
128 |             stage2_dict[block] = j
129 |             stage2.append(block)
130 |         stage1[i] = j
131 |     return (stage1,stage2)
132 | 
133 | 
134 | def stage1_item_size(nstage2):
135 |     nbyte = math.ceil(math.log(nstage2, 2) / 8)
136 |     size = 2**math.ceil(math.log(nbyte, 2))
137 |     return size
138 | 
139 | page_size = 4096
140 | block_size = 256
141 | 
142 | nbytes = {}
143 | 
144 | best_block_size = 1
145 | smallest_size = len(compose)
146 | 
147 | for i in range(1,17):
148 |     block_size = 2**i
149 |     stage1,stage2 = compute_tables(block_size)
150 | 
151 |     nbyte1 = len(stage1) * stage1_item_size(len(stage2))
152 |     nbyte2 = len(stage2) * block_size
153 | 
154 |     nbyte1 = math.ceil(nbyte1 / page_size) * page_size
155 |     nbyte2 = math.ceil(nbyte2 / page_size) * page_size
156 |     nbyte = nbyte1 + nbyte2
157 |     nbytes[block_size] = nbyte
158 | 
159 |     if nbyte < smallest_size:
160 |         smallest_size = nbyte
161 |         best_block_size = block_size
162 | 
163 | 
164 | block_size = best_block_size
165 | stage1,stage2 = compute_tables(block_size)
166 | 
167 | type1_size = stage1_item_size(len(stage2))
168 | if type1_size == 1:
169 |     type1 = 'uint8_t'
170 | elif type1_size == 2:
171 |     type1 = 'uint16_t'
172 | elif type1_size == 4:
173 |     type1 = 'uint32_t'
174 | else:
175 |     type1 = 'uint64_t'
176 | 
177 | type2 = 'struct composition'
178 | 
179 | 
180 | # Write compose.h to stdout
181 | 
182 | 
183 | print("/* This file is automatically generated. DO NOT EDIT!")
184 | print("   Instead, edit gen-compose.py and re-run. */")
185 | print("")
186 | print("/*")
187 | print(" * Unicode primary composites.")
188 | print(" *")
189 | print(" * Defined in Unicode Sec 3.11 \"Normalization Forms\"")
190 | print(" *")
191 | print(" * We use the two-stage lookup strategy described at")
192 | print(" *")
193 | print(" *     http://www.strchr.com/multi-stage_tables")
194 | print(" *")
195 | print(" */")
196 | print("")
197 | print("#ifndef UNICODE_COMPOSE_H")
198 | print("#define UNICODE_COMPOSE_H")
199 | print("")
200 | print("#include <stdint.h>")
201 | print("")
202 | print("/* composition")
203 | print(" * -----------")
204 | print(" * offset: the offset into the primary and combiner arrays,")
205 | print(" *         or 0 if there are no compositions")
206 | print(" * length: the number of compositions for the codepont")
207 | print(" */")
208 | print("struct composition {")
209 | print("\tunsigned offset : 11;")
210 | print("\tunsigned length : 5;")
211 | print("};")
212 | print("")
213 | print("#define COMPOSITION_BLOCK_SIZE", block_size)
214 | print("")
215 | print("#define COMPOSITION_HANGUL_LPART", hangul_lpart)
216 | print("")
217 | print("#define COMPOSITION_HANGUL_LVPART", hangul_lvpart)
218 | print("")
219 | print("static const " + type1 + " composition_stage1[] = {")
220 | for i in range(len(stage1) - 1):
221 |     if i % 16  == 0:
222 |         print("/* U+{:04X} */".format(i * block_size), end="")
223 |     print("{0: >3},".format(stage1[i]), end="")
224 |     if i % 16 == 15:
225 |         print("")
226 | print("{0: >3}".format(stage1[len(stage1) - 1]))
227 | print("};")
228 | print("")
229 | print("static const " + type2 + " composition_stage2[][" +
230 |         str(block_size) + "] = {")
231 | for i in range(len(stage2)):
232 |     print("  /* block " + str(i) + " */")
233 |     print("  {", end="")
234 |     for j in range(block_size):
235 |         print("{{{0: >3}".format(stage2[i][j][0]), end="")
236 |         print(",{0: >2}}}".format(stage2[i][j][1]), end="")
237 |         if j + 1 == block_size:
238 |             print("\n  }", end="")
239 |         else:
240 |             print(",", end="")
241 |             if j % 7 == 6:
242 |                 print("\n   ", end="")
243 |             else:
244 |                 print(" ", end="")
245 |     if i + 1 != len(stage2):
246 |         print(",\n")
247 |     else:
248 |         print("")
249 | print("};")
250 | print("")
251 | print("static const int32_t composition_combiner[] = {")
252 | for i in range(len(combiner) - 1):
253 |     if i % 8  == 0:
254 |         print("/* {0: >3} */ ".format(i), end="")
255 |     print("0x{0:04X},".format(combiner[i]), end="")
256 |     if i % 8 == 7:
257 |         print("")
258 | print("0x{0:04X}".format(combiner[len(combiner) - 1]))
259 | print("};")
260 | print("")
261 | print("static const int32_t composition_primary[] = {")
262 | for i in range(len(primary) - 1):
263 |     if i % 8  == 0:
264 |         print("/* {0: >3} */ ".format(i), end="")
265 |     print("0x{0:04X},".format(primary[i]), end="")
266 |     if i % 8 == 7:
267 |         print("")
268 | print("0x{0:04X}".format(primary[len(primary) - 1]))
269 | print("};")
270 | print("")
271 | print("#endif /* UNICODE_COMPOSE_H */")
272 | 


--------------------------------------------------------------------------------
/util/gen-decompose.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright 2017 Patrick O. Perry.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import operator
 18 | import math
 19 | 
 20 | try:
 21 |     import unicode_data
 22 | except ModuleNotFoundError:
 23 |     from util import unicode_data
 24 | 
 25 | decomp_vals = unicode_data.decomp_vals
 26 | decomp_map = unicode_data.decomp_map
 27 | decomp = unicode_data.decomp
 28 | 
 29 | def compute_tables(block_size):
 30 |     nblock = len(decomp) // block_size
 31 |     stage1 = [None] * nblock
 32 |     stage2 = []
 33 |     stage2_dict = {}
 34 |     for i in range(nblock):
 35 |         begin = i * block_size
 36 |         end = begin + block_size
 37 |         block = tuple(decomp[begin:end])
 38 |         if block in stage2_dict:
 39 |             j = stage2_dict[block]
 40 |         else:
 41 |             j = len(stage2)
 42 |             stage2_dict[block] = j
 43 |             stage2.append(block)
 44 |         stage1[i] = j
 45 |     return (stage1,stage2)
 46 | 
 47 | 
 48 | def stage1_item_size(nstage2):
 49 |     nbyte = math.ceil(math.log(nstage2, 2) / 8)
 50 |     size = 2**math.ceil(math.log(nbyte, 2))
 51 |     return size
 52 | 
 53 | page_size = 4096
 54 | block_size = 256
 55 | 
 56 | nbytes = {}
 57 | 
 58 | best_block_size = 1
 59 | smallest_size = len(decomp)
 60 | 
 61 | for i in range(1,17):
 62 |     block_size = 2**i
 63 |     stage1,stage2 = compute_tables(block_size)
 64 | 
 65 |     nbyte1 = len(stage1) * stage1_item_size(len(stage2))
 66 |     nbyte2 = len(stage2) * block_size
 67 | 
 68 |     nbyte1 = math.ceil(nbyte1 / page_size) * page_size
 69 |     nbyte2 = math.ceil(nbyte2 / page_size) * page_size
 70 |     nbyte = nbyte1 + nbyte2
 71 |     nbytes[block_size] = nbyte
 72 | 
 73 |     if nbyte < smallest_size:
 74 |         smallest_size = nbyte
 75 |         best_block_size = block_size
 76 | 
 77 | 
 78 | block_size = best_block_size
 79 | stage1,stage2 = compute_tables(block_size)
 80 | 
 81 | type1_size = stage1_item_size(len(stage2))
 82 | if type1_size == 1:
 83 |     type1 = 'uint8_t'
 84 | elif type1_size == 2:
 85 |     type1 = 'uint16_t'
 86 | elif type1_size == 4:
 87 |     type1 = 'uint32_t'
 88 | else:
 89 |     type1 = 'uint64_t'
 90 | 
 91 | # Write decompose.h to stdout
 92 | 
 93 | print("/* This file is automatically generated. DO NOT EDIT!")
 94 | print("   Instead, edit gen-decompose.py and re-run.  */")
 95 | print("")
 96 | print("/*")
 97 | print(" * Decomposition mappings.")
 98 | print(" *")
 99 | print(" * Defined in UAX #44 \"Unicode Character Database\"")
100 | print(" *")
101 | print(" *     http://www.unicode.org/reports/tr44/")
102 | print(" *")
103 | print(" * Section 5.7.3, Table 14.")
104 | print(" *")
105 | print(" *")
106 | print(" * We use a two-stage lookup strategy as described at")
107 | print(" *")
108 | print(" *     http://www.strchr.com/multi-stage_tables")
109 | print(" *")
110 | print(" */")
111 | print("")
112 | print("#ifndef UNICODE_DECOMPOSE_H")
113 | print("#define UNICODE_DECOMPOSE_H")
114 | print("")
115 | print("#include <stdint.h>")
116 | print("")
117 | print("/* decomposition_type")
118 | print(" * ------------------")
119 | print(" * compatibility decompositions have decomposition_type != 0")
120 | print(" */")
121 | print("enum decomposition_type {", end="")
122 | first = True
123 | for k,v in sorted(decomp_vals.items(), key=operator.itemgetter(1)):
124 |     if not first:
125 |         print(",", end="")
126 |     print("\n\tDECOMPOSITION_" + k.upper() + " = " + str(v), end="")
127 |     first = False
128 | print("\n};")
129 | print("")
130 | print("/* decomposition")
131 | print(" * -------------")
132 | print(" * type:   the decomposition_type")
133 | print(" *")
134 | print(" * length: the length (in codepoints) of the decomposition mapping,")
135 | print(" *         or 0 if there is no decomposition")
136 | print(" *")
137 | print(" * data:   the mapped-to codepoint (length = 1), or")
138 | print(" *         an index into the `decomposition_mapping` array, pointing")
139 | print(" *         to the first codepoint in the mapping (length > 1)")
140 | print(" */")
141 | print("struct decomposition {")
142 | print("\tint type : 6;")
143 | print("\tunsigned length : 5;")
144 | print("\tunsigned data : 21;")
145 | print("};")
146 | print("")
147 | print("#define DECOMPOSITION_BLOCK_SIZE", block_size)
148 | print("")
149 | print("static const " + type1 + " decomposition_stage1[] = {")
150 | for i in range(len(stage1) - 1):
151 |     if i % 16  == 0:
152 |         print("/* U+{:04X} */".format(i * block_size), end="")
153 |     print("{0: >3},".format(stage1[i]), end="")
154 |     if i % 16 == 15:
155 |         print("")
156 | print("{0: >3}".format(stage1[len(stage1) - 1]))
157 | print("};")
158 | print("")
159 | print("static const struct decomposition decomposition_stage2[][" +
160 |         str(block_size) + "] = {")
161 | for i in range(0,len(stage2)):
162 |     print("  /* block " + str(i) + " */")
163 |     print("  {", end="")
164 |     for j in range(block_size):
165 |         val = stage2[i][j]
166 |         if val is None:
167 |             print("{0,0,0}", end="")
168 |         else:
169 |             if val[0] is None:
170 |                 t = 0
171 |             else:
172 |                 t = decomp_vals[val[0]]
173 |             print("{{{0},{1},0x{2:05X}}}".format(t, val[1], val[2]), end="")
174 | 
175 |         #print("{0: >3}".format(prop_vals[stage2[i][j]]), end="")
176 |         if j + 1 == block_size:
177 |             print("\n  }", end="")
178 |         else:
179 |             print(",", end="")
180 |             if j % 5 == 4:
181 |                 print("\n   ", end="")
182 |     if i + 1 != len(stage2):
183 |         print(",\n")
184 |     else:
185 |         print("")
186 | print("};")
187 | print("")
188 | print("static const int32_t decomposition_mapping[] = {")
189 | for i in range(len(decomp_map) - 1):
190 |     if i % 8  == 0:
191 |         print("/* 0x{:04X} */ ".format(i), end="")
192 |     print("0x{0:04X},".format(decomp_map[i]), end="")
193 |     if i % 8 == 7:
194 |         print("")
195 | print("0x{0:04X}".format(decomp_map[len(decomp_map) - 1]))
196 | print("};")
197 | print("")
198 | print("#endif /* UNICODE_DECOMPOSE_H */")
199 | 


--------------------------------------------------------------------------------
/util/gen-emojiprop.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright 2017 Patrick O. Perry.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import math
 18 | 
 19 | try:
 20 |     import property
 21 |     import unicode_data
 22 | except ModuleNotFoundError:
 23 |     from util import property
 24 |     from util import unicode_data
 25 | 
 26 | EMOJI_DATA = "data/ucd/emoji/emoji-data.txt"
 27 | 
 28 | emoji_props = property.read(EMOJI_DATA, sets=True)
 29 | 
 30 | 
 31 | props = [0] * len(unicode_data.uchars)
 32 | for i, (key, values) in enumerate(emoji_props.items()):
 33 |     bit = 0x1 << i
 34 |     for value in values:
 35 |         props[value] |= bit
 36 | 
 37 | 
 38 | def compute_tables(block_size):
 39 |     nblock = len(props) // block_size
 40 |     stage1 = [None] * nblock
 41 |     stage2 = []
 42 |     stage2_dict = {}
 43 |     for i in range(nblock):
 44 |         begin = i * block_size
 45 |         end = begin + block_size
 46 |         block = tuple(props[begin:end])
 47 |         if block in stage2_dict:
 48 |             j = stage2_dict[block]
 49 |         else:
 50 |             j = len(stage2)
 51 |             stage2_dict[block] = j
 52 |             stage2.append(block)
 53 |         stage1[i] = j
 54 |     return (stage1,stage2)
 55 | 
 56 | 
 57 | def stage1_item_size(nstage2):
 58 |     nbyte = math.ceil(math.log(nstage2, 2) / 8)
 59 |     size = 2**math.ceil(math.log(nbyte, 2))
 60 |     return size
 61 | 
 62 | page_size = 4096
 63 | block_size = 256
 64 | 
 65 | nbytes = {}
 66 | 
 67 | best_block_size = 1
 68 | smallest_size = len(props)
 69 | 
 70 | for i in range(1,17):
 71 |     block_size = 2**i
 72 |     stage1,stage2 = compute_tables(block_size)
 73 | 
 74 |     nbyte1 = len(stage1) * stage1_item_size(len(stage2))
 75 |     nbyte2 = len(stage2) * block_size
 76 | 
 77 |     nbyte1 = math.ceil(nbyte1 / page_size) * page_size
 78 |     nbyte2 = math.ceil(nbyte2 / page_size) * page_size
 79 |     nbyte = nbyte1 + nbyte2
 80 |     nbytes[block_size] = nbyte
 81 | 
 82 |     if nbyte < smallest_size:
 83 |         smallest_size = nbyte
 84 |         best_block_size = block_size
 85 | 
 86 | 
 87 | block_size = best_block_size
 88 | stage1, stage2 = compute_tables(block_size)
 89 | 
 90 | type1_size = stage1_item_size(len(stage2))
 91 | if type1_size == 1:
 92 |     type1 = 'uint8_t'
 93 | elif type1_size == 2:
 94 |     type1 = 'uint16_t'
 95 | elif type1_size == 4:
 96 |     type1 = 'uint32_t'
 97 | else:
 98 |     type1 = 'uint64_t'
 99 | 
100 | assert len(emoji_props) <= 8
101 | type2 = 'uint8_t'
102 | 
103 | print("/* This file is automatically generated. DO NOT EDIT!")
104 | print("   Instead, edit gen-emoji.py and re-run.  */")
105 | print("")
106 | print("/*")
107 | print(" * Unicode Emoji property values.")
108 | print(" *")
109 | print(" * We use the two-stage lookup strategy described at")
110 | print(" *")
111 | print(" *     http://www.strchr.com/multi-stage_tables")
112 | print(" *")
113 | print(" */")
114 | print("")
115 | print("#ifndef UNICODE_EMOJIPROP_H")
116 | print("#define UNICODE_EMOJIPROP_H")
117 | print("")
118 | print("#include <stdint.h>")
119 | print("")
120 | print("enum emoji_prop_type {")
121 | print("\tEMOJI_PROP_NONE = 0", end="")
122 | for i, name in enumerate(emoji_props.keys()):
123 |     print(",\n\tEMOJI_PROP_" + name.upper() + " = (1 << " + str(i) + ")",
124 |           end="")
125 | print("\n};")
126 | print("")
127 | print("static const " + type1 + " emoji_prop_stage1[] = {")
128 | for i in range(len(stage1) - 1):
129 |     if i % 16  == 0:
130 |         print("/* U+{:04X} */".format(i * block_size), end="")
131 |     print("{0: >3},".format(stage1[i]), end="")
132 |     if i % 16 == 15:
133 |         print("")
134 | print("{0: >3}".format(stage1[len(stage1) - 1]))
135 | print("};")
136 | print("")
137 | print("static const " + type2 + " emoji_prop_stage2[][" +
138 |         str(block_size) + "] = {")
139 | for i in range(len(stage2)):
140 |     print("  /* block " + str(i) + " */")
141 |     print("  {", end="")
142 |     for j in range(block_size):
143 |         print("{0: >3}".format(stage2[i][j]), end="")
144 |         if j + 1 == block_size:
145 |             print("\n  }", end="")
146 |         else:
147 |             print(",", end="")
148 |             if j % 16 == 15:
149 |                 print("\n   ", end="")
150 |     if i + 1 != len(stage2):
151 |         print(",\n")
152 |     else:
153 |         print("")
154 | print("};")
155 | 
156 | print("")
157 | print("static int emoji_prop(int32_t code)")
158 | print("{")
159 | print("\tconst int32_t block_size = " + str(block_size) + ";")
160 | print("\t" + type1 + " i = emoji_prop_stage1[code / block_size];")
161 | print("\treturn emoji_prop_stage2[i][code % block_size];")
162 | print("}")
163 | print("")
164 | print("#endif /* UNICODE_EMOJIPROP_H */")
165 | 


--------------------------------------------------------------------------------
/util/gen-graphbreak.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright 2017 Patrick O. Perry.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import math
 18 | 
 19 | try:
 20 |     import property
 21 | except ModuleNotFoundError:
 22 |     from util import property
 23 | 
 24 | DERIVED_CORE_PROPERTIES = "data/ucd/DerivedCoreProperties.txt"
 25 | EMOJI_DATA = "data/ucd/emoji/emoji-data.txt"
 26 | GRAPHEME_BREAK_PROPERTY = "data/ucd/auxiliary/GraphemeBreakProperty.txt"
 27 | 
 28 | code_props = property.read(GRAPHEME_BREAK_PROPERTY)
 29 | emoji_props = property.read(EMOJI_DATA, sets=True)
 30 | 
 31 | derived_core_properties = property.read(DERIVED_CORE_PROPERTIES, sets=True)
 32 | #incb_consonant = derived_core_properties['Indic_Conjunct_Break=Consonant']
 33 | #incb_extend = derived_core_properties['Indic_Conjunct_Break=Extend']
 34 | 
 35 | for i in range(len(code_props)):
 36 |     if code_props[i] is None:
 37 |         code_props[i] = 'Other'
 38 |     elif code_props[i] == 'Extend':
 39 |         code_props[i] = 'Extend_Other'
 40 | 
 41 | for i in emoji_props['Extended_Pictographic']:
 42 |     assert code_props[i] == 'Other'
 43 |     code_props[i] = 'Extended_Pictographic'
 44 | 
 45 | for p,v in (
 46 |         ('InCB=Linker',    'Extend_InCB_Linker'),
 47 |         ('InCB=Extend',    'Extend_InCB_Extend'),
 48 |         ('InCB=Consonant', 'InCB_Consonant'),
 49 |     ):
 50 |     for i in derived_core_properties[p]:
 51 |         if code_props[i] == 'ZWJ':
 52 |             assert p == 'InCB=Extend'
 53 |             continue
 54 |         elif p in ('InCB=Extend', 'InCB=Linker'):
 55 |             assert code_props[i] == 'Extend_Other'
 56 |         else:
 57 |             assert code_props[i] == 'Other'
 58 |         code_props[i] = v
 59 | 
 60 | prop_names = set(code_props)
 61 | prop_names.remove('Other')
 62 | 
 63 | 
 64 | prop_vals = {}
 65 | prop_vals['Other'] = 0;
 66 | 
 67 | for p in sorted(prop_names):
 68 |     prop_vals[p] = len(prop_vals)
 69 | 
 70 | 
 71 | def compute_tables(block_size):
 72 |     nblock = len(code_props) // block_size
 73 |     stage1 = [None] * nblock
 74 |     stage2 = []
 75 |     stage2_dict = {}
 76 |     for i in range(nblock):
 77 |         begin = i * block_size
 78 |         end = begin + block_size
 79 |         block = tuple(code_props[begin:end])
 80 |         if block in stage2_dict:
 81 |             j = stage2_dict[block]
 82 |         else:
 83 |             j = len(stage2)
 84 |             stage2_dict[block] = j
 85 |             stage2.append(block)
 86 |         stage1[i] = j
 87 |     return (stage1,stage2)
 88 | 
 89 | 
 90 | def stage1_item_size(nstage2):
 91 |     nbyte = math.ceil(math.log(nstage2, 2) / 8)
 92 |     size = 2**math.ceil(math.log(nbyte, 2))
 93 |     return size
 94 | 
 95 | page_size = 4096
 96 | block_size = 256
 97 | 
 98 | nbytes = {}
 99 | 
100 | best_block_size = 1
101 | smallest_size = len(code_props)
102 | 
103 | for i in range(1,17):
104 |     block_size = 2**i
105 |     stage1,stage2 = compute_tables(block_size)
106 | 
107 |     nbyte1 = len(stage1) * stage1_item_size(len(stage2))
108 |     nbyte2 = len(stage2) * block_size
109 | 
110 |     nbyte1 = math.ceil(nbyte1 / page_size) * page_size
111 |     nbyte2 = math.ceil(nbyte2 / page_size) * page_size
112 |     nbyte = nbyte1 + nbyte2
113 |     nbytes[block_size] = nbyte
114 | 
115 |     if nbyte < smallest_size:
116 |         smallest_size = nbyte
117 |         best_block_size = block_size
118 | 
119 | 
120 | block_size = best_block_size
121 | stage1,stage2 = compute_tables(block_size)
122 | 
123 | type1_size = stage1_item_size(len(stage2))
124 | 
125 | if type1_size == 1:
126 |     type1 = 'uint8_t'
127 | elif type1_size == 2:
128 |     type1 = 'uint16_t'
129 | elif type1_size == 4:
130 |     type1 = 'uint32_t'
131 | else:
132 |     type1 = 'uint64_t'
133 | 
134 | type2 = 'int8_t'
135 | 
136 | 
137 | 
138 | # Write graphbreak.h to stdout
139 | 
140 | print("/* This file is automatically generated. DO NOT EDIT!")
141 | print("   Instead, edit gen-graphbreak.py and re-run.  */")
142 | print("")
143 | print("/*")
144 | print(" * Unicode Grapheme_Break property values.")
145 | print(" *")
146 | print(" * Defined in UAX #29 \"Unicode Text Segmentation\"")
147 | print(" *")
148 | print(" *     http://www.unicode.org/reports/tr29/")
149 | print(" *")
150 | print(" * Section 4.1, Table 3.")
151 | print(" *")
152 | print(" *")
153 | print(" * We use the two-stage lookup strategy described at")
154 | print(" *")
155 | print(" *     http://www.strchr.com/multi-stage_tables")
156 | print(" *")
157 | print(" */")
158 | print("")
159 | print("#ifndef UNICODE_GRAPHBREAK_H")
160 | print("#define UNICODE_GRAPHBREAK_H")
161 | print("")
162 | print("#include <stdint.h>")
163 | print("")
164 | print("enum graph_break_prop {")
165 | print("\tGRAPH_BREAK_OTHER = 0", end="")
166 | for prop in sorted(prop_names):
167 |     print(",\n\tGRAPH_BREAK_" + prop.upper() + " = " + str(prop_vals[prop]),
168 |           end="")
169 | print("\n};")
170 | print("")
171 | print("static const " + type1 + " graph_break_stage1[] = {")
172 | for i in range(len(stage1) - 1):
173 |     if i % 16  == 0:
174 |         print("/* U+{:04X} */".format(i * block_size), end="")
175 |     print("{0: >3},".format(stage1[i]), end="")
176 |     if i % 16 == 15:
177 |         print("")
178 | print("{0: >3}".format(stage1[len(stage1) - 1]))
179 | print("};")
180 | print("")
181 | print("static const " + type2 + " graph_break_stage2[][" +
182 |         str(block_size) + "] = {")
183 | #for i in range(len(stage2)):
184 | for i in range(0,len(stage2)):
185 |     print("  /* block " + str(i) + " */")
186 |     print("  {", end="")
187 |     for j in range(block_size):
188 |         print("{0: >3}".format(prop_vals[stage2[i][j]]), end="")
189 |         if j + 1 == block_size:
190 |             print("\n  }", end="")
191 |         else:
192 |             print(",", end="")
193 |             if j % 16 == 15:
194 |                 print("\n   ", end="")
195 |     if i + 1 != len(stage2):
196 |         print(",\n")
197 |     else:
198 |         print("")
199 | print("};")
200 | 
201 | print("")
202 | print("static int graph_break(int32_t code)")
203 | print("{")
204 | print("\tconst int32_t block_size = " + str(block_size) + ";")
205 | print("\t" + type1 + " i = graph_break_stage1[code / block_size];")
206 | print("\treturn graph_break_stage2[i][code % block_size];")
207 | print("}")
208 | print("")
209 | print("#endif /* UNICODE_GRAPHBREAK_H */")
210 | 


--------------------------------------------------------------------------------
/util/gen-normalization.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright 2017 Patrick O. Perry.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import math
 18 | import re
 19 | 
 20 | PATTERN = re.compile(r"""^([0-9A-Fa-f]+)        # (first code)
 21 |                           (\.\.([0-9A-Fa-f]+))? # (.. last code)?
 22 |                           \s*
 23 |                           ;                     # ;
 24 |                           \s*
 25 |                           (\w+)                 # (property name)
 26 |                           \s*
 27 |                           ;                     # ;
 28 |                           \s*
 29 |                           (\w+)                 # (property value)
 30 |                           \s*
 31 |                           (\#.*)?$              # (# comment)?""", re.X)
 32 | 
 33 | UNICODE_MAX = 0x10FFFF
 34 | 
 35 | DERIVED_NORMALIZATION_PROPS = "data/ucd/DerivedNormalizationProps.txt"
 36 | try:
 37 |     infile = open(DERIVED_NORMALIZATION_PROPS)
 38 | except FileNotFoundError:
 39 |     infile = open("../" + DERIVED_NORMALIZATION_PROPS, "r")
 40 | 
 41 | code_props = ['Yes'] * (UNICODE_MAX + 1)
 42 | 
 43 | for line in infile:
 44 |     m = PATTERN.match(line)
 45 |     if m is None or m.group(4) != 'NFC_QC':
 46 |         continue
 47 |     begin = int(m.group(1), 16)
 48 |     if m.group(3) is None:
 49 |         end = begin + 1
 50 |     else:
 51 |         end = int(m.group(3), 16) + 1
 52 |     prop = m.group(5)
 53 |     if prop == 'M':
 54 |         prop = 'Maybe'
 55 |     elif prop == 'N':
 56 |         prop = 'No'
 57 |     elif prop == 'Y':
 58 |         prop = 'Yes'
 59 |     for code in range(begin, end):
 60 |         code_props[code] = prop
 61 |     #print(line, end = "")
 62 |     #print('[', '{:04X}'.format(begin), ',', '{:04X}'.format(end), '): ',
 63 |     #      prop, sep = '')
 64 | infile.close()
 65 | 
 66 | 
 67 | prop_names = set(code_props)
 68 | 
 69 | prop_vals = {}
 70 | prop_vals['No'] = 0
 71 | prop_vals['Yes'] = 1
 72 | prop_vals['Maybe'] = -1
 73 | 
 74 | def compute_tables(block_size):
 75 |     nblock = len(code_props) // block_size
 76 |     stage1 = [None] * nblock
 77 |     stage2 = []
 78 |     stage2_dict = {}
 79 |     for i in range(nblock):
 80 |         begin = i * block_size
 81 |         end = begin + block_size
 82 |         block = tuple(code_props[begin:end])
 83 |         if block in stage2_dict:
 84 |             j = stage2_dict[block]
 85 |         else:
 86 |             j = len(stage2)
 87 |             stage2_dict[block] = j
 88 |             stage2.append(block)
 89 |         stage1[i] = j
 90 |     return (stage1,stage2)
 91 | 
 92 | def stage1_item_size(nstage2):
 93 |     nbyte = math.ceil(math.log(nstage2, 2) / 8)
 94 |     size = 2**math.ceil(math.log(nbyte, 2))
 95 |     return size
 96 | 
 97 | page_size = 4096
 98 | block_size = 256
 99 | 
100 | nbytes = {}
101 | 
102 | best_block_size = 1
103 | smallest_size = len(code_props)
104 | 
105 | for i in range(1,17):
106 |     block_size = 2**i
107 |     stage1,stage2 = compute_tables(block_size)
108 |     #
109 |     nbyte1 = len(stage1) * stage1_item_size(len(stage2))
110 |     nbyte2 = len(stage2) * block_size
111 |     #
112 |     nbyte1 = math.ceil(nbyte1 / page_size) * page_size
113 |     nbyte2 = math.ceil(nbyte2 / page_size) * page_size
114 |     nbyte = nbyte1 + nbyte2
115 |     nbytes[block_size] = nbyte
116 |     #
117 |     if nbyte < smallest_size:
118 |         smallest_size = nbyte
119 |         best_block_size = block_size
120 | 
121 | 
122 | block_size = best_block_size
123 | stage1,stage2 = compute_tables(block_size)
124 | 
125 | type1_size = stage1_item_size(len(stage2))
126 | 
127 | if type1_size == 1:
128 |     type1 = 'uint8_t'
129 | elif type1_size == 2:
130 |     type1 = 'uint16_t'
131 | elif type1_size == 4:
132 |     type1 = 'uint32_t'
133 | else:
134 |     type1 = 'uint64_t'
135 | 
136 | type2 = 'int8_t'
137 | 
138 | 
139 | 
140 | # Write normalizationprop.h to stdout
141 | 
142 | print("/* This file is automatically generated. DO NOT EDIT!")
143 | print("   Instead, edit gen-normalization.py and re-run.  */")
144 | print("")
145 | print("/*")
146 | print(" * Unicode NFC_QC property values.")
147 | print(" *")
148 | print(" * Defined in UAX #15 \"Unicode Normalization Forms\"")
149 | print(" *")
150 | print(" *     http://www.unicode.org/reports/tr15/")
151 | print(" *")
152 | print(" * Section 9, \"Detecting Normalization Forms.\"")
153 | print(" *")
154 | print(" *")
155 | print(" * We use the two-stage lookup strategy described at")
156 | print(" *")
157 | print(" *     http://www.strchr.com/multi-stage_tables")
158 | print(" *")
159 | print(" */")
160 | print("")
161 | print("#ifndef NORMALIZATIONPROP_H")
162 | print("#define NORMALIZATIONPROP_H")
163 | print("")
164 | print("#include <stdint.h>")
165 | print("")
166 | print("enum nfc_qc_prop {")
167 | for i in range(len(prop_names)):
168 |     prop = sorted(prop_names)[i]
169 |     if i > 0:
170 |         print(",")
171 |     print("\tNFC_QC_" + prop.upper() + " = " + str(prop_vals[prop]), end="")
172 | print("\n};")
173 | print("")
174 | print("static const " + type1 + " nfc_qc_stage1[] = {")
175 | for i in range(len(stage1) - 1):
176 |     if i % 16  == 0:
177 |         print("/* U+{:04X} */".format(i * block_size), end="")
178 |     print("{0: >3},".format(stage1[i]), end="")
179 |     if i % 16 == 15:
180 |         print("")
181 | print("{0: >3}".format(stage1[len(stage1) - 1]))
182 | print("};")
183 | print("")
184 | print("static const " + type2 + " nfc_qc_stage2[][" +
185 |         str(block_size) + "] = {")
186 | for i in range(len(stage2)):
187 |     print("  /* block " + str(i) + " */")
188 |     print("  {", end="")
189 |     for j in range(block_size):
190 |         print("{0: >3}".format(prop_vals[stage2[i][j]]), end="")
191 |         if j + 1 == block_size:
192 |             print("\n  }", end="")
193 |         else:
194 |             print(",", end="")
195 |             if j % 16 == 15:
196 |                 print("\n   ", end="")
197 |     if i + 1 != len(stage2):
198 |         print(",\n")
199 |     else:
200 |         print("")
201 | print("};")
202 | 
203 | print("")
204 | print("static int nfc_qc(uint32_t code)")
205 | print("{")
206 | print("\tconst uint32_t block_size = " + str(block_size) + ";")
207 | print("\t" + type1 + " i = nfc_qc_stage1[code / block_size];")
208 | print("\treturn nfc_qc_stage2[i][code % block_size];")
209 | print("}")
210 | print("")
211 | print("#endif /* NORMALIZATIONPROP_H */")
212 | 


--------------------------------------------------------------------------------
/util/gen-wordbreak.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright 2016 Patrick O. Perry.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import math
 18 | 
 19 | try:
 20 |     import property
 21 |     import unicode_data
 22 | except ModuleNotFoundError:
 23 |     from util import property
 24 |     from util import unicode_data
 25 | 
 26 | 
 27 | WORD_BREAK_PROPERTY = "data/ucd/auxiliary/WordBreakProperty.txt"
 28 | code_props = property.read(WORD_BREAK_PROPERTY)
 29 | 
 30 | prop_names = set(code_props)
 31 | prop_names.remove(None)
 32 | 
 33 | prop_names.add('Other')
 34 | for code in range(len(code_props)):
 35 |     if code_props[code] is None:
 36 |         code_props[code] = 'Other'
 37 | 
 38 | prop_vals = {}
 39 | prop_vals['None'] = 0;
 40 | for p in sorted(prop_names):
 41 |     prop_vals[p] = len(prop_vals)
 42 | 
 43 | 
 44 | def compute_tables(block_size):
 45 |     nblock = len(code_props) // block_size
 46 |     stage1 = [None] * nblock
 47 |     stage2 = []
 48 |     stage2_dict = {}
 49 |     for i in range(nblock):
 50 |         begin = i * block_size
 51 |         end = begin + block_size
 52 |         block = tuple(code_props[begin:end])
 53 |         if block in stage2_dict:
 54 |             j = stage2_dict[block]
 55 |         else:
 56 |             j = len(stage2)
 57 |             stage2_dict[block] = j
 58 |             stage2.append(block)
 59 |         stage1[i] = j
 60 |     return (stage1,stage2)
 61 | 
 62 | 
 63 | def stage1_item_size(nstage2):
 64 |     nbyte = math.ceil(math.log(nstage2, 2) / 8)
 65 |     size = 2**math.ceil(math.log(nbyte, 2))
 66 |     return size
 67 | 
 68 | page_size = 4096
 69 | block_size = 256
 70 | 
 71 | nbytes = {}
 72 | 
 73 | best_block_size = 1
 74 | smallest_size = len(code_props)
 75 | 
 76 | for i in range(1,17):
 77 |     block_size = 2**i
 78 |     stage1,stage2 = compute_tables(block_size)
 79 | 
 80 |     nbyte1 = len(stage1) * stage1_item_size(len(stage2))
 81 |     nbyte2 = len(stage2) * block_size
 82 | 
 83 |     nbyte1 = math.ceil(nbyte1 / page_size) * page_size
 84 |     nbyte2 = math.ceil(nbyte2 / page_size) * page_size
 85 |     nbyte = nbyte1 + nbyte2
 86 |     nbytes[block_size] = nbyte
 87 | 
 88 |     if nbyte < smallest_size:
 89 |         smallest_size = nbyte
 90 |         best_block_size = block_size
 91 | 
 92 | 
 93 | block_size = best_block_size
 94 | stage1,stage2 = compute_tables(block_size)
 95 | 
 96 | type1_size = stage1_item_size(len(stage2))
 97 | 
 98 | if type1_size == 1:
 99 |     type1 = 'uint8_t'
100 | elif type1_size == 2:
101 |     type1 = 'uint16_t'
102 | elif type1_size == 4:
103 |     type1 = 'uint32_t'
104 | else:
105 |     type1 = 'uint64_t'
106 | 
107 | type2 = 'int8_t'
108 | 
109 | 
110 | # Write wordbreakprop.h to stdout
111 | 
112 | print("/* This file is automatically generated. DO NOT EDIT!")
113 | print("   Instead, edit gen-wordbreak.py and re-run.  */")
114 | print("")
115 | print("/*")
116 | print(" * Unicode Word_Break property values.")
117 | print(" *")
118 | print(" * Defined in UAX #29 \"Unicode Text Segmentation\"")
119 | print(" *")
120 | print(" *     http://www.unicode.org/reports/tr29/")
121 | print(" *")
122 | print(" * Section 4.1, Table 3.")
123 | print(" *")
124 | print(" *")
125 | print(" * We use the two-stage lookup strategy described at")
126 | print(" *")
127 | print(" *     http://www.strchr.com/multi-stage_tables")
128 | print(" *")
129 | print(" */")
130 | print("")
131 | print("#ifndef WORDBREAKPROP_H")
132 | print("#define WORDBREAKPROP_H")
133 | print("")
134 | print("#include <stdint.h>")
135 | print("")
136 | print("enum word_break_prop {")
137 | print("\tWORD_BREAK_NONE = 0", end="")
138 | for prop in sorted(prop_names):
139 |     print(",\n\tWORD_BREAK_" + prop.upper() + " = " + str(prop_vals[prop]),
140 |           end="")
141 | print("\n};")
142 | print("")
143 | print("static const " + type1 + " word_break_stage1[] = {")
144 | for i in range(len(stage1) - 1):
145 |     if i % 16  == 0:
146 |         print("/* U+{:04X} */".format(i * block_size), end="")
147 |     print("{0: >3},".format(stage1[i]), end="")
148 |     if i % 16 == 15:
149 |         print("")
150 | print("{0: >3}".format(stage1[len(stage1) - 1]))
151 | print("};")
152 | print("")
153 | print("static const " + type2 + " word_break_stage2[][" +
154 |         str(block_size) + "] = {")
155 | #for i in range(len(stage2)):
156 | for i in range(0,len(stage2)):
157 |     print("  /* block " + str(i) + " */")
158 |     print("  {", end="")
159 |     for j in range(block_size):
160 |         print("{0: >3}".format(prop_vals[stage2[i][j]]), end="")
161 |         if j + 1 == block_size:
162 |             print("\n  }", end="")
163 |         else:
164 |             print(",", end="")
165 |             if j % 16 == 15:
166 |                 print("\n   ", end="")
167 |     if i + 1 != len(stage2):
168 |         print(",\n")
169 |     else:
170 |         print("")
171 | print("};")
172 | 
173 | print("")
174 | print("static int word_break(int32_t code)")
175 | print("{")
176 | print("\tconst int32_t block_size = " + str(block_size) + ";")
177 | print("\t" + type1 + " i = word_break_stage1[code / block_size];")
178 | print("\treturn word_break_stage2[i][code % block_size];")
179 | print("}")
180 | print("")
181 | print("#endif /* WORDBREAKPROP_H */")
182 | 


--------------------------------------------------------------------------------
/util/property.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import re
 3 | 
 4 | PATTERN = re.compile(r"""^([0-9A-Fa-f]+)        # (first code)
 5 |                           (\.\.([0-9A-Fa-f]+))? # (.. last code)?
 6 |                           \s*
 7 |                           ;                     # ;
 8 |                           \s*
 9 |                           (\w+)                 # (prop name)
10 |                           \s*
11 |                           (;\s*(\w+))?          # (; (prop value))
12 |                           \s*
13 |                           (\#.*)?$              # (# comment)?""", re.X)
14 | 
15 | FIRST_CODE = 1
16 | LAST_CODE = 3
17 | PROP_NAME = 4
18 | PROP_VALUE = 6
19 | 
20 | UNICODE_MAX = 0x10FFFF
21 | 
22 | 
23 | def read(filename, sets=False):
24 |     try:
25 |         file = open(filename, "r")
26 |     except FileNotFoundError:
27 |         file = open("../" + filename, "r")
28 |     
29 |     code_props = [None] * (UNICODE_MAX + 1)
30 |     prop_names = set()
31 |     properties = {}
32 | 
33 |     with file:
34 |         for line in file:
35 |             line = line.split("#")[0] # remove comment
36 |             m = PATTERN.match(line)
37 |             if m:
38 |                 first = int(m.group(FIRST_CODE), 16)
39 |                 if m.group(LAST_CODE):
40 |                     last = int(m.group(LAST_CODE), 16)
41 |                 else:
42 |                     last = first
43 |                 name = m.group(PROP_NAME)
44 |                 val = m.group(PROP_VALUE)
45 |                 if val != None:
46 |                     name = name + '=' + val
47 |                 if not name in properties:
48 |                     properties[name] = set()
49 |                 prop = properties[name]
50 |                 for u in range(first, last + 1):
51 |                     if not sets:
52 |                         assert code_props[u] is None
53 |                     code_props[u] = name
54 |                     prop.add(u)
55 |                 prop_names.add(name)
56 |     if sets:
57 |         return properties
58 |     else:
59 |         return code_props
60 | 


--------------------------------------------------------------------------------
/util/table-graphbreak.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <inttypes.h>
 3 | #include <stdio.h>
 4 | #include "../src/private/charwidth.h"
 5 | #include "../src/private/graphbreak.h"
 6 | #include "../src/utf8lite.h"
 7 | 
 8 | const char *str_charwidth(enum charwidth_prop prop)
 9 | {
10 | 	switch (prop) {
11 | 	case CHARWIDTH_OTHER:
12 | 		return "Other";
13 | 	case CHARWIDTH_EMOJI:
14 | 		return "Emoji";
15 | 	case CHARWIDTH_AMBIGUOUS:
16 | 		return "Ambiguous";
17 | 	case CHARWIDTH_IGNORABLE:
18 | 		return "Ignorable";
19 | 	case CHARWIDTH_NONE:
20 | 		return "None";
21 | 	case CHARWIDTH_NARROW:
22 | 		return "Narrow";
23 | 	case CHARWIDTH_WIDE:
24 | 		return "Wide";
25 | 	default:
26 | 		assert(0 && "Unrecognized charwidth property");
27 | 	}
28 | }
29 | 
30 | const char *str_graph_break(enum graph_break_prop prop)
31 | {
32 | 	switch (prop) {
33 | 	case GRAPH_BREAK_OTHER:
34 | 		return "None";
35 | 	case GRAPH_BREAK_CR:
36 | 		return "CR";
37 | 	case GRAPH_BREAK_CONTROL:
38 | 		return "Control";
39 | 	case GRAPH_BREAK_E_BASE:
40 | 		return "EBase";
41 | 	case GRAPH_BREAK_E_BASE_GAZ:
42 | 		return "EBaseGAZ";
43 | 	case GRAPH_BREAK_E_MODIFIER:
44 | 		return "EModifier";
45 | 	case GRAPH_BREAK_EXTEND:
46 | 		return "Extend";
47 | 	case GRAPH_BREAK_GLUE_AFTER_ZWJ:
48 | 		return "GlueAfterZWJ";
49 | 	case GRAPH_BREAK_L:
50 | 		return "L";
51 | 	case GRAPH_BREAK_LF:
52 | 		return "LF";
53 | 	case GRAPH_BREAK_LV:
54 | 		return "LV";
55 | 	case GRAPH_BREAK_LVT:
56 | 		return "LVT";
57 | 	case GRAPH_BREAK_PREPEND:
58 | 		return "Prepend";
59 | 	case GRAPH_BREAK_REGIONAL_INDICATOR:
60 | 		return "RegionalIndicator";
61 | 	case GRAPH_BREAK_SPACINGMARK:
62 | 		return "SpacingMark";
63 | 	case GRAPH_BREAK_T:
64 | 		return "T";
65 | 	case GRAPH_BREAK_V:
66 | 		return "V";
67 | 	case GRAPH_BREAK_ZWJ:
68 | 		return "ZWJ";
69 | 	default:
70 | 		assert(0 && "Unrecognized graph break property");
71 | 	}
72 | }
73 | 
74 | int main(int argc, const char **argv)
75 | {
76 | 	int32_t i, n = UTF8LITE_CODE_MAX;
77 | 	int cw, gb;
78 | 
79 | 	printf("code,width,graph\n");
80 | 	for (i = 0; i <= n; i++) {
81 | 		cw = charwidth(i);
82 | 		gb = graph_break(i);
83 | 		printf("U+%04"PRIX32",%s,%s\n", (uint32_t)i,
84 | 		       str_charwidth(cw), str_graph_break(gb));
85 | 	}
86 | 
87 | 	return 0;
88 | }
89 | 


--------------------------------------------------------------------------------
/util/unicode_data.py:
--------------------------------------------------------------------------------
  1 | # unicode_data
  2 | 
  3 | import collections
  4 | import re
  5 | 
  6 | 
  7 | UNICODE_DATA = 'data/ucd/UnicodeData.txt'
  8 | UNICODE_MAX = 0x10FFFF
  9 | 
 10 | try:
 11 |     unicode_data = open(UNICODE_DATA, "r")
 12 | except FileNotFoundError:
 13 |     unicode_data = open("../" + UNICODE_DATA, "r")
 14 | 
 15 | field_names = [
 16 |     'name',     # Name
 17 |     'category', # General_Category
 18 |     'ccc',      # Canonical_Combining_Class
 19 |     'bidi',     # Bidi_Class
 20 |     'decomp',   # Decomposition_Type, Decomposition_Mapping
 21 |     'decimal',  # Numeric_Value (Numeric_Type = Decimal)
 22 |     'digit',    # Numeric_Value (Numeric_Type = Decimal, Digit)
 23 |     'numeric',  # Numeric_Value (Numeric_Type = Decimal, Digit, Numeric)
 24 |     'mirrored', # Bidi_Mirrored
 25 |     'old_name', # Unicode_1_Name
 26 |     'comment',  # ISO_Comment
 27 |     'ucase',    # Simple_Uppercase_Mapping
 28 |     'lcase',    # Simple_Lowercase_Mapping
 29 |     'tcase'     # Simple_Titlecase_Mapping
 30 |     ]
 31 | 
 32 | UChar = collections.namedtuple('UChar', field_names)
 33 | ids = UChar._make(range(1, len(field_names) + 1))
 34 | 
 35 | Decomp = collections.namedtuple('Decomp', ['type', 'map'])
 36 | 
 37 | DECOMP_PATTERN = re.compile(r"""^(<(\w+)>)?\s* # decomposition type
 38 |                                  ((\s*[0-9A-Fa-f]+)+) # decomposition mapping
 39 |                                  \s*$""", re.X)
 40 | RANGE_PATTERN = re.compile(r"""^<([^,]+),\s*    # range name
 41 |                                  (First|Last) # first or last
 42 |                                  >$""", re.X)
 43 | 
 44 | def parse_decomp(code, field):
 45 |     if field != '':
 46 |         m = DECOMP_PATTERN.match(field)
 47 |         assert m
 48 |         d_type = m.group(2)
 49 |         d_map = tuple([int(x, 16) for x in m.group(3).split()])
 50 |         return Decomp(type=d_type, map=d_map)
 51 | 
 52 |     elif code in range(0xAC00, 0xD7A4):
 53 |         return Decomp(type='hangul', map=None)
 54 |     else:
 55 |         return None
 56 | 
 57 | 
 58 | def parse_code(field):
 59 |     if field != '':
 60 |         return int(field, 16)
 61 |     else:
 62 |         return None
 63 | 
 64 | def parse_int(field):
 65 |     if field != '':
 66 |         return int(field)
 67 |     else:
 68 |         return None
 69 | 
 70 | def parse_str(field):
 71 |     if field == '':
 72 |         return None
 73 |     else:
 74 |         return field
 75 | 
 76 | 
 77 | uchars = [None] * (UNICODE_MAX + 1)
 78 | 
 79 | with unicode_data:
 80 |     for line in unicode_data:
 81 |         fields = line.strip().split(';')
 82 |         code = int(fields[0], 16)
 83 |         uchars[code] = UChar(name = fields[ids.name],
 84 |                              category = parse_str(fields[ids.category]),
 85 |                              ccc = parse_int(fields[ids.ccc]),
 86 |                              bidi = fields[ids.bidi],
 87 |                              decomp = parse_decomp(code, fields[ids.decomp]),
 88 |                              decimal = fields[ids.decimal],
 89 |                              digit = fields[ids.digit],
 90 |                              numeric = fields[ids.numeric],
 91 |                              mirrored = fields[ids.mirrored],
 92 |                              old_name = fields[ids.old_name],
 93 |                              comment = fields[ids.comment],
 94 |                              ucase = parse_code(fields[ids.ucase]),
 95 |                              lcase = parse_code(fields[ids.lcase]),
 96 |                              tcase = parse_code(fields[ids.tcase]))
 97 | 
 98 | 
 99 | utype = None
100 | 
101 | for code in range(len(uchars)):
102 |     u = uchars[code]
103 |     if u is None:
104 |         uchars[code] = utype
105 |     else:
106 |         m = RANGE_PATTERN.match(u.name)
107 |         if m:
108 |             if m.group(2) == 'First':
109 |                 utype = u._replace(name = '<' + m.group(1) + '>')
110 |             else:
111 |                 utype = None
112 | 
113 | 
114 | 
115 | decomp_vals = {
116 |     'hangul': -1, 'none': 0,
117 |     'font': 1, 'noBreak': 2, 'initial': 3, 'medial': 4, 'final': 5,
118 |     'isolated': 6, 'circle': 7, 'super': 8, 'sub': 9, 'vertical': 10,
119 |     'wide': 11, 'narrow': 12, 'small': 13, 'square': 14, 'fraction': 15,
120 |     'compat': 16 }
121 | 
122 | decomp_map = []
123 | decomp = []
124 | 
125 | for code in range(len(uchars)):
126 |     u = uchars[code]
127 | 
128 |     if u is None or u.decomp is None:
129 |         decomp.append(None)
130 |         continue
131 | 
132 |     d = u.decomp
133 |     if d.map is not None:
134 |         d_len = len(d.map)
135 | 
136 |         if d_len > 1:
137 |             d_data = len(decomp_map)
138 |             decomp_map.extend(d.map)
139 |         else:
140 |             d_data = d.map[0]
141 | 
142 |         decomp.append((d.type, d_len, d_data))
143 | 
144 |     elif d.type == 'hangul':
145 |         decomp.append(('hangul', 2, 0))
146 | 
147 |     else:
148 |         decomp.append(None)
149 | 
150 | 
151 | # From Unicode-8.0 Section 3.12 Conjoining Jamo Behavior
152 | HANGUL_SBASE = 0xAC00
153 | HANGUL_LBASE = 0x1100
154 | HANGUL_VBASE = 0x1161
155 | HANGUL_TBASE = 0x11A7
156 | HANGUL_LCOUNT = 19
157 | HANGUL_VCOUNT = 21
158 | HANGUL_TCOUNT = 28
159 | HANGUL_NCOUNT = (HANGUL_VCOUNT * HANGUL_TCOUNT)
160 | HANTUL_SCOUNT = (HANGUL_LCOUNT * HANGUL_NCOUNT)
161 | 
162 | 
163 | def hangul_decompose(code):
164 |     sindex = code - HANGUL_SBASE
165 |     lindex = sindex // HANGUL_NCOUNT
166 |     vindex = (sindex % HANGUL_NCOUNT) // HANGUL_TCOUNT
167 |     tindex = sindex % HANGUL_TCOUNT
168 |     lpart = HANGUL_LBASE + lindex
169 |     vpart = HANGUL_VBASE + vindex;
170 |     tpart = HANGUL_TBASE + tindex;
171 |     if tindex > 0:
172 |         return (lpart, vpart, tpart)
173 |     else:
174 |         return (lpart, vpart)
175 | 
176 | 
177 | # get a character's decomposition if one exists, as a tuple
178 | def decompose(code, compat=True):
179 |     dc = decomp[code]
180 |     if dc is None:
181 |         return None
182 |     t = dc[0]
183 |     l = dc[1]
184 |     if not compat and t is not None and t != 'hangul':
185 |         return None
186 |     if l == 1:
187 |         return (dc[2],)
188 |     elif dc[0] != 'hangul':
189 |         o = dc[2]
190 |         return tuple(decomp_map[o:o + l])
191 |     else:
192 |         return hangul_decompose(code)
193 | 


--------------------------------------------------------------------------------