├── README.md
├── readme_ja.txt
├── readme_en.txt
├── LICENSE
├── license.txt
├── unicode
    ├── readme_ja.txt
    ├── readme_en.txt
    ├── ucfdataout2.cpp
    └── updataout3.cpp
├── misc
    └── conftest.cpp
├── history_ja.txt
└── history_en.txt


/README.md:
--------------------------------------------------------------------------------
 1 | SRELL (**s**td::**r**eg**e**x-**l**ike **l**ibrary) is a regular expression template library for C++ and has native support for UTF-8, UTF-16, and UTF-32.
 2 | 
 3 | **This is up-to-date mirror!**
 4 | 
 5 | Please read `readme_en.txt` or `readme_ja.txt`.
 6 | 
 7 | If any bug is found, please send a report to the author rather than posting a new issue here. The author's email address is found at the bottom of [SRELL](https://www.akenotsuki.com/misc/srell/en) page.
 8 | 
 9 | ---
10 | 


--------------------------------------------------------------------------------
/readme_ja.txt:
--------------------------------------------------------------------------------
 1 | ■使用法
 2 | 
 3 | 次のファイルを同じディレクトリに置き、srell.hppをincludeするだけです。
 4 | ・srell.hpp
 5 | ・srell_ucfdata2.h（case folding用データ）
 6 | ・srell_updata3.h（Unicode property用データ）
 7 | 
 8 | ■付属物
 9 | 以下のディレクトリ内にあるものはおまけのようなものです。
10 | SRELL側からは参照していませんので、削除してしまってもライブラリの動作に
11 | 影響はありません。
12 | 
13 | ・misc
14 |   テストプログラムのソースが入っています。
15 | 
16 | ・single-header
17 |   srell.hppの中にsrell_ucfdata2.hppとsrell_updata2.hppとを埋め込んで、こ
18 |   れ単体で使用できるようにしたstandalone版が入っています。
19 | 
20 | ・unicode
21 |   最新のUnicodeデータからsrell_ucfdata2.hpp及びsrell_updata2.hppを作るた
22 |   めのプログラムのソースが入っています。
23 | 
24 | 


--------------------------------------------------------------------------------
/readme_en.txt:
--------------------------------------------------------------------------------
 1 | How to Use
 2 | 
 3 |   Put the following three files in one directory, and include srell.hpp.
 4 |   1. srell.hpp
 5 |   2. srell_ucfdata2.h (data for case folding)
 6 |   3. srell_updata3.h (data for Unicode properties)
 7 | 
 8 | The files in the following directories are supplements. As SRELL does not use
 9 | them, it is safe to remove them.
10 | 
11 | * misc
12 |   Contains source code files for a conformance test program.
13 | 
14 | * single-header
15 |   Contains a standalone version of srell.hpp into which srell_ucfdata2.hpp
16 |   and srell_updata2.hpp have been merged.
17 | 
18 | * unicode
19 |   Contains source code files for programs that generate srell_ucfdata2.hpp
20 |   and srell_update2.hpp from latest Unicode data text files.
21 | 
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2012-2024, Nozomu Katoo
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/license.txt:
--------------------------------------------------------------------------------
 1 | /*****************************************************************************
 2 | **
 3 | **  SRELL (std::regex-like library) version 4.100
 4 | **
 5 | **  Copyright (c) 2012-2025, Nozomu Katoo. All rights reserved.
 6 | **
 7 | **  Redistribution and use in source and binary forms, with or without
 8 | **  modification, are permitted provided that the following conditions are
 9 | **  met:
10 | **
11 | **  1. Redistributions of source code must retain the above copyright notice,
12 | **     this list of conditions and the following disclaimer.
13 | **
14 | **  2. Redistributions in binary form must reproduce the above copyright
15 | **     notice, this list of conditions and the following disclaimer in the
16 | **     documentation and/or other materials provided with the distribution.
17 | **
18 | **  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
19 | **  IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
20 | **  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 | **  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
22 | **  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 | **  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 | **  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 | **  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 | **  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 | **  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 | **  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | **
30 | ******************************************************************************
31 | */
32 | 
33 | 


--------------------------------------------------------------------------------
/unicode/readme_ja.txt:
--------------------------------------------------------------------------------
 1 | ■同梱物について
 2 | 
 3 |   1. ucfdataout2.cpp
 4 |   2. updataout3.cpp
 5 | 
 6 |   これらはISO-646/US-ASCII互換の環境でコンパイル、実行される必要があります。
 7 | 
 8 | ----
 9 | 1. ucfdataout2.cpp
10 | 
11 |   srell_ucfdata2.hの最新版を作成するプログラムのソースファイルです。SRELLは
12 | case-insensitiveな（大文字小文字の違いを無視した）照合を行うために、この
13 | srell_ucfdata2.hを必要とします。
14 | 
15 |   ucfdataout2は、Unicode Consortiumより提供されているCaseFolding.txtというテキ
16 | ストデータからsrell_ucfdata2.hを生成します。
17 | 
18 |   +---------------------------------------------------------------------------
19 |   | CaseFolding.txtとは
20 |   |
21 |   |   Case-insensitiveな照合を行う際には、大文字小文字の違いを吸収するために
22 |   | "case-folding" と呼ばれる処理が行われます。Unicode規格に基づいた
23 |   | case-foldingを行うために、Unicode Consortiumから提供されているのが
24 |   | CaseFolding.txtです。
25 |   |
26 |   |   このデータファイルはUnicode規格がアップデートされるとそれに合わせて
27 |   | アップデートされる可能性があります。
28 |   |
29 |   +---------------------------------------------------------------------------
30 | 
31 |   1-1. 使用方法
32 | 
33 |     1) ucfdataout2.cppをコンパイルします。
34 |     2) 最新版のCaseFolding.txtを次のURLより取得します。
35 |        http://www.unicode.org/Public/UNIDATA/CaseFolding.txt ,
36 |     3) CaseFolding.txtと、1)で作成したバイナリとを同じフォルダに置いて
37 |        バイナリを実行します。
38 |     4) srell_ucfdata2.hが生成されますので、それをSRELLの置かれているディレクト
39 |        リへと移動させます。
40 | 
41 |   1-2. 互換性
42 | 
43 |     srell_ucfdata2.h:   SRELL 4.030以降。拡張子が異なるだけで中身は
44 |                         srell_ucfdata2.hppに同じ。
45 |     srell_ucfdata2.hpp: SRELL 2.500から4.029まで。
46 |     srell_updata.hpp:   SRELL 2.401まで。
47 | 
48 | ----
49 | 2. updataout3.cpp
50 | 
51 |   srell_updata3.hの最新版を作成するプログラムのソースファイルです。SRELLは
52 | Unicode property escapes（\p{...} と \P{...}）を含む正規表現と文字列との照合
53 | を行うために、このsrell_updata3.hを必要とします。
54 | 
55 |   updataout3は、Unicode Consortiumより提供されている次のテキストデータから
56 | srell_updata3.hを生成します。
57 | 
58 |   ・DerivedCoreProperties.txt
59 |   ・DerivedNormalizationProps.txt
60 |   ・emoji-data.txt
61 |   ・PropertyValueAliases.txt
62 |   ・PropList.txt
63 |   ・ScriptExtensions.txt
64 |   ・Scripts.txt
65 |   ・UnicodeData.txt
66 |   ・emoji-sequences.txt
67 |   ・emoji-zwj-sequences.txt
68 | 
69 |   先述のCaseFolding.txt同様、これらのテキストデータファイルもUnicode規格が
70 | アップデートされるとそれに合わせてアップデートされる可能性があります。
71 | 
72 |   2-1. 使用方法
73 | 
74 |     1) updataout3.cppをコンパイルします。
75 |     2) 前記テキストファイルの最新版を次のURLより取得します。
76 |        a. emoji-data.txt: http://www.unicode.org/Public/UNIDATA/emoji/
77 |        b. emoji-sequences.txt と emoji-zwj-sequences.txt:
78 |           http://www.unicode.org/Public/emoji/(ヴァージョン番号)/
79 |        c. その他: http://www.unicode.org/Public/UNIDATA/
80 |     3) これらのテキストファイルと、1)で作成したバイナリとを同じフォルダに
81 |        置いてバイナリを実行します。
82 |     4) srell_updata3.hが生成されますので、それをSRELLの置かれているディレクト
83 |        リへと移動させます。
84 | 
85 |   2-2. 互換性
86 | 
87 |     srell_updata3.h:   SRELL 4.030以降。
88 |     srell_updata2.hpp: SRELL 4.000から4.029まで。
89 |     srell_updata.hpp:  SRELL 3.018まで。
90 | 
91 | 


--------------------------------------------------------------------------------
/unicode/readme_en.txt:
--------------------------------------------------------------------------------
 1 | Contents of this directory:
 2 | 
 3 |   1. ucfdataout2.cpp
 4 |   2. updataout3.cpp
 5 | 
 6 |   These need to be compiled and run on a system that supports
 7 |   an ISO-646/US-ASCII compatible encoding.
 8 | 
 9 | ----
10 | 1. ucfdataout2.cpp
11 | 
12 |   This is a C++ source file for a program that generates a newer version
13 | of srell_ucfdata2.h, which SRELL includes for case-insensitive matching. It
14 | is generated by ucfdataout2 with CaseFolding.txt provided by the Unicode
15 | Consortium.
16 | 
17 |   +---------------------------------------------------------------------------
18 |   | What is CaseFolding.txt?
19 |   |
20 |   |   It is a data file needed for case-insensitive matching based on the
21 |   | Unicode Standard. Whenever a new version of the Unicode Standard is
22 |   | released, CaseFolding.txt may also be updated accordingly.
23 |   |
24 |   +---------------------------------------------------------------------------
25 | 
26 |   1-1. Usage
27 | 
28 |     1) compile ucfdataout2.cpp,
29 |     2) get the latest version of CaseFolding.txt, which is available at
30 |        http://www.unicode.org/Public/UNIDATA/CaseFolding.txt ,
31 |     3) put CaseFolding.txt and a binary file generated at 1) in the same
32 |        directory and run the binary file,
33 |     4) move the newly generated "srell_ucfdata2.h" to the directory in where
34 |        SRELL is put.
35 | 
36 |   1-2. Compatibility
37 | 
38 |     srell_ucfdata2.h:   SRELL 4.030-. The format is the same as
39 |                         srell_ucfdata2.hpp. Only the suffix is changed.
40 |     srell_ucfdata2.hpp: SRELL 2.500-4.029
41 |     srell_ucfdata.hpp:  SRELL -2.401
42 | 
43 | ----
44 | 2. updataout3.cpp
45 | 
46 |   This is a C++ source file for a program that generates a newer version
47 | of srell_updata3.h, which SRELL includes for Unicode property escapes
48 | (\p{...} and \P{...}). It is generated by updataout3 with the following text
49 | files provided by the Unicode Consortium:
50 | 
51 |   * DerivedCoreProperties.txt
52 |   * DerivedNormalizationProps.txt
53 |   * emoji-data.txt
54 |   * PropertyValueAliases.txt
55 |   * PropList.txt
56 |   * ScriptExtensions.txt
57 |   * Scripts.txt
58 |   * UnicodeData.txt
59 |   * emoji-sequences.txt
60 |   * emoji-zwj-sequences.txt
61 | 
62 |   As well as CaseFolding.txt mentioned above, these files may be updated
63 | accordingly whenever a new version of the Unicode Standard is released.
64 | 
65 |   2-1. Usage
66 | 
67 |     1) compile updataout3.cpp,
68 |     2) get the latest versions of the text files mentioned above, which are
69 |        available at:
70 |        a. emoji-data.txt: http://www.unicode.org/Public/UNIDATA/emoji/
71 |        b. emoji-sequences.txt and emoji-zwj-sequences.txt:
72 |           http://www.unicode.org/Public/emoji/(version number)/
73 |        c. others: http://www.unicode.org/Public/UNIDATA/ ,
74 |     3) put the text files and a binary file generated at 1) in the same
75 |        directory and run the binary file,
76 |     4) move the newly generated "srell_updata3.h" to the directory in where
77 |        SRELL is put.
78 | 
79 |   2-2. Compatibility
80 | 
81 |     srell_updata3.h:   SRELL 4.030-
82 |     srell_updata2.hpp: SRELL 4.000-4.029
83 |     srell_updata.hpp:  SRELL -3.018
84 | 
85 | 


--------------------------------------------------------------------------------
/unicode/ucfdataout2.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | //  ucfdataout.cpp: version 2.104 (2024/07/20).
  3 | //
  4 | //  This is a program that generates srell_ucfdata2.h from CaseFolding.txt
  5 | //  provided by the Unicode Consortium. The latese version is available at:
  6 | //  http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
  7 | //
  8 | 
  9 | #include <cstdio>
 10 | #include <cstdlib>
 11 | #include <string>
 12 | #include <map>
 13 | #include <algorithm>	//  For std::swap in C++98/03
 14 | #include <utility>	//  For std::swap in C++11-
 15 | #define SRELL_NO_UNICODE_DATA
 16 | #include "../srell.hpp"
 17 | 
 18 | #if defined(_MSC_VER) && _MSC_VER >= 1400
 19 | #pragma warning(disable:4996)
 20 | #endif
 21 | 
 22 | namespace unishared
 23 | {
 24 | template <typename T>
 25 | std::string to_string(T value, int radix = 10, const int precision = 1)
 26 | {
 27 | 	std::string num;
 28 | 
 29 | 	if (radix >= 2 && radix <= 16)
 30 | 	{
 31 | 		typedef typename std::string::size_type size_type;
 32 | 		const bool minus = value < 0 ? (value = 0 - value, true) : false;
 33 | 
 34 | 		for (; value; value /= radix)
 35 | 			num.push_back("0123456789ABCDEF"[value % radix]);
 36 | 
 37 | 		if (precision > 0 && num.size() < static_cast<size_type>(precision))
 38 | 			num.append(static_cast<size_type>(precision) - num.size(), static_cast<char>('0'));
 39 | 
 40 | 		if (minus)
 41 | 			num.push_back(static_cast<char>('-'));
 42 | 
 43 | 		const size_type mid = num.size() / 2;
 44 | 
 45 | 		for (size_type i = 0; i < mid; ++i)
 46 | 			std::swap(num[i], num[num.size() - i - 1]);
 47 | 	}
 48 | 	return num;
 49 | }
 50 | 
 51 | bool read_file(std::string &str, const char *const filename, const char *const dir)
 52 | {
 53 | 	const std::string path(std::string(dir ? dir : "") + filename);
 54 | 	FILE *const fp = std::fopen(path.c_str(), "r");
 55 | 
 56 | 	std::fprintf(stdout, "Reading '%s'... ", path.c_str());
 57 | 
 58 | 	if (fp)
 59 | 	{
 60 | 		static const std::size_t bufsize = 4096;
 61 | 		char *const buffer = static_cast<char *>(std::malloc(bufsize));
 62 | 
 63 | 		if (buffer)
 64 | 		{
 65 | 			for (;;)
 66 | 			{
 67 | 				const std::size_t size = std::fread(buffer, 1, bufsize, fp);
 68 | 
 69 | 				if (!size)
 70 | 					break;
 71 | 
 72 | 				str.append(buffer, size);
 73 | 			}
 74 | 			std::fclose(fp);
 75 | 			std::fputs("done.\n", stdout);
 76 | 			std::free(buffer);
 77 | 			return true;
 78 | 		}
 79 | 	}
 80 | 	std::fputs("failed...\n", stdout);
 81 | 	return false;
 82 | }
 83 | 
 84 | bool write_file(const char *const filename, const std::string &str)
 85 | {
 86 | 	FILE *const fp = std::fopen(filename, "wb");
 87 | 
 88 | 	std::fprintf(stdout, "Writing '%s'... ", filename);
 89 | 
 90 | 	if (fp)
 91 | 	{
 92 | 		const bool success = std::fwrite(str.c_str(), 1, str.size(), fp) == str.size();
 93 | 		std::fclose(fp);
 94 | 		if (success)
 95 | 		{
 96 | 			std::fputs("done.\n", stdout);
 97 | 			return true;
 98 | 		}
 99 | 	}
100 | 	std::fputs("failed...\n", stdout);
101 | 	return false;
102 | }
103 | }
104 | //  namespace unishared
105 | 
106 | struct ucf_options
107 | {
108 | 	const char *infilename;
109 | 	const char *outfilename;
110 | 	const char *indir;
111 | 	int version;
112 | 	int errorno;
113 | 
114 | 	ucf_options(const int argc, const char *const *const argv)
115 | 		: infilename("CaseFolding.txt")
116 | 		, outfilename("srell_ucfdata2.h")
117 | 		, indir("")
118 | 		, version(201)
119 | 		, errorno(0)
120 | 	{
121 | 
122 | 		for (int index = 1; index < argc; ++index)
123 | 		{
124 | 			const char firstchar = argv[index][0];
125 | 
126 | 			if (firstchar == '-' || firstchar == '/')
127 | 			{
128 | 				const char *const option = argv[index] + 1;
129 | 
130 | 				++index;
131 | 				if (std::strcmp(option, "i") == 0)
132 | 				{
133 | 					if (index >= argc)
134 | 						goto NO_ARGUMENT;
135 | 					infilename = argv[index];
136 | 				}
137 | 				else if (std::strcmp(option, "o") == 0)
138 | 				{
139 | 					if (index >= argc)
140 | 						goto NO_ARGUMENT;
141 | 					outfilename = argv[index];
142 | 				}
143 | 				else if (std::strcmp(option, "id") == 0)
144 | 				{
145 | 					if (index >= argc)
146 | 						goto NO_ARGUMENT;
147 | 					indir = argv[index];
148 | 				}
149 | 				else if (std::strcmp(option, "?") == 0 || std::strcmp(option, "h") == 0)
150 | 				{
151 | 					std::fputs("Usage: ucfdataout2 [options]\nOptions:\n", stdout);
152 | 					std::fputs("  -i <FILE>\t\tRead data from <FILE>.\n", stdout);
153 | 					std::fputs("  -id <DIRECTORY>\tAssume that input file exist in <DIRECTORY>.\n\t\t\t<DIRECTORY> must ends with '/' or '\\'.\n", stdout);
154 | 					std::fputs("  -o <FILE>\t\tOutput to <FILE>.\n", stdout);
155 | 					errorno = 1;
156 | 					return;
157 | 				}
158 | 				else
159 | 				{
160 | 					--index;
161 | 					goto UNKNOWN_OPTION;
162 | 				}
163 | 
164 | 				continue;
165 | 
166 | 				NO_ARGUMENT:
167 | 				std::fprintf(stdout, "[Error] no argument for \"%s\" specified.\n", argv[--index]);
168 | 				errorno = -2;
169 | 			}
170 | 			else
171 | 			{
172 | 				UNKNOWN_OPTION:
173 | 				std::fprintf(stdout, "[Error] unknown option \"%s\" found.\n", argv[index]);
174 | 				errorno = -1;
175 | 			}
176 | 		}
177 | 	}
178 | };
179 | //  struct ucf_options
180 | 
181 | class unicode_casefolding
182 | {
183 | public:
184 | 
185 | 	unicode_casefolding()
186 | 		: maxdelta_(0L), maxdelta_cp_(0L), ucf_maxcodepoint_(0L), rev_maxcodepoint_(0L)
187 | 		, ucf_numofsegs_(1U), rev_numofsegs_(1U), numofcps_from_(0U), numofcps_to_(0U)
188 | 		, max_appearance_(0U), nextoffset_(0x100L), rev_charsets_(1, -1L)
189 | 	{
190 | 	}
191 | 
192 | 	int create_ucfdata(std::string &outdata, const ucf_options &opts)
193 | 	{
194 | 		const std::string indent("\t\t\t");
195 | 		int errorno = opts.errorno;
196 | 		std::string buf;
197 | 
198 | 		if (errorno)
199 | 			return errorno;
200 | 
201 | 		if (unishared::read_file(buf, opts.infilename, opts.indir))
202 | 		{
203 | 			static const srell::regex re_line("^.*$", srell::regex::multiline);
204 | 			static const srell::regex re_license("^# (.*)$");
205 | 			static const srell::regex re_cfdata("^\\s*([0-9A-Fa-f]+); ([CS]); ([0-9A-Fa-f]+);\\s*#\\s*(.*)$");
206 | 			static const srell::regex re_comment_or_emptyline("^#.*|^$");
207 | 			srell::cregex_iterator2 iter(buf.c_str(), buf.c_str() + buf.size(), re_line);
208 | 			srell::cmatch match;
209 | 			int colcount = 0;
210 | 
211 | 			for (; !iter.done(); ++iter)
212 | 			{
213 | 				if (iter->length(0))
214 | 				{
215 | 					if (!srell::regex_match((*iter)[0].first, (*iter)[0].second, match, re_license))
216 | 					{
217 | 						outdata.append(1, '\n');
218 | 						break;
219 | 					}
220 | 					outdata += "//  " + match.str(1) + "\n";
221 | 				}
222 | 			}
223 | 
224 | 			outdata += "template <typename T1, typename T2>\nstruct unicode_casefolding\n{\n";
225 | 
226 | 			for (; !iter.done(); ++iter)
227 | 			{
228 | 				if (srell::regex_match((*iter)[0].first, (*iter)[0].second, match, re_cfdata))
229 | 				{
230 | 					const std::string from(match[1].str());
231 | 					const std::string to(match[3].str());
232 | 					const std::string type(match[2].str());
233 | 					const std::string name(match[4].str());
234 | 
235 | 					update(from, to);
236 | 				}
237 | 			}
238 | 
239 | 			if (colcount > 0)
240 | 				outdata.append(1, '\n');
241 | 
242 | 			outdata += "\tstatic const T1 ucf_maxcodepoint = 0x" + unishared::to_string(ucf_maxcodepoint_, 16, 4) + ";\n";
243 | 			outdata += "\tstatic const T2 ucf_deltatablesize = 0x" + unishared::to_string(ucf_numofsegs_ << 8, 16) + ";\n";
244 | 
245 | 			outdata += "\tstatic const T1 rev_maxcodepoint = 0x" + unishared::to_string(rev_maxcodepoint_, 16, 4) + ";\n";
246 | 			outdata += "\tstatic const T2 rev_indextablesize = 0x" + unishared::to_string(rev_numofsegs_ << 8, 16) + ";\n";
247 | 			outdata += "\tstatic const T2 rev_charsettablesize = " + unishared::to_string(numofcps_to_ * 2 + numofcps_from_ + 1) + ";\t//  1 + " + unishared::to_string(numofcps_to_) + " * 2 + " + unishared::to_string(numofcps_from_) + "\n";
248 | 			outdata += "\tstatic const T2 rev_maxset = " + unishared::to_string(maxset()) + ";\n";
249 | 			outdata += "\tstatic const T1 eos = 0;\n";
250 | 
251 | 			outdata += "\n\tstatic const T1 ucf_deltatable[];\n\tstatic const T2 ucf_segmenttable[];\n\tstatic const T2 rev_indextable[];\n\tstatic const T2 rev_segmenttable[];\n\tstatic const T1 rev_charsettable[];\n";
252 | 
253 | 			outdata += "};\ntemplate <typename T1, typename T2>\n\tconst T1 unicode_casefolding<T1, T2>::ucf_maxcodepoint;\ntemplate <typename T1, typename T2>\n\tconst T2 unicode_casefolding<T1, T2>::ucf_deltatablesize;\ntemplate <typename T1, typename T2>\n\tconst T1 unicode_casefolding<T1, T2>::rev_maxcodepoint;\ntemplate <typename T1, typename T2>\n\tconst T2 unicode_casefolding<T1, T2>::rev_indextablesize;\ntemplate <typename T1, typename T2>\n\tconst T2 unicode_casefolding<T1, T2>::rev_charsettablesize;\ntemplate <typename T1, typename T2>\n\tconst T2 unicode_casefolding<T1, T2>::rev_maxset;\ntemplate <typename T1, typename T2>\n\tconst T1 unicode_casefolding<T1, T2>::eos;\n\n";
254 | 			out_v2tables(outdata);
255 | 			outdata += "#define SRELL_UCFDATA_VERSION " + unishared::to_string(static_cast<unsigned int>(opts.version)) + "\n";
256 | 
257 | 			std::fprintf(stdout, "MaxDelta: %+ld (U+%.4lX->U+%.4lX)\n", maxdelta_, maxdelta_cp_, maxdelta_cp_ + maxdelta_);
258 | 		}
259 | 		else
260 | 			errorno = 1;
261 | 
262 | 		return errorno;
263 | 	}
264 | 
265 | private:
266 | 
267 | 	void update(const std::string &from, const std::string &to)
268 | 	{
269 | 		const long cp_from = std::strtol(from.c_str(), NULL, 16);
270 | 		const long cp_to = std::strtol(to.c_str(), NULL, 16);
271 | 		const long delta = cp_to - cp_from;
272 | 		const long segno_from = cp_from >> 8;
273 | 		const long segno_to = cp_to >> 8;
274 | 
275 | 		update_tables(cp_from, cp_to, segno_from);
276 | 
277 | 		++numofcps_from_;
278 | 		if (std::abs(maxdelta_) < std::abs(delta))
279 | 		{
280 | 			maxdelta_cp_ = cp_from;
281 | 			maxdelta_ = delta;
282 | 		}
283 | 
284 | 		if (ucf_maxcodepoint_ < cp_from)
285 | 			ucf_maxcodepoint_ = cp_from;
286 | 
287 | 		if (rev_maxcodepoint_ < cp_to)
288 | 			rev_maxcodepoint_ = cp_to;
289 | 
290 | 		if (rev_maxcodepoint_ < cp_from)
291 | 			rev_maxcodepoint_ = cp_from;
292 | 
293 | 		if (!ucf_countedsegnos.count(segno_from))
294 | 		{
295 | 			ucf_countedsegnos[segno_from] = 1;
296 | 			++ucf_numofsegs_;
297 | 		}
298 | 
299 | 		if (!rev_countedsegnos.count(segno_to))
300 | 		{
301 | 			rev_countedsegnos[segno_to] = 1;
302 | 			++rev_numofsegs_;
303 | 		}
304 | 		if (!rev_countedsegnos.count(segno_from))
305 | 		{
306 | 			rev_countedsegnos[segno_from] = 1;
307 | 			++rev_numofsegs_;
308 | 		}
309 | 
310 | 		if (!cps_counted_as_foldedto.count(cp_to))
311 | 		{
312 | 			cps_counted_as_foldedto[cp_to] = 1;
313 | 			++numofcps_to_;
314 | 		}
315 | 
316 | 		if (appearance_counts_.count(to))
317 | 			++appearance_counts_[to];
318 | 		else
319 | 			appearance_counts_[to] = 1;
320 | 
321 | 		if (max_appearance_ < appearance_counts_[to])
322 | 			max_appearance_ = appearance_counts_[to];
323 | 	}
324 | 
325 | 	unsigned int maxset() const
326 | 	{
327 | 		return max_appearance_ + 1;
328 | 	}
329 | 
330 | 	void out_v2tables(std::string &outdata)
331 | 	{
332 | 		const char *const headers[] = {
333 | 			"template <typename T1, typename T2>\nconst ",
334 | 			" unicode_casefolding<T1, T2>::",
335 | 			"[] =\n{\n"
336 | 		};
337 | 
338 | 		create_revtables();
339 | 		out_lowertable(outdata, headers, "T1", "ucf_deltatable", ucf_deltas_, ucf_segments_);
340 | 		outdata.append(1, '\n');
341 | 		out_uppertable(outdata, headers, "T2", "ucf_segmenttable", ucf_segments_);
342 | 		outdata.append(1, '\n');
343 | 		out_lowertable(outdata, headers, "T2", "rev_indextable", rev_indices_, rev_segments_);
344 | 		outdata.append(1, '\n');
345 | 		out_uppertable(outdata, headers, "T2", "rev_segmenttable", rev_segments_);
346 | 		outdata.append(1, '\n');
347 | 		out_cstable(outdata, headers, "T1", "rev_charsettable", rev_charsets_);
348 | 	}
349 | 
350 | 	//  Updates ucf_segments_, ucf_deltas_, and rev_charsets_.
351 | 	void update_tables(const long cp_from, const long cp_to, const long segno_from)
352 | 	{
353 | 		if (segno_from >= static_cast<long>(ucf_segments_.size()))
354 | 			ucf_segments_.resize(segno_from + 1, 0L);
355 | 
356 | 		long &offset_of_segment = ucf_segments_[segno_from];
357 | 
358 | 		if (offset_of_segment == 0L)
359 | 		{
360 | 			offset_of_segment = nextoffset_;
361 | 			nextoffset_ += 0x100L;
362 | 			ucf_deltas_.resize(nextoffset_, 0L);
363 | 		}
364 | 
365 | 		ucf_deltas_[offset_of_segment + (cp_from & 0xffL)] = cp_to - cp_from;
366 | 
367 | 		for (long index = 0L;; ++index)
368 | 		{
369 | 			if (index == static_cast<long>(rev_charsets_.size()))
370 | 			{
371 | 				rev_charsets_.push_back(cp_to);
372 | 				rev_charsets_.push_back(cp_from);
373 | 				rev_charsets_.push_back(-1L);
374 | 				break;
375 | 			}
376 | 			if (rev_charsets_[index] == cp_to)
377 | 			{
378 | 				for (++index; rev_charsets_[index] != -1L; ++index);
379 | 
380 | 				rev_charsets_.insert(index, 1, cp_from);
381 | 				break;
382 | 			}
383 | 		}
384 | 	}
385 | 
386 | 	//  Creates rev_segments_ and rev_indices_ from rev_charsets_.
387 | 	void create_revtables()
388 | 	{
389 | 		long nextoffset = 0x100L;
390 | 		for (long index = 0L; index < static_cast<long>(rev_charsets_.size()); ++index)
391 | 		{
392 | 			const long bocs = index;	//  Beginning of charset.
393 | 
394 | 			for (; rev_charsets_[index] != -1L; ++index)
395 | 			{
396 | 				const long &u21ch = rev_charsets_[index];
397 | 				const long segno = u21ch >> 8L;
398 | 
399 | 				if (segno >= static_cast<long>(rev_segments_.size()))
400 | 					rev_segments_.resize(segno + 1, 0L);
401 | 
402 | 				long &offset_of_segment = rev_segments_[segno];
403 | 
404 | 				if (offset_of_segment == 0L)
405 | 				{
406 | 					offset_of_segment = nextoffset;
407 | 					nextoffset += 0x100L;
408 | 					rev_indices_.resize(nextoffset, 0L);
409 | 				}
410 | 				rev_indices_[offset_of_segment + (u21ch & 0xffL)] = bocs;
411 | 			}
412 | 		}
413 | 	}
414 | 
415 | 	void out_lowertable(std::string &outdata, const char *const headers[], const char *const type, const char *const funcname, const std::basic_string<long> &table, const std::basic_string<long> &segtable) const
416 | 	{
417 | 		const long end = static_cast<long>(table.size());
418 | 
419 | 		outdata += headers[0];
420 | 		outdata += type;
421 | 		outdata += headers[1];
422 | 		outdata += funcname;
423 | 		outdata += headers[2];
424 | 
425 | 		for (long i = 0L; i < end;)
426 | 		{
427 | 			const long col = i & 15L;
428 | 
429 | 			if ((i & 255L) == 0)
430 | 			{
431 | 				if (i != 0L)
432 | 				{
433 | 					for (long j = 0L; j < static_cast<long>(segtable.size()); ++j)
434 | 					{
435 | 						if (segtable[j] == i)
436 | 						{
437 | 							outdata += "\n\t//  For u+" + unishared::to_string(j, 16, 2) + "xx (" + unishared::to_string(i) + ")\n";
438 | 							break;
439 | 						}
440 | 					}
441 | 				}
442 | 				else
443 | 					outdata += "\t//  For common (0)\n";
444 | 			}
445 | 
446 | 			outdata += col == 0 ? "\t" : (col & 3) == 0 ? "  " : " ";
447 | 			if (table[i] >= 0L)
448 | 				outdata += unishared::to_string(table[i]);
449 | 			else
450 | 				outdata += "static_cast<", outdata += type, outdata += ">(", outdata += unishared::to_string(table[i]) + ")";
451 | 
452 | 			if (++i == end)
453 | 				outdata.append(1, '\n');
454 | 			else if (col == 15L)
455 | 				outdata += ",\n";
456 | 			else
457 | 				outdata.append(1, ',');
458 | 		}
459 | 		outdata += "};\n";
460 | 	}
461 | 
462 | 	void out_uppertable(std::string &outdata, const char *const headers[], const char *const type, const char *const funcname, const std::basic_string<long> &table) const
463 | 	{
464 | 		int end = static_cast<int>(table.size());
465 | 
466 | 		outdata += headers[0];
467 | 		outdata += type;
468 | 		outdata += headers[1];
469 | 		outdata += funcname;
470 | 		outdata += headers[2];
471 | 
472 | 		for (int i = 0; i < end;)
473 | 		{
474 | 			const int col = i & 15;
475 | 
476 | 			outdata += col == 0 ? "\t" : (col & 3) == 0 ? "  " : " ";
477 | 			if (table[i] >= 0)
478 | 				outdata += unishared::to_string(table[i]);
479 | 			else
480 | 				outdata += "static_cast<", outdata += type, outdata += ">(", outdata += unishared::to_string(table[i]) + ")";
481 | 
482 | 			if (++i == end)
483 | 				outdata.append(1, '\n');
484 | 			else if (col == 15)
485 | 				outdata += ",\n";
486 | 			else
487 | 				outdata.append(1, ',');
488 | 		}
489 | 		outdata += "};\n";
490 | 	}
491 | 
492 | 	void out_cstable(std::string &outdata, const char *const headers[], const char *const type, const char *const funcname, const std::basic_string<long> &table) const
493 | 	{
494 | 		int end = static_cast<int>(table.size());
495 | 		bool newline = true;
496 | 		int bos = 0;
497 | 		int prevprintedbos = -1;
498 | 
499 | 		outdata += headers[0];
500 | 		outdata += type;
501 | 		outdata += headers[1];
502 | 		outdata += funcname;
503 | 		outdata += headers[2];
504 | 
505 | 		for (int i = 0; i < end;)
506 | 		{
507 | 			const long val = table[i];
508 | 
509 | 			outdata += newline ? "\t" : " ";
510 | 			newline = false;
511 | 
512 | 			if (val == -1L)
513 | 				outdata += "eos";
514 | 			else
515 | 				outdata += "0x", outdata += unishared::to_string(val, 16, 4);
516 | 
517 | 			if (++i != end)
518 | 				outdata.append(1, ',');
519 | 
520 | 			if (val == -1L)
521 | 			{
522 | 				if (prevprintedbos != bos / 10 || i == end)
523 | 				{
524 | 					outdata += "\t//  ";
525 | 					outdata += unishared::to_string(bos);
526 | 					prevprintedbos = bos / 10;
527 | 				}
528 | 				outdata.append(1, '\n');
529 | 				newline = true;
530 | 				bos = i;
531 | 			}
532 | 		}
533 | 		outdata += "};\n";
534 | 	}
535 | 
536 | 	typedef std::map<long, char> flagset_type;
537 | 
538 | 	long maxdelta_;	//  = 0L;
539 | 	long maxdelta_cp_;	//  = 0L;
540 | 	long ucf_maxcodepoint_;	//  = 0L;	//  The max code point for case-folding.
541 | 	long rev_maxcodepoint_;	//  = 0L;	//  The max code point for reverse lookup.
542 | 	unsigned int ucf_numofsegs_;	//  = 1U;	//  The number of segments in the delta table.
543 | 	unsigned int rev_numofsegs_;	//  = 1U;	//  The number of segments in the table for reverse lookup.
544 | 	unsigned int numofcps_from_;	//  = 0U;	//  The number of code points in "folded from"s.
545 | 	unsigned int numofcps_to_;	//  = 0U;	//  The number of code points in "folded to"s.
546 | 
547 | 	flagset_type ucf_countedsegnos;	//  The set of segment nos marked as "counted" for case-folding.
548 | 	flagset_type rev_countedsegnos;	//  The set of segment nos marked as "counted" for reverse lookup.
549 | 	flagset_type cps_counted_as_foldedto;	//  The set of code points marked as "folded to".
550 | 
551 | 	unsigned int max_appearance_;
552 | 	std::map<std::string, unsigned int> appearance_counts_;
553 | 
554 | 	long nextoffset_;
555 | 	std::basic_string<long> ucf_deltas_;
556 | 	std::basic_string<long> ucf_segments_;
557 | 	std::basic_string<long> rev_indices_;
558 | 	std::basic_string<long> rev_segments_;
559 | 	std::basic_string<long> rev_deltas_;
560 | 	std::basic_string<long> rev_charsets_;
561 | };
562 | //  class unicode_casefolding
563 | 
564 | int main(const int argc, const char *const *const argv)
565 | {
566 | 	ucf_options ucfopts(argc, argv);
567 | 	std::string outdata;
568 | 	unicode_casefolding ucf;
569 | 	int errorno = ucf.create_ucfdata(outdata, ucfopts);
570 | 
571 | 	if (errorno == 0)
572 | 	{
573 | 		if (!unishared::write_file(ucfopts.outfilename, outdata))
574 | 			errorno = 2;
575 | 	}
576 | 	return errorno;
577 | }
578 | 


--------------------------------------------------------------------------------
/misc/conftest.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | //  Conformance test program for SRELL.
  3 | //  Version 3.007 (2025/09/18)
  4 | //
  5 | //  This needs to be compiled and run on a system that supports
  6 | //  an ISO-646/US-ASCII compatible encoding.
  7 | //
  8 | 
  9 | #include <cstdio>
 10 | #include <cstring>
 11 | #include <string>
 12 | #include <vector>
 13 | 
 14 | #ifdef DUP_CHECK
 15 | #include <map>
 16 | #endif
 17 | 
 18 | #define SRELL_HAS_U8TYPE
 19 | #define SRELL_HAS_U1632TYPE
 20 | 
 21 | #ifdef __cpp_unicode_characters
 22 | typedef char16_t u16char_type;
 23 | typedef char32_t u32char_type;
 24 | #else
 25 | #define PRE_CPP11
 26 | #if defined(__clang_major__) && (__clang_major__ >= 19)
 27 | #undef SRELL_HAS_U1632TYPE
 28 | #else
 29 | typedef unsigned short u16char_type;
 30 | typedef unsigned long u32char_type;
 31 | #endif
 32 | #endif
 33 | 
 34 | #ifdef __cpp_char8_t
 35 | #if defined(_MSC_VER)
 36 | typedef char char_type;
 37 | #define RE(x) x
 38 | #define STR(x) x
 39 | #define STR0(x) x "\0"
 40 | #else
 41 | typedef char8_t char_type;
 42 | #define RE(x) u8##x
 43 | #define STR(x) u8##x
 44 | #define STR0(x) u8##x u8"\0"
 45 | #endif
 46 | typedef char8_t u8char_type;
 47 | #else
 48 | #define PRE_CPP20
 49 | typedef char char_type;
 50 | #define RE(x) x
 51 | #define STR(x) x
 52 | #define STR0(x) x "\0"
 53 | #if defined(__clang_major__) && (__clang_major__ >= 19)
 54 | #undef SRELL_HAS_U8TYPE
 55 | #else
 56 | typedef unsigned char u8char_type;
 57 | #endif
 58 | #endif
 59 | 
 60 | #include "../srell.hpp"
 61 | #include "conftest-data.h"
 62 | 
 63 | namespace constants
 64 | {
 65 | 	enum utf_type
 66 | 	{
 67 | 		unknown, utf8, utf16, utf32, utf8c, utf16or32w, c, w
 68 | 	};
 69 | }
 70 | 
 71 | struct utf0_tag {};
 72 | struct utf8_tag {};
 73 | struct utf16_tag {};
 74 | struct utf32_tag {};
 75 | 
 76 | #if defined(WCHAR_MAX)
 77 | 	#if (WCHAR_MAX >= 0x10FFFF)
 78 | 	#define SRELL_HAS_UTF32W
 79 | 	#define SRELL_HAS_UTF1632W 32
 80 | 	#elif (WCHAR_MAX >= 0xFFFF)
 81 | 	#define SRELL_HAS_UTF16W
 82 | 	#define SRELL_HAS_UTF1632W 16
 83 | 	#else
 84 | 	#error "wchar_t is not capable of UTF-16 or UTF-32."
 85 | 	#endif
 86 | #endif
 87 | 
 88 | unsigned long try_unescaping(const char_type *&p)
 89 | {
 90 | 	const unsigned long failure = 0x110000ul;
 91 | 	unsigned long ucp = 0ul;
 92 | 
 93 | 	if (*++p == 0x7b)	//  '{'
 94 | 	{
 95 | 		const char_type *const begin = ++p;
 96 | 
 97 | 		for (;; ++p)
 98 | 		{
 99 | 			if (*p >= 0x30 && *p <= 0x39)
100 | 				ucp = (ucp << 4) | (*p - 0x30);
101 | 			else
102 | 			{
103 | 				const char ch = *p | 0x20;
104 | 
105 | 				if (ch >= 0x61 && ch <= 0x66)
106 | 					ucp = (ucp << 4) | (ch - 0x61 + 10);
107 | 				else if (*p == 0x7d && p != begin)	//  '}'
108 | 					return ucp;
109 | 				else
110 | 					return failure;
111 | 			}
112 | 		}
113 | 	}
114 | 
115 | 	for (unsigned int ui = 0;; ++p)
116 | 	{
117 | 		if (*p >= 0x30 && *p <= 0x39)
118 | 			ucp = (ucp << 4) | (*p - 0x30);
119 | 		else
120 | 		{
121 | 			const char ch = *p | 0x20;
122 | 
123 | 			if (ch >= 0x61 && ch <= 0x66)
124 | 				ucp = (ucp << 4) | (ch - 0x61 + 10);
125 | 			else
126 | 				return failure;
127 | 		}
128 | 
129 | 		if (++ui == 4)
130 | 			return ucp;
131 | 	}
132 | }
133 | 
134 | template <typename Char32T, typename UtfTag>
135 | std::basic_string<Char32T> to_utf(const char_type *&u8c, const UtfTag)
136 | {
137 | 	std::basic_string<Char32T> out;
138 | 
139 | 	for (; *u8c; ++u8c)
140 | 	{
141 | 		if (*u8c == 0x5c)	//  '\\'
142 | 		{
143 | 			const char_type *prefetch = u8c;
144 | 
145 | 			if (*++prefetch == 0x75)	//  'u'
146 | 			{
147 | 				const unsigned long u32 = try_unescaping(prefetch);
148 | 
149 | 				if (u32 < 0x110000ul)
150 | 				{
151 | 					out.push_back(static_cast<Char32T>(u32));
152 | 
153 | 					u8c = prefetch;
154 | 					continue;
155 | 				}
156 | 			}
157 | 		}
158 | 		out.append(1, *u8c);
159 | 	}
160 | 	return out;
161 | }
162 | 
163 | template <typename Char8T>
164 | std::basic_string<Char8T> to_utf(const char_type *&u8c, const utf8_tag)
165 | {
166 | 	std::basic_string<Char8T> out;
167 | 
168 | 	for (; *u8c; ++u8c)
169 | 	{
170 | 		if (*u8c == 0x5c)	//  '\\'
171 | 		{
172 | 			const char_type *prefetch = u8c;
173 | 
174 | 			if (*++prefetch == 0x75)	//  'u'
175 | 			{
176 | 				const unsigned long u32 = try_unescaping(prefetch);
177 | 
178 | 				if (u32 < 0x110000ul)
179 | 				{
180 | 					if (u32 < 0x80ul)
181 | 					{
182 | 						out.push_back(static_cast<Char8T>(u32));
183 | 					}
184 | 					else if (u32 < 0x800ul)
185 | 					{
186 | 						out.push_back(static_cast<Char8T>(((u32 >> 6) & 0x1f) | 0xc0));
187 | 						out.push_back(static_cast<Char8T>((u32 & 0x3f) | 0x80));
188 | 					}
189 | 					else if (u32 < 0x10000ul)
190 | 					{
191 | 						out.push_back(static_cast<Char8T>(((u32 >> 12) & 0x0f) | 0xe0));
192 | 						out.push_back(static_cast<Char8T>(((u32 >> 6) & 0x3f) | 0x80));
193 | 						out.push_back(static_cast<Char8T>((u32 & 0x3f) | 0x80));
194 | 					}
195 | 					else
196 | 					{
197 | 						out.push_back(static_cast<Char8T>(((u32 >> 18) & 7) | 0xf0));
198 | 						out.push_back(static_cast<Char8T>(((u32 >> 12) & 0x3f) | 0x80));
199 | 						out.push_back(static_cast<Char8T>(((u32 >> 6) & 0x3f) | 0x80));
200 | 						out.push_back(static_cast<Char8T>((u32 & 0x3f) | 0x80));
201 | 					}
202 | 
203 | 					u8c = prefetch;
204 | 					continue;
205 | 				}
206 | 			}
207 | 		}
208 | 		out.append(1, *u8c);
209 | 	}
210 | 	return out;
211 | }
212 | 
213 | template <typename Char16T>
214 | std::basic_string<Char16T> to_utf(const char_type *&u8c, const utf16_tag)
215 | {
216 | 	std::basic_string<Char16T> out;
217 | 
218 | 	for (; *u8c; ++u8c)
219 | 	{
220 | 		if (*u8c == 0x5c)	//  '\\'
221 | 		{
222 | 			const char_type *prefetch = u8c;
223 | 
224 | 			if (*++prefetch == 0x75)	//  'u'
225 | 			{
226 | 				const unsigned long u32 = try_unescaping(prefetch);
227 | 
228 | 				if (u32 < 0x110000ul)
229 | 				{
230 | 					if (u32 < 0x10000ul)
231 | 					{
232 | 						out.push_back(static_cast<Char16T>(u32));
233 | 					}
234 | 					else
235 | 					{
236 | 						out.push_back(static_cast<Char16T>(((u32 - 0x10000) >> 10) | 0xd800));
237 | 						out.push_back(static_cast<Char16T>((u32 & 0x3ff) | 0xdc00));
238 | 					}
239 | 
240 | 					u8c = prefetch;
241 | 					continue;
242 | 				}
243 | 			}
244 | 		}
245 | 		out.append(1, *u8c);
246 | 	}
247 | 	return out;
248 | }
249 | 
250 | template <typename CharT>
251 | std::string u32ctou8c(const CharT u32, const std::size_t minsize)
252 | {
253 | 	std::string utf8;
254 | 
255 | 	//  UTF-32 -> UTF-8.
256 | 	if (u32 < 0x80)	//  00..7F
257 | 	{
258 | 		utf8.push_back(static_cast<char>((u32 >= 0x20 || u32 == 0x0a) ? u32 : 0x20));
259 | 	}
260 | 	else if (u32 < 0x800)	//  80..7FF
261 | 	{
262 | 		utf8.push_back(static_cast<char>(((u32 >> 6) & 0x1f) | 0xc0));
263 | 		utf8.push_back(static_cast<char>((u32 & 0x3f) | 0x80));
264 | 	}
265 | 	else if (u32 < 0x10000)	//  800..FFFF
266 | 	{
267 | 		utf8.push_back(static_cast<char>(((u32 >> 12) & 0x0f) | 0xe0));
268 | 		utf8.push_back(static_cast<char>(((u32 >> 6) & 0x3f) | 0x80));
269 | 		utf8.push_back(static_cast<char>((u32 & 0x3f) | 0x80));
270 | 	}
271 | 	else	//  10000..10FFFF
272 | 	{
273 | #ifdef _MSC_VER
274 | #pragma warning(push)
275 | #pragma warning(disable:4333)
276 | #endif
277 | 		utf8.push_back(static_cast<char>(((u32 >> 18) & 0x7) | 0xf0));
278 | 		utf8.push_back(static_cast<char>(((u32 >> 12) & 0x3f) | 0x80));
279 | 		utf8.push_back(static_cast<char>(((u32 >> 6) & 0x3f) | 0x80));
280 | 		utf8.push_back(static_cast<char>((u32 & 0x3f) | 0x80));
281 | #ifdef _MSC_VER
282 | #pragma warning(pop)
283 | #endif
284 | 	}
285 | 
286 | 	if (utf8.size() < minsize)
287 | 		utf8.insert(0, minsize - utf8.size(), 0x20);
288 | 
289 | 	return utf8;
290 | }
291 | 
292 | template <typename StringT, typename CharT>
293 | StringT simple_conv(const CharT *c)
294 | {
295 | 	StringT out;
296 | 
297 | 	for (; *c;)
298 | 		out.push_back(*c++);
299 | 
300 | 	return out;
301 | }
302 | 
303 | template <typename Char16>
304 | std::string utf16_to_utf8c(const std::basic_string<Char16> &u16)
305 | {
306 | 	std::string out;
307 | 
308 | 	for (typename std::basic_string<Char16>::size_type index = 0; index < u16.size(); ++index)
309 | 	{
310 | 		const Char16 ucp = u16[index];
311 | 
312 | 		//  UTF-16 -> UTF-32.
313 | 		if ((ucp & 0xfc00) == 0xd800)
314 | 		{
315 | 			const Char16 surtail = u16[++index];
316 | 
317 | 			if ((surtail & 0xfc00) == 0xdc00)
318 | 			{
319 | 				const Char16 surlead = ((ucp & 0x3ff) + 0x40);
320 | 				//  110110aa aabbbbcc  110111cc ddddeeee
321 | 				//  11110aaa 10aabbbb  10ccccdd 10ddeeee
322 | 				out.push_back(static_cast<char>(((surlead >> 8) & 7) | 0xf0));
323 | 				out.push_back(static_cast<char>(((surlead >> 2) & 0x3f) | 0x80));
324 | 				out.push_back(static_cast<char>(0x80 | ((surlead << 4) & 0x30) | ((surtail >> 6) & 0xf)));
325 | 				out.push_back(static_cast<char>(0x80 | (surtail & 0x3f)));
326 | 				continue;
327 | 			}
328 | 			else
329 | 				--index;
330 | 		}
331 | 		out += u32ctou8c(ucp, 1);
332 | 	}
333 | 	return out;
334 | }
335 | 
336 | template <typename Char32>
337 | std::string utf32_to_utf8c(const std::basic_string<Char32> &u32)
338 | {
339 | 	std::string out;
340 | 
341 | 	for (typename std::basic_string<Char32>::size_type index = 0; index < u32.size(); ++index)
342 | 		out += u32ctou8c(u32[index], 1);
343 | 
344 | 	return out;
345 | }
346 | 
347 | std::string convert_to_utf8c(const std::string &s)
348 | {
349 | 	return s;
350 | }
351 | 
352 | std::string convert_to_utf8c(const std::wstring &ws)
353 | {
354 | #if (SRELL_HAS_UTF1632W == 16)
355 | 	return utf16_to_utf8c(ws);
356 | #else
357 | 	return utf32_to_utf8c(ws);
358 | #endif
359 | }
360 | 
361 | #if defined(SRELL_HAS_U8TYPE)
362 | std::string convert_to_utf8c(const std::basic_string<u8char_type> &u8)
363 | {
364 | 	return std::string(u8.begin(), u8.end());
365 | }
366 | #endif
367 | #if defined(SRELL_HAS_U1632TYPE)
368 | std::string convert_to_utf8c(const std::basic_string<u16char_type> &u16)
369 | {
370 | 	return utf16_to_utf8c(u16);
371 | }
372 | std::string convert_to_utf8c(const std::basic_string<u32char_type> &u32)
373 | {
374 | 	return utf32_to_utf8c(u32);
375 | }
376 | #endif
377 | 
378 | namespace otherflags
379 | {
380 | 	typedef unsigned int type;
381 | 	static const type none = 0;
382 | 	static const type regex_match = 1 << 0;
383 | 	static const type three_iterators = 1 << 1;
384 | 	static const type global = 1 << 2;
385 | 	static const type matchall = 1 << 3;
386 | 	static const type errortest = 1 << 4;
387 | 	static const type namedgroup = 1 << 5;
388 | 	static const type print_states = 1 << 6;
389 | }
390 | 
391 | std::string parse_flagstring(
392 | 	srell::regex_constants::syntax_option_type &so,
393 | 	srell::regex_constants::match_flag_type &mf,
394 | 	otherflags::type &of,
395 | 	const char *flags)
396 | {
397 | 	std::string str;
398 | 
399 | 	so = srell::regex_constants::ECMAScript;
400 | 	mf = srell::regex_constants::match_default;
401 | 
402 | 	for (;; ++flags)
403 | 	{
404 | 		switch (*flags)
405 | 		{
406 | 		case 0:
407 | 			return str;
408 | 
409 | 		case 'i':
410 | 			so |= srell::regex_constants::icase;
411 | 			str.push_back(*flags);
412 | 			break;
413 | 		case 'm':
414 | 			so |= srell::regex_constants::multiline;
415 | 			str.push_back(*flags);
416 | 			break;
417 | 		case 's':
418 | 			so |= srell::regex_constants::dotall;
419 | 			str.push_back(*flags);
420 | 			break;
421 | #if !defined(NO_VMODE)
422 | 		case 'v':
423 | 			so |= srell::regex_constants::unicodesets;
424 | 			str.push_back(*flags);
425 | 			break;
426 | #endif
427 | 		case 'y':
428 | 			so |= srell::regex_constants::sticky;
429 | 			str.push_back(*flags);
430 | 			break;
431 | 
432 | 		case 'n':
433 | 			so |= srell::regex_constants::nosubs;
434 | 			str.push_back(*flags);
435 | 			break;
436 | 
437 | 		case '3':	//  regex_search(begin, end, start);
438 | 			of |= otherflags::three_iterators;
439 | 			break;
440 | 
441 | 		case 'M':	//  regex_match()
442 | 			of |= otherflags::regex_match;
443 | 			break;
444 | 
445 | 		case 'G':	//  String.prototype.match()
446 | 			of |= otherflags::global;
447 | 			break;
448 | 
449 | 		case 'A':	//  String.prototype.matchAll()
450 | 			of |= otherflags::matchall;
451 | 			break;
452 | 
453 | 		case 'E':
454 | 			of |= otherflags::errortest;
455 | 			break;
456 | 
457 | 		case 'N':
458 | 			of |= otherflags::namedgroup;
459 | 			break;
460 | 
461 | 		default:
462 | 			std::fprintf(stdout, "[Warning] Unknown flag '%c' found.\n", *flags);
463 | 			break;
464 | 		}
465 | 	}
466 | }
467 | 
468 | template <typename RegexType, typename CharT, typename UtfTag>
469 | bool conf_test(
470 | 	const char_type *str1,
471 | 	const char_type *exp1,
472 | 	const char *const flagstr,
473 | 	const unsigned int num,
474 | 	const char_type *expected1,
475 | 	const unsigned int offset,
476 | 	const unsigned int max)
477 | {
478 | 	typedef RegexType regex_type;
479 | 	typedef CharT char_type2;
480 | 	typedef UtfTag utf_tag;
481 | 	typedef std::basic_string<char_type2> string_type;
482 | 	typedef srell::match_results<const CharT *> mr_type;
483 | //	typedef srell::match_results<const typename string_type::const_iterator> smr_type;
484 | 
485 | 	string_type str(to_utf<char_type2>(str1, utf_tag()));
486 | 	string_type exp(to_utf<char_type2>(exp1, utf_tag()));
487 | 	std::vector<string_type> expected;
488 | 	srell::regex_constants::syntax_option_type so = srell::regex_constants::ECMAScript;
489 | 	srell::regex_constants::match_flag_type mf = srell::regex_constants::match_default;
490 | 	otherflags::type of = otherflags::none;
491 | 
492 | 	const std::string flagstr2 = parse_flagstring(so, mf, of, flagstr);
493 | 	const bool search = (of & otherflags::regex_match) ? false : true;
494 | 	const bool iterator3 = (of & otherflags::three_iterators) ? true : false;
495 | 	const bool global = (of & otherflags::global) ? true : false;
496 | 	const bool matchall = (of & otherflags::matchall) ? true : false;
497 | 	const bool errortest = (of & otherflags::errortest) ? true : false;
498 | 	const bool namedgroup = (of & otherflags::namedgroup) ? true : false;
499 | 
500 | 	for (unsigned int i = 0; i < num; ++i)
501 | 	{
502 | 		const string_type s(to_utf<char_type2>(expected1, utf_tag()));
503 | 
504 | 		expected.push_back(s);
505 | 		++expected1;
506 | 	}
507 | 
508 | 	regex_type re;
509 | 	mr_type mr;
510 | //	smr_type smr;
511 | 	bool b = false;
512 | 	unsigned int num_of_failures = 0;
513 | 
514 | 	const std::string strfc(convert_to_utf8c(str));	//  For Console.
515 | 	const std::string expfc(convert_to_utf8c(exp));
516 | 
517 | #ifdef DUP_CHECK
518 | 	static std::map<std::string, bool> tried;
519 | 	const std::string comb(flagstr + strfc + expfc);
520 | 	if (tried.count(comb))
521 | 		std::fprintf(stdout, "[Warning] /%s/%s.exec(\"%s\") has been checked.\n", expfc.c_str(), flagstr, strfc.c_str());
522 | 	else
523 | 		tried[comb] = true;
524 | #endif
525 | 
526 | 	try
527 | 	{
528 | 		re.assign(exp, so);
529 | 
530 | 		if (errortest)	//  Reaching here means that an exception has not been thrown.
531 | 		{
532 | 			std::fprintf(stdout, "\t/%s/\nResult: Failed (expected %u \"%s\", but no error thrown).\n\n", expfc.c_str(), offset, srell::regex_error(offset).what());
533 | 			return false;
534 | 		}
535 | 
536 | 		const CharT *const begin = str.c_str() + offset;
537 | 		const CharT *const end = str.c_str() + str.size();
538 | 		const CharT *const lblimit = !iterator3 ? begin : str.c_str();
539 | 		string_type matched;
540 | 		string_type msg;
541 | 		string_type gname;
542 | 
543 | 		if (search)
544 | 		{
545 | 			b = srell::regex_search(begin, end, lblimit, mr, re, mf);
546 | 		}
547 | 		else
548 | 		{
549 | 			b = srell::regex_match(begin, end, mr, re, mf);
550 | 		}
551 | 
552 | 		std::fprintf(stdout, "\t/%s/%s.%sch(\"%s\");", expfc.c_str(), flagstr2.c_str(), search ? "sear" : "mat", strfc.c_str());
553 | 
554 | 		if (offset != 0)
555 | 			std::fprintf(stdout, " offset:%u\n", offset);
556 | 		else
557 | 			std::fprintf(stdout, "\n");
558 | 
559 | 		if (max > 1)
560 | 			std::fprintf(stdout, "\t%u times\n", max);
561 | 
562 | 		std::fprintf(stdout, "\t%s.\n", b ? "Found" : "Not Found");
563 | 
564 | 		unsigned int matchcount = 0u;
565 | 
566 | 		for (; mr.size() != 0;)
567 | 		{
568 | 			if (global || matchall)
569 | 				std::fprintf(stdout, "\t#%.2u\n", static_cast<unsigned int>(matchcount / (matchall ? mr.size() : 1)));
570 | 
571 | 			for (srell::cmatch::size_type i = 0; i < mr.size(); ++i)
572 | 			{
573 | 				std::fprintf(stdout, "\tm[%u] = ", static_cast<unsigned int>(i));
574 | 
575 | 				if (namedgroup)
576 | 				{
577 | 					typename mr_type::gnamemap_type::gname_string gntmp0 = mr.lookup_gname_(static_cast<unsigned int>(i));
578 | 
579 | 					gname.clear();
580 | 					if (gntmp0.size())
581 | 					{
582 | 						string_type gntmp1(&gntmp0[0], &gntmp0[gntmp0.size()]);
583 | 
584 | 						if (&mr[i] == &mr[gntmp1])
585 | 							gname = simple_conv<string_type>(" <") + gntmp1 + simple_conv<string_type>(">");
586 | 					}
587 | 				}
588 | 
589 | 				if (mr[i].matched)
590 | 				{
591 | 					matched = mr[i].str() + gname;
592 | 					msg = simple_conv<string_type>("\"") + matched + simple_conv<string_type>("\"") + simple_conv<string_type>(" (%u-%u)");
593 | 				}
594 | 				else
595 | 					msg = matched = simple_conv<string_type>("(undefined)") + gname;
596 | 
597 | 				const std::size_t expno = matchcount++;
598 | 
599 | 				if (expno < expected.size())
600 | 				{
601 | 					if (matched == expected.operator[](expno))
602 | 						msg += simple_conv<string_type>("; OK.");
603 | 					else
604 | 					{
605 | 						msg += simple_conv<string_type>("; failed... (expected: \"") + expected.operator[](expno) + simple_conv<string_type>("\")");
606 | 						++num_of_failures;
607 | 					}
608 | 				}
609 | 				else
610 | 				{
611 | 					msg += simple_conv<string_type>("; failed... (should not match)");
612 | 					++num_of_failures;
613 | 				}
614 | 
615 | 				msg += simple_conv<string_type>("\n");
616 | 				const std::string msgfc(convert_to_utf8c(msg));
617 | 				std::fprintf(stdout, msgfc.c_str(), mr.position(i), mr.length(i));
618 | 				if (global)
619 | 					break;
620 | 			}
621 | 
622 | 			if (global || matchall)
623 | 			{
624 | 				const CharT *begin2 = mr[0].second;
625 | 
626 | 				if (begin2 == mr.prefix().first)
627 | 				{
628 | 					if (matchcount == expected.size())
629 | 						break;
630 | 
631 | 					if (begin2 != end)
632 | 						regex_type::traits_type::utf_traits::codepoint_inc(begin2, end);
633 | 				}
634 | 
635 | 				b = srell::regex_search(begin2, end, lblimit, mr, re, mf);
636 | 			}
637 | 			else
638 | 				break;
639 | 		}
640 | 
641 | 		if (expected.size() != matchcount)
642 | 		{
643 | 			std::fprintf(stdout, "\tm.size() == %u; should be %u.\n", static_cast<unsigned int>(mr.size()), static_cast<unsigned int>(expected.size()));
644 | 			++num_of_failures;
645 | 		}
646 | 
647 | 		std::fprintf(stdout, "Result: %s.\n\n", num_of_failures ? "Failed" : "OK");
648 | 
649 | 		return num_of_failures == 0;
650 | 	}
651 | 	catch (const srell::regex_error &e)
652 | 	{
653 | 		std::fprintf(stdout, "Error (regex_error): %d \"%s\"\n\t/%s/%s;\n", e.code(), e.what(), expfc.c_str(), flagstr2.c_str());
654 | 
655 | 		if (errortest)
656 | 		{
657 | 			if (e.code() == static_cast<srell::regex_constants::error_type>(offset))
658 | 			{
659 | 				std::fprintf(stdout, "Result: OK.\n\n");
660 | 				return true;
661 | 			}
662 | 
663 | 			std::fprintf(stdout, "Result: Failed... (expected: %u \"%s\")\n\n", offset, srell::regex_error(offset).what());
664 | 		}
665 | 		else
666 | 			std::fprintf(stdout, "Result: Failed.\n\n");
667 | 	}
668 | 	catch (const std::exception &e)
669 | 	{
670 | 		std::fprintf(stdout, "Error (std::exception): \"%s\"\nResult: Failed.\n\n", e.what());
671 | 	}
672 | 	return false;
673 | }
674 | 
675 | bool conf_test(
676 | 	const constants::utf_type utf_type,
677 | 	const char_type *const str,
678 | 	const char_type *const exp,
679 | 	const char *const flagstr,
680 | 	const unsigned int num,
681 | 	const char_type *const expected,
682 | 	const unsigned int offset = 0,
683 | 	const unsigned int max = 1)
684 | {
685 | 	switch (utf_type)
686 | 	{
687 | 	case constants::utf8c:
688 | 		return conf_test<srell::u8cregex, char, utf8_tag>(str, exp, flagstr, num, expected, offset, max);
689 | 
690 | #if defined(SRELL_HAS_UTF1632W)
691 | 	case constants::utf16or32w:
692 | 		return conf_test<srell::u1632wregex, wchar_t,
693 | #if (SRELL_HAS_UTF1632W == 16)
694 | 				utf16_tag
695 | #else
696 | 				utf32_tag
697 | #endif
698 | 			>(str, exp, flagstr, num, expected, offset, max);
699 | #endif
700 | 
701 | #if defined(SRELL_HAS_U8TYPE)
702 | 	case constants::utf8:
703 | 		return conf_test<srell::basic_regex<u8char_type, srell::u8regex_traits<u8char_type> >, u8char_type, utf8_tag>(str, exp, flagstr, num, expected, offset, max);
704 | #endif
705 | 
706 | #if defined(SRELL_HAS_U1632TYPE)
707 | 	case constants::utf16:
708 | 		return conf_test<srell::basic_regex<u16char_type, srell::u16regex_traits<u16char_type> >, u16char_type, utf16_tag>(str, exp, flagstr, num, expected, offset, max);
709 | 
710 | 	case constants::utf32:
711 | 		return conf_test<srell::basic_regex<u32char_type>, u32char_type, utf32_tag>(str, exp, flagstr, num, expected, offset, max);
712 | #endif
713 | 
714 | 	case constants::w:
715 | 		return conf_test<srell::wregex, wchar_t, utf0_tag>(str, exp, flagstr, num, expected, offset, max);
716 | 
717 | 	case constants::c:
718 | 	case constants::unknown:
719 | 	default:
720 | 		return conf_test<srell::regex, char, utf0_tag>(str, exp, flagstr, num, expected, offset, max);
721 | 	}
722 | }
723 | 
724 | struct options
725 | {
726 | 	constants::utf_type utype;
727 | 	int errorno;
728 | 
729 | 	options(const int argc, const char *const *const argv)
730 | 		: utype(constants::unknown)
731 | 		, errorno(0)
732 | 	{
733 | 		if (argc >= 2)
734 | 		{
735 | 			const std::size_t len = std::strlen(argv[1]);
736 | 
737 | 			if (len >= 4 && std::memcmp(argv[1], "utf", 3) == 0)
738 | 			{
739 | 				if (argv[1][3] == '8')
740 | 				{
741 | 					if (std::strcmp(argv[1] + 4, "c") == 0)
742 | 						utype = constants::utf8c;
743 | #if defined(SRELL_HAS_U8TYPE)
744 | 					else if (argv[1][4] == 0)
745 | 						utype = constants::utf8;
746 | #endif
747 | 					else
748 | 						goto UNKNOWN_TYPE;
749 | 				}
750 | 				else if (len >= 5 && std::memcmp(argv[1] + 3, "16", 2) == 0)
751 | 				{
752 | #if defined(SRELL_HAS_UTF16W)
753 | 					if (std::strcmp(argv[1] + 5, "w") == 0)
754 | 						utype = constants::utf16or32w;
755 | 					else
756 | #endif
757 | #if defined(SRELL_HAS_U1632TYPE)
758 | 					if (argv[1][5] == 0)
759 | 						utype = constants::utf16;
760 | #endif
761 | 					else
762 | 						goto UNKNOWN_TYPE;
763 | 				}
764 | 				else if (len >= 5 && std::memcmp(argv[1] + 3, "32", 2) == 0)
765 | 				{
766 | #if defined(SRELL_HAS_UTF32W)
767 | 					if (std::strcmp(argv[1] + 5, "w") == 0)
768 | 						utype = constants::utf16or32w;
769 | 					else
770 | #endif
771 | #if defined(SRELL_HAS_U1632TYPE)
772 | 					if (argv[1][5] == 0)
773 | 						utype = constants::utf32;
774 | #endif
775 | 					else
776 | 						goto UNKNOWN_TYPE;
777 | 				}
778 | 				else
779 | 					goto UNKNOWN_TYPE;
780 | 			}
781 | 			else
782 | 				goto UNKNOWN_TYPE;
783 | 
784 | 			return;
785 | 		}
786 | 
787 | 		errorno = -1;
788 | 
789 | 		PRINT_USAGE:
790 | 		std::fputs("Usage: conftest testtype\n", stdout);
791 | #if defined(SRELL_HAS_U8TYPE)
792 | 		std::fputs("    utf8    u8regex\n", stdout);
793 | #endif
794 | #if defined(SRELL_HAS_U1632TYPE)
795 | 		std::fputs("    utf16   u16regex\n", stdout);
796 | 		std::fputs("    utf32   u32regex\n", stdout);
797 | #endif
798 | 		std::fputs("    utf8c   u8cregex (UTF-8 with char)\n", stdout);
799 | #if defined(SRELL_HAS_UTF16W)
800 | 		std::fputs("    utf16w  u1632wregex (UTF-16 with wchar_t)\n", stdout);
801 | #endif
802 | #if defined(SRELL_HAS_UTF32W)
803 | 		std::fputs("    utf32w  u1632wregex (UTF-32 with wchar_t)\n", stdout);
804 | #endif
805 | 		return;
806 | 
807 | 		UNKNOWN_TYPE:
808 | 		std::fprintf(stdout, "[Error] Unknown type \"%s\" specified.\n", argv[1]);
809 | 		errorno = 1;
810 | 		goto PRINT_USAGE;
811 | 	}
812 | };
813 | //  struct options
814 | 
815 | int main(const int argc, const char *const argv[])
816 | {
817 | 	options opts(argc, argv);
818 | //	const unsigned int count = 100000;
819 | 	unsigned int num_of_tests = 0;
820 | 	unsigned int num_of_tests_passed = 0;
821 | //	unsigned int num_of_benches = 0;
822 | //	unsigned int num_of_benches_passed = 0;
823 | 	unsigned int skipped = 0;
824 | 
825 | 	const char_type *re = STR("");
826 | 	const char_type *str = re;
827 | 	const char_type *expected = re;
828 | 
829 | 	const constants::utf_type utype = opts.utype;
830 | 	const bool is_utf8 = utype == constants::utf8 || utype == constants::utf8c;
831 | 
832 | 	if (opts.errorno)
833 | 		return opts.errorno;
834 | 
835 | 	const testdata *t = tests;
836 | 
837 | 	for (;; ++t)
838 | 	{
839 | 		if (t->type == 1)
840 | 			break;
841 | 
842 | 		std::fputs(t->title, stdout);
843 | 		if (t->type == 8 && !is_utf8)
844 | 		{
845 | 			std::fputs("[Info] This test is specific to UTF-8. Skipped...\n\n", stdout);
846 | 			++skipped;
847 | 			continue;
848 | 		}
849 | 
850 | 		if (t->re)
851 | 			re = t->re;
852 | 		if (t->str)
853 | 			str = t->str;
854 | 		if (t->expected)
855 | 			expected = t->expected;
856 | 
857 | 		if (conf_test(utype, str, re, t->flags, t->number, expected, t->offset))
858 | 			++num_of_tests_passed;
859 | 
860 | 		++num_of_tests;
861 | 	}
862 | 
863 | 	const char *typetable[] = {
864 | 		"Unknown", "UTF-8", "UTF-16", "UTF-32", "UTF-8 with char",
865 | #if defined(SRELL_HAS_UTF1632W)
866 | #if (SRELL_HAS_UTF1632W == 16)
867 | 		"UTF-16 with wchar_t",
868 | #else
869 | 		"UTF-32 with wchar_t",
870 | #endif
871 | #else
872 | 		"", "",
873 | #endif
874 | 		//  These two types will pass all tests only when it has width of at least 21 bits.
875 | 		"char", "wchar_t"
876 | 	};
877 | 
878 | 	std::fprintf(stdout, "TestType: %s (%d)\n", typetable[opts.utype], opts.utype);
879 | 
880 | 	std::fprintf(stdout, "Results of tests: %u/%u (%.1lf%%) passed. (%u skipped).\n", num_of_tests_passed, num_of_tests, num_of_tests ? static_cast<double>(num_of_tests_passed) * 100.0 / num_of_tests : -1.0, skipped);
881 | //	std::fprintf(stdout, "Results of benchmarks: %u/%u passed.\n", num_of_benches_passed, num_of_benches);
882 | 
883 | 	return static_cast<int>(num_of_tests - num_of_tests_passed);
884 | }
885 | 


--------------------------------------------------------------------------------
/history_ja.txt:
--------------------------------------------------------------------------------
  1 | 20251010; version 4.100:
  2 |   ・intおよびsize_tのビット幅が32以上であることを求めるように。
  3 |   ・上の要件追加により、一部のコードを簡略化。
  4 |   ・basic_regexのコンストラクタ、assign(), operator=()をstring_view対応
  5 |     に。
  6 |   ・その他細々とした改良：
  7 |     1. CPUキャッシュと相性の良いBoyer-Moore-Horspoolデータに。
  8 |     2. C++11より前のコンパイラにはstd::vectorにdata()メンバがないため、
  9 |        version 4.065以降でregex_token_iteratorがコンパイルできなくなって
 10 |        いました。
 11 |     3. SIMD用の情報とcontiguousではないイテレータ用の情報とが共存できる
 12 |        ように。
 13 | 
 14 | 20250928; version 4.090:
 15 |   ・Unicode property名を引く時のメモリ使用効率を改良。
 16 |   ・bad_allocなしモード（SRELL_NO_THROW 2）対応無期限延期につき、関連コ
 17 |     ードを削除。
 18 | 
 19 | 20250920; version 4.080:
 20 |   ・nosubsフラグを実装。
 21 | 
 22 | 20250914; version 4.070:
 23 |   ・埋込フラグ (?ims-ims) にvとyとを追加。
 24 | 
 25 | 20250910; version 4.069:
 26 |   ・ucfdata2.h, updata3.hをUnicode 17.0.0対応に更新。
 27 |   ・regex_errorのwhat()がエラー名を返すように変更。
 28 |   ・basic_regexのmatch(), search()にstring_viewなどに対応するための
 29 |     overloadを追加。
 30 |   ・misc/conftest.cppを更新。
 31 |     1. GCC 13以降で-Wallをつけてコンパイルした時に警告が出ぬように変更。
 32 |     2. Clang 19以降、std::char_traitsは既定の定義が削除されたことを受け
 33 |        て、char(8|16|32)_tのうち対応していない文字型のテストを代用型で行
 34 |        うのはやめて飛ばすように変更。
 35 |   ・unicode/updataout3.cppを更新。
 36 | 
 37 | 20250814; version 4.068:
 38 |   ・unicode/updataout3.cppを更新。
 39 |     1. Version 4.065でSRELLの内部函数の名前を変更した影響でコンパイルで
 40 |        きなくなっていた問題を修正。
 41 |     2. 文字列リテラルを\xエスケープするか否かを、コンパイル時ではなく実
 42 |        行時に選択できるようオプションを追加。
 43 | 
 44 | 20250608; version 4.067:
 45 |   ・内部のメモリ処理の調整。
 46 |   ・その他改良など。
 47 | 
 48 | 20250518; version 4.066:
 49 |   ・ECMAScriptの仕様に準拠していない振る舞いが3箇所あったので修正。
 50 |     1. "" に対して /()*/ で検索した時、$1がundefined相当にならず "" にな
 51 |        っていた。
 52 |     2. \0の後ろに数字が続いていてもエラーにならなかった。
 53 |     3. 正規表現が\cで終わっていてもエラーにならなかった。
 54 |   ・syntax_option_typeにquietフラグを追加。
 55 |   ・syntax_option_typeに既存のunicodesetsフラグの別名としてvmodeフラグを
 56 |     追加。
 57 |   ・照合用函数内に8箇所あったthrowする箇所を1箇所に統合。
 58 |   ・conftestのexpectedが数ヶ所間違っていたので修正。
 59 |   ・その他細々とした修正など。
 60 | 
 61 | 20250420; version 4.065:
 62 |   ・内部のメモリ処理の改良。
 63 |   ・SRELL_NO_THROW定義時用に、regex_token_iteratorにもecode()メンバ函数
 64 |     を追加。
 65 |   ・照合用函数の改良。
 66 |   ・match_results::format()の改良。
 67 |   ・reinterpret_castの正しくない使い方をしていた箇所を修正。
 68 |   ・SRELL_STRICT_IMPLとSRELL_NO_THROWとを同時に定義できるように改良。
 69 |   ・後方参照番号が正規表現中に出現する括弧の総数を超えていてもエラーにな
 70 |     らず暗黙のうちに無視されていた問題を修正。
 71 |   ・Modifiersが入れ子で使われると外側のフラグ変更が内側のグループに継承
 72 |     されていなかった問題を修正。たとえば (?i:a(?s:a)) は "AA" にマッチし
 73 |     ませんでした。
 74 |   ・std::basic_stringを検索対象とするregex_search()に、検索開始位置を指
 75 |     定できるoverloadを追加。
 76 |   ・上と同じoverloadをbasic_regex.search()にも追加。
 77 |   ・パターンコンパイル時における極端な状況への配慮を追加：
 78 |     ・括弧の総数、内部カウンタを必要とする繰り返しの総数、0幅チェッカの
 79 |       総数、このどれかが32bit値の最大値に達した場合にはerror_complexity
 80 |       をthrowするようにしました。
 81 |     ・入れ子になっているグループや文字クラス（Vフラグモード時のみ）をパ
 82 |       ーズする時、階層の深さの累計がSRELL_MAX_DEPTH（初期値は256）を超え
 83 |       たらスタック溢れ防止のためerror_complexityをthrowするようにしまし
 84 |       た。
 85 |   ・SRELL_NO_ICASEをdefineするとコンパイルできなくなっていた問題を修正。
 86 |   ・独自拡張のうち、3つのイテレータを引数に取り、かつmatch_resultsを引数
 87 |     に取らないregex_search()のoverloadを廃止。
 88 |   ・SRELL_NO_THROW level 2の下準備。
 89 | 
 90 | 20250214; version 4.064:
 91 |   ・SIMD利用可能時にu32regex型のパターンコンパイルを行うとメモリが破壊さ
 92 |     れてしまうバグを修正。
 93 |   ・Version 4.057で誤って削除してしまっていたbasic_regexの独自拡張メンバ
 94 |     match(), search()を復元。ただし引数にmatch_resultsを取るもののみで、
 95 |     取らぬoverloadについてはこのまま廃止します。
 96 |   ・その他細々とした改良など。
 97 | 
 98 | 20241208; version 4.063:
 99 |   ・UTF-16用のSIMDデータを作る時にサロゲートペアを考慮するよう修正。
100 | 
101 | 20241208; version 4.062:
102 |   ・デバッグ用マクロチェックの簡略化。
103 |   ・コードの整理と改良など。
104 | 
105 | 20241204; version 4.061:
106 |   ・簡単なSIMDによるアクセラレーションを導入（x86/x64のみ）。
107 | 
108 | 20241101; version 4.060:
109 |   ・4.030のデータ形式変更以降、srell_updata3.h内のいくつかのoffset値が1
110 |     つずれていた問題を修正。このずれにより、コードポイント値順で最後とな
111 |     るUnicode property値（\p{sc=Zzzz}や\p{space}など）が引けなくなってい
112 |     ました（報告してくださったEugene Levelev氏に感謝します）。
113 |   ・unicode/updataout3.cppを更新。上記問題が修正されたデータを出力するよ
114 |     うに。
115 |   ・4.059で修正した問題の原因となった4.050でのコード変更を取り消し（コー
116 |     ドの簡素化を目的とした変更だったのですが、修正によりコード量が逆に増
117 |     えてしまったため）。
118 | 
119 | 20241016; version 4.059:
120 |   ・4.050以降、引数なしのmatch_results::length(), position(), str()がコ
121 |     ンパイルできなくなっていた問題を修正（報告してくださったWinfried
122 |     Schenke氏に感謝します）。
123 | 
124 | 20241004; version 4.058:
125 |   ・Modifiers (?ims-ims:) を既定の機能に。
126 | 
127 | 20240922; version 4.057:
128 |   ・Version 4.054以降、\p{}/\P{}で無効なUnicode property名や値が指定され
129 |     てもエラーにならなかった問題を修正。
130 |   ・basic_regexからreplace(), split()を削除。
131 |   ・SRELL_CPP*マクロを廃止。
132 |   ・misc/conftest.cppを更新。4.052と同じマスク値バグの修正と、SRELL_CPP*
133 |     マクロ廃止に伴う変更。
134 |   ・unicode/updataout3.cppを更新。
135 | 
136 | 20240911; version 4.056:
137 |   ・ucfdata2.h, updata3.hをUnicode 16.0.0対応に更新。
138 |   ・その他細々とした変更など。
139 | 
140 | 20240904; version 4.055:
141 |   ・複数のコードユニットからなる文字（UTF-8では0080..10FFFF、UTF-16では
142 |     10000..10FFFF）を相当量含むテキストに対する検索が過度に遅くならぬよ
143 |     うUTF-8/UTF-16用のフィルタを調整。
144 |   ・limit_counterの既定値を1 << 24から1 << 21に変更。
145 |   ・regex_traitsの全メンバ函数を不使用につき削除。
146 | 
147 | 20240831; version 4.054:
148 |   ・コードの減量。uモードとvモードとで分離していたパーザの統合。
149 | 
150 | 20240824; version 4.053:
151 |   ・定義済み文字クラスの作成方法を簡略化。
152 |   ・UTF-8用内部イテレータの改良。条件分岐数の削減。
153 | 
154 | 20240818; version 4.052:
155 |   ・utf16_traits中のマスク値の誤りを修正。
156 |   ・その他改良など。
157 | 
158 | 20240816; version 4.051:
159 |   ・4.050で導入した最適化の再実装。メモリ使用量が必要最低限で済む方法に
160 |     変更。
161 |   ・その他改良など。
162 | 
163 | 20240810; version 4.050:
164 |   ・C{n,m}（Cは文字かクラス、n < m != 無限大）用の最適化処理を追加。
165 |   ・その他改良など。
166 | 
167 | 20240720; version 4.049:
168 |   ・syntax_option_typeにstickyフラグを追加。
169 |   ・regex_iterator2にsplit_aptrange()を追加。
170 |   ・ucfdataout2.cpp, updataout3.cppから、データファイルを古い形式で出力
171 |     する機能を削除。
172 | 
173 | 20240714; version 4.048:
174 |   ・指している位置を維持したまま現在位置または直前のコードポイント値を読
175 |     み込むイテレータを廃止。
176 |   ・UTF-8用内部iteratorが最短の表現以外を受け付けぬよう変更。
177 | 
178 | 20240707; version 4.047:
179 |   ・srell::regex（CHAR_BITが8の時のみ）, srell::u8cregex, srell::u8regex
180 |     による検索の速度を改良。
181 | 
182 | 20240613; version 4.046:
183 |   ・コードの減量。リテラルが連続する箇所を最優先に探す仕組みを削除。
184 |   ・その他細々とした改良や修正など。
185 | 
186 | 20240608; version 4.045:
187 |   ・Modifiersを実装。ただし提案がECMAScript仕様書に織り込まれるまでは、
188 |     #define SRELL_ENABLE_MODIFIERS定義時のみ利用可能。
189 |   ・4.043でduplicate named capturing groups対応のための変更を加えた際、
190 |     後方参照の番号が括弧の最大番号を超えていないかのチェックが抜けてしま
191 |     っていたので追加。
192 | 
193 | 20240602; version 4.044:
194 |   ・SRELL_NO_NAMEDCAPTURE用の#if～#endifが抜けていたので追加。
195 |   ・古いほうのstate挿入函数を引退させ、新しいほうに一本化。
196 | 
197 | 20240526; version 4.043:
198 |   ・Duplicate named capturing groups（|で区切られた位置なら既出のグルー
199 |     プ名を重複して使える機能）を実装。
200 | 
201 | 20240524; version 4.042:
202 |   ・*, +用の最適化処理が、C{n,}（Cは文字かクラス、nは2以上）にも適用され
203 |     うるように拡張。
204 |   ・次の条件を満たす場合に用いられる統合スタックを導入。
205 |     1) 照合用函数に渡されたiteratorがポインタであること、または、
206 |     2) コンパイラがstd::is_trivially_copyableに対応していて、かつ
207 |        渡されたiteratorの型Iに対する
208 |        std::is_trivially_copyable<I>::valueがtrueであること。
209 |     どちらの条件も満たさない時は従来の個別スタックが使われます。
210 | 
211 | 20240519; version 4.041:
212 |   ・4.040で行った仮修正の仕上げ。
213 |   ・使用していない函数の削除。
214 |   ・64GBを超えるメモリ割り当てが可能な環境において理論上起こり得る問題に
215 |     対処。
216 | 
217 | 20240131; version 4.040:
218 |   ・もう1行復元。?? (non-greedy {0,1})が最適化バグを引き起こすことがあっ
219 |     たため。
220 | 
221 | 20240127; version 4.039:
222 |   ・Version 4.037で誤って削除してしまったコードを復元。
223 | 
224 | 20240124; version 4.038:
225 |   ・/(?:ab)+|cd/が"ababcd"にマッチしてしまうバグを修正。
226 |     発生条件：|の左右が異なる文字で始まり、かつ左方が(?:)+の中に入ってい
227 |     る。
228 |   ・その他改良など。
229 | 
230 | 20240122; version 4.037:
231 |   ・Version 4.021以降最適化のバグにより、/(?:a|ab|abc)$/が"ac"にマッチす
232 |     るようになっていた問題を修正。
233 |     発生条件：(?:A|B|C) のような表現でAがBの前部と一致し、BがCの前部と一
234 |     致している場合。
235 |     →誤った最適化によりAの終端からCの後部に至るパスが発生してしまう。こ
236 |       のパスは普段は隠れているものの、バックトラッキングが発生すると使わ
237 |       れてしまう。
238 |   ・その他、細々とした修正や改良など。
239 | 
240 | 20240114; version 4.036:
241 |   ・Lookaround （lookaheadとlookbehind）の改良とバグ修正：
242 |     1. 不要なスタック処理を削除。
243 |     2. "abc"に対する/(?:(?=(\w))|b)c$/の1番括弧が"b"ではなく未定義になる
244 |        ように、version 3.003で廃止したstateを復活。
245 |        発生条件：1: Lookaroundが捕獲括弧を含んでいて、2: そのlookaround
246 |        が成功した後、後続のマッチに失敗し、3: '|'で区切られた別の
247 |        subpatternを試し、正規表現全体のマッチが成功する。
248 |        →Lookaround内で捕獲された文字列がundefinedに戻らず残ってしまって
249 |          いた。
250 |   ・misc/sample01.cppをconftest.cppに置き換え。
251 |   ・各epsilonにタグ付け。
252 | 
253 | 20231229; version 4.035:
254 |   ・文字クラスのcase foldingを改良（Icase時の\p{Any}のコンパイル速度の改
255 |     善）。
256 |   ・(?i:)対応の下準備。
257 |   ・updataout3.cppを更新。前版で内部のnamespaceを変更した影響でコンパイ
258 |     ルできなくなっていました。
259 | 
260 | 20231209; version 4.034:
261 |   ・照合用函数に渡されたiteratorがcontiguous_iteratorかどうかを調
262 |     べる時、std::contiguous_iteratorが使えるならそれを使うように。
263 |   ・正規表現中に存在しないグループ名が、match_results型のoperator[]()メ
264 |     ンバ函数に引数として渡された場合、error_backrefをthrowするのをやめ、
265 |     「何にもマッチしていない」ことを表すsub_match型インスタンスへの参照
266 |     を返すように変更。
267 |     この変更に併せて、match_results::operator[](size_type n)が
268 |     n >= size()の時も同様の参照を返すように変更（std::regex準拠の挙動。
269 |     従来はSRELL_STRICT_IMPLが定義されていた時のみ準拠）。
270 |   ・例外をthrowしないモードを実装。
271 |   ・例外を投げないモード用に、直前のコンパイル時にthrowされるはずであっ
272 |     たerror_typeを返す basic_regex::ecode() を追加。
273 |   ・例外を投げないモード用に、直前の検索時にthrowされるはずであった
274 |     error_typeを返す match_results::ecode() を追加。
275 | 
276 | 20230926; version 4.033:
277 |   ・Version 4.020以降、64ビット環境でアクセス違反を起こすことがあった問
278 |     題を修正（報告してくださったYuriy Skvortsov氏に感謝します）。
279 |     発生条件：/ab|ac|ad/のように、3つ以上のAlternativesが同じ文字から始
280 |     まる。
281 |   ・utf_traits中の使用されていないメンバ函数を削除。
282 |   ・その他コードの整理など。
283 | 
284 | 20230916; version 4.032:
285 |   ・UTF-8/UTF-16のデコーダが常にinline展開されるようdirectiveを追加。
286 |   ・オートマトンの呼び出し部を整理。
287 | 
288 | 20230913; version 4.031:
289 |   ・ucfdata2.h, updata3.hをUnicode 15.1.0対応に更新。
290 |   ・updataout3.cppを更新。Unicode property escapeのScriptまたは
291 |     Script_Extensionsで指定できる値に"Unknown"を追加。
292 |     この値はScripts.txt内で言及されているものの、ECMAScript仕様書の「対
293 |     応すべきscript名一覧」になかったのでこれまで対応していませんでした。
294 |     しかし仕様書から一覧表が削除され、除外する理由がなくなったのでV8に倣
295 |     ってSRELLも対応することにしました。
296 | 
297 | 20230909; version 4.030:
298 |   ・^ $ \b \Bだけのrewinderが作られることのないようにパターンコンパイラ
299 |     を変更。
300 |   ・Unicode propertyの名前や値の照合を二分探索で行うように。
301 |   ・上記変更に併せてunicode/updataout2.cppを更新し、updataout3.cppに。
302 |     また、ECMAScript仕様書が対応すべきscript名を一覧表で示すことをやめた
303 |     ため、Scripts.txtとPropertyValueAliases.txtとから読み取るように。
304 |   ・Unicodeデータファイルの拡張子を*.hppから*.hに変更。
305 |   ・unicode/ucfdataout2.cppを更新。上記の拡張子変更に対応。
306 | 
307 | 20230903; version 4.029:
308 |   ・unicode/updataout2.cppを更新。SRELL内部で使う型を4.023で統合したせい
309 |     でコンパイルが通らなくなっていた問題を修正。
310 |   ・srell_updata2.hppを作り直し（Unicode 15で新規追加された2つのスクリプ
311 |     トのデータが入っていなかったため。どうも古いupdataout2.cppで出力した
312 |     ものだったようです）。
313 | 
314 | 20230831; version 4.028:
315 |   ・regex型またはwregex型が使われる時、単体のchar/wchar_tで表現できない
316 |     Unicode値についてはオートマトンを呼び出さないように改良。
317 | 
318 | 20230821; version 4.027:
319 |   ・"2023-8-21"に対して/(?:(\d+-)?)+(\d{1,2})-(\d{1,2})/で検索した時に、
320 |     1番括弧が何もキャプチャしないバグを修正（相当前からあったバグ）。
321 |   ・同じ条件で全体のマッチが "23-8-21" だけになるバグを修正（4.019で混入
322 |     したバグで、4.026の修正でもカヴァーできていなかったもの）。
323 | 
324 | 20230820; version 4.026:
325 |   ・Version 4.019以降、"2023-8-20"に対して/(\d+-)?\d{1,2}-\d{1,2}/で検索
326 |     すると"23-8-20"にマッチしてしまうようになっていたバグを修正。
327 | 
328 | 20230819; version 4.025:
329 |   ・movzxを避けるため内部表現中のフラグ管理をbool型から整数型に変更。
330 |   ・オートマトン中でよく使う構造体のメンバ変数名を短いものに置換。
331 | 
332 | 20230817; version 4.024:
333 |   ・4.019以降出番のなくなっていた最適化処理用コードをコメントアウト。
334 |   ・細々とした改良と問題の修正。
335 | 
336 | 20230804; version 4.023:
337 |   ・内部で使う2種類の整数型を1種類に統合。
338 |   ・4.019で導入したentry state selectorにより効果が限定的となった最適化
339 |     処理を簡略に。
340 |   ・Entry state selectorの改良。
341 |   ・変数名の修正。
342 | 
343 | 20230730; version 4.022:
344 |   ・ソースコードの整理と細々とした問題の修正。
345 | 
346 | 20230727; version 4.021:
347 |   ・新たな内部状態を挿入すること無しに分岐の最適化が行えるよう改良。
348 | 
349 | 20230724; version 4.020:
350 |   ・Properties of stringsの内部表現への変換方法を簡略化。
351 |   ・その他細々とした改良。
352 | 
353 | ・[4.000～4.019, vフラグモード] 手元のソースファイルからリリース用ファイ
354 |   ルを作るための設定にミスがあり、version 4.000～4.019ではvモードが正し
355 |   く実装されていませんでした。
356 |   optimise_pos()函数の最後に次の行を書き足すとこれらの版でも正常に動作し
357 |   ます。
358 |     insert_btbranch(piece, ins_bt);
359 |   この函数がどこからも呼ばれていなかったのがバグの原因です。
360 | 
361 | 20230114; version 4.019:
362 |   ・新しいentry state selectorを実装。
363 | 
364 | 20230109; version 4.018:
365 |   ・4.016のオートマトン統合をキャンセル。パターンコンパイラ側に変更を加
366 |     え始めるとicase検索が著しく速度低下したため。
367 | 
368 | 20230107; version 4.017:
369 |   ・Version 4.006以降、bidirectional iteratorで検索しようとするとコンパ
370 |     イルエラーが発生するようになっていた問題を修正。
371 | 
372 | 20230106; version 4.016/3.018（＊のみ）:
373 |   ・4つのオートマタを2つに統合（i-modifier対応の下準備）。
374 |   ＊/a{0,0}/がエラー扱いになっていた問題を修正。
375 |   ＊その他細かい修正など。
376 | 
377 | 20221227; version 4.015:
378 |   ・VCで_ITERATOR_DEBUG_LEVELを1以上にすると、エラー扱いされる
379 |     regex_iterator2中のコードを修正。
380 |   ・その他改良など。
381 | 
382 | 20221220; version 4.014:
383 |   ・誤って前版で抜けていたmatch_resultsのメンバ函数を補充。
384 |   ・regex_token_iteratorの簡素化。
385 | 
386 | 20221220; version 4.013:
387 |   ・"abc"を/$/でsplit()すると、{"abc"}となるべきところが{"abc", ""}にな
388 |     ってしまっていた問題を修正。
389 |   ・replace()のoverload函数の数を減らし、ラムダ使用時は常にコールバック
390 |     函数で受け取りたいmatch_resultsの型をテンプレート実引数で明示するよ
391 |     うに。
392 |   ・regex_iterator2を追加。
393 | 
394 | 20221216; version 4.012:
395 |   ・コンパイラによってreplace()のコンパイルに失敗する問題を修正。
396 | 
397 | 20221214; version 4.011/3.017（＊のみ）:
398 |   ＊[LWG Issue 3204] sub_matchにswap()を追加。
399 |   ・replace()の仕様変更。std::basic_string風のコンテナ型なら何でも置換で
400 |     きるように。
401 |   ・srell::str_clipを追加。
402 |   ・split()にイテレータ、ポインタに対応するoverloadを追加。
403 | 
404 | 20221212; version 4.010:
405 |   ・split()の実装が説明文と合うように修正。文ではsub_matchがリストコンテ
406 |     ナにpushされるとなっているのに対して、コードではbasic_stringがpushさ
407 |     れていました。
408 |   ・sub_matchクラスのbasic_stringへの変換函数（キャスト及びstr()）に、カ
409 |     スタムtraits/allocator対応版を追加。
410 | 
411 | 20221210; version 4.009/3.016（＊のみ）:
412 |   ＊regex_iteratorのiterator (it) が0幅にマッチすると、次に++した時に
413 |     it->prefix().matchedがtrueにならなかった問題を修正。
414 |   ＊match_resultsのテンプレート引数にカスタムallocatorを渡すとコンパイル
415 |     できなかった問題を修正。
416 |   ・basic_regexに新しいメンバ函数（拡張API）を追加。
417 | 
418 | 20221130; version 4.008:
419 |   ・4.006で導入したfinderよりもBMHの優先度が上になるよう調整。
420 |   ・\b/\Bおよびmultilineモードにおける^, $の改良。
421 | 
422 | 20221124; version 4.007:
423 |   ・正規表現の最初でのみ使える埋込フラグ (?ims-ims) に対応（Python 3.11
424 |     と同様）。
425 |     註：この機能は独自拡張で、ECMAScriptの仕様にはありません。また
426 |     regexp-modifiers提案とも異なっています。この機能はSRELL_NO_UBMODを定
427 |     義することにより、無効にできます。
428 | 
429 | 20221123; version 4.006:
430 |   ・最初にマッチする文字が一種類である正規表現用のfinderを追加。
431 | 
432 | 20221030; version 4.005/3.015（＊のみ）:
433 |   ＊int型とlong型とでビット幅が異なる環境（LP64, 4/8/8等）で未定義動作と
434 |     なるコードを修正（報告してくださったTravers Ching氏に感謝します）。
435 |   ・unicode/ucfdataout2.cpp, updataout2.cppを更新。Unicodeデータファイル
436 |     (srell_ucfdata2.hpp, srell_updata2.hpp) なしでもコンパイルできるよう
437 |     に。
438 |   ・その他コードの整理など。
439 | 
440 | 20221022; version 4.004/3.014:
441 |   ・srell_ucfdata2.hppとsrell_updata2.hppとをUnicode 15.0.0対応に更新。
442 |   ・unicode/updataout2.cppをUnicode 15対応に更新（ECMAScript 2023で対応
443 |     される見込みのスクリプト名の先行対応）。
444 |   ・先の後方参照バグを直した結果、無意味になったコードを削除。
445 | 
446 | 20221012; version 4.003/3.013:
447 |   ・後方参照バグを再々修正。ちなみにこのバグは可変幅の戻り読みに対応した
448 |     ことに付随するものであるため、version 2.000以降のSRELL全版に存在しま
449 |     す。
450 |     （可変幅の戻り読みでは/(?<=\1\s+(\d+))/のように、パーザが捕獲括弧よ
451 |     りも先に後方参照に出合ってしまうことがあるため、対応する括弧がその正
452 |     規表現中に実在するのかすぐに判断できないことに由来しています）
453 | 
454 | 20221012; version 4.002/3.012:
455 |   ・前版の後方参照バグを違う方法で再修正。20221011の修正では/(?:\1+)*()/
456 |     のような表現に対応できていなかったため。同時に/()(?:\1+)*/のような表
457 |     現が無限ループに陥るのも修正。
458 | 
459 | 20221011; version 4.001/3.011（＊のみ）:
460 |   ＊/\1*()/や/(\1+)/のように、対応する捕獲括弧の閉じ括弧よりも先に出現す
461 |     る後方参照に*または+が付いているとnullポインタを参照してしまう、もし
462 |     くは無限ループに陥るバグを修正（バグを見つけてくださったsrellcomの作
463 |     者、@datadiode氏に感謝します）。
464 |   ・ECMAScriptの仕様に従い、[]内で'-'をエスケープせず書ける位置のチェッ
465 |     クを厳密に行うよう変更。定義済み文字クラス（\d, \s等）直後の'-'は、
466 |     それが文字クラス最後の文字でない限りはエラーに（[\s-\d]はエラー、
467 |     [\s-]はOK）。
468 |   ・UTF-8用内部iteratorの調整。
469 | 
470 | 20220618; version 4.000:
471 |   ・ECMAScriptに追加される見込みのvフラグモードに対応。
472 |   ・srell_updata.hppの仕様変更。srell_updata2.hppに。
473 |   ・上記変更に併せてunicode/updataout.cppを更新し、updataout2.cppに。
474 |   ・64ビット環境でclang-tidyが "excessive padding" と警告する問題に対応
475 |     するため構造体メンバの順番を変更（ご報告に感謝します）。
476 |   ・unicode/ucfdataout2.cppを更新。
477 | 
478 | 20220529; version 3.010:
479 |   ・\pや\Pを含む文字クラスのメモリ使用量を削減。
480 |   ・\pや\Pの{}内が不正の時にthrowされるエラーの種類を、
481 |     regex_constants::error_escapeから新設の
482 |     regex_constants::error_propertyに変更。
483 |   ・その他細々とした改良。
484 | 
485 | 20220511; version 3.009:
486 |   ・最適化バグにより /abcd|ab/ が "abc" にマッチしなかった問題を修正。
487 | 
488 | 20220504; version 3.008:
489 |   ・icase指定時の[^\P{...}]の振る舞いが、TC39で提案中のv-modeのそれに近
490 |     いものになっていた問題を修正。
491 | 
492 | 20220429; version 3.007:
493 |   ・カウンタの仕組みをさらに変更。
494 | 
495 | 20220428; version 3.006:
496 |   ・繰り返し処理用のカウンタを調整。
497 |   ・小さな文字クラス用の線形探索を再削除。
498 | 
499 | 20220424; version 3.005:
500 |   ・multiline指定時に /(?<=$.*)/ が "a" の終わりにマッチしなかった問題を
501 |     修正。
502 |   ・TC39で提案中の\A, \z, (?m:)の準備。
503 | 
504 | 20220420; version 3.004:
505 |   ・'*' または '+' 付きの文字クラスが後続する文字または文字クラスと排他
506 |     的になっていない表現用の最適化処理を追加。例：/[A-Za-z]+ing/,
507 |     /".*"/ など。
508 | 
509 | 20220416; version 3.003:
510 |   ・2つの最適化函数を1つに統合。
511 |   ・先読み (lookahead)・戻り読み (lookbehind) 用のコード量を削減。
512 | 
513 | 20220416; version 3.002:
514 |   ・3.000で導入した簡易entry state選択の使用時に、regex_matchや
515 |     match_continuousフラグが指定されたregex_searchが機能しない場合があっ
516 |     た問題を修正。
517 | 
518 | 20211025; version 3.001:
519 |   ・カウンタ分割を廃止。効果がないかむしろ若干速度が低下しているように見
520 |     えるため。
521 |   ・潜在的なバグを修正。
522 |   ・その他細かな改良など。
523 | 
524 | 20211023; version 3.000:
525 |   ・srell_ucfdata2.hppとsrell_updata.hppとをUnicode 14.0.0対応に更新。
526 |   ・unicode/updataout.cppをUnicode 14対応に更新（ECMAScript 2022で対応さ
527 |     れる見込みのスクリプト名の先行対応）。
528 |   ・char32_t未対応のコンパイラでUnicode値を保持するため内部で使用する型
529 |     を「21ビット以上あるunsigned整数型」から「32ビット以上あるunsigned整
530 |     数型」に変更。
531 |   ・char32_t未対応のコンパイラで繰り返し回数や文字クラス番号を保持するの
532 |     に使う型を「unsigned int」から「32ビット以上あるunsigned整数型」に変
533 |     更。
534 |   ・数値用パーザにoverflowチェックを追加。例：unsigned int型が32ビットの
535 |     幅の時、前の版まで /a{0,4294967297}/ は /a{0,1}/ 相当になってしまっ
536 |     ていましたが、前記のチェックを入れたことによりこのような場合には
537 |     error_braceがthrowされるようになっています。
538 |   ・非multilineモード時に /[^;]*^;?/ が入力文字列の先頭にマッチしなかっ
539 |     たバグを修正。
540 |   ・ごく簡易なentry state選択を実装。
541 | 
542 | 20211004; version 2.930:
543 |   ・WCHAR_MAXの値に基づいてUTF-16/UTF-32対応が切り替わるu1632w-型を新規
544 |     に追加（WCHAR_MAXが0xFFFF以上・0x10FFFF未満ならu1632w-型はu16w-型の
545 |     別名となり、WCHAR_MAXが0x10FFFF以上ならu1632w-型はu32w-型の別名とな
546 |     ります）。
547 |   ・Eytzinger layout検索時に使われるメモリ使用量を削減。
548 |   ・その他細かな改良など（いくつかはNIREに対するMarko Njezic氏の改善案に
549 |     基づきます）。
550 | 
551 | 20210624; version 2.920:
552 |   ・?（{0,1}相当）用の最適化処理を追加。
553 |   ・misc/sample01.cpp内で参照しているECMAScript仕様書の版を2021に変更。
554 | 
555 | 20210429; version 2.912:
556 |   ・2.900で導入した最適化処理のバグにより /aa|a|aa/ が "a" にマッチしな
557 |     くなっていた問題を修正（報告してくださったJan Schrötter氏に感謝しま
558 |     す）。
559 |     ちなみにこの最適化処理は、srell.hppをincludeする前に
560 |     SRELLDBG_NO_BRANCH_OPT2マクロを定義しておくと無効化できます。
561 | 
562 | 20210424; version 2.911:
563 |   ・2.900で導入した最適化処理内の不用意な行削除が原因で、/abc|ab|ac/ が
564 |     "ac" に対してマッチしなくなっていた問題を修正（バグ報告に感謝します）。
565 | 
566 | 20210407; version 2.910:
567 |   ・2.900以降、パターンコンパイラ内部でmove代入演算子が使われる時にメモ
568 |     リリークしていた問題を修正（報告してくださったMichal Švec氏に感謝し
569 |     ます）。
570 | 
571 | 20210214; version 2.901:
572 |   ・不要なテンプレートの特殊化を削除。
573 | 
574 | 20210214; version 2.900:
575 |   ・文字列のみからなる選択（例：/abc|abd|acde/）用の最適化処理を新規に追
576 |     加。
577 |   ・u(8|16)[cs]regex_(token_)?iteratorがコンパイルエラーとなり使用できな
578 |     かった問題を修正。
579 |   ・その他細かな改良など。
580 | 
581 | 20210131; version 2.810:
582 |   ・UTF-8用内部iteratorの改良。
583 | 
584 | 20200724; version 2.800:
585 |   ・文字クラスの二分探索にEytzinger layoutを導入。
586 |   ・小さな文字クラス用に線形探索を再実装。
587 |   ・名前付き括弧の名前部分をパーズするためのプロパティーデータの扱いを変
588 |     更。basic_regex型インスタンス内に読み込むのを止めて、必要な時のみ読
589 |     み込むように。
590 | 
591 | 20200714; version 2.730:
592 |   ・入れ子になった捕獲括弧で冗長な退避・復元処理をせぬように変更。
593 |   ・regex_iteratorの改良。
594 | 
595 | 20200703; version 2.720:
596 |   ・非ASCII文字を含むUTF-8文字列または非BMPの文字を含むUTF-16文字列を、
597 |     Boyer-Moore-Horspoolアルゴリズムを用いて、大文字小文字の区別無しで
598 |     (icase/case-insensitiveで) 検索する場合の処理の改良。
599 |   ・Version 2.650での変更により、regex_iterator->prefix().firstが前回マ
600 |     ッチした位置の終端ではなく文字列全体の最初を指すようにになってしまっ
601 |     ていたのを修正。
602 |   ・上記修正に合わせて3イテレータ版のregex_search()が呼ばれる場合、
603 |     match_results.position()は戻り読みの逆行限界として渡された位置
604 |     （regex_searchの第3引数）を起点とした位置を返し、
605 |     match_results.prefix().firstは検索開始位置（同第1引数）を指すように
606 |     変更。
607 |   ・BMH検索時に、不正なUTF-8シークウェンスの前後にある有効なシークウェン
608 |     スが読み飛ばされてしまう問題を修正（2.630でUTF-8の処理方法を変えた時
609 |     に混入したバグ）。
610 | 
611 | 20200701; version 2.710:
612 |   ・Boyer-Moore-Horspool検索の調整。
613 | 
614 | 20200630; version 2.700:
615 |   ・最適化処理の調整。
616 | 
617 | 20200620; version 2.651:
618 |   ・グループ名のチェックを行う位置を\uエスケープの解釈後に移動。
619 |   ・misc/sample01.cppをversion 1.103に更新。参照しているECMAScript仕様書
620 |     の版を2020(ES11)に変更。
621 | 
622 | 20200618; version 2.650:
623 |   ・名前付き括弧に捕獲された文字列へのアクセス用函数に、グループ名をポイ
624 |     ンタで指定するoverloadをmatch_resultsに追加。
625 |   ・3イテレータ版のregex_search()使用時には、検索の開始位置ではなく戻り
626 |     読み (lookbehind) の逆行限界として渡された位置のほうを
627 |     match_results::prefix::firstにセットするよう変更。
628 |   ・不要と思われる処理をいくつか削除。
629 | 
630 | 20200601; version 2.643:
631 |   ・syntax_option_typeおよびmatch_flag_typeのoperator函数にinline指定を
632 |     追加（これがないとリンク時に多重定義エラーが出ることがあるとのご指摘
633 |     がありました）。
634 |   ・その他細かな改良など。
635 | 
636 | 20200530; version 2.642:
637 |   ・basic_regex型インスタンスが確保するメモリのサイズを削減。
638 | 
639 | 20200528; version 2.641:
640 |   ・2.640での修正1が不完全であったため再修正。
641 |   ・最適化処理の調整。
642 | 
643 | 20200516; version 2.640:
644 |   ・最適化バグの修正1: regex_matchが入力文字列の終端を通り過ぎてしまうこ
645 |     とがあった問題を修正。
646 |   ・最適化バグの修正2: multilineフラグ指定時に ^ や $ が適切な位置でのマ
647 |     ッチングをさせてもらえなくなってしまっていた問題を修正。
648 |   ・srell_ucfdata2.hppとsrell_updata.hppとを更新。
649 | 
650 | 20200509; version 2.630:
651 |   ・正規表現中に不正なUTF-8のシークウェンスがあった場合、パターンコンパ
652 |     イラがregex_utf8をthrowするように仕様変更（検索対象文字列中に不正な
653 |     UTF-8の並びがあってもエラー扱いされません）。
654 |   ・UTF-8でBMH検索が行われる際、マッチした箇所の直後に余分な後続
655 |     (trailing) バイトが続いていた場合にその部分もマッチング結果に含めて
656 |     しまう問題を修正。
657 |   ・basic_regex.flags() が正しい値を返さないことがあったのを修正。
658 |   ・正規表現中で実際には使われていないグループ名 (NAME) を
659 |     match_results.format()に渡す書式文字列の中で$<NAME>のようにして指定
660 |     すると、その部分が空文字に置換されずそのまま残ってしまう問題を修正。
661 | 
662 | 20200502; version 2.620:
663 |   ・Boyer-Moore-Horspoolアルゴリズム用クラスからmatch_continuous指定時用
664 |     およびregex_match用の函数を削除。これらの処理時は以前のようにオート
665 |     マトンを使うように変更。
666 |   ・その他クリーンナップ。
667 | 
668 | 20200428; version 2.611:
669 |   ・/\d*/ が "abc" の冒頭にマッチせず末尾にマッチする問題を修正（Version
670 |     2.210で混入したバグ）。
671 | 
672 | 20200426; version 2.610:
673 |   ・Case-insensitive (icase) なBMH検索が行われる際、探している文字列が検
674 |     索対象テキスト全体の先頭にあった場合に読み飛ばされてしまうことがある
675 |     バグを修正（UTF-8またはUTF-16で、検索文字列の末尾が複数のコードユニ
676 |     ットからなる文字である場合に発生）。
677 |   ・キャプチャグループ名のパーズをECMAScriptの仕様書通りきっちり行うよう
678 |     に変更。これにより、前の版までは受理されていた /(?<,>...)/ のような
679 |     グループ名はregex_errorがthrowされるように。
680 | 
681 | 20200418; version 2.600:
682 |   ・戻り読み (lookbehind) の逆行限界を直接regex_search()に渡せるように
683 |     3イテレータ版のregex_search()を追加。
684 |   ・[非互換変更] 2.300で導入したmatch_flag_typeのmatch_lblim_availフラグ
685 |     と、match_resultsのlookbehind_limitメンバとを廃止。
686 |   ・srell_ucfdata2.hppとsrell_updata.hppとをUnicode 13.0.0対応に更新。
687 |   ・unicode/updataout.cppをUnicode 13対応に更新（ECMAScript 2020で対応さ
688 |     れる見込みのスクリプト名の先行対応）。
689 | 
690 | 20191118; version 2.500:
691 |   ・初めてbasic_regex型インスタンスが作られた時にcase foldingデータから
692 |     icaseマッチング用テーブルを展開するのに代えて、最初から計算済みテー
693 |     ブルを保持しているように仕様変更。
694 |   ・上記変更に併せてsrell_ucfdata.hppおよびそれを出力するucfdataout.cpp
695 |     はお役御免とし、代わりに展開済みicase用テーブルを保持する
696 |     srell_ucfdata2.hppとそれを出力するucfdataout2.cppとを追加。
697 |   ・文字クラスの照合方法を線形探索から二分探索に変更。
698 |   ・文字クラスの最適化処理のタイミングを「']' が見つかった時にまとめて一
699 |     括」から「文字または文字コードの範囲をpushするたびごと逐次」に変更。
700 |   ・assertをすべて削除。
701 |   ・連続する\uHHHHがサロゲートペアをなしている場合はUnicode値として解釈
702 |     するように変更（これによりECMAScript仕様との相違はなくなりました）。
703 |   ・SRELL_NO_NAMEDCAPTUREマクロ使用時にコンパイルエラーが出ていたのを修
704 |     正。
705 |   ・updataout.cppを1.101にヴァージョンアップ。
706 |   ・単体版のsrellを追加（single-headerディレクトリ内）。
707 | 
708 | 20190914; version 2.401:
709 |   ・basic_regex型インスタンスのサイズを削減（Unicode property escapes対
710 |     応時にうっかり膨張させてしまっていました）。
711 |   ・basic_regex::swap()の改良。
712 | 
713 | 20190907; version 2.400:
714 |   ・文字クラスの照合速度を改善。
715 |   ・パターンコンパイル時にグループ名中の\uエスケープを解釈するように変更
716 |     （ECMAScriptの仕様に準拠）。
717 |   ・ucfdataout.cppを1.200にヴァージョンアップ。このプログラムが出力する
718 |     srell_ucfdata.hpp中のunicode_casefoldingクラスに、新たにメンバ変数が
719 |     追加されました。
720 |     SRELL 2.400以降はこの追加されたメンバ変数をコンパイル時に必要とする
721 |     ため、ucfdataout.cpp 1.101以前によって出力されたsrell_ucfdata.hppを
722 |     SRELL 2.400以降で使うことはできません（古いSRELLで新しい
723 |     srell_ucfdata.hppを使うことは可）。
724 |   ・その他コードの整理や改良など。
725 | 
726 | 20190902; version 2.304:
727 |   ・Version 2.303のコード整理で壊れてしまっていたregex_iteratorを修復。
728 | 
729 | 20190810; version 2.303:
730 |   ・2.302の修正が不完全であったため再修正。
731 |   ・その他コードの整理。
732 | 
733 | 20190809; version 2.302:
734 |   ・(?...) に繰り返し指定がついている時、内側の括弧によって捕獲された文
735 |     字列がループごとにクリアされず持ち越されていたバグを修正。
736 |     例：/(?:(ab)|(cd))+/.exec("abcd") → 1番括弧はundefinedになるはずが
737 |     "ab"になってしまっていた。
738 |   ・misc/sample01.cppをversion 1.102に更新。テスト名中の章番号を
739 |     ECMAScript 2019 (ES10) 準拠に変更
740 | 
741 | 20190724; version 2.301:
742 |   ・ECMAScriptの仕様に準じて、\でエスケープ可能な文字の種類を次の15字に
743 |     限定。^$\.*+?()[]{}|/
744 |     文字クラス内（[]内）ではこの15字に加えて '-' も対象に。
745 | 
746 | 20190717; version 2.300:
747 |   ・検索対象範囲とは別に、戻り読み (lookbehind) の逆行限界を指定できる機
748 |     能を追加（match_flag_typeへのmatch_lblim_availフラグの追加と
749 |     match_resultsへのlookbehind_limitメンバの追加）。
750 |     これに併せてregex_iteratorのコンストラクタ内でも、内部で使うprivate
751 |     なmatch_results型インスタンスのlookbehind_limitメンバに値を設定する
752 |     ように変更。
753 |   ・ECMAScriptの仕様に合わせて、後方参照が対応する捕獲括弧より先に出現し
754 |     てもエラー扱いせぬように変更。/\1(.)/, /(?<=(.)\1)/, /\k<a>(?<a>.)/
755 |     などすべてOKに。
756 |   ・misc/sample01.cppをversion 1.101に更新。misc.jsより準拠テストを1つ追
757 |     加。
758 | 
759 | 20190714; version 2.230:
760 |   ・正規表現が '*' か '+' かを伴う文字または文字クラスで始まる場合の検索
761 |     速度を改善（例：/[A-Za-z]+ing/）。
762 | 
763 | 20190707; version 2.221:
764 |   ・std::u8stringの利用可否は__cpp_char8_tではなく__cpp_lib_char8_tを用
765 |     いて判断するように変更。
766 |   ・icase指定時にcase-folding処理をした結果、文字クラス内の文字がすべて
767 |     同じ文字になった場合には、文字クラスを解消して文字リテラルとして処理
768 |     するように変更。例：/r[Ss\u017F]t/i → /rst/i。
769 |   ・その他問題を修正。
770 | 
771 | 20190617; version 2.220:
772 |   ・カウンタを使わぬほうが内部表現がコンパクトになる繰り返しはカウンタを
773 |     使わぬように変更。
774 |   ・最適化バグにより、/a{1,2}?b/.exec("aab") が "aab" ではなく "ab" を返
775 |     していたのを修正（発生条件：最短一致優先の回数指定が付いている文字ま
776 |     たは文字クラスの後ろに、その文字集合と排他的な文字または文字クラスが
777 |     続いている場合）。
778 | 
779 | 20190613; version 2.210:
780 |   ・/ab|cd|ef/ のような表現（'|' で区切られている文字列の先頭文字が互い
781 |     に排他的な場合）の照合方法を改良。
782 | 
783 | 20190603; version 2.202:
784 |   ・BMHアルゴリズムが使われる状況で、regex_matchがregex_search相当の処理
785 |     をしてしまうバグを修正。
786 | 
787 | 20190531; version 2.200:
788 |   ・通常の（正規表現ではない）テキスト検索用に、Boyer-Moore-Horspoolアル
789 |     ゴリズムに基づく実装を追加。
790 |   ・UTF-8用iteratorの改良。
791 |   ・icase指定時の\b/\Bの挙動を修正。/.\B./i が "s\u017F" にマッチするよ
792 |     うに。
793 |   ・その他問題を修正。
794 | 
795 | 20190508; version 2.100:
796 |   ・Lookbehind中に文字列のキャプチャがあり、かつその中および左方に可変長
797 |     の正規表現があった場合、文字列の捕獲に失敗することがあったのを修正。
798 |     例："1053" =~ /(?<=(\d+)(\d+))$/ で$2に適切な文字列がセットされず。
799 |   ・srell_ucfdata.hppとsrell_updata.hppとをUnicode 12.1.0対応に更新。
800 |   ・unicode/updataout.cppをUnicode 12対応に更新（ECMAScript 2020で対応さ
801 |     れる見込みのスクリプト名の先行対応）。
802 |   ・srell.hpp中の改行コードをCR+LFからLFに変更。
803 |   ・unicode/*.cppが出力するファイルの改行コードをCR+LFからLFに変更。
804 |   ・misc/sample01.cppをversion 1.010に更新。
805 |     1. テスト名中の章番号をECMAScript 2018 (ES9) 準拠に変更（前版までは
806 |        ECMAScript 5.1までの章番号準拠でした）。
807 |     2. ECMAScript 2018規格の2.2.2.3 NOTEから準拠テストを1つ追加。
808 |   ・C++11の機能の使用可否を判定するマクロを変更。
809 |   ・文字クラスの処理方法を変更。
810 |   ・basic_regexの全コンストラクタと全assign函数とでflag_typeのdefault引
811 |     数を指定できるように、syntax_option_typeとmatch_flag_typeとを再実装
812 |     （TR1→C++11間の変更の見落とし）。
813 |   ・char8_t型に試験対応。コンパイラがchar8_tに対応している場合
814 |     （__cpp_char8_tマクロ定義の有無で判断）、"u8-"というprefixの付いた
815 |     クラスは「char8_t型文字列を受け取り、それをUTF-8として扱う」ように。
816 |     char8_tに未対応の場合は従来通り、char型文字列をUTF-8として処理。
817 |   ・常に「char型文字列をUTF-8として扱う」クラスとして新規に"u8c-"という
818 |     prefixに付いたクラスを追加。2.002までの"u8-"付きクラス相当。
819 |     ・u8cregex; u8ccmatch, u8csmatch; u8ccsub_match, u8cssub_match;
820 |       u8ccregex_iterator, u8csregex_iterator; u8ccregex_token_iterator,
821 |       u8csregex_token_iterator.
822 | 
823 | 20180717; version 2.002:
824 |   ・ECMAScriptの仕様に合わせて \u{h...} の h... 部分の最大桁数を6から無
825 |     制限に変更（変更前の1～6桁というのは提案書に基づく実装でした）。
826 |   ・updataout.cppを1.001に更新。新規に追加されたスクリプト名をエラー扱い
827 |     せぬように修整。
828 |   ・srell_ucfdata.hppとsrell_updata.hppとをUnicode 11.0.0対応に更新。
829 | 
830 | 20180204; version 2.001:
831 |   ・icase指定時に、[\W]（\Wを含む文字class）が [KkSs\u017F\u212A] のいず
832 |     れにもマッチせぬよう変更（関連：ecma262 issue #512）。
833 | 
834 | 20180127; version 2.000:
835 |   ・ECMAScript 2018のRegExpに追加されることになった次の機能を実装:
836 |     ・'.' があらゆるコードポイントにマッチするようにするための指定
837 |       "dotall" フラグを、srell::regex_constants内の syntax_option_type
838 |       および srell::basic_regex内の flag_type に追加。
839 |     ・Unicode property用の表現、\p{...} と \P{...} とを追加。
840 |     ・名前付きキャプチャ (?<NAME>...) と、名前付きキャプチャによって捕獲
841 |       された文字列を後方参照するための正規表現、\k<NAME> とを追加。
842 |   ・戻り読み (lookbehind) の振る舞いを変更。(?<=...), (?<!...) とも可変
843 |     幅の戻り読みに対応。
844 | 
845 | 20180125; version 1.401:
846 |   ・ECMAScriptの仕様に合わせて、match_results.format()内で後方参照として
847 |     認識される数値を99までに制限（即ち$1～$9および$01～$99のみ有効）。
848 |   ・長い間メンテナンスしていないマクロを削除。
849 | 
850 | 20180101; version 1.400:
851 |   ・/(?:)*/ のように、空のnon-capturingグループにも量指定子を付けられる
852 |     ように変更（ECMAScriptのRegExpとの互換性確保のための変更で、使い道は
853 |     おそらくありません）。
854 |   ・次の3条件が揃った時に固まってしまったのを修正: 1) non-capturingグル
855 |     ープに量指定子が付いていて、2) そのグループ自身が0幅になり得て、3)
856 |     そのグループ内の最後以外の場所に、0幅になり得る後方参照が現れる時。
857 |     たとえば /(.*)(?:\1.*)*/ のような表現。
858 | 
859 | 20171216; version 1.300:
860 |   ・最適化処理のバグにより、/^(;[^;]*)*$/ が ";;;;" にマッチしなかった問
861 |     題を修正。この問題の発生条件は次の通り:
862 |     ・/(A...B*)*$/ のような終わり方をしていて、かつAとBとが互いに排他的
863 |       な文字または文字集合である場合。
864 | 
865 | 20170621; version 1.200:
866 |   ・srell_ucfdata.hppをUnicode 10.0.0対応に。
867 |   ・不正なUTF-8 sequenceに対するu8regex_traitsの振る舞いを改善。
868 | 
869 | 20150618; version 1.141:
870 |     srell_ucfdata.hppをUnicode 8.0.0対応に。
871 | 
872 | 20150517; version 1.140:
873 |   ・regex_match()がマッチの成否を判定する方法の変更。
874 |     （C++ Standard Library Issues List #2273 への対応）
875 |   ・ECMAScriptの仕様に合わせて \cX の X の範囲を [A-Za-z] に制限。
876 |   ・look-around assertions中の丸括弧が、ある条件下で正しく文字列をキャプ
877 |     チャせぬ場合があった問題を修正。Version 1.111での修正が不完全であっ
878 |     たことによるもの。
879 | 
880 | 20150503; version 1.130:
881 |   ・case-folding用函数の改善。
882 |   ・unicode/ucfdataout.cppをversion 1.100に。
883 |   ・u(16|32)[cs]match用の#if directives中にあったtypoを修正。
884 | 
885 | 20150425; version 1.120:
886 |   ・UTF-8文字列においてU+010000-U+10FFFFの範囲の文字（4オクテット長の文
887 |     字）が認識されぬバグを修正。
888 |   ・misc/sample01.cppをversion 1.010に。
889 | 
890 | 20150402; version 1.111:
891 |   ・最適化処理のバグにより、"aaa" =~ /((.*)*)/ の $2 が "aaa" ではなく空
892 |     になってしまう問題を修正。
893 | 
894 | 20141101; version 1.110:
895 |   ・バグ報告による修正：
896 |       1. basic_regex::assign() 内の compile() に "this->" を追加。
897 |       2. operator=() 函数を明示的に実装。
898 |   ・unicode/ucfdataout.cppをversion 1.001 に。
899 | 
900 | 20140622; version 1.101:
901 |     srell_ucfdata.hppをUnicode 7.0.0対応に。
902 | 
903 | 20121118; version 1.100:
904 |     最初のリリース版。
905 | 
906 | 


--------------------------------------------------------------------------------
/history_en.txt:
--------------------------------------------------------------------------------
   1 | 20251010; version 4.100:
   2 |   * Now SRELL requires that the bit widths of int and size_t be at least
   3 |     32.
   4 |   * Based on the additional requirement above, simplified the code in
   5 |     some places.
   6 |   * Modified overloads of constructors, assign(), and operator=() of
   7 |     basic_regex for supporting string_view etc.
   8 |   * Other minor improvements:
   9 |     1. Switch to CPU cache friendly Boyer-Moore-Horspool data.
  10 |     2. Pre-C++11 compilers had been unable to compile
  11 |        regex_token_iterator since version 4.065 owing to lack of data()
  12 |        in std::vector.
  13 |     3. Coexistence of SIMD info and info for non-contiguous iterators.
  14 | 
  15 | 20250928; version 4.090:
  16 |   * Improved memory efficiency in lookup for Unicode property names and
  17 |     values.
  18 |   * Removed the code intended for the no bad_alloc mode
  19 |     (SRELL_NO_THROW 2), as supporting it was postponed indefinitely.
  20 | 
  21 | 20250920; version 4.080:
  22 |   * Implemented the nosubs option.
  23 | 
  24 | 20250914; version 4.070:
  25 |   * Added v and y to unbounded flag modifiers.
  26 | 
  27 | 20250910; version 4.069:
  28 |   * Updated ucfdata2.h and updata3.h to support Unicode 17.0.0.
  29 |   * Modified what() of regex_error to return an error name instead of
  30 |     "regex_error".
  31 |   * Added overloads to match() and search() of basic_regex for
  32 |     supporting string_view etc.
  33 |   * Updated misc/conftest.cpp:
  34 |     1) for GCC 13 and later not to generate a warning when the -Wall
  35 |        option is specified,
  36 |     2. and for Clang 19 and later to exclude tests with unsupported
  37 |        character types among char(8|16|32)_t instead of using a
  38 |        substituted type, because the base template implementation for
  39 |        std::char_traits has been removed.
  40 |   * Updated unicode/updataout3.cpp.
  41 | 
  42 | 20250814; version 4.068:
  43 |   * Updated unicode/updataout3.cpp:
  44 |     1. This file had been unable to be compiled since version 4.065
  45 |        because of the name change of an internal function in SRELL.
  46 |     2. Added a new option to decide whether string literals are escaped
  47 |        or not at runtime, instead of compile-time.
  48 | 
  49 | 20250608; version 4.067:
  50 |   * Adjusted internal memory handling.
  51 |   * Various minor improvements.
  52 | 
  53 | 20250518; version 4.066:
  54 |   * Fixed three minor issues being inconsistent with the ECMAScript
  55 |     specification:
  56 |     1. In the result of searching /()*/ against "", $1 was "" instead of
  57 |        being unmatched.
  58 |     2. No error was thrown even if \0 was followed by any digit.
  59 |     3. No error was thrown even if the regex sequence ended with "\c".
  60 |   * Added the "quiet" flag to syntax_option_type.
  61 |   * Added the "vmode" flag as an alias to "unicodesets" to
  62 |     syntax_option_type.
  63 |   * Combined eight points of throwing an exception into one in the
  64 |     matching algorithm function.
  65 |   * Fixed several wrong expected results in conftest.
  66 |   * Various minor fixes.
  67 | 
  68 | 20250420; version 4.065:
  69 |   * Improved internal memory handling.
  70 |   * Add ecode() to regex_token_iterator for SRELL_NO_THROW.
  71 |   * Several improvements on the matching algorithm function.
  72 |   * Improvements on match_results::format().
  73 |   * Replaced reinterpret_cast that had been used inappropriately with
  74 |     more proper code.
  75 |   * Improved the pattern compiler so that SRELL_STRICT_IMPL and
  76 |     SRELL_NO_THROW can be defined together.
  77 |   * Fixed the pattern compiler to throw error_backref when a
  78 |     backreference number greater than the total number of capturing
  79 |     bracket pairs in the regex expression is used, instead of ignoring
  80 |     it silently.
  81 |   * Fixed the bug in the pattern compiler which caused an flag
  82 |     modification in an outer group not to be inherited to its inner
  83 |     groups when the inner ones have their own modifiers. E.g.,
  84 |     (?i:a(?s:a)) had not matched "AA".
  85 |   * Added a new overload to regex_search that searches in
  86 |     std::basic_string, in which the start position can be specified.
  87 |   * Added the overload same as above also to basic_regex.search().
  88 |   * Added consideratioins for extreme cases in pattern compiling:
  89 |     * If any total number of capturing groups, repetitions that need an
  90 |       internal counter, or 0-width match checkers reaches 0xFFFFFFFF,
  91 |       now error_complexity is thrown.
  92 |     * When parsing nested groups and/or character classes (only in the
  93 |       v-flags mode), if the total depth of their levels exceeds
  94 |       SRELL_MAX_DEPTH (the default value is 256), error_complexity is
  95 |       thrown to avoid a stack overflow.
  96 |   * Fixed a problem that caused a compilation error when SRELL_NO_ICASE
  97 |     was defined.
  98 |   * Removed one of API extensions, which takes three iterators and does
  99 |     not take match_results as parameters, from overloads of
 100 |     regex_search().
 101 |   * Preparations for SRELL_NO_THROW level 2.
 102 | 
 103 | 20250214; version 4.064:
 104 |   * Fixed a bug that caused memory corruption when u32regex's pattern
 105 |     compiler is used with SIMD support being enabled.
 106 |   * Restored some of API extensions to basic_regex, match() and search()
 107 |     member functions that had been removed mistakenly in version 4.057.
 108 |     But overloads that take match_results as a parameter only. Ones that
 109 |     do not take are dropped officially.
 110 |   * Other minor modifications.
 111 | 
 112 | 20241208; version 4.063:
 113 |   * Adjusted the pattern compiler to consider surrogate pairs when
 114 |     generating SIMD data for UTF-16.
 115 | 
 116 | 20241208; version 4.062:
 117 |   * Simplified debugging macro checks.
 118 |   * Minor improvements and cleanup.
 119 | 
 120 | 20241204; version 4.061:
 121 |   * Introduced a simple SIMD acceleration (x86/x64 only).
 122 | 
 123 | 20241101; version 4.060:
 124 |   * Corrected several offset values that had been off by 1 in
 125 |     srell_updata3.h since the format change of data tables in 4.030.
 126 |     Because of this problem, the pattern compiler had failed to look up
 127 |     a Unicode property value being last in code point order, e.g.,
 128 |     \p{sc=Zzzz}, \p{space}. (Thanks to Eugene Levelev for the bug
 129 |     report).
 130 |   * Updated unicode/updataout3.cpp to output a data file in which the
 131 |     problem above is fixed.
 132 |   * Cancelled the code change in 4.050 that had caused a problem which
 133 |     had been fixed in 4.059, in favour of code simplification.
 134 | 
 135 | 20241016; version 4.059:
 136 |   * Fixed a problem that caused match_results::length(), position(), and
 137 |     str() not to compile when an argument was omitted since version
 138 |     4.050 (Thanks to Winfried Schenke for the bug report).
 139 | 
 140 | 20241004; version 4.058:
 141 |   * Now modifiers (?ims-ims:) are enabled by default.
 142 | 
 143 | 20240922; version 4.057:
 144 |   * Fixed an issue that caused the pattern compiler not to return an
 145 |     error even when an invalid Unicode property name or value is
 146 |     specified in \p{}/\P{} since version 4.054.
 147 |   * Removed replace() and split() from basic_regex.
 148 |   * Removed SRELL_CPP* macros.
 149 |   * Updated misc/conftest.cpp. The wrong mask value bug fix (same fix in
 150 |     srell.hpp 4.052) and changes for the removal of SRELL_CPP* macros.
 151 |   * Updated unicode/updataout3.cpp.
 152 | 
 153 | 20240911; version 4.056:
 154 |   * Updated ucfdata2.h and updata3.h to support Unicode 16.0.0.
 155 |   * Other minor modifications.
 156 | 
 157 | 20240904; version 4.055:
 158 |   * Adjusted the prefilter for UTF-8/UTF-16 so that search against a
 159 |     string containing not a few characters encoded in multiple code
 160 |     units (0080..10FFFF in UTF-8, 10000..10FFFF in UTF-16) would not be
 161 |     slowed down to excess.
 162 |   * Lowered the default value of limit_counter from 1 << 24 to 1 << 21.
 163 |   * Removed all member functions from regex_traits as unused.
 164 | 
 165 | 20240831; version 4.054:
 166 |   * Code size reduction. Unified parsers that had been separate for
 167 |     u-mode and v-mode.
 168 | 
 169 | 20240824; version 4.053:
 170 |   * Simplified the creation of the predefined character classes.
 171 |   * Improved internal UTF-8 iterators. Reduced the number of conditional
 172 |     jumps.
 173 | 
 174 | 20240818; version 4.052:
 175 |   * Fixed the wrong mask value in utf16_traits.
 176 |   * Minor improvements.
 177 | 
 178 | 20240816; version 4.051:
 179 |   * Reimplemented the optimisation introduced in 4.050 in a different
 180 |     way to minimise memory usage.
 181 |   * Minor improvements.
 182 | 
 183 | 20240810; version 4.050:
 184 |   * Added a new optimisation for C{n,m} where C is a character or
 185 |     character class and n < m != infinity.
 186 |   * Minor improvements.
 187 | 
 188 | 20240720; version 4.049:
 189 |   * Added the new flag, "sticky" to syntax_option_type.
 190 |   * Added split_aptrange() to regex_iterator2.
 191 |   * Removed the feature of generating a data file in an old format from
 192 |     ucfdataout2.cpp and updataout3.cpp.
 193 | 
 194 | 20240714; version 4.048:
 195 |   * Removed two types of internal iterators, which read a codepoint
 196 |     value at the current position or the previous position with keeping
 197 |     its pointing position.
 198 |   * Modified internal UTF-8 iterators not to accept non-shortest forms.
 199 | 
 200 | 20240707; version 4.047:
 201 |   * Performance improvement in searching with srell::regex (only if
 202 |     CHAR_BIT is 8), srell::u8cregex, or srell::u8regex.
 203 | 
 204 | 20240613; version 4.046:
 205 |   * Code size reduction. SRELL no longer gives priority to finding a
 206 |     literal sequence.
 207 |   * Various minor improvements and fixes.
 208 | 
 209 | 20240608; version 4.045:
 210 |   * Implemented the regex modifiers feature. But until the proposal is
 211 |     merged into the draft specification of ECMAScript, this feature is
 212 |     disabled and available only when SRELL_ENABLE_MODIFIERS is defined.
 213 |   * Added a missing check to see whether a backreference number exceeds
 214 |     the max number of capturing groups or not, which should have been
 215 |     added with the modification for the duplicate named capturing groups
 216 |     support in version 4.043.
 217 | 
 218 | 20240602; version 4.044:
 219 |   * Added several missing #if ~ #endif directives for
 220 |     SRELL_NO_NAMEDCAPTURE.
 221 |   * Retired the older state insertion function in favour of the newer
 222 |     one.
 223 | 
 224 | 20240526; version 4.043:
 225 |   * Implemented the duplicate named capturing groups feature.
 226 | 
 227 | 20240524; version 4.042:
 228 |   * Expanded the scope of the optimisation for * and + also to support
 229 |     C{n,} where C is a character or character class and n >= 2.
 230 |   * Introduced the unified stack, which is used when either of the
 231 |     following conditions is met:
 232 |     1) The iterator passed to the matching function is a pointer, or
 233 |     2) std::is_trivially_copyable is supported by the compiler and for
 234 |        the type I of the passed iterator,
 235 |        std::is_trivially_copyable<I>::value is true.
 236 |     Otherwise separate stacks that have been present from early versions
 237 |     are used.
 238 | 
 239 | 20240519; version 4.041:
 240 |   * Completed the temporary fix in 4.040.
 241 |   * Removed unused functions.
 242 |   * Fixed a potential issue on systems where memory of more than 64 GB
 243 |     can be allocated.
 244 | 
 245 | 20240131; version 4.040:
 246 |   * Restored one more line for ?? (non-greedy {0,1}) not to cause an
 247 |     optimisation bug.
 248 | 
 249 | 20240127; version 4.039:
 250 |   * Restored some code that had been removed mistakenly in 4.037.
 251 | 
 252 | 20240124; version 4.038:
 253 |   * Fixed a bug that caused /(?:ab)+|cd/ to match "ababcd".
 254 |     Condition: Both sides of | begin with different characters, and the
 255 |     left side character is contained in (?:)+.
 256 |   * Minor improvements.
 257 | 
 258 | 20240122; version 4.037:
 259 |   * Fixed an optimisation bug that caused /(?:a|ab|abc)$/ to match "ac"
 260 |     since version 4.021.
 261 |     Condition: (?:A|B|C) where A is a prefix of B, and B is a prefix of
 262 |     C.
 263 |     -> A path from the end of A to a suffix of C occured through the
 264 |        wrong optimisation. This path was usually hidden, but could be
 265 |        used when backtracking is performed.
 266 |   * Other various improvements and fixes.
 267 | 
 268 | 20240114; version 4.036:
 269 |   * Improvement and bugfix of lookaround (lookahead and lookbehind):
 270 |     1. Removed unnecessary stack operations.
 271 |     2. Restored the state type that had been removed in version 3.003 so
 272 |        that the value of the first capturing group in /(?:(?=(\w))|b)c$/
 273 |        against "abc" will be undefined, not "b".
 274 |        Condition: 1. A lookaround assertion contains a capturing group,
 275 |        2. After the lookaround assertion is successful, matching with
 276 |        succeeding expressions fails, 3, Another subpattern separated by
 277 |        '|' is tried and a match with total expressions is found.
 278 |        -> A subsequence captured by the group in the lookaround remained
 279 |           without being reverted to "undefined".
 280 |   * Replaced misc/sample01.cpp with conftest.cpp.
 281 |   * Tagged each kind of epsilon.
 282 | 
 283 | 20231229; version 4.035:
 284 |   * Improved case folding of character classes. (Compilation of \p{Any}
 285 |     was a bit slow when the icase flag was set).
 286 |   * Several preparations for (?i:) support.
 287 |   * Updated updataout3.cpp. It could not be compiled because an internal
 288 |     namespace was changed in the previous version.
 289 | 
 290 | 20231209; version 4.034:
 291 |   * Modified to use std::contiguous_iterator when it is available, to
 292 |     check if the iterator passed to the matching function is a
 293 |     contigous_iterator.
 294 |   * Modified match_results::operator[]() not to throw error_backref when
 295 |     a group name not existing in the regular expression is passed to as
 296 |     an argument, but to return a reference to a sub_match object
 297 |     representing an unmatched sub-exression.
 298 |     In accordance with this change, now
 299 |     match_results::operator[](size_type n) also returns the same object
 300 |     when n >= size(). (Behaviour accordant to std::regex. Until the
 301 |     previous version, this object was returned only when
 302 |     SRELL_STRICT_IMPL is defined).
 303 |   * Implemented the no throw/exception mode.
 304 |   * For the no throw mode, added basic_regex::ecode() that returns
 305 |     error_type that should have been thrown during the previous pattern
 306 |     compilation.
 307 |   * For the no throw mode, added match_results::ecode() that returns
 308 |     error_type that should have been thrown during the previous pattern
 309 |     matching/searching.
 310 | 
 311 | 20230926; version 4.033:
 312 |   * Fixed a bug that could cause a crash on 64-bit systems since version
 313 |     4.020 (Thanks to Yuriy Skvortsov for the bug report).
 314 |     Condition: 3 or more Alternatives begin with the same character such
 315 |     as /ab|ac|ad/.
 316 |   * Removed an unused member function from utf_traits.
 317 |   * Some clean-ups.
 318 | 
 319 | 20230916; version 4.032:
 320 |   * Added directives for decoders of UTF-8/UTF-16 to be always inlined.
 321 |   * Improved several internal functions that call the automaton.
 322 | 
 323 | 20230913; version 4.031:
 324 |   * Updated ucfdata2.h and updata3.h to support Unicode 15.1.0.
 325 |   * Updated updataout3.cpp so that the "Unknown" value can be used as a
 326 |     value for Script/Script_Extensions of Unicode property escapes.
 327 |     Although this value is mentioned in Scripts.txt, SRELL did not
 328 |     support it because it was not included in the table in the
 329 |     ECMAScript specification which shows what script names must be
 330 |     supported. However, as the table was removed from the specification
 331 |     and the rationale to exclude it has disappeared, SRELL has begun to
 332 |     support this value, following V8.
 333 | 
 334 | 20230909; version 4.030:
 335 |   * Modified the pattern compiler not to create a rewinder only for ^,
 336 |     $, or \b/\B.
 337 |   * Introduced the binary search to look up names and values for Unicode
 338 |     properties.
 339 |   * In accordance with the change above, updated unicode/updataout2.cpp
 340 |     to updataout3.cpp.
 341 |     Furthermore, since the ECMAScript specification ceased to list the
 342 |     script names that must be supported, modified to read them from
 343 |     Scripts.txt and PropertyValueAliases.txt.
 344 |   * Changed the suffix of Unicode data files from *.hpp to *.h.
 345 |   * Updated unicode/ucfdataout2.cpp to follow the suffix change above.
 346 | 
 347 | 20230903; version 4.029:
 348 |   * Updated unicode/updataout2.cpp to fix an issue that caused
 349 |     compilation error because internal integer types were unified in
 350 |     version 4.023.
 351 |   * Recreated srell_updata2.hpp (Data for the two scripts that were
 352 |     newly added to Unicode 15 were missing from the previous version.
 353 |     Apparently, it was output by the previous version of
 354 |     updataout2.cpp).
 355 | 
 356 | 20230831; version 4.028:
 357 |   * Improved not to call the automaton for Unicode code point values
 358 |     that cannot be held by a single char/wchar_t when the regex or
 359 |     wregex type is used.
 360 | 
 361 | 20230821; version 4.027:
 362 |   * Fixed a bug that caused the first capturing group to be empty in the
 363 |     match result of /(?:(\d+-)?)+(\d{1,2})-(\d{1,2})/ against
 364 |     "2023-8-21" (This bug had been existing from an early version).
 365 |   * Fixed a bug that caused the entire match to be only "23-8-21" in the
 366 |     same match result (This bug was introduced in 4.019 and was not
 367 |     covered by the fix in 4.026).
 368 | 
 369 | 20230820; version 4.026:
 370 |   * Fix a bug that caused a search for /(\d+-)?\d{1,2}-\d{1,2}/ in
 371 |     "2023-8-20" matched only "23-8-20" since version 4.019.
 372 | 
 373 | 20230819; version 4.025:
 374 |   * To avoid movzx, changed the type to hold a flag in the internal
 375 |     representation from bool to an integer type.
 376 |   * Repleaced names of member variables in structs that are used
 377 |     frequently in the automaton with shorter names.
 378 | 
 379 | 20230817; version 4.024:
 380 |   * Commented out the code for an optimisation that had become unused
 381 |     since 4.019.
 382 |   * Various minor improvements and fixes.
 383 | 
 384 | 20230804; version 4.023:
 385 |   * Unified two internal integer types to one type.
 386 |   * Simplified several optimisations that had become less effective
 387 |     because of the entry state selector introduced in 4.019.
 388 |   * Improved the entry state selector.
 389 |   * Corrected misnamed variable names.
 390 | 
 391 | 20230730; version 4.022:
 392 |   * Refinement of source code and various minor fixes.
 393 | 
 394 | 20230727; version 4.021:
 395 |   * Improved the branch optimisation so that SRELL can optimise
 396 |     Alternatives without inserting new additional internal states.
 397 | 
 398 | 20230724; version 4.020:
 399 |   * Simplified the method of converting properties of strings to
 400 |     internal representations.
 401 |   * Other minor improvements.
 402 | 
 403 | * [4.000-4.019, v flag mode] Because there was a mistake in the setting
 404 |   for compiling my own source files to a release version, the v flag
 405 |   mode was not correctly implemented in SRELL 4.000-4.019. Adding the
 406 |   following line at the last of the optimise_pos() function fixes the
 407 |   problem:
 408 |     insert_btbranch(piece, ins_bt);
 409 |   The bug originated from the fact that this function was not called
 410 |   from anywhere.
 411 | 
 412 | 20230114; version 4.019:
 413 |   * Implemented a new entry state selector.
 414 | 
 415 | 20230109; version 4.018:
 416 |   * Cancelled the mergence of automata that was done in version 4.016,
 417 |     because once I began to modify the pattern compiler for i-modifier
 418 |     support, icase search performance degradation that had not surface
 419 |     in preliminary examinations appeared.
 420 | 
 421 | 20230107; version 4.017:
 422 |   * Fixed a bug that caused compilation to fail since version 4.006 when
 423 |     bidirectional iterators were passed to the matching function.
 424 | 
 425 | 20230106; version 4.016/3.018 (@ only):
 426 |   * Merged four automata into two (preparation for i-modifier support).
 427 |   @ Fixed the pattern compiler not to treat /a{0,0}/ as an error.
 428 |   @ Other minor fixes.
 429 | 
 430 | 20221227; version 4.015:
 431 |   * Fixed a minor issue in regex_iterator2 that was treated as an error
 432 |     by VC when _ITERATOR_DEBUG_LEVEL >= 1.
 433 |   * Other improvements.
 434 | 
 435 | 20221220; version 4.014:
 436 |   * Supplemented some member functions that were missing accidentally
 437 |     from match_results in the previous release.
 438 |   * Simplified regex_token_iterator.
 439 | 
 440 | 20221220; version 4.013:
 441 |   * Fixed a minor issue in split(). When splitting "abc" by /$/, split()
 442 |     had returned {"abc", ""} instead of {"abc"} that is correct.
 443 |   * Reduced the number of overload functions of replace(). Now when the
 444 |     lambda expression is used, the type of match_results that will be
 445 |     passed to a callback function needs specifying explicitly as the
 446 |     template argument.
 447 |   * Added regex_iterator2.
 448 | 
 449 | 20221216; version 4.012:
 450 |   * Fixed replace(). VC2005 could not compile it.
 451 | 
 452 | 20221214; version 4.011/3.017 (@ only):
 453 |   @ [LWG Issue 3204] Added swap() to sub_match.
 454 |   * Modified replace() so that it can replace any container type that
 455 |     looks like std::basic_string.
 456 |   * Added srell::str_clip.
 457 |   * Added overload functions to split() that support a pair of iterators
 458 |     and a pointer.
 459 | 
 460 | 20221212; version 4.010:
 461 |   * Adjusted the behaviour of split() in accordance with the document.
 462 |     While the document says sub_match is pushed to a list container,
 463 |     in the code basic_string was pushed to the list container.
 464 |   * Added overloads to the sub_match class that support implicit and
 465 |     explcit converting to std::basic_string instantiated with a custom
 466 |     traits/allocator.
 467 | 
 468 | 20221210; version 4.009/3.016 (@ only):
 469 |   @ Fixed a problem so that regex_iterator.prefix().matched can be set
 470 |     to true in incrementing after it matched an empty sequence.
 471 |   @ Fixed the core of the matching functions not to cause a compilation
 472 |     error when an object of match_results instantiated with a custom
 473 |     allocator is provided.
 474 |   * Added new member functions to basic_regex as API extensions.
 475 | 
 476 | 20221130; version 4.008:
 477 |   * Priority was given back to the BMH matcher from the finder
 478 |     introduced in 4.006.
 479 |   * Minor improvements of ^ and $ in the multiline mode and \b, \B.
 480 | 
 481 | 20221124; version 4.007:
 482 |   * Added support for the unbounded flag modifiers ((?ims-ims)), which
 483 |     are available only at the beginning of a regular expression (the
 484 |     same as Python 3.11).
 485 |     Note: This feature is not defined in the ECMAScript specification
 486 |     nor compatible with the regexp-modifiers proposal. This feature can
 487 |     be disabled by defining SRELL_NO_UBMOD.
 488 | 
 489 | 20221123; version 4.006:
 490 |   * Added a new finder for expressions whose first matching character is
 491 |     a single character.
 492 | 
 493 | 20221030; version 4.005/3.015 (@ only):
 494 |   @ Fixed a problem that caused undefined behaviour in conditions where
 495 |     sizeof (int) != sizeof (long), e.g. LP64 (4/8/8). (Thanks to Travers
 496 |     Ching for the report).
 497 |   * Updated unicode/ucfdataout2.cpp and updataout2.cpp. Now they can
 498 |     compile even without srell_ucfdata2.hpp and srell_updata2.hpp.
 499 |   * Some clean-ups.
 500 | 
 501 | 20221022; version 4.004/3.014:
 502 |   * Updated srell_ucfdata2.hpp and srell_updata2.hpp to support Unicode
 503 |     15.0.0.
 504 |   * Updated unicode/updataout2.cpp to support Unicode 15. (Support in
 505 |     advance new script names that are expected to be available in RegExp
 506 |     of ECMAScript 2023).
 507 |   * Removed some code that had become unused or meaningless as a result
 508 |     of the previous backreference bug fixes.
 509 | 
 510 | 20221012; version 4.003/3.013:
 511 |   * Re-refixed the backreference bug. Incidentally, this bug was
 512 |     introduced along with the addition of the variable-length lookbehind
 513 |     feature. Therefore, SRELL versions 2.000- have this bug.
 514 |     (It originated from the fact that in a variable length lookbehind
 515 |     assertion, it is possible that the parser encounters a backreference
 516 |     prior to the corresponding bracket pair, such as /(?<=\1\s+(\d+))/,
 517 |     and the parser cannot know immediately whether the corresponding
 518 |     capturing bracket pair really exists in the expression).
 519 | 
 520 | 20221012; version 4.002/3.012:
 521 |   * Refixed the backreference bug in a different way because the fix of
 522 |     20221011 did not cover the problem caused by such an expression as
 523 |     /(?:\1+)*()/. Fixed also an infinite loop caused by an expression
 524 |     like /()(?:\1+)*/.
 525 | 
 526 | 20221011; version 4.001/3.011 (@ only):
 527 |   @ Fixed a bug that caused dereferencing a null pointer or infinite
 528 |     loop when a backreference is followed by * or +, and the
 529 |     backreference appears prior to the close bracket of the
 530 |     corresponding pair of capturing brackets, such as /\1*()/, /(\1+)/.
 531 |     (Thanks to @datadiode, the author of srellcom, for finding the bug).
 532 |   * In accordance with the ECMAScript specification, restricted
 533 |     positions where '-' can be written without escaping in character
 534 |     classes. Now '-' following a predefined character class such as \d,
 535 |     \s causes an error, unless it is the final character in a character
 536 |     class. ([\s-0] causes an error, [\s-] is accepted).
 537 |   * Adjusted internal UTF-8 iterators.
 538 | 
 539 | 20220618; version 4.000:
 540 |   * Added support for the v-flag mode that is expected to be added to
 541 |     a future version of ECMAScript.
 542 |   * Changed the format of srell_updata.hpp and renamed to
 543 |     srell_updata2.hpp.
 544 |   * In accordance with the change above, unicode/updataout.cpp was
 545 |     updated and renamed to updataout2.cpp.
 546 |   * Fixed an issue of a struct layout that clang-tidy warns as
 547 |     "excessive padding" on 64-bit systems (Thanks for the report).
 548 |   * Updated unicode/ucfdataout2.cpp.
 549 | 
 550 | 20220529; version 3.010:
 551 |   * Reduced the amount of memory used to hold a character class that
 552 |     contains Unicode property escapes.
 553 |   * Changed the value of error_type thrown when an invalid name or value
 554 |     is specified in curly brackets of \p or \P, from
 555 |     regex_constants::error_escape to newly-introduced
 556 |     regex_constants::error_property.
 557 |   * Other minor improvements.
 558 | 
 559 | 20220511; version 3.009:
 560 |   * Fixed an optimisation bug that caused /abcd|ab/ not to match "abc".
 561 | 
 562 | 20220504; version 3.008:
 563 |   * Fixed the behaviour of [^\P{...}] when the icase flag is set, as it
 564 |     behaved similarly to the one in v-mode that has been proposed in
 565 |     TC39.
 566 | 
 567 | 20220429; version 3.007:
 568 |   * Further modification to the counter mechanism.
 569 | 
 570 | 20220428; version 3.006:
 571 |   * Modified the mechanism of the counter used for repetition.
 572 |   * Re-removed the implementation of linear search for small character
 573 |     classes.
 574 | 
 575 | 20220424; version 3.005:
 576 |   * Fixed a bug that caused /(?<=$.*)/ not to match the end of "a" when
 577 |     the multiline flag is set
 578 |   * Preparations for \A, \z, (?m:) that have been proposed in TC39.
 579 | 
 580 | 20220420; version 3.004:
 581 |   * Added a new optimisation for /A*B/ and /A+B/ where a character class
 582 |     A overlaps a character or character class B, such as /[A-Za-z]+ing/,
 583 |     /".*"/.
 584 | 
 585 | 20220416; version 3.003:
 586 |   * Combined two optimisation functions into one.
 587 |   * Reduced the amount of code for lookaround (lookahead and lookbehind)
 588 |     assertions.
 589 | 
 590 | 20220416; version 3.002:
 591 |   * Fixed a bug that caused regex_match or regex_search with the
 592 |     match_continuous flag being set to fail when the entry state
 593 |     selector introduced in version 3.000 was used internally.
 594 | 
 595 | 20211025; version 3.001:
 596 |   * Removed the code for splitting counter as it seemed to be no effect
 597 |     or to make performance a bit worse.
 598 |   * Fixed potential bugs.
 599 |   * Minor improvements.
 600 | 
 601 | 20211023; version 3.000:
 602 |   * Updated srell_ucfdata2.hpp and srell_updata.hpp to support Unicode
 603 |     14.0.0.
 604 |   * Updated unicode/updataout.cpp to support Unicode 14. (Support in
 605 |     advance new script names that are expected to be available in RegExp
 606 |     of ECMAScript 2022).
 607 |   * Changed the type used to store a Unicode value when char32_t is not
 608 |     available, from an "unsigned integer type with width of at least 21
 609 |     bits" to a "one of at least 32 bits".
 610 |   * Changed the type used to store a repetition count or character class
 611 |     number when char32_t is not available, from "unsigned int" to
 612 |     "unsigned integer type of at least 32-bit width".
 613 |   * Added overflow check in the function that translates digits into a
 614 |     numeric value. For example, while up to the previous version
 615 |     /a{0,4294967297}/ was treated as /a{0,1}/ because of overflow when
 616 |     the unsigned int type is 32-bit width, SRELL now throws error_brace
 617 |     in cases like this.
 618 |   * Fixed a bug that caused /[^;]*^;?/ not to match the beginning of an
 619 |     input string when the multiline flag is not set.
 620 |   * Implemented a very simple and limited entry state selector.
 621 | 
 622 | 20211004; version 2.930:
 623 |   * Added new typedefs whose prefix is u1632w- and support UTF-16 or
 624 |     UTF-32 depending on the value of WCHAR_MAX. (When 0xFFFF <=
 625 |     WCHAR_MAX < 0x10FFFF, u1632w- types are aliases of u16w- types.
 626 |     When 0x10FFFF <= WCHAR_MAX, u1632w- types are aliases of u32w-
 627 |     types).
 628 |   * Reduced the amount of memory used for Eytzinger layout search.
 629 |   * Various improvements. (Some of them are based on suggestions to NIRE
 630 |     by Marko Njezic).
 631 | 
 632 | 20210624; version 2.920:
 633 |   * Added a new optimisation for the quantifier '?' (I.e., {0,1}).
 634 |   * Changed the version number of the ECMAScript specification
 635 |     referenced in misc/sample01.cpp to 2021.
 636 | 
 637 | 20210429; version 2.912:
 638 |   * Fixed another bug in the optimisation introduced in version 2.900,
 639 |     which caused /aa|a|aa/ not to match "a" (Thanks to Jan Schrötter for
 640 |     the report).
 641 |     Incidentally, this optimisation can be disabled by defining
 642 |     SRELLDBG_NO_BRANCH_OPT2 prior to including srell.hpp.
 643 | 
 644 | 20210424; version 2.911:
 645 |   * Fixed a bug in the optimisation introduced in version 2.900, which
 646 |     caused /abc|ab|ac/ not to match "ac". (Thanks for the bug report [As
 647 |     my email to the reporter was rejected by the email server and
 648 |     returned, it is unclear whether mentioning the name here is okay
 649 |     with the reporter. So, I refrain]).
 650 | 
 651 | 20210407; version 2.910:
 652 |   * Fixed a potential memory leak in move assignment operators used by
 653 |     the pattern compiler since 2.900. (Thanks to Michal Švec for the
 654 |     report).
 655 | 
 656 | 20210214; version 2.901:
 657 |   * Removed redundant template specialisations.
 658 | 
 659 | 20210214; version 2.900:
 660 |   * Added a new optimisation for the alternative expression that consist
 661 |     of string literals, such as /abc|abd|acde/.
 662 |   * Fixed the problem that brought u(8|16)[cs]regex_(token_)?iterator
 663 |     (i.e., regex (token) iterators specialised for char8_t or char16_t)
 664 |     to a compile error.
 665 |   * Minor improvements.
 666 | 
 667 | 20210131; version 2.810:
 668 |   * Improved internal UTF-8 iterators.
 669 | 
 670 | 20200724; version 2.800:
 671 |   * Introduced the Eytzinger layout for binary search in the character
 672 |     class.
 673 |   * Reimplemented linear search for small character classes.
 674 |   * Modified handling of the property data used for parsing the name for
 675 |     a named capturing group. Now they are loaded only when needed
 676 |     instead of being loaded into an instance of basic_regex always.
 677 | 
 678 | 20200714; version 2.730:
 679 |   * Added code to prevent redundant save and restore operations when
 680 |     nested capturing round brackets are processed.
 681 |   * Improved regex_iterator.
 682 | 
 683 | 20200703; version 2.720:
 684 |   * Improved case-insensitive (icase) search using the
 685 |     Boyer-Moore-Horspool algorithm for UTF-8 string that includes
 686 |     non-ASCII characters or UTF-16 string that includes non-BMP
 687 |     characters.
 688 |   * Fixed a bug that caused regex_iterator->prefix().first to point to
 689 |     the beginning of the subject string instead of the end of the
 690 |     previous match (regression introduced in version 2.650, when
 691 |     three-iterators overloads were added to regex_search()).
 692 |   * In accordance with the fix above, when a three-iterators version of
 693 |     regex_search() is called, now match_results.position() returns a
 694 |     distance from the position passed to as the lookbehind limit (3rd
 695 |     param of regex_search) and match_results.prefix().first points to
 696 |     the position passed to as the beginning of the subject string (1st
 697 |     param of regex_search).
 698 |   * Fixed a bug that could cause a valid UTF-8 sequence being adjacent
 699 |     to an invalid UTF-8 sequence to be skipped when the BMH algorithm
 700 |     was used (regression introduced in version 2.630, when UTF-8
 701 |     handling was modified).
 702 | 
 703 | 20200701; version 2.710:
 704 |   * Minor modifications to Boyer-Moore-Horspool search.
 705 | 
 706 | 20200630; version 2.700:
 707 |   * Optimisation adjustments.
 708 | 
 709 | 20200620; version 2.651:
 710 |   * Move the group name validity check to after parsing the \u escape.
 711 |   * Updated misc/sample01.cpp to version 1.103. Changed the version
 712 |     number of the ECMAScript specification referenced by to 2020 (ES11).
 713 | 
 714 | 20200618; version 2.650:
 715 |   * To element access functions in match_results, added overload
 716 |     functions for specifying the group name by a pointer.
 717 |   * When a three-iterators version of regex_search() is used, SRELL now
 718 |     sets match_results::prefix::first to the position passed to as the
 719 |     lookbehind limit (third param) instead of the position passed to as
 720 |     the beginning of the subject (first param).
 721 |   * Removed some operations that seem to be redundant.
 722 | 
 723 | 20200601; version 2.643:
 724 |   * Added "inline" to operators in syntax_option_type and
 725 |     match_flag_type types, based on a report that it is needed not to
 726 |     cause the multiple definition error.
 727 |   * Minor improvements.
 728 | 
 729 | 20200530; version 2.642:
 730 |   * Reduced the size of memory allocated by the basic_regex instance.
 731 | 
 732 | 20200528; version 2.641:
 733 |   * The fix in 2.640 was incomplete. Fixed the optimisation bug 1 again.
 734 |   * Optimisation adjustments.
 735 | 
 736 | 20200516; version 2.640:
 737 |   * Fixed an optimisation bug 1: It was possible for regex_match to pass
 738 |     the end of a subject string under certain conditions.
 739 |   * Fixed an optimisation bug 2: ^ and $ were not given a chance to
 740 |     match an appropriate position in some cases when the multiline flag
 741 |     is set to true.
 742 |   * Updated srell_ucfdata2.hpp and srell_updata.hpp.
 743 | 
 744 | 20200509; version 2.630:
 745 |   * SRELL's pattern compiler no longer permits invalid UTF-8 sequences
 746 |     in regular expressions. It throws regex_utf8. (Invalid UTF-8
 747 |     sequences in the subject string are not treated as an error.)
 748 |   * Fixed BMH search functions not to include extra (invalid) UTF-8
 749 |     trailing bytes following the real matched substring, in a returned
 750 |     result.
 751 |   * Fixed minor issues: 1) basic_regex.flags() did not return the
 752 |     correct value in some cases, 2) match_results.format() did not
 753 |     replace $<NAME> with an empty string when any capturing group whose
 754 |     name is NAME did not exist.
 755 | 
 756 | 20200502; version 2.620:
 757 |   * Removed methods used for match_continuous and regex_match in the
 758 |     class for the Boyer-Moore-Horspool algorithm. Now SRELL always uses
 759 |     the automaton like earlier versions when they are processed.
 760 |   * Some clean-ups.
 761 | 
 762 | 20200428; version 2.611:
 763 |   * Fixed a bug that caused /\d*/ not to match the head of "abc" but to
 764 |     match the end of it. (regression introduced in version 2.210.)
 765 | 
 766 | 20200426; version 2.610:
 767 |   * Fixed a bug that caused case-insensitive (icase) BMH search to skip
 768 |     a matched sequence at the beginning of the entire text, when 1)
 769 |     search is done against UTF-8 or UTF-16 text, and 2) the searched
 770 |     pattern ends with a character that consists of multiple code units
 771 |     in that encoding.
 772 |   * Now SRELL parses a capturing group name according to the ECMA
 773 |     specification and strictly checks its validity. Group names like
 774 |     /(?<,>...)/ cause regex_error.
 775 | 
 776 | 20200418; version 2.600:
 777 |   * To pass to regex_search() directly the limit of a sequence until
 778 |     where the automaton can lookbehind, added three-iterators versions
 779 |     of regex_search().
 780 |   * [Breaking Change] Removed the match_lblim_avail flag from
 781 |     match_flag_type and the lookbehind_limit member from match_results
 782 |     which were added in version 2.300.
 783 |   * Updated srell_ucfdata2.hpp and srell_updata.hpp to support Unicode
 784 |     13.0.0.
 785 |   * Updated unicode/updataout.cpp to support Unicode 13. (Support in
 786 |     advance new script names that will be available in RegExp of
 787 |     ECMAScript 2020).
 788 | 
 789 | 20191118; version 2.500:
 790 |   * Modified basic_regex to hold precomputed tables for icase matching,
 791 |     instead of creating them from case folding data when its instance is
 792 |     first created.
 793 |   * In accordance with the change above, srell_ucfdata.hpp and
 794 |     ucfdataout.cpp were replaced with srell_ucfdata2.hpp and
 795 |     ucfdataout2.cpp, accordingly.
 796 |   * Changed the method of character class matching from linear search to
 797 |     binary search.
 798 |   * Changed the timing of optimisation of a character class from "when a
 799 |     closing bracket ']' is found" to "every time a character or
 800 |     character range is pushed to its character class array".
 801 |   * Removed all asserts.
 802 |   * Modified the pattern compiler to interpret sequential \uHHHH escapes
 803 |     as a Unicode code point value if they represent a valid surrogate
 804 |     pair. (By this change, incompatibilities with the ECMAScript
 805 |     specification disappeared.)
 806 |   * Fixed the position of an endif directive that caused a compiler
 807 |     error when -DSRELL_NO_NAMEDCAPTURE is specified.
 808 |   * Updated updataout.cpp to version 1.101.
 809 |   * Added a standalone version of SRELL in the single-header directory.
 810 | 
 811 | 20190914; version 2.401:
 812 |   * Reduced the size of basic_regex. (It was bloated by my carelessness
 813 |     when support for Unicode property escapes was added).
 814 |   * Improved basic_regex::swap().
 815 | 
 816 | 20190907; version 2.400:
 817 |   * Improved the performance of character class matching.
 818 |   * Modified the pattern compiler to interpret the \u escape sequence in
 819 |     the group name in accordance with the ECMAScript specification.
 820 |   * Updated ucfdataout.cpp to version 1.200. A new member has been added
 821 |     to the unicode_casefolding class in srell_ucfdata.hpp that
 822 |     ucfdataout.cpp generates.
 823 |     Because SRELL 2.400 and later need this added member, they cannot be
 824 |     used with srell_ucfdata.hpp output by ucfdataout.cpp version 1.101
 825 |     or earlier. (No problem in using an older version of SRELL with a
 826 |     newer version of srell_ucfdata.hpp).
 827 |   * Some clean-ups and improvements.
 828 | 
 829 | 20190902; version 2.304:
 830 |   * Fixed regex_iterator that had been broken by the code clean-up in
 831 |     version 2.303.
 832 | 
 833 | 20190810; version 2.303:
 834 |   * Refixed the problem that was fixed in version 2.302 as the fix was
 835 |     incomplete.
 836 |   * Cleaned up code.
 837 | 
 838 | 20190809; version 2.302:
 839 |   * Bug fix: When (?...) has a quantifier, strings captured by round
 840 |     brackets inside it were not cleared in each repetition but carried
 841 |     over to the next loop. For example,
 842 |     /(?:(ab)|(cd))+/.exec("abcd") returned ["abcd", "ab", "cd"], instead
 843 |     of ["abcd", undefined, "cd"]. (The latter is correct).
 844 |   * Updated misc/sample01.cpp to version 1.102. Rewrote the chapter
 845 |     numbers in accordance with ECMAScript 2019 (ES10).
 846 | 
 847 | 20190724; version 2.301:
 848 |   * In accordance with the ECMAScript spec, restricted the characters
 849 |     which can be escaped by '\', to the following fifteen characters:
 850 |     ^$\.*+?()[]{}|/
 851 |     Only in the character class, i.e., inside [], '-' also becomes a
 852 |     member of the group.
 853 | 
 854 | 20190717; version 2.300:
 855 |   * Added a feature for specifying the limit until where the automaton
 856 |     can lookbehind, separated from the beginning of a target sequence.
 857 |     (Addition of the match_lblim_avail flag to match_flag_type and the
 858 |     lookbehind_limit member to match_results).
 859 |     And, lookbehind_limit of match_results being private and used
 860 |     internally in regex_iterator is also set in its constructor.
 861 |   * Removed order restriction of capturing parentheses and
 862 |     backreferences, in accordance with the ECMAScript spec. Now /\1(.)/,
 863 |     /(?<=(.)\1)/, and /\k<a>(?<a>.)/ are all okay.
 864 |   * Updated misc/sample01.cpp to version 1.101. Added one compliance
 865 |     test from misc.js.
 866 | 
 867 | 20190714; version 2.230:
 868 |   * Improved the performance of searching when regular expressions begin
 869 |     with a character or character class followed by a '*' or '+'. (E.g.
 870 |     /[A-Za-z]+ing/).
 871 | 
 872 | 20190707; version 2.221:
 873 |   * Changed the feature test macro used for checking availability of
 874 |     std::u8string, from __cpp_char8_t to __cpp_lib_char8_t.
 875 |   * When icase specified, if all characters in a character class become
 876 |     the same character as a result of case-folding, the pattern compiler
 877 |     has been changed to convert the character class to the character
 878 |     literal (e.g. /r[Ss\u017F]t/i -> /rst/i).
 879 |   * Fixed a minor issue.
 880 | 
 881 | 20190617; version 2.220:
 882 |   * Changed the internal representation of repetition in the case that
 883 |     it becomes more compact by not using the counter.
 884 |   * Fixed an optimisation bug that caused searching for /a{1,2}?b/
 885 |     against "aab" to return "ab" instead of "aab". (Condition: a
 886 |     character or character class with a non-greedy quantifier is
 887 |     followed by its exclusive character or character class).
 888 | 
 889 | 20190613; version 2.210:
 890 |   * Improved a method of matching for expressions like /ab|cd|ef/ (where
 891 |     string literals separaterd by '|' begin with a character exclusive
 892 |     to each other).
 893 | 
 894 | 20190603; version 2.202:
 895 |   * Fixed a bug that caused regex_match to behave like regex_search in
 896 |     the situation where the BMH algorithm is used.
 897 | 
 898 | 20190531; version 2.200:
 899 |   * For searching with a ordinary (non-regex) string, added an
 900 |     implementation based on the Boyer-Moore-Horspool algorithm.
 901 |   * Improved UTF-8 iterators.
 902 |   * Fixed behaviours of \b and \B when icase specified, to match /.\B./i
 903 |     against "s\u017F".
 904 |   * Fixed minor issues.
 905 | 
 906 | 20190508; version 2.100:
 907 |   * Fixed a bug that caused failure of capturing when 1) a pair of
 908 |     capturing brackets exists in a lookbehind assertion, and 2) variable
 909 |     length expressions exist in both the left side of and the inside of
 910 |     the pair of brackets. E.g. given "1053" =~ /(?<=(\d+)(\d+))$/, no
 911 |     appropriate string was set for $2.
 912 |   * Updated srell_ucfdata.hpp and srell_updata.hpp to support Unicode
 913 |     12.1.0.
 914 |   * Updated unicode/updataout.cpp to support Unicode 12. (Support in
 915 |     advance a new binary property and new script names that will be
 916 |     available in RegExp of ECMAScript 2019 and new script names that are
 917 |     anticipated to be available in RegExp of ECMAScript 2020).
 918 |   * Changed the newline character in srell.hpp from CR+LF to LF.
 919 |   * Modified unicode/*.cpp to output LF as a newline instead of CR+LF.
 920 |   * Updated misc/sample01.cpp to version 1.100:
 921 |     1. Rewrote the chapter numbers in subtitles of compliance tests, in
 922 |        accordance with ECMAScript 2018 Language Specification (ES9).
 923 |        (The old chapter numbers were based on ECMAScript specifications
 924 |        up to version 5.1).
 925 |     2. Added one compliance test from ECMAScript 2018 Language
 926 |        Specification 21.2.2.3, NOTE.
 927 |   * Modified the macros for detecting C++11 features.
 928 |   * Changed the method of the character class.
 929 |   * For all the constructors and assign functions of basic_regex to have
 930 |     a default argument for flag_type, reimplemented syntax_option_type
 931 |     and match_flag_type (missed changes between TR1 -> C++11).
 932 |   * Experimental support for the char8_t type. If a compiler supports
 933 |     char8_t (detected by the __cpp_char8_t macro), classes whose names
 934 |     have the "u8-" prefix accept a sequence of char8_t and handle it as
 935 |     a UTF-8 string. If char8_t is not supported, the classes handle a
 936 |     sequence of char as a UTF-8 string, as before.
 937 |   * As classes that always handle a sequence of char as a UTF-8 string,
 938 |     new classes whose names have the "u8c-" prefix were added. They
 939 |     correspond to the classes having the "u8-" prefix in their names up
 940 |     to version 2.002:
 941 |     * u8cregex; u8ccmatch, u8csmatch; u8ccsub_match, u8cssub_match;
 942 |       u8ccregex_iterator, u8csregex_iterator; u8ccregex_token_iterator,
 943 |       u8csregex_token_iterator.
 944 | 
 945 | 20180717; version 2.002:
 946 |   * Changed the maximum number of hexdigits in \u{h...} from six to
 947 |     'unlimited' in accordance with the ECMAScript specification. ("one
 948 |     to six hexadecimal digits" of the old implementation was based on
 949 |     the proposal document).
 950 |   * Updated updataout.cpp to version 1.001. Encounting unknown
 951 |     (newly-encoded) script names is no longer treated as an error.
 952 |   * Updated srell_ucfdata.hpp and srell_updata.hpp to support Unicode
 953 |     11.0.0.
 954 | 
 955 | 20180204; version 2.001:
 956 |   * When icase is specified, [\W] (a character class containing \W) no
 957 |     longer matches any of [KkSs\u017F\u212A] (ecma262 issue #512).
 958 | 
 959 | 20180127; version 2.000:
 960 |   * Added the following features that are to be included into RegExp of
 961 |     ECMAScript 2018:
 962 |     * New syntax option flag for '.' to match every code point, dotall,
 963 |       was added to srell::regex_constants as a value of
 964 |       syntax_option_type and to srell::basic_regex as a value of
 965 |       flag_type.
 966 |     * New expressions to support the Unicode property, \p{...} and
 967 |       \P{...}.
 968 |     * Named capture groups (?<NAME>...) and the new expression for
 969 |       backreference to a named capture group, \k<NAME>.
 970 |   * The behaviors of lookbehind assertions changed. Now both (?<=...)
 971 |     and (?<!...) support variable-length lookbehind.
 972 | 
 973 | 20180125; version 1.401:
 974 |   * Limited the maximum of numbers that are recognised as backreference
 975 |     in match_results.format() up to 99, in accordance with the
 976 |     ECMAScript specification. (I.e., restricted to $1..$9 and $01..$99).
 977 |   * Removed an unused macro and its related code.
 978 | 
 979 | 20180101; version 1.400:
 980 |   * Changed the behaviour of the pattern compiler so that an empty
 981 |     non-capturing group can have a quantifier, for example, /(?:)*/. It
 982 |     is a meaningless expression, but changed just for compatibility with
 983 |     RegExp of ECMAScript.
 984 |   * Fixed a hang bug: This occured when 1) a non-capturing group has a
 985 |     quantifier, 2) and the length of the group itself can be zero-width,
 986 |     3) and a backreference that can be zero-width is included in the
 987 |     group somewhere other than the last, such as /(.*)(?:\1.*)*/.
 988 | 
 989 | 20171216; version 1.300:
 990 |   * Fixed an important bug: /^(;[^;]*)*$/ did not match ";;;;" because
 991 |     of a bug in optimisation. This problem occured when a sequence of
 992 |     regular expressions ended like /(A...B*)*$/ where a character or
 993 |     character set that A represents and the one that B represents are
 994 |     exclusive to each other.
 995 | 
 996 | 20170621; version 1.200:
 997 |   * Updated srell_ucfdata.hpp to support Unicode 10.0.0.
 998 |   * Improved u8regex_traits to handle corrupt UTF-8 sequences more
 999 |     safely.
1000 | 
1001 | 20150618; version 1.141:
1002 |     Updated srell_ucfdata.hpp to support Unicode 8.0.0.
1003 | 
1004 | 20150517; version 1.140:
1005 |   * Modified the method for regex_match() to determine whether a
1006 |     sequence of regular expressions is matched against a sequence of
1007 |     characters. (Issue raised at #2273 in C++ Standard Library Issues
1008 |     List).
1009 |   * Restricted the accepted range of X in the expression "\cX" to
1010 |     [A-Za-z] in accordance with the ECMAScript specification.
1011 |   * Fixed the problem that caused parens in a lookaround assertion not
1012 |     to capture a sequence correctly in some circumstances because the
1013 |     bug fix done in version 1.111 was imperfect.
1014 | 
1015 | 20150503; version 1.130:
1016 |   * Improved case-folding functions.
1017 |   * Updated unicode/ucfdataout.cpp to version 1.100.
1018 |   * Fixed a typo in #if directives for u(16|32)[cs]match.
1019 | 
1020 | 20150425; version 1.120:
1021 |   * Fixed the bug that caused characters in U+010000-U+10FFFF in UTF-8
1022 |     (i.e., four octet length characters) not to have been recognised.
1023 |   * Updated misc/sample01.cpp to version 1.010.
1024 | 
1025 | 20150402; version 1.111:
1026 |   * Fixed the problem that caused $2 of "aaa" =~ /((.*)*)/ to be empty
1027 |     instead of "aaa" because of a bug in optimisation.
1028 | 
1029 | 20141101; version 1.110:
1030 |   * Several fixes based on a bug report:
1031 |       1. Added "this->" to compile() in basic_regex::assign().
1032 |       2. Implemented operator=() functions explicitly instead of using
1033 |          default ones generated automatically.
1034 |   * unicode/ucfdataout.cpp revised and updated to version 1.001.
1035 | 
1036 | 20140622; version 1.101:
1037 |     Updated srell_ucfdata.hpp to support Unicode 7.0.0.
1038 | 
1039 | 20121118; version 1.100:
1040 |     The first released version.
1041 | 
1042 | 


--------------------------------------------------------------------------------
/unicode/updataout3.cpp:
--------------------------------------------------------------------------------
   1 | //
   2 | //  updataout.cpp: version 3.007 (2025/08/23).
   3 | //
   4 | //  This is a program that generates srell_updata3.h from:
   5 | //    DerivedCoreProperties.txt
   6 | //    DerivedNormalizationProps.txt
   7 | //    PropList.txt
   8 | //    PropertyValueAliases.txt
   9 | //    ScriptExtensions.txt
  10 | //    Scripts.txt
  11 | //    UnicodeData.txt
  12 | //    emoji-data.txt
  13 | //    emoji-sequences.txt
  14 | //    emoji-zwj-sequences.txt
  15 | //  provided by the Unicode Consortium. The latese versions of them are
  16 | //  available at:
  17 | //    emoji-data.txt: http://www.unicode.org/Public/UNIDATA/emoji/
  18 | //    emoji-sequences.txt and emoji-zwj-sequences.txt:
  19 | //      http://www.unicode.org/Public/emoji/
  20 | //    others: http://www.unicode.org/Public/UNIDATA/
  21 | //
  22 | 
  23 | #include <cstdio>
  24 | #include <cstdlib>
  25 | #include <cstdarg>
  26 | #include <string>
  27 | #include <vector>
  28 | #include <map>
  29 | #include <stdexcept>
  30 | #include <algorithm>	//  For std::swap in C++98/03
  31 | #include <utility>	//  For std::swap in C++11-
  32 | #define SRELL_NO_UNICODE_DATA
  33 | #include "../srell.hpp"
  34 | 
  35 | #if defined(_MSC_VER) && _MSC_VER >= 1400
  36 | #pragma warning(disable:4996)
  37 | #endif
  38 | 
  39 | namespace updata
  40 | {
  41 | static const char *const property_names[] = {	//  3
  42 | 	"General_Category:gc", "Script:sc", "Script_Extensions:scx", ""
  43 | };
  44 | static const char *const binary_property_names[] = {	//  53 (52+1)
  45 | 	//  *1: http://unicode.org/reports/tr18/#General_Category_Property
  46 | 	//  *2: 9th field in UnicodeData.txt
  47 | 	"ASCII",								//  *1
  48 | 	"ASCII_Hex_Digit:AHex",					//  PropList.txt
  49 | 	"Alphabetic:Alpha",						//  DerivedCoreProperties.txt
  50 | 	"Any",									//  *1
  51 | 	"Assigned",								//  *1
  52 | 	"Bidi_Control:Bidi_C",					//  PropList.txt
  53 | 	"Bidi_Mirrored:Bidi_M",					//  *2
  54 | 	"Case_Ignorable:CI",					//  DerivedCoreProperties.txt
  55 | 	"Cased",								//  DerivedCoreProperties.txt
  56 | 	"Changes_When_Casefolded:CWCF",			//  DerivedCoreProperties.txt
  57 | 	"Changes_When_Casemapped:CWCM",			//  DerivedCoreProperties.txt
  58 | 	"Changes_When_Lowercased:CWL",			//  DerivedCoreProperties.txt
  59 | 	"Changes_When_NFKC_Casefolded:CWKCF",	//  DerivedNormalizationProps.txt
  60 | 	"Changes_When_Titlecased:CWT",			//  DerivedCoreProperties.txt
  61 | 	"Changes_When_Uppercased:CWU",			//  DerivedCoreProperties.txt
  62 | 	"Dash",									//  PropList.txt
  63 | 	"Default_Ignorable_Code_Point:DI",		//  DerivedCoreProperties.txt
  64 | 	"Deprecated:Dep",						//  PropList.txt
  65 | 	"Diacritic:Dia",						//  PropList.txt
  66 | 	"Emoji",								//  emoji-data.txt
  67 | 	"Emoji_Component:EComp",				//  emoji-data.txt
  68 | 	"Emoji_Modifier:EMod",					//  emoji-data.txt
  69 | 	"Emoji_Modifier_Base:EBase",			//  emoji-data.txt
  70 | 	"Emoji_Presentation:EPres",				//  emoji-data.txt
  71 | 	"Extended_Pictographic:ExtPict",		//  emoji-data.txt
  72 | 	"Extender:Ext",							//  PropList.txt
  73 | 	"Grapheme_Base:Gr_Base",				//  DerivedCoreProperties.txt
  74 | 	"Grapheme_Extend:Gr_Ext",				//  DerivedCoreProperties.txt
  75 | 	"Hex_Digit:Hex",						//  PropList.txt
  76 | 	"IDS_Binary_Operator:IDSB",				//  PropList.txt
  77 | 	"IDS_Trinary_Operator:IDST",			//  PropList.txt
  78 | 	"ID_Continue:IDC",						//  DerivedCoreProperties.txt
  79 | 	"ID_Start:IDS",							//  DerivedCoreProperties.txt
  80 | 	"Ideographic:Ideo",						//  PropList.txt
  81 | 	"Join_Control:Join_C",					//  PropList.txt
  82 | 	"Logical_Order_Exception:LOE",			//  PropList.txt
  83 | 	"Lowercase:Lower",						//  DerivedCoreProperties.txt
  84 | 	"Math",									//  DerivedCoreProperties.txt
  85 | 	"Noncharacter_Code_Point:NChar",		//  PropList.txt
  86 | 	"Pattern_Syntax:Pat_Syn",				//  PropList.txt
  87 | 	"Pattern_White_Space:Pat_WS",			//  PropList.txt
  88 | 	"Quotation_Mark:QMark",					//  PropList.txt
  89 | 	"Radical",								//  PropList.txt
  90 | 	"Regional_Indicator:RI",				//  PropList.txt
  91 | 	"Sentence_Terminal:STerm",				//  PropList.txt
  92 | 	"Soft_Dotted:SD",						//  PropList.txt
  93 | 	"Terminal_Punctuation:Term",			//  PropList.txt
  94 | 	"Unified_Ideograph:UIdeo",				//  PropList.txt
  95 | 	"Uppercase:Upper",						//  DerivedCoreProperties.txt
  96 | 	"Variation_Selector:VS",				//  PropList.txt
  97 | 	"White_Space:space",					//  PropList.txt
  98 | 	"XID_Continue:XIDC",					//  DerivedCoreProperties.txt
  99 | 	"XID_Start:XIDS",						//  DerivedCoreProperties.txt
 100 | 	//  ECMAScript 2019/Unicode 11:
 101 | 	//    "Extended_Pictographic:ExtPict",
 102 | 	//  ECMAScript 2021/Unicode 13:
 103 | 	//    Aliases: EComp, EMod, EBase, EPres, and ExtPict
 104 | 	""
 105 | };
 106 | static const char *const emoseq_property_names[] = {
 107 | 	"RGI_Emoji",
 108 | 	"Basic_Emoji",							//  emoji-sequences.txt
 109 | 	"Emoji_Keycap_Sequence",				//  emoji-sequences.txt
 110 | 	"RGI_Emoji_Modifier_Sequence",			//  emoji-sequences.txt
 111 | 	"RGI_Emoji_Flag_Sequence",				//  emoji-sequences.txt
 112 | 	"RGI_Emoji_Tag_Sequence",				//  emoji-sequences.txt
 113 | 	"RGI_Emoji_ZWJ_Sequence",				//  emoji-zwj-sequences.txt
 114 | 	""
 115 | };
 116 | static const char *const gc_values[] = {	//  38
 117 | 	"Other:C", "Control:Cc:cntrl", "Format:Cf", "Unassigned:Cn",
 118 | 	"Private_Use:Co", "Surrogate:Cs", "Letter:L", "Cased_Letter:LC",
 119 | 	"Lowercase_Letter:Ll", "Titlecase_Letter:Lt", "Uppercase_Letter:Lu", "Modifier_Letter:Lm",
 120 | 	"Other_Letter:Lo", "Mark:M:Combining_Mark", "Spacing_Mark:Mc", "Enclosing_Mark:Me",
 121 | 	"Nonspacing_Mark:Mn", "Number:N", "Decimal_Number:Nd:digit", "Letter_Number:Nl",
 122 | 	"Other_Number:No", "Punctuation:P:punct", "Connector_Punctuation:Pc", "Dash_Punctuation:Pd",
 123 | 	"Close_Punctuation:Pe", "Final_Punctuation:Pf", "Initial_Punctuation:Pi", "Other_Punctuation:Po",
 124 | 	"Open_Punctuation:Ps", "Symbol:S", "Currency_Symbol:Sc", "Modifier_Symbol:Sk",
 125 | 	"Math_Symbol:Sm", "Other_Symbol:So", "Separator:Z", "Line_Separator:Zl",
 126 | 	"Paragraph_Separator:Zp", "Space_Separator:Zs", ""
 127 | };
 128 | }	//  namespace updata
 129 | 
 130 | namespace unishared
 131 | {
 132 | template <typename T>
 133 | std::string to_string(T value, int radix = 10, const int precision = 1)
 134 | {
 135 | 	std::string num;
 136 | 
 137 | 	if (radix >= 2 && radix <= 16)
 138 | 	{
 139 | 		typedef typename std::string::size_type size_type;
 140 | 		const bool minus = value < 0 ? (value = 0 - value, true) : false;
 141 | 
 142 | 		for (; value; value /= radix)
 143 | 			num.push_back("0123456789ABCDEF"[value % radix]);
 144 | 
 145 | 		if (precision > 0 && num.size() < static_cast<size_type>(precision))
 146 | 			num.append(static_cast<size_type>(precision) - num.size(), static_cast<char>('0'));
 147 | 
 148 | 		if (minus)
 149 | 			num.push_back(static_cast<char>('-'));
 150 | 
 151 | 		const size_type mid = num.size() / 2;
 152 | 
 153 | 		for (size_type i = 0; i < mid; ++i)
 154 | 			std::swap(num[i], num[num.size() - i - 1]);
 155 | 	}
 156 | 	return num;
 157 | }
 158 | 
 159 | void throw_error(const char *const s, ...)
 160 | {
 161 | 	char buffer[256];
 162 | 
 163 | 	va_list va;
 164 | 	va_start(va, s);
 165 | 	std::vsprintf(buffer, s, va);
 166 | 	va_end(va);
 167 | 	throw std::runtime_error(buffer);
 168 | }
 169 | 
 170 | void read_file(std::string &str, const char *const filename, const char *const dir)
 171 | {
 172 | 	const std::string path(std::string(dir ? dir : "") + filename);
 173 | 	FILE *const fp = std::fopen(path.c_str(), "r");
 174 | 
 175 | 	std::fprintf(stdout, "Reading '%s'... ", path.c_str());
 176 | 
 177 | 	if (fp)
 178 | 	{
 179 | 		static const std::size_t bufsize = 4096;
 180 | 		char *const buffer = static_cast<char *>(std::malloc(bufsize));
 181 | 
 182 | 		if (buffer)
 183 | 		{
 184 | 			for (;;)
 185 | 			{
 186 | 				const std::size_t size = std::fread(buffer, 1, bufsize, fp);
 187 | 
 188 | 				if (!size)
 189 | 					break;
 190 | 
 191 | 				str.append(buffer, size);
 192 | 			}
 193 | 			std::fclose(fp);
 194 | 			std::fputs("done.\n", stdout);
 195 | 			std::free(buffer);
 196 | 			return;
 197 | 		}
 198 | 	}
 199 | 	std::fputs("failed...", stdout);
 200 | 	throw_error("could not open!");
 201 | }
 202 | 
 203 | bool write_file(const char *const filename, const std::string &str)
 204 | {
 205 | 	FILE *const fp = std::fopen(filename, "wb");
 206 | 
 207 | 	std::fprintf(stdout, "Writing '%s'... ", filename);
 208 | 
 209 | 	if (fp)
 210 | 	{
 211 | 		const bool success = std::fwrite(str.c_str(), 1, str.size(), fp) == str.size();
 212 | 		std::fclose(fp);
 213 | 		if (success)
 214 | 		{
 215 | 			std::fputs("done.\n", stdout);
 216 | 			return true;
 217 | 		}
 218 | 	}
 219 | 	std::fputs("failed...\n", stdout);
 220 | 	return false;
 221 | }
 222 | }	//  namespace unishared
 223 | 
 224 | struct up_options
 225 | {
 226 | 	const char *outfilename;
 227 | 	const char *indir;
 228 | 	int version;
 229 | 	int errorno;
 230 | 	bool noesc;
 231 | 
 232 | 	up_options(const int argc, const char *const *const argv)
 233 | 		: outfilename("srell_updata3.h")
 234 | 		, indir("")
 235 | 		, version(301)
 236 | 		, errorno(0)
 237 | 		, noesc(false)
 238 | 	{
 239 | 		for (int index = 1; index < argc; ++index)
 240 | 		{
 241 | 			const char firstchar = argv[index][0];
 242 | 
 243 | 			if (firstchar == '-' || firstchar == '/')
 244 | 			{
 245 | 				const char *const option = argv[index] + 1;
 246 | 
 247 | 				if (std::strcmp(option, "o") == 0)
 248 | 				{
 249 | 					if (index + 1 >= argc)
 250 | 						goto NO_ARGUMENT;
 251 | 					outfilename = argv[++index];
 252 | 				}
 253 | 				else if (std::strcmp(option, "i") == 0 || std::strcmp(option, "id") == 0)
 254 | 				{
 255 | 					if (index + 1 >= argc)
 256 | 						goto NO_ARGUMENT;
 257 | 					indir = argv[++index];
 258 | 				}
 259 | 				else if (std::strcmp(option, "noesc") == 0)
 260 | 				{
 261 | 					if (index + 1 >= argc)
 262 | 						goto NO_ARGUMENT;
 263 | 					noesc = static_cast<int>(std::strtod(argv[++index], NULL)) ? true : false;
 264 | 				}
 265 | 				else if (std::strcmp(option, "?") == 0 || std::strcmp(option, "h") == 0)
 266 | 				{
 267 | 					std::fputs("Usage: updataout2 [options]\nOptions:\n", stdout);
 268 | 					std::fputs("  -i <DIRECTORY>\tSame as -id.\n", stdout);
 269 | 					std::fputs("  -id <DIRECTORY>\tAssume that input files exist in <DIRECTORY>.\n\t\t\t<DIRECTORY> must ends with '/' or '\\'.\n", stdout);
 270 | 					std::fputs("  -o <FILE>\t\tOutput to <FILE>.\n", stdout);
 271 | 					std::fputs("  -noesc [1|0]\t\tDo not escape literal strings.\n", stdout);
 272 | 					errorno = 1;
 273 | 					return;
 274 | 				}
 275 | 				else
 276 | 					goto UNKNOWN_OPTION;
 277 | 
 278 | 				continue;
 279 | 
 280 | 				NO_ARGUMENT:
 281 | 				std::fprintf(stdout, "[Error] no argument for \"%s\" specified.\n", argv[index]);
 282 | 				errorno = -2;
 283 | 			}
 284 | 			else
 285 | 			{
 286 | 				UNKNOWN_OPTION:
 287 | 				std::fprintf(stdout, "[Error] unknown option \"%s\" found.\n", argv[index]);
 288 | 				errorno = -1;
 289 | 			}
 290 | 		}
 291 | 	}
 292 | };
 293 | //  struct up_options
 294 | 
 295 | class unicode_property
 296 | {
 297 | public:
 298 | 
 299 | 	unicode_property()
 300 | 		: re_colon_(":")
 301 | 	{
 302 | 	}
 303 | 
 304 | 	int create_updata(std::string &outdata, const up_options &opts)
 305 | 	{
 306 | 		int errorno = opts.errorno;
 307 | 		const char *const unidatafilename = "UnicodeData.txt";
 308 | 		const char *const propdatafiles[] = { "PropList.txt", "DerivedCoreProperties.txt", "emoji-data.txt", "DerivedNormalizationProps.txt", "" };
 309 | 		const char *const emodatafiles[] = { "emoji-sequences.txt", "emoji-zwj-sequences.txt", "" };
 310 | 		const char *const scfilename = "Scripts.txt";
 311 | 		const char *const scxfilename = "ScriptExtensions.txt";
 312 | 		const char *const pvafilename = "PropertyValueAliases.txt";
 313 | 		canonicalname_mapper scriptname_maps;
 314 | 		strings_type scriptname_aliases;
 315 | 		std::string licensetext;
 316 | 		rangeholder general_category_values;
 317 | 		rangeholder binary_properties;
 318 | 		seqholder emoseq_properties;
 319 | 		rangeholder scripts;
 320 | 		rangeholder scriptextensions;
 321 | 		sortedrangeholder combined_properties;
 322 | 		sortedseqholder combined_pos;
 323 | //		scriptnameholder ucs_to_scriptname;	//  codepoint->scriptname.
 324 | 
 325 | 		if (errorno)
 326 | 			return errorno;
 327 | 
 328 | 		try
 329 | 		{
 330 | 			licensetext = "//  ";
 331 | 			licensetext += unidatafilename;
 332 | 			licensetext += "\n//\n";
 333 | 
 334 | 			read_unidata(general_category_values, binary_properties, unidatafilename, opts.indir);
 335 | 			set_additionalbinprops(binary_properties, general_category_values);	//  for ASCII, Any, Cn.
 336 | 			create_compositecategories(general_category_values);	//  This needs "Cn".
 337 | 
 338 | 			read_binprops(binary_properties, licensetext, propdatafiles, opts.indir);
 339 | #if !defined(SRELL_NO_VMODE)
 340 | 			read_emoseq(emoseq_properties, licensetext, emodatafiles, opts.indir);
 341 | #endif
 342 | 
 343 | 			read_scriptnames(scriptname_maps, scriptname_aliases, licensetext, scfilename, pvafilename, opts);
 344 | 
 345 | 			read_scripts(scripts, licensetext, scfilename, opts.indir);
 346 | 
 347 | 			scriptextensions = scripts;
 348 | 			modify_for_scx(scriptextensions, scriptname_maps, licensetext, scxfilename, opts.indir);
 349 | 
 350 | 			combine_properties(combined_properties, general_category_values, "gc", updata::gc_values);
 351 | 			combine_properties(combined_properties, binary_properties, "bp", updata::binary_property_names);
 352 | 			combine_properties(combined_properties, scripts, "sc", scriptname_aliases);
 353 | 			combine_properties(combined_properties, scriptextensions, "scx", scriptname_aliases);
 354 | #if !defined(SRELL_NO_VMODE)
 355 | 			combine_pos(combined_pos, emoseq_properties, "bp", updata::emoseq_property_names);
 356 | #endif
 357 | 
 358 | 			do_formatting(outdata, combined_properties, combined_pos, opts);
 359 | 
 360 | 			licensetext.append(1, '\n');
 361 | 			outdata.insert(0, licensetext);
 362 | 		}
 363 | 		catch (srell::regex_error &e)
 364 | 		{
 365 | 			std::printf("\nError: %s,%d\n", e.what(), e.code());
 366 | 			errorno = 1;
 367 | 		}
 368 | 		catch (std::runtime_error &e)
 369 | 		{
 370 | 			std::printf("\nError: %s\n", e.what());
 371 | 			errorno = 2;
 372 | 		}
 373 | 		return errorno;
 374 | 	}
 375 | 
 376 | private:
 377 | 
 378 | 	typedef srell::re_detail::ui_l32 ui_l32;
 379 | 	typedef srell::re_detail::range_pairs ucprange_array;
 380 | 	typedef srell::re_detail::range_pair u32pair;
 381 | 	typedef u32pair ucprange;
 382 | 	typedef srell::re_detail::range_pair_helper u32rp_helper;
 383 | 	typedef u32rp_helper ucprange_helper;
 384 | 	typedef std::map<std::string, ucprange_array> rangeholder;
 385 | 	typedef srell::re_detail::simple_array<ui_l32> u32array;
 386 | 	typedef std::map<std::string, u32array> seqholder;
 387 | 	typedef std::vector<std::string> strings_type;
 388 | 	typedef std::vector<srell::csub_match> matchranges_type;
 389 | 	typedef std::map<ui_l32, std::string> scriptnameholder;
 390 | 	typedef std::map<std::string, std::string> name_mapper;
 391 | 	typedef std::map<std::string, ui_l32> namenumber_mapper;
 392 | 	typedef name_mapper canonicalname_mapper;
 393 | 	static const ui_l32 invalid_u32value = srell::re_detail::constants::invalid_u32value;
 394 | 	static const ui_l32 compositeclass = invalid_u32value;
 395 | 
 396 | 	struct sorted_name_and_ranges
 397 | 	{
 398 | 		std::string ptype;
 399 | 		std::string canonicalname;
 400 | 		std::string namealiases;
 401 | 		ucprange_array ucpranges;
 402 | 	};
 403 | 	typedef std::vector<sorted_name_and_ranges> sortedrangeholder;
 404 | 
 405 | 	struct sorted_name_and_seqs
 406 | 	{
 407 | 		std::string ptype;
 408 | 		std::string canonicalname;
 409 | 		std::string namealiases;
 410 | 		u32array ucpseqs;
 411 | 	};
 412 | 	typedef std::vector<sorted_name_and_seqs> sortedseqholder;
 413 | 
 414 | 	void split2(matchranges_type &parts, const std::string &data, const char splitter)
 415 | 	{
 416 | 		std::string::size_type readpos = 0;
 417 | 		srell::csub_match csm;
 418 | 
 419 | 		csm.matched = true;
 420 | 		for (;;)
 421 | 		{
 422 | 			std::string::size_type lineend = data.find(splitter, readpos);
 423 | 
 424 | 			csm.first = data.data() + readpos;
 425 | 			if (lineend == std::string::npos)
 426 | 			{
 427 | 				csm.second = data.data() + data.size();
 428 | 				parts.push_back(csm);
 429 | 				break;
 430 | 			}
 431 | 
 432 | 			csm.second = data.data() + lineend;
 433 | 			parts.push_back(csm);
 434 | 			++lineend;
 435 | 			readpos = lineend;
 436 | 		}
 437 | 	}
 438 | 
 439 | 	std::string join(const char c, const strings_type &parts, const bool add_final_also = false)
 440 | 	{
 441 | 		std::string out;
 442 | 
 443 | 		for (strings_type::size_type i = 0; i < parts.size(); ++i)
 444 | 			out.append(parts[i] + c);
 445 | 
 446 | 		if (!add_final_also && out.size())
 447 | 			out.resize(out.size() - 1);
 448 | 
 449 | 		return out;
 450 | 	}
 451 | 
 452 | 	void read_unidata(rangeholder &gc, rangeholder &bp, const char *const unidatafilename, const char *const indir)
 453 | 	{
 454 | 		const srell::regex re_dataline("^([0-9A-F]+);([^;]*);(([^;]*);(?:[^;]*;){6}([^;]*)(?:;[^;]*){5})$");
 455 | 		const srell::regex re_rangefirst("^<(.*), First>$");
 456 | 
 457 | 		const std::string stringY("Y");
 458 | 		const std::string stringN("N");
 459 | 		ui_l32 prevucp = invalid_u32value;
 460 | 		std::string data;
 461 | 		matchranges_type lines;
 462 | 		srell::cmatch cmatch;
 463 | //		matchranges_type parts;
 464 | 		std::string rangename;
 465 | 		std::string rangefirstproperty;
 466 | 		ui_l32 rangefirstcp = 0;
 467 | 		ucprange range;
 468 | 		ucprange_array bidi_mirrored_ranges;
 469 | 
 470 | 		unishared::read_file(data, unidatafilename, indir);
 471 | 		split2(lines, data, '\n');
 472 | 
 473 | 		for (matchranges_type::size_type i = 0; i < lines.size(); ++i)
 474 | 		{
 475 | 			const srell::csub_match &line = lines[i];
 476 | 
 477 | 			if (srell::regex_match(line.first, line.second, cmatch, re_dataline))
 478 | 			{
 479 | 				const srell::cmatch::value_type &codepoint = cmatch[1];
 480 | 				const srell::cmatch::value_type &name = cmatch[2];
 481 | 				const std::string name_string(name.str());
 482 | 				const std::string property(cmatch[3].str());
 483 | 
 484 | 				range.first = range.second = static_cast<ui_l32>(std::strtol(codepoint.first, NULL, 16));
 485 | 
 486 | 				if (prevucp >= range.first && prevucp != invalid_u32value)
 487 | 					unishared::throw_error("Out of order: %.4lX >= %.4lX", prevucp, range.first);
 488 | 
 489 | //				parts.clear();
 490 | //				split2(parts, property, ';');
 491 | //				if (parts.size() != 13)
 492 | //					unishared::throw_error("number of fields is not 13, but %u\n\t[%s]", parts.size(), line.str().c_str());
 493 | 
 494 | //				const std::string &general_category = parts[0];
 495 | //				const std::string &bidi_mirrored = parts[7];
 496 | 				const std::string general_category(cmatch[4].str());
 497 | 				const std::string bidi_mirrored(cmatch[5].str());
 498 | 
 499 | 				prevucp = range.first;
 500 | 
 501 | 				if (rangename.size())
 502 | 				{
 503 | 					if (name_string.compare("<" + rangename + ", Last>") != 0)
 504 | 						unishared::throw_error("<%s, Last> does not follow its First line.\n\t%s follows insteadly.", rangename.c_str(), name_string.c_str());
 505 | 
 506 | 					if (property != rangefirstproperty)
 507 | 					{
 508 | 						unishared::throw_error("\"%s\": properties of First and Last are different.\n\tFirst: %s\n\tLast:  %s", rangename.c_str(), rangefirstproperty.c_str(), property.c_str());
 509 | 					}
 510 | 
 511 | 					range.first = rangefirstcp;
 512 | 					rangename.clear();
 513 | 				}
 514 | 				else if (srell::regex_match(name.first, name.second, cmatch, re_rangefirst))
 515 | 				{
 516 | 					rangename = cmatch[1];
 517 | 					rangefirstproperty = property;
 518 | 					rangefirstcp = range.first;
 519 | 					continue;
 520 | 				}
 521 | 
 522 | 				//  Registers "general_category" value.
 523 | 				gc[general_category].join(range);
 524 | 
 525 | 				//  Registers "bidi_mirrored" value.
 526 | 				if (bidi_mirrored == stringY)
 527 | 				{
 528 | 					bidi_mirrored_ranges.join(range);
 529 | 				}
 530 | 				else if (bidi_mirrored != stringN)
 531 | 					unishared::throw_error("Unknown Bidi_Mirrored value [%s] in %s.", bidi_mirrored.c_str(), line.str().c_str());
 532 | 			}
 533 | 			else if (line.first != line.second)
 534 | 				unishared::throw_error("Unknown format [%s]", line.str().c_str());
 535 | 		}
 536 | 		bp["Bidi_Mirrored"] = bidi_mirrored_ranges;
 537 | 	}
 538 | 
 539 | 	void read_scriptnames(canonicalname_mapper &sn_maps, strings_type &sn_aliases, std::string &licensetext, const char *const scfilename, const char *const pvafilename, const up_options &opts)
 540 | 	{
 541 | 		const srell::regex re_scline("^[0-9A-Fa-f.]+\\s*;\\s*(\\S+)");
 542 | 		const srell::regex re_pvaline("scx?\\s*;\\s*(\\S.*)\\r?\\n?");
 543 | 		const srell::regex re_split("[ ;]+");
 544 | 		ui_l32 count = 0;
 545 | 		std::string data;
 546 | 		matchranges_type lines;
 547 | 		srell::cmatch cmatch;
 548 | 		namenumber_mapper seennames;
 549 | 
 550 | 		unishared::read_file(data, scfilename, opts.indir);
 551 | 
 552 | 		lines.clear();
 553 | 		split2(lines, data, '\n');
 554 | 
 555 | 		for (matchranges_type::size_type i = 0; i < lines.size(); ++i)
 556 | 		{
 557 | 			const srell::csub_match &line = lines[i];
 558 | 
 559 | 			if (srell::regex_search(line.first, line.second, cmatch, re_scline, srell::regex_constants::match_continuous))
 560 | 			{
 561 | 				const std::string scname(cmatch.str(1));
 562 | 
 563 | 				if (!seennames.count(scname))
 564 | 				{
 565 | 					seennames[scname] = count++;
 566 | 				}
 567 | 			}
 568 | 		}
 569 | 
 570 | 		seennames["Unknown"] = count++;
 571 | 		sn_aliases.resize(count);
 572 | 
 573 | 		typedef std::vector<srell::csub_match> scnames_type;
 574 | 		canonicalname_mapper aliases_tmp;
 575 | 		scnames_type scnames;
 576 | 
 577 | 		data.clear();
 578 | 		unishared::read_file(data, pvafilename, opts.indir);
 579 | 
 580 | 		lines.clear();
 581 | 		split2(lines, data, '\n');
 582 | 
 583 | 		matchranges_type::size_type i = read_license(licensetext, lines, 0);
 584 | 
 585 | 		for (; i < lines.size(); ++i)
 586 | 		{
 587 | 			const srell::csub_match &line = lines[i];
 588 | 
 589 | 			if (srell::regex_match(line.first, line.second, cmatch, re_pvaline, srell::regex_constants::match_continuous))
 590 | 			{
 591 | 				srell::cregex_iterator2 rei2s(cmatch[1].first, cmatch[1].second, re_split);
 592 | 
 593 | 				scnames.clear();
 594 | 				for (rei2s.split_begin();; rei2s.split_next())
 595 | 				{
 596 | 					scnames.push_back(rei2s.split_aptrange());
 597 | 					if (rei2s.done())
 598 | 						break;
 599 | 				}
 600 | 
 601 | 				if (scnames.size() >= 2)
 602 | 				{
 603 | 					const std::string canonicalname(scnames[1]);
 604 | 
 605 | 					if (seennames.count(canonicalname))
 606 | 					{
 607 | 						std::string aliases(canonicalname);
 608 | 
 609 | 						for (scnames_type::size_type j = 0; j < scnames.size(); ++j)
 610 | 						{
 611 | 							const std::string scname(scnames[j].str());
 612 | 
 613 | 							sn_maps[scname] = canonicalname;
 614 | 
 615 | 							if (scname != canonicalname)
 616 | 							{
 617 | 								aliases += ':';
 618 | 								aliases += scname;
 619 | 							}
 620 | 						}
 621 | 						sn_aliases[seennames[canonicalname]] = aliases;
 622 | 					}
 623 | 				}
 624 | 			}
 625 | 		}
 626 | 	}
 627 | 
 628 | 	matchranges_type::size_type read_license(std::string &licensetext, const matchranges_type &lines, matchranges_type::size_type pos)
 629 | 	{
 630 | 		static const srell::regex re_license("^#[ \\t]*(\\S.*)?$");
 631 | 		srell::cmatch cm;
 632 | 
 633 | 		for (; pos < lines.size(); ++pos)
 634 | 		{
 635 | 			const srell::csub_match &line = lines[pos];
 636 | 
 637 | 			if (srell::regex_search(line.first, line.second, cm, re_license, srell::regex_constants::match_continuous))
 638 | 			{
 639 | 				const std::string comment(cm[1].str());
 640 | 
 641 | 				if (comment.size())
 642 | 					licensetext += "//  " + comment + '\n';
 643 | 				else
 644 | 				{
 645 | 					licensetext += "//\n";
 646 | 					break;
 647 | 				}
 648 | 			}
 649 | 		}
 650 | 		return pos;
 651 | 	}
 652 | 
 653 | 	//  binary properties created from UnicodeData.txt.
 654 | 	void set_additionalbinprops(rangeholder &bp, rangeholder &gc)
 655 | 	{
 656 | 		ucprange_array assigned_ranges;
 657 | 
 658 | 		for (rangeholder::iterator it = gc.begin(); it != gc.end(); ++it)
 659 | 			assigned_ranges.merge(it->second);
 660 | 
 661 | 		bp["Any"].join(ucprange_helper(0x0000, 0x10ffff));
 662 | 		bp["ASCII"].join(ucprange_helper(0x0000, 0x007f));
 663 | 		bp["Assigned"];	//  Only creates. No data.
 664 | 
 665 | //		bp["Assigned"] = assigned_ranges;
 666 | 		assigned_ranges.negation();
 667 | 		gc["Cn"] = assigned_ranges;
 668 | 	}
 669 | 
 670 | 	void create_compositecategory(rangeholder &gc, const char *const newname, const char *const *categories)
 671 | 	{
 672 | 		ucprange_array array;
 673 | 		ui_l32 total = 0;
 674 | 
 675 | 		array.append_newpair(ucprange_helper(compositeclass, 0));
 676 | 
 677 | 		for (; **categories; ++categories)
 678 | 		{
 679 | 			const char *const c = *categories;
 680 | 			const ui_l32 count = static_cast<ui_l32>(gc[*categories].size());
 681 | 
 682 | 			array.append_newpair(ucprange_helper(c[0], c[1]));
 683 | 			array.append_newpair(ucprange_helper(count, 0));
 684 | 			total += count;
 685 | 		}
 686 | 		array[0].second = total;
 687 | 		gc[newname] = array;
 688 | 	}
 689 | 
 690 | 	void create_compositecategories(rangeholder &gc)
 691 | 	{
 692 | 		const char *const categoryLC[] = { "Ll", "Lt", "Lu", "" };
 693 | 		const char *const categoryL[] = { "Ll", "Lt", "Lu", "Lm", "Lo", "" };
 694 | 		const char *const categoryM[] = { "Mc", "Me", "Mn", "" };
 695 | 		const char *const categoryN[] = { "Nd", "Nl", "No", "" };
 696 | 		const char *const categoryC[] = { "Cc", "Cf", "Cn", "Co", "Cs", "" };
 697 | 		const char *const categoryP[] = { "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps", "" };
 698 | 		const char *const categoryZ[] = { "Zl", "Zp", "Zs", "" };
 699 | 		const char *const categoryS[] = { "Sc", "Sk", "Sm", "So", "" };
 700 | 
 701 | 		create_compositecategory(gc, "LC", categoryLC);
 702 | 		create_compositecategory(gc, "L", categoryL);
 703 | 		create_compositecategory(gc, "M", categoryM);
 704 | 		create_compositecategory(gc, "N", categoryN);
 705 | 		create_compositecategory(gc, "C", categoryC);
 706 | 		create_compositecategory(gc, "P", categoryP);
 707 | 		create_compositecategory(gc, "Z", categoryZ);
 708 | 		create_compositecategory(gc, "S", categoryS);
 709 | 	}
 710 | 
 711 | 	void read_binprops(rangeholder &bp, std::string &licensetext, const char *const *propdatafiles, const char *const indir)
 712 | 	{
 713 | 		static const srell::regex re_propfmt("^\\s*([0-9A-Fa-f]{4,})(?:\\.\\.([0-9A-Fa-f]{4,}))?\\s*;\\s*([^\\s;#]+)\\s*");	//  (#.*)?$");
 714 | 		ucprange range;
 715 | 		std::string data;
 716 | 		matchranges_type lines;
 717 | 		srell::cmatch cmatch;
 718 | 
 719 | 		for (; **propdatafiles; ++propdatafiles)
 720 | 		{
 721 | 			data.clear();
 722 | 			unishared::read_file(data, *propdatafiles, indir);
 723 | 
 724 | 			lines.clear();
 725 | 			split2(lines, data, '\n');
 726 | 
 727 | 			matchranges_type::size_type i = read_license(licensetext, lines, 0);
 728 | 
 729 | 			for (; i < lines.size(); ++i)
 730 | 			{
 731 | 				const srell::csub_match &line = lines[i];
 732 | 
 733 | 				if (srell::regex_search(line.first, line.second, cmatch, re_propfmt, srell::regex_constants::match_continuous))
 734 | 				{
 735 | 					const srell::cmatch::value_type &begin = cmatch[1];
 736 | 					const srell::cmatch::value_type &end = cmatch[2];
 737 | 					const srell::cmatch::value_type &property = cmatch[3];
 738 | //					const srell::cmatch::value_type &comment = cmatch[4];
 739 | 
 740 | 					range.first = static_cast<ui_l32>(std::strtol(begin.first, NULL, 16));
 741 | 					range.second = end.matched ? static_cast<ui_l32>(std::strtol(end.first, NULL, 16)) : range.first;
 742 | 
 743 | 					bp[property.str()].join(range);
 744 | 				}
 745 | 			}
 746 | 		}
 747 | 	}
 748 | 
 749 | 	void read_emoseq(seqholder &emsq, std::string &licensetext, const char *const *emodatafiles, const char *const indir)
 750 | 	{
 751 | 		const srell::regex re_emsqfmt("^\\s*([0-9A-Fa-f]{4,})(?:\\.\\.([0-9A-Fa-f]{4,})|((?:\\s+[0-9A-Fa-f]{4,})+))?\\s*;\\s*([^\\s;#]+)\\s*");	//  (?:\\s*;[^#]*)(#.*)?$");
 752 | 		const srell::regex re_emsq2fmt("\\s*([0-9A-Fa-f]{4,})");
 753 | 		std::string data;
 754 | 		matchranges_type lines;
 755 | 		srell::cmatch cmatch;
 756 | 
 757 | 		for (; **emodatafiles; ++emodatafiles)
 758 | 		{
 759 | 			data.clear();
 760 | 			unishared::read_file(data, *emodatafiles, indir);
 761 | 
 762 | 			lines.clear();
 763 | 			split2(lines, data, '\n');
 764 | 
 765 | 			matchranges_type::size_type i = read_license(licensetext, lines, 0);
 766 | 
 767 | 			for (; i < lines.size(); ++i)
 768 | 			{
 769 | 				const srell::csub_match &line = lines[i];
 770 | 
 771 | 				if (srell::regex_search(line.first, line.second, cmatch, re_emsqfmt, srell::regex_constants::match_continuous))
 772 | 				{
 773 | 					const srell::cmatch::value_type &begin = cmatch[1];
 774 | 					const srell::cmatch::value_type &end = cmatch[2];
 775 | 					const srell::cmatch::value_type &seqs = cmatch[3];
 776 | 					const std::string seqname = cmatch[4].str();
 777 | //					const srell::cmatch::value_type &comment = cmatch[5];
 778 | 					const ui_l32 first = static_cast<ui_l32>(std::strtol(begin.first, NULL, 16));
 779 | 
 780 | 					if (seqs.matched)
 781 | 					{
 782 | 						const u32array::size_type orgsize = emsq[seqname].size();
 783 | 						srell::cregex_iterator2 it(seqs.first, seqs.second, re_emsq2fmt, srell::regex_constants::match_continuous);
 784 | 						ui_l32 count = 2;
 785 | 
 786 | 						emsq[seqname].push_back_c(0);	//  Number of code points.
 787 | 						emsq[seqname].push_back(first);
 788 | 
 789 | 						for (; !it.done(); ++it, ++count)
 790 | 						{
 791 | 							const srell::cmatch::value_type &ucp = (*it)[1];
 792 | 
 793 | 							emsq[seqname].push_back(static_cast<ui_l32>(std::strtol(ucp.first, NULL, 16)));
 794 | 						}
 795 | 						emsq[seqname][orgsize] = count;
 796 | 					}
 797 | 					else
 798 | 					{
 799 | 						if (end.matched)
 800 | 						{
 801 | 							emsq[seqname].push_back_c(1);	//  Range.
 802 | 							emsq[seqname].push_back(first);
 803 | 							emsq[seqname].push_back(static_cast<ui_l32>(std::strtol(end.first, NULL, 16)));
 804 | 						}
 805 | 						else
 806 | 						{
 807 | 							emsq[seqname].push_back_c(2);	//  Single code point.
 808 | 							emsq[seqname].push_back(first);
 809 | 						}
 810 | 					}
 811 | 				}
 812 | 			}
 813 | 		}
 814 | 
 815 | 		for (seqholder::iterator it = emsq.begin(); it != emsq.end(); ++it)
 816 | 		{
 817 | 			if (it->second.size() & 1)
 818 | 			{
 819 | 				std::printf("[Info] Padding added to \"%s\" (%u).\n", it->first.c_str(), static_cast<unsigned int>(it->second.size()));
 820 | 				it->second.push_back_c(0);
 821 | 			}
 822 | 		}
 823 | 
 824 | 		emsq["RGI_Emoji"].push_back_c(compositeclass);	//  Dummy data.
 825 | 	}
 826 | 
 827 | 	void read_scripts(rangeholder &sc, std::string &licensetext, const char *const filename, const char *const indir)
 828 | 	{
 829 | 		const srell::regex re_scriptdata("^\\s*([0-9A-Fa-f]{4,})(?:\\.\\.([0-9A-Fa-f]{4,}))?\\s*;\\s*([^\\s;#]+)\\s*");	//  (#.*)?$");
 830 | 		ucprange range;
 831 | 		std::string data;
 832 | 		matchranges_type lines;
 833 | 		srell::cmatch cmatch;
 834 | 		ucprange_array assigned_ranges;
 835 | 
 836 | 		data.clear();
 837 | 		unishared::read_file(data, filename, indir);
 838 | 
 839 | 		lines.clear();
 840 | 		split2(lines, data, '\n');
 841 | 
 842 | 		matchranges_type::size_type i = read_license(licensetext, lines, 0);
 843 | 
 844 | 		for (; i < lines.size(); ++i)
 845 | 		{
 846 | 			const srell::csub_match &line = lines[i];
 847 | 
 848 | 			if (srell::regex_search(line.first, line.second, cmatch, re_scriptdata, srell::regex_constants::match_continuous))
 849 | 			{
 850 | 				const srell::cmatch::value_type &begin = cmatch[1];
 851 | 				const srell::cmatch::value_type &end = cmatch[2];
 852 | 				const srell::cmatch::value_type &scriptname = cmatch[3];
 853 | //				const srell::cmatch::value_type &comment = cmatch[4];
 854 | 
 855 | 				range.first = static_cast<ui_l32>(std::strtol(begin.first, NULL, 16));
 856 | 				range.second = end.matched ? static_cast<ui_l32>(std::strtol(end.first, NULL, 16)) : range.first;
 857 | 
 858 | 				sc[scriptname].join(range);
 859 | 				assigned_ranges.join(range);
 860 | 			}
 861 | 		}
 862 | 		assigned_ranges.negation();
 863 | 		sc["Unknown"] = assigned_ranges;
 864 | 	}
 865 | 
 866 | 	canonicalname_mapper load_canonicalnames(const char *const *names)
 867 | 	{
 868 | 		canonicalname_mapper canonicalnames;
 869 | 		matchranges_type parts;
 870 | 
 871 | 		for (; **names; ++names)
 872 | 		{
 873 | 			parts.clear();
 874 | 			split2(parts, *names, ':');
 875 | 			const std::string canonicalname(parts[0].str());
 876 | 			for (matchranges_type::size_type i = 0; i < parts.size(); ++i)
 877 | 			{
 878 | 				canonicalnames[parts[i].str()] = canonicalname;
 879 | 			}
 880 | 		}
 881 | 		return canonicalnames;
 882 | 	}
 883 | 
 884 | 	void modify_for_scx(rangeholder &scx, const canonicalname_mapper &canonicalnames, std::string &licensetext, const char *const filename, const char *const indir)
 885 | 	{
 886 | 		const srell::regex re_scxdata("^\\s*([0-9A-Fa-f]{4,})(?:\\.\\.([0-9A-Fa-f]{4,}))?\\s*;\\s*([^\\s;#][^;#]*[^\\s;#])\\s*", srell::regex::multiline);	//  (#.*)?$");
 887 | 		const srell::regex re_space(" ");
 888 | 		const std::string name_common("Common");
 889 | 		const std::string name_inherited("Inherited");
 890 | 		ucprange_array common = scx[name_common];
 891 | 		ucprange_array inherited = scx[name_inherited];
 892 | 		ucprange range;
 893 | 		std::map<std::string, bool> warning_out;
 894 | 		std::string data;
 895 | 		matchranges_type lines;
 896 | 		srell::cmatch cmatch;
 897 | 
 898 | 		unishared::read_file(data, filename, indir);
 899 | 
 900 | 		lines.clear();
 901 | 		split2(lines, data, '\n');
 902 | 
 903 | 		matchranges_type::size_type i = read_license(licensetext, lines, 0);
 904 | 
 905 | 		for (; i < lines.size(); ++i)
 906 | 		{
 907 | 			const srell::csub_match &line = lines[i];
 908 | 
 909 | 			if (srell::regex_search(line.first, line.second, cmatch, re_scxdata, srell::regex_constants::match_continuous))
 910 | 			{
 911 | 				const srell::cmatch::value_type &begin = cmatch[1];
 912 | 				const srell::cmatch::value_type &end = cmatch[2];
 913 | 				const srell::cmatch::value_type &scxnames = cmatch[3];
 914 | //				const srell::cmatch::value_type &comment = cmatch[4];
 915 | 
 916 | 				range.first = static_cast<ui_l32>(std::strtol(begin.str().c_str(), NULL, 16));
 917 | 				range.second = end.matched ? static_cast<ui_l32>(std::strtol(end.str().c_str(), NULL, 16)) : range.first;
 918 | 
 919 | 				common.remove_range(range);
 920 | 				inherited.remove_range(range);
 921 | 
 922 | 				srell::cregex_iterator2 rei2s(scxnames.first, scxnames.second, re_space);
 923 | 
 924 | 				for (rei2s.split_begin();; rei2s.split_next())
 925 | 				{
 926 | 					const std::string scriptname(!rei2s.done() ? rei2s.split_range() : rei2s.split_remainder());
 927 | 
 928 | 					if (scriptname.size())
 929 | 					{
 930 | 						const canonicalname_mapper::const_iterator it = canonicalnames.find(scriptname);
 931 | 
 932 | 						if (it != canonicalnames.end())
 933 | 							scx[it->second].join(range);
 934 | 						else
 935 | 						{
 936 | //							unishared::throw_error("Canonical name for \"%s\" is not found.", scriptname.c_str());
 937 | 							if (!warning_out.count(scriptname))
 938 | 							{
 939 | 								std::printf("[Info] Canonical name for \"%s\" is not found. New script?\n", scriptname.c_str());
 940 | 								warning_out[scriptname] = true;
 941 | 							}
 942 | 						}
 943 | 					}
 944 | 					if (rei2s.done())
 945 | 						break;
 946 | 				}
 947 | 			}
 948 | 		}
 949 | 		scx[name_common] = common;
 950 | 		scx[name_inherited] = inherited;
 951 | 	}
 952 | 
 953 | 	void combine_properties(sortedrangeholder &base, const rangeholder &addition, const char *const ptype, const char *const *aliasnames)
 954 | 	{
 955 | 		strings_type aliases;
 956 | 
 957 | 		for (; **aliasnames; ++aliasnames)
 958 | 			aliases.push_back(std::string(*aliasnames));
 959 | 
 960 | 		return combine_properties(base, addition, ptype, aliases);
 961 | 	}
 962 | 
 963 | 	void combine_properties(sortedrangeholder &base, const rangeholder &addition, const char *const ptype, const strings_type &aliasnames)
 964 | 	{
 965 | 		sorted_name_and_ranges elem;
 966 | 		matchranges_type names;
 967 | 
 968 | 		for (strings_type::size_type i = 0; i < aliasnames.size(); ++i)
 969 | 		{
 970 | 			const std::string &aliases = aliasnames[i];
 971 | 			bool pdata_found = false;
 972 | 
 973 | 			names.clear();
 974 | 			split2(names, aliases, ':');
 975 | 
 976 | 			const std::string canonicalname(names[0].str());
 977 | 
 978 | 			for (matchranges_type::size_type j = 0; j < names.size(); ++j)
 979 | 			{
 980 | 				const rangeholder::const_iterator it = addition.find(names[j].str());
 981 | 
 982 | 				if (it != addition.end())
 983 | 				{
 984 | 					elem.ucpranges = it->second;
 985 | 					pdata_found = true;
 986 | 					break;
 987 | 				}
 988 | 			}
 989 | 
 990 | 			if (!pdata_found)
 991 | 				unishared::throw_error("No property value for \"%s\" found.", aliases.c_str());
 992 | 
 993 | 			elem.ptype = ptype;
 994 | 			elem.canonicalname = canonicalname;
 995 | 			elem.namealiases = aliases;
 996 | 			base.push_back(elem);
 997 | 		}
 998 | 	}
 999 | 
1000 | #if !defined(SRELL_NO_VMODE)
1001 | 
1002 | 	void combine_pos(sortedseqholder &base, const seqholder &addition, const char *const ptype, const char *const *aliasnames)
1003 | 	{
1004 | 		ui_l32 total = 0;
1005 | 		sorted_name_and_seqs elem;
1006 | 		matchranges_type names;
1007 | 		u32array compclass;
1008 | 
1009 | 		//  Composite class.
1010 | 		compclass.push_back_c(compositeclass);
1011 | 		compclass.push_back_c(0);
1012 | 
1013 | 		elem.ptype = ptype;
1014 | 		for (; **aliasnames; ++aliasnames)
1015 | 		{
1016 | 			const std::string aliases(*aliasnames);
1017 | 			bool pdata_found = false;
1018 | 
1019 | 			names.clear();
1020 | 			split2(names, aliases, ':');
1021 | 
1022 | 			const std::string canonicalname(names[0].str());
1023 | 
1024 | 			for (strings_type::size_type i = 0; i < names.size(); ++i)
1025 | 			{
1026 | 				const seqholder::const_iterator it = addition.find(names[i].str());
1027 | 
1028 | 				if (it != addition.end())
1029 | 				{
1030 | 					elem.ucpseqs = it->second;
1031 | 					pdata_found = true;
1032 | 					if (elem.ucpseqs.size() != 1 || elem.ucpseqs[0] != compositeclass)
1033 | 					{
1034 | 						compclass.push_back(static_cast<ui_l32>(elem.ucpseqs.size()));
1035 | 						total += static_cast<ui_l32>(elem.ucpseqs.size());
1036 | 					}
1037 | 					break;
1038 | 				}
1039 | 			}
1040 | 
1041 | 			if (!pdata_found)
1042 | 				unishared::throw_error("No property value for \"%s\" found.", aliases.c_str());
1043 | 
1044 | 			elem.canonicalname = canonicalname;
1045 | 			elem.namealiases = aliases;
1046 | 			base.push_back(elem);
1047 | 		}
1048 | 
1049 | 		//  Composite class.
1050 | 		compclass[1] = total;
1051 | 		base[0].ucpseqs = compclass;	//  [0] = RGI_Emoji.
1052 | 	}
1053 | 
1054 | #endif	//  !defined(SRELL_NO_VMODE)
1055 | 
1056 | 	name_mapper create_ptype_mappings()
1057 | 	{
1058 | 		name_mapper categories;
1059 | 
1060 | 		categories["gc"] = "general_category";
1061 | 		categories["bp"] = "binary";
1062 | 		categories["sc"] = "script";
1063 | 		categories["scx"] = "script_extensions";
1064 | 		return categories;
1065 | 	}
1066 | 
1067 | 	std::string create_ptypes(const name_mapper &ptypes)
1068 | 	{
1069 | 		const char *names[] = { "bp", "gc", "sc", "scx", "" };
1070 | 		const std::string t2prefix = "\tuptype_";
1071 | 		std::string ptypedef;
1072 | 
1073 | 		for (unsigned int i = 0; *names[i];)
1074 | 		{
1075 | 			const char *const name = names[i];
1076 | 			const name_mapper::const_iterator it = ptypes.find(name);
1077 | 
1078 | 			if (it == ptypes.end())
1079 | 				unishared::throw_error("Name for ptype \"%s\" is not found.", name);
1080 | 
1081 | 			ptypedef += t2prefix + name + " = " + unishared::to_string(++i) + ",\n";
1082 | 		}
1083 | 		return ptypedef;
1084 | 	}
1085 | 
1086 | 	std::string ranges_to_string(const ucprange_array &array, const std::string &indent, const bool composite)
1087 | 	{
1088 | 		std::string rangestring(indent);
1089 | 
1090 | 		if (composite)
1091 | 		{
1092 | 			rangestring += "//  ";
1093 | 
1094 | 			for (ucprange_array::size_type i = 1; i < array.size(); ++i)
1095 | 			{
1096 | 				const ucprange &range = array[i];
1097 | 
1098 | 				if (i > 1)
1099 | 					rangestring += " + ";
1100 | 				rangestring += static_cast<char>(range.first);
1101 | 				rangestring += static_cast<char>(range.second);
1102 | 				rangestring += ':' + unishared::to_string(array[++i].first);
1103 | 			}
1104 | 		}
1105 | 		else
1106 | 		{
1107 | 			unsigned count = 0;
1108 | 
1109 | 			for (ucprange_array::size_type i = 0; i < array.size(); ++i)
1110 | 			{
1111 | 				const ucprange &range = array[i];
1112 | 				if (count == 4)
1113 | 				{
1114 | 					count = 0;
1115 | 					rangestring += '\n' + indent;
1116 | 				}
1117 | 				else if (count)
1118 | 					rangestring += ' ';
1119 | 
1120 | 				rangestring += "0x" + unishared::to_string(range.first, 16, 4) + ", 0x" + unishared::to_string(range.second, 16, 4) + ',';
1121 | 				++count;
1122 | 			}
1123 | 		}
1124 | 		return rangestring;
1125 | 	}
1126 | 
1127 | #if !defined(SRELL_NO_VMODE)
1128 | 	std::string seqs_to_string(const u32array &array, const std::string &indent)
1129 | 	{
1130 | 		std::string seqstring;
1131 | 
1132 | 		if (array.size() == 1 && array[0] == compositeclass)
1133 | 		{
1134 | 		}
1135 | 		else
1136 | 		{
1137 | 			for (u32array::size_type i = 0; i < array.size();)
1138 | 			{
1139 | 				const ui_l32 num = array[i];
1140 | 
1141 | 				if (num == compositeclass)
1142 | 					break;
1143 | 
1144 | 				if (num == 0)	//  Padding.
1145 | 				{
1146 | 					seqstring += indent + "0,\t//  Padding.\n";
1147 | 					break;
1148 | 				}
1149 | 
1150 | 				if (++i == array.size())
1151 | 					unishared::throw_error("[InternalError] No data follows %u.", num);
1152 | 
1153 | 				seqstring += indent + unishared::to_string(num);
1154 | 				seqstring += ", 0x" + unishared::to_string(array[i++], 16, 4);
1155 | 
1156 | 				if (num == 1)	//  Range.
1157 | 				{
1158 | 					if (i == array.size())
1159 | 						unishared::throw_error("[InternalError] No pair for %.4lX.", array[i - 1]);
1160 | 
1161 | 					seqstring += ", 0x" + unishared::to_string(array[i++], 16, 4);
1162 | 				}
1163 | 				else
1164 | 				{
1165 | 					for (ui_l32 j = 2; j < num; ++j)
1166 | 					{
1167 | 						if (i == array.size())
1168 | 							unishared::throw_error("[InternalError] Broken after %.4lX.", array[i - 1]);
1169 | 
1170 | 						seqstring += ", 0x" + unishared::to_string(array[i++], 16, 4);
1171 | 					}
1172 | 				}
1173 | 				seqstring += ",\n";
1174 | 			}
1175 | 
1176 | 			if (seqstring.size())
1177 | 				seqstring.resize(seqstring.size() - 1);
1178 | 		}
1179 | 		return seqstring;
1180 | 	}
1181 | #endif	//  !defined(SRELL_NO_VMODE)
1182 | 
1183 | 	void drop_finalcomma(std::string &data)
1184 | 	{
1185 | 		std::string::size_type commapos = data.rfind(',');
1186 | 		if (commapos != std::string::npos)
1187 | 			data.erase(commapos, 1);
1188 | 	}
1189 | 
1190 | 	std::string create_pnametable(ui_l32 &count, const std::string &indent, const up_options &opts)
1191 | 	{
1192 | 		const char *const *pnames = updata::property_names;
1193 | 		std::string out;
1194 | 		namenumber_mapper categories;
1195 | 
1196 | 		count = 0u;
1197 | 		for (unsigned int i = 2; **pnames; ++pnames, ++i)
1198 | 		{
1199 | 			const std::string names(*pnames);
1200 | 			srell::cregex_iterator2 rei2(names, re_colon_);
1201 | 
1202 | 			for (rei2.split_begin();; rei2.split_next())
1203 | 			{
1204 | 				const std::string name(!rei2.done() ? rei2.split_range() : rei2.split_remainder());
1205 | 				categories[name] = i;
1206 | 				++count;
1207 | 
1208 | 				if (rei2.done())
1209 | 					break;
1210 | 			}
1211 | 		}
1212 | 
1213 | 		out.assign(indent + "{ \"\", " + unishared::to_string(count) + " },\n");
1214 | 
1215 | 		for (namenumber_mapper::const_iterator it = categories.begin(); it != categories.end(); ++it)
1216 | 		{
1217 | 			out.append(indent);
1218 | 			out.append("{ \"");
1219 | 			if (!opts.noesc)
1220 | 				out.append(escape_string(it->first));
1221 | 			else
1222 | 				out.append(it->first);
1223 | 			out.append("\", " + unishared::to_string(it->second) + " },\n");
1224 | 		}
1225 | 		return out;
1226 | 	}
1227 | 
1228 | 	std::string join_dropcomma_append(const strings_type &s, const std::string &return_table)
1229 | 	{
1230 | 		std::string tmp(join('\n', s, true));
1231 | 
1232 | 		drop_finalcomma(tmp);
1233 | 		tmp.append(return_table);
1234 | 		return tmp;
1235 | 	}
1236 | 
1237 | 	void do_formatting(std::string &out, const sortedrangeholder &alldata, const sortedseqholder &emsq, const up_options &opts)
1238 | 	{
1239 | 		const std::size_t numofproperties = sizeof (updata::property_names) / sizeof (updata::property_names[0]) + 1;
1240 | 		const std::string template1("template <typename T1, typename T2, typename T3>\n");
1241 | 		const std::string template2("unicode_property_data<T1, T2, T3>::");
1242 | 		const std::string return_table("};\n");
1243 | 		const std::string indent("\t");
1244 | 		name_mapper ptype_mappings(create_ptype_mappings());
1245 | 		const std::string ptypes(create_ptypes(ptype_mappings));
1246 | 		const std::string t1head("\t");
1247 | 		const std::string t1prefix(t1head + "upid_");
1248 | 
1249 | 		const ui_l32 pno_base = numofproperties;
1250 | 		ui_l32 offset = 0u;
1251 | 		ui_l32 property_number = pno_base;
1252 | 		ui_l32 property_id_number = pno_base;
1253 | 
1254 | 		std::string pnumbers(t1prefix + "unknown = 0,\n");
1255 | 		strings_type rangetable;
1256 | 		strings_type lookup_ranges;
1257 | 		std::string lookup_numbers;
1258 | 		namenumber_mapper rangeno_map;
1259 | 
1260 | 		pnumbers += t1prefix + "invalid = 0,\n";
1261 | 		pnumbers += t1prefix + "error = 0,\n";
1262 | 		pnumbers += ptypes;
1263 | 
1264 | 		namenumber_mapper registered;
1265 | 		srell::re_detail::simple_array<ucprange> rangepos;
1266 | 		srell::cregex_iterator2 rei2;
1267 | 
1268 | 		for (sortedrangeholder::size_type i = 0; i < alldata.size(); ++i)
1269 | 		{
1270 | 			const sorted_name_and_ranges &elem = alldata[i];
1271 | 			const std::string ptype = elem.ptype;
1272 | 			const std::string name = elem.canonicalname;
1273 | 			const std::string aliases = elem.namealiases;
1274 | 			const ucprange_array &array = elem.ucpranges;
1275 | 			const std::string pnumber_keyname(ptype + '_' + name);
1276 | 			const std::string position_comment(' ' + ptype + '=' + aliases);
1277 | 			const bool compositeclass_found = array.size() && array[0].first == compositeclass;
1278 | 			std::string rangestring(ranges_to_string(array, indent, compositeclass_found));
1279 | 			ui_l32 numofranges = static_cast<ui_l32 >(array.size());
1280 | 			ui_l32 pno = property_number;
1281 | 			const namenumber_mapper::const_iterator rit = registered.find(rangestring);
1282 | 
1283 | 			if (rit != registered.end())
1284 | 			{
1285 | 				pno = rit->second;
1286 | 
1287 | 				lookup_ranges[pno - pno_base] += position_comment;
1288 | 				rangetable[(pno - pno_base) * 2] += position_comment;
1289 | 
1290 | 				rei2.assign(aliases, re_colon_);
1291 | 
1292 | 				for (rei2.split_begin();; rei2.split_next())
1293 | 				{
1294 | 					const std::string alias(!rei2.done() ? rei2.split_range() : rei2.split_remainder());
1295 | 
1296 | 					rangeno_map[ptype + ':' + alias] = pno;
1297 | 					if (rei2.done())
1298 | 						break;
1299 | 				}
1300 | 			}
1301 | 			else
1302 | 			{
1303 | 				//  ucpranges of "Assigned" is empty.
1304 | 				if (compositeclass_found)
1305 | 				{
1306 | 					std::printf("[Info] Composite property \"%s\" found.\n", aliases.c_str());
1307 | 					numofranges = array[0].second;
1308 | 				}
1309 | 				else
1310 | 					registered[rangestring] = property_number;
1311 | 
1312 | 				rei2.assign(aliases, re_colon_);
1313 | 
1314 | 				for (rei2.split_begin();; rei2.split_next())
1315 | 				{
1316 | 					const std::string alias(!rei2.done() ? rei2.split_range() : rei2.split_remainder());
1317 | 
1318 | 					rangeno_map[ptype + ':' + alias] = property_number;
1319 | 					if (rei2.done())
1320 | 						break;
1321 | 				}
1322 | 
1323 | 				lookup_ranges.push_back(indent + "{ " + unishared::to_string(offset) + ", " + unishared::to_string(numofranges) + " },\t//  #" + unishared::to_string(pno) + position_comment);
1324 | 
1325 | 				rangetable.push_back(indent + "//  #" + unishared::to_string(pno) + " (" + unishared::to_string(offset) + '+' + unishared::to_string(numofranges) + "):" + position_comment);
1326 | 				rangetable.push_back(rangestring);
1327 | 
1328 | 				rangepos.push_back(ucprange_helper(offset, numofranges));
1329 | 
1330 | 				if (!compositeclass_found)
1331 | 					offset += numofranges;
1332 | 
1333 | 				++property_number;
1334 | 			}
1335 | 
1336 | 			pnumbers.append(t1head + pnumber_keyname + " = " + unishared::to_string(pno) + "," + (pno != property_id_number ? ("\t//  #" + unishared::to_string(property_id_number)) : "") + '\n');
1337 | 
1338 | 			++property_id_number;
1339 | 		}
1340 | 
1341 | 		pnumbers.append(t1prefix + "max_property_number = " + unishared::to_string(property_number - 1) + ",\n");
1342 | 
1343 | #if !defined(SRELL_NO_VMODE)
1344 | 		if (rangetable.size())
1345 | 			drop_finalcomma(rangetable[rangetable.size() - 1]);
1346 | 
1347 | 		rangetable.push_back("#if !defined(SRELL_NO_UNICODE_POS)\n" + indent + ",");
1348 | 
1349 | 		for (sortedseqholder::size_type i = 0; i < emsq.size(); ++i)
1350 | 		{
1351 | 			const sorted_name_and_seqs &elem = emsq[i];
1352 | 			const std::string ptype = elem.ptype;
1353 | 			const std::string name = elem.canonicalname;
1354 | 			const std::string aliases = elem.namealiases;
1355 | 			const u32array &array = elem.ucpseqs;
1356 | 			const bool compositeclass_found = array.size() && array[0] == compositeclass;
1357 | 			const std::string pnumber_keyname(ptype + '_' + name);
1358 | 			const std::string position_comment(' ' + ptype + '=' + aliases);
1359 | 			ui_l32 numofseqs = static_cast<ui_l32>(array.size());
1360 | 			std::string seqstring;
1361 | 
1362 | 			if (compositeclass_found)
1363 | 			{
1364 | 				std::printf("[Info] Composite property \"%s\" found.\n", aliases.c_str());
1365 | 				numofseqs = array[1];
1366 | 				seqstring = indent + "//  ";
1367 | 
1368 | 				for (u32array::size_type j = 2; j < array.size(); ++j)
1369 | 				{
1370 | 					if (j > 2)
1371 | 						seqstring += " + ";
1372 | 					seqstring += unishared::to_string(array[j]) + "/2";
1373 | 				}
1374 | 			}
1375 | 			else
1376 | 				seqstring = seqs_to_string(array, indent);
1377 | 
1378 | 			const ui_l32 numofranges = numofseqs / 2;
1379 | 
1380 | 			pnumbers.append(t1head + pnumber_keyname + " = " + unishared::to_string(property_number) + ",\t//  #" + unishared::to_string(property_id_number) + '\n');
1381 | 
1382 | 			rei2.assign(aliases, re_colon_);
1383 | 
1384 | 			for (rei2.split_begin();; rei2.split_next())
1385 | 			{
1386 | 				const std::string alias(!rei2.done() ? rei2.split_range() : rei2.split_remainder());
1387 | 
1388 | 				rangeno_map[ptype + ':' + aliases] = property_number;
1389 | 				if (rei2.done())
1390 | 					break;
1391 | 			}
1392 | 
1393 | 			lookup_ranges.push_back(indent + "{ " + unishared::to_string(offset) + ", " + unishared::to_string(numofranges) + " },\t//  #" + unishared::to_string(property_number) + position_comment);
1394 | 			rangetable.push_back(indent + "//  #" + unishared::to_string(property_number) + " (" + unishared::to_string(offset) + '+' + unishared::to_string(numofseqs) + "/2):" + position_comment);
1395 | 			rangetable.push_back(seqstring);
1396 | 
1397 | 			++property_number;
1398 | 			++property_id_number;
1399 | 			if (!compositeclass_found)
1400 | 				offset += numofranges;
1401 | 		}
1402 | 
1403 | 		pnumbers.append(t1prefix + "max_pos_number = " + unishared::to_string(--property_number) + "\n");
1404 | 		rangetable.push_back("#endif\t//  !defined(SRELL_NO_UNICODE_POS)");
1405 | 
1406 | #endif	//  !defined(SRELL_NO_VMODE)
1407 | 
1408 | 		ui_l32 basepos = 0u;
1409 | 		std::string pnames(create_pnametable(basepos, indent, opts));
1410 | 		u32pair posinfo[numofproperties];
1411 | 
1412 | 		sort_rangeno_table(posinfo, basepos, lookup_numbers, rangeno_map, indent, opts);
1413 | 		lookup_numbers.append(return_table);
1414 | 		merge_posinfo(lookup_ranges, posinfo, numofproperties, indent);
1415 | 		pnames.insert(0, template1 + "const T1 " + template2 + "propertynumbertable[] =\n{\n");
1416 | 
1417 | 		out.append("enum upid_type\n{\n");
1418 | 		out.append(pnumbers);
1419 | 		out.append("};\n\n");
1420 | 
1421 | 		out.append(template1 + "struct unicode_property_data\n{\n");
1422 | 		out.append("\tstatic const T1 propertynumbertable[];\n");
1423 | 		out.append("\tstatic const T2 positiontable[];\n");
1424 | 		out.append("\tstatic const T3 rangetable[];\n");
1425 | 		out.append("};\n\n");
1426 | 
1427 | 		out.append(pnames);
1428 | 		out.append(lookup_numbers);
1429 | 		out.append("\n");
1430 | 
1431 | 		out.append(template1 + "const T2 " + template2 + "positiontable[] =\n{\n\t{ 0, 0 },\t//  #0 unknown\n");
1432 | 		out.append(join_dropcomma_append(lookup_ranges, return_table));
1433 | 		out.append("\n");
1434 | 
1435 | 		out.append(template1 + "const T3 " + template2 + "rangetable[] =\n{\n");
1436 | 		out.append(join_dropcomma_append(rangetable, return_table));
1437 | 
1438 | 		out.append("#define SRELL_UPDATA_VERSION " + unishared::to_string(static_cast<unsigned int>(opts.version)) + "\n");
1439 | 	}
1440 | 
1441 | 	void sort_rangeno_table(u32pair *const posinfo, ui_l32 offset, std::string &lookup_numbers, const namenumber_mapper &rangeno_map, const std::string &indent, const up_options &opts)
1442 | 	{
1443 | 		strings_type names;
1444 | 		name_mapper pvalues;
1445 | 		namenumber_mapper pcounts;
1446 | 		srell::cregex_iterator2 rei2;
1447 | 
1448 | 		for (namenumber_mapper::const_iterator it = rangeno_map.begin(); it != rangeno_map.end(); ++it)
1449 | 		{
1450 | 			names.clear();
1451 | 			rei2.assign(it->first, re_colon_);
1452 | 
1453 | 			for (rei2.split_begin();; rei2.split_next())
1454 | 			{
1455 | 				names.push_back(rei2.split_aptrange());
1456 | 
1457 | 				if (rei2.done() || names.size() > 2)
1458 | 					break;
1459 | 			}
1460 | 
1461 | 			if (names.size() == 2)
1462 | 			{
1463 | 				const std::string &pname = names[0];
1464 | 				const std::string &pvalue = names[1];
1465 | 				if (!opts.noesc)
1466 | 					pvalues[pname] += indent + "{ \"" + escape_string(pvalue) + "\", " + unishared::to_string(it->second) + " },\n";
1467 | 				else
1468 | 					pvalues[pname] += indent + "{ \"" + pvalue + "\", " + unishared::to_string(it->second) + " },\n";
1469 | 				++pcounts[pname];
1470 | 			}
1471 | 		}
1472 | 
1473 | 		offset += set_pvalue_and_count(lookup_numbers, posinfo[2], "gc", offset, pcounts, pvalues, indent);
1474 | 		offset += set_pvalue_and_count(lookup_numbers, posinfo[1], "bp", offset, pcounts, pvalues, indent);
1475 | 		offset += set_pvalue_and_count(lookup_numbers, posinfo[3], "sc", offset, pcounts, pvalues, indent);
1476 | 		offset += set_pvalue_and_count(lookup_numbers, posinfo[4], "scx", offset, pcounts, pvalues, indent);
1477 | 		drop_finalcomma(lookup_numbers);
1478 | 	}
1479 | 
1480 | 	ui_l32 set_pvalue_and_count(std::string &lookup_numbers, u32pair &posinfo, const std::string category, const ui_l32 offset, namenumber_mapper &pcounts, name_mapper &pvalues, const std::string &indent)
1481 | 	{
1482 | 		lookup_numbers.append(indent + "//  " + category + ": " + unishared::to_string(pcounts[category]) + "\n" + pvalues[category]);
1483 | 		posinfo.set(offset + 1, pcounts[category]);
1484 | 		return posinfo.second;
1485 | 	}
1486 | 
1487 | 	void merge_posinfo(strings_type &lookup_ranges, const u32pair *const posinfo, const std::size_t numofproperties, const std::string &indent)
1488 | 	{
1489 | 		for (std::size_t i = 1; i < numofproperties; ++i)
1490 | 		{
1491 | 			const u32pair &pair = posinfo[i];
1492 | 			const std::string line(indent + "{ " + unishared::to_string(pair.first) + ", " + unishared::to_string(pair.second) + " },\t//  #" + unishared::to_string(i) + ' ' + (i == 1 ? "binary" : updata::property_names[i - 2]));
1493 | 
1494 | 			lookup_ranges.insert(lookup_ranges.begin() + i - 1, line);
1495 | 		}
1496 | 	}
1497 | 
1498 | 	std::string escape_string(const std::string &s)
1499 | 	{
1500 | 		static const char hex[] = "0123456789ABCDEF";
1501 | 		std::string out;
1502 | 
1503 | 		for (std::string::size_type i = 0; i < s.size(); ++i)
1504 | 		{
1505 | 			out.append("\\x");
1506 | 			out.append(1, hex[(s[i] >> 4) & 15]);
1507 | 			out.append(1, hex[s[i] & 15]);
1508 | 		}
1509 | 		return out;
1510 | 	}
1511 | 
1512 | 	srell::regex re_colon_;
1513 | };
1514 | //  class unicode_property
1515 | 
1516 | int main(const int argc, const char *const *const argv)
1517 | {
1518 | 	up_options upopts(argc, argv);
1519 | 	std::string outdata;
1520 | 	unicode_property up;
1521 | 	int errorno = up.create_updata(outdata, upopts);
1522 | 
1523 | 	if (errorno == 0)
1524 | 	{
1525 | 		if (!unishared::write_file(upopts.outfilename, outdata))
1526 | 			errorno = 2;
1527 | 	}
1528 | 	return errorno;
1529 | }
1530 | 


--------------------------------------------------------------------------------