├── README.md
├── ReflectionTypeHint.php
├── ReflectionTypeHint_example.php
├── Text
└── Censure.php
├── UTF8-CHANGELOG.txt
└── UTF8.php
/README.md:
--------------------------------------------------------------------------------
1 | php-censure
2 | ===========
3 |
4 | Клон одноименной библиотеки антимата с google-code
5 |
6 | Оригинал можно найти по ссылке: http://code.google.com/p/php-censure/
7 |
8 | Библиотека позволяет определить наличие в тексте на русском языке мата (в том числе многие криптованные варианты)
9 | и/или заменить его произвольным набором символов
10 |
11 | Оригинальное описание
12 |
13 | Алгоритм достаточно надёжен и быстр, в т.ч. на больших объёмах данных
14 | Метод обнаружения мата основывается на корнях и предлогах русского языка, а не на словаре
15 | Слова "лох", "хер", "залупа", "сука" матерными словами не считаются (см. словарь Даля)
16 | Разработка ведётся с 2005 года
17 |
18 | Согласно статье 20.1 КоАП РФ нецензурная брань в общественных местах (интернет — место общественное)
19 | расценивается как мелкое хулиганство, за что установлена административная ответственность — наложение
20 | штрафа в размере от пятисот до одной тысячи рублей или административный арест на срок до пятнадцати суток.
21 |
--------------------------------------------------------------------------------
/ReflectionTypeHint.php:
--------------------------------------------------------------------------------
1 | 'is_int',
31 | 'integer' => 'is_int',
32 | 'digit' => 'ctype_digit',
33 | 'number' => 'ctype_digit',
34 | 'float' => 'is_float',
35 | 'double' => 'is_float',
36 | 'real' => 'is_float',
37 | 'numeric' => 'is_numeric',
38 | 'str' => 'is_string',
39 | 'string' => 'is_string',
40 | 'char' => 'is_string',
41 | 'bool' => 'is_bool',
42 | 'boolean' => 'is_bool',
43 | 'null' => 'is_null',
44 | 'array' => 'is_array',
45 | 'obj' => 'is_object',
46 | 'object' => 'is_object',
47 | 'res' => 'is_resource',
48 | 'resource' => 'is_resource',
49 | 'scalar' => 'is_scalar', #integer, float, string or boolean
50 | 'cb' => 'is_callable',
51 | 'callback' => 'is_callable',
52 | );
53 |
54 | #calling the methods of this class only statically!
55 | private function __construct() {}
56 |
57 | public static function isValid()
58 | {
59 | if (! assert_options(ASSERT_ACTIVE)) return true;
60 | $bt = self::debugBacktrace(null, 1);
61 | extract($bt); //to $file, $line, $function, $class, $object, $type, $args
62 | if (! $args) return true; #speed improve
63 | $r = new ReflectionMethod($class, $function);
64 | $doc = $r->getDocComment();
65 | $cache_id = $class. $type. $function;
66 | preg_match_all('~ [\r\n]++ [\x20\t]++ \* [\x20\t]++
67 | @param
68 | [\x20\t]++
69 | \K #memory reduce
70 | ( [_a-z]++[_a-z\d]*+
71 | (?>[|/,][_a-z]+[_a-z\d]*)*+
72 | ) #1 types
73 | [\x20\t]++
74 | &?+\$([_a-z]++[_a-z\d]*+) #2 name
75 | ~sixSX', $doc, $params, PREG_SET_ORDER);
76 | $parameters = $r->getParameters();
77 | //d($args, $params, $parameters);
78 | if (count($parameters) > count($params))
79 | {
80 | $message = 'phpDoc %d piece(s) @param description expected in %s%s%s(), %s given, ' . PHP_EOL
81 | . 'called in %s on line %d ' . PHP_EOL
82 | . 'and defined in %s on line %d';
83 | $message = sprintf($message, count($parameters), $class, $type, $function, count($params), $file, $line, $r->getFileName(), $r->getStartLine());
84 | trigger_error($message, E_USER_NOTICE);
85 | }
86 | foreach ($args as $i => $value)
87 | {
88 | if (! isset($params[$i])) return true;
89 | if ($parameters[$i]->name !== $params[$i][2])
90 | {
91 | $param_num = $i + 1;
92 | $message = 'phpDoc @param %d in %s%s%s() must be named as $%s, $%s given, ' . PHP_EOL
93 | . 'called in %s on line %d ' . PHP_EOL
94 | . 'and defined in %s on line %d';
95 | $message = sprintf($message, $param_num, $class, $type, $function, $parameters[$i]->name, $params[$i][2], $file, $line, $r->getFileName(), $r->getStartLine());
96 | trigger_error($message, E_USER_NOTICE);
97 | }
98 |
99 | $hints = preg_split('~[|/,]~sSX', $params[$i][1]);
100 | if (! self::checkValueTypes($hints, $value))
101 | {
102 | $param_num = $i + 1;
103 | $message = 'Argument %d passed to %s%s%s() must be an %s, %s given, ' . PHP_EOL
104 | . 'called in %s on line %d ' . PHP_EOL
105 | . 'and defined in %s on line %d';
106 | $message = sprintf($message, $param_num, $class, $type, $function, implode('|', $hints), (is_object($value) ? get_class($value) . ' ' : '') . gettype($value), $file, $line, $r->getFileName(), $r->getStartLine());
107 | trigger_error($message, E_USER_WARNING);
108 | return false;
109 | }
110 | }
111 | return true;
112 | }
113 |
114 | /**
115 | * Return stacktrace. Correctly work with call_user_func*()
116 | * (totally skip them correcting caller references).
117 | * If $return_frame is present, return only $return_frame matched caller, not all stacktrace.
118 | *
119 | * @param string|null $re_ignore example: '~^' . preg_quote(__CLASS__, '~') . '(?![a-zA-Z\d])~sSX'
120 | * @param int|null $return_frame
121 | * @return array
122 | */
123 | public static function debugBacktrace($re_ignore = null, $return_frame = null)
124 | {
125 | $trace = debug_backtrace();
126 |
127 | $a = array();
128 | $frames = 0;
129 | for ($i = 0, $n = count($trace); $i < $n; $i++)
130 | {
131 | $t = $trace[$i];
132 | if (! $t) continue;
133 |
134 | // Next frame.
135 | $next = isset($trace[$i+1])? $trace[$i+1] : null;
136 |
137 | // Dummy frame before call_user_func*() frames.
138 | if (! isset($t['file']) && $next)
139 | {
140 | $t['over_function'] = $trace[$i+1]['function'];
141 | $t = $t + $trace[$i+1];
142 | $trace[$i+1] = null; // skip call_user_func on next iteration
143 | }
144 |
145 | // Skip myself frame.
146 | if (++$frames < 2) continue;
147 |
148 | // 'class' and 'function' field of next frame define where this frame function situated.
149 | // Skip frames for functions situated in ignored places.
150 | if ($re_ignore && $next)
151 | {
152 | // Name of function "inside which" frame was generated.
153 | $frame_caller = (isset($next['class']) ? $next['class'] . $next['type'] : '')
154 | . (isset($next['function']) ? $next['function'] : '');
155 | if (preg_match($re_ignore, $frame_caller)) continue;
156 | }
157 |
158 | // On each iteration we consider ability to add PREVIOUS frame to $a stack.
159 | if (count($a) === $return_frame) return $t;
160 | $a[] = $t;
161 | }
162 | return $a;
163 | }
164 |
165 | /**
166 | * Checks a value to the allowed types
167 | *
168 | * @param array $types
169 | * @param mixed $value
170 | * @return bool
171 | */
172 | public static function checkValueTypes(array $types, $value)
173 | {
174 | foreach ($types as $type)
175 | {
176 | $type = strtolower($type);
177 | if (array_key_exists($type, self::$hints) && call_user_func(self::$hints[$type], $value)) return true;
178 | if (is_object($value) && @is_a($value, $type)) return true;
179 | if ($type === 'mixed') return true;
180 | }
181 | return false;
182 | }
183 | }
--------------------------------------------------------------------------------
/ReflectionTypeHint_example.php:
--------------------------------------------------------------------------------
1 | myMethod('sss', 75467, new Exception(), true);
24 |
--------------------------------------------------------------------------------
/Text/Censure.php:
--------------------------------------------------------------------------------
1 | = 5.2.0
41 | * @param string $charset кодировка символов (родная кодировка -- UTF-8, для других будет прозрачное перекодирование)
42 | * @return bool|string|int|null Если $replace === NULL, то возвращает FALSE, если мат не обнаружен, иначе фрагмент текста с матерным словом.
43 | * Если $replace !== NULL, то возвращает исходную строку, где фрагменты мата заменены на $replace.
44 | * В случае возникновения ошибки возвращает код ошибки > 0 (integer):
45 | * * PREG_INTERNAL_ERROR
46 | * * PREG_BACKTRACK_LIMIT_ERROR (see also pcre.backtrack_limit)
47 | * * PREG_RECURSION_LIMIT_ERROR (see also pcre.recursion_limit)
48 | * * PREG_BAD_UTF8_ERROR
49 | * * PREG_BAD_UTF8_OFFSET_ERROR (since PHP 5.3.0)
50 | * Или -1, если ReflectionTypeHint вернул ошибку
51 | */
52 | public static function parse(
53 | $s,
54 | $delta = 3,
55 | $continue = "\xe2\x80\xa6",
56 | $is_html = true,
57 | $replace = null,
58 | $charset = 'UTF-8')
59 | {
60 | if (! ReflectionTypeHint::isValid()) return -1;
61 | if ($s === null) return null;
62 |
63 | static $re_badwords = null;
64 |
65 | if ($re_badwords === null)
66 | {
67 | #предлоги русского языка:
68 | #[всуо]|
69 | #по|за|на|об|до|от|вы|вс|вз|из|ис|
70 | #под|про|при|над|низ|раз|рас|воз|вос|
71 | #пооб|повы|пона|поза|недо|пере|одно|
72 | #полуза|произ|пораз|много|
73 | $pretext = array(
74 | #1
75 | '[уyоoаa]_? (?=[еёeхx])', #у, о (уебать, охуеть, ахуеть)
76 | '[вvbсc]_? (?=[хпбмгжxpmgj])', #в, с (впиздячить, схуярить)
77 | '[вvbсc]_?[ъь]_? (?=[еёe])', #въ, съ (съебаться, въебать)
78 | 'ё_? (?=[бb6])', #ё (ёбля)
79 | #2
80 | '[вvb]_?[ыi]_?', #вы
81 | '[зz3]_?[аa]_?', #за
82 | '[нnh]_?[аaеeиi]_?', #на, не, ни
83 | '[вvb]_?[сc]_? (?=[хпбмгжxpmgj])', #вс (вспизднуть)
84 | '[оo]_?[тtбb6]_? (?=[хпбмгжxpmgj])', #от, об
85 | '[оo]_?[тtбb6]_?[ъь]_? (?=[еёe])', #отъ, объ
86 | '[иiвvb]_?[зz3]_? (?=[хпбмгжxpmgj])', #[ив]з
87 | '[иiвvb]_?[зz3]_?[ъь]_? (?=[еёe])', #[ив]зъ
88 | '[иi]_?[сc]_? (?=[хпбмгжxpmgj])', #ис
89 | '[пpдdg]_?[оo]_? (?> [бb6]_? (?=[хпбмгжxpmgj])
90 | | [бb6]_? [ъь]_? (?=[еёe])
91 | | [зz3]_? [аa] _?
92 | )?', #по, до, пообъ, дообъ, поза, доза (двойные символы вырезаются!)
93 | #3
94 | '[пp]_?[рr]_?[оoиi]_?', #пр[ои]
95 | '[зz3]_?[лl]_?[оo]_?', #зло (злоебучая)
96 | '[нnh]_?[аa]_?[дdg]_? (?=[хпбмгжxpmgj])', #над
97 | '[нnh]_?[аa]_?[дdg]_?[ъь]_? (?=[еёe])', #надъ
98 | '[пp]_?[оoаa]_?[дdg]_? (?=[хпбмгжxpmgj])', #под
99 | '[пp]_?[оoаa]_?[дdg]_?[ъь]_? (?=[еёe])', #подъ
100 | '[рr]_?[аa]_?[зz3сc]_? (?=[хпбмгжxpmgj])', #ра[зс]
101 | '[рr]_?[аa]_?[зz3сc]_?[ъь]_? (?=[еёe])', #ра[зс]ъ
102 | '[вvb]_?[оo]_?[зz3сc]_? (?=[хпбмгжxpmgj])', #во[зс]
103 | '[вvb]_?[оo]_?[зz3сc]_?[ъь]_? (?=[еёe])', #во[зс]ъ
104 | #4
105 | '[нnh]_?[еe]_?[дdg]_?[оo]_?', #недо
106 | '[пp]_?[еe]_?[рr]_?[еe]_?', #пере
107 | '[oо]_?[дdg]_?[нnh]_?[оo]_?', #одно
108 | '[кk]_?[oо]_?[нnh]_?[оo]_?', #коно (коноебиться)
109 | '[мm]_?[уy]_?[дdg]_?[oоaа]_?', #муд[оа] (мудаёб)
110 | '[oо]_?[сc]_?[тt]_?[оo]_?', #осто (остопиздело)
111 | '[дdg]_?[уy]_?[рpr]_?[оoаa]_?', #дур[оа]
112 | '[хx]_?[уy]_?[дdg]_?[оoаa]_?', #худ[оа] (худоебина)
113 | #5
114 | '[мm]_?[нnh]_?[оo]_?[гg]_?[оo]_?', #много
115 | '[мm]_?[оo]_?[рpr]_?[дdg]_?[оoаa]_?', #морд[оа]
116 | '[мm]_?[оo]_?[зz3]_?[гg]_?[оoаa]_?', #мозг[оа]
117 | '[дdg]_?[оo]_?[лl]_?[бb6]_?[оoаa]_?', #долб[оа]
118 | '[оo]_?[сc]_?[тt]_?[рpr]_?[оo]_?', #остро
119 | );
120 |
121 | $badwords = array(
122 | #Слово на букву Х
123 | '(?<=\PL) %RE_PRETEXT%?
124 | [hхx]_?[уyu]_?[ийiеeёяюju] #хуй, хуя, хую, хуем, хуёвый, охуительный
125 | #исключения:
126 | (? '\x20', #пробел
218 | '\pL' => '[^\x20\d]', #буква
219 | '\PL' => '[\x20\d]', #не буква
220 | '[:vowel:]' => '[аеиоуыэюяёaeioyu]', #гласные буквы
221 | '[:consonant:]' => '[^аеиоуыэюяёaeioyu\x20\d]', #согласные буквы
222 | );
223 |
224 | $re_badwords = str_replace(
225 | '%RE_PRETEXT%',
226 | '(?:' . implode('|', $pretext) . ')', #однократный шаблон с альтернативами использовать нельзя!
227 | '~' . implode('|', $badwords) . '~sxuSX'
228 | );
229 | $re_badwords = strtr($re_badwords, $trans);
230 | }
231 |
232 | $s = UTF8::convert_from($s, $charset);
233 | $replace = UTF8::convert_from($replace, $charset);
234 |
235 | $ss = $s; #saves original string
236 |
237 | if ($is_html)
238 | {
239 | #скрипты не вырезаем, т.к. м.б. обходной маневр на с кодом на javascript:
240 | #
241 | #хотя давать пользователю возможность использовать код на javascript нехорошо
242 | $s = is_callable(array('HTML', 'strip_tags')) ? HTML::strip_tags($s, null, true, array('comment', 'style', 'map', 'frameset', 'object', 'applet'))
243 | : strip_tags($s);
244 | #заменяем html-сущности в "чистый" UTF-8
245 | $s = UTF8::html_entity_decode($s, $is_htmlspecialchars = true);
246 | }
247 |
248 | if (strtoupper(substr($charset, 0, 3)) === 'UTF') #UTF-8, UTF-16, UTF-32
249 | {
250 | #remove combining diactrical marks
251 | $additional_chars = array(
252 | "\xc2\xad", #"мягкие" переносы строк ()
253 | );
254 | $s = UTF8::diactrical_remove($s, $additional_chars);
255 | }
256 |
257 | #ВотБ/\яПидорыОхуелиБлятьНахуйПохуйПи3децПолный
258 | if (version_compare(PHP_VERSION, '5.2.0', '>='))
259 | {
260 | $s = preg_replace('~ [\p{Lu}3] (?>\p{Ll}+|/\\\\|[@36]+)++ #Вот
261 | (?= [\p{Lu}3] (?:\p{Ll} |/\\\\|[@36] ) ) #Бля
262 | ~sxuSX', '$0 ', $s);
263 | }
264 |
265 | $s = mb_strtolower($s);
266 |
267 | #получаем в массив только буквы и цифры
268 | #"с_л@о#во,с\xc2\xa7лово.Слово" -> "с л о во с лово слово слово слово слово"
269 | preg_match_all('~(?> \xd0[\xb0-\xbf]|\xd1[\x80-\x8f\x91] #[а-я]
270 | | /\\\\ #л
271 | | @ #а
272 | | [a-z\d]+
273 | )+
274 | ~sxSX', $s, $m);
275 | $s = ' ' . implode(' ', $m[0]) . ' ';
276 |
277 | $trans = array(
278 | '/\\' => 'л', #Б/\ЯТЬ --> БЛЯТЬ
279 | '@' => 'а', #пизд@ --> пизда
280 | );
281 | $s = strtr($s, $trans);
282 |
283 | #цифровые подделки под буквы
284 | $trans = array(
285 | '~ [3з]++ [3з\x20]*+ ~sxuSX' => 'з',
286 | '~ [6б]++ [6б\x20]*+ ~sxuSX' => 'б',
287 | );
288 | $s = preg_replace(array_keys($trans), array_values($trans), $s);
289 |
290 | #убираем все повторяющиеся символы, ловим обман типа "х-у-у-й"
291 | #"сллоооовоо слово х у у й" --> "слово слово х у й"
292 | $s = preg_replace('/( [\xd0\xd1][\x80-\xbf] \x20? #optimized [а-я]
293 | | [a-z\d] \x20?
294 | ) \\1+
295 | /sxSX', '$1', $s);
296 |
297 | if ($replace === null || version_compare(PHP_VERSION, '5.2.0', '<'))
298 | {
299 | $result = preg_match($re_badwords, $s, $m, PREG_OFFSET_CAPTURE);
300 | if (function_exists('preg_last_error') && preg_last_error() !== PREG_NO_ERROR) return preg_last_error();
301 | if ($result === false) return 1; #PREG_INTERNAL_ERROR = 1
302 | if ($result && $replace === null)
303 | {
304 | list($word, $offset) = $m[0];
305 | $s1 = substr($s, 0, $offset);
306 | $s2 = substr($s, $offset + strlen($word));
307 | $delta = intval($delta);
308 | if ($delta === 0) $fragment = '[' . trim($word) . ']';
309 | else
310 | {
311 | if ($delta < 1 || $delta > 10) $delta = 3;
312 | preg_match('/ (?> \x20 (?>[\xd0\xd1][\x80-\xbf]|[a-z\d]+)++ ){1,' . $delta . '}+
313 | \x20?+
314 | $/sxSX', $s1, $m1);
315 | preg_match('/^ (?>[\xd0\xd1][\x80-\xbf]|[a-z\d]+)*+ #ending
316 | \x20?+
317 | (?> (?>[\xd0\xd1][\x80-\xbf]|[a-z\d]+)++ \x20 ){0,' . $delta . '}+
318 | /sxSX', $s2, $m2);
319 | $fragment = (ltrim(@$m1[0]) !== ltrim($s1) ? $continue : '') .
320 | trim((isset($m1[0]) ? $m1[0] : '') . '[' . trim($word) . ']' . (isset($m2[0]) ? $m2[0] : '')) .
321 | (rtrim(@$m2[0]) !== rtrim($s2) ? $continue : '');
322 | }
323 | return UTF8::convert_to($fragment, $charset);
324 | }
325 | return false;
326 | }
327 |
328 | $result = preg_match_all($re_badwords, $s, $m);
329 | if (function_exists('preg_last_error') && preg_last_error() !== PREG_NO_ERROR) return preg_last_error();
330 | if ($result === false) return 1; #PREG_INTERNAL_ERROR = 1
331 | if ($result > 0)
332 | {
333 | #d($s, $m[0]);
334 | $s = $ss;
335 | #замена матного фрагмента на $replace
336 | foreach ($m[0] as $w)
337 | {
338 | $re_w = '~' . preg_replace_callback('~(?:/\\\\|[^\x20])~suSX', array('self', '_make_regexp_callback'), $w) . '~sxuiSX';
339 | $ss = preg_replace($re_w, $replace, $ss);
340 | #d($re_w);
341 | }
342 | while ($ss !== $s) $ss = self::parse($s = $ss, $delta, $continue, $is_html, $replace, 'UTF-8');
343 | }
344 | return UTF8::convert_to($ss, $charset);
345 | }
346 |
347 | private static function _make_regexp_callback(array $m)
348 | {
349 | #$re_holes = '[\x00-\x20\-_\*\~\.\'"\^=`:]';
350 | #$re_holes = '[\x00-\x2f\x3a-\x40\x5b-\x60\x7b-\x7f]';
351 | $re_holes = '(?!/\\\\)[^\p{L}\d]'; #non letter, non digit, non '/\'
352 | if ($m[0] === 'а') $re = '[@аА]++ (?>[:holes:]|[@аА]+)*+';
353 | elseif ($m[0] === 'з') $re = '[3зЗ]++ (?>[:holes:]|[3зЗ]+)*+';
354 | elseif ($m[0] === 'б') $re = '[6бБ]++ (?>[:holes:]|[6бБ]+)*+';
355 | elseif ($m[0] === 'л') $re = '(?>[лЛ]+|/\\\\)++ (?>[:holes:]|[лЛ]+|/\\\\)*+';
356 | else
357 | {
358 | #в PCRE-7.2 флаг /i в комбинации с /u в регулярном выражении почему-то не работает (BUG?)
359 | #поэтому делаем класс символов с буквами в обоих регистрах
360 | $char = '[' . preg_quote($m[0] . UTF8::uppercase($m[0]), '~') . ']';
361 | $re = str_replace('$0', $char, '$0++ (?>[:holes:]|$0+)*+');
362 | }
363 | return str_replace('[:holes:]', $re_holes, $re . "\r\n");
364 | }
365 | }
366 |
--------------------------------------------------------------------------------
/UTF8-CHANGELOG.txt:
--------------------------------------------------------------------------------
1 | 2.2.2 / 2011-06-24
2 |
3 | * Convert case functions improved: from all russian charsets to UTF8 native support was added
4 | * UTF8::stripos() speed improved
5 | * constant REPLACEMENT_CHAR added
6 |
7 | 2.2.1 / 2011-06-08
8 |
9 | * UTF8::preg_quote_case_insensitive() added
10 | * UTF8::stripos() speed improved
11 |
12 | 2.2.0 / 2011-06-06
13 |
14 | * UTF8::strlen(), UTF8::substr(), UTF8::strpos(),
15 | UTF8::html_entity_encode(), UTF8::html_entity_decode(),
16 | UTF8::convert_case(), UTF8::lowercase(), UTF8::uppercase() speed improved
17 | * UTF8::stripos(), UTF8::to_unicode(), UTF8::from_unicode() added
18 | * UTF8::strtolower(), UTF8::strtoupper() as wrapper to UTF8::convert_case() added
19 | * Unicode character database to 6.0.0 (2010-06-04) updated
20 | * UTF8::$convert_case_table improved
21 |
22 | 2.1.3 / 2011-05-31
23 |
24 | * UTF8::truncate() small bug fixed
25 |
26 | 2.1.2 / 2011-03-25
27 |
28 | * Класс требует PHP-5.3.x
29 | * UTF8::$char_re deprecated
30 | * Добавлен метод UTF8::tests(), который тестирует методы класса на правильность работы
31 | * Добавлены методы UTF8::strcmp(), UTF8::strncmp(), UTF8::strcasecmp()
32 | * UTF8::is_utf8(), UTF8::str_limit(), UTF8::str_split() speed improved
33 | * Добавлен 2-й параметр в UTF8::html_entity_encode()
34 | * Добавлен 3-й параметр в UTF8::ucwords()
35 | * Методы UTF8::convert_case(), UTF8::lowercase(), UTF8::uppercase() могут принимать массив в 1-м параметре
36 | * Мелкие улучшения в UTF8::strtr()
37 | * Модернизирован класс ReflectionTypeHint
38 |
39 | 2.1.1 / 2010-07-19
40 |
41 | * Добавлены методы array_change_key_case(), range(), strtr()
42 | * Улучшен метод convert_files_from()
43 | * Unicode Character Database 5.2.0
44 | * Исправлены ошибки в trim(), ltrim(), rtrim(), str_pad(), которые могут возникать в некоторых случаях
45 |
46 | 2.1.0 / 2010-03-26
47 |
48 | * Удалён метод unescape_recursive()
49 | * Добавлен метод convert_files_from()
50 | * Несколько методов теперь могут принимать массив и делать их обход рекурсивно
51 | * Почти все методы для обработки строк могут принимать и возвращать NULL
52 |
53 | 2.0.2 / 2010-02-13
54 |
55 | * Новые методы is_ascii(), ltrim(), rtrim(), trim(), str_pad(), strspn()
56 | * Исправлена небольшая ошибка в str_limit()
57 | * Исправлена ошибка в методах convert_from() и convert_to(): они ошибочно возвращали FALSE,
58 | если подать на вход массив, содержащий элементы типа boolean со значением FALSE
59 |
60 | 2.0.1 / 2010-02-08
61 |
62 | * Удалён метод convert_from_cp1259(), используйте convert_from('cp1251')
63 | * Метод convert_from_utf16() теперь приватный, используйте convert_from('UTF-16')
64 | * Добавлены методы convert_to(), diactrical_remove(), diactrical_restore()
65 | * Другие мелкие исправления
66 |
--------------------------------------------------------------------------------
/UTF8.php:
--------------------------------------------------------------------------------
1 | = 5.3.x
19 | *
20 | * In Russian:
21 | *
22 | * Поддержка UTF-8 в PHP 5.
23 | *
24 | * Возможности и преимущества использования этого класса
25 | * * Совместимость с интерфейсом стандартных PHP функций, работающих с однобайтовыми кодировками
26 | * * Возможность работы без PHP расширений ICONV и MBSTRING, если они есть, то активно используются!
27 | * * Полезные функции, отсутствующие в ICONV и MBSTRING
28 | * * Методы, которые принимают и возвращают строку, умеют принимать и возвращать null (удобно при выборках значений из базы данных)
29 | * * Несколько методов умеют обрабатывать массивы рекурсивно
30 | * * Единый интерфейс и инкапсуляция (можно унаследоваться и переопределить методы)
31 | * * Высокая производительность, надёжность и качественный код
32 | * * PHP >= 5.3.x
33 | *
34 | * Example:
35 | * $s = 'Hello, Привет';
36 | * if (UTF8::is_utf8($s)) echo UTF8::strlen($s);
37 | *
38 | * UTF-8 encoding scheme:
39 | * 2^7 0x00000000 — 0x0000007F 0xxxxxxx
40 | * 2^11 0x00000080 — 0x000007FF 110xxxxx 10xxxxxx
41 | * 2^16 0x00000800 — 0x0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
42 | * 2^21 0x00010000 — 0x001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
43 | * 1-4 bytes length: 2^7 + 2^11 + 2^16 + 2^21 = 2 164 864
44 | *
45 | * If I was a owner of the world, I would leave only 2 encoding: UTF-8 and UTF-32 ;-)
46 | *
47 | * Useful links
48 | * http://ru.wikipedia.org/wiki/UTF8
49 | * http://www.madore.org/~david/misc/unitest/ A Unicode Test Page
50 | * http://www.unicode.org/
51 | * http://www.unicode.org/reports/
52 | * http://www.unicode.org/reports/tr10/ Unicode Collation Algorithm
53 | * http://www.unicode.org/Public/UCA/6.0.0/ Unicode Collation Algorithm
54 | * http://www.unicode.org/reports/tr6/ A Standard Compression Scheme for Unicode
55 | * http://www.fileformat.info/info/unicode/char/search.htm Unicode Character Search
56 | *
57 | * @link http://code.google.com/p/php5-utf8/
58 | * @license http://creativecommons.org/licenses/by-sa/3.0/
59 | * @author Nasibullin Rinat
60 | * @version 2.2.2
61 | */
62 | class UTF8
63 | {
64 | #REPLACEMENT CHARACTER (for broken char)
65 | const REPLACEMENT_CHAR = "\xEF\xBF\xBD"; #U+FFFD
66 |
67 | /**
68 | * Regular expression for a character in UTF-8 without the use of a flag /u
69 | * @deprecated Instead, use a dot (".") and the flag /u, it works faster!
70 | * @var string
71 | */
72 | public static $char_re = ' [\x09\x0A\x0D\x20-\x7E] # ASCII strict
73 | # [\x00-\x7F] # ASCII non-strict (including control chars)
74 | | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
75 | | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
76 | | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
77 | | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
78 | | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
79 | | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
80 | | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
81 | ';
82 |
83 | /**
84 | * Combining diactrical marks (Unicode 5.1).
85 | *
86 | * For example, russian letters in composed form: "Ё" (U+0401), "Й" (U+0419),
87 | * decomposed form: (U+0415 U+0308), (U+0418 U+0306)
88 | *
89 | * @link http://www.unicode.org/charts/PDF/U0300.pdf
90 | * @link http://www.unicode.org/charts/PDF/U1DC0.pdf
91 | * @link http://www.unicode.org/charts/PDF/UFE20.pdf
92 | * @var string
93 | */
94 | #public static $diactrical_re = '\p{M}'; #alternative, but only with /u flag
95 | public static $diactrical_re = ' \xcc[\x80-\xb9]|\xcd[\x80-\xaf] #UNICODE range: U+0300 — U+036F (for letters)
96 | | \xe2\x83[\x90-\xbf] #UNICODE range: U+20D0 — U+20FF (for symbols)
97 | | \xe1\xb7[\x80-\xbf] #UNICODE range: U+1DC0 — U+1DFF (supplement)
98 | | \xef\xb8[\xa0-\xaf] #UNICODE range: U+FE20 — U+FE2F (combining half marks)
99 | ';
100 |
101 | /**
102 | * @var array
103 | */
104 | public static $html_special_chars_table = array(
105 | '"' => "\x22", #U+0022 ["] " quotation mark = APL quote
106 | '&' => "\x26", #U+0026 [&] & ampersand
107 | '<' => "\x3c", #U+003C [<] < less-than sign
108 | '>' => "\x3e", #U+003E [>] > greater-than sign
109 | );
110 |
111 | /**
112 | * @link http://www.fileformat.info/format/w3c/entitytest.htm?sort=Unicode%20Character HTML Entity Browser Test Page
113 | * @var array
114 | */
115 | public static $html_entity_table = array(
116 | #Latin-1 Entities:
117 | ' ' => "\xc2\xa0", #U+00A0 [ ] no-break space = non-breaking space
118 | '¡' => "\xc2\xa1", #U+00A1 [¡] inverted exclamation mark
119 | '¢' => "\xc2\xa2", #U+00A2 [¢] cent sign
120 | '£' => "\xc2\xa3", #U+00A3 [£] pound sign
121 | '¤' => "\xc2\xa4", #U+00A4 [¤] currency sign
122 | '¥' => "\xc2\xa5", #U+00A5 [¥] yen sign = yuan sign
123 | '¦' => "\xc2\xa6", #U+00A6 [¦] broken bar = broken vertical bar
124 | '§' => "\xc2\xa7", #U+00A7 [§] section sign
125 | '¨' => "\xc2\xa8", #U+00A8 [¨] diaeresis = spacing diaeresis
126 | '©' => "\xc2\xa9", #U+00A9 [©] copyright sign
127 | 'ª' => "\xc2\xaa", #U+00AA [ª] feminine ordinal indicator
128 | '«' => "\xc2\xab", #U+00AB [«] left-pointing double angle quotation mark = left pointing guillemet
129 | '¬' => "\xc2\xac", #U+00AC [¬] not sign
130 | '' => "\xc2\xad", #U+00AD [ ] soft hyphen = discretionary hyphen
131 | '®' => "\xc2\xae", #U+00AE [®] registered sign = registered trade mark sign
132 | '¯' => "\xc2\xaf", #U+00AF [¯] macron = spacing macron = overline = APL overbar
133 | '°' => "\xc2\xb0", #U+00B0 [°] degree sign
134 | '±' => "\xc2\xb1", #U+00B1 [±] plus-minus sign = plus-or-minus sign
135 | '²' => "\xc2\xb2", #U+00B2 [²] superscript two = superscript digit two = squared
136 | '³' => "\xc2\xb3", #U+00B3 [³] superscript three = superscript digit three = cubed
137 | '´' => "\xc2\xb4", #U+00B4 [´] acute accent = spacing acute
138 | 'µ' => "\xc2\xb5", #U+00B5 [µ] micro sign
139 | '¶' => "\xc2\xb6", #U+00B6 [¶] pilcrow sign = paragraph sign
140 | '·' => "\xc2\xb7", #U+00B7 [·] middle dot = Georgian comma = Greek middle dot
141 | '¸' => "\xc2\xb8", #U+00B8 [¸] cedilla = spacing cedilla
142 | '¹' => "\xc2\xb9", #U+00B9 [¹] superscript one = superscript digit one
143 | 'º' => "\xc2\xba", #U+00BA [º] masculine ordinal indicator
144 | '»' => "\xc2\xbb", #U+00BB [»] right-pointing double angle quotation mark = right pointing guillemet
145 | '¼' => "\xc2\xbc", #U+00BC [¼] vulgar fraction one quarter = fraction one quarter
146 | '½' => "\xc2\xbd", #U+00BD [½] vulgar fraction one half = fraction one half
147 | '¾' => "\xc2\xbe", #U+00BE [¾] vulgar fraction three quarters = fraction three quarters
148 | '¿' => "\xc2\xbf", #U+00BF [¿] inverted question mark = turned question mark
149 | #Latin capital letter
150 | 'À' => "\xc3\x80", #Latin capital letter A with grave = Latin capital letter A grave
151 | 'Á' => "\xc3\x81", #Latin capital letter A with acute
152 | 'Â' => "\xc3\x82", #Latin capital letter A with circumflex
153 | 'Ã' => "\xc3\x83", #Latin capital letter A with tilde
154 | 'Ä' => "\xc3\x84", #Latin capital letter A with diaeresis
155 | 'Å' => "\xc3\x85", #Latin capital letter A with ring above = Latin capital letter A ring
156 | 'Æ' => "\xc3\x86", #Latin capital letter AE = Latin capital ligature AE
157 | 'Ç' => "\xc3\x87", #Latin capital letter C with cedilla
158 | 'È' => "\xc3\x88", #Latin capital letter E with grave
159 | 'É' => "\xc3\x89", #Latin capital letter E with acute
160 | 'Ê' => "\xc3\x8a", #Latin capital letter E with circumflex
161 | 'Ë' => "\xc3\x8b", #Latin capital letter E with diaeresis
162 | 'Ì' => "\xc3\x8c", #Latin capital letter I with grave
163 | 'Í' => "\xc3\x8d", #Latin capital letter I with acute
164 | 'Î' => "\xc3\x8e", #Latin capital letter I with circumflex
165 | 'Ï' => "\xc3\x8f", #Latin capital letter I with diaeresis
166 | 'Ð' => "\xc3\x90", #Latin capital letter ETH
167 | 'Ñ' => "\xc3\x91", #Latin capital letter N with tilde
168 | 'Ò' => "\xc3\x92", #Latin capital letter O with grave
169 | 'Ó' => "\xc3\x93", #Latin capital letter O with acute
170 | 'Ô' => "\xc3\x94", #Latin capital letter O with circumflex
171 | 'Õ' => "\xc3\x95", #Latin capital letter O with tilde
172 | 'Ö' => "\xc3\x96", #Latin capital letter O with diaeresis
173 | '×' => "\xc3\x97", #U+00D7 [×] multiplication sign
174 | 'Ø' => "\xc3\x98", #Latin capital letter O with stroke = Latin capital letter O slash
175 | 'Ù' => "\xc3\x99", #Latin capital letter U with grave
176 | 'Ú' => "\xc3\x9a", #Latin capital letter U with acute
177 | 'Û' => "\xc3\x9b", #Latin capital letter U with circumflex
178 | 'Ü' => "\xc3\x9c", #Latin capital letter U with diaeresis
179 | 'Ý' => "\xc3\x9d", #Latin capital letter Y with acute
180 | 'Þ' => "\xc3\x9e", #Latin capital letter THORN
181 | #Latin small letter
182 | 'ß' => "\xc3\x9f", #Latin small letter sharp s = ess-zed
183 | 'à' => "\xc3\xa0", #Latin small letter a with grave = Latin small letter a grave
184 | 'á' => "\xc3\xa1", #Latin small letter a with acute
185 | 'â' => "\xc3\xa2", #Latin small letter a with circumflex
186 | 'ã' => "\xc3\xa3", #Latin small letter a with tilde
187 | 'ä' => "\xc3\xa4", #Latin small letter a with diaeresis
188 | 'å' => "\xc3\xa5", #Latin small letter a with ring above = Latin small letter a ring
189 | 'æ' => "\xc3\xa6", #Latin small letter ae = Latin small ligature ae
190 | 'ç' => "\xc3\xa7", #Latin small letter c with cedilla
191 | 'è' => "\xc3\xa8", #Latin small letter e with grave
192 | 'é' => "\xc3\xa9", #Latin small letter e with acute
193 | 'ê' => "\xc3\xaa", #Latin small letter e with circumflex
194 | 'ë' => "\xc3\xab", #Latin small letter e with diaeresis
195 | 'ì' => "\xc3\xac", #Latin small letter i with grave
196 | 'í' => "\xc3\xad", #Latin small letter i with acute
197 | 'î' => "\xc3\xae", #Latin small letter i with circumflex
198 | 'ï' => "\xc3\xaf", #Latin small letter i with diaeresis
199 | 'ð' => "\xc3\xb0", #Latin small letter eth
200 | 'ñ' => "\xc3\xb1", #Latin small letter n with tilde
201 | 'ò' => "\xc3\xb2", #Latin small letter o with grave
202 | 'ó' => "\xc3\xb3", #Latin small letter o with acute
203 | 'ô' => "\xc3\xb4", #Latin small letter o with circumflex
204 | 'õ' => "\xc3\xb5", #Latin small letter o with tilde
205 | 'ö' => "\xc3\xb6", #Latin small letter o with diaeresis
206 | '÷' => "\xc3\xb7", #U+00F7 [÷] division sign
207 | 'ø' => "\xc3\xb8", #Latin small letter o with stroke = Latin small letter o slash
208 | 'ù' => "\xc3\xb9", #Latin small letter u with grave
209 | 'ú' => "\xc3\xba", #Latin small letter u with acute
210 | 'û' => "\xc3\xbb", #Latin small letter u with circumflex
211 | 'ü' => "\xc3\xbc", #Latin small letter u with diaeresis
212 | 'ý' => "\xc3\xbd", #Latin small letter y with acute
213 | 'þ' => "\xc3\xbe", #Latin small letter thorn
214 | 'ÿ' => "\xc3\xbf", #Latin small letter y with diaeresis
215 | #Symbols and Greek Letters:
216 | 'ƒ' => "\xc6\x92", #U+0192 [ƒ] Latin small f with hook = function = florin
217 | 'Α' => "\xce\x91", #Greek capital letter alpha
218 | 'Β' => "\xce\x92", #Greek capital letter beta
219 | 'Γ' => "\xce\x93", #Greek capital letter gamma
220 | 'Δ' => "\xce\x94", #Greek capital letter delta
221 | 'Ε' => "\xce\x95", #Greek capital letter epsilon
222 | 'Ζ' => "\xce\x96", #Greek capital letter zeta
223 | 'Η' => "\xce\x97", #Greek capital letter eta
224 | 'Θ' => "\xce\x98", #Greek capital letter theta
225 | 'Ι' => "\xce\x99", #Greek capital letter iota
226 | 'Κ' => "\xce\x9a", #Greek capital letter kappa
227 | 'Λ' => "\xce\x9b", #Greek capital letter lambda
228 | 'Μ' => "\xce\x9c", #Greek capital letter mu
229 | 'Ν' => "\xce\x9d", #Greek capital letter nu
230 | 'Ξ' => "\xce\x9e", #Greek capital letter xi
231 | 'Ο' => "\xce\x9f", #Greek capital letter omicron
232 | 'Π' => "\xce\xa0", #Greek capital letter pi
233 | 'Ρ' => "\xce\xa1", #Greek capital letter rho
234 | 'Σ' => "\xce\xa3", #Greek capital letter sigma
235 | 'Τ' => "\xce\xa4", #Greek capital letter tau
236 | 'Υ' => "\xce\xa5", #Greek capital letter upsilon
237 | 'Φ' => "\xce\xa6", #Greek capital letter phi
238 | 'Χ' => "\xce\xa7", #Greek capital letter chi
239 | 'Ψ' => "\xce\xa8", #Greek capital letter psi
240 | 'Ω' => "\xce\xa9", #Greek capital letter omega
241 | 'α' => "\xce\xb1", #Greek small letter alpha
242 | 'β' => "\xce\xb2", #Greek small letter beta
243 | 'γ' => "\xce\xb3", #Greek small letter gamma
244 | 'δ' => "\xce\xb4", #Greek small letter delta
245 | 'ε' => "\xce\xb5", #Greek small letter epsilon
246 | 'ζ' => "\xce\xb6", #Greek small letter zeta
247 | 'η' => "\xce\xb7", #Greek small letter eta
248 | 'θ' => "\xce\xb8", #Greek small letter theta
249 | 'ι' => "\xce\xb9", #Greek small letter iota
250 | 'κ' => "\xce\xba", #Greek small letter kappa
251 | 'λ' => "\xce\xbb", #Greek small letter lambda
252 | 'μ' => "\xce\xbc", #Greek small letter mu
253 | 'ν' => "\xce\xbd", #Greek small letter nu
254 | 'ξ' => "\xce\xbe", #Greek small letter xi
255 | 'ο' => "\xce\xbf", #Greek small letter omicron
256 | 'π' => "\xcf\x80", #Greek small letter pi
257 | 'ρ' => "\xcf\x81", #Greek small letter rho
258 | 'ς' => "\xcf\x82", #Greek small letter final sigma
259 | 'σ' => "\xcf\x83", #Greek small letter sigma
260 | 'τ' => "\xcf\x84", #Greek small letter tau
261 | 'υ' => "\xcf\x85", #Greek small letter upsilon
262 | 'φ' => "\xcf\x86", #Greek small letter phi
263 | 'χ' => "\xcf\x87", #Greek small letter chi
264 | 'ψ' => "\xcf\x88", #Greek small letter psi
265 | 'ω' => "\xcf\x89", #Greek small letter omega
266 | 'ϑ'=> "\xcf\x91", #Greek small letter theta symbol
267 | 'ϒ' => "\xcf\x92", #Greek upsilon with hook symbol
268 | 'ϖ' => "\xcf\x96", #U+03D6 [ϖ] Greek pi symbol
269 |
270 | '•' => "\xe2\x80\xa2", #U+2022 [•] bullet = black small circle
271 | '…' => "\xe2\x80\xa6", #U+2026 […] horizontal ellipsis = three dot leader
272 | '′' => "\xe2\x80\xb2", #U+2032 [′] prime = minutes = feet (для обозначения минут и футов)
273 | '″' => "\xe2\x80\xb3", #U+2033 [″] double prime = seconds = inches (для обозначения секунд и дюймов).
274 | '‾' => "\xe2\x80\xbe", #U+203E [‾] overline = spacing overscore
275 | '⁄' => "\xe2\x81\x84", #U+2044 [⁄] fraction slash
276 | '℘' => "\xe2\x84\x98", #U+2118 [℘] script capital P = power set = Weierstrass p
277 | 'ℑ' => "\xe2\x84\x91", #U+2111 [ℑ] blackletter capital I = imaginary part
278 | 'ℜ' => "\xe2\x84\x9c", #U+211C [ℜ] blackletter capital R = real part symbol
279 | '™' => "\xe2\x84\xa2", #U+2122 [™] trade mark sign
280 | 'ℵ' => "\xe2\x84\xb5", #U+2135 [ℵ] alef symbol = first transfinite cardinal
281 | '←' => "\xe2\x86\x90", #U+2190 [←] leftwards arrow
282 | '↑' => "\xe2\x86\x91", #U+2191 [↑] upwards arrow
283 | '→' => "\xe2\x86\x92", #U+2192 [→] rightwards arrow
284 | '↓' => "\xe2\x86\x93", #U+2193 [↓] downwards arrow
285 | '↔' => "\xe2\x86\x94", #U+2194 [↔] left right arrow
286 | '↵' => "\xe2\x86\xb5", #U+21B5 [↵] downwards arrow with corner leftwards = carriage return
287 | '⇐' => "\xe2\x87\x90", #U+21D0 [⇐] leftwards double arrow
288 | '⇑' => "\xe2\x87\x91", #U+21D1 [⇑] upwards double arrow
289 | '⇒' => "\xe2\x87\x92", #U+21D2 [⇒] rightwards double arrow
290 | '⇓' => "\xe2\x87\x93", #U+21D3 [⇓] downwards double arrow
291 | '⇔' => "\xe2\x87\x94", #U+21D4 [⇔] left right double arrow
292 | '∀' => "\xe2\x88\x80", #U+2200 [∀] for all
293 | '∂' => "\xe2\x88\x82", #U+2202 [∂] partial differential
294 | '∃' => "\xe2\x88\x83", #U+2203 [∃] there exists
295 | '∅' => "\xe2\x88\x85", #U+2205 [∅] empty set = null set = diameter
296 | '∇' => "\xe2\x88\x87", #U+2207 [∇] nabla = backward difference
297 | '∈' => "\xe2\x88\x88", #U+2208 [∈] element of
298 | '∉' => "\xe2\x88\x89", #U+2209 [∉] not an element of
299 | '∋' => "\xe2\x88\x8b", #U+220B [∋] contains as member
300 | '∏' => "\xe2\x88\x8f", #U+220F [∏] n-ary product = product sign
301 | '∑' => "\xe2\x88\x91", #U+2211 [∑] n-ary sumation
302 | '−' => "\xe2\x88\x92", #U+2212 [−] minus sign
303 | '∗' => "\xe2\x88\x97", #U+2217 [∗] asterisk operator
304 | '√' => "\xe2\x88\x9a", #U+221A [√] square root = radical sign
305 | '∝' => "\xe2\x88\x9d", #U+221D [∝] proportional to
306 | '∞' => "\xe2\x88\x9e", #U+221E [∞] infinity
307 | '∠' => "\xe2\x88\xa0", #U+2220 [∠] angle
308 | '∧' => "\xe2\x88\xa7", #U+2227 [∧] logical and = wedge
309 | '∨' => "\xe2\x88\xa8", #U+2228 [∨] logical or = vee
310 | '∩' => "\xe2\x88\xa9", #U+2229 [∩] intersection = cap
311 | '∪' => "\xe2\x88\xaa", #U+222A [∪] union = cup
312 | '∫' => "\xe2\x88\xab", #U+222B [∫] integral
313 | '∴' => "\xe2\x88\xb4", #U+2234 [∴] therefore
314 | '∼' => "\xe2\x88\xbc", #U+223C [∼] tilde operator = varies with = similar to
315 | '≅' => "\xe2\x89\x85", #U+2245 [≅] approximately equal to
316 | '≈' => "\xe2\x89\x88", #U+2248 [≈] almost equal to = asymptotic to
317 | '≠' => "\xe2\x89\xa0", #U+2260 [≠] not equal to
318 | '≡' => "\xe2\x89\xa1", #U+2261 [≡] identical to
319 | '≤' => "\xe2\x89\xa4", #U+2264 [≤] less-than or equal to
320 | '≥' => "\xe2\x89\xa5", #U+2265 [≥] greater-than or equal to
321 | '⊂' => "\xe2\x8a\x82", #U+2282 [⊂] subset of
322 | '⊃' => "\xe2\x8a\x83", #U+2283 [⊃] superset of
323 | '⊄' => "\xe2\x8a\x84", #U+2284 [⊄] not a subset of
324 | '⊆' => "\xe2\x8a\x86", #U+2286 [⊆] subset of or equal to
325 | '⊇' => "\xe2\x8a\x87", #U+2287 [⊇] superset of or equal to
326 | '⊕' => "\xe2\x8a\x95", #U+2295 [⊕] circled plus = direct sum
327 | '⊗' => "\xe2\x8a\x97", #U+2297 [⊗] circled times = vector product
328 | '⊥' => "\xe2\x8a\xa5", #U+22A5 [⊥] up tack = orthogonal to = perpendicular
329 | '⋅' => "\xe2\x8b\x85", #U+22C5 [⋅] dot operator
330 | '⌈' => "\xe2\x8c\x88", #U+2308 [⌈] left ceiling = APL upstile
331 | '⌉' => "\xe2\x8c\x89", #U+2309 [⌉] right ceiling
332 | '⌊' => "\xe2\x8c\x8a", #U+230A [⌊] left floor = APL downstile
333 | '⌋' => "\xe2\x8c\x8b", #U+230B [⌋] right floor
334 | '〈' => "\xe2\x8c\xa9", #U+2329 [〈] left-pointing angle bracket = bra
335 | '〉' => "\xe2\x8c\xaa", #U+232A [〉] right-pointing angle bracket = ket
336 | '◊' => "\xe2\x97\x8a", #U+25CA [◊] lozenge
337 | '♠' => "\xe2\x99\xa0", #U+2660 [♠] black spade suit
338 | '♣' => "\xe2\x99\xa3", #U+2663 [♣] black club suit = shamrock
339 | '♥' => "\xe2\x99\xa5", #U+2665 [♥] black heart suit = valentine
340 | '♦' => "\xe2\x99\xa6", #U+2666 [♦] black diamond suit
341 | #Other Special Characters:
342 | 'Œ' => "\xc5\x92", #U+0152 [Œ] Latin capital ligature OE
343 | 'œ' => "\xc5\x93", #U+0153 [œ] Latin small ligature oe
344 | 'Š' => "\xc5\xa0", #U+0160 [Š] Latin capital letter S with caron
345 | 'š' => "\xc5\xa1", #U+0161 [š] Latin small letter s with caron
346 | 'Ÿ' => "\xc5\xb8", #U+0178 [Ÿ] Latin capital letter Y with diaeresis
347 | 'ˆ' => "\xcb\x86", #U+02C6 [ˆ] modifier letter circumflex accent
348 | '˜' => "\xcb\x9c", #U+02DC [˜] small tilde
349 | ' ' => "\xe2\x80\x82", #U+2002 [ ] en space
350 | ' ' => "\xe2\x80\x83", #U+2003 [ ] em space
351 | ' ' => "\xe2\x80\x89", #U+2009 [ ] thin space
352 | '' => "\xe2\x80\x8c", #U+200C [] zero width non-joiner
353 | '' => "\xe2\x80\x8d", #U+200D [] zero width joiner
354 | '' => "\xe2\x80\x8e", #U+200E [] left-to-right mark
355 | '' => "\xe2\x80\x8f", #U+200F [] right-to-left mark
356 | '–' => "\xe2\x80\x93", #U+2013 [–] en dash
357 | '—' => "\xe2\x80\x94", #U+2014 [—] em dash
358 | '‘' => "\xe2\x80\x98", #U+2018 [‘] left single quotation mark
359 | '’' => "\xe2\x80\x99", #U+2019 [’] right single quotation mark (and apostrophe!)
360 | '‚' => "\xe2\x80\x9a", #U+201A [‚] single low-9 quotation mark
361 | '“' => "\xe2\x80\x9c", #U+201C [“] left double quotation mark
362 | '”' => "\xe2\x80\x9d", #U+201D [”] right double quotation mark
363 | '„' => "\xe2\x80\x9e", #U+201E [„] double low-9 quotation mark
364 | '†' => "\xe2\x80\xa0", #U+2020 [†] dagger
365 | '‡' => "\xe2\x80\xa1", #U+2021 [‡] double dagger
366 | '‰' => "\xe2\x80\xb0", #U+2030 [‰] per mille sign
367 | '‹' => "\xe2\x80\xb9", #U+2039 [‹] single left-pointing angle quotation mark
368 | '›' => "\xe2\x80\xba", #U+203A [›] single right-pointing angle quotation mark
369 | '€' => "\xe2\x82\xac", #U+20AC [€] euro sign
370 | );
371 |
372 | /**
373 | * This table contains the data on how cp1259 characters map into Unicode (UTF-8).
374 | * The cp1259 map describes standart tatarish cyrillic charset and based on the cp1251 table.
375 | * cp1259 -- this is an outdated one byte encoding of the Tatar language,
376 | * which includes all the Russian letters from cp1251.
377 | *
378 | * @link http://search.cpan.org/CPAN/authors/id/A/AM/AMICHAUER/Lingua-TT-Yanalif-0.08.tar.gz
379 | * @link http://www.unicode.org/charts/PDF/U0400.pdf
380 | */
381 | public static $cp1259_table = array(
382 | #bytes from 0x00 to 0x7F (ASCII) saved as is
383 | "\x80" => "\xd3\x98", #U+04d8 CYRILLIC CAPITAL LETTER SCHWA
384 | "\x81" => "\xd0\x83", #U+0403 CYRILLIC CAPITAL LETTER GJE
385 | "\x82" => "\xe2\x80\x9a", #U+201a SINGLE LOW-9 QUOTATION MARK
386 | "\x83" => "\xd1\x93", #U+0453 CYRILLIC SMALL LETTER GJE
387 | "\x84" => "\xe2\x80\x9e", #U+201e DOUBLE LOW-9 QUOTATION MARK
388 | "\x85" => "\xe2\x80\xa6", #U+2026 HORIZONTAL ELLIPSIS
389 | "\x86" => "\xe2\x80\xa0", #U+2020 DAGGER
390 | "\x87" => "\xe2\x80\xa1", #U+2021 DOUBLE DAGGER
391 | "\x88" => "\xe2\x82\xac", #U+20ac EURO SIGN
392 | "\x89" => "\xe2\x80\xb0", #U+2030 PER MILLE SIGN
393 | "\x8a" => "\xd3\xa8", #U+04e8 CYRILLIC CAPITAL LETTER BARRED O
394 | "\x8b" => "\xe2\x80\xb9", #U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
395 | "\x8c" => "\xd2\xae", #U+04ae CYRILLIC CAPITAL LETTER STRAIGHT U
396 | "\x8d" => "\xd2\x96", #U+0496 CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER
397 | "\x8e" => "\xd2\xa2", #U+04a2 CYRILLIC CAPITAL LETTER EN WITH HOOK
398 | "\x8f" => "\xd2\xba", #U+04ba CYRILLIC CAPITAL LETTER SHHA
399 | "\x90" => "\xd3\x99", #U+04d9 CYRILLIC SMALL LETTER SCHWA
400 | "\x91" => "\xe2\x80\x98", #U+2018 LEFT SINGLE QUOTATION MARK
401 | "\x92" => "\xe2\x80\x99", #U+2019 RIGHT SINGLE QUOTATION MARK
402 | "\x93" => "\xe2\x80\x9c", #U+201c LEFT DOUBLE QUOTATION MARK
403 | "\x94" => "\xe2\x80\x9d", #U+201d RIGHT DOUBLE QUOTATION MARK
404 | "\x95" => "\xe2\x80\xa2", #U+2022 BULLET
405 | "\x96" => "\xe2\x80\x93", #U+2013 EN DASH
406 | "\x97" => "\xe2\x80\x94", #U+2014 EM DASH
407 | #"\x98" #UNDEFINED
408 | "\x99" => "\xe2\x84\xa2", #U+2122 TRADE MARK SIGN
409 | "\x9a" => "\xd3\xa9", #U+04e9 CYRILLIC SMALL LETTER BARRED O
410 | "\x9b" => "\xe2\x80\xba", #U+203a SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
411 | "\x9c" => "\xd2\xaf", #U+04af CYRILLIC SMALL LETTER STRAIGHT U
412 | "\x9d" => "\xd2\x97", #U+0497 CYRILLIC SMALL LETTER ZHE WITH DESCENDER
413 | "\x9e" => "\xd2\xa3", #U+04a3 CYRILLIC SMALL LETTER EN WITH HOOK
414 | "\x9f" => "\xd2\xbb", #U+04bb CYRILLIC SMALL LETTER SHHA
415 | "\xa0" => "\xc2\xa0", #U+00a0 NO-BREAK SPACE
416 | "\xa1" => "\xd0\x8e", #U+040e CYRILLIC CAPITAL LETTER SHORT U
417 | "\xa2" => "\xd1\x9e", #U+045e CYRILLIC SMALL LETTER SHORT U
418 | "\xa3" => "\xd0\x88", #U+0408 CYRILLIC CAPITAL LETTER JE
419 | "\xa4" => "\xc2\xa4", #U+00a4 CURRENCY SIGN
420 | "\xa5" => "\xd2\x90", #U+0490 CYRILLIC CAPITAL LETTER GHE WITH UPTURN
421 | "\xa6" => "\xc2\xa6", #U+00a6 BROKEN BAR
422 | "\xa7" => "\xc2\xa7", #U+00a7 SECTION SIGN
423 | "\xa8" => "\xd0\x81", #U+0401 CYRILLIC CAPITAL LETTER IO
424 | "\xa9" => "\xc2\xa9", #U+00a9 COPYRIGHT SIGN
425 | "\xaa" => "\xd0\x84", #U+0404 CYRILLIC CAPITAL LETTER UKRAINIAN IE
426 | "\xab" => "\xc2\xab", #U+00ab LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
427 | "\xac" => "\xc2\xac", #U+00ac NOT SIGN
428 | "\xad" => "\xc2\xad", #U+00ad SOFT HYPHEN
429 | "\xae" => "\xc2\xae", #U+00ae REGISTERED SIGN
430 | "\xaf" => "\xd0\x87", #U+0407 CYRILLIC CAPITAL LETTER YI
431 | "\xb0" => "\xc2\xb0", #U+00b0 DEGREE SIGN
432 | "\xb1" => "\xc2\xb1", #U+00b1 PLUS-MINUS SIGN
433 | "\xb2" => "\xd0\x86", #U+0406 CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I
434 | "\xb3" => "\xd1\x96", #U+0456 CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
435 | "\xb4" => "\xd2\x91", #U+0491 CYRILLIC SMALL LETTER GHE WITH UPTURN
436 | "\xb5" => "\xc2\xb5", #U+00b5 MICRO SIGN
437 | "\xb6" => "\xc2\xb6", #U+00b6 PILCROW SIGN
438 | "\xb7" => "\xc2\xb7", #U+00b7 MIDDLE DOT
439 | "\xb8" => "\xd1\x91", #U+0451 CYRILLIC SMALL LETTER IO
440 | "\xb9" => "\xe2\x84\x96", #U+2116 NUMERO SIGN
441 | "\xba" => "\xd1\x94", #U+0454 CYRILLIC SMALL LETTER UKRAINIAN IE
442 | "\xbb" => "\xc2\xbb", #U+00bb RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
443 | "\xbc" => "\xd1\x98", #U+0458 CYRILLIC SMALL LETTER JE
444 | "\xbd" => "\xd0\x85", #U+0405 CYRILLIC CAPITAL LETTER DZE
445 | "\xbe" => "\xd1\x95", #U+0455 CYRILLIC SMALL LETTER DZE
446 | "\xbf" => "\xd1\x97", #U+0457 CYRILLIC SMALL LETTER YI
447 | "\xc0" => "\xd0\x90", #U+0410 CYRILLIC CAPITAL LETTER A
448 | "\xc1" => "\xd0\x91", #U+0411 CYRILLIC CAPITAL LETTER BE
449 | "\xc2" => "\xd0\x92", #U+0412 CYRILLIC CAPITAL LETTER VE
450 | "\xc3" => "\xd0\x93", #U+0413 CYRILLIC CAPITAL LETTER GHE
451 | "\xc4" => "\xd0\x94", #U+0414 CYRILLIC CAPITAL LETTER DE
452 | "\xc5" => "\xd0\x95", #U+0415 CYRILLIC CAPITAL LETTER IE
453 | "\xc6" => "\xd0\x96", #U+0416 CYRILLIC CAPITAL LETTER ZHE
454 | "\xc7" => "\xd0\x97", #U+0417 CYRILLIC CAPITAL LETTER ZE
455 | "\xc8" => "\xd0\x98", #U+0418 CYRILLIC CAPITAL LETTER I
456 | "\xc9" => "\xd0\x99", #U+0419 CYRILLIC CAPITAL LETTER SHORT I
457 | "\xca" => "\xd0\x9a", #U+041a CYRILLIC CAPITAL LETTER KA
458 | "\xcb" => "\xd0\x9b", #U+041b CYRILLIC CAPITAL LETTER EL
459 | "\xcc" => "\xd0\x9c", #U+041c CYRILLIC CAPITAL LETTER EM
460 | "\xcd" => "\xd0\x9d", #U+041d CYRILLIC CAPITAL LETTER EN
461 | "\xce" => "\xd0\x9e", #U+041e CYRILLIC CAPITAL LETTER O
462 | "\xcf" => "\xd0\x9f", #U+041f CYRILLIC CAPITAL LETTER PE
463 | "\xd0" => "\xd0\xa0", #U+0420 CYRILLIC CAPITAL LETTER ER
464 | "\xd1" => "\xd0\xa1", #U+0421 CYRILLIC CAPITAL LETTER ES
465 | "\xd2" => "\xd0\xa2", #U+0422 CYRILLIC CAPITAL LETTER TE
466 | "\xd3" => "\xd0\xa3", #U+0423 CYRILLIC CAPITAL LETTER U
467 | "\xd4" => "\xd0\xa4", #U+0424 CYRILLIC CAPITAL LETTER EF
468 | "\xd5" => "\xd0\xa5", #U+0425 CYRILLIC CAPITAL LETTER HA
469 | "\xd6" => "\xd0\xa6", #U+0426 CYRILLIC CAPITAL LETTER TSE
470 | "\xd7" => "\xd0\xa7", #U+0427 CYRILLIC CAPITAL LETTER CHE
471 | "\xd8" => "\xd0\xa8", #U+0428 CYRILLIC CAPITAL LETTER SHA
472 | "\xd9" => "\xd0\xa9", #U+0429 CYRILLIC CAPITAL LETTER SHCHA
473 | "\xda" => "\xd0\xaa", #U+042a CYRILLIC CAPITAL LETTER HARD SIGN
474 | "\xdb" => "\xd0\xab", #U+042b CYRILLIC CAPITAL LETTER YERU
475 | "\xdc" => "\xd0\xac", #U+042c CYRILLIC CAPITAL LETTER SOFT SIGN
476 | "\xdd" => "\xd0\xad", #U+042d CYRILLIC CAPITAL LETTER E
477 | "\xde" => "\xd0\xae", #U+042e CYRILLIC CAPITAL LETTER YU
478 | "\xdf" => "\xd0\xaf", #U+042f CYRILLIC CAPITAL LETTER YA
479 | "\xe0" => "\xd0\xb0", #U+0430 CYRILLIC SMALL LETTER A
480 | "\xe1" => "\xd0\xb1", #U+0431 CYRILLIC SMALL LETTER BE
481 | "\xe2" => "\xd0\xb2", #U+0432 CYRILLIC SMALL LETTER VE
482 | "\xe3" => "\xd0\xb3", #U+0433 CYRILLIC SMALL LETTER GHE
483 | "\xe4" => "\xd0\xb4", #U+0434 CYRILLIC SMALL LETTER DE
484 | "\xe5" => "\xd0\xb5", #U+0435 CYRILLIC SMALL LETTER IE
485 | "\xe6" => "\xd0\xb6", #U+0436 CYRILLIC SMALL LETTER ZHE
486 | "\xe7" => "\xd0\xb7", #U+0437 CYRILLIC SMALL LETTER ZE
487 | "\xe8" => "\xd0\xb8", #U+0438 CYRILLIC SMALL LETTER I
488 | "\xe9" => "\xd0\xb9", #U+0439 CYRILLIC SMALL LETTER SHORT I
489 | "\xea" => "\xd0\xba", #U+043a CYRILLIC SMALL LETTER KA
490 | "\xeb" => "\xd0\xbb", #U+043b CYRILLIC SMALL LETTER EL
491 | "\xec" => "\xd0\xbc", #U+043c CYRILLIC SMALL LETTER EM
492 | "\xed" => "\xd0\xbd", #U+043d CYRILLIC SMALL LETTER EN
493 | "\xee" => "\xd0\xbe", #U+043e CYRILLIC SMALL LETTER O
494 | "\xef" => "\xd0\xbf", #U+043f CYRILLIC SMALL LETTER PE
495 | "\xf0" => "\xd1\x80", #U+0440 CYRILLIC SMALL LETTER ER
496 | "\xf1" => "\xd1\x81", #U+0441 CYRILLIC SMALL LETTER ES
497 | "\xf2" => "\xd1\x82", #U+0442 CYRILLIC SMALL LETTER TE
498 | "\xf3" => "\xd1\x83", #U+0443 CYRILLIC SMALL LETTER U
499 | "\xf4" => "\xd1\x84", #U+0444 CYRILLIC SMALL LETTER EF
500 | "\xf5" => "\xd1\x85", #U+0445 CYRILLIC SMALL LETTER HA
501 | "\xf6" => "\xd1\x86", #U+0446 CYRILLIC SMALL LETTER TSE
502 | "\xf7" => "\xd1\x87", #U+0447 CYRILLIC SMALL LETTER CHE
503 | "\xf8" => "\xd1\x88", #U+0448 CYRILLIC SMALL LETTER SHA
504 | "\xf9" => "\xd1\x89", #U+0449 CYRILLIC SMALL LETTER SHCHA
505 | "\xfa" => "\xd1\x8a", #U+044a CYRILLIC SMALL LETTER HARD SIGN
506 | "\xfb" => "\xd1\x8b", #U+044b CYRILLIC SMALL LETTER YERU
507 | "\xfc" => "\xd1\x8c", #U+044c CYRILLIC SMALL LETTER SOFT SIGN
508 | "\xfd" => "\xd1\x8d", #U+044d CYRILLIC SMALL LETTER E
509 | "\xfe" => "\xd1\x8e", #U+044e CYRILLIC SMALL LETTER YU
510 | "\xff" => "\xd1\x8f", #U+044f CYRILLIC SMALL LETTER YA
511 | );
512 |
513 | /**
514 | * UTF-8 Case lookup table
515 | *
516 | * This lookuptable defines the upper case letters to their correspponding
517 | * lower case letter in UTF-8
518 | *
519 | * @author Andreas Gohr
520 | */
521 | public static $convert_case_table = array(
522 | #CASE_UPPER => case_lower
523 | "\x41" => "\x61", #A a
524 | "\x42" => "\x62", #B b
525 | "\x43" => "\x63", #C c
526 | "\x44" => "\x64", #D d
527 | "\x45" => "\x65", #E e
528 | "\x46" => "\x66", #F f
529 | "\x47" => "\x67", #G g
530 | "\x48" => "\x68", #H h
531 | "\x49" => "\x69", #I i
532 | "\x4a" => "\x6a", #J j
533 | "\x4b" => "\x6b", #K k
534 | "\x4c" => "\x6c", #L l
535 | "\x4d" => "\x6d", #M m
536 | "\x4e" => "\x6e", #N n
537 | "\x4f" => "\x6f", #O o
538 | "\x50" => "\x70", #P p
539 | "\x51" => "\x71", #Q q
540 | "\x52" => "\x72", #R r
541 | "\x53" => "\x73", #S s
542 | "\x54" => "\x74", #T t
543 | "\x55" => "\x75", #U u
544 | "\x56" => "\x76", #V v
545 | "\x57" => "\x77", #W w
546 | "\x58" => "\x78", #X x
547 | "\x59" => "\x79", #Y y
548 | "\x5a" => "\x7a", #Z z
549 | "\xc3\x80" => "\xc3\xa0",
550 | "\xc3\x81" => "\xc3\xa1",
551 | "\xc3\x82" => "\xc3\xa2",
552 | "\xc3\x83" => "\xc3\xa3",
553 | "\xc3\x84" => "\xc3\xa4",
554 | "\xc3\x85" => "\xc3\xa5",
555 | "\xc3\x86" => "\xc3\xa6",
556 | "\xc3\x87" => "\xc3\xa7",
557 | "\xc3\x88" => "\xc3\xa8",
558 | "\xc3\x89" => "\xc3\xa9",
559 | "\xc3\x8a" => "\xc3\xaa",
560 | "\xc3\x8b" => "\xc3\xab",
561 | "\xc3\x8c" => "\xc3\xac",
562 | "\xc3\x8d" => "\xc3\xad",
563 | "\xc3\x8e" => "\xc3\xae",
564 | "\xc3\x8f" => "\xc3\xaf",
565 | "\xc3\x90" => "\xc3\xb0",
566 | "\xc3\x91" => "\xc3\xb1",
567 | "\xc3\x92" => "\xc3\xb2",
568 | "\xc3\x93" => "\xc3\xb3",
569 | "\xc3\x94" => "\xc3\xb4",
570 | "\xc3\x95" => "\xc3\xb5",
571 | "\xc3\x96" => "\xc3\xb6",
572 | "\xc3\x98" => "\xc3\xb8",
573 | "\xc3\x99" => "\xc3\xb9",
574 | "\xc3\x9a" => "\xc3\xba",
575 | "\xc3\x9b" => "\xc3\xbb",
576 | "\xc3\x9c" => "\xc3\xbc",
577 | "\xc3\x9d" => "\xc3\xbd",
578 | "\xc3\x9e" => "\xc3\xbe",
579 | "\xc4\x80" => "\xc4\x81",
580 | "\xc4\x82" => "\xc4\x83",
581 | "\xc4\x84" => "\xc4\x85",
582 | "\xc4\x86" => "\xc4\x87",
583 | "\xc4\x88" => "\xc4\x89",
584 | "\xc4\x8a" => "\xc4\x8b",
585 | "\xc4\x8c" => "\xc4\x8d",
586 | "\xc4\x8e" => "\xc4\x8f",
587 | "\xc4\x90" => "\xc4\x91",
588 | "\xc4\x92" => "\xc4\x93",
589 | "\xc4\x94" => "\xc4\x95",
590 | "\xc4\x96" => "\xc4\x97",
591 | "\xc4\x98" => "\xc4\x99",
592 | "\xc4\x9a" => "\xc4\x9b",
593 | "\xc4\x9c" => "\xc4\x9d",
594 | "\xc4\x9e" => "\xc4\x9f",
595 | "\xc4\xa0" => "\xc4\xa1",
596 | "\xc4\xa2" => "\xc4\xa3",
597 | "\xc4\xa4" => "\xc4\xa5",
598 | "\xc4\xa6" => "\xc4\xa7",
599 | "\xc4\xa8" => "\xc4\xa9",
600 | "\xc4\xaa" => "\xc4\xab",
601 | "\xc4\xac" => "\xc4\xad",
602 | "\xc4\xae" => "\xc4\xaf",
603 | "\xc4\xb2" => "\xc4\xb3",
604 | "\xc4\xb4" => "\xc4\xb5",
605 | "\xc4\xb6" => "\xc4\xb7",
606 | "\xc4\xb9" => "\xc4\xba",
607 | "\xc4\xbb" => "\xc4\xbc",
608 | "\xc4\xbd" => "\xc4\xbe",
609 | "\xc4\xbf" => "\xc5\x80",
610 | "\xc5\x81" => "\xc5\x82",
611 | "\xc5\x83" => "\xc5\x84",
612 | "\xc5\x85" => "\xc5\x86",
613 | "\xc5\x87" => "\xc5\x88",
614 | "\xc5\x8a" => "\xc5\x8b",
615 | "\xc5\x8c" => "\xc5\x8d",
616 | "\xc5\x8e" => "\xc5\x8f",
617 | "\xc5\x90" => "\xc5\x91",
618 | "\xc5\x92" => "\xc5\x93",
619 | "\xc5\x94" => "\xc5\x95",
620 | "\xc5\x96" => "\xc5\x97",
621 | "\xc5\x98" => "\xc5\x99",
622 | "\xc5\x9a" => "\xc5\x9b",
623 | "\xc5\x9c" => "\xc5\x9d",
624 | "\xc5\x9e" => "\xc5\x9f",
625 | "\xc5\xa0" => "\xc5\xa1",
626 | "\xc5\xa2" => "\xc5\xa3",
627 | "\xc5\xa4" => "\xc5\xa5",
628 | "\xc5\xa6" => "\xc5\xa7",
629 | "\xc5\xa8" => "\xc5\xa9",
630 | "\xc5\xaa" => "\xc5\xab",
631 | "\xc5\xac" => "\xc5\xad",
632 | "\xc5\xae" => "\xc5\xaf",
633 | "\xc5\xb0" => "\xc5\xb1",
634 | "\xc5\xb2" => "\xc5\xb3",
635 | "\xc5\xb4" => "\xc5\xb5",
636 | "\xc5\xb6" => "\xc5\xb7",
637 | "\xc5\xb8" => "\xc3\xbf",
638 | "\xc5\xb9" => "\xc5\xba",
639 | "\xc5\xbb" => "\xc5\xbc",
640 | "\xc5\xbd" => "\xc5\xbe",
641 | "\xc6\x81" => "\xc9\x93",
642 | "\xc6\x82" => "\xc6\x83",
643 | "\xc6\x84" => "\xc6\x85",
644 | "\xc6\x86" => "\xc9\x94",
645 | "\xc6\x87" => "\xc6\x88",
646 | "\xc6\x89" => "\xc9\x96",
647 | "\xc6\x8a" => "\xc9\x97",
648 | "\xc6\x8b" => "\xc6\x8c",
649 | "\xc6\x8e" => "\xc7\x9d",
650 | "\xc6\x8f" => "\xc9\x99",
651 | "\xc6\x90" => "\xc9\x9b",
652 | "\xc6\x91" => "\xc6\x92",
653 | "\xc6\x94" => "\xc9\xa3",
654 | "\xc6\x96" => "\xc9\xa9",
655 | "\xc6\x97" => "\xc9\xa8",
656 | "\xc6\x98" => "\xc6\x99",
657 | "\xc6\x9c" => "\xc9\xaf",
658 | "\xc6\x9d" => "\xc9\xb2",
659 | "\xc6\x9f" => "\xc9\xb5",
660 | "\xc6\xa0" => "\xc6\xa1",
661 | "\xc6\xa2" => "\xc6\xa3",
662 | "\xc6\xa4" => "\xc6\xa5",
663 | "\xc6\xa6" => "\xca\x80",
664 | "\xc6\xa7" => "\xc6\xa8",
665 | "\xc6\xa9" => "\xca\x83",
666 | "\xc6\xac" => "\xc6\xad",
667 | "\xc6\xae" => "\xca\x88",
668 | "\xc6\xaf" => "\xc6\xb0",
669 | "\xc6\xb1" => "\xca\x8a",
670 | "\xc6\xb2" => "\xca\x8b",
671 | "\xc6\xb3" => "\xc6\xb4",
672 | "\xc6\xb5" => "\xc6\xb6",
673 | "\xc6\xb7" => "\xca\x92",
674 | "\xc6\xb8" => "\xc6\xb9",
675 | "\xc6\xbc" => "\xc6\xbd",
676 | "\xc7\x85" => "\xc7\x86",
677 | "\xc7\x88" => "\xc7\x89",
678 | "\xc7\x8b" => "\xc7\x8c",
679 | "\xc7\x8d" => "\xc7\x8e",
680 | "\xc7\x8f" => "\xc7\x90",
681 | "\xc7\x91" => "\xc7\x92",
682 | "\xc7\x93" => "\xc7\x94",
683 | "\xc7\x95" => "\xc7\x96",
684 | "\xc7\x97" => "\xc7\x98",
685 | "\xc7\x99" => "\xc7\x9a",
686 | "\xc7\x9b" => "\xc7\x9c",
687 | "\xc7\x9e" => "\xc7\x9f",
688 | "\xc7\xa0" => "\xc7\xa1",
689 | "\xc7\xa2" => "\xc7\xa3",
690 | "\xc7\xa4" => "\xc7\xa5",
691 | "\xc7\xa6" => "\xc7\xa7",
692 | "\xc7\xa8" => "\xc7\xa9",
693 | "\xc7\xaa" => "\xc7\xab",
694 | "\xc7\xac" => "\xc7\xad",
695 | "\xc7\xae" => "\xc7\xaf",
696 | "\xc7\xb2" => "\xc7\xb3",
697 | "\xc7\xb4" => "\xc7\xb5",
698 | "\xc7\xb6" => "\xc6\x95",
699 | "\xc7\xb7" => "\xc6\xbf",
700 | "\xc7\xb8" => "\xc7\xb9",
701 | "\xc7\xba" => "\xc7\xbb",
702 | "\xc7\xbc" => "\xc7\xbd",
703 | "\xc7\xbe" => "\xc7\xbf",
704 | "\xc8\x80" => "\xc8\x81",
705 | "\xc8\x82" => "\xc8\x83",
706 | "\xc8\x84" => "\xc8\x85",
707 | "\xc8\x86" => "\xc8\x87",
708 | "\xc8\x88" => "\xc8\x89",
709 | "\xc8\x8a" => "\xc8\x8b",
710 | "\xc8\x8c" => "\xc8\x8d",
711 | "\xc8\x8e" => "\xc8\x8f",
712 | "\xc8\x90" => "\xc8\x91",
713 | "\xc8\x92" => "\xc8\x93",
714 | "\xc8\x94" => "\xc8\x95",
715 | "\xc8\x96" => "\xc8\x97",
716 | "\xc8\x98" => "\xc8\x99",
717 | "\xc8\x9a" => "\xc8\x9b",
718 | "\xc8\x9c" => "\xc8\x9d",
719 | "\xc8\x9e" => "\xc8\x9f",
720 | "\xc8\xa0" => "\xc6\x9e",
721 | "\xc8\xa2" => "\xc8\xa3",
722 | "\xc8\xa4" => "\xc8\xa5",
723 | "\xc8\xa6" => "\xc8\xa7",
724 | "\xc8\xa8" => "\xc8\xa9",
725 | "\xc8\xaa" => "\xc8\xab",
726 | "\xc8\xac" => "\xc8\xad",
727 | "\xc8\xae" => "\xc8\xaf",
728 | "\xc8\xb0" => "\xc8\xb1",
729 | "\xc8\xb2" => "\xc8\xb3",
730 | "\xce\x86" => "\xce\xac",
731 | "\xce\x88" => "\xce\xad",
732 | "\xce\x89" => "\xce\xae",
733 | "\xce\x8a" => "\xce\xaf",
734 | "\xce\x8c" => "\xcf\x8c",
735 | "\xce\x8e" => "\xcf\x8d",
736 | "\xce\x8f" => "\xcf\x8e",
737 | "\xce\x91" => "\xce\xb1",
738 | "\xce\x92" => "\xce\xb2",
739 | "\xce\x93" => "\xce\xb3",
740 | "\xce\x94" => "\xce\xb4",
741 | "\xce\x95" => "\xce\xb5",
742 | "\xce\x96" => "\xce\xb6",
743 | "\xce\x97" => "\xce\xb7",
744 | "\xce\x98" => "\xce\xb8",
745 | "\xce\x99" => "\xce\xb9",
746 | "\xce\x9a" => "\xce\xba",
747 | "\xce\x9b" => "\xce\xbb",
748 | "\xce\x9c" => "\xc2\xb5",
749 | "\xce\x9d" => "\xce\xbd",
750 | "\xce\x9e" => "\xce\xbe",
751 | "\xce\x9f" => "\xce\xbf",
752 | "\xce\xa0" => "\xcf\x80",
753 | "\xce\xa1" => "\xcf\x81",
754 | "\xce\xa3" => "\xcf\x82",
755 | "\xce\xa4" => "\xcf\x84",
756 | "\xce\xa5" => "\xcf\x85",
757 | "\xce\xa6" => "\xcf\x86",
758 | "\xce\xa7" => "\xcf\x87",
759 | "\xce\xa8" => "\xcf\x88",
760 | "\xce\xa9" => "\xcf\x89",
761 | "\xce\xaa" => "\xcf\x8a",
762 | "\xce\xab" => "\xcf\x8b",
763 | "\xcf\x98" => "\xcf\x99",
764 | "\xcf\x9a" => "\xcf\x9b",
765 | "\xcf\x9c" => "\xcf\x9d",
766 | "\xcf\x9e" => "\xcf\x9f",
767 | "\xcf\xa0" => "\xcf\xa1",
768 | "\xcf\xa2" => "\xcf\xa3",
769 | "\xcf\xa4" => "\xcf\xa5",
770 | "\xcf\xa6" => "\xcf\xa7",
771 | "\xcf\xa8" => "\xcf\xa9",
772 | "\xcf\xaa" => "\xcf\xab",
773 | "\xcf\xac" => "\xcf\xad",
774 | "\xcf\xae" => "\xcf\xaf",
775 | "\xd0\x80" => "\xd1\x90",
776 | "\xd0\x81" => "\xd1\x91",
777 | "\xd0\x82" => "\xd1\x92",
778 | "\xd0\x83" => "\xd1\x93",
779 | "\xd0\x84" => "\xd1\x94",
780 | "\xd0\x85" => "\xd1\x95",
781 | "\xd0\x86" => "\xd1\x96",
782 | "\xd0\x87" => "\xd1\x97",
783 | "\xd0\x88" => "\xd1\x98",
784 | "\xd0\x89" => "\xd1\x99",
785 | "\xd0\x8a" => "\xd1\x9a",
786 | "\xd0\x8b" => "\xd1\x9b",
787 | "\xd0\x8c" => "\xd1\x9c",
788 | "\xd0\x8d" => "\xd1\x9d",
789 | "\xd0\x8e" => "\xd1\x9e",
790 | "\xd0\x8f" => "\xd1\x9f",
791 | "\xd0\x90" => "\xd0\xb0",
792 | "\xd0\x91" => "\xd0\xb1",
793 | "\xd0\x92" => "\xd0\xb2",
794 | "\xd0\x93" => "\xd0\xb3",
795 | "\xd0\x94" => "\xd0\xb4",
796 | "\xd0\x95" => "\xd0\xb5",
797 | "\xd0\x96" => "\xd0\xb6",
798 | "\xd0\x97" => "\xd0\xb7",
799 | "\xd0\x98" => "\xd0\xb8",
800 | "\xd0\x99" => "\xd0\xb9",
801 | "\xd0\x9a" => "\xd0\xba",
802 | "\xd0\x9b" => "\xd0\xbb",
803 | "\xd0\x9c" => "\xd0\xbc",
804 | "\xd0\x9d" => "\xd0\xbd",
805 | "\xd0\x9e" => "\xd0\xbe",
806 | "\xd0\x9f" => "\xd0\xbf",
807 | "\xd0\xa0" => "\xd1\x80",
808 | "\xd0\xa1" => "\xd1\x81",
809 | "\xd0\xa2" => "\xd1\x82",
810 | "\xd0\xa3" => "\xd1\x83",
811 | "\xd0\xa4" => "\xd1\x84",
812 | "\xd0\xa5" => "\xd1\x85",
813 | "\xd0\xa6" => "\xd1\x86",
814 | "\xd0\xa7" => "\xd1\x87",
815 | "\xd0\xa8" => "\xd1\x88",
816 | "\xd0\xa9" => "\xd1\x89",
817 | "\xd0\xaa" => "\xd1\x8a",
818 | "\xd0\xab" => "\xd1\x8b",
819 | "\xd0\xac" => "\xd1\x8c",
820 | "\xd0\xad" => "\xd1\x8d",
821 | "\xd0\xae" => "\xd1\x8e",
822 | "\xd0\xaf" => "\xd1\x8f",
823 | "\xd1\xa0" => "\xd1\xa1",
824 | "\xd1\xa2" => "\xd1\xa3",
825 | "\xd1\xa4" => "\xd1\xa5",
826 | "\xd1\xa6" => "\xd1\xa7",
827 | "\xd1\xa8" => "\xd1\xa9",
828 | "\xd1\xaa" => "\xd1\xab",
829 | "\xd1\xac" => "\xd1\xad",
830 | "\xd1\xae" => "\xd1\xaf",
831 | "\xd1\xb0" => "\xd1\xb1",
832 | "\xd1\xb2" => "\xd1\xb3",
833 | "\xd1\xb4" => "\xd1\xb5",
834 | "\xd1\xb6" => "\xd1\xb7",
835 | "\xd1\xb8" => "\xd1\xb9",
836 | "\xd1\xba" => "\xd1\xbb",
837 | "\xd1\xbc" => "\xd1\xbd",
838 | "\xd1\xbe" => "\xd1\xbf",
839 | "\xd2\x80" => "\xd2\x81",
840 | "\xd2\x8a" => "\xd2\x8b",
841 | "\xd2\x8c" => "\xd2\x8d",
842 | "\xd2\x8e" => "\xd2\x8f",
843 | "\xd2\x90" => "\xd2\x91",
844 | "\xd2\x92" => "\xd2\x93",
845 | "\xd2\x94" => "\xd2\x95",
846 | "\xd2\x96" => "\xd2\x97",
847 | "\xd2\x98" => "\xd2\x99",
848 | "\xd2\x9a" => "\xd2\x9b",
849 | "\xd2\x9c" => "\xd2\x9d",
850 | "\xd2\x9e" => "\xd2\x9f",
851 | "\xd2\xa0" => "\xd2\xa1",
852 | "\xd2\xa2" => "\xd2\xa3",
853 | "\xd2\xa4" => "\xd2\xa5",
854 | "\xd2\xa6" => "\xd2\xa7",
855 | "\xd2\xa8" => "\xd2\xa9",
856 | "\xd2\xaa" => "\xd2\xab",
857 | "\xd2\xac" => "\xd2\xad",
858 | "\xd2\xae" => "\xd2\xaf",
859 | "\xd2\xb0" => "\xd2\xb1",
860 | "\xd2\xb2" => "\xd2\xb3",
861 | "\xd2\xb4" => "\xd2\xb5",
862 | "\xd2\xb6" => "\xd2\xb7",
863 | "\xd2\xb8" => "\xd2\xb9",
864 | "\xd2\xba" => "\xd2\xbb",
865 | "\xd2\xbc" => "\xd2\xbd",
866 | "\xd2\xbe" => "\xd2\xbf",
867 | "\xd3\x81" => "\xd3\x82",
868 | "\xd3\x83" => "\xd3\x84",
869 | "\xd3\x85" => "\xd3\x86",
870 | "\xd3\x87" => "\xd3\x88",
871 | "\xd3\x89" => "\xd3\x8a",
872 | "\xd3\x8b" => "\xd3\x8c",
873 | "\xd3\x8d" => "\xd3\x8e",
874 | "\xd3\x90" => "\xd3\x91",
875 | "\xd3\x92" => "\xd3\x93",
876 | "\xd3\x94" => "\xd3\x95",
877 | "\xd3\x96" => "\xd3\x97",
878 | "\xd3\x98" => "\xd3\x99",
879 | "\xd3\x9a" => "\xd3\x9b",
880 | "\xd3\x9c" => "\xd3\x9d",
881 | "\xd3\x9e" => "\xd3\x9f",
882 | "\xd3\xa0" => "\xd3\xa1",
883 | "\xd3\xa2" => "\xd3\xa3",
884 | "\xd3\xa4" => "\xd3\xa5",
885 | "\xd3\xa6" => "\xd3\xa7",
886 | "\xd3\xa8" => "\xd3\xa9",
887 | "\xd3\xaa" => "\xd3\xab",
888 | "\xd3\xac" => "\xd3\xad",
889 | "\xd3\xae" => "\xd3\xaf",
890 | "\xd3\xb0" => "\xd3\xb1",
891 | "\xd3\xb2" => "\xd3\xb3",
892 | "\xd3\xb4" => "\xd3\xb5",
893 | "\xd3\xb8" => "\xd3\xb9",
894 | "\xd4\x80" => "\xd4\x81",
895 | "\xd4\x82" => "\xd4\x83",
896 | "\xd4\x84" => "\xd4\x85",
897 | "\xd4\x86" => "\xd4\x87",
898 | "\xd4\x88" => "\xd4\x89",
899 | "\xd4\x8a" => "\xd4\x8b",
900 | "\xd4\x8c" => "\xd4\x8d",
901 | "\xd4\x8e" => "\xd4\x8f",
902 | "\xd4\xb1" => "\xd5\xa1",
903 | "\xd4\xb2" => "\xd5\xa2",
904 | "\xd4\xb3" => "\xd5\xa3",
905 | "\xd4\xb4" => "\xd5\xa4",
906 | "\xd4\xb5" => "\xd5\xa5",
907 | "\xd4\xb6" => "\xd5\xa6",
908 | "\xd4\xb7" => "\xd5\xa7",
909 | "\xd4\xb8" => "\xd5\xa8",
910 | "\xd4\xb9" => "\xd5\xa9",
911 | "\xd4\xba" => "\xd5\xaa",
912 | "\xd4\xbb" => "\xd5\xab",
913 | "\xd4\xbc" => "\xd5\xac",
914 | "\xd4\xbd" => "\xd5\xad",
915 | "\xd4\xbe" => "\xd5\xae",
916 | "\xd4\xbf" => "\xd5\xaf",
917 | "\xd5\x80" => "\xd5\xb0",
918 | "\xd5\x81" => "\xd5\xb1",
919 | "\xd5\x82" => "\xd5\xb2",
920 | "\xd5\x83" => "\xd5\xb3",
921 | "\xd5\x84" => "\xd5\xb4",
922 | "\xd5\x85" => "\xd5\xb5",
923 | "\xd5\x86" => "\xd5\xb6",
924 | "\xd5\x87" => "\xd5\xb7",
925 | "\xd5\x88" => "\xd5\xb8",
926 | "\xd5\x89" => "\xd5\xb9",
927 | "\xd5\x8a" => "\xd5\xba",
928 | "\xd5\x8b" => "\xd5\xbb",
929 | "\xd5\x8c" => "\xd5\xbc",
930 | "\xd5\x8d" => "\xd5\xbd",
931 | "\xd5\x8e" => "\xd5\xbe",
932 | "\xd5\x8f" => "\xd5\xbf",
933 | "\xd5\x90" => "\xd6\x80",
934 | "\xd5\x91" => "\xd6\x81",
935 | "\xd5\x92" => "\xd6\x82",
936 | "\xd5\x93" => "\xd6\x83",
937 | "\xd5\x94" => "\xd6\x84",
938 | "\xd5\x95" => "\xd6\x85",
939 | "\xd5\x96" => "\xd6\x86",
940 | "\xe1\xb8\x80" => "\xe1\xb8\x81",
941 | "\xe1\xb8\x82" => "\xe1\xb8\x83",
942 | "\xe1\xb8\x84" => "\xe1\xb8\x85",
943 | "\xe1\xb8\x86" => "\xe1\xb8\x87",
944 | "\xe1\xb8\x88" => "\xe1\xb8\x89",
945 | "\xe1\xb8\x8a" => "\xe1\xb8\x8b",
946 | "\xe1\xb8\x8c" => "\xe1\xb8\x8d",
947 | "\xe1\xb8\x8e" => "\xe1\xb8\x8f",
948 | "\xe1\xb8\x90" => "\xe1\xb8\x91",
949 | "\xe1\xb8\x92" => "\xe1\xb8\x93",
950 | "\xe1\xb8\x94" => "\xe1\xb8\x95",
951 | "\xe1\xb8\x96" => "\xe1\xb8\x97",
952 | "\xe1\xb8\x98" => "\xe1\xb8\x99",
953 | "\xe1\xb8\x9a" => "\xe1\xb8\x9b",
954 | "\xe1\xb8\x9c" => "\xe1\xb8\x9d",
955 | "\xe1\xb8\x9e" => "\xe1\xb8\x9f",
956 | "\xe1\xb8\xa0" => "\xe1\xb8\xa1",
957 | "\xe1\xb8\xa2" => "\xe1\xb8\xa3",
958 | "\xe1\xb8\xa4" => "\xe1\xb8\xa5",
959 | "\xe1\xb8\xa6" => "\xe1\xb8\xa7",
960 | "\xe1\xb8\xa8" => "\xe1\xb8\xa9",
961 | "\xe1\xb8\xaa" => "\xe1\xb8\xab",
962 | "\xe1\xb8\xac" => "\xe1\xb8\xad",
963 | "\xe1\xb8\xae" => "\xe1\xb8\xaf",
964 | "\xe1\xb8\xb0" => "\xe1\xb8\xb1",
965 | "\xe1\xb8\xb2" => "\xe1\xb8\xb3",
966 | "\xe1\xb8\xb4" => "\xe1\xb8\xb5",
967 | "\xe1\xb8\xb6" => "\xe1\xb8\xb7",
968 | "\xe1\xb8\xb8" => "\xe1\xb8\xb9",
969 | "\xe1\xb8\xba" => "\xe1\xb8\xbb",
970 | "\xe1\xb8\xbc" => "\xe1\xb8\xbd",
971 | "\xe1\xb8\xbe" => "\xe1\xb8\xbf",
972 | "\xe1\xb9\x80" => "\xe1\xb9\x81",
973 | "\xe1\xb9\x82" => "\xe1\xb9\x83",
974 | "\xe1\xb9\x84" => "\xe1\xb9\x85",
975 | "\xe1\xb9\x86" => "\xe1\xb9\x87",
976 | "\xe1\xb9\x88" => "\xe1\xb9\x89",
977 | "\xe1\xb9\x8a" => "\xe1\xb9\x8b",
978 | "\xe1\xb9\x8c" => "\xe1\xb9\x8d",
979 | "\xe1\xb9\x8e" => "\xe1\xb9\x8f",
980 | "\xe1\xb9\x90" => "\xe1\xb9\x91",
981 | "\xe1\xb9\x92" => "\xe1\xb9\x93",
982 | "\xe1\xb9\x94" => "\xe1\xb9\x95",
983 | "\xe1\xb9\x96" => "\xe1\xb9\x97",
984 | "\xe1\xb9\x98" => "\xe1\xb9\x99",
985 | "\xe1\xb9\x9a" => "\xe1\xb9\x9b",
986 | "\xe1\xb9\x9c" => "\xe1\xb9\x9d",
987 | "\xe1\xb9\x9e" => "\xe1\xb9\x9f",
988 | "\xe1\xb9\xa0" => "\xe1\xb9\xa1",
989 | "\xe1\xb9\xa2" => "\xe1\xb9\xa3",
990 | "\xe1\xb9\xa4" => "\xe1\xb9\xa5",
991 | "\xe1\xb9\xa6" => "\xe1\xb9\xa7",
992 | "\xe1\xb9\xa8" => "\xe1\xb9\xa9",
993 | "\xe1\xb9\xaa" => "\xe1\xb9\xab",
994 | "\xe1\xb9\xac" => "\xe1\xb9\xad",
995 | "\xe1\xb9\xae" => "\xe1\xb9\xaf",
996 | "\xe1\xb9\xb0" => "\xe1\xb9\xb1",
997 | "\xe1\xb9\xb2" => "\xe1\xb9\xb3",
998 | "\xe1\xb9\xb4" => "\xe1\xb9\xb5",
999 | "\xe1\xb9\xb6" => "\xe1\xb9\xb7",
1000 | "\xe1\xb9\xb8" => "\xe1\xb9\xb9",
1001 | "\xe1\xb9\xba" => "\xe1\xb9\xbb",
1002 | "\xe1\xb9\xbc" => "\xe1\xb9\xbd",
1003 | "\xe1\xb9\xbe" => "\xe1\xb9\xbf",
1004 | "\xe1\xba\x80" => "\xe1\xba\x81",
1005 | "\xe1\xba\x82" => "\xe1\xba\x83",
1006 | "\xe1\xba\x84" => "\xe1\xba\x85",
1007 | "\xe1\xba\x86" => "\xe1\xba\x87",
1008 | "\xe1\xba\x88" => "\xe1\xba\x89",
1009 | "\xe1\xba\x8a" => "\xe1\xba\x8b",
1010 | "\xe1\xba\x8c" => "\xe1\xba\x8d",
1011 | "\xe1\xba\x8e" => "\xe1\xba\x8f",
1012 | "\xe1\xba\x90" => "\xe1\xba\x91",
1013 | "\xe1\xba\x92" => "\xe1\xba\x93",
1014 | "\xe1\xba\x94" => "\xe1\xba\x95",
1015 | "\xe1\xba\xa0" => "\xe1\xba\xa1",
1016 | "\xe1\xba\xa2" => "\xe1\xba\xa3",
1017 | "\xe1\xba\xa4" => "\xe1\xba\xa5",
1018 | "\xe1\xba\xa6" => "\xe1\xba\xa7",
1019 | "\xe1\xba\xa8" => "\xe1\xba\xa9",
1020 | "\xe1\xba\xaa" => "\xe1\xba\xab",
1021 | "\xe1\xba\xac" => "\xe1\xba\xad",
1022 | "\xe1\xba\xae" => "\xe1\xba\xaf",
1023 | "\xe1\xba\xb0" => "\xe1\xba\xb1",
1024 | "\xe1\xba\xb2" => "\xe1\xba\xb3",
1025 | "\xe1\xba\xb4" => "\xe1\xba\xb5",
1026 | "\xe1\xba\xb6" => "\xe1\xba\xb7",
1027 | "\xe1\xba\xb8" => "\xe1\xba\xb9",
1028 | "\xe1\xba\xba" => "\xe1\xba\xbb",
1029 | "\xe1\xba\xbc" => "\xe1\xba\xbd",
1030 | "\xe1\xba\xbe" => "\xe1\xba\xbf",
1031 | "\xe1\xbb\x80" => "\xe1\xbb\x81",
1032 | "\xe1\xbb\x82" => "\xe1\xbb\x83",
1033 | "\xe1\xbb\x84" => "\xe1\xbb\x85",
1034 | "\xe1\xbb\x86" => "\xe1\xbb\x87",
1035 | "\xe1\xbb\x88" => "\xe1\xbb\x89",
1036 | "\xe1\xbb\x8a" => "\xe1\xbb\x8b",
1037 | "\xe1\xbb\x8c" => "\xe1\xbb\x8d",
1038 | "\xe1\xbb\x8e" => "\xe1\xbb\x8f",
1039 | "\xe1\xbb\x90" => "\xe1\xbb\x91",
1040 | "\xe1\xbb\x92" => "\xe1\xbb\x93",
1041 | "\xe1\xbb\x94" => "\xe1\xbb\x95",
1042 | "\xe1\xbb\x96" => "\xe1\xbb\x97",
1043 | "\xe1\xbb\x98" => "\xe1\xbb\x99",
1044 | "\xe1\xbb\x9a" => "\xe1\xbb\x9b",
1045 | "\xe1\xbb\x9c" => "\xe1\xbb\x9d",
1046 | "\xe1\xbb\x9e" => "\xe1\xbb\x9f",
1047 | "\xe1\xbb\xa0" => "\xe1\xbb\xa1",
1048 | "\xe1\xbb\xa2" => "\xe1\xbb\xa3",
1049 | "\xe1\xbb\xa4" => "\xe1\xbb\xa5",
1050 | "\xe1\xbb\xa6" => "\xe1\xbb\xa7",
1051 | "\xe1\xbb\xa8" => "\xe1\xbb\xa9",
1052 | "\xe1\xbb\xaa" => "\xe1\xbb\xab",
1053 | "\xe1\xbb\xac" => "\xe1\xbb\xad",
1054 | "\xe1\xbb\xae" => "\xe1\xbb\xaf",
1055 | "\xe1\xbb\xb0" => "\xe1\xbb\xb1",
1056 | "\xe1\xbb\xb2" => "\xe1\xbb\xb3",
1057 | "\xe1\xbb\xb4" => "\xe1\xbb\xb5",
1058 | "\xe1\xbb\xb6" => "\xe1\xbb\xb7",
1059 | "\xe1\xbb\xb8" => "\xe1\xbb\xb9",
1060 | "\xe1\xbc\x88" => "\xe1\xbc\x80",
1061 | "\xe1\xbc\x89" => "\xe1\xbc\x81",
1062 | "\xe1\xbc\x8a" => "\xe1\xbc\x82",
1063 | "\xe1\xbc\x8b" => "\xe1\xbc\x83",
1064 | "\xe1\xbc\x8c" => "\xe1\xbc\x84",
1065 | "\xe1\xbc\x8d" => "\xe1\xbc\x85",
1066 | "\xe1\xbc\x8e" => "\xe1\xbc\x86",
1067 | "\xe1\xbc\x8f" => "\xe1\xbc\x87",
1068 | "\xe1\xbc\x98" => "\xe1\xbc\x90",
1069 | "\xe1\xbc\x99" => "\xe1\xbc\x91",
1070 | "\xe1\xbc\x9a" => "\xe1\xbc\x92",
1071 | "\xe1\xbc\x9b" => "\xe1\xbc\x93",
1072 | "\xe1\xbc\x9c" => "\xe1\xbc\x94",
1073 | "\xe1\xbc\x9d" => "\xe1\xbc\x95",
1074 | "\xe1\xbc\xa9" => "\xe1\xbc\xa1",
1075 | "\xe1\xbc\xaa" => "\xe1\xbc\xa2",
1076 | "\xe1\xbc\xab" => "\xe1\xbc\xa3",
1077 | "\xe1\xbc\xac" => "\xe1\xbc\xa4",
1078 | "\xe1\xbc\xad" => "\xe1\xbc\xa5",
1079 | "\xe1\xbc\xae" => "\xe1\xbc\xa6",
1080 | "\xe1\xbc\xaf" => "\xe1\xbc\xa7",
1081 | "\xe1\xbc\xb8" => "\xe1\xbc\xb0",
1082 | "\xe1\xbc\xb9" => "\xe1\xbc\xb1",
1083 | "\xe1\xbc\xba" => "\xe1\xbc\xb2",
1084 | "\xe1\xbc\xbb" => "\xe1\xbc\xb3",
1085 | "\xe1\xbc\xbc" => "\xe1\xbc\xb4",
1086 | "\xe1\xbc\xbd" => "\xe1\xbc\xb5",
1087 | "\xe1\xbc\xbe" => "\xe1\xbc\xb6",
1088 | "\xe1\xbc\xbf" => "\xe1\xbc\xb7",
1089 | "\xe1\xbd\x88" => "\xe1\xbd\x80",
1090 | "\xe1\xbd\x89" => "\xe1\xbd\x81",
1091 | "\xe1\xbd\x8a" => "\xe1\xbd\x82",
1092 | "\xe1\xbd\x8b" => "\xe1\xbd\x83",
1093 | "\xe1\xbd\x8c" => "\xe1\xbd\x84",
1094 | "\xe1\xbd\x8d" => "\xe1\xbd\x85",
1095 | "\xe1\xbd\x99" => "\xe1\xbd\x91",
1096 | "\xe1\xbd\x9b" => "\xe1\xbd\x93",
1097 | "\xe1\xbd\x9d" => "\xe1\xbd\x95",
1098 | "\xe1\xbd\x9f" => "\xe1\xbd\x97",
1099 | "\xe1\xbd\xa9" => "\xe1\xbd\xa1",
1100 | "\xe1\xbd\xaa" => "\xe1\xbd\xa2",
1101 | "\xe1\xbd\xab" => "\xe1\xbd\xa3",
1102 | "\xe1\xbd\xac" => "\xe1\xbd\xa4",
1103 | "\xe1\xbd\xad" => "\xe1\xbd\xa5",
1104 | "\xe1\xbd\xae" => "\xe1\xbd\xa6",
1105 | "\xe1\xbd\xaf" => "\xe1\xbd\xa7",
1106 | "\xe1\xbe\x88" => "\xe1\xbe\x80",
1107 | "\xe1\xbe\x89" => "\xe1\xbe\x81",
1108 | "\xe1\xbe\x8a" => "\xe1\xbe\x82",
1109 | "\xe1\xbe\x8b" => "\xe1\xbe\x83",
1110 | "\xe1\xbe\x8c" => "\xe1\xbe\x84",
1111 | "\xe1\xbe\x8d" => "\xe1\xbe\x85",
1112 | "\xe1\xbe\x8e" => "\xe1\xbe\x86",
1113 | "\xe1\xbe\x8f" => "\xe1\xbe\x87",
1114 | "\xe1\xbe\x98" => "\xe1\xbe\x90",
1115 | "\xe1\xbe\x99" => "\xe1\xbe\x91",
1116 | "\xe1\xbe\x9a" => "\xe1\xbe\x92",
1117 | "\xe1\xbe\x9b" => "\xe1\xbe\x93",
1118 | "\xe1\xbe\x9c" => "\xe1\xbe\x94",
1119 | "\xe1\xbe\x9d" => "\xe1\xbe\x95",
1120 | "\xe1\xbe\x9e" => "\xe1\xbe\x96",
1121 | "\xe1\xbe\x9f" => "\xe1\xbe\x97",
1122 | "\xe1\xbe\xa9" => "\xe1\xbe\xa1",
1123 | "\xe1\xbe\xaa" => "\xe1\xbe\xa2",
1124 | "\xe1\xbe\xab" => "\xe1\xbe\xa3",
1125 | "\xe1\xbe\xac" => "\xe1\xbe\xa4",
1126 | "\xe1\xbe\xad" => "\xe1\xbe\xa5",
1127 | "\xe1\xbe\xae" => "\xe1\xbe\xa6",
1128 | "\xe1\xbe\xaf" => "\xe1\xbe\xa7",
1129 | "\xe1\xbe\xb8" => "\xe1\xbe\xb0",
1130 | "\xe1\xbe\xb9" => "\xe1\xbe\xb1",
1131 | "\xe1\xbe\xba" => "\xe1\xbd\xb0",
1132 | "\xe1\xbe\xbb" => "\xe1\xbd\xb1",
1133 | "\xe1\xbe\xbc" => "\xe1\xbe\xb3",
1134 | "\xe1\xbf\x88" => "\xe1\xbd\xb2",
1135 | "\xe1\xbf\x89" => "\xe1\xbd\xb3",
1136 | "\xe1\xbf\x8a" => "\xe1\xbd\xb4",
1137 | "\xe1\xbf\x8b" => "\xe1\xbd\xb5",
1138 | "\xe1\xbf\x8c" => "\xe1\xbf\x83",
1139 | "\xe1\xbf\x98" => "\xe1\xbf\x90",
1140 | "\xe1\xbf\x99" => "\xe1\xbf\x91",
1141 | "\xe1\xbf\x9a" => "\xe1\xbd\xb6",
1142 | "\xe1\xbf\x9b" => "\xe1\xbd\xb7",
1143 | "\xe1\xbf\xa9" => "\xe1\xbf\xa1",
1144 | "\xe1\xbf\xaa" => "\xe1\xbd\xba",
1145 | "\xe1\xbf\xab" => "\xe1\xbd\xbb",
1146 | "\xe1\xbf\xac" => "\xe1\xbf\xa5",
1147 | "\xe1\xbf\xb8" => "\xe1\xbd\xb8",
1148 | "\xe1\xbf\xb9" => "\xe1\xbd\xb9",
1149 | "\xe1\xbf\xba" => "\xe1\xbd\xbc",
1150 | "\xe1\xbf\xbb" => "\xe1\xbd\xbd",
1151 | "\xe1\xbf\xbc" => "\xe1\xbf\xb3",
1152 | "\xef\xbc\xa1" => "\xef\xbd\x81",
1153 | "\xef\xbc\xa2" => "\xef\xbd\x82",
1154 | "\xef\xbc\xa3" => "\xef\xbd\x83",
1155 | "\xef\xbc\xa4" => "\xef\xbd\x84",
1156 | "\xef\xbc\xa5" => "\xef\xbd\x85",
1157 | "\xef\xbc\xa6" => "\xef\xbd\x86",
1158 | "\xef\xbc\xa7" => "\xef\xbd\x87",
1159 | "\xef\xbc\xa8" => "\xef\xbd\x88",
1160 | "\xef\xbc\xa9" => "\xef\xbd\x89",
1161 | "\xef\xbc\xaa" => "\xef\xbd\x8a",
1162 | "\xef\xbc\xab" => "\xef\xbd\x8b",
1163 | "\xef\xbc\xac" => "\xef\xbd\x8c",
1164 | "\xef\xbc\xad" => "\xef\xbd\x8d",
1165 | "\xef\xbc\xae" => "\xef\xbd\x8e",
1166 | "\xef\xbc\xaf" => "\xef\xbd\x8f",
1167 | "\xef\xbc\xb0" => "\xef\xbd\x90",
1168 | "\xef\xbc\xb1" => "\xef\xbd\x91",
1169 | "\xef\xbc\xb2" => "\xef\xbd\x92",
1170 | "\xef\xbc\xb3" => "\xef\xbd\x93",
1171 | "\xef\xbc\xb4" => "\xef\xbd\x94",
1172 | "\xef\xbc\xb5" => "\xef\xbd\x95",
1173 | "\xef\xbc\xb6" => "\xef\xbd\x96",
1174 | "\xef\xbc\xb7" => "\xef\xbd\x97",
1175 | "\xef\xbc\xb8" => "\xef\xbd\x98",
1176 | "\xef\xbc\xb9" => "\xef\xbd\x99",
1177 | "\xef\xbc\xba" => "\xef\xbd\x9a",
1178 | );
1179 |
1180 | #Unicode Character Database 6.0.0 (2010-06-04)
1181 | #autogenerated by unicode_blocks_txt2php() PHP function at 2011-06-04 00:19:39, 209 blocks total
1182 | public static $unicode_blocks = array(
1183 | 'Basic Latin' => array(
1184 | 0 => 0x0000,
1185 | 1 => 0x007F,
1186 | 2 => 0,
1187 | ),
1188 | 'Latin-1 Supplement' => array(
1189 | 0 => 0x0080,
1190 | 1 => 0x00FF,
1191 | 2 => 1,
1192 | ),
1193 | 'Latin Extended-A' => array(
1194 | 0 => 0x0100,
1195 | 1 => 0x017F,
1196 | 2 => 2,
1197 | ),
1198 | 'Latin Extended-B' => array(
1199 | 0 => 0x0180,
1200 | 1 => 0x024F,
1201 | 2 => 3,
1202 | ),
1203 | 'IPA Extensions' => array(
1204 | 0 => 0x0250,
1205 | 1 => 0x02AF,
1206 | 2 => 4,
1207 | ),
1208 | 'Spacing Modifier Letters' => array(
1209 | 0 => 0x02B0,
1210 | 1 => 0x02FF,
1211 | 2 => 5,
1212 | ),
1213 | 'Combining Diacritical Marks' => array(
1214 | 0 => 0x0300,
1215 | 1 => 0x036F,
1216 | 2 => 6,
1217 | ),
1218 | 'Greek and Coptic' => array(
1219 | 0 => 0x0370,
1220 | 1 => 0x03FF,
1221 | 2 => 7,
1222 | ),
1223 | 'Cyrillic' => array(
1224 | 0 => 0x0400,
1225 | 1 => 0x04FF,
1226 | 2 => 8,
1227 | ),
1228 | 'Cyrillic Supplement' => array(
1229 | 0 => 0x0500,
1230 | 1 => 0x052F,
1231 | 2 => 9,
1232 | ),
1233 | 'Armenian' => array(
1234 | 0 => 0x0530,
1235 | 1 => 0x058F,
1236 | 2 => 10,
1237 | ),
1238 | 'Hebrew' => array(
1239 | 0 => 0x0590,
1240 | 1 => 0x05FF,
1241 | 2 => 11,
1242 | ),
1243 | 'Arabic' => array(
1244 | 0 => 0x0600,
1245 | 1 => 0x06FF,
1246 | 2 => 12,
1247 | ),
1248 | 'Syriac' => array(
1249 | 0 => 0x0700,
1250 | 1 => 0x074F,
1251 | 2 => 13,
1252 | ),
1253 | 'Arabic Supplement' => array(
1254 | 0 => 0x0750,
1255 | 1 => 0x077F,
1256 | 2 => 14,
1257 | ),
1258 | 'Thaana' => array(
1259 | 0 => 0x0780,
1260 | 1 => 0x07BF,
1261 | 2 => 15,
1262 | ),
1263 | 'NKo' => array(
1264 | 0 => 0x07C0,
1265 | 1 => 0x07FF,
1266 | 2 => 16,
1267 | ),
1268 | 'Samaritan' => array(
1269 | 0 => 0x0800,
1270 | 1 => 0x083F,
1271 | 2 => 17,
1272 | ),
1273 | 'Mandaic' => array(
1274 | 0 => 0x0840,
1275 | 1 => 0x085F,
1276 | 2 => 18,
1277 | ),
1278 | 'Devanagari' => array(
1279 | 0 => 0x0900,
1280 | 1 => 0x097F,
1281 | 2 => 19,
1282 | ),
1283 | 'Bengali' => array(
1284 | 0 => 0x0980,
1285 | 1 => 0x09FF,
1286 | 2 => 20,
1287 | ),
1288 | 'Gurmukhi' => array(
1289 | 0 => 0x0A00,
1290 | 1 => 0x0A7F,
1291 | 2 => 21,
1292 | ),
1293 | 'Gujarati' => array(
1294 | 0 => 0x0A80,
1295 | 1 => 0x0AFF,
1296 | 2 => 22,
1297 | ),
1298 | 'Oriya' => array(
1299 | 0 => 0x0B00,
1300 | 1 => 0x0B7F,
1301 | 2 => 23,
1302 | ),
1303 | 'Tamil' => array(
1304 | 0 => 0x0B80,
1305 | 1 => 0x0BFF,
1306 | 2 => 24,
1307 | ),
1308 | 'Telugu' => array(
1309 | 0 => 0x0C00,
1310 | 1 => 0x0C7F,
1311 | 2 => 25,
1312 | ),
1313 | 'Kannada' => array(
1314 | 0 => 0x0C80,
1315 | 1 => 0x0CFF,
1316 | 2 => 26,
1317 | ),
1318 | 'Malayalam' => array(
1319 | 0 => 0x0D00,
1320 | 1 => 0x0D7F,
1321 | 2 => 27,
1322 | ),
1323 | 'Sinhala' => array(
1324 | 0 => 0x0D80,
1325 | 1 => 0x0DFF,
1326 | 2 => 28,
1327 | ),
1328 | 'Thai' => array(
1329 | 0 => 0x0E00,
1330 | 1 => 0x0E7F,
1331 | 2 => 29,
1332 | ),
1333 | 'Lao' => array(
1334 | 0 => 0x0E80,
1335 | 1 => 0x0EFF,
1336 | 2 => 30,
1337 | ),
1338 | 'Tibetan' => array(
1339 | 0 => 0x0F00,
1340 | 1 => 0x0FFF,
1341 | 2 => 31,
1342 | ),
1343 | 'Myanmar' => array(
1344 | 0 => 0x1000,
1345 | 1 => 0x109F,
1346 | 2 => 32,
1347 | ),
1348 | 'Georgian' => array(
1349 | 0 => 0x10A0,
1350 | 1 => 0x10FF,
1351 | 2 => 33,
1352 | ),
1353 | 'Hangul Jamo' => array(
1354 | 0 => 0x1100,
1355 | 1 => 0x11FF,
1356 | 2 => 34,
1357 | ),
1358 | 'Ethiopic' => array(
1359 | 0 => 0x1200,
1360 | 1 => 0x137F,
1361 | 2 => 35,
1362 | ),
1363 | 'Ethiopic Supplement' => array(
1364 | 0 => 0x1380,
1365 | 1 => 0x139F,
1366 | 2 => 36,
1367 | ),
1368 | 'Cherokee' => array(
1369 | 0 => 0x13A0,
1370 | 1 => 0x13FF,
1371 | 2 => 37,
1372 | ),
1373 | 'Unified Canadian Aboriginal Syllabics' => array(
1374 | 0 => 0x1400,
1375 | 1 => 0x167F,
1376 | 2 => 38,
1377 | ),
1378 | 'Ogham' => array(
1379 | 0 => 0x1680,
1380 | 1 => 0x169F,
1381 | 2 => 39,
1382 | ),
1383 | 'Runic' => array(
1384 | 0 => 0x16A0,
1385 | 1 => 0x16FF,
1386 | 2 => 40,
1387 | ),
1388 | 'Tagalog' => array(
1389 | 0 => 0x1700,
1390 | 1 => 0x171F,
1391 | 2 => 41,
1392 | ),
1393 | 'Hanunoo' => array(
1394 | 0 => 0x1720,
1395 | 1 => 0x173F,
1396 | 2 => 42,
1397 | ),
1398 | 'Buhid' => array(
1399 | 0 => 0x1740,
1400 | 1 => 0x175F,
1401 | 2 => 43,
1402 | ),
1403 | 'Tagbanwa' => array(
1404 | 0 => 0x1760,
1405 | 1 => 0x177F,
1406 | 2 => 44,
1407 | ),
1408 | 'Khmer' => array(
1409 | 0 => 0x1780,
1410 | 1 => 0x17FF,
1411 | 2 => 45,
1412 | ),
1413 | 'Mongolian' => array(
1414 | 0 => 0x1800,
1415 | 1 => 0x18AF,
1416 | 2 => 46,
1417 | ),
1418 | 'Unified Canadian Aboriginal Syllabics Extended' => array(
1419 | 0 => 0x18B0,
1420 | 1 => 0x18FF,
1421 | 2 => 47,
1422 | ),
1423 | 'Limbu' => array(
1424 | 0 => 0x1900,
1425 | 1 => 0x194F,
1426 | 2 => 48,
1427 | ),
1428 | 'Tai Le' => array(
1429 | 0 => 0x1950,
1430 | 1 => 0x197F,
1431 | 2 => 49,
1432 | ),
1433 | 'New Tai Lue' => array(
1434 | 0 => 0x1980,
1435 | 1 => 0x19DF,
1436 | 2 => 50,
1437 | ),
1438 | 'Khmer Symbols' => array(
1439 | 0 => 0x19E0,
1440 | 1 => 0x19FF,
1441 | 2 => 51,
1442 | ),
1443 | 'Buginese' => array(
1444 | 0 => 0x1A00,
1445 | 1 => 0x1A1F,
1446 | 2 => 52,
1447 | ),
1448 | 'Tai Tham' => array(
1449 | 0 => 0x1A20,
1450 | 1 => 0x1AAF,
1451 | 2 => 53,
1452 | ),
1453 | 'Balinese' => array(
1454 | 0 => 0x1B00,
1455 | 1 => 0x1B7F,
1456 | 2 => 54,
1457 | ),
1458 | 'Sundanese' => array(
1459 | 0 => 0x1B80,
1460 | 1 => 0x1BBF,
1461 | 2 => 55,
1462 | ),
1463 | 'Batak' => array(
1464 | 0 => 0x1BC0,
1465 | 1 => 0x1BFF,
1466 | 2 => 56,
1467 | ),
1468 | 'Lepcha' => array(
1469 | 0 => 0x1C00,
1470 | 1 => 0x1C4F,
1471 | 2 => 57,
1472 | ),
1473 | 'Ol Chiki' => array(
1474 | 0 => 0x1C50,
1475 | 1 => 0x1C7F,
1476 | 2 => 58,
1477 | ),
1478 | 'Vedic Extensions' => array(
1479 | 0 => 0x1CD0,
1480 | 1 => 0x1CFF,
1481 | 2 => 59,
1482 | ),
1483 | 'Phonetic Extensions' => array(
1484 | 0 => 0x1D00,
1485 | 1 => 0x1D7F,
1486 | 2 => 60,
1487 | ),
1488 | 'Phonetic Extensions Supplement' => array(
1489 | 0 => 0x1D80,
1490 | 1 => 0x1DBF,
1491 | 2 => 61,
1492 | ),
1493 | 'Combining Diacritical Marks Supplement' => array(
1494 | 0 => 0x1DC0,
1495 | 1 => 0x1DFF,
1496 | 2 => 62,
1497 | ),
1498 | 'Latin Extended Additional' => array(
1499 | 0 => 0x1E00,
1500 | 1 => 0x1EFF,
1501 | 2 => 63,
1502 | ),
1503 | 'Greek Extended' => array(
1504 | 0 => 0x1F00,
1505 | 1 => 0x1FFF,
1506 | 2 => 64,
1507 | ),
1508 | 'General Punctuation' => array(
1509 | 0 => 0x2000,
1510 | 1 => 0x206F,
1511 | 2 => 65,
1512 | ),
1513 | 'Superscripts and Subscripts' => array(
1514 | 0 => 0x2070,
1515 | 1 => 0x209F,
1516 | 2 => 66,
1517 | ),
1518 | 'Currency Symbols' => array(
1519 | 0 => 0x20A0,
1520 | 1 => 0x20CF,
1521 | 2 => 67,
1522 | ),
1523 | 'Combining Diacritical Marks for Symbols' => array(
1524 | 0 => 0x20D0,
1525 | 1 => 0x20FF,
1526 | 2 => 68,
1527 | ),
1528 | 'Letterlike Symbols' => array(
1529 | 0 => 0x2100,
1530 | 1 => 0x214F,
1531 | 2 => 69,
1532 | ),
1533 | 'Number Forms' => array(
1534 | 0 => 0x2150,
1535 | 1 => 0x218F,
1536 | 2 => 70,
1537 | ),
1538 | 'Arrows' => array(
1539 | 0 => 0x2190,
1540 | 1 => 0x21FF,
1541 | 2 => 71,
1542 | ),
1543 | 'Mathematical Operators' => array(
1544 | 0 => 0x2200,
1545 | 1 => 0x22FF,
1546 | 2 => 72,
1547 | ),
1548 | 'Miscellaneous Technical' => array(
1549 | 0 => 0x2300,
1550 | 1 => 0x23FF,
1551 | 2 => 73,
1552 | ),
1553 | 'Control Pictures' => array(
1554 | 0 => 0x2400,
1555 | 1 => 0x243F,
1556 | 2 => 74,
1557 | ),
1558 | 'Optical Character Recognition' => array(
1559 | 0 => 0x2440,
1560 | 1 => 0x245F,
1561 | 2 => 75,
1562 | ),
1563 | 'Enclosed Alphanumerics' => array(
1564 | 0 => 0x2460,
1565 | 1 => 0x24FF,
1566 | 2 => 76,
1567 | ),
1568 | 'Box Drawing' => array(
1569 | 0 => 0x2500,
1570 | 1 => 0x257F,
1571 | 2 => 77,
1572 | ),
1573 | 'Block Elements' => array(
1574 | 0 => 0x2580,
1575 | 1 => 0x259F,
1576 | 2 => 78,
1577 | ),
1578 | 'Geometric Shapes' => array(
1579 | 0 => 0x25A0,
1580 | 1 => 0x25FF,
1581 | 2 => 79,
1582 | ),
1583 | 'Miscellaneous Symbols' => array(
1584 | 0 => 0x2600,
1585 | 1 => 0x26FF,
1586 | 2 => 80,
1587 | ),
1588 | 'Dingbats' => array(
1589 | 0 => 0x2700,
1590 | 1 => 0x27BF,
1591 | 2 => 81,
1592 | ),
1593 | 'Miscellaneous Mathematical Symbols-A' => array(
1594 | 0 => 0x27C0,
1595 | 1 => 0x27EF,
1596 | 2 => 82,
1597 | ),
1598 | 'Supplemental Arrows-A' => array(
1599 | 0 => 0x27F0,
1600 | 1 => 0x27FF,
1601 | 2 => 83,
1602 | ),
1603 | 'Braille Patterns' => array(
1604 | 0 => 0x2800,
1605 | 1 => 0x28FF,
1606 | 2 => 84,
1607 | ),
1608 | 'Supplemental Arrows-B' => array(
1609 | 0 => 0x2900,
1610 | 1 => 0x297F,
1611 | 2 => 85,
1612 | ),
1613 | 'Miscellaneous Mathematical Symbols-B' => array(
1614 | 0 => 0x2980,
1615 | 1 => 0x29FF,
1616 | 2 => 86,
1617 | ),
1618 | 'Supplemental Mathematical Operators' => array(
1619 | 0 => 0x2A00,
1620 | 1 => 0x2AFF,
1621 | 2 => 87,
1622 | ),
1623 | 'Miscellaneous Symbols and Arrows' => array(
1624 | 0 => 0x2B00,
1625 | 1 => 0x2BFF,
1626 | 2 => 88,
1627 | ),
1628 | 'Glagolitic' => array(
1629 | 0 => 0x2C00,
1630 | 1 => 0x2C5F,
1631 | 2 => 89,
1632 | ),
1633 | 'Latin Extended-C' => array(
1634 | 0 => 0x2C60,
1635 | 1 => 0x2C7F,
1636 | 2 => 90,
1637 | ),
1638 | 'Coptic' => array(
1639 | 0 => 0x2C80,
1640 | 1 => 0x2CFF,
1641 | 2 => 91,
1642 | ),
1643 | 'Georgian Supplement' => array(
1644 | 0 => 0x2D00,
1645 | 1 => 0x2D2F,
1646 | 2 => 92,
1647 | ),
1648 | 'Tifinagh' => array(
1649 | 0 => 0x2D30,
1650 | 1 => 0x2D7F,
1651 | 2 => 93,
1652 | ),
1653 | 'Ethiopic Extended' => array(
1654 | 0 => 0x2D80,
1655 | 1 => 0x2DDF,
1656 | 2 => 94,
1657 | ),
1658 | 'Cyrillic Extended-A' => array(
1659 | 0 => 0x2DE0,
1660 | 1 => 0x2DFF,
1661 | 2 => 95,
1662 | ),
1663 | 'Supplemental Punctuation' => array(
1664 | 0 => 0x2E00,
1665 | 1 => 0x2E7F,
1666 | 2 => 96,
1667 | ),
1668 | 'CJK Radicals Supplement' => array(
1669 | 0 => 0x2E80,
1670 | 1 => 0x2EFF,
1671 | 2 => 97,
1672 | ),
1673 | 'Kangxi Radicals' => array(
1674 | 0 => 0x2F00,
1675 | 1 => 0x2FDF,
1676 | 2 => 98,
1677 | ),
1678 | 'Ideographic Description Characters' => array(
1679 | 0 => 0x2FF0,
1680 | 1 => 0x2FFF,
1681 | 2 => 99,
1682 | ),
1683 | 'CJK Symbols and Punctuation' => array(
1684 | 0 => 0x3000,
1685 | 1 => 0x303F,
1686 | 2 => 100,
1687 | ),
1688 | 'Hiragana' => array(
1689 | 0 => 0x3040,
1690 | 1 => 0x309F,
1691 | 2 => 101,
1692 | ),
1693 | 'Katakana' => array(
1694 | 0 => 0x30A0,
1695 | 1 => 0x30FF,
1696 | 2 => 102,
1697 | ),
1698 | 'Bopomofo' => array(
1699 | 0 => 0x3100,
1700 | 1 => 0x312F,
1701 | 2 => 103,
1702 | ),
1703 | 'Hangul Compatibility Jamo' => array(
1704 | 0 => 0x3130,
1705 | 1 => 0x318F,
1706 | 2 => 104,
1707 | ),
1708 | 'Kanbun' => array(
1709 | 0 => 0x3190,
1710 | 1 => 0x319F,
1711 | 2 => 105,
1712 | ),
1713 | 'Bopomofo Extended' => array(
1714 | 0 => 0x31A0,
1715 | 1 => 0x31BF,
1716 | 2 => 106,
1717 | ),
1718 | 'CJK Strokes' => array(
1719 | 0 => 0x31C0,
1720 | 1 => 0x31EF,
1721 | 2 => 107,
1722 | ),
1723 | 'Katakana Phonetic Extensions' => array(
1724 | 0 => 0x31F0,
1725 | 1 => 0x31FF,
1726 | 2 => 108,
1727 | ),
1728 | 'Enclosed CJK Letters and Months' => array(
1729 | 0 => 0x3200,
1730 | 1 => 0x32FF,
1731 | 2 => 109,
1732 | ),
1733 | 'CJK Compatibility' => array(
1734 | 0 => 0x3300,
1735 | 1 => 0x33FF,
1736 | 2 => 110,
1737 | ),
1738 | 'CJK Unified Ideographs Extension A' => array(
1739 | 0 => 0x3400,
1740 | 1 => 0x4DBF,
1741 | 2 => 111,
1742 | ),
1743 | 'Yijing Hexagram Symbols' => array(
1744 | 0 => 0x4DC0,
1745 | 1 => 0x4DFF,
1746 | 2 => 112,
1747 | ),
1748 | 'CJK Unified Ideographs' => array(
1749 | 0 => 0x4E00,
1750 | 1 => 0x9FFF,
1751 | 2 => 113,
1752 | ),
1753 | 'Yi Syllables' => array(
1754 | 0 => 0xA000,
1755 | 1 => 0xA48F,
1756 | 2 => 114,
1757 | ),
1758 | 'Yi Radicals' => array(
1759 | 0 => 0xA490,
1760 | 1 => 0xA4CF,
1761 | 2 => 115,
1762 | ),
1763 | 'Lisu' => array(
1764 | 0 => 0xA4D0,
1765 | 1 => 0xA4FF,
1766 | 2 => 116,
1767 | ),
1768 | 'Vai' => array(
1769 | 0 => 0xA500,
1770 | 1 => 0xA63F,
1771 | 2 => 117,
1772 | ),
1773 | 'Cyrillic Extended-B' => array(
1774 | 0 => 0xA640,
1775 | 1 => 0xA69F,
1776 | 2 => 118,
1777 | ),
1778 | 'Bamum' => array(
1779 | 0 => 0xA6A0,
1780 | 1 => 0xA6FF,
1781 | 2 => 119,
1782 | ),
1783 | 'Modifier Tone Letters' => array(
1784 | 0 => 0xA700,
1785 | 1 => 0xA71F,
1786 | 2 => 120,
1787 | ),
1788 | 'Latin Extended-D' => array(
1789 | 0 => 0xA720,
1790 | 1 => 0xA7FF,
1791 | 2 => 121,
1792 | ),
1793 | 'Syloti Nagri' => array(
1794 | 0 => 0xA800,
1795 | 1 => 0xA82F,
1796 | 2 => 122,
1797 | ),
1798 | 'Common Indic Number Forms' => array(
1799 | 0 => 0xA830,
1800 | 1 => 0xA83F,
1801 | 2 => 123,
1802 | ),
1803 | 'Phags-pa' => array(
1804 | 0 => 0xA840,
1805 | 1 => 0xA87F,
1806 | 2 => 124,
1807 | ),
1808 | 'Saurashtra' => array(
1809 | 0 => 0xA880,
1810 | 1 => 0xA8DF,
1811 | 2 => 125,
1812 | ),
1813 | 'Devanagari Extended' => array(
1814 | 0 => 0xA8E0,
1815 | 1 => 0xA8FF,
1816 | 2 => 126,
1817 | ),
1818 | 'Kayah Li' => array(
1819 | 0 => 0xA900,
1820 | 1 => 0xA92F,
1821 | 2 => 127,
1822 | ),
1823 | 'Rejang' => array(
1824 | 0 => 0xA930,
1825 | 1 => 0xA95F,
1826 | 2 => 128,
1827 | ),
1828 | 'Hangul Jamo Extended-A' => array(
1829 | 0 => 0xA960,
1830 | 1 => 0xA97F,
1831 | 2 => 129,
1832 | ),
1833 | 'Javanese' => array(
1834 | 0 => 0xA980,
1835 | 1 => 0xA9DF,
1836 | 2 => 130,
1837 | ),
1838 | 'Cham' => array(
1839 | 0 => 0xAA00,
1840 | 1 => 0xAA5F,
1841 | 2 => 131,
1842 | ),
1843 | 'Myanmar Extended-A' => array(
1844 | 0 => 0xAA60,
1845 | 1 => 0xAA7F,
1846 | 2 => 132,
1847 | ),
1848 | 'Tai Viet' => array(
1849 | 0 => 0xAA80,
1850 | 1 => 0xAADF,
1851 | 2 => 133,
1852 | ),
1853 | 'Ethiopic Extended-A' => array(
1854 | 0 => 0xAB00,
1855 | 1 => 0xAB2F,
1856 | 2 => 134,
1857 | ),
1858 | 'Meetei Mayek' => array(
1859 | 0 => 0xABC0,
1860 | 1 => 0xABFF,
1861 | 2 => 135,
1862 | ),
1863 | 'Hangul Syllables' => array(
1864 | 0 => 0xAC00,
1865 | 1 => 0xD7AF,
1866 | 2 => 136,
1867 | ),
1868 | 'Hangul Jamo Extended-B' => array(
1869 | 0 => 0xD7B0,
1870 | 1 => 0xD7FF,
1871 | 2 => 137,
1872 | ),
1873 | 'High Surrogates' => array(
1874 | 0 => 0xD800,
1875 | 1 => 0xDB7F,
1876 | 2 => 138,
1877 | ),
1878 | 'High Private Use Surrogates' => array(
1879 | 0 => 0xDB80,
1880 | 1 => 0xDBFF,
1881 | 2 => 139,
1882 | ),
1883 | 'Low Surrogates' => array(
1884 | 0 => 0xDC00,
1885 | 1 => 0xDFFF,
1886 | 2 => 140,
1887 | ),
1888 | 'Private Use Area' => array(
1889 | 0 => 0xE000,
1890 | 1 => 0xF8FF,
1891 | 2 => 141,
1892 | ),
1893 | 'CJK Compatibility Ideographs' => array(
1894 | 0 => 0xF900,
1895 | 1 => 0xFAFF,
1896 | 2 => 142,
1897 | ),
1898 | 'Alphabetic Presentation Forms' => array(
1899 | 0 => 0xFB00,
1900 | 1 => 0xFB4F,
1901 | 2 => 143,
1902 | ),
1903 | 'Arabic Presentation Forms-A' => array(
1904 | 0 => 0xFB50,
1905 | 1 => 0xFDFF,
1906 | 2 => 144,
1907 | ),
1908 | 'Variation Selectors' => array(
1909 | 0 => 0xFE00,
1910 | 1 => 0xFE0F,
1911 | 2 => 145,
1912 | ),
1913 | 'Vertical Forms' => array(
1914 | 0 => 0xFE10,
1915 | 1 => 0xFE1F,
1916 | 2 => 146,
1917 | ),
1918 | 'Combining Half Marks' => array(
1919 | 0 => 0xFE20,
1920 | 1 => 0xFE2F,
1921 | 2 => 147,
1922 | ),
1923 | 'CJK Compatibility Forms' => array(
1924 | 0 => 0xFE30,
1925 | 1 => 0xFE4F,
1926 | 2 => 148,
1927 | ),
1928 | 'Small Form Variants' => array(
1929 | 0 => 0xFE50,
1930 | 1 => 0xFE6F,
1931 | 2 => 149,
1932 | ),
1933 | 'Arabic Presentation Forms-B' => array(
1934 | 0 => 0xFE70,
1935 | 1 => 0xFEFF,
1936 | 2 => 150,
1937 | ),
1938 | 'Halfwidth and Fullwidth Forms' => array(
1939 | 0 => 0xFF00,
1940 | 1 => 0xFFEF,
1941 | 2 => 151,
1942 | ),
1943 | 'Specials' => array(
1944 | 0 => 0xFFF0,
1945 | 1 => 0xFFFF,
1946 | 2 => 152,
1947 | ),
1948 | 'Linear B Syllabary' => array(
1949 | 0 => 0x10000,
1950 | 1 => 0x1007F,
1951 | 2 => 153,
1952 | ),
1953 | 'Linear B Ideograms' => array(
1954 | 0 => 0x10080,
1955 | 1 => 0x100FF,
1956 | 2 => 154,
1957 | ),
1958 | 'Aegean Numbers' => array(
1959 | 0 => 0x10100,
1960 | 1 => 0x1013F,
1961 | 2 => 155,
1962 | ),
1963 | 'Ancient Greek Numbers' => array(
1964 | 0 => 0x10140,
1965 | 1 => 0x1018F,
1966 | 2 => 156,
1967 | ),
1968 | 'Ancient Symbols' => array(
1969 | 0 => 0x10190,
1970 | 1 => 0x101CF,
1971 | 2 => 157,
1972 | ),
1973 | 'Phaistos Disc' => array(
1974 | 0 => 0x101D0,
1975 | 1 => 0x101FF,
1976 | 2 => 158,
1977 | ),
1978 | 'Lycian' => array(
1979 | 0 => 0x10280,
1980 | 1 => 0x1029F,
1981 | 2 => 159,
1982 | ),
1983 | 'Carian' => array(
1984 | 0 => 0x102A0,
1985 | 1 => 0x102DF,
1986 | 2 => 160,
1987 | ),
1988 | 'Old Italic' => array(
1989 | 0 => 0x10300,
1990 | 1 => 0x1032F,
1991 | 2 => 161,
1992 | ),
1993 | 'Gothic' => array(
1994 | 0 => 0x10330,
1995 | 1 => 0x1034F,
1996 | 2 => 162,
1997 | ),
1998 | 'Ugaritic' => array(
1999 | 0 => 0x10380,
2000 | 1 => 0x1039F,
2001 | 2 => 163,
2002 | ),
2003 | 'Old Persian' => array(
2004 | 0 => 0x103A0,
2005 | 1 => 0x103DF,
2006 | 2 => 164,
2007 | ),
2008 | 'Deseret' => array(
2009 | 0 => 0x10400,
2010 | 1 => 0x1044F,
2011 | 2 => 165,
2012 | ),
2013 | 'Shavian' => array(
2014 | 0 => 0x10450,
2015 | 1 => 0x1047F,
2016 | 2 => 166,
2017 | ),
2018 | 'Osmanya' => array(
2019 | 0 => 0x10480,
2020 | 1 => 0x104AF,
2021 | 2 => 167,
2022 | ),
2023 | 'Cypriot Syllabary' => array(
2024 | 0 => 0x10800,
2025 | 1 => 0x1083F,
2026 | 2 => 168,
2027 | ),
2028 | 'Imperial Aramaic' => array(
2029 | 0 => 0x10840,
2030 | 1 => 0x1085F,
2031 | 2 => 169,
2032 | ),
2033 | 'Phoenician' => array(
2034 | 0 => 0x10900,
2035 | 1 => 0x1091F,
2036 | 2 => 170,
2037 | ),
2038 | 'Lydian' => array(
2039 | 0 => 0x10920,
2040 | 1 => 0x1093F,
2041 | 2 => 171,
2042 | ),
2043 | 'Kharoshthi' => array(
2044 | 0 => 0x10A00,
2045 | 1 => 0x10A5F,
2046 | 2 => 172,
2047 | ),
2048 | 'Old South Arabian' => array(
2049 | 0 => 0x10A60,
2050 | 1 => 0x10A7F,
2051 | 2 => 173,
2052 | ),
2053 | 'Avestan' => array(
2054 | 0 => 0x10B00,
2055 | 1 => 0x10B3F,
2056 | 2 => 174,
2057 | ),
2058 | 'Inscriptional Parthian' => array(
2059 | 0 => 0x10B40,
2060 | 1 => 0x10B5F,
2061 | 2 => 175,
2062 | ),
2063 | 'Inscriptional Pahlavi' => array(
2064 | 0 => 0x10B60,
2065 | 1 => 0x10B7F,
2066 | 2 => 176,
2067 | ),
2068 | 'Old Turkic' => array(
2069 | 0 => 0x10C00,
2070 | 1 => 0x10C4F,
2071 | 2 => 177,
2072 | ),
2073 | 'Rumi Numeral Symbols' => array(
2074 | 0 => 0x10E60,
2075 | 1 => 0x10E7F,
2076 | 2 => 178,
2077 | ),
2078 | 'Brahmi' => array(
2079 | 0 => 0x11000,
2080 | 1 => 0x1107F,
2081 | 2 => 179,
2082 | ),
2083 | 'Kaithi' => array(
2084 | 0 => 0x11080,
2085 | 1 => 0x110CF,
2086 | 2 => 180,
2087 | ),
2088 | 'Cuneiform' => array(
2089 | 0 => 0x12000,
2090 | 1 => 0x123FF,
2091 | 2 => 181,
2092 | ),
2093 | 'Cuneiform Numbers and Punctuation' => array(
2094 | 0 => 0x12400,
2095 | 1 => 0x1247F,
2096 | 2 => 182,
2097 | ),
2098 | 'Egyptian Hieroglyphs' => array(
2099 | 0 => 0x13000,
2100 | 1 => 0x1342F,
2101 | 2 => 183,
2102 | ),
2103 | 'Bamum Supplement' => array(
2104 | 0 => 0x16800,
2105 | 1 => 0x16A3F,
2106 | 2 => 184,
2107 | ),
2108 | 'Kana Supplement' => array(
2109 | 0 => 0x1B000,
2110 | 1 => 0x1B0FF,
2111 | 2 => 185,
2112 | ),
2113 | 'Byzantine Musical Symbols' => array(
2114 | 0 => 0x1D000,
2115 | 1 => 0x1D0FF,
2116 | 2 => 186,
2117 | ),
2118 | 'Musical Symbols' => array(
2119 | 0 => 0x1D100,
2120 | 1 => 0x1D1FF,
2121 | 2 => 187,
2122 | ),
2123 | 'Ancient Greek Musical Notation' => array(
2124 | 0 => 0x1D200,
2125 | 1 => 0x1D24F,
2126 | 2 => 188,
2127 | ),
2128 | 'Tai Xuan Jing Symbols' => array(
2129 | 0 => 0x1D300,
2130 | 1 => 0x1D35F,
2131 | 2 => 189,
2132 | ),
2133 | 'Counting Rod Numerals' => array(
2134 | 0 => 0x1D360,
2135 | 1 => 0x1D37F,
2136 | 2 => 190,
2137 | ),
2138 | 'Mathematical Alphanumeric Symbols' => array(
2139 | 0 => 0x1D400,
2140 | 1 => 0x1D7FF,
2141 | 2 => 191,
2142 | ),
2143 | 'Mahjong Tiles' => array(
2144 | 0 => 0x1F000,
2145 | 1 => 0x1F02F,
2146 | 2 => 192,
2147 | ),
2148 | 'Domino Tiles' => array(
2149 | 0 => 0x1F030,
2150 | 1 => 0x1F09F,
2151 | 2 => 193,
2152 | ),
2153 | 'Playing Cards' => array(
2154 | 0 => 0x1F0A0,
2155 | 1 => 0x1F0FF,
2156 | 2 => 194,
2157 | ),
2158 | 'Enclosed Alphanumeric Supplement' => array(
2159 | 0 => 0x1F100,
2160 | 1 => 0x1F1FF,
2161 | 2 => 195,
2162 | ),
2163 | 'Enclosed Ideographic Supplement' => array(
2164 | 0 => 0x1F200,
2165 | 1 => 0x1F2FF,
2166 | 2 => 196,
2167 | ),
2168 | 'Miscellaneous Symbols And Pictographs' => array(
2169 | 0 => 0x1F300,
2170 | 1 => 0x1F5FF,
2171 | 2 => 197,
2172 | ),
2173 | 'Emoticons' => array(
2174 | 0 => 0x1F600,
2175 | 1 => 0x1F64F,
2176 | 2 => 198,
2177 | ),
2178 | 'Transport And Map Symbols' => array(
2179 | 0 => 0x1F680,
2180 | 1 => 0x1F6FF,
2181 | 2 => 199,
2182 | ),
2183 | 'Alchemical Symbols' => array(
2184 | 0 => 0x1F700,
2185 | 1 => 0x1F77F,
2186 | 2 => 200,
2187 | ),
2188 | 'CJK Unified Ideographs Extension B' => array(
2189 | 0 => 0x20000,
2190 | 1 => 0x2A6DF,
2191 | 2 => 201,
2192 | ),
2193 | 'CJK Unified Ideographs Extension C' => array(
2194 | 0 => 0x2A700,
2195 | 1 => 0x2B73F,
2196 | 2 => 202,
2197 | ),
2198 | 'CJK Unified Ideographs Extension D' => array(
2199 | 0 => 0x2B740,
2200 | 1 => 0x2B81F,
2201 | 2 => 203,
2202 | ),
2203 | 'CJK Compatibility Ideographs Supplement' => array(
2204 | 0 => 0x2F800,
2205 | 1 => 0x2FA1F,
2206 | 2 => 204,
2207 | ),
2208 | 'Tags' => array(
2209 | 0 => 0xE0000,
2210 | 1 => 0xE007F,
2211 | 2 => 205,
2212 | ),
2213 | 'Variation Selectors Supplement' => array(
2214 | 0 => 0xE0100,
2215 | 1 => 0xE01EF,
2216 | 2 => 206,
2217 | ),
2218 | 'Supplementary Private Use Area-A' => array(
2219 | 0 => 0xF0000,
2220 | 1 => 0xFFFFF,
2221 | 2 => 207,
2222 | ),
2223 | 'Supplementary Private Use Area-B' => array(
2224 | 0 => 0x100000,
2225 | 1 => 0x10FFFF,
2226 | 2 => 208,
2227 | ),
2228 | );
2229 |
2230 | #calling the methods of this class only statically!
2231 | private function __construct() {}
2232 |
2233 | /**
2234 | * Remove combining diactrical marks, with possibility of the restore
2235 | * Удаляет диакритические знаки в тексте, с возможностью восстановления (опция)
2236 | *
2237 | * @param string|null $s
2238 | * @param array|null $additional_chars for example: "\xc2\xad" #soft hyphen = discretionary hyphen
2239 | * @param bool $is_can_restored
2240 | * @param array|null &$restore_table
2241 | * @return string|bool|null Returns FALSE if error occurred
2242 | */
2243 | public static function diactrical_remove($s, $additional_chars = null, $is_can_restored = false, &$restore_table = null)
2244 | {
2245 | if (! ReflectionTypeHint::isValid()) return false;
2246 | if (is_null($s)) return $s;
2247 |
2248 | if ($additional_chars)
2249 | {
2250 | foreach ($additional_chars as $k => &$v) $v = preg_quote($v, '/');
2251 | $re = '/((?>' . self::$diactrical_re . '|' . implode('|', $additional_chars) . ')+)/sxSX';
2252 | }
2253 | else $re = '/((?>' . self::$diactrical_re . ')+)/sxSX';
2254 | if (! $is_can_restored) return preg_replace($re, '', $s);
2255 |
2256 | $restore_table = array();
2257 | $a = preg_split($re, $s, -1, PREG_SPLIT_DELIM_CAPTURE);
2258 | $c = count($a);
2259 | if ($c === 1) return $s;
2260 | $pos = 0;
2261 | $s2 = '';
2262 | for ($i = 0; $i < $c - 1; $i += 2)
2263 | {
2264 | $s2 .= $a[$i];
2265 | #запоминаем символьные (не байтовые!) позиции
2266 | $pos += self::strlen($a[$i]);
2267 | $restore_table['offsets'][$pos] = $a[$i + 1];
2268 | }
2269 | $restore_table['length'] = $pos + self::strlen(end($a));
2270 | return $s2 . end($a);
2271 | }
2272 |
2273 | /**
2274 | * Restore combining diactrical marks, removed by self::diactrical_remove()
2275 | * In Russian:
2276 | * Восстанавливает диакритические знаки в тексте, при условии, что их символьные позиции и кол-во символов не изменились!
2277 | *
2278 | * @see self::diactrical_remove()
2279 | * @param string|null $s
2280 | * @param array $restore_table
2281 | * @return string|bool|null Returns FALSE if error occurred (broken $restore_table)
2282 | */
2283 | public static function diactrical_restore($s, array $restore_table)
2284 | {
2285 | if (! ReflectionTypeHint::isValid()) return false;
2286 | if (is_null($s)) return $s;
2287 |
2288 | if (! $restore_table) return $s;
2289 | if (! is_int(@$restore_table['length']) ||
2290 | ! is_array(@$restore_table['offsets']) ||
2291 | $restore_table['length'] !== self::strlen($s)) return false;
2292 | $a = array();
2293 | $length = $offset = 0;
2294 | $s2 = '';
2295 | foreach ($restore_table['offsets'] as $pos => $diactricals)
2296 | {
2297 | $length = $pos - $offset;
2298 | $s2 .= self::substr($s, $offset, $length) . $diactricals;
2299 | $offset = $pos;
2300 | }
2301 | return $s2 . self::substr($s, $offset, strlen($s));
2302 | }
2303 |
2304 | /**
2305 | * Encodes data from another character encoding to UTF-8.
2306 | *
2307 | * @param array|scalar|null $data
2308 | * @param string $charset
2309 | * @return array|scalar|null Returns FALSE if error occurred
2310 | */
2311 | public static function convert_from($data, $charset = 'cp1251')
2312 | {
2313 | if (! ReflectionTypeHint::isValid()) return false;
2314 | return self::_convert($data, $charset, 'UTF-8');
2315 | }
2316 |
2317 | /**
2318 | * Encodes data from UTF-8 to another character encoding.
2319 | *
2320 | * @param array|scalar|null $data
2321 | * @param string $charset
2322 | * @return array|scalar|null Returns FALSE if error occurred
2323 | */
2324 | public static function convert_to($data, $charset = 'cp1251')
2325 | {
2326 | if (! ReflectionTypeHint::isValid()) return false;
2327 | return self::_convert($data, 'UTF-8', $charset);
2328 | }
2329 |
2330 | /**
2331 | * Recoding the data of any structure to/from UTF-8.
2332 | * Arrays traversed recursively, recoded keys and values.
2333 | *
2334 | * @see mb_encoding_aliases()
2335 | * @param array|scalar|null $data
2336 | * @param string $charset_from
2337 | * @param string $charset_to
2338 | * @return array|scalar|null Returns FALSE if error occurred
2339 | */
2340 | private static function _convert($data, $charset_from, $charset_to)
2341 | {
2342 | if (! ReflectionTypeHint::isValid()) return false; #for recursive calls
2343 | if ($charset_from === $charset_to) return $data;
2344 | if (is_array($data))
2345 | {
2346 | $d = array();
2347 | foreach ($data as $k => &$v)
2348 | {
2349 | $k = self::_convert($k, $charset_from, $charset_to);
2350 | if ($k === false) return false;
2351 | $d[$k] = self::_convert($v, $charset_from, $charset_to);
2352 | if ($d[$k] === false && ! is_bool($v)) return false;
2353 | }
2354 | return $d;
2355 | }
2356 | if (is_string($data))
2357 | {
2358 | #smart behaviour for errors protected + speed improve
2359 | if ($charset_from === 'UTF-8' && ! self::is_utf8($data)) return $data;
2360 | if ($charset_to === 'UTF-8' && self::is_utf8($data)) return $data;
2361 |
2362 | #since PHP-5.3.x iconv() faster then mb_convert_encoding()
2363 | if (function_exists('iconv')) return iconv($charset_from, $charset_to . '//IGNORE//TRANSLIT', $data);
2364 | if (function_exists('mb_convert_encoding')) return mb_convert_encoding($data, $charset_to, $charset_from);
2365 |
2366 | #charset_from
2367 | if ($charset_from === 'UTF-16' || $charset_from === 'UCS-2') return self::_convert_from_utf16($data);
2368 | if ($charset_from === 'cp1251' || $charset_from === 'cp1259') return strtr($data, self::$cp1259_table);
2369 | if ($charset_from === 'koi8-r' || $charset_from === 'KOI8-R') return strtr(convert_cyr_string($data, 'k', 'w'), self::$cp1259_table);
2370 | if ($charset_from === 'iso8859-5') return strtr(convert_cyr_string($data, 'i', 'w'), self::$cp1259_table);
2371 | if ($charset_from === 'cp866') return strtr(convert_cyr_string($data, 'a', 'w'), self::$cp1259_table);
2372 | if ($charset_from === 'mac-cyrillic') return strtr(convert_cyr_string($data, 'm', 'w'), self::$cp1259_table);
2373 |
2374 | #charset_to
2375 | if ($charset_to === 'cp1251' || $charset_to === 'cp1259') return strtr($data, array_flip(self::$cp1259_table));
2376 |
2377 | #last trying
2378 | if (function_exists('recode_string'))
2379 | {
2380 | $s = @recode_string($charset_from . '..' . $charset_to, $data);
2381 | if (is_string($s)) return $s;
2382 | }
2383 |
2384 | trigger_error('Convert "' . $charset_from . '" --> "' . $charset_to . '" is not supported native, "iconv" or "mbstring" extension required', E_USER_WARNING);
2385 | return false;
2386 | }
2387 | return $data;
2388 | }
2389 |
2390 | /**
2391 | * Convert UTF-16 / UCS-2 encoding string to UTF-8.
2392 | * Surrogates UTF-16 are supported!
2393 | *
2394 | * In Russian:
2395 | * Преобразует строку из кодировки UTF-16 / UCS-2 в UTF-8.
2396 | * Суррогаты UTF-16 поддерживаются!
2397 | *
2398 | * @param string $s
2399 | * @param string $type 'BE' -- big endian byte order
2400 | * 'LE' -- little endian byte order
2401 | * @param bool $to_array returns array chars instead whole string?
2402 | * @return string|array|bool UTF-8 string, array chars or FALSE if error occurred
2403 | */
2404 | private static function _convert_from_utf16($s, $type = 'BE', $to_array = false)
2405 | {
2406 | static $types = array(
2407 | 'BE' => 'n', #unsigned short (always 16 bit, big endian byte order)
2408 | 'LE' => 'v', #unsigned short (always 16 bit, little endian byte order)
2409 | );
2410 | if (! array_key_exists($type, $types))
2411 | {
2412 | trigger_error('Unexpected value in 2-nd parameter, "' . $type . '" given!', E_USER_WARNING);
2413 | return false;
2414 | }
2415 | #the fastest way:
2416 | if (function_exists('iconv') || function_exists('mb_convert_encoding'))
2417 | {
2418 | if (function_exists('iconv')) $s = iconv('UTF-16' . $type, 'UTF-8', $s);
2419 | elseif (function_exists('mb_convert_encoding')) $s = mb_convert_encoding($s, 'UTF-8', 'UTF-16' . $type);
2420 | if (! $to_array) return $s;
2421 | return self::str_split($s);
2422 | }
2423 |
2424 | /*
2425 | http://en.wikipedia.org/wiki/UTF-16
2426 |
2427 | The improvement that UTF-16 made over UCS-2 is its ability to encode
2428 | characters in planes 1-16, not just those in plane 0 (BMP).
2429 |
2430 | UTF-16 represents non-BMP characters (those from U+10000 through U+10FFFF)
2431 | using a pair of 16-bit words, known as a surrogate pair.
2432 | First 1000016 is subtracted from the code point to give a 20-bit value.
2433 | This is then split into two separate 10-bit values each of which is represented
2434 | as a surrogate with the most significant half placed in the first surrogate.
2435 | To allow safe use of simple word-oriented string processing, separate ranges
2436 | of values are used for the two surrogates: 0xD800-0xDBFF for the first, most
2437 | significant surrogate and 0xDC00-0xDFFF for the second, least significant surrogate.
2438 |
2439 | For example, the character at code point U+10000 becomes the code unit sequence 0xD800 0xDC00,
2440 | and the character at U+10FFFD, the upper limit of Unicode, becomes the sequence 0xDBFF 0xDFFD.
2441 | Unicode and ISO/IEC 10646 do not, and will never, assign characters to any of the code points
2442 | in the U+D800-U+DFFF range, so an individual code value from a surrogate pair does not ever
2443 | represent a character.
2444 |
2445 | http://www.russellcottrell.com/greek/utilities/SurrogatePairCalculator.htm
2446 | http://www.russellcottrell.com/greek/utilities/UnicodeRanges.htm
2447 |
2448 | Conversion of a Unicode scalar value S to a surrogate pair :
2449 | H = Math.floor((S - 0x10000) / 0x400) + 0xD800;
2450 | L = ((S - 0x10000) % 0x400) + 0xDC00;
2451 | The conversion of a surrogate pair to a scalar value:
2452 | N = ((H - 0xD800) * 0x400) + (L - 0xDC00) + 0x10000;
2453 | */
2454 | $a = array();
2455 | $hi = false;
2456 | foreach (unpack($types[$type] . '*', $s) as $codepoint)
2457 | {
2458 | #surrogate process
2459 | if ($hi !== false)
2460 | {
2461 | $lo = $codepoint;
2462 | if ($lo < 0xDC00 || $lo > 0xDFFF) $a[] = "\xEF\xBF\xBD"; #U+FFFD REPLACEMENT CHARACTER (for broken char)
2463 | else
2464 | {
2465 | $codepoint = (($hi - 0xD800) * 0x400) + ($lo - 0xDC00) + 0x10000;
2466 | $a[] = self::chr($codepoint);
2467 | }
2468 | $hi = false;
2469 | }
2470 | elseif ($codepoint < 0xD800 || $codepoint > 0xDBFF) $a[] = self::chr($codepoint); #not surrogate
2471 | else $hi = $codepoint; #surrogate was found
2472 | }
2473 | return $to_array ? $a : implode('', $a);
2474 | }
2475 |
2476 | /**
2477 | * Strips out device control codes in the ASCII range.
2478 | *
2479 | * @param string|null String to clean
2480 | * @return string|bool|null Returns FALSE if error occurred
2481 | */
2482 | public static function strict($s)
2483 | {
2484 | if (! ReflectionTypeHint::isValid()) return false;
2485 | if (is_null($s)) return $s;
2486 | return preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F]+/sSX', '', $s);
2487 | }
2488 |
2489 | /**
2490 | * Check the data accessory to the class of characters ASCII.
2491 | * For null, integer, float, boolean returns TRUE.
2492 | *
2493 | * Массивы обходятся рекурсивно, если в хотябы одном элементе массива
2494 | * его значение не ASCII, возвращается FALSE.
2495 | *
2496 | * @param array|scalar|null $data
2497 | * @return bool
2498 | */
2499 | public static function is_ascii($data)
2500 | {
2501 | if (! ReflectionTypeHint::isValid()) return false;
2502 | if (is_array($data))
2503 | {
2504 | foreach ($data as $k => &$v)
2505 | {
2506 | if (! self::is_ascii($k) || ! self::is_ascii($v)) return false;
2507 | }
2508 | return true;
2509 | }
2510 | #ltrim() little faster then preg_match()
2511 | #if (is_string($data)) return preg_match('/^[\x00-\x7f]*$/sSX', $data); #deprecated
2512 | if (is_string($data)) return ltrim($data, "\x00..\x7f") === '';
2513 | if (is_scalar($data) || is_null($data)) return true; #~ null, integer, float, boolean
2514 | return false; #object or resource
2515 | }
2516 |
2517 | /**
2518 | * Returns true if data is valid UTF-8 and false otherwise.
2519 | * For null, integer, float, boolean returns TRUE.
2520 | *
2521 | * The arrays are traversed recursively, if At least one element of the array
2522 | * its value is not in UTF-8, returns FALSE.
2523 | *
2524 | * @link http://www.w3.org/International/questions/qa-forms-utf-8.html
2525 | * @link http://ru3.php.net/mb_detect_encoding
2526 | * @link http://webtest.philigon.ru/articles/utf8/
2527 | * @link http://unicode.coeurlumiere.com/
2528 | * @param array|scalar|null $data
2529 | * @param bool $is_strict strict the range of ASCII?
2530 | * @return bool
2531 | */
2532 | public static function is_utf8($data, $is_strict = true)
2533 | {
2534 | if (! ReflectionTypeHint::isValid()) return false;
2535 | if (is_array($data))
2536 | {
2537 | foreach ($data as $k => &$v)
2538 | {
2539 | if (! self::is_utf8($k, $is_strict) || ! self::is_utf8($v, $is_strict)) return false;
2540 | }
2541 | return true;
2542 | }
2543 | if (is_string($data))
2544 | {
2545 | if (! preg_match('~~suSX', $data)) return false;
2546 | if (function_exists('preg_last_error') && preg_last_error() !== PREG_NO_ERROR) return false;
2547 | #preg_match('~~suSX') much faster (up to 4 times), then mb_check_encoding($data, 'UTF-8')!
2548 | #if (function_exists('mb_check_encoding') && ! mb_check_encoding($data, 'UTF-8')) return false; #DEPRECATED
2549 | if ($is_strict && preg_match('/[^\x09\x0A\x0D\x20-\xBF\xC2-\xF7]/sSX', $data)) return false;
2550 | return true;
2551 | }
2552 | if (is_scalar($data) || is_null($data)) return true; #~ null, integer, float, boolean
2553 | return false; #object or resource
2554 | }
2555 |
2556 | /**
2557 | * Tries to detect if a string is in Unicode encoding
2558 | *
2559 | * @deprecated Slowly, use self::is_utf8() instead
2560 | * @see self::is_utf8()
2561 | * @param string $s текст
2562 | * @param bool $is_strict строгая проверка диапазона ASCII?
2563 | * @return bool
2564 | */
2565 | public static function check($s, $is_strict = true)
2566 | {
2567 | if (! ReflectionTypeHint::isValid()) return false;
2568 | for ($i = 0, $len = strlen($s); $i < $len; $i++)
2569 | {
2570 | $c = ord($s[$i]);
2571 | if ($c < 0x80) #1 byte 0bbbbbbb
2572 | {
2573 | if ($is_strict === false || ($c > 0x1F && $c < 0x7F) || $c == 0x09 || $c == 0x0A || $c == 0x0D) continue;
2574 | }
2575 | if (($c & 0xE0) == 0xC0) $n = 1; #2 bytes 110bbbbb 10bbbbbb
2576 | elseif (($c & 0xF0) == 0xE0) $n = 2; #3 bytes 1110bbbb 10bbbbbb 10bbbbbb
2577 | elseif (($c & 0xF8) == 0xF0) $n = 3; #4 bytes 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb
2578 | elseif (($c & 0xFC) == 0xF8) $n = 4; #5 bytes 111110bb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb
2579 | elseif (($c & 0xFE) == 0xFC) $n = 5; #6 bytes 1111110b 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb
2580 | else return false; #does not match any model
2581 | #n bytes matching 10bbbbbb follow ?
2582 | for ($j = 0; $j < $n; $j++)
2583 | {
2584 | $i++;
2585 | if ($i == $len || ((ord($s[$i]) & 0xC0) != 0x80) ) return false;
2586 | }
2587 | }
2588 | return true;
2589 | }
2590 |
2591 | /**
2592 | * Check the data in UTF-8 charset on given ranges of the standard UNICODE.
2593 | * The suitable alternative to regular expressions.
2594 | *
2595 | * For null, integer, float, boolean returns TRUE.
2596 | *
2597 | * Arrays traversed recursively (keys and values).
2598 | * At least if one array element value is not passed checking, it returns FALSE.
2599 | *
2600 | * @example
2601 | * #A simple check the standard named ranges:
2602 | * UTF8::blocks_check('поисковые системы Google и Yandex', array('Basic Latin', 'Cyrillic'));
2603 | * #You can check the named, direct ranges or codepoints together:
2604 | * UTF8::blocks_check('поисковые системы Google и Yandex', array(array(0x20, 0x7E), #[\x20-\x7E]
2605 | * array(0x0410, 0x044F), #[A-Яa-я]
2606 | * 0x0401, #russian yo (Ё)
2607 | * 0x0451, #russian ye (ё)
2608 | * 'Arrows',
2609 | * ));
2610 | *
2611 | * @link http://www.unicode.org/charts/
2612 | * @param array|scalar|null $data
2613 | * @param array|string $blocks
2614 | * @return bool Возвращает TRUE, если все символы из текста принадлежат указанным диапазонам
2615 | * и FALSE в противном случае или для разбитого UTF-8.
2616 | */
2617 | public static function blocks_check($data, $blocks)
2618 | {
2619 | if (! ReflectionTypeHint::isValid()) return false;
2620 |
2621 | if (is_array($data))
2622 | {
2623 | foreach ($data as $k => &$v)
2624 | {
2625 | if (! self::blocks_check($k, $blocks) || ! self::blocks_check($v, $blocks)) return false;
2626 | }
2627 | return true;
2628 | }
2629 |
2630 | if (is_string($data))
2631 | {
2632 | $chars = self::str_split($data);
2633 | if ($chars === false) return false; #broken UTF-8
2634 | unset($data); #memory free
2635 | $skip = array(); #save to cache already checked symbols
2636 | foreach ($chars as $i => $char)
2637 | {
2638 | if (array_key_exists($char, $skip)) continue; #speed improve
2639 | $codepoint = self::ord($char);
2640 | if ($codepoint === false) return false; #broken UTF-8
2641 | $is_valid = false;
2642 | $blocks = (array)$blocks;
2643 | foreach ($blocks as $j => $block)
2644 | {
2645 | if (is_string($block))
2646 | {
2647 | if (! array_key_exists($block, self::$unicode_blocks))
2648 | {
2649 | trigger_error('Unknown block "' . $block . '"!', E_USER_WARNING);
2650 | return false;
2651 | }
2652 | list ($min, $max) = self::$unicode_blocks[$block];
2653 | }
2654 | elseif (is_array($block)) list ($min, $max) = $block;
2655 | elseif (is_int($block)) $min = $max = $block;
2656 | else trigger_error('A string/array/int type expected for block[' . $j . ']!', E_USER_ERROR);
2657 | if ($codepoint >= $min && $codepoint <= $max)
2658 | {
2659 | $is_valid = true;
2660 | break;
2661 | }
2662 | }#foreach
2663 | if (! $is_valid) return false;
2664 | $skip[$char] = null;
2665 | }#foreach
2666 | return true;
2667 | }
2668 | if (is_scalar($data) || is_null($data)) return true; #~ null, integer, float, boolean
2669 | return false; #object or resource
2670 | }
2671 |
2672 | /**
2673 | * Recode $_GET, $_POST, $_COOKIE, $_REQUEST, $_FILES from $charset encoding to UTF-8, if necessary.
2674 | * A side effect is a positive protection against XSS attacks with non-printable characters on the vulnerable PHP function.
2675 | * Thus web forms can be sent to the server in 2-encoding: $charset and UTF-8.
2676 | * For example: ?тест[тест]=тест
2677 | *
2678 | * Алгоритм работы:
2679 | * 1) Функция проверяет массивы $_GET, $_POST, $_COOKIE, $_REQUEST, $_FILES
2680 | * на корректность значений элементов кодировке UTF-8.
2681 | * 2) Значения не в UTF-8 принимаются как $charset и конвертируется в UTF-8,
2682 | * при этом байты от 0x00 до 0x7F (ASCII) сохраняются как есть.
2683 | * 3) Сконвертированные значения снова проверяются.
2684 | * Если данные опять не в кодировке UTF-8, то они считаются разбитыми и функция возвращает FALSE.
2685 | *
2686 | * NOTICE
2687 | * Функция должна вызываться после self::unescape_request()!
2688 | *
2689 | * @see self::unescape_request()
2690 | * @param bool $is_hex2bin Декодировать HEX-данные?
2691 | * Пример: 0xd09ec2a0d0bad0bed0bcd0bfd0b0d0bdd0b8d0b8 => О компании
2692 | * Параметры в URL адресах иногда бывает удобно кодировать не функцией rawurlencode(),
2693 | * а использовать следующий механизм (к тому же кодирующий данные более компактно):
2694 | * '0x' . bin2hex($string)
2695 | * @param string $charset
2696 | * @return bool Возвращает TRUE, если все значения элементов массивов в кодировке UTF-8
2697 | * и FALSE + E_USER_WARNING в противном случае.
2698 | */
2699 | public static function autoconvert_request($is_hex2bin = false, $charset = 'cp1251')
2700 | {
2701 | if (! ReflectionTypeHint::isValid()) return false;
2702 | $is_converted = false;
2703 | $is_broken = false;
2704 | foreach (array('_GET', '_POST', '_COOKIE', '_FILES') as $k => $v)
2705 | {
2706 | if (! array_key_exists($v, $GLOBALS)) continue;
2707 | #использовать array_walk_recursive() не предоставляется возможным,
2708 | #т.к. его callback функция не поддерживает передачу ключа по ссылке
2709 | $GLOBALS[$v] = self::_autoconvert_request_recursive($GLOBALS[$v], $is_converted, $is_broken, $is_hex2bin, $charset);
2710 | if ($is_broken)
2711 | {
2712 | trigger_error('Array $' . $v . ' does not have keys/values in UTF-8 charset!', E_USER_WARNING);
2713 | return false;
2714 | }
2715 | }
2716 | if ($is_converted)
2717 | {
2718 | $_REQUEST =
2719 | (isset($_COOKIE) ? $_COOKIE : array()) +
2720 | (isset($_POST) ? $_POST : array()) +
2721 | (isset($_GET) ? $_GET : array());
2722 | }
2723 | return true;
2724 | }
2725 |
2726 | private static function _autoconvert_request_recursive(&$data, &$is_converted, &$is_broken, $is_hex2bin, $charset)
2727 | {
2728 | if ($is_broken) return $data; #speed improve
2729 | if (is_array($data))
2730 | {
2731 | $d = array();
2732 | foreach ($data as $k => &$v)
2733 | {
2734 | $k = self::_autoconvert_request($k, $is_converted, $is_broken, $is_hex2bin, $charset);
2735 | if ($is_broken) return $data; #speed improve
2736 | $d[$k] = self::_autoconvert_request_recursive($v, $is_converted, $is_broken, $is_hex2bin, $charset);
2737 | if ($is_broken) return $data; #speed improve
2738 | }
2739 | return $d;
2740 | }
2741 | return self::_autoconvert_request($data, $is_converted, $is_broken, $is_hex2bin, $charset);
2742 | }
2743 |
2744 | private static function _autoconvert_request(&$s, &$is_converted, &$is_broken, $is_hex2bin, $charset)
2745 | {
2746 | #regexp speed improve by using strpos()
2747 | if ($is_hex2bin && strpos($s, '0x') === 0 && preg_match('/^0x((?:[\da-fA-F]{2})+)$/sSX', $s, $m))
2748 | {
2749 | $s = pack('H' . strlen($m[1]), $m[1]); #hex2bin()
2750 | $is_converted = true;
2751 | }
2752 | if (! self::is_utf8($s))
2753 | {
2754 | $s = self::convert_from($s, $charset);
2755 | if ($s === false) $is_broken = true;
2756 | elseif (! self::is_utf8($s))
2757 | {
2758 | trigger_error('String 0x ' . substr(bin2hex($s), 0, 100) . '... is not UTF-8!', E_USER_WARNING);
2759 | $is_broken = true;
2760 | }
2761 | else $is_converted = true;
2762 | }
2763 | return $s;
2764 | }
2765 |
2766 | /**
2767 | * Сравнение строк
2768 | *
2769 | * @param string|null $s1
2770 | * @param string|null $s2
2771 | * @param string $locale For example, 'en_CA', 'ru_RU'
2772 | * @return int|bool|null Returns FALSE if error occurred
2773 | * Returns < 0 if $s1 is less than $s2;
2774 | * > 0 if $s1 is greater than $s2;
2775 | * 0 if they are equal.
2776 | */
2777 | public static function strcmp($s1, $s2, $locale = '')
2778 | {
2779 | if (! ReflectionTypeHint::isValid()) return false;
2780 | if (is_null($s1) || is_null($s2)) return null;
2781 | if (! function_exists('collator_create')) return strcmp($s1, $s2);
2782 | # PHP 5 >= 5.3.0, PECL intl >= 1.0.0
2783 | # If empty string ("") or "root" are passed, UCA rules will be used.
2784 | $c = new Collator($locale);
2785 | if (! $c)
2786 | {
2787 | # Returns an "empty" object on error. You can use intl_get_error_code() and/or intl_get_error_message() to know what happened.
2788 | trigger_error(intl_get_error_message(), E_USER_WARNING);
2789 | return false;
2790 | }
2791 | return $c->compare($s1, $s2);
2792 | }
2793 |
2794 | /**
2795 | * Сравнение строк для N первых символов
2796 | *
2797 | * @param string|null $s1
2798 | * @param string|null $s2
2799 | * @param int $length
2800 | * @return int|bool|null Returns FALSE if error occurred
2801 | * Returns < 0 if $s1 is less than $s2;
2802 | * > 0 if $s1 is greater than $s2;
2803 | * 0 if they are equal.
2804 | */
2805 | public static function strncmp($s1, $s2, $length)
2806 | {
2807 | if (! ReflectionTypeHint::isValid()) return false;
2808 | if (is_null($s1) || is_null($s2)) return null;
2809 | return self::strcmp(self::substr($s1, 0, $length), self::substr($s2, 0, $length));
2810 | }
2811 |
2812 | /**
2813 | * Implementation strcasecmp() function for UTF-8 encoding string.
2814 | *
2815 | * @param string|null $s1
2816 | * @param string|null $s2
2817 | * @return int|bool|null Returns FALSE if error occurred
2818 | * Returns < 0 if $s1 is less than $s2;
2819 | * > 0 if $s1 is greater than $s2;
2820 | * 0 if they are equal.
2821 | */
2822 | public static function strcasecmp($s1, $s2)
2823 | {
2824 | if (! ReflectionTypeHint::isValid()) return false;
2825 | if (is_null($s1) || is_null($s2)) return null;
2826 | return self::strcmp(self::lowercase($s1), self::lowercase($s2));
2827 | }
2828 |
2829 | /**
2830 | * Converts a UTF-8 string to a UNICODE codepoints
2831 | *
2832 | * @param string|null $s UTF-8 string
2833 | * @return array|bool|null Unicode codepoints
2834 | * Returns FALSE if $s broken (not UTF-8)
2835 | */
2836 | public static function to_unicode($s)
2837 | {
2838 | if (! ReflectionTypeHint::isValid()) return false;
2839 | if (is_null($s)) return $s;
2840 |
2841 | $s2 = null;
2842 | #since PHP-5.3.x iconv() little faster then mb_convert_encoding()
2843 | if (function_exists('iconv')) $s2 = @iconv('UTF-8', 'UCS-4BE', $s);
2844 | elseif (function_exists('mb_convert_encoding')) $s2 = @mb_convert_encoding($s, 'UCS-4BE', 'UTF-8');
2845 | if (is_string($s2)) return array_values(unpack('N*', $s2));
2846 | if ($s2 !== null) return false;
2847 |
2848 | $a = self::str_split($s);
2849 | if ($a === false) return false;
2850 | return array_map(array(__CLASS__, 'ord'), $a);
2851 | }
2852 |
2853 | /**
2854 | * Converts a UNICODE codepoints to a UTF-8 string
2855 | *
2856 | * @param array|null $a Unicode codepoints
2857 | * @return string|bool|null UTF-8 string
2858 | * Returns FALSE if error occurred
2859 | */
2860 | public static function from_unicode($a)
2861 | {
2862 | if (! ReflectionTypeHint::isValid()) return false;
2863 | if (is_null($a)) return $a;
2864 |
2865 | #since PHP-5.3.x iconv() little faster then mb_convert_encoding()
2866 | if (function_exists('iconv'))
2867 | {
2868 | array_walk($a, function(&$cp) { $cp = pack('N', $cp); });
2869 | $s = @iconv('UCS-4BE', 'UTF-8', implode('', $a));
2870 | if (! is_string($s)) return false;
2871 | return $s;
2872 | }
2873 | if (function_exists('mb_convert_encoding'))
2874 | {
2875 | array_walk($a, function(&$cp) { $cp = pack('N', $cp); });
2876 | $s = mb_convert_encoding(implode('', $a), 'UTF-8', 'UCS-4BE');
2877 | if (! is_string($s)) return false;
2878 | return $s;
2879 | }
2880 |
2881 | return implode('', array_map(array(__CLASS__, 'chr'), $a));
2882 | }
2883 |
2884 | /**
2885 | * Converts a UTF-8 character to a UNICODE codepoint
2886 | *
2887 | * @param string|null $char UTF-8 character
2888 | * @return int|bool|null Unicode codepoint
2889 | * Returns FALSE if $char broken (not UTF-8)
2890 | */
2891 | public static function ord($char)
2892 | {
2893 | if (! ReflectionTypeHint::isValid()) return false;
2894 | if (is_null($char)) return $char;
2895 |
2896 | static $cache = array();
2897 | if (array_key_exists($char, $cache)) return $cache[$char]; #speed improve
2898 |
2899 | switch (strlen($char))
2900 | {
2901 | case 1 : return $cache[$char] = ord($char);
2902 | case 2 : return $cache[$char] = (ord($char{1}) & 63) |
2903 | ((ord($char{0}) & 31) << 6);
2904 | case 3 : return $cache[$char] = (ord($char{2}) & 63) |
2905 | ((ord($char{1}) & 63) << 6) |
2906 | ((ord($char{0}) & 15) << 12);
2907 | case 4 : return $cache[$char] = (ord($char{3}) & 63) |
2908 | ((ord($char{2}) & 63) << 6) |
2909 | ((ord($char{1}) & 63) << 12) |
2910 | ((ord($char{0}) & 7) << 18);
2911 | default :
2912 | trigger_error('Character 0x' . bin2hex($char) . ' is not UTF-8!', E_USER_WARNING);
2913 | return false;
2914 | }
2915 | }
2916 |
2917 | /**
2918 | * Converts a UNICODE codepoint to a UTF-8 character
2919 | *
2920 | * @param int|digit|null $cp Unicode codepoint
2921 | * @return string|bool|null UTF-8 character
2922 | * Returns FALSE if error occurred
2923 | */
2924 | public static function chr($cp)
2925 | {
2926 | if (! ReflectionTypeHint::isValid()) return false;
2927 | if (is_null($cp)) return $cp;
2928 |
2929 | static $cache = array();
2930 | if (array_key_exists($cp, $cache)) return $cache[$cp]; #speed improve
2931 |
2932 | if ($cp <= 0x7f) return $cache[$cp] = chr($cp);
2933 | if ($cp <= 0x7ff) return $cache[$cp] = chr(0xc0 | ($cp >> 6)) .
2934 | chr(0x80 | ($cp & 0x3f));
2935 | if ($cp <= 0xffff) return $cache[$cp] = chr(0xe0 | ($cp >> 12)) .
2936 | chr(0x80 | (($cp >> 6) & 0x3f)) .
2937 | chr(0x80 | ($cp & 0x3f));
2938 | if ($cp <= 0x10ffff) return $cache[$cp] = chr(0xf0 | ($cp >> 18)) .
2939 | chr(0x80 | (($cp >> 12) & 0x3f)) .
2940 | chr(0x80 | (($cp >> 6) & 0x3f)) .
2941 | chr(0x80 | ($cp & 0x3f));
2942 | #U+FFFD REPLACEMENT CHARACTER
2943 | return $cache[$cp] = "\xEF\xBF\xBD";
2944 | }
2945 |
2946 | /**
2947 | * Implementation chunk_split() function for UTF-8 encoding string.
2948 | *
2949 | * @param string|null $s
2950 | * @param int|digit|null $length
2951 | * @param string|null $glue
2952 | * @return string|bool|null Returns FALSE if error occurred
2953 | */
2954 | public static function chunk_split($s, $length = null, $glue = null)
2955 | {
2956 | if (! ReflectionTypeHint::isValid()) return false;
2957 | if (is_null($s)) return $s;
2958 |
2959 | $length = intval($length);
2960 | $glue = strval($glue);
2961 | if ($length < 1) $length = 76;
2962 | if ($glue === '') $glue = "\r\n";
2963 | if (! is_array($a = self::str_split($s, $length))) return false;
2964 | return implode($glue, $a);
2965 | }
2966 |
2967 | /**
2968 | * Changes all keys in an array
2969 | *
2970 | * @param array|null $a
2971 | * @param int $mode {CASE_LOWER|CASE_UPPER}
2972 | * @return array|bool|null Returns FALSE if error occurred
2973 | */
2974 | public static function array_change_key_case($a, $mode)
2975 | {
2976 | if (! ReflectionTypeHint::isValid()) return false;
2977 | if (! is_array($a)) return $a;
2978 | $a2 = array();
2979 | foreach ($a as $k => $v)
2980 | {
2981 | if (is_string($k))
2982 | {
2983 | $k = self::convert_case($k, $mode);
2984 | if ($k === false) return false;
2985 | }
2986 | $a2[$k] = $v;
2987 | }
2988 | return $a2;
2989 | }
2990 |
2991 | /**
2992 | * Конвертирует регистр букв в данных в кодировке UTF-8.
2993 | * Массивы обходятся рекурсивно, при этом конвертируются только значения
2994 | * в элементах массива, а ключи остаются без изменений.
2995 | * Для конвертирования только ключей используйте метод self::array_change_key_case().
2996 | *
2997 | * @see self::array_change_key_case()
2998 | * @link http://www.unicode.org/charts/PDF/U0400.pdf
2999 | * @link http://ru.wikipedia.org/wiki/ISO_639-1
3000 | * @param array|scalar|null $data Данные произвольной структуры
3001 | * @param int $mode {CASE_LOWER|CASE_UPPER}
3002 | * @param bool $is_ascii_optimization for speed improve
3003 | * @return scalar|bool|null Returns FALSE if error occurred
3004 | */
3005 | public static function convert_case($data, $mode, $is_ascii_optimization = true)
3006 | {
3007 | if (! ReflectionTypeHint::isValid()) return false;
3008 |
3009 | if (is_array($data))
3010 | {
3011 | foreach ($data as $k => &$v) $v = self::convert_case($v, $mode);
3012 | return $data;
3013 | }
3014 | if (! is_string($data) || ! $data) return $data;
3015 |
3016 | if ($mode === CASE_UPPER)
3017 | {
3018 | if ($is_ascii_optimization && self::is_ascii($data)) return strtoupper($data); #speed improve!
3019 | #deprecated, since PHP-5.3.x strtr() 2-3 times faster then mb_strtolower()
3020 | #if (function_exists('mb_strtoupper')) return mb_strtoupper($data, 'utf-8');
3021 | return strtr($data, array_flip(self::$convert_case_table));
3022 | }
3023 | if ($mode === CASE_LOWER)
3024 | {
3025 | if ($is_ascii_optimization && self::is_ascii($data)) return strtolower($data); #speed improve!
3026 | #deprecated, since PHP-5.3.x strtr() 2-3 times faster then mb_strtolower()
3027 | #if (function_exists('mb_strtolower')) return mb_strtolower($data, 'utf-8');
3028 | return strtr($data, self::$convert_case_table);
3029 | }
3030 | trigger_error('Parameter 2 should be a constant of CASE_LOWER or CASE_UPPER!', E_USER_WARNING);
3031 | return $data;
3032 | }
3033 |
3034 | /**
3035 | * Convert a data to lower case
3036 | *
3037 | * @param array|scalar|null $data
3038 | * @return scalar|bool|null Returns FALSE if error occurred */
3039 | public static function lowercase($data)
3040 | {
3041 | if (! ReflectionTypeHint::isValid()) return false;
3042 | return self::convert_case($data, CASE_LOWER);
3043 | }
3044 |
3045 | /**
3046 | * Convert a data to upper case
3047 | *
3048 | * @param array|scalar|null $data
3049 | * @return scalar|null Returns FALSE if error occurred
3050 | */
3051 | public static function uppercase($data)
3052 | {
3053 | if (! ReflectionTypeHint::isValid()) return false;
3054 | return self::convert_case($data, CASE_UPPER);
3055 | }
3056 |
3057 | /**
3058 | * Convert a data to lower case
3059 | *
3060 | * @param array|scalar|null $data
3061 | * @return scalar|bool|null Returns FALSE if error occurred
3062 | */
3063 | public static function strtolower($data)
3064 | {
3065 | if (! ReflectionTypeHint::isValid()) return false;
3066 | return self::convert_case($data, CASE_LOWER);
3067 | }
3068 |
3069 | /**
3070 | * Convert a data to upper case
3071 | *
3072 | * @param array|scalar|null $data
3073 | * @return scalar|null Returns FALSE if error occurred
3074 | */
3075 | public static function strtoupper($data)
3076 | {
3077 | if (! ReflectionTypeHint::isValid()) return false;
3078 | return self::convert_case($data, CASE_UPPER);
3079 | }
3080 |
3081 |
3082 | /**
3083 | * Convert all HTML entities to native UTF-8 characters
3084 | * Функция декодирует гораздо больше именованных сущностей, чем стандартная html_entity_decode()
3085 | * Все dec и hex сущности так же переводятся в UTF-8.
3086 | *
3087 | * Example: '"' or '"' or '"' will be converted to '"'.
3088 | *
3089 | * @link http://www.htmlhelp.com/reference/html40/entities/
3090 | * @link http://www.alanwood.net/demos/ent4_frame.html (HTML 4.01 Character Entity References)
3091 | * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset1.asp?frame=true
3092 | * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp?frame=true
3093 | * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp?frame=true
3094 | *
3095 | * @param scalar|null $s
3096 | * @param bool $is_special_chars Дополнительно обрабатывать специальные html сущности? (< > & ")
3097 | * @return scalar|null Returns FALSE if error occurred
3098 | */
3099 | public static function html_entity_decode($s, $is_special_chars = false)
3100 | {
3101 | if (! ReflectionTypeHint::isValid()) return false;
3102 | if (! is_string($s)) return $s;
3103 |
3104 | #speed improve
3105 | if (strlen($s) < 4 #по минимальной длине сущности - 4 байта: d; &xx;
3106 | || ($pos = strpos($s, '&') === false) || strpos($s, ';', $pos) === false) return $s;
3107 |
3108 | $table = self::$html_entity_table;
3109 | if ($is_special_chars) $table += self::$html_special_chars_table;
3110 |
3111 | #replace named entities
3112 | $s = strtr($s, $table);
3113 | #block below deprecated, since PHP-5.3.x strtr() 1.5 times faster
3114 | if (0 && preg_match_all('/&[a-zA-Z]++\d*+;/sSX', $s, $m, null, $pos))
3115 | {
3116 | foreach (array_unique($m[0]) as $entity)
3117 | {
3118 | if (array_key_exists($entity, $table)) $s = str_replace($entity, $table[$entity], $s);
3119 | }
3120 | }
3121 |
3122 | #заменяем числовые dec и hex сущности:
3123 | if (strpos($s, '') !== false) #speed improve
3124 | {
3125 | $class = __CLASS__;
3126 | $html_special_chars_table_flipped = array_flip(self::$html_special_chars_table);
3127 | $s = preg_replace_callback('/((x)[\da-fA-F]{1,6}+|\d{1,7}+);/sSX',
3128 | function (array $m) use ($class, $html_special_chars_table_flipped, $is_special_chars)
3129 | {
3130 | $codepoint = isset($m[2]) && $m[2] === 'x' ? hexdec($m[1]) : $m[1];
3131 | if (! $is_special_chars)
3132 | {
3133 | $char = pack('C', $codepoint);
3134 | if (array_key_exists($char, $html_special_chars_table_flipped)) return $html_special_chars_table_flipped[$char];
3135 | }
3136 | return $class::chr($codepoint);
3137 | }, $s);
3138 | }
3139 | return $s;
3140 | }
3141 |
3142 | /**
3143 | * Convert special UTF-8 characters to HTML entities.
3144 | * Функция кодирует гораздо больше именованных сущностей, чем стандартная htmlentities()
3145 | *
3146 | * @link http://www.htmlhelp.com/reference/html40/entities/
3147 | * @link http://www.alanwood.net/demos/ent4_frame.html (HTML 4.01 Character Entity References)
3148 | * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset1.asp?frame=true
3149 | * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp?frame=true
3150 | * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp?frame=true
3151 | *
3152 | * @param scalar|null $s
3153 | * @param bool $is_special_chars_only Обрабатывать только специальные html сущности? (< > & ")
3154 | * @return scalar|null Returns FALSE if error occurred
3155 | */
3156 | public static function html_entity_encode($s, $is_special_chars_only = false)
3157 | {
3158 | if (! ReflectionTypeHint::isValid()) return false;
3159 | if (! is_string($s)) return $s;
3160 |
3161 | #if ($is_special_chars_only) return strtr($s, array_flip(self::$html_special_chars_table));
3162 | if ($is_special_chars_only) return htmlspecialchars($s);
3163 |
3164 | #replace UTF-8 chars to named entities:
3165 | $s = strtr($s, array_flip(self::$html_entity_table));
3166 | #block below deprecated, since PHP-5.3.x strtr() 3 times faster
3167 | if (0 && preg_match_all('~(?> [\xc2\xc3\xc5\xc6\xcb\xce\xcf][\x80-\xbf] #2 bytes
3168 | | \xe2[\x80-\x99][\x82-\xac] #3 bytes
3169 | )
3170 | ~sxSX', $s, $m))
3171 | {
3172 | $table = array_flip(self::$html_entity_table);
3173 | foreach (array_unique($m[0]) as $char)
3174 | {
3175 | if (array_key_exists($char, $table)) $s = str_replace($char, $table[$char], $s);
3176 | }
3177 | }
3178 |
3179 | return $s;
3180 | }
3181 |
3182 | /**
3183 | * Make regular expression for case insensitive match
3184 | * Example (non ASCII): "123_слово_test" => "123_(с|С)(л|Л)(о|О)(в|В)(о|О)_[tT][eE][sS][tT]"
3185 | * Example (only ASCII): "123_test" => "(?i:123_test)"
3186 | *
3187 | * @param string $s
3188 | * @param string|null $delimiter If the optional delimiter is specified, it will also be escaped.
3189 | * This is useful for escaping the delimiter that is required by the PCRE functions.
3190 | * The / is the most commonly used delimiter.
3191 | * @return string|bool|null Returns FALSE if error occurred
3192 | */
3193 | public static function preg_quote_case_insensitive($s, $delimiter = null)
3194 | {
3195 | if (! ReflectionTypeHint::isValid()) return false;
3196 | if (is_null($s)) return $s;
3197 |
3198 | if (self::is_ascii($s)) return '(?i:' . preg_quote($s, $delimiter) . ')'; #speed improve
3199 |
3200 | $s_re = '';
3201 | $s_lc = UTF8::lowercase($s); if ($s_lc === false) return false;
3202 | $s_uc = UTF8::uppercase($s); if ($s_uc === false) return false;
3203 |
3204 | $chars_lc = UTF8::str_split($s_lc); if ($chars_lc === false) return false;
3205 | $chars_uc = UTF8::str_split($s_uc); if ($chars_uc === false) return false;
3206 |
3207 | foreach ($chars_lc as $i => $char)
3208 | {
3209 | if ($chars_lc[$i] === $chars_uc[$i])
3210 | $s_re .= preg_quote($chars_lc[$i], $delimiter);
3211 | elseif (self::is_ascii($chars_lc[$i]))
3212 | $s_re .= '[' . preg_quote($chars_lc[$i] . $chars_uc[$i], $delimiter) . ']';
3213 | else
3214 | $s_re .= '(' . preg_quote($chars_lc[$i], $delimiter) . '|'
3215 | . preg_quote($chars_uc[$i], $delimiter) . ')';
3216 | }
3217 | return $s_re;
3218 | }
3219 |
3220 | /**
3221 | * Call preg_match_all() and convert byte offsets into character offsets for PREG_OFFSET_CAPTURE flag.
3222 | * This is regardless of whether you use /u modifier.
3223 | *
3224 | * @link http://bolknote.ru/2010/09/08/~2704
3225 | *
3226 | * @param string $pattern
3227 | * @param string|null $subject
3228 | * @param array $matches
3229 | * @param int $flags
3230 | * @param int $char_offset
3231 | * @return array|bool|null Returns FALSE if error occurred
3232 | */
3233 | public static function preg_match_all($pattern, $subject, &$matches, $flags = PREG_PATTERN_ORDER, $char_offset = 0)
3234 | {
3235 | if (! ReflectionTypeHint::isValid()) return false;
3236 | if (is_null($subject)) return null;
3237 |
3238 | $byte_offset = ($char_offset > 0) ? strlen(self::substr($subject, 0, $char_offset)) : $char_offset;
3239 |
3240 | $return = preg_match_all($pattern, $subject, $matches, $flags, $byte_offset);
3241 | if ($return === false) return false;
3242 |
3243 | if ($flags & PREG_OFFSET_CAPTURE)
3244 | {
3245 | foreach ($matches as &$match)
3246 | {
3247 | foreach ($match as &$a) $a[1] = self::strlen(substr($subject, 0, $a[1]));
3248 | }
3249 | }
3250 |
3251 | return $return;
3252 | }
3253 |
3254 | #alias for self::str_limit()
3255 | public static function truncate($s, $maxlength = null, $continue = "\xe2\x80\xa6", &$is_cutted = null, $tail_min_length = 20)
3256 | {
3257 | return self::str_limit($s, $maxlength, $continue, $is_cutted, $tail_min_length);
3258 | }
3259 |
3260 | /**
3261 | * Обрезает текст в кодировке UTF-8 до заданной длины,
3262 | * причём последнее слово показывается целиком, а не обрывается на середине.
3263 | * Html сущности корректно обрабатываются.
3264 | *
3265 | * @param string|null $s Текст в кодировке UTF-8
3266 | * @param int|null|digit $maxlength Ограничение длины текста
3267 | * @param string $continue Завершающая строка, которая будет вставлена после текста, если он обрежется
3268 | * @param bool|null &$is_cutted Текст был обрезан?
3269 | * @param int|digit $tail_min_length Если длина "хвоста", оставшегося после обрезки текста, меньше $tail_min_length,
3270 | * то текст возвращается без изменений
3271 | * @return string|bool|null Returns FALSE if error occurred
3272 | */
3273 | public static function str_limit($s, $maxlength = null, $continue = "\xe2\x80\xa6", &$is_cutted = null, $tail_min_length = 20) #"\xe2\x80\xa6" = "…"
3274 | {
3275 | if (! ReflectionTypeHint::isValid()) return false;
3276 | if (is_null($s)) return $s;
3277 |
3278 | $is_cutted = false;
3279 | if ($continue === null) $continue = "\xe2\x80\xa6";
3280 | if (! $maxlength) $maxlength = 256;
3281 |
3282 | #speed improve block
3283 | #{{{
3284 | if (strlen($s) <= $maxlength) return $s;
3285 | $s2 = str_replace("\r\n", '?', $s);
3286 | $s2 = preg_replace('/&(?> [a-zA-Z][a-zA-Z\d]+
3287 | | \#(?> \d{1,4}
3288 | | x[\da-fA-F]{2,4}
3289 | )
3290 | ); # html сущности (< > & ")
3291 | /sxSX', '?', $s2);
3292 | if (strlen($s2) <= $maxlength || self::strlen($s2) <= $maxlength) return $s;
3293 | #}}}
3294 |
3295 | $r = preg_match_all('/(?> \r\n # переносы строк
3296 | | &(?> [a-zA-Z][a-zA-Z\d]+
3297 | | \#(?> \d{1,4}
3298 | | x[\da-fA-F]{2,4}
3299 | )
3300 | ); # html сущности (< > & ")
3301 | | .
3302 | )
3303 | /sxuSX', $s, $m);
3304 | if ($r === false) return false;
3305 |
3306 | #d($m);
3307 | if (count($m[0]) <= $maxlength) return $s;
3308 |
3309 | $left = implode('', array_slice($m[0], 0, $maxlength));
3310 | #из диапазона ASCII исключаем буквы, цифры, открывающие парные символы [a-zA-Z\d\(\{\[] и некоторые др. символы
3311 | #нельзя вырезать в конце строки символ ";", т.к. он используются в сущностях &xxx;
3312 | $left2 = rtrim($left, "\x00..\x28\x2A..\x2F\x3A\x3C..\x3E\x40\x5B\x5C\x5E..\x60\x7B\x7C\x7E\x7F");
3313 | if (strlen($left) !== strlen($left2)) $return = $left2 . $continue;
3314 | else
3315 | {
3316 | #добавляем остаток к обрезанному слову
3317 | $right = implode('', array_slice($m[0], $maxlength));
3318 | preg_match('/^(?> [\d\)\]\}\-\.:]+ #цифры, закрывающие парные символы, дефис для составных слов, дата, время, IP-адреса, URL типа www.ya.ru:80!
3319 | | \p{L}+ #буквы
3320 | | \xe2\x80\x9d #закрывающие кавычки
3321 | | \xe2\x80\x99 #закрывающие кавычки
3322 | | \xe2\x80\x9c #закрывающие кавычки
3323 | | \xc2\xbb #закрывающие кавычки
3324 | )+
3325 | /suxSX', $right, $m);
3326 | #d($m);
3327 | $right = isset($m[0]) ? rtrim($m[0], '.-') : '';
3328 | $return = $left . $right;
3329 | if (strlen($return) !== strlen($s)) $return .= $continue;
3330 | }
3331 | if (self::strlen($s) - self::strlen($return) < $tail_min_length) return $s;
3332 |
3333 | $is_cutted = true;
3334 | return $return;
3335 | }
3336 |
3337 | /**
3338 | * Implementation str_split() function for UTF-8 encoding string.
3339 | *
3340 | * @param string|null $s
3341 | * @param int|null|digit $length
3342 | * @return array|bool|null Returns FALSE if error occurred
3343 | */
3344 | public static function str_split($s, $length = null)
3345 | {
3346 | if (! ReflectionTypeHint::isValid()) return false;
3347 | if (is_null($s)) return $s;
3348 |
3349 | $length = ($length === null) ? 1 : intval($length);
3350 | if ($length < 1) return false;
3351 | #there are limits in regexp for {min,max}!
3352 | if (preg_match_all('~.~suSX', $s, $m) === false) return false;
3353 | if (function_exists('preg_last_error') && preg_last_error() !== PREG_NO_ERROR) return false;
3354 | if ($length === 1) $a = $m[0];
3355 | else
3356 | {
3357 | $a = array();
3358 | for ($i = 0, $c = count($m[0]); $i < $c; $i += $length) $a[] = implode('', array_slice($m[0], $i, $length));
3359 | }
3360 | return $a;
3361 | }
3362 |
3363 | /**
3364 | * Implementation strlen() function for UTF-8 encoding string.
3365 | *
3366 | * @param string|null $s
3367 | * @return int|bool|null Returns FALSE if error occurred
3368 | */
3369 | public static function strlen($s)
3370 | {
3371 | if (! ReflectionTypeHint::isValid()) return false;
3372 | if (is_null($s)) return $s;
3373 |
3374 | //since PHP-5.3.x mb_strlen() faster then strlen(utf8_decode())
3375 | if (function_exists('mb_strlen')) return mb_strlen($s, 'utf-8');
3376 |
3377 | /*
3378 | utf8_decode() converts characters that are not in ISO-8859-1 to '?', which, for the purpose of counting, is quite alright.
3379 | It's much faster than iconv_strlen()
3380 | Note: this function does not count bad UTF-8 bytes in the string - these are simply ignored
3381 | */
3382 | return strlen(utf8_decode($s));
3383 |
3384 | /*
3385 | #slowly then strlen(utf8_decode())
3386 | if (function_exists('iconv_strlen')) return iconv_strlen($s, 'utf-8');
3387 |
3388 | #Do not count UTF-8 continuation bytes
3389 | #return strlen(preg_replace('/[\x80-\xBF]/sSX', '', $s));
3390 |
3391 | #slowly then strlen(utf8_decode())
3392 | preg_match_all('~.~suSX', $str, $m);
3393 | return count($m[0]);
3394 |
3395 | #slowly then preg_match_all() + count()
3396 | $n = 0;
3397 | for ($i = 0, $len = strlen($s); $i < $len; $i++)
3398 | {
3399 | $c = ord(substr($s, $i, 1));
3400 | if ($c < 0x80) $n++; #single-byte (0xxxxxx)
3401 | elseif (($c & 0xC0) == 0xC0) $n++; #multi-byte starting byte (11xxxxxx)
3402 | }
3403 | return $n;
3404 | */
3405 | }
3406 |
3407 | /**
3408 | * Implementation strpos() function for UTF-8 encoding string
3409 | *
3410 | * @param string|null $s The entire string
3411 | * @param string|int $needle The searched substring
3412 | * @param int|null $offset The optional offset parameter specifies the position from which the search should be performed
3413 | * @return int|bool|null Returns the numeric position of the first occurrence of needle in haystack.
3414 | * If needle is not found, will return FALSE.
3415 | */
3416 | public static function strpos($s, $needle, $offset = null)
3417 | {
3418 | if (! ReflectionTypeHint::isValid()) return false;
3419 | if (is_null($s)) return $s;
3420 |
3421 | if ($offset === null || $offset < 0) $offset = 0;
3422 | if (function_exists('mb_strpos')) return mb_strpos($s, $needle, $offset, 'utf-8');
3423 | #iconv_strpos() deprecated, because slowly than self::strlen(substr())
3424 | #if (function_exists('iconv_strpos')) return iconv_strpos($s, $needle, $offset, 'utf-8');
3425 | $byte_pos = $offset;
3426 | do if (($byte_pos = strpos($s, $needle, $byte_pos)) === false) return false;
3427 | while (($char_pos = self::strlen(substr($s, 0, $byte_pos++))) < $offset);
3428 | return $char_pos;
3429 | }
3430 |
3431 | /**
3432 | * Find position of first occurrence of a case-insensitive string.
3433 | *
3434 | * @param string|null $s The entire string
3435 | * @param string|int $needle The searched substring
3436 | * @param int|null $offset The optional offset parameter specifies the position from which the search should be performed
3437 | * @return int|bool|null Returns the numeric position of the first occurrence of needle in haystack.
3438 | * If needle is not found, will return FALSE.
3439 | */
3440 | public static function stripos($s, $needle, $offset = null)
3441 | {
3442 | if (! ReflectionTypeHint::isValid()) return false;
3443 | if (is_null($s)) return $s;
3444 |
3445 | if ($offset === null || $offset < 0) $offset = 0;
3446 | if (function_exists('mb_stripos')) return mb_stripos($s, $needle, $offset, 'utf-8');
3447 |
3448 | #optimization block (speed improve)
3449 | #{{{
3450 | $ascii_int = intval(self::is_ascii($s)) + intval(self::is_ascii($needle));
3451 | if ($ascii_int === 1) return false;
3452 | if ($ascii_int === 2) return stripos($s, $needle, $offset);
3453 | #}}}
3454 |
3455 | $s = self::convert_case($s, CASE_LOWER, false);
3456 | if ($s === false) return false;
3457 | $needle = self::convert_case($needle, CASE_LOWER, false);
3458 | if ($needle === false) return false;
3459 | return self::strpos($s, $needle, $offset);
3460 | }
3461 |
3462 | /**
3463 | * Implementation strrev() function for UTF-8 encoding string
3464 | *
3465 | * @param string|null $s
3466 | * @return string|bool|null Returns FALSE if error occurred
3467 | */
3468 | public static function strrev($s)
3469 | {
3470 | if (! ReflectionTypeHint::isValid()) return false;
3471 | if (is_null($s)) return $s;
3472 |
3473 | if (0) #TODO test speed
3474 | {
3475 | $s = self::_convert($s, 'UTF-8', 'UTF-32');
3476 | if (! is_string($s)) return false;
3477 | $s = implode('', array_reverse(str_split($s, 4)));
3478 | return self::_convert($s, 'UTF-32', 'UTF-8');
3479 | }
3480 |
3481 | if (! is_array($a = self::str_split($s))) return false;
3482 | return implode('', array_reverse($a));
3483 | }
3484 |
3485 | /**
3486 | * Implementation substr() function for UTF-8 encoding string.
3487 | *
3488 | * @link http://www.w3.org/International/questions/qa-forms-utf-8.html
3489 | * @param string|null $s
3490 | * @param int|digit $offset
3491 | * @param int|null|digit $length
3492 | * @return string|bool|null Returns FALSE if error occurred
3493 | */
3494 | public static function substr($s, $offset, $length = null)
3495 | {
3496 | if (! ReflectionTypeHint::isValid()) return false;
3497 | if (is_null($s)) return $s;
3498 |
3499 | #since PHP-5.3.x mb_substr() faster then iconv_substr()
3500 | if (function_exists('mb_substr'))
3501 | {
3502 | if ($length === null) $length = self::strlen($s);
3503 | return mb_substr($s, $offset, $length, 'utf-8');
3504 | }
3505 | if (function_exists('iconv_substr'))
3506 | {
3507 | if ($length === null) $length = self::strlen($s);
3508 | return iconv_substr($s, $offset, $length, 'utf-8');
3509 | }
3510 |
3511 | static $_s = null;
3512 | static $_a = null;
3513 |
3514 | if ($_s !== $s) $_a = self::str_split($_s = $s);
3515 | if (! is_array($_a)) return false;
3516 | if ($length !== null) $a = array_slice($_a, $offset, $length);
3517 | else $a = array_slice($_a, $offset);
3518 | return implode('', $a);
3519 | }
3520 |
3521 | /**
3522 | * Implementation substr_replace() function for UTF-8 encoding string.
3523 | *
3524 | * @param string|null $s
3525 | * @param string|int $replacement
3526 | * @param int|digit $start
3527 | * @param int|null $length
3528 | * @return string|bool|null Returns FALSE if error occurred
3529 | */
3530 | public static function substr_replace($s, $replacement, $start, $length = null)
3531 | {
3532 | if (! ReflectionTypeHint::isValid()) return false;
3533 | if (is_null($s)) return $s;
3534 |
3535 | if (! is_array($a = self::str_split($s))) return false;
3536 | array_splice($a, $start, $length, $replacement);
3537 | return implode('', $a);
3538 | }
3539 |
3540 | /**
3541 | * Implementation ucfirst() function for UTF-8 encoding string.
3542 | * Преобразует первый символ строки в кодировке UTF-8 в верхний регистр.
3543 | *
3544 | * @param string|null $s
3545 | * @param bool $is_other_to_lowercase остальные символы преобразуются в нижний регистр?
3546 | * @return string|bool|null Returns FALSE if error occurred
3547 | */
3548 | public static function ucfirst($s, $is_other_to_lowercase = true)
3549 | {
3550 | if (! ReflectionTypeHint::isValid()) return false;
3551 | if (is_null($s)) return $s;
3552 |
3553 | if ($s === '' || ! is_string($s)) return $s;
3554 | if (! preg_match('/^(.)(.*)$/suSX', $s, $m)) return false;
3555 | return self::uppercase($m[1]) . ($is_other_to_lowercase ? self::lowercase($m[2]) : $m[2]);
3556 | }
3557 |
3558 | /**
3559 | * Implementation ucwords() function for UTF-8 encoding string.
3560 | * Преобразует в верхний регистр первый символ каждого слова в строке в кодировке UTF-8,
3561 | * остальные символы каждого слова преобразуются в нижний регистр.
3562 | *
3563 | * @param string|null $s
3564 | * @param bool $is_other_to_lowercase остальные символы преобразуются в нижний регистр?
3565 | * @param string $spaces_re
3566 | * @return string|bool|null Returns FALSE if error occurred
3567 | */
3568 | public static function ucwords($s, $is_other_to_lowercase = true, $spaces_re = '~([\pZ\s]+)~suSX') #\pXps is POSIX space: property Z or tab, NL, VT, FF, CR
3569 | {
3570 | if (! ReflectionTypeHint::isValid()) return false;
3571 | if (is_null($s)) return $s;
3572 |
3573 | $words = preg_split($spaces_re, $s, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
3574 | foreach ($words as $k => $word)
3575 | {
3576 | $words[$k] = self::ucfirst($word, $is_other_to_lowercase = true);
3577 | if ($words[$k] === false) return false;
3578 | }
3579 | return implode('', $words);
3580 | }
3581 |
3582 | /**
3583 | * Decodes a string in the format %uXXXX or %u{XXXXXX} in the UTF-8 string.
3584 | *
3585 | * Используется для декодирования данных типа "%u0442%u0435%u0441%u0442",
3586 | * закодированных устаревшей функцией javascript://encode().
3587 | * Рекомендуется использовать функцию javascript://encodeURIComponent().
3588 | *
3589 | * NOTICE
3590 | * Устаревший формат %uXXXX позволяет использовать юникод только из диапазона UCS-2, т.е. от U+0 до U+FFFF
3591 | *
3592 | * @param scalar|array|null $data
3593 | * @param bool $is_rawurlencode
3594 | * @return scalar|array|null Returns FALSE if error occurred
3595 | */
3596 | public static function unescape($data, $is_rawurlencode = false)
3597 | {
3598 | if (! ReflectionTypeHint::isValid()) return false;
3599 | if (is_array($data))
3600 | {
3601 | $d = array();
3602 | foreach ($data as $k => &$v)
3603 | {
3604 | $k = self::unescape($k, $is_rawurlencode);
3605 | if ($k === false) return false;
3606 | $d[$k] = self::unescape($v, $is_rawurlencode);
3607 | if ($d[$k] === false && ! is_bool($v)) return false;
3608 | }
3609 | return $d;
3610 | }
3611 | if (is_string($data))
3612 | {
3613 | if (strpos($data, '%u') === false) return $data; #use strpos() for speed improving
3614 | return preg_replace_callback('/%u( [\da-fA-F]{4}+ #%uXXXX only UCS-2
3615 | | \{ [\da-fA-F]{1,6}+ \} #%u{XXXXXX} extended form for all UNICODE charts
3616 | )
3617 | /sxSX',
3618 | function (array $m) use ($is_rawurlencode)
3619 | {
3620 | $codepoint = hexdec(trim($m[1], '{}'));
3621 | $char = self::chr($codepoint);
3622 | return $is_rawurlencode ? rawurlencode($char) : $char;
3623 | },
3624 | $data);
3625 | }
3626 | if (is_scalar($data) || is_null($data)) return $data; #~ null, integer, float, boolean
3627 | return false; #object or resource
3628 | }
3629 |
3630 | /**
3631 | * 1) Corrects the global arrays $_GET, $_POST, $_COOKIE, $_REQUEST
3632 | * decoded values in the format %uXXXX and %u{XXXXXX}, encoded,
3633 | * for example, through an outdated javascript function escape().
3634 | * Standard PHP5 cannot do it.
3635 | * 2) If in the HTTP_COOKIE there are parameters with the same name,
3636 | * takes the last value, not the first, as in the QUERY_STRING.
3637 | * 3) Creates an array of $_POST for non-standard Content-Type, for example, "Content-Type: application/octet-stream".
3638 | * Standard PHP5 creates an array for "Content-Type: application/x-www-form-urlencoded" and "Content-Type: multipart/form-data".
3639 | *
3640 | * Сессии, куки и независимая авторизация на поддоменах.
3641 | *
3642 | * ПРИМЕР 1
3643 | * У рабочего сайта http://domain.com появились поддомены.
3644 | * Для кроссдоменной авторизации через механизм сессий имя хоста для COOKIE было изменено с "domain.com" на ".domain.com"
3645 | * В результате авторизация не работает.
3646 | * Помогает очистка COOKIE, но их принудительная очистка на тысячах пользовательских компьютеров проблематична.
3647 | * Проблема в следующем: если в HTTP_COOKIE есть параметры с одинаковым именем, то берётся последнее значение,
3648 | * а не первое, как в QUERY_STRING.
3649 | * Более подробное описание:
3650 | * PHP не правильно (?) обрабатывает заголовок HTTP_COOKIE, если там встречаются параметры с одинаковым именем, но разными значениями.
3651 | * Пример запроса HTTP-заголовка клиентом: "Cookie: sid=chpgs2fiak-330mzqza; sid=cmz5tnp5zz-xlbbgqp"
3652 | * В этом случае сервер берёт первое значение, а не последнее.
3653 | * Хотя если в QUERY_STRING есть такая ситуация, всегда берётся последний параметр.
3654 | * В HTTP_COOKIE два параметра с одинаковым именем могут появиться, если отправить клиенту следующие HTTP-заголовки:
3655 | * "Set-Cookie: sid=chpgs2fiak-330mzqza; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=domain.com" (только domain.com)
3656 | * "Set-Cookie: sid=cmz6uqorzv-1bn35110; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.domain.com" (domain.com и все его поддомены)
3657 | * Решение: поменять имя сессии.
3658 | *
3659 | * ПРИМЕР 2
3660 | * Есть рабочие сайты: http://domain.com (основной), http://admin.domain.com (админка),
3661 | * http://sub1.domain.com (подпроект 1), http://sub2.domain.com, (подпроект 2).
3662 | * Так же имеется сервер разработки http://dev.domain.com, на котором м. б. свои поддомены.
3663 | * Требуется сделать независимую кросс-доменную авторизацию для http://*.domain.com и http://*.dev.domain.com.
3664 | * Для сохранения статуса авторизации будем использовать сессию, имя и значение которой пишется в COOKIE.
3665 | * Т. к. домены http://*.dev.domain.com имеют пересечение с доменами http://*.domain.com,
3666 | * для независимой авторизации нужно использовать разные имена сессий.
3667 | * Пример HTTP заголовков ответа сервера:
3668 | * "Set-Cookie: sid=chpgs2fiak-330mzqza; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.domain.com" (.domain.com и все его поддомены)
3669 | * "Set-Cookie: sid.dev=cmz6uqorzv-1bn35110; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.dev.domain.com" (dev.domain.com и все его поддомены)
3670 | *
3671 | * @link http://tools.ietf.org/html/rfc2965 RFC 2965 - HTTP State Management Mechanism
3672 | * @return void
3673 | */
3674 | public static function unescape_request()
3675 | {
3676 | $fixed = false;
3677 | #ATTENTION! HTTP_RAW_POST_DATA is only accessible when Content-Type of POST request is NOT default "application/x-www-form-urlencoded"!
3678 | $HTTP_RAW_POST_DATA = isset($_SERVER['REQUEST_METHOD']) && $_SERVER['REQUEST_METHOD'] === 'POST' ? (isset($GLOBALS['HTTP_RAW_POST_DATA']) ? $GLOBALS['HTTP_RAW_POST_DATA'] : @file_get_contents('php://input')) : null;
3679 | if (ini_get('always_populate_raw_post_data')) $GLOBALS['HTTP_RAW_POST_DATA'] = $HTTP_RAW_POST_DATA;
3680 | foreach (array( '_GET' => isset($_SERVER['QUERY_STRING']) ? $_SERVER['QUERY_STRING'] : null,
3681 | '_POST' => $HTTP_RAW_POST_DATA,
3682 | '_COOKIE' => isset($_SERVER['HTTP_COOKIE']) ? $_SERVER['HTTP_COOKIE'] : null,
3683 | ) as $k => $v)
3684 | {
3685 | if (! is_string($v)) continue;
3686 | if ($k === '_COOKIE')
3687 | {
3688 | $v = preg_replace('/; *+/sSX', '&', $v);
3689 | unset($_COOKIE); #будем парсить HTTP_COOKIE сами, чтобы сделать обработку как у QUERY_STRING
3690 | }
3691 | if (strpos($v, '%u') !== false)
3692 | {
3693 | parse_str(self::unescape($v, $is_rawurlencode = true), $GLOBALS[$k]);
3694 | $fixed = true;
3695 | continue;
3696 | }
3697 | if (array_key_exists($k, $GLOBALS)) continue;
3698 | parse_str($v, $GLOBALS[$k]);
3699 | $fixed = true;
3700 | }
3701 | if ($fixed)
3702 | {
3703 | $_REQUEST =
3704 | (isset($_COOKIE) ? $_COOKIE : array()) +
3705 | (isset($_POST) ? $_POST : array()) +
3706 | (isset($_GET) ? $_GET : array());
3707 | }
3708 | }
3709 |
3710 | /**
3711 | * Calculates the height of the edit text in