├── README.md ├── ReflectionTypeHint.php ├── ReflectionTypeHint_example.php ├── Text └── Censure.php ├── UTF8-CHANGELOG.txt └── UTF8.php /README.md: -------------------------------------------------------------------------------- 1 | php-censure 2 | =========== 3 | 4 | Клон одноименной библиотеки антимата с google-code 5 | 6 | Оригинал можно найти по ссылке: http://code.google.com/p/php-censure/ 7 | 8 | Библиотека позволяет определить наличие в тексте на русском языке мата (в том числе многие криптованные варианты) 9 | и/или заменить его произвольным набором символов 10 | 11 | Оригинальное описание 12 | 13 | Алгоритм достаточно надёжен и быстр, в т.ч. на больших объёмах данных 14 | Метод обнаружения мата основывается на корнях и предлогах русского языка, а не на словаре 15 | Слова "лох", "хер", "залупа", "сука" матерными словами не считаются (см. словарь Даля) 16 | Разработка ведётся с 2005 года 17 | 18 | Согласно статье 20.1 КоАП РФ нецензурная брань в общественных местах (интернет — место общественное) 19 | расценивается как мелкое хулиганство, за что установлена административная ответственность — наложение 20 | штрафа в размере от пятисот до одной тысячи рублей или административный арест на срок до пятнадцати суток. 21 | -------------------------------------------------------------------------------- /ReflectionTypeHint.php: -------------------------------------------------------------------------------- 1 | 'is_int', 31 | 'integer' => 'is_int', 32 | 'digit' => 'ctype_digit', 33 | 'number' => 'ctype_digit', 34 | 'float' => 'is_float', 35 | 'double' => 'is_float', 36 | 'real' => 'is_float', 37 | 'numeric' => 'is_numeric', 38 | 'str' => 'is_string', 39 | 'string' => 'is_string', 40 | 'char' => 'is_string', 41 | 'bool' => 'is_bool', 42 | 'boolean' => 'is_bool', 43 | 'null' => 'is_null', 44 | 'array' => 'is_array', 45 | 'obj' => 'is_object', 46 | 'object' => 'is_object', 47 | 'res' => 'is_resource', 48 | 'resource' => 'is_resource', 49 | 'scalar' => 'is_scalar', #integer, float, string or boolean 50 | 'cb' => 'is_callable', 51 | 'callback' => 'is_callable', 52 | ); 53 | 54 | #calling the methods of this class only statically! 55 | private function __construct() {} 56 | 57 | public static function isValid() 58 | { 59 | if (! assert_options(ASSERT_ACTIVE)) return true; 60 | $bt = self::debugBacktrace(null, 1); 61 | extract($bt); //to $file, $line, $function, $class, $object, $type, $args 62 | if (! $args) return true; #speed improve 63 | $r = new ReflectionMethod($class, $function); 64 | $doc = $r->getDocComment(); 65 | $cache_id = $class. $type. $function; 66 | preg_match_all('~ [\r\n]++ [\x20\t]++ \* [\x20\t]++ 67 | @param 68 | [\x20\t]++ 69 | \K #memory reduce 70 | ( [_a-z]++[_a-z\d]*+ 71 | (?>[|/,][_a-z]+[_a-z\d]*)*+ 72 | ) #1 types 73 | [\x20\t]++ 74 | &?+\$([_a-z]++[_a-z\d]*+) #2 name 75 | ~sixSX', $doc, $params, PREG_SET_ORDER); 76 | $parameters = $r->getParameters(); 77 | //d($args, $params, $parameters); 78 | if (count($parameters) > count($params)) 79 | { 80 | $message = 'phpDoc %d piece(s) @param description expected in %s%s%s(), %s given, ' . PHP_EOL 81 | . 'called in %s on line %d ' . PHP_EOL 82 | . 'and defined in %s on line %d'; 83 | $message = sprintf($message, count($parameters), $class, $type, $function, count($params), $file, $line, $r->getFileName(), $r->getStartLine()); 84 | trigger_error($message, E_USER_NOTICE); 85 | } 86 | foreach ($args as $i => $value) 87 | { 88 | if (! isset($params[$i])) return true; 89 | if ($parameters[$i]->name !== $params[$i][2]) 90 | { 91 | $param_num = $i + 1; 92 | $message = 'phpDoc @param %d in %s%s%s() must be named as $%s, $%s given, ' . PHP_EOL 93 | . 'called in %s on line %d ' . PHP_EOL 94 | . 'and defined in %s on line %d'; 95 | $message = sprintf($message, $param_num, $class, $type, $function, $parameters[$i]->name, $params[$i][2], $file, $line, $r->getFileName(), $r->getStartLine()); 96 | trigger_error($message, E_USER_NOTICE); 97 | } 98 | 99 | $hints = preg_split('~[|/,]~sSX', $params[$i][1]); 100 | if (! self::checkValueTypes($hints, $value)) 101 | { 102 | $param_num = $i + 1; 103 | $message = 'Argument %d passed to %s%s%s() must be an %s, %s given, ' . PHP_EOL 104 | . 'called in %s on line %d ' . PHP_EOL 105 | . 'and defined in %s on line %d'; 106 | $message = sprintf($message, $param_num, $class, $type, $function, implode('|', $hints), (is_object($value) ? get_class($value) . ' ' : '') . gettype($value), $file, $line, $r->getFileName(), $r->getStartLine()); 107 | trigger_error($message, E_USER_WARNING); 108 | return false; 109 | } 110 | } 111 | return true; 112 | } 113 | 114 | /** 115 | * Return stacktrace. Correctly work with call_user_func*() 116 | * (totally skip them correcting caller references). 117 | * If $return_frame is present, return only $return_frame matched caller, not all stacktrace. 118 | * 119 | * @param string|null $re_ignore example: '~^' . preg_quote(__CLASS__, '~') . '(?![a-zA-Z\d])~sSX' 120 | * @param int|null $return_frame 121 | * @return array 122 | */ 123 | public static function debugBacktrace($re_ignore = null, $return_frame = null) 124 | { 125 | $trace = debug_backtrace(); 126 | 127 | $a = array(); 128 | $frames = 0; 129 | for ($i = 0, $n = count($trace); $i < $n; $i++) 130 | { 131 | $t = $trace[$i]; 132 | if (! $t) continue; 133 | 134 | // Next frame. 135 | $next = isset($trace[$i+1])? $trace[$i+1] : null; 136 | 137 | // Dummy frame before call_user_func*() frames. 138 | if (! isset($t['file']) && $next) 139 | { 140 | $t['over_function'] = $trace[$i+1]['function']; 141 | $t = $t + $trace[$i+1]; 142 | $trace[$i+1] = null; // skip call_user_func on next iteration 143 | } 144 | 145 | // Skip myself frame. 146 | if (++$frames < 2) continue; 147 | 148 | // 'class' and 'function' field of next frame define where this frame function situated. 149 | // Skip frames for functions situated in ignored places. 150 | if ($re_ignore && $next) 151 | { 152 | // Name of function "inside which" frame was generated. 153 | $frame_caller = (isset($next['class']) ? $next['class'] . $next['type'] : '') 154 | . (isset($next['function']) ? $next['function'] : ''); 155 | if (preg_match($re_ignore, $frame_caller)) continue; 156 | } 157 | 158 | // On each iteration we consider ability to add PREVIOUS frame to $a stack. 159 | if (count($a) === $return_frame) return $t; 160 | $a[] = $t; 161 | } 162 | return $a; 163 | } 164 | 165 | /** 166 | * Checks a value to the allowed types 167 | * 168 | * @param array $types 169 | * @param mixed $value 170 | * @return bool 171 | */ 172 | public static function checkValueTypes(array $types, $value) 173 | { 174 | foreach ($types as $type) 175 | { 176 | $type = strtolower($type); 177 | if (array_key_exists($type, self::$hints) && call_user_func(self::$hints[$type], $value)) return true; 178 | if (is_object($value) && @is_a($value, $type)) return true; 179 | if ($type === 'mixed') return true; 180 | } 181 | return false; 182 | } 183 | } -------------------------------------------------------------------------------- /ReflectionTypeHint_example.php: -------------------------------------------------------------------------------- 1 | myMethod('sss', 75467, new Exception(), true); 24 | -------------------------------------------------------------------------------- /Text/Censure.php: -------------------------------------------------------------------------------- 1 | = 5.2.0 41 | * @param string $charset кодировка символов (родная кодировка -- UTF-8, для других будет прозрачное перекодирование) 42 | * @return bool|string|int|null Если $replace === NULL, то возвращает FALSE, если мат не обнаружен, иначе фрагмент текста с матерным словом. 43 | * Если $replace !== NULL, то возвращает исходную строку, где фрагменты мата заменены на $replace. 44 | * В случае возникновения ошибки возвращает код ошибки > 0 (integer): 45 | * * PREG_INTERNAL_ERROR 46 | * * PREG_BACKTRACK_LIMIT_ERROR (see also pcre.backtrack_limit) 47 | * * PREG_RECURSION_LIMIT_ERROR (see also pcre.recursion_limit) 48 | * * PREG_BAD_UTF8_ERROR 49 | * * PREG_BAD_UTF8_OFFSET_ERROR (since PHP 5.3.0) 50 | * Или -1, если ReflectionTypeHint вернул ошибку 51 | */ 52 | public static function parse( 53 | $s, 54 | $delta = 3, 55 | $continue = "\xe2\x80\xa6", 56 | $is_html = true, 57 | $replace = null, 58 | $charset = 'UTF-8') 59 | { 60 | if (! ReflectionTypeHint::isValid()) return -1; 61 | if ($s === null) return null; 62 | 63 | static $re_badwords = null; 64 | 65 | if ($re_badwords === null) 66 | { 67 | #предлоги русского языка: 68 | #[всуо]| 69 | #по|за|на|об|до|от|вы|вс|вз|из|ис| 70 | #под|про|при|над|низ|раз|рас|воз|вос| 71 | #пооб|повы|пона|поза|недо|пере|одно| 72 | #полуза|произ|пораз|много| 73 | $pretext = array( 74 | #1 75 | '[уyоoаa]_? (?=[еёeхx])', #у, о (уебать, охуеть, ахуеть) 76 | '[вvbсc]_? (?=[хпбмгжxpmgj])', #в, с (впиздячить, схуярить) 77 | '[вvbсc]_?[ъь]_? (?=[еёe])', #въ, съ (съебаться, въебать) 78 | 'ё_? (?=[бb6])', #ё (ёбля) 79 | #2 80 | '[вvb]_?[ыi]_?', #вы 81 | '[зz3]_?[аa]_?', #за 82 | '[нnh]_?[аaеeиi]_?', #на, не, ни 83 | '[вvb]_?[сc]_? (?=[хпбмгжxpmgj])', #вс (вспизднуть) 84 | '[оo]_?[тtбb6]_? (?=[хпбмгжxpmgj])', #от, об 85 | '[оo]_?[тtбb6]_?[ъь]_? (?=[еёe])', #отъ, объ 86 | '[иiвvb]_?[зz3]_? (?=[хпбмгжxpmgj])', #[ив]з 87 | '[иiвvb]_?[зz3]_?[ъь]_? (?=[еёe])', #[ив]зъ 88 | '[иi]_?[сc]_? (?=[хпбмгжxpmgj])', #ис 89 | '[пpдdg]_?[оo]_? (?> [бb6]_? (?=[хпбмгжxpmgj]) 90 | | [бb6]_? [ъь]_? (?=[еёe]) 91 | | [зz3]_? [аa] _? 92 | )?', #по, до, пообъ, дообъ, поза, доза (двойные символы вырезаются!) 93 | #3 94 | '[пp]_?[рr]_?[оoиi]_?', #пр[ои] 95 | '[зz3]_?[лl]_?[оo]_?', #зло (злоебучая) 96 | '[нnh]_?[аa]_?[дdg]_? (?=[хпбмгжxpmgj])', #над 97 | '[нnh]_?[аa]_?[дdg]_?[ъь]_? (?=[еёe])', #надъ 98 | '[пp]_?[оoаa]_?[дdg]_? (?=[хпбмгжxpmgj])', #под 99 | '[пp]_?[оoаa]_?[дdg]_?[ъь]_? (?=[еёe])', #подъ 100 | '[рr]_?[аa]_?[зz3сc]_? (?=[хпбмгжxpmgj])', #ра[зс] 101 | '[рr]_?[аa]_?[зz3сc]_?[ъь]_? (?=[еёe])', #ра[зс]ъ 102 | '[вvb]_?[оo]_?[зz3сc]_? (?=[хпбмгжxpmgj])', #во[зс] 103 | '[вvb]_?[оo]_?[зz3сc]_?[ъь]_? (?=[еёe])', #во[зс]ъ 104 | #4 105 | '[нnh]_?[еe]_?[дdg]_?[оo]_?', #недо 106 | '[пp]_?[еe]_?[рr]_?[еe]_?', #пере 107 | '[oо]_?[дdg]_?[нnh]_?[оo]_?', #одно 108 | '[кk]_?[oо]_?[нnh]_?[оo]_?', #коно (коноебиться) 109 | '[мm]_?[уy]_?[дdg]_?[oоaа]_?', #муд[оа] (мудаёб) 110 | '[oо]_?[сc]_?[тt]_?[оo]_?', #осто (остопиздело) 111 | '[дdg]_?[уy]_?[рpr]_?[оoаa]_?', #дур[оа] 112 | '[хx]_?[уy]_?[дdg]_?[оoаa]_?', #худ[оа] (худоебина) 113 | #5 114 | '[мm]_?[нnh]_?[оo]_?[гg]_?[оo]_?', #много 115 | '[мm]_?[оo]_?[рpr]_?[дdg]_?[оoаa]_?', #морд[оа] 116 | '[мm]_?[оo]_?[зz3]_?[гg]_?[оoаa]_?', #мозг[оа] 117 | '[дdg]_?[оo]_?[лl]_?[бb6]_?[оoаa]_?', #долб[оа] 118 | '[оo]_?[сc]_?[тt]_?[рpr]_?[оo]_?', #остро 119 | ); 120 | 121 | $badwords = array( 122 | #Слово на букву Х 123 | '(?<=\PL) %RE_PRETEXT%? 124 | [hхx]_?[уyu]_?[ийiеeёяюju] #хуй, хуя, хую, хуем, хуёвый, охуительный 125 | #исключения: 126 | (? '\x20', #пробел 218 | '\pL' => '[^\x20\d]', #буква 219 | '\PL' => '[\x20\d]', #не буква 220 | '[:vowel:]' => '[аеиоуыэюяёaeioyu]', #гласные буквы 221 | '[:consonant:]' => '[^аеиоуыэюяёaeioyu\x20\d]', #согласные буквы 222 | ); 223 | 224 | $re_badwords = str_replace( 225 | '%RE_PRETEXT%', 226 | '(?:' . implode('|', $pretext) . ')', #однократный шаблон с альтернативами использовать нельзя! 227 | '~' . implode('|', $badwords) . '~sxuSX' 228 | ); 229 | $re_badwords = strtr($re_badwords, $trans); 230 | } 231 | 232 | $s = UTF8::convert_from($s, $charset); 233 | $replace = UTF8::convert_from($replace, $charset); 234 | 235 | $ss = $s; #saves original string 236 | 237 | if ($is_html) 238 | { 239 | #скрипты не вырезаем, т.к. м.б. обходной маневр на с кодом на javascript: 240 | # 241 | #хотя давать пользователю возможность использовать код на javascript нехорошо 242 | $s = is_callable(array('HTML', 'strip_tags')) ? HTML::strip_tags($s, null, true, array('comment', 'style', 'map', 'frameset', 'object', 'applet')) 243 | : strip_tags($s); 244 | #заменяем html-сущности в "чистый" UTF-8 245 | $s = UTF8::html_entity_decode($s, $is_htmlspecialchars = true); 246 | } 247 | 248 | if (strtoupper(substr($charset, 0, 3)) === 'UTF') #UTF-8, UTF-16, UTF-32 249 | { 250 | #remove combining diactrical marks 251 | $additional_chars = array( 252 | "\xc2\xad", #"мягкие" переносы строк (­) 253 | ); 254 | $s = UTF8::diactrical_remove($s, $additional_chars); 255 | } 256 | 257 | #ВотБ/\яПидорыОхуелиБлятьНахуйПохуйПи3децПолный 258 | if (version_compare(PHP_VERSION, '5.2.0', '>=')) 259 | { 260 | $s = preg_replace('~ [\p{Lu}3] (?>\p{Ll}+|/\\\\|[@36]+)++ #Вот 261 | (?= [\p{Lu}3] (?:\p{Ll} |/\\\\|[@36] ) ) #Бля 262 | ~sxuSX', '$0 ', $s); 263 | } 264 | 265 | $s = mb_strtolower($s); 266 | 267 | #получаем в массив только буквы и цифры 268 | #"с_л@о#во,с\xc2\xa7лово.Слово" -> "с л о во с лово слово слово слово слово" 269 | preg_match_all('~(?> \xd0[\xb0-\xbf]|\xd1[\x80-\x8f\x91] #[а-я] 270 | | /\\\\ #л 271 | | @ #а 272 | | [a-z\d]+ 273 | )+ 274 | ~sxSX', $s, $m); 275 | $s = ' ' . implode(' ', $m[0]) . ' '; 276 | 277 | $trans = array( 278 | '/\\' => 'л', #Б/\ЯТЬ --> БЛЯТЬ 279 | '@' => 'а', #пизд@ --> пизда 280 | ); 281 | $s = strtr($s, $trans); 282 | 283 | #цифровые подделки под буквы 284 | $trans = array( 285 | '~ [3з]++ [3з\x20]*+ ~sxuSX' => 'з', 286 | '~ [6б]++ [6б\x20]*+ ~sxuSX' => 'б', 287 | ); 288 | $s = preg_replace(array_keys($trans), array_values($trans), $s); 289 | 290 | #убираем все повторяющиеся символы, ловим обман типа "х-у-у-й" 291 | #"сллоооовоо слово х у у й" --> "слово слово х у й" 292 | $s = preg_replace('/( [\xd0\xd1][\x80-\xbf] \x20? #optimized [а-я] 293 | | [a-z\d] \x20? 294 | ) \\1+ 295 | /sxSX', '$1', $s); 296 | 297 | if ($replace === null || version_compare(PHP_VERSION, '5.2.0', '<')) 298 | { 299 | $result = preg_match($re_badwords, $s, $m, PREG_OFFSET_CAPTURE); 300 | if (function_exists('preg_last_error') && preg_last_error() !== PREG_NO_ERROR) return preg_last_error(); 301 | if ($result === false) return 1; #PREG_INTERNAL_ERROR = 1 302 | if ($result && $replace === null) 303 | { 304 | list($word, $offset) = $m[0]; 305 | $s1 = substr($s, 0, $offset); 306 | $s2 = substr($s, $offset + strlen($word)); 307 | $delta = intval($delta); 308 | if ($delta === 0) $fragment = '[' . trim($word) . ']'; 309 | else 310 | { 311 | if ($delta < 1 || $delta > 10) $delta = 3; 312 | preg_match('/ (?> \x20 (?>[\xd0\xd1][\x80-\xbf]|[a-z\d]+)++ ){1,' . $delta . '}+ 313 | \x20?+ 314 | $/sxSX', $s1, $m1); 315 | preg_match('/^ (?>[\xd0\xd1][\x80-\xbf]|[a-z\d]+)*+ #ending 316 | \x20?+ 317 | (?> (?>[\xd0\xd1][\x80-\xbf]|[a-z\d]+)++ \x20 ){0,' . $delta . '}+ 318 | /sxSX', $s2, $m2); 319 | $fragment = (ltrim(@$m1[0]) !== ltrim($s1) ? $continue : '') . 320 | trim((isset($m1[0]) ? $m1[0] : '') . '[' . trim($word) . ']' . (isset($m2[0]) ? $m2[0] : '')) . 321 | (rtrim(@$m2[0]) !== rtrim($s2) ? $continue : ''); 322 | } 323 | return UTF8::convert_to($fragment, $charset); 324 | } 325 | return false; 326 | } 327 | 328 | $result = preg_match_all($re_badwords, $s, $m); 329 | if (function_exists('preg_last_error') && preg_last_error() !== PREG_NO_ERROR) return preg_last_error(); 330 | if ($result === false) return 1; #PREG_INTERNAL_ERROR = 1 331 | if ($result > 0) 332 | { 333 | #d($s, $m[0]); 334 | $s = $ss; 335 | #замена матного фрагмента на $replace 336 | foreach ($m[0] as $w) 337 | { 338 | $re_w = '~' . preg_replace_callback('~(?:/\\\\|[^\x20])~suSX', array('self', '_make_regexp_callback'), $w) . '~sxuiSX'; 339 | $ss = preg_replace($re_w, $replace, $ss); 340 | #d($re_w); 341 | } 342 | while ($ss !== $s) $ss = self::parse($s = $ss, $delta, $continue, $is_html, $replace, 'UTF-8'); 343 | } 344 | return UTF8::convert_to($ss, $charset); 345 | } 346 | 347 | private static function _make_regexp_callback(array $m) 348 | { 349 | #$re_holes = '[\x00-\x20\-_\*\~\.\'"\^=`:]'; 350 | #$re_holes = '[\x00-\x2f\x3a-\x40\x5b-\x60\x7b-\x7f]'; 351 | $re_holes = '(?!/\\\\)[^\p{L}\d]'; #non letter, non digit, non '/\' 352 | if ($m[0] === 'а') $re = '[@аА]++ (?>[:holes:]|[@аА]+)*+'; 353 | elseif ($m[0] === 'з') $re = '[3зЗ]++ (?>[:holes:]|[3зЗ]+)*+'; 354 | elseif ($m[0] === 'б') $re = '[6бБ]++ (?>[:holes:]|[6бБ]+)*+'; 355 | elseif ($m[0] === 'л') $re = '(?>[лЛ]+|/\\\\)++ (?>[:holes:]|[лЛ]+|/\\\\)*+'; 356 | else 357 | { 358 | #в PCRE-7.2 флаг /i в комбинации с /u в регулярном выражении почему-то не работает (BUG?) 359 | #поэтому делаем класс символов с буквами в обоих регистрах 360 | $char = '[' . preg_quote($m[0] . UTF8::uppercase($m[0]), '~') . ']'; 361 | $re = str_replace('$0', $char, '$0++ (?>[:holes:]|$0+)*+'); 362 | } 363 | return str_replace('[:holes:]', $re_holes, $re . "\r\n"); 364 | } 365 | } 366 | -------------------------------------------------------------------------------- /UTF8-CHANGELOG.txt: -------------------------------------------------------------------------------- 1 | 2.2.2 / 2011-06-24 2 | 3 | * Convert case functions improved: from all russian charsets to UTF8 native support was added 4 | * UTF8::stripos() speed improved 5 | * constant REPLACEMENT_CHAR added 6 | 7 | 2.2.1 / 2011-06-08 8 | 9 | * UTF8::preg_quote_case_insensitive() added 10 | * UTF8::stripos() speed improved 11 | 12 | 2.2.0 / 2011-06-06 13 | 14 | * UTF8::strlen(), UTF8::substr(), UTF8::strpos(), 15 | UTF8::html_entity_encode(), UTF8::html_entity_decode(), 16 | UTF8::convert_case(), UTF8::lowercase(), UTF8::uppercase() speed improved 17 | * UTF8::stripos(), UTF8::to_unicode(), UTF8::from_unicode() added 18 | * UTF8::strtolower(), UTF8::strtoupper() as wrapper to UTF8::convert_case() added 19 | * Unicode character database to 6.0.0 (2010-06-04) updated 20 | * UTF8::$convert_case_table improved 21 | 22 | 2.1.3 / 2011-05-31 23 | 24 | * UTF8::truncate() small bug fixed 25 | 26 | 2.1.2 / 2011-03-25 27 | 28 | * Класс требует PHP-5.3.x 29 | * UTF8::$char_re deprecated 30 | * Добавлен метод UTF8::tests(), который тестирует методы класса на правильность работы 31 | * Добавлены методы UTF8::strcmp(), UTF8::strncmp(), UTF8::strcasecmp() 32 | * UTF8::is_utf8(), UTF8::str_limit(), UTF8::str_split() speed improved 33 | * Добавлен 2-й параметр в UTF8::html_entity_encode() 34 | * Добавлен 3-й параметр в UTF8::ucwords() 35 | * Методы UTF8::convert_case(), UTF8::lowercase(), UTF8::uppercase() могут принимать массив в 1-м параметре 36 | * Мелкие улучшения в UTF8::strtr() 37 | * Модернизирован класс ReflectionTypeHint 38 | 39 | 2.1.1 / 2010-07-19 40 | 41 | * Добавлены методы array_change_key_case(), range(), strtr() 42 | * Улучшен метод convert_files_from() 43 | * Unicode Character Database 5.2.0 44 | * Исправлены ошибки в trim(), ltrim(), rtrim(), str_pad(), которые могут возникать в некоторых случаях 45 | 46 | 2.1.0 / 2010-03-26 47 | 48 | * Удалён метод unescape_recursive() 49 | * Добавлен метод convert_files_from() 50 | * Несколько методов теперь могут принимать массив и делать их обход рекурсивно 51 | * Почти все методы для обработки строк могут принимать и возвращать NULL 52 | 53 | 2.0.2 / 2010-02-13 54 | 55 | * Новые методы is_ascii(), ltrim(), rtrim(), trim(), str_pad(), strspn() 56 | * Исправлена небольшая ошибка в str_limit() 57 | * Исправлена ошибка в методах convert_from() и convert_to(): они ошибочно возвращали FALSE, 58 | если подать на вход массив, содержащий элементы типа boolean со значением FALSE 59 | 60 | 2.0.1 / 2010-02-08 61 | 62 | * Удалён метод convert_from_cp1259(), используйте convert_from('cp1251') 63 | * Метод convert_from_utf16() теперь приватный, используйте convert_from('UTF-16') 64 | * Добавлены методы convert_to(), diactrical_remove(), diactrical_restore() 65 | * Другие мелкие исправления 66 | -------------------------------------------------------------------------------- /UTF8.php: -------------------------------------------------------------------------------- 1 | = 5.3.x 19 | * 20 | * In Russian: 21 | * 22 | * Поддержка UTF-8 в PHP 5. 23 | * 24 | * Возможности и преимущества использования этого класса 25 | * * Совместимость с интерфейсом стандартных PHP функций, работающих с однобайтовыми кодировками 26 | * * Возможность работы без PHP расширений ICONV и MBSTRING, если они есть, то активно используются! 27 | * * Полезные функции, отсутствующие в ICONV и MBSTRING 28 | * * Методы, которые принимают и возвращают строку, умеют принимать и возвращать null (удобно при выборках значений из базы данных) 29 | * * Несколько методов умеют обрабатывать массивы рекурсивно 30 | * * Единый интерфейс и инкапсуляция (можно унаследоваться и переопределить методы) 31 | * * Высокая производительность, надёжность и качественный код 32 | * * PHP >= 5.3.x 33 | * 34 | * Example: 35 | * $s = 'Hello, Привет'; 36 | * if (UTF8::is_utf8($s)) echo UTF8::strlen($s); 37 | * 38 | * UTF-8 encoding scheme: 39 | * 2^7 0x00000000 — 0x0000007F 0xxxxxxx 40 | * 2^11 0x00000080 — 0x000007FF 110xxxxx 10xxxxxx 41 | * 2^16 0x00000800 — 0x0000FFFF 1110xxxx 10xxxxxx 10xxxxxx 42 | * 2^21 0x00010000 — 0x001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 43 | * 1-4 bytes length: 2^7 + 2^11 + 2^16 + 2^21 = 2 164 864 44 | * 45 | * If I was a owner of the world, I would leave only 2 encoding: UTF-8 and UTF-32 ;-) 46 | * 47 | * Useful links 48 | * http://ru.wikipedia.org/wiki/UTF8 49 | * http://www.madore.org/~david/misc/unitest/ A Unicode Test Page 50 | * http://www.unicode.org/ 51 | * http://www.unicode.org/reports/ 52 | * http://www.unicode.org/reports/tr10/ Unicode Collation Algorithm 53 | * http://www.unicode.org/Public/UCA/6.0.0/ Unicode Collation Algorithm 54 | * http://www.unicode.org/reports/tr6/ A Standard Compression Scheme for Unicode 55 | * http://www.fileformat.info/info/unicode/char/search.htm Unicode Character Search 56 | * 57 | * @link http://code.google.com/p/php5-utf8/ 58 | * @license http://creativecommons.org/licenses/by-sa/3.0/ 59 | * @author Nasibullin Rinat 60 | * @version 2.2.2 61 | */ 62 | class UTF8 63 | { 64 | #REPLACEMENT CHARACTER (for broken char) 65 | const REPLACEMENT_CHAR = "\xEF\xBF\xBD"; #U+FFFD 66 | 67 | /** 68 | * Regular expression for a character in UTF-8 without the use of a flag /u 69 | * @deprecated Instead, use a dot (".") and the flag /u, it works faster! 70 | * @var string 71 | */ 72 | public static $char_re = ' [\x09\x0A\x0D\x20-\x7E] # ASCII strict 73 | # [\x00-\x7F] # ASCII non-strict (including control chars) 74 | | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte 75 | | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs 76 | | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte 77 | | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates 78 | | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 79 | | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 80 | | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 81 | '; 82 | 83 | /** 84 | * Combining diactrical marks (Unicode 5.1). 85 | * 86 | * For example, russian letters in composed form: "Ё" (U+0401), "Й" (U+0419), 87 | * decomposed form: (U+0415 U+0308), (U+0418 U+0306) 88 | * 89 | * @link http://www.unicode.org/charts/PDF/U0300.pdf 90 | * @link http://www.unicode.org/charts/PDF/U1DC0.pdf 91 | * @link http://www.unicode.org/charts/PDF/UFE20.pdf 92 | * @var string 93 | */ 94 | #public static $diactrical_re = '\p{M}'; #alternative, but only with /u flag 95 | public static $diactrical_re = ' \xcc[\x80-\xb9]|\xcd[\x80-\xaf] #UNICODE range: U+0300 — U+036F (for letters) 96 | | \xe2\x83[\x90-\xbf] #UNICODE range: U+20D0 — U+20FF (for symbols) 97 | | \xe1\xb7[\x80-\xbf] #UNICODE range: U+1DC0 — U+1DFF (supplement) 98 | | \xef\xb8[\xa0-\xaf] #UNICODE range: U+FE20 — U+FE2F (combining half marks) 99 | '; 100 | 101 | /** 102 | * @var array 103 | */ 104 | public static $html_special_chars_table = array( 105 | '"' => "\x22", #U+0022 ["] " quotation mark = APL quote 106 | '&' => "\x26", #U+0026 [&] & ampersand 107 | '<' => "\x3c", #U+003C [<] < less-than sign 108 | '>' => "\x3e", #U+003E [>] > greater-than sign 109 | ); 110 | 111 | /** 112 | * @link http://www.fileformat.info/format/w3c/entitytest.htm?sort=Unicode%20Character HTML Entity Browser Test Page 113 | * @var array 114 | */ 115 | public static $html_entity_table = array( 116 | #Latin-1 Entities: 117 | ' ' => "\xc2\xa0", #U+00A0 [ ] no-break space = non-breaking space 118 | '¡' => "\xc2\xa1", #U+00A1 [¡] inverted exclamation mark 119 | '¢' => "\xc2\xa2", #U+00A2 [¢] cent sign 120 | '£' => "\xc2\xa3", #U+00A3 [£] pound sign 121 | '¤' => "\xc2\xa4", #U+00A4 [¤] currency sign 122 | '¥' => "\xc2\xa5", #U+00A5 [¥] yen sign = yuan sign 123 | '¦' => "\xc2\xa6", #U+00A6 [¦] broken bar = broken vertical bar 124 | '§' => "\xc2\xa7", #U+00A7 [§] section sign 125 | '¨' => "\xc2\xa8", #U+00A8 [¨] diaeresis = spacing diaeresis 126 | '©' => "\xc2\xa9", #U+00A9 [©] copyright sign 127 | 'ª' => "\xc2\xaa", #U+00AA [ª] feminine ordinal indicator 128 | '«' => "\xc2\xab", #U+00AB [«] left-pointing double angle quotation mark = left pointing guillemet 129 | '¬' => "\xc2\xac", #U+00AC [¬] not sign 130 | '­' => "\xc2\xad", #U+00AD [ ] soft hyphen = discretionary hyphen 131 | '®' => "\xc2\xae", #U+00AE [®] registered sign = registered trade mark sign 132 | '¯' => "\xc2\xaf", #U+00AF [¯] macron = spacing macron = overline = APL overbar 133 | '°' => "\xc2\xb0", #U+00B0 [°] degree sign 134 | '±' => "\xc2\xb1", #U+00B1 [±] plus-minus sign = plus-or-minus sign 135 | '²' => "\xc2\xb2", #U+00B2 [²] superscript two = superscript digit two = squared 136 | '³' => "\xc2\xb3", #U+00B3 [³] superscript three = superscript digit three = cubed 137 | '´' => "\xc2\xb4", #U+00B4 [´] acute accent = spacing acute 138 | 'µ' => "\xc2\xb5", #U+00B5 [µ] micro sign 139 | '¶' => "\xc2\xb6", #U+00B6 [¶] pilcrow sign = paragraph sign 140 | '·' => "\xc2\xb7", #U+00B7 [·] middle dot = Georgian comma = Greek middle dot 141 | '¸' => "\xc2\xb8", #U+00B8 [¸] cedilla = spacing cedilla 142 | '¹' => "\xc2\xb9", #U+00B9 [¹] superscript one = superscript digit one 143 | 'º' => "\xc2\xba", #U+00BA [º] masculine ordinal indicator 144 | '»' => "\xc2\xbb", #U+00BB [»] right-pointing double angle quotation mark = right pointing guillemet 145 | '¼' => "\xc2\xbc", #U+00BC [¼] vulgar fraction one quarter = fraction one quarter 146 | '½' => "\xc2\xbd", #U+00BD [½] vulgar fraction one half = fraction one half 147 | '¾' => "\xc2\xbe", #U+00BE [¾] vulgar fraction three quarters = fraction three quarters 148 | '¿' => "\xc2\xbf", #U+00BF [¿] inverted question mark = turned question mark 149 | #Latin capital letter 150 | 'À' => "\xc3\x80", #Latin capital letter A with grave = Latin capital letter A grave 151 | 'Á' => "\xc3\x81", #Latin capital letter A with acute 152 | 'Â' => "\xc3\x82", #Latin capital letter A with circumflex 153 | 'Ã' => "\xc3\x83", #Latin capital letter A with tilde 154 | 'Ä' => "\xc3\x84", #Latin capital letter A with diaeresis 155 | 'Å' => "\xc3\x85", #Latin capital letter A with ring above = Latin capital letter A ring 156 | 'Æ' => "\xc3\x86", #Latin capital letter AE = Latin capital ligature AE 157 | 'Ç' => "\xc3\x87", #Latin capital letter C with cedilla 158 | 'È' => "\xc3\x88", #Latin capital letter E with grave 159 | 'É' => "\xc3\x89", #Latin capital letter E with acute 160 | 'Ê' => "\xc3\x8a", #Latin capital letter E with circumflex 161 | 'Ë' => "\xc3\x8b", #Latin capital letter E with diaeresis 162 | 'Ì' => "\xc3\x8c", #Latin capital letter I with grave 163 | 'Í' => "\xc3\x8d", #Latin capital letter I with acute 164 | 'Î' => "\xc3\x8e", #Latin capital letter I with circumflex 165 | 'Ï' => "\xc3\x8f", #Latin capital letter I with diaeresis 166 | 'Ð' => "\xc3\x90", #Latin capital letter ETH 167 | 'Ñ' => "\xc3\x91", #Latin capital letter N with tilde 168 | 'Ò' => "\xc3\x92", #Latin capital letter O with grave 169 | 'Ó' => "\xc3\x93", #Latin capital letter O with acute 170 | 'Ô' => "\xc3\x94", #Latin capital letter O with circumflex 171 | 'Õ' => "\xc3\x95", #Latin capital letter O with tilde 172 | 'Ö' => "\xc3\x96", #Latin capital letter O with diaeresis 173 | '×' => "\xc3\x97", #U+00D7 [×] multiplication sign 174 | 'Ø' => "\xc3\x98", #Latin capital letter O with stroke = Latin capital letter O slash 175 | 'Ù' => "\xc3\x99", #Latin capital letter U with grave 176 | 'Ú' => "\xc3\x9a", #Latin capital letter U with acute 177 | 'Û' => "\xc3\x9b", #Latin capital letter U with circumflex 178 | 'Ü' => "\xc3\x9c", #Latin capital letter U with diaeresis 179 | 'Ý' => "\xc3\x9d", #Latin capital letter Y with acute 180 | 'Þ' => "\xc3\x9e", #Latin capital letter THORN 181 | #Latin small letter 182 | 'ß' => "\xc3\x9f", #Latin small letter sharp s = ess-zed 183 | 'à' => "\xc3\xa0", #Latin small letter a with grave = Latin small letter a grave 184 | 'á' => "\xc3\xa1", #Latin small letter a with acute 185 | 'â' => "\xc3\xa2", #Latin small letter a with circumflex 186 | 'ã' => "\xc3\xa3", #Latin small letter a with tilde 187 | 'ä' => "\xc3\xa4", #Latin small letter a with diaeresis 188 | 'å' => "\xc3\xa5", #Latin small letter a with ring above = Latin small letter a ring 189 | 'æ' => "\xc3\xa6", #Latin small letter ae = Latin small ligature ae 190 | 'ç' => "\xc3\xa7", #Latin small letter c with cedilla 191 | 'è' => "\xc3\xa8", #Latin small letter e with grave 192 | 'é' => "\xc3\xa9", #Latin small letter e with acute 193 | 'ê' => "\xc3\xaa", #Latin small letter e with circumflex 194 | 'ë' => "\xc3\xab", #Latin small letter e with diaeresis 195 | 'ì' => "\xc3\xac", #Latin small letter i with grave 196 | 'í' => "\xc3\xad", #Latin small letter i with acute 197 | 'î' => "\xc3\xae", #Latin small letter i with circumflex 198 | 'ï' => "\xc3\xaf", #Latin small letter i with diaeresis 199 | 'ð' => "\xc3\xb0", #Latin small letter eth 200 | 'ñ' => "\xc3\xb1", #Latin small letter n with tilde 201 | 'ò' => "\xc3\xb2", #Latin small letter o with grave 202 | 'ó' => "\xc3\xb3", #Latin small letter o with acute 203 | 'ô' => "\xc3\xb4", #Latin small letter o with circumflex 204 | 'õ' => "\xc3\xb5", #Latin small letter o with tilde 205 | 'ö' => "\xc3\xb6", #Latin small letter o with diaeresis 206 | '÷' => "\xc3\xb7", #U+00F7 [÷] division sign 207 | 'ø' => "\xc3\xb8", #Latin small letter o with stroke = Latin small letter o slash 208 | 'ù' => "\xc3\xb9", #Latin small letter u with grave 209 | 'ú' => "\xc3\xba", #Latin small letter u with acute 210 | 'û' => "\xc3\xbb", #Latin small letter u with circumflex 211 | 'ü' => "\xc3\xbc", #Latin small letter u with diaeresis 212 | 'ý' => "\xc3\xbd", #Latin small letter y with acute 213 | 'þ' => "\xc3\xbe", #Latin small letter thorn 214 | 'ÿ' => "\xc3\xbf", #Latin small letter y with diaeresis 215 | #Symbols and Greek Letters: 216 | 'ƒ' => "\xc6\x92", #U+0192 [ƒ] Latin small f with hook = function = florin 217 | 'Α' => "\xce\x91", #Greek capital letter alpha 218 | 'Β' => "\xce\x92", #Greek capital letter beta 219 | 'Γ' => "\xce\x93", #Greek capital letter gamma 220 | 'Δ' => "\xce\x94", #Greek capital letter delta 221 | 'Ε' => "\xce\x95", #Greek capital letter epsilon 222 | 'Ζ' => "\xce\x96", #Greek capital letter zeta 223 | 'Η' => "\xce\x97", #Greek capital letter eta 224 | 'Θ' => "\xce\x98", #Greek capital letter theta 225 | 'Ι' => "\xce\x99", #Greek capital letter iota 226 | 'Κ' => "\xce\x9a", #Greek capital letter kappa 227 | 'Λ' => "\xce\x9b", #Greek capital letter lambda 228 | 'Μ' => "\xce\x9c", #Greek capital letter mu 229 | 'Ν' => "\xce\x9d", #Greek capital letter nu 230 | 'Ξ' => "\xce\x9e", #Greek capital letter xi 231 | 'Ο' => "\xce\x9f", #Greek capital letter omicron 232 | 'Π' => "\xce\xa0", #Greek capital letter pi 233 | 'Ρ' => "\xce\xa1", #Greek capital letter rho 234 | 'Σ' => "\xce\xa3", #Greek capital letter sigma 235 | 'Τ' => "\xce\xa4", #Greek capital letter tau 236 | 'Υ' => "\xce\xa5", #Greek capital letter upsilon 237 | 'Φ' => "\xce\xa6", #Greek capital letter phi 238 | 'Χ' => "\xce\xa7", #Greek capital letter chi 239 | 'Ψ' => "\xce\xa8", #Greek capital letter psi 240 | 'Ω' => "\xce\xa9", #Greek capital letter omega 241 | 'α' => "\xce\xb1", #Greek small letter alpha 242 | 'β' => "\xce\xb2", #Greek small letter beta 243 | 'γ' => "\xce\xb3", #Greek small letter gamma 244 | 'δ' => "\xce\xb4", #Greek small letter delta 245 | 'ε' => "\xce\xb5", #Greek small letter epsilon 246 | 'ζ' => "\xce\xb6", #Greek small letter zeta 247 | 'η' => "\xce\xb7", #Greek small letter eta 248 | 'θ' => "\xce\xb8", #Greek small letter theta 249 | 'ι' => "\xce\xb9", #Greek small letter iota 250 | 'κ' => "\xce\xba", #Greek small letter kappa 251 | 'λ' => "\xce\xbb", #Greek small letter lambda 252 | 'μ' => "\xce\xbc", #Greek small letter mu 253 | 'ν' => "\xce\xbd", #Greek small letter nu 254 | 'ξ' => "\xce\xbe", #Greek small letter xi 255 | 'ο' => "\xce\xbf", #Greek small letter omicron 256 | 'π' => "\xcf\x80", #Greek small letter pi 257 | 'ρ' => "\xcf\x81", #Greek small letter rho 258 | 'ς' => "\xcf\x82", #Greek small letter final sigma 259 | 'σ' => "\xcf\x83", #Greek small letter sigma 260 | 'τ' => "\xcf\x84", #Greek small letter tau 261 | 'υ' => "\xcf\x85", #Greek small letter upsilon 262 | 'φ' => "\xcf\x86", #Greek small letter phi 263 | 'χ' => "\xcf\x87", #Greek small letter chi 264 | 'ψ' => "\xcf\x88", #Greek small letter psi 265 | 'ω' => "\xcf\x89", #Greek small letter omega 266 | 'ϑ'=> "\xcf\x91", #Greek small letter theta symbol 267 | 'ϒ' => "\xcf\x92", #Greek upsilon with hook symbol 268 | 'ϖ' => "\xcf\x96", #U+03D6 [ϖ] Greek pi symbol 269 | 270 | '•' => "\xe2\x80\xa2", #U+2022 [•] bullet = black small circle 271 | '…' => "\xe2\x80\xa6", #U+2026 […] horizontal ellipsis = three dot leader 272 | '′' => "\xe2\x80\xb2", #U+2032 [′] prime = minutes = feet (для обозначения минут и футов) 273 | '″' => "\xe2\x80\xb3", #U+2033 [″] double prime = seconds = inches (для обозначения секунд и дюймов). 274 | '‾' => "\xe2\x80\xbe", #U+203E [‾] overline = spacing overscore 275 | '⁄' => "\xe2\x81\x84", #U+2044 [⁄] fraction slash 276 | '℘' => "\xe2\x84\x98", #U+2118 [℘] script capital P = power set = Weierstrass p 277 | 'ℑ' => "\xe2\x84\x91", #U+2111 [ℑ] blackletter capital I = imaginary part 278 | 'ℜ' => "\xe2\x84\x9c", #U+211C [ℜ] blackletter capital R = real part symbol 279 | '™' => "\xe2\x84\xa2", #U+2122 [™] trade mark sign 280 | 'ℵ' => "\xe2\x84\xb5", #U+2135 [ℵ] alef symbol = first transfinite cardinal 281 | '←' => "\xe2\x86\x90", #U+2190 [←] leftwards arrow 282 | '↑' => "\xe2\x86\x91", #U+2191 [↑] upwards arrow 283 | '→' => "\xe2\x86\x92", #U+2192 [→] rightwards arrow 284 | '↓' => "\xe2\x86\x93", #U+2193 [↓] downwards arrow 285 | '↔' => "\xe2\x86\x94", #U+2194 [↔] left right arrow 286 | '↵' => "\xe2\x86\xb5", #U+21B5 [↵] downwards arrow with corner leftwards = carriage return 287 | '⇐' => "\xe2\x87\x90", #U+21D0 [⇐] leftwards double arrow 288 | '⇑' => "\xe2\x87\x91", #U+21D1 [⇑] upwards double arrow 289 | '⇒' => "\xe2\x87\x92", #U+21D2 [⇒] rightwards double arrow 290 | '⇓' => "\xe2\x87\x93", #U+21D3 [⇓] downwards double arrow 291 | '⇔' => "\xe2\x87\x94", #U+21D4 [⇔] left right double arrow 292 | '∀' => "\xe2\x88\x80", #U+2200 [∀] for all 293 | '∂' => "\xe2\x88\x82", #U+2202 [∂] partial differential 294 | '∃' => "\xe2\x88\x83", #U+2203 [∃] there exists 295 | '∅' => "\xe2\x88\x85", #U+2205 [∅] empty set = null set = diameter 296 | '∇' => "\xe2\x88\x87", #U+2207 [∇] nabla = backward difference 297 | '∈' => "\xe2\x88\x88", #U+2208 [∈] element of 298 | '∉' => "\xe2\x88\x89", #U+2209 [∉] not an element of 299 | '∋' => "\xe2\x88\x8b", #U+220B [∋] contains as member 300 | '∏' => "\xe2\x88\x8f", #U+220F [∏] n-ary product = product sign 301 | '∑' => "\xe2\x88\x91", #U+2211 [∑] n-ary sumation 302 | '−' => "\xe2\x88\x92", #U+2212 [−] minus sign 303 | '∗' => "\xe2\x88\x97", #U+2217 [∗] asterisk operator 304 | '√' => "\xe2\x88\x9a", #U+221A [√] square root = radical sign 305 | '∝' => "\xe2\x88\x9d", #U+221D [∝] proportional to 306 | '∞' => "\xe2\x88\x9e", #U+221E [∞] infinity 307 | '∠' => "\xe2\x88\xa0", #U+2220 [∠] angle 308 | '∧' => "\xe2\x88\xa7", #U+2227 [∧] logical and = wedge 309 | '∨' => "\xe2\x88\xa8", #U+2228 [∨] logical or = vee 310 | '∩' => "\xe2\x88\xa9", #U+2229 [∩] intersection = cap 311 | '∪' => "\xe2\x88\xaa", #U+222A [∪] union = cup 312 | '∫' => "\xe2\x88\xab", #U+222B [∫] integral 313 | '∴' => "\xe2\x88\xb4", #U+2234 [∴] therefore 314 | '∼' => "\xe2\x88\xbc", #U+223C [∼] tilde operator = varies with = similar to 315 | '≅' => "\xe2\x89\x85", #U+2245 [≅] approximately equal to 316 | '≈' => "\xe2\x89\x88", #U+2248 [≈] almost equal to = asymptotic to 317 | '≠' => "\xe2\x89\xa0", #U+2260 [≠] not equal to 318 | '≡' => "\xe2\x89\xa1", #U+2261 [≡] identical to 319 | '≤' => "\xe2\x89\xa4", #U+2264 [≤] less-than or equal to 320 | '≥' => "\xe2\x89\xa5", #U+2265 [≥] greater-than or equal to 321 | '⊂' => "\xe2\x8a\x82", #U+2282 [⊂] subset of 322 | '⊃' => "\xe2\x8a\x83", #U+2283 [⊃] superset of 323 | '⊄' => "\xe2\x8a\x84", #U+2284 [⊄] not a subset of 324 | '⊆' => "\xe2\x8a\x86", #U+2286 [⊆] subset of or equal to 325 | '⊇' => "\xe2\x8a\x87", #U+2287 [⊇] superset of or equal to 326 | '⊕' => "\xe2\x8a\x95", #U+2295 [⊕] circled plus = direct sum 327 | '⊗' => "\xe2\x8a\x97", #U+2297 [⊗] circled times = vector product 328 | '⊥' => "\xe2\x8a\xa5", #U+22A5 [⊥] up tack = orthogonal to = perpendicular 329 | '⋅' => "\xe2\x8b\x85", #U+22C5 [⋅] dot operator 330 | '⌈' => "\xe2\x8c\x88", #U+2308 [⌈] left ceiling = APL upstile 331 | '⌉' => "\xe2\x8c\x89", #U+2309 [⌉] right ceiling 332 | '⌊' => "\xe2\x8c\x8a", #U+230A [⌊] left floor = APL downstile 333 | '⌋' => "\xe2\x8c\x8b", #U+230B [⌋] right floor 334 | '⟨' => "\xe2\x8c\xa9", #U+2329 [〈] left-pointing angle bracket = bra 335 | '⟩' => "\xe2\x8c\xaa", #U+232A [〉] right-pointing angle bracket = ket 336 | '◊' => "\xe2\x97\x8a", #U+25CA [◊] lozenge 337 | '♠' => "\xe2\x99\xa0", #U+2660 [♠] black spade suit 338 | '♣' => "\xe2\x99\xa3", #U+2663 [♣] black club suit = shamrock 339 | '♥' => "\xe2\x99\xa5", #U+2665 [♥] black heart suit = valentine 340 | '♦' => "\xe2\x99\xa6", #U+2666 [♦] black diamond suit 341 | #Other Special Characters: 342 | 'Œ' => "\xc5\x92", #U+0152 [Œ] Latin capital ligature OE 343 | 'œ' => "\xc5\x93", #U+0153 [œ] Latin small ligature oe 344 | 'Š' => "\xc5\xa0", #U+0160 [Š] Latin capital letter S with caron 345 | 'š' => "\xc5\xa1", #U+0161 [š] Latin small letter s with caron 346 | 'Ÿ' => "\xc5\xb8", #U+0178 [Ÿ] Latin capital letter Y with diaeresis 347 | 'ˆ' => "\xcb\x86", #U+02C6 [ˆ] modifier letter circumflex accent 348 | '˜' => "\xcb\x9c", #U+02DC [˜] small tilde 349 | ' ' => "\xe2\x80\x82", #U+2002 [ ] en space 350 | ' ' => "\xe2\x80\x83", #U+2003 [ ] em space 351 | ' ' => "\xe2\x80\x89", #U+2009 [ ] thin space 352 | '‌' => "\xe2\x80\x8c", #U+200C [‌] zero width non-joiner 353 | '‍' => "\xe2\x80\x8d", #U+200D [‍] zero width joiner 354 | '‎' => "\xe2\x80\x8e", #U+200E [‎] left-to-right mark 355 | '‏' => "\xe2\x80\x8f", #U+200F [‏] right-to-left mark 356 | '–' => "\xe2\x80\x93", #U+2013 [–] en dash 357 | '—' => "\xe2\x80\x94", #U+2014 [—] em dash 358 | '‘' => "\xe2\x80\x98", #U+2018 [‘] left single quotation mark 359 | '’' => "\xe2\x80\x99", #U+2019 [’] right single quotation mark (and apostrophe!) 360 | '‚' => "\xe2\x80\x9a", #U+201A [‚] single low-9 quotation mark 361 | '“' => "\xe2\x80\x9c", #U+201C [“] left double quotation mark 362 | '”' => "\xe2\x80\x9d", #U+201D [”] right double quotation mark 363 | '„' => "\xe2\x80\x9e", #U+201E [„] double low-9 quotation mark 364 | '†' => "\xe2\x80\xa0", #U+2020 [†] dagger 365 | '‡' => "\xe2\x80\xa1", #U+2021 [‡] double dagger 366 | '‰' => "\xe2\x80\xb0", #U+2030 [‰] per mille sign 367 | '‹' => "\xe2\x80\xb9", #U+2039 [‹] single left-pointing angle quotation mark 368 | '›' => "\xe2\x80\xba", #U+203A [›] single right-pointing angle quotation mark 369 | '€' => "\xe2\x82\xac", #U+20AC [€] euro sign 370 | ); 371 | 372 | /** 373 | * This table contains the data on how cp1259 characters map into Unicode (UTF-8). 374 | * The cp1259 map describes standart tatarish cyrillic charset and based on the cp1251 table. 375 | * cp1259 -- this is an outdated one byte encoding of the Tatar language, 376 | * which includes all the Russian letters from cp1251. 377 | * 378 | * @link http://search.cpan.org/CPAN/authors/id/A/AM/AMICHAUER/Lingua-TT-Yanalif-0.08.tar.gz 379 | * @link http://www.unicode.org/charts/PDF/U0400.pdf 380 | */ 381 | public static $cp1259_table = array( 382 | #bytes from 0x00 to 0x7F (ASCII) saved as is 383 | "\x80" => "\xd3\x98", #U+04d8 CYRILLIC CAPITAL LETTER SCHWA 384 | "\x81" => "\xd0\x83", #U+0403 CYRILLIC CAPITAL LETTER GJE 385 | "\x82" => "\xe2\x80\x9a", #U+201a SINGLE LOW-9 QUOTATION MARK 386 | "\x83" => "\xd1\x93", #U+0453 CYRILLIC SMALL LETTER GJE 387 | "\x84" => "\xe2\x80\x9e", #U+201e DOUBLE LOW-9 QUOTATION MARK 388 | "\x85" => "\xe2\x80\xa6", #U+2026 HORIZONTAL ELLIPSIS 389 | "\x86" => "\xe2\x80\xa0", #U+2020 DAGGER 390 | "\x87" => "\xe2\x80\xa1", #U+2021 DOUBLE DAGGER 391 | "\x88" => "\xe2\x82\xac", #U+20ac EURO SIGN 392 | "\x89" => "\xe2\x80\xb0", #U+2030 PER MILLE SIGN 393 | "\x8a" => "\xd3\xa8", #U+04e8 CYRILLIC CAPITAL LETTER BARRED O 394 | "\x8b" => "\xe2\x80\xb9", #U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK 395 | "\x8c" => "\xd2\xae", #U+04ae CYRILLIC CAPITAL LETTER STRAIGHT U 396 | "\x8d" => "\xd2\x96", #U+0496 CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER 397 | "\x8e" => "\xd2\xa2", #U+04a2 CYRILLIC CAPITAL LETTER EN WITH HOOK 398 | "\x8f" => "\xd2\xba", #U+04ba CYRILLIC CAPITAL LETTER SHHA 399 | "\x90" => "\xd3\x99", #U+04d9 CYRILLIC SMALL LETTER SCHWA 400 | "\x91" => "\xe2\x80\x98", #U+2018 LEFT SINGLE QUOTATION MARK 401 | "\x92" => "\xe2\x80\x99", #U+2019 RIGHT SINGLE QUOTATION MARK 402 | "\x93" => "\xe2\x80\x9c", #U+201c LEFT DOUBLE QUOTATION MARK 403 | "\x94" => "\xe2\x80\x9d", #U+201d RIGHT DOUBLE QUOTATION MARK 404 | "\x95" => "\xe2\x80\xa2", #U+2022 BULLET 405 | "\x96" => "\xe2\x80\x93", #U+2013 EN DASH 406 | "\x97" => "\xe2\x80\x94", #U+2014 EM DASH 407 | #"\x98" #UNDEFINED 408 | "\x99" => "\xe2\x84\xa2", #U+2122 TRADE MARK SIGN 409 | "\x9a" => "\xd3\xa9", #U+04e9 CYRILLIC SMALL LETTER BARRED O 410 | "\x9b" => "\xe2\x80\xba", #U+203a SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 411 | "\x9c" => "\xd2\xaf", #U+04af CYRILLIC SMALL LETTER STRAIGHT U 412 | "\x9d" => "\xd2\x97", #U+0497 CYRILLIC SMALL LETTER ZHE WITH DESCENDER 413 | "\x9e" => "\xd2\xa3", #U+04a3 CYRILLIC SMALL LETTER EN WITH HOOK 414 | "\x9f" => "\xd2\xbb", #U+04bb CYRILLIC SMALL LETTER SHHA 415 | "\xa0" => "\xc2\xa0", #U+00a0 NO-BREAK SPACE 416 | "\xa1" => "\xd0\x8e", #U+040e CYRILLIC CAPITAL LETTER SHORT U 417 | "\xa2" => "\xd1\x9e", #U+045e CYRILLIC SMALL LETTER SHORT U 418 | "\xa3" => "\xd0\x88", #U+0408 CYRILLIC CAPITAL LETTER JE 419 | "\xa4" => "\xc2\xa4", #U+00a4 CURRENCY SIGN 420 | "\xa5" => "\xd2\x90", #U+0490 CYRILLIC CAPITAL LETTER GHE WITH UPTURN 421 | "\xa6" => "\xc2\xa6", #U+00a6 BROKEN BAR 422 | "\xa7" => "\xc2\xa7", #U+00a7 SECTION SIGN 423 | "\xa8" => "\xd0\x81", #U+0401 CYRILLIC CAPITAL LETTER IO 424 | "\xa9" => "\xc2\xa9", #U+00a9 COPYRIGHT SIGN 425 | "\xaa" => "\xd0\x84", #U+0404 CYRILLIC CAPITAL LETTER UKRAINIAN IE 426 | "\xab" => "\xc2\xab", #U+00ab LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 427 | "\xac" => "\xc2\xac", #U+00ac NOT SIGN 428 | "\xad" => "\xc2\xad", #U+00ad SOFT HYPHEN 429 | "\xae" => "\xc2\xae", #U+00ae REGISTERED SIGN 430 | "\xaf" => "\xd0\x87", #U+0407 CYRILLIC CAPITAL LETTER YI 431 | "\xb0" => "\xc2\xb0", #U+00b0 DEGREE SIGN 432 | "\xb1" => "\xc2\xb1", #U+00b1 PLUS-MINUS SIGN 433 | "\xb2" => "\xd0\x86", #U+0406 CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I 434 | "\xb3" => "\xd1\x96", #U+0456 CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I 435 | "\xb4" => "\xd2\x91", #U+0491 CYRILLIC SMALL LETTER GHE WITH UPTURN 436 | "\xb5" => "\xc2\xb5", #U+00b5 MICRO SIGN 437 | "\xb6" => "\xc2\xb6", #U+00b6 PILCROW SIGN 438 | "\xb7" => "\xc2\xb7", #U+00b7 MIDDLE DOT 439 | "\xb8" => "\xd1\x91", #U+0451 CYRILLIC SMALL LETTER IO 440 | "\xb9" => "\xe2\x84\x96", #U+2116 NUMERO SIGN 441 | "\xba" => "\xd1\x94", #U+0454 CYRILLIC SMALL LETTER UKRAINIAN IE 442 | "\xbb" => "\xc2\xbb", #U+00bb RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 443 | "\xbc" => "\xd1\x98", #U+0458 CYRILLIC SMALL LETTER JE 444 | "\xbd" => "\xd0\x85", #U+0405 CYRILLIC CAPITAL LETTER DZE 445 | "\xbe" => "\xd1\x95", #U+0455 CYRILLIC SMALL LETTER DZE 446 | "\xbf" => "\xd1\x97", #U+0457 CYRILLIC SMALL LETTER YI 447 | "\xc0" => "\xd0\x90", #U+0410 CYRILLIC CAPITAL LETTER A 448 | "\xc1" => "\xd0\x91", #U+0411 CYRILLIC CAPITAL LETTER BE 449 | "\xc2" => "\xd0\x92", #U+0412 CYRILLIC CAPITAL LETTER VE 450 | "\xc3" => "\xd0\x93", #U+0413 CYRILLIC CAPITAL LETTER GHE 451 | "\xc4" => "\xd0\x94", #U+0414 CYRILLIC CAPITAL LETTER DE 452 | "\xc5" => "\xd0\x95", #U+0415 CYRILLIC CAPITAL LETTER IE 453 | "\xc6" => "\xd0\x96", #U+0416 CYRILLIC CAPITAL LETTER ZHE 454 | "\xc7" => "\xd0\x97", #U+0417 CYRILLIC CAPITAL LETTER ZE 455 | "\xc8" => "\xd0\x98", #U+0418 CYRILLIC CAPITAL LETTER I 456 | "\xc9" => "\xd0\x99", #U+0419 CYRILLIC CAPITAL LETTER SHORT I 457 | "\xca" => "\xd0\x9a", #U+041a CYRILLIC CAPITAL LETTER KA 458 | "\xcb" => "\xd0\x9b", #U+041b CYRILLIC CAPITAL LETTER EL 459 | "\xcc" => "\xd0\x9c", #U+041c CYRILLIC CAPITAL LETTER EM 460 | "\xcd" => "\xd0\x9d", #U+041d CYRILLIC CAPITAL LETTER EN 461 | "\xce" => "\xd0\x9e", #U+041e CYRILLIC CAPITAL LETTER O 462 | "\xcf" => "\xd0\x9f", #U+041f CYRILLIC CAPITAL LETTER PE 463 | "\xd0" => "\xd0\xa0", #U+0420 CYRILLIC CAPITAL LETTER ER 464 | "\xd1" => "\xd0\xa1", #U+0421 CYRILLIC CAPITAL LETTER ES 465 | "\xd2" => "\xd0\xa2", #U+0422 CYRILLIC CAPITAL LETTER TE 466 | "\xd3" => "\xd0\xa3", #U+0423 CYRILLIC CAPITAL LETTER U 467 | "\xd4" => "\xd0\xa4", #U+0424 CYRILLIC CAPITAL LETTER EF 468 | "\xd5" => "\xd0\xa5", #U+0425 CYRILLIC CAPITAL LETTER HA 469 | "\xd6" => "\xd0\xa6", #U+0426 CYRILLIC CAPITAL LETTER TSE 470 | "\xd7" => "\xd0\xa7", #U+0427 CYRILLIC CAPITAL LETTER CHE 471 | "\xd8" => "\xd0\xa8", #U+0428 CYRILLIC CAPITAL LETTER SHA 472 | "\xd9" => "\xd0\xa9", #U+0429 CYRILLIC CAPITAL LETTER SHCHA 473 | "\xda" => "\xd0\xaa", #U+042a CYRILLIC CAPITAL LETTER HARD SIGN 474 | "\xdb" => "\xd0\xab", #U+042b CYRILLIC CAPITAL LETTER YERU 475 | "\xdc" => "\xd0\xac", #U+042c CYRILLIC CAPITAL LETTER SOFT SIGN 476 | "\xdd" => "\xd0\xad", #U+042d CYRILLIC CAPITAL LETTER E 477 | "\xde" => "\xd0\xae", #U+042e CYRILLIC CAPITAL LETTER YU 478 | "\xdf" => "\xd0\xaf", #U+042f CYRILLIC CAPITAL LETTER YA 479 | "\xe0" => "\xd0\xb0", #U+0430 CYRILLIC SMALL LETTER A 480 | "\xe1" => "\xd0\xb1", #U+0431 CYRILLIC SMALL LETTER BE 481 | "\xe2" => "\xd0\xb2", #U+0432 CYRILLIC SMALL LETTER VE 482 | "\xe3" => "\xd0\xb3", #U+0433 CYRILLIC SMALL LETTER GHE 483 | "\xe4" => "\xd0\xb4", #U+0434 CYRILLIC SMALL LETTER DE 484 | "\xe5" => "\xd0\xb5", #U+0435 CYRILLIC SMALL LETTER IE 485 | "\xe6" => "\xd0\xb6", #U+0436 CYRILLIC SMALL LETTER ZHE 486 | "\xe7" => "\xd0\xb7", #U+0437 CYRILLIC SMALL LETTER ZE 487 | "\xe8" => "\xd0\xb8", #U+0438 CYRILLIC SMALL LETTER I 488 | "\xe9" => "\xd0\xb9", #U+0439 CYRILLIC SMALL LETTER SHORT I 489 | "\xea" => "\xd0\xba", #U+043a CYRILLIC SMALL LETTER KA 490 | "\xeb" => "\xd0\xbb", #U+043b CYRILLIC SMALL LETTER EL 491 | "\xec" => "\xd0\xbc", #U+043c CYRILLIC SMALL LETTER EM 492 | "\xed" => "\xd0\xbd", #U+043d CYRILLIC SMALL LETTER EN 493 | "\xee" => "\xd0\xbe", #U+043e CYRILLIC SMALL LETTER O 494 | "\xef" => "\xd0\xbf", #U+043f CYRILLIC SMALL LETTER PE 495 | "\xf0" => "\xd1\x80", #U+0440 CYRILLIC SMALL LETTER ER 496 | "\xf1" => "\xd1\x81", #U+0441 CYRILLIC SMALL LETTER ES 497 | "\xf2" => "\xd1\x82", #U+0442 CYRILLIC SMALL LETTER TE 498 | "\xf3" => "\xd1\x83", #U+0443 CYRILLIC SMALL LETTER U 499 | "\xf4" => "\xd1\x84", #U+0444 CYRILLIC SMALL LETTER EF 500 | "\xf5" => "\xd1\x85", #U+0445 CYRILLIC SMALL LETTER HA 501 | "\xf6" => "\xd1\x86", #U+0446 CYRILLIC SMALL LETTER TSE 502 | "\xf7" => "\xd1\x87", #U+0447 CYRILLIC SMALL LETTER CHE 503 | "\xf8" => "\xd1\x88", #U+0448 CYRILLIC SMALL LETTER SHA 504 | "\xf9" => "\xd1\x89", #U+0449 CYRILLIC SMALL LETTER SHCHA 505 | "\xfa" => "\xd1\x8a", #U+044a CYRILLIC SMALL LETTER HARD SIGN 506 | "\xfb" => "\xd1\x8b", #U+044b CYRILLIC SMALL LETTER YERU 507 | "\xfc" => "\xd1\x8c", #U+044c CYRILLIC SMALL LETTER SOFT SIGN 508 | "\xfd" => "\xd1\x8d", #U+044d CYRILLIC SMALL LETTER E 509 | "\xfe" => "\xd1\x8e", #U+044e CYRILLIC SMALL LETTER YU 510 | "\xff" => "\xd1\x8f", #U+044f CYRILLIC SMALL LETTER YA 511 | ); 512 | 513 | /** 514 | * UTF-8 Case lookup table 515 | * 516 | * This lookuptable defines the upper case letters to their correspponding 517 | * lower case letter in UTF-8 518 | * 519 | * @author Andreas Gohr 520 | */ 521 | public static $convert_case_table = array( 522 | #CASE_UPPER => case_lower 523 | "\x41" => "\x61", #A a 524 | "\x42" => "\x62", #B b 525 | "\x43" => "\x63", #C c 526 | "\x44" => "\x64", #D d 527 | "\x45" => "\x65", #E e 528 | "\x46" => "\x66", #F f 529 | "\x47" => "\x67", #G g 530 | "\x48" => "\x68", #H h 531 | "\x49" => "\x69", #I i 532 | "\x4a" => "\x6a", #J j 533 | "\x4b" => "\x6b", #K k 534 | "\x4c" => "\x6c", #L l 535 | "\x4d" => "\x6d", #M m 536 | "\x4e" => "\x6e", #N n 537 | "\x4f" => "\x6f", #O o 538 | "\x50" => "\x70", #P p 539 | "\x51" => "\x71", #Q q 540 | "\x52" => "\x72", #R r 541 | "\x53" => "\x73", #S s 542 | "\x54" => "\x74", #T t 543 | "\x55" => "\x75", #U u 544 | "\x56" => "\x76", #V v 545 | "\x57" => "\x77", #W w 546 | "\x58" => "\x78", #X x 547 | "\x59" => "\x79", #Y y 548 | "\x5a" => "\x7a", #Z z 549 | "\xc3\x80" => "\xc3\xa0", 550 | "\xc3\x81" => "\xc3\xa1", 551 | "\xc3\x82" => "\xc3\xa2", 552 | "\xc3\x83" => "\xc3\xa3", 553 | "\xc3\x84" => "\xc3\xa4", 554 | "\xc3\x85" => "\xc3\xa5", 555 | "\xc3\x86" => "\xc3\xa6", 556 | "\xc3\x87" => "\xc3\xa7", 557 | "\xc3\x88" => "\xc3\xa8", 558 | "\xc3\x89" => "\xc3\xa9", 559 | "\xc3\x8a" => "\xc3\xaa", 560 | "\xc3\x8b" => "\xc3\xab", 561 | "\xc3\x8c" => "\xc3\xac", 562 | "\xc3\x8d" => "\xc3\xad", 563 | "\xc3\x8e" => "\xc3\xae", 564 | "\xc3\x8f" => "\xc3\xaf", 565 | "\xc3\x90" => "\xc3\xb0", 566 | "\xc3\x91" => "\xc3\xb1", 567 | "\xc3\x92" => "\xc3\xb2", 568 | "\xc3\x93" => "\xc3\xb3", 569 | "\xc3\x94" => "\xc3\xb4", 570 | "\xc3\x95" => "\xc3\xb5", 571 | "\xc3\x96" => "\xc3\xb6", 572 | "\xc3\x98" => "\xc3\xb8", 573 | "\xc3\x99" => "\xc3\xb9", 574 | "\xc3\x9a" => "\xc3\xba", 575 | "\xc3\x9b" => "\xc3\xbb", 576 | "\xc3\x9c" => "\xc3\xbc", 577 | "\xc3\x9d" => "\xc3\xbd", 578 | "\xc3\x9e" => "\xc3\xbe", 579 | "\xc4\x80" => "\xc4\x81", 580 | "\xc4\x82" => "\xc4\x83", 581 | "\xc4\x84" => "\xc4\x85", 582 | "\xc4\x86" => "\xc4\x87", 583 | "\xc4\x88" => "\xc4\x89", 584 | "\xc4\x8a" => "\xc4\x8b", 585 | "\xc4\x8c" => "\xc4\x8d", 586 | "\xc4\x8e" => "\xc4\x8f", 587 | "\xc4\x90" => "\xc4\x91", 588 | "\xc4\x92" => "\xc4\x93", 589 | "\xc4\x94" => "\xc4\x95", 590 | "\xc4\x96" => "\xc4\x97", 591 | "\xc4\x98" => "\xc4\x99", 592 | "\xc4\x9a" => "\xc4\x9b", 593 | "\xc4\x9c" => "\xc4\x9d", 594 | "\xc4\x9e" => "\xc4\x9f", 595 | "\xc4\xa0" => "\xc4\xa1", 596 | "\xc4\xa2" => "\xc4\xa3", 597 | "\xc4\xa4" => "\xc4\xa5", 598 | "\xc4\xa6" => "\xc4\xa7", 599 | "\xc4\xa8" => "\xc4\xa9", 600 | "\xc4\xaa" => "\xc4\xab", 601 | "\xc4\xac" => "\xc4\xad", 602 | "\xc4\xae" => "\xc4\xaf", 603 | "\xc4\xb2" => "\xc4\xb3", 604 | "\xc4\xb4" => "\xc4\xb5", 605 | "\xc4\xb6" => "\xc4\xb7", 606 | "\xc4\xb9" => "\xc4\xba", 607 | "\xc4\xbb" => "\xc4\xbc", 608 | "\xc4\xbd" => "\xc4\xbe", 609 | "\xc4\xbf" => "\xc5\x80", 610 | "\xc5\x81" => "\xc5\x82", 611 | "\xc5\x83" => "\xc5\x84", 612 | "\xc5\x85" => "\xc5\x86", 613 | "\xc5\x87" => "\xc5\x88", 614 | "\xc5\x8a" => "\xc5\x8b", 615 | "\xc5\x8c" => "\xc5\x8d", 616 | "\xc5\x8e" => "\xc5\x8f", 617 | "\xc5\x90" => "\xc5\x91", 618 | "\xc5\x92" => "\xc5\x93", 619 | "\xc5\x94" => "\xc5\x95", 620 | "\xc5\x96" => "\xc5\x97", 621 | "\xc5\x98" => "\xc5\x99", 622 | "\xc5\x9a" => "\xc5\x9b", 623 | "\xc5\x9c" => "\xc5\x9d", 624 | "\xc5\x9e" => "\xc5\x9f", 625 | "\xc5\xa0" => "\xc5\xa1", 626 | "\xc5\xa2" => "\xc5\xa3", 627 | "\xc5\xa4" => "\xc5\xa5", 628 | "\xc5\xa6" => "\xc5\xa7", 629 | "\xc5\xa8" => "\xc5\xa9", 630 | "\xc5\xaa" => "\xc5\xab", 631 | "\xc5\xac" => "\xc5\xad", 632 | "\xc5\xae" => "\xc5\xaf", 633 | "\xc5\xb0" => "\xc5\xb1", 634 | "\xc5\xb2" => "\xc5\xb3", 635 | "\xc5\xb4" => "\xc5\xb5", 636 | "\xc5\xb6" => "\xc5\xb7", 637 | "\xc5\xb8" => "\xc3\xbf", 638 | "\xc5\xb9" => "\xc5\xba", 639 | "\xc5\xbb" => "\xc5\xbc", 640 | "\xc5\xbd" => "\xc5\xbe", 641 | "\xc6\x81" => "\xc9\x93", 642 | "\xc6\x82" => "\xc6\x83", 643 | "\xc6\x84" => "\xc6\x85", 644 | "\xc6\x86" => "\xc9\x94", 645 | "\xc6\x87" => "\xc6\x88", 646 | "\xc6\x89" => "\xc9\x96", 647 | "\xc6\x8a" => "\xc9\x97", 648 | "\xc6\x8b" => "\xc6\x8c", 649 | "\xc6\x8e" => "\xc7\x9d", 650 | "\xc6\x8f" => "\xc9\x99", 651 | "\xc6\x90" => "\xc9\x9b", 652 | "\xc6\x91" => "\xc6\x92", 653 | "\xc6\x94" => "\xc9\xa3", 654 | "\xc6\x96" => "\xc9\xa9", 655 | "\xc6\x97" => "\xc9\xa8", 656 | "\xc6\x98" => "\xc6\x99", 657 | "\xc6\x9c" => "\xc9\xaf", 658 | "\xc6\x9d" => "\xc9\xb2", 659 | "\xc6\x9f" => "\xc9\xb5", 660 | "\xc6\xa0" => "\xc6\xa1", 661 | "\xc6\xa2" => "\xc6\xa3", 662 | "\xc6\xa4" => "\xc6\xa5", 663 | "\xc6\xa6" => "\xca\x80", 664 | "\xc6\xa7" => "\xc6\xa8", 665 | "\xc6\xa9" => "\xca\x83", 666 | "\xc6\xac" => "\xc6\xad", 667 | "\xc6\xae" => "\xca\x88", 668 | "\xc6\xaf" => "\xc6\xb0", 669 | "\xc6\xb1" => "\xca\x8a", 670 | "\xc6\xb2" => "\xca\x8b", 671 | "\xc6\xb3" => "\xc6\xb4", 672 | "\xc6\xb5" => "\xc6\xb6", 673 | "\xc6\xb7" => "\xca\x92", 674 | "\xc6\xb8" => "\xc6\xb9", 675 | "\xc6\xbc" => "\xc6\xbd", 676 | "\xc7\x85" => "\xc7\x86", 677 | "\xc7\x88" => "\xc7\x89", 678 | "\xc7\x8b" => "\xc7\x8c", 679 | "\xc7\x8d" => "\xc7\x8e", 680 | "\xc7\x8f" => "\xc7\x90", 681 | "\xc7\x91" => "\xc7\x92", 682 | "\xc7\x93" => "\xc7\x94", 683 | "\xc7\x95" => "\xc7\x96", 684 | "\xc7\x97" => "\xc7\x98", 685 | "\xc7\x99" => "\xc7\x9a", 686 | "\xc7\x9b" => "\xc7\x9c", 687 | "\xc7\x9e" => "\xc7\x9f", 688 | "\xc7\xa0" => "\xc7\xa1", 689 | "\xc7\xa2" => "\xc7\xa3", 690 | "\xc7\xa4" => "\xc7\xa5", 691 | "\xc7\xa6" => "\xc7\xa7", 692 | "\xc7\xa8" => "\xc7\xa9", 693 | "\xc7\xaa" => "\xc7\xab", 694 | "\xc7\xac" => "\xc7\xad", 695 | "\xc7\xae" => "\xc7\xaf", 696 | "\xc7\xb2" => "\xc7\xb3", 697 | "\xc7\xb4" => "\xc7\xb5", 698 | "\xc7\xb6" => "\xc6\x95", 699 | "\xc7\xb7" => "\xc6\xbf", 700 | "\xc7\xb8" => "\xc7\xb9", 701 | "\xc7\xba" => "\xc7\xbb", 702 | "\xc7\xbc" => "\xc7\xbd", 703 | "\xc7\xbe" => "\xc7\xbf", 704 | "\xc8\x80" => "\xc8\x81", 705 | "\xc8\x82" => "\xc8\x83", 706 | "\xc8\x84" => "\xc8\x85", 707 | "\xc8\x86" => "\xc8\x87", 708 | "\xc8\x88" => "\xc8\x89", 709 | "\xc8\x8a" => "\xc8\x8b", 710 | "\xc8\x8c" => "\xc8\x8d", 711 | "\xc8\x8e" => "\xc8\x8f", 712 | "\xc8\x90" => "\xc8\x91", 713 | "\xc8\x92" => "\xc8\x93", 714 | "\xc8\x94" => "\xc8\x95", 715 | "\xc8\x96" => "\xc8\x97", 716 | "\xc8\x98" => "\xc8\x99", 717 | "\xc8\x9a" => "\xc8\x9b", 718 | "\xc8\x9c" => "\xc8\x9d", 719 | "\xc8\x9e" => "\xc8\x9f", 720 | "\xc8\xa0" => "\xc6\x9e", 721 | "\xc8\xa2" => "\xc8\xa3", 722 | "\xc8\xa4" => "\xc8\xa5", 723 | "\xc8\xa6" => "\xc8\xa7", 724 | "\xc8\xa8" => "\xc8\xa9", 725 | "\xc8\xaa" => "\xc8\xab", 726 | "\xc8\xac" => "\xc8\xad", 727 | "\xc8\xae" => "\xc8\xaf", 728 | "\xc8\xb0" => "\xc8\xb1", 729 | "\xc8\xb2" => "\xc8\xb3", 730 | "\xce\x86" => "\xce\xac", 731 | "\xce\x88" => "\xce\xad", 732 | "\xce\x89" => "\xce\xae", 733 | "\xce\x8a" => "\xce\xaf", 734 | "\xce\x8c" => "\xcf\x8c", 735 | "\xce\x8e" => "\xcf\x8d", 736 | "\xce\x8f" => "\xcf\x8e", 737 | "\xce\x91" => "\xce\xb1", 738 | "\xce\x92" => "\xce\xb2", 739 | "\xce\x93" => "\xce\xb3", 740 | "\xce\x94" => "\xce\xb4", 741 | "\xce\x95" => "\xce\xb5", 742 | "\xce\x96" => "\xce\xb6", 743 | "\xce\x97" => "\xce\xb7", 744 | "\xce\x98" => "\xce\xb8", 745 | "\xce\x99" => "\xce\xb9", 746 | "\xce\x9a" => "\xce\xba", 747 | "\xce\x9b" => "\xce\xbb", 748 | "\xce\x9c" => "\xc2\xb5", 749 | "\xce\x9d" => "\xce\xbd", 750 | "\xce\x9e" => "\xce\xbe", 751 | "\xce\x9f" => "\xce\xbf", 752 | "\xce\xa0" => "\xcf\x80", 753 | "\xce\xa1" => "\xcf\x81", 754 | "\xce\xa3" => "\xcf\x82", 755 | "\xce\xa4" => "\xcf\x84", 756 | "\xce\xa5" => "\xcf\x85", 757 | "\xce\xa6" => "\xcf\x86", 758 | "\xce\xa7" => "\xcf\x87", 759 | "\xce\xa8" => "\xcf\x88", 760 | "\xce\xa9" => "\xcf\x89", 761 | "\xce\xaa" => "\xcf\x8a", 762 | "\xce\xab" => "\xcf\x8b", 763 | "\xcf\x98" => "\xcf\x99", 764 | "\xcf\x9a" => "\xcf\x9b", 765 | "\xcf\x9c" => "\xcf\x9d", 766 | "\xcf\x9e" => "\xcf\x9f", 767 | "\xcf\xa0" => "\xcf\xa1", 768 | "\xcf\xa2" => "\xcf\xa3", 769 | "\xcf\xa4" => "\xcf\xa5", 770 | "\xcf\xa6" => "\xcf\xa7", 771 | "\xcf\xa8" => "\xcf\xa9", 772 | "\xcf\xaa" => "\xcf\xab", 773 | "\xcf\xac" => "\xcf\xad", 774 | "\xcf\xae" => "\xcf\xaf", 775 | "\xd0\x80" => "\xd1\x90", 776 | "\xd0\x81" => "\xd1\x91", 777 | "\xd0\x82" => "\xd1\x92", 778 | "\xd0\x83" => "\xd1\x93", 779 | "\xd0\x84" => "\xd1\x94", 780 | "\xd0\x85" => "\xd1\x95", 781 | "\xd0\x86" => "\xd1\x96", 782 | "\xd0\x87" => "\xd1\x97", 783 | "\xd0\x88" => "\xd1\x98", 784 | "\xd0\x89" => "\xd1\x99", 785 | "\xd0\x8a" => "\xd1\x9a", 786 | "\xd0\x8b" => "\xd1\x9b", 787 | "\xd0\x8c" => "\xd1\x9c", 788 | "\xd0\x8d" => "\xd1\x9d", 789 | "\xd0\x8e" => "\xd1\x9e", 790 | "\xd0\x8f" => "\xd1\x9f", 791 | "\xd0\x90" => "\xd0\xb0", 792 | "\xd0\x91" => "\xd0\xb1", 793 | "\xd0\x92" => "\xd0\xb2", 794 | "\xd0\x93" => "\xd0\xb3", 795 | "\xd0\x94" => "\xd0\xb4", 796 | "\xd0\x95" => "\xd0\xb5", 797 | "\xd0\x96" => "\xd0\xb6", 798 | "\xd0\x97" => "\xd0\xb7", 799 | "\xd0\x98" => "\xd0\xb8", 800 | "\xd0\x99" => "\xd0\xb9", 801 | "\xd0\x9a" => "\xd0\xba", 802 | "\xd0\x9b" => "\xd0\xbb", 803 | "\xd0\x9c" => "\xd0\xbc", 804 | "\xd0\x9d" => "\xd0\xbd", 805 | "\xd0\x9e" => "\xd0\xbe", 806 | "\xd0\x9f" => "\xd0\xbf", 807 | "\xd0\xa0" => "\xd1\x80", 808 | "\xd0\xa1" => "\xd1\x81", 809 | "\xd0\xa2" => "\xd1\x82", 810 | "\xd0\xa3" => "\xd1\x83", 811 | "\xd0\xa4" => "\xd1\x84", 812 | "\xd0\xa5" => "\xd1\x85", 813 | "\xd0\xa6" => "\xd1\x86", 814 | "\xd0\xa7" => "\xd1\x87", 815 | "\xd0\xa8" => "\xd1\x88", 816 | "\xd0\xa9" => "\xd1\x89", 817 | "\xd0\xaa" => "\xd1\x8a", 818 | "\xd0\xab" => "\xd1\x8b", 819 | "\xd0\xac" => "\xd1\x8c", 820 | "\xd0\xad" => "\xd1\x8d", 821 | "\xd0\xae" => "\xd1\x8e", 822 | "\xd0\xaf" => "\xd1\x8f", 823 | "\xd1\xa0" => "\xd1\xa1", 824 | "\xd1\xa2" => "\xd1\xa3", 825 | "\xd1\xa4" => "\xd1\xa5", 826 | "\xd1\xa6" => "\xd1\xa7", 827 | "\xd1\xa8" => "\xd1\xa9", 828 | "\xd1\xaa" => "\xd1\xab", 829 | "\xd1\xac" => "\xd1\xad", 830 | "\xd1\xae" => "\xd1\xaf", 831 | "\xd1\xb0" => "\xd1\xb1", 832 | "\xd1\xb2" => "\xd1\xb3", 833 | "\xd1\xb4" => "\xd1\xb5", 834 | "\xd1\xb6" => "\xd1\xb7", 835 | "\xd1\xb8" => "\xd1\xb9", 836 | "\xd1\xba" => "\xd1\xbb", 837 | "\xd1\xbc" => "\xd1\xbd", 838 | "\xd1\xbe" => "\xd1\xbf", 839 | "\xd2\x80" => "\xd2\x81", 840 | "\xd2\x8a" => "\xd2\x8b", 841 | "\xd2\x8c" => "\xd2\x8d", 842 | "\xd2\x8e" => "\xd2\x8f", 843 | "\xd2\x90" => "\xd2\x91", 844 | "\xd2\x92" => "\xd2\x93", 845 | "\xd2\x94" => "\xd2\x95", 846 | "\xd2\x96" => "\xd2\x97", 847 | "\xd2\x98" => "\xd2\x99", 848 | "\xd2\x9a" => "\xd2\x9b", 849 | "\xd2\x9c" => "\xd2\x9d", 850 | "\xd2\x9e" => "\xd2\x9f", 851 | "\xd2\xa0" => "\xd2\xa1", 852 | "\xd2\xa2" => "\xd2\xa3", 853 | "\xd2\xa4" => "\xd2\xa5", 854 | "\xd2\xa6" => "\xd2\xa7", 855 | "\xd2\xa8" => "\xd2\xa9", 856 | "\xd2\xaa" => "\xd2\xab", 857 | "\xd2\xac" => "\xd2\xad", 858 | "\xd2\xae" => "\xd2\xaf", 859 | "\xd2\xb0" => "\xd2\xb1", 860 | "\xd2\xb2" => "\xd2\xb3", 861 | "\xd2\xb4" => "\xd2\xb5", 862 | "\xd2\xb6" => "\xd2\xb7", 863 | "\xd2\xb8" => "\xd2\xb9", 864 | "\xd2\xba" => "\xd2\xbb", 865 | "\xd2\xbc" => "\xd2\xbd", 866 | "\xd2\xbe" => "\xd2\xbf", 867 | "\xd3\x81" => "\xd3\x82", 868 | "\xd3\x83" => "\xd3\x84", 869 | "\xd3\x85" => "\xd3\x86", 870 | "\xd3\x87" => "\xd3\x88", 871 | "\xd3\x89" => "\xd3\x8a", 872 | "\xd3\x8b" => "\xd3\x8c", 873 | "\xd3\x8d" => "\xd3\x8e", 874 | "\xd3\x90" => "\xd3\x91", 875 | "\xd3\x92" => "\xd3\x93", 876 | "\xd3\x94" => "\xd3\x95", 877 | "\xd3\x96" => "\xd3\x97", 878 | "\xd3\x98" => "\xd3\x99", 879 | "\xd3\x9a" => "\xd3\x9b", 880 | "\xd3\x9c" => "\xd3\x9d", 881 | "\xd3\x9e" => "\xd3\x9f", 882 | "\xd3\xa0" => "\xd3\xa1", 883 | "\xd3\xa2" => "\xd3\xa3", 884 | "\xd3\xa4" => "\xd3\xa5", 885 | "\xd3\xa6" => "\xd3\xa7", 886 | "\xd3\xa8" => "\xd3\xa9", 887 | "\xd3\xaa" => "\xd3\xab", 888 | "\xd3\xac" => "\xd3\xad", 889 | "\xd3\xae" => "\xd3\xaf", 890 | "\xd3\xb0" => "\xd3\xb1", 891 | "\xd3\xb2" => "\xd3\xb3", 892 | "\xd3\xb4" => "\xd3\xb5", 893 | "\xd3\xb8" => "\xd3\xb9", 894 | "\xd4\x80" => "\xd4\x81", 895 | "\xd4\x82" => "\xd4\x83", 896 | "\xd4\x84" => "\xd4\x85", 897 | "\xd4\x86" => "\xd4\x87", 898 | "\xd4\x88" => "\xd4\x89", 899 | "\xd4\x8a" => "\xd4\x8b", 900 | "\xd4\x8c" => "\xd4\x8d", 901 | "\xd4\x8e" => "\xd4\x8f", 902 | "\xd4\xb1" => "\xd5\xa1", 903 | "\xd4\xb2" => "\xd5\xa2", 904 | "\xd4\xb3" => "\xd5\xa3", 905 | "\xd4\xb4" => "\xd5\xa4", 906 | "\xd4\xb5" => "\xd5\xa5", 907 | "\xd4\xb6" => "\xd5\xa6", 908 | "\xd4\xb7" => "\xd5\xa7", 909 | "\xd4\xb8" => "\xd5\xa8", 910 | "\xd4\xb9" => "\xd5\xa9", 911 | "\xd4\xba" => "\xd5\xaa", 912 | "\xd4\xbb" => "\xd5\xab", 913 | "\xd4\xbc" => "\xd5\xac", 914 | "\xd4\xbd" => "\xd5\xad", 915 | "\xd4\xbe" => "\xd5\xae", 916 | "\xd4\xbf" => "\xd5\xaf", 917 | "\xd5\x80" => "\xd5\xb0", 918 | "\xd5\x81" => "\xd5\xb1", 919 | "\xd5\x82" => "\xd5\xb2", 920 | "\xd5\x83" => "\xd5\xb3", 921 | "\xd5\x84" => "\xd5\xb4", 922 | "\xd5\x85" => "\xd5\xb5", 923 | "\xd5\x86" => "\xd5\xb6", 924 | "\xd5\x87" => "\xd5\xb7", 925 | "\xd5\x88" => "\xd5\xb8", 926 | "\xd5\x89" => "\xd5\xb9", 927 | "\xd5\x8a" => "\xd5\xba", 928 | "\xd5\x8b" => "\xd5\xbb", 929 | "\xd5\x8c" => "\xd5\xbc", 930 | "\xd5\x8d" => "\xd5\xbd", 931 | "\xd5\x8e" => "\xd5\xbe", 932 | "\xd5\x8f" => "\xd5\xbf", 933 | "\xd5\x90" => "\xd6\x80", 934 | "\xd5\x91" => "\xd6\x81", 935 | "\xd5\x92" => "\xd6\x82", 936 | "\xd5\x93" => "\xd6\x83", 937 | "\xd5\x94" => "\xd6\x84", 938 | "\xd5\x95" => "\xd6\x85", 939 | "\xd5\x96" => "\xd6\x86", 940 | "\xe1\xb8\x80" => "\xe1\xb8\x81", 941 | "\xe1\xb8\x82" => "\xe1\xb8\x83", 942 | "\xe1\xb8\x84" => "\xe1\xb8\x85", 943 | "\xe1\xb8\x86" => "\xe1\xb8\x87", 944 | "\xe1\xb8\x88" => "\xe1\xb8\x89", 945 | "\xe1\xb8\x8a" => "\xe1\xb8\x8b", 946 | "\xe1\xb8\x8c" => "\xe1\xb8\x8d", 947 | "\xe1\xb8\x8e" => "\xe1\xb8\x8f", 948 | "\xe1\xb8\x90" => "\xe1\xb8\x91", 949 | "\xe1\xb8\x92" => "\xe1\xb8\x93", 950 | "\xe1\xb8\x94" => "\xe1\xb8\x95", 951 | "\xe1\xb8\x96" => "\xe1\xb8\x97", 952 | "\xe1\xb8\x98" => "\xe1\xb8\x99", 953 | "\xe1\xb8\x9a" => "\xe1\xb8\x9b", 954 | "\xe1\xb8\x9c" => "\xe1\xb8\x9d", 955 | "\xe1\xb8\x9e" => "\xe1\xb8\x9f", 956 | "\xe1\xb8\xa0" => "\xe1\xb8\xa1", 957 | "\xe1\xb8\xa2" => "\xe1\xb8\xa3", 958 | "\xe1\xb8\xa4" => "\xe1\xb8\xa5", 959 | "\xe1\xb8\xa6" => "\xe1\xb8\xa7", 960 | "\xe1\xb8\xa8" => "\xe1\xb8\xa9", 961 | "\xe1\xb8\xaa" => "\xe1\xb8\xab", 962 | "\xe1\xb8\xac" => "\xe1\xb8\xad", 963 | "\xe1\xb8\xae" => "\xe1\xb8\xaf", 964 | "\xe1\xb8\xb0" => "\xe1\xb8\xb1", 965 | "\xe1\xb8\xb2" => "\xe1\xb8\xb3", 966 | "\xe1\xb8\xb4" => "\xe1\xb8\xb5", 967 | "\xe1\xb8\xb6" => "\xe1\xb8\xb7", 968 | "\xe1\xb8\xb8" => "\xe1\xb8\xb9", 969 | "\xe1\xb8\xba" => "\xe1\xb8\xbb", 970 | "\xe1\xb8\xbc" => "\xe1\xb8\xbd", 971 | "\xe1\xb8\xbe" => "\xe1\xb8\xbf", 972 | "\xe1\xb9\x80" => "\xe1\xb9\x81", 973 | "\xe1\xb9\x82" => "\xe1\xb9\x83", 974 | "\xe1\xb9\x84" => "\xe1\xb9\x85", 975 | "\xe1\xb9\x86" => "\xe1\xb9\x87", 976 | "\xe1\xb9\x88" => "\xe1\xb9\x89", 977 | "\xe1\xb9\x8a" => "\xe1\xb9\x8b", 978 | "\xe1\xb9\x8c" => "\xe1\xb9\x8d", 979 | "\xe1\xb9\x8e" => "\xe1\xb9\x8f", 980 | "\xe1\xb9\x90" => "\xe1\xb9\x91", 981 | "\xe1\xb9\x92" => "\xe1\xb9\x93", 982 | "\xe1\xb9\x94" => "\xe1\xb9\x95", 983 | "\xe1\xb9\x96" => "\xe1\xb9\x97", 984 | "\xe1\xb9\x98" => "\xe1\xb9\x99", 985 | "\xe1\xb9\x9a" => "\xe1\xb9\x9b", 986 | "\xe1\xb9\x9c" => "\xe1\xb9\x9d", 987 | "\xe1\xb9\x9e" => "\xe1\xb9\x9f", 988 | "\xe1\xb9\xa0" => "\xe1\xb9\xa1", 989 | "\xe1\xb9\xa2" => "\xe1\xb9\xa3", 990 | "\xe1\xb9\xa4" => "\xe1\xb9\xa5", 991 | "\xe1\xb9\xa6" => "\xe1\xb9\xa7", 992 | "\xe1\xb9\xa8" => "\xe1\xb9\xa9", 993 | "\xe1\xb9\xaa" => "\xe1\xb9\xab", 994 | "\xe1\xb9\xac" => "\xe1\xb9\xad", 995 | "\xe1\xb9\xae" => "\xe1\xb9\xaf", 996 | "\xe1\xb9\xb0" => "\xe1\xb9\xb1", 997 | "\xe1\xb9\xb2" => "\xe1\xb9\xb3", 998 | "\xe1\xb9\xb4" => "\xe1\xb9\xb5", 999 | "\xe1\xb9\xb6" => "\xe1\xb9\xb7", 1000 | "\xe1\xb9\xb8" => "\xe1\xb9\xb9", 1001 | "\xe1\xb9\xba" => "\xe1\xb9\xbb", 1002 | "\xe1\xb9\xbc" => "\xe1\xb9\xbd", 1003 | "\xe1\xb9\xbe" => "\xe1\xb9\xbf", 1004 | "\xe1\xba\x80" => "\xe1\xba\x81", 1005 | "\xe1\xba\x82" => "\xe1\xba\x83", 1006 | "\xe1\xba\x84" => "\xe1\xba\x85", 1007 | "\xe1\xba\x86" => "\xe1\xba\x87", 1008 | "\xe1\xba\x88" => "\xe1\xba\x89", 1009 | "\xe1\xba\x8a" => "\xe1\xba\x8b", 1010 | "\xe1\xba\x8c" => "\xe1\xba\x8d", 1011 | "\xe1\xba\x8e" => "\xe1\xba\x8f", 1012 | "\xe1\xba\x90" => "\xe1\xba\x91", 1013 | "\xe1\xba\x92" => "\xe1\xba\x93", 1014 | "\xe1\xba\x94" => "\xe1\xba\x95", 1015 | "\xe1\xba\xa0" => "\xe1\xba\xa1", 1016 | "\xe1\xba\xa2" => "\xe1\xba\xa3", 1017 | "\xe1\xba\xa4" => "\xe1\xba\xa5", 1018 | "\xe1\xba\xa6" => "\xe1\xba\xa7", 1019 | "\xe1\xba\xa8" => "\xe1\xba\xa9", 1020 | "\xe1\xba\xaa" => "\xe1\xba\xab", 1021 | "\xe1\xba\xac" => "\xe1\xba\xad", 1022 | "\xe1\xba\xae" => "\xe1\xba\xaf", 1023 | "\xe1\xba\xb0" => "\xe1\xba\xb1", 1024 | "\xe1\xba\xb2" => "\xe1\xba\xb3", 1025 | "\xe1\xba\xb4" => "\xe1\xba\xb5", 1026 | "\xe1\xba\xb6" => "\xe1\xba\xb7", 1027 | "\xe1\xba\xb8" => "\xe1\xba\xb9", 1028 | "\xe1\xba\xba" => "\xe1\xba\xbb", 1029 | "\xe1\xba\xbc" => "\xe1\xba\xbd", 1030 | "\xe1\xba\xbe" => "\xe1\xba\xbf", 1031 | "\xe1\xbb\x80" => "\xe1\xbb\x81", 1032 | "\xe1\xbb\x82" => "\xe1\xbb\x83", 1033 | "\xe1\xbb\x84" => "\xe1\xbb\x85", 1034 | "\xe1\xbb\x86" => "\xe1\xbb\x87", 1035 | "\xe1\xbb\x88" => "\xe1\xbb\x89", 1036 | "\xe1\xbb\x8a" => "\xe1\xbb\x8b", 1037 | "\xe1\xbb\x8c" => "\xe1\xbb\x8d", 1038 | "\xe1\xbb\x8e" => "\xe1\xbb\x8f", 1039 | "\xe1\xbb\x90" => "\xe1\xbb\x91", 1040 | "\xe1\xbb\x92" => "\xe1\xbb\x93", 1041 | "\xe1\xbb\x94" => "\xe1\xbb\x95", 1042 | "\xe1\xbb\x96" => "\xe1\xbb\x97", 1043 | "\xe1\xbb\x98" => "\xe1\xbb\x99", 1044 | "\xe1\xbb\x9a" => "\xe1\xbb\x9b", 1045 | "\xe1\xbb\x9c" => "\xe1\xbb\x9d", 1046 | "\xe1\xbb\x9e" => "\xe1\xbb\x9f", 1047 | "\xe1\xbb\xa0" => "\xe1\xbb\xa1", 1048 | "\xe1\xbb\xa2" => "\xe1\xbb\xa3", 1049 | "\xe1\xbb\xa4" => "\xe1\xbb\xa5", 1050 | "\xe1\xbb\xa6" => "\xe1\xbb\xa7", 1051 | "\xe1\xbb\xa8" => "\xe1\xbb\xa9", 1052 | "\xe1\xbb\xaa" => "\xe1\xbb\xab", 1053 | "\xe1\xbb\xac" => "\xe1\xbb\xad", 1054 | "\xe1\xbb\xae" => "\xe1\xbb\xaf", 1055 | "\xe1\xbb\xb0" => "\xe1\xbb\xb1", 1056 | "\xe1\xbb\xb2" => "\xe1\xbb\xb3", 1057 | "\xe1\xbb\xb4" => "\xe1\xbb\xb5", 1058 | "\xe1\xbb\xb6" => "\xe1\xbb\xb7", 1059 | "\xe1\xbb\xb8" => "\xe1\xbb\xb9", 1060 | "\xe1\xbc\x88" => "\xe1\xbc\x80", 1061 | "\xe1\xbc\x89" => "\xe1\xbc\x81", 1062 | "\xe1\xbc\x8a" => "\xe1\xbc\x82", 1063 | "\xe1\xbc\x8b" => "\xe1\xbc\x83", 1064 | "\xe1\xbc\x8c" => "\xe1\xbc\x84", 1065 | "\xe1\xbc\x8d" => "\xe1\xbc\x85", 1066 | "\xe1\xbc\x8e" => "\xe1\xbc\x86", 1067 | "\xe1\xbc\x8f" => "\xe1\xbc\x87", 1068 | "\xe1\xbc\x98" => "\xe1\xbc\x90", 1069 | "\xe1\xbc\x99" => "\xe1\xbc\x91", 1070 | "\xe1\xbc\x9a" => "\xe1\xbc\x92", 1071 | "\xe1\xbc\x9b" => "\xe1\xbc\x93", 1072 | "\xe1\xbc\x9c" => "\xe1\xbc\x94", 1073 | "\xe1\xbc\x9d" => "\xe1\xbc\x95", 1074 | "\xe1\xbc\xa9" => "\xe1\xbc\xa1", 1075 | "\xe1\xbc\xaa" => "\xe1\xbc\xa2", 1076 | "\xe1\xbc\xab" => "\xe1\xbc\xa3", 1077 | "\xe1\xbc\xac" => "\xe1\xbc\xa4", 1078 | "\xe1\xbc\xad" => "\xe1\xbc\xa5", 1079 | "\xe1\xbc\xae" => "\xe1\xbc\xa6", 1080 | "\xe1\xbc\xaf" => "\xe1\xbc\xa7", 1081 | "\xe1\xbc\xb8" => "\xe1\xbc\xb0", 1082 | "\xe1\xbc\xb9" => "\xe1\xbc\xb1", 1083 | "\xe1\xbc\xba" => "\xe1\xbc\xb2", 1084 | "\xe1\xbc\xbb" => "\xe1\xbc\xb3", 1085 | "\xe1\xbc\xbc" => "\xe1\xbc\xb4", 1086 | "\xe1\xbc\xbd" => "\xe1\xbc\xb5", 1087 | "\xe1\xbc\xbe" => "\xe1\xbc\xb6", 1088 | "\xe1\xbc\xbf" => "\xe1\xbc\xb7", 1089 | "\xe1\xbd\x88" => "\xe1\xbd\x80", 1090 | "\xe1\xbd\x89" => "\xe1\xbd\x81", 1091 | "\xe1\xbd\x8a" => "\xe1\xbd\x82", 1092 | "\xe1\xbd\x8b" => "\xe1\xbd\x83", 1093 | "\xe1\xbd\x8c" => "\xe1\xbd\x84", 1094 | "\xe1\xbd\x8d" => "\xe1\xbd\x85", 1095 | "\xe1\xbd\x99" => "\xe1\xbd\x91", 1096 | "\xe1\xbd\x9b" => "\xe1\xbd\x93", 1097 | "\xe1\xbd\x9d" => "\xe1\xbd\x95", 1098 | "\xe1\xbd\x9f" => "\xe1\xbd\x97", 1099 | "\xe1\xbd\xa9" => "\xe1\xbd\xa1", 1100 | "\xe1\xbd\xaa" => "\xe1\xbd\xa2", 1101 | "\xe1\xbd\xab" => "\xe1\xbd\xa3", 1102 | "\xe1\xbd\xac" => "\xe1\xbd\xa4", 1103 | "\xe1\xbd\xad" => "\xe1\xbd\xa5", 1104 | "\xe1\xbd\xae" => "\xe1\xbd\xa6", 1105 | "\xe1\xbd\xaf" => "\xe1\xbd\xa7", 1106 | "\xe1\xbe\x88" => "\xe1\xbe\x80", 1107 | "\xe1\xbe\x89" => "\xe1\xbe\x81", 1108 | "\xe1\xbe\x8a" => "\xe1\xbe\x82", 1109 | "\xe1\xbe\x8b" => "\xe1\xbe\x83", 1110 | "\xe1\xbe\x8c" => "\xe1\xbe\x84", 1111 | "\xe1\xbe\x8d" => "\xe1\xbe\x85", 1112 | "\xe1\xbe\x8e" => "\xe1\xbe\x86", 1113 | "\xe1\xbe\x8f" => "\xe1\xbe\x87", 1114 | "\xe1\xbe\x98" => "\xe1\xbe\x90", 1115 | "\xe1\xbe\x99" => "\xe1\xbe\x91", 1116 | "\xe1\xbe\x9a" => "\xe1\xbe\x92", 1117 | "\xe1\xbe\x9b" => "\xe1\xbe\x93", 1118 | "\xe1\xbe\x9c" => "\xe1\xbe\x94", 1119 | "\xe1\xbe\x9d" => "\xe1\xbe\x95", 1120 | "\xe1\xbe\x9e" => "\xe1\xbe\x96", 1121 | "\xe1\xbe\x9f" => "\xe1\xbe\x97", 1122 | "\xe1\xbe\xa9" => "\xe1\xbe\xa1", 1123 | "\xe1\xbe\xaa" => "\xe1\xbe\xa2", 1124 | "\xe1\xbe\xab" => "\xe1\xbe\xa3", 1125 | "\xe1\xbe\xac" => "\xe1\xbe\xa4", 1126 | "\xe1\xbe\xad" => "\xe1\xbe\xa5", 1127 | "\xe1\xbe\xae" => "\xe1\xbe\xa6", 1128 | "\xe1\xbe\xaf" => "\xe1\xbe\xa7", 1129 | "\xe1\xbe\xb8" => "\xe1\xbe\xb0", 1130 | "\xe1\xbe\xb9" => "\xe1\xbe\xb1", 1131 | "\xe1\xbe\xba" => "\xe1\xbd\xb0", 1132 | "\xe1\xbe\xbb" => "\xe1\xbd\xb1", 1133 | "\xe1\xbe\xbc" => "\xe1\xbe\xb3", 1134 | "\xe1\xbf\x88" => "\xe1\xbd\xb2", 1135 | "\xe1\xbf\x89" => "\xe1\xbd\xb3", 1136 | "\xe1\xbf\x8a" => "\xe1\xbd\xb4", 1137 | "\xe1\xbf\x8b" => "\xe1\xbd\xb5", 1138 | "\xe1\xbf\x8c" => "\xe1\xbf\x83", 1139 | "\xe1\xbf\x98" => "\xe1\xbf\x90", 1140 | "\xe1\xbf\x99" => "\xe1\xbf\x91", 1141 | "\xe1\xbf\x9a" => "\xe1\xbd\xb6", 1142 | "\xe1\xbf\x9b" => "\xe1\xbd\xb7", 1143 | "\xe1\xbf\xa9" => "\xe1\xbf\xa1", 1144 | "\xe1\xbf\xaa" => "\xe1\xbd\xba", 1145 | "\xe1\xbf\xab" => "\xe1\xbd\xbb", 1146 | "\xe1\xbf\xac" => "\xe1\xbf\xa5", 1147 | "\xe1\xbf\xb8" => "\xe1\xbd\xb8", 1148 | "\xe1\xbf\xb9" => "\xe1\xbd\xb9", 1149 | "\xe1\xbf\xba" => "\xe1\xbd\xbc", 1150 | "\xe1\xbf\xbb" => "\xe1\xbd\xbd", 1151 | "\xe1\xbf\xbc" => "\xe1\xbf\xb3", 1152 | "\xef\xbc\xa1" => "\xef\xbd\x81", 1153 | "\xef\xbc\xa2" => "\xef\xbd\x82", 1154 | "\xef\xbc\xa3" => "\xef\xbd\x83", 1155 | "\xef\xbc\xa4" => "\xef\xbd\x84", 1156 | "\xef\xbc\xa5" => "\xef\xbd\x85", 1157 | "\xef\xbc\xa6" => "\xef\xbd\x86", 1158 | "\xef\xbc\xa7" => "\xef\xbd\x87", 1159 | "\xef\xbc\xa8" => "\xef\xbd\x88", 1160 | "\xef\xbc\xa9" => "\xef\xbd\x89", 1161 | "\xef\xbc\xaa" => "\xef\xbd\x8a", 1162 | "\xef\xbc\xab" => "\xef\xbd\x8b", 1163 | "\xef\xbc\xac" => "\xef\xbd\x8c", 1164 | "\xef\xbc\xad" => "\xef\xbd\x8d", 1165 | "\xef\xbc\xae" => "\xef\xbd\x8e", 1166 | "\xef\xbc\xaf" => "\xef\xbd\x8f", 1167 | "\xef\xbc\xb0" => "\xef\xbd\x90", 1168 | "\xef\xbc\xb1" => "\xef\xbd\x91", 1169 | "\xef\xbc\xb2" => "\xef\xbd\x92", 1170 | "\xef\xbc\xb3" => "\xef\xbd\x93", 1171 | "\xef\xbc\xb4" => "\xef\xbd\x94", 1172 | "\xef\xbc\xb5" => "\xef\xbd\x95", 1173 | "\xef\xbc\xb6" => "\xef\xbd\x96", 1174 | "\xef\xbc\xb7" => "\xef\xbd\x97", 1175 | "\xef\xbc\xb8" => "\xef\xbd\x98", 1176 | "\xef\xbc\xb9" => "\xef\xbd\x99", 1177 | "\xef\xbc\xba" => "\xef\xbd\x9a", 1178 | ); 1179 | 1180 | #Unicode Character Database 6.0.0 (2010-06-04) 1181 | #autogenerated by unicode_blocks_txt2php() PHP function at 2011-06-04 00:19:39, 209 blocks total 1182 | public static $unicode_blocks = array( 1183 | 'Basic Latin' => array( 1184 | 0 => 0x0000, 1185 | 1 => 0x007F, 1186 | 2 => 0, 1187 | ), 1188 | 'Latin-1 Supplement' => array( 1189 | 0 => 0x0080, 1190 | 1 => 0x00FF, 1191 | 2 => 1, 1192 | ), 1193 | 'Latin Extended-A' => array( 1194 | 0 => 0x0100, 1195 | 1 => 0x017F, 1196 | 2 => 2, 1197 | ), 1198 | 'Latin Extended-B' => array( 1199 | 0 => 0x0180, 1200 | 1 => 0x024F, 1201 | 2 => 3, 1202 | ), 1203 | 'IPA Extensions' => array( 1204 | 0 => 0x0250, 1205 | 1 => 0x02AF, 1206 | 2 => 4, 1207 | ), 1208 | 'Spacing Modifier Letters' => array( 1209 | 0 => 0x02B0, 1210 | 1 => 0x02FF, 1211 | 2 => 5, 1212 | ), 1213 | 'Combining Diacritical Marks' => array( 1214 | 0 => 0x0300, 1215 | 1 => 0x036F, 1216 | 2 => 6, 1217 | ), 1218 | 'Greek and Coptic' => array( 1219 | 0 => 0x0370, 1220 | 1 => 0x03FF, 1221 | 2 => 7, 1222 | ), 1223 | 'Cyrillic' => array( 1224 | 0 => 0x0400, 1225 | 1 => 0x04FF, 1226 | 2 => 8, 1227 | ), 1228 | 'Cyrillic Supplement' => array( 1229 | 0 => 0x0500, 1230 | 1 => 0x052F, 1231 | 2 => 9, 1232 | ), 1233 | 'Armenian' => array( 1234 | 0 => 0x0530, 1235 | 1 => 0x058F, 1236 | 2 => 10, 1237 | ), 1238 | 'Hebrew' => array( 1239 | 0 => 0x0590, 1240 | 1 => 0x05FF, 1241 | 2 => 11, 1242 | ), 1243 | 'Arabic' => array( 1244 | 0 => 0x0600, 1245 | 1 => 0x06FF, 1246 | 2 => 12, 1247 | ), 1248 | 'Syriac' => array( 1249 | 0 => 0x0700, 1250 | 1 => 0x074F, 1251 | 2 => 13, 1252 | ), 1253 | 'Arabic Supplement' => array( 1254 | 0 => 0x0750, 1255 | 1 => 0x077F, 1256 | 2 => 14, 1257 | ), 1258 | 'Thaana' => array( 1259 | 0 => 0x0780, 1260 | 1 => 0x07BF, 1261 | 2 => 15, 1262 | ), 1263 | 'NKo' => array( 1264 | 0 => 0x07C0, 1265 | 1 => 0x07FF, 1266 | 2 => 16, 1267 | ), 1268 | 'Samaritan' => array( 1269 | 0 => 0x0800, 1270 | 1 => 0x083F, 1271 | 2 => 17, 1272 | ), 1273 | 'Mandaic' => array( 1274 | 0 => 0x0840, 1275 | 1 => 0x085F, 1276 | 2 => 18, 1277 | ), 1278 | 'Devanagari' => array( 1279 | 0 => 0x0900, 1280 | 1 => 0x097F, 1281 | 2 => 19, 1282 | ), 1283 | 'Bengali' => array( 1284 | 0 => 0x0980, 1285 | 1 => 0x09FF, 1286 | 2 => 20, 1287 | ), 1288 | 'Gurmukhi' => array( 1289 | 0 => 0x0A00, 1290 | 1 => 0x0A7F, 1291 | 2 => 21, 1292 | ), 1293 | 'Gujarati' => array( 1294 | 0 => 0x0A80, 1295 | 1 => 0x0AFF, 1296 | 2 => 22, 1297 | ), 1298 | 'Oriya' => array( 1299 | 0 => 0x0B00, 1300 | 1 => 0x0B7F, 1301 | 2 => 23, 1302 | ), 1303 | 'Tamil' => array( 1304 | 0 => 0x0B80, 1305 | 1 => 0x0BFF, 1306 | 2 => 24, 1307 | ), 1308 | 'Telugu' => array( 1309 | 0 => 0x0C00, 1310 | 1 => 0x0C7F, 1311 | 2 => 25, 1312 | ), 1313 | 'Kannada' => array( 1314 | 0 => 0x0C80, 1315 | 1 => 0x0CFF, 1316 | 2 => 26, 1317 | ), 1318 | 'Malayalam' => array( 1319 | 0 => 0x0D00, 1320 | 1 => 0x0D7F, 1321 | 2 => 27, 1322 | ), 1323 | 'Sinhala' => array( 1324 | 0 => 0x0D80, 1325 | 1 => 0x0DFF, 1326 | 2 => 28, 1327 | ), 1328 | 'Thai' => array( 1329 | 0 => 0x0E00, 1330 | 1 => 0x0E7F, 1331 | 2 => 29, 1332 | ), 1333 | 'Lao' => array( 1334 | 0 => 0x0E80, 1335 | 1 => 0x0EFF, 1336 | 2 => 30, 1337 | ), 1338 | 'Tibetan' => array( 1339 | 0 => 0x0F00, 1340 | 1 => 0x0FFF, 1341 | 2 => 31, 1342 | ), 1343 | 'Myanmar' => array( 1344 | 0 => 0x1000, 1345 | 1 => 0x109F, 1346 | 2 => 32, 1347 | ), 1348 | 'Georgian' => array( 1349 | 0 => 0x10A0, 1350 | 1 => 0x10FF, 1351 | 2 => 33, 1352 | ), 1353 | 'Hangul Jamo' => array( 1354 | 0 => 0x1100, 1355 | 1 => 0x11FF, 1356 | 2 => 34, 1357 | ), 1358 | 'Ethiopic' => array( 1359 | 0 => 0x1200, 1360 | 1 => 0x137F, 1361 | 2 => 35, 1362 | ), 1363 | 'Ethiopic Supplement' => array( 1364 | 0 => 0x1380, 1365 | 1 => 0x139F, 1366 | 2 => 36, 1367 | ), 1368 | 'Cherokee' => array( 1369 | 0 => 0x13A0, 1370 | 1 => 0x13FF, 1371 | 2 => 37, 1372 | ), 1373 | 'Unified Canadian Aboriginal Syllabics' => array( 1374 | 0 => 0x1400, 1375 | 1 => 0x167F, 1376 | 2 => 38, 1377 | ), 1378 | 'Ogham' => array( 1379 | 0 => 0x1680, 1380 | 1 => 0x169F, 1381 | 2 => 39, 1382 | ), 1383 | 'Runic' => array( 1384 | 0 => 0x16A0, 1385 | 1 => 0x16FF, 1386 | 2 => 40, 1387 | ), 1388 | 'Tagalog' => array( 1389 | 0 => 0x1700, 1390 | 1 => 0x171F, 1391 | 2 => 41, 1392 | ), 1393 | 'Hanunoo' => array( 1394 | 0 => 0x1720, 1395 | 1 => 0x173F, 1396 | 2 => 42, 1397 | ), 1398 | 'Buhid' => array( 1399 | 0 => 0x1740, 1400 | 1 => 0x175F, 1401 | 2 => 43, 1402 | ), 1403 | 'Tagbanwa' => array( 1404 | 0 => 0x1760, 1405 | 1 => 0x177F, 1406 | 2 => 44, 1407 | ), 1408 | 'Khmer' => array( 1409 | 0 => 0x1780, 1410 | 1 => 0x17FF, 1411 | 2 => 45, 1412 | ), 1413 | 'Mongolian' => array( 1414 | 0 => 0x1800, 1415 | 1 => 0x18AF, 1416 | 2 => 46, 1417 | ), 1418 | 'Unified Canadian Aboriginal Syllabics Extended' => array( 1419 | 0 => 0x18B0, 1420 | 1 => 0x18FF, 1421 | 2 => 47, 1422 | ), 1423 | 'Limbu' => array( 1424 | 0 => 0x1900, 1425 | 1 => 0x194F, 1426 | 2 => 48, 1427 | ), 1428 | 'Tai Le' => array( 1429 | 0 => 0x1950, 1430 | 1 => 0x197F, 1431 | 2 => 49, 1432 | ), 1433 | 'New Tai Lue' => array( 1434 | 0 => 0x1980, 1435 | 1 => 0x19DF, 1436 | 2 => 50, 1437 | ), 1438 | 'Khmer Symbols' => array( 1439 | 0 => 0x19E0, 1440 | 1 => 0x19FF, 1441 | 2 => 51, 1442 | ), 1443 | 'Buginese' => array( 1444 | 0 => 0x1A00, 1445 | 1 => 0x1A1F, 1446 | 2 => 52, 1447 | ), 1448 | 'Tai Tham' => array( 1449 | 0 => 0x1A20, 1450 | 1 => 0x1AAF, 1451 | 2 => 53, 1452 | ), 1453 | 'Balinese' => array( 1454 | 0 => 0x1B00, 1455 | 1 => 0x1B7F, 1456 | 2 => 54, 1457 | ), 1458 | 'Sundanese' => array( 1459 | 0 => 0x1B80, 1460 | 1 => 0x1BBF, 1461 | 2 => 55, 1462 | ), 1463 | 'Batak' => array( 1464 | 0 => 0x1BC0, 1465 | 1 => 0x1BFF, 1466 | 2 => 56, 1467 | ), 1468 | 'Lepcha' => array( 1469 | 0 => 0x1C00, 1470 | 1 => 0x1C4F, 1471 | 2 => 57, 1472 | ), 1473 | 'Ol Chiki' => array( 1474 | 0 => 0x1C50, 1475 | 1 => 0x1C7F, 1476 | 2 => 58, 1477 | ), 1478 | 'Vedic Extensions' => array( 1479 | 0 => 0x1CD0, 1480 | 1 => 0x1CFF, 1481 | 2 => 59, 1482 | ), 1483 | 'Phonetic Extensions' => array( 1484 | 0 => 0x1D00, 1485 | 1 => 0x1D7F, 1486 | 2 => 60, 1487 | ), 1488 | 'Phonetic Extensions Supplement' => array( 1489 | 0 => 0x1D80, 1490 | 1 => 0x1DBF, 1491 | 2 => 61, 1492 | ), 1493 | 'Combining Diacritical Marks Supplement' => array( 1494 | 0 => 0x1DC0, 1495 | 1 => 0x1DFF, 1496 | 2 => 62, 1497 | ), 1498 | 'Latin Extended Additional' => array( 1499 | 0 => 0x1E00, 1500 | 1 => 0x1EFF, 1501 | 2 => 63, 1502 | ), 1503 | 'Greek Extended' => array( 1504 | 0 => 0x1F00, 1505 | 1 => 0x1FFF, 1506 | 2 => 64, 1507 | ), 1508 | 'General Punctuation' => array( 1509 | 0 => 0x2000, 1510 | 1 => 0x206F, 1511 | 2 => 65, 1512 | ), 1513 | 'Superscripts and Subscripts' => array( 1514 | 0 => 0x2070, 1515 | 1 => 0x209F, 1516 | 2 => 66, 1517 | ), 1518 | 'Currency Symbols' => array( 1519 | 0 => 0x20A0, 1520 | 1 => 0x20CF, 1521 | 2 => 67, 1522 | ), 1523 | 'Combining Diacritical Marks for Symbols' => array( 1524 | 0 => 0x20D0, 1525 | 1 => 0x20FF, 1526 | 2 => 68, 1527 | ), 1528 | 'Letterlike Symbols' => array( 1529 | 0 => 0x2100, 1530 | 1 => 0x214F, 1531 | 2 => 69, 1532 | ), 1533 | 'Number Forms' => array( 1534 | 0 => 0x2150, 1535 | 1 => 0x218F, 1536 | 2 => 70, 1537 | ), 1538 | 'Arrows' => array( 1539 | 0 => 0x2190, 1540 | 1 => 0x21FF, 1541 | 2 => 71, 1542 | ), 1543 | 'Mathematical Operators' => array( 1544 | 0 => 0x2200, 1545 | 1 => 0x22FF, 1546 | 2 => 72, 1547 | ), 1548 | 'Miscellaneous Technical' => array( 1549 | 0 => 0x2300, 1550 | 1 => 0x23FF, 1551 | 2 => 73, 1552 | ), 1553 | 'Control Pictures' => array( 1554 | 0 => 0x2400, 1555 | 1 => 0x243F, 1556 | 2 => 74, 1557 | ), 1558 | 'Optical Character Recognition' => array( 1559 | 0 => 0x2440, 1560 | 1 => 0x245F, 1561 | 2 => 75, 1562 | ), 1563 | 'Enclosed Alphanumerics' => array( 1564 | 0 => 0x2460, 1565 | 1 => 0x24FF, 1566 | 2 => 76, 1567 | ), 1568 | 'Box Drawing' => array( 1569 | 0 => 0x2500, 1570 | 1 => 0x257F, 1571 | 2 => 77, 1572 | ), 1573 | 'Block Elements' => array( 1574 | 0 => 0x2580, 1575 | 1 => 0x259F, 1576 | 2 => 78, 1577 | ), 1578 | 'Geometric Shapes' => array( 1579 | 0 => 0x25A0, 1580 | 1 => 0x25FF, 1581 | 2 => 79, 1582 | ), 1583 | 'Miscellaneous Symbols' => array( 1584 | 0 => 0x2600, 1585 | 1 => 0x26FF, 1586 | 2 => 80, 1587 | ), 1588 | 'Dingbats' => array( 1589 | 0 => 0x2700, 1590 | 1 => 0x27BF, 1591 | 2 => 81, 1592 | ), 1593 | 'Miscellaneous Mathematical Symbols-A' => array( 1594 | 0 => 0x27C0, 1595 | 1 => 0x27EF, 1596 | 2 => 82, 1597 | ), 1598 | 'Supplemental Arrows-A' => array( 1599 | 0 => 0x27F0, 1600 | 1 => 0x27FF, 1601 | 2 => 83, 1602 | ), 1603 | 'Braille Patterns' => array( 1604 | 0 => 0x2800, 1605 | 1 => 0x28FF, 1606 | 2 => 84, 1607 | ), 1608 | 'Supplemental Arrows-B' => array( 1609 | 0 => 0x2900, 1610 | 1 => 0x297F, 1611 | 2 => 85, 1612 | ), 1613 | 'Miscellaneous Mathematical Symbols-B' => array( 1614 | 0 => 0x2980, 1615 | 1 => 0x29FF, 1616 | 2 => 86, 1617 | ), 1618 | 'Supplemental Mathematical Operators' => array( 1619 | 0 => 0x2A00, 1620 | 1 => 0x2AFF, 1621 | 2 => 87, 1622 | ), 1623 | 'Miscellaneous Symbols and Arrows' => array( 1624 | 0 => 0x2B00, 1625 | 1 => 0x2BFF, 1626 | 2 => 88, 1627 | ), 1628 | 'Glagolitic' => array( 1629 | 0 => 0x2C00, 1630 | 1 => 0x2C5F, 1631 | 2 => 89, 1632 | ), 1633 | 'Latin Extended-C' => array( 1634 | 0 => 0x2C60, 1635 | 1 => 0x2C7F, 1636 | 2 => 90, 1637 | ), 1638 | 'Coptic' => array( 1639 | 0 => 0x2C80, 1640 | 1 => 0x2CFF, 1641 | 2 => 91, 1642 | ), 1643 | 'Georgian Supplement' => array( 1644 | 0 => 0x2D00, 1645 | 1 => 0x2D2F, 1646 | 2 => 92, 1647 | ), 1648 | 'Tifinagh' => array( 1649 | 0 => 0x2D30, 1650 | 1 => 0x2D7F, 1651 | 2 => 93, 1652 | ), 1653 | 'Ethiopic Extended' => array( 1654 | 0 => 0x2D80, 1655 | 1 => 0x2DDF, 1656 | 2 => 94, 1657 | ), 1658 | 'Cyrillic Extended-A' => array( 1659 | 0 => 0x2DE0, 1660 | 1 => 0x2DFF, 1661 | 2 => 95, 1662 | ), 1663 | 'Supplemental Punctuation' => array( 1664 | 0 => 0x2E00, 1665 | 1 => 0x2E7F, 1666 | 2 => 96, 1667 | ), 1668 | 'CJK Radicals Supplement' => array( 1669 | 0 => 0x2E80, 1670 | 1 => 0x2EFF, 1671 | 2 => 97, 1672 | ), 1673 | 'Kangxi Radicals' => array( 1674 | 0 => 0x2F00, 1675 | 1 => 0x2FDF, 1676 | 2 => 98, 1677 | ), 1678 | 'Ideographic Description Characters' => array( 1679 | 0 => 0x2FF0, 1680 | 1 => 0x2FFF, 1681 | 2 => 99, 1682 | ), 1683 | 'CJK Symbols and Punctuation' => array( 1684 | 0 => 0x3000, 1685 | 1 => 0x303F, 1686 | 2 => 100, 1687 | ), 1688 | 'Hiragana' => array( 1689 | 0 => 0x3040, 1690 | 1 => 0x309F, 1691 | 2 => 101, 1692 | ), 1693 | 'Katakana' => array( 1694 | 0 => 0x30A0, 1695 | 1 => 0x30FF, 1696 | 2 => 102, 1697 | ), 1698 | 'Bopomofo' => array( 1699 | 0 => 0x3100, 1700 | 1 => 0x312F, 1701 | 2 => 103, 1702 | ), 1703 | 'Hangul Compatibility Jamo' => array( 1704 | 0 => 0x3130, 1705 | 1 => 0x318F, 1706 | 2 => 104, 1707 | ), 1708 | 'Kanbun' => array( 1709 | 0 => 0x3190, 1710 | 1 => 0x319F, 1711 | 2 => 105, 1712 | ), 1713 | 'Bopomofo Extended' => array( 1714 | 0 => 0x31A0, 1715 | 1 => 0x31BF, 1716 | 2 => 106, 1717 | ), 1718 | 'CJK Strokes' => array( 1719 | 0 => 0x31C0, 1720 | 1 => 0x31EF, 1721 | 2 => 107, 1722 | ), 1723 | 'Katakana Phonetic Extensions' => array( 1724 | 0 => 0x31F0, 1725 | 1 => 0x31FF, 1726 | 2 => 108, 1727 | ), 1728 | 'Enclosed CJK Letters and Months' => array( 1729 | 0 => 0x3200, 1730 | 1 => 0x32FF, 1731 | 2 => 109, 1732 | ), 1733 | 'CJK Compatibility' => array( 1734 | 0 => 0x3300, 1735 | 1 => 0x33FF, 1736 | 2 => 110, 1737 | ), 1738 | 'CJK Unified Ideographs Extension A' => array( 1739 | 0 => 0x3400, 1740 | 1 => 0x4DBF, 1741 | 2 => 111, 1742 | ), 1743 | 'Yijing Hexagram Symbols' => array( 1744 | 0 => 0x4DC0, 1745 | 1 => 0x4DFF, 1746 | 2 => 112, 1747 | ), 1748 | 'CJK Unified Ideographs' => array( 1749 | 0 => 0x4E00, 1750 | 1 => 0x9FFF, 1751 | 2 => 113, 1752 | ), 1753 | 'Yi Syllables' => array( 1754 | 0 => 0xA000, 1755 | 1 => 0xA48F, 1756 | 2 => 114, 1757 | ), 1758 | 'Yi Radicals' => array( 1759 | 0 => 0xA490, 1760 | 1 => 0xA4CF, 1761 | 2 => 115, 1762 | ), 1763 | 'Lisu' => array( 1764 | 0 => 0xA4D0, 1765 | 1 => 0xA4FF, 1766 | 2 => 116, 1767 | ), 1768 | 'Vai' => array( 1769 | 0 => 0xA500, 1770 | 1 => 0xA63F, 1771 | 2 => 117, 1772 | ), 1773 | 'Cyrillic Extended-B' => array( 1774 | 0 => 0xA640, 1775 | 1 => 0xA69F, 1776 | 2 => 118, 1777 | ), 1778 | 'Bamum' => array( 1779 | 0 => 0xA6A0, 1780 | 1 => 0xA6FF, 1781 | 2 => 119, 1782 | ), 1783 | 'Modifier Tone Letters' => array( 1784 | 0 => 0xA700, 1785 | 1 => 0xA71F, 1786 | 2 => 120, 1787 | ), 1788 | 'Latin Extended-D' => array( 1789 | 0 => 0xA720, 1790 | 1 => 0xA7FF, 1791 | 2 => 121, 1792 | ), 1793 | 'Syloti Nagri' => array( 1794 | 0 => 0xA800, 1795 | 1 => 0xA82F, 1796 | 2 => 122, 1797 | ), 1798 | 'Common Indic Number Forms' => array( 1799 | 0 => 0xA830, 1800 | 1 => 0xA83F, 1801 | 2 => 123, 1802 | ), 1803 | 'Phags-pa' => array( 1804 | 0 => 0xA840, 1805 | 1 => 0xA87F, 1806 | 2 => 124, 1807 | ), 1808 | 'Saurashtra' => array( 1809 | 0 => 0xA880, 1810 | 1 => 0xA8DF, 1811 | 2 => 125, 1812 | ), 1813 | 'Devanagari Extended' => array( 1814 | 0 => 0xA8E0, 1815 | 1 => 0xA8FF, 1816 | 2 => 126, 1817 | ), 1818 | 'Kayah Li' => array( 1819 | 0 => 0xA900, 1820 | 1 => 0xA92F, 1821 | 2 => 127, 1822 | ), 1823 | 'Rejang' => array( 1824 | 0 => 0xA930, 1825 | 1 => 0xA95F, 1826 | 2 => 128, 1827 | ), 1828 | 'Hangul Jamo Extended-A' => array( 1829 | 0 => 0xA960, 1830 | 1 => 0xA97F, 1831 | 2 => 129, 1832 | ), 1833 | 'Javanese' => array( 1834 | 0 => 0xA980, 1835 | 1 => 0xA9DF, 1836 | 2 => 130, 1837 | ), 1838 | 'Cham' => array( 1839 | 0 => 0xAA00, 1840 | 1 => 0xAA5F, 1841 | 2 => 131, 1842 | ), 1843 | 'Myanmar Extended-A' => array( 1844 | 0 => 0xAA60, 1845 | 1 => 0xAA7F, 1846 | 2 => 132, 1847 | ), 1848 | 'Tai Viet' => array( 1849 | 0 => 0xAA80, 1850 | 1 => 0xAADF, 1851 | 2 => 133, 1852 | ), 1853 | 'Ethiopic Extended-A' => array( 1854 | 0 => 0xAB00, 1855 | 1 => 0xAB2F, 1856 | 2 => 134, 1857 | ), 1858 | 'Meetei Mayek' => array( 1859 | 0 => 0xABC0, 1860 | 1 => 0xABFF, 1861 | 2 => 135, 1862 | ), 1863 | 'Hangul Syllables' => array( 1864 | 0 => 0xAC00, 1865 | 1 => 0xD7AF, 1866 | 2 => 136, 1867 | ), 1868 | 'Hangul Jamo Extended-B' => array( 1869 | 0 => 0xD7B0, 1870 | 1 => 0xD7FF, 1871 | 2 => 137, 1872 | ), 1873 | 'High Surrogates' => array( 1874 | 0 => 0xD800, 1875 | 1 => 0xDB7F, 1876 | 2 => 138, 1877 | ), 1878 | 'High Private Use Surrogates' => array( 1879 | 0 => 0xDB80, 1880 | 1 => 0xDBFF, 1881 | 2 => 139, 1882 | ), 1883 | 'Low Surrogates' => array( 1884 | 0 => 0xDC00, 1885 | 1 => 0xDFFF, 1886 | 2 => 140, 1887 | ), 1888 | 'Private Use Area' => array( 1889 | 0 => 0xE000, 1890 | 1 => 0xF8FF, 1891 | 2 => 141, 1892 | ), 1893 | 'CJK Compatibility Ideographs' => array( 1894 | 0 => 0xF900, 1895 | 1 => 0xFAFF, 1896 | 2 => 142, 1897 | ), 1898 | 'Alphabetic Presentation Forms' => array( 1899 | 0 => 0xFB00, 1900 | 1 => 0xFB4F, 1901 | 2 => 143, 1902 | ), 1903 | 'Arabic Presentation Forms-A' => array( 1904 | 0 => 0xFB50, 1905 | 1 => 0xFDFF, 1906 | 2 => 144, 1907 | ), 1908 | 'Variation Selectors' => array( 1909 | 0 => 0xFE00, 1910 | 1 => 0xFE0F, 1911 | 2 => 145, 1912 | ), 1913 | 'Vertical Forms' => array( 1914 | 0 => 0xFE10, 1915 | 1 => 0xFE1F, 1916 | 2 => 146, 1917 | ), 1918 | 'Combining Half Marks' => array( 1919 | 0 => 0xFE20, 1920 | 1 => 0xFE2F, 1921 | 2 => 147, 1922 | ), 1923 | 'CJK Compatibility Forms' => array( 1924 | 0 => 0xFE30, 1925 | 1 => 0xFE4F, 1926 | 2 => 148, 1927 | ), 1928 | 'Small Form Variants' => array( 1929 | 0 => 0xFE50, 1930 | 1 => 0xFE6F, 1931 | 2 => 149, 1932 | ), 1933 | 'Arabic Presentation Forms-B' => array( 1934 | 0 => 0xFE70, 1935 | 1 => 0xFEFF, 1936 | 2 => 150, 1937 | ), 1938 | 'Halfwidth and Fullwidth Forms' => array( 1939 | 0 => 0xFF00, 1940 | 1 => 0xFFEF, 1941 | 2 => 151, 1942 | ), 1943 | 'Specials' => array( 1944 | 0 => 0xFFF0, 1945 | 1 => 0xFFFF, 1946 | 2 => 152, 1947 | ), 1948 | 'Linear B Syllabary' => array( 1949 | 0 => 0x10000, 1950 | 1 => 0x1007F, 1951 | 2 => 153, 1952 | ), 1953 | 'Linear B Ideograms' => array( 1954 | 0 => 0x10080, 1955 | 1 => 0x100FF, 1956 | 2 => 154, 1957 | ), 1958 | 'Aegean Numbers' => array( 1959 | 0 => 0x10100, 1960 | 1 => 0x1013F, 1961 | 2 => 155, 1962 | ), 1963 | 'Ancient Greek Numbers' => array( 1964 | 0 => 0x10140, 1965 | 1 => 0x1018F, 1966 | 2 => 156, 1967 | ), 1968 | 'Ancient Symbols' => array( 1969 | 0 => 0x10190, 1970 | 1 => 0x101CF, 1971 | 2 => 157, 1972 | ), 1973 | 'Phaistos Disc' => array( 1974 | 0 => 0x101D0, 1975 | 1 => 0x101FF, 1976 | 2 => 158, 1977 | ), 1978 | 'Lycian' => array( 1979 | 0 => 0x10280, 1980 | 1 => 0x1029F, 1981 | 2 => 159, 1982 | ), 1983 | 'Carian' => array( 1984 | 0 => 0x102A0, 1985 | 1 => 0x102DF, 1986 | 2 => 160, 1987 | ), 1988 | 'Old Italic' => array( 1989 | 0 => 0x10300, 1990 | 1 => 0x1032F, 1991 | 2 => 161, 1992 | ), 1993 | 'Gothic' => array( 1994 | 0 => 0x10330, 1995 | 1 => 0x1034F, 1996 | 2 => 162, 1997 | ), 1998 | 'Ugaritic' => array( 1999 | 0 => 0x10380, 2000 | 1 => 0x1039F, 2001 | 2 => 163, 2002 | ), 2003 | 'Old Persian' => array( 2004 | 0 => 0x103A0, 2005 | 1 => 0x103DF, 2006 | 2 => 164, 2007 | ), 2008 | 'Deseret' => array( 2009 | 0 => 0x10400, 2010 | 1 => 0x1044F, 2011 | 2 => 165, 2012 | ), 2013 | 'Shavian' => array( 2014 | 0 => 0x10450, 2015 | 1 => 0x1047F, 2016 | 2 => 166, 2017 | ), 2018 | 'Osmanya' => array( 2019 | 0 => 0x10480, 2020 | 1 => 0x104AF, 2021 | 2 => 167, 2022 | ), 2023 | 'Cypriot Syllabary' => array( 2024 | 0 => 0x10800, 2025 | 1 => 0x1083F, 2026 | 2 => 168, 2027 | ), 2028 | 'Imperial Aramaic' => array( 2029 | 0 => 0x10840, 2030 | 1 => 0x1085F, 2031 | 2 => 169, 2032 | ), 2033 | 'Phoenician' => array( 2034 | 0 => 0x10900, 2035 | 1 => 0x1091F, 2036 | 2 => 170, 2037 | ), 2038 | 'Lydian' => array( 2039 | 0 => 0x10920, 2040 | 1 => 0x1093F, 2041 | 2 => 171, 2042 | ), 2043 | 'Kharoshthi' => array( 2044 | 0 => 0x10A00, 2045 | 1 => 0x10A5F, 2046 | 2 => 172, 2047 | ), 2048 | 'Old South Arabian' => array( 2049 | 0 => 0x10A60, 2050 | 1 => 0x10A7F, 2051 | 2 => 173, 2052 | ), 2053 | 'Avestan' => array( 2054 | 0 => 0x10B00, 2055 | 1 => 0x10B3F, 2056 | 2 => 174, 2057 | ), 2058 | 'Inscriptional Parthian' => array( 2059 | 0 => 0x10B40, 2060 | 1 => 0x10B5F, 2061 | 2 => 175, 2062 | ), 2063 | 'Inscriptional Pahlavi' => array( 2064 | 0 => 0x10B60, 2065 | 1 => 0x10B7F, 2066 | 2 => 176, 2067 | ), 2068 | 'Old Turkic' => array( 2069 | 0 => 0x10C00, 2070 | 1 => 0x10C4F, 2071 | 2 => 177, 2072 | ), 2073 | 'Rumi Numeral Symbols' => array( 2074 | 0 => 0x10E60, 2075 | 1 => 0x10E7F, 2076 | 2 => 178, 2077 | ), 2078 | 'Brahmi' => array( 2079 | 0 => 0x11000, 2080 | 1 => 0x1107F, 2081 | 2 => 179, 2082 | ), 2083 | 'Kaithi' => array( 2084 | 0 => 0x11080, 2085 | 1 => 0x110CF, 2086 | 2 => 180, 2087 | ), 2088 | 'Cuneiform' => array( 2089 | 0 => 0x12000, 2090 | 1 => 0x123FF, 2091 | 2 => 181, 2092 | ), 2093 | 'Cuneiform Numbers and Punctuation' => array( 2094 | 0 => 0x12400, 2095 | 1 => 0x1247F, 2096 | 2 => 182, 2097 | ), 2098 | 'Egyptian Hieroglyphs' => array( 2099 | 0 => 0x13000, 2100 | 1 => 0x1342F, 2101 | 2 => 183, 2102 | ), 2103 | 'Bamum Supplement' => array( 2104 | 0 => 0x16800, 2105 | 1 => 0x16A3F, 2106 | 2 => 184, 2107 | ), 2108 | 'Kana Supplement' => array( 2109 | 0 => 0x1B000, 2110 | 1 => 0x1B0FF, 2111 | 2 => 185, 2112 | ), 2113 | 'Byzantine Musical Symbols' => array( 2114 | 0 => 0x1D000, 2115 | 1 => 0x1D0FF, 2116 | 2 => 186, 2117 | ), 2118 | 'Musical Symbols' => array( 2119 | 0 => 0x1D100, 2120 | 1 => 0x1D1FF, 2121 | 2 => 187, 2122 | ), 2123 | 'Ancient Greek Musical Notation' => array( 2124 | 0 => 0x1D200, 2125 | 1 => 0x1D24F, 2126 | 2 => 188, 2127 | ), 2128 | 'Tai Xuan Jing Symbols' => array( 2129 | 0 => 0x1D300, 2130 | 1 => 0x1D35F, 2131 | 2 => 189, 2132 | ), 2133 | 'Counting Rod Numerals' => array( 2134 | 0 => 0x1D360, 2135 | 1 => 0x1D37F, 2136 | 2 => 190, 2137 | ), 2138 | 'Mathematical Alphanumeric Symbols' => array( 2139 | 0 => 0x1D400, 2140 | 1 => 0x1D7FF, 2141 | 2 => 191, 2142 | ), 2143 | 'Mahjong Tiles' => array( 2144 | 0 => 0x1F000, 2145 | 1 => 0x1F02F, 2146 | 2 => 192, 2147 | ), 2148 | 'Domino Tiles' => array( 2149 | 0 => 0x1F030, 2150 | 1 => 0x1F09F, 2151 | 2 => 193, 2152 | ), 2153 | 'Playing Cards' => array( 2154 | 0 => 0x1F0A0, 2155 | 1 => 0x1F0FF, 2156 | 2 => 194, 2157 | ), 2158 | 'Enclosed Alphanumeric Supplement' => array( 2159 | 0 => 0x1F100, 2160 | 1 => 0x1F1FF, 2161 | 2 => 195, 2162 | ), 2163 | 'Enclosed Ideographic Supplement' => array( 2164 | 0 => 0x1F200, 2165 | 1 => 0x1F2FF, 2166 | 2 => 196, 2167 | ), 2168 | 'Miscellaneous Symbols And Pictographs' => array( 2169 | 0 => 0x1F300, 2170 | 1 => 0x1F5FF, 2171 | 2 => 197, 2172 | ), 2173 | 'Emoticons' => array( 2174 | 0 => 0x1F600, 2175 | 1 => 0x1F64F, 2176 | 2 => 198, 2177 | ), 2178 | 'Transport And Map Symbols' => array( 2179 | 0 => 0x1F680, 2180 | 1 => 0x1F6FF, 2181 | 2 => 199, 2182 | ), 2183 | 'Alchemical Symbols' => array( 2184 | 0 => 0x1F700, 2185 | 1 => 0x1F77F, 2186 | 2 => 200, 2187 | ), 2188 | 'CJK Unified Ideographs Extension B' => array( 2189 | 0 => 0x20000, 2190 | 1 => 0x2A6DF, 2191 | 2 => 201, 2192 | ), 2193 | 'CJK Unified Ideographs Extension C' => array( 2194 | 0 => 0x2A700, 2195 | 1 => 0x2B73F, 2196 | 2 => 202, 2197 | ), 2198 | 'CJK Unified Ideographs Extension D' => array( 2199 | 0 => 0x2B740, 2200 | 1 => 0x2B81F, 2201 | 2 => 203, 2202 | ), 2203 | 'CJK Compatibility Ideographs Supplement' => array( 2204 | 0 => 0x2F800, 2205 | 1 => 0x2FA1F, 2206 | 2 => 204, 2207 | ), 2208 | 'Tags' => array( 2209 | 0 => 0xE0000, 2210 | 1 => 0xE007F, 2211 | 2 => 205, 2212 | ), 2213 | 'Variation Selectors Supplement' => array( 2214 | 0 => 0xE0100, 2215 | 1 => 0xE01EF, 2216 | 2 => 206, 2217 | ), 2218 | 'Supplementary Private Use Area-A' => array( 2219 | 0 => 0xF0000, 2220 | 1 => 0xFFFFF, 2221 | 2 => 207, 2222 | ), 2223 | 'Supplementary Private Use Area-B' => array( 2224 | 0 => 0x100000, 2225 | 1 => 0x10FFFF, 2226 | 2 => 208, 2227 | ), 2228 | ); 2229 | 2230 | #calling the methods of this class only statically! 2231 | private function __construct() {} 2232 | 2233 | /** 2234 | * Remove combining diactrical marks, with possibility of the restore 2235 | * Удаляет диакритические знаки в тексте, с возможностью восстановления (опция) 2236 | * 2237 | * @param string|null $s 2238 | * @param array|null $additional_chars for example: "\xc2\xad" #soft hyphen = discretionary hyphen 2239 | * @param bool $is_can_restored 2240 | * @param array|null &$restore_table 2241 | * @return string|bool|null Returns FALSE if error occurred 2242 | */ 2243 | public static function diactrical_remove($s, $additional_chars = null, $is_can_restored = false, &$restore_table = null) 2244 | { 2245 | if (! ReflectionTypeHint::isValid()) return false; 2246 | if (is_null($s)) return $s; 2247 | 2248 | if ($additional_chars) 2249 | { 2250 | foreach ($additional_chars as $k => &$v) $v = preg_quote($v, '/'); 2251 | $re = '/((?>' . self::$diactrical_re . '|' . implode('|', $additional_chars) . ')+)/sxSX'; 2252 | } 2253 | else $re = '/((?>' . self::$diactrical_re . ')+)/sxSX'; 2254 | if (! $is_can_restored) return preg_replace($re, '', $s); 2255 | 2256 | $restore_table = array(); 2257 | $a = preg_split($re, $s, -1, PREG_SPLIT_DELIM_CAPTURE); 2258 | $c = count($a); 2259 | if ($c === 1) return $s; 2260 | $pos = 0; 2261 | $s2 = ''; 2262 | for ($i = 0; $i < $c - 1; $i += 2) 2263 | { 2264 | $s2 .= $a[$i]; 2265 | #запоминаем символьные (не байтовые!) позиции 2266 | $pos += self::strlen($a[$i]); 2267 | $restore_table['offsets'][$pos] = $a[$i + 1]; 2268 | } 2269 | $restore_table['length'] = $pos + self::strlen(end($a)); 2270 | return $s2 . end($a); 2271 | } 2272 | 2273 | /** 2274 | * Restore combining diactrical marks, removed by self::diactrical_remove() 2275 | * In Russian: 2276 | * Восстанавливает диакритические знаки в тексте, при условии, что их символьные позиции и кол-во символов не изменились! 2277 | * 2278 | * @see self::diactrical_remove() 2279 | * @param string|null $s 2280 | * @param array $restore_table 2281 | * @return string|bool|null Returns FALSE if error occurred (broken $restore_table) 2282 | */ 2283 | public static function diactrical_restore($s, array $restore_table) 2284 | { 2285 | if (! ReflectionTypeHint::isValid()) return false; 2286 | if (is_null($s)) return $s; 2287 | 2288 | if (! $restore_table) return $s; 2289 | if (! is_int(@$restore_table['length']) || 2290 | ! is_array(@$restore_table['offsets']) || 2291 | $restore_table['length'] !== self::strlen($s)) return false; 2292 | $a = array(); 2293 | $length = $offset = 0; 2294 | $s2 = ''; 2295 | foreach ($restore_table['offsets'] as $pos => $diactricals) 2296 | { 2297 | $length = $pos - $offset; 2298 | $s2 .= self::substr($s, $offset, $length) . $diactricals; 2299 | $offset = $pos; 2300 | } 2301 | return $s2 . self::substr($s, $offset, strlen($s)); 2302 | } 2303 | 2304 | /** 2305 | * Encodes data from another character encoding to UTF-8. 2306 | * 2307 | * @param array|scalar|null $data 2308 | * @param string $charset 2309 | * @return array|scalar|null Returns FALSE if error occurred 2310 | */ 2311 | public static function convert_from($data, $charset = 'cp1251') 2312 | { 2313 | if (! ReflectionTypeHint::isValid()) return false; 2314 | return self::_convert($data, $charset, 'UTF-8'); 2315 | } 2316 | 2317 | /** 2318 | * Encodes data from UTF-8 to another character encoding. 2319 | * 2320 | * @param array|scalar|null $data 2321 | * @param string $charset 2322 | * @return array|scalar|null Returns FALSE if error occurred 2323 | */ 2324 | public static function convert_to($data, $charset = 'cp1251') 2325 | { 2326 | if (! ReflectionTypeHint::isValid()) return false; 2327 | return self::_convert($data, 'UTF-8', $charset); 2328 | } 2329 | 2330 | /** 2331 | * Recoding the data of any structure to/from UTF-8. 2332 | * Arrays traversed recursively, recoded keys and values. 2333 | * 2334 | * @see mb_encoding_aliases() 2335 | * @param array|scalar|null $data 2336 | * @param string $charset_from 2337 | * @param string $charset_to 2338 | * @return array|scalar|null Returns FALSE if error occurred 2339 | */ 2340 | private static function _convert($data, $charset_from, $charset_to) 2341 | { 2342 | if (! ReflectionTypeHint::isValid()) return false; #for recursive calls 2343 | if ($charset_from === $charset_to) return $data; 2344 | if (is_array($data)) 2345 | { 2346 | $d = array(); 2347 | foreach ($data as $k => &$v) 2348 | { 2349 | $k = self::_convert($k, $charset_from, $charset_to); 2350 | if ($k === false) return false; 2351 | $d[$k] = self::_convert($v, $charset_from, $charset_to); 2352 | if ($d[$k] === false && ! is_bool($v)) return false; 2353 | } 2354 | return $d; 2355 | } 2356 | if (is_string($data)) 2357 | { 2358 | #smart behaviour for errors protected + speed improve 2359 | if ($charset_from === 'UTF-8' && ! self::is_utf8($data)) return $data; 2360 | if ($charset_to === 'UTF-8' && self::is_utf8($data)) return $data; 2361 | 2362 | #since PHP-5.3.x iconv() faster then mb_convert_encoding() 2363 | if (function_exists('iconv')) return iconv($charset_from, $charset_to . '//IGNORE//TRANSLIT', $data); 2364 | if (function_exists('mb_convert_encoding')) return mb_convert_encoding($data, $charset_to, $charset_from); 2365 | 2366 | #charset_from 2367 | if ($charset_from === 'UTF-16' || $charset_from === 'UCS-2') return self::_convert_from_utf16($data); 2368 | if ($charset_from === 'cp1251' || $charset_from === 'cp1259') return strtr($data, self::$cp1259_table); 2369 | if ($charset_from === 'koi8-r' || $charset_from === 'KOI8-R') return strtr(convert_cyr_string($data, 'k', 'w'), self::$cp1259_table); 2370 | if ($charset_from === 'iso8859-5') return strtr(convert_cyr_string($data, 'i', 'w'), self::$cp1259_table); 2371 | if ($charset_from === 'cp866') return strtr(convert_cyr_string($data, 'a', 'w'), self::$cp1259_table); 2372 | if ($charset_from === 'mac-cyrillic') return strtr(convert_cyr_string($data, 'm', 'w'), self::$cp1259_table); 2373 | 2374 | #charset_to 2375 | if ($charset_to === 'cp1251' || $charset_to === 'cp1259') return strtr($data, array_flip(self::$cp1259_table)); 2376 | 2377 | #last trying 2378 | if (function_exists('recode_string')) 2379 | { 2380 | $s = @recode_string($charset_from . '..' . $charset_to, $data); 2381 | if (is_string($s)) return $s; 2382 | } 2383 | 2384 | trigger_error('Convert "' . $charset_from . '" --> "' . $charset_to . '" is not supported native, "iconv" or "mbstring" extension required', E_USER_WARNING); 2385 | return false; 2386 | } 2387 | return $data; 2388 | } 2389 | 2390 | /** 2391 | * Convert UTF-16 / UCS-2 encoding string to UTF-8. 2392 | * Surrogates UTF-16 are supported! 2393 | * 2394 | * In Russian: 2395 | * Преобразует строку из кодировки UTF-16 / UCS-2 в UTF-8. 2396 | * Суррогаты UTF-16 поддерживаются! 2397 | * 2398 | * @param string $s 2399 | * @param string $type 'BE' -- big endian byte order 2400 | * 'LE' -- little endian byte order 2401 | * @param bool $to_array returns array chars instead whole string? 2402 | * @return string|array|bool UTF-8 string, array chars or FALSE if error occurred 2403 | */ 2404 | private static function _convert_from_utf16($s, $type = 'BE', $to_array = false) 2405 | { 2406 | static $types = array( 2407 | 'BE' => 'n', #unsigned short (always 16 bit, big endian byte order) 2408 | 'LE' => 'v', #unsigned short (always 16 bit, little endian byte order) 2409 | ); 2410 | if (! array_key_exists($type, $types)) 2411 | { 2412 | trigger_error('Unexpected value in 2-nd parameter, "' . $type . '" given!', E_USER_WARNING); 2413 | return false; 2414 | } 2415 | #the fastest way: 2416 | if (function_exists('iconv') || function_exists('mb_convert_encoding')) 2417 | { 2418 | if (function_exists('iconv')) $s = iconv('UTF-16' . $type, 'UTF-8', $s); 2419 | elseif (function_exists('mb_convert_encoding')) $s = mb_convert_encoding($s, 'UTF-8', 'UTF-16' . $type); 2420 | if (! $to_array) return $s; 2421 | return self::str_split($s); 2422 | } 2423 | 2424 | /* 2425 | http://en.wikipedia.org/wiki/UTF-16 2426 | 2427 | The improvement that UTF-16 made over UCS-2 is its ability to encode 2428 | characters in planes 1-16, not just those in plane 0 (BMP). 2429 | 2430 | UTF-16 represents non-BMP characters (those from U+10000 through U+10FFFF) 2431 | using a pair of 16-bit words, known as a surrogate pair. 2432 | First 1000016 is subtracted from the code point to give a 20-bit value. 2433 | This is then split into two separate 10-bit values each of which is represented 2434 | as a surrogate with the most significant half placed in the first surrogate. 2435 | To allow safe use of simple word-oriented string processing, separate ranges 2436 | of values are used for the two surrogates: 0xD800-0xDBFF for the first, most 2437 | significant surrogate and 0xDC00-0xDFFF for the second, least significant surrogate. 2438 | 2439 | For example, the character at code point U+10000 becomes the code unit sequence 0xD800 0xDC00, 2440 | and the character at U+10FFFD, the upper limit of Unicode, becomes the sequence 0xDBFF 0xDFFD. 2441 | Unicode and ISO/IEC 10646 do not, and will never, assign characters to any of the code points 2442 | in the U+D800-U+DFFF range, so an individual code value from a surrogate pair does not ever 2443 | represent a character. 2444 | 2445 | http://www.russellcottrell.com/greek/utilities/SurrogatePairCalculator.htm 2446 | http://www.russellcottrell.com/greek/utilities/UnicodeRanges.htm 2447 | 2448 | Conversion of a Unicode scalar value S to a surrogate pair : 2449 | H = Math.floor((S - 0x10000) / 0x400) + 0xD800; 2450 | L = ((S - 0x10000) % 0x400) + 0xDC00; 2451 | The conversion of a surrogate pair to a scalar value: 2452 | N = ((H - 0xD800) * 0x400) + (L - 0xDC00) + 0x10000; 2453 | */ 2454 | $a = array(); 2455 | $hi = false; 2456 | foreach (unpack($types[$type] . '*', $s) as $codepoint) 2457 | { 2458 | #surrogate process 2459 | if ($hi !== false) 2460 | { 2461 | $lo = $codepoint; 2462 | if ($lo < 0xDC00 || $lo > 0xDFFF) $a[] = "\xEF\xBF\xBD"; #U+FFFD REPLACEMENT CHARACTER (for broken char) 2463 | else 2464 | { 2465 | $codepoint = (($hi - 0xD800) * 0x400) + ($lo - 0xDC00) + 0x10000; 2466 | $a[] = self::chr($codepoint); 2467 | } 2468 | $hi = false; 2469 | } 2470 | elseif ($codepoint < 0xD800 || $codepoint > 0xDBFF) $a[] = self::chr($codepoint); #not surrogate 2471 | else $hi = $codepoint; #surrogate was found 2472 | } 2473 | return $to_array ? $a : implode('', $a); 2474 | } 2475 | 2476 | /** 2477 | * Strips out device control codes in the ASCII range. 2478 | * 2479 | * @param string|null String to clean 2480 | * @return string|bool|null Returns FALSE if error occurred 2481 | */ 2482 | public static function strict($s) 2483 | { 2484 | if (! ReflectionTypeHint::isValid()) return false; 2485 | if (is_null($s)) return $s; 2486 | return preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F]+/sSX', '', $s); 2487 | } 2488 | 2489 | /** 2490 | * Check the data accessory to the class of characters ASCII. 2491 | * For null, integer, float, boolean returns TRUE. 2492 | * 2493 | * Массивы обходятся рекурсивно, если в хотябы одном элементе массива 2494 | * его значение не ASCII, возвращается FALSE. 2495 | * 2496 | * @param array|scalar|null $data 2497 | * @return bool 2498 | */ 2499 | public static function is_ascii($data) 2500 | { 2501 | if (! ReflectionTypeHint::isValid()) return false; 2502 | if (is_array($data)) 2503 | { 2504 | foreach ($data as $k => &$v) 2505 | { 2506 | if (! self::is_ascii($k) || ! self::is_ascii($v)) return false; 2507 | } 2508 | return true; 2509 | } 2510 | #ltrim() little faster then preg_match() 2511 | #if (is_string($data)) return preg_match('/^[\x00-\x7f]*$/sSX', $data); #deprecated 2512 | if (is_string($data)) return ltrim($data, "\x00..\x7f") === ''; 2513 | if (is_scalar($data) || is_null($data)) return true; #~ null, integer, float, boolean 2514 | return false; #object or resource 2515 | } 2516 | 2517 | /** 2518 | * Returns true if data is valid UTF-8 and false otherwise. 2519 | * For null, integer, float, boolean returns TRUE. 2520 | * 2521 | * The arrays are traversed recursively, if At least one element of the array 2522 | * its value is not in UTF-8, returns FALSE. 2523 | * 2524 | * @link http://www.w3.org/International/questions/qa-forms-utf-8.html 2525 | * @link http://ru3.php.net/mb_detect_encoding 2526 | * @link http://webtest.philigon.ru/articles/utf8/ 2527 | * @link http://unicode.coeurlumiere.com/ 2528 | * @param array|scalar|null $data 2529 | * @param bool $is_strict strict the range of ASCII? 2530 | * @return bool 2531 | */ 2532 | public static function is_utf8($data, $is_strict = true) 2533 | { 2534 | if (! ReflectionTypeHint::isValid()) return false; 2535 | if (is_array($data)) 2536 | { 2537 | foreach ($data as $k => &$v) 2538 | { 2539 | if (! self::is_utf8($k, $is_strict) || ! self::is_utf8($v, $is_strict)) return false; 2540 | } 2541 | return true; 2542 | } 2543 | if (is_string($data)) 2544 | { 2545 | if (! preg_match('~~suSX', $data)) return false; 2546 | if (function_exists('preg_last_error') && preg_last_error() !== PREG_NO_ERROR) return false; 2547 | #preg_match('~~suSX') much faster (up to 4 times), then mb_check_encoding($data, 'UTF-8')! 2548 | #if (function_exists('mb_check_encoding') && ! mb_check_encoding($data, 'UTF-8')) return false; #DEPRECATED 2549 | if ($is_strict && preg_match('/[^\x09\x0A\x0D\x20-\xBF\xC2-\xF7]/sSX', $data)) return false; 2550 | return true; 2551 | } 2552 | if (is_scalar($data) || is_null($data)) return true; #~ null, integer, float, boolean 2553 | return false; #object or resource 2554 | } 2555 | 2556 | /** 2557 | * Tries to detect if a string is in Unicode encoding 2558 | * 2559 | * @deprecated Slowly, use self::is_utf8() instead 2560 | * @see self::is_utf8() 2561 | * @param string $s текст 2562 | * @param bool $is_strict строгая проверка диапазона ASCII? 2563 | * @return bool 2564 | */ 2565 | public static function check($s, $is_strict = true) 2566 | { 2567 | if (! ReflectionTypeHint::isValid()) return false; 2568 | for ($i = 0, $len = strlen($s); $i < $len; $i++) 2569 | { 2570 | $c = ord($s[$i]); 2571 | if ($c < 0x80) #1 byte 0bbbbbbb 2572 | { 2573 | if ($is_strict === false || ($c > 0x1F && $c < 0x7F) || $c == 0x09 || $c == 0x0A || $c == 0x0D) continue; 2574 | } 2575 | if (($c & 0xE0) == 0xC0) $n = 1; #2 bytes 110bbbbb 10bbbbbb 2576 | elseif (($c & 0xF0) == 0xE0) $n = 2; #3 bytes 1110bbbb 10bbbbbb 10bbbbbb 2577 | elseif (($c & 0xF8) == 0xF0) $n = 3; #4 bytes 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb 2578 | elseif (($c & 0xFC) == 0xF8) $n = 4; #5 bytes 111110bb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb 2579 | elseif (($c & 0xFE) == 0xFC) $n = 5; #6 bytes 1111110b 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb 2580 | else return false; #does not match any model 2581 | #n bytes matching 10bbbbbb follow ? 2582 | for ($j = 0; $j < $n; $j++) 2583 | { 2584 | $i++; 2585 | if ($i == $len || ((ord($s[$i]) & 0xC0) != 0x80) ) return false; 2586 | } 2587 | } 2588 | return true; 2589 | } 2590 | 2591 | /** 2592 | * Check the data in UTF-8 charset on given ranges of the standard UNICODE. 2593 | * The suitable alternative to regular expressions. 2594 | * 2595 | * For null, integer, float, boolean returns TRUE. 2596 | * 2597 | * Arrays traversed recursively (keys and values). 2598 | * At least if one array element value is not passed checking, it returns FALSE. 2599 | * 2600 | * @example 2601 | * #A simple check the standard named ranges: 2602 | * UTF8::blocks_check('поисковые системы Google и Yandex', array('Basic Latin', 'Cyrillic')); 2603 | * #You can check the named, direct ranges or codepoints together: 2604 | * UTF8::blocks_check('поисковые системы Google и Yandex', array(array(0x20, 0x7E), #[\x20-\x7E] 2605 | * array(0x0410, 0x044F), #[A-Яa-я] 2606 | * 0x0401, #russian yo (Ё) 2607 | * 0x0451, #russian ye (ё) 2608 | * 'Arrows', 2609 | * )); 2610 | * 2611 | * @link http://www.unicode.org/charts/ 2612 | * @param array|scalar|null $data 2613 | * @param array|string $blocks 2614 | * @return bool Возвращает TRUE, если все символы из текста принадлежат указанным диапазонам 2615 | * и FALSE в противном случае или для разбитого UTF-8. 2616 | */ 2617 | public static function blocks_check($data, $blocks) 2618 | { 2619 | if (! ReflectionTypeHint::isValid()) return false; 2620 | 2621 | if (is_array($data)) 2622 | { 2623 | foreach ($data as $k => &$v) 2624 | { 2625 | if (! self::blocks_check($k, $blocks) || ! self::blocks_check($v, $blocks)) return false; 2626 | } 2627 | return true; 2628 | } 2629 | 2630 | if (is_string($data)) 2631 | { 2632 | $chars = self::str_split($data); 2633 | if ($chars === false) return false; #broken UTF-8 2634 | unset($data); #memory free 2635 | $skip = array(); #save to cache already checked symbols 2636 | foreach ($chars as $i => $char) 2637 | { 2638 | if (array_key_exists($char, $skip)) continue; #speed improve 2639 | $codepoint = self::ord($char); 2640 | if ($codepoint === false) return false; #broken UTF-8 2641 | $is_valid = false; 2642 | $blocks = (array)$blocks; 2643 | foreach ($blocks as $j => $block) 2644 | { 2645 | if (is_string($block)) 2646 | { 2647 | if (! array_key_exists($block, self::$unicode_blocks)) 2648 | { 2649 | trigger_error('Unknown block "' . $block . '"!', E_USER_WARNING); 2650 | return false; 2651 | } 2652 | list ($min, $max) = self::$unicode_blocks[$block]; 2653 | } 2654 | elseif (is_array($block)) list ($min, $max) = $block; 2655 | elseif (is_int($block)) $min = $max = $block; 2656 | else trigger_error('A string/array/int type expected for block[' . $j . ']!', E_USER_ERROR); 2657 | if ($codepoint >= $min && $codepoint <= $max) 2658 | { 2659 | $is_valid = true; 2660 | break; 2661 | } 2662 | }#foreach 2663 | if (! $is_valid) return false; 2664 | $skip[$char] = null; 2665 | }#foreach 2666 | return true; 2667 | } 2668 | if (is_scalar($data) || is_null($data)) return true; #~ null, integer, float, boolean 2669 | return false; #object or resource 2670 | } 2671 | 2672 | /** 2673 | * Recode $_GET, $_POST, $_COOKIE, $_REQUEST, $_FILES from $charset encoding to UTF-8, if necessary. 2674 | * A side effect is a positive protection against XSS attacks with non-printable characters on the vulnerable PHP function. 2675 | * Thus web forms can be sent to the server in 2-encoding: $charset and UTF-8. 2676 | * For example: ?тест[тест]=тест 2677 | * 2678 | * Алгоритм работы: 2679 | * 1) Функция проверяет массивы $_GET, $_POST, $_COOKIE, $_REQUEST, $_FILES 2680 | * на корректность значений элементов кодировке UTF-8. 2681 | * 2) Значения не в UTF-8 принимаются как $charset и конвертируется в UTF-8, 2682 | * при этом байты от 0x00 до 0x7F (ASCII) сохраняются как есть. 2683 | * 3) Сконвертированные значения снова проверяются. 2684 | * Если данные опять не в кодировке UTF-8, то они считаются разбитыми и функция возвращает FALSE. 2685 | * 2686 | * NOTICE 2687 | * Функция должна вызываться после self::unescape_request()! 2688 | * 2689 | * @see self::unescape_request() 2690 | * @param bool $is_hex2bin Декодировать HEX-данные? 2691 | * Пример: 0xd09ec2a0d0bad0bed0bcd0bfd0b0d0bdd0b8d0b8 => О компании 2692 | * Параметры в URL адресах иногда бывает удобно кодировать не функцией rawurlencode(), 2693 | * а использовать следующий механизм (к тому же кодирующий данные более компактно): 2694 | * '0x' . bin2hex($string) 2695 | * @param string $charset 2696 | * @return bool Возвращает TRUE, если все значения элементов массивов в кодировке UTF-8 2697 | * и FALSE + E_USER_WARNING в противном случае. 2698 | */ 2699 | public static function autoconvert_request($is_hex2bin = false, $charset = 'cp1251') 2700 | { 2701 | if (! ReflectionTypeHint::isValid()) return false; 2702 | $is_converted = false; 2703 | $is_broken = false; 2704 | foreach (array('_GET', '_POST', '_COOKIE', '_FILES') as $k => $v) 2705 | { 2706 | if (! array_key_exists($v, $GLOBALS)) continue; 2707 | #использовать array_walk_recursive() не предоставляется возможным, 2708 | #т.к. его callback функция не поддерживает передачу ключа по ссылке 2709 | $GLOBALS[$v] = self::_autoconvert_request_recursive($GLOBALS[$v], $is_converted, $is_broken, $is_hex2bin, $charset); 2710 | if ($is_broken) 2711 | { 2712 | trigger_error('Array $' . $v . ' does not have keys/values in UTF-8 charset!', E_USER_WARNING); 2713 | return false; 2714 | } 2715 | } 2716 | if ($is_converted) 2717 | { 2718 | $_REQUEST = 2719 | (isset($_COOKIE) ? $_COOKIE : array()) + 2720 | (isset($_POST) ? $_POST : array()) + 2721 | (isset($_GET) ? $_GET : array()); 2722 | } 2723 | return true; 2724 | } 2725 | 2726 | private static function _autoconvert_request_recursive(&$data, &$is_converted, &$is_broken, $is_hex2bin, $charset) 2727 | { 2728 | if ($is_broken) return $data; #speed improve 2729 | if (is_array($data)) 2730 | { 2731 | $d = array(); 2732 | foreach ($data as $k => &$v) 2733 | { 2734 | $k = self::_autoconvert_request($k, $is_converted, $is_broken, $is_hex2bin, $charset); 2735 | if ($is_broken) return $data; #speed improve 2736 | $d[$k] = self::_autoconvert_request_recursive($v, $is_converted, $is_broken, $is_hex2bin, $charset); 2737 | if ($is_broken) return $data; #speed improve 2738 | } 2739 | return $d; 2740 | } 2741 | return self::_autoconvert_request($data, $is_converted, $is_broken, $is_hex2bin, $charset); 2742 | } 2743 | 2744 | private static function _autoconvert_request(&$s, &$is_converted, &$is_broken, $is_hex2bin, $charset) 2745 | { 2746 | #regexp speed improve by using strpos() 2747 | if ($is_hex2bin && strpos($s, '0x') === 0 && preg_match('/^0x((?:[\da-fA-F]{2})+)$/sSX', $s, $m)) 2748 | { 2749 | $s = pack('H' . strlen($m[1]), $m[1]); #hex2bin() 2750 | $is_converted = true; 2751 | } 2752 | if (! self::is_utf8($s)) 2753 | { 2754 | $s = self::convert_from($s, $charset); 2755 | if ($s === false) $is_broken = true; 2756 | elseif (! self::is_utf8($s)) 2757 | { 2758 | trigger_error('String 0x ' . substr(bin2hex($s), 0, 100) . '... is not UTF-8!', E_USER_WARNING); 2759 | $is_broken = true; 2760 | } 2761 | else $is_converted = true; 2762 | } 2763 | return $s; 2764 | } 2765 | 2766 | /** 2767 | * Сравнение строк 2768 | * 2769 | * @param string|null $s1 2770 | * @param string|null $s2 2771 | * @param string $locale For example, 'en_CA', 'ru_RU' 2772 | * @return int|bool|null Returns FALSE if error occurred 2773 | * Returns < 0 if $s1 is less than $s2; 2774 | * > 0 if $s1 is greater than $s2; 2775 | * 0 if they are equal. 2776 | */ 2777 | public static function strcmp($s1, $s2, $locale = '') 2778 | { 2779 | if (! ReflectionTypeHint::isValid()) return false; 2780 | if (is_null($s1) || is_null($s2)) return null; 2781 | if (! function_exists('collator_create')) return strcmp($s1, $s2); 2782 | # PHP 5 >= 5.3.0, PECL intl >= 1.0.0 2783 | # If empty string ("") or "root" are passed, UCA rules will be used. 2784 | $c = new Collator($locale); 2785 | if (! $c) 2786 | { 2787 | # Returns an "empty" object on error. You can use intl_get_error_code() and/or intl_get_error_message() to know what happened. 2788 | trigger_error(intl_get_error_message(), E_USER_WARNING); 2789 | return false; 2790 | } 2791 | return $c->compare($s1, $s2); 2792 | } 2793 | 2794 | /** 2795 | * Сравнение строк для N первых символов 2796 | * 2797 | * @param string|null $s1 2798 | * @param string|null $s2 2799 | * @param int $length 2800 | * @return int|bool|null Returns FALSE if error occurred 2801 | * Returns < 0 if $s1 is less than $s2; 2802 | * > 0 if $s1 is greater than $s2; 2803 | * 0 if they are equal. 2804 | */ 2805 | public static function strncmp($s1, $s2, $length) 2806 | { 2807 | if (! ReflectionTypeHint::isValid()) return false; 2808 | if (is_null($s1) || is_null($s2)) return null; 2809 | return self::strcmp(self::substr($s1, 0, $length), self::substr($s2, 0, $length)); 2810 | } 2811 | 2812 | /** 2813 | * Implementation strcasecmp() function for UTF-8 encoding string. 2814 | * 2815 | * @param string|null $s1 2816 | * @param string|null $s2 2817 | * @return int|bool|null Returns FALSE if error occurred 2818 | * Returns < 0 if $s1 is less than $s2; 2819 | * > 0 if $s1 is greater than $s2; 2820 | * 0 if they are equal. 2821 | */ 2822 | public static function strcasecmp($s1, $s2) 2823 | { 2824 | if (! ReflectionTypeHint::isValid()) return false; 2825 | if (is_null($s1) || is_null($s2)) return null; 2826 | return self::strcmp(self::lowercase($s1), self::lowercase($s2)); 2827 | } 2828 | 2829 | /** 2830 | * Converts a UTF-8 string to a UNICODE codepoints 2831 | * 2832 | * @param string|null $s UTF-8 string 2833 | * @return array|bool|null Unicode codepoints 2834 | * Returns FALSE if $s broken (not UTF-8) 2835 | */ 2836 | public static function to_unicode($s) 2837 | { 2838 | if (! ReflectionTypeHint::isValid()) return false; 2839 | if (is_null($s)) return $s; 2840 | 2841 | $s2 = null; 2842 | #since PHP-5.3.x iconv() little faster then mb_convert_encoding() 2843 | if (function_exists('iconv')) $s2 = @iconv('UTF-8', 'UCS-4BE', $s); 2844 | elseif (function_exists('mb_convert_encoding')) $s2 = @mb_convert_encoding($s, 'UCS-4BE', 'UTF-8'); 2845 | if (is_string($s2)) return array_values(unpack('N*', $s2)); 2846 | if ($s2 !== null) return false; 2847 | 2848 | $a = self::str_split($s); 2849 | if ($a === false) return false; 2850 | return array_map(array(__CLASS__, 'ord'), $a); 2851 | } 2852 | 2853 | /** 2854 | * Converts a UNICODE codepoints to a UTF-8 string 2855 | * 2856 | * @param array|null $a Unicode codepoints 2857 | * @return string|bool|null UTF-8 string 2858 | * Returns FALSE if error occurred 2859 | */ 2860 | public static function from_unicode($a) 2861 | { 2862 | if (! ReflectionTypeHint::isValid()) return false; 2863 | if (is_null($a)) return $a; 2864 | 2865 | #since PHP-5.3.x iconv() little faster then mb_convert_encoding() 2866 | if (function_exists('iconv')) 2867 | { 2868 | array_walk($a, function(&$cp) { $cp = pack('N', $cp); }); 2869 | $s = @iconv('UCS-4BE', 'UTF-8', implode('', $a)); 2870 | if (! is_string($s)) return false; 2871 | return $s; 2872 | } 2873 | if (function_exists('mb_convert_encoding')) 2874 | { 2875 | array_walk($a, function(&$cp) { $cp = pack('N', $cp); }); 2876 | $s = mb_convert_encoding(implode('', $a), 'UTF-8', 'UCS-4BE'); 2877 | if (! is_string($s)) return false; 2878 | return $s; 2879 | } 2880 | 2881 | return implode('', array_map(array(__CLASS__, 'chr'), $a)); 2882 | } 2883 | 2884 | /** 2885 | * Converts a UTF-8 character to a UNICODE codepoint 2886 | * 2887 | * @param string|null $char UTF-8 character 2888 | * @return int|bool|null Unicode codepoint 2889 | * Returns FALSE if $char broken (not UTF-8) 2890 | */ 2891 | public static function ord($char) 2892 | { 2893 | if (! ReflectionTypeHint::isValid()) return false; 2894 | if (is_null($char)) return $char; 2895 | 2896 | static $cache = array(); 2897 | if (array_key_exists($char, $cache)) return $cache[$char]; #speed improve 2898 | 2899 | switch (strlen($char)) 2900 | { 2901 | case 1 : return $cache[$char] = ord($char); 2902 | case 2 : return $cache[$char] = (ord($char{1}) & 63) | 2903 | ((ord($char{0}) & 31) << 6); 2904 | case 3 : return $cache[$char] = (ord($char{2}) & 63) | 2905 | ((ord($char{1}) & 63) << 6) | 2906 | ((ord($char{0}) & 15) << 12); 2907 | case 4 : return $cache[$char] = (ord($char{3}) & 63) | 2908 | ((ord($char{2}) & 63) << 6) | 2909 | ((ord($char{1}) & 63) << 12) | 2910 | ((ord($char{0}) & 7) << 18); 2911 | default : 2912 | trigger_error('Character 0x' . bin2hex($char) . ' is not UTF-8!', E_USER_WARNING); 2913 | return false; 2914 | } 2915 | } 2916 | 2917 | /** 2918 | * Converts a UNICODE codepoint to a UTF-8 character 2919 | * 2920 | * @param int|digit|null $cp Unicode codepoint 2921 | * @return string|bool|null UTF-8 character 2922 | * Returns FALSE if error occurred 2923 | */ 2924 | public static function chr($cp) 2925 | { 2926 | if (! ReflectionTypeHint::isValid()) return false; 2927 | if (is_null($cp)) return $cp; 2928 | 2929 | static $cache = array(); 2930 | if (array_key_exists($cp, $cache)) return $cache[$cp]; #speed improve 2931 | 2932 | if ($cp <= 0x7f) return $cache[$cp] = chr($cp); 2933 | if ($cp <= 0x7ff) return $cache[$cp] = chr(0xc0 | ($cp >> 6)) . 2934 | chr(0x80 | ($cp & 0x3f)); 2935 | if ($cp <= 0xffff) return $cache[$cp] = chr(0xe0 | ($cp >> 12)) . 2936 | chr(0x80 | (($cp >> 6) & 0x3f)) . 2937 | chr(0x80 | ($cp & 0x3f)); 2938 | if ($cp <= 0x10ffff) return $cache[$cp] = chr(0xf0 | ($cp >> 18)) . 2939 | chr(0x80 | (($cp >> 12) & 0x3f)) . 2940 | chr(0x80 | (($cp >> 6) & 0x3f)) . 2941 | chr(0x80 | ($cp & 0x3f)); 2942 | #U+FFFD REPLACEMENT CHARACTER 2943 | return $cache[$cp] = "\xEF\xBF\xBD"; 2944 | } 2945 | 2946 | /** 2947 | * Implementation chunk_split() function for UTF-8 encoding string. 2948 | * 2949 | * @param string|null $s 2950 | * @param int|digit|null $length 2951 | * @param string|null $glue 2952 | * @return string|bool|null Returns FALSE if error occurred 2953 | */ 2954 | public static function chunk_split($s, $length = null, $glue = null) 2955 | { 2956 | if (! ReflectionTypeHint::isValid()) return false; 2957 | if (is_null($s)) return $s; 2958 | 2959 | $length = intval($length); 2960 | $glue = strval($glue); 2961 | if ($length < 1) $length = 76; 2962 | if ($glue === '') $glue = "\r\n"; 2963 | if (! is_array($a = self::str_split($s, $length))) return false; 2964 | return implode($glue, $a); 2965 | } 2966 | 2967 | /** 2968 | * Changes all keys in an array 2969 | * 2970 | * @param array|null $a 2971 | * @param int $mode {CASE_LOWER|CASE_UPPER} 2972 | * @return array|bool|null Returns FALSE if error occurred 2973 | */ 2974 | public static function array_change_key_case($a, $mode) 2975 | { 2976 | if (! ReflectionTypeHint::isValid()) return false; 2977 | if (! is_array($a)) return $a; 2978 | $a2 = array(); 2979 | foreach ($a as $k => $v) 2980 | { 2981 | if (is_string($k)) 2982 | { 2983 | $k = self::convert_case($k, $mode); 2984 | if ($k === false) return false; 2985 | } 2986 | $a2[$k] = $v; 2987 | } 2988 | return $a2; 2989 | } 2990 | 2991 | /** 2992 | * Конвертирует регистр букв в данных в кодировке UTF-8. 2993 | * Массивы обходятся рекурсивно, при этом конвертируются только значения 2994 | * в элементах массива, а ключи остаются без изменений. 2995 | * Для конвертирования только ключей используйте метод self::array_change_key_case(). 2996 | * 2997 | * @see self::array_change_key_case() 2998 | * @link http://www.unicode.org/charts/PDF/U0400.pdf 2999 | * @link http://ru.wikipedia.org/wiki/ISO_639-1 3000 | * @param array|scalar|null $data Данные произвольной структуры 3001 | * @param int $mode {CASE_LOWER|CASE_UPPER} 3002 | * @param bool $is_ascii_optimization for speed improve 3003 | * @return scalar|bool|null Returns FALSE if error occurred 3004 | */ 3005 | public static function convert_case($data, $mode, $is_ascii_optimization = true) 3006 | { 3007 | if (! ReflectionTypeHint::isValid()) return false; 3008 | 3009 | if (is_array($data)) 3010 | { 3011 | foreach ($data as $k => &$v) $v = self::convert_case($v, $mode); 3012 | return $data; 3013 | } 3014 | if (! is_string($data) || ! $data) return $data; 3015 | 3016 | if ($mode === CASE_UPPER) 3017 | { 3018 | if ($is_ascii_optimization && self::is_ascii($data)) return strtoupper($data); #speed improve! 3019 | #deprecated, since PHP-5.3.x strtr() 2-3 times faster then mb_strtolower() 3020 | #if (function_exists('mb_strtoupper')) return mb_strtoupper($data, 'utf-8'); 3021 | return strtr($data, array_flip(self::$convert_case_table)); 3022 | } 3023 | if ($mode === CASE_LOWER) 3024 | { 3025 | if ($is_ascii_optimization && self::is_ascii($data)) return strtolower($data); #speed improve! 3026 | #deprecated, since PHP-5.3.x strtr() 2-3 times faster then mb_strtolower() 3027 | #if (function_exists('mb_strtolower')) return mb_strtolower($data, 'utf-8'); 3028 | return strtr($data, self::$convert_case_table); 3029 | } 3030 | trigger_error('Parameter 2 should be a constant of CASE_LOWER or CASE_UPPER!', E_USER_WARNING); 3031 | return $data; 3032 | } 3033 | 3034 | /** 3035 | * Convert a data to lower case 3036 | * 3037 | * @param array|scalar|null $data 3038 | * @return scalar|bool|null Returns FALSE if error occurred */ 3039 | public static function lowercase($data) 3040 | { 3041 | if (! ReflectionTypeHint::isValid()) return false; 3042 | return self::convert_case($data, CASE_LOWER); 3043 | } 3044 | 3045 | /** 3046 | * Convert a data to upper case 3047 | * 3048 | * @param array|scalar|null $data 3049 | * @return scalar|null Returns FALSE if error occurred 3050 | */ 3051 | public static function uppercase($data) 3052 | { 3053 | if (! ReflectionTypeHint::isValid()) return false; 3054 | return self::convert_case($data, CASE_UPPER); 3055 | } 3056 | 3057 | /** 3058 | * Convert a data to lower case 3059 | * 3060 | * @param array|scalar|null $data 3061 | * @return scalar|bool|null Returns FALSE if error occurred 3062 | */ 3063 | public static function strtolower($data) 3064 | { 3065 | if (! ReflectionTypeHint::isValid()) return false; 3066 | return self::convert_case($data, CASE_LOWER); 3067 | } 3068 | 3069 | /** 3070 | * Convert a data to upper case 3071 | * 3072 | * @param array|scalar|null $data 3073 | * @return scalar|null Returns FALSE if error occurred 3074 | */ 3075 | public static function strtoupper($data) 3076 | { 3077 | if (! ReflectionTypeHint::isValid()) return false; 3078 | return self::convert_case($data, CASE_UPPER); 3079 | } 3080 | 3081 | 3082 | /** 3083 | * Convert all HTML entities to native UTF-8 characters 3084 | * Функция декодирует гораздо больше именованных сущностей, чем стандартная html_entity_decode() 3085 | * Все dec и hex сущности так же переводятся в UTF-8. 3086 | * 3087 | * Example: '"' or '"' or '"' will be converted to '"'. 3088 | * 3089 | * @link http://www.htmlhelp.com/reference/html40/entities/ 3090 | * @link http://www.alanwood.net/demos/ent4_frame.html (HTML 4.01 Character Entity References) 3091 | * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset1.asp?frame=true 3092 | * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp?frame=true 3093 | * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp?frame=true 3094 | * 3095 | * @param scalar|null $s 3096 | * @param bool $is_special_chars Дополнительно обрабатывать специальные html сущности? (< > & ") 3097 | * @return scalar|null Returns FALSE if error occurred 3098 | */ 3099 | public static function html_entity_decode($s, $is_special_chars = false) 3100 | { 3101 | if (! ReflectionTypeHint::isValid()) return false; 3102 | if (! is_string($s)) return $s; 3103 | 3104 | #speed improve 3105 | if (strlen($s) < 4 #по минимальной длине сущности - 4 байта: &#d; &xx; 3106 | || ($pos = strpos($s, '&') === false) || strpos($s, ';', $pos) === false) return $s; 3107 | 3108 | $table = self::$html_entity_table; 3109 | if ($is_special_chars) $table += self::$html_special_chars_table; 3110 | 3111 | #replace named entities 3112 | $s = strtr($s, $table); 3113 | #block below deprecated, since PHP-5.3.x strtr() 1.5 times faster 3114 | if (0 && preg_match_all('/&[a-zA-Z]++\d*+;/sSX', $s, $m, null, $pos)) 3115 | { 3116 | foreach (array_unique($m[0]) as $entity) 3117 | { 3118 | if (array_key_exists($entity, $table)) $s = str_replace($entity, $table[$entity], $s); 3119 | } 3120 | } 3121 | 3122 | #заменяем числовые dec и hex сущности: 3123 | if (strpos($s, '&#') !== false) #speed improve 3124 | { 3125 | $class = __CLASS__; 3126 | $html_special_chars_table_flipped = array_flip(self::$html_special_chars_table); 3127 | $s = preg_replace_callback('/&#((x)[\da-fA-F]{1,6}+|\d{1,7}+);/sSX', 3128 | function (array $m) use ($class, $html_special_chars_table_flipped, $is_special_chars) 3129 | { 3130 | $codepoint = isset($m[2]) && $m[2] === 'x' ? hexdec($m[1]) : $m[1]; 3131 | if (! $is_special_chars) 3132 | { 3133 | $char = pack('C', $codepoint); 3134 | if (array_key_exists($char, $html_special_chars_table_flipped)) return $html_special_chars_table_flipped[$char]; 3135 | } 3136 | return $class::chr($codepoint); 3137 | }, $s); 3138 | } 3139 | return $s; 3140 | } 3141 | 3142 | /** 3143 | * Convert special UTF-8 characters to HTML entities. 3144 | * Функция кодирует гораздо больше именованных сущностей, чем стандартная htmlentities() 3145 | * 3146 | * @link http://www.htmlhelp.com/reference/html40/entities/ 3147 | * @link http://www.alanwood.net/demos/ent4_frame.html (HTML 4.01 Character Entity References) 3148 | * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset1.asp?frame=true 3149 | * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp?frame=true 3150 | * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp?frame=true 3151 | * 3152 | * @param scalar|null $s 3153 | * @param bool $is_special_chars_only Обрабатывать только специальные html сущности? (< > & ") 3154 | * @return scalar|null Returns FALSE if error occurred 3155 | */ 3156 | public static function html_entity_encode($s, $is_special_chars_only = false) 3157 | { 3158 | if (! ReflectionTypeHint::isValid()) return false; 3159 | if (! is_string($s)) return $s; 3160 | 3161 | #if ($is_special_chars_only) return strtr($s, array_flip(self::$html_special_chars_table)); 3162 | if ($is_special_chars_only) return htmlspecialchars($s); 3163 | 3164 | #replace UTF-8 chars to named entities: 3165 | $s = strtr($s, array_flip(self::$html_entity_table)); 3166 | #block below deprecated, since PHP-5.3.x strtr() 3 times faster 3167 | if (0 && preg_match_all('~(?> [\xc2\xc3\xc5\xc6\xcb\xce\xcf][\x80-\xbf] #2 bytes 3168 | | \xe2[\x80-\x99][\x82-\xac] #3 bytes 3169 | ) 3170 | ~sxSX', $s, $m)) 3171 | { 3172 | $table = array_flip(self::$html_entity_table); 3173 | foreach (array_unique($m[0]) as $char) 3174 | { 3175 | if (array_key_exists($char, $table)) $s = str_replace($char, $table[$char], $s); 3176 | } 3177 | } 3178 | 3179 | return $s; 3180 | } 3181 | 3182 | /** 3183 | * Make regular expression for case insensitive match 3184 | * Example (non ASCII): "123_слово_test" => "123_(с|С)(л|Л)(о|О)(в|В)(о|О)_[tT][eE][sS][tT]" 3185 | * Example (only ASCII): "123_test" => "(?i:123_test)" 3186 | * 3187 | * @param string $s 3188 | * @param string|null $delimiter If the optional delimiter is specified, it will also be escaped. 3189 | * This is useful for escaping the delimiter that is required by the PCRE functions. 3190 | * The / is the most commonly used delimiter. 3191 | * @return string|bool|null Returns FALSE if error occurred 3192 | */ 3193 | public static function preg_quote_case_insensitive($s, $delimiter = null) 3194 | { 3195 | if (! ReflectionTypeHint::isValid()) return false; 3196 | if (is_null($s)) return $s; 3197 | 3198 | if (self::is_ascii($s)) return '(?i:' . preg_quote($s, $delimiter) . ')'; #speed improve 3199 | 3200 | $s_re = ''; 3201 | $s_lc = UTF8::lowercase($s); if ($s_lc === false) return false; 3202 | $s_uc = UTF8::uppercase($s); if ($s_uc === false) return false; 3203 | 3204 | $chars_lc = UTF8::str_split($s_lc); if ($chars_lc === false) return false; 3205 | $chars_uc = UTF8::str_split($s_uc); if ($chars_uc === false) return false; 3206 | 3207 | foreach ($chars_lc as $i => $char) 3208 | { 3209 | if ($chars_lc[$i] === $chars_uc[$i]) 3210 | $s_re .= preg_quote($chars_lc[$i], $delimiter); 3211 | elseif (self::is_ascii($chars_lc[$i])) 3212 | $s_re .= '[' . preg_quote($chars_lc[$i] . $chars_uc[$i], $delimiter) . ']'; 3213 | else 3214 | $s_re .= '(' . preg_quote($chars_lc[$i], $delimiter) . '|' 3215 | . preg_quote($chars_uc[$i], $delimiter) . ')'; 3216 | } 3217 | return $s_re; 3218 | } 3219 | 3220 | /** 3221 | * Call preg_match_all() and convert byte offsets into character offsets for PREG_OFFSET_CAPTURE flag. 3222 | * This is regardless of whether you use /u modifier. 3223 | * 3224 | * @link http://bolknote.ru/2010/09/08/~2704 3225 | * 3226 | * @param string $pattern 3227 | * @param string|null $subject 3228 | * @param array $matches 3229 | * @param int $flags 3230 | * @param int $char_offset 3231 | * @return array|bool|null Returns FALSE if error occurred 3232 | */ 3233 | public static function preg_match_all($pattern, $subject, &$matches, $flags = PREG_PATTERN_ORDER, $char_offset = 0) 3234 | { 3235 | if (! ReflectionTypeHint::isValid()) return false; 3236 | if (is_null($subject)) return null; 3237 | 3238 | $byte_offset = ($char_offset > 0) ? strlen(self::substr($subject, 0, $char_offset)) : $char_offset; 3239 | 3240 | $return = preg_match_all($pattern, $subject, $matches, $flags, $byte_offset); 3241 | if ($return === false) return false; 3242 | 3243 | if ($flags & PREG_OFFSET_CAPTURE) 3244 | { 3245 | foreach ($matches as &$match) 3246 | { 3247 | foreach ($match as &$a) $a[1] = self::strlen(substr($subject, 0, $a[1])); 3248 | } 3249 | } 3250 | 3251 | return $return; 3252 | } 3253 | 3254 | #alias for self::str_limit() 3255 | public static function truncate($s, $maxlength = null, $continue = "\xe2\x80\xa6", &$is_cutted = null, $tail_min_length = 20) 3256 | { 3257 | return self::str_limit($s, $maxlength, $continue, $is_cutted, $tail_min_length); 3258 | } 3259 | 3260 | /** 3261 | * Обрезает текст в кодировке UTF-8 до заданной длины, 3262 | * причём последнее слово показывается целиком, а не обрывается на середине. 3263 | * Html сущности корректно обрабатываются. 3264 | * 3265 | * @param string|null $s Текст в кодировке UTF-8 3266 | * @param int|null|digit $maxlength Ограничение длины текста 3267 | * @param string $continue Завершающая строка, которая будет вставлена после текста, если он обрежется 3268 | * @param bool|null &$is_cutted Текст был обрезан? 3269 | * @param int|digit $tail_min_length Если длина "хвоста", оставшегося после обрезки текста, меньше $tail_min_length, 3270 | * то текст возвращается без изменений 3271 | * @return string|bool|null Returns FALSE if error occurred 3272 | */ 3273 | public static function str_limit($s, $maxlength = null, $continue = "\xe2\x80\xa6", &$is_cutted = null, $tail_min_length = 20) #"\xe2\x80\xa6" = "…" 3274 | { 3275 | if (! ReflectionTypeHint::isValid()) return false; 3276 | if (is_null($s)) return $s; 3277 | 3278 | $is_cutted = false; 3279 | if ($continue === null) $continue = "\xe2\x80\xa6"; 3280 | if (! $maxlength) $maxlength = 256; 3281 | 3282 | #speed improve block 3283 | #{{{ 3284 | if (strlen($s) <= $maxlength) return $s; 3285 | $s2 = str_replace("\r\n", '?', $s); 3286 | $s2 = preg_replace('/&(?> [a-zA-Z][a-zA-Z\d]+ 3287 | | \#(?> \d{1,4} 3288 | | x[\da-fA-F]{2,4} 3289 | ) 3290 | ); # html сущности (< > & ") 3291 | /sxSX', '?', $s2); 3292 | if (strlen($s2) <= $maxlength || self::strlen($s2) <= $maxlength) return $s; 3293 | #}}} 3294 | 3295 | $r = preg_match_all('/(?> \r\n # переносы строк 3296 | | &(?> [a-zA-Z][a-zA-Z\d]+ 3297 | | \#(?> \d{1,4} 3298 | | x[\da-fA-F]{2,4} 3299 | ) 3300 | ); # html сущности (< > & ") 3301 | | . 3302 | ) 3303 | /sxuSX', $s, $m); 3304 | if ($r === false) return false; 3305 | 3306 | #d($m); 3307 | if (count($m[0]) <= $maxlength) return $s; 3308 | 3309 | $left = implode('', array_slice($m[0], 0, $maxlength)); 3310 | #из диапазона ASCII исключаем буквы, цифры, открывающие парные символы [a-zA-Z\d\(\{\[] и некоторые др. символы 3311 | #нельзя вырезать в конце строки символ ";", т.к. он используются в сущностях &xxx; 3312 | $left2 = rtrim($left, "\x00..\x28\x2A..\x2F\x3A\x3C..\x3E\x40\x5B\x5C\x5E..\x60\x7B\x7C\x7E\x7F"); 3313 | if (strlen($left) !== strlen($left2)) $return = $left2 . $continue; 3314 | else 3315 | { 3316 | #добавляем остаток к обрезанному слову 3317 | $right = implode('', array_slice($m[0], $maxlength)); 3318 | preg_match('/^(?> [\d\)\]\}\-\.:]+ #цифры, закрывающие парные символы, дефис для составных слов, дата, время, IP-адреса, URL типа www.ya.ru:80! 3319 | | \p{L}+ #буквы 3320 | | \xe2\x80\x9d #закрывающие кавычки 3321 | | \xe2\x80\x99 #закрывающие кавычки 3322 | | \xe2\x80\x9c #закрывающие кавычки 3323 | | \xc2\xbb #закрывающие кавычки 3324 | )+ 3325 | /suxSX', $right, $m); 3326 | #d($m); 3327 | $right = isset($m[0]) ? rtrim($m[0], '.-') : ''; 3328 | $return = $left . $right; 3329 | if (strlen($return) !== strlen($s)) $return .= $continue; 3330 | } 3331 | if (self::strlen($s) - self::strlen($return) < $tail_min_length) return $s; 3332 | 3333 | $is_cutted = true; 3334 | return $return; 3335 | } 3336 | 3337 | /** 3338 | * Implementation str_split() function for UTF-8 encoding string. 3339 | * 3340 | * @param string|null $s 3341 | * @param int|null|digit $length 3342 | * @return array|bool|null Returns FALSE if error occurred 3343 | */ 3344 | public static function str_split($s, $length = null) 3345 | { 3346 | if (! ReflectionTypeHint::isValid()) return false; 3347 | if (is_null($s)) return $s; 3348 | 3349 | $length = ($length === null) ? 1 : intval($length); 3350 | if ($length < 1) return false; 3351 | #there are limits in regexp for {min,max}! 3352 | if (preg_match_all('~.~suSX', $s, $m) === false) return false; 3353 | if (function_exists('preg_last_error') && preg_last_error() !== PREG_NO_ERROR) return false; 3354 | if ($length === 1) $a = $m[0]; 3355 | else 3356 | { 3357 | $a = array(); 3358 | for ($i = 0, $c = count($m[0]); $i < $c; $i += $length) $a[] = implode('', array_slice($m[0], $i, $length)); 3359 | } 3360 | return $a; 3361 | } 3362 | 3363 | /** 3364 | * Implementation strlen() function for UTF-8 encoding string. 3365 | * 3366 | * @param string|null $s 3367 | * @return int|bool|null Returns FALSE if error occurred 3368 | */ 3369 | public static function strlen($s) 3370 | { 3371 | if (! ReflectionTypeHint::isValid()) return false; 3372 | if (is_null($s)) return $s; 3373 | 3374 | //since PHP-5.3.x mb_strlen() faster then strlen(utf8_decode()) 3375 | if (function_exists('mb_strlen')) return mb_strlen($s, 'utf-8'); 3376 | 3377 | /* 3378 | utf8_decode() converts characters that are not in ISO-8859-1 to '?', which, for the purpose of counting, is quite alright. 3379 | It's much faster than iconv_strlen() 3380 | Note: this function does not count bad UTF-8 bytes in the string - these are simply ignored 3381 | */ 3382 | return strlen(utf8_decode($s)); 3383 | 3384 | /* 3385 | #slowly then strlen(utf8_decode()) 3386 | if (function_exists('iconv_strlen')) return iconv_strlen($s, 'utf-8'); 3387 | 3388 | #Do not count UTF-8 continuation bytes 3389 | #return strlen(preg_replace('/[\x80-\xBF]/sSX', '', $s)); 3390 | 3391 | #slowly then strlen(utf8_decode()) 3392 | preg_match_all('~.~suSX', $str, $m); 3393 | return count($m[0]); 3394 | 3395 | #slowly then preg_match_all() + count() 3396 | $n = 0; 3397 | for ($i = 0, $len = strlen($s); $i < $len; $i++) 3398 | { 3399 | $c = ord(substr($s, $i, 1)); 3400 | if ($c < 0x80) $n++; #single-byte (0xxxxxx) 3401 | elseif (($c & 0xC0) == 0xC0) $n++; #multi-byte starting byte (11xxxxxx) 3402 | } 3403 | return $n; 3404 | */ 3405 | } 3406 | 3407 | /** 3408 | * Implementation strpos() function for UTF-8 encoding string 3409 | * 3410 | * @param string|null $s The entire string 3411 | * @param string|int $needle The searched substring 3412 | * @param int|null $offset The optional offset parameter specifies the position from which the search should be performed 3413 | * @return int|bool|null Returns the numeric position of the first occurrence of needle in haystack. 3414 | * If needle is not found, will return FALSE. 3415 | */ 3416 | public static function strpos($s, $needle, $offset = null) 3417 | { 3418 | if (! ReflectionTypeHint::isValid()) return false; 3419 | if (is_null($s)) return $s; 3420 | 3421 | if ($offset === null || $offset < 0) $offset = 0; 3422 | if (function_exists('mb_strpos')) return mb_strpos($s, $needle, $offset, 'utf-8'); 3423 | #iconv_strpos() deprecated, because slowly than self::strlen(substr()) 3424 | #if (function_exists('iconv_strpos')) return iconv_strpos($s, $needle, $offset, 'utf-8'); 3425 | $byte_pos = $offset; 3426 | do if (($byte_pos = strpos($s, $needle, $byte_pos)) === false) return false; 3427 | while (($char_pos = self::strlen(substr($s, 0, $byte_pos++))) < $offset); 3428 | return $char_pos; 3429 | } 3430 | 3431 | /** 3432 | * Find position of first occurrence of a case-insensitive string. 3433 | * 3434 | * @param string|null $s The entire string 3435 | * @param string|int $needle The searched substring 3436 | * @param int|null $offset The optional offset parameter specifies the position from which the search should be performed 3437 | * @return int|bool|null Returns the numeric position of the first occurrence of needle in haystack. 3438 | * If needle is not found, will return FALSE. 3439 | */ 3440 | public static function stripos($s, $needle, $offset = null) 3441 | { 3442 | if (! ReflectionTypeHint::isValid()) return false; 3443 | if (is_null($s)) return $s; 3444 | 3445 | if ($offset === null || $offset < 0) $offset = 0; 3446 | if (function_exists('mb_stripos')) return mb_stripos($s, $needle, $offset, 'utf-8'); 3447 | 3448 | #optimization block (speed improve) 3449 | #{{{ 3450 | $ascii_int = intval(self::is_ascii($s)) + intval(self::is_ascii($needle)); 3451 | if ($ascii_int === 1) return false; 3452 | if ($ascii_int === 2) return stripos($s, $needle, $offset); 3453 | #}}} 3454 | 3455 | $s = self::convert_case($s, CASE_LOWER, false); 3456 | if ($s === false) return false; 3457 | $needle = self::convert_case($needle, CASE_LOWER, false); 3458 | if ($needle === false) return false; 3459 | return self::strpos($s, $needle, $offset); 3460 | } 3461 | 3462 | /** 3463 | * Implementation strrev() function for UTF-8 encoding string 3464 | * 3465 | * @param string|null $s 3466 | * @return string|bool|null Returns FALSE if error occurred 3467 | */ 3468 | public static function strrev($s) 3469 | { 3470 | if (! ReflectionTypeHint::isValid()) return false; 3471 | if (is_null($s)) return $s; 3472 | 3473 | if (0) #TODO test speed 3474 | { 3475 | $s = self::_convert($s, 'UTF-8', 'UTF-32'); 3476 | if (! is_string($s)) return false; 3477 | $s = implode('', array_reverse(str_split($s, 4))); 3478 | return self::_convert($s, 'UTF-32', 'UTF-8'); 3479 | } 3480 | 3481 | if (! is_array($a = self::str_split($s))) return false; 3482 | return implode('', array_reverse($a)); 3483 | } 3484 | 3485 | /** 3486 | * Implementation substr() function for UTF-8 encoding string. 3487 | * 3488 | * @link http://www.w3.org/International/questions/qa-forms-utf-8.html 3489 | * @param string|null $s 3490 | * @param int|digit $offset 3491 | * @param int|null|digit $length 3492 | * @return string|bool|null Returns FALSE if error occurred 3493 | */ 3494 | public static function substr($s, $offset, $length = null) 3495 | { 3496 | if (! ReflectionTypeHint::isValid()) return false; 3497 | if (is_null($s)) return $s; 3498 | 3499 | #since PHP-5.3.x mb_substr() faster then iconv_substr() 3500 | if (function_exists('mb_substr')) 3501 | { 3502 | if ($length === null) $length = self::strlen($s); 3503 | return mb_substr($s, $offset, $length, 'utf-8'); 3504 | } 3505 | if (function_exists('iconv_substr')) 3506 | { 3507 | if ($length === null) $length = self::strlen($s); 3508 | return iconv_substr($s, $offset, $length, 'utf-8'); 3509 | } 3510 | 3511 | static $_s = null; 3512 | static $_a = null; 3513 | 3514 | if ($_s !== $s) $_a = self::str_split($_s = $s); 3515 | if (! is_array($_a)) return false; 3516 | if ($length !== null) $a = array_slice($_a, $offset, $length); 3517 | else $a = array_slice($_a, $offset); 3518 | return implode('', $a); 3519 | } 3520 | 3521 | /** 3522 | * Implementation substr_replace() function for UTF-8 encoding string. 3523 | * 3524 | * @param string|null $s 3525 | * @param string|int $replacement 3526 | * @param int|digit $start 3527 | * @param int|null $length 3528 | * @return string|bool|null Returns FALSE if error occurred 3529 | */ 3530 | public static function substr_replace($s, $replacement, $start, $length = null) 3531 | { 3532 | if (! ReflectionTypeHint::isValid()) return false; 3533 | if (is_null($s)) return $s; 3534 | 3535 | if (! is_array($a = self::str_split($s))) return false; 3536 | array_splice($a, $start, $length, $replacement); 3537 | return implode('', $a); 3538 | } 3539 | 3540 | /** 3541 | * Implementation ucfirst() function for UTF-8 encoding string. 3542 | * Преобразует первый символ строки в кодировке UTF-8 в верхний регистр. 3543 | * 3544 | * @param string|null $s 3545 | * @param bool $is_other_to_lowercase остальные символы преобразуются в нижний регистр? 3546 | * @return string|bool|null Returns FALSE if error occurred 3547 | */ 3548 | public static function ucfirst($s, $is_other_to_lowercase = true) 3549 | { 3550 | if (! ReflectionTypeHint::isValid()) return false; 3551 | if (is_null($s)) return $s; 3552 | 3553 | if ($s === '' || ! is_string($s)) return $s; 3554 | if (! preg_match('/^(.)(.*)$/suSX', $s, $m)) return false; 3555 | return self::uppercase($m[1]) . ($is_other_to_lowercase ? self::lowercase($m[2]) : $m[2]); 3556 | } 3557 | 3558 | /** 3559 | * Implementation ucwords() function for UTF-8 encoding string. 3560 | * Преобразует в верхний регистр первый символ каждого слова в строке в кодировке UTF-8, 3561 | * остальные символы каждого слова преобразуются в нижний регистр. 3562 | * 3563 | * @param string|null $s 3564 | * @param bool $is_other_to_lowercase остальные символы преобразуются в нижний регистр? 3565 | * @param string $spaces_re 3566 | * @return string|bool|null Returns FALSE if error occurred 3567 | */ 3568 | public static function ucwords($s, $is_other_to_lowercase = true, $spaces_re = '~([\pZ\s]+)~suSX') #\pXps is POSIX space: property Z or tab, NL, VT, FF, CR 3569 | { 3570 | if (! ReflectionTypeHint::isValid()) return false; 3571 | if (is_null($s)) return $s; 3572 | 3573 | $words = preg_split($spaces_re, $s, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE); 3574 | foreach ($words as $k => $word) 3575 | { 3576 | $words[$k] = self::ucfirst($word, $is_other_to_lowercase = true); 3577 | if ($words[$k] === false) return false; 3578 | } 3579 | return implode('', $words); 3580 | } 3581 | 3582 | /** 3583 | * Decodes a string in the format %uXXXX or %u{XXXXXX} in the UTF-8 string. 3584 | * 3585 | * Используется для декодирования данных типа "%u0442%u0435%u0441%u0442", 3586 | * закодированных устаревшей функцией javascript://encode(). 3587 | * Рекомендуется использовать функцию javascript://encodeURIComponent(). 3588 | * 3589 | * NOTICE 3590 | * Устаревший формат %uXXXX позволяет использовать юникод только из диапазона UCS-2, т.е. от U+0 до U+FFFF 3591 | * 3592 | * @param scalar|array|null $data 3593 | * @param bool $is_rawurlencode 3594 | * @return scalar|array|null Returns FALSE if error occurred 3595 | */ 3596 | public static function unescape($data, $is_rawurlencode = false) 3597 | { 3598 | if (! ReflectionTypeHint::isValid()) return false; 3599 | if (is_array($data)) 3600 | { 3601 | $d = array(); 3602 | foreach ($data as $k => &$v) 3603 | { 3604 | $k = self::unescape($k, $is_rawurlencode); 3605 | if ($k === false) return false; 3606 | $d[$k] = self::unescape($v, $is_rawurlencode); 3607 | if ($d[$k] === false && ! is_bool($v)) return false; 3608 | } 3609 | return $d; 3610 | } 3611 | if (is_string($data)) 3612 | { 3613 | if (strpos($data, '%u') === false) return $data; #use strpos() for speed improving 3614 | return preg_replace_callback('/%u( [\da-fA-F]{4}+ #%uXXXX only UCS-2 3615 | | \{ [\da-fA-F]{1,6}+ \} #%u{XXXXXX} extended form for all UNICODE charts 3616 | ) 3617 | /sxSX', 3618 | function (array $m) use ($is_rawurlencode) 3619 | { 3620 | $codepoint = hexdec(trim($m[1], '{}')); 3621 | $char = self::chr($codepoint); 3622 | return $is_rawurlencode ? rawurlencode($char) : $char; 3623 | }, 3624 | $data); 3625 | } 3626 | if (is_scalar($data) || is_null($data)) return $data; #~ null, integer, float, boolean 3627 | return false; #object or resource 3628 | } 3629 | 3630 | /** 3631 | * 1) Corrects the global arrays $_GET, $_POST, $_COOKIE, $_REQUEST 3632 | * decoded values ​​in the format %uXXXX and %u{XXXXXX}, encoded, 3633 | * for example, through an outdated javascript function escape(). 3634 | * Standard PHP5 cannot do it. 3635 | * 2) If in the HTTP_COOKIE there are parameters with the same name, 3636 | * takes the last value, not the first, as in the QUERY_STRING. 3637 | * 3) Creates an array of $_POST for non-standard Content-Type, for example, "Content-Type: application/octet-stream". 3638 | * Standard PHP5 creates an array for "Content-Type: application/x-www-form-urlencoded" and "Content-Type: multipart/form-data". 3639 | * 3640 | * Сессии, куки и независимая авторизация на поддоменах. 3641 | * 3642 | * ПРИМЕР 1 3643 | * У рабочего сайта http://domain.com появились поддомены. 3644 | * Для кроссдоменной авторизации через механизм сессий имя хоста для COOKIE было изменено с "domain.com" на ".domain.com" 3645 | * В результате авторизация не работает. 3646 | * Помогает очистка COOKIE, но их принудительная очистка на тысячах пользовательских компьютеров проблематична. 3647 | * Проблема в следующем: если в HTTP_COOKIE есть параметры с одинаковым именем, то берётся последнее значение, 3648 | * а не первое, как в QUERY_STRING. 3649 | * Более подробное описание: 3650 | * PHP не правильно (?) обрабатывает заголовок HTTP_COOKIE, если там встречаются параметры с одинаковым именем, но разными значениями. 3651 | * Пример запроса HTTP-заголовка клиентом: "Cookie: sid=chpgs2fiak-330mzqza; sid=cmz5tnp5zz-xlbbgqp" 3652 | * В этом случае сервер берёт первое значение, а не последнее. 3653 | * Хотя если в QUERY_STRING есть такая ситуация, всегда берётся последний параметр. 3654 | * В HTTP_COOKIE два параметра с одинаковым именем могут появиться, если отправить клиенту следующие HTTP-заголовки: 3655 | * "Set-Cookie: sid=chpgs2fiak-330mzqza; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=domain.com" (только domain.com) 3656 | * "Set-Cookie: sid=cmz6uqorzv-1bn35110; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.domain.com" (domain.com и все его поддомены) 3657 | * Решение: поменять имя сессии. 3658 | * 3659 | * ПРИМЕР 2 3660 | * Есть рабочие сайты: http://domain.com (основной), http://admin.domain.com (админка), 3661 | * http://sub1.domain.com (подпроект 1), http://sub2.domain.com, (подпроект 2). 3662 | * Так же имеется сервер разработки http://dev.domain.com, на котором м. б. свои поддомены. 3663 | * Требуется сделать независимую кросс-доменную авторизацию для http://*.domain.com и http://*.dev.domain.com. 3664 | * Для сохранения статуса авторизации будем использовать сессию, имя и значение которой пишется в COOKIE. 3665 | * Т. к. домены http://*.dev.domain.com имеют пересечение с доменами http://*.domain.com, 3666 | * для независимой авторизации нужно использовать разные имена сессий. 3667 | * Пример HTTP заголовков ответа сервера: 3668 | * "Set-Cookie: sid=chpgs2fiak-330mzqza; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.domain.com" (.domain.com и все его поддомены) 3669 | * "Set-Cookie: sid.dev=cmz6uqorzv-1bn35110; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.dev.domain.com" (dev.domain.com и все его поддомены) 3670 | * 3671 | * @link http://tools.ietf.org/html/rfc2965 RFC 2965 - HTTP State Management Mechanism 3672 | * @return void 3673 | */ 3674 | public static function unescape_request() 3675 | { 3676 | $fixed = false; 3677 | #ATTENTION! HTTP_RAW_POST_DATA is only accessible when Content-Type of POST request is NOT default "application/x-www-form-urlencoded"! 3678 | $HTTP_RAW_POST_DATA = isset($_SERVER['REQUEST_METHOD']) && $_SERVER['REQUEST_METHOD'] === 'POST' ? (isset($GLOBALS['HTTP_RAW_POST_DATA']) ? $GLOBALS['HTTP_RAW_POST_DATA'] : @file_get_contents('php://input')) : null; 3679 | if (ini_get('always_populate_raw_post_data')) $GLOBALS['HTTP_RAW_POST_DATA'] = $HTTP_RAW_POST_DATA; 3680 | foreach (array( '_GET' => isset($_SERVER['QUERY_STRING']) ? $_SERVER['QUERY_STRING'] : null, 3681 | '_POST' => $HTTP_RAW_POST_DATA, 3682 | '_COOKIE' => isset($_SERVER['HTTP_COOKIE']) ? $_SERVER['HTTP_COOKIE'] : null, 3683 | ) as $k => $v) 3684 | { 3685 | if (! is_string($v)) continue; 3686 | if ($k === '_COOKIE') 3687 | { 3688 | $v = preg_replace('/; *+/sSX', '&', $v); 3689 | unset($_COOKIE); #будем парсить HTTP_COOKIE сами, чтобы сделать обработку как у QUERY_STRING 3690 | } 3691 | if (strpos($v, '%u') !== false) 3692 | { 3693 | parse_str(self::unescape($v, $is_rawurlencode = true), $GLOBALS[$k]); 3694 | $fixed = true; 3695 | continue; 3696 | } 3697 | if (array_key_exists($k, $GLOBALS)) continue; 3698 | parse_str($v, $GLOBALS[$k]); 3699 | $fixed = true; 3700 | } 3701 | if ($fixed) 3702 | { 3703 | $_REQUEST = 3704 | (isset($_COOKIE) ? $_COOKIE : array()) + 3705 | (isset($_POST) ? $_POST : array()) + 3706 | (isset($_GET) ? $_GET : array()); 3707 | } 3708 | } 3709 | 3710 | /** 3711 | * Calculates the height of the edit text in