├── LICENSE ├── README.md ├── SECURITY.md ├── composer.json └── src ├── Inflector.php ├── Normalise.php ├── StringHelper.php └── phputf8 ├── LICENSE ├── README ├── mbstring └── core.php ├── native └── core.php ├── ord.php ├── str_ireplace.php ├── str_pad.php ├── str_split.php ├── strcasecmp.php ├── strcspn.php ├── stristr.php ├── strrev.php ├── strspn.php ├── substr_replace.php ├── trim.php ├── ucfirst.php ├── ucwords.php ├── utf8.php └── utils ├── ascii.php ├── bad.php ├── patterns.php ├── position.php ├── specials.php ├── unicode.php └── validation.php /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc. 5 | 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Library General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License 307 | along with this program; if not, write to the Free Software 308 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 309 | 310 | 311 | Also add information on how to contact you by electronic and paper mail. 312 | 313 | If the program is interactive, make it output a short notice like this 314 | when it starts in an interactive mode: 315 | 316 | Gnomovision version 69, Copyright (C) year name of author 317 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 318 | This is free software, and you are welcome to redistribute it 319 | under certain conditions; type `show c' for details. 320 | 321 | The hypothetical commands `show w' and `show c' should show the appropriate 322 | parts of the General Public License. Of course, the commands you use may 323 | be called something other than `show w' and `show c'; they could even be 324 | mouse-clicks or menu items--whatever suits your program. 325 | 326 | You should also get your employer (if you work as a programmer) or your 327 | school, if any, to sign a "copyright disclaimer" for the program, if 328 | necessary. Here is a sample; alter the names: 329 | 330 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 331 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 332 | 333 | , 1 April 1989 334 | Ty Coon, President of Vice 335 | 336 | This General Public License does not permit incorporating your program into 337 | proprietary programs. If your program is a subroutine library, you may 338 | consider it more useful to permit linking proprietary applications with the 339 | library. If this is what you want to do, use the GNU Library General 340 | Public License instead of this License. 341 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The String Package [![Build Status](https://ci.joomla.org/api/badges/joomla-framework/string/status.svg?ref=refs/heads/3.x-dev)](https://ci.joomla.org/joomla-framework/string) 2 | 3 | [![Latest Stable Version](https://poser.pugx.org/joomla/string/v/stable)](https://packagist.org/packages/joomla/string) 4 | [![Total Downloads](https://poser.pugx.org/joomla/string/downloads)](https://packagist.org/packages/joomla/string) 5 | [![Latest Unstable Version](https://poser.pugx.org/joomla/string/v/unstable)](https://packagist.org/packages/joomla/string) 6 | [![License](https://poser.pugx.org/joomla/string/license)](https://packagist.org/packages/joomla/string) 7 | 8 | ## Installation via Composer 9 | 10 | Add `"joomla/string": "~3.0"` to the require block in your composer.json and then run `composer install`. 11 | 12 | ```json 13 | { 14 | "require": { 15 | "joomla/string": "~3.0" 16 | } 17 | } 18 | ``` 19 | 20 | Alternatively, you can simply run the following from the command line: 21 | 22 | ```sh 23 | composer require joomla/string "~3.0" 24 | ``` 25 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | These versions are currently being supported with security updates: 6 | 7 | | Version | Supported | 8 | | ------- | ------------------ | 9 | | 3.x.x | :white_check_mark: | 10 | | 2.0.x | :white_check_mark: | 11 | | 1.4.x | :x: | 12 | | < 1.4 | :x: | 13 | 14 | ## Reporting a Vulnerability 15 | 16 | To report a security issue in the core Joomla! CMS or Framework, or with a joomla.org website, please submit 17 | [the form on our portal](https://developer.joomla.org/security/contact-the-team.html) containing as much detail 18 | as possible about the issue. Additional information about our security team and their processes may be found on 19 | our [Security page](https://developer.joomla.org/security.html). 20 | 21 | To report an issue in a Joomla! extension, please submit it to the [Vulnerable Extensions List](https://vel.joomla.org/submit-vel). 22 | 23 | For support with a site which has been attacked, please visit the [Joomla! Forum](https://forum.joomla.org/viewforum.php?f=714). 24 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "joomla/string", 3 | "type": "joomla-package", 4 | "description": "Joomla String Package", 5 | "keywords": ["joomla", "framework", "string"], 6 | "homepage": "https://github.com/joomla-framework/string", 7 | "license": "GPL-2.0-or-later", 8 | "require": { 9 | "php": "^8.1.0", 10 | "symfony/deprecation-contracts": "^2|^3" 11 | }, 12 | "require-dev": { 13 | "doctrine/inflector": "^1.2", 14 | "joomla/test": "^3.0", 15 | "phpunit/phpunit": "^9.5.28", 16 | "squizlabs/php_codesniffer": "^3.7.2", 17 | "phpstan/phpstan": "^2.0", 18 | "phpstan/phpstan-deprecation-rules": "^2.0", 19 | "phan/phan": "^5.4.2" 20 | }, 21 | "conflict": { 22 | "doctrine/inflector": "<1.2" 23 | }, 24 | "suggest": { 25 | "ext-mbstring": "For improved processing", 26 | "doctrine/inflector": "To use the string inflector" 27 | }, 28 | "autoload": { 29 | "psr-4": { 30 | "Joomla\\String\\": "src/" 31 | }, 32 | "files": [ 33 | "src/phputf8/utf8.php", 34 | "src/phputf8/ord.php", 35 | "src/phputf8/str_ireplace.php", 36 | "src/phputf8/str_pad.php", 37 | "src/phputf8/str_split.php", 38 | "src/phputf8/strcasecmp.php", 39 | "src/phputf8/strcspn.php", 40 | "src/phputf8/stristr.php", 41 | "src/phputf8/strrev.php", 42 | "src/phputf8/strspn.php", 43 | "src/phputf8/trim.php", 44 | "src/phputf8/ucfirst.php", 45 | "src/phputf8/ucwords.php", 46 | "src/phputf8/utils/ascii.php", 47 | "src/phputf8/utils/validation.php" 48 | ] 49 | }, 50 | "autoload-dev": { 51 | "psr-4": { 52 | "Joomla\\String\\Tests\\": "Tests/" 53 | } 54 | }, 55 | "minimum-stability": "dev", 56 | "extra": { 57 | "branch-alias": { 58 | "dev-2.0-dev": "2.0-dev", 59 | "dev-3.x-dev": "3.0-dev" 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/Inflector.php: -------------------------------------------------------------------------------- 1 | [ 40 | 'id', 41 | 'hits', 42 | 'clicks', 43 | ], 44 | ]; 45 | 46 | /** 47 | * Adds inflection regex rules to the inflector. 48 | * 49 | * @param mixed $data A string or an array of strings or regex rules to add. 50 | * @param string $ruleType The rule type: singular | plural | countable 51 | * 52 | * @return void 53 | * 54 | * @since 1.0 55 | * @throws \InvalidArgumentException 56 | */ 57 | private function addRule($data, string $ruleType) 58 | { 59 | if (\is_string($data)) { 60 | $data = [$data]; 61 | } elseif (!\is_array($data)) { 62 | throw new \InvalidArgumentException('Invalid inflector rule data.'); 63 | } elseif (!\in_array($ruleType, ['singular', 'plural', 'countable'])) { 64 | throw new \InvalidArgumentException('Unsupported rule type.'); 65 | } 66 | 67 | if ($ruleType === 'countable') { 68 | foreach ($data as $rule) { 69 | // Ensure a string is pushed. 70 | array_push(self::$countable['rules'], (string) $rule); 71 | } 72 | } else { 73 | static::rules($ruleType, $data); 74 | } 75 | } 76 | 77 | /** 78 | * Adds a countable word. 79 | * 80 | * @param mixed $data A string or an array of strings to add. 81 | * 82 | * @return $this 83 | * 84 | * @since 1.0 85 | */ 86 | public function addCountableRule($data) 87 | { 88 | $this->addRule($data, 'countable'); 89 | 90 | return $this; 91 | } 92 | 93 | /** 94 | * Adds a specific singular-plural pair for a word. 95 | * 96 | * @param string $singular The singular form of the word. 97 | * @param string $plural The plural form of the word. If omitted, it is assumed the singular and plural are identical. 98 | * 99 | * @return $this 100 | * 101 | * @since 1.0 102 | * @deprecated 3.0 Use Doctrine\Common\Inflector\Inflector::rules() instead. 103 | */ 104 | public function addWord($singular, $plural = '') 105 | { 106 | trigger_deprecation( 107 | 'joomla/string', 108 | '2.0.0', 109 | '%s() is deprecated and will be removed in 3.0, use %s::rules() instead.', 110 | __METHOD__, 111 | DoctrineInflector::class 112 | ); 113 | 114 | if ($plural !== '') { 115 | static::rules( 116 | 'plural', 117 | [ 118 | 'irregular' => [$plural => $singular], 119 | ] 120 | ); 121 | 122 | static::rules( 123 | 'singular', 124 | [ 125 | 'irregular' => [$singular => $plural], 126 | ] 127 | ); 128 | } else { 129 | static::rules( 130 | 'plural', 131 | [ 132 | 'uninflected' => [$singular], 133 | ] 134 | ); 135 | 136 | static::rules( 137 | 'singular', 138 | [ 139 | 'uninflected' => [$singular], 140 | ] 141 | ); 142 | } 143 | 144 | return $this; 145 | } 146 | 147 | /** 148 | * Adds a pluralisation rule. 149 | * 150 | * @param mixed $data A string or an array of regex rules to add. 151 | * 152 | * @return $this 153 | * 154 | * @since 1.0 155 | * @deprecated 3.0 Use Doctrine\Common\Inflector\Inflector::rules() instead. 156 | */ 157 | public function addPluraliseRule($data) 158 | { 159 | trigger_deprecation( 160 | 'joomla/string', 161 | '2.0.0', 162 | '%s() is deprecated and will be removed in 3.0, use %s::rules() instead.', 163 | __METHOD__, 164 | DoctrineInflector::class 165 | ); 166 | 167 | $this->addRule($data, 'plural'); 168 | 169 | return $this; 170 | } 171 | 172 | /** 173 | * Adds a singularisation rule. 174 | * 175 | * @param mixed $data A string or an array of regex rules to add. 176 | * 177 | * @return $this 178 | * 179 | * @since 1.0 180 | * @deprecated 3.0 Use Doctrine\Common\Inflector\Inflector::rules() instead. 181 | */ 182 | public function addSingulariseRule($data) 183 | { 184 | trigger_deprecation( 185 | 'joomla/string', 186 | '2.0.0', 187 | '%s() is deprecated and will be removed in 3.0, use %s::rules() instead.', 188 | __METHOD__, 189 | DoctrineInflector::class 190 | ); 191 | 192 | $this->addRule($data, 'singular'); 193 | 194 | return $this; 195 | } 196 | 197 | /** 198 | * Gets an instance of the Inflector singleton. 199 | * 200 | * @param boolean $new If true (default is false), returns a new instance regardless if one exists. This argument is mainly used for testing. 201 | * 202 | * @return static 203 | * 204 | * @since 1.0 205 | * @deprecated 3.0 Use static methods without a class instance instead. 206 | */ 207 | public static function getInstance($new = false) 208 | { 209 | trigger_deprecation( 210 | 'joomla/string', 211 | '2.0.0', 212 | '%s() is deprecated and will be removed in 3.0.', 213 | __METHOD__ 214 | ); 215 | 216 | if ($new) { 217 | return new static(); 218 | } 219 | 220 | if (!\is_object(self::$instance)) { 221 | self::$instance = new static(); 222 | } 223 | 224 | return self::$instance; 225 | } 226 | 227 | /** 228 | * Checks if a word is countable. 229 | * 230 | * @param string $word The string input. 231 | * 232 | * @return boolean True if word is countable, false otherwise. 233 | * 234 | * @since 1.0 235 | */ 236 | public function isCountable($word) 237 | { 238 | return \in_array($word, self::$countable['rules']); 239 | } 240 | 241 | /** 242 | * Checks if a word is in a plural form. 243 | * 244 | * @param string $word The string input. 245 | * 246 | * @return boolean True if word is plural, false if not. 247 | * 248 | * @since 1.0 249 | */ 250 | public function isPlural($word) 251 | { 252 | return $this->toPlural($this->toSingular($word)) === $word; 253 | } 254 | 255 | /** 256 | * Checks if a word is in a singular form. 257 | * 258 | * @param string $word The string input. 259 | * 260 | * @return boolean True if word is singular, false if not. 261 | * 262 | * @since 1.0 263 | */ 264 | public function isSingular($word) 265 | { 266 | return $this->toSingular($word) === $word; 267 | } 268 | 269 | /** 270 | * Converts a word into its plural form. 271 | * 272 | * @param string $word The singular word to pluralise. 273 | * 274 | * @return string The word in plural form. 275 | * 276 | * @since 1.0 277 | * @deprecated 3.0 Use Doctrine\Common\Inflector\Inflector::pluralize() instead. 278 | */ 279 | public function toPlural($word) 280 | { 281 | trigger_deprecation( 282 | 'joomla/string', 283 | '2.0.0', 284 | '%s() is deprecated and will be removed in 3.0, use %s::pluralize() instead.', 285 | __METHOD__, 286 | DoctrineInflector::class 287 | ); 288 | 289 | return static::pluralize($word); 290 | } 291 | 292 | /** 293 | * Converts a word into its singular form. 294 | * 295 | * @param string $word The plural word to singularise. 296 | * 297 | * @return string The word in singular form. 298 | * 299 | * @since 1.0 300 | * @deprecated 3.0 Use Doctrine\Common\Inflector\Inflector::singularize() instead. 301 | */ 302 | public function toSingular($word) 303 | { 304 | trigger_deprecation( 305 | 'joomla/string', 306 | '2.0.0', 307 | '%s() is deprecated and will be removed in 3.0, use %s::singularize() instead.', 308 | __METHOD__, 309 | DoctrineInflector::class 310 | ); 311 | 312 | return static::singularize($word); 313 | } 314 | } 315 | -------------------------------------------------------------------------------- /src/Normalise.php: -------------------------------------------------------------------------------- 1 | [ 30 | '#-(\d+)$#', 31 | '-%d', 32 | ], 33 | 'default' => [ 34 | ['#\((\d+)\)$#', '#\(\d+\)$#'], 35 | [' (%d)', '(%d)'], 36 | ], 37 | ]; 38 | 39 | /** 40 | * Increments a trailing number in a string. 41 | * 42 | * Used to easily create distinct labels when copying objects. The method has the following styles: 43 | * 44 | * default: "Label" becomes "Label (2)" 45 | * dash: "Label" becomes "Label-2" 46 | * 47 | * @param string $string The source string. 48 | * @param string|null $style The the style (default|dash). 49 | * @param integer $n If supplied, this number is used for the copy, otherwise it is the 'next' number. 50 | * 51 | * @return string The incremented string. 52 | * 53 | * @since 1.3.0 54 | */ 55 | public static function increment($string, $style = 'default', $n = 0) 56 | { 57 | $styleSpec = static::$incrementStyles[$style] ?? static::$incrementStyles['default']; 58 | 59 | // Regular expression search and replace patterns. 60 | if (\is_array($styleSpec[0])) { 61 | $rxSearch = $styleSpec[0][0]; 62 | $rxReplace = $styleSpec[0][1]; 63 | } else { 64 | $rxSearch = $rxReplace = $styleSpec[0]; 65 | } 66 | 67 | // New and old (existing) sprintf formats. 68 | if (\is_array($styleSpec[1])) { 69 | $newFormat = $styleSpec[1][0]; 70 | $oldFormat = $styleSpec[1][1]; 71 | } else { 72 | $newFormat = $oldFormat = $styleSpec[1]; 73 | } 74 | 75 | // Check if we are incrementing an existing pattern, or appending a new one. 76 | if (preg_match($rxSearch, $string, $matches)) { 77 | $n = empty($n) ? ($matches[1] + 1) : $n; 78 | $string = preg_replace($rxReplace, sprintf($oldFormat, $n), $string); 79 | } else { 80 | $n = empty($n) ? 2 : $n; 81 | $string .= sprintf($newFormat, $n); 82 | } 83 | 84 | return $string; 85 | } 86 | 87 | /** 88 | * Tests whether a string contains only 7bit ASCII bytes. 89 | * 90 | * You might use this to conditionally check whether a string needs handling as UTF-8 or not, potentially offering performance 91 | * benefits by using the native PHP equivalent if it's just ASCII e.g.; 92 | * 93 | * 94 | * if (StringHelper::is_ascii($someString)) 95 | * { 96 | * // It's just ASCII - use the native PHP version 97 | * $someString = strtolower($someString); 98 | * } 99 | * else 100 | * { 101 | * $someString = StringHelper::strtolower($someString); 102 | * } 103 | * 104 | * 105 | * @param string $str The string to test. 106 | * 107 | * @return boolean True if the string is all ASCII 108 | * 109 | * @since 1.3.0 110 | */ 111 | public static function is_ascii($str) 112 | { 113 | return utf8_is_ascii($str); 114 | } 115 | 116 | /** 117 | * UTF-8 aware alternative to ord() 118 | * 119 | * Returns the unicode ordinal for a character. 120 | * 121 | * @param string $chr UTF-8 encoded character 122 | * 123 | * @return integer Unicode ordinal for the character 124 | * 125 | * @link https://www.php.net/ord 126 | * @since 1.4.0 127 | */ 128 | public static function ord($chr) 129 | { 130 | return utf8_ord($chr); 131 | } 132 | 133 | /** 134 | * UTF-8 aware alternative to strpos() 135 | * 136 | * Find position of first occurrence of a string. 137 | * 138 | * @param string $str String being examined 139 | * @param string $search String being searched for 140 | * @param integer|null|boolean $offset Optional, specifies the position from which the search should be performed 141 | * 142 | * @return integer|boolean Number of characters before the first match or FALSE on failure 143 | * 144 | * @link https://www.php.net/strpos 145 | * @since 1.3.0 146 | */ 147 | public static function strpos($str, $search, $offset = false) 148 | { 149 | if ($offset === false) { 150 | return utf8_strpos($str, $search); 151 | } 152 | 153 | return utf8_strpos($str, $search, $offset); 154 | } 155 | 156 | /** 157 | * UTF-8 aware alternative to strrpos() 158 | * 159 | * Finds position of last occurrence of a string. 160 | * 161 | * @param string $str String being examined. 162 | * @param string $search String being searched for. 163 | * @param integer $offset Offset from the left of the string. 164 | * 165 | * @return integer|boolean Number of characters before the last match or false on failure 166 | * 167 | * @link https://www.php.net/strrpos 168 | * @since 1.3.0 169 | */ 170 | public static function strrpos($str, $search, $offset = 0) 171 | { 172 | return utf8_strrpos($str, $search, $offset); 173 | } 174 | 175 | /** 176 | * UTF-8 aware alternative to substr() 177 | * 178 | * Return part of a string given character offset (and optionally length). 179 | * 180 | * @param string $str String being processed 181 | * @param integer $offset Number of UTF-8 characters offset (from left) 182 | * @param integer|null|boolean $length Optional length in UTF-8 characters from offset 183 | * 184 | * @return string|boolean 185 | * 186 | * @link https://www.php.net/substr 187 | * @since 1.3.0 188 | */ 189 | public static function substr($str, $offset, $length = false) 190 | { 191 | if ($length === false) { 192 | return utf8_substr($str, $offset); 193 | } 194 | 195 | return utf8_substr($str, $offset, $length); 196 | } 197 | 198 | /** 199 | * UTF-8 aware alternative to strtolower() 200 | * 201 | * Make a string lowercase 202 | * 203 | * Note: The concept of a characters "case" only exists is some alphabets such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does 204 | * not exist in the Chinese alphabet, for example. See Unicode Standard Annex #21: Case Mappings 205 | * 206 | * @param string $str String being processed 207 | * 208 | * @return string|boolean Either string in lowercase or FALSE is UTF-8 invalid 209 | * 210 | * @link https://www.php.net/strtolower 211 | * @since 1.3.0 212 | */ 213 | public static function strtolower($str) 214 | { 215 | return utf8_strtolower($str); 216 | } 217 | 218 | /** 219 | * UTF-8 aware alternative to strtoupper() 220 | * 221 | * Make a string uppercase 222 | * 223 | * Note: The concept of a characters "case" only exists is some alphabets such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does 224 | * not exist in the Chinese alphabet, for example. See Unicode Standard Annex #21: Case Mappings 225 | * 226 | * @param string $str String being processed 227 | * 228 | * @return string|boolean Either string in uppercase or FALSE is UTF-8 invalid 229 | * 230 | * @link https://www.php.net/strtoupper 231 | * @since 1.3.0 232 | */ 233 | public static function strtoupper($str) 234 | { 235 | return utf8_strtoupper($str); 236 | } 237 | 238 | /** 239 | * UTF-8 aware alternative to strlen() 240 | * 241 | * Returns the number of characters in the string (NOT THE NUMBER OF BYTES). 242 | * 243 | * @param string $str UTF-8 string. 244 | * 245 | * @return integer Number of UTF-8 characters in string. 246 | * 247 | * @link https://www.php.net/strlen 248 | * @since 1.3.0 249 | */ 250 | public static function strlen($str) 251 | { 252 | return utf8_strlen($str); 253 | } 254 | 255 | /** 256 | * UTF-8 aware alternative to str_ireplace() 257 | * 258 | * Case-insensitive version of str_replace() 259 | * 260 | * @param string|string[] $search String to search 261 | * @param string|string[] $replace Existing string to replace 262 | * @param string $str New string to replace with 263 | * @param integer|null|boolean $count Optional count value to be passed by referene 264 | * 265 | * @return string UTF-8 String 266 | * 267 | * @link https://www.php.net/str_ireplace 268 | * @since 1.3.0 269 | */ 270 | public static function str_ireplace($search, $replace, $str, $count = null) 271 | { 272 | if ($count === false) { 273 | return utf8_ireplace($search, $replace, $str); 274 | } 275 | 276 | return utf8_ireplace($search, $replace, $str, $count); 277 | } 278 | 279 | /** 280 | * UTF-8 aware alternative to str_pad() 281 | * 282 | * Pad a string to a certain length with another string. 283 | * $padStr may contain multi-byte characters. 284 | * 285 | * @param string $input The input string. 286 | * @param integer $length If the value is negative, less than, or equal to the length of the input string, no padding takes place. 287 | * @param string $padStr The string may be truncated if the number of padding characters can't be evenly divided by the string's length. 288 | * @param integer $type The type of padding to apply 289 | * 290 | * @return string 291 | * 292 | * @link https://www.php.net/str_pad 293 | * @since 1.4.0 294 | */ 295 | public static function str_pad($input, $length, $padStr = ' ', $type = STR_PAD_RIGHT) 296 | { 297 | return utf8_str_pad($input, $length, $padStr, $type); 298 | } 299 | 300 | /** 301 | * UTF-8 aware alternative to str_split() 302 | * 303 | * Convert a string to an array. 304 | * 305 | * @param string $str UTF-8 encoded string to process 306 | * @param integer $splitLen Number to characters to split string by 307 | * 308 | * @return array|string|boolean 309 | * 310 | * @link https://www.php.net/str_split 311 | * @since 1.3.0 312 | */ 313 | public static function str_split($str, $splitLen = 1) 314 | { 315 | return utf8_str_split($str, $splitLen); 316 | } 317 | 318 | /** 319 | * UTF-8/LOCALE aware alternative to strcasecmp() 320 | * 321 | * A case insensitive string comparison. 322 | * 323 | * @param string $str1 string 1 to compare 324 | * @param string $str2 string 2 to compare 325 | * @param string|boolean $locale The locale used by strcoll or false to use classical comparison 326 | * 327 | * @return integer Either < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal. 328 | * 329 | * @link https://www.php.net/strcasecmp 330 | * @link https://www.php.net/strcoll 331 | * @link https://www.php.net/setlocale 332 | * @since 1.3.0 333 | */ 334 | public static function strcasecmp($str1, $str2, $locale = false) 335 | { 336 | if ($locale === false) { 337 | return utf8_strcasecmp($str1, $str2); 338 | } 339 | 340 | // Get current locale 341 | $locale0 = setlocale(LC_COLLATE, 0); 342 | 343 | if (!$locale = setlocale(LC_COLLATE, $locale)) { 344 | $locale = $locale0; 345 | } 346 | 347 | // See if we have successfully set locale to UTF-8 348 | if (!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m)) { 349 | $encoding = 'CP' . $m[1]; 350 | } elseif (stristr($locale, 'UTF-8') || stristr($locale, 'utf8')) { 351 | $encoding = 'UTF-8'; 352 | } else { 353 | $encoding = 'nonrecodable'; 354 | } 355 | 356 | // If we successfully set encoding it to utf-8 or encoding is sth weird don't recode 357 | if ($encoding == 'UTF-8' || $encoding == 'nonrecodable') { 358 | return strcoll(utf8_strtolower($str1), utf8_strtolower($str2)); 359 | } 360 | 361 | return strcoll( 362 | static::transcode(utf8_strtolower($str1), 'UTF-8', $encoding), 363 | static::transcode(utf8_strtolower($str2), 'UTF-8', $encoding) 364 | ); 365 | } 366 | 367 | /** 368 | * UTF-8/LOCALE aware alternative to strcmp() 369 | * 370 | * A case sensitive string comparison. 371 | * 372 | * @param string $str1 string 1 to compare 373 | * @param string $str2 string 2 to compare 374 | * @param mixed $locale The locale used by strcoll or false to use classical comparison 375 | * 376 | * @return integer Either < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal. 377 | * 378 | * @link https://www.php.net/strcmp 379 | * @link https://www.php.net/strcoll 380 | * @link https://www.php.net/setlocale 381 | * @since 1.3.0 382 | */ 383 | public static function strcmp($str1, $str2, $locale = false) 384 | { 385 | if ($locale) { 386 | // Get current locale 387 | $locale0 = setlocale(LC_COLLATE, 0); 388 | 389 | if (!$locale = setlocale(LC_COLLATE, $locale)) { 390 | $locale = $locale0; 391 | } 392 | 393 | // See if we have successfully set locale to UTF-8 394 | if (!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m)) { 395 | $encoding = 'CP' . $m[1]; 396 | } elseif (stristr($locale, 'UTF-8') || stristr($locale, 'utf8')) { 397 | $encoding = 'UTF-8'; 398 | } else { 399 | $encoding = 'nonrecodable'; 400 | } 401 | 402 | // If we successfully set encoding it to utf-8 or encoding is sth weird don't recode 403 | if ($encoding == 'UTF-8' || $encoding == 'nonrecodable') { 404 | return strcoll($str1, $str2); 405 | } 406 | 407 | return strcoll(static::transcode($str1, 'UTF-8', $encoding), static::transcode($str2, 'UTF-8', $encoding)); 408 | } 409 | 410 | return strcmp($str1, $str2); 411 | } 412 | 413 | /** 414 | * UTF-8 aware alternative to strcspn() 415 | * 416 | * Find length of initial segment not matching mask. 417 | * 418 | * @param string $str The string to process 419 | * @param string $mask The mask 420 | * @param integer|boolean $start Optional starting character position (in characters) 421 | * @param integer|boolean $length Optional length 422 | * 423 | * @return integer The length of the initial segment of str1 which does not contain any of the characters in str2 424 | * 425 | * @link https://www.php.net/strcspn 426 | * @since 1.3.0 427 | */ 428 | public static function strcspn($str, $mask, $start = null, $length = null) 429 | { 430 | if ($start === false && $length === false) { 431 | return utf8_strcspn($str, $mask); 432 | } 433 | 434 | if ($length === false) { 435 | return utf8_strcspn($str, $mask, $start); 436 | } 437 | 438 | return utf8_strcspn($str, $mask, $start, $length); 439 | } 440 | 441 | /** 442 | * UTF-8 aware alternative to stristr() 443 | * 444 | * Returns all of haystack from the first occurrence of needle to the end. Needle and haystack are examined in a case-insensitive manner to 445 | * find the first occurrence of a string using case insensitive comparison. 446 | * 447 | * @param string $str The haystack 448 | * @param string $search The needle 449 | * 450 | * @return string|boolean 451 | * 452 | * @link https://www.php.net/stristr 453 | * @since 1.3.0 454 | */ 455 | public static function stristr($str, $search) 456 | { 457 | return utf8_stristr($str, $search); 458 | } 459 | 460 | /** 461 | * UTF-8 aware alternative to strrev() 462 | * 463 | * Reverse a string. 464 | * 465 | * @param string $str String to be reversed 466 | * 467 | * @return string The string in reverse character order 468 | * 469 | * @link https://www.php.net/strrev 470 | * @since 1.3.0 471 | */ 472 | public static function strrev($str) 473 | { 474 | return utf8_strrev($str); 475 | } 476 | 477 | /** 478 | * UTF-8 aware alternative to strspn() 479 | * 480 | * Find length of initial segment matching mask. 481 | * 482 | * @param string $str The haystack 483 | * @param string $mask The mask 484 | * @param integer|null $start Start optional 485 | * @param integer|null $length Length optional 486 | * 487 | * @return integer 488 | * 489 | * @link https://www.php.net/strspn 490 | * @since 1.3.0 491 | */ 492 | public static function strspn($str, $mask, $start = null, $length = null) 493 | { 494 | if ($start === null && $length === null) { 495 | return utf8_strspn($str, $mask); 496 | } 497 | 498 | if ($length === null) { 499 | return utf8_strspn($str, $mask, $start); 500 | } 501 | 502 | return utf8_strspn($str, $mask, $start, $length); 503 | } 504 | 505 | /** 506 | * UTF-8 aware alternative to substr_replace() 507 | * 508 | * Replace text within a portion of a string. 509 | * 510 | * @param string $str The haystack 511 | * @param string $repl The replacement string 512 | * @param integer $start Start 513 | * @param integer|boolean|null $length Length (optional) 514 | * 515 | * @return string 516 | * 517 | * @link https://www.php.net/substr_replace 518 | * @since 1.3.0 519 | */ 520 | public static function substr_replace($str, $repl, $start, $length = null) 521 | { 522 | // Loaded by library loader 523 | if ($length === false) { 524 | return utf8_substr_replace($str, $repl, $start); 525 | } 526 | 527 | return utf8_substr_replace($str, $repl, $start, $length); 528 | } 529 | 530 | /** 531 | * UTF-8 aware replacement for ltrim() 532 | * 533 | * Strip whitespace (or other characters) from the beginning of a string. You only need to use this if you are supplying the charlist 534 | * optional arg and it contains UTF-8 characters. Otherwise ltrim will work normally on a UTF-8 string. 535 | * 536 | * @param string $str The string to be trimmed 537 | * @param string|boolean $charlist The optional charlist of additional characters to trim 538 | * 539 | * @return string The trimmed string 540 | * 541 | * @link https://www.php.net/ltrim 542 | * @since 1.3.0 543 | */ 544 | public static function ltrim($str, $charlist = false) 545 | { 546 | if (empty($charlist) && $charlist !== false) { 547 | return $str; 548 | } 549 | 550 | if ($charlist === false) { 551 | return utf8_ltrim($str); 552 | } 553 | 554 | return utf8_ltrim($str, $charlist); 555 | } 556 | 557 | /** 558 | * UTF-8 aware replacement for rtrim() 559 | * 560 | * Strip whitespace (or other characters) from the end of a string. You only need to use this if you are supplying the charlist 561 | * optional arg and it contains UTF-8 characters. Otherwise rtrim will work normally on a UTF-8 string. 562 | * 563 | * @param string $str The string to be trimmed 564 | * @param string|boolean $charlist The optional charlist of additional characters to trim 565 | * 566 | * @return string The trimmed string 567 | * 568 | * @link https://www.php.net/rtrim 569 | * @since 1.3.0 570 | */ 571 | public static function rtrim($str, $charlist = false) 572 | { 573 | if (empty($charlist) && $charlist !== false) { 574 | return $str; 575 | } 576 | 577 | if ($charlist === false) { 578 | return utf8_rtrim($str); 579 | } 580 | 581 | return utf8_rtrim($str, $charlist); 582 | } 583 | 584 | /** 585 | * UTF-8 aware replacement for trim() 586 | * 587 | * Strip whitespace (or other characters) from the beginning and end of a string. You only need to use this if you are supplying the charlist 588 | * optional arg and it contains UTF-8 characters. Otherwise trim will work normally on a UTF-8 string 589 | * 590 | * @param string $str The string to be trimmed 591 | * @param string|boolean $charlist The optional charlist of additional characters to trim 592 | * 593 | * @return string The trimmed string 594 | * 595 | * @link https://www.php.net/trim 596 | * @since 1.3.0 597 | */ 598 | public static function trim($str, $charlist = false) 599 | { 600 | if (empty($charlist) && $charlist !== false) { 601 | return $str; 602 | } 603 | 604 | if ($charlist === false) { 605 | return utf8_trim($str); 606 | } 607 | 608 | return utf8_trim($str, $charlist); 609 | } 610 | 611 | /** 612 | * UTF-8 aware alternative to ucfirst() 613 | * 614 | * Make a string's first character uppercase or all words' first character uppercase. 615 | * 616 | * @param string $str String to be processed 617 | * @param string|null $delimiter The words delimiter (null means do not split the string) 618 | * @param string|null $newDelimiter The new words delimiter (null means equal to $delimiter) 619 | * 620 | * @return string If $delimiter is null, return the string with first character as upper case (if applicable) 621 | * else consider the string of words separated by the delimiter, apply the ucfirst to each words 622 | * and return the string with the new delimiter 623 | * 624 | * @link https://www.php.net/ucfirst 625 | * @since 1.3.0 626 | */ 627 | public static function ucfirst($str, $delimiter = null, $newDelimiter = null) 628 | { 629 | if ($delimiter === null) { 630 | return utf8_ucfirst($str); 631 | } 632 | 633 | if ($newDelimiter === null) { 634 | $newDelimiter = $delimiter; 635 | } 636 | 637 | return implode($newDelimiter, array_map('utf8_ucfirst', explode($delimiter, $str))); 638 | } 639 | 640 | /** 641 | * UTF-8 aware alternative to ucwords() 642 | * 643 | * Uppercase the first character of each word in a string. 644 | * 645 | * @param string $str String to be processed 646 | * 647 | * @return string String with first char of each word uppercase 648 | * 649 | * @link https://www.php.net/ucwords 650 | * @since 1.3.0 651 | */ 652 | public static function ucwords($str) 653 | { 654 | return utf8_ucwords($str); 655 | } 656 | 657 | /** 658 | * Transcode a string. 659 | * 660 | * @param string $source The string to transcode. 661 | * @param string $fromEncoding The source encoding. 662 | * @param string $toEncoding The target encoding. 663 | * 664 | * @return string|null The transcoded string, or null if the source was not a string. 665 | * 666 | * @link https://bugs.php.net/bug.php?id=48147 667 | * 668 | * @since 1.3.0 669 | */ 670 | public static function transcode($source, $fromEncoding, $toEncoding) 671 | { 672 | switch (ICONV_IMPL) { 673 | case 'glibc': 674 | return @iconv($fromEncoding, $toEncoding . '//TRANSLIT,IGNORE', $source); 675 | 676 | case 'libiconv': 677 | default: 678 | return iconv($fromEncoding, $toEncoding . '//IGNORE//TRANSLIT', $source); 679 | } 680 | } 681 | 682 | /** 683 | * Tests a string as to whether it's valid UTF-8 and supported by the Unicode standard. 684 | * 685 | * Note: this function has been modified to simple return true or false. 686 | * 687 | * @param string $str UTF-8 encoded string. 688 | * 689 | * @return boolean true if valid 690 | * 691 | * @author 692 | * @link https://hsivonen.fi/php-utf8/ 693 | * @see compliant 694 | * @since 1.3.0 695 | */ 696 | public static function valid($str) 697 | { 698 | return utf8_is_valid($str); 699 | } 700 | 701 | /** 702 | * Tests whether a string complies as UTF-8. 703 | * 704 | * This will be much faster than StringHelper::valid() but will pass five and six octet UTF-8 sequences, which are not supported by Unicode and 705 | * so cannot be displayed correctly in a browser. In other words it is not as strict as StringHelper::valid() but it's faster. If you use it to 706 | * validate user input, you place yourself at the risk that attackers will be able to inject 5 and 6 byte sequences (which may or may not be a 707 | * significant risk, depending on what you are are doing). 708 | * 709 | * @param string $str UTF-8 string to check 710 | * 711 | * @return boolean TRUE if string is valid UTF-8 712 | * 713 | * @see StringHelper::valid 714 | * @link https://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805 715 | * @since 1.3.0 716 | */ 717 | public static function compliant($str) 718 | { 719 | return utf8_compliant($str); 720 | } 721 | 722 | /** 723 | * Converts Unicode sequences to UTF-8 string. 724 | * 725 | * @param string $str Unicode string to convert 726 | * 727 | * @return string UTF-8 string 728 | * 729 | * @since 1.3.0 730 | */ 731 | public static function unicode_to_utf8($str) 732 | { 733 | if (\extension_loaded('mbstring')) { 734 | return preg_replace_callback( 735 | '/\\\\u([0-9a-fA-F]{4})/', 736 | static function ($match) { 737 | return mb_convert_encoding(pack('H*', $match[1]), 'UTF-8', 'UCS-2BE'); 738 | }, 739 | $str 740 | ); 741 | } 742 | 743 | return $str; 744 | } 745 | 746 | /** 747 | * Converts Unicode sequences to UTF-16 string. 748 | * 749 | * @param string $str Unicode string to convert 750 | * 751 | * @return string UTF-16 string 752 | * 753 | * @since 1.3.0 754 | */ 755 | public static function unicode_to_utf16($str) 756 | { 757 | if (\extension_loaded('mbstring')) { 758 | return preg_replace_callback( 759 | '/\\\\u([0-9a-fA-F]{4})/', 760 | static function ($match) { 761 | return mb_convert_encoding(pack('H*', $match[1]), 'UTF-8', 'UTF-16BE'); 762 | }, 763 | $str 764 | ); 765 | } 766 | 767 | return $str; 768 | } 769 | } 770 | -------------------------------------------------------------------------------- /src/phputf8/LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 2.1, February 1999 3 | 4 | Copyright (C) 1991, 1999 Free Software Foundation, Inc. 5 | 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | [This is the first released version of the Lesser GPL. It also counts 10 | as the successor of the GNU Library Public License, version 2, hence 11 | the version number 2.1.] 12 | 13 | Preamble 14 | 15 | The licenses for most software are designed to take away your 16 | freedom to share and change it. By contrast, the GNU General Public 17 | Licenses are intended to guarantee your freedom to share and change 18 | free software--to make sure the software is free for all its users. 19 | 20 | This license, the Lesser General Public License, applies to some 21 | specially designated software packages--typically libraries--of the 22 | Free Software Foundation and other authors who decide to use it. You 23 | can use it too, but we suggest you first think carefully about whether 24 | this license or the ordinary General Public License is the better 25 | strategy to use in any particular case, based on the explanations below. 26 | 27 | When we speak of free software, we are referring to freedom of use, 28 | not price. Our General Public Licenses are designed to make sure that 29 | you have the freedom to distribute copies of free software (and charge 30 | for this service if you wish); that you receive source code or can get 31 | it if you want it; that you can change the software and use pieces of 32 | it in new free programs; and that you are informed that you can do 33 | these things. 34 | 35 | To protect your rights, we need to make restrictions that forbid 36 | distributors to deny you these rights or to ask you to surrender these 37 | rights. These restrictions translate to certain responsibilities for 38 | you if you distribute copies of the library or if you modify it. 39 | 40 | For example, if you distribute copies of the library, whether gratis 41 | or for a fee, you must give the recipients all the rights that we gave 42 | you. You must make sure that they, too, receive or can get the source 43 | code. If you link other code with the library, you must provide 44 | complete object files to the recipients, so that they can relink them 45 | with the library after making changes to the library and recompiling 46 | it. And you must show them these terms so they know their rights. 47 | 48 | We protect your rights with a two-step method: (1) we copyright the 49 | library, and (2) we offer you this license, which gives you legal 50 | permission to copy, distribute and/or modify the library. 51 | 52 | To protect each distributor, we want to make it very clear that 53 | there is no warranty for the free library. Also, if the library is 54 | modified by someone else and passed on, the recipients should know 55 | that what they have is not the original version, so that the original 56 | author's reputation will not be affected by problems that might be 57 | introduced by others. 58 | 59 | Finally, software patents pose a constant threat to the existence of 60 | any free program. We wish to make sure that a company cannot 61 | effectively restrict the users of a free program by obtaining a 62 | restrictive license from a patent holder. Therefore, we insist that 63 | any patent license obtained for a version of the library must be 64 | consistent with the full freedom of use specified in this license. 65 | 66 | Most GNU software, including some libraries, is covered by the 67 | ordinary GNU General Public License. This license, the GNU Lesser 68 | General Public License, applies to certain designated libraries, and 69 | is quite different from the ordinary General Public License. We use 70 | this license for certain libraries in order to permit linking those 71 | libraries into non-free programs. 72 | 73 | When a program is linked with a library, whether statically or using 74 | a shared library, the combination of the two is legally speaking a 75 | combined work, a derivative of the original library. The ordinary 76 | General Public License therefore permits such linking only if the 77 | entire combination fits its criteria of freedom. The Lesser General 78 | Public License permits more lax criteria for linking other code with 79 | the library. 80 | 81 | We call this license the "Lesser" General Public License because it 82 | does Less to protect the user's freedom than the ordinary General 83 | Public License. It also provides other free software developers Less 84 | of an advantage over competing non-free programs. These disadvantages 85 | are the reason we use the ordinary General Public License for many 86 | libraries. However, the Lesser license provides advantages in certain 87 | special circumstances. 88 | 89 | For example, on rare occasions, there may be a special need to 90 | encourage the widest possible use of a certain library, so that it becomes 91 | a de-facto standard. To achieve this, non-free programs must be 92 | allowed to use the library. A more frequent case is that a free 93 | library does the same job as widely used non-free libraries. In this 94 | case, there is little to gain by limiting the free library to free 95 | software only, so we use the Lesser General Public License. 96 | 97 | In other cases, permission to use a particular library in non-free 98 | programs enables a greater number of people to use a large body of 99 | free software. For example, permission to use the GNU C Library in 100 | non-free programs enables many more people to use the whole GNU 101 | operating system, as well as its variant, the GNU/Linux operating 102 | system. 103 | 104 | Although the Lesser General Public License is Less protective of the 105 | users' freedom, it does ensure that the user of a program that is 106 | linked with the Library has the freedom and the wherewithal to run 107 | that program using a modified version of the Library. 108 | 109 | The precise terms and conditions for copying, distribution and 110 | modification follow. Pay close attention to the difference between a 111 | "work based on the library" and a "work that uses the library". The 112 | former contains code derived from the library, whereas the latter must 113 | be combined with the library in order to run. 114 | 115 | GNU LESSER GENERAL PUBLIC LICENSE 116 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 117 | 118 | 0. This License Agreement applies to any software library or other 119 | program which contains a notice placed by the copyright holder or 120 | other authorized party saying it may be distributed under the terms of 121 | this Lesser General Public License (also called "this License"). 122 | Each licensee is addressed as "you". 123 | 124 | A "library" means a collection of software functions and/or data 125 | prepared so as to be conveniently linked with application programs 126 | (which use some of those functions and data) to form executables. 127 | 128 | The "Library", below, refers to any such software library or work 129 | which has been distributed under these terms. A "work based on the 130 | Library" means either the Library or any derivative work under 131 | copyright law: that is to say, a work containing the Library or a 132 | portion of it, either verbatim or with modifications and/or translated 133 | straightforwardly into another language. (Hereinafter, translation is 134 | included without limitation in the term "modification".) 135 | 136 | "Source code" for a work means the preferred form of the work for 137 | making modifications to it. For a library, complete source code means 138 | all the source code for all modules it contains, plus any associated 139 | interface definition files, plus the scripts used to control compilation 140 | and installation of the library. 141 | 142 | Activities other than copying, distribution and modification are not 143 | covered by this License; they are outside its scope. The act of 144 | running a program using the Library is not restricted, and output from 145 | such a program is covered only if its contents constitute a work based 146 | on the Library (independent of the use of the Library in a tool for 147 | writing it). Whether that is true depends on what the Library does 148 | and what the program that uses the Library does. 149 | 150 | 1. You may copy and distribute verbatim copies of the Library's 151 | complete source code as you receive it, in any medium, provided that 152 | you conspicuously and appropriately publish on each copy an 153 | appropriate copyright notice and disclaimer of warranty; keep intact 154 | all the notices that refer to this License and to the absence of any 155 | warranty; and distribute a copy of this License along with the 156 | Library. 157 | 158 | You may charge a fee for the physical act of transferring a copy, 159 | and you may at your option offer warranty protection in exchange for a 160 | fee. 161 | 162 | 2. You may modify your copy or copies of the Library or any portion 163 | of it, thus forming a work based on the Library, and copy and 164 | distribute such modifications or work under the terms of Section 1 165 | above, provided that you also meet all of these conditions: 166 | 167 | a) The modified work must itself be a software library. 168 | 169 | b) You must cause the files modified to carry prominent notices 170 | stating that you changed the files and the date of any change. 171 | 172 | c) You must cause the whole of the work to be licensed at no 173 | charge to all third parties under the terms of this License. 174 | 175 | d) If a facility in the modified Library refers to a function or a 176 | table of data to be supplied by an application program that uses 177 | the facility, other than as an argument passed when the facility 178 | is invoked, then you must make a good faith effort to ensure that, 179 | in the event an application does not supply such function or 180 | table, the facility still operates, and performs whatever part of 181 | its purpose remains meaningful. 182 | 183 | (For example, a function in a library to compute square roots has 184 | a purpose that is entirely well-defined independent of the 185 | application. Therefore, Subsection 2d requires that any 186 | application-supplied function or table used by this function must 187 | be optional: if the application does not supply it, the square 188 | root function must still compute square roots.) 189 | 190 | These requirements apply to the modified work as a whole. If 191 | identifiable sections of that work are not derived from the Library, 192 | and can be reasonably considered independent and separate works in 193 | themselves, then this License, and its terms, do not apply to those 194 | sections when you distribute them as separate works. But when you 195 | distribute the same sections as part of a whole which is a work based 196 | on the Library, the distribution of the whole must be on the terms of 197 | this License, whose permissions for other licensees extend to the 198 | entire whole, and thus to each and every part regardless of who wrote 199 | it. 200 | 201 | Thus, it is not the intent of this section to claim rights or contest 202 | your rights to work written entirely by you; rather, the intent is to 203 | exercise the right to control the distribution of derivative or 204 | collective works based on the Library. 205 | 206 | In addition, mere aggregation of another work not based on the Library 207 | with the Library (or with a work based on the Library) on a volume of 208 | a storage or distribution medium does not bring the other work under 209 | the scope of this License. 210 | 211 | 3. You may opt to apply the terms of the ordinary GNU General Public 212 | License instead of this License to a given copy of the Library. To do 213 | this, you must alter all the notices that refer to this License, so 214 | that they refer to the ordinary GNU General Public License, version 2, 215 | instead of to this License. (If a newer version than version 2 of the 216 | ordinary GNU General Public License has appeared, then you can specify 217 | that version instead if you wish.) Do not make any other change in 218 | these notices. 219 | 220 | Once this change is made in a given copy, it is irreversible for 221 | that copy, so the ordinary GNU General Public License applies to all 222 | subsequent copies and derivative works made from that copy. 223 | 224 | This option is useful when you wish to copy part of the code of 225 | the Library into a program that is not a library. 226 | 227 | 4. You may copy and distribute the Library (or a portion or 228 | derivative of it, under Section 2) in object code or executable form 229 | under the terms of Sections 1 and 2 above provided that you accompany 230 | it with the complete corresponding machine-readable source code, which 231 | must be distributed under the terms of Sections 1 and 2 above on a 232 | medium customarily used for software interchange. 233 | 234 | If distribution of object code is made by offering access to copy 235 | from a designated place, then offering equivalent access to copy the 236 | source code from the same place satisfies the requirement to 237 | distribute the source code, even though third parties are not 238 | compelled to copy the source along with the object code. 239 | 240 | 5. A program that contains no derivative of any portion of the 241 | Library, but is designed to work with the Library by being compiled or 242 | linked with it, is called a "work that uses the Library". Such a 243 | work, in isolation, is not a derivative work of the Library, and 244 | therefore falls outside the scope of this License. 245 | 246 | However, linking a "work that uses the Library" with the Library 247 | creates an executable that is a derivative of the Library (because it 248 | contains portions of the Library), rather than a "work that uses the 249 | library". The executable is therefore covered by this License. 250 | Section 6 states terms for distribution of such executables. 251 | 252 | When a "work that uses the Library" uses material from a header file 253 | that is part of the Library, the object code for the work may be a 254 | derivative work of the Library even though the source code is not. 255 | Whether this is true is especially significant if the work can be 256 | linked without the Library, or if the work is itself a library. The 257 | threshold for this to be true is not precisely defined by law. 258 | 259 | If such an object file uses only numerical parameters, data 260 | structure layouts and accessors, and small macros and small inline 261 | functions (ten lines or less in length), then the use of the object 262 | file is unrestricted, regardless of whether it is legally a derivative 263 | work. (Executables containing this object code plus portions of the 264 | Library will still fall under Section 6.) 265 | 266 | Otherwise, if the work is a derivative of the Library, you may 267 | distribute the object code for the work under the terms of Section 6. 268 | Any executables containing that work also fall under Section 6, 269 | whether or not they are linked directly with the Library itself. 270 | 271 | 6. As an exception to the Sections above, you may also combine or 272 | link a "work that uses the Library" with the Library to produce a 273 | work containing portions of the Library, and distribute that work 274 | under terms of your choice, provided that the terms permit 275 | modification of the work for the customer's own use and reverse 276 | engineering for debugging such modifications. 277 | 278 | You must give prominent notice with each copy of the work that the 279 | Library is used in it and that the Library and its use are covered by 280 | this License. You must supply a copy of this License. If the work 281 | during execution displays copyright notices, you must include the 282 | copyright notice for the Library among them, as well as a reference 283 | directing the user to the copy of this License. Also, you must do one 284 | of these things: 285 | 286 | a) Accompany the work with the complete corresponding 287 | machine-readable source code for the Library including whatever 288 | changes were used in the work (which must be distributed under 289 | Sections 1 and 2 above); and, if the work is an executable linked 290 | with the Library, with the complete machine-readable "work that 291 | uses the Library", as object code and/or source code, so that the 292 | user can modify the Library and then relink to produce a modified 293 | executable containing the modified Library. (It is understood 294 | that the user who changes the contents of definitions files in the 295 | Library will not necessarily be able to recompile the application 296 | to use the modified definitions.) 297 | 298 | b) Use a suitable shared library mechanism for linking with the 299 | Library. A suitable mechanism is one that (1) uses at run time a 300 | copy of the library already present on the user's computer system, 301 | rather than copying library functions into the executable, and (2) 302 | will operate properly with a modified version of the library, if 303 | the user installs one, as long as the modified version is 304 | interface-compatible with the version that the work was made with. 305 | 306 | c) Accompany the work with a written offer, valid for at 307 | least three years, to give the same user the materials 308 | specified in Subsection 6a, above, for a charge no more 309 | than the cost of performing this distribution. 310 | 311 | d) If distribution of the work is made by offering access to copy 312 | from a designated place, offer equivalent access to copy the above 313 | specified materials from the same place. 314 | 315 | e) Verify that the user has already received a copy of these 316 | materials or that you have already sent this user a copy. 317 | 318 | For an executable, the required form of the "work that uses the 319 | Library" must include any data and utility programs needed for 320 | reproducing the executable from it. However, as a special exception, 321 | the materials to be distributed need not include anything that is 322 | normally distributed (in either source or binary form) with the major 323 | components (compiler, kernel, and so on) of the operating system on 324 | which the executable runs, unless that component itself accompanies 325 | the executable. 326 | 327 | It may happen that this requirement contradicts the license 328 | restrictions of other proprietary libraries that do not normally 329 | accompany the operating system. Such a contradiction means you cannot 330 | use both them and the Library together in an executable that you 331 | distribute. 332 | 333 | 7. You may place library facilities that are a work based on the 334 | Library side-by-side in a single library together with other library 335 | facilities not covered by this License, and distribute such a combined 336 | library, provided that the separate distribution of the work based on 337 | the Library and of the other library facilities is otherwise 338 | permitted, and provided that you do these two things: 339 | 340 | a) Accompany the combined library with a copy of the same work 341 | based on the Library, uncombined with any other library 342 | facilities. This must be distributed under the terms of the 343 | Sections above. 344 | 345 | b) Give prominent notice with the combined library of the fact 346 | that part of it is a work based on the Library, and explaining 347 | where to find the accompanying uncombined form of the same work. 348 | 349 | 8. You may not copy, modify, sublicense, link with, or distribute 350 | the Library except as expressly provided under this License. Any 351 | attempt otherwise to copy, modify, sublicense, link with, or 352 | distribute the Library is void, and will automatically terminate your 353 | rights under this License. However, parties who have received copies, 354 | or rights, from you under this License will not have their licenses 355 | terminated so long as such parties remain in full compliance. 356 | 357 | 9. You are not required to accept this License, since you have not 358 | signed it. However, nothing else grants you permission to modify or 359 | distribute the Library or its derivative works. These actions are 360 | prohibited by law if you do not accept this License. Therefore, by 361 | modifying or distributing the Library (or any work based on the 362 | Library), you indicate your acceptance of this License to do so, and 363 | all its terms and conditions for copying, distributing or modifying 364 | the Library or works based on it. 365 | 366 | 10. Each time you redistribute the Library (or any work based on the 367 | Library), the recipient automatically receives a license from the 368 | original licensor to copy, distribute, link with or modify the Library 369 | subject to these terms and conditions. You may not impose any further 370 | restrictions on the recipients' exercise of the rights granted herein. 371 | You are not responsible for enforcing compliance by third parties with 372 | this License. 373 | 374 | 11. If, as a consequence of a court judgment or allegation of patent 375 | infringement or for any other reason (not limited to patent issues), 376 | conditions are imposed on you (whether by court order, agreement or 377 | otherwise) that contradict the conditions of this License, they do not 378 | excuse you from the conditions of this License. If you cannot 379 | distribute so as to satisfy simultaneously your obligations under this 380 | License and any other pertinent obligations, then as a consequence you 381 | may not distribute the Library at all. For example, if a patent 382 | license would not permit royalty-free redistribution of the Library by 383 | all those who receive copies directly or indirectly through you, then 384 | the only way you could satisfy both it and this License would be to 385 | refrain entirely from distribution of the Library. 386 | 387 | If any portion of this section is held invalid or unenforceable under any 388 | particular circumstance, the balance of the section is intended to apply, 389 | and the section as a whole is intended to apply in other circumstances. 390 | 391 | It is not the purpose of this section to induce you to infringe any 392 | patents or other property right claims or to contest validity of any 393 | such claims; this section has the sole purpose of protecting the 394 | integrity of the free software distribution system which is 395 | implemented by public license practices. Many people have made 396 | generous contributions to the wide range of software distributed 397 | through that system in reliance on consistent application of that 398 | system; it is up to the author/donor to decide if he or she is willing 399 | to distribute software through any other system and a licensee cannot 400 | impose that choice. 401 | 402 | This section is intended to make thoroughly clear what is believed to 403 | be a consequence of the rest of this License. 404 | 405 | 12. If the distribution and/or use of the Library is restricted in 406 | certain countries either by patents or by copyrighted interfaces, the 407 | original copyright holder who places the Library under this License may add 408 | an explicit geographical distribution limitation excluding those countries, 409 | so that distribution is permitted only in or among countries not thus 410 | excluded. In such case, this License incorporates the limitation as if 411 | written in the body of this License. 412 | 413 | 13. The Free Software Foundation may publish revised and/or new 414 | versions of the Lesser General Public License from time to time. 415 | Such new versions will be similar in spirit to the present version, 416 | but may differ in detail to address new problems or concerns. 417 | 418 | Each version is given a distinguishing version number. If the Library 419 | specifies a version number of this License which applies to it and 420 | "any later version", you have the option of following the terms and 421 | conditions either of that version or of any later version published by 422 | the Free Software Foundation. If the Library does not specify a 423 | license version number, you may choose any version ever published by 424 | the Free Software Foundation. 425 | 426 | 14. If you wish to incorporate parts of the Library into other free 427 | programs whose distribution conditions are incompatible with these, 428 | write to the author to ask for permission. For software which is 429 | copyrighted by the Free Software Foundation, write to the Free 430 | Software Foundation; we sometimes make exceptions for this. Our 431 | decision will be guided by the two goals of preserving the free status 432 | of all derivatives of our free software and of promoting the sharing 433 | and reuse of software generally. 434 | 435 | NO WARRANTY 436 | 437 | 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO 438 | WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. 439 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR 440 | OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY 441 | KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE 442 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 443 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE 444 | LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME 445 | THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 446 | 447 | 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN 448 | WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY 449 | AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU 450 | FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR 451 | CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE 452 | LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING 453 | RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A 454 | FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF 455 | SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH 456 | DAMAGES. 457 | 458 | END OF TERMS AND CONDITIONS 459 | 460 | How to Apply These Terms to Your New Libraries 461 | 462 | If you develop a new library, and you want it to be of the greatest 463 | possible use to the public, we recommend making it free software that 464 | everyone can redistribute and change. You can do so by permitting 465 | redistribution under these terms (or, alternatively, under the terms of the 466 | ordinary General Public License). 467 | 468 | To apply these terms, attach the following notices to the library. It is 469 | safest to attach them to the start of each source file to most effectively 470 | convey the exclusion of warranty; and each file should have at least the 471 | "copyright" line and a pointer to where the full notice is found. 472 | 473 | 474 | Copyright (C) 475 | 476 | This library is free software; you can redistribute it and/or 477 | modify it under the terms of the GNU Lesser General Public 478 | License as published by the Free Software Foundation; either 479 | version 2.1 of the License, or (at your option) any later version. 480 | 481 | This library is distributed in the hope that it will be useful, 482 | but WITHOUT ANY WARRANTY; without even the implied warranty of 483 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 484 | Lesser General Public License for more details. 485 | 486 | You should have received a copy of the GNU Lesser General Public 487 | License along with this library; if not, write to the Free Software 488 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 489 | 490 | Also add information on how to contact you by electronic and paper mail. 491 | 492 | You should also get your employer (if you work as a programmer) or your 493 | school, if any, to sign a "copyright disclaimer" for the library, if 494 | necessary. Here is a sample; alter the names: 495 | 496 | Yoyodyne, Inc., hereby disclaims all copyright interest in the 497 | library `Frob' (a library for tweaking knobs) written by James Random Hacker. 498 | 499 | , 1 April 1990 500 | Ty Coon, President of Vice 501 | 502 | That's all there is to it! 503 | 504 | 505 | -------------------------------------------------------------------------------- /src/phputf8/README: -------------------------------------------------------------------------------- 1 | ++PHP UTF-8++ 2 | 3 | Version 0.5 4 | 5 | ++DOCUMENTATION++ 6 | 7 | Documentation in progress in ./docs dir 8 | 9 | http://www.phpwact.org/php/i18n/charsets 10 | http://www.phpwact.org/php/i18n/utf-8 11 | 12 | Important Note: DO NOT use these functions without understanding WHY 13 | you are using them. In particular, do not blindly replace all use of PHP's 14 | string functions which functions found here - most of the time you will 15 | not need to, and you will be introducing a significant performance 16 | overhead to your application. You can get a good idea of when to use what 17 | from reading: http://www.phpwact.org/php/i18n/utf-8 18 | 19 | Important Note: For sake of performance most of the functions here are 20 | not "defensive" (e.g. there is not extensive parameter checking, well 21 | formed UTF-8 is assumed). This is particularily relevant when is comes to 22 | catching badly formed UTF-8 - you should screen input on the "outer 23 | perimeter" with help from functions in the utf8_validation.php and 24 | utf8_bad.php files. 25 | 26 | Important Note: this library treats ALL ASCII characters as valid, including ASCII control characters. But if you use some ASCII control characters in XML, it will render the XML ill-formed. Don't be a bozo: http://hsivonen.iki.fi/producing-xml/#controlchar 27 | 28 | ++BUGS / SUPPORT / FEATURE REQUESTS ++ 29 | 30 | Please report bugs to: 31 | http://sourceforge.net/tracker/?group_id=142846&atid=753842 32 | - if you are able, please submit a failing unit test 33 | (http://www.lastcraft.com/simple_test.php) with your bug report. 34 | 35 | For feature requests / faster implementation of functions found here, 36 | please drop them in via the RFE tracker: http://sourceforge.net/tracker/?group_id=142846&atid=753845 37 | Particularily interested in faster implementations! 38 | 39 | For general support / help, use: 40 | http://sourceforge.net/tracker/?group_id=142846&atid=753843 41 | 42 | In the VERY WORST case, you can email me: hfuecks gmail com - I tend to be slow to respond though so be warned. 43 | 44 | Important Note: when reporting bugs, please provide the following 45 | information; 46 | 47 | PHP version, whether the iconv extension is loaded (in PHP5 it's 48 | there by default), whether the mbstring extension is loaded. The 49 | following PHP script can be used to determine this information; 50 | 51 | "; 53 | if ( extension_loaded('mbstring') ) { 54 | print "mbstring available
"; 55 | } else { 56 | print "mbstring not available
"; 57 | } 58 | if ( extension_loaded('iconv') ) { 59 | print "iconv available
"; 60 | } else { 61 | print "iconv not available
"; 62 | } 63 | ?> 64 | 65 | ++LICENSING++ 66 | 67 | Parts of the code in this library come from other places, under different 68 | licenses. 69 | The authors involved have been contacted (see below). Attribution for 70 | which code came from elsewhere can be found in the source code itself. 71 | 72 | +Andreas Gohr / Chris Smith - Dokuwiki 73 | There is a fair degree of collaboration / exchange of ideas and code 74 | beteen Dokuwiki's UTF-8 library; 75 | http://dev.splitbrain.org/view/darcs/dokuwiki/inc/utf8.php 76 | and phputf8. Although Dokuwiki is released under GPL, its UTF-8 77 | library is released under LGPL, hence no conflict with phputf8 78 | 79 | +Henri Sivonen (http://hsivonen.iki.fi/php-utf8/ / 80 | http://hsivonen.iki.fi/php-utf8/) has also given permission for his 81 | code to be released under the terms of the LGPL. He ported a Unicode / UTF-8 82 | converter from the Mozilla codebase to PHP, which is re-used in phputf8 83 | -------------------------------------------------------------------------------- /src/phputf8/mbstring/core.php: -------------------------------------------------------------------------------- 1 | 25 | * @link http://www.php.net/manual/en/function.strlen.php 26 | * @link http://www.php.net/manual/en/function.utf8-decode.php 27 | * @param string UTF-8 string 28 | * @return int number of UTF-8 characters in string 29 | * @package utf8 30 | */ 31 | function utf8_strlen($str) 32 | { 33 | return strlen(utf8_decode($str)); 34 | } 35 | 36 | 37 | //-------------------------------------------------------------------- 38 | /** 39 | * UTF-8 aware alternative to strpos 40 | * Find position of first occurrence of a string 41 | * Note: This will get alot slower if offset is used 42 | * Note: requires utf8_strlen amd utf8_substr to be loaded 43 | * @param string haystack 44 | * @param string needle (you should validate this with utf8_is_valid) 45 | * @param integer offset in characters (from left) 46 | * @return mixed integer position or FALSE on failure 47 | * @see http://www.php.net/strpos 48 | * @see utf8_strlen 49 | * @see utf8_substr 50 | * @package utf8 51 | */ 52 | function utf8_strpos($str, $needle, $offset = null) 53 | { 54 | if (is_null($offset)) { 55 | $ar = explode($needle, $str, 2); 56 | if (count($ar) > 1) { 57 | return utf8_strlen($ar[0]); 58 | } 59 | return false; 60 | } else { 61 | if (!is_int($offset)) { 62 | trigger_error('utf8_strpos: Offset must be an integer', E_USER_ERROR); 63 | return false; 64 | } 65 | 66 | $str = utf8_substr($str, $offset); 67 | 68 | if (false !== ($pos = utf8_strpos($str, $needle))) { 69 | return $pos + $offset; 70 | } 71 | 72 | return false; 73 | } 74 | } 75 | 76 | //-------------------------------------------------------------------- 77 | /** 78 | * UTF-8 aware alternative to strrpos 79 | * Find position of last occurrence of a char in a string 80 | * Note: This will get alot slower if offset is used 81 | * Note: requires utf8_substr and utf8_strlen to be loaded 82 | * @param string haystack 83 | * @param string needle (you should validate this with utf8_is_valid) 84 | * @param integer (optional) offset (from left) 85 | * @return mixed integer position or FALSE on failure 86 | * @see http://www.php.net/strrpos 87 | * @see utf8_substr 88 | * @see utf8_strlen 89 | * @package utf8 90 | */ 91 | function utf8_strrpos($str, $needle, $offset = null) 92 | { 93 | if (is_null($offset)) { 94 | $ar = explode($needle, $str); 95 | 96 | if (count($ar) > 1) { 97 | // Pop off the end of the string where the last match was made 98 | array_pop($ar); 99 | $str = join($needle, $ar); 100 | return utf8_strlen($str); 101 | } 102 | return false; 103 | } else { 104 | if (!is_int($offset)) { 105 | trigger_error('utf8_strrpos expects parameter 3 to be long', E_USER_WARNING); 106 | return false; 107 | } 108 | 109 | $str = utf8_substr($str, $offset); 110 | 111 | if (false !== ($pos = utf8_strrpos($str, $needle))) { 112 | return $pos + $offset; 113 | } 114 | 115 | return false; 116 | } 117 | } 118 | 119 | //-------------------------------------------------------------------- 120 | /** 121 | * UTF-8 aware alternative to substr 122 | * Return part of a string given character offset (and optionally length) 123 | * 124 | * Note arguments: comparied to substr - if offset or length are 125 | * not integers, this version will not complain but rather massages them 126 | * into an integer. 127 | * 128 | * Note on returned values: substr documentation states false can be 129 | * returned in some cases (e.g. offset > string length) 130 | * mb_substr never returns false, it will return an empty string instead. 131 | * This adopts the mb_substr approach 132 | * 133 | * Note on implementation: PCRE only supports repetitions of less than 134 | * 65536, in order to accept up to MAXINT values for offset and length, 135 | * we'll repeat a group of 65535 characters when needed. 136 | * 137 | * Note on implementation: calculating the number of characters in the 138 | * string is a relatively expensive operation, so we only carry it out when 139 | * necessary. It isn't necessary for +ve offsets and no specified length 140 | * 141 | * @author Chris Smith 142 | * @param string 143 | * @param integer number of UTF-8 characters offset (from left) 144 | * @param integer (optional) length in UTF-8 characters from offset 145 | * @return mixed string or FALSE if failure 146 | * @package utf8 147 | */ 148 | function utf8_substr($str, $offset, $length = null) 149 | { 150 | // generates E_NOTICE 151 | // for PHP4 objects, but not PHP5 objects 152 | $str = (string)$str; 153 | $offset = (int)$offset; 154 | if (!is_null($length)) { 155 | $length = (int)$length; 156 | } 157 | 158 | // handle trivial cases 159 | if ($length === 0) { 160 | return ''; 161 | } 162 | if ($offset < 0 && $length < 0 && $length < $offset) { 163 | return ''; 164 | } 165 | 166 | // normalise negative offsets (we could use a tail 167 | // anchored pattern, but they are horribly slow!) 168 | if ($offset < 0) { 169 | // see notes 170 | $strlen = strlen(utf8_decode($str)); 171 | $offset = $strlen + $offset; 172 | if ($offset < 0) { 173 | $offset = 0; 174 | } 175 | } 176 | 177 | $Op = ''; 178 | $Lp = ''; 179 | 180 | // establish a pattern for offset, a 181 | // non-captured group equal in length to offset 182 | if ($offset > 0) { 183 | $Ox = (int)($offset / 65535); 184 | $Oy = $offset % 65535; 185 | 186 | if ($Ox) { 187 | $Op = '(?:.{65535}){' . $Ox . '}'; 188 | } 189 | 190 | $Op = '^(?:' . $Op . '.{' . $Oy . '})'; 191 | } else { 192 | // offset == 0; just anchor the pattern 193 | $Op = '^'; 194 | } 195 | 196 | // establish a pattern for length 197 | if (is_null($length)) { 198 | // the rest of the string 199 | $Lp = '(.*)$'; 200 | } else { 201 | if (!isset($strlen)) { 202 | // see notes 203 | $strlen = strlen(utf8_decode($str)); 204 | } 205 | 206 | // another trivial case 207 | if ($offset > $strlen) { 208 | return ''; 209 | } 210 | 211 | if ($length > 0) { 212 | // reduce any length that would 213 | // go passed the end of the string 214 | $length = min($strlen - $offset, $length); 215 | 216 | $Lx = (int)($length / 65535); 217 | $Ly = $length % 65535; 218 | 219 | // negative length requires a captured group 220 | // of length characters 221 | if ($Lx) { 222 | $Lp = '(?:.{65535}){' . $Lx . '}'; 223 | } 224 | $Lp = '(' . $Lp . '.{' . $Ly . '})'; 225 | } elseif ($length < 0) { 226 | if ($length < ($offset - $strlen)) { 227 | return ''; 228 | } 229 | 230 | $Lx = (int)((-$length) / 65535); 231 | $Ly = (-$length) % 65535; 232 | 233 | // negative length requires ... capture everything 234 | // except a group of -length characters 235 | // anchored at the tail-end of the string 236 | if ($Lx) { 237 | $Lp = '(?:.{65535}){' . $Lx . '}'; 238 | } 239 | $Lp = '(.*)(?:' . $Lp . '.{' . $Ly . '})$'; 240 | } 241 | } 242 | 243 | if (!preg_match('#' . $Op . $Lp . '#us', $str, $match)) { 244 | return ''; 245 | } 246 | 247 | return $match[1]; 248 | } 249 | 250 | //--------------------------------------------------------------- 251 | /** 252 | * UTF-8 aware alternative to strtolower 253 | * Make a string lowercase 254 | * Note: The concept of a characters "case" only exists is some alphabets 255 | * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does 256 | * not exist in the Chinese alphabet, for example. See Unicode Standard 257 | * Annex #21: Case Mappings 258 | * Note: requires utf8_to_unicode and utf8_from_unicode 259 | * @author Andreas Gohr 260 | * @param string 261 | * @return mixed either string in lowercase or FALSE is UTF-8 invalid 262 | * @see http://www.php.net/strtolower 263 | * @see utf8_to_unicode 264 | * @see utf8_from_unicode 265 | * @see http://www.unicode.org/reports/tr21/tr21-5.html 266 | * @see http://dev.splitbrain.org/view/darcs/dokuwiki/inc/utf8.php 267 | * @package utf8 268 | */ 269 | function utf8_strtolower($string) 270 | { 271 | static $UTF8_UPPER_TO_LOWER = null; 272 | 273 | if (is_null($UTF8_UPPER_TO_LOWER)) { 274 | $UTF8_UPPER_TO_LOWER = [ 275 | 0x0041 => 0x0061, 0x03A6 => 0x03C6, 0x0162 => 0x0163, 0x00C5 => 0x00E5, 0x0042 => 0x0062, 276 | 0x0139 => 0x013A, 0x00C1 => 0x00E1, 0x0141 => 0x0142, 0x038E => 0x03CD, 0x0100 => 0x0101, 277 | 0x0490 => 0x0491, 0x0394 => 0x03B4, 0x015A => 0x015B, 0x0044 => 0x0064, 0x0393 => 0x03B3, 278 | 0x00D4 => 0x00F4, 0x042A => 0x044A, 0x0419 => 0x0439, 0x0112 => 0x0113, 0x041C => 0x043C, 279 | 0x015E => 0x015F, 0x0143 => 0x0144, 0x00CE => 0x00EE, 0x040E => 0x045E, 0x042F => 0x044F, 280 | 0x039A => 0x03BA, 0x0154 => 0x0155, 0x0049 => 0x0069, 0x0053 => 0x0073, 0x1E1E => 0x1E1F, 281 | 0x0134 => 0x0135, 0x0427 => 0x0447, 0x03A0 => 0x03C0, 0x0418 => 0x0438, 0x00D3 => 0x00F3, 282 | 0x0420 => 0x0440, 0x0404 => 0x0454, 0x0415 => 0x0435, 0x0429 => 0x0449, 0x014A => 0x014B, 283 | 0x0411 => 0x0431, 0x0409 => 0x0459, 0x1E02 => 0x1E03, 0x00D6 => 0x00F6, 0x00D9 => 0x00F9, 284 | 0x004E => 0x006E, 0x0401 => 0x0451, 0x03A4 => 0x03C4, 0x0423 => 0x0443, 0x015C => 0x015D, 285 | 0x0403 => 0x0453, 0x03A8 => 0x03C8, 0x0158 => 0x0159, 0x0047 => 0x0067, 0x00C4 => 0x00E4, 286 | 0x0386 => 0x03AC, 0x0389 => 0x03AE, 0x0166 => 0x0167, 0x039E => 0x03BE, 0x0164 => 0x0165, 287 | 0x0116 => 0x0117, 0x0108 => 0x0109, 0x0056 => 0x0076, 0x00DE => 0x00FE, 0x0156 => 0x0157, 288 | 0x00DA => 0x00FA, 0x1E60 => 0x1E61, 0x1E82 => 0x1E83, 0x00C2 => 0x00E2, 0x0118 => 0x0119, 289 | 0x0145 => 0x0146, 0x0050 => 0x0070, 0x0150 => 0x0151, 0x042E => 0x044E, 0x0128 => 0x0129, 290 | 0x03A7 => 0x03C7, 0x013D => 0x013E, 0x0422 => 0x0442, 0x005A => 0x007A, 0x0428 => 0x0448, 291 | 0x03A1 => 0x03C1, 0x1E80 => 0x1E81, 0x016C => 0x016D, 0x00D5 => 0x00F5, 0x0055 => 0x0075, 292 | 0x0176 => 0x0177, 0x00DC => 0x00FC, 0x1E56 => 0x1E57, 0x03A3 => 0x03C3, 0x041A => 0x043A, 293 | 0x004D => 0x006D, 0x016A => 0x016B, 0x0170 => 0x0171, 0x0424 => 0x0444, 0x00CC => 0x00EC, 294 | 0x0168 => 0x0169, 0x039F => 0x03BF, 0x004B => 0x006B, 0x00D2 => 0x00F2, 0x00C0 => 0x00E0, 295 | 0x0414 => 0x0434, 0x03A9 => 0x03C9, 0x1E6A => 0x1E6B, 0x00C3 => 0x00E3, 0x042D => 0x044D, 296 | 0x0416 => 0x0436, 0x01A0 => 0x01A1, 0x010C => 0x010D, 0x011C => 0x011D, 0x00D0 => 0x00F0, 297 | 0x013B => 0x013C, 0x040F => 0x045F, 0x040A => 0x045A, 0x00C8 => 0x00E8, 0x03A5 => 0x03C5, 298 | 0x0046 => 0x0066, 0x00DD => 0x00FD, 0x0043 => 0x0063, 0x021A => 0x021B, 0x00CA => 0x00EA, 299 | 0x0399 => 0x03B9, 0x0179 => 0x017A, 0x00CF => 0x00EF, 0x01AF => 0x01B0, 0x0045 => 0x0065, 300 | 0x039B => 0x03BB, 0x0398 => 0x03B8, 0x039C => 0x03BC, 0x040C => 0x045C, 0x041F => 0x043F, 301 | 0x042C => 0x044C, 0x00DE => 0x00FE, 0x00D0 => 0x00F0, 0x1EF2 => 0x1EF3, 0x0048 => 0x0068, 302 | 0x00CB => 0x00EB, 0x0110 => 0x0111, 0x0413 => 0x0433, 0x012E => 0x012F, 0x00C6 => 0x00E6, 303 | 0x0058 => 0x0078, 0x0160 => 0x0161, 0x016E => 0x016F, 0x0391 => 0x03B1, 0x0407 => 0x0457, 304 | 0x0172 => 0x0173, 0x0178 => 0x00FF, 0x004F => 0x006F, 0x041B => 0x043B, 0x0395 => 0x03B5, 305 | 0x0425 => 0x0445, 0x0120 => 0x0121, 0x017D => 0x017E, 0x017B => 0x017C, 0x0396 => 0x03B6, 306 | 0x0392 => 0x03B2, 0x0388 => 0x03AD, 0x1E84 => 0x1E85, 0x0174 => 0x0175, 0x0051 => 0x0071, 307 | 0x0417 => 0x0437, 0x1E0A => 0x1E0B, 0x0147 => 0x0148, 0x0104 => 0x0105, 0x0408 => 0x0458, 308 | 0x014C => 0x014D, 0x00CD => 0x00ED, 0x0059 => 0x0079, 0x010A => 0x010B, 0x038F => 0x03CE, 309 | 0x0052 => 0x0072, 0x0410 => 0x0430, 0x0405 => 0x0455, 0x0402 => 0x0452, 0x0126 => 0x0127, 310 | 0x0136 => 0x0137, 0x012A => 0x012B, 0x038A => 0x03AF, 0x042B => 0x044B, 0x004C => 0x006C, 311 | 0x0397 => 0x03B7, 0x0124 => 0x0125, 0x0218 => 0x0219, 0x00DB => 0x00FB, 0x011E => 0x011F, 312 | 0x041E => 0x043E, 0x1E40 => 0x1E41, 0x039D => 0x03BD, 0x0106 => 0x0107, 0x03AB => 0x03CB, 313 | 0x0426 => 0x0446, 0x00DE => 0x00FE, 0x00C7 => 0x00E7, 0x03AA => 0x03CA, 0x0421 => 0x0441, 314 | 0x0412 => 0x0432, 0x010E => 0x010F, 0x00D8 => 0x00F8, 0x0057 => 0x0077, 0x011A => 0x011B, 315 | 0x0054 => 0x0074, 0x004A => 0x006A, 0x040B => 0x045B, 0x0406 => 0x0456, 0x0102 => 0x0103, 316 | 0x039B => 0x03BB, 0x00D1 => 0x00F1, 0x041D => 0x043D, 0x038C => 0x03CC, 0x00C9 => 0x00E9, 317 | 0x00D0 => 0x00F0, 0x0407 => 0x0457, 0x0122 => 0x0123, 318 | ]; 319 | } 320 | 321 | $uni = utf8_to_unicode($string); 322 | 323 | if (!$uni) { 324 | return false; 325 | } 326 | 327 | $cnt = count($uni); 328 | for ($i = 0; $i < $cnt; $i++) { 329 | if (isset($UTF8_UPPER_TO_LOWER[$uni[$i]])) { 330 | $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]]; 331 | } 332 | } 333 | 334 | return utf8_from_unicode($uni); 335 | } 336 | 337 | //--------------------------------------------------------------- 338 | /** 339 | * UTF-8 aware alternative to strtoupper 340 | * Make a string uppercase 341 | * Note: The concept of a characters "case" only exists is some alphabets 342 | * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does 343 | * not exist in the Chinese alphabet, for example. See Unicode Standard 344 | * Annex #21: Case Mappings 345 | * Note: requires utf8_to_unicode and utf8_from_unicode 346 | * @author Andreas Gohr 347 | * @param string 348 | * @return mixed either string in lowercase or FALSE is UTF-8 invalid 349 | * @see http://www.php.net/strtoupper 350 | * @see utf8_to_unicode 351 | * @see utf8_from_unicode 352 | * @see http://www.unicode.org/reports/tr21/tr21-5.html 353 | * @see http://dev.splitbrain.org/view/darcs/dokuwiki/inc/utf8.php 354 | * @package utf8 355 | */ 356 | function utf8_strtoupper($string) 357 | { 358 | static $UTF8_LOWER_TO_UPPER = null; 359 | 360 | if (is_null($UTF8_LOWER_TO_UPPER)) { 361 | $UTF8_LOWER_TO_UPPER = [ 362 | 0x0061 => 0x0041, 0x03C6 => 0x03A6, 0x0163 => 0x0162, 0x00E5 => 0x00C5, 0x0062 => 0x0042, 363 | 0x013A => 0x0139, 0x00E1 => 0x00C1, 0x0142 => 0x0141, 0x03CD => 0x038E, 0x0101 => 0x0100, 364 | 0x0491 => 0x0490, 0x03B4 => 0x0394, 0x015B => 0x015A, 0x0064 => 0x0044, 0x03B3 => 0x0393, 365 | 0x00F4 => 0x00D4, 0x044A => 0x042A, 0x0439 => 0x0419, 0x0113 => 0x0112, 0x043C => 0x041C, 366 | 0x015F => 0x015E, 0x0144 => 0x0143, 0x00EE => 0x00CE, 0x045E => 0x040E, 0x044F => 0x042F, 367 | 0x03BA => 0x039A, 0x0155 => 0x0154, 0x0069 => 0x0049, 0x0073 => 0x0053, 0x1E1F => 0x1E1E, 368 | 0x0135 => 0x0134, 0x0447 => 0x0427, 0x03C0 => 0x03A0, 0x0438 => 0x0418, 0x00F3 => 0x00D3, 369 | 0x0440 => 0x0420, 0x0454 => 0x0404, 0x0435 => 0x0415, 0x0449 => 0x0429, 0x014B => 0x014A, 370 | 0x0431 => 0x0411, 0x0459 => 0x0409, 0x1E03 => 0x1E02, 0x00F6 => 0x00D6, 0x00F9 => 0x00D9, 371 | 0x006E => 0x004E, 0x0451 => 0x0401, 0x03C4 => 0x03A4, 0x0443 => 0x0423, 0x015D => 0x015C, 372 | 0x0453 => 0x0403, 0x03C8 => 0x03A8, 0x0159 => 0x0158, 0x0067 => 0x0047, 0x00E4 => 0x00C4, 373 | 0x03AC => 0x0386, 0x03AE => 0x0389, 0x0167 => 0x0166, 0x03BE => 0x039E, 0x0165 => 0x0164, 374 | 0x0117 => 0x0116, 0x0109 => 0x0108, 0x0076 => 0x0056, 0x00FE => 0x00DE, 0x0157 => 0x0156, 375 | 0x00FA => 0x00DA, 0x1E61 => 0x1E60, 0x1E83 => 0x1E82, 0x00E2 => 0x00C2, 0x0119 => 0x0118, 376 | 0x0146 => 0x0145, 0x0070 => 0x0050, 0x0151 => 0x0150, 0x044E => 0x042E, 0x0129 => 0x0128, 377 | 0x03C7 => 0x03A7, 0x013E => 0x013D, 0x0442 => 0x0422, 0x007A => 0x005A, 0x0448 => 0x0428, 378 | 0x03C1 => 0x03A1, 0x1E81 => 0x1E80, 0x016D => 0x016C, 0x00F5 => 0x00D5, 0x0075 => 0x0055, 379 | 0x0177 => 0x0176, 0x00FC => 0x00DC, 0x1E57 => 0x1E56, 0x03C3 => 0x03A3, 0x043A => 0x041A, 380 | 0x006D => 0x004D, 0x016B => 0x016A, 0x0171 => 0x0170, 0x0444 => 0x0424, 0x00EC => 0x00CC, 381 | 0x0169 => 0x0168, 0x03BF => 0x039F, 0x006B => 0x004B, 0x00F2 => 0x00D2, 0x00E0 => 0x00C0, 382 | 0x0434 => 0x0414, 0x03C9 => 0x03A9, 0x1E6B => 0x1E6A, 0x00E3 => 0x00C3, 0x044D => 0x042D, 383 | 0x0436 => 0x0416, 0x01A1 => 0x01A0, 0x010D => 0x010C, 0x011D => 0x011C, 0x00F0 => 0x00D0, 384 | 0x013C => 0x013B, 0x045F => 0x040F, 0x045A => 0x040A, 0x00E8 => 0x00C8, 0x03C5 => 0x03A5, 385 | 0x0066 => 0x0046, 0x00FD => 0x00DD, 0x0063 => 0x0043, 0x021B => 0x021A, 0x00EA => 0x00CA, 386 | 0x03B9 => 0x0399, 0x017A => 0x0179, 0x00EF => 0x00CF, 0x01B0 => 0x01AF, 0x0065 => 0x0045, 387 | 0x03BB => 0x039B, 0x03B8 => 0x0398, 0x03BC => 0x039C, 0x045C => 0x040C, 0x043F => 0x041F, 388 | 0x044C => 0x042C, 0x00FE => 0x00DE, 0x00F0 => 0x00D0, 0x1EF3 => 0x1EF2, 0x0068 => 0x0048, 389 | 0x00EB => 0x00CB, 0x0111 => 0x0110, 0x0433 => 0x0413, 0x012F => 0x012E, 0x00E6 => 0x00C6, 390 | 0x0078 => 0x0058, 0x0161 => 0x0160, 0x016F => 0x016E, 0x03B1 => 0x0391, 0x0457 => 0x0407, 391 | 0x0173 => 0x0172, 0x00FF => 0x0178, 0x006F => 0x004F, 0x043B => 0x041B, 0x03B5 => 0x0395, 392 | 0x0445 => 0x0425, 0x0121 => 0x0120, 0x017E => 0x017D, 0x017C => 0x017B, 0x03B6 => 0x0396, 393 | 0x03B2 => 0x0392, 0x03AD => 0x0388, 0x1E85 => 0x1E84, 0x0175 => 0x0174, 0x0071 => 0x0051, 394 | 0x0437 => 0x0417, 0x1E0B => 0x1E0A, 0x0148 => 0x0147, 0x0105 => 0x0104, 0x0458 => 0x0408, 395 | 0x014D => 0x014C, 0x00ED => 0x00CD, 0x0079 => 0x0059, 0x010B => 0x010A, 0x03CE => 0x038F, 396 | 0x0072 => 0x0052, 0x0430 => 0x0410, 0x0455 => 0x0405, 0x0452 => 0x0402, 0x0127 => 0x0126, 397 | 0x0137 => 0x0136, 0x012B => 0x012A, 0x03AF => 0x038A, 0x044B => 0x042B, 0x006C => 0x004C, 398 | 0x03B7 => 0x0397, 0x0125 => 0x0124, 0x0219 => 0x0218, 0x00FB => 0x00DB, 0x011F => 0x011E, 399 | 0x043E => 0x041E, 0x1E41 => 0x1E40, 0x03BD => 0x039D, 0x0107 => 0x0106, 0x03CB => 0x03AB, 400 | 0x0446 => 0x0426, 0x00FE => 0x00DE, 0x00E7 => 0x00C7, 0x03CA => 0x03AA, 0x0441 => 0x0421, 401 | 0x0432 => 0x0412, 0x010F => 0x010E, 0x00F8 => 0x00D8, 0x0077 => 0x0057, 0x011B => 0x011A, 402 | 0x0074 => 0x0054, 0x006A => 0x004A, 0x045B => 0x040B, 0x0456 => 0x0406, 0x0103 => 0x0102, 403 | 0x03BB => 0x039B, 0x00F1 => 0x00D1, 0x043D => 0x041D, 0x03CC => 0x038C, 0x00E9 => 0x00C9, 404 | 0x00F0 => 0x00D0, 0x0457 => 0x0407, 0x0123 => 0x0122, 405 | ]; 406 | } 407 | 408 | $uni = utf8_to_unicode($string); 409 | 410 | if (!$uni) { 411 | return false; 412 | } 413 | 414 | $cnt = count($uni); 415 | for ($i = 0; $i < $cnt; $i++) { 416 | if (isset($UTF8_LOWER_TO_UPPER[$uni[$i]])) { 417 | $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]]; 418 | } 419 | } 420 | 421 | return utf8_from_unicode($uni); 422 | } 423 | -------------------------------------------------------------------------------- /src/phputf8/ord.php: -------------------------------------------------------------------------------- 1 | = 0 && $ord0 <= 127) { 27 | return $ord0; 28 | } 29 | 30 | if (!isset($chr[1])) { 31 | trigger_error('Short sequence - at least 2 bytes expected, only 1 seen'); 32 | return false; 33 | } 34 | 35 | $ord1 = ord($chr[1]); 36 | if ($ord0 >= 192 && $ord0 <= 223) { 37 | return ($ord0 - 192) * 64 38 | + ($ord1 - 128); 39 | } 40 | 41 | if (!isset($chr[2])) { 42 | trigger_error('Short sequence - at least 3 bytes expected, only 2 seen'); 43 | return false; 44 | } 45 | $ord2 = ord($chr[2]); 46 | if ($ord0 >= 224 && $ord0 <= 239) { 47 | return ($ord0 - 224) * 4096 48 | + ($ord1 - 128) * 64 49 | + ($ord2 - 128); 50 | } 51 | 52 | if (!isset($chr[3])) { 53 | trigger_error('Short sequence - at least 4 bytes expected, only 3 seen'); 54 | return false; 55 | } 56 | $ord3 = ord($chr[3]); 57 | if ($ord0 >= 240 && $ord0 <= 247) { 58 | return ($ord0 - 240) * 262144 59 | + ($ord1 - 128) * 4096 60 | + ($ord2 - 128) * 64 61 | + ($ord3 - 128); 62 | } 63 | 64 | if (!isset($chr[4])) { 65 | trigger_error('Short sequence - at least 5 bytes expected, only 4 seen'); 66 | return false; 67 | } 68 | $ord4 = ord($chr[4]); 69 | if ($ord0 >= 248 && $ord0 <= 251) { 70 | return ($ord0 - 248) * 16777216 71 | + ($ord1 - 128) * 262144 72 | + ($ord2 - 128) * 4096 73 | + ($ord3 - 128) * 64 74 | + ($ord4 - 128); 75 | } 76 | 77 | if (!isset($chr[5])) { 78 | trigger_error('Short sequence - at least 6 bytes expected, only 5 seen'); 79 | return false; 80 | } 81 | if ($ord0 >= 252 && $ord0 <= 253) { 82 | return ($ord0 - 252) * 1073741824 83 | + ($ord1 - 128) * 16777216 84 | + ($ord2 - 128) * 262144 85 | + ($ord3 - 128) * 4096 86 | + ($ord4 - 128) * 64 87 | + (ord($chr[5]) - 128); 88 | } 89 | 90 | if ($ord0 >= 254 && $ord0 <= 255) { 91 | trigger_error('Invalid UTF-8 with surrogate ordinal ' . $ord0); 92 | return false; 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/phputf8/str_ireplace.php: -------------------------------------------------------------------------------- 1 | 12 | * @param string $input 13 | * @param int $length 14 | * @param string $padStr 15 | * @param int $type ( same constants as str_pad ) 16 | * @return string 17 | * @see http://www.php.net/str_pad 18 | * @see utf8_substr 19 | * @package utf8 20 | */ 21 | function utf8_str_pad($input, $length, $padStr = ' ', $type = STR_PAD_RIGHT) 22 | { 23 | $inputLen = utf8_strlen($input); 24 | if ($length <= $inputLen) { 25 | return $input; 26 | } 27 | 28 | $padStrLen = utf8_strlen($padStr); 29 | $padLen = $length - $inputLen; 30 | 31 | if ($type == STR_PAD_RIGHT) { 32 | $repeatTimes = ceil($padLen / $padStrLen); 33 | return utf8_substr($input . str_repeat($padStr, $repeatTimes), 0, $length); 34 | } 35 | 36 | if ($type == STR_PAD_LEFT) { 37 | $repeatTimes = ceil($padLen / $padStrLen); 38 | return utf8_substr(str_repeat($padStr, $repeatTimes), 0, floor($padLen)) . $input; 39 | } 40 | 41 | if ($type == STR_PAD_BOTH) { 42 | $padLen /= 2; 43 | $padAmountLeft = floor($padLen); 44 | $padAmountRight = ceil($padLen); 45 | $repeatTimesLeft = ceil($padAmountLeft / $padStrLen); 46 | $repeatTimesRight = ceil($padAmountRight / $padStrLen); 47 | 48 | $paddingLeft = utf8_substr(str_repeat($padStr, $repeatTimesLeft), 0, $padAmountLeft); 49 | $paddingRight = utf8_substr(str_repeat($padStr, $repeatTimesRight), 0, $padAmountLeft); 50 | return $paddingLeft . $input . $paddingRight; 51 | } 52 | 53 | trigger_error('utf8_str_pad: Unknown padding type (' . $type . ')', E_USER_ERROR); 54 | } 55 | -------------------------------------------------------------------------------- /src/phputf8/str_split.php: -------------------------------------------------------------------------------- 1 | 14 | * @see http://www.php.net/ltrim 15 | * @see http://dev.splitbrain.org/view/darcs/dokuwiki/inc/utf8.php 16 | * @return string 17 | * @package utf8 18 | */ 19 | function utf8_ltrim($str, $charlist = false) 20 | { 21 | if ($charlist === false) { 22 | return ltrim($str); 23 | } 24 | 25 | //quote charlist for use in a characterclass 26 | $charlist = preg_replace('!([\\\\\\-\\]\\[/^])!', '\\\${1}', $charlist); 27 | 28 | return preg_replace('/^[' . $charlist . ']+/u', '', $str); 29 | } 30 | 31 | //--------------------------------------------------------------- 32 | /** 33 | * UTF-8 aware replacement for rtrim() 34 | * Note: you only need to use this if you are supplying the charlist 35 | * optional arg and it contains UTF-8 characters. Otherwise rtrim will 36 | * work normally on a UTF-8 string 37 | * @author Andreas Gohr 38 | * @see http://www.php.net/rtrim 39 | * @see http://dev.splitbrain.org/view/darcs/dokuwiki/inc/utf8.php 40 | * @return string 41 | * @package utf8 42 | */ 43 | function utf8_rtrim($str, $charlist = false) 44 | { 45 | if ($charlist === false) { 46 | return rtrim($str); 47 | } 48 | 49 | //quote charlist for use in a characterclass 50 | $charlist = preg_replace('!([\\\\\\-\\]\\[/^])!', '\\\${1}', $charlist); 51 | 52 | return preg_replace('/[' . $charlist . ']+$/u', '', $str); 53 | } 54 | 55 | //--------------------------------------------------------------- 56 | /** 57 | * UTF-8 aware replacement for trim() 58 | * Note: you only need to use this if you are supplying the charlist 59 | * optional arg and it contains UTF-8 characters. Otherwise trim will 60 | * work normally on a UTF-8 string 61 | * @author Andreas Gohr 62 | * @see http://www.php.net/trim 63 | * @see http://dev.splitbrain.org/view/darcs/dokuwiki/inc/utf8.php 64 | * @return string 65 | * @package utf8 66 | */ 67 | function utf8_trim($str, $charlist = false) 68 | { 69 | if ($charlist === false) { 70 | return trim($str); 71 | } 72 | return utf8_ltrim(utf8_rtrim($str, $charlist), $charlist); 73 | } 74 | -------------------------------------------------------------------------------- /src/phputf8/ucfirst.php: -------------------------------------------------------------------------------- 1 | 17 | * if ( utf8_is_ascii($someString) ) { 18 | * // It's just ASCII - use the native PHP version 19 | * $someString = strtolower($someString); 20 | * } else { 21 | * $someString = utf8_strtolower($someString); 22 | * } 23 | * 24 | * 25 | * @param string 26 | * @return boolean TRUE if it's all ASCII 27 | * @package utf8 28 | * @see utf8_is_ascii_ctrl 29 | */ 30 | function utf8_is_ascii($str) 31 | { 32 | // Search for any bytes which are outside the ASCII range... 33 | return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1); 34 | } 35 | 36 | //-------------------------------------------------------------------- 37 | /** 38 | * Tests whether a string contains only 7bit ASCII bytes with device 39 | * control codes omitted. The device control codes can be found on the 40 | * second table here: http://www.w3schools.com/tags/ref_ascii.asp 41 | * 42 | * @param string 43 | * @return boolean TRUE if it's all ASCII without device control codes 44 | * @package utf8 45 | * @see utf8_is_ascii 46 | */ 47 | function utf8_is_ascii_ctrl($str) 48 | { 49 | if (strlen($str) > 0) { 50 | // Search for any bytes which are outside the ASCII range, 51 | // or are device control codes 52 | return (preg_match('/[^\x09\x0A\x0D\x20-\x7E]/', $str) !== 1); 53 | } 54 | return false; 55 | } 56 | 57 | //-------------------------------------------------------------------- 58 | /** 59 | * Strip out all non-7bit ASCII bytes 60 | * If you need to transmit a string to system which you know can only 61 | * support 7bit ASCII, you could use this function. 62 | * @param string 63 | * @return string with non ASCII bytes removed 64 | * @package utf8 65 | * @see utf8_strip_non_ascii_ctrl 66 | */ 67 | function utf8_strip_non_ascii($str) 68 | { 69 | ob_start(); 70 | while ( 71 | preg_match( 72 | '/^([\x00-\x7F]+)|([^\x00-\x7F]+)/S', 73 | $str, 74 | $matches 75 | ) 76 | ) { 77 | if (!isset($matches[2])) { 78 | echo $matches[0]; 79 | } 80 | $str = substr($str, strlen($matches[0])); 81 | } 82 | $result = ob_get_contents(); 83 | ob_end_clean(); 84 | return $result; 85 | } 86 | 87 | //-------------------------------------------------------------------- 88 | /** 89 | * Strip out device control codes in the ASCII range 90 | * which are not permitted in XML. Note that this leaves 91 | * multi-byte characters untouched - it only removes device 92 | * control codes 93 | * @see http://hsivonen.iki.fi/producing-xml/#controlchar 94 | * @param string 95 | * @return string control codes removed 96 | */ 97 | function utf8_strip_ascii_ctrl($str) 98 | { 99 | ob_start(); 100 | while ( 101 | preg_match( 102 | '/^([^\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+)|([\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+)/S', 103 | $str, 104 | $matches 105 | ) 106 | ) { 107 | if (!isset($matches[2])) { 108 | echo $matches[0]; 109 | } 110 | $str = substr($str, strlen($matches[0])); 111 | } 112 | $result = ob_get_contents(); 113 | ob_end_clean(); 114 | return $result; 115 | } 116 | 117 | //-------------------------------------------------------------------- 118 | /** 119 | * Strip out all non 7bit ASCII bytes and ASCII device control codes. 120 | * For a list of ASCII device control codes see the 2nd table here: 121 | * http://www.w3schools.com/tags/ref_ascii.asp 122 | * 123 | * @param string 124 | * @return boolean TRUE if it's all ASCII 125 | * @package utf8 126 | */ 127 | function utf8_strip_non_ascii_ctrl($str) 128 | { 129 | ob_start(); 130 | while ( 131 | preg_match( 132 | '/^([\x09\x0A\x0D\x20-\x7E]+)|([^\x09\x0A\x0D\x20-\x7E]+)/S', 133 | $str, 134 | $matches 135 | ) 136 | ) { 137 | if (!isset($matches[2])) { 138 | echo $matches[0]; 139 | } 140 | $str = substr($str, strlen($matches[0])); 141 | } 142 | $result = ob_get_contents(); 143 | ob_end_clean(); 144 | return $result; 145 | } 146 | 147 | //--------------------------------------------------------------- 148 | /** 149 | * Replace accented UTF-8 characters by unaccented ASCII-7 "equivalents". 150 | * The purpose of this function is to replace characters commonly found in Latin 151 | * alphabets with something more or less equivalent from the ASCII range. This can 152 | * be useful for converting a UTF-8 to something ready for a filename, for example. 153 | * Following the use of this function, you would probably also pass the string 154 | * through utf8_strip_non_ascii to clean out any other non-ASCII chars 155 | * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) 156 | * letters. Default is to deaccent both cases ($case = 0) 157 | * 158 | * For a more complete implementation of transliteration, see the utf8_to_ascii package 159 | * available from the phputf8 project downloads: 160 | * http://prdownloads.sourceforge.net/phputf8 161 | * 162 | * @param string UTF-8 string 163 | * @param int (optional) -1 lowercase only, +1 uppercase only, 1 both cases 164 | * @param string UTF-8 with accented characters replaced by ASCII chars 165 | * @return string accented chars replaced with ascii equivalents 166 | * @author Andreas Gohr 167 | * @package utf8 168 | */ 169 | function utf8_accents_to_ascii($str, $case = 0) 170 | { 171 | static $UTF8_LOWER_ACCENTS = null; 172 | static $UTF8_UPPER_ACCENTS = null; 173 | 174 | if ($case <= 0) { 175 | if (is_null($UTF8_LOWER_ACCENTS)) { 176 | $UTF8_LOWER_ACCENTS = [ 177 | 'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o', 178 | 'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k', 179 | 'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o', 180 | 'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o', 181 | 'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c', 182 | 'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't', 183 | 'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l', 184 | 'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z', 185 | 'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't', 186 | 'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o', 187 | 'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j', 188 | 'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o', 189 | 'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g', 190 | 'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a', 191 | 'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e', 192 | ]; 193 | } 194 | 195 | $str = str_replace( 196 | array_keys($UTF8_LOWER_ACCENTS), 197 | array_values($UTF8_LOWER_ACCENTS), 198 | $str 199 | ); 200 | } 201 | 202 | if ($case >= 0) { 203 | if (is_null($UTF8_UPPER_ACCENTS)) { 204 | $UTF8_UPPER_ACCENTS = [ 205 | 'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O', 206 | 'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K', 207 | 'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O', 208 | 'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O', 209 | 'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C', 210 | 'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T', 211 | 'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L', 212 | 'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z', 213 | 'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T', 214 | 'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O', 215 | 'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J', 216 | 'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O', 217 | 'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G', 218 | 'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A', 219 | 'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E', 220 | ]; 221 | } 222 | $str = str_replace( 223 | array_keys($UTF8_UPPER_ACCENTS), 224 | array_values($UTF8_UPPER_ACCENTS), 225 | $str 226 | ); 227 | } 228 | 229 | return $str; 230 | } 231 | -------------------------------------------------------------------------------- /src/phputf8/utils/bad.php: -------------------------------------------------------------------------------- 1 | 0) { 91 | return $badList; 92 | } 93 | return false; 94 | } 95 | 96 | //-------------------------------------------------------------------- 97 | /** 98 | * Strips out any bad bytes from a UTF-8 string and returns the rest 99 | * PCRE Pattern to locate bad bytes in a UTF-8 string 100 | * Comes from W3 FAQ: Multilingual Forms 101 | * Note: modified to include full ASCII range including control chars 102 | * @see http://www.w3.org/International/questions/qa-forms-utf-8 103 | * @param string 104 | * @return string 105 | * @package utf8 106 | */ 107 | function utf8_bad_strip($str) 108 | { 109 | $UTF8_BAD = 110 | '([\x00-\x7F]' . # ASCII (including control chars) 111 | '|[\xC2-\xDF][\x80-\xBF]' . # non-overlong 2-byte 112 | '|\xE0[\xA0-\xBF][\x80-\xBF]' . # excluding overlongs 113 | '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' . # straight 3-byte 114 | '|\xED[\x80-\x9F][\x80-\xBF]' . # excluding surrogates 115 | '|\xF0[\x90-\xBF][\x80-\xBF]{2}' . # planes 1-3 116 | '|[\xF1-\xF3][\x80-\xBF]{3}' . # planes 4-15 117 | '|\xF4[\x80-\x8F][\x80-\xBF]{2}' . # plane 16 118 | '|(.{1}))'; # invalid byte 119 | ob_start(); 120 | while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) { 121 | if (!isset($matches[2])) { 122 | echo $matches[0]; 123 | } 124 | $str = substr($str, strlen($matches[0])); 125 | } 126 | $result = ob_get_contents(); 127 | ob_end_clean(); 128 | return $result; 129 | } 130 | 131 | //-------------------------------------------------------------------- 132 | /** 133 | * Replace bad bytes with an alternative character - ASCII character 134 | * recommended is replacement char 135 | * PCRE Pattern to locate bad bytes in a UTF-8 string 136 | * Comes from W3 FAQ: Multilingual Forms 137 | * Note: modified to include full ASCII range including control chars 138 | * @see http://www.w3.org/International/questions/qa-forms-utf-8 139 | * @param string to search 140 | * @param string to replace bad bytes with (defaults to '?') - use ASCII 141 | * @return string 142 | * @package utf8 143 | */ 144 | function utf8_bad_replace($str, $replace = '?') 145 | { 146 | $UTF8_BAD = 147 | '([\x00-\x7F]' . # ASCII (including control chars) 148 | '|[\xC2-\xDF][\x80-\xBF]' . # non-overlong 2-byte 149 | '|\xE0[\xA0-\xBF][\x80-\xBF]' . # excluding overlongs 150 | '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' . # straight 3-byte 151 | '|\xED[\x80-\x9F][\x80-\xBF]' . # excluding surrogates 152 | '|\xF0[\x90-\xBF][\x80-\xBF]{2}' . # planes 1-3 153 | '|[\xF1-\xF3][\x80-\xBF]{3}' . # planes 4-15 154 | '|\xF4[\x80-\x8F][\x80-\xBF]{2}' . # plane 16 155 | '|(.{1}))'; # invalid byte 156 | ob_start(); 157 | while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) { 158 | if (!isset($matches[2])) { 159 | echo $matches[0]; 160 | } else { 161 | echo $replace; 162 | } 163 | $str = substr($str, strlen($matches[0])); 164 | } 165 | $result = ob_get_contents(); 166 | ob_end_clean(); 167 | return $result; 168 | } 169 | 170 | //-------------------------------------------------------------------- 171 | /** 172 | * Return code from utf8_bad_identify() when a five octet sequence is detected. 173 | * Note: 5 octets sequences are valid UTF-8 but are not supported by Unicode so 174 | * do not represent a useful character 175 | * @see utf8_bad_identify 176 | * @package utf8 177 | */ 178 | define('UTF8_BAD_5OCTET', 1); 179 | 180 | /** 181 | * Return code from utf8_bad_identify() when a six octet sequence is detected. 182 | * Note: 6 octets sequences are valid UTF-8 but are not supported by Unicode so 183 | * do not represent a useful character 184 | * @see utf8_bad_identify 185 | * @package utf8 186 | */ 187 | define('UTF8_BAD_6OCTET', 2); 188 | 189 | /** 190 | * Return code from utf8_bad_identify(). 191 | * Invalid octet for use as start of multi-byte UTF-8 sequence 192 | * @see utf8_bad_identify 193 | * @package utf8 194 | */ 195 | define('UTF8_BAD_SEQID', 3); 196 | 197 | /** 198 | * Return code from utf8_bad_identify(). 199 | * From Unicode 3.1, non-shortest form is illegal 200 | * @see utf8_bad_identify 201 | * @package utf8 202 | */ 203 | define('UTF8_BAD_NONSHORT', 4); 204 | 205 | /** 206 | * Return code from utf8_bad_identify(). 207 | * From Unicode 3.2, surrogate characters are illegal 208 | * @see utf8_bad_identify 209 | * @package utf8 210 | */ 211 | define('UTF8_BAD_SURROGATE', 5); 212 | 213 | /** 214 | * Return code from utf8_bad_identify(). 215 | * Codepoints outside the Unicode range are illegal 216 | * @see utf8_bad_identify 217 | * @package utf8 218 | */ 219 | define('UTF8_BAD_UNIOUTRANGE', 6); 220 | 221 | /** 222 | * Return code from utf8_bad_identify(). 223 | * Incomplete multi-octet sequence 224 | * Note: this is kind of a "catch-all" 225 | * @see utf8_bad_identify 226 | * @package utf8 227 | */ 228 | define('UTF8_BAD_SEQINCOMPLETE', 7); 229 | 230 | //-------------------------------------------------------------------- 231 | /** 232 | * Reports on the type of bad byte found in a UTF-8 string. Returns a 233 | * status code on the first bad byte found 234 | * 235 | * Joomla modification - As of PHP 7.4, curly brace access has been deprecated. As a result this function has been 236 | * modified to use square brace syntax 237 | * See https://github.com/php/php-src/commit/d574df63dc375f5fc9202ce5afde23f866b6450a 238 | * for additional references 239 | * 240 | * @author 241 | * @param string UTF-8 encoded string 242 | * @return mixed integer constant describing problem or FALSE if valid UTF-8 243 | * @see utf8_bad_explain 244 | * @see http://hsivonen.iki.fi/php-utf8/ 245 | * @package utf8 246 | */ 247 | function utf8_bad_identify($str, &$i) 248 | { 249 | $mState = 0; // cached expected number of octets after the current octet 250 | // until the beginning of the next UTF8 character sequence 251 | $mUcs4 = 0; // cached Unicode character 252 | $mBytes = 1; // cached expected number of octets in the current sequence 253 | 254 | $len = strlen($str); 255 | 256 | for ($i = 0; $i < $len; $i++) { 257 | $in = ord($str[$i]); 258 | 259 | if ($mState == 0) { 260 | // When mState is zero we expect either a US-ASCII character or a 261 | // multi-octet sequence. 262 | if (0 == (0x80 & ($in))) { 263 | // US-ASCII, pass straight through. 264 | $mBytes = 1; 265 | } elseif (0xC0 == (0xE0 & ($in))) { 266 | // First octet of 2 octet sequence 267 | $mUcs4 = ($in); 268 | $mUcs4 = ($mUcs4 & 0x1F) << 6; 269 | $mState = 1; 270 | $mBytes = 2; 271 | } elseif (0xE0 == (0xF0 & ($in))) { 272 | // First octet of 3 octet sequence 273 | $mUcs4 = ($in); 274 | $mUcs4 = ($mUcs4 & 0x0F) << 12; 275 | $mState = 2; 276 | $mBytes = 3; 277 | } elseif (0xF0 == (0xF8 & ($in))) { 278 | // First octet of 4 octet sequence 279 | $mUcs4 = ($in); 280 | $mUcs4 = ($mUcs4 & 0x07) << 18; 281 | $mState = 3; 282 | $mBytes = 4; 283 | } elseif (0xF8 == (0xFC & ($in))) { 284 | /* First octet of 5 octet sequence. 285 | * 286 | * This is illegal because the encoded codepoint must be either 287 | * (a) not the shortest form or 288 | * (b) outside the Unicode range of 0-0x10FFFF. 289 | */ 290 | 291 | return UTF8_BAD_5OCTET; 292 | } elseif (0xFC == (0xFE & ($in))) { 293 | // First octet of 6 octet sequence, see comments for 5 octet sequence. 294 | return UTF8_BAD_6OCTET; 295 | } else { 296 | // Current octet is neither in the US-ASCII range nor a legal first 297 | // octet of a multi-octet sequence. 298 | return UTF8_BAD_SEQID; 299 | } 300 | } else { 301 | // When mState is non-zero, we expect a continuation of the multi-octet 302 | // sequence 303 | if (0x80 == (0xC0 & ($in))) { 304 | // Legal continuation. 305 | $shift = ($mState - 1) * 6; 306 | $tmp = $in; 307 | $tmp = ($tmp & 0x0000003F) << $shift; 308 | $mUcs4 |= $tmp; 309 | 310 | /** 311 | * End of the multi-octet sequence. mUcs4 now contains the final 312 | * Unicode codepoint to be output 313 | */ 314 | if (0 == --$mState) { 315 | // From Unicode 3.1, non-shortest form is illegal 316 | if ( 317 | ((2 == $mBytes) && ($mUcs4 < 0x0080)) || 318 | ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 319 | ((4 == $mBytes) && ($mUcs4 < 0x10000)) 320 | ) { 321 | return UTF8_BAD_NONSHORT; 322 | 323 | // From Unicode 3.2, surrogate characters are illegal 324 | } elseif (($mUcs4 & 0xFFFFF800) == 0xD800) { 325 | return UTF8_BAD_SURROGATE; 326 | 327 | // Codepoints outside the Unicode range are illegal 328 | } elseif ($mUcs4 > 0x10FFFF) { 329 | return UTF8_BAD_UNIOUTRANGE; 330 | } 331 | 332 | //initialize UTF8 cache 333 | $mState = 0; 334 | $mUcs4 = 0; 335 | $mBytes = 1; 336 | } 337 | } else { 338 | // ((0xC0 & (*in) != 0x80) && (mState != 0)) 339 | // Incomplete multi-octet sequence. 340 | $i--; 341 | return UTF8_BAD_SEQINCOMPLETE; 342 | } 343 | } 344 | } 345 | 346 | if ($mState != 0) { 347 | // Incomplete multi-octet sequence. 348 | $i--; 349 | return UTF8_BAD_SEQINCOMPLETE; 350 | } 351 | 352 | // No bad octets found 353 | $i = null; 354 | return false; 355 | } 356 | 357 | //-------------------------------------------------------------------- 358 | /** 359 | * Takes a return code from utf8_bad_identify() are returns a message 360 | * (in English) explaining what the problem is. 361 | * @param int return code from utf8_bad_identify 362 | * @return mixed string message or FALSE if return code unknown 363 | * @see utf8_bad_identify 364 | * @package utf8 365 | */ 366 | function utf8_bad_explain($code) 367 | { 368 | switch ($code) { 369 | case UTF8_BAD_5OCTET: 370 | return 'Five octet sequences are valid UTF-8 but are not supported by Unicode'; 371 | break; 372 | 373 | case UTF8_BAD_6OCTET: 374 | return 'Six octet sequences are valid UTF-8 but are not supported by Unicode'; 375 | break; 376 | 377 | case UTF8_BAD_SEQID: 378 | return 'Invalid octet for use as start of multi-byte UTF-8 sequence'; 379 | break; 380 | 381 | case UTF8_BAD_NONSHORT: 382 | return 'From Unicode 3.1, non-shortest form is illegal'; 383 | break; 384 | 385 | case UTF8_BAD_SURROGATE: 386 | return 'From Unicode 3.2, surrogate characters are illegal'; 387 | break; 388 | 389 | case UTF8_BAD_UNIOUTRANGE: 390 | return 'Codepoints outside the Unicode range are illegal'; 391 | break; 392 | 393 | case UTF8_BAD_SEQINCOMPLETE: 394 | return 'Incomplete multi-octet sequence'; 395 | break; 396 | } 397 | 398 | trigger_error('Unknown error code: ' . $code, E_USER_WARNING); 399 | return false; 400 | } 401 | -------------------------------------------------------------------------------- /src/phputf8/utils/patterns.php: -------------------------------------------------------------------------------- 1 | 22 | * @param string string to locate index in 23 | * @param int (n times) 24 | * @return mixed - int if only one input int, array if more 25 | * @return boolean TRUE if it's all ASCII 26 | * @package utf8 27 | */ 28 | function utf8_byte_position() 29 | { 30 | $args = func_get_args(); 31 | $str =& array_shift($args); 32 | if (!is_string($str)) { 33 | return false; 34 | } 35 | 36 | $result = []; 37 | 38 | // trivial byte index, character offset pair 39 | $prev = [0,0]; 40 | 41 | // use a short piece of str to estimate bytes per character 42 | // $i (& $j) -> byte indexes into $str 43 | $i = utf8_locate_next_chr($str, 300); 44 | 45 | // $c -> character offset into $str 46 | $c = strlen(utf8_decode(substr($str, 0, $i))); 47 | 48 | // deal with arguments from lowest to highest 49 | sort($args); 50 | 51 | foreach ($args as $offset) { 52 | // sanity checks FIXME 53 | 54 | // 0 is an easy check 55 | if ($offset == 0) { 56 | $result[] = 0; 57 | continue; 58 | } 59 | 60 | // ensure no endless looping 61 | $safety_valve = 50; 62 | 63 | do { 64 | if (($c - $prev[1]) == 0) { 65 | // Hack: gone past end of string 66 | $error = 0; 67 | $i = strlen($str); 68 | break; 69 | } 70 | 71 | $j = $i + (int)(($offset - $c) * ($i - $prev[0]) / ($c - $prev[1])); 72 | 73 | // correct to utf8 character boundary 74 | $j = utf8_locate_next_chr($str, $j); 75 | 76 | // save the index, offset for use next iteration 77 | $prev = [$i,$c]; 78 | 79 | if ($j > $i) { 80 | // determine new character offset 81 | $c += strlen(utf8_decode(substr($str, $i, $j - $i))); 82 | } else { 83 | // ditto 84 | $c -= strlen(utf8_decode(substr($str, $j, $i - $j))); 85 | } 86 | 87 | $error = abs($c - $offset); 88 | 89 | // ready for next time around 90 | $i = $j; 91 | 92 | // from 7 it is faster to iterate over the string 93 | } while (($error > 7) && --$safety_valve); 94 | 95 | if ($error && $error <= 7) { 96 | if ($c < $offset) { 97 | // move up 98 | while ($error--) { 99 | $i = utf8_locate_next_chr($str, ++$i); 100 | } 101 | } else { 102 | // move down 103 | while ($error--) { 104 | $i = utf8_locate_current_chr($str, --$i); 105 | } 106 | } 107 | 108 | // ready for next arg 109 | $c = $offset; 110 | } 111 | $result[] = $i; 112 | } 113 | 114 | if (count($result) == 1) { 115 | return $result[0]; 116 | } 117 | 118 | return $result; 119 | } 120 | 121 | //-------------------------------------------------------------------- 122 | /** 123 | * Given a string and any byte index, returns the byte index 124 | * of the start of the current UTF-8 character, relative to supplied 125 | * position. If the current character begins at the same place as the 126 | * supplied byte index, that byte index will be returned. Otherwise 127 | * this function will step backwards, looking for the index where 128 | * current UTF-8 character begins 129 | * @author Chris Smith 130 | * @param string 131 | * @param int byte index in the string 132 | * @return int byte index of start of next UTF-8 character 133 | * @package utf8 134 | */ 135 | function utf8_locate_current_chr(&$str, $idx) 136 | { 137 | if ($idx <= 0) { 138 | return 0; 139 | } 140 | 141 | $limit = strlen($str); 142 | if ($idx >= $limit) { 143 | return $limit; 144 | } 145 | 146 | // Binary value for any byte after the first in a multi-byte UTF-8 character 147 | // will be like 10xxxxxx so & 0xC0 can be used to detect this kind 148 | // of byte - assuming well formed UTF-8 149 | while ($idx && ((ord($str[$idx]) & 0xC0) == 0x80)) { 150 | $idx--; 151 | } 152 | 153 | return $idx; 154 | } 155 | 156 | //-------------------------------------------------------------------- 157 | /** 158 | * Given a string and any byte index, returns the byte index 159 | * of the start of the next UTF-8 character, relative to supplied 160 | * position. If the next character begins at the same place as the 161 | * supplied byte index, that byte index will be returned. 162 | * @author Chris Smith 163 | * @param string 164 | * @param int byte index in the string 165 | * @return int byte index of start of next UTF-8 character 166 | * @package utf8 167 | */ 168 | function utf8_locate_next_chr(&$str, $idx) 169 | { 170 | if ($idx <= 0) { 171 | return 0; 172 | } 173 | 174 | $limit = strlen($str); 175 | if ($idx >= $limit) { 176 | return $limit; 177 | } 178 | 179 | // Binary value for any byte after the first in a multi-byte UTF-8 character 180 | // will be like 10xxxxxx so & 0xC0 can be used to detect this kind 181 | // of byte - assuming well formed UTF-8 182 | while (($idx < $limit) && ((ord($str[$idx]) & 0xC0) == 0x80)) { 183 | $idx++; 184 | } 185 | 186 | return $idx; 187 | } 188 | -------------------------------------------------------------------------------- /src/phputf8/utils/specials.php: -------------------------------------------------------------------------------- 1 | 120 | * @param string $string The UTF8 string to strip of special chars 121 | * @param string (optional) $repl Replace special with this string 122 | * @return string with common non-alphanumeric characters removed 123 | * @see utf8_specials_pattern 124 | */ 125 | function utf8_strip_specials($string, $repl = '') 126 | { 127 | return preg_replace(utf8_specials_pattern(), $repl, $string); 128 | } 129 | -------------------------------------------------------------------------------- /src/phputf8/utils/unicode.php: -------------------------------------------------------------------------------- 1 | 0xFFFF. Occurrances of the BOM are ignored. Surrogates 23 | * are not allowed. 24 | * Returns false if the input string isn't a valid UTF-8 octet sequence 25 | * and raises a PHP error at level E_USER_WARNING 26 | * Note: this function has been modified slightly in this library to 27 | * trigger errors on encountering bad bytes 28 | * 29 | * Joomla modification - As of PHP 7.4, curly brace access has been deprecated. As a result this function has been 30 | * modified to use square brace syntax 31 | * See https://github.com/php/php-src/commit/d574df63dc375f5fc9202ce5afde23f866b6450a 32 | * for additional references 33 | * 34 | * @author 35 | * @param string UTF-8 encoded string 36 | * @return mixed array of unicode code points or FALSE if UTF-8 invalid 37 | * @see utf8_from_unicode 38 | * @see http://hsivonen.iki.fi/php-utf8/ 39 | * @package utf8 40 | */ 41 | function utf8_to_unicode($str) 42 | { 43 | $mState = 0; // cached expected number of octets after the current octet 44 | // until the beginning of the next UTF8 character sequence 45 | $mUcs4 = 0; // cached Unicode character 46 | $mBytes = 1; // cached expected number of octets in the current sequence 47 | 48 | $out = []; 49 | 50 | $len = strlen($str); 51 | 52 | for ($i = 0; $i < $len; $i++) { 53 | $in = ord($str[$i]); 54 | 55 | if ($mState == 0) { 56 | // When mState is zero we expect either a US-ASCII character or a 57 | // multi-octet sequence. 58 | if (0 == (0x80 & ($in))) { 59 | // US-ASCII, pass straight through. 60 | $out[] = $in; 61 | $mBytes = 1; 62 | } elseif (0xC0 == (0xE0 & ($in))) { 63 | // First octet of 2 octet sequence 64 | $mUcs4 = ($in); 65 | $mUcs4 = ($mUcs4 & 0x1F) << 6; 66 | $mState = 1; 67 | $mBytes = 2; 68 | } elseif (0xE0 == (0xF0 & ($in))) { 69 | // First octet of 3 octet sequence 70 | $mUcs4 = ($in); 71 | $mUcs4 = ($mUcs4 & 0x0F) << 12; 72 | $mState = 2; 73 | $mBytes = 3; 74 | } elseif (0xF0 == (0xF8 & ($in))) { 75 | // First octet of 4 octet sequence 76 | $mUcs4 = ($in); 77 | $mUcs4 = ($mUcs4 & 0x07) << 18; 78 | $mState = 3; 79 | $mBytes = 4; 80 | } elseif (0xF8 == (0xFC & ($in))) { 81 | /* First octet of 5 octet sequence. 82 | * 83 | * This is illegal because the encoded codepoint must be either 84 | * (a) not the shortest form or 85 | * (b) outside the Unicode range of 0-0x10FFFF. 86 | * Rather than trying to resynchronize, we will carry on until the end 87 | * of the sequence and let the later error handling code catch it. 88 | */ 89 | $mUcs4 = ($in); 90 | $mUcs4 = ($mUcs4 & 0x03) << 24; 91 | $mState = 4; 92 | $mBytes = 5; 93 | } elseif (0xFC == (0xFE & ($in))) { 94 | // First octet of 6 octet sequence, see comments for 5 octet sequence. 95 | $mUcs4 = ($in); 96 | $mUcs4 = ($mUcs4 & 1) << 30; 97 | $mState = 5; 98 | $mBytes = 6; 99 | } else { 100 | /* Current octet is neither in the US-ASCII range nor a legal first 101 | * octet of a multi-octet sequence. 102 | */ 103 | trigger_error( 104 | 'utf8_to_unicode: Illegal sequence identifier ' . 105 | 'in UTF-8 at byte ' . $i, 106 | E_USER_WARNING 107 | ); 108 | return false; 109 | } 110 | } else { 111 | // When mState is non-zero, we expect a continuation of the multi-octet 112 | // sequence 113 | if (0x80 == (0xC0 & ($in))) { 114 | // Legal continuation. 115 | $shift = ($mState - 1) * 6; 116 | $tmp = $in; 117 | $tmp = ($tmp & 0x0000003F) << $shift; 118 | $mUcs4 |= $tmp; 119 | 120 | /** 121 | * End of the multi-octet sequence. mUcs4 now contains the final 122 | * Unicode codepoint to be output 123 | */ 124 | if (0 == --$mState) { 125 | /* 126 | * Check for illegal sequences and codepoints. 127 | */ 128 | // From Unicode 3.1, non-shortest form is illegal 129 | if ( 130 | ((2 == $mBytes) && ($mUcs4 < 0x0080)) || 131 | ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 132 | ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 133 | (4 < $mBytes) || 134 | // From Unicode 3.2, surrogate characters are illegal 135 | (($mUcs4 & 0xFFFFF800) == 0xD800) || 136 | // Codepoints outside the Unicode range are illegal 137 | ($mUcs4 > 0x10FFFF) 138 | ) { 139 | trigger_error( 140 | 'utf8_to_unicode: Illegal sequence or codepoint ' . 141 | 'in UTF-8 at byte ' . $i, 142 | E_USER_WARNING 143 | ); 144 | 145 | return false; 146 | } 147 | 148 | if (0xFEFF != $mUcs4) { 149 | // BOM is legal but we don't want to output it 150 | $out[] = $mUcs4; 151 | } 152 | 153 | //initialize UTF8 cache 154 | $mState = 0; 155 | $mUcs4 = 0; 156 | $mBytes = 1; 157 | } 158 | } else { 159 | /** 160 | *((0xC0 & (*in) != 0x80) && (mState != 0)) 161 | * Incomplete multi-octet sequence. 162 | */ 163 | trigger_error( 164 | 'utf8_to_unicode: Incomplete multi-octet ' . 165 | ' sequence in UTF-8 at byte ' . $i, 166 | E_USER_WARNING 167 | ); 168 | 169 | return false; 170 | } 171 | } 172 | } 173 | return $out; 174 | } 175 | 176 | //-------------------------------------------------------------------- 177 | /** 178 | * Takes an array of ints representing the Unicode characters and returns 179 | * a UTF-8 string. Astral planes are supported ie. the ints in the 180 | * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 181 | * are not allowed. 182 | * Returns false if the input array contains ints that represent 183 | * surrogates or are outside the Unicode range 184 | * and raises a PHP error at level E_USER_WARNING 185 | * Note: this function has been modified slightly in this library to use 186 | * output buffering to concatenate the UTF-8 string (faster) as well as 187 | * reference the array by it's keys 188 | * @param array of unicode code points representing a string 189 | * @return mixed UTF-8 string or FALSE if array contains invalid code points 190 | * @author 191 | * @see utf8_to_unicode 192 | * @see http://hsivonen.iki.fi/php-utf8/ 193 | * @package utf8 194 | */ 195 | function utf8_from_unicode($arr) 196 | { 197 | ob_start(); 198 | 199 | foreach (array_keys($arr) as $k) { 200 | # ASCII range (including control chars) 201 | if (($arr[$k] >= 0) && ($arr[$k] <= 0x007f)) { 202 | echo chr($arr[$k]); 203 | 204 | # 2 byte sequence 205 | } elseif ($arr[$k] <= 0x07ff) { 206 | echo chr(0xc0 | ($arr[$k] >> 6)); 207 | echo chr(0x80 | ($arr[$k] & 0x003f)); 208 | 209 | # Byte order mark (skip) 210 | } elseif ($arr[$k] == 0xFEFF) { 211 | // nop -- zap the BOM 212 | 213 | # Test for illegal surrogates 214 | } elseif ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) { 215 | // found a surrogate 216 | trigger_error( 217 | 'utf8_from_unicode: Illegal surrogate ' . 218 | 'at index: ' . $k . ', value: ' . $arr[$k], 219 | E_USER_WARNING 220 | ); 221 | 222 | return false; 223 | 224 | # 3 byte sequence 225 | } elseif ($arr[$k] <= 0xffff) { 226 | echo chr(0xe0 | ($arr[$k] >> 12)); 227 | echo chr(0x80 | (($arr[$k] >> 6) & 0x003f)); 228 | echo chr(0x80 | ($arr[$k] & 0x003f)); 229 | 230 | # 4 byte sequence 231 | } elseif ($arr[$k] <= 0x10ffff) { 232 | echo chr(0xf0 | ($arr[$k] >> 18)); 233 | echo chr(0x80 | (($arr[$k] >> 12) & 0x3f)); 234 | echo chr(0x80 | (($arr[$k] >> 6) & 0x3f)); 235 | echo chr(0x80 | ($arr[$k] & 0x3f)); 236 | } else { 237 | trigger_error( 238 | 'utf8_from_unicode: Codepoint out of Unicode range ' . 239 | 'at index: ' . $k . ', value: ' . $arr[$k], 240 | E_USER_WARNING 241 | ); 242 | 243 | // out of range 244 | return false; 245 | } 246 | } 247 | 248 | $result = ob_get_contents(); 249 | ob_end_clean(); 250 | return $result; 251 | } 252 | -------------------------------------------------------------------------------- /src/phputf8/utils/validation.php: -------------------------------------------------------------------------------- 1 | 24 | * @param string UTF-8 encoded string 25 | * @return boolean true if valid 26 | * @see http://hsivonen.iki.fi/php-utf8/ 27 | * @see utf8_compliant 28 | * @package utf8 29 | */ 30 | function utf8_is_valid($str) 31 | { 32 | $mState = 0; // cached expected number of octets after the current octet 33 | // until the beginning of the next UTF8 character sequence 34 | $mUcs4 = 0; // cached Unicode character 35 | $mBytes = 1; // cached expected number of octets in the current sequence 36 | 37 | $len = strlen($str); 38 | 39 | for ($i = 0; $i < $len; $i++) { 40 | /* 41 | * Joomla modification - As of PHP 7.4, curly brace access has been deprecated. As a result the line below has 42 | * been modified to use square brace syntax 43 | * See https://github.com/php/php-src/commit/d574df63dc375f5fc9202ce5afde23f866b6450a 44 | * for additional references 45 | */ 46 | $in = ord($str[$i]); 47 | 48 | if ($mState == 0) { 49 | // When mState is zero we expect either a US-ASCII character or a 50 | // multi-octet sequence. 51 | if (0 == (0x80 & ($in))) { 52 | // US-ASCII, pass straight through. 53 | $mBytes = 1; 54 | } elseif (0xC0 == (0xE0 & ($in))) { 55 | // First octet of 2 octet sequence 56 | $mUcs4 = ($in); 57 | $mUcs4 = ($mUcs4 & 0x1F) << 6; 58 | $mState = 1; 59 | $mBytes = 2; 60 | } elseif (0xE0 == (0xF0 & ($in))) { 61 | // First octet of 3 octet sequence 62 | $mUcs4 = ($in); 63 | $mUcs4 = ($mUcs4 & 0x0F) << 12; 64 | $mState = 2; 65 | $mBytes = 3; 66 | } elseif (0xF0 == (0xF8 & ($in))) { 67 | // First octet of 4 octet sequence 68 | $mUcs4 = ($in); 69 | $mUcs4 = ($mUcs4 & 0x07) << 18; 70 | $mState = 3; 71 | $mBytes = 4; 72 | } elseif (0xF8 == (0xFC & ($in))) { 73 | /* First octet of 5 octet sequence. 74 | * 75 | * This is illegal because the encoded codepoint must be either 76 | * (a) not the shortest form or 77 | * (b) outside the Unicode range of 0-0x10FFFF. 78 | * Rather than trying to resynchronize, we will carry on until the end 79 | * of the sequence and let the later error handling code catch it. 80 | */ 81 | $mUcs4 = ($in); 82 | $mUcs4 = ($mUcs4 & 0x03) << 24; 83 | $mState = 4; 84 | $mBytes = 5; 85 | } elseif (0xFC == (0xFE & ($in))) { 86 | // First octet of 6 octet sequence, see comments for 5 octet sequence. 87 | $mUcs4 = ($in); 88 | $mUcs4 = ($mUcs4 & 1) << 30; 89 | $mState = 5; 90 | $mBytes = 6; 91 | } else { 92 | /* Current octet is neither in the US-ASCII range nor a legal first 93 | * octet of a multi-octet sequence. 94 | */ 95 | return false; 96 | } 97 | } else { 98 | // When mState is non-zero, we expect a continuation of the multi-octet 99 | // sequence 100 | if (0x80 == (0xC0 & ($in))) { 101 | // Legal continuation. 102 | $shift = ($mState - 1) * 6; 103 | $tmp = $in; 104 | $tmp = ($tmp & 0x0000003F) << $shift; 105 | $mUcs4 |= $tmp; 106 | 107 | /** 108 | * End of the multi-octet sequence. mUcs4 now contains the final 109 | * Unicode codepoint to be output 110 | */ 111 | if (0 == --$mState) { 112 | /* 113 | * Check for illegal sequences and codepoints. 114 | */ 115 | // From Unicode 3.1, non-shortest form is illegal 116 | if ( 117 | ((2 == $mBytes) && ($mUcs4 < 0x0080)) || 118 | ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 119 | ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 120 | (4 < $mBytes) || 121 | // From Unicode 3.2, surrogate characters are illegal 122 | (($mUcs4 & 0xFFFFF800) == 0xD800) || 123 | // Codepoints outside the Unicode range are illegal 124 | ($mUcs4 > 0x10FFFF) 125 | ) { 126 | return false; 127 | } 128 | 129 | //initialize UTF8 cache 130 | $mState = 0; 131 | $mUcs4 = 0; 132 | $mBytes = 1; 133 | } 134 | } else { 135 | /** 136 | *((0xC0 & (*in) != 0x80) && (mState != 0)) 137 | * Incomplete multi-octet sequence. 138 | */ 139 | 140 | return false; 141 | } 142 | } 143 | } 144 | return true; 145 | } 146 | 147 | //-------------------------------------------------------------------- 148 | /** 149 | * Tests whether a string complies as UTF-8. This will be much 150 | * faster than utf8_is_valid but will pass five and six octet 151 | * UTF-8 sequences, which are not supported by Unicode and 152 | * so cannot be displayed correctly in a browser. In other words 153 | * it is not as strict as utf8_is_valid but it's faster. If you use 154 | * is to validate user input, you place yourself at the risk that 155 | * attackers will be able to inject 5 and 6 byte sequences (which 156 | * may or may not be a significant risk, depending on what you are 157 | * are doing) 158 | * @see utf8_is_valid 159 | * @see http://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805 160 | * @param string UTF-8 string to check 161 | * @return boolean TRUE if string is valid UTF-8 162 | * @package utf8 163 | */ 164 | function utf8_compliant($str) 165 | { 166 | if (strlen($str) == 0) { 167 | return true; 168 | } 169 | // If even just the first character can be matched, when the /u 170 | // modifier is used, then it's valid UTF-8. If the UTF-8 is somehow 171 | // invalid, nothing at all will match, even if the string contains 172 | // some valid sequences 173 | return (preg_match('/^.{1}/us', $str, $ar) == 1); 174 | } 175 | --------------------------------------------------------------------------------