├── LICENSE
├── README.md
├── SECURITY.md
├── composer.json
└── src
    ├── Inflector.php
    ├── Normalise.php
    ├── StringHelper.php
    └── phputf8
        ├── LICENSE
        ├── README
        ├── mbstring
            └── core.php
        ├── native
            └── core.php
        ├── ord.php
        ├── str_ireplace.php
        ├── str_pad.php
        ├── str_split.php
        ├── strcasecmp.php
        ├── strcspn.php
        ├── stristr.php
        ├── strrev.php
        ├── strspn.php
        ├── substr_replace.php
        ├── trim.php
        ├── ucfirst.php
        ├── ucwords.php
        ├── utf8.php
        └── utils
            ├── ascii.php
            ├── bad.php
            ├── patterns.php
            ├── position.php
            ├── specials.php
            ├── unicode.php
            └── validation.php


/LICENSE:
--------------------------------------------------------------------------------
  1 | GNU GENERAL PUBLIC LICENSE
  2 | 				Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.
  5 |  59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 | 				Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Library General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 | 			GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 | 	a) You must cause the modified files to carry prominent notices
 96 | 	stating that you changed the files and the date of any change.
 97 | 
 98 | 	b) You must cause any work that you distribute or publish, that in
 99 | 	whole or in part contains or is derived from the Program or any
100 | 	part thereof, to be licensed as a whole at no charge to all third
101 | 	parties under the terms of this License.
102 | 
103 | 	c) If the modified program normally reads commands interactively
104 | 	when run, you must cause it, when started running for such
105 | 	interactive use in the most ordinary way, to print or display an
106 | 	announcement including an appropriate copyright notice and a
107 | 	notice that there is no warranty (or else, saying that you provide
108 | 	a warranty) and that users may redistribute the program under
109 | 	these conditions, and telling the user how to view a copy of this
110 | 	License.  (Exception: if the Program itself is interactive but
111 | 	does not normally print such an announcement, your work based on
112 | 	the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 | 	a) Accompany it with the complete corresponding machine-readable
139 | 	source code, which must be distributed under the terms of Sections
140 | 	1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 | 	b) Accompany it with a written offer, valid for at least three
143 | 	years, to give any third party, for a charge no more than your
144 | 	cost of physically performing source distribution, a complete
145 | 	machine-readable copy of the corresponding source code, to be
146 | 	distributed under the terms of Sections 1 and 2 above on a medium
147 | 	customarily used for software interchange; or,
148 | 
149 | 	c) Accompany it with the information you received as to the offer
150 | 	to distribute corresponding source code.  (This alternative is
151 | 	allowed only for noncommercial distribution and only if you
152 | 	received the program in object code or executable form with such
153 | 	an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 | 				NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 | 			 END OF TERMS AND CONDITIONS
281 | 
282 | 		How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 | 	<one line to give the program's name and a brief idea of what it does.>
294 | 	Copyright (C) <year>  <name of author>
295 | 
296 | 	This program is free software; you can redistribute it and/or modify
297 | 	it under the terms of the GNU General Public License as published by
298 | 	the Free Software Foundation; either version 2 of the License, or
299 | 	(at your option) any later version.
300 | 
301 | 	This program is distributed in the hope that it will be useful,
302 | 	but WITHOUT ANY WARRANTY; without even the implied warranty of
303 | 	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 | 	GNU General Public License for more details.
305 | 
306 | 	You should have received a copy of the GNU General Public License
307 | 	along with this program; if not, write to the Free Software
308 | 	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
309 | 
310 | 
311 | Also add information on how to contact you by electronic and paper mail.
312 | 
313 | If the program is interactive, make it output a short notice like this
314 | when it starts in an interactive mode:
315 | 
316 | 	Gnomovision version 69, Copyright (C) year name of author
317 | 	Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
318 | 	This is free software, and you are welcome to redistribute it
319 | 	under certain conditions; type `show c' for details.
320 | 
321 | The hypothetical commands `show w' and `show c' should show the appropriate
322 | parts of the General Public License.  Of course, the commands you use may
323 | be called something other than `show w' and `show c'; they could even be
324 | mouse-clicks or menu items--whatever suits your program.
325 | 
326 | You should also get your employer (if you work as a programmer) or your
327 | school, if any, to sign a "copyright disclaimer" for the program, if
328 | necessary.  Here is a sample; alter the names:
329 | 
330 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
331 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
332 | 
333 |   <signature of Ty Coon>, 1 April 1989
334 |   Ty Coon, President of Vice
335 | 
336 | This General Public License does not permit incorporating your program into
337 | proprietary programs.  If your program is a subroutine library, you may
338 | consider it more useful to permit linking proprietary applications with the
339 | library.  If this is what you want to do, use the GNU Library General
340 | Public License instead of this License.
341 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # The String Package [![Build Status](https://ci.joomla.org/api/badges/joomla-framework/string/status.svg?ref=refs/heads/3.x-dev)](https://ci.joomla.org/joomla-framework/string)
 2 | 
 3 | [![Latest Stable Version](https://poser.pugx.org/joomla/string/v/stable)](https://packagist.org/packages/joomla/string)
 4 | [![Total Downloads](https://poser.pugx.org/joomla/string/downloads)](https://packagist.org/packages/joomla/string)
 5 | [![Latest Unstable Version](https://poser.pugx.org/joomla/string/v/unstable)](https://packagist.org/packages/joomla/string)
 6 | [![License](https://poser.pugx.org/joomla/string/license)](https://packagist.org/packages/joomla/string)
 7 | 
 8 | ## Installation via Composer
 9 | 
10 | Add `"joomla/string": "~3.0"` to the require block in your composer.json and then run `composer install`.
11 | 
12 | ```json
13 | {
14 | 	"require": {
15 | 		"joomla/string": "~3.0"
16 | 	}
17 | }
18 | ```
19 | 
20 | Alternatively, you can simply run the following from the command line:
21 | 
22 | ```sh
23 | composer require joomla/string "~3.0"
24 | ```
25 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Supported Versions
 4 | 
 5 | These versions are currently being supported with security updates:
 6 | 
 7 | | Version | Supported          |
 8 | | ------- | ------------------ |
 9 | | 3.x.x   | :white_check_mark: |
10 | | 2.0.x   | :white_check_mark: |
11 | | 1.4.x   | :x:                |
12 | | < 1.4   | :x:                |
13 | 
14 | ## Reporting a Vulnerability
15 | 
16 | To report a security issue in the core Joomla! CMS or Framework, or with a joomla.org website, please submit
17 | [the form on our portal](https://developer.joomla.org/security/contact-the-team.html) containing as much detail
18 | as possible about the issue. Additional information about our security team and their processes may be found on
19 | our [Security page](https://developer.joomla.org/security.html).
20 | 
21 | To report an issue in a Joomla! extension, please submit it to the [Vulnerable Extensions List](https://vel.joomla.org/submit-vel).
22 | 
23 | For support with a site which has been attacked, please visit the [Joomla! Forum](https://forum.joomla.org/viewforum.php?f=714).
24 | 


--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "joomla/string",
 3 |     "type": "joomla-package",
 4 |     "description": "Joomla String Package",
 5 |     "keywords": ["joomla", "framework", "string"],
 6 |     "homepage": "https://github.com/joomla-framework/string",
 7 |     "license": "GPL-2.0-or-later",
 8 |     "require": {
 9 |         "php": "^8.1.0",
10 |         "symfony/deprecation-contracts": "^2|^3"
11 |     },
12 |     "require-dev": {
13 |         "doctrine/inflector": "^1.2",
14 |         "joomla/test": "^3.0",
15 |         "phpunit/phpunit": "^9.5.28",
16 |         "squizlabs/php_codesniffer": "^3.7.2",
17 |         "phpstan/phpstan": "^2.0",
18 |         "phpstan/phpstan-deprecation-rules": "^2.0",
19 |         "phan/phan": "^5.4.2"
20 |     },
21 |     "conflict": {
22 |         "doctrine/inflector": "<1.2"
23 |     },
24 |     "suggest": {
25 |         "ext-mbstring": "For improved processing",
26 |         "doctrine/inflector": "To use the string inflector"
27 |     },
28 |     "autoload": {
29 |         "psr-4": {
30 |             "Joomla\\String\\": "src/"
31 |         },
32 |         "files": [
33 |             "src/phputf8/utf8.php",
34 |             "src/phputf8/ord.php",
35 |             "src/phputf8/str_ireplace.php",
36 |             "src/phputf8/str_pad.php",
37 |             "src/phputf8/str_split.php",
38 |             "src/phputf8/strcasecmp.php",
39 |             "src/phputf8/strcspn.php",
40 |             "src/phputf8/stristr.php",
41 |             "src/phputf8/strrev.php",
42 |             "src/phputf8/strspn.php",
43 |             "src/phputf8/trim.php",
44 |             "src/phputf8/ucfirst.php",
45 |             "src/phputf8/ucwords.php",
46 |             "src/phputf8/utils/ascii.php",
47 |             "src/phputf8/utils/validation.php"
48 |         ]
49 |     },
50 |     "autoload-dev": {
51 |         "psr-4": {
52 |             "Joomla\\String\\Tests\\": "Tests/"
53 |         }
54 |     },
55 |     "minimum-stability": "dev",
56 |     "extra": {
57 |         "branch-alias": {
58 |             "dev-2.0-dev": "2.0-dev",
59 |             "dev-3.x-dev": "3.0-dev"
60 |         }
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/Inflector.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | /**
  4 |  * Part of the Joomla Framework String Package
  5 |  *
  6 |  * @copyright  Copyright (C) 2005 - 2021 Open Source Matters, Inc. All rights reserved.
  7 |  * @license    GNU General Public License version 2 or later; see LICENSE
  8 |  */
  9 | 
 10 | namespace Joomla\String;
 11 | 
 12 | use Doctrine\Common\Inflector\Inflector as DoctrineInflector;
 13 | 
 14 | /**
 15 |  * Joomla Framework String Inflector Class
 16 |  *
 17 |  * The Inflector transforms words
 18 |  *
 19 |  * @since  1.0
 20 |  */
 21 | class Inflector extends DoctrineInflector
 22 | {
 23 |     /**
 24 |      * The singleton instance.
 25 |      *
 26 |      * @var    Inflector
 27 |      * @since  1.0
 28 |      * @deprecated  3.0
 29 |      */
 30 |     private static $instance;
 31 | 
 32 |     /**
 33 |      * The inflector rules for countability.
 34 |      *
 35 |      * @var    array
 36 |      * @since  2.0.0
 37 |      */
 38 |     private static $countable = [
 39 |         'rules' => [
 40 |             'id',
 41 |             'hits',
 42 |             'clicks',
 43 |         ],
 44 |     ];
 45 | 
 46 |     /**
 47 |      * Adds inflection regex rules to the inflector.
 48 |      *
 49 |      * @param   mixed   $data      A string or an array of strings or regex rules to add.
 50 |      * @param   string  $ruleType  The rule type: singular | plural | countable
 51 |      *
 52 |      * @return  void
 53 |      *
 54 |      * @since   1.0
 55 |      * @throws  \InvalidArgumentException
 56 |      */
 57 |     private function addRule($data, string $ruleType)
 58 |     {
 59 |         if (\is_string($data)) {
 60 |             $data = [$data];
 61 |         } elseif (!\is_array($data)) {
 62 |             throw new \InvalidArgumentException('Invalid inflector rule data.');
 63 |         } elseif (!\in_array($ruleType, ['singular', 'plural', 'countable'])) {
 64 |             throw new \InvalidArgumentException('Unsupported rule type.');
 65 |         }
 66 | 
 67 |         if ($ruleType === 'countable') {
 68 |             foreach ($data as $rule) {
 69 |                 // Ensure a string is pushed.
 70 |                 array_push(self::$countable['rules'], (string) $rule);
 71 |             }
 72 |         } else {
 73 |             static::rules($ruleType, $data);
 74 |         }
 75 |     }
 76 | 
 77 |     /**
 78 |      * Adds a countable word.
 79 |      *
 80 |      * @param   mixed  $data  A string or an array of strings to add.
 81 |      *
 82 |      * @return  $this
 83 |      *
 84 |      * @since   1.0
 85 |      */
 86 |     public function addCountableRule($data)
 87 |     {
 88 |         $this->addRule($data, 'countable');
 89 | 
 90 |         return $this;
 91 |     }
 92 | 
 93 |     /**
 94 |      * Adds a specific singular-plural pair for a word.
 95 |      *
 96 |      * @param   string  $singular  The singular form of the word.
 97 |      * @param   string  $plural    The plural form of the word. If omitted, it is assumed the singular and plural are identical.
 98 |      *
 99 |      * @return  $this
100 |      *
101 |      * @since   1.0
102 |      * @deprecated  3.0  Use Doctrine\Common\Inflector\Inflector::rules() instead.
103 |      */
104 |     public function addWord($singular, $plural = '')
105 |     {
106 |         trigger_deprecation(
107 |             'joomla/string',
108 |             '2.0.0',
109 |             '%s() is deprecated and will be removed in 3.0, use %s::rules() instead.',
110 |             __METHOD__,
111 |             DoctrineInflector::class
112 |         );
113 | 
114 |         if ($plural !== '') {
115 |             static::rules(
116 |                 'plural',
117 |                 [
118 |                     'irregular' => [$plural => $singular],
119 |                 ]
120 |             );
121 | 
122 |             static::rules(
123 |                 'singular',
124 |                 [
125 |                     'irregular' => [$singular => $plural],
126 |                 ]
127 |             );
128 |         } else {
129 |             static::rules(
130 |                 'plural',
131 |                 [
132 |                     'uninflected' => [$singular],
133 |                 ]
134 |             );
135 | 
136 |             static::rules(
137 |                 'singular',
138 |                 [
139 |                     'uninflected' => [$singular],
140 |                 ]
141 |             );
142 |         }
143 | 
144 |         return $this;
145 |     }
146 | 
147 |     /**
148 |      * Adds a pluralisation rule.
149 |      *
150 |      * @param   mixed  $data  A string or an array of regex rules to add.
151 |      *
152 |      * @return  $this
153 |      *
154 |      * @since   1.0
155 |      * @deprecated  3.0  Use Doctrine\Common\Inflector\Inflector::rules() instead.
156 |      */
157 |     public function addPluraliseRule($data)
158 |     {
159 |         trigger_deprecation(
160 |             'joomla/string',
161 |             '2.0.0',
162 |             '%s() is deprecated and will be removed in 3.0, use %s::rules() instead.',
163 |             __METHOD__,
164 |             DoctrineInflector::class
165 |         );
166 | 
167 |         $this->addRule($data, 'plural');
168 | 
169 |         return $this;
170 |     }
171 | 
172 |     /**
173 |      * Adds a singularisation rule.
174 |      *
175 |      * @param   mixed  $data  A string or an array of regex rules to add.
176 |      *
177 |      * @return  $this
178 |      *
179 |      * @since   1.0
180 |      * @deprecated  3.0  Use Doctrine\Common\Inflector\Inflector::rules() instead.
181 |      */
182 |     public function addSingulariseRule($data)
183 |     {
184 |         trigger_deprecation(
185 |             'joomla/string',
186 |             '2.0.0',
187 |             '%s() is deprecated and will be removed in 3.0, use %s::rules() instead.',
188 |             __METHOD__,
189 |             DoctrineInflector::class
190 |         );
191 | 
192 |         $this->addRule($data, 'singular');
193 | 
194 |         return $this;
195 |     }
196 | 
197 |     /**
198 |      * Gets an instance of the Inflector singleton.
199 |      *
200 |      * @param   boolean  $new  If true (default is false), returns a new instance regardless if one exists. This argument is mainly used for testing.
201 |      *
202 |      * @return  static
203 |      *
204 |      * @since   1.0
205 |      * @deprecated  3.0  Use static methods without a class instance instead.
206 |      */
207 |     public static function getInstance($new = false)
208 |     {
209 |         trigger_deprecation(
210 |             'joomla/string',
211 |             '2.0.0',
212 |             '%s() is deprecated and will be removed in 3.0.',
213 |             __METHOD__
214 |         );
215 | 
216 |         if ($new) {
217 |             return new static();
218 |         }
219 | 
220 |         if (!\is_object(self::$instance)) {
221 |             self::$instance = new static();
222 |         }
223 | 
224 |         return self::$instance;
225 |     }
226 | 
227 |     /**
228 |      * Checks if a word is countable.
229 |      *
230 |      * @param   string  $word  The string input.
231 |      *
232 |      * @return  boolean  True if word is countable, false otherwise.
233 |      *
234 |      * @since   1.0
235 |      */
236 |     public function isCountable($word)
237 |     {
238 |         return \in_array($word, self::$countable['rules']);
239 |     }
240 | 
241 |     /**
242 |      * Checks if a word is in a plural form.
243 |      *
244 |      * @param   string  $word  The string input.
245 |      *
246 |      * @return  boolean  True if word is plural, false if not.
247 |      *
248 |      * @since   1.0
249 |      */
250 |     public function isPlural($word)
251 |     {
252 |         return $this->toPlural($this->toSingular($word)) === $word;
253 |     }
254 | 
255 |     /**
256 |      * Checks if a word is in a singular form.
257 |      *
258 |      * @param   string  $word  The string input.
259 |      *
260 |      * @return  boolean  True if word is singular, false if not.
261 |      *
262 |      * @since   1.0
263 |      */
264 |     public function isSingular($word)
265 |     {
266 |         return $this->toSingular($word) === $word;
267 |     }
268 | 
269 |     /**
270 |      * Converts a word into its plural form.
271 |      *
272 |      * @param   string  $word  The singular word to pluralise.
273 |      *
274 |      * @return  string  The word in plural form.
275 |      *
276 |      * @since   1.0
277 |      * @deprecated  3.0  Use Doctrine\Common\Inflector\Inflector::pluralize() instead.
278 |      */
279 |     public function toPlural($word)
280 |     {
281 |         trigger_deprecation(
282 |             'joomla/string',
283 |             '2.0.0',
284 |             '%s() is deprecated and will be removed in 3.0, use %s::pluralize() instead.',
285 |             __METHOD__,
286 |             DoctrineInflector::class
287 |         );
288 | 
289 |         return static::pluralize($word);
290 |     }
291 | 
292 |     /**
293 |      * Converts a word into its singular form.
294 |      *
295 |      * @param   string  $word  The plural word to singularise.
296 |      *
297 |      * @return  string  The word in singular form.
298 |      *
299 |      * @since   1.0
300 |      * @deprecated  3.0  Use Doctrine\Common\Inflector\Inflector::singularize() instead.
301 |      */
302 |     public function toSingular($word)
303 |     {
304 |         trigger_deprecation(
305 |             'joomla/string',
306 |             '2.0.0',
307 |             '%s() is deprecated and will be removed in 3.0, use %s::singularize() instead.',
308 |             __METHOD__,
309 |             DoctrineInflector::class
310 |         );
311 | 
312 |         return static::singularize($word);
313 |     }
314 | }
315 | 


--------------------------------------------------------------------------------
/src/Normalise.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | /**
  4 |  * Part of the Joomla Framework String Package
  5 |  *
  6 |  * @copyright  Copyright (C) 2005 - 2021 Open Source Matters, Inc. All rights reserved.
  7 |  * @license    GNU General Public License version 2 or later; see LICENSE
  8 |  */
  9 | 
 10 | namespace Joomla\String;
 11 | 
 12 | /**
 13 |  * Joomla Framework String Normalise Class
 14 |  *
 15 |  * @since  1.0
 16 |  */
 17 | abstract class Normalise
 18 | {
 19 |     /**
 20 |      * Method to convert a string from camel case.
 21 |      *
 22 |      * This method offers two modes. Grouped allows for splitting on groups of uppercase characters as follows:
 23 |      *
 24 |      * "FooBarABCDef"            becomes  array("Foo", "Bar", "ABC", "Def")
 25 |      * "JFooBar"                 becomes  array("J", "Foo", "Bar")
 26 |      * "J001FooBar002"           becomes  array("J001", "Foo", "Bar002")
 27 |      * "abcDef"                  becomes  array("abc", "Def")
 28 |      * "abc_defGhi_Jkl"          becomes  array("abc_def", "Ghi_Jkl")
 29 |      * "ThisIsA_NASAAstronaut"   becomes  array("This", "Is", "A_NASA", "Astronaut"))
 30 |      * "JohnFitzgerald_Kennedy"  becomes  array("John", "Fitzgerald_Kennedy"))
 31 |      *
 32 |      * Non-grouped will split strings at each uppercase character.
 33 |      *
 34 |      * @param   string   $input    The string input (ASCII only).
 35 |      * @param   boolean  $grouped  Optionally allows splitting on groups of uppercase characters.
 36 |      *
 37 |      * @return  array|string  The space separated string, as an array if grouped.
 38 |      *
 39 |      * @since   1.0
 40 |      */
 41 |     public static function fromCamelCase($input, $grouped = false)
 42 |     {
 43 |         return $grouped
 44 |             ? preg_split('/(?<=[^A-Z_])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][^A-Z_])/x', $input)
 45 |             : trim(preg_replace('#([A-Z])#', ' $1', $input));
 46 |     }
 47 | 
 48 |     /**
 49 |      * Method to convert a string into camel case.
 50 |      *
 51 |      * @param   string  $input  The string input (ASCII only).
 52 |      *
 53 |      * @return  string  The camel case string.
 54 |      *
 55 |      * @since   1.0
 56 |      */
 57 |     public static function toCamelCase($input)
 58 |     {
 59 |         // Convert words to uppercase and then remove spaces.
 60 |         $input = static::toSpaceSeparated($input);
 61 |         $input = ucwords($input);
 62 |         $input = str_ireplace(' ', '', $input);
 63 | 
 64 |         return $input;
 65 |     }
 66 | 
 67 |     /**
 68 |      * Method to convert a string into dash separated form.
 69 |      *
 70 |      * @param   string  $input  The string input (ASCII only).
 71 |      *
 72 |      * @return  string  The dash separated string.
 73 |      *
 74 |      * @since   1.0
 75 |      */
 76 |     public static function toDashSeparated($input)
 77 |     {
 78 |         // Convert spaces and underscores to dashes.
 79 |         return preg_replace('#[ \-_]+#', '-', $input);
 80 |     }
 81 | 
 82 |     /**
 83 |      * Method to convert a string into space separated form.
 84 |      *
 85 |      * @param   string  $input  The string input (ASCII only).
 86 |      *
 87 |      * @return  string  The space separated string.
 88 |      *
 89 |      * @since   1.0
 90 |      */
 91 |     public static function toSpaceSeparated($input)
 92 |     {
 93 |         // Convert underscores and dashes to spaces.
 94 |         return preg_replace('#[ \-_]+#', ' ', $input);
 95 |     }
 96 | 
 97 |     /**
 98 |      * Method to convert a string into underscore separated form.
 99 |      *
100 |      * @param   string  $input  The string input (ASCII only).
101 |      *
102 |      * @return  string  The underscore separated string.
103 |      *
104 |      * @since   1.0
105 |      */
106 |     public static function toUnderscoreSeparated($input)
107 |     {
108 |         // Convert spaces and dashes to underscores.
109 |         return preg_replace('#[ \-_]+#', '_', $input);
110 |     }
111 | 
112 |     /**
113 |      * Method to convert a string into variable form.
114 |      *
115 |      * @param   string  $input  The string input (ASCII only).
116 |      *
117 |      * @return  string  The variable string.
118 |      *
119 |      * @since   1.0
120 |      */
121 |     public static function toVariable($input)
122 |     {
123 |         // Remove dashes and underscores, then convert to camel case.
124 |         $input = static::toCamelCase($input);
125 | 
126 |         // Remove leading digits.
127 |         $input = preg_replace('#^[0-9]+#', '', $input);
128 | 
129 |         // Lowercase the first character.
130 |         $input = lcfirst($input);
131 | 
132 |         return $input;
133 |     }
134 | 
135 |     /**
136 |      * Method to convert a string into key form.
137 |      *
138 |      * @param   string  $input  The string input (ASCII only).
139 |      *
140 |      * @return  string  The key string.
141 |      *
142 |      * @since   1.0
143 |      */
144 |     public static function toKey($input)
145 |     {
146 |         // Remove spaces and dashes, then convert to lower case.
147 |         return strtolower(static::toUnderscoreSeparated($input));
148 |     }
149 | }
150 | 


--------------------------------------------------------------------------------
/src/StringHelper.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | /**
  4 |  * Part of the Joomla Framework String Package
  5 |  *
  6 |  * @copyright  Copyright (C) 2005 - 2021 Open Source Matters, Inc. All rights reserved.
  7 |  * @license    GNU General Public License version 2 or later; see LICENSE
  8 |  */
  9 | 
 10 | namespace Joomla\String;
 11 | 
 12 | // PHP mbstring and iconv local configuration
 13 | @ini_set('default_charset', 'UTF-8');
 14 | 
 15 | /**
 16 |  * String handling class for UTF-8 data wrapping the phputf8 library. All functions assume the validity of UTF-8 strings.
 17 |  *
 18 |  * @since  1.3.0
 19 |  */
 20 | abstract class StringHelper
 21 | {
 22 |     /**
 23 |      * Increment styles.
 24 |      *
 25 |      * @var    array
 26 |      * @since  1.3.0
 27 |      */
 28 |     protected static $incrementStyles = [
 29 |         'dash' => [
 30 |             '#-(\d+)$#',
 31 |             '-%d',
 32 |         ],
 33 |         'default' => [
 34 |             ['#\((\d+)\)$#', '#\(\d+\)$#'],
 35 |             [' (%d)', '(%d)'],
 36 |         ],
 37 |     ];
 38 | 
 39 |     /**
 40 |      * Increments a trailing number in a string.
 41 |      *
 42 |      * Used to easily create distinct labels when copying objects. The method has the following styles:
 43 |      *
 44 |      * default: "Label" becomes "Label (2)"
 45 |      * dash:    "Label" becomes "Label-2"
 46 |      *
 47 |      * @param   string       $string  The source string.
 48 |      * @param   string|null  $style   The the style (default|dash).
 49 |      * @param   integer      $n       If supplied, this number is used for the copy, otherwise it is the 'next' number.
 50 |      *
 51 |      * @return  string  The incremented string.
 52 |      *
 53 |      * @since   1.3.0
 54 |      */
 55 |     public static function increment($string, $style = 'default', $n = 0)
 56 |     {
 57 |         $styleSpec = static::$incrementStyles[$style] ?? static::$incrementStyles['default'];
 58 | 
 59 |         // Regular expression search and replace patterns.
 60 |         if (\is_array($styleSpec[0])) {
 61 |             $rxSearch  = $styleSpec[0][0];
 62 |             $rxReplace = $styleSpec[0][1];
 63 |         } else {
 64 |             $rxSearch = $rxReplace = $styleSpec[0];
 65 |         }
 66 | 
 67 |         // New and old (existing) sprintf formats.
 68 |         if (\is_array($styleSpec[1])) {
 69 |             $newFormat = $styleSpec[1][0];
 70 |             $oldFormat = $styleSpec[1][1];
 71 |         } else {
 72 |             $newFormat = $oldFormat = $styleSpec[1];
 73 |         }
 74 | 
 75 |         // Check if we are incrementing an existing pattern, or appending a new one.
 76 |         if (preg_match($rxSearch, $string, $matches)) {
 77 |             $n      = empty($n) ? ($matches[1] + 1) : $n;
 78 |             $string = preg_replace($rxReplace, sprintf($oldFormat, $n), $string);
 79 |         } else {
 80 |             $n = empty($n) ? 2 : $n;
 81 |             $string .= sprintf($newFormat, $n);
 82 |         }
 83 | 
 84 |         return $string;
 85 |     }
 86 | 
 87 |     /**
 88 |      * Tests whether a string contains only 7bit ASCII bytes.
 89 |      *
 90 |      * You might use this to conditionally check whether a string needs handling as UTF-8 or not, potentially offering performance
 91 |      * benefits by using the native PHP equivalent if it's just ASCII e.g.;
 92 |      *
 93 |      * <code>
 94 |      * if (StringHelper::is_ascii($someString))
 95 |      * {
 96 |      *     // It's just ASCII - use the native PHP version
 97 |      *     $someString = strtolower($someString);
 98 |      * }
 99 |      * else
100 |      * {
101 |      *     $someString = StringHelper::strtolower($someString);
102 |      * }
103 |      * </code>
104 |      *
105 |      * @param   string  $str  The string to test.
106 |      *
107 |      * @return  boolean True if the string is all ASCII
108 |      *
109 |      * @since   1.3.0
110 |      */
111 |     public static function is_ascii($str)
112 |     {
113 |         return utf8_is_ascii($str);
114 |     }
115 | 
116 |     /**
117 |      * UTF-8 aware alternative to ord()
118 |      *
119 |      * Returns the unicode ordinal for a character.
120 |      *
121 |      * @param   string  $chr  UTF-8 encoded character
122 |      *
123 |      * @return  integer Unicode ordinal for the character
124 |      *
125 |      * @link    https://www.php.net/ord
126 |      * @since   1.4.0
127 |      */
128 |     public static function ord($chr)
129 |     {
130 |         return utf8_ord($chr);
131 |     }
132 | 
133 |     /**
134 |      * UTF-8 aware alternative to strpos()
135 |      *
136 |      * Find position of first occurrence of a string.
137 |      *
138 |      * @param   string                $str     String being examined
139 |      * @param   string                $search  String being searched for
140 |      * @param   integer|null|boolean  $offset  Optional, specifies the position from which the search should be performed
141 |      *
142 |      * @return  integer|boolean  Number of characters before the first match or FALSE on failure
143 |      *
144 |      * @link    https://www.php.net/strpos
145 |      * @since   1.3.0
146 |      */
147 |     public static function strpos($str, $search, $offset = false)
148 |     {
149 |         if ($offset === false) {
150 |             return utf8_strpos($str, $search);
151 |         }
152 | 
153 |         return utf8_strpos($str, $search, $offset);
154 |     }
155 | 
156 |     /**
157 |      * UTF-8 aware alternative to strrpos()
158 |      *
159 |      * Finds position of last occurrence of a string.
160 |      *
161 |      * @param   string   $str     String being examined.
162 |      * @param   string   $search  String being searched for.
163 |      * @param   integer  $offset  Offset from the left of the string.
164 |      *
165 |      * @return  integer|boolean  Number of characters before the last match or false on failure
166 |      *
167 |      * @link    https://www.php.net/strrpos
168 |      * @since   1.3.0
169 |      */
170 |     public static function strrpos($str, $search, $offset = 0)
171 |     {
172 |         return utf8_strrpos($str, $search, $offset);
173 |     }
174 | 
175 |     /**
176 |      * UTF-8 aware alternative to substr()
177 |      *
178 |      * Return part of a string given character offset (and optionally length).
179 |      *
180 |      * @param   string                $str     String being processed
181 |      * @param   integer               $offset  Number of UTF-8 characters offset (from left)
182 |      * @param   integer|null|boolean  $length  Optional length in UTF-8 characters from offset
183 |      *
184 |      * @return  string|boolean
185 |      *
186 |      * @link    https://www.php.net/substr
187 |      * @since   1.3.0
188 |      */
189 |     public static function substr($str, $offset, $length = false)
190 |     {
191 |         if ($length === false) {
192 |             return utf8_substr($str, $offset);
193 |         }
194 | 
195 |         return utf8_substr($str, $offset, $length);
196 |     }
197 | 
198 |     /**
199 |      * UTF-8 aware alternative to strtolower()
200 |      *
201 |      * Make a string lowercase
202 |      *
203 |      * Note: The concept of a characters "case" only exists is some alphabets such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
204 |      * not exist in the Chinese alphabet, for example. See Unicode Standard Annex #21: Case Mappings
205 |      *
206 |      * @param   string  $str  String being processed
207 |      *
208 |      * @return  string|boolean  Either string in lowercase or FALSE is UTF-8 invalid
209 |      *
210 |      * @link    https://www.php.net/strtolower
211 |      * @since   1.3.0
212 |      */
213 |     public static function strtolower($str)
214 |     {
215 |         return utf8_strtolower($str);
216 |     }
217 | 
218 |     /**
219 |      * UTF-8 aware alternative to strtoupper()
220 |      *
221 |      * Make a string uppercase
222 |      *
223 |      * Note: The concept of a characters "case" only exists is some alphabets such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
224 |      * not exist in the Chinese alphabet, for example. See Unicode Standard Annex #21: Case Mappings
225 |      *
226 |      * @param   string  $str  String being processed
227 |      *
228 |      * @return  string|boolean  Either string in uppercase or FALSE is UTF-8 invalid
229 |      *
230 |      * @link    https://www.php.net/strtoupper
231 |      * @since   1.3.0
232 |      */
233 |     public static function strtoupper($str)
234 |     {
235 |         return utf8_strtoupper($str);
236 |     }
237 | 
238 |     /**
239 |      * UTF-8 aware alternative to strlen()
240 |      *
241 |      * Returns the number of characters in the string (NOT THE NUMBER OF BYTES).
242 |      *
243 |      * @param   string  $str  UTF-8 string.
244 |      *
245 |      * @return  integer  Number of UTF-8 characters in string.
246 |      *
247 |      * @link    https://www.php.net/strlen
248 |      * @since   1.3.0
249 |      */
250 |     public static function strlen($str)
251 |     {
252 |         return utf8_strlen($str);
253 |     }
254 | 
255 |     /**
256 |      * UTF-8 aware alternative to str_ireplace()
257 |      *
258 |      * Case-insensitive version of str_replace()
259 |      *
260 |      * @param   string|string[]       $search   String to search
261 |      * @param   string|string[]       $replace  Existing string to replace
262 |      * @param   string                $str      New string to replace with
263 |      * @param   integer|null|boolean  $count    Optional count value to be passed by referene
264 |      *
265 |      * @return  string  UTF-8 String
266 |      *
267 |      * @link    https://www.php.net/str_ireplace
268 |      * @since   1.3.0
269 |      */
270 |     public static function str_ireplace($search, $replace, $str, $count = null)
271 |     {
272 |         if ($count === false) {
273 |             return utf8_ireplace($search, $replace, $str);
274 |         }
275 | 
276 |         return utf8_ireplace($search, $replace, $str, $count);
277 |     }
278 | 
279 |     /**
280 |      * UTF-8 aware alternative to str_pad()
281 |      *
282 |      * Pad a string to a certain length with another string.
283 |      * $padStr may contain multi-byte characters.
284 |      *
285 |      * @param   string   $input   The input string.
286 |      * @param   integer  $length  If the value is negative, less than, or equal to the length of the input string, no padding takes place.
287 |      * @param   string   $padStr  The string may be truncated if the number of padding characters can't be evenly divided by the string's length.
288 |      * @param   integer  $type    The type of padding to apply
289 |      *
290 |      * @return  string
291 |      *
292 |      * @link    https://www.php.net/str_pad
293 |      * @since   1.4.0
294 |      */
295 |     public static function str_pad($input, $length, $padStr = ' ', $type = STR_PAD_RIGHT)
296 |     {
297 |         return utf8_str_pad($input, $length, $padStr, $type);
298 |     }
299 | 
300 |     /**
301 |      * UTF-8 aware alternative to str_split()
302 |      *
303 |      * Convert a string to an array.
304 |      *
305 |      * @param   string   $str       UTF-8 encoded string to process
306 |      * @param   integer  $splitLen  Number to characters to split string by
307 |      *
308 |      * @return  array|string|boolean
309 |      *
310 |      * @link    https://www.php.net/str_split
311 |      * @since   1.3.0
312 |      */
313 |     public static function str_split($str, $splitLen = 1)
314 |     {
315 |         return utf8_str_split($str, $splitLen);
316 |     }
317 | 
318 |     /**
319 |      * UTF-8/LOCALE aware alternative to strcasecmp()
320 |      *
321 |      * A case insensitive string comparison.
322 |      *
323 |      * @param   string          $str1    string 1 to compare
324 |      * @param   string          $str2    string 2 to compare
325 |      * @param   string|boolean  $locale  The locale used by strcoll or false to use classical comparison
326 |      *
327 |      * @return  integer   Either < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal.
328 |      *
329 |      * @link    https://www.php.net/strcasecmp
330 |      * @link    https://www.php.net/strcoll
331 |      * @link    https://www.php.net/setlocale
332 |      * @since   1.3.0
333 |      */
334 |     public static function strcasecmp($str1, $str2, $locale = false)
335 |     {
336 |         if ($locale === false) {
337 |             return utf8_strcasecmp($str1, $str2);
338 |         }
339 | 
340 |         // Get current locale
341 |         $locale0 = setlocale(LC_COLLATE, 0);
342 | 
343 |         if (!$locale = setlocale(LC_COLLATE, $locale)) {
344 |             $locale = $locale0;
345 |         }
346 | 
347 |         // See if we have successfully set locale to UTF-8
348 |         if (!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m)) {
349 |             $encoding = 'CP' . $m[1];
350 |         } elseif (stristr($locale, 'UTF-8') || stristr($locale, 'utf8')) {
351 |             $encoding = 'UTF-8';
352 |         } else {
353 |             $encoding = 'nonrecodable';
354 |         }
355 | 
356 |         // If we successfully set encoding it to utf-8 or encoding is sth weird don't recode
357 |         if ($encoding == 'UTF-8' || $encoding == 'nonrecodable') {
358 |             return strcoll(utf8_strtolower($str1), utf8_strtolower($str2));
359 |         }
360 | 
361 |         return strcoll(
362 |             static::transcode(utf8_strtolower($str1), 'UTF-8', $encoding),
363 |             static::transcode(utf8_strtolower($str2), 'UTF-8', $encoding)
364 |         );
365 |     }
366 | 
367 |     /**
368 |      * UTF-8/LOCALE aware alternative to strcmp()
369 |      *
370 |      * A case sensitive string comparison.
371 |      *
372 |      * @param   string  $str1    string 1 to compare
373 |      * @param   string  $str2    string 2 to compare
374 |      * @param   mixed   $locale  The locale used by strcoll or false to use classical comparison
375 |      *
376 |      * @return  integer  Either < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal.
377 |      *
378 |      * @link    https://www.php.net/strcmp
379 |      * @link    https://www.php.net/strcoll
380 |      * @link    https://www.php.net/setlocale
381 |      * @since   1.3.0
382 |      */
383 |     public static function strcmp($str1, $str2, $locale = false)
384 |     {
385 |         if ($locale) {
386 |             // Get current locale
387 |             $locale0 = setlocale(LC_COLLATE, 0);
388 | 
389 |             if (!$locale = setlocale(LC_COLLATE, $locale)) {
390 |                 $locale = $locale0;
391 |             }
392 | 
393 |             // See if we have successfully set locale to UTF-8
394 |             if (!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m)) {
395 |                 $encoding = 'CP' . $m[1];
396 |             } elseif (stristr($locale, 'UTF-8') || stristr($locale, 'utf8')) {
397 |                 $encoding = 'UTF-8';
398 |             } else {
399 |                 $encoding = 'nonrecodable';
400 |             }
401 | 
402 |             // If we successfully set encoding it to utf-8 or encoding is sth weird don't recode
403 |             if ($encoding == 'UTF-8' || $encoding == 'nonrecodable') {
404 |                 return strcoll($str1, $str2);
405 |             }
406 | 
407 |             return strcoll(static::transcode($str1, 'UTF-8', $encoding), static::transcode($str2, 'UTF-8', $encoding));
408 |         }
409 | 
410 |         return strcmp($str1, $str2);
411 |     }
412 | 
413 |     /**
414 |      * UTF-8 aware alternative to strcspn()
415 |      *
416 |      * Find length of initial segment not matching mask.
417 |      *
418 |      * @param   string           $str     The string to process
419 |      * @param   string           $mask    The mask
420 |      * @param   integer|boolean  $start   Optional starting character position (in characters)
421 |      * @param   integer|boolean  $length  Optional length
422 |      *
423 |      * @return  integer  The length of the initial segment of str1 which does not contain any of the characters in str2
424 |      *
425 |      * @link    https://www.php.net/strcspn
426 |      * @since   1.3.0
427 |      */
428 |     public static function strcspn($str, $mask, $start = null, $length = null)
429 |     {
430 |         if ($start === false && $length === false) {
431 |             return utf8_strcspn($str, $mask);
432 |         }
433 | 
434 |         if ($length === false) {
435 |             return utf8_strcspn($str, $mask, $start);
436 |         }
437 | 
438 |         return utf8_strcspn($str, $mask, $start, $length);
439 |     }
440 | 
441 |     /**
442 |      * UTF-8 aware alternative to stristr()
443 |      *
444 |      * Returns all of haystack from the first occurrence of needle to the end. Needle and haystack are examined in a case-insensitive manner to
445 |      * find the first occurrence of a string using case insensitive comparison.
446 |      *
447 |      * @param   string  $str     The haystack
448 |      * @param   string  $search  The needle
449 |      *
450 |      * @return  string|boolean
451 |      *
452 |      * @link    https://www.php.net/stristr
453 |      * @since   1.3.0
454 |      */
455 |     public static function stristr($str, $search)
456 |     {
457 |         return utf8_stristr($str, $search);
458 |     }
459 | 
460 |     /**
461 |      * UTF-8 aware alternative to strrev()
462 |      *
463 |      * Reverse a string.
464 |      *
465 |      * @param   string  $str  String to be reversed
466 |      *
467 |      * @return  string   The string in reverse character order
468 |      *
469 |      * @link    https://www.php.net/strrev
470 |      * @since   1.3.0
471 |      */
472 |     public static function strrev($str)
473 |     {
474 |         return utf8_strrev($str);
475 |     }
476 | 
477 |     /**
478 |      * UTF-8 aware alternative to strspn()
479 |      *
480 |      * Find length of initial segment matching mask.
481 |      *
482 |      * @param   string        $str     The haystack
483 |      * @param   string        $mask    The mask
484 |      * @param   integer|null  $start   Start optional
485 |      * @param   integer|null  $length  Length optional
486 |      *
487 |      * @return  integer
488 |      *
489 |      * @link    https://www.php.net/strspn
490 |      * @since   1.3.0
491 |      */
492 |     public static function strspn($str, $mask, $start = null, $length = null)
493 |     {
494 |         if ($start === null && $length === null) {
495 |             return utf8_strspn($str, $mask);
496 |         }
497 | 
498 |         if ($length === null) {
499 |             return utf8_strspn($str, $mask, $start);
500 |         }
501 | 
502 |         return utf8_strspn($str, $mask, $start, $length);
503 |     }
504 | 
505 |     /**
506 |      * UTF-8 aware alternative to substr_replace()
507 |      *
508 |      * Replace text within a portion of a string.
509 |      *
510 |      * @param   string                $str     The haystack
511 |      * @param   string                $repl    The replacement string
512 |      * @param   integer               $start   Start
513 |      * @param   integer|boolean|null  $length  Length (optional)
514 |      *
515 |      * @return  string
516 |      *
517 |      * @link    https://www.php.net/substr_replace
518 |      * @since   1.3.0
519 |      */
520 |     public static function substr_replace($str, $repl, $start, $length = null)
521 |     {
522 |         // Loaded by library loader
523 |         if ($length === false) {
524 |             return utf8_substr_replace($str, $repl, $start);
525 |         }
526 | 
527 |         return utf8_substr_replace($str, $repl, $start, $length);
528 |     }
529 | 
530 |     /**
531 |      * UTF-8 aware replacement for ltrim()
532 |      *
533 |      * Strip whitespace (or other characters) from the beginning of a string. You only need to use this if you are supplying the charlist
534 |      * optional arg and it contains UTF-8 characters. Otherwise ltrim will work normally on a UTF-8 string.
535 |      *
536 |      * @param   string          $str       The string to be trimmed
537 |      * @param   string|boolean  $charlist  The optional charlist of additional characters to trim
538 |      *
539 |      * @return  string  The trimmed string
540 |      *
541 |      * @link    https://www.php.net/ltrim
542 |      * @since   1.3.0
543 |      */
544 |     public static function ltrim($str, $charlist = false)
545 |     {
546 |         if (empty($charlist) && $charlist !== false) {
547 |             return $str;
548 |         }
549 | 
550 |         if ($charlist === false) {
551 |             return utf8_ltrim($str);
552 |         }
553 | 
554 |         return utf8_ltrim($str, $charlist);
555 |     }
556 | 
557 |     /**
558 |      * UTF-8 aware replacement for rtrim()
559 |      *
560 |      * Strip whitespace (or other characters) from the end of a string. You only need to use this if you are supplying the charlist
561 |      * optional arg and it contains UTF-8 characters. Otherwise rtrim will work normally on a UTF-8 string.
562 |      *
563 |      * @param   string          $str       The string to be trimmed
564 |      * @param   string|boolean  $charlist  The optional charlist of additional characters to trim
565 |      *
566 |      * @return  string  The trimmed string
567 |      *
568 |      * @link    https://www.php.net/rtrim
569 |      * @since   1.3.0
570 |      */
571 |     public static function rtrim($str, $charlist = false)
572 |     {
573 |         if (empty($charlist) && $charlist !== false) {
574 |             return $str;
575 |         }
576 | 
577 |         if ($charlist === false) {
578 |             return utf8_rtrim($str);
579 |         }
580 | 
581 |         return utf8_rtrim($str, $charlist);
582 |     }
583 | 
584 |     /**
585 |      * UTF-8 aware replacement for trim()
586 |      *
587 |      * Strip whitespace (or other characters) from the beginning and end of a string. You only need to use this if you are supplying the charlist
588 |      * optional arg and it contains UTF-8 characters. Otherwise trim will work normally on a UTF-8 string
589 |      *
590 |      * @param   string          $str       The string to be trimmed
591 |      * @param   string|boolean  $charlist  The optional charlist of additional characters to trim
592 |      *
593 |      * @return  string  The trimmed string
594 |      *
595 |      * @link    https://www.php.net/trim
596 |      * @since   1.3.0
597 |      */
598 |     public static function trim($str, $charlist = false)
599 |     {
600 |         if (empty($charlist) && $charlist !== false) {
601 |             return $str;
602 |         }
603 | 
604 |         if ($charlist === false) {
605 |             return utf8_trim($str);
606 |         }
607 | 
608 |         return utf8_trim($str, $charlist);
609 |     }
610 | 
611 |     /**
612 |      * UTF-8 aware alternative to ucfirst()
613 |      *
614 |      * Make a string's first character uppercase or all words' first character uppercase.
615 |      *
616 |      * @param   string       $str           String to be processed
617 |      * @param   string|null  $delimiter     The words delimiter (null means do not split the string)
618 |      * @param   string|null  $newDelimiter  The new words delimiter (null means equal to $delimiter)
619 |      *
620 |      * @return  string  If $delimiter is null, return the string with first character as upper case (if applicable)
621 |      *                  else consider the string of words separated by the delimiter, apply the ucfirst to each words
622 |      *                  and return the string with the new delimiter
623 |      *
624 |      * @link    https://www.php.net/ucfirst
625 |      * @since   1.3.0
626 |      */
627 |     public static function ucfirst($str, $delimiter = null, $newDelimiter = null)
628 |     {
629 |         if ($delimiter === null) {
630 |             return utf8_ucfirst($str);
631 |         }
632 | 
633 |         if ($newDelimiter === null) {
634 |             $newDelimiter = $delimiter;
635 |         }
636 | 
637 |         return implode($newDelimiter, array_map('utf8_ucfirst', explode($delimiter, $str)));
638 |     }
639 | 
640 |     /**
641 |      * UTF-8 aware alternative to ucwords()
642 |      *
643 |      * Uppercase the first character of each word in a string.
644 |      *
645 |      * @param   string  $str  String to be processed
646 |      *
647 |      * @return  string  String with first char of each word uppercase
648 |      *
649 |      * @link    https://www.php.net/ucwords
650 |      * @since   1.3.0
651 |      */
652 |     public static function ucwords($str)
653 |     {
654 |         return utf8_ucwords($str);
655 |     }
656 | 
657 |     /**
658 |      * Transcode a string.
659 |      *
660 |      * @param   string  $source        The string to transcode.
661 |      * @param   string  $fromEncoding  The source encoding.
662 |      * @param   string  $toEncoding    The target encoding.
663 |      *
664 |      * @return  string|null  The transcoded string, or null if the source was not a string.
665 |      *
666 |      * @link    https://bugs.php.net/bug.php?id=48147
667 |      *
668 |      * @since   1.3.0
669 |      */
670 |     public static function transcode($source, $fromEncoding, $toEncoding)
671 |     {
672 |         switch (ICONV_IMPL) {
673 |             case 'glibc':
674 |                 return @iconv($fromEncoding, $toEncoding . '//TRANSLIT,IGNORE', $source);
675 | 
676 |             case 'libiconv':
677 |             default:
678 |                 return iconv($fromEncoding, $toEncoding . '//IGNORE//TRANSLIT', $source);
679 |         }
680 |     }
681 | 
682 |     /**
683 |      * Tests a string as to whether it's valid UTF-8 and supported by the Unicode standard.
684 |      *
685 |      * Note: this function has been modified to simple return true or false.
686 |      *
687 |      * @param   string  $str  UTF-8 encoded string.
688 |      *
689 |      * @return  boolean  true if valid
690 |      *
691 |      * @author  <hsivonen@iki.fi>
692 |      * @link    https://hsivonen.fi/php-utf8/
693 |      * @see     compliant
694 |      * @since   1.3.0
695 |      */
696 |     public static function valid($str)
697 |     {
698 |         return utf8_is_valid($str);
699 |     }
700 | 
701 |     /**
702 |      * Tests whether a string complies as UTF-8.
703 |      *
704 |      * This will be much faster than StringHelper::valid() but will pass five and six octet UTF-8 sequences, which are not supported by Unicode and
705 |      * so cannot be displayed correctly in a browser. In other words it is not as strict as StringHelper::valid() but it's faster. If you use it to
706 |      * validate user input, you place yourself at the risk that attackers will be able to inject 5 and 6 byte sequences (which may or may not be a
707 |      * significant risk, depending on what you are are doing).
708 |      *
709 |      * @param   string  $str  UTF-8 string to check
710 |      *
711 |      * @return  boolean  TRUE if string is valid UTF-8
712 |      *
713 |      * @see     StringHelper::valid
714 |      * @link    https://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805
715 |      * @since   1.3.0
716 |      */
717 |     public static function compliant($str)
718 |     {
719 |         return utf8_compliant($str);
720 |     }
721 | 
722 |     /**
723 |      * Converts Unicode sequences to UTF-8 string.
724 |      *
725 |      * @param   string  $str  Unicode string to convert
726 |      *
727 |      * @return  string  UTF-8 string
728 |      *
729 |      * @since   1.3.0
730 |      */
731 |     public static function unicode_to_utf8($str)
732 |     {
733 |         if (\extension_loaded('mbstring')) {
734 |             return preg_replace_callback(
735 |                 '/\\\\u([0-9a-fA-F]{4})/',
736 |                 static function ($match) {
737 |                     return mb_convert_encoding(pack('H*', $match[1]), 'UTF-8', 'UCS-2BE');
738 |                 },
739 |                 $str
740 |             );
741 |         }
742 | 
743 |         return $str;
744 |     }
745 | 
746 |     /**
747 |      * Converts Unicode sequences to UTF-16 string.
748 |      *
749 |      * @param   string  $str  Unicode string to convert
750 |      *
751 |      * @return  string  UTF-16 string
752 |      *
753 |      * @since   1.3.0
754 |      */
755 |     public static function unicode_to_utf16($str)
756 |     {
757 |         if (\extension_loaded('mbstring')) {
758 |             return preg_replace_callback(
759 |                 '/\\\\u([0-9a-fA-F]{4})/',
760 |                 static function ($match) {
761 |                     return mb_convert_encoding(pack('H*', $match[1]), 'UTF-8', 'UTF-16BE');
762 |                 },
763 |                 $str
764 |             );
765 |         }
766 | 
767 |         return $str;
768 |     }
769 | }
770 | 


--------------------------------------------------------------------------------
/src/phputf8/LICENSE:
--------------------------------------------------------------------------------
  1 | 		  GNU LESSER GENERAL PUBLIC LICENSE
  2 | 		       Version 2.1, February 1999
  3 | 
  4 |  Copyright (C) 1991, 1999 Free Software Foundation, Inc.
  5 |      51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 | [This is the first released version of the Lesser GPL.  It also counts
 10 |  as the successor of the GNU Library Public License, version 2, hence
 11 |  the version number 2.1.]
 12 | 
 13 | 			    Preamble
 14 | 
 15 |   The licenses for most software are designed to take away your
 16 | freedom to share and change it.  By contrast, the GNU General Public
 17 | Licenses are intended to guarantee your freedom to share and change
 18 | free software--to make sure the software is free for all its users.
 19 | 
 20 |   This license, the Lesser General Public License, applies to some
 21 | specially designated software packages--typically libraries--of the
 22 | Free Software Foundation and other authors who decide to use it.  You
 23 | can use it too, but we suggest you first think carefully about whether
 24 | this license or the ordinary General Public License is the better
 25 | strategy to use in any particular case, based on the explanations below.
 26 | 
 27 |   When we speak of free software, we are referring to freedom of use,
 28 | not price.  Our General Public Licenses are designed to make sure that
 29 | you have the freedom to distribute copies of free software (and charge
 30 | for this service if you wish); that you receive source code or can get
 31 | it if you want it; that you can change the software and use pieces of
 32 | it in new free programs; and that you are informed that you can do
 33 | these things.
 34 | 
 35 |   To protect your rights, we need to make restrictions that forbid
 36 | distributors to deny you these rights or to ask you to surrender these
 37 | rights.  These restrictions translate to certain responsibilities for
 38 | you if you distribute copies of the library or if you modify it.
 39 | 
 40 |   For example, if you distribute copies of the library, whether gratis
 41 | or for a fee, you must give the recipients all the rights that we gave
 42 | you.  You must make sure that they, too, receive or can get the source
 43 | code.  If you link other code with the library, you must provide
 44 | complete object files to the recipients, so that they can relink them
 45 | with the library after making changes to the library and recompiling
 46 | it.  And you must show them these terms so they know their rights.
 47 | 
 48 |   We protect your rights with a two-step method: (1) we copyright the
 49 | library, and (2) we offer you this license, which gives you legal
 50 | permission to copy, distribute and/or modify the library.
 51 | 
 52 |   To protect each distributor, we want to make it very clear that
 53 | there is no warranty for the free library.  Also, if the library is
 54 | modified by someone else and passed on, the recipients should know
 55 | that what they have is not the original version, so that the original
 56 | author's reputation will not be affected by problems that might be
 57 | introduced by others.
 58 | 
 59 |   Finally, software patents pose a constant threat to the existence of
 60 | any free program.  We wish to make sure that a company cannot
 61 | effectively restrict the users of a free program by obtaining a
 62 | restrictive license from a patent holder.  Therefore, we insist that
 63 | any patent license obtained for a version of the library must be
 64 | consistent with the full freedom of use specified in this license.
 65 | 
 66 |   Most GNU software, including some libraries, is covered by the
 67 | ordinary GNU General Public License.  This license, the GNU Lesser
 68 | General Public License, applies to certain designated libraries, and
 69 | is quite different from the ordinary General Public License.  We use
 70 | this license for certain libraries in order to permit linking those
 71 | libraries into non-free programs.
 72 | 
 73 |   When a program is linked with a library, whether statically or using
 74 | a shared library, the combination of the two is legally speaking a
 75 | combined work, a derivative of the original library.  The ordinary
 76 | General Public License therefore permits such linking only if the
 77 | entire combination fits its criteria of freedom.  The Lesser General
 78 | Public License permits more lax criteria for linking other code with
 79 | the library.
 80 | 
 81 |   We call this license the "Lesser" General Public License because it
 82 | does Less to protect the user's freedom than the ordinary General
 83 | Public License.  It also provides other free software developers Less
 84 | of an advantage over competing non-free programs.  These disadvantages
 85 | are the reason we use the ordinary General Public License for many
 86 | libraries.  However, the Lesser license provides advantages in certain
 87 | special circumstances.
 88 | 
 89 |   For example, on rare occasions, there may be a special need to
 90 | encourage the widest possible use of a certain library, so that it becomes
 91 | a de-facto standard.  To achieve this, non-free programs must be
 92 | allowed to use the library.  A more frequent case is that a free
 93 | library does the same job as widely used non-free libraries.  In this
 94 | case, there is little to gain by limiting the free library to free
 95 | software only, so we use the Lesser General Public License.
 96 | 
 97 |   In other cases, permission to use a particular library in non-free
 98 | programs enables a greater number of people to use a large body of
 99 | free software.  For example, permission to use the GNU C Library in
100 | non-free programs enables many more people to use the whole GNU
101 | operating system, as well as its variant, the GNU/Linux operating
102 | system.
103 | 
104 |   Although the Lesser General Public License is Less protective of the
105 | users' freedom, it does ensure that the user of a program that is
106 | linked with the Library has the freedom and the wherewithal to run
107 | that program using a modified version of the Library.
108 | 
109 |   The precise terms and conditions for copying, distribution and
110 | modification follow.  Pay close attention to the difference between a
111 | "work based on the library" and a "work that uses the library".  The
112 | former contains code derived from the library, whereas the latter must
113 | be combined with the library in order to run.
114 | 
115 | 		  GNU LESSER GENERAL PUBLIC LICENSE
116 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
117 | 
118 |   0. This License Agreement applies to any software library or other
119 | program which contains a notice placed by the copyright holder or
120 | other authorized party saying it may be distributed under the terms of
121 | this Lesser General Public License (also called "this License").
122 | Each licensee is addressed as "you".
123 | 
124 |   A "library" means a collection of software functions and/or data
125 | prepared so as to be conveniently linked with application programs
126 | (which use some of those functions and data) to form executables.
127 | 
128 |   The "Library", below, refers to any such software library or work
129 | which has been distributed under these terms.  A "work based on the
130 | Library" means either the Library or any derivative work under
131 | copyright law: that is to say, a work containing the Library or a
132 | portion of it, either verbatim or with modifications and/or translated
133 | straightforwardly into another language.  (Hereinafter, translation is
134 | included without limitation in the term "modification".)
135 | 
136 |   "Source code" for a work means the preferred form of the work for
137 | making modifications to it.  For a library, complete source code means
138 | all the source code for all modules it contains, plus any associated
139 | interface definition files, plus the scripts used to control compilation
140 | and installation of the library.
141 | 
142 |   Activities other than copying, distribution and modification are not
143 | covered by this License; they are outside its scope.  The act of
144 | running a program using the Library is not restricted, and output from
145 | such a program is covered only if its contents constitute a work based
146 | on the Library (independent of the use of the Library in a tool for
147 | writing it).  Whether that is true depends on what the Library does
148 | and what the program that uses the Library does.
149 | 
150 |   1. You may copy and distribute verbatim copies of the Library's
151 | complete source code as you receive it, in any medium, provided that
152 | you conspicuously and appropriately publish on each copy an
153 | appropriate copyright notice and disclaimer of warranty; keep intact
154 | all the notices that refer to this License and to the absence of any
155 | warranty; and distribute a copy of this License along with the
156 | Library.
157 | 
158 |   You may charge a fee for the physical act of transferring a copy,
159 | and you may at your option offer warranty protection in exchange for a
160 | fee.
161 | 
162 |   2. You may modify your copy or copies of the Library or any portion
163 | of it, thus forming a work based on the Library, and copy and
164 | distribute such modifications or work under the terms of Section 1
165 | above, provided that you also meet all of these conditions:
166 | 
167 |     a) The modified work must itself be a software library.
168 | 
169 |     b) You must cause the files modified to carry prominent notices
170 |     stating that you changed the files and the date of any change.
171 | 
172 |     c) You must cause the whole of the work to be licensed at no
173 |     charge to all third parties under the terms of this License.
174 | 
175 |     d) If a facility in the modified Library refers to a function or a
176 |     table of data to be supplied by an application program that uses
177 |     the facility, other than as an argument passed when the facility
178 |     is invoked, then you must make a good faith effort to ensure that,
179 |     in the event an application does not supply such function or
180 |     table, the facility still operates, and performs whatever part of
181 |     its purpose remains meaningful.
182 | 
183 |     (For example, a function in a library to compute square roots has
184 |     a purpose that is entirely well-defined independent of the
185 |     application.  Therefore, Subsection 2d requires that any
186 |     application-supplied function or table used by this function must
187 |     be optional: if the application does not supply it, the square
188 |     root function must still compute square roots.)
189 | 
190 | These requirements apply to the modified work as a whole.  If
191 | identifiable sections of that work are not derived from the Library,
192 | and can be reasonably considered independent and separate works in
193 | themselves, then this License, and its terms, do not apply to those
194 | sections when you distribute them as separate works.  But when you
195 | distribute the same sections as part of a whole which is a work based
196 | on the Library, the distribution of the whole must be on the terms of
197 | this License, whose permissions for other licensees extend to the
198 | entire whole, and thus to each and every part regardless of who wrote
199 | it.
200 | 
201 | Thus, it is not the intent of this section to claim rights or contest
202 | your rights to work written entirely by you; rather, the intent is to
203 | exercise the right to control the distribution of derivative or
204 | collective works based on the Library.
205 | 
206 | In addition, mere aggregation of another work not based on the Library
207 | with the Library (or with a work based on the Library) on a volume of
208 | a storage or distribution medium does not bring the other work under
209 | the scope of this License.
210 | 
211 |   3. You may opt to apply the terms of the ordinary GNU General Public
212 | License instead of this License to a given copy of the Library.  To do
213 | this, you must alter all the notices that refer to this License, so
214 | that they refer to the ordinary GNU General Public License, version 2,
215 | instead of to this License.  (If a newer version than version 2 of the
216 | ordinary GNU General Public License has appeared, then you can specify
217 | that version instead if you wish.)  Do not make any other change in
218 | these notices.
219 | 
220 |   Once this change is made in a given copy, it is irreversible for
221 | that copy, so the ordinary GNU General Public License applies to all
222 | subsequent copies and derivative works made from that copy.
223 | 
224 |   This option is useful when you wish to copy part of the code of
225 | the Library into a program that is not a library.
226 | 
227 |   4. You may copy and distribute the Library (or a portion or
228 | derivative of it, under Section 2) in object code or executable form
229 | under the terms of Sections 1 and 2 above provided that you accompany
230 | it with the complete corresponding machine-readable source code, which
231 | must be distributed under the terms of Sections 1 and 2 above on a
232 | medium customarily used for software interchange.
233 | 
234 |   If distribution of object code is made by offering access to copy
235 | from a designated place, then offering equivalent access to copy the
236 | source code from the same place satisfies the requirement to
237 | distribute the source code, even though third parties are not
238 | compelled to copy the source along with the object code.
239 | 
240 |   5. A program that contains no derivative of any portion of the
241 | Library, but is designed to work with the Library by being compiled or
242 | linked with it, is called a "work that uses the Library".  Such a
243 | work, in isolation, is not a derivative work of the Library, and
244 | therefore falls outside the scope of this License.
245 | 
246 |   However, linking a "work that uses the Library" with the Library
247 | creates an executable that is a derivative of the Library (because it
248 | contains portions of the Library), rather than a "work that uses the
249 | library".  The executable is therefore covered by this License.
250 | Section 6 states terms for distribution of such executables.
251 | 
252 |   When a "work that uses the Library" uses material from a header file
253 | that is part of the Library, the object code for the work may be a
254 | derivative work of the Library even though the source code is not.
255 | Whether this is true is especially significant if the work can be
256 | linked without the Library, or if the work is itself a library.  The
257 | threshold for this to be true is not precisely defined by law.
258 | 
259 |   If such an object file uses only numerical parameters, data
260 | structure layouts and accessors, and small macros and small inline
261 | functions (ten lines or less in length), then the use of the object
262 | file is unrestricted, regardless of whether it is legally a derivative
263 | work.  (Executables containing this object code plus portions of the
264 | Library will still fall under Section 6.)
265 | 
266 |   Otherwise, if the work is a derivative of the Library, you may
267 | distribute the object code for the work under the terms of Section 6.
268 | Any executables containing that work also fall under Section 6,
269 | whether or not they are linked directly with the Library itself.
270 | 
271 |   6. As an exception to the Sections above, you may also combine or
272 | link a "work that uses the Library" with the Library to produce a
273 | work containing portions of the Library, and distribute that work
274 | under terms of your choice, provided that the terms permit
275 | modification of the work for the customer's own use and reverse
276 | engineering for debugging such modifications.
277 | 
278 |   You must give prominent notice with each copy of the work that the
279 | Library is used in it and that the Library and its use are covered by
280 | this License.  You must supply a copy of this License.  If the work
281 | during execution displays copyright notices, you must include the
282 | copyright notice for the Library among them, as well as a reference
283 | directing the user to the copy of this License.  Also, you must do one
284 | of these things:
285 | 
286 |     a) Accompany the work with the complete corresponding
287 |     machine-readable source code for the Library including whatever
288 |     changes were used in the work (which must be distributed under
289 |     Sections 1 and 2 above); and, if the work is an executable linked
290 |     with the Library, with the complete machine-readable "work that
291 |     uses the Library", as object code and/or source code, so that the
292 |     user can modify the Library and then relink to produce a modified
293 |     executable containing the modified Library.  (It is understood
294 |     that the user who changes the contents of definitions files in the
295 |     Library will not necessarily be able to recompile the application
296 |     to use the modified definitions.)
297 | 
298 |     b) Use a suitable shared library mechanism for linking with the
299 |     Library.  A suitable mechanism is one that (1) uses at run time a
300 |     copy of the library already present on the user's computer system,
301 |     rather than copying library functions into the executable, and (2)
302 |     will operate properly with a modified version of the library, if
303 |     the user installs one, as long as the modified version is
304 |     interface-compatible with the version that the work was made with.
305 | 
306 |     c) Accompany the work with a written offer, valid for at
307 |     least three years, to give the same user the materials
308 |     specified in Subsection 6a, above, for a charge no more
309 |     than the cost of performing this distribution.
310 | 
311 |     d) If distribution of the work is made by offering access to copy
312 |     from a designated place, offer equivalent access to copy the above
313 |     specified materials from the same place.
314 | 
315 |     e) Verify that the user has already received a copy of these
316 |     materials or that you have already sent this user a copy.
317 | 
318 |   For an executable, the required form of the "work that uses the
319 | Library" must include any data and utility programs needed for
320 | reproducing the executable from it.  However, as a special exception,
321 | the materials to be distributed need not include anything that is
322 | normally distributed (in either source or binary form) with the major
323 | components (compiler, kernel, and so on) of the operating system on
324 | which the executable runs, unless that component itself accompanies
325 | the executable.
326 | 
327 |   It may happen that this requirement contradicts the license
328 | restrictions of other proprietary libraries that do not normally
329 | accompany the operating system.  Such a contradiction means you cannot
330 | use both them and the Library together in an executable that you
331 | distribute.
332 | 
333 |   7. You may place library facilities that are a work based on the
334 | Library side-by-side in a single library together with other library
335 | facilities not covered by this License, and distribute such a combined
336 | library, provided that the separate distribution of the work based on
337 | the Library and of the other library facilities is otherwise
338 | permitted, and provided that you do these two things:
339 | 
340 |     a) Accompany the combined library with a copy of the same work
341 |     based on the Library, uncombined with any other library
342 |     facilities.  This must be distributed under the terms of the
343 |     Sections above.
344 | 
345 |     b) Give prominent notice with the combined library of the fact
346 |     that part of it is a work based on the Library, and explaining
347 |     where to find the accompanying uncombined form of the same work.
348 | 
349 |   8. You may not copy, modify, sublicense, link with, or distribute
350 | the Library except as expressly provided under this License.  Any
351 | attempt otherwise to copy, modify, sublicense, link with, or
352 | distribute the Library is void, and will automatically terminate your
353 | rights under this License.  However, parties who have received copies,
354 | or rights, from you under this License will not have their licenses
355 | terminated so long as such parties remain in full compliance.
356 | 
357 |   9. You are not required to accept this License, since you have not
358 | signed it.  However, nothing else grants you permission to modify or
359 | distribute the Library or its derivative works.  These actions are
360 | prohibited by law if you do not accept this License.  Therefore, by
361 | modifying or distributing the Library (or any work based on the
362 | Library), you indicate your acceptance of this License to do so, and
363 | all its terms and conditions for copying, distributing or modifying
364 | the Library or works based on it.
365 | 
366 |   10. Each time you redistribute the Library (or any work based on the
367 | Library), the recipient automatically receives a license from the
368 | original licensor to copy, distribute, link with or modify the Library
369 | subject to these terms and conditions.  You may not impose any further
370 | restrictions on the recipients' exercise of the rights granted herein.
371 | You are not responsible for enforcing compliance by third parties with
372 | this License.
373 | 
374 |   11. If, as a consequence of a court judgment or allegation of patent
375 | infringement or for any other reason (not limited to patent issues),
376 | conditions are imposed on you (whether by court order, agreement or
377 | otherwise) that contradict the conditions of this License, they do not
378 | excuse you from the conditions of this License.  If you cannot
379 | distribute so as to satisfy simultaneously your obligations under this
380 | License and any other pertinent obligations, then as a consequence you
381 | may not distribute the Library at all.  For example, if a patent
382 | license would not permit royalty-free redistribution of the Library by
383 | all those who receive copies directly or indirectly through you, then
384 | the only way you could satisfy both it and this License would be to
385 | refrain entirely from distribution of the Library.
386 | 
387 | If any portion of this section is held invalid or unenforceable under any
388 | particular circumstance, the balance of the section is intended to apply,
389 | and the section as a whole is intended to apply in other circumstances.
390 | 
391 | It is not the purpose of this section to induce you to infringe any
392 | patents or other property right claims or to contest validity of any
393 | such claims; this section has the sole purpose of protecting the
394 | integrity of the free software distribution system which is
395 | implemented by public license practices.  Many people have made
396 | generous contributions to the wide range of software distributed
397 | through that system in reliance on consistent application of that
398 | system; it is up to the author/donor to decide if he or she is willing
399 | to distribute software through any other system and a licensee cannot
400 | impose that choice.
401 | 
402 | This section is intended to make thoroughly clear what is believed to
403 | be a consequence of the rest of this License.
404 | 
405 |   12. If the distribution and/or use of the Library is restricted in
406 | certain countries either by patents or by copyrighted interfaces, the
407 | original copyright holder who places the Library under this License may add
408 | an explicit geographical distribution limitation excluding those countries,
409 | so that distribution is permitted only in or among countries not thus
410 | excluded.  In such case, this License incorporates the limitation as if
411 | written in the body of this License.
412 | 
413 |   13. The Free Software Foundation may publish revised and/or new
414 | versions of the Lesser General Public License from time to time.
415 | Such new versions will be similar in spirit to the present version,
416 | but may differ in detail to address new problems or concerns.
417 | 
418 | Each version is given a distinguishing version number.  If the Library
419 | specifies a version number of this License which applies to it and
420 | "any later version", you have the option of following the terms and
421 | conditions either of that version or of any later version published by
422 | the Free Software Foundation.  If the Library does not specify a
423 | license version number, you may choose any version ever published by
424 | the Free Software Foundation.
425 | 
426 |   14. If you wish to incorporate parts of the Library into other free
427 | programs whose distribution conditions are incompatible with these,
428 | write to the author to ask for permission.  For software which is
429 | copyrighted by the Free Software Foundation, write to the Free
430 | Software Foundation; we sometimes make exceptions for this.  Our
431 | decision will be guided by the two goals of preserving the free status
432 | of all derivatives of our free software and of promoting the sharing
433 | and reuse of software generally.
434 | 
435 | 			    NO WARRANTY
436 | 
437 |   15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
438 | WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
439 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
440 | OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
441 | KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
442 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
443 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
444 | LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
445 | THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
446 | 
447 |   16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
448 | WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
449 | AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
450 | FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
451 | CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
452 | LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
453 | RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
454 | FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
455 | SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
456 | DAMAGES.
457 | 
458 | 		     END OF TERMS AND CONDITIONS
459 | 
460 |            How to Apply These Terms to Your New Libraries
461 | 
462 |   If you develop a new library, and you want it to be of the greatest
463 | possible use to the public, we recommend making it free software that
464 | everyone can redistribute and change.  You can do so by permitting
465 | redistribution under these terms (or, alternatively, under the terms of the
466 | ordinary General Public License).
467 | 
468 |   To apply these terms, attach the following notices to the library.  It is
469 | safest to attach them to the start of each source file to most effectively
470 | convey the exclusion of warranty; and each file should have at least the
471 | "copyright" line and a pointer to where the full notice is found.
472 | 
473 |     <one line to give the library's name and a brief idea of what it does.>
474 |     Copyright (C) <year>  <name of author>
475 | 
476 |     This library is free software; you can redistribute it and/or
477 |     modify it under the terms of the GNU Lesser General Public
478 |     License as published by the Free Software Foundation; either
479 |     version 2.1 of the License, or (at your option) any later version.
480 | 
481 |     This library is distributed in the hope that it will be useful,
482 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
483 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
484 |     Lesser General Public License for more details.
485 | 
486 |     You should have received a copy of the GNU Lesser General Public
487 |     License along with this library; if not, write to the Free Software
488 |     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
489 | 
490 | Also add information on how to contact you by electronic and paper mail.
491 | 
492 | You should also get your employer (if you work as a programmer) or your
493 | school, if any, to sign a "copyright disclaimer" for the library, if
494 | necessary.  Here is a sample; alter the names:
495 | 
496 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the
497 |   library `Frob' (a library for tweaking knobs) written by James Random Hacker.
498 | 
499 |   <signature of Ty Coon>, 1 April 1990
500 |   Ty Coon, President of Vice
501 | 
502 | That's all there is to it!
503 | 
504 | 
505 | 


--------------------------------------------------------------------------------
/src/phputf8/README:
--------------------------------------------------------------------------------
 1 | ++PHP UTF-8++
 2 | 
 3 | Version 0.5
 4 | 
 5 | ++DOCUMENTATION++
 6 | 
 7 | Documentation in progress in ./docs dir
 8 | 
 9 | http://www.phpwact.org/php/i18n/charsets
10 | http://www.phpwact.org/php/i18n/utf-8
11 | 
12 | Important Note: DO NOT use these functions without understanding WHY
13 | you are using them. In particular, do not blindly replace all use of PHP's
14 | string functions which functions found here - most of the time you will
15 | not need to, and you will be introducing a significant performance
16 | overhead to your application. You can get a good idea of when to use what
17 | from reading: http://www.phpwact.org/php/i18n/utf-8
18 | 
19 | Important Note: For sake of performance most of the functions here are
20 | not "defensive" (e.g. there is not extensive parameter checking, well
21 | formed UTF-8 is assumed). This is particularily relevant when is comes to
22 | catching badly formed UTF-8 - you should screen input on the "outer
23 | perimeter" with help from functions in the utf8_validation.php and
24 | utf8_bad.php files.
25 | 
26 | Important Note: this library treats ALL ASCII characters as valid, including ASCII control characters. But if you use some ASCII control characters in XML, it will render the XML ill-formed. Don't be a bozo: http://hsivonen.iki.fi/producing-xml/#controlchar
27 | 
28 | ++BUGS / SUPPORT / FEATURE REQUESTS ++
29 | 
30 | Please report bugs to:
31 | http://sourceforge.net/tracker/?group_id=142846&atid=753842
32 | - if you are able, please submit a failing unit test
33 | (http://www.lastcraft.com/simple_test.php) with your bug report.
34 | 
35 | For feature requests / faster implementation of functions found here,
36 | please drop them in via the RFE tracker: http://sourceforge.net/tracker/?group_id=142846&atid=753845
37 | Particularily interested in faster implementations!
38 | 
39 | For general support / help, use:
40 | http://sourceforge.net/tracker/?group_id=142846&atid=753843
41 | 
42 | In the VERY WORST case, you can email me: hfuecks gmail com - I tend to be slow to respond though so be warned.
43 | 
44 | Important Note: when reporting bugs, please provide the following
45 | information;
46 | 
47 | PHP version, whether the iconv extension is loaded (in PHP5 it's
48 | there by default), whether the mbstring extension is loaded. The
49 | following PHP script can be used to determine this information;
50 | 
51 | <?php
52 | print "PHP Version: " .phpversion()."<br>";
53 | if ( extension_loaded('mbstring') ) {
54 |     print "mbstring available<br>";
55 | } else {
56 |     print "mbstring not available<br>";
57 | }
58 | if ( extension_loaded('iconv') ) {
59 |     print "iconv available<br>";
60 | } else {
61 |     print "iconv not available<br>";
62 | }
63 | ?>
64 | 
65 | ++LICENSING++
66 | 
67 | Parts of the code in this library come from other places, under different
68 | licenses.
69 | The authors involved have been contacted (see below). Attribution for
70 | which code came from elsewhere can be found in the source code itself.
71 | 
72 | +Andreas Gohr / Chris Smith - Dokuwiki
73 | There is a fair degree of collaboration / exchange of ideas and code
74 | beteen Dokuwiki's UTF-8 library;
75 | http://dev.splitbrain.org/view/darcs/dokuwiki/inc/utf8.php
76 | and phputf8. Although Dokuwiki is released under GPL, its UTF-8
77 | library is released under LGPL, hence no conflict with phputf8
78 | 
79 | +Henri Sivonen (http://hsivonen.iki.fi/php-utf8/ /
80 | http://hsivonen.iki.fi/php-utf8/) has also given permission for his
81 | code to be released under the terms of the LGPL. He ported a Unicode / UTF-8
82 | converter from the Mozilla codebase to PHP, which is re-used in phputf8
83 | 


--------------------------------------------------------------------------------
/src/phputf8/mbstring/core.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | /**
  4 | * @package utf8
  5 | */
  6 | 
  7 | /**
  8 | * Define UTF8_CORE as required
  9 | */
 10 | if (!defined('UTF8_CORE')) {
 11 |     define('UTF8_CORE', true);
 12 | }
 13 | 
 14 | //--------------------------------------------------------------------
 15 | /**
 16 | * Wrapper round mb_strlen
 17 | * Assumes you have mb_internal_encoding to UTF-8 already
 18 | * Note: this function does not count bad bytes in the string - these
 19 | * are simply ignored
 20 | * @param string UTF-8 string
 21 | * @return int number of UTF-8 characters in string
 22 | * @package utf8
 23 | */
 24 | function utf8_strlen($str)
 25 | {
 26 |     return mb_strlen($str);
 27 | }
 28 | 
 29 | 
 30 | //--------------------------------------------------------------------
 31 | /**
 32 | * Assumes mbstring internal encoding is set to UTF-8
 33 | * Wrapper around mb_strpos
 34 | * Find position of first occurrence of a string
 35 | * @param string haystack
 36 | * @param string needle (you should validate this with utf8_is_valid)
 37 | * @param integer offset in characters (from left)
 38 | * @return mixed integer position or FALSE on failure
 39 | * @package utf8
 40 | */
 41 | function utf8_strpos($str, $search, $offset = false)
 42 | {
 43 |     if ($offset === false) {
 44 |         return mb_strpos($str, $search);
 45 |     } else {
 46 |         return mb_strpos($str, $search, $offset);
 47 |     }
 48 | }
 49 | 
 50 | //--------------------------------------------------------------------
 51 | /**
 52 | * Assumes mbstring internal encoding is set to UTF-8
 53 | * Wrapper around mb_strrpos
 54 | * Find position of last occurrence of a char in a string
 55 | * @param string haystack
 56 | * @param string needle (you should validate this with utf8_is_valid)
 57 | * @param integer (optional) offset (from left)
 58 | * @return mixed integer position or FALSE on failure
 59 | * @package utf8
 60 | */
 61 | function utf8_strrpos($str, $search, $offset = false)
 62 | {
 63 |     if ($offset === false) {
 64 |         # Emulate behaviour of strrpos rather than raising warning
 65 |         if (empty($str)) {
 66 |             return false;
 67 |         }
 68 |         return mb_strrpos($str, $search);
 69 |     } else {
 70 |         if (!is_int($offset)) {
 71 |             trigger_error('utf8_strrpos expects parameter 3 to be long', E_USER_WARNING);
 72 |             return false;
 73 |         }
 74 | 
 75 |         $str = mb_substr($str, $offset);
 76 | 
 77 |         if (false !== ($pos = mb_strrpos($str, $search))) {
 78 |             return $pos + $offset;
 79 |         }
 80 | 
 81 |         return false;
 82 |     }
 83 | }
 84 | 
 85 | //--------------------------------------------------------------------
 86 | /**
 87 | * Assumes mbstring internal encoding is set to UTF-8
 88 | * Wrapper around mb_substr
 89 | * Return part of a string given character offset (and optionally length)
 90 | * @param string
 91 | * @param integer number of UTF-8 characters offset (from left)
 92 | * @param integer (optional) length in UTF-8 characters from offset
 93 | * @return mixed string or FALSE if failure
 94 | * @package utf8
 95 | */
 96 | function utf8_substr($str, $offset, $length = false)
 97 | {
 98 |     if ($length === false) {
 99 |         return mb_substr($str, (int) $offset);
100 |     } else {
101 |         return mb_substr($str, (int) $offset, $length);
102 |     }
103 | }
104 | 
105 | //--------------------------------------------------------------------
106 | /**
107 | * Assumes mbstring internal encoding is set to UTF-8
108 | * Wrapper around mb_strtolower
109 | * Make a string lowercase
110 | * Note: The concept of a characters "case" only exists is some alphabets
111 | * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
112 | * not exist in the Chinese alphabet, for example. See Unicode Standard
113 | * Annex #21: Case Mappings
114 | * @param string
115 | * @return mixed either string in lowercase or FALSE is UTF-8 invalid
116 | * @package utf8
117 | */
118 | function utf8_strtolower($str)
119 | {
120 |     return mb_strtolower($str);
121 | }
122 | 
123 | //--------------------------------------------------------------------
124 | /**
125 | * Assumes mbstring internal encoding is set to UTF-8
126 | * Wrapper around mb_strtoupper
127 | * Make a string uppercase
128 | * Note: The concept of a characters "case" only exists is some alphabets
129 | * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
130 | * not exist in the Chinese alphabet, for example. See Unicode Standard
131 | * Annex #21: Case Mappings
132 | * @param string
133 | * @return mixed either string in lowercase or FALSE is UTF-8 invalid
134 | * @package utf8
135 | */
136 | function utf8_strtoupper($str)
137 | {
138 |     return mb_strtoupper($str);
139 | }
140 | 


--------------------------------------------------------------------------------
/src/phputf8/native/core.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | /**
  4 | * @package utf8
  5 | */
  6 | 
  7 | /**
  8 | * Define UTF8_CORE as required
  9 | */
 10 | if (!defined('UTF8_CORE')) {
 11 |     define('UTF8_CORE', true);
 12 | }
 13 | 
 14 | //--------------------------------------------------------------------
 15 | /**
 16 | * Unicode aware replacement for strlen(). Returns the number
 17 | * of characters in the string (not the number of bytes), replacing
 18 | * multibyte characters with a single byte equivalent
 19 | * utf8_decode() converts characters that are not in ISO-8859-1
 20 | * to '?', which, for the purpose of counting, is alright - It's
 21 | * much faster than iconv_strlen
 22 | * Note: this function does not count bad UTF-8 bytes in the string
 23 | * - these are simply ignored
 24 | * @author <chernyshevsky at hotmail dot com>
 25 | * @link   http://www.php.net/manual/en/function.strlen.php
 26 | * @link   http://www.php.net/manual/en/function.utf8-decode.php
 27 | * @param string UTF-8 string
 28 | * @return int number of UTF-8 characters in string
 29 | * @package utf8
 30 | */
 31 | function utf8_strlen($str)
 32 | {
 33 |     return strlen(utf8_decode($str));
 34 | }
 35 | 
 36 | 
 37 | //--------------------------------------------------------------------
 38 | /**
 39 | * UTF-8 aware alternative to strpos
 40 | * Find position of first occurrence of a string
 41 | * Note: This will get alot slower if offset is used
 42 | * Note: requires utf8_strlen amd utf8_substr to be loaded
 43 | * @param string haystack
 44 | * @param string needle (you should validate this with utf8_is_valid)
 45 | * @param integer offset in characters (from left)
 46 | * @return mixed integer position or FALSE on failure
 47 | * @see http://www.php.net/strpos
 48 | * @see utf8_strlen
 49 | * @see utf8_substr
 50 | * @package utf8
 51 | */
 52 | function utf8_strpos($str, $needle, $offset = null)
 53 | {
 54 |     if (is_null($offset)) {
 55 |         $ar = explode($needle, $str, 2);
 56 |         if (count($ar) > 1) {
 57 |             return utf8_strlen($ar[0]);
 58 |         }
 59 |         return false;
 60 |     } else {
 61 |         if (!is_int($offset)) {
 62 |             trigger_error('utf8_strpos: Offset must be an integer', E_USER_ERROR);
 63 |             return false;
 64 |         }
 65 | 
 66 |         $str = utf8_substr($str, $offset);
 67 | 
 68 |         if (false !== ($pos = utf8_strpos($str, $needle))) {
 69 |             return $pos + $offset;
 70 |         }
 71 | 
 72 |         return false;
 73 |     }
 74 | }
 75 | 
 76 | //--------------------------------------------------------------------
 77 | /**
 78 | * UTF-8 aware alternative to strrpos
 79 | * Find position of last occurrence of a char in a string
 80 | * Note: This will get alot slower if offset is used
 81 | * Note: requires utf8_substr and utf8_strlen to be loaded
 82 | * @param string haystack
 83 | * @param string needle (you should validate this with utf8_is_valid)
 84 | * @param integer (optional) offset (from left)
 85 | * @return mixed integer position or FALSE on failure
 86 | * @see http://www.php.net/strrpos
 87 | * @see utf8_substr
 88 | * @see utf8_strlen
 89 | * @package utf8
 90 | */
 91 | function utf8_strrpos($str, $needle, $offset = null)
 92 | {
 93 |     if (is_null($offset)) {
 94 |         $ar = explode($needle, $str);
 95 | 
 96 |         if (count($ar) > 1) {
 97 |             // Pop off the end of the string where the last match was made
 98 |             array_pop($ar);
 99 |             $str = join($needle, $ar);
100 |             return utf8_strlen($str);
101 |         }
102 |         return false;
103 |     } else {
104 |         if (!is_int($offset)) {
105 |             trigger_error('utf8_strrpos expects parameter 3 to be long', E_USER_WARNING);
106 |             return false;
107 |         }
108 | 
109 |         $str = utf8_substr($str, $offset);
110 | 
111 |         if (false !== ($pos = utf8_strrpos($str, $needle))) {
112 |             return $pos + $offset;
113 |         }
114 | 
115 |         return false;
116 |     }
117 | }
118 | 
119 | //--------------------------------------------------------------------
120 | /**
121 | * UTF-8 aware alternative to substr
122 | * Return part of a string given character offset (and optionally length)
123 | *
124 | * Note arguments: comparied to substr - if offset or length are
125 | * not integers, this version will not complain but rather massages them
126 | * into an integer.
127 | *
128 | * Note on returned values: substr documentation states false can be
129 | * returned in some cases (e.g. offset > string length)
130 | * mb_substr never returns false, it will return an empty string instead.
131 | * This adopts the mb_substr approach
132 | *
133 | * Note on implementation: PCRE only supports repetitions of less than
134 | * 65536, in order to accept up to MAXINT values for offset and length,
135 | * we'll repeat a group of 65535 characters when needed.
136 | *
137 | * Note on implementation: calculating the number of characters in the
138 | * string is a relatively expensive operation, so we only carry it out when
139 | * necessary. It isn't necessary for +ve offsets and no specified length
140 | *
141 | * @author Chris Smith<chris@jalakai.co.uk>
142 | * @param string
143 | * @param integer number of UTF-8 characters offset (from left)
144 | * @param integer (optional) length in UTF-8 characters from offset
145 | * @return mixed string or FALSE if failure
146 | * @package utf8
147 | */
148 | function utf8_substr($str, $offset, $length = null)
149 | {
150 |     // generates E_NOTICE
151 |     // for PHP4 objects, but not PHP5 objects
152 |     $str    = (string)$str;
153 |     $offset = (int)$offset;
154 |     if (!is_null($length)) {
155 |         $length = (int)$length;
156 |     }
157 | 
158 |     // handle trivial cases
159 |     if ($length === 0) {
160 |         return '';
161 |     }
162 |     if ($offset < 0 && $length < 0 && $length < $offset) {
163 |         return '';
164 |     }
165 | 
166 |     // normalise negative offsets (we could use a tail
167 |     // anchored pattern, but they are horribly slow!)
168 |     if ($offset < 0) {
169 |         // see notes
170 |         $strlen = strlen(utf8_decode($str));
171 |         $offset = $strlen + $offset;
172 |         if ($offset < 0) {
173 |             $offset = 0;
174 |         }
175 |     }
176 | 
177 |     $Op = '';
178 |     $Lp = '';
179 | 
180 |     // establish a pattern for offset, a
181 |     // non-captured group equal in length to offset
182 |     if ($offset > 0) {
183 |         $Ox = (int)($offset / 65535);
184 |         $Oy = $offset % 65535;
185 | 
186 |         if ($Ox) {
187 |             $Op = '(?:.{65535}){' . $Ox . '}';
188 |         }
189 | 
190 |         $Op = '^(?:' . $Op . '.{' . $Oy . '})';
191 |     } else {
192 |         // offset == 0; just anchor the pattern
193 |         $Op = '^';
194 |     }
195 | 
196 |     // establish a pattern for length
197 |     if (is_null($length)) {
198 |         // the rest of the string
199 |         $Lp = '(.*)$';
200 |     } else {
201 |         if (!isset($strlen)) {
202 |             // see notes
203 |             $strlen = strlen(utf8_decode($str));
204 |         }
205 | 
206 |         // another trivial case
207 |         if ($offset > $strlen) {
208 |             return '';
209 |         }
210 | 
211 |         if ($length > 0) {
212 |             // reduce any length that would
213 |             // go passed the end of the string
214 |             $length = min($strlen - $offset, $length);
215 | 
216 |             $Lx = (int)($length / 65535);
217 |             $Ly = $length % 65535;
218 | 
219 |             // negative length requires a captured group
220 |             // of length characters
221 |             if ($Lx) {
222 |                 $Lp = '(?:.{65535}){' . $Lx . '}';
223 |             }
224 |             $Lp = '(' . $Lp . '.{' . $Ly . '})';
225 |         } elseif ($length < 0) {
226 |             if ($length < ($offset - $strlen)) {
227 |                 return '';
228 |             }
229 | 
230 |             $Lx = (int)((-$length) / 65535);
231 |             $Ly = (-$length) % 65535;
232 | 
233 |             // negative length requires ... capture everything
234 |             // except a group of  -length characters
235 |             // anchored at the tail-end of the string
236 |             if ($Lx) {
237 |                 $Lp = '(?:.{65535}){' . $Lx . '}';
238 |             }
239 |             $Lp = '(.*)(?:' . $Lp . '.{' . $Ly . '})$';
240 |         }
241 |     }
242 | 
243 |     if (!preg_match('#' . $Op . $Lp . '#us', $str, $match)) {
244 |         return '';
245 |     }
246 | 
247 |     return $match[1];
248 | }
249 | 
250 | //---------------------------------------------------------------
251 | /**
252 | * UTF-8 aware alternative to strtolower
253 | * Make a string lowercase
254 | * Note: The concept of a characters "case" only exists is some alphabets
255 | * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
256 | * not exist in the Chinese alphabet, for example. See Unicode Standard
257 | * Annex #21: Case Mappings
258 | * Note: requires utf8_to_unicode and utf8_from_unicode
259 | * @author Andreas Gohr <andi@splitbrain.org>
260 | * @param string
261 | * @return mixed either string in lowercase or FALSE is UTF-8 invalid
262 | * @see http://www.php.net/strtolower
263 | * @see utf8_to_unicode
264 | * @see utf8_from_unicode
265 | * @see http://www.unicode.org/reports/tr21/tr21-5.html
266 | * @see http://dev.splitbrain.org/view/darcs/dokuwiki/inc/utf8.php
267 | * @package utf8
268 | */
269 | function utf8_strtolower($string)
270 | {
271 |     static $UTF8_UPPER_TO_LOWER = null;
272 | 
273 |     if (is_null($UTF8_UPPER_TO_LOWER)) {
274 |         $UTF8_UPPER_TO_LOWER = [
275 |         0x0041 => 0x0061, 0x03A6 => 0x03C6, 0x0162 => 0x0163, 0x00C5 => 0x00E5, 0x0042 => 0x0062,
276 |         0x0139 => 0x013A, 0x00C1 => 0x00E1, 0x0141 => 0x0142, 0x038E => 0x03CD, 0x0100 => 0x0101,
277 |         0x0490 => 0x0491, 0x0394 => 0x03B4, 0x015A => 0x015B, 0x0044 => 0x0064, 0x0393 => 0x03B3,
278 |         0x00D4 => 0x00F4, 0x042A => 0x044A, 0x0419 => 0x0439, 0x0112 => 0x0113, 0x041C => 0x043C,
279 |         0x015E => 0x015F, 0x0143 => 0x0144, 0x00CE => 0x00EE, 0x040E => 0x045E, 0x042F => 0x044F,
280 |         0x039A => 0x03BA, 0x0154 => 0x0155, 0x0049 => 0x0069, 0x0053 => 0x0073, 0x1E1E => 0x1E1F,
281 |         0x0134 => 0x0135, 0x0427 => 0x0447, 0x03A0 => 0x03C0, 0x0418 => 0x0438, 0x00D3 => 0x00F3,
282 |         0x0420 => 0x0440, 0x0404 => 0x0454, 0x0415 => 0x0435, 0x0429 => 0x0449, 0x014A => 0x014B,
283 |         0x0411 => 0x0431, 0x0409 => 0x0459, 0x1E02 => 0x1E03, 0x00D6 => 0x00F6, 0x00D9 => 0x00F9,
284 |         0x004E => 0x006E, 0x0401 => 0x0451, 0x03A4 => 0x03C4, 0x0423 => 0x0443, 0x015C => 0x015D,
285 |         0x0403 => 0x0453, 0x03A8 => 0x03C8, 0x0158 => 0x0159, 0x0047 => 0x0067, 0x00C4 => 0x00E4,
286 |         0x0386 => 0x03AC, 0x0389 => 0x03AE, 0x0166 => 0x0167, 0x039E => 0x03BE, 0x0164 => 0x0165,
287 |         0x0116 => 0x0117, 0x0108 => 0x0109, 0x0056 => 0x0076, 0x00DE => 0x00FE, 0x0156 => 0x0157,
288 |         0x00DA => 0x00FA, 0x1E60 => 0x1E61, 0x1E82 => 0x1E83, 0x00C2 => 0x00E2, 0x0118 => 0x0119,
289 |         0x0145 => 0x0146, 0x0050 => 0x0070, 0x0150 => 0x0151, 0x042E => 0x044E, 0x0128 => 0x0129,
290 |         0x03A7 => 0x03C7, 0x013D => 0x013E, 0x0422 => 0x0442, 0x005A => 0x007A, 0x0428 => 0x0448,
291 |         0x03A1 => 0x03C1, 0x1E80 => 0x1E81, 0x016C => 0x016D, 0x00D5 => 0x00F5, 0x0055 => 0x0075,
292 |         0x0176 => 0x0177, 0x00DC => 0x00FC, 0x1E56 => 0x1E57, 0x03A3 => 0x03C3, 0x041A => 0x043A,
293 |         0x004D => 0x006D, 0x016A => 0x016B, 0x0170 => 0x0171, 0x0424 => 0x0444, 0x00CC => 0x00EC,
294 |         0x0168 => 0x0169, 0x039F => 0x03BF, 0x004B => 0x006B, 0x00D2 => 0x00F2, 0x00C0 => 0x00E0,
295 |         0x0414 => 0x0434, 0x03A9 => 0x03C9, 0x1E6A => 0x1E6B, 0x00C3 => 0x00E3, 0x042D => 0x044D,
296 |         0x0416 => 0x0436, 0x01A0 => 0x01A1, 0x010C => 0x010D, 0x011C => 0x011D, 0x00D0 => 0x00F0,
297 |         0x013B => 0x013C, 0x040F => 0x045F, 0x040A => 0x045A, 0x00C8 => 0x00E8, 0x03A5 => 0x03C5,
298 |         0x0046 => 0x0066, 0x00DD => 0x00FD, 0x0043 => 0x0063, 0x021A => 0x021B, 0x00CA => 0x00EA,
299 |         0x0399 => 0x03B9, 0x0179 => 0x017A, 0x00CF => 0x00EF, 0x01AF => 0x01B0, 0x0045 => 0x0065,
300 |         0x039B => 0x03BB, 0x0398 => 0x03B8, 0x039C => 0x03BC, 0x040C => 0x045C, 0x041F => 0x043F,
301 |         0x042C => 0x044C, 0x00DE => 0x00FE, 0x00D0 => 0x00F0, 0x1EF2 => 0x1EF3, 0x0048 => 0x0068,
302 |         0x00CB => 0x00EB, 0x0110 => 0x0111, 0x0413 => 0x0433, 0x012E => 0x012F, 0x00C6 => 0x00E6,
303 |         0x0058 => 0x0078, 0x0160 => 0x0161, 0x016E => 0x016F, 0x0391 => 0x03B1, 0x0407 => 0x0457,
304 |         0x0172 => 0x0173, 0x0178 => 0x00FF, 0x004F => 0x006F, 0x041B => 0x043B, 0x0395 => 0x03B5,
305 |         0x0425 => 0x0445, 0x0120 => 0x0121, 0x017D => 0x017E, 0x017B => 0x017C, 0x0396 => 0x03B6,
306 |         0x0392 => 0x03B2, 0x0388 => 0x03AD, 0x1E84 => 0x1E85, 0x0174 => 0x0175, 0x0051 => 0x0071,
307 |         0x0417 => 0x0437, 0x1E0A => 0x1E0B, 0x0147 => 0x0148, 0x0104 => 0x0105, 0x0408 => 0x0458,
308 |         0x014C => 0x014D, 0x00CD => 0x00ED, 0x0059 => 0x0079, 0x010A => 0x010B, 0x038F => 0x03CE,
309 |         0x0052 => 0x0072, 0x0410 => 0x0430, 0x0405 => 0x0455, 0x0402 => 0x0452, 0x0126 => 0x0127,
310 |         0x0136 => 0x0137, 0x012A => 0x012B, 0x038A => 0x03AF, 0x042B => 0x044B, 0x004C => 0x006C,
311 |         0x0397 => 0x03B7, 0x0124 => 0x0125, 0x0218 => 0x0219, 0x00DB => 0x00FB, 0x011E => 0x011F,
312 |         0x041E => 0x043E, 0x1E40 => 0x1E41, 0x039D => 0x03BD, 0x0106 => 0x0107, 0x03AB => 0x03CB,
313 |         0x0426 => 0x0446, 0x00DE => 0x00FE, 0x00C7 => 0x00E7, 0x03AA => 0x03CA, 0x0421 => 0x0441,
314 |         0x0412 => 0x0432, 0x010E => 0x010F, 0x00D8 => 0x00F8, 0x0057 => 0x0077, 0x011A => 0x011B,
315 |         0x0054 => 0x0074, 0x004A => 0x006A, 0x040B => 0x045B, 0x0406 => 0x0456, 0x0102 => 0x0103,
316 |         0x039B => 0x03BB, 0x00D1 => 0x00F1, 0x041D => 0x043D, 0x038C => 0x03CC, 0x00C9 => 0x00E9,
317 |         0x00D0 => 0x00F0, 0x0407 => 0x0457, 0x0122 => 0x0123,
318 |             ];
319 |     }
320 | 
321 |     $uni = utf8_to_unicode($string);
322 | 
323 |     if (!$uni) {
324 |         return false;
325 |     }
326 | 
327 |     $cnt = count($uni);
328 |     for ($i = 0; $i < $cnt; $i++) {
329 |         if (isset($UTF8_UPPER_TO_LOWER[$uni[$i]])) {
330 |             $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
331 |         }
332 |     }
333 | 
334 |     return utf8_from_unicode($uni);
335 | }
336 | 
337 | //---------------------------------------------------------------
338 | /**
339 | * UTF-8 aware alternative to strtoupper
340 | * Make a string uppercase
341 | * Note: The concept of a characters "case" only exists is some alphabets
342 | * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
343 | * not exist in the Chinese alphabet, for example. See Unicode Standard
344 | * Annex #21: Case Mappings
345 | * Note: requires utf8_to_unicode and utf8_from_unicode
346 | * @author Andreas Gohr <andi@splitbrain.org>
347 | * @param string
348 | * @return mixed either string in lowercase or FALSE is UTF-8 invalid
349 | * @see http://www.php.net/strtoupper
350 | * @see utf8_to_unicode
351 | * @see utf8_from_unicode
352 | * @see http://www.unicode.org/reports/tr21/tr21-5.html
353 | * @see http://dev.splitbrain.org/view/darcs/dokuwiki/inc/utf8.php
354 | * @package utf8
355 | */
356 | function utf8_strtoupper($string)
357 | {
358 |     static $UTF8_LOWER_TO_UPPER = null;
359 | 
360 |     if (is_null($UTF8_LOWER_TO_UPPER)) {
361 |         $UTF8_LOWER_TO_UPPER = [
362 |         0x0061 => 0x0041, 0x03C6 => 0x03A6, 0x0163 => 0x0162, 0x00E5 => 0x00C5, 0x0062 => 0x0042,
363 |         0x013A => 0x0139, 0x00E1 => 0x00C1, 0x0142 => 0x0141, 0x03CD => 0x038E, 0x0101 => 0x0100,
364 |         0x0491 => 0x0490, 0x03B4 => 0x0394, 0x015B => 0x015A, 0x0064 => 0x0044, 0x03B3 => 0x0393,
365 |         0x00F4 => 0x00D4, 0x044A => 0x042A, 0x0439 => 0x0419, 0x0113 => 0x0112, 0x043C => 0x041C,
366 |         0x015F => 0x015E, 0x0144 => 0x0143, 0x00EE => 0x00CE, 0x045E => 0x040E, 0x044F => 0x042F,
367 |         0x03BA => 0x039A, 0x0155 => 0x0154, 0x0069 => 0x0049, 0x0073 => 0x0053, 0x1E1F => 0x1E1E,
368 |         0x0135 => 0x0134, 0x0447 => 0x0427, 0x03C0 => 0x03A0, 0x0438 => 0x0418, 0x00F3 => 0x00D3,
369 |         0x0440 => 0x0420, 0x0454 => 0x0404, 0x0435 => 0x0415, 0x0449 => 0x0429, 0x014B => 0x014A,
370 |         0x0431 => 0x0411, 0x0459 => 0x0409, 0x1E03 => 0x1E02, 0x00F6 => 0x00D6, 0x00F9 => 0x00D9,
371 |         0x006E => 0x004E, 0x0451 => 0x0401, 0x03C4 => 0x03A4, 0x0443 => 0x0423, 0x015D => 0x015C,
372 |         0x0453 => 0x0403, 0x03C8 => 0x03A8, 0x0159 => 0x0158, 0x0067 => 0x0047, 0x00E4 => 0x00C4,
373 |         0x03AC => 0x0386, 0x03AE => 0x0389, 0x0167 => 0x0166, 0x03BE => 0x039E, 0x0165 => 0x0164,
374 |         0x0117 => 0x0116, 0x0109 => 0x0108, 0x0076 => 0x0056, 0x00FE => 0x00DE, 0x0157 => 0x0156,
375 |         0x00FA => 0x00DA, 0x1E61 => 0x1E60, 0x1E83 => 0x1E82, 0x00E2 => 0x00C2, 0x0119 => 0x0118,
376 |         0x0146 => 0x0145, 0x0070 => 0x0050, 0x0151 => 0x0150, 0x044E => 0x042E, 0x0129 => 0x0128,
377 |         0x03C7 => 0x03A7, 0x013E => 0x013D, 0x0442 => 0x0422, 0x007A => 0x005A, 0x0448 => 0x0428,
378 |         0x03C1 => 0x03A1, 0x1E81 => 0x1E80, 0x016D => 0x016C, 0x00F5 => 0x00D5, 0x0075 => 0x0055,
379 |         0x0177 => 0x0176, 0x00FC => 0x00DC, 0x1E57 => 0x1E56, 0x03C3 => 0x03A3, 0x043A => 0x041A,
380 |         0x006D => 0x004D, 0x016B => 0x016A, 0x0171 => 0x0170, 0x0444 => 0x0424, 0x00EC => 0x00CC,
381 |         0x0169 => 0x0168, 0x03BF => 0x039F, 0x006B => 0x004B, 0x00F2 => 0x00D2, 0x00E0 => 0x00C0,
382 |         0x0434 => 0x0414, 0x03C9 => 0x03A9, 0x1E6B => 0x1E6A, 0x00E3 => 0x00C3, 0x044D => 0x042D,
383 |         0x0436 => 0x0416, 0x01A1 => 0x01A0, 0x010D => 0x010C, 0x011D => 0x011C, 0x00F0 => 0x00D0,
384 |         0x013C => 0x013B, 0x045F => 0x040F, 0x045A => 0x040A, 0x00E8 => 0x00C8, 0x03C5 => 0x03A5,
385 |         0x0066 => 0x0046, 0x00FD => 0x00DD, 0x0063 => 0x0043, 0x021B => 0x021A, 0x00EA => 0x00CA,
386 |         0x03B9 => 0x0399, 0x017A => 0x0179, 0x00EF => 0x00CF, 0x01B0 => 0x01AF, 0x0065 => 0x0045,
387 |         0x03BB => 0x039B, 0x03B8 => 0x0398, 0x03BC => 0x039C, 0x045C => 0x040C, 0x043F => 0x041F,
388 |         0x044C => 0x042C, 0x00FE => 0x00DE, 0x00F0 => 0x00D0, 0x1EF3 => 0x1EF2, 0x0068 => 0x0048,
389 |         0x00EB => 0x00CB, 0x0111 => 0x0110, 0x0433 => 0x0413, 0x012F => 0x012E, 0x00E6 => 0x00C6,
390 |         0x0078 => 0x0058, 0x0161 => 0x0160, 0x016F => 0x016E, 0x03B1 => 0x0391, 0x0457 => 0x0407,
391 |         0x0173 => 0x0172, 0x00FF => 0x0178, 0x006F => 0x004F, 0x043B => 0x041B, 0x03B5 => 0x0395,
392 |         0x0445 => 0x0425, 0x0121 => 0x0120, 0x017E => 0x017D, 0x017C => 0x017B, 0x03B6 => 0x0396,
393 |         0x03B2 => 0x0392, 0x03AD => 0x0388, 0x1E85 => 0x1E84, 0x0175 => 0x0174, 0x0071 => 0x0051,
394 |         0x0437 => 0x0417, 0x1E0B => 0x1E0A, 0x0148 => 0x0147, 0x0105 => 0x0104, 0x0458 => 0x0408,
395 |         0x014D => 0x014C, 0x00ED => 0x00CD, 0x0079 => 0x0059, 0x010B => 0x010A, 0x03CE => 0x038F,
396 |         0x0072 => 0x0052, 0x0430 => 0x0410, 0x0455 => 0x0405, 0x0452 => 0x0402, 0x0127 => 0x0126,
397 |         0x0137 => 0x0136, 0x012B => 0x012A, 0x03AF => 0x038A, 0x044B => 0x042B, 0x006C => 0x004C,
398 |         0x03B7 => 0x0397, 0x0125 => 0x0124, 0x0219 => 0x0218, 0x00FB => 0x00DB, 0x011F => 0x011E,
399 |         0x043E => 0x041E, 0x1E41 => 0x1E40, 0x03BD => 0x039D, 0x0107 => 0x0106, 0x03CB => 0x03AB,
400 |         0x0446 => 0x0426, 0x00FE => 0x00DE, 0x00E7 => 0x00C7, 0x03CA => 0x03AA, 0x0441 => 0x0421,
401 |         0x0432 => 0x0412, 0x010F => 0x010E, 0x00F8 => 0x00D8, 0x0077 => 0x0057, 0x011B => 0x011A,
402 |         0x0074 => 0x0054, 0x006A => 0x004A, 0x045B => 0x040B, 0x0456 => 0x0406, 0x0103 => 0x0102,
403 |         0x03BB => 0x039B, 0x00F1 => 0x00D1, 0x043D => 0x041D, 0x03CC => 0x038C, 0x00E9 => 0x00C9,
404 |         0x00F0 => 0x00D0, 0x0457 => 0x0407, 0x0123 => 0x0122,
405 |             ];
406 |     }
407 | 
408 |     $uni = utf8_to_unicode($string);
409 | 
410 |     if (!$uni) {
411 |         return false;
412 |     }
413 | 
414 |     $cnt = count($uni);
415 |     for ($i = 0; $i < $cnt; $i++) {
416 |         if (isset($UTF8_LOWER_TO_UPPER[$uni[$i]])) {
417 |             $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
418 |         }
419 |     }
420 | 
421 |     return utf8_from_unicode($uni);
422 | }
423 | 


--------------------------------------------------------------------------------
/src/phputf8/ord.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /**
 4 | * @package utf8
 5 | */
 6 | 
 7 | //---------------------------------------------------------------
 8 | /**
 9 | * UTF-8 aware alternative to ord
10 | * Returns the unicode ordinal for a character
11 | *
12 | * Joomla modification - As of PHP 7.4, curly brace access has been deprecated. As a result this function has been
13 | * modified to use square brace syntax
14 | * See https://github.com/php/php-src/commit/d574df63dc375f5fc9202ce5afde23f866b6450a
15 | * for additional references
16 | *
17 | * @param string UTF-8 encoded character
18 | * @return int unicode ordinal for the character
19 | * @see http://www.php.net/ord
20 | * @see http://www.php.net/manual/en/function.ord.php#46267
21 | */
22 | function utf8_ord($chr)
23 | {
24 |     $ord0 = ord($chr);
25 | 
26 |     if ($ord0 >= 0 && $ord0 <= 127) {
27 |         return $ord0;
28 |     }
29 | 
30 |     if (!isset($chr[1])) {
31 |         trigger_error('Short sequence - at least 2 bytes expected, only 1 seen');
32 |         return false;
33 |     }
34 | 
35 |     $ord1 = ord($chr[1]);
36 |     if ($ord0 >= 192 && $ord0 <= 223) {
37 |         return ($ord0 - 192) * 64
38 |             + ($ord1 - 128);
39 |     }
40 | 
41 |     if (!isset($chr[2])) {
42 |         trigger_error('Short sequence - at least 3 bytes expected, only 2 seen');
43 |         return false;
44 |     }
45 |     $ord2 = ord($chr[2]);
46 |     if ($ord0 >= 224 && $ord0 <= 239) {
47 |         return ($ord0 - 224) * 4096
48 |             + ($ord1 - 128) * 64
49 |                 + ($ord2 - 128);
50 |     }
51 | 
52 |     if (!isset($chr[3])) {
53 |         trigger_error('Short sequence - at least 4 bytes expected, only 3 seen');
54 |         return false;
55 |     }
56 |     $ord3 = ord($chr[3]);
57 |     if ($ord0 >= 240 && $ord0 <= 247) {
58 |         return ($ord0 - 240) * 262144
59 |             + ($ord1 - 128) * 4096
60 |                 + ($ord2 - 128) * 64
61 |                     + ($ord3 - 128);
62 |     }
63 | 
64 |     if (!isset($chr[4])) {
65 |         trigger_error('Short sequence - at least 5 bytes expected, only 4 seen');
66 |         return false;
67 |     }
68 |     $ord4 = ord($chr[4]);
69 |     if ($ord0 >= 248 && $ord0 <= 251) {
70 |         return ($ord0 - 248) * 16777216
71 |             + ($ord1 - 128) * 262144
72 |                 + ($ord2 - 128) * 4096
73 |                     + ($ord3 - 128) * 64
74 |                         + ($ord4 - 128);
75 |     }
76 | 
77 |     if (!isset($chr[5])) {
78 |         trigger_error('Short sequence - at least 6 bytes expected, only 5 seen');
79 |         return false;
80 |     }
81 |     if ($ord0 >= 252 && $ord0 <= 253) {
82 |         return ($ord0 - 252) * 1073741824
83 |             + ($ord1 - 128) * 16777216
84 |                 + ($ord2 - 128) * 262144
85 |                     + ($ord3 - 128) * 4096
86 |                         + ($ord4 - 128) * 64
87 |                             + (ord($chr[5]) - 128);
88 |     }
89 | 
90 |     if ($ord0 >= 254 && $ord0 <= 255) {
91 |         trigger_error('Invalid UTF-8 with surrogate ordinal ' . $ord0);
92 |         return false;
93 |     }
94 | }
95 | 


--------------------------------------------------------------------------------
/src/phputf8/str_ireplace.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /**
 4 | * @package utf8
 5 | */
 6 | 
 7 | //---------------------------------------------------------------
 8 | /**
 9 | * UTF-8 aware alternative to str_ireplace
10 | * Case-insensitive version of str_replace
11 | * Note: requires utf8_strtolower
12 | * Note: it's not fast and gets slower if $search / $replace is array
13 | * Notes: it's based on the assumption that the lower and uppercase
14 | * versions of a UTF-8 character will have the same length in bytes
15 | * which is currently true given the hash table to strtolower
16 | * @param string
17 | * @return string
18 | * @see http://www.php.net/str_ireplace
19 | * @see utf8_strtolower
20 | * @package utf8
21 | */
22 | function utf8_ireplace($search, $replace, $str, $count = null)
23 | {
24 |     if (!is_array($search)) {
25 |         $slen = strlen($search);
26 |         if ($slen == 0) {
27 |             return $str;
28 |         }
29 | 
30 |         $lendif = strlen($replace) - strlen($search);
31 |         $search = utf8_strtolower($search);
32 | 
33 |         $search  = preg_quote($search, '/');
34 |         $lstr    = utf8_strtolower($str);
35 |         $i       = 0;
36 |         $matched = 0;
37 |         while (preg_match('/(.*)' . $search . '/Us', $lstr, $matches)) {
38 |             if ($i === $count) {
39 |                 break;
40 |             }
41 |             $mlen = strlen($matches[0]);
42 |             $lstr = substr($lstr, $mlen);
43 |             $str  = substr_replace($str, $replace, $matched + strlen($matches[1]), $slen);
44 |             $matched += $mlen + $lendif;
45 |             $i++;
46 |         }
47 |         return $str;
48 |     } else {
49 |         foreach (array_keys($search) as $k) {
50 |             if (is_array($replace)) {
51 |                 if (array_key_exists($k, $replace)) {
52 |                     $str = utf8_ireplace($search[$k], $replace[$k], $str, $count);
53 |                 } else {
54 |                     $str = utf8_ireplace($search[$k], '', $str, $count);
55 |                 }
56 |             } else {
57 |                 $str = utf8_ireplace($search[$k], $replace, $str, $count);
58 |             }
59 |         }
60 |         return $str;
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/phputf8/str_pad.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /**
 4 | * @package utf8
 5 | */
 6 | 
 7 | //---------------------------------------------------------------
 8 | /**
 9 | * Replacement for str_pad. $padStr may contain multi-byte characters.
10 | *
11 | * @author Oliver Saunders <oliver (a) osinternetservices.com>
12 | * @param string $input
13 | * @param int $length
14 | * @param string $padStr
15 | * @param int $type ( same constants as str_pad )
16 | * @return string
17 | * @see http://www.php.net/str_pad
18 | * @see utf8_substr
19 | * @package utf8
20 | */
21 | function utf8_str_pad($input, $length, $padStr = ' ', $type = STR_PAD_RIGHT)
22 | {
23 |     $inputLen = utf8_strlen($input);
24 |     if ($length <= $inputLen) {
25 |         return $input;
26 |     }
27 | 
28 |     $padStrLen = utf8_strlen($padStr);
29 |     $padLen    = $length - $inputLen;
30 | 
31 |     if ($type == STR_PAD_RIGHT) {
32 |         $repeatTimes = ceil($padLen / $padStrLen);
33 |         return utf8_substr($input . str_repeat($padStr, $repeatTimes), 0, $length);
34 |     }
35 | 
36 |     if ($type == STR_PAD_LEFT) {
37 |         $repeatTimes = ceil($padLen / $padStrLen);
38 |         return utf8_substr(str_repeat($padStr, $repeatTimes), 0, floor($padLen)) . $input;
39 |     }
40 | 
41 |     if ($type == STR_PAD_BOTH) {
42 |         $padLen /= 2;
43 |         $padAmountLeft    = floor($padLen);
44 |         $padAmountRight   = ceil($padLen);
45 |         $repeatTimesLeft  = ceil($padAmountLeft / $padStrLen);
46 |         $repeatTimesRight = ceil($padAmountRight / $padStrLen);
47 | 
48 |         $paddingLeft  = utf8_substr(str_repeat($padStr, $repeatTimesLeft), 0, $padAmountLeft);
49 |         $paddingRight = utf8_substr(str_repeat($padStr, $repeatTimesRight), 0, $padAmountLeft);
50 |         return $paddingLeft . $input . $paddingRight;
51 |     }
52 | 
53 |     trigger_error('utf8_str_pad: Unknown padding type (' . $type . ')', E_USER_ERROR);
54 | }
55 | 


--------------------------------------------------------------------------------
/src/phputf8/str_split.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /**
 4 | * @package utf8
 5 | */
 6 | 
 7 | //---------------------------------------------------------------
 8 | /**
 9 | * UTF-8 aware alternative to str_split
10 | * Convert a string to an array
11 | * Note: requires utf8_strlen to be loaded
12 | * @param string UTF-8 encoded
13 | * @param int number to characters to split string by
14 | * @return string characters in string reverses
15 | * @see http://www.php.net/str_split
16 | * @see utf8_strlen
17 | * @package utf8
18 | */
19 | function utf8_str_split($str, $split_len = 1)
20 | {
21 |     if (!preg_match('/^[0-9]+$/', $split_len) || $split_len < 1) {
22 |         return false;
23 |     }
24 | 
25 |     $len = utf8_strlen($str);
26 |     if ($len <= $split_len) {
27 |         return [$str];
28 |     }
29 | 
30 |     preg_match_all('/.{' . $split_len . '}|[^\x00]{1,' . $split_len . '}$/us', $str, $ar);
31 |     return $ar[0];
32 | }
33 | 


--------------------------------------------------------------------------------
/src/phputf8/strcasecmp.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /**
 4 | * @package utf8
 5 | */
 6 | 
 7 | //---------------------------------------------------------------
 8 | /**
 9 | * UTF-8 aware alternative to strcasecmp
10 | * A case insensivite string comparison
11 | * Note: requires utf8_strtolower
12 | * @param string
13 | * @param string
14 | * @return int
15 | * @see http://www.php.net/strcasecmp
16 | * @see utf8_strtolower
17 | * @package utf8
18 | */
19 | function utf8_strcasecmp($strX, $strY)
20 | {
21 |     $strX = utf8_strtolower($strX);
22 |     $strY = utf8_strtolower($strY);
23 |     return strcmp($strX, $strY);
24 | }
25 | 


--------------------------------------------------------------------------------
/src/phputf8/strcspn.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /**
 4 | * @package utf8
 5 | */
 6 | 
 7 | //---------------------------------------------------------------
 8 | /**
 9 | * UTF-8 aware alternative to strcspn
10 | * Find length of initial segment not matching mask
11 | * Note: requires utf8_strlen and utf8_substr (if start, length are used)
12 | * @param string
13 | * @return int
14 | * @see http://www.php.net/strcspn
15 | * @see utf8_strlen
16 | * @package utf8
17 | */
18 | function utf8_strcspn($str, $mask, $start = null, $length = null)
19 | {
20 |     if (empty($mask) || strlen($mask) == 0) {
21 |         return null;
22 |     }
23 | 
24 |     $mask = preg_replace('!([\\\\\\-\\]\\[/^])!', '\\\${1}', $mask);
25 | 
26 |     if ($start !== null || $length !== null) {
27 |         $str = utf8_substr($str, $start, $length);
28 |     }
29 | 
30 |     preg_match('/^[^' . $mask . ']+/u', $str, $matches);
31 | 
32 |     if (isset($matches[0])) {
33 |         return utf8_strlen($matches[0]);
34 |     }
35 | 
36 |     return 0;
37 | }
38 | 


--------------------------------------------------------------------------------
/src/phputf8/stristr.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /**
 4 | * @package utf8
 5 | */
 6 | 
 7 | //---------------------------------------------------------------
 8 | /**
 9 | * UTF-8 aware alternative to stristr
10 | * Find first occurrence of a string using case insensitive comparison
11 | * Note: requires utf8_strtolower
12 | * @param string
13 | * @param string
14 | * @return int
15 | * @see http://www.php.net/strcasecmp
16 | * @see utf8_strtolower
17 | * @package utf8
18 | */
19 | function utf8_stristr($str, $search)
20 | {
21 |     if (strlen($search) == 0) {
22 |         return $str;
23 |     }
24 | 
25 |     $lstr    = utf8_strtolower($str);
26 |     $lsearch = utf8_strtolower($search);
27 |     //JOOMLA SPECIFIC FIX - BEGIN
28 |     preg_match('/^(.*)' . preg_quote($lsearch, '/') . '/Us', $lstr, $matches);
29 |     //JOOMLA SPECIFIC FIX - END
30 | 
31 |     if (count($matches) == 2) {
32 |         return substr($str, strlen($matches[1]));
33 |     }
34 | 
35 |     return false;
36 | }
37 | 


--------------------------------------------------------------------------------
/src/phputf8/strrev.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /**
 4 | * @package utf8
 5 | */
 6 | 
 7 | //---------------------------------------------------------------
 8 | /**
 9 | * UTF-8 aware alternative to strrev
10 | * Reverse a string
11 | * @param string UTF-8 encoded
12 | * @return string characters in string reverses
13 | * @see http://www.php.net/strrev
14 | * @package utf8
15 | */
16 | function utf8_strrev($str)
17 | {
18 |     preg_match_all('/./us', $str, $ar);
19 |     return join('', array_reverse($ar[0]));
20 | }
21 | 


--------------------------------------------------------------------------------
/src/phputf8/strspn.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /**
 4 | * @package utf8
 5 | */
 6 | 
 7 | //---------------------------------------------------------------
 8 | /**
 9 | * UTF-8 aware alternative to strspn
10 | * Find length of initial segment matching mask
11 | * Note: requires utf8_strlen and utf8_substr (if start, length are used)
12 | * @param string
13 | * @return int
14 | * @see http://www.php.net/strspn
15 | * @package utf8
16 | */
17 | function utf8_strspn($str, $mask, $start = null, $length = null)
18 | {
19 |     $mask = preg_replace('!([\\\\\\-\\]\\[/^])!', '\\\${1}', $mask);
20 | 
21 |     // Fix for $start but no $length argument.
22 |     if ($start !== null && $length === null) {
23 |         $length = utf8_strlen($str);
24 |     }
25 | 
26 |     if ($start !== null || $length !== null) {
27 |         $str = utf8_substr($str, $start, $length);
28 |     }
29 | 
30 |     preg_match('/^[' . $mask . ']+/u', $str, $matches);
31 | 
32 |     if (isset($matches[0])) {
33 |         return utf8_strlen($matches[0]);
34 |     }
35 | 
36 |     return 0;
37 | }
38 | 


--------------------------------------------------------------------------------
/src/phputf8/substr_replace.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /**
 4 | * @package utf8
 5 | */
 6 | 
 7 | //---------------------------------------------------------------
 8 | /**
 9 | * UTF-8 aware substr_replace.
10 | * Note: requires utf8_substr to be loaded
11 | * @see http://www.php.net/substr_replace
12 | * @see utf8_strlen
13 | * @see utf8_substr
14 | */
15 | function utf8_substr_replace($str, $repl, $start, $length = null)
16 | {
17 |     preg_match_all('/./us', $str, $ar);
18 |     preg_match_all('/./us', $repl, $rar);
19 |     if ($length === null) {
20 |         $length = utf8_strlen($str);
21 |     }
22 |     array_splice($ar[0], $start, $length, $rar[0]);
23 |     return join('', $ar[0]);
24 | }
25 | 


--------------------------------------------------------------------------------
/src/phputf8/trim.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /**
 4 | * @package utf8
 5 | */
 6 | 
 7 | //---------------------------------------------------------------
 8 | /**
 9 | * UTF-8 aware replacement for ltrim()
10 | * Note: you only need to use this if you are supplying the charlist
11 | * optional arg and it contains UTF-8 characters. Otherwise ltrim will
12 | * work normally on a UTF-8 string
13 | * @author Andreas Gohr <andi@splitbrain.org>
14 | * @see http://www.php.net/ltrim
15 | * @see http://dev.splitbrain.org/view/darcs/dokuwiki/inc/utf8.php
16 | * @return string
17 | * @package utf8
18 | */
19 | function utf8_ltrim($str, $charlist = false)
20 | {
21 |     if ($charlist === false) {
22 |         return ltrim($str);
23 |     }
24 | 
25 |     //quote charlist for use in a characterclass
26 |     $charlist = preg_replace('!([\\\\\\-\\]\\[/^])!', '\\\${1}', $charlist);
27 | 
28 |     return preg_replace('/^[' . $charlist . ']+/u', '', $str);
29 | }
30 | 
31 | //---------------------------------------------------------------
32 | /**
33 | * UTF-8 aware replacement for rtrim()
34 | * Note: you only need to use this if you are supplying the charlist
35 | * optional arg and it contains UTF-8 characters. Otherwise rtrim will
36 | * work normally on a UTF-8 string
37 | * @author Andreas Gohr <andi@splitbrain.org>
38 | * @see http://www.php.net/rtrim
39 | * @see http://dev.splitbrain.org/view/darcs/dokuwiki/inc/utf8.php
40 | * @return string
41 | * @package utf8
42 | */
43 | function utf8_rtrim($str, $charlist = false)
44 | {
45 |     if ($charlist === false) {
46 |         return rtrim($str);
47 |     }
48 | 
49 |     //quote charlist for use in a characterclass
50 |     $charlist = preg_replace('!([\\\\\\-\\]\\[/^])!', '\\\${1}', $charlist);
51 | 
52 |     return preg_replace('/[' . $charlist . ']+$/u', '', $str);
53 | }
54 | 
55 | //---------------------------------------------------------------
56 | /**
57 | * UTF-8 aware replacement for trim()
58 | * Note: you only need to use this if you are supplying the charlist
59 | * optional arg and it contains UTF-8 characters. Otherwise trim will
60 | * work normally on a UTF-8 string
61 | * @author Andreas Gohr <andi@splitbrain.org>
62 | * @see http://www.php.net/trim
63 | * @see http://dev.splitbrain.org/view/darcs/dokuwiki/inc/utf8.php
64 | * @return string
65 | * @package utf8
66 | */
67 | function utf8_trim($str, $charlist = false)
68 | {
69 |     if ($charlist === false) {
70 |         return trim($str);
71 |     }
72 |     return utf8_ltrim(utf8_rtrim($str, $charlist), $charlist);
73 | }
74 | 


--------------------------------------------------------------------------------
/src/phputf8/ucfirst.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /**
 4 | * @package utf8
 5 | */
 6 | 
 7 | //---------------------------------------------------------------
 8 | /**
 9 | * UTF-8 aware alternative to ucfirst
10 | * Make a string's first character uppercase
11 | * Note: requires utf8_strtoupper
12 | * @param string
13 | * @return string with first character as upper case (if applicable)
14 | * @see http://www.php.net/ucfirst
15 | * @see utf8_strtoupper
16 | * @package utf8
17 | */
18 | function utf8_ucfirst($str)
19 | {
20 |     switch (utf8_strlen($str)) {
21 |         case 0:
22 |             return '';
23 |             break;
24 |         case 1:
25 |             return utf8_strtoupper($str);
26 |             break;
27 |         default:
28 |             preg_match('/^(.{1})(.*)$/us', $str, $matches);
29 |             return utf8_strtoupper($matches[1]) . $matches[2];
30 |             break;
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/phputf8/ucwords.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /**
 4 | * @package utf8
 5 | */
 6 | 
 7 | //---------------------------------------------------------------
 8 | /**
 9 | * UTF-8 aware alternative to ucwords
10 | * Uppercase the first character of each word in a string
11 | * Note: requires utf8_substr_replace and utf8_strtoupper
12 | * @param string
13 | * @return string with first char of each word uppercase
14 | * @see http://www.php.net/ucwords
15 | * @package utf8
16 | */
17 | function utf8_ucwords($str)
18 | {
19 |     // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
20 |     // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
21 |     // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords
22 |     $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
23 |     return preg_replace_callback($pattern, 'utf8_ucwords_callback', $str);
24 | }
25 | 
26 | //---------------------------------------------------------------
27 | /**
28 | * Callback function for preg_replace_callback call in utf8_ucwords
29 | * You don't need to call this yourself
30 | * @param array of matches corresponding to a single word
31 | * @return string with first char of the word in uppercase
32 | * @see utf8_ucwords
33 | * @see utf8_strtoupper
34 | * @package utf8
35 | */
36 | function utf8_ucwords_callback($matches)
37 | {
38 |     $leadingws = $matches[2];
39 |     $ucfirst   = utf8_strtoupper($matches[3]);
40 |     $ucword    = utf8_substr_replace(ltrim($matches[0]), $ucfirst, 0, 1);
41 |     return $leadingws . $ucword;
42 | }
43 | 


--------------------------------------------------------------------------------
/src/phputf8/utf8.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /**
 4 | * This is the dynamic loader for the library. It checks whether you have
 5 | * the mbstring extension available and includes relevant files
 6 | * on that basis, falling back to the native (as in written in PHP) version
 7 | * if mbstring is unavailabe.
 8 | *
 9 | * It's probably easiest to use this, if you don't want to understand
10 | * the dependencies involved, in conjunction with PHP versions etc. At
11 | * the same time, you might get better performance by managing loading
12 | * yourself. The smartest way to do this, bearing in mind performance,
13 | * is probably to "load on demand" - i.e. just before you use these
14 | * functions in your code, load the version you need.
15 | *
16 | * It makes sure the the following functions are available;
17 | * utf8_strlen, utf8_strpos, utf8_strrpos, utf8_substr,
18 | * utf8_strtolower, utf8_strtoupper
19 | * Other functions in the ./native directory depend on these
20 | * six functions being available
21 | * @package utf8
22 | */
23 | 
24 | /**
25 | * Put the current directory in this constant
26 | */
27 | if (!defined('UTF8')) {
28 |     define('UTF8', dirname(__FILE__));
29 | }
30 | 
31 | /**
32 | * If string overloading is active, it will break many of the
33 | * native implementations. mbstring.func_overload must be set
34 | * to 0, 1 or 4 in php.ini (string overloading disabled).
35 | * Also need to check we have the correct internal mbstring
36 | * encoding
37 | */
38 | if (extension_loaded('mbstring')) {
39 |     /*
40 |      * Joomla modification - As of PHP 8, the `mbstring.func_overload` configuration has been removed and the
41 |      * MB_OVERLOAD_STRING constant will no longer be present, so this check only runs for PHP 7 and older
42 |      * See https://github.com/php/php-src/commit/331e56ce38a91e87a6fb8e88154bb5bde445b132
43 |      * and https://github.com/php/php-src/commit/97df99a6d7d96a886ac143337fecad775907589a
44 |      * for additional references
45 |      */
46 |     if (PHP_VERSION_ID < 80000 && ((int) ini_get('mbstring.func_overload')) & MB_OVERLOAD_STRING) {
47 |         trigger_error('String functions are overloaded by mbstring', E_USER_ERROR);
48 |     }
49 |     mb_internal_encoding('UTF-8');
50 | }
51 | 
52 | /**
53 | * Check whether PCRE has been compiled with UTF-8 support
54 | */
55 | $UTF8_ar = [];
56 | if (preg_match('/^.{1}$/u', "ñ", $UTF8_ar) != 1) {
57 |     trigger_error('PCRE is not compiled with UTF-8 support', E_USER_ERROR);
58 | }
59 | unset($UTF8_ar);
60 | 
61 | 
62 | /**
63 | * Load the smartest implementations of utf8_strpos, utf8_strrpos
64 | * and utf8_substr
65 | */
66 | if (!defined('UTF8_CORE')) {
67 |     if (function_exists('mb_substr')) {
68 |         require_once UTF8 . '/mbstring/core.php';
69 |     } else {
70 |         require_once UTF8 . '/utils/unicode.php';
71 |         require_once UTF8 . '/native/core.php';
72 |     }
73 | }
74 | 
75 | /**
76 | * Load the native implementation of utf8_substr_replace
77 | */
78 | require_once UTF8 . '/substr_replace.php';
79 | 
80 | /**
81 | * You should now be able to use all the other utf_* string functions
82 | */
83 | 


--------------------------------------------------------------------------------
/src/phputf8/utils/ascii.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | /**
  4 | * Tools to help with ASCII in UTF-8
  5 | *
  6 | * @package utf8
  7 | */
  8 | 
  9 | //--------------------------------------------------------------------
 10 | /**
 11 | * Tests whether a string contains only 7bit ASCII bytes.
 12 | * You might use this to conditionally check whether a string
 13 | * needs handling as UTF-8 or not, potentially offering performance
 14 | * benefits by using the native PHP equivalent if it's just ASCII e.g.;
 15 | *
 16 | * <code>
 17 | * if ( utf8_is_ascii($someString) ) {
 18 | *     // It's just ASCII - use the native PHP version
 19 | *     $someString = strtolower($someString);
 20 | * } else {
 21 | *     $someString = utf8_strtolower($someString);
 22 | * }
 23 | * </code>
 24 | *
 25 | * @param string
 26 | * @return boolean TRUE if it's all ASCII
 27 | * @package utf8
 28 | * @see utf8_is_ascii_ctrl
 29 | */
 30 | function utf8_is_ascii($str)
 31 | {
 32 |     // Search for any bytes which are outside the ASCII range...
 33 |     return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
 34 | }
 35 | 
 36 | //--------------------------------------------------------------------
 37 | /**
 38 | * Tests whether a string contains only 7bit ASCII bytes with device
 39 | * control codes omitted. The device control codes can be found on the
 40 | * second table here: http://www.w3schools.com/tags/ref_ascii.asp
 41 | *
 42 | * @param string
 43 | * @return boolean TRUE if it's all ASCII without device control codes
 44 | * @package utf8
 45 | * @see utf8_is_ascii
 46 | */
 47 | function utf8_is_ascii_ctrl($str)
 48 | {
 49 |     if (strlen($str) > 0) {
 50 |         // Search for any bytes which are outside the ASCII range,
 51 |         // or are device control codes
 52 |         return (preg_match('/[^\x09\x0A\x0D\x20-\x7E]/', $str) !== 1);
 53 |     }
 54 |     return false;
 55 | }
 56 | 
 57 | //--------------------------------------------------------------------
 58 | /**
 59 | * Strip out all non-7bit ASCII bytes
 60 | * If you need to transmit a string to system which you know can only
 61 | * support 7bit ASCII, you could use this function.
 62 | * @param string
 63 | * @return string with non ASCII bytes removed
 64 | * @package utf8
 65 | * @see utf8_strip_non_ascii_ctrl
 66 | */
 67 | function utf8_strip_non_ascii($str)
 68 | {
 69 |     ob_start();
 70 |     while (
 71 |         preg_match(
 72 |             '/^([\x00-\x7F]+)|([^\x00-\x7F]+)/S',
 73 |             $str,
 74 |             $matches
 75 |         )
 76 |     ) {
 77 |         if (!isset($matches[2])) {
 78 |             echo $matches[0];
 79 |         }
 80 |         $str = substr($str, strlen($matches[0]));
 81 |     }
 82 |     $result = ob_get_contents();
 83 |     ob_end_clean();
 84 |     return $result;
 85 | }
 86 | 
 87 | //--------------------------------------------------------------------
 88 | /**
 89 | * Strip out device control codes in the ASCII range
 90 | * which are not permitted in XML. Note that this leaves
 91 | * multi-byte characters untouched - it only removes device
 92 | * control codes
 93 | * @see http://hsivonen.iki.fi/producing-xml/#controlchar
 94 | * @param string
 95 | * @return string control codes removed
 96 | */
 97 | function utf8_strip_ascii_ctrl($str)
 98 | {
 99 |     ob_start();
100 |     while (
101 |         preg_match(
102 |             '/^([^\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+)|([\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+)/S',
103 |             $str,
104 |             $matches
105 |         )
106 |     ) {
107 |         if (!isset($matches[2])) {
108 |             echo $matches[0];
109 |         }
110 |         $str = substr($str, strlen($matches[0]));
111 |     }
112 |     $result = ob_get_contents();
113 |     ob_end_clean();
114 |     return $result;
115 | }
116 | 
117 | //--------------------------------------------------------------------
118 | /**
119 | * Strip out all non 7bit ASCII bytes and ASCII device control codes.
120 | * For a list of ASCII device control codes see the 2nd table here:
121 | * http://www.w3schools.com/tags/ref_ascii.asp
122 | *
123 | * @param string
124 | * @return boolean TRUE if it's all ASCII
125 | * @package utf8
126 | */
127 | function utf8_strip_non_ascii_ctrl($str)
128 | {
129 |     ob_start();
130 |     while (
131 |         preg_match(
132 |             '/^([\x09\x0A\x0D\x20-\x7E]+)|([^\x09\x0A\x0D\x20-\x7E]+)/S',
133 |             $str,
134 |             $matches
135 |         )
136 |     ) {
137 |         if (!isset($matches[2])) {
138 |             echo $matches[0];
139 |         }
140 |         $str = substr($str, strlen($matches[0]));
141 |     }
142 |     $result = ob_get_contents();
143 |     ob_end_clean();
144 |     return $result;
145 | }
146 | 
147 | //---------------------------------------------------------------
148 | /**
149 | * Replace accented UTF-8 characters by unaccented ASCII-7 "equivalents".
150 | * The purpose of this function is to replace characters commonly found in Latin
151 | * alphabets with something more or less equivalent from the ASCII range. This can
152 | * be useful for converting a UTF-8 to something ready for a filename, for example.
153 | * Following the use of this function, you would probably also pass the string
154 | * through utf8_strip_non_ascii to clean out any other non-ASCII chars
155 | * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
156 | * letters. Default is to deaccent both cases ($case = 0)
157 | *
158 | * For a more complete implementation of transliteration, see the utf8_to_ascii package
159 | * available from the phputf8 project downloads:
160 | * http://prdownloads.sourceforge.net/phputf8
161 | *
162 | * @param string UTF-8 string
163 | * @param int (optional) -1 lowercase only, +1 uppercase only, 1 both cases
164 | * @param string UTF-8 with accented characters replaced by ASCII chars
165 | * @return string accented chars replaced with ascii equivalents
166 | * @author Andreas Gohr <andi@splitbrain.org>
167 | * @package utf8
168 | */
169 | function utf8_accents_to_ascii($str, $case = 0)
170 | {
171 |     static $UTF8_LOWER_ACCENTS = null;
172 |     static $UTF8_UPPER_ACCENTS = null;
173 | 
174 |     if ($case <= 0) {
175 |         if (is_null($UTF8_LOWER_ACCENTS)) {
176 |             $UTF8_LOWER_ACCENTS = [
177 |             'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
178 |             'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
179 |             'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
180 |             'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
181 |             'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
182 |             'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
183 |             'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
184 |             'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
185 |             'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
186 |             'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
187 |             'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
188 |             'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
189 |             'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
190 |             'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
191 |             'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
192 |             ];
193 |         }
194 | 
195 |         $str = str_replace(
196 |             array_keys($UTF8_LOWER_ACCENTS),
197 |             array_values($UTF8_LOWER_ACCENTS),
198 |             $str
199 |         );
200 |     }
201 | 
202 |     if ($case >= 0) {
203 |         if (is_null($UTF8_UPPER_ACCENTS)) {
204 |             $UTF8_UPPER_ACCENTS = [
205 |             'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
206 |             'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
207 |             'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
208 |             'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
209 |             'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
210 |             'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
211 |             'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
212 |             'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
213 |             'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
214 |             'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
215 |             'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
216 |             'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
217 |             'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
218 |             'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
219 |             'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
220 |             ];
221 |         }
222 |         $str = str_replace(
223 |             array_keys($UTF8_UPPER_ACCENTS),
224 |             array_values($UTF8_UPPER_ACCENTS),
225 |             $str
226 |         );
227 |     }
228 | 
229 |     return $str;
230 | }
231 | 


--------------------------------------------------------------------------------
/src/phputf8/utils/bad.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | /**
  4 | * Tools for locating / replacing bad bytes in UTF-8 strings
  5 | * The Original Code is Mozilla Communicator client code.
  6 | * The Initial Developer of the Original Code is
  7 | * Netscape Communications Corporation.
  8 | * Portions created by the Initial Developer are Copyright (C) 1998
  9 | * the Initial Developer. All Rights Reserved.
 10 | * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi)
 11 | * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com)
 12 | * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
 13 | * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
 14 | * @see http://hsivonen.iki.fi/php-utf8/
 15 | * @package utf8
 16 | * @see utf8_is_valid
 17 | */
 18 | 
 19 | //--------------------------------------------------------------------
 20 | /**
 21 | * Locates the first bad byte in a UTF-8 string returning it's
 22 | * byte index in the string
 23 | * PCRE Pattern to locate bad bytes in a UTF-8 string
 24 | * Comes from W3 FAQ: Multilingual Forms
 25 | * Note: modified to include full ASCII range including control chars
 26 | * @see http://www.w3.org/International/questions/qa-forms-utf-8
 27 | * @param string
 28 | * @return mixed integer byte index or FALSE if no bad found
 29 | * @package utf8
 30 | */
 31 | function utf8_bad_find($str)
 32 | {
 33 |     $UTF8_BAD =
 34 |     '([\x00-\x7F]' .                          # ASCII (including control chars)
 35 |     '|[\xC2-\xDF][\x80-\xBF]' .               # non-overlong 2-byte
 36 |     '|\xE0[\xA0-\xBF][\x80-\xBF]' .           # excluding overlongs
 37 |     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' .    # straight 3-byte
 38 |     '|\xED[\x80-\x9F][\x80-\xBF]' .           # excluding surrogates
 39 |     '|\xF0[\x90-\xBF][\x80-\xBF]{2}' .        # planes 1-3
 40 |     '|[\xF1-\xF3][\x80-\xBF]{3}' .            # planes 4-15
 41 |     '|\xF4[\x80-\x8F][\x80-\xBF]{2}' .        # plane 16
 42 |     '|(.{1}))';                              # invalid byte
 43 |     $pos     = 0;
 44 |     $badList = [];
 45 |     while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) {
 46 |         $bytes = strlen($matches[0]);
 47 |         if (isset($matches[2])) {
 48 |             return $pos;
 49 |         }
 50 |         $pos += $bytes;
 51 |         $str = substr($str, $bytes);
 52 |     }
 53 |     return false;
 54 | }
 55 | 
 56 | //--------------------------------------------------------------------
 57 | /**
 58 | * Locates all bad bytes in a UTF-8 string and returns a list of their
 59 | * byte index in the string
 60 | * PCRE Pattern to locate bad bytes in a UTF-8 string
 61 | * Comes from W3 FAQ: Multilingual Forms
 62 | * Note: modified to include full ASCII range including control chars
 63 | * @see http://www.w3.org/International/questions/qa-forms-utf-8
 64 | * @param string
 65 | * @return mixed array of integers or FALSE if no bad found
 66 | * @package utf8
 67 | */
 68 | function utf8_bad_findall($str)
 69 | {
 70 |     $UTF8_BAD =
 71 |     '([\x00-\x7F]' .                          # ASCII (including control chars)
 72 |     '|[\xC2-\xDF][\x80-\xBF]' .               # non-overlong 2-byte
 73 |     '|\xE0[\xA0-\xBF][\x80-\xBF]' .           # excluding overlongs
 74 |     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' .    # straight 3-byte
 75 |     '|\xED[\x80-\x9F][\x80-\xBF]' .           # excluding surrogates
 76 |     '|\xF0[\x90-\xBF][\x80-\xBF]{2}' .        # planes 1-3
 77 |     '|[\xF1-\xF3][\x80-\xBF]{3}' .            # planes 4-15
 78 |     '|\xF4[\x80-\x8F][\x80-\xBF]{2}' .        # plane 16
 79 |     '|(.{1}))';                              # invalid byte
 80 |     $pos     = 0;
 81 |     $badList = [];
 82 |     while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) {
 83 |         $bytes = strlen($matches[0]);
 84 |         if (isset($matches[2])) {
 85 |             $badList[] = $pos;
 86 |         }
 87 |         $pos += $bytes;
 88 |         $str = substr($str, $bytes);
 89 |     }
 90 |     if (count($badList) > 0) {
 91 |         return $badList;
 92 |     }
 93 |     return false;
 94 | }
 95 | 
 96 | //--------------------------------------------------------------------
 97 | /**
 98 | * Strips out any bad bytes from a UTF-8 string and returns the rest
 99 | * PCRE Pattern to locate bad bytes in a UTF-8 string
100 | * Comes from W3 FAQ: Multilingual Forms
101 | * Note: modified to include full ASCII range including control chars
102 | * @see http://www.w3.org/International/questions/qa-forms-utf-8
103 | * @param string
104 | * @return string
105 | * @package utf8
106 | */
107 | function utf8_bad_strip($str)
108 | {
109 |     $UTF8_BAD =
110 |     '([\x00-\x7F]' .                          # ASCII (including control chars)
111 |     '|[\xC2-\xDF][\x80-\xBF]' .               # non-overlong 2-byte
112 |     '|\xE0[\xA0-\xBF][\x80-\xBF]' .           # excluding overlongs
113 |     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' .    # straight 3-byte
114 |     '|\xED[\x80-\x9F][\x80-\xBF]' .           # excluding surrogates
115 |     '|\xF0[\x90-\xBF][\x80-\xBF]{2}' .        # planes 1-3
116 |     '|[\xF1-\xF3][\x80-\xBF]{3}' .            # planes 4-15
117 |     '|\xF4[\x80-\x8F][\x80-\xBF]{2}' .        # plane 16
118 |     '|(.{1}))';                              # invalid byte
119 |     ob_start();
120 |     while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) {
121 |         if (!isset($matches[2])) {
122 |             echo $matches[0];
123 |         }
124 |         $str = substr($str, strlen($matches[0]));
125 |     }
126 |     $result = ob_get_contents();
127 |     ob_end_clean();
128 |     return $result;
129 | }
130 | 
131 | //--------------------------------------------------------------------
132 | /**
133 | * Replace bad bytes with an alternative character - ASCII character
134 | * recommended is replacement char
135 | * PCRE Pattern to locate bad bytes in a UTF-8 string
136 | * Comes from W3 FAQ: Multilingual Forms
137 | * Note: modified to include full ASCII range including control chars
138 | * @see http://www.w3.org/International/questions/qa-forms-utf-8
139 | * @param string to search
140 | * @param string to replace bad bytes with (defaults to '?') - use ASCII
141 | * @return string
142 | * @package utf8
143 | */
144 | function utf8_bad_replace($str, $replace = '?')
145 | {
146 |     $UTF8_BAD =
147 |     '([\x00-\x7F]' .                          # ASCII (including control chars)
148 |     '|[\xC2-\xDF][\x80-\xBF]' .               # non-overlong 2-byte
149 |     '|\xE0[\xA0-\xBF][\x80-\xBF]' .           # excluding overlongs
150 |     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' .    # straight 3-byte
151 |     '|\xED[\x80-\x9F][\x80-\xBF]' .           # excluding surrogates
152 |     '|\xF0[\x90-\xBF][\x80-\xBF]{2}' .        # planes 1-3
153 |     '|[\xF1-\xF3][\x80-\xBF]{3}' .            # planes 4-15
154 |     '|\xF4[\x80-\x8F][\x80-\xBF]{2}' .        # plane 16
155 |     '|(.{1}))';                              # invalid byte
156 |     ob_start();
157 |     while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) {
158 |         if (!isset($matches[2])) {
159 |             echo $matches[0];
160 |         } else {
161 |             echo $replace;
162 |         }
163 |         $str = substr($str, strlen($matches[0]));
164 |     }
165 |     $result = ob_get_contents();
166 |     ob_end_clean();
167 |     return $result;
168 | }
169 | 
170 | //--------------------------------------------------------------------
171 | /**
172 | * Return code from utf8_bad_identify() when a five octet sequence is detected.
173 | * Note: 5 octets sequences are valid UTF-8 but are not supported by Unicode so
174 | * do not represent a useful character
175 | * @see utf8_bad_identify
176 | * @package utf8
177 | */
178 | define('UTF8_BAD_5OCTET', 1);
179 | 
180 | /**
181 | * Return code from utf8_bad_identify() when a six octet sequence is detected.
182 | * Note: 6 octets sequences are valid UTF-8 but are not supported by Unicode so
183 | * do not represent a useful character
184 | * @see utf8_bad_identify
185 | * @package utf8
186 | */
187 | define('UTF8_BAD_6OCTET', 2);
188 | 
189 | /**
190 | * Return code from utf8_bad_identify().
191 | * Invalid octet for use as start of multi-byte UTF-8 sequence
192 | * @see utf8_bad_identify
193 | * @package utf8
194 | */
195 | define('UTF8_BAD_SEQID', 3);
196 | 
197 | /**
198 | * Return code from utf8_bad_identify().
199 | * From Unicode 3.1, non-shortest form is illegal
200 | * @see utf8_bad_identify
201 | * @package utf8
202 | */
203 | define('UTF8_BAD_NONSHORT', 4);
204 | 
205 | /**
206 | * Return code from utf8_bad_identify().
207 | * From Unicode 3.2, surrogate characters are illegal
208 | * @see utf8_bad_identify
209 | * @package utf8
210 | */
211 | define('UTF8_BAD_SURROGATE', 5);
212 | 
213 | /**
214 | * Return code from utf8_bad_identify().
215 | * Codepoints outside the Unicode range are illegal
216 | * @see utf8_bad_identify
217 | * @package utf8
218 | */
219 | define('UTF8_BAD_UNIOUTRANGE', 6);
220 | 
221 | /**
222 | * Return code from utf8_bad_identify().
223 | * Incomplete multi-octet sequence
224 | * Note: this is kind of a "catch-all"
225 | * @see utf8_bad_identify
226 | * @package utf8
227 | */
228 | define('UTF8_BAD_SEQINCOMPLETE', 7);
229 | 
230 | //--------------------------------------------------------------------
231 | /**
232 | * Reports on the type of bad byte found in a UTF-8 string. Returns a
233 | * status code on the first bad byte found
234 | *
235 | * Joomla modification - As of PHP 7.4, curly brace access has been deprecated. As a result this function has been
236 | * modified to use square brace syntax
237 | * See https://github.com/php/php-src/commit/d574df63dc375f5fc9202ce5afde23f866b6450a
238 | * for additional references
239 | *
240 | * @author <hsivonen@iki.fi>
241 | * @param string UTF-8 encoded string
242 | * @return mixed integer constant describing problem or FALSE if valid UTF-8
243 | * @see utf8_bad_explain
244 | * @see http://hsivonen.iki.fi/php-utf8/
245 | * @package utf8
246 | */
247 | function utf8_bad_identify($str, &$i)
248 | {
249 |     $mState = 0;     // cached expected number of octets after the current octet
250 |                      // until the beginning of the next UTF8 character sequence
251 |     $mUcs4  = 0;     // cached Unicode character
252 |     $mBytes = 1;     // cached expected number of octets in the current sequence
253 | 
254 |     $len = strlen($str);
255 | 
256 |     for ($i = 0; $i < $len; $i++) {
257 |         $in = ord($str[$i]);
258 | 
259 |         if ($mState == 0) {
260 |             // When mState is zero we expect either a US-ASCII character or a
261 |             // multi-octet sequence.
262 |             if (0 == (0x80 & ($in))) {
263 |                 // US-ASCII, pass straight through.
264 |                 $mBytes = 1;
265 |             } elseif (0xC0 == (0xE0 & ($in))) {
266 |                 // First octet of 2 octet sequence
267 |                 $mUcs4  = ($in);
268 |                 $mUcs4  = ($mUcs4 & 0x1F) << 6;
269 |                 $mState = 1;
270 |                 $mBytes = 2;
271 |             } elseif (0xE0 == (0xF0 & ($in))) {
272 |                 // First octet of 3 octet sequence
273 |                 $mUcs4  = ($in);
274 |                 $mUcs4  = ($mUcs4 & 0x0F) << 12;
275 |                 $mState = 2;
276 |                 $mBytes = 3;
277 |             } elseif (0xF0 == (0xF8 & ($in))) {
278 |                 // First octet of 4 octet sequence
279 |                 $mUcs4  = ($in);
280 |                 $mUcs4  = ($mUcs4 & 0x07) << 18;
281 |                 $mState = 3;
282 |                 $mBytes = 4;
283 |             } elseif (0xF8 == (0xFC & ($in))) {
284 |                 /* First octet of 5 octet sequence.
285 |                 *
286 |                 * This is illegal because the encoded codepoint must be either
287 |                 * (a) not the shortest form or
288 |                 * (b) outside the Unicode range of 0-0x10FFFF.
289 |                 */
290 | 
291 |                 return UTF8_BAD_5OCTET;
292 |             } elseif (0xFC == (0xFE & ($in))) {
293 |                 // First octet of 6 octet sequence, see comments for 5 octet sequence.
294 |                 return UTF8_BAD_6OCTET;
295 |             } else {
296 |                 // Current octet is neither in the US-ASCII range nor a legal first
297 |                 // octet of a multi-octet sequence.
298 |                 return UTF8_BAD_SEQID;
299 |             }
300 |         } else {
301 |             // When mState is non-zero, we expect a continuation of the multi-octet
302 |             // sequence
303 |             if (0x80 == (0xC0 & ($in))) {
304 |                 // Legal continuation.
305 |                 $shift = ($mState - 1) * 6;
306 |                 $tmp   = $in;
307 |                 $tmp   = ($tmp & 0x0000003F) << $shift;
308 |                 $mUcs4 |= $tmp;
309 | 
310 |                 /**
311 |                 * End of the multi-octet sequence. mUcs4 now contains the final
312 |                 * Unicode codepoint to be output
313 |                 */
314 |                 if (0 == --$mState) {
315 |                     // From Unicode 3.1, non-shortest form is illegal
316 |                     if (
317 |                         ((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
318 |                         ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
319 |                         ((4 == $mBytes) && ($mUcs4 < 0x10000))
320 |                     ) {
321 |                         return UTF8_BAD_NONSHORT;
322 | 
323 |                     // From Unicode 3.2, surrogate characters are illegal
324 |                     } elseif (($mUcs4 & 0xFFFFF800) == 0xD800) {
325 |                         return UTF8_BAD_SURROGATE;
326 | 
327 |                     // Codepoints outside the Unicode range are illegal
328 |                     } elseif ($mUcs4 > 0x10FFFF) {
329 |                         return UTF8_BAD_UNIOUTRANGE;
330 |                     }
331 | 
332 |                     //initialize UTF8 cache
333 |                     $mState = 0;
334 |                     $mUcs4  = 0;
335 |                     $mBytes = 1;
336 |                 }
337 |             } else {
338 |                 // ((0xC0 & (*in) != 0x80) && (mState != 0))
339 |                 // Incomplete multi-octet sequence.
340 |                 $i--;
341 |                 return UTF8_BAD_SEQINCOMPLETE;
342 |             }
343 |         }
344 |     }
345 | 
346 |     if ($mState != 0) {
347 |         // Incomplete multi-octet sequence.
348 |         $i--;
349 |         return UTF8_BAD_SEQINCOMPLETE;
350 |     }
351 | 
352 |     // No bad octets found
353 |     $i = null;
354 |     return false;
355 | }
356 | 
357 | //--------------------------------------------------------------------
358 | /**
359 | * Takes a return code from utf8_bad_identify() are returns a message
360 | * (in English) explaining what the problem is.
361 | * @param int return code from utf8_bad_identify
362 | * @return mixed string message or FALSE if return code unknown
363 | * @see utf8_bad_identify
364 | * @package utf8
365 | */
366 | function utf8_bad_explain($code)
367 | {
368 |     switch ($code) {
369 |         case UTF8_BAD_5OCTET:
370 |             return 'Five octet sequences are valid UTF-8 but are not supported by Unicode';
371 |             break;
372 | 
373 |         case UTF8_BAD_6OCTET:
374 |             return 'Six octet sequences are valid UTF-8 but are not supported by Unicode';
375 |             break;
376 | 
377 |         case UTF8_BAD_SEQID:
378 |             return 'Invalid octet for use as start of multi-byte UTF-8 sequence';
379 |             break;
380 | 
381 |         case UTF8_BAD_NONSHORT:
382 |             return 'From Unicode 3.1, non-shortest form is illegal';
383 |             break;
384 | 
385 |         case UTF8_BAD_SURROGATE:
386 |             return 'From Unicode 3.2, surrogate characters are illegal';
387 |             break;
388 | 
389 |         case UTF8_BAD_UNIOUTRANGE:
390 |             return 'Codepoints outside the Unicode range are illegal';
391 |             break;
392 | 
393 |         case UTF8_BAD_SEQINCOMPLETE:
394 |             return 'Incomplete multi-octet sequence';
395 |             break;
396 |     }
397 | 
398 |     trigger_error('Unknown error code: ' . $code, E_USER_WARNING);
399 |     return false;
400 | }
401 | 


--------------------------------------------------------------------------------
/src/phputf8/utils/patterns.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /**
 4 | * PCRE Regular expressions for UTF-8. Note this file is not actually used by
 5 | * the rest of the library but these regular expressions can be useful to have
 6 | * available.
 7 | * @see http://www.w3.org/International/questions/qa-forms-utf-8
 8 | * @package utf8
 9 | */
10 | 
11 | //--------------------------------------------------------------------
12 | /**
13 | * PCRE Pattern to check a UTF-8 string is valid
14 | * Comes from W3 FAQ: Multilingual Forms
15 | * Note: modified to include full ASCII range including control chars
16 | * @see http://www.w3.org/International/questions/qa-forms-utf-8
17 | * @package utf8
18 | */
19 | $UTF8_VALID = '^(' .
20 |     '[\x00-\x7F]' .                          # ASCII (including control chars)
21 |     '|[\xC2-\xDF][\x80-\xBF]' .              # non-overlong 2-byte
22 |     '|\xE0[\xA0-\xBF][\x80-\xBF]' .          # excluding overlongs
23 |     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' .   # straight 3-byte
24 |     '|\xED[\x80-\x9F][\x80-\xBF]' .          # excluding surrogates
25 |     '|\xF0[\x90-\xBF][\x80-\xBF]{2}' .       # planes 1-3
26 |     '|[\xF1-\xF3][\x80-\xBF]{3}' .           # planes 4-15
27 |     '|\xF4[\x80-\x8F][\x80-\xBF]{2}' .       # plane 16
28 |     ')*$';
29 | 
30 | //--------------------------------------------------------------------
31 | /**
32 | * PCRE Pattern to match single UTF-8 characters
33 | * Comes from W3 FAQ: Multilingual Forms
34 | * Note: modified to include full ASCII range including control chars
35 | * @see http://www.w3.org/International/questions/qa-forms-utf-8
36 | * @package utf8
37 | */
38 | $UTF8_MATCH =
39 |     '([\x00-\x7F])' .                          # ASCII (including control chars)
40 |     '|([\xC2-\xDF][\x80-\xBF])' .              # non-overlong 2-byte
41 |     '|(\xE0[\xA0-\xBF][\x80-\xBF])' .          # excluding overlongs
42 |     '|([\xE1-\xEC\xEE\xEF][\x80-\xBF]{2})' .   # straight 3-byte
43 |     '|(\xED[\x80-\x9F][\x80-\xBF])' .          # excluding surrogates
44 |     '|(\xF0[\x90-\xBF][\x80-\xBF]{2})' .       # planes 1-3
45 |     '|([\xF1-\xF3][\x80-\xBF]{3})' .           # planes 4-15
46 |     '|(\xF4[\x80-\x8F][\x80-\xBF]{2})';       # plane 16
47 | 
48 | //--------------------------------------------------------------------
49 | /**
50 | * PCRE Pattern to locate bad bytes in a UTF-8 string
51 | * Comes from W3 FAQ: Multilingual Forms
52 | * Note: modified to include full ASCII range including control chars
53 | * @see http://www.w3.org/International/questions/qa-forms-utf-8
54 | * @package utf8
55 | */
56 | $UTF8_BAD =
57 |     '([\x00-\x7F]' .                          # ASCII (including control chars)
58 |     '|[\xC2-\xDF][\x80-\xBF]' .               # non-overlong 2-byte
59 |     '|\xE0[\xA0-\xBF][\x80-\xBF]' .           # excluding overlongs
60 |     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' .    # straight 3-byte
61 |     '|\xED[\x80-\x9F][\x80-\xBF]' .           # excluding surrogates
62 |     '|\xF0[\x90-\xBF][\x80-\xBF]{2}' .        # planes 1-3
63 |     '|[\xF1-\xF3][\x80-\xBF]{3}' .            # planes 4-15
64 |     '|\xF4[\x80-\x8F][\x80-\xBF]{2}' .        # plane 16
65 |     '|(.{1}))';                              # invalid byte
66 | 


--------------------------------------------------------------------------------
/src/phputf8/utils/position.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | /**
  4 | * Locate a byte index given a UTF-8 character index
  5 | * @package utf8
  6 | */
  7 | 
  8 | //--------------------------------------------------------------------
  9 | /**
 10 | * Given a string and a character index in the string, in
 11 | * terms of the UTF-8 character position, returns the byte
 12 | * index of that character. Can be useful when you want to
 13 | * PHP's native string functions but we warned, locating
 14 | * the byte can be expensive
 15 | * Takes variable number of parameters - first must be
 16 | * the search string then 1 to n UTF-8 character positions
 17 | * to obtain byte indexes for - it is more efficient to search
 18 | * the string for multiple characters at once, than make
 19 | * repeated calls to this function
 20 | *
 21 | * @author Chris Smith<chris@jalakai.co.uk>
 22 | * @param string string to locate index in
 23 | * @param int (n times)
 24 | * @return mixed - int if only one input int, array if more
 25 | * @return boolean TRUE if it's all ASCII
 26 | * @package utf8
 27 | */
 28 | function utf8_byte_position()
 29 | {
 30 |     $args = func_get_args();
 31 |     $str  =& array_shift($args);
 32 |     if (!is_string($str)) {
 33 |         return false;
 34 |     }
 35 | 
 36 |     $result = [];
 37 | 
 38 |     // trivial byte index, character offset pair
 39 |     $prev = [0,0];
 40 | 
 41 |     // use a short piece of str to estimate bytes per character
 42 |     // $i (& $j) -> byte indexes into $str
 43 |     $i = utf8_locate_next_chr($str, 300);
 44 | 
 45 |     // $c -> character offset into $str
 46 |     $c = strlen(utf8_decode(substr($str, 0, $i)));
 47 | 
 48 |     // deal with arguments from lowest to highest
 49 |     sort($args);
 50 | 
 51 |     foreach ($args as $offset) {
 52 |         // sanity checks FIXME
 53 | 
 54 |         // 0 is an easy check
 55 |         if ($offset == 0) {
 56 |             $result[] = 0;
 57 |             continue;
 58 |         }
 59 | 
 60 |         // ensure no endless looping
 61 |         $safety_valve = 50;
 62 | 
 63 |         do {
 64 |             if (($c - $prev[1]) == 0) {
 65 |                 // Hack: gone past end of string
 66 |                 $error = 0;
 67 |                 $i     = strlen($str);
 68 |                 break;
 69 |             }
 70 | 
 71 |             $j = $i + (int)(($offset - $c) * ($i - $prev[0]) / ($c - $prev[1]));
 72 | 
 73 |             // correct to utf8 character boundary
 74 |             $j = utf8_locate_next_chr($str, $j);
 75 | 
 76 |             // save the index, offset for use next iteration
 77 |             $prev = [$i,$c];
 78 | 
 79 |             if ($j > $i) {
 80 |                 // determine new character offset
 81 |                 $c += strlen(utf8_decode(substr($str, $i, $j - $i)));
 82 |             } else {
 83 |                 // ditto
 84 |                 $c -= strlen(utf8_decode(substr($str, $j, $i - $j)));
 85 |             }
 86 | 
 87 |             $error = abs($c - $offset);
 88 | 
 89 |             // ready for next time around
 90 |             $i = $j;
 91 | 
 92 |             // from 7 it is faster to iterate over the string
 93 |         } while (($error > 7) && --$safety_valve);
 94 | 
 95 |         if ($error && $error <= 7) {
 96 |             if ($c < $offset) {
 97 |                 // move up
 98 |                 while ($error--) {
 99 |                     $i = utf8_locate_next_chr($str, ++$i);
100 |                 }
101 |             } else {
102 |                 // move down
103 |                 while ($error--) {
104 |                     $i = utf8_locate_current_chr($str, --$i);
105 |                 }
106 |             }
107 | 
108 |             // ready for next arg
109 |             $c = $offset;
110 |         }
111 |         $result[] = $i;
112 |     }
113 | 
114 |     if (count($result) == 1) {
115 |         return $result[0];
116 |     }
117 | 
118 |     return $result;
119 | }
120 | 
121 | //--------------------------------------------------------------------
122 | /**
123 | * Given a string and any byte index, returns the byte index
124 | * of the start of the current UTF-8 character, relative to supplied
125 | * position. If the current character begins at the same place as the
126 | * supplied byte index, that byte index will be returned. Otherwise
127 | * this function will step backwards, looking for the index where
128 | * current UTF-8 character begins
129 | * @author Chris Smith<chris@jalakai.co.uk>
130 | * @param string
131 | * @param int byte index in the string
132 | * @return int byte index of start of next UTF-8 character
133 | * @package utf8
134 | */
135 | function utf8_locate_current_chr(&$str, $idx)
136 | {
137 |     if ($idx <= 0) {
138 |         return 0;
139 |     }
140 | 
141 |     $limit = strlen($str);
142 |     if ($idx >= $limit) {
143 |         return $limit;
144 |     }
145 | 
146 |     // Binary value for any byte after the first in a multi-byte UTF-8 character
147 |     // will be like 10xxxxxx so & 0xC0 can be used to detect this kind
148 |     // of byte - assuming well formed UTF-8
149 |     while ($idx && ((ord($str[$idx]) & 0xC0) == 0x80)) {
150 |         $idx--;
151 |     }
152 | 
153 |     return $idx;
154 | }
155 | 
156 | //--------------------------------------------------------------------
157 | /**
158 | * Given a string and any byte index, returns the byte index
159 | * of the start of the next UTF-8 character, relative to supplied
160 | * position. If the next character begins at the same place as the
161 | * supplied byte index, that byte index will be returned.
162 | * @author Chris Smith<chris@jalakai.co.uk>
163 | * @param string
164 | * @param int byte index in the string
165 | * @return int byte index of start of next UTF-8 character
166 | * @package utf8
167 | */
168 | function utf8_locate_next_chr(&$str, $idx)
169 | {
170 |     if ($idx <= 0) {
171 |         return 0;
172 |     }
173 | 
174 |     $limit = strlen($str);
175 |     if ($idx >= $limit) {
176 |         return $limit;
177 |     }
178 | 
179 |     // Binary value for any byte after the first in a multi-byte UTF-8 character
180 |     // will be like 10xxxxxx so & 0xC0 can be used to detect this kind
181 |     // of byte - assuming well formed UTF-8
182 |     while (($idx < $limit) && ((ord($str[$idx]) & 0xC0) == 0x80)) {
183 |         $idx++;
184 |     }
185 | 
186 |     return $idx;
187 | }
188 | 


--------------------------------------------------------------------------------
/src/phputf8/utils/specials.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | /**
  4 | * Utilities for processing "special" characters in UTF-8. "Special" largely means anything which would
  5 | * be regarded as a non-word character, like ASCII control characters and punctuation. This has a "Roman"
  6 | * bias - it would be unaware of modern Chinese "punctuation" characters for example.
  7 | * Note: requires utils/unicode.php to be loaded
  8 | * @package utf8
  9 | * @see utf8_is_valid
 10 | */
 11 | 
 12 | //--------------------------------------------------------------------
 13 | /**
 14 | * Used internally. Builds a PCRE pattern from the $UTF8_SPECIAL_CHARS
 15 | * array defined in this file
 16 | * The $UTF8_SPECIAL_CHARS should contain all special characters (non-letter/non-digit)
 17 | * defined in the various local charsets - it's not a complete list of
 18 | * non-alphanum characters in UTF-8. It's not perfect but should match most
 19 | * cases of special chars.
 20 | * This function adds the control chars 0x00 to 0x19 to the array of
 21 | * special chars (they are not included in $UTF8_SPECIAL_CHARS)
 22 | * @package utf8
 23 | * @return string
 24 | * @see utf8_from_unicode
 25 | * @see utf8_is_word_chars
 26 | * @see utf8_strip_specials
 27 | */
 28 | function utf8_specials_pattern()
 29 | {
 30 |     static $pattern = null;
 31 | 
 32 |     if (!$pattern) {
 33 |         $UTF8_SPECIAL_CHARS = [
 34 |         0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
 35 |         0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c,
 36 |         0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
 37 |         0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
 38 |         0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
 39 |         0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
 40 |         0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
 41 |         0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
 42 |         0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
 43 |         0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
 44 |         0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
 45 |         0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
 46 |         0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
 47 |         0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
 48 |         0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
 49 |         0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
 50 |         0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
 51 |         0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
 52 |         0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
 53 |         0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
 54 |         0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
 55 |         0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
 56 |         0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
 57 |         0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
 58 |         0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
 59 |         0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
 60 |         0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
 61 |         0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
 62 |         0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
 63 |         0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
 64 |         0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
 65 |         0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
 66 |         0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
 67 |         0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
 68 |         0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
 69 |         0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
 70 |         0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
 71 |         0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
 72 |         0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
 73 |         0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
 74 |         0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
 75 |         0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
 76 |         0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
 77 |         0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
 78 |         0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
 79 |         0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
 80 |         0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
 81 |         0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
 82 |         0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
 83 |         0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
 84 |             ];
 85 |         $pattern = preg_quote(utf8_from_unicode($UTF8_SPECIAL_CHARS), '/');
 86 |         $pattern = '/[\x00-\x19' . $pattern . ']/u';
 87 |     }
 88 | 
 89 |     return $pattern;
 90 | }
 91 | 
 92 | //--------------------------------------------------------------------
 93 | /**
 94 | * Checks a string for whether it contains only word characters. This
 95 | * is logically equivalent to the \w PCRE meta character. Note that
 96 | * this is not a 100% guarantee that the string only contains alpha /
 97 | * numeric characters but just that common non-alphanumeric are not
 98 | * in the string, including ASCII device control characters.
 99 | * @package utf8
100 | * @param string to check
101 | * @return boolean TRUE if the string only contains word characters
102 | * @see utf8_specials_pattern
103 | */
104 | function utf8_is_word_chars($str)
105 | {
106 |     return !(bool)preg_match(utf8_specials_pattern(), $str);
107 | }
108 | 
109 | //--------------------------------------------------------------------
110 | /**
111 | * Removes special characters (nonalphanumeric) from a UTF-8 string
112 | *
113 | * This can be useful as a helper for sanitizing a string for use as
114 | * something like a file name or a unique identifier. Be warned though
115 | * it does not handle all possible non-alphanumeric characters and is
116 | * not intended is some kind of security / injection filter.
117 | *
118 | * @package utf8
119 | * @author Andreas Gohr <andi@splitbrain.org>
120 | * @param string $string The UTF8 string to strip of special chars
121 | * @param string (optional) $repl   Replace special with this string
122 | * @return string with common non-alphanumeric characters removed
123 | * @see utf8_specials_pattern
124 | */
125 | function utf8_strip_specials($string, $repl = '')
126 | {
127 |     return preg_replace(utf8_specials_pattern(), $repl, $string);
128 | }
129 | 


--------------------------------------------------------------------------------
/src/phputf8/utils/unicode.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | /**
  4 | * Tools for conversion between UTF-8 and unicode
  5 | * The Original Code is Mozilla Communicator client code.
  6 | * The Initial Developer of the Original Code is
  7 | * Netscape Communications Corporation.
  8 | * Portions created by the Initial Developer are Copyright (C) 1998
  9 | * the Initial Developer. All Rights Reserved.
 10 | * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi)
 11 | * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com)
 12 | * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
 13 | * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
 14 | * @see http://hsivonen.iki.fi/php-utf8/
 15 | * @package utf8
 16 | */
 17 | 
 18 | //--------------------------------------------------------------------
 19 | /**
 20 | * Takes an UTF-8 string and returns an array of ints representing the
 21 | * Unicode characters. Astral planes are supported ie. the ints in the
 22 | * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
 23 | * are not allowed.
 24 | * Returns false if the input string isn't a valid UTF-8 octet sequence
 25 | * and raises a PHP error at level E_USER_WARNING
 26 | * Note: this function has been modified slightly in this library to
 27 | * trigger errors on encountering bad bytes
 28 | *
 29 | * Joomla modification - As of PHP 7.4, curly brace access has been deprecated. As a result this function has been
 30 | * modified to use square brace syntax
 31 | * See https://github.com/php/php-src/commit/d574df63dc375f5fc9202ce5afde23f866b6450a
 32 | * for additional references
 33 | *
 34 | * @author <hsivonen@iki.fi>
 35 | * @param string UTF-8 encoded string
 36 | * @return mixed array of unicode code points or FALSE if UTF-8 invalid
 37 | * @see utf8_from_unicode
 38 | * @see http://hsivonen.iki.fi/php-utf8/
 39 | * @package utf8
 40 | */
 41 | function utf8_to_unicode($str)
 42 | {
 43 |     $mState = 0;     // cached expected number of octets after the current octet
 44 |                      // until the beginning of the next UTF8 character sequence
 45 |     $mUcs4  = 0;     // cached Unicode character
 46 |     $mBytes = 1;     // cached expected number of octets in the current sequence
 47 | 
 48 |     $out = [];
 49 | 
 50 |     $len = strlen($str);
 51 | 
 52 |     for ($i = 0; $i < $len; $i++) {
 53 |         $in = ord($str[$i]);
 54 | 
 55 |         if ($mState == 0) {
 56 |             // When mState is zero we expect either a US-ASCII character or a
 57 |             // multi-octet sequence.
 58 |             if (0 == (0x80 & ($in))) {
 59 |                 // US-ASCII, pass straight through.
 60 |                 $out[]  = $in;
 61 |                 $mBytes = 1;
 62 |             } elseif (0xC0 == (0xE0 & ($in))) {
 63 |                 // First octet of 2 octet sequence
 64 |                 $mUcs4  = ($in);
 65 |                 $mUcs4  = ($mUcs4 & 0x1F) << 6;
 66 |                 $mState = 1;
 67 |                 $mBytes = 2;
 68 |             } elseif (0xE0 == (0xF0 & ($in))) {
 69 |                 // First octet of 3 octet sequence
 70 |                 $mUcs4  = ($in);
 71 |                 $mUcs4  = ($mUcs4 & 0x0F) << 12;
 72 |                 $mState = 2;
 73 |                 $mBytes = 3;
 74 |             } elseif (0xF0 == (0xF8 & ($in))) {
 75 |                 // First octet of 4 octet sequence
 76 |                 $mUcs4  = ($in);
 77 |                 $mUcs4  = ($mUcs4 & 0x07) << 18;
 78 |                 $mState = 3;
 79 |                 $mBytes = 4;
 80 |             } elseif (0xF8 == (0xFC & ($in))) {
 81 |                 /* First octet of 5 octet sequence.
 82 |                 *
 83 |                 * This is illegal because the encoded codepoint must be either
 84 |                 * (a) not the shortest form or
 85 |                 * (b) outside the Unicode range of 0-0x10FFFF.
 86 |                 * Rather than trying to resynchronize, we will carry on until the end
 87 |                 * of the sequence and let the later error handling code catch it.
 88 |                 */
 89 |                 $mUcs4  = ($in);
 90 |                 $mUcs4  = ($mUcs4 & 0x03) << 24;
 91 |                 $mState = 4;
 92 |                 $mBytes = 5;
 93 |             } elseif (0xFC == (0xFE & ($in))) {
 94 |                 // First octet of 6 octet sequence, see comments for 5 octet sequence.
 95 |                 $mUcs4  = ($in);
 96 |                 $mUcs4  = ($mUcs4 & 1) << 30;
 97 |                 $mState = 5;
 98 |                 $mBytes = 6;
 99 |             } else {
100 |                 /* Current octet is neither in the US-ASCII range nor a legal first
101 |                  * octet of a multi-octet sequence.
102 |                  */
103 |                 trigger_error(
104 |                     'utf8_to_unicode: Illegal sequence identifier ' .
105 |                         'in UTF-8 at byte ' . $i,
106 |                     E_USER_WARNING
107 |                 );
108 |                 return false;
109 |             }
110 |         } else {
111 |             // When mState is non-zero, we expect a continuation of the multi-octet
112 |             // sequence
113 |             if (0x80 == (0xC0 & ($in))) {
114 |                 // Legal continuation.
115 |                 $shift = ($mState - 1) * 6;
116 |                 $tmp   = $in;
117 |                 $tmp   = ($tmp & 0x0000003F) << $shift;
118 |                 $mUcs4 |= $tmp;
119 | 
120 |                 /**
121 |                 * End of the multi-octet sequence. mUcs4 now contains the final
122 |                 * Unicode codepoint to be output
123 |                 */
124 |                 if (0 == --$mState) {
125 |                     /*
126 |                     * Check for illegal sequences and codepoints.
127 |                     */
128 |                     // From Unicode 3.1, non-shortest form is illegal
129 |                     if (
130 |                         ((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
131 |                         ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
132 |                         ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
133 |                         (4 < $mBytes) ||
134 |                         // From Unicode 3.2, surrogate characters are illegal
135 |                         (($mUcs4 & 0xFFFFF800) == 0xD800) ||
136 |                         // Codepoints outside the Unicode range are illegal
137 |                         ($mUcs4 > 0x10FFFF)
138 |                     ) {
139 |                         trigger_error(
140 |                             'utf8_to_unicode: Illegal sequence or codepoint ' .
141 |                                 'in UTF-8 at byte ' . $i,
142 |                             E_USER_WARNING
143 |                         );
144 | 
145 |                         return false;
146 |                     }
147 | 
148 |                     if (0xFEFF != $mUcs4) {
149 |                         // BOM is legal but we don't want to output it
150 |                         $out[] = $mUcs4;
151 |                     }
152 | 
153 |                     //initialize UTF8 cache
154 |                     $mState = 0;
155 |                     $mUcs4  = 0;
156 |                     $mBytes = 1;
157 |                 }
158 |             } else {
159 |                 /**
160 |                 *((0xC0 & (*in) != 0x80) && (mState != 0))
161 |                 * Incomplete multi-octet sequence.
162 |                 */
163 |                 trigger_error(
164 |                     'utf8_to_unicode: Incomplete multi-octet ' .
165 |                     '   sequence in UTF-8 at byte ' . $i,
166 |                     E_USER_WARNING
167 |                 );
168 | 
169 |                 return false;
170 |             }
171 |         }
172 |     }
173 |     return $out;
174 | }
175 | 
176 | //--------------------------------------------------------------------
177 | /**
178 | * Takes an array of ints representing the Unicode characters and returns
179 | * a UTF-8 string. Astral planes are supported ie. the ints in the
180 | * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
181 | * are not allowed.
182 | * Returns false if the input array contains ints that represent
183 | * surrogates or are outside the Unicode range
184 | * and raises a PHP error at level E_USER_WARNING
185 | * Note: this function has been modified slightly in this library to use
186 | * output buffering to concatenate the UTF-8 string (faster) as well as
187 | * reference the array by it's keys
188 | * @param array of unicode code points representing a string
189 | * @return mixed UTF-8 string or FALSE if array contains invalid code points
190 | * @author <hsivonen@iki.fi>
191 | * @see utf8_to_unicode
192 | * @see http://hsivonen.iki.fi/php-utf8/
193 | * @package utf8
194 | */
195 | function utf8_from_unicode($arr)
196 | {
197 |     ob_start();
198 | 
199 |     foreach (array_keys($arr) as $k) {
200 |         # ASCII range (including control chars)
201 |         if (($arr[$k] >= 0) && ($arr[$k] <= 0x007f)) {
202 |             echo chr($arr[$k]);
203 | 
204 |         # 2 byte sequence
205 |         } elseif ($arr[$k] <= 0x07ff) {
206 |             echo chr(0xc0 | ($arr[$k] >> 6));
207 |             echo chr(0x80 | ($arr[$k] & 0x003f));
208 | 
209 |         # Byte order mark (skip)
210 |         } elseif ($arr[$k] == 0xFEFF) {
211 |             // nop -- zap the BOM
212 | 
213 |             # Test for illegal surrogates
214 |         } elseif ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
215 |             // found a surrogate
216 |             trigger_error(
217 |                 'utf8_from_unicode: Illegal surrogate ' .
218 |                     'at index: ' . $k . ', value: ' . $arr[$k],
219 |                 E_USER_WARNING
220 |             );
221 | 
222 |             return false;
223 | 
224 |         # 3 byte sequence
225 |         } elseif ($arr[$k] <= 0xffff) {
226 |             echo chr(0xe0 | ($arr[$k] >> 12));
227 |             echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
228 |             echo chr(0x80 | ($arr[$k] & 0x003f));
229 | 
230 |         # 4 byte sequence
231 |         } elseif ($arr[$k] <= 0x10ffff) {
232 |             echo chr(0xf0 | ($arr[$k] >> 18));
233 |             echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
234 |             echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
235 |             echo chr(0x80 | ($arr[$k] & 0x3f));
236 |         } else {
237 |             trigger_error(
238 |                 'utf8_from_unicode: Codepoint out of Unicode range ' .
239 |                     'at index: ' . $k . ', value: ' . $arr[$k],
240 |                 E_USER_WARNING
241 |             );
242 | 
243 |             // out of range
244 |             return false;
245 |         }
246 |     }
247 | 
248 |     $result = ob_get_contents();
249 |     ob_end_clean();
250 |     return $result;
251 | }
252 | 


--------------------------------------------------------------------------------
/src/phputf8/utils/validation.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | /**
  4 | * Tools for validing a UTF-8 string is well formed.
  5 | * The Original Code is Mozilla Communicator client code.
  6 | * The Initial Developer of the Original Code is
  7 | * Netscape Communications Corporation.
  8 | * Portions created by the Initial Developer are Copyright (C) 1998
  9 | * the Initial Developer. All Rights Reserved.
 10 | * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi)
 11 | * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com)
 12 | * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
 13 | * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
 14 | * @see http://hsivonen.iki.fi/php-utf8/
 15 | * @package utf8
 16 | */
 17 | 
 18 | //--------------------------------------------------------------------
 19 | /**
 20 | * Tests a string as to whether it's valid UTF-8 and supported by the
 21 | * Unicode standard
 22 | * Note: this function has been modified to simple return true or false
 23 | * @author <hsivonen@iki.fi>
 24 | * @param string UTF-8 encoded string
 25 | * @return boolean true if valid
 26 | * @see http://hsivonen.iki.fi/php-utf8/
 27 | * @see utf8_compliant
 28 | * @package utf8
 29 | */
 30 | function utf8_is_valid($str)
 31 | {
 32 |     $mState = 0;     // cached expected number of octets after the current octet
 33 |                      // until the beginning of the next UTF8 character sequence
 34 |     $mUcs4  = 0;     // cached Unicode character
 35 |     $mBytes = 1;     // cached expected number of octets in the current sequence
 36 | 
 37 |     $len = strlen($str);
 38 | 
 39 |     for ($i = 0; $i < $len; $i++) {
 40 |         /*
 41 |          * Joomla modification - As of PHP 7.4, curly brace access has been deprecated. As a result the line below has
 42 |          * been modified to use square brace syntax
 43 |          * See https://github.com/php/php-src/commit/d574df63dc375f5fc9202ce5afde23f866b6450a
 44 |          * for additional references
 45 |          */
 46 |         $in = ord($str[$i]);
 47 | 
 48 |         if ($mState == 0) {
 49 |             // When mState is zero we expect either a US-ASCII character or a
 50 |             // multi-octet sequence.
 51 |             if (0 == (0x80 & ($in))) {
 52 |                 // US-ASCII, pass straight through.
 53 |                 $mBytes = 1;
 54 |             } elseif (0xC0 == (0xE0 & ($in))) {
 55 |                 // First octet of 2 octet sequence
 56 |                 $mUcs4  = ($in);
 57 |                 $mUcs4  = ($mUcs4 & 0x1F) << 6;
 58 |                 $mState = 1;
 59 |                 $mBytes = 2;
 60 |             } elseif (0xE0 == (0xF0 & ($in))) {
 61 |                 // First octet of 3 octet sequence
 62 |                 $mUcs4  = ($in);
 63 |                 $mUcs4  = ($mUcs4 & 0x0F) << 12;
 64 |                 $mState = 2;
 65 |                 $mBytes = 3;
 66 |             } elseif (0xF0 == (0xF8 & ($in))) {
 67 |                 // First octet of 4 octet sequence
 68 |                 $mUcs4  = ($in);
 69 |                 $mUcs4  = ($mUcs4 & 0x07) << 18;
 70 |                 $mState = 3;
 71 |                 $mBytes = 4;
 72 |             } elseif (0xF8 == (0xFC & ($in))) {
 73 |                 /* First octet of 5 octet sequence.
 74 |                 *
 75 |                 * This is illegal because the encoded codepoint must be either
 76 |                 * (a) not the shortest form or
 77 |                 * (b) outside the Unicode range of 0-0x10FFFF.
 78 |                 * Rather than trying to resynchronize, we will carry on until the end
 79 |                 * of the sequence and let the later error handling code catch it.
 80 |                 */
 81 |                 $mUcs4  = ($in);
 82 |                 $mUcs4  = ($mUcs4 & 0x03) << 24;
 83 |                 $mState = 4;
 84 |                 $mBytes = 5;
 85 |             } elseif (0xFC == (0xFE & ($in))) {
 86 |                 // First octet of 6 octet sequence, see comments for 5 octet sequence.
 87 |                 $mUcs4  = ($in);
 88 |                 $mUcs4  = ($mUcs4 & 1) << 30;
 89 |                 $mState = 5;
 90 |                 $mBytes = 6;
 91 |             } else {
 92 |                 /* Current octet is neither in the US-ASCII range nor a legal first
 93 |                  * octet of a multi-octet sequence.
 94 |                  */
 95 |                 return false;
 96 |             }
 97 |         } else {
 98 |             // When mState is non-zero, we expect a continuation of the multi-octet
 99 |             // sequence
100 |             if (0x80 == (0xC0 & ($in))) {
101 |                 // Legal continuation.
102 |                 $shift = ($mState - 1) * 6;
103 |                 $tmp   = $in;
104 |                 $tmp   = ($tmp & 0x0000003F) << $shift;
105 |                 $mUcs4 |= $tmp;
106 | 
107 |                 /**
108 |                 * End of the multi-octet sequence. mUcs4 now contains the final
109 |                 * Unicode codepoint to be output
110 |                 */
111 |                 if (0 == --$mState) {
112 |                     /*
113 |                     * Check for illegal sequences and codepoints.
114 |                     */
115 |                     // From Unicode 3.1, non-shortest form is illegal
116 |                     if (
117 |                         ((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
118 |                         ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
119 |                         ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
120 |                         (4 < $mBytes) ||
121 |                         // From Unicode 3.2, surrogate characters are illegal
122 |                         (($mUcs4 & 0xFFFFF800) == 0xD800) ||
123 |                         // Codepoints outside the Unicode range are illegal
124 |                         ($mUcs4 > 0x10FFFF)
125 |                     ) {
126 |                         return false;
127 |                     }
128 | 
129 |                     //initialize UTF8 cache
130 |                     $mState = 0;
131 |                     $mUcs4  = 0;
132 |                     $mBytes = 1;
133 |                 }
134 |             } else {
135 |                 /**
136 |                 *((0xC0 & (*in) != 0x80) && (mState != 0))
137 |                 * Incomplete multi-octet sequence.
138 |                 */
139 | 
140 |                 return false;
141 |             }
142 |         }
143 |     }
144 |     return true;
145 | }
146 | 
147 | //--------------------------------------------------------------------
148 | /**
149 | * Tests whether a string complies as UTF-8. This will be much
150 | * faster than utf8_is_valid but will pass five and six octet
151 | * UTF-8 sequences, which are not supported by Unicode and
152 | * so cannot be displayed correctly in a browser. In other words
153 | * it is not as strict as utf8_is_valid but it's faster. If you use
154 | * is to validate user input, you place yourself at the risk that
155 | * attackers will be able to inject 5 and 6 byte sequences (which
156 | * may or may not be a significant risk, depending on what you are
157 | * are doing)
158 | * @see utf8_is_valid
159 | * @see http://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805
160 | * @param string UTF-8 string to check
161 | * @return boolean TRUE if string is valid UTF-8
162 | * @package utf8
163 | */
164 | function utf8_compliant($str)
165 | {
166 |     if (strlen($str) == 0) {
167 |         return true;
168 |     }
169 |     // If even just the first character can be matched, when the /u
170 |     // modifier is used, then it's valid UTF-8. If the UTF-8 is somehow
171 |     // invalid, nothing at all will match, even if the string contains
172 |     // some valid sequences
173 |     return (preg_match('/^.{1}/us', $str, $ar) == 1);
174 | }
175 | 


--------------------------------------------------------------------------------