├── .State ├── .gitignore ├── Bin ├── Fromcode.php └── Tocode.php ├── CHANGELOG.md ├── Documentation ├── En │ └── Index.xyl └── Fr │ └── Index.xyl ├── README.md ├── Source ├── Exception.php ├── Search.php └── Ustring.php ├── Test └── Unit │ ├── Issue.php │ ├── Search.php │ └── Ustring.php └── composer.json /.State: -------------------------------------------------------------------------------- 1 | finalized 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /vendor/ 2 | /composer.lock 3 | -------------------------------------------------------------------------------- /Bin/Fromcode.php: -------------------------------------------------------------------------------- 1 | getOption($v)) { 70 | switch ($c) { 71 | case 'b': 72 | $base = intval($v); 73 | 74 | break; 75 | 76 | case '__ambiguous': 77 | $this->resolveOptionAmbiguity($v); 78 | 79 | break; 80 | 81 | case 'h': 82 | case '?': 83 | default: 84 | return $this->usage(); 85 | } 86 | } 87 | 88 | $this->parser->listInputs($code); 89 | 90 | $char = Ustring::fromCode(base_convert($code, $base, 10)); 91 | 92 | echo $char; 93 | 94 | return 0; 95 | } 96 | 97 | /** 98 | * The command usage. 99 | */ 100 | public function usage(): void 101 | { 102 | echo 103 | 'Usage : ustring:fromcode ', "\n", 104 | 'Options :', "\n", 105 | $this->makeUsageOptionsList([ 106 | 'b' => 'Specify the base of the code (16 by default).', 107 | 'help' => 'This help.' 108 | ]), "\n"; 109 | } 110 | } 111 | 112 | __halt_compiler(); 113 | Get a character from its code. 114 | -------------------------------------------------------------------------------- /Bin/Tocode.php: -------------------------------------------------------------------------------- 1 | getOption($v)) { 68 | switch ($c) { 69 | case 'b': 70 | $base = intval($v); 71 | 72 | break; 73 | 74 | case '__ambiguous': 75 | $this->resolveOptionAmbiguity($v); 76 | 77 | break; 78 | 79 | case 'h': 80 | case '?': 81 | default: 82 | return $this->usage(); 83 | } 84 | } 85 | 86 | $this->parser->listInputs($char); 87 | 88 | $code = base_convert((string) Ustring::toCode($char), 10, $base); 89 | 90 | echo $code, "\n"; 91 | 92 | return 0; 93 | } 94 | 95 | /** 96 | * The command usage. 97 | */ 98 | public function usage(): void 99 | { 100 | echo 101 | 'Usage : ustring:tocode ', "\n", 102 | 'Options :', "\n", 103 | $this->makeUsageOptionsList([ 104 | 'b' => 'Get the code in a specific base (16 by default).', 105 | 'help' => 'This help.' 106 | ]), "\n"; 107 | } 108 | } 109 | 110 | __halt_compiler(); 111 | Transform a character into its code. 112 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # 4.17.01.16 2 | 3 | * Quality: Happy new year! (Alexis von Glasow, 2017-01-12T14:01:18+01:00) 4 | * Test: Add the `Decorrelated` interface. (Ivan Enderlin, 2016-10-25T07:58:13+02:00) 5 | * Documentation: New `README.md` file. (Ivan Enderlin, 2016-10-18T15:10:49+02:00) 6 | * Documentation: Update `support` properties. (Ivan Enderlin, 2016-10-11T08:45:51+02:00) 7 | 8 | # 4.16.01.11 9 | 10 | * Quality: Drop PHP5.4. (Ivan Enderlin, 2016-01-11T09:15:27+01:00) 11 | * Quality: Run devtools:cs. (Ivan Enderlin, 2016-01-09T09:11:00+01:00) 12 | * Core: Remove `Hoa\Core`. (Ivan Enderlin, 2016-01-09T08:27:47+01:00) 13 | * Consistency: Use `Hoa\Consistency`. (Ivan Enderlin, 2015-12-08T22:11:40+01:00) 14 | * Exception: Use `Hoa\Exception`. (Ivan Enderlin, 2015-11-20T13:19:42+01:00) 15 | 16 | # 3.15.11.09 17 | 18 | * Fixed leftover typos (string -> ustring) (David Thalmann, 2015-09-08T14:45:12+02:00) 19 | * Add a `.gitignore` file. (Stéphane HULARD, 2015-08-03T11:49:32+02:00) 20 | 21 | # 3.15.08.03 22 | 23 | * `ext/iconv` is suggested, no longer required. (Ivan Enderlin, 2015-08-03T07:06:46+02:00) 24 | * Fix CS. (Ivan Enderlin, 2015-08-03T07:05:10+02:00) 25 | * Test `ext/mbstring` availability globally. (Ivan Enderlin, 2015-08-03T07:04:30+02:00) 26 | 27 | # 3.15.07.28 28 | 29 | * Fix CS. (Ivan Enderlin, 2015-07-28T14:13:48+02:00) 30 | * Fix the CHANGELOG. (Ivan Enderlin, 2015-07-28T14:13:27+02:00) 31 | * Prepare 3.15.05.29. (Ivan Enderlin, 2015-05-29T15:36:54+02:00) 32 | 33 | # 3.15.05.29 34 | 35 | * Update installation section. (Ivan Enderlin, 2015-05-29T14:13:22+02:00) 36 | * Rename `Hoa\String` to `Hoa\Ustring`. (Ivan Enderlin, 2015-05-29T12:24:23+02:00) 37 | * Move to PSR-1 and PSR-2. (Ivan Enderlin, 2015-05-18T09:49:37+02:00) 38 | 39 | # 2.15.03.25 40 | 41 | * `toCode` supports invalid UTF-8 character. (Ivan Enderlin, 2015-03-25T08:52:52+01:00) 42 | * Fix a typo in an exception message. (bureX, 2015-01-27T01:41:08+01:00) 43 | 44 | # 2.15.02.19 45 | 46 | * Add the CHANGELOG.md file. (Ivan Enderlin, 2015-02-19T09:11:32+01:00) 47 | * Add `require-dev` with `hoa/test`. (Ivan Enderlin, 2015-01-29T14:55:20+01:00) 48 | * Add `hoa string:fromcode` and `hoa string:tocode`. (Ivan Enderlin, 2015-01-23T22:29:55+01:00) 49 | * Translate the documentation in English. (Ivan Enderlin, 2015-01-23T19:27:04+01:00) 50 | * Add examples, present new features and update links in the documentation. (Ivan Enderlin, 2015-01-23T19:27:00+01:00) 51 | * Implement the `getCharWidth` method. (Ivan Enderlin, 2015-01-07T11:00:06+01:00) 52 | * Accept other `intl` implementations. (Ivan Enderlin, 2015-01-06T13:42:20+01:00) 53 | * Remove a useless test. (Ivan Enderlin, 2015-01-06T11:24:39+01:00) 54 | * Add more tests for Math symbols. (Ivan Enderlin, 2015-01-06T11:22:53+01:00) 55 | * Add emoji and other symbols supports to `toAscii`. (Ivan Enderlin, 2015-01-06T11:17:32+01:00) 56 | * Update `toAscii` to use a transliterator. (Ivan Enderlin, 2015-01-06T10:58:07+01:00) 57 | * Add transliterator support. (Ivan Enderlin, 2015-01-06T10:57:30+01:00) 58 | * Happy new year! (Ivan Enderlin, 2015-01-05T14:52:34+01:00) 59 | 60 | # 2.14.12.24 61 | 62 | * Clean code. (Julien Bianchi, 2014-12-24T08:44:59+01:00) 63 | * Add tests for `Hoa\String\Search`. (Ivan Enderlin, 2014-12-23T14:15:21+01:00) 64 | * `toBinaryCode` has a better semantics. (Ivan Enderlin, 2014-12-23T14:11:02+01:00) 65 | * Use hexadecimal everywhere. (Ivan Enderlin, 2014-12-23T12:33:00+01:00) 66 | * Tests are green now. (Ivan Enderlin, 2014-12-23T12:27:38+01:00) 67 | * New `toCode` method, without UCS-2. (Ivan Enderlin, 2014-12-23T12:07:22+01:00) 68 | * Add tests. (Ivan Enderlin, 2014-12-23T02:15:55+01:00) 69 | * Fix flags between global and local in `match`. (Ivan Enderlin, 2014-12-23T02:13:50+01:00) 70 | * Fix `compare` if `Collator` is not present. (Ivan Enderlin, 2014-12-23T02:13:16+01:00) 71 | * Wrong append and prepend algorithm. (Ivan Enderlin, 2014-12-23T02:12:30+01:00) 72 | * Move to PHP5.4 and remove `from`/`import`. (Ivan Enderlin, 2014-12-22T22:44:40+01:00) 73 | 74 | # 2.14.12.10 75 | 76 | * Move to PSR-4. (Ivan Enderlin, 2014-12-09T18:48:53+01:00) 77 | 78 | # 2.14.11.09 79 | 80 | * Format code. #mania (Ivan Enderlin, 2014-10-05T15:09:31+02:00) 81 | * Implement the `String::copy` method. (Marc Lemay, 2014-10-05T15:08:33+02:00) 82 | 83 | # 2.14.09.23 84 | 85 | * Add `branch-alias`. (Stéphane PY, 2014-09-23T11:55:55+02:00) 86 | 87 | # 2.14.09.16 88 | 89 | * Drop PHP5.3. (Ivan Enderlin, 2014-09-17T17:20:54+02:00) 90 | * Add the installation section. (Ivan Enderlin, 2014-09-17T17:20:46+02:00) 91 | 92 | (first snapshot) 93 | -------------------------------------------------------------------------------- /Documentation/En/Index.xyl: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 |

Strings can sometimes be complex, especially when they use 7 | the Unicode encoding format. The Hoa\Ustring library 8 | provides several operations on UTF-8 strings.

9 | 10 |

Table of contents

11 | 12 | 13 | 14 |

Introduction

15 | 16 |

When we manipulate strings, the Unicode 17 | format establishes itself because of its compatibility with 18 | historical formats (like ASCII) and its capacity to understand a 19 | large range of characters and symbols for all cultures and 20 | all regions in the world. PHP provides several tools to manipulate such 21 | strings, like the following extensions: 22 | mbstring, 23 | iconv or also the excellent 24 | intl which is based on 25 | ICU, the reference implementation of 26 | Unicode. Unfortunately, sometimes we have to mix these extensions to achieve 27 | our aims and at the cost of a certain complexity along with 28 | a regrettable verbosity.

29 |

The Hoa\Ustring library answers to these issues by providing a 30 | simple way to manipulate strings with 31 | performance and efficiency in minds. It 32 | also provides some evoluated algorithms to perform search 33 | operations on strings.

34 | 35 |

Unicode strings

36 | 37 |

The Hoa\Ustring\Ustring class represents a 38 | UTF-8 Unicode strings and allows to manipulate it easily. 39 | This class implements the 40 | ArrayAccess, 41 | Countable and 42 | IteratorAggregate 43 | interfaces. We are going to use three examples in three different languages: 44 | French, Arab and Japanese. Thus:

45 |
$french   = new Hoa\Ustring\Ustring('Je t\'aime');
 46 | $arabic   = new Hoa\Ustring\Ustring('أحبك');
 47 | $japanese = new Hoa\Ustring\Ustring('私はあなたを愛して');
48 |

Now, let's see what we can do on these three strings.

49 | 50 |

String manipulation

51 | 52 |

Let's start with elementary operations. If we would like 53 | to count the number of characters (not bytes), we will use 54 | the count function. Thus:

55 |
var_dump(
 56 |     count($french),
 57 |     count($arabic),
 58 |     count($japanese)
 59 | );
 60 | 
 61 | /**
 62 |  * Will output:
 63 |  *     int(9)
 64 |  *     int(4)
 65 |  *     int(9)
 66 |  */
67 |

When we speak about text position, it is not suitable to speak about the 68 | right or the left, but rather about a beginning or an 69 | end, and based on the direction of writing. 70 | We can know this direction thanks to the 71 | Hoa\Ustring\Ustring::getDirection method. It returns the value of 72 | one of the following constants:

73 |
    74 |
  • Hoa\Ustring\Ustring::LTR, for left-to-right, if the text is 75 | written from the left to the right,
  • 76 |
  • Hoa\Ustring\Ustring::RTL, for right-to-left, if the text is 77 | written from the right to the left.
  • 78 |
79 |

Let's observe the result with our examples:

80 |
var_dump(
 81 |     $french->getDirection()   === Hoa\Ustring\Ustring::LTR, // is left-to-right?
 82 |     $arabic->getDirection()   === Hoa\Ustring\Ustring::RTL, // is right-to-left?
 83 |     $japanese->getDirection() === Hoa\Ustring\Ustring::LTR  // is left-to-right?
 84 | );
 85 | 
 86 | /**
 87 |  * Will output:
 88 |  *     bool(true)
 89 |  *     bool(true)
 90 |  *     bool(true)
 91 |  */
92 |

The result of this method is computed thanks to the 93 | Hoa\Ustring\Ustring::getCharDirection static method which computes 94 | the direction of only one character.

95 |

If we would like to concatenate another string to the end 96 | or to the beginning, we will respectively use the 97 | Hoa\Ustring\Ustring::append and 98 | Hoa\Ustring\Ustring::prepend methods. These methods, like most of 99 | the ones which modifies the string, return the object itself, in order to 100 | chain the calls. For instance:

101 |
echo $french->append('… et toi, m\'aimes-tu ?')->prepend('Mam\'zelle ! ');
102 | 
103 | /**
104 |  * Will output:
105 |  *     Mam'zelle ! Je t'aime… et toi, m'aimes-tu ?
106 |  */
107 |

We also have the Hoa\Ustring\Ustring::toLowerCase and 108 | Hoa\Ustring\Ustring::toUpperCase methods to, respectively, set 109 | the case of the string to lower or upper. For instance:

110 |
echo $french->toUpperCase();
111 | 
112 | /**
113 |  * Will output:
114 |  *     MAM'ZELLE ! JE T'AIME… ET TOI, M'AIMES-TU ?
115 |  */
116 |

We can also add characters to the beginning or to the end of the string to 117 | reach a minimum length. This operation is frequently called 118 | the padding (for historical reasons dating back to typewriters). 119 | That's why we have the Hoa\Ustring\Ustring::pad method which 120 | takes three arguments: the minimum length, characters to add and a constant 121 | indicating whether we have to add at the end or at the beginning of the string 122 | (respectively Hoa\Ustring\Ustring::END, by default, and 123 | Hoa\Ustring\Ustring::BEGINNING).

124 |
echo $arabic->pad(20, ' ');
125 | 
126 | /**
127 |  * Will output:
128 |  *                     أحبك
129 |  */
130 |

A similar operation allows to remove, by default, spaces 131 | at the beginning and at the end of the string thanks to the 132 | Hoa\Ustring\Ustring::trim method. For example, to retreive our 133 | original Arabic string:

134 |
echo $arabic->trim();
135 | 
136 | /**
137 |  * Will output:
138 |  *     أحبك
139 |  */
140 |

If we would like to remove other characters, we can use its first argument 141 | which must be a regular expression. Finally, its second argument allows to 142 | specify from what side we would like to remove character: at the beginning, at 143 | the end or both, still by using the 144 | Hoa\Ustring\Ustring::BEGINNING and 145 | Hoa\Ustring\Ustring::END constants.

146 |

If we would like to remove other characters, we can use its first argument 147 | which must be a regular expression. Finally, its second argument allows to 148 | specify the side where to remove characters: at the beginning, at the end or 149 | both, still by using the Hoa\Ustring\Ustring::BEGINNING and 150 | Hoa\Ustring\Ustring::END constants. We can combine these 151 | constants to express “both sides”, which is the default value: 152 | Hoa\Ustring\Ustring::BEGINNING | 153 | Hoa\Ustring\Ustring::END. For example, to remove all the numbers and 154 | the spaces only at the end, we will write:

155 |
$arabic->trim('\s|\d', Hoa\Ustring\Ustring::END);
156 |

We can also reduce the string to a 157 | sub-string by specifying the position of the first character 158 | followed by the length of the sub-string to the 159 | Hoa\Ustring\Ustring::reduce method:

160 |
echo $french->reduce(3, 6)->reduce(2, 4);
161 | 
162 | /**
163 |  * Will output:
164 |  *     aime
165 |  */
166 |

If we would like to get a specific character, we can rely on the 167 | ArrayAccess interface. For instance, to get the first character 168 | of each of our examples (from their original definitions):

169 |
var_dump(
170 |     $french[0],
171 |     $arabic[0],
172 |     $japanese[0]
173 | );
174 | 
175 | /**
176 |  * Will output:
177 |  *     string(1) "J"
178 |  *     string(2) "أ"
179 |  *     string(3) "私"
180 |  */
181 |

If we would like the last character, we will use the -1 index. The index is 182 | not bounded to the length of the string. If the index exceeds this length, 183 | then a modulo will be applied.

184 |

We can also modify or remove a specific character with this method. For 185 | example:

186 |
$french->append(' ?');
187 | $french[-1] = '!';
188 | echo $french;
189 | 
190 | /**
191 |  * Will output:
192 |  *     Je t'aime !
193 |  */
194 |

Another very useful method is the ASCII transformation. 195 | Be careful, this is not always possible, according to your settings. For 196 | example:

197 |
$title = new Hoa\Ustring\Ustring('Un été brûlant sur la côte');
198 | echo $title->toAscii();
199 | 
200 | /**
201 |  * Will output:
202 |  *     Un ete brulant sur la cote
203 |  */
204 |

We can also transform from Arabic or Japanese to ASCII. Symbols, like 205 | Mathemeticals symbols or emojis, are also transformed:

206 |
$emoji = new Hoa\Ustring\Ustring('I ❤ Unicode');
207 | $maths = new Hoa\Ustring\Ustring('∀ i ∈ ℕ');
208 | 
209 | echo
210 |     $arabic->toAscii(), "\n",
211 |     $japanese->toAscii(), "\n",
212 |     $emoji->toAscii(), "\n",
213 |     $maths->toAscii(), "\n";
214 | 
215 | /**
216 |  * Will output:
217 |  *     ahbk
218 |  *     sihaanatawo aishite
219 |  *     I (heavy black heart)️ Unicode
220 |  *     (for all) i (element of) N
221 |  */
222 |

In order this method to work correctly, the 223 | intl extension needs to be 224 | present, so that the 225 | Transliterator class 226 | is present. If it does not exist, the 227 | Normalizer class must 228 | exist. If this class does not exist neither, the 229 | Hoa\Ustring\Ustring::toAscii method can still try a 230 | transformation, but it is less efficient. To activate this last solution, 231 | true must be passed as a single argument. This tour 232 | de force is not recommended in most cases.

233 |

We also find the getTransliterator method which returns a 234 | Transliterator object, or null if this class does 235 | not exist. This method takes a transliteration identifier as argument. We 236 | suggest to read 237 | the documentation about the transliterator of ICU to understand this 238 | identifier. The transliterate method allows to transliterate the 239 | current string based on an identifier and a beginning index and an end 240 | one. This method works the same way than the 241 | Transliterator::transliterate 242 | method.

243 |

More generally, to change the encoding format, we can use 244 | the Hoa\Ustring\Ustring::transcode static method, with a string 245 | as first argument, the original encoding format as second argument and the 246 | expected encoding format as third argument (UTF-8 by default). The get the 247 | list of encoding formats, we have to refer to the 248 | iconv extension or to use the 249 | following command line in a terminal:

250 |
$ iconv --list
251 |

To know if a string is encoded in UTF-8, we can use the 252 | Hoa\Ustring\Ustring::isUtf8 static method; for instance:

253 |
var_dump(
254 |     Hoa\Ustring\Ustring::isUtf8('a'),
255 |     Hoa\Ustring\Ustring::isUtf8(Hoa\Ustring\Ustring::transcode('a', 'UTF-8', 'UTF-16'))
256 | );
257 | 
258 | /**
259 |  * Will output:
260 |  *     bool(true)
261 |  *     bool(false)
262 |  */
263 |

We can split the string into several sub-strings by using 264 | the Hoa\Ustring\Ustring::split method. As first argument, we have 265 | a regular expression (of kind PCRE), then an 266 | integer representing the maximum number of elements to return and finally a 267 | combination of constants. These constants are the same as the ones of 268 | preg_split.

269 |

By default, the second argument is set to -1, which means infinity, and the 270 | last argument is set to PREG_SPLIT_NO_EMPTY. Thus, if we would 271 | like to get all the words of a string, we will write:

272 |
print_r($title->split('#\b|\s#'));
273 | 
274 | /**
275 |  * Will output:
276 |  *     Array
277 |  *     (
278 |  *         [0] => Un
279 |  *         [1] => ete
280 |  *         [2] => brulant
281 |  *         [3] => sur
282 |  *         [4] => la
283 |  *         [5] => cote
284 |  *     )
285 |  */
286 |

If we would like to iterate over all the 287 | characters, it is recommended to use the 288 | IteratorAggregate method, being the 289 | Hoa\Ustring\Ustring::getIterator method. Let's see on the Arabic 290 | example:

291 |
foreach ($arabic as $letter) {
292 |     echo $letter, "\n";
293 | }
294 | 
295 | /**
296 |  * Will output:
297 |  *     أ
298 |  *     ح
299 |  *     ب
300 |  *     ك
301 |  */
302 |

We notice that the iteration is based on the text direction, it means that 303 | the first element of the iteration is the first letter of the string starting 304 | from the beginning.

305 |

Of course, if we would like to get an array of characters, we can use the 306 | iterator_to_array 307 | PHP function:

308 |
print_r(iterator_to_array($arabic));
309 | 
310 | /**
311 |  * Will output:
312 |  *     Array
313 |  *     (
314 |  *         [0] => أ
315 |  *         [1] => ح
316 |  *         [2] => ب
317 |  *         [3] => ك
318 |  *     )
319 |  */
320 | 321 | 322 | 323 |

Strings can also be compared thanks to the 324 | Hoa\Ustring\Ustring::compare method:

325 |
$string = new Hoa\Ustring\Ustring('abc');
326 | var_dump(
327 |     $string->compare('wxyz')
328 | );
329 | 
330 | /**
331 |  * Will output:
332 |  *     string(-1)
333 |  */
334 |

This methods returns -1 if the initial string comes before (in the 335 | alphabetical order), 0 if it is identical and 1 if it comes after. If we 336 | would like to use all the power of the underlying mechanism, we can call the 337 | Hoa\Ustring\Ustring::getCollator static method (if the 338 | Collator class exists, else 339 | Hoa\Ustring\Ustring::compare will use a simple byte to bytes 340 | comparison without taking care of the other parameters). Thus, if we would 341 | like to sort an array of strings, we will write:

342 |
$strings = array('c', 'Σ', 'd', 'x', 'α', 'a');
343 | Hoa\Ustring\Ustring::getCollator()->sort($strings);
344 | print_r($strings);
345 | 
346 | /**
347 |  * Could output:
348 |  *     Array
349 |  *     (
350 |  *         [0] => a
351 |  *         [1] => c
352 |  *         [2] => d
353 |  *         [3] => x
354 |  *         [4] => α
355 |  *         [5] => Σ
356 |  *     )
357 |  */
358 |

Comparison between two strings depends on the locale, it 359 | means of the localization of the system, like the language, the country, the 360 | region etc. We can use the 361 | Hoa\Locale library to modify 362 | these data, but it's not a dependence of Hoa\Ustring.

363 |

We can also know if a string matches a certain pattern, 364 | still expressed with a regular expression. To achieve that, we will use the 365 | Hoa\Ustring\Ustring::match method. This method relies on the 366 | preg_match and 367 | preg_match_all PHP 368 | functions, but by modifying the pattern's options to ensure the Unicode 369 | support. We have the following parameters: the pattern, a variable passed by 370 | reference to collect the matches, flags, an offset and finally a boolean 371 | indicating whether the search is global or not (respectively if we have to use 372 | preg_match_all or preg_match). By default, the 373 | search is not global.

374 |

Thus, we will check that our French example contains aime with 375 | a direct object complement:

376 |
$french->match('#(?:(?<direct_object>\w)[\'\b])aime#', $matches);
377 | var_dump($matches['direct_object']);
378 | 
379 | /**
380 |  * Will output:
381 |  *     string(1) "t"
382 |  */
383 |

This method returns false if an error is raised (for example 384 | if the pattern is not correct), 0 if no match has been found, the number of 385 | matches else.

386 |

Similarly, we can search and replace 387 | sub-strings by other sub-strings based on a pattern, still expressed with a 388 | regular expression. To achieve that, we will use the 389 | Hoa\Ustring\Ustring::replace method. This method uses the 390 | preg_replace and 391 | preg_replace_callback 392 | PHP functions, but still by modifying the pattern's options to ensure the 393 | Unicode support. As first argument, we find one or more patterns, as second 394 | argument, one or more replacements and as last argument the limit of 395 | replacements to apply. If the replacement is a callable, then the 396 | preg_replace_callback function will be used.

397 |

Thus, we will modify our French example to be more polite:

398 |
$french->replace('#(?:\w[\'\b])(?<verb>aime)#', function ($matches) {
399 |     return 'vous ' . $matches['verb'];
400 | });
401 | 
402 | echo $french;
403 | 
404 | /**
405 |  * Will output:
406 |  *     Je vous aime
407 |  */
408 |

The Hoa\Ustring\Ustring class provides constants which are 409 | aliases of existing PHP constants and ensure a better readability of the 410 | code:

411 |
    412 |
  • Hoa\Ustring\Ustring::WITHOUT_EMPTY, alias of 413 | PREG_SPLIT_NO_EMPTY,
  • 414 |
  • Hoa\Ustring\Ustring::WITH_DELIMITERS, alias of 415 | PREG_SPLIT_DELIM_CAPTURE,
  • 416 |
  • Hoa\Ustring\Ustring::WITH_OFFSET, alias of 417 | PREG_OFFSET_CAPTURE and 418 | PREG_SPLIT_OFFSET_CAPTURE,
  • 419 |
  • Hoa\Ustring\Ustring::GROUP_BY_PATTERN, alias of 420 | PREG_PATTERN_ORDER,
  • 421 |
  • Hoa\Ustring\Ustring::GROUP_BY_TUPLE, alias of 422 | PREG_SET_ORDER.
  • 423 |
424 |

Because they are strict aliases, we can write:

425 |
$string = new Hoa\Ustring\Ustring('abc1 defg2 hikl3 xyz4');
426 | $string->match(
427 |     '#(\w+)(\d)#',
428 |     $matches,
429 |     Hoa\Ustring\Ustring::WITH_OFFSET
430 |   | Hoa\Ustring\Ustring::GROUP_BY_TUPLE,
431 |     0,
432 |     true
433 | );
434 | 435 |

Characters

436 | 437 |

The Hoa\Ustring\Ustring class offers static methods working on 438 | a single Unicode character. We have already mentionned the 439 | getCharDirection method which allows to know the 440 | direction of a character. We also have the 441 | getCharWidth which counts the number of columns 442 | necessary to print a single character. Thus:

443 |
var_dump(
444 |     Hoa\Ustring\Ustring::getCharWidth(Hoa\Ustring\Ustring::fromCode(0x7f)),
445 |     Hoa\Ustring\Ustring::getCharWidth('a'),
446 |     Hoa\Ustring\Ustring::getCharWidth('㽠')
447 | );
448 | 
449 | /**
450 |  * Will output:
451 |  *     int(-1)
452 |  *     int(1)
453 |  *     int(2)
454 |  */
455 |

This method returns -1 or 0 if the character is not 456 | printable (for instance, if this is a control character, like 457 | 0x7f which corresponds to DELETE), 1 or more if this 458 | is a character that can be printed. In our example, requires 459 | 2 columns to be printed.

460 |

To get more semantics, we have the 461 | Hoa\Ustring\Ustring::isCharPrintable method which allows to know 462 | whether a character is printable or not.

463 |

If we would like to count the number of columns necessary for a whole 464 | string, we have to use the Hoa\Ustring\Ustring::getWidth method. 465 | Thus:

466 |
var_dump(
467 |     $french->getWidth(),
468 |     $arabic->getWidth(),
469 |     $japanese->getWidth()
470 | );
471 | 
472 | /**
473 |  * Will output:
474 |  *     int(9)
475 |  *     int(4)
476 |  *     int(18)
477 |  */
478 |

Try this in your terminal with a monospaced font. You will 479 | observe that Japanese requires 18 columns to be printed. This measure is very 480 | useful if we would like to know the length of a string to position it 481 | efficiently.

482 |

The getCharWidth method is different of getWidth 483 | because it includes control characters. This method is intended to be used, 484 | for example, with terminals (please, see the 485 | Hoa\Console library).

486 |

Finally, if this time we are not interested by Unicode characters but 487 | rather by machine characters char (being 488 | 1 byte), we have an extra operation. The 489 | Hoa\Ustring\Ustring::getBytesLength method will count the 490 | length of the string in bytes:

491 |
var_dump(
492 |     $arabic->getBytesLength(),
493 |     $japanese->getBytesLength()
494 | );
495 | 
496 | /**
497 |  * Will output:
498 |  *     int(8)
499 |  *     int(27)
500 |  */
501 |

If we compare these results with the ones of the 502 | Hoa\Ustring\Ustring::count method, we understand that the Arabic 503 | characters are encoded with 2 bytes whereas Japanese characteres are encoded 504 | with 3 bytes. We can also get a specific byte thanks to the 505 | Hoa\Ustring\Ustring::getByteAt method. Once again, the index is 506 | not bounded.

507 | 508 |

Code-point

509 | 510 |

Each character is represented by an integer, called a 511 | code-point. To get the code-point of a character, we can 512 | use the Hoa\Ustring\Ustring::toCode static method, and to get a 513 | character based on its code-point, we can use the 514 | Hoa\Ustring\Ustring::fromCode static method. We also have the 515 | Hoa\Ustring\Ustring::toBinaryCode method which returns the binary 516 | representation of a character. Let's take an example:

517 |
var_dump(
518 |     Hoa\Ustring\Ustring::toCode('Σ'),
519 |     Hoa\Ustring\Ustring::toBinaryCode('Σ'),
520 |     Hoa\Ustring\Ustring::fromCode(0x1a9)
521 | );
522 | 
523 | /**
524 |  * Will output:
525 |  *     int(931)
526 |  *     string(32) "1100111010100011"
527 |  *     string(2) "Σ"
528 |  */
529 | 530 |

Search algorithms

531 | 532 |

The Hoa\Ustring library provides sophisticated 533 | search algorithms on strings through the 534 | Hoa\Ustring\Search class.

535 |

We will study the Hoa\Ustring\Search::approximated algorithm 536 | which searches a sub-string in a string up to k 537 | differences (a difference is an addition, a deletion or a 538 | modification). Let's take the classical example of a DNA representation: We 539 | will search all the sub-strings approximating GATAA with 540 | 1 difference (maximum) in CAGATAAGAGAA. So, we will write:

541 |
$x      = 'GATAA';
542 | $y      = 'CAGATAAGAGAA';
543 | $k      = 1;
544 | $search = Hoa\Ustring\Search::approximated($y, $x, $k);
545 | $n      = count($search);
546 | 
547 | echo 'Try to match ', $x, ' in ', $y, ' with at most ', $k, ' difference(s):', "\n";
548 | echo $n, ' match(es) found:', "\n";
549 | 
550 | foreach ($search as $position) {
551 |     echo '    • ', substr($y, $position['i'], $position['l'), "\n";
552 | }
553 | 
554 | /**
555 |  * Will output:
556 |  *     Try to match GATAA in CAGATAAGAGAA with at most 1 difference(s):
557 |  *     4 match(es) found:
558 |  *         • AGATA
559 |  *         • GATAA
560 |  *         • ATAAG
561 |  *         • GAGAA
562 |  */
563 |

This methods returns an array of arrays. Each sub-array represents a result 564 | and contains three indexes: i for the position of the first 565 | character (byte) of the result, j for the position of the last 566 | character and l for the length of the result (simply 567 | j - i). Thus, we can compute the results by using 568 | our initial string (here $y) and its 569 | indexes.

570 |

With our example, we have four results. The first is AGATA, 571 | being GATAA with one moved character, and 572 | AGATA exists in CAGATAAGAGAA. The second 573 | result is GATAA, our sub-string, which well and truly exists in 574 | CAGATAAGAGAA. The third result is ATAAG, 575 | being GATAA with one moved character, and 576 | ATAAG exists in CAGATAAGAGAA. Finally, the 577 | last result is GAGAA, being GATAA with one 578 | modified character, and GAGAA exists in 579 | CAGATAAGAGAA.

580 |

Another example, more concrete this time. We will consider the 581 | --testIt --foobar --testThat --testAt string (which represents 582 | possible options of a command line), and we will search --testot, 583 | an option that should have been given by the user. This option does not exist 584 | as it is. We will then use our search algorithm with at most 1 difference. 585 | Let's see:

586 |
$x      = 'testot';
587 | $y      = '--testIt --foobar --testThat --testAt';
588 | $k      = 1;
589 | $search = Hoa\Ustring\Search::approximated($y, $x, $k);
590 | $n      = count($search);
591 | 
592 | // …
593 | 
594 | /**
595 |  * Will output:
596 |  *     Try to match testot in --testIt --foobar --testThat --testAt with at most 1 difference(s)
597 |  *     2 match(es) found:
598 |  *         • testIt
599 |  *         • testAt
600 |  */
601 |

The testIt and testAt results are true options, 602 | so we can suggest them to the user. This is a mechanism user by 603 | Hoa\Console to suggest corrections to the user in case of a 604 | mistyping.

605 | 606 |

Conclusion

607 | 608 |

The Hoa\Ustring library provides facilities to manipulate 609 | strings encoded with the Unicode format, but also to make sophisticated search 610 | on strings.

611 | 612 |
613 |
614 | -------------------------------------------------------------------------------- /Documentation/Fr/Index.xyl: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 |

Les chaînes de caractères peuvent parfois être complexes, 7 | particulièrement lorsqu'elles utilisent l'encodage Unicode. 8 | La bibliothèque Hoa\Ustring propose plusieurs opérations sur des 9 | chaînes de caractères UTF-8.

10 | 11 |

Table des matières

12 | 13 | 14 | 15 |

Introduction

16 | 17 |

Lorsque nous manipulons des chaînes de caractères, le format 18 | Unicode s'impose par sa 19 | compatibilité avec les formats de base historiques (comme 20 | ASCII) et par sa grande capacité à comprendre une très large 21 | plage de caractères et de symboles, pour toutes les cultures et toutes les 22 | régions de notre monde. PHP propose plusieurs outils pour manipuler de telles 23 | chaînes, comme les extensions 24 | mbstring, 25 | iconv ou encore l'excellente 26 | intl qui se base sur 27 | ICU, l'implémentation de référence 28 | d'Unicode. Malheureusement, il faut parfois mélanger ces extensions pour 29 | arriver à nos fins et au prix d'une certaine complexité et 30 | d'une verbosité regrettable.

31 |

La bibliothèque Hoa\Ustring répond à ces problématiques en 32 | proposant une façon simple de manipuler des chaînes de 33 | caractères, de manière performante et 34 | efficace. Elle propose également des algorithmes évolués pour 35 | des opérations de recherche sur des chaînes de 36 | caractères.

37 | 38 |

Chaîne de caractères Unicode

39 | 40 |

La classe Hoa\Ustring\Ustring représente une chaîne de 41 | caractères Unicode UTF-8 et permet de la manipuler 42 | facilement. Elle implémente les interfaces 43 | ArrayAccess, 44 | Countable et 45 | IteratorAggregate. 46 | Nous allons utiliser trois exemples dans trois langues différentes : français, 47 | arabe et japonais. Ainsi :

48 |
$french   = new Hoa\Ustring\Ustring('Je t\'aime');
 49 | $arabic   = new Hoa\Ustring\Ustring('أحبك');
 50 | $japanese = new Hoa\Ustring\Ustring('私はあなたを愛して');
51 |

Maintenant, voyons les opérations possibles sur ces trois chaînes.

52 | 53 |

Manipulation de la chaîne

54 | 55 |

Commençons par les opérations élémentaires. Si nous 56 | voulons compter le nombre de caractères (et non pas 57 | d'octets), nous allons utiliser la fonction 58 | count de PHP. Ainsi :

59 |
var_dump(
 60 |     count($french),
 61 |     count($arabic),
 62 |     count($japanese)
 63 | );
 64 | 
 65 | /**
 66 |  * Will output:
 67 |  *     int(9)
 68 |  *     int(4)
 69 |  *     int(9)
 70 |  */
71 |

Quand nous parlons de position sur un texte, il n'est pas adéquat de parler 72 | de droite ou de gauche, mais plutôt de début ou de 73 | fin, et cela à partir de la direction (sens 74 | d'écriture) du texte. Nous pouvons connaître cette direction grâce à la 75 | méthode Hoa\Ustring\Ustring::getDirection. Elle retourne la 76 | valeur d'une des constantes suivantes :

77 |
    78 |
  • Hoa\Ustring\Ustring::LTR, pour 79 | left-to-right, si le texte s'écrit de gauche à 80 | droite ;
  • 81 |
  • Hoa\Ustring\Ustring::RTL, pour 82 | right-to-left, si le texte s'écrit de droite à 83 | gauche.
  • 84 |
85 |

Observons le résultat sur nos exemples :

86 |
var_dump(
 87 |     $french->getDirection()   === Hoa\Ustring\Ustring::LTR, // is left-to-right?
 88 |     $arabic->getDirection()   === Hoa\Ustring\Ustring::RTL, // is right-to-left?
 89 |     $japanese->getDirection() === Hoa\Ustring\Ustring::LTR  // is left-to-right?
 90 | );
 91 | 
 92 | /**
 93 |  * Will output:
 94 |  *     bool(true)
 95 |  *     bool(true)
 96 |  *     bool(true)
 97 |  */
98 |

Le résultat de cette méthode est calculé grâce à la méthode statique 99 | Hoa\Ustring\Ustring::getCharDirection qui calcule la direction 100 | d'un seul caractère.

101 |

Si nous voulons concaténer une autre chaîne à la fin ou au 102 | début, nous utiliserons respectivement les méthodes 103 | Hoa\Ustring\Ustring::append et 104 | Hoa\Ustring\Ustring::prepend. Ces méthodes, comme la plupart de 105 | celles qui modifient la chaîne, retournent l'objet lui-même, ce afin de 106 | chaîner les appels. Par exemple :

107 |
echo $french->append('… et toi, m\'aimes-tu ?')->prepend('Mam\'zelle ! ');
108 | 
109 | /**
110 |  * Will output:
111 |  *     Mam'zelle ! Je t'aime… et toi, m'aimes-tu ?
112 |  */
113 |

Nous avons également les méthodes 114 | Hoa\Ustring\Ustring::toLowerCase et 115 | Hoa\Ustring\Ustring::toUpperCase pour, respectivement, mettre la 116 | chaîne en minuscules ou en majuscules. Par 117 | exemple :

118 |
echo $french->toUpperCase();
119 | 
120 | /**
121 |  * Will output:
122 |  *     MAM'ZELLE ! JE T'AIME… ET TOI, M'AIMES-TU ?
123 |  */
124 |

Nous pouvons aussi ajouter des caractères en début ou en fin de chaîne pour 125 | atteindre une taille minimum. Cette opération est plus 126 | couramment appelée le padding (pour des raisons historiques 127 | remontant aux machines à écrire). C'est pourquoi nous trouvons la méthode 128 | Hoa\Ustring\Ustring::pad qui prend trois arguments : la taille 129 | minimum, les caractères à ajouter et une constante indiquant si nous devons 130 | ajouter en fin ou en début de chaîne (respectivement 131 | Hoa\Ustring\Ustring::END, par défaut, et 132 | Hoa\Ustring\Ustring::BEGINNING).

133 |
echo $arabic->pad(20, ' ');
134 | 
135 | /**
136 |  * Will output:
137 |  *                     أحبك
138 |  */
139 |

Une opération similairement inverse permet de supprimer, par défaut, les 140 | espaces en début et en fin de chaîne grâce à la méthode 141 | Hoa\Ustring\Ustring::trim. Par exemple, pour revenir à notre 142 | chaîne arabe originale :

143 |
echo $arabic->trim();
144 | 
145 | /**
146 |  * Will output:
147 |  *     أحبك
148 |  */
149 |

Si nous voulons supprimer d'autres caractères, nous pouvons utiliser son 150 | premier argument qui doit être une expression régulière. Enfin, son second 151 | argument permet de préciser de quel côté nous voulons supprimer les 152 | caractères : en début, en fin ou les deux, toujours en utilisant les 153 | constantes Hoa\Ustring\Ustring::BEGINNING et 154 | Hoa\Ustring\Ustring::END. Nous pouvons combiner ces constantes 155 | pour exprimer « les deux côtés », ce qui est la valeur par défaut : 156 | Hoa\Ustring\Ustring::BEGINNING | 157 | Hoa\Ustring\Ustring::END. Par exemple, pour supprimer tous les nombres 158 | et les espaces uniquement à la fin, nous écrirons :

159 |
$arabic->trim('\s|\d', Hoa\Ustring\Ustring::END);
160 |

Nous pouvons également réduire la chaîne à une 161 | sous-chaîne en précisant la position du premier caractère 162 | puis la taille de la sous-chaîne à la méthode 163 | Hoa\Ustring\Ustring::reduce :

164 |
echo $french->reduce(3, 6)->reduce(2, 4);
165 | 
166 | /**
167 |  * Will output:
168 |  *     aime
169 |  */
170 |

Si nous voulons obtenir un caractère en particulier, nous pouvons exploiter 171 | l'interface ArrayAccess. Par exemple, pour obtenir le premier 172 | caractère de chacun de nos exemples (en les reprenant depuis le début) :

173 |
var_dump(
174 |     $french[0],
175 |     $arabic[0],
176 |     $japanese[0]
177 | );
178 | 
179 | /**
180 |  * Will output:
181 |  *     string(1) "J"
182 |  *     string(2) "أ"
183 |  *     string(3) "私"
184 |  */
185 |

Si nous voulons le dernier caractère, nous utiliserons l'index -1. L'index 186 | n'est pas borné à la taille de la chaîne. Si jamais l'index dépasse cette 187 | taille, alors un modulo sera appliqué.

188 |

Nous pouvons aussi modifier ou supprimer un caractère précis avec cette 189 | méthode. Par exemple :

190 |
$french->append(' ?');
191 | $french[-1] = '!';
192 | echo $french;
193 | 
194 | /**
195 |  * Will output:
196 |  *     Je t'aime !
197 |  */
198 |

Une autre méthode fort utile est la transformation en 199 | ASCII. Attention, ce n'est pas toujours possible, selon votre 200 | installation. Par exemple :

201 |
$title = new Hoa\Ustring\Ustring('Un été brûlant sur la côte');
202 | echo $title->toAscii();
203 | 
204 | /**
205 |  * Will output:
206 |  *     Un ete brulant sur la cote
207 |  */
208 |

Nous pouvons aussi transformer de l'arabe ou du japonais vers de l'ASCII. 209 | Les symboles, comme les symboles Mathématiques ou les emojis, sont aussi 210 | transformés :

211 |
$emoji = new Hoa\Ustring\Ustring('I ❤ Unicode');
212 | $maths = new Hoa\Ustring\Ustring('∀ i ∈ ℕ');
213 | 
214 | echo
215 |     $arabic->toAscii(), "\n",
216 |     $japanese->toAscii(), "\n",
217 |     $emoji->toAscii(), "\n",
218 |     $maths->toAscii(), "\n";
219 | 
220 | /**
221 |  * Will output:
222 |  *     ahbk
223 |  *     sihaanatawo aishite
224 |  *     I (heavy black heart)️ Unicode
225 |  *     (for all) i (element of) N
226 |  */
227 |

Pour que cette méthode fonctionne correctement, il faut que l'extension 228 | intl soit présente, pour que la 229 | classe Transliterator 230 | existe. Si elle n'existe pas, la classe 231 | Normalizer doit exister. 232 | Si cette classe n'existe pas non plus, la méthode 233 | Hoa\Ustring\Ustring::toAscii peut quand même essayer une 234 | transformation mais moins efficace. Pour cela, il faut passer 235 | true en seul argument. Ce tour de force est déconseillé dans la 236 | plupart des cas.

237 |

Nous trouvons également la méthode getTransliterator qui 238 | retourne un objet Transliterator, ou null si cette 239 | classe n'existe pas. Cette méthode prend en argument un identifiant de 240 | translitération. Nous conseillons de 241 | lire la 242 | documentation sur le translitérateur d'ICU pour comprendre cet 243 | identifiant. La méthode transliterate permet de translitérer la 244 | chaîne courante à partir d'un identifiant et d'un index de début et de 245 | fin. Elle fonctionne de la même façon que la méthode 246 | Transliterator::transliterate.

247 | 248 |

Plus généralement, pour des changements d'encodage brut, 249 | nous pouvons utiliser la méthode statique 250 | Hoa\Ustring\Ustring::transcode, avec en premier argument une chaîne 251 | de caractères, en deuxième argument l'encodage d'origine et en dernier 252 | argument l'encodage final souhaité (par défaut UTF-8). Pour la liste des 253 | encodages, il faut se reporter à l'extension 254 | iconv ou entrer la commande 255 | suivante dans un terminal :

256 |
$ iconv --list
257 |

Pour savoir si une chaîne est encodée en UTF-8, nous pouvons utiliser la 258 | méthode statique Hoa\Ustring\Ustring::isUtf8 ; par exemple :

259 |
var_dump(
260 |     Hoa\Ustring\Ustring::isUtf8('a'),
261 |     Hoa\Ustring\Ustring::isUtf8(Hoa\Ustring\Ustring::transcode('a', 'UTF-8', 'UTF-16'))
262 | );
263 | 
264 | /**
265 |  * Will output:
266 |  *     bool(true)
267 |  *     bool(false)
268 |  */
269 |

Nous pouvons éclater la chaîne en plusieurs sous-chaînes 270 | en utilisant la méthode Hoa\Ustring\Ustring::split. En premier 271 | argument, nous avons une expression régulière (type 272 | PCRE), puis un entier représentant le nombre 273 | maximum d'éléments à retourner et enfin une combinaison de constantes. Ces 274 | constantes sont les mêmes que celles de 275 | preg_split.

276 |

Par défaut, le deuxième argument vaut -1, qui symbolise l'infini, et le 277 | dernier argument vaut PREG_SPLIT_NO_EMPTY. Ainsi, si nous 278 | voulons obtenir tous les mots d'une chaîne, nous écrirons :

279 |
print_r($title->split('#\b|\s#'));
280 | 
281 | /**
282 |  * Will output:
283 |  *     Array
284 |  *     (
285 |  *         [0] => Un
286 |  *         [1] => ete
287 |  *         [2] => brulant
288 |  *         [3] => sur
289 |  *         [4] => la
290 |  *         [5] => cote
291 |  *     )
292 |  */
293 |

Si nous voulons itérer sur tous les 294 | caractères, il est préférable d'exploiter l'interface 295 | IteratorAggregate, soit la méthode 296 | Hoa\Ustring\Ustring::getIterator. Voyons plutôt sur l'exemple en 297 | arabe :

298 |
foreach ($arabic as $letter) {
299 |     echo $letter, "\n";
300 | }
301 | 
302 | /**
303 |  * Will output:
304 |  *     أ
305 |  *     ح
306 |  *     ب
307 |  *     ك
308 |  */
309 |

Nous remarquons que l'itération se fait suivant la direction du texte, 310 | c'est à dire que le premier élément de l'itération est la première lettre de 311 | la chaîne en partant du début.

312 |

Bien sûr, si nous voulons obtenir un tableau des caractères, nous pouvons 313 | utiliser la fonction 314 | iterator_to_array 315 | de PHP :

316 |
print_r(iterator_to_array($arabic));
317 | 
318 | /**
319 |  * Will output:
320 |  *     Array
321 |  *     (
322 |  *         [0] => أ
323 |  *         [1] => ح
324 |  *         [2] => ب
325 |  *         [3] => ك
326 |  *     )
327 |  */
328 | 329 | 330 | 331 |

Les chaînes peuvent également être comparées entre elles 332 | grâce à la méthode Hoa\Ustring\Ustring::compare :

333 |
$string = new Hoa\Ustring\Ustring('abc');
334 | var_dump(
335 |     $string->compare('wxyz')
336 | );
337 | 
338 | /**
339 |  * Will output:
340 |  *     string(-1)
341 |  */
342 |

Cette méthode retourne -1 si la chaîne initiale vient avant (par ordre 343 | alphabétique), 0 si elle est identique et 1 si elle vient après. Si nous 344 | voulons utiliser la pleine 345 | puissance du mécanisme sous-jacent, nous pouvons appeler la méthode statique 346 | Hoa\Ustring\Ustring::getCollator (si la classe 347 | Collator existe, sinon 348 | Hoa\Ustring\Ustring::compare utilisera une comparaison simple 349 | octet par octets sans tenir compte d'autres paramètres). Ainsi, si nous 350 | voulons trier un tableau de chaînes, nous écrirons plutôt :

351 |
$strings = array('c', 'Σ', 'd', 'x', 'α', 'a');
352 | Hoa\Ustring\Ustring::getCollator()->sort($strings);
353 | print_r($strings);
354 | 
355 | /**
356 |  * Could output:
357 |  *     Array
358 |  *     (
359 |  *         [0] => a
360 |  *         [1] => c
361 |  *         [2] => d
362 |  *         [3] => x
363 |  *         [4] => α
364 |  *         [5] => Σ
365 |  *     )
366 |  */
367 |

La comparaison entre deux chaînes dépend de la locale, 368 | c'est à dire de la régionalisation du système, comme la langue, le pays, la 369 | région etc. Nous pouvons utiliser la 370 | bibliothèque Hoa\Locale pour modifier ces données, mais ce 371 | n'est pas une dépendance de Hoa\Ustring pour autant.

372 |

Nous pouvons également savoir si une chaîne correspond à 373 | un certain motif, toujours exprimé avec une expression régulière. Pour cela, 374 | nous allons utiliser la méthode Hoa\Ustring\Ustring::match. Cette 375 | méthode repose sur les fonctions 376 | preg_match et 377 | preg_match_all de 378 | PHP, mais en modifiant les options du motif afin qu'il supporte Unicode. Nous 379 | avons les paramètres suivants : le motif, une variable par référence pour 380 | récupérer les captures, les flags, la position de début de 381 | recherche (offset) et enfin un booléen indiquant si la 382 | recherche est globale ou non (respectivement si nous devons utiliser 383 | preg_match_all ou preg_match). Par défaut, la 384 | recherche n'est pas globale.

385 |

Ainsi, nous allons vérifier que notre exemple en français contient bien 386 | aime avec son complément d'objet direct :

387 |
$french->match('#(?:(?<direct_object>\w)[\'\b])aime#', $matches);
388 | var_dump($matches['direct_object']);
389 | 
390 | /**
391 |  * Will output:
392 |  *     string(1) "t"
393 |  */
394 |

Cette méthode retourne false si une erreur est survenue (par 395 | exemple si le motif n'est pas correct), 0 si aucune correspondance n'a été 396 | trouvée, le nombre de correspondances trouvées sinon.

397 |

Similairement, nous pouvons chercher et 398 | remplacer des sous-chaînes par d'autres sous-chaînes suivant 399 | un motif, toujours exprimé avec une expression régulière. Pour cela, nous 400 | allons utiliser la méthode Hoa\Ustring\Ustring::replace. Cette 401 | méthode repose sur les fonctions 402 | preg_replace et 403 | preg_replace_callback 404 | de PHP, mais toujours en modifiant les options du motif afin qu'il supporte 405 | Unicode. En premier argument, nous trouvons le ou les motifs, en deuxième 406 | argument, le ou les remplacements et en dernier argument la limite de 407 | remplacements à faire. Si le remplacement est un callable, 408 | alors la fonction preg_replace_callback sera utilisée.

409 |

Ainsi, nous allons modifier notre exemple français pour qu'il soit plus 410 | poli :

411 |
$french->replace('#(?:\w[\'\b])(?<verb>aime)#', function ($matches) {
412 |     return 'vous ' . $matches['verb'];
413 | });
414 | 
415 | echo $french;
416 | 
417 | /**
418 |  * Will output:
419 |  *     Je vous aime
420 |  */
421 |

La classe Hoa\Ustring\Ustring propose des constantes qui sont 422 | des aliases de constantes PHP et qui permettent une meilleure lecture du 423 | code:

424 |
    425 |
  • Hoa\Ustring\Ustring::WITHOUT_EMPTY, alias de 426 | PREG_SPLIT_NO_EMPTY ;
  • 427 |
  • Hoa\Ustring\Ustring::WITH_DELIMITERS, alias de 428 | PREG_SPLIT_DELIM_CAPTURE ;
  • 429 |
  • Hoa\Ustring\Ustring::WITH_OFFSET, alias de 430 | PREG_OFFSET_CAPTURE et 431 | PREG_SPLIT_OFFSET_CAPTURE ;
  • 432 |
  • Hoa\Ustring\Ustring::GROUP_BY_PATTERN, alias de 433 | PREG_PATTERN_ORDER ;
  • 434 |
  • Hoa\Ustring\Ustring::GROUP_BY_TUPLE, alias de 435 | PREG_SET_ORDER.
  • 436 |
437 |

Comme ce sont des aliases stricts, nous pouvons écrire :

438 |
$string = new Hoa\Ustring\Ustring('abc1 defg2 hikl3 xyz4');
439 | $string->match(
440 |     '#(\w+)(\d)#',
441 |     $matches,
442 |     Hoa\Ustring\Ustring::WITH_OFFSET
443 |   | Hoa\Ustring\Ustring::GROUP_BY_TUPLE,
444 |     0,
445 |     true
446 | );
447 | 448 |

Caractères

449 | 450 |

La classe Hoa\Ustring\Ustring offre des méthodes statiques 451 | travaillant sur un seul caractère Unicode. Nous avons déjà évoqué la méthode 452 | getCharDirection qui permet de connaître la 453 | direction d'un caractère. Nous trouvons aussi 454 | getCharWidth qui calcule le nombre de colonnes 455 | nécessaires pour l'affichage d'un seul caractère. Ainsi :

456 |
var_dump(
457 |     Hoa\Ustring\Ustring::getCharWidth(Hoa\Ustring\Ustring::fromCode(0x7f)),
458 |     Hoa\Ustring\Ustring::getCharWidth('a'),
459 |     Hoa\Ustring\Ustring::getCharWidth('㽠')
460 | );
461 | 
462 | /**
463 |  * Will output:
464 |  *     int(-1)
465 |  *     int(1)
466 |  *     int(2)
467 |  */
468 |

Cette méthode retourne -1 ou 0 si le caractère n'est pas 469 | imprimable (par exemple si c'est un caractère de contrôle, 470 | comme 0x7f qui correspond à DELETE), 1 ou plus si 471 | c'est un caractère qui peut être imprimé. Dans notre exemple, 472 | s'imprime sur 2 colonnes.

473 |

Pour plus de sémantique, nous avons accès à la méthode 474 | Hoa\Ustring\Ustring::isCharPrintable qui permet de savoir si un 475 | caractère est imprimable ou pas.

476 |

Si nous voulons calculer le nombre de colonnes pour tout une chaîne, il 477 | faut utiliser la méthode Hoa\Ustring\Ustring::getWidth. 478 | Ainsi :

479 |
var_dump(
480 |     $french->getWidth(),
481 |     $arabic->getWidth(),
482 |     $japanese->getWidth()
483 | );
484 | 
485 | /**
486 |  * Will output:
487 |  *     int(9)
488 |  *     int(4)
489 |  *     int(18)
490 |  */
491 |

Essayez dans un terminal avec une police mono-espacée. 492 | Vous verrez que le japonais demande 18 colonnes pour s'afficher. Cette mesure 493 | est très utile si nous voulons connaître la largeur d'une chaîne pour la 494 | positionner correctement.

495 |

La méthode getCharWidth est différente de 496 | getWidth car elle prend en compte des caractères de contrôles. 497 | Elle est destinée à être utilisée, par exemple, avec des terminaux (voir 498 | la bibliothèque 499 | Hoa\Console).

500 |

Enfin, si cette fois nous ne nous intéressons pas aux caractères Unicode 501 | mais aux caractères machines char (soit 1 502 | octet), nous avons une opération supplémentaire. La méthode 503 | Hoa\Ustring\Ustring::getBytesLength va compter la 504 | taille de la chaîne en octets :

505 |
var_dump(
506 |     $arabic->getBytesLength(),
507 |     $japanese->getBytesLength()
508 | );
509 | 
510 | /**
511 |  * Will output:
512 |  *     int(8)
513 |  *     int(27)
514 |  */
515 |

Si nous comparons ces résultats avec ceux de la méthode 516 | Hoa\Ustring\Ustring::count, nous comprenons que les caractères 517 | arabes sont encodés sur 2 octets alors que les caractères japonais sont 518 | encodés sur 3 octets. Nous pouvons également obtenir un octet précis à l'aide 519 | de la méthode Hoa\Ustring\Ustring::getByteAt. Encore une fois, 520 | l'index n'est pas borné.

521 | 522 |

Code-point

523 | 524 |

Chaque caractère est représenté en machine par un entier, appelé 525 | code-point. Pour obtenir le code-point d'un caractère, nous 526 | pouvons utiliser la méthode statique Hoa\Ustring\Ustring::toCode, 527 | et pour obtenir un caractère à partir d'un code, nous pouvons utiliser la 528 | méthode statique Hoa\Ustring\Ustring::fromCode. Nous avons aussi 529 | la méthode statique Hoa\Ustring\Ustring::toBinaryCode qui 530 | retourne la représentation sous forme binaire d'un caractère. Prenons un 531 | exemple :

532 |
var_dump(
533 |     Hoa\Ustring\Ustring::toCode('Σ'),
534 |     Hoa\Ustring\Ustring::toBinaryCode('Σ'),
535 |     Hoa\Ustring\Ustring::fromCode(0x1a9)
536 | );
537 | 
538 | /**
539 |  * Will output:
540 |  *     int(931)
541 |  *     string(32) "1100111010100011"
542 |  *     string(2) "Σ"
543 |  */
544 | 545 |

Algorithmes de recherche

546 | 547 |

La bibliothèque Hoa\Ustring propose des algorithmes de 548 | recherches sophistiquées sur les chaînes de caractères à 549 | travers la classe Hoa\Ustring\Search.

550 |

Nous allons étudier l'algorithme 551 | Hoa\Ustring\Search::approximated qui fait une recherche d'une 552 | sous-chaîne dans une chaîne avec au maximum k 553 | différences (une différence étant une insertion, une délétion ou une 554 | modification). Prenons un exemple classique avec une représentation 555 | ADN : nous allons chercher toutes les sous-chaînes s'approchant de 556 | GATAA à 1 différence près (au maximum) dans 557 | CAGATAAGAGAA. Pour cela, nous allons donc écrire :

558 |
$x      = 'GATAA';
559 | $y      = 'CAGATAAGAGAA';
560 | $k      = 1;
561 | $search = Hoa\Ustring\Search::approximated($y, $x, $k);
562 | $n      = count($search);
563 | 
564 | echo 'Try to match ', $x, ' in ', $y, ' with at most ', $k, ' difference(s):', "\n";
565 | echo $n, ' match(es) found:', "\n";
566 | 
567 | foreach ($search as $position) {
568 |     echo '    • ', substr($y, $position['i'], $position['l'), "\n";
569 | }
570 | 
571 | /**
572 |  * Will output:
573 |  *     Try to match GATAA in CAGATAAGAGAA with at most 1 difference(s):
574 |  *     4 match(es) found:
575 |  *         • AGATA
576 |  *         • GATAA
577 |  *         • ATAAG
578 |  *         • GAGAA
579 |  */
580 |

Cette méthode retourne un tableau de tableaux. Chaque sous-tableau 581 | représente un résultat et contient trois indexes : i pour la 582 | position du premier caractère (octet) du résultat, j pour la 583 | position du dernier caractère et l pour la taille du résultat 584 | (tout simplement j - i). 585 | Ainsi, nous pouvons calculer les résultats en utilisant notre chaîne initiale 586 | (ici $y) et ces indexes.

587 |

Avec notre exemple, nous avons quatre résultats. Le premier est 588 | AGATA, soit GATAA avec un caractère 589 | déplacé, et AGATA existe bien dans 590 | CAGATAAGAGAA. Le deuxième résultat est 591 | GATAA, notre sous-chaîne, qui existe bel et bien dans 592 | CAGATAAGAGAA. Le troisième résultat est 593 | ATAAG, soit GATAA avec un caractère 594 | déplacé, et ATAAG existe bien dans 595 | CAGATAAGAGAA. Enfin, le dernier résultat est 596 | GAGAA, soit GATAA avec un caractère 597 | modifié, et GAGAA existe bien dans 598 | CAGATAAGAGAA.

599 |

Prenons un autre exemple, plus concret cette fois-ci. Nous allons 600 | considérer la chaîne --testIt --foobar --testThat --testAt (qui 601 | représente les options possibles d'une ligne de commande), et nous allons 602 | chercher --testot, une option qu'aurait pu donner 603 | l'utilisateur. Cette option n'existe pas telle quelle. Nous allons donc 604 | utiliser notre algorithme de recherche avec 1 différence au maximum. Voyons 605 | plutôt :

606 |
$x      = 'testot';
607 | $y      = '--testIt --foobar --testThat --testAt';
608 | $k      = 1;
609 | $search = Hoa\Ustring\Search::approximated($y, $x, $k);
610 | $n      = count($search);
611 | 
612 | // …
613 | 
614 | /**
615 |  * Will output:
616 |  *     Try to match testot in --testIt --foobar --testThat --testAt with at most 1 difference(s)
617 |  *     2 match(es) found:
618 |  *         • testIt
619 |  *         • testAt
620 |  */
621 |

Les résultats testIt et testAt sont des vraies 622 | options, donc nous pouvons les proposer à l'utilisateur. C'est un mécanisme 623 | utilisé par Hoa\Console pour proposer des corrections à 624 | l'utilisateur s'il se trompe.

625 | 626 |

Conclusion

627 | 628 |

La bibliothèque Hoa\Ustring propose des facilités pour 629 | manipuler des chaînes encodées au format Unicode, mais aussi pour effectuer 630 | des recherches sophistiquées sur des chaînes.

631 | 632 |
633 |
634 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | Hoa 3 |

4 | 5 | --- 6 | 7 |

8 | Build status 9 | Code coverage 10 | Packagist 11 | License 12 |

13 |

14 | Hoa is a modular, extensible and 15 | structured set of PHP libraries.
16 | Moreover, Hoa aims at being a bridge between industrial and research worlds. 17 |

18 | 19 | # Hoa\Ustring 20 | 21 | [![Help on IRC](https://img.shields.io/badge/help-%23hoaproject-ff0066.svg)](https://webchat.freenode.net/?channels=#hoaproject) 22 | [![Help on Gitter](https://img.shields.io/badge/help-gitter-ff0066.svg)](https://gitter.im/hoaproject/central) 23 | [![Documentation](https://img.shields.io/badge/documentation-hack_book-ff0066.svg)](https://central.hoa-project.net/Documentation/Library/Ustring) 24 | [![Board](https://img.shields.io/badge/organisation-board-ff0066.svg)](https://waffle.io/hoaproject/ustring) 25 | 26 | This library allows to manipulate UTF-8 strings easily with some search 27 | algorithms. 28 | 29 | [Learn more](https://central.hoa-project.net/Documentation/Library/Ustring). 30 | 31 | ## Installation 32 | 33 | With [Composer](https://getcomposer.org/), to include this library into 34 | your dependencies, you need to 35 | require [`hoa/ustring`](https://packagist.org/packages/hoa/ustring): 36 | 37 | ```sh 38 | $ composer require hoa/ustring '~4.0' 39 | ``` 40 | 41 | For more installation procedures, please read [the Source 42 | page](https://hoa-project.net/Source.html). 43 | 44 | ## Testing 45 | 46 | Before running the test suites, the development dependencies must be installed: 47 | 48 | ```sh 49 | $ composer install 50 | ``` 51 | 52 | Then, to run all the test suites: 53 | 54 | ```sh 55 | $ vendor/bin/hoa test:run 56 | ``` 57 | 58 | For more information, please read the [contributor 59 | guide](https://hoa-project.net/Literature/Contributor/Guide.html). 60 | 61 | ## Quick usage 62 | 63 | We propose a quick overview of two usages: manipulate UTF-8 strings and one 64 | search algorithm. 65 | 66 | ### Natural UTF-8 strings manipulation 67 | 68 | The `Hoa\Ustring\Ustring` class allows to manipulate easily UTF-8 strings in a 69 | very natural way. This class implements the `\ArrayAccess`, `\Countable` and 70 | `\IteratorAggregate` interfaces. We will use the following examples: 71 | 72 | ```php 73 | $french = new Hoa\Ustring\Ustring('Je t\'aime'); 74 | $arabic = new Hoa\Ustring\Ustring('أحبك'); 75 | $japanese = new Hoa\Ustring\Ustring('私はあなたを愛して'); 76 | ``` 77 | 78 | To get the first character, we will do: 79 | 80 | ```php 81 | var_dump( 82 | $french[0], // string(1) "J" 83 | $arabic[0], // string(2) "أ" 84 | $japanese[0] // string(3) "私" 85 | ); 86 | ``` 87 | 88 | And to get the last character, we will do `[-1]`. It supports unbounded (and 89 | modulo) indexes. 90 | 91 | We note that it cares about text **direction**. Look at `$arabic[0]`, it returns 92 | `أ` and not `ك`. To get the direction, we can use the 93 | `Hoa\Ustring\Ustring::getDirection` method (which call the 94 | `Hoa\Ustring\Ustring::getCharDirection` static method), it returns either 95 | `Hoa\Ustring\Ustring::LTR` (`0`) or `Hoa\Ustring\Ustring::RTL` (`1`): 96 | 97 | ```php 98 | var_dump( 99 | $french->getDirection(), // int(0) 100 | $arabic->getDirection(), // int(1) 101 | $japanese->getDirection() // int(0) 102 | ); 103 | ``` 104 | 105 | Text direction is also important for the `append`, `prepend`, `pad`… methods on 106 | `Hoa\Ustring\Ustring` for example. 107 | 108 | To get the length of a string, we can use the `count` function: 109 | 110 | ```php 111 | var_dump( 112 | count($french), // int(9) 113 | count($arabic), // int(4) 114 | count($japanese) // int(9) 115 | ); 116 | ``` 117 | 118 | We are also able to iterate over the string: 119 | 120 | ```php 121 | foreach ($arabic as $letter) { 122 | var_dump($letter); 123 | } 124 | 125 | /** 126 | * Will output: 127 | * string(2) "أ" 128 | * string(2) "ح" 129 | * string(2) "ب" 130 | * string(2) "ك" 131 | */ 132 | ``` 133 | 134 | Again, text direction is useful here. For `$arabic`, the iteration is done from 135 | right to left. 136 | 137 | Some static methods are helpful, such as `fromCode`, `toCode` or `isUtf8` on 138 | `Hoa\Ustring\Ustring`: 139 | 140 | ```php 141 | var_dump( 142 | Hoa\Ustring\Ustring::fromCode(0x1a9), // string(2) "Ʃ" 143 | Hoa\Ustring\Ustring::toCode('Ʃ'), // int(425) == 0x1a9 144 | Hoa\Ustring\Ustring::isUtf8('Ʃ') // bool(true) 145 | ); 146 | ``` 147 | 148 | We can also transform any text into ASCII: 149 | 150 | ```php 151 | $emoji = new Hoa\Ustring\Ustring('I ❤ Unicode'); 152 | $maths = new Hoa\Ustring\Ustring('∀ i ∈ ℕ'); 153 | 154 | echo 155 | $emoji->toAscii(), "\n", 156 | $maths->toAscii(), "\n"; 157 | 158 | /** 159 | * Will output: 160 | * I (heavy black heart) Unicode 161 | * (for all) i (element of) N 162 | */ 163 | ``` 164 | 165 | ### Search algorithm 166 | 167 | The `Hoa\Ustring\Search` implements search algorithms on strings. 168 | 169 | For example, the `Hoa\Ustring\Search::approximated` method make a search by 170 | approximated patterns (with *k* differences based upon the principle diagonal 171 | monotony). If we search the word `GATAA` in `CAGATAAGAGAA` with 1 difference, we 172 | will do: 173 | 174 | ```php 175 | $search = Hoa\Ustring\Search::approximated( 176 | $haystack = 'CAGATAAGAGAA', 177 | $needle = 'GATAA', 178 | $k = 1 179 | ); 180 | $solutions = array(); 181 | 182 | foreach ($search as $pos) { 183 | $solutions[] = substr($haystack, $pos['i'], $pos['l']); 184 | } 185 | ``` 186 | 187 | We will found `AGATA`, `GATAA`, `ATAAG` and `GAGAA`. 188 | 189 | The result is not very handy but the algorithm is much optimized and found many 190 | applications. 191 | 192 | ## Documentation 193 | 194 | The 195 | [hack book of `Hoa\Ustring`](https://central.hoa-project.net/Documentation/Library/Ustring) contains 196 | detailed information about how to use this library and how it works. 197 | 198 | To generate the documentation locally, execute the following commands: 199 | 200 | ```sh 201 | $ composer require --dev hoa/devtools 202 | $ vendor/bin/hoa devtools:documentation --open 203 | ``` 204 | 205 | More documentation can be found on the project's website: 206 | [hoa-project.net](https://hoa-project.net/). 207 | 208 | ## Getting help 209 | 210 | There are mainly two ways to get help: 211 | 212 | * On the [`#hoaproject`](https://webchat.freenode.net/?channels=#hoaproject) 213 | IRC channel, 214 | * On the forum at [users.hoa-project.net](https://users.hoa-project.net). 215 | 216 | ## Contribution 217 | 218 | Do you want to contribute? Thanks! A detailed [contributor 219 | guide](https://hoa-project.net/Literature/Contributor/Guide.html) explains 220 | everything you need to know. 221 | 222 | ## License 223 | 224 | Hoa is under the New BSD License (BSD-3-Clause). Please, see 225 | [`LICENSE`](https://hoa-project.net/LICENSE) for details. 226 | -------------------------------------------------------------------------------- /Source/Exception.php: -------------------------------------------------------------------------------- 1 | array_fill(-1, $n - $m + $k + 3, -2)]; 58 | 59 | for ($q = 0, $max = $k - 1; $q <= $max; ++$q) { 60 | $L[$q][-$q - 1] = $L[$q][-$q - 2] = $q - 1; 61 | } 62 | 63 | for ($q = 0; $q <= $k; ++$q) { 64 | for ($d = -$q, $max = $n - $m + $k - $q; $d <= $max; ++$d) { 65 | $l = min( 66 | max( 67 | $L[$q - 1][$d - 1], 68 | $L[$q - 1][$d ] + 1, 69 | $L[$q - 1][$d + 1] + 1 70 | ), 71 | $m - 1 72 | ); 73 | $a = substr($x, $l + 1, $m - $l); 74 | $b = substr($y, $l + 1 + $d, $n - $l - $d); 75 | $L[$q][$d] = $l + static::lcp($a, $b); 76 | 77 | if ($L[$q][$d] == $m - 1 || 78 | $d + $L[$q][$d] == $n - 1) { 79 | $j = $m + $d; 80 | $i = max(0, $j - $m); 81 | $offset[$q][] = ['i' => $i, 'j' => $j, 'l' => $j - $i]; 82 | } 83 | } 84 | } 85 | 86 | return empty($offset) ? $offset : $offset[$k]; 87 | } 88 | 89 | /** 90 | * Length of the longest common prefixes. 91 | */ 92 | public static function lcp(string $x, string $y): int 93 | { 94 | $max = min(strlen($x), strlen($y)); 95 | $i = 0; 96 | 97 | while ($i < $max && $x[$i] == $y[$i]) { 98 | ++$i; 99 | } 100 | 101 | return $i; 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /Source/Ustring.php: -------------------------------------------------------------------------------- 1 | append($string); 165 | } 166 | 167 | return; 168 | } 169 | 170 | /** 171 | * Check if ext/mbstring is available. 172 | */ 173 | public static function checkMbString(): bool 174 | { 175 | return function_exists('mb_substr'); 176 | } 177 | 178 | /** 179 | * Check if ext/iconv is available. 180 | */ 181 | public static function checkIconv(): bool 182 | { 183 | return function_exists('iconv'); 184 | } 185 | 186 | /** 187 | * Append a substring to the current string, i.e. add to the end. 188 | */ 189 | public function append(string $substring): self 190 | { 191 | $this->_string .= $substring; 192 | 193 | return $this; 194 | } 195 | 196 | /** 197 | * Prepend a substring to the current string, i.e. add to the start. 198 | */ 199 | public function prepend(string $substring): self 200 | { 201 | $this->_string = $substring . $this->_string; 202 | 203 | return $this; 204 | } 205 | 206 | /** 207 | * Pad the current string to a certain length with another piece, aka piece. 208 | */ 209 | public function pad(int $length, string $piece, int $side = self::END): self 210 | { 211 | $difference = $length - $this->count(); 212 | 213 | if (0 >= $difference) { 214 | return $this; 215 | } 216 | 217 | $handle = null; 218 | 219 | for ($i = $difference / mb_strlen($piece) - 1; $i >= 0; --$i) { 220 | $handle .= $piece; 221 | } 222 | 223 | $handle .= mb_substr($piece, 0, $difference - mb_strlen($handle)); 224 | 225 | return 226 | static::END === $side 227 | ? $this->append($handle) 228 | : $this->prepend($handle); 229 | } 230 | 231 | /** 232 | * Make a comparison with a string. 233 | * Return < 0 if current string is less than $string, > 0 if greater and 0 234 | * if equal. 235 | */ 236 | public function compare($string): int 237 | { 238 | if (null === $collator = static::getCollator()) { 239 | return strcmp($this->_string, (string) $string); 240 | } 241 | 242 | return $collator->compare($this->_string, $string); 243 | } 244 | 245 | /** 246 | * Get collator. 247 | */ 248 | public static function getCollator(): ?Collator 249 | { 250 | if (false === class_exists('Collator')) { 251 | return null; 252 | } 253 | 254 | if (null === static::$_collator) { 255 | static::$_collator = new Collator(setlocale(LC_COLLATE, null)); 256 | } 257 | 258 | return static::$_collator; 259 | } 260 | 261 | /** 262 | * Ensure that the pattern is safe for Unicode: add the “u” option. 263 | */ 264 | public static function safePattern(string $pattern): string 265 | { 266 | $delimiter = mb_substr($pattern, 0, 1); 267 | $options = mb_substr( 268 | mb_strrchr($pattern, $delimiter, false), 269 | mb_strlen($delimiter) 270 | ); 271 | 272 | if (false === strpos($options, 'u')) { 273 | $pattern .= 'u'; 274 | } 275 | 276 | return $pattern; 277 | } 278 | 279 | /** 280 | * Perform a regular expression (PCRE) match. 281 | */ 282 | public function match( 283 | string $pattern, 284 | array &$matches = null, 285 | int $flags = 0, 286 | int $offset = 0, 287 | bool $global = false 288 | ): int { 289 | $pattern = static::safePattern($pattern); 290 | 291 | if (0 === $flags) { 292 | if (true === $global) { 293 | $flags = static::GROUP_BY_PATTERN; 294 | } 295 | } else { 296 | $flags &= ~PREG_SPLIT_OFFSET_CAPTURE; 297 | } 298 | 299 | 300 | $offset = strlen(mb_substr($this->_string, 0, $offset)); 301 | 302 | if (true === $global) { 303 | return preg_match_all( 304 | $pattern, 305 | $this->_string, 306 | $matches, 307 | $flags, 308 | $offset 309 | ); 310 | } 311 | 312 | return preg_match($pattern, $this->_string, $matches, $flags, $offset); 313 | } 314 | 315 | /** 316 | * Perform a regular expression (PCRE) search and replace. 317 | */ 318 | public function replace($pattern, $replacement, int $limit = -1): self 319 | { 320 | $pattern = static::safePattern($pattern); 321 | 322 | if (false === is_callable($replacement)) { 323 | $this->_string = preg_replace( 324 | $pattern, 325 | $replacement, 326 | $this->_string, 327 | $limit 328 | ); 329 | } else { 330 | $this->_string = preg_replace_callback( 331 | $pattern, 332 | $replacement, 333 | $this->_string, 334 | $limit 335 | ); 336 | } 337 | 338 | return $this; 339 | } 340 | 341 | /** 342 | * Split the current string according to a given pattern (PCRE). 343 | */ 344 | public function split( 345 | string $pattern, 346 | int $limit = -1, 347 | int $flags = self::WITHOUT_EMPTY 348 | ): array { 349 | return preg_split( 350 | static::safePattern($pattern), 351 | $this->_string, 352 | $limit, 353 | $flags 354 | ); 355 | } 356 | 357 | /** 358 | * Iterator over chars. 359 | */ 360 | public function getIterator(): ArrayIterator 361 | { 362 | return new ArrayIterator(preg_split('#(?_string)); 363 | } 364 | 365 | /** 366 | * Perform a lowercase folding on the current string. 367 | */ 368 | public function toLowerCase(): self 369 | { 370 | $this->_string = mb_strtolower($this->_string); 371 | 372 | return $this; 373 | } 374 | 375 | /** 376 | * Perform an uppercase folding on the current string. 377 | * 378 | * @return \Hoa\Ustring 379 | */ 380 | public function toUpperCase(): \Hoa\Ustring 381 | { 382 | $this->_string = mb_strtoupper($this->_string); 383 | 384 | return $this; 385 | } 386 | 387 | /** 388 | * Transform a UTF-8 string into an ASCII one. 389 | * First, try with a transliterator. If not available, will fallback to a 390 | * normalizer. If not available, will try something homemade. 391 | */ 392 | public function toAscii(bool $try = false): self 393 | { 394 | if (0 === preg_match('#[\x80-\xff]#', $this->_string)) { 395 | return $this; 396 | } 397 | 398 | $string = $this->_string; 399 | $transId = 400 | 'Any-Latin; ' . 401 | '[\p{S}] Name; ' . 402 | 'Latin-ASCII'; 403 | 404 | if (null !== $transliterator = static::getTransliterator($transId)) { 405 | $this->_string = preg_replace_callback( 406 | '#\\\N\{([A-Z ]+)\}#u', 407 | function (array $matches) { 408 | return '(' . strtolower($matches[1]) . ')'; 409 | }, 410 | $transliterator->transliterate($string) 411 | ); 412 | 413 | return $this; 414 | } 415 | 416 | if (false === class_exists('Normalizer')) { 417 | if (false === $try) { 418 | throw new Exception( 419 | '%s needs the class Normalizer to work properly, ' . 420 | 'or you can force a try by using %1$s(true).', 421 | 0, 422 | __METHOD__ 423 | ); 424 | } 425 | 426 | $string = static::transcode($string, 'UTF-8', 'ASCII//IGNORE//TRANSLIT'); 427 | $this->_string = preg_replace('#(?:[\'"`^](\w))#u', '\1', $string); 428 | 429 | return $this; 430 | } 431 | 432 | $string = \Normalizer::normalize($string, \Normalizer::NFKD); 433 | $string = preg_replace('#\p{Mn}+#u', '', $string); 434 | $this->_string = static::transcode($string, 'UTF-8', 'ASCII//IGNORE//TRANSLIT'); 435 | 436 | return $this; 437 | } 438 | 439 | /** 440 | * Transliterate the string into another. 441 | * See self::getTransliterator for more information. 442 | */ 443 | public function transliterate(string $identifier, int $start = 0, int $end = null): self 444 | { 445 | if (null === $transliterator = static::getTransliterator($identifier)) { 446 | throw new Exception( 447 | '%s needs the class Transliterator to work properly.', 448 | 1, 449 | __METHOD__ 450 | ); 451 | } 452 | 453 | $this->_string = $transliterator->transliterate($this->_string, $start, $end); 454 | 455 | return $this; 456 | } 457 | 458 | /** 459 | * Get transliterator. 460 | * See http://userguide.icu-project.org/transforms/general for $identifier. 461 | */ 462 | public static function getTransliterator(string $identifier): ?Transliterator 463 | { 464 | if (false === class_exists('Transliterator')) { 465 | return null; 466 | } 467 | 468 | return Transliterator::create($identifier); 469 | } 470 | 471 | /** 472 | * Strip characters (default \s) of the current string. 473 | */ 474 | public function trim(string $regex = '\s', int $side = self::BEGINNING | self::END): self 475 | { 476 | $regex = '(?:' . $regex . ')+'; 477 | $handle = null; 478 | 479 | if (0 !== ($side & static::BEGINNING)) { 480 | $handle .= '(^' . $regex . ')'; 481 | } 482 | 483 | if (0 !== ($side & static::END)) { 484 | if (null !== $handle) { 485 | $handle .= '|'; 486 | } 487 | 488 | $handle .= '(' . $regex . '$)'; 489 | } 490 | 491 | $this->_string = preg_replace('#' . $handle . '#u', '', $this->_string); 492 | $this->_direction = null; 493 | 494 | return $this; 495 | } 496 | 497 | /** 498 | * Compute offset (negative, unbound etc.). 499 | */ 500 | protected function computeOffset(int $offset): int 501 | { 502 | $length = mb_strlen($this->_string); 503 | 504 | if (0 > $offset) { 505 | $offset = -$offset % $length; 506 | 507 | if (0 !== $offset) { 508 | $offset = $length - $offset; 509 | } 510 | } elseif ($offset >= $length) { 511 | $offset %= $length; 512 | } 513 | 514 | return $offset; 515 | } 516 | 517 | /** 518 | * Get a specific chars of the current string. 519 | */ 520 | public function offsetGet($offset): string 521 | { 522 | return mb_substr($this->_string, $this->computeOffset($offset), 1); 523 | } 524 | 525 | /** 526 | * Set a specific character of the current string. 527 | */ 528 | public function offsetSet($offset, $value): self 529 | { 530 | $head = null; 531 | $offset = $this->computeOffset($offset); 532 | 533 | if (0 < $offset) { 534 | $head = mb_substr($this->_string, 0, $offset); 535 | } 536 | 537 | $tail = mb_substr($this->_string, $offset + 1); 538 | $this->_string = $head . $value . $tail; 539 | $this->_direction = null; 540 | 541 | return $this; 542 | } 543 | 544 | /** 545 | * Delete a specific character of the current string. 546 | */ 547 | public function offsetUnset($offset): void 548 | { 549 | $this->offsetSet($offset, null); 550 | } 551 | 552 | /** 553 | * Check if a specific offset exists. 554 | */ 555 | public function offsetExists($offset): bool 556 | { 557 | return true; 558 | } 559 | 560 | /** 561 | * Reduce the strings. 562 | */ 563 | public function reduce(int $start, int $length = null): self 564 | { 565 | $this->_string = mb_substr($this->_string, $start, $length); 566 | 567 | return $this; 568 | } 569 | 570 | /** 571 | * Count number of characters of the current string. 572 | */ 573 | public function count(): int 574 | { 575 | return mb_strlen($this->_string); 576 | } 577 | 578 | /** 579 | * Get byte (not character) at a specific offset. 580 | */ 581 | public function getByteAt(int $offset): string 582 | { 583 | $length = strlen($this->_string); 584 | 585 | if (0 > $offset) { 586 | $offset = -$offset % $length; 587 | 588 | if (0 !== $offset) { 589 | $offset = $length - $offset; 590 | } 591 | } elseif ($offset >= $length) { 592 | $offset %= $length; 593 | } 594 | 595 | return $this->_string[$offset]; 596 | } 597 | 598 | /** 599 | * Count number of bytes (not characters) of the current string. 600 | */ 601 | public function getBytesLength(): int 602 | { 603 | return strlen($this->_string); 604 | } 605 | 606 | /** 607 | * Get the width of the current string. 608 | * Useful when printing the string in monotype (some character need more 609 | * than one column to be printed). 610 | */ 611 | public function getWidth(): int 612 | { 613 | return mb_strwidth($this->_string); 614 | } 615 | 616 | /** 617 | * Get direction of the current string. 618 | * Please, see the self::LTR and self::RTL constants. 619 | * It does not yet support embedding directions. 620 | */ 621 | public function getDirection(): int 622 | { 623 | if (null === $this->_direction) { 624 | if (null === $this->_string) { 625 | $this->_direction = static::LTR; 626 | } else { 627 | $this->_direction = static::getCharDirection( 628 | mb_substr($this->_string, 0, 1) 629 | ); 630 | } 631 | } 632 | 633 | return $this->_direction; 634 | } 635 | 636 | /** 637 | * Get character of a specific character. 638 | * Please, see the self::LTR and self::RTL constants. 639 | */ 640 | public static function getCharDirection(string $char): int 641 | { 642 | $c = static::toCode($char); 643 | 644 | if (!(0x5be <= $c && 0x10b7f >= $c)) { 645 | return static::LTR; 646 | } 647 | 648 | if (0x85e >= $c) { 649 | if (0x5be === $c || 650 | 0x5c0 === $c || 651 | 0x5c3 === $c || 652 | 0x5c6 === $c || 653 | (0x5d0 <= $c && 0x5ea >= $c) || 654 | (0x5f0 <= $c && 0x5f4 >= $c) || 655 | 0x608 === $c || 656 | 0x60b === $c || 657 | 0x60d === $c || 658 | 0x61b === $c || 659 | (0x61e <= $c && 0x64a >= $c) || 660 | (0x66d <= $c && 0x66f >= $c) || 661 | (0x671 <= $c && 0x6d5 >= $c) || 662 | (0x6e5 <= $c && 0x6e6 >= $c) || 663 | (0x6ee <= $c && 0x6ef >= $c) || 664 | (0x6fa <= $c && 0x70d >= $c) || 665 | 0x710 === $c || 666 | (0x712 <= $c && 0x72f >= $c) || 667 | (0x74d <= $c && 0x7a5 >= $c) || 668 | 0x7b1 === $c || 669 | (0x7c0 <= $c && 0x7ea >= $c) || 670 | (0x7f4 <= $c && 0x7f5 >= $c) || 671 | 0x7fa === $c || 672 | (0x800 <= $c && 0x815 >= $c) || 673 | 0x81a === $c || 674 | 0x824 === $c || 675 | 0x828 === $c || 676 | (0x830 <= $c && 0x83e >= $c) || 677 | (0x840 <= $c && 0x858 >= $c) || 678 | 0x85e === $c) { 679 | return static::RTL; 680 | } 681 | } elseif (0x200f === $c) { 682 | return static::RTL; 683 | } elseif (0xfb1d <= $c) { 684 | if (0xfb1d === $c || 685 | (0xfb1f <= $c && 0xfb28 >= $c) || 686 | (0xfb2a <= $c && 0xfb36 >= $c) || 687 | (0xfb38 <= $c && 0xfb3c >= $c) || 688 | 0xfb3e === $c || 689 | (0xfb40 <= $c && 0xfb41 >= $c) || 690 | (0xfb43 <= $c && 0xfb44 >= $c) || 691 | (0xfb46 <= $c && 0xfbc1 >= $c) || 692 | (0xfbd3 <= $c && 0xfd3d >= $c) || 693 | (0xfd50 <= $c && 0xfd8f >= $c) || 694 | (0xfd92 <= $c && 0xfdc7 >= $c) || 695 | (0xfdf0 <= $c && 0xfdfc >= $c) || 696 | (0xfe70 <= $c && 0xfe74 >= $c) || 697 | (0xfe76 <= $c && 0xfefc >= $c) || 698 | (0x10800 <= $c && 0x10805 >= $c) || 699 | 0x10808 === $c || 700 | (0x1080a <= $c && 0x10835 >= $c) || 701 | (0x10837 <= $c && 0x10838 >= $c) || 702 | 0x1083c === $c || 703 | (0x1083f <= $c && 0x10855 >= $c) || 704 | (0x10857 <= $c && 0x1085f >= $c) || 705 | (0x10900 <= $c && 0x1091b >= $c) || 706 | (0x10920 <= $c && 0x10939 >= $c) || 707 | 0x1093f === $c || 708 | 0x10a00 === $c || 709 | (0x10a10 <= $c && 0x10a13 >= $c) || 710 | (0x10a15 <= $c && 0x10a17 >= $c) || 711 | (0x10a19 <= $c && 0x10a33 >= $c) || 712 | (0x10a40 <= $c && 0x10a47 >= $c) || 713 | (0x10a50 <= $c && 0x10a58 >= $c) || 714 | (0x10a60 <= $c && 0x10a7f >= $c) || 715 | (0x10b00 <= $c && 0x10b35 >= $c) || 716 | (0x10b40 <= $c && 0x10b55 >= $c) || 717 | (0x10b58 <= $c && 0x10b72 >= $c) || 718 | (0x10b78 <= $c && 0x10b7f >= $c)) { 719 | return static::RTL; 720 | } 721 | } 722 | 723 | return static::LTR; 724 | } 725 | 726 | /** 727 | * Get the number of column positions of a wide-character. 728 | * 729 | * This is a PHP implementation of wcwidth() and wcswidth() (defined in IEEE 730 | * Std 1002.1-2001) for Unicode, by Markus Kuhn. Please, see 731 | * http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c. 732 | * 733 | * The wcwidth(wc) function shall either return 0 (if wc is a null 734 | * wide-character code), or return the number of column positions to be 735 | * occupied by the wide-character code wc, or return -1 (if wc does not 736 | * correspond to a printable wide-character code). 737 | */ 738 | public static function getCharWidth(string $char): int 739 | { 740 | $char = (string) $char; 741 | $c = static::toCode($char); 742 | 743 | // Test for 8-bit control characters. 744 | if (0x0 === $c) { 745 | return 0; 746 | } 747 | 748 | if (0x20 > $c || (0x7f <= $c && $c < 0xa0)) { 749 | return -1; 750 | } 751 | 752 | // Non-spacing characters. 753 | if (0xad !== $c && 754 | 0 !== preg_match('#^[\p{Mn}\p{Me}\p{Cf}\x{1160}-\x{11ff}\x{200b}]#u', $char)) { 755 | return 0; 756 | } 757 | 758 | // If we arrive here, $c is not a combining C0/C1 control character. 759 | return 1 + 760 | (0x1100 <= $c && 761 | (0x115f >= $c || // Hangul Jamo init. consonants 762 | 0x2329 === $c || 0x232a === $c || 763 | (0x2e80 <= $c && 0xa4cf >= $c && 764 | 0x303f !== $c) || // CJK…Yi 765 | (0xac00 <= $c && 0xd7a3 >= $c) || // Hangul Syllables 766 | (0xf900 <= $c && 0xfaff >= $c) || // CJK Compatibility Ideographs 767 | (0xfe10 <= $c && 0xfe19 >= $c) || // Vertical forms 768 | (0xfe30 <= $c && 0xfe6f >= $c) || // CJK Compatibility Forms 769 | (0xff00 <= $c && 0xff60 >= $c) || // Fullwidth Forms 770 | (0xffe0 <= $c && 0xffe6 >= $c) || 771 | (0x20000 <= $c && 0x2fffd >= $c) || 772 | (0x30000 <= $c && 0x3fffd >= $c))); 773 | } 774 | 775 | /** 776 | * Check whether the character is printable or not. 777 | */ 778 | public static function isCharPrintable(string $char): bool 779 | { 780 | return 1 <= static::getCharWidth($char); 781 | } 782 | 783 | /** 784 | * Get a UTF-8 character from its decimal code representation. 785 | */ 786 | public static function fromCode(int $code): string 787 | { 788 | return mb_convert_encoding( 789 | '&#x' . dechex($code) . ';', 790 | 'UTF-8', 791 | 'HTML-ENTITIES' 792 | ); 793 | } 794 | 795 | /** 796 | * Get a decimal code representation of a specific character. 797 | */ 798 | public static function toCode(string $char): int 799 | { 800 | $char = (string) $char; 801 | $code = ord($char[0]); 802 | $bytes = 1; 803 | 804 | if (!($code & 0x80)) { // 0xxxxxxx 805 | return $code; 806 | } 807 | 808 | if (($code & 0xe0) === 0xc0) { // 110xxxxx 809 | $bytes = 2; 810 | $code = $code & ~0xc0; 811 | } elseif (($code & 0xf0) == 0xe0) { // 1110xxxx 812 | $bytes = 3; 813 | $code = $code & ~0xe0; 814 | } elseif (($code & 0xf8) === 0xf0) { // 11110xxx 815 | $bytes = 4; 816 | $code = $code & ~0xf0; 817 | } 818 | 819 | for ($i = 2; $i <= $bytes; $i++) { // 10xxxxxx 820 | $code = ($code << 6) + (ord($char[$i - 1]) & ~0x80); 821 | } 822 | 823 | return $code; 824 | } 825 | 826 | /** 827 | * Get a binary representation of a specific character. 828 | */ 829 | public static function toBinaryCode(string $char): string 830 | { 831 | $char = (string) $char; 832 | $out = ''; 833 | 834 | for ($i = 0, $max = strlen($char); $i < $max; ++$i) { 835 | $out .= vsprintf('%08b', ord($char[$i])); 836 | } 837 | 838 | return $out; 839 | } 840 | 841 | /** 842 | * Transcode. 843 | */ 844 | public static function transcode(string $string, string $from, string $to = 'UTF-8'): string 845 | { 846 | if (false === static::checkIconv()) { 847 | throw new Exception( 848 | '%s needs the iconv extension.', 849 | 2, 850 | __CLASS__ 851 | ); 852 | } 853 | 854 | return iconv($from, $to, $string); 855 | } 856 | 857 | /** 858 | * Check if a string is encoded in UTF-8. 859 | */ 860 | public static function isUtf8(string $string): bool 861 | { 862 | return (bool) preg_match('##u', $string); 863 | } 864 | 865 | /** 866 | * Copy current object string 867 | */ 868 | public function copy(): self 869 | { 870 | return clone $this; 871 | } 872 | 873 | /** 874 | * Transform the object as a string. 875 | */ 876 | public function __toString(): string 877 | { 878 | return $this->_string; 879 | } 880 | } 881 | 882 | /** 883 | * Flex entity. 884 | */ 885 | Consistency::flexEntity(Ustring::class); 886 | 887 | if (false === Ustring::checkMbString()) { 888 | throw new Exception( 889 | '%s needs the mbstring extension.', 890 | 0, 891 | __NAMESPACE__ . '\Ustring' 892 | ); 893 | } 894 | -------------------------------------------------------------------------------- /Test/Unit/Issue.php: -------------------------------------------------------------------------------- 1 | when($result = LUT::toCode(chr(160))) 53 | ->then 54 | ->integer($result) 55 | ->isEqualTo(0xa0); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /Test/Unit/Search.php: -------------------------------------------------------------------------------- 1 | given( 53 | $x = 'GATAA', 54 | $y = 'CAGATAAGAGAA', 55 | $k = 1 56 | ) 57 | ->when($result = LUT\Search::approximated($y, $x, $k)) 58 | ->then 59 | ->array($result) 60 | ->isEqualTo([ 61 | 0 => [ 62 | 'i' => 1, 63 | 'j' => 6, 64 | 'l' => 5 65 | ], 66 | 1 => [ 67 | 'i' => 2, 68 | 'j' => 7, 69 | 'l' => 5 70 | ], 71 | 2 => [ 72 | 'i' => 3, 73 | 'j' => 8, 74 | 'l' => 5 75 | ], 76 | 3 => [ 77 | 'i' => 7, 78 | 'j' => 12, 79 | 'l' => 5 80 | ] 81 | ]); 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /Test/Unit/Ustring.php: -------------------------------------------------------------------------------- 1 | given($this->function->function_exists = true) 53 | ->then 54 | ->boolean(LUT::checkMbString()) 55 | ->isTrue(); 56 | } 57 | 58 | public function case_append_ltr(): void 59 | { 60 | $this 61 | ->given($string = new LUT('je')) 62 | ->when($result = $string->append(' t\'aime')) 63 | ->then 64 | ->object($result) 65 | ->isIdenticalTo($string) 66 | ->string((string) $result) 67 | ->isEqualTo('je t\'aime'); 68 | } 69 | 70 | public function case_append_rtl(): void 71 | { 72 | $this 73 | ->given($string = new LUT('أ')) 74 | ->when($result = $string->append('حبك')) 75 | ->then 76 | ->object($result) 77 | ->isIdenticalTo($string) 78 | ->string((string) $result) 79 | ->isEqualTo('أحبك'); 80 | } 81 | 82 | public function case_prepend_ltr(): void 83 | { 84 | $this 85 | ->given($string = new LUT(' t\'aime')) 86 | ->when($result = $string->prepend('je')) 87 | ->then 88 | ->object($result) 89 | ->isIdenticalTo($string) 90 | ->string((string) $result) 91 | ->isEqualTo('je t\'aime'); 92 | } 93 | 94 | public function case_prepend_rtl(): void 95 | { 96 | $this 97 | ->given($string = new LUT('ك')) 98 | ->when($result = $string->prepend('أحب')) 99 | ->then 100 | ->object($result) 101 | ->isIdenticalTo($string) 102 | ->string((string) $result) 103 | ->isEqualTo('أحبك'); 104 | } 105 | 106 | public function case_pad_beginning_ltr(): void 107 | { 108 | $this 109 | ->given($string = new LUT('je t\'aime')) 110 | ->when($result = $string->pad(20, '👍 💩 😄 ❤️ ', LUT::BEGINNING)) 111 | ->then 112 | ->object($result) 113 | ->isIdenticalTo($string) 114 | ->string((string) $result) 115 | ->isEqualTo('👍 💩 😄 ❤️ 👍 je t\'aime'); 116 | } 117 | 118 | public function case_pad_beginning_rtl(): void 119 | { 120 | $this 121 | ->given($string = new LUT('أحبك')) 122 | ->when($result = $string->pad(20, '👍 💩 😄 ❤️ ', LUT::BEGINNING)) 123 | ->then 124 | ->object($result) 125 | ->isIdenticalTo($string) 126 | ->string((string) $result) 127 | ->isEqualTo('👍 💩 😄 ❤️ 👍 💩 😄 ❤أحبك'); 128 | } 129 | 130 | public function case_pad_end_ltr(): void 131 | { 132 | $this 133 | ->given($string = new LUT('je t\'aime')) 134 | ->when($result = $string->pad(20, '👍 💩 😄 ❤️ ', LUT::END)) 135 | ->then 136 | ->object($result) 137 | ->isIdenticalTo($string) 138 | ->string((string) $result) 139 | ->isEqualTo('je t\'aime👍 💩 😄 ❤️ 👍 '); 140 | } 141 | 142 | public function case_pad_end_rtl(): void 143 | { 144 | $this 145 | ->given($string = new LUT('أحبك')) 146 | ->when($result = $string->pad(20, '👍 💩 😄 ❤️ ', LUT::END)) 147 | ->then 148 | ->object($result) 149 | ->isIdenticalTo($string) 150 | ->string((string) $result) 151 | ->isEqualTo('أحبك👍 💩 😄 ❤️ 👍 💩 😄 ❤'); 152 | } 153 | 154 | public function case_compare_no_collator(): void 155 | { 156 | $this 157 | ->given( 158 | $this->function->class_exists = function ($name) { 159 | return 'Collator' !== $name; 160 | }, 161 | $string = new LUT('b') 162 | ) 163 | ->case_compare(); 164 | } 165 | 166 | public function case_compare(): void 167 | { 168 | $this 169 | ->given($string = new LUT('b')) 170 | ->when($result = $string->compare('a')) 171 | ->then 172 | ->integer($result) 173 | ->isEqualTo(1) 174 | 175 | ->when($result = $string->compare('b')) 176 | ->then 177 | ->integer($result) 178 | ->isEqualTo(0) 179 | 180 | ->when($result = $string->compare('c')) 181 | ->then 182 | ->integer($result) 183 | ->isEqualTo(-1); 184 | } 185 | 186 | public function case_collator(): void 187 | { 188 | $this 189 | ->given( 190 | $this->function->setlocale = 'fr_FR', 191 | $collator = LUT::getCollator() 192 | ) 193 | ->when($result = $collator->getLocale(\Locale::VALID_LOCALE)) 194 | ->then 195 | ->string($result) 196 | ->isEqualTo('fr'); 197 | } 198 | 199 | public function case_safe_unsafe_pattern(): void 200 | { 201 | $this 202 | ->given($pattern = '/foo/i') 203 | ->when($result = LUT::safePattern($pattern)) 204 | ->then 205 | ->string($result) 206 | ->isEqualto('/foo/iu'); 207 | } 208 | 209 | public function case_safe_safe_pattern(): void 210 | { 211 | $this 212 | ->given($pattern = '/foo/ui') 213 | ->when($result = LUT::safePattern($pattern)) 214 | ->then 215 | ->string($result) 216 | ->isEqualto('/foo/ui'); 217 | } 218 | 219 | public function case_match_default(): void 220 | { 221 | $this 222 | ->given( 223 | $pattern = '/💩/u', 224 | $string = new LUT('foo 💩 bar') 225 | ) 226 | ->when($result = $string->match($pattern, $matches)) 227 | ->then 228 | ->integer($result) 229 | ->isEqualTo(1) 230 | ->array($matches) 231 | ->isEqualTo([ 232 | 0 => '💩' 233 | ]); 234 | } 235 | 236 | public function case_match_offset(): void 237 | { 238 | $this 239 | ->given( 240 | $pattern = '/💩/u', 241 | $string = new LUT('foo 💩 bar') 242 | ) 243 | ->when($result = $string->match($pattern, $matches, 0, 0)) 244 | ->then 245 | ->integer($result) 246 | ->isEqualTo(1) 247 | ->array($matches) 248 | ->isEqualTo([0 => '💩']) 249 | 250 | ->when($result = $string->match($pattern, $matches, 0, 4)) 251 | ->then 252 | ->integer($result) 253 | ->isEqualTo(1) 254 | ->array($matches) 255 | ->isEqualTo([0 => '💩']) 256 | 257 | ->when($result = $string->match($pattern, $matches, 0, 5)) 258 | ->then 259 | ->integer($result) 260 | ->isEqualTo(0) 261 | ->array($matches) 262 | ->isEmpty(); 263 | } 264 | 265 | public function case_match_with_offset(): void 266 | { 267 | $this 268 | ->given( 269 | $pattern = '/💩/u', 270 | $string = new LUT('foo 💩 bar') 271 | ) 272 | ->when($result = $string->match($pattern, $matches, $string::WITH_OFFSET)) 273 | ->then 274 | ->integer($result) 275 | ->isEqualTo(1) 276 | ->array($matches) 277 | ->isEqualTo([ 278 | 0 => [ 279 | 0 => '💩', 280 | 1 => 4 281 | ] 282 | ]); 283 | } 284 | 285 | public function case_match_all_default(): void 286 | { 287 | $this 288 | ->given( 289 | $pattern = '/💩/u', 290 | $string = new LUT('foo 💩 bar 💩 baz') 291 | ) 292 | ->when($result = $string->match($pattern, $matches, 0, 0, true)) 293 | ->then 294 | ->integer($result) 295 | ->isEqualTo(2) 296 | ->array($matches) 297 | ->isEqualTo([ 298 | 0 => [ 299 | 0 => '💩', 300 | 1 => '💩' 301 | ] 302 | ]); 303 | } 304 | 305 | public function case_match_all_with_offset(): void 306 | { 307 | $this 308 | ->given( 309 | $pattern = '/💩/u', 310 | $string = new LUT('foo 💩 bar 💩 baz') 311 | ) 312 | ->when($result = $string->match($pattern, $matches, $string::WITH_OFFSET, 0, true)) 313 | ->then 314 | ->integer($result) 315 | ->isEqualTo(2) 316 | ->array($matches) 317 | ->isEqualTo([ 318 | 0 => [ 319 | 0 => [ 320 | 0 => '💩', 321 | 1 => 4 322 | ], 323 | 1 => [ 324 | 0 => '💩', 325 | 1 => 13 326 | ] 327 | ] 328 | ]); 329 | } 330 | 331 | public function case_match_all_grouped_by_pattern(): void 332 | { 333 | $this 334 | ->given( 335 | $pattern = '/(💩)/u', 336 | $string = new LUT('foo 💩 bar 💩 baz') 337 | ) 338 | ->when($result = $string->match($pattern, $matches, $string::GROUP_BY_PATTERN, 0, true)) 339 | ->then 340 | ->integer($result) 341 | ->isEqualTo(2) 342 | ->array($matches) 343 | ->isEqualTo([ 344 | 0 => [ 345 | 0 => '💩', 346 | 1 => '💩' 347 | ], 348 | 1 => [ 349 | 0 => '💩', 350 | 1 => '💩' 351 | ] 352 | ]); 353 | } 354 | 355 | public function case_match_all_grouped_by_tuple(): void 356 | { 357 | $this 358 | ->given( 359 | $pattern = '/(💩)/u', 360 | $string = new LUT('foo 💩 bar 💩 baz') 361 | ) 362 | ->when($result = $string->match($pattern, $matches, $string::GROUP_BY_TUPLE, 0, true)) 363 | ->then 364 | ->integer($result) 365 | ->isEqualTo(2) 366 | ->array($matches) 367 | ->isEqualTo([ 368 | 0 => [ 369 | 0 => '💩', 370 | 1 => '💩' 371 | ], 372 | 1 => [ 373 | 0 => '💩', 374 | 1 => '💩' 375 | ] 376 | ]); 377 | } 378 | 379 | public function case_replace(): void 380 | { 381 | $this 382 | ->given($string = new LUT('❤️ 💩 💩')) 383 | ->when($result = $string->replace('/💩/u', '😄')) 384 | ->then 385 | ->object($result) 386 | ->isIdenticalTo($string) 387 | ->string((string) $result) 388 | ->isEqualTo('❤️ 😄 😄'); 389 | } 390 | 391 | public function case_replace_limited(): void 392 | { 393 | $this 394 | ->given($string = new LUT('❤️ 💩 💩')) 395 | ->when($result = $string->replace('/💩/u', '😄', 1)) 396 | ->then 397 | ->object($result) 398 | ->isIdenticalTo($string) 399 | ->string((string) $result) 400 | ->isEqualTo('❤️ 😄 💩'); 401 | } 402 | 403 | public function case_split_default(): void 404 | { 405 | $this 406 | ->given($string = new LUT('❤️💩❤️💩❤️')) 407 | ->when($result = $string->split('/💩/')) 408 | ->then 409 | ->array($result) 410 | ->isEqualTo([ 411 | 0 => '❤️', 412 | 1 => '❤️', 413 | 2 => '❤️' 414 | ]); 415 | } 416 | 417 | public function case_split_default_limited(): void 418 | { 419 | $this 420 | ->given($string = new LUT('❤️💩❤️💩❤️')) 421 | ->when($result = $string->split('/💩/', 1)) 422 | ->then 423 | ->array($result) 424 | ->isEqualTo([ 425 | 0 => '❤️💩❤️💩❤️' 426 | ]); 427 | } 428 | 429 | public function case_split_with_delimiters(): void 430 | { 431 | $this 432 | ->given($string = new LUT('❤️💩❤️💩❤️')) 433 | ->when($result = $string->split('/💩/', -1, $string::WITH_DELIMITERS)) 434 | ->then 435 | ->array($result) 436 | ->isEqualTo([ 437 | 0 => '❤️', 438 | 1 => '❤️', 439 | 2 => '❤️' 440 | ]); 441 | } 442 | 443 | public function case_split_with_offset(): void 444 | { 445 | $this 446 | ->given($string = new LUT('❤️💩❤️💩❤️')) 447 | ->when($result = $string->split('/💩/', -1, $string::WITH_OFFSET)) 448 | ->then 449 | ->array($result) 450 | ->isEqualTo([ 451 | 0 => [ 452 | 0 => '❤️', 453 | 1 => 0 454 | ], 455 | 1 => [ 456 | 0 => '❤️', 457 | 1 => 10 458 | ], 459 | 2 => [ 460 | 0 => '❤️', 461 | 1 => 20 462 | ] 463 | ]); 464 | } 465 | 466 | public function case_iterator_ltr(): void 467 | { 468 | $this 469 | ->given($string = new LUT('je t\'aime')) 470 | ->when($result = iterator_to_array($string)) 471 | ->then 472 | ->array($result) 473 | ->isEqualTo([ 474 | 'j', 475 | 'e', 476 | ' ', 477 | 't', 478 | '\'', 479 | 'a', 480 | 'i', 481 | 'm', 482 | 'e' 483 | ]); 484 | } 485 | 486 | public function case_iterator_rtl(): void 487 | { 488 | $this 489 | ->given($string = new LUT('أحبك')) 490 | ->when($result = iterator_to_array($string)) 491 | ->then 492 | ->array($result) 493 | ->isEqualTo([ 494 | 'أ', 495 | 'ح', 496 | 'ب', 497 | 'ك' 498 | ]); 499 | } 500 | 501 | public function case_to_lower(): void 502 | { 503 | $this 504 | ->given($string = new LUT('Σ \'ΑΓΑΠΏ')) 505 | ->when($result = $string->toLowerCase()) 506 | ->then 507 | ->object($result) 508 | ->isIdenticalTo($string) 509 | ->string((string) $result) 510 | ->isEqualTo('σ \'αγαπώ') 511 | 512 | ->given($string = new LUT('JE T\'AIME')) 513 | ->when($result = $string->toLowerCase()) 514 | ->then 515 | ->object($result) 516 | ->isIdenticalTo($string) 517 | ->string((string) $result) 518 | ->isEqualTo('je t\'aime'); 519 | } 520 | 521 | public function case_to_upper(): void 522 | { 523 | $this 524 | ->given($string = new LUT('σ \'αγαπώ')) 525 | ->when($result = $string->toUpperCase()) 526 | ->then 527 | ->object($result) 528 | ->isIdenticalTo($string) 529 | ->string((string) $result) 530 | ->isEqualTo('Σ \'ΑΓΑΠΏ') 531 | 532 | ->given($string = new LUT('je t\'aime')) 533 | ->when($result = $string->toUpperCase()) 534 | ->then 535 | ->object($result) 536 | ->isIdenticalTo($string) 537 | ->string((string) $result) 538 | ->isEqualTo('JE T\'AIME'); 539 | } 540 | 541 | public function case_trim_default(): void 542 | { 543 | $this 544 | ->given($string = new LUT('💩💩❤️💩💩')) 545 | ->when($result = $string->trim('💩')) 546 | ->then 547 | ->object($result) 548 | ->isIdenticalTo($string) 549 | ->string((string) $result) 550 | ->isEqualTo('❤️'); 551 | } 552 | 553 | public function case_trim_beginning(): void 554 | { 555 | $this 556 | ->given($string = new LUT('💩💩❤️💩💩')) 557 | ->when($result = $string->trim('💩', $string::BEGINNING)) 558 | ->then 559 | ->object($result) 560 | ->isIdenticalTo($string) 561 | ->string((string) $result) 562 | ->isEqualTo('❤️💩💩'); 563 | } 564 | 565 | public function case_trim_end(): void 566 | { 567 | $this 568 | ->given($string = new LUT('💩💩❤️💩💩')) 569 | ->when($result = $string->trim('💩', $string::END)) 570 | ->then 571 | ->object($result) 572 | ->isIdenticalTo($string) 573 | ->string((string) $result) 574 | ->isEqualTo('💩💩❤️'); 575 | } 576 | 577 | public function case_offset_get_ltr(): void 578 | { 579 | $this 580 | ->given($string = new LUT('je t\'aime')) 581 | ->when($result = $string[0]) 582 | ->then 583 | ->string($result) 584 | ->isEqualTo('j') 585 | 586 | ->when($result = $string[-1]) 587 | ->then 588 | ->string($result) 589 | ->isEqualTo('e'); 590 | } 591 | 592 | public function case_offset_get_rtl(): void 593 | { 594 | $this 595 | ->given($string = new LUT('أحبك')) 596 | ->when($result = $string[0]) 597 | ->then 598 | ->string($result) 599 | ->isEqualTo('أ') 600 | 601 | ->when($result = $string[-1]) 602 | ->then 603 | ->string($result) 604 | ->isEqualTo('ك'); 605 | } 606 | 607 | public function case_offset_set(): void 608 | { 609 | $this 610 | ->given($string = new LUT('أحبﻙ')) 611 | ->when($string[-1] = 'ك') 612 | ->then 613 | ->string((string) $string) 614 | ->isEqualTo('أحبك'); 615 | } 616 | 617 | public function case_offset_unset(): void 618 | { 619 | $this 620 | ->given($string = new LUT('أحبك😄')) 621 | ->when(function () use ($string): void { 622 | unset($string[-1]); 623 | }) 624 | ->then 625 | ->string((string) $string) 626 | ->isEqualTo('أحبك'); 627 | } 628 | 629 | public function case_reduce(): void 630 | { 631 | $this 632 | ->given($string = new LUT('أحبك')) 633 | ->when($result = $string->reduce(0, 1)) 634 | ->then 635 | ->object($result) 636 | ->isIdenticalTo($string) 637 | ->string((string) $result) 638 | ->isEqualTo('أ'); 639 | } 640 | 641 | public function case_count(): void 642 | { 643 | $this 644 | ->given($string = new LUT('je t\'aime')) 645 | ->when($result = count($string)) 646 | ->then 647 | ->integer($result) 648 | ->isEqualTo(9) 649 | 650 | ->given($string = new LUT('أحبك')) 651 | ->when($result = count($string)) 652 | ->then 653 | ->integer($result) 654 | ->isEqualTo(4) 655 | 656 | ->given($string = new LUT('💩')) 657 | ->when($result = count($string)) 658 | ->then 659 | ->integer($result) 660 | ->isEqualTo(1); 661 | } 662 | 663 | public function case_byte_at(): void 664 | { 665 | $this 666 | ->given($string = new LUT('💩')) 667 | ->when($result = $string->getByteAt(0)) 668 | ->then 669 | ->integer(ord($result)) 670 | ->isEqualTo(0xf0) 671 | 672 | ->when($result = $string->getByteAt(1)) 673 | ->then 674 | ->integer(ord($result)) 675 | ->isEqualTo(0x9f) 676 | 677 | ->when($result = $string->getByteAt(2)) 678 | ->then 679 | ->integer(ord($result)) 680 | ->isEqualTo(0x92) 681 | 682 | ->when($result = $string->getByteAt(3)) 683 | ->then 684 | ->integer(ord($result)) 685 | ->isEqualTo(0xa9) 686 | 687 | ->when($result = $string->getByteAt(-1)) 688 | ->then 689 | ->integer(ord($result)) 690 | ->isEqualTo(0xa9); 691 | } 692 | 693 | public function case_bytes_length(): void 694 | { 695 | $this 696 | ->given($string = new LUT('💩')) 697 | ->when($result = $string->getBytesLength()) 698 | ->then 699 | ->integer($result) 700 | ->isEqualTo(4); 701 | } 702 | 703 | public function case_get_width(): void 704 | { 705 | $this 706 | ->given($string = new LUT('💩')) 707 | ->when($result = $string->getWidth()) 708 | ->then 709 | ->integer($result) 710 | ->isEqualTo(1) 711 | 712 | ->given($string = new LUT('習')) 713 | ->when($result = $string->getWidth()) 714 | ->then 715 | ->integer($result) 716 | ->isEqualTo(2); 717 | } 718 | 719 | public function case_get_char_direction(): void 720 | { 721 | $this 722 | ->when($result = LUT::getCharDirection('A')) 723 | ->then 724 | ->integer($result) 725 | ->isEqualTo(LUT::LTR) 726 | 727 | ->when($result = LUT::getCharDirection('ا')) 728 | ->then 729 | ->integer($result) 730 | ->isEqualTo(LUT::RTL); 731 | } 732 | 733 | public function case_get_char_width(): void 734 | { 735 | $this 736 | ->given( 737 | $data = [ 738 | // 8-bit control character. 739 | [0x0, 0], 740 | [0x19, -1], 741 | [0x7f, -1], 742 | [0x9f, -1], 743 | 744 | // Regular. 745 | [0xa0, 1], 746 | 747 | // Non-spacing characters mark. 748 | [0x300, 0], // in Mn 749 | [0x488, 0], // in Me 750 | [0x600, 0], // in Cf 751 | [0xad, 1], // in Cf, but the only exception 752 | [0x1160, 0], 753 | [0x11ff, 0], 754 | [0x200b, 0], 755 | 756 | // To test the last return statement. 757 | [0x1100, 2], 758 | [0x2160, 1], 759 | [0x3f60, 2], 760 | [0x303f, 1], 761 | [0x2329, 2], 762 | [0xaed0, 2], 763 | [0x232a, 2], 764 | [0xffa4, 1], 765 | [0xfe10, 2], 766 | [0xfe30, 2], 767 | [0xff00, 2], 768 | [0xf900, 2] 769 | ] 770 | ) 771 | ->when(function () use ($data): void { 772 | foreach ($data as $datum) { 773 | list($code, $width) = $datum; 774 | 775 | $this 776 | ->when($result = LUT::getCharWidth(LUT::fromCode($code))) 777 | ->then 778 | ->integer($result) 779 | ->isEqualTo($width); 780 | } 781 | }); 782 | } 783 | 784 | public function case_is_char_printable(): void 785 | { 786 | $this 787 | ->when($result = LUT::isCharPrintable(LUT::fromCode(0x7f))) 788 | ->then 789 | ->boolean($result) 790 | ->isFalse() 791 | 792 | ->when($result = LUT::isCharPrintable(LUT::fromCode(0xa0))) 793 | ->then 794 | ->boolean($result) 795 | ->isTrue() 796 | 797 | ->when($result = LUT::isCharPrintable(LUT::fromCode(0x1100))) 798 | ->then 799 | ->boolean($result) 800 | ->isTrue(); 801 | } 802 | 803 | public function case_from_code(): void 804 | { 805 | $this 806 | // U+0000 to U+007F 807 | ->when($result = LUT::fromCode(0x7e)) 808 | ->then 809 | ->string($result) 810 | ->isEqualTo('~') 811 | 812 | // U+0080 to U+07FF 813 | ->when($result = LUT::fromCode(0xa7)) 814 | ->then 815 | ->string($result) 816 | ->isEqualTo('§') 817 | 818 | // U+0800 to U+FFFF 819 | ->when($result = LUT::fromCode(0x1207)) 820 | ->then 821 | ->string($result) 822 | ->isEqualTo('ሇ') 823 | 824 | // U+10000 to U+10FFFF 825 | ->when($result = LUT::fromCode(0x1f4a9)) 826 | ->then 827 | ->string($result) 828 | ->isEqualTo('💩'); 829 | } 830 | 831 | public function case_to_code(): void 832 | { 833 | $this 834 | // U+0000 to U+007F 835 | ->when($result = LUT::toCode('~')) 836 | ->then 837 | ->integer($result) 838 | ->isEqualTo(0x7e) 839 | 840 | // U+0080 to U+07FF 841 | ->when($result = LUT::toCode('§')) 842 | ->then 843 | ->integer($result) 844 | ->isEqualTo(0xa7) 845 | 846 | // U+0800 to U+FFFF 847 | ->when($result = LUT::toCode('ሇ')) 848 | ->then 849 | ->integer($result) 850 | ->isEqualTo(0x1207) 851 | 852 | // U+10000 to U+10FFFF 853 | ->when($result = LUT::toCode('💩')) 854 | ->then 855 | ->integer($result) 856 | ->isEqualTo(0x1f4a9); 857 | } 858 | 859 | public function case_to_binary_code(): void 860 | { 861 | $this 862 | // U+0000 to U+007F 863 | ->when($result = LUT::toBinaryCode('~')) 864 | ->then 865 | ->string($result) 866 | ->isEqualTo('01111110') 867 | 868 | // U+0080 to U+07FF 869 | ->when($result = LUT::toBinaryCode('§')) 870 | ->then 871 | ->string($result) 872 | ->isEqualTo('1100001010100111') 873 | 874 | // U+0800 to U+FFFF 875 | ->when($result = LUT::toBinaryCode('ሇ')) 876 | ->then 877 | ->string($result) 878 | ->isEqualTo('111000011000100010000111') 879 | 880 | // U+10000 to U+10FFFF 881 | ->when($result = LUT::toBinaryCode('💩')) 882 | ->then 883 | ->string($result) 884 | ->isEqualTo('11110000100111111001001010101001'); 885 | } 886 | 887 | public function case_transcode_no_iconv(): void 888 | { 889 | $this 890 | ->given( 891 | $this->function->function_exists = function ($name) { 892 | return 'iconv' !== $name; 893 | } 894 | ) 895 | ->exception(function (): void { 896 | LUT::transcode('foo', 'UTF-8'); 897 | }) 898 | ->isInstanceOf(LUT\Exception::class); 899 | } 900 | 901 | public function case_transcode_and_isUtf8(): void 902 | { 903 | $this 904 | ->given($uΣ = 'Σ') 905 | ->when($Σ = LUT::transcode($uΣ, 'UTF-8', 'UTF-16')) 906 | ->then 907 | ->string($Σ) 908 | ->isNotEqualTo($uΣ) 909 | ->boolean(LUT::isUtf8($Σ)) 910 | ->isFalse() 911 | 912 | ->when($Σ = LUT::transcode($Σ, 'UTF-16', 'UTF-8')) 913 | ->string($Σ) 914 | ->isEqualTo($uΣ) 915 | ->boolean(LUT::isUtf8($Σ)) 916 | ->isTrue() 917 | ->boolean(LUT::isUtf8($uΣ)) 918 | ->isTrue(); 919 | } 920 | 921 | public function case_to_ascii_no_transliterator_no_normalizer(): void 922 | { 923 | $this 924 | ->given( 925 | $this->function->class_exists = function ($name) { 926 | return false === in_array($name, ['Transliterator', 'Normalizer']); 927 | }, 928 | $string = new LUT('Un été brûlant sur la côte') 929 | ) 930 | ->exception(function () use ($string): void { 931 | $string->toAscii(); 932 | }) 933 | ->isInstanceOf(LUT\Exception::class); 934 | } 935 | 936 | public function case_to_ascii_no_transliterator_no_normalizer_try(): void 937 | { 938 | $this 939 | ->given( 940 | $this->function->class_exists = function ($name) { 941 | return false === in_array($name, ['Transliterator', 'Normalizer']); 942 | }, 943 | $string = new LUT('Un été brûlant sur la côte') 944 | ) 945 | ->when($result = $string->toAscii(true)) 946 | ->then 947 | ->object($result) 948 | ->isIdenticalTo($string) 949 | ->string((string) $result) 950 | ->isEqualTo('Un ete brulant sur la cote'); 951 | } 952 | 953 | public function case_to_ascii_no_transliterator(): void 954 | { 955 | $this 956 | ->given( 957 | $this->function->class_exists = function ($name) { 958 | return 'Transliterator' !== $name; 959 | }, 960 | $string = new LUT('Un été brûlant sur la côte') 961 | ) 962 | ->when($result = $string->toAscii()) 963 | ->then 964 | ->object($result) 965 | ->isIdenticalTo($string) 966 | ->string((string) $result) 967 | ->isEqualTo('Un ete brulant sur la cote'); 968 | } 969 | 970 | public function case_to_ascii(): void 971 | { 972 | $this 973 | ->given( 974 | $strings = [ 975 | 'Un été brûlant sur la côte' 976 | => 'Un ete brulant sur la cote', 977 | 978 | 'Αυτή είναι μια δοκιμή' 979 | => 'Aute einai mia dokime', 980 | 981 | 'أحبك' 982 | => 'ahbk', 983 | 984 | 'キャンパス' 985 | => 'kyanpasu', 986 | 987 | 'биологическом' 988 | => 'biologiceskom', 989 | 990 | '정, 병호' 991 | => 'jeong, byeongho', 992 | 993 | 'ますだ, よしひこ' 994 | => 'masuda, yoshihiko', 995 | 996 | 'मोनिच' 997 | => 'monica', 998 | 999 | 'क्ष' 1000 | => 'ksa', 1001 | 1002 | 'أحبك 😀' 1003 | => 'ahbk (grinning face)', 1004 | 1005 | '∀ i ∈ ℕ' 1006 | => '(for all) i (element of) N' 1007 | ] 1008 | ) 1009 | ->when(function () use ($strings): void { 1010 | foreach ($strings as $original => $asciied) { 1011 | $this 1012 | ->given($string = new LUT($original)) 1013 | ->when($result = $string->toAscii()) 1014 | ->then 1015 | ->object($result) 1016 | ->isIdenticalTo($string) 1017 | ->string((string) $result) 1018 | ->isEqualTo($asciied); 1019 | } 1020 | }); 1021 | } 1022 | 1023 | public function case_copy(): void 1024 | { 1025 | $this 1026 | ->given($string = new LUT('foo')) 1027 | ->when($result = $string->copy()) 1028 | ->then 1029 | ->object($result) 1030 | ->isEqualTo($string); 1031 | } 1032 | 1033 | public function case_toString(): void 1034 | { 1035 | $this 1036 | ->given($datum = $this->sample($this->realdom->regex('/\w{7,42}/'))) 1037 | ->when($result = new LUT($datum)) 1038 | ->then 1039 | ->castToString($result) 1040 | ->isEqualTo($datum); 1041 | } 1042 | } 1043 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name" : "hoa/ustring", 3 | "description": "The Hoa\\Ustring library.", 4 | "type" : "library", 5 | "keywords" : ["library", "string", "unicode", "search"], 6 | "homepage" : "https://hoa-project.net/", 7 | "license" : "BSD-3-Clause", 8 | "authors" : [ 9 | { 10 | "name" : "Ivan Enderlin", 11 | "email": "ivan.enderlin@hoa-project.net" 12 | }, 13 | { 14 | "name" : "Hoa community", 15 | "homepage": "https://hoa-project.net/" 16 | } 17 | ], 18 | "support": { 19 | "email" : "support@hoa-project.net", 20 | "irc" : "irc://chat.freenode.net/hoaproject", 21 | "forum" : "https://users.hoa-project.net/", 22 | "docs" : "https://central.hoa-project.net/Documentation/Library/Ustring", 23 | "source": "https://central.hoa-project.net/Resource/Library/Ustring" 24 | }, 25 | "require": { 26 | "php" : ">=7.1", 27 | "hoa/consistency": "dev-master", 28 | "hoa/exception" : "dev-master" 29 | }, 30 | "require-dev": { 31 | "hoa/test": "dev-master" 32 | }, 33 | "autoload": { 34 | "psr-4": { 35 | "Hoa\\Ustring\\" : "Source", 36 | "Hoa\\Ustring\\Bin\\": "Bin" 37 | } 38 | }, 39 | "suggest": { 40 | "ext-iconv": "ext/iconv must be present (or a third implementation) to use Hoa\\Ustring::transcode().", 41 | "ext-intl" : "To get a better Hoa\\Ustring::toAscii() and Hoa\\Ustring::compareTo()." 42 | }, 43 | "extra" : { 44 | "branch-alias": { 45 | "dev-master": "4.x-dev" 46 | } 47 | } 48 | } 49 | --------------------------------------------------------------------------------