├── .State
├── .gitignore
├── Bin
    ├── Fromcode.php
    └── Tocode.php
├── CHANGELOG.md
├── Documentation
    ├── En
    │   └── Index.xyl
    └── Fr
    │   └── Index.xyl
├── README.md
├── Source
    ├── Exception.php
    ├── Search.php
    └── Ustring.php
├── Test
    └── Unit
    │   ├── Issue.php
    │   ├── Search.php
    │   └── Ustring.php
└── composer.json


/.State:
--------------------------------------------------------------------------------
1 | finalized
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /vendor/
2 | /composer.lock
3 | 


--------------------------------------------------------------------------------
/Bin/Fromcode.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | declare(strict_types=1);
  4 | 
  5 | /**
  6 |  * Hoa
  7 |  *
  8 |  *
  9 |  * @license
 10 |  *
 11 |  * New BSD License
 12 |  *
 13 |  * Copyright © 2007-2017, Hoa community. All rights reserved.
 14 |  *
 15 |  * Redistribution and use in source and binary forms, with or without
 16 |  * modification, are permitted provided that the following conditions are met:
 17 |  *     * Redistributions of source code must retain the above copyright
 18 |  *       notice, this list of conditions and the following disclaimer.
 19 |  *     * Redistributions in binary form must reproduce the above copyright
 20 |  *       notice, this list of conditions and the following disclaimer in the
 21 |  *       documentation and/or other materials provided with the distribution.
 22 |  *     * Neither the name of the Hoa nor the names of its contributors may be
 23 |  *       used to endorse or promote products derived from this software without
 24 |  *       specific prior written permission.
 25 |  *
 26 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 27 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 28 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 29 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS AND CONTRIBUTORS BE
 30 |  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 31 |  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 32 |  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 33 |  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 34 |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 35 |  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 36 |  * POSSIBILITY OF SUCH DAMAGE.
 37 |  */
 38 | 
 39 | namespace Hoa\Ustring\Bin;
 40 | 
 41 | use Hoa\Console;
 42 | use Hoa\Ustring;
 43 | 
 44 | /**
 45 |  * Get a character from its code. Please, see Hoa\Ustring\Ustring::fromCode.
 46 |  */
 47 | class Fromcode extends Console\Dispatcher\Kit
 48 | {
 49 |     /**
 50 |      * Options description.
 51 |      *
 52 |      * @var array
 53 |      */
 54 |     protected $options = [
 55 |         ['base', Console\GetOption::REQUIRED_ARGUMENT, 'b'],
 56 |         ['help', Console\GetOption::NO_ARGUMENT,       'h'],
 57 |         ['help', Console\GetOption::NO_ARGUMENT,       '?']
 58 |     ];
 59 | 
 60 | 
 61 | 
 62 |     /**
 63 |      * The entry method.
 64 |      */
 65 |     public function main(): ?int
 66 |     {
 67 |         $base = 16;
 68 | 
 69 |         while (false !== $c = $this->getOption($v)) {
 70 |             switch ($c) {
 71 |                 case 'b':
 72 |                     $base = intval($v);
 73 | 
 74 |                     break;
 75 | 
 76 |                 case '__ambiguous':
 77 |                     $this->resolveOptionAmbiguity($v);
 78 | 
 79 |                     break;
 80 | 
 81 |                 case 'h':
 82 |                 case '?':
 83 |                 default:
 84 |                     return $this->usage();
 85 |             }
 86 |         }
 87 | 
 88 |         $this->parser->listInputs($code);
 89 | 
 90 |         $char = Ustring::fromCode(base_convert($code, $base, 10));
 91 | 
 92 |         echo $char;
 93 | 
 94 |         return 0;
 95 |     }
 96 | 
 97 |     /**
 98 |      * The command usage.
 99 |      */
100 |     public function usage(): void
101 |     {
102 |         echo
103 |             'Usage   : ustring:fromcode <char>', "\n",
104 |             'Options :', "\n",
105 |             $this->makeUsageOptionsList([
106 |                 'b'    => 'Specify the base of the code (16 by default).',
107 |                 'help' => 'This help.'
108 |             ]), "\n";
109 |     }
110 | }
111 | 
112 | __halt_compiler();
113 | Get a character from its code.
114 | 


--------------------------------------------------------------------------------
/Bin/Tocode.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | declare(strict_types=1);
  4 | 
  5 | /**
  6 |  * Hoa
  7 |  *
  8 |  *
  9 |  * @license
 10 |  *
 11 |  * New BSD License
 12 |  *
 13 |  * Copyright © 2007-2017, Hoa community. All rights reserved.
 14 |  *
 15 |  * Redistribution and use in source and binary forms, with or without
 16 |  * modification, are permitted provided that the following conditions are met:
 17 |  *     * Redistributions of source code must retain the above copyright
 18 |  *       notice, this list of conditions and the following disclaimer.
 19 |  *     * Redistributions in binary form must reproduce the above copyright
 20 |  *       notice, this list of conditions and the following disclaimer in the
 21 |  *       documentation and/or other materials provided with the distribution.
 22 |  *     * Neither the name of the Hoa nor the names of its contributors may be
 23 |  *       used to endorse or promote products derived from this software without
 24 |  *       specific prior written permission.
 25 |  *
 26 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 27 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 28 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 29 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS AND CONTRIBUTORS BE
 30 |  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 31 |  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 32 |  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 33 |  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 34 |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 35 |  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 36 |  * POSSIBILITY OF SUCH DAMAGE.
 37 |  */
 38 | 
 39 | namespace Hoa\Ustring\Bin;
 40 | 
 41 | use Hoa\Console;
 42 | use Hoa\Ustring;
 43 | 
 44 | /**
 45 |  * Transform a character into its code. Please, see Hoa\Ustring\Ustring::toCode.
 46 |  */
 47 | class Tocode extends Console\Dispatcher\Kit
 48 | {
 49 |     /**
 50 |      * Options description.
 51 |      */
 52 |     protected $options = [
 53 |         ['base', Console\GetOption::REQUIRED_ARGUMENT, 'b'],
 54 |         ['help', Console\GetOption::NO_ARGUMENT,       'h'],
 55 |         ['help', Console\GetOption::NO_ARGUMENT,       '?']
 56 |     ];
 57 | 
 58 | 
 59 | 
 60 |     /**
 61 |      * The entry method.
 62 |      */
 63 |     public function main(): ?int
 64 |     {
 65 |         $base = 16;
 66 | 
 67 |         while (false !== $c = $this->getOption($v)) {
 68 |             switch ($c) {
 69 |                 case 'b':
 70 |                     $base = intval($v);
 71 | 
 72 |                     break;
 73 | 
 74 |                 case '__ambiguous':
 75 |                     $this->resolveOptionAmbiguity($v);
 76 | 
 77 |                     break;
 78 | 
 79 |                 case 'h':
 80 |                 case '?':
 81 |                 default:
 82 |                     return $this->usage();
 83 |             }
 84 |         }
 85 | 
 86 |         $this->parser->listInputs($char);
 87 | 
 88 |         $code = base_convert((string) Ustring::toCode($char), 10, $base);
 89 | 
 90 |         echo $code, "\n";
 91 | 
 92 |         return 0;
 93 |     }
 94 | 
 95 |     /**
 96 |      * The command usage.
 97 |      */
 98 |     public function usage(): void
 99 |     {
100 |         echo
101 |             'Usage   : ustring:tocode <char>', "\n",
102 |             'Options :', "\n",
103 |             $this->makeUsageOptionsList([
104 |                 'b'    => 'Get the code in a specific base (16 by default).',
105 |                 'help' => 'This help.'
106 |             ]), "\n";
107 |     }
108 | }
109 | 
110 | __halt_compiler();
111 | Transform a character into its code.
112 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # 4.17.01.16
 2 | 
 3 |   * Quality: Happy new year! (Alexis von Glasow, 2017-01-12T14:01:18+01:00)
 4 |   * Test: Add the `Decorrelated` interface. (Ivan Enderlin, 2016-10-25T07:58:13+02:00)
 5 |   * Documentation: New `README.md` file. (Ivan Enderlin, 2016-10-18T15:10:49+02:00)
 6 |   * Documentation: Update `support` properties. (Ivan Enderlin, 2016-10-11T08:45:51+02:00)
 7 | 
 8 | # 4.16.01.11
 9 | 
10 |   * Quality: Drop PHP5.4. (Ivan Enderlin, 2016-01-11T09:15:27+01:00)
11 |   * Quality: Run devtools:cs. (Ivan Enderlin, 2016-01-09T09:11:00+01:00)
12 |   * Core: Remove `Hoa\Core`. (Ivan Enderlin, 2016-01-09T08:27:47+01:00)
13 |   * Consistency: Use `Hoa\Consistency`. (Ivan Enderlin, 2015-12-08T22:11:40+01:00)
14 |   * Exception: Use `Hoa\Exception`. (Ivan Enderlin, 2015-11-20T13:19:42+01:00)
15 | 
16 | # 3.15.11.09
17 | 
18 |   * Fixed leftover typos (string -> ustring) (David Thalmann, 2015-09-08T14:45:12+02:00)
19 |   * Add a `.gitignore` file. (Stéphane HULARD, 2015-08-03T11:49:32+02:00)
20 | 
21 | # 3.15.08.03
22 | 
23 |   * `ext/iconv` is suggested, no longer required. (Ivan Enderlin, 2015-08-03T07:06:46+02:00)
24 |   * Fix CS. (Ivan Enderlin, 2015-08-03T07:05:10+02:00)
25 |   * Test `ext/mbstring` availability globally. (Ivan Enderlin, 2015-08-03T07:04:30+02:00)
26 | 
27 | # 3.15.07.28
28 | 
29 |   * Fix CS. (Ivan Enderlin, 2015-07-28T14:13:48+02:00)
30 |   * Fix the CHANGELOG. (Ivan Enderlin, 2015-07-28T14:13:27+02:00)
31 |   * Prepare 3.15.05.29. (Ivan Enderlin, 2015-05-29T15:36:54+02:00)
32 | 
33 | # 3.15.05.29
34 | 
35 |   * Update installation section. (Ivan Enderlin, 2015-05-29T14:13:22+02:00)
36 |   * Rename `Hoa\String` to `Hoa\Ustring`. (Ivan Enderlin, 2015-05-29T12:24:23+02:00)
37 |   * Move to PSR-1 and PSR-2. (Ivan Enderlin, 2015-05-18T09:49:37+02:00)
38 | 
39 | # 2.15.03.25
40 | 
41 |   * `toCode` supports invalid UTF-8 character. (Ivan Enderlin, 2015-03-25T08:52:52+01:00)
42 |   * Fix a typo in an exception message. (bureX, 2015-01-27T01:41:08+01:00)
43 | 
44 | # 2.15.02.19
45 | 
46 |   * Add the CHANGELOG.md file. (Ivan Enderlin, 2015-02-19T09:11:32+01:00)
47 |   * Add `require-dev` with `hoa/test`. (Ivan Enderlin, 2015-01-29T14:55:20+01:00)
48 |   * Add `hoa string:fromcode` and `hoa string:tocode`. (Ivan Enderlin, 2015-01-23T22:29:55+01:00)
49 |   * Translate the documentation in English. (Ivan Enderlin, 2015-01-23T19:27:04+01:00)
50 |   * Add examples, present new features and update links in the documentation. (Ivan Enderlin, 2015-01-23T19:27:00+01:00)
51 |   * Implement the `getCharWidth` method. (Ivan Enderlin, 2015-01-07T11:00:06+01:00)
52 |   * Accept other `intl` implementations. (Ivan Enderlin, 2015-01-06T13:42:20+01:00)
53 |   * Remove a useless test. (Ivan Enderlin, 2015-01-06T11:24:39+01:00)
54 |   * Add more tests for Math symbols. (Ivan Enderlin, 2015-01-06T11:22:53+01:00)
55 |   * Add emoji and other symbols supports to `toAscii`. (Ivan Enderlin, 2015-01-06T11:17:32+01:00)
56 |   * Update `toAscii` to use a transliterator. (Ivan Enderlin, 2015-01-06T10:58:07+01:00)
57 |   * Add transliterator support. (Ivan Enderlin, 2015-01-06T10:57:30+01:00)
58 |   * Happy new year! (Ivan Enderlin, 2015-01-05T14:52:34+01:00)
59 | 
60 | # 2.14.12.24
61 | 
62 |   * Clean code. (Julien Bianchi, 2014-12-24T08:44:59+01:00)
63 |   * Add tests for `Hoa\String\Search`. (Ivan Enderlin, 2014-12-23T14:15:21+01:00)
64 |   * `toBinaryCode` has a better semantics. (Ivan Enderlin, 2014-12-23T14:11:02+01:00)
65 |   * Use hexadecimal everywhere. (Ivan Enderlin, 2014-12-23T12:33:00+01:00)
66 |   * Tests are green now. (Ivan Enderlin, 2014-12-23T12:27:38+01:00)
67 |   * New `toCode` method, without UCS-2. (Ivan Enderlin, 2014-12-23T12:07:22+01:00)
68 |   * Add tests. (Ivan Enderlin, 2014-12-23T02:15:55+01:00)
69 |   * Fix flags between global and local in `match`. (Ivan Enderlin, 2014-12-23T02:13:50+01:00)
70 |   * Fix `compare` if `Collator` is not present. (Ivan Enderlin, 2014-12-23T02:13:16+01:00)
71 |   * Wrong append and prepend algorithm. (Ivan Enderlin, 2014-12-23T02:12:30+01:00)
72 |   * Move to PHP5.4 and remove `from`/`import`. (Ivan Enderlin, 2014-12-22T22:44:40+01:00)
73 | 
74 | # 2.14.12.10
75 | 
76 |   * Move to PSR-4. (Ivan Enderlin, 2014-12-09T18:48:53+01:00)
77 | 
78 | # 2.14.11.09
79 | 
80 |   * Format code. #mania (Ivan Enderlin, 2014-10-05T15:09:31+02:00)
81 |   * Implement the `String::copy` method. (Marc Lemay, 2014-10-05T15:08:33+02:00)
82 | 
83 | # 2.14.09.23
84 | 
85 |   * Add `branch-alias`. (Stéphane PY, 2014-09-23T11:55:55+02:00)
86 | 
87 | # 2.14.09.16
88 | 
89 |   * Drop PHP5.3. (Ivan Enderlin, 2014-09-17T17:20:54+02:00)
90 |   * Add the installation section. (Ivan Enderlin, 2014-09-17T17:20:46+02:00)
91 | 
92 | (first snapshot)
93 | 


--------------------------------------------------------------------------------
/Documentation/En/Index.xyl:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8"?>
  2 | 
  3 | <overlay xmlns="http://hoa-project.net/xyl/xylophone">
  4 | <yield id="chapter">
  5 | 
  6 |   <p>Strings can sometimes be <strong>complex</strong>, especially when they use
  7 |   the <code>Unicode</code> encoding format. The <code>Hoa\Ustring</code> library
  8 |   provides several operations on UTF-8 strings.</p>
  9 | 
 10 |   <h2 id="Table_of_contents">Table of contents</h2>
 11 | 
 12 |   <tableofcontents id="main-toc" />
 13 | 
 14 |   <h2 id="Introduction" for="main-toc">Introduction</h2>
 15 | 
 16 |   <p>When we manipulate strings, the <a href="http://unicode.org/">Unicode</a>
 17 |   format establishes itself because of its <strong>compatibility</strong> with
 18 |   historical formats (like ASCII) and its capacity to understand a
 19 |   <strong>large</strong> range of characters and symbols for all cultures and
 20 |   all regions in the world. PHP provides several tools to manipulate such
 21 |   strings, like the following extensions:
 22 |   <a href="http://php.net/mbstring"><code>mbstring</code></a>,
 23 |   <a href="http://php.net/iconv"><code>iconv</code></a> or also the excellent
 24 |   <a href="http://php.net/intl"><code>intl</code></a>  which is based on
 25 |   <a href="http://icu-project.org/">ICU</a>, the reference implementation of
 26 |   Unicode. Unfortunately, sometimes we have to mix these extensions to achieve
 27 |   our aims and at the cost of a certain <strong>complexity</strong> along with
 28 |   a regrettable <strong>verbosity</strong>.</p>
 29 |   <p>The <code>Hoa\Ustring</code> library answers to these issues by providing a
 30 |   <strong>simple</strong> way to manipulate strings with
 31 |   <strong>performance</strong> and <strong>efficiency</strong> in minds. It
 32 |   also provides some evoluated algorithms to perform <strong>search</strong>
 33 |   operations on strings.</p>
 34 | 
 35 |   <h2 id="Unicode_strings" for="main-toc">Unicode strings</h2>
 36 | 
 37 |   <p>The <code>Hoa\Ustring\Ustring</code> class represents a
 38 |   <strong>UTF-8</strong> Unicode strings and allows to manipulate it easily.
 39 |   This class implements the
 40 |   <a href="http://php.net/arrayaccess"><code>ArrayAccess</code></a>,
 41 |   <a href="http://php.net/countable"><code>Countable</code></a> and
 42 |   <a href="http://php.net/iteratoraggregate"><code>IteratorAggregate</code></a>
 43 |   interfaces. We are going to use three examples in three different languages:
 44 |   French, Arab and Japanese. Thus:</p>
 45 |   <pre><code class="language-php">$french   = new Hoa\Ustring\Ustring('Je t\'aime');
 46 | $arabic   = new Hoa\Ustring\Ustring('أحبك');
 47 | $japanese = new Hoa\Ustring\Ustring('私はあなたを愛して');</code></pre>
 48 |   <p>Now, let's see what we can do on these three strings.</p>
 49 | 
 50 |   <h3 id="String_manipulation" for="main-toc">String manipulation</h3>
 51 | 
 52 |   <p>Let's start with <strong>elementary</strong> operations. If we would like
 53 |   to <strong>count</strong> the number of characters (not bytes), we will use
 54 |   the <a href="http://php.net/count"><code>count</code> function</a>. Thus:</p>
 55 |   <pre><code class="language-php">var_dump(
 56 |     count($french),
 57 |     count($arabic),
 58 |     count($japanese)
 59 | );
 60 | 
 61 | /**
 62 |  * Will output:
 63 |  *     int(9)
 64 |  *     int(4)
 65 |  *     int(9)
 66 |  */</code></pre>
 67 |   <p>When we speak about text position, it is not suitable to speak about the
 68 |   right or the left, but rather about a <strong>beginning</strong> or an
 69 |   <strong>end</strong>, and based on the <strong>direction</strong> of writing.
 70 |   We can know this direction thanks to the
 71 |   <code>Hoa\Ustring\Ustring::getDirection</code> method. It returns the value of
 72 |   one of the following constants:</p>
 73 |   <ul>
 74 |     <li><code>Hoa\Ustring\Ustring::LTR</code>, for left-to-right, if the text is
 75 |     written from the left to the right,</li>
 76 |     <li><code>Hoa\Ustring\Ustring::RTL</code>, for right-to-left, if the text is
 77 |     written from the right to the left.</li>
 78 |   </ul>
 79 |   <p>Let's observe the result with our examples:</p>
 80 |   <pre><code class="language-php">var_dump(
 81 |     $french->getDirection()   === Hoa\Ustring\Ustring::LTR, // is left-to-right?
 82 |     $arabic->getDirection()   === Hoa\Ustring\Ustring::RTL, // is right-to-left?
 83 |     $japanese->getDirection() === Hoa\Ustring\Ustring::LTR  // is left-to-right?
 84 | );
 85 | 
 86 | /**
 87 |  * Will output:
 88 |  *     bool(true)
 89 |  *     bool(true)
 90 |  *     bool(true)
 91 |  */</code></pre>
 92 |   <p>The result of this method is computed thanks to the
 93 |   <code>Hoa\Ustring\Ustring::getCharDirection</code> static method which computes
 94 |   the direction of only one character.</p>
 95 |   <p>If we would like to <strong>concatenate</strong> another string to the end
 96 |   or to the beginning, we will respectively use the
 97 |   <code>Hoa\Ustring\Ustring::append</code> and
 98 |   <code>Hoa\Ustring\Ustring::prepend</code> methods. These methods, like most of
 99 |   the ones which modifies the string, return the object itself, in order to
100 |   chain the calls. For instance:</p>
101 |   <pre><code class="language-php">echo $french->append('… et toi, m\'aimes-tu ?')->prepend('Mam\'zelle ! ');
102 | 
103 | /**
104 |  * Will output:
105 |  *     Mam'zelle ! Je t'aime… et toi, m'aimes-tu ?
106 |  */</code></pre>
107 |   <p>We also have the <code>Hoa\Ustring\Ustring::toLowerCase</code> and
108 |   <code>Hoa\Ustring\Ustring::toUpperCase</code> methods to, respectively, set
109 |   the case of the string to lower or upper. For instance:</p>
110 |   <pre><code class="language-php">echo $french->toUpperCase();
111 | 
112 | /**
113 |  * Will output:
114 |  *     MAM'ZELLE ! JE T'AIME… ET TOI, M'AIMES-TU ?
115 |  */</code></pre>
116 |   <p>We can also add characters to the beginning or to the end of the string to
117 |   reach a <strong>minimum</strong> length. This operation is frequently called
118 |   the <em>padding</em> (for historical reasons dating back to typewriters).
119 |   That's why we have the <code>Hoa\Ustring\Ustring::pad</code> method which
120 |   takes three arguments: the minimum length, characters to add and a constant
121 |   indicating whether we have to add at the end or at the beginning of the string
122 |   (respectively <code>Hoa\Ustring\Ustring::END</code>, by default, and
123 |   <code>Hoa\Ustring\Ustring::BEGINNING</code>).</p>
124 |   <pre><code class="language-php">echo $arabic->pad(20, ' ');
125 | 
126 | /**
127 |  * Will output:
128 |  *                     أحبك
129 |  */</code></pre>
130 |   <p>A similar operation allows to remove, by default, <strong>spaces</strong>
131 |   at the beginning and at the end of the string thanks to the
132 |   <code>Hoa\Ustring\Ustring::trim</code> method. For example, to retreive our
133 |   original Arabic string:</p>
134 |   <pre><code class="language-php">echo $arabic->trim();
135 | 
136 | /**
137 |  * Will output:
138 |  *     أحبك
139 |  */</code></pre>
140 |   <p>If we would like to remove other characters, we can use its first argument
141 |   which must be a regular expression. Finally, its second argument allows to
142 |   specify from what side we would like to remove character: at the beginning, at
143 |   the end or both, still by using the
144 |   <code>Hoa\Ustring\Ustring::BEGINNING</code> and
145 |   <code>Hoa\Ustring\Ustring::END</code> constants.</p>
146 |   <p>If we would like to remove other characters, we can use its first argument
147 |   which must be a regular expression. Finally, its second argument allows to
148 |   specify the side where to remove characters: at the beginning, at the end or
149 |   both, still by using the <code>Hoa\Ustring\Ustring::BEGINNING</code> and
150 |   <code>Hoa\Ustring\Ustring::END</code> constants. We can combine these
151 |   constants to express “both sides”, which is the default value:
152 |   <code class="language-php">Hoa\Ustring\Ustring::BEGINNING |
153 |   Hoa\Ustring\Ustring::END</code>. For example, to remove all the numbers and
154 |   the spaces only at the end, we will write:</p>
155 |   <pre><code class="language-php">$arabic->trim('\s|\d', Hoa\Ustring\Ustring::END);</code></pre>
156 |   <p>We can also <strong>reduce</strong> the string to a
157 |   <strong>sub-string</strong> by specifying the position of the first character
158 |   followed by the length of the sub-string to the
159 |   <code>Hoa\Ustring\Ustring::reduce</code> method:</p>
160 |   <pre><code class="language-php">echo $french->reduce(3, 6)->reduce(2, 4);
161 | 
162 | /**
163 |  * Will output:
164 |  *     aime
165 |  */</code></pre>
166 |   <p>If we would like to get a specific character, we can rely on the
167 |   <code>ArrayAccess</code> interface. For instance, to get the first character
168 |   of each of our examples (from their original definitions):</p>
169 |   <pre><code class="language-php">var_dump(
170 |     $french[0],
171 |     $arabic[0],
172 |     $japanese[0]
173 | );
174 | 
175 | /**
176 |  * Will output:
177 |  *     string(1) "J"
178 |  *     string(2) "أ"
179 |  *     string(3) "私"
180 |  */</code></pre>
181 |   <p>If we would like the last character, we will use the -1 index. The index is
182 |   not bounded to the length of the string. If the index exceeds this length,
183 |   then a <em>modulo</em> will be applied.</p>
184 |   <p>We can also modify or remove a specific character with this method. For
185 |   example:</p>
186 |   <pre><code class="language-php">$french->append(' ?');
187 | $french[-1] = '!';
188 | echo $french;
189 | 
190 | /**
191 |  * Will output:
192 |  *     Je t'aime !
193 |  */</code></pre>
194 |   <p>Another very useful method is the <strong>ASCII</strong> transformation.
195 |   Be careful, this is not always possible, according to your settings. For
196 |   example:</p>
197 |   <pre><code class="language-php">$title = new Hoa\Ustring\Ustring('Un été brûlant sur la côte');
198 | echo $title->toAscii();
199 | 
200 | /**
201 |  * Will output:
202 |  *     Un ete brulant sur la cote
203 |  */</code></pre>
204 |   <p>We can also transform from Arabic or Japanese to ASCII. Symbols, like
205 |   Mathemeticals symbols or emojis, are also transformed:</p>
206 |   <pre><code class="language-php">$emoji = new Hoa\Ustring\Ustring('I ❤ Unicode');
207 | $maths = new Hoa\Ustring\Ustring('∀ i ∈ ℕ');
208 | 
209 | echo
210 |     $arabic->toAscii(), "\n",
211 |     $japanese->toAscii(), "\n",
212 |     $emoji->toAscii(), "\n",
213 |     $maths->toAscii(), "\n";
214 | 
215 | /**
216 |  * Will output:
217 |  *     ahbk
218 |  *     sihaanatawo aishite
219 |  *     I (heavy black heart)️ Unicode
220 |  *     (for all) i (element of) N
221 |  */</code></pre>
222 |   <p>In order this method to work correctly, the
223 |   <a href="http://php.net/intl"><code>intl</code></a> extension needs to be
224 |   present, so that the
225 |   <a href="http://php.net/transliterator"><code>Transliterator</code></a> class
226 |   is present. If it does not exist, the
227 |   <a href="http://php.net/normalizer"><code>Normalizer</code></a> class must
228 |   exist. If this class does not exist neither, the
229 |   <code>Hoa\Ustring\Ustring::toAscii</code> method can still try a
230 |   transformation, but it is less efficient. To activate this last solution,
231 |   <code>true</code> must be passed as a single argument. This <em lang="fr">tour
232 |   de force</em> is not recommended in most cases.</p>
233 |   <p>We also find the <code>getTransliterator</code> method which returns a
234 |   <code>Transliterator</code> object, or <code>null</code> if this class does
235 |   not exist. This method takes a transliteration identifier as argument. We
236 |   suggest to <a href="http://userguide.icu-project.org/transforms/general">read
237 |   the documentation about the transliterator of ICU</a> to understand this
238 |   identifier. The <code>transliterate</code> method allows to transliterate the
239 |   current string based on an identifier and a beginning index and an end
240 |   one. This method works the same way than the
241 |   <a href="http://php.net/transliterator.transliterate"><code>Transliterator::transliterate</code></a>
242 |   method.</p>
243 |   <p>More generally, to change the <strong>encoding</strong> format, we can use
244 |   the <code>Hoa\Ustring\Ustring::transcode</code> static method, with a string
245 |   as first argument, the original encoding format as second argument and the
246 |   expected encoding format as third argument (UTF-8 by default). The get the
247 |   list of encoding formats, we have to refer to the
248 |   <a href="http://php.net/iconv"><code>iconv</code></a> extension or to use the
249 |   following command line in a terminal:</p>
250 |   <pre><code class="language-php">$ iconv --list</code></pre>
251 |   <p>To know if a string is encoded in UTF-8, we can use the
252 |   <code>Hoa\Ustring\Ustring::isUtf8</code> static method; for instance:</p>
253 |   <pre><code class="language-php">var_dump(
254 |     Hoa\Ustring\Ustring::isUtf8('a'),
255 |     Hoa\Ustring\Ustring::isUtf8(Hoa\Ustring\Ustring::transcode('a', 'UTF-8', 'UTF-16'))
256 | );
257 | 
258 | /**
259 |  * Will output:
260 |  *     bool(true)
261 |  *     bool(false)
262 |  */</code></pre>
263 |   <p>We can <strong>split</strong> the string into several sub-strings by using
264 |   the <code>Hoa\Ustring\Ustring::split</code> method. As first argument, we have
265 |   a regular expression (of kind <a href="http://pcre.org/">PCRE</a>), then an
266 |   integer representing the maximum number of elements to return and finally a
267 |   combination of constants. These constants are the same as the ones of
268 |   <a href="http://php.net/preg_split"><code>preg_split</code></a>.</p>
269 |   <p>By default, the second argument is set to -1, which means infinity, and the
270 |   last argument is set to <code>PREG_SPLIT_NO_EMPTY</code>. Thus, if we would
271 |   like to get all the words of a string, we will write:</p>
272 |   <pre><code class="language-php">print_r($title->split('#\b|\s#'));
273 | 
274 | /**
275 |  * Will output:
276 |  *     Array
277 |  *     (
278 |  *         [0] => Un
279 |  *         [1] => ete
280 |  *         [2] => brulant
281 |  *         [3] => sur
282 |  *         [4] => la
283 |  *         [5] => cote
284 |  *     )
285 |  */</code></pre>
286 |   <p>If we would like to <strong>iterate</strong> over all the
287 |   <strong>characters</strong>, it is recommended to use the
288 |   <code>IteratorAggregate</code> method, being the
289 |   <code>Hoa\Ustring\Ustring::getIterator</code> method. Let's see on the Arabic
290 |   example:</p>
291 |   <pre><code class="language-php">foreach ($arabic as $letter) {
292 |     echo $letter, "\n";
293 | }
294 | 
295 | /**
296 |  * Will output:
297 |  *     أ
298 |  *     ح
299 |  *     ب
300 |  *     ك
301 |  */</code></pre>
302 |   <p>We notice that the iteration is based on the text direction, it means that
303 |   the first element of the iteration is the first letter of the string starting
304 |   from the beginning.</p>
305 |   <p>Of course, if we would like to get an array of characters, we can use the
306 |   <a href="http://php.net/iterator_to_array"><code>iterator_to_array</code></a>
307 |   PHP function:</p>
308 |   <pre><code class="language-php">print_r(iterator_to_array($arabic));
309 | 
310 | /**
311 |  * Will output:
312 |  *     Array
313 |  *     (
314 |  *         [0] => أ
315 |  *         [1] => ح
316 |  *         [2] => ب
317 |  *         [3] => ك
318 |  *     )
319 |  */</code></pre>
320 | 
321 |   <h3 id="Comparison_and_search" for="main-toc">Comparison and search</h3>
322 | 
323 |   <p>Strings can also be <strong>compared</strong> thanks to the
324 |   <code>Hoa\Ustring\Ustring::compare</code> method:</p>
325 |   <pre><code class="language-php">$string = new Hoa\Ustring\Ustring('abc');
326 | var_dump(
327 |     $string->compare('wxyz')
328 | );
329 | 
330 | /**
331 |  * Will output:
332 |  *     string(-1)
333 |  */</code></pre>
334 |   <p>This methods returns -1 if the initial string comes before (in the
335 |   alphabetical order), 0 if it is identical and 1 if it comes after. If we
336 |   would like to use all the power of the underlying mechanism, we can call the
337 |   <code>Hoa\Ustring\Ustring::getCollator</code> static method (if the
338 |   <a href="http://php.net/Collator"><code>Collator</code></a> class exists, else
339 |   <code>Hoa\Ustring\Ustring::compare</code> will use a simple byte to bytes
340 |   comparison without taking care of the other parameters). Thus, if we would
341 |   like to sort an array of strings, we will write:</p>
342 |   <pre><code class="language-php">$strings = array('c', 'Σ', 'd', 'x', 'α', 'a');
343 | Hoa\Ustring\Ustring::getCollator()->sort($strings);
344 | print_r($strings);
345 | 
346 | /**
347 |  * Could output:
348 |  *     Array
349 |  *     (
350 |  *         [0] => a
351 |  *         [1] => c
352 |  *         [2] => d
353 |  *         [3] => x
354 |  *         [4] => α
355 |  *         [5] => Σ
356 |  *     )
357 |  */</code></pre>
358 |   <p>Comparison between two strings depends on the <strong>locale</strong>, it
359 |   means of the localization of the system, like the language, the country, the
360 |   region etc. We can use the
361 |   <a href="@hack:chapter=Locale"><code>Hoa\Locale</code> library</a> to modify
362 |   these data, but it's not a dependence of <code>Hoa\Ustring</code>.</p>
363 |   <p>We can also know if a string <strong>matches</strong> a certain pattern,
364 |   still expressed with a regular expression. To achieve that, we will use the
365 |   <code>Hoa\Ustring\Ustring::match</code> method. This method relies on the
366 |   <a href="http://php.net/preg_match"><code>preg_match</code></a> and
367 |   <a href="http://php.net/preg_match_all"><code>preg_match_all</code></a> PHP
368 |   functions, but by modifying the pattern's options to ensure the Unicode
369 |   support. We have the following parameters: the pattern, a variable passed by
370 |   reference to collect the matches, flags, an offset and finally a boolean
371 |   indicating whether the search is global or not (respectively if we have to use
372 |   <code>preg_match_all</code> or <code>preg_match</code>). By default, the
373 |   search is not global.</p>
374 |   <p>Thus, we will check that our French example contains <code>aime</code> with
375 |   a direct object complement:</p>
376 |   <pre><code class="language-php">$french->match('#(?:(?&amp;lt;direct_object>\w)[\'\b])aime#', $matches);
377 | var_dump($matches['direct_object']);
378 | 
379 | /**
380 |  * Will output:
381 |  *     string(1) "t"
382 |  */</code></pre>
383 |   <p>This method returns <code>false</code> if an error is raised (for example
384 |   if the pattern is not correct), 0 if no match has been found, the number of
385 |   matches else.</p>
386 |   <p>Similarly, we can <strong>search</strong> and <strong>replace</strong>
387 |   sub-strings by other sub-strings based on a pattern, still expressed with a
388 |   regular expression. To achieve that, we will use the
389 |   <code>Hoa\Ustring\Ustring::replace</code> method. This method uses the
390 |   <a href="http://php.net/preg_replace"><code>preg_replace</code></a> and
391 |   <a href="http://php.net/preg_replace_callback"><code>preg_replace_callback</code></a>
392 |   PHP functions, but still by modifying the pattern's options to ensure the
393 |   Unicode support. As first argument, we find one or more patterns, as second
394 |   argument, one or more replacements and as last argument the limit of
395 |   replacements to apply. If the replacement is a callable, then the
396 |   <code>preg_replace_callback</code> function will be used.</p>
397 |   <p>Thus, we will modify our French example to be more polite:</p>
398 |   <pre><code class="language-php">$french->replace('#(?:\w[\'\b])(?&amp;lt;verb>aime)#', function ($matches) {
399 |     return 'vous ' . $matches['verb'];
400 | });
401 | 
402 | echo $french;
403 | 
404 | /**
405 |  * Will output:
406 |  *     Je vous aime
407 |  */</code></pre>
408 |   <p>The <code>Hoa\Ustring\Ustring</code> class provides constants which are
409 |   aliases of existing PHP constants and ensure a better readability of the
410 |   code:</p>
411 |   <ul>
412 |     <li><code>Hoa\Ustring\Ustring::WITHOUT_EMPTY</code>, alias of
413 |     <code>PREG_SPLIT_NO_EMPTY</code>,</li>
414 |     <li><code>Hoa\Ustring\Ustring::WITH_DELIMITERS</code>, alias of
415 |     <code>PREG_SPLIT_DELIM_CAPTURE</code>,</li>
416 |     <li><code>Hoa\Ustring\Ustring::WITH_OFFSET</code>, alias of
417 |     <code>PREG_OFFSET_CAPTURE</code> and
418 |     <code>PREG_SPLIT_OFFSET_CAPTURE</code>,</li>
419 |     <li><code>Hoa\Ustring\Ustring::GROUP_BY_PATTERN</code>, alias of
420 |     <code>PREG_PATTERN_ORDER</code>,</li>
421 |     <li><code>Hoa\Ustring\Ustring::GROUP_BY_TUPLE</code>, alias of
422 |     <code>PREG_SET_ORDER</code>.</li>
423 |   </ul>
424 |   <p>Because they are strict aliases, we can write:</p>
425 |   <pre><code class="language-php">$string = new Hoa\Ustring\Ustring('abc1 defg2 hikl3 xyz4');
426 | $string->match(
427 |     '#(\w+)(\d)#',
428 |     $matches,
429 |     Hoa\Ustring\Ustring::WITH_OFFSET
430 |   | Hoa\Ustring\Ustring::GROUP_BY_TUPLE,
431 |     0,
432 |     true
433 | );</code></pre>
434 | 
435 |   <h3 id="Characters" for="main-toc">Characters</h3>
436 | 
437 |   <p>The <code>Hoa\Ustring\Ustring</code> class offers static methods working on
438 |   a single Unicode character. We have already mentionned the
439 |   <code>getCharDirection</code> method which allows to know the
440 |   <strong>direction</strong> of a character. We also have the
441 |   <code>getCharWidth</code> which counts the <strong>number of columns</strong>
442 |   necessary to print a single character. Thus:</p>
443 |   <pre><code class="language-php">var_dump(
444 |     Hoa\Ustring\Ustring::getCharWidth(Hoa\Ustring\Ustring::fromCode(0x7f)),
445 |     Hoa\Ustring\Ustring::getCharWidth('a'),
446 |     Hoa\Ustring\Ustring::getCharWidth('㽠')
447 | );
448 | 
449 | /**
450 |  * Will output:
451 |  *     int(-1)
452 |  *     int(1)
453 |  *     int(2)
454 |  */</code></pre>
455 |   <p>This method returns -1 or 0 if the character is not
456 |   <strong>printable</strong> (for instance, if this is a control character, like
457 |   <code>0x7f</code> which corresponds to <code>DELETE</code>), 1 or more if this
458 |   is a character that can be printed. In our example, <code>㽠</code> requires
459 |   2 columns to be printed.</p>
460 |   <p>To get more semantics, we have the
461 |   <code>Hoa\Ustring\Ustring::isCharPrintable</code> method which allows to know
462 |   whether a character is printable or not.</p>
463 |   <p>If we would like to count the number of columns necessary for a whole
464 |   string, we have to use the <code>Hoa\Ustring\Ustring::getWidth</code> method.
465 |   Thus:</p>
466 |   <pre><code class="language-php">var_dump(
467 |     $french->getWidth(),
468 |     $arabic->getWidth(),
469 |     $japanese->getWidth()
470 | );
471 | 
472 | /**
473 |  * Will output:
474 |  *     int(9)
475 |  *     int(4)
476 |  *     int(18)
477 |  */</code></pre>
478 |   <p>Try this in your terminal with a <strong>monospaced</strong> font. You will
479 |   observe that Japanese requires 18 columns to be printed. This measure is very
480 |   useful if we would like to know the length of a string to position it
481 |   efficiently.</p>
482 |   <p>The <code>getCharWidth</code> method is different of <code>getWidth</code>
483 |   because it includes control characters. This method is intended to be used,
484 |   for example, with terminals (please, see the
485 |   <a href="@hack:chapter=Console"><code>Hoa\Console</code> library</a>).</p>
486 |   <p>Finally, if this time we are not interested by Unicode characters but
487 |   rather by <strong>machine</strong> characters <code>char</code> (being
488 |   1 byte), we have an extra operation. The
489 |   <code>Hoa\Ustring\Ustring::getBytesLength</code> method will count the
490 |   <strong>length</strong> of the string in bytes:</p>
491 |   <pre><code class="language-php">var_dump(
492 |     $arabic->getBytesLength(),
493 |     $japanese->getBytesLength()
494 | );
495 | 
496 | /**
497 |  * Will output:
498 |  *     int(8)
499 |  *     int(27)
500 |  */</code></pre>
501 |   <p>If we compare these results with the ones of the
502 |   <code>Hoa\Ustring\Ustring::count</code> method, we understand that the Arabic
503 |   characters are encoded with 2 bytes whereas Japanese characteres are encoded
504 |   with 3 bytes. We can also get a specific byte thanks to the
505 |   <code>Hoa\Ustring\Ustring::getByteAt</code> method. Once again, the index is
506 |   not bounded.</p>
507 | 
508 |   <h3 id="Code-point" for="main-toc">Code-point</h3>
509 | 
510 |   <p>Each character is represented by an integer, called a
511 |   <strong>code-point</strong>. To get the code-point of a character, we can
512 |   use the <code>Hoa\Ustring\Ustring::toCode</code> static method, and to get a
513 |   character based on its code-point, we can use the
514 |   <code>Hoa\Ustring\Ustring::fromCode</code> static method. We also have the
515 |   <code>Hoa\Ustring\Ustring::toBinaryCode</code> method which returns the binary
516 |   representation of a character. Let's take an example:</p>
517 |   <pre><code class="language-php">var_dump(
518 |     Hoa\Ustring\Ustring::toCode('Σ'),
519 |     Hoa\Ustring\Ustring::toBinaryCode('Σ'),
520 |     Hoa\Ustring\Ustring::fromCode(0x1a9)
521 | );
522 | 
523 | /**
524 |  * Will output:
525 |  *     int(931)
526 |  *     string(32) "1100111010100011"
527 |  *     string(2) "Σ"
528 |  */</code></pre>
529 | 
530 |   <h2 id="Search_algorithms" for="main-toc">Search algorithms</h2>
531 | 
532 |   <p>The <code>Hoa\Ustring</code> library provides sophisticated
533 |   <strong>search</strong> algorithms on strings through the
534 |   <code>Hoa\Ustring\Search</code> class.</p>
535 |   <p>We will study the <code>Hoa\Ustring\Search::approximated</code> algorithm
536 |   which searches a sub-string in a string up to <strong><em>k</em>
537 |   differences</strong> (a difference is an addition, a deletion or a
538 |   modification). Let's take the classical example of a DNA representation: We
539 |   will search all the sub-strings approximating <code>GATAA</code> with
540 |   1 difference (maximum) in <code>CAGATAAGAGAA</code>. So, we will write:</p>
541 |   <pre><code class="language-php">$x      = 'GATAA';
542 | $y      = 'CAGATAAGAGAA';
543 | $k      = 1;
544 | $search = Hoa\Ustring\Search::approximated($y, $x, $k);
545 | $n      = count($search);
546 | 
547 | echo 'Try to match ', $x, ' in ', $y, ' with at most ', $k, ' difference(s):', "\n";
548 | echo $n, ' match(es) found:', "\n";
549 | 
550 | foreach ($search as $position) {
551 |     echo '    • ', substr($y, $position['i'], $position['l'), "\n";
552 | }
553 | 
554 | /**
555 |  * Will output:
556 |  *     Try to match GATAA in CAGATAAGAGAA with at most 1 difference(s):
557 |  *     4 match(es) found:
558 |  *         • AGATA
559 |  *         • GATAA
560 |  *         • ATAAG
561 |  *         • GAGAA
562 |  */</code></pre>
563 |   <p>This methods returns an array of arrays. Each sub-array represents a result
564 |   and contains three indexes: <code>i</code> for the position of the first
565 |   character (byte) of the result, <code>j</code> for the position of the last
566 |   character and <code>l</code> for the length of the result (simply
567 |   <code>j</code> - <code>i</code>). Thus, we can compute the results by using
568 |   our initial string (here <code class="language-php">$y</code>) and its
569 |   indexes.</p>
570 |   <p>With our example, we have four results. The first is <code>AGATA</code>,
571 |   being <code>GATA<em>A</em></code> with one moved character, and
572 |   <code>AGATA</code> exists in <code>C<em>AGATA</em>AGAGAA</code>.  The second
573 |   result is <code>GATAA</code>, our sub-string, which well and truly exists in
574 |   <code>CA<em>GATAA</em>GAGAA</code>. The third result is <code>ATAAG</code>,
575 |   being <code><em>G</em>ATAA</code> with one moved character, and
576 |   <code>ATAAG</code> exists in <code>CAG<em>ATAAG</em>AGAA</code>. Finally, the
577 |   last result is <code>GAGAA</code>, being <code>GA<em>T</em>AA</code> with one
578 |   modified character, and <code>GAGAA</code> exists in
579 |   <code>CAGATAA<em>GAGAA</em></code>.</p>
580 |   <p>Another example, more concrete this time. We will consider the
581 |   <code>--testIt --foobar --testThat --testAt</code> string (which represents
582 |   possible options of a command line), and we will search <code>--testot</code>,
583 |   an option that should have been given by the user. This option does not exist
584 |   as it is. We will then use our search algorithm with at most 1 difference.
585 |   Let's see:</p>
586 |   <pre><code class="language-php">$x      = 'testot';
587 | $y      = '--testIt --foobar --testThat --testAt';
588 | $k      = 1;
589 | $search = Hoa\Ustring\Search::approximated($y, $x, $k);
590 | $n      = count($search);
591 | 
592 | // …
593 | 
594 | /**
595 |  * Will output:
596 |  *     Try to match testot in --testIt --foobar --testThat --testAt with at most 1 difference(s)
597 |  *     2 match(es) found:
598 |  *         • testIt
599 |  *         • testAt
600 |  */</code></pre>
601 |   <p>The <code>testIt</code> and <code>testAt</code> results are true options,
602 |   so we can suggest them to the user. This is a mechanism user by
603 |   <code>Hoa\Console</code> to suggest corrections to the user in case of a
604 |   mistyping.</p>
605 | 
606 |   <h2 id="Conclusion" for="main-toc">Conclusion</h2>
607 | 
608 |   <p>The <code>Hoa\Ustring</code> library provides facilities to manipulate
609 |   strings encoded with the Unicode format, but also to make sophisticated search
610 |   on strings.</p>
611 | 
612 | </yield>
613 | </overlay>
614 | 


--------------------------------------------------------------------------------
/Documentation/Fr/Index.xyl:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8"?>
  2 | 
  3 | <overlay xmlns="http://hoa-project.net/xyl/xylophone">
  4 | <yield id="chapter">
  5 | 
  6 |   <p>Les chaînes de caractères peuvent parfois être <strong>complexes</strong>,
  7 |   particulièrement lorsqu'elles utilisent l'encodage <strong>Unicode</strong>.
  8 |   La bibliothèque <code>Hoa\Ustring</code> propose plusieurs opérations sur des
  9 |   chaînes de caractères UTF-8.</p>
 10 | 
 11 |   <h2 id="Table_of_contents">Table des matières</h2>
 12 | 
 13 |   <tableofcontents id="main-toc" />
 14 | 
 15 |   <h2 id="Introduction" for="main-toc">Introduction</h2>
 16 | 
 17 |   <p>Lorsque nous manipulons des chaînes de caractères, le format
 18 |   <a href="http://unicode.org/">Unicode</a> s'impose par sa
 19 |   <strong>compatibilité</strong> avec les formats de base historiques (comme
 20 |   ASCII) et par sa grande capacité à comprendre une très <strong>large</strong>
 21 |   plage de caractères et de symboles, pour toutes les cultures et toutes les
 22 |   régions de notre monde. PHP propose plusieurs outils pour manipuler de telles
 23 |   chaînes, comme les extensions
 24 |   <a href="http://php.net/mbstring"><code>mbstring</code></a>,
 25 |   <a href="http://php.net/iconv"><code>iconv</code></a> ou encore l'excellente
 26 |   <a href="http://php.net/intl"><code>intl</code></a> qui se base sur
 27 |   <a href="http://icu-project.org/">ICU</a>, l'implémentation de référence
 28 |   d'Unicode. Malheureusement, il faut parfois mélanger ces extensions pour
 29 |   arriver à nos fins et au prix d'une certaine <strong>complexité</strong> et
 30 |   d'une <strong>verbosité</strong> regrettable.</p>
 31 |   <p>La bibliothèque <code>Hoa\Ustring</code> répond à ces problématiques en
 32 |   proposant une façon <strong>simple</strong> de manipuler des chaînes de
 33 |   caractères, de manière <strong>performante</strong> et
 34 |   <strong>efficace</strong>. Elle propose également des algorithmes évolués pour
 35 |   des opérations de <strong>recherche</strong> sur des chaînes de
 36 |   caractères.</p>
 37 | 
 38 |   <h2 id="Unicode_strings" for="main-toc">Chaîne de caractères Unicode</h2>
 39 | 
 40 |   <p>La classe <code>Hoa\Ustring\Ustring</code> représente une chaîne de
 41 |   caractères Unicode <strong>UTF-8</strong> et permet de la manipuler
 42 |   facilement. Elle implémente les interfaces
 43 |   <a href="http://php.net/arrayaccess"><code>ArrayAccess</code></a>,
 44 |   <a href="http://php.net/countable"><code>Countable</code></a> et
 45 |   <a href="http://php.net/iteratoraggregate"><code>IteratorAggregate</code></a>.
 46 |   Nous allons utiliser trois exemples dans trois langues différentes : français,
 47 |   arabe et japonais. Ainsi :</p>
 48 |   <pre><code class="language-php">$french   = new Hoa\Ustring\Ustring('Je t\'aime');
 49 | $arabic   = new Hoa\Ustring\Ustring('أحبك');
 50 | $japanese = new Hoa\Ustring\Ustring('私はあなたを愛して');</code></pre>
 51 |   <p>Maintenant, voyons les opérations possibles sur ces trois chaînes.</p>
 52 | 
 53 |   <h3 id="String_manipulation" for="main-toc">Manipulation de la chaîne</h3>
 54 | 
 55 |   <p>Commençons par les opérations <strong>élémentaires</strong>. Si nous
 56 |   voulons <strong>compter</strong> le nombre de caractères (et non pas
 57 |   d'octets), nous allons utiliser <a href="http://php.net/count">la fonction
 58 |   <code>count</code></a> de PHP. Ainsi :</p>
 59 |   <pre><code class="language-php">var_dump(
 60 |     count($french),
 61 |     count($arabic),
 62 |     count($japanese)
 63 | );
 64 | 
 65 | /**
 66 |  * Will output:
 67 |  *     int(9)
 68 |  *     int(4)
 69 |  *     int(9)
 70 |  */</code></pre>
 71 |   <p>Quand nous parlons de position sur un texte, il n'est pas adéquat de parler
 72 |   de droite ou de gauche, mais plutôt de <strong>début</strong> ou de
 73 |   <strong>fin</strong>, et cela à partir de la <strong>direction</strong> (sens
 74 |   d'écriture) du texte. Nous pouvons connaître cette direction grâce à la
 75 |   méthode <code>Hoa\Ustring\Ustring::getDirection</code>. Elle retourne la
 76 |   valeur d'une des constantes suivantes :</p>
 77 |   <ul>
 78 |     <li><code>Hoa\Ustring\Ustring::LTR</code>, pour
 79 |     <em lang="en">left-to-right</em>, si le texte s'écrit de gauche à
 80 |     droite ;</li>
 81 |     <li><code>Hoa\Ustring\Ustring::RTL</code>, pour
 82 |     <em lang="en">right-to-left</em>, si le texte s'écrit de droite à
 83 |     gauche.</li>
 84 |   </ul>
 85 |   <p>Observons le résultat sur nos exemples :</p>
 86 |   <pre><code class="language-php">var_dump(
 87 |     $french->getDirection()   === Hoa\Ustring\Ustring::LTR, // is left-to-right?
 88 |     $arabic->getDirection()   === Hoa\Ustring\Ustring::RTL, // is right-to-left?
 89 |     $japanese->getDirection() === Hoa\Ustring\Ustring::LTR  // is left-to-right?
 90 | );
 91 | 
 92 | /**
 93 |  * Will output:
 94 |  *     bool(true)
 95 |  *     bool(true)
 96 |  *     bool(true)
 97 |  */</code></pre>
 98 |   <p>Le résultat de cette méthode est calculé grâce à la méthode statique
 99 |   <code>Hoa\Ustring\Ustring::getCharDirection</code> qui calcule la direction
100 |   d'un seul caractère.</p>
101 |   <p>Si nous voulons <strong>concaténer</strong> une autre chaîne à la fin ou au
102 |   début, nous utiliserons respectivement les méthodes
103 |   <code>Hoa\Ustring\Ustring::append</code> et
104 |   <code>Hoa\Ustring\Ustring::prepend</code>. Ces méthodes, comme la plupart de
105 |   celles qui modifient la chaîne, retournent l'objet lui-même, ce afin de
106 |   chaîner les appels. Par exemple :</p>
107 |   <pre><code class="language-php">echo $french->append('… et toi, m\'aimes-tu ?')->prepend('Mam\'zelle ! ');
108 | 
109 | /**
110 |  * Will output:
111 |  *     Mam'zelle ! Je t'aime… et toi, m'aimes-tu ?
112 |  */</code></pre>
113 |   <p>Nous avons également les méthodes
114 |   <code>Hoa\Ustring\Ustring::toLowerCase</code> et
115 |   <code>Hoa\Ustring\Ustring::toUpperCase</code> pour, respectivement, mettre la
116 |   chaîne en <strong>minuscules</strong> ou en <strong>majuscules</strong>. Par
117 |   exemple :</p>
118 |   <pre><code class="language-php">echo $french->toUpperCase();
119 | 
120 | /**
121 |  * Will output:
122 |  *     MAM'ZELLE ! JE T'AIME… ET TOI, M'AIMES-TU ?
123 |  */</code></pre>
124 |   <p>Nous pouvons aussi ajouter des caractères en début ou en fin de chaîne pour
125 |   atteindre une taille <strong>minimum</strong>. Cette opération est plus
126 |   couramment appelée le <em lang="en">padding</em> (pour des raisons historiques
127 |   remontant aux machines à écrire). C'est pourquoi nous trouvons la méthode
128 |   <code>Hoa\Ustring\Ustring::pad</code> qui prend trois arguments : la taille
129 |   minimum, les caractères à ajouter et une constante indiquant si nous devons
130 |   ajouter en fin ou en début de chaîne (respectivement
131 |   <code>Hoa\Ustring\Ustring::END</code>, par défaut, et
132 |   <code>Hoa\Ustring\Ustring::BEGINNING</code>).</p>
133 |   <pre><code class="language-php">echo $arabic->pad(20, ' ');
134 | 
135 | /**
136 |  * Will output:
137 |  *                     أحبك
138 |  */</code></pre>
139 |   <p>Une opération similairement inverse permet de supprimer, par défaut, les
140 |   <strong>espaces</strong> en début et en fin de chaîne grâce à la méthode
141 |   <code>Hoa\Ustring\Ustring::trim</code>. Par exemple, pour revenir à notre
142 |   chaîne arabe originale :</p>
143 |   <pre><code class="language-php">echo $arabic->trim();
144 | 
145 | /**
146 |  * Will output:
147 |  *     أحبك
148 |  */</code></pre>
149 |   <p>Si nous voulons supprimer d'autres caractères, nous pouvons utiliser son
150 |   premier argument qui doit être une expression régulière. Enfin, son second
151 |   argument permet de préciser de quel côté nous voulons supprimer les
152 |   caractères : en début, en fin ou les deux, toujours en utilisant les
153 |   constantes <code>Hoa\Ustring\Ustring::BEGINNING</code> et
154 |   <code>Hoa\Ustring\Ustring::END</code>.  Nous pouvons combiner ces constantes
155 |   pour exprimer « les deux côtés », ce qui est la valeur par défaut :
156 |   <code class="language-php">Hoa\Ustring\Ustring::BEGINNING |
157 |   Hoa\Ustring\Ustring::END</code>. Par exemple, pour supprimer tous les nombres
158 |   et les espaces uniquement à la fin, nous écrirons :</p>
159 |   <pre><code class="language-php">$arabic->trim('\s|\d', Hoa\Ustring\Ustring::END);</code></pre>
160 |   <p>Nous pouvons également <strong>réduire</strong> la chaîne à une
161 |   <strong>sous-chaîne</strong> en précisant la position du premier caractère
162 |   puis la taille de la sous-chaîne à la méthode
163 |   <code>Hoa\Ustring\Ustring::reduce</code> :</p>
164 |   <pre><code class="language-php">echo $french->reduce(3, 6)->reduce(2, 4);
165 | 
166 | /**
167 |  * Will output:
168 |  *     aime
169 |  */</code></pre>
170 |   <p>Si nous voulons obtenir un caractère en particulier, nous pouvons exploiter
171 |   l'interface <code>ArrayAccess</code>. Par exemple, pour obtenir le premier
172 |   caractère de chacun de nos exemples (en les reprenant depuis le début) :</p>
173 |   <pre><code class="language-php">var_dump(
174 |     $french[0],
175 |     $arabic[0],
176 |     $japanese[0]
177 | );
178 | 
179 | /**
180 |  * Will output:
181 |  *     string(1) "J"
182 |  *     string(2) "أ"
183 |  *     string(3) "私"
184 |  */</code></pre>
185 |   <p>Si nous voulons le dernier caractère, nous utiliserons l'index -1. L'index
186 |   n'est pas borné à la taille de la chaîne. Si jamais l'index dépasse cette
187 |   taille, alors un <em>modulo</em> sera appliqué.</p>
188 |   <p>Nous pouvons aussi modifier ou supprimer un caractère précis avec cette
189 |   méthode. Par exemple :</p>
190 |   <pre><code class="language-php">$french->append(' ?');
191 | $french[-1] = '!';
192 | echo $french;
193 | 
194 | /**
195 |  * Will output:
196 |  *     Je t'aime !
197 |  */</code></pre>
198 |   <p>Une autre méthode fort utile est la transformation en
199 |   <strong>ASCII</strong>. Attention, ce n'est pas toujours possible, selon votre
200 |   installation. Par exemple :</p>
201 |   <pre><code class="language-php">$title = new Hoa\Ustring\Ustring('Un été brûlant sur la côte');
202 | echo $title->toAscii();
203 | 
204 | /**
205 |  * Will output:
206 |  *     Un ete brulant sur la cote
207 |  */</code></pre>
208 |   <p>Nous pouvons aussi transformer de l'arabe ou du japonais vers de l'ASCII.
209 |   Les symboles, comme les symboles Mathématiques ou les emojis, sont aussi
210 |   transformés :</p>
211 |   <pre><code class="language-php">$emoji = new Hoa\Ustring\Ustring('I ❤ Unicode');
212 | $maths = new Hoa\Ustring\Ustring('∀ i ∈ ℕ');
213 | 
214 | echo
215 |     $arabic->toAscii(), "\n",
216 |     $japanese->toAscii(), "\n",
217 |     $emoji->toAscii(), "\n",
218 |     $maths->toAscii(), "\n";
219 | 
220 | /**
221 |  * Will output:
222 |  *     ahbk
223 |  *     sihaanatawo aishite
224 |  *     I (heavy black heart)️ Unicode
225 |  *     (for all) i (element of) N
226 |  */</code></pre>
227 |   <p>Pour que cette méthode fonctionne correctement, il faut que l'extension
228 |   <a href="http://php.net/intl"><code>intl</code></a> soit présente, pour que la
229 |   classe <a href="http://php.net/transliterator"><code>Transliterator</code></a>
230 |   existe. Si elle n'existe pas, la classe
231 |   <a href="http://php.net/normalizer"><code>Normalizer</code></a> doit exister.
232 |   Si cette classe n'existe pas non plus, la méthode
233 |   <code>Hoa\Ustring\Ustring::toAscii</code> peut quand même essayer une
234 |   transformation mais moins efficace. Pour cela, il faut passer
235 |   <code>true</code> en seul argument. Ce tour de force est déconseillé dans la
236 |   plupart des cas.</p>
237 |   <p>Nous trouvons également la méthode <code>getTransliterator</code> qui
238 |   retourne un objet <code>Transliterator</code>, ou <code>null</code> si cette
239 |   classe n'existe pas. Cette méthode prend en argument un identifiant de
240 |   translitération. Nous conseillons de
241 |   <a href="http://userguide.icu-project.org/transforms/general">lire la
242 |   documentation sur le translitérateur d'ICU</a> pour comprendre cet
243 |   identifiant. La méthode <code>transliterate</code> permet de translitérer la
244 |   chaîne courante à partir d'un identifiant et d'un index de début et de
245 |   fin. Elle fonctionne de la même façon que la méthode
246 |   <a href="http://php.net/transliterator.transliterate"><code>Transliterator::transliterate</code></a>.</p>
247 | 
248 |   <p>Plus généralement, pour des changements d'<strong>encodage</strong> brut,
249 |   nous pouvons utiliser la méthode statique
250 |   <code>Hoa\Ustring\Ustring::transcode</code>, avec en premier argument une chaîne
251 |   de caractères, en deuxième argument l'encodage d'origine et en dernier
252 |   argument l'encodage final souhaité (par défaut UTF-8). Pour la liste des
253 |   encodages, il faut se reporter à l'extension
254 |   <a href="http://php.net/iconv"><code>iconv</code></a> ou entrer la commande
255 |   suivante dans un terminal :</p>
256 |   <pre><code class="language-php">$ iconv --list</code></pre>
257 |   <p>Pour savoir si une chaîne est encodée en UTF-8, nous pouvons utiliser la
258 |   méthode statique <code>Hoa\Ustring\Ustring::isUtf8</code> ; par exemple :</p>
259 |   <pre><code class="language-php">var_dump(
260 |     Hoa\Ustring\Ustring::isUtf8('a'),
261 |     Hoa\Ustring\Ustring::isUtf8(Hoa\Ustring\Ustring::transcode('a', 'UTF-8', 'UTF-16'))
262 | );
263 | 
264 | /**
265 |  * Will output:
266 |  *     bool(true)
267 |  *     bool(false)
268 |  */</code></pre>
269 |   <p>Nous pouvons <strong>éclater</strong> la chaîne en plusieurs sous-chaînes
270 |   en utilisant la méthode <code>Hoa\Ustring\Ustring::split</code>. En premier
271 |   argument, nous avons une expression régulière (type
272 |   <a href="http://pcre.org/">PCRE</a>), puis un entier représentant le nombre
273 |   maximum d'éléments à retourner et enfin une combinaison de constantes. Ces
274 |   constantes sont les mêmes que celles de
275 |   <a href="http://php.net/preg_split"><code>preg_split</code></a>.</p>
276 |   <p>Par défaut, le deuxième argument vaut -1, qui symbolise l'infini, et le
277 |   dernier argument vaut <code>PREG_SPLIT_NO_EMPTY</code>. Ainsi, si nous
278 |   voulons obtenir tous les mots d'une chaîne, nous écrirons :</p>
279 |   <pre><code class="language-php">print_r($title->split('#\b|\s#'));
280 | 
281 | /**
282 |  * Will output:
283 |  *     Array
284 |  *     (
285 |  *         [0] => Un
286 |  *         [1] => ete
287 |  *         [2] => brulant
288 |  *         [3] => sur
289 |  *         [4] => la
290 |  *         [5] => cote
291 |  *     )
292 |  */</code></pre>
293 |   <p>Si nous voulons <strong>itérer</strong> sur tous les
294 |   <strong>caractères</strong>, il est préférable d'exploiter l'interface
295 |   <code>IteratorAggregate</code>, soit la méthode
296 |   <code>Hoa\Ustring\Ustring::getIterator</code>. Voyons plutôt sur l'exemple en
297 |   arabe :</p>
298 |   <pre><code class="language-php">foreach ($arabic as $letter) {
299 |     echo $letter, "\n";
300 | }
301 | 
302 | /**
303 |  * Will output:
304 |  *     أ
305 |  *     ح
306 |  *     ب
307 |  *     ك
308 |  */</code></pre>
309 |   <p>Nous remarquons que l'itération se fait suivant la direction du texte,
310 |   c'est à dire que le premier élément de l'itération est la première lettre de
311 |   la chaîne en partant du début.</p>
312 |   <p>Bien sûr, si nous voulons obtenir un tableau des caractères, nous pouvons
313 |   utiliser la fonction
314 |   <a href="http://php.net/iterator_to_array"><code>iterator_to_array</code></a>
315 |   de PHP :</p>
316 |   <pre><code class="language-php">print_r(iterator_to_array($arabic));
317 | 
318 | /**
319 |  * Will output:
320 |  *     Array
321 |  *     (
322 |  *         [0] => أ
323 |  *         [1] => ح
324 |  *         [2] => ب
325 |  *         [3] => ك
326 |  *     )
327 |  */</code></pre>
328 | 
329 |   <h3 id="Comparison_and_search" for="main-toc">Comparaison et recherche</h3>
330 | 
331 |   <p>Les chaînes peuvent également être <strong>comparées</strong> entre elles
332 |   grâce à la méthode <code>Hoa\Ustring\Ustring::compare</code> :</p>
333 |   <pre><code class="language-php">$string = new Hoa\Ustring\Ustring('abc');
334 | var_dump(
335 |     $string->compare('wxyz')
336 | );
337 | 
338 | /**
339 |  * Will output:
340 |  *     string(-1)
341 |  */</code></pre>
342 |   <p>Cette méthode retourne -1 si la chaîne initiale vient avant (par ordre
343 |   alphabétique), 0 si elle est identique et 1 si elle vient après. Si nous
344 |   voulons utiliser la pleine
345 |   puissance du mécanisme sous-jacent, nous pouvons appeler la méthode statique
346 |   <code>Hoa\Ustring\Ustring::getCollator</code> (si la classe
347 |   <a href="http://php.net/Collator"><code>Collator</code></a> existe, sinon
348 |   <code>Hoa\Ustring\Ustring::compare</code> utilisera une comparaison simple
349 |   octet par octets sans tenir compte d'autres paramètres). Ainsi, si nous
350 |   voulons trier un tableau de chaînes, nous écrirons plutôt :</p>
351 |   <pre><code class="language-php">$strings = array('c', 'Σ', 'd', 'x', 'α', 'a');
352 | Hoa\Ustring\Ustring::getCollator()->sort($strings);
353 | print_r($strings);
354 | 
355 | /**
356 |  * Could output:
357 |  *     Array
358 |  *     (
359 |  *         [0] => a
360 |  *         [1] => c
361 |  *         [2] => d
362 |  *         [3] => x
363 |  *         [4] => α
364 |  *         [5] => Σ
365 |  *     )
366 |  */</code></pre>
367 |   <p>La comparaison entre deux chaînes dépend de la <strong>locale</strong>,
368 |   c'est à dire de la régionalisation du système, comme la langue, le pays, la
369 |   région etc. Nous pouvons utiliser <a href="@hack:chapter=Locale">la
370 |   bibliothèque <code>Hoa\Locale</code></a> pour modifier ces données, mais ce
371 |   n'est pas une dépendance de <code>Hoa\Ustring</code> pour autant.</p>
372 |   <p>Nous pouvons également savoir si une chaîne <strong>correspond</strong> à
373 |   un certain motif, toujours exprimé avec une expression régulière. Pour cela,
374 |   nous allons utiliser la méthode <code>Hoa\Ustring\Ustring::match</code>. Cette
375 |   méthode repose sur les fonctions
376 |   <a href="http://php.net/preg_match"><code>preg_match</code></a> et
377 |   <a href="http://php.net/preg_match_all"><code>preg_match_all</code></a> de
378 |   PHP, mais en modifiant les options du motif afin qu'il supporte Unicode. Nous
379 |   avons les paramètres suivants : le motif, une variable par référence pour
380 |   récupérer les captures, les <em lang="en">flags</em>, la position de début de
381 |   recherche (<em lang="en">offset</em>) et enfin un booléen indiquant si la
382 |   recherche est globale ou non (respectivement si nous devons utiliser
383 |   <code>preg_match_all</code> ou <code>preg_match</code>). Par défaut, la
384 |   recherche n'est pas globale.</p>
385 |   <p>Ainsi, nous allons vérifier que notre exemple en français contient bien
386 |   <code>aime</code> avec son complément d'objet direct :</p>
387 |   <pre><code class="language-php">$french->match('#(?:(?&amp;lt;direct_object>\w)[\'\b])aime#', $matches);
388 | var_dump($matches['direct_object']);
389 | 
390 | /**
391 |  * Will output:
392 |  *     string(1) "t"
393 |  */</code></pre>
394 |   <p>Cette méthode retourne <code>false</code> si une erreur est survenue (par
395 |   exemple si le motif n'est pas correct), 0 si aucune correspondance n'a été
396 |   trouvée, le nombre de correspondances trouvées sinon.</p>
397 |   <p>Similairement, nous pouvons <strong>chercher</strong> et
398 |   <strong>remplacer</strong> des sous-chaînes par d'autres sous-chaînes suivant
399 |   un motif, toujours exprimé avec une expression régulière. Pour cela, nous
400 |   allons utiliser la méthode <code>Hoa\Ustring\Ustring::replace</code>. Cette
401 |   méthode repose sur les fonctions
402 |   <a href="http://php.net/preg_replace"><code>preg_replace</code></a> et
403 |   <a href="http://php.net/preg_replace_callback"><code>preg_replace_callback</code></a>
404 |   de PHP, mais toujours en modifiant les options du motif afin qu'il supporte
405 |   Unicode. En premier argument, nous trouvons le ou les motifs, en deuxième
406 |   argument, le ou les remplacements et en dernier argument la limite de
407 |   remplacements à faire. Si le remplacement est un <em lang="en">callable</em>,
408 |   alors la fonction <code>preg_replace_callback</code> sera utilisée.</p>
409 |   <p>Ainsi, nous allons modifier notre exemple français pour qu'il soit plus
410 |   poli :</p>
411 |   <pre><code class="language-php">$french->replace('#(?:\w[\'\b])(?&amp;lt;verb>aime)#', function ($matches) {
412 |     return 'vous ' . $matches['verb'];
413 | });
414 | 
415 | echo $french;
416 | 
417 | /**
418 |  * Will output:
419 |  *     Je vous aime
420 |  */</code></pre>
421 |   <p>La classe <code>Hoa\Ustring\Ustring</code> propose des constantes qui sont
422 |   des aliases de constantes PHP et qui permettent une meilleure lecture du
423 |   code:</p>
424 |   <ul>
425 |     <li><code>Hoa\Ustring\Ustring::WITHOUT_EMPTY</code>, alias de
426 |     <code>PREG_SPLIT_NO_EMPTY</code> ;</li>
427 |     <li><code>Hoa\Ustring\Ustring::WITH_DELIMITERS</code>, alias de
428 |     <code>PREG_SPLIT_DELIM_CAPTURE</code> ;</li>
429 |     <li><code>Hoa\Ustring\Ustring::WITH_OFFSET</code>, alias de
430 |     <code>PREG_OFFSET_CAPTURE</code> et
431 |     <code>PREG_SPLIT_OFFSET_CAPTURE</code> ;</li>
432 |     <li><code>Hoa\Ustring\Ustring::GROUP_BY_PATTERN</code>, alias de
433 |     <code>PREG_PATTERN_ORDER</code> ;</li>
434 |     <li><code>Hoa\Ustring\Ustring::GROUP_BY_TUPLE</code>, alias de
435 |     <code>PREG_SET_ORDER</code>.</li>
436 |   </ul>
437 |   <p>Comme ce sont des aliases stricts, nous pouvons écrire :</p>
438 |   <pre><code class="language-php">$string = new Hoa\Ustring\Ustring('abc1 defg2 hikl3 xyz4');
439 | $string->match(
440 |     '#(\w+)(\d)#',
441 |     $matches,
442 |     Hoa\Ustring\Ustring::WITH_OFFSET
443 |   | Hoa\Ustring\Ustring::GROUP_BY_TUPLE,
444 |     0,
445 |     true
446 | );</code></pre>
447 | 
448 |   <h3 id="Characters" for="main-toc">Caractères</h3>
449 | 
450 |   <p>La classe <code>Hoa\Ustring\Ustring</code> offre des méthodes statiques
451 |   travaillant sur un seul caractère Unicode. Nous avons déjà évoqué la méthode
452 |   <code>getCharDirection</code> qui permet de connaître la
453 |   <strong>direction</strong> d'un caractère. Nous trouvons aussi
454 |   <code>getCharWidth</code> qui calcule le <strong>nombre de colonnes</strong>
455 |   nécessaires pour l'affichage d'un seul caractère. Ainsi :</p>
456 |   <pre><code class="language-php">var_dump(
457 |     Hoa\Ustring\Ustring::getCharWidth(Hoa\Ustring\Ustring::fromCode(0x7f)),
458 |     Hoa\Ustring\Ustring::getCharWidth('a'),
459 |     Hoa\Ustring\Ustring::getCharWidth('㽠')
460 | );
461 | 
462 | /**
463 |  * Will output:
464 |  *     int(-1)
465 |  *     int(1)
466 |  *     int(2)
467 |  */</code></pre>
468 |   <p>Cette méthode retourne -1 ou 0 si le caractère n'est pas
469 |   <strong>imprimable</strong> (par exemple si c'est un caractère de contrôle,
470 |   comme <code>0x7f</code> qui correspond à <code>DELETE</code>), 1 ou plus si
471 |   c'est un caractère qui peut être imprimé. Dans notre exemple, <code>㽠</code>
472 |   s'imprime sur 2 colonnes.</p>
473 |   <p>Pour plus de sémantique, nous avons accès à la méthode
474 |   <code>Hoa\Ustring\Ustring::isCharPrintable</code> qui permet de savoir si un
475 |   caractère est imprimable ou pas.</p>
476 |   <p>Si nous voulons calculer le nombre de colonnes pour tout une chaîne, il
477 |   faut utiliser la méthode <code>Hoa\Ustring\Ustring::getWidth</code>.
478 |   Ainsi :</p>
479 |   <pre><code class="language-php">var_dump(
480 |     $french->getWidth(),
481 |     $arabic->getWidth(),
482 |     $japanese->getWidth()
483 | );
484 | 
485 | /**
486 |  * Will output:
487 |  *     int(9)
488 |  *     int(4)
489 |  *     int(18)
490 |  */</code></pre>
491 |   <p>Essayez dans un terminal avec une police <strong>mono-espacée</strong>.
492 |   Vous verrez que le japonais demande 18 colonnes pour s'afficher. Cette mesure
493 |   est très utile si nous voulons connaître la largeur d'une chaîne pour la
494 |   positionner correctement.</p>
495 |   <p>La méthode <code>getCharWidth</code> est différente de
496 |   <code>getWidth</code> car elle prend en compte des caractères de contrôles.
497 |   Elle est destinée à être utilisée, par exemple, avec des terminaux (voir
498 |   <a href="@hack:chapter=Console">la bibliothèque
499 |   <code>Hoa\Console</code></a>).</p>
500 |   <p>Enfin, si cette fois nous ne nous intéressons pas aux caractères Unicode
501 |   mais aux caractères <strong>machines</strong> <code>char</code> (soit 1
502 |   octet), nous avons une opération supplémentaire. La méthode
503 |   <code>Hoa\Ustring\Ustring::getBytesLength</code> va compter la
504 |   <strong>taille</strong> de la chaîne en octets :</p>
505 |   <pre><code class="language-php">var_dump(
506 |     $arabic->getBytesLength(),
507 |     $japanese->getBytesLength()
508 | );
509 | 
510 | /**
511 |  * Will output:
512 |  *     int(8)
513 |  *     int(27)
514 |  */</code></pre>
515 |   <p>Si nous comparons ces résultats avec ceux de la méthode
516 |   <code>Hoa\Ustring\Ustring::count</code>, nous comprenons que les caractères
517 |   arabes sont encodés sur 2 octets alors que les caractères japonais sont
518 |   encodés sur 3 octets. Nous pouvons également obtenir un octet précis à l'aide
519 |   de la méthode <code>Hoa\Ustring\Ustring::getByteAt</code>. Encore une fois,
520 |   l'index n'est pas borné.</p>
521 | 
522 |   <h3 id="Code-point" for="main-toc">Code-point</h3>
523 | 
524 |   <p>Chaque caractère est représenté en machine par un entier, appelé
525 |   <strong>code-point</strong>. Pour obtenir le code-point d'un caractère, nous
526 |   pouvons utiliser la méthode statique <code>Hoa\Ustring\Ustring::toCode</code>,
527 |   et pour obtenir un caractère à partir d'un code, nous pouvons utiliser la
528 |   méthode statique <code>Hoa\Ustring\Ustring::fromCode</code>. Nous avons aussi
529 |   la méthode statique <code>Hoa\Ustring\Ustring::toBinaryCode</code> qui
530 |   retourne la représentation sous forme binaire d'un caractère. Prenons un
531 |   exemple :</p>
532 |   <pre><code class="language-php">var_dump(
533 |     Hoa\Ustring\Ustring::toCode('Σ'),
534 |     Hoa\Ustring\Ustring::toBinaryCode('Σ'),
535 |     Hoa\Ustring\Ustring::fromCode(0x1a9)
536 | );
537 | 
538 | /**
539 |  * Will output:
540 |  *     int(931)
541 |  *     string(32) "1100111010100011"
542 |  *     string(2) "Σ"
543 |  */</code></pre>
544 | 
545 |   <h2 id="Search_algorithms" for="main-toc">Algorithmes de recherche</h2>
546 | 
547 |   <p>La bibliothèque <code>Hoa\Ustring</code> propose des algorithmes de
548 |   <strong>recherches</strong> sophistiquées sur les chaînes de caractères à
549 |   travers la classe <code>Hoa\Ustring\Search</code>.</p>
550 |   <p>Nous allons étudier l'algorithme
551 |   <code>Hoa\Ustring\Search::approximated</code> qui fait une recherche d'une
552 |   sous-chaîne dans une chaîne avec au maximum <strong><em>k</em>
553 |   différences</strong> (une différence étant une insertion, une délétion ou une
554 |   modification). Prenons un exemple classique avec une représentation
555 |   ADN : nous allons chercher toutes les sous-chaînes s'approchant de
556 |   <code>GATAA</code> à 1 différence près (au maximum) dans
557 |   <code>CAGATAAGAGAA</code>. Pour cela, nous allons donc écrire :</p>
558 |   <pre><code class="language-php">$x      = 'GATAA';
559 | $y      = 'CAGATAAGAGAA';
560 | $k      = 1;
561 | $search = Hoa\Ustring\Search::approximated($y, $x, $k);
562 | $n      = count($search);
563 | 
564 | echo 'Try to match ', $x, ' in ', $y, ' with at most ', $k, ' difference(s):', "\n";
565 | echo $n, ' match(es) found:', "\n";
566 | 
567 | foreach ($search as $position) {
568 |     echo '    • ', substr($y, $position['i'], $position['l'), "\n";
569 | }
570 | 
571 | /**
572 |  * Will output:
573 |  *     Try to match GATAA in CAGATAAGAGAA with at most 1 difference(s):
574 |  *     4 match(es) found:
575 |  *         • AGATA
576 |  *         • GATAA
577 |  *         • ATAAG
578 |  *         • GAGAA
579 |  */</code></pre>
580 |   <p>Cette méthode retourne un tableau de tableaux. Chaque sous-tableau
581 |   représente un résultat et contient trois indexes : <code>i</code> pour la
582 |   position du premier caractère (octet) du résultat, <code>j</code> pour la
583 |   position du dernier caractère et <code>l</code> pour la taille du résultat
584 |   (tout simplement <code>j</code> - <code>i</code>).
585 |   Ainsi, nous pouvons calculer les résultats en utilisant notre chaîne initiale
586 |   (ici <code class="language-php">$y</code>) et ces indexes.</p>
587 |   <p>Avec notre exemple, nous avons quatre résultats. Le premier est
588 |   <code>AGATA</code>, soit <code>GATA<em>A</em></code> avec un caractère
589 |   déplacé, et <code>AGATA</code> existe bien dans
590 |   <code>C<em>AGATA</em>AGAGAA</code>. Le deuxième résultat est
591 |   <code>GATAA</code>, notre sous-chaîne, qui existe bel et bien dans
592 |   <code>CA<em>GATAA</em>GAGAA</code>. Le troisième résultat est
593 |   <code>ATAAG</code>, soit <code><em>G</em>ATAA</code> avec un caractère
594 |   déplacé, et <code>ATAAG</code> existe bien dans
595 |   <code>CAG<em>ATAAG</em>AGAA</code>. Enfin, le dernier résultat est
596 |   <code>GAGAA</code>, soit <code>GA<em>T</em>AA</code> avec un caractère
597 |   modifié, et <code>GAGAA</code> existe bien dans
598 |   <code>CAGATAA<em>GAGAA</em></code>.</p>
599 |   <p>Prenons un autre exemple, plus concret cette fois-ci. Nous allons
600 |   considérer la chaîne <code>--testIt --foobar --testThat --testAt</code> (qui
601 |   représente les options possibles d'une ligne de commande), et nous allons
602 |   chercher <code>--testot</code>, une option qu'aurait pu donner
603 |   l'utilisateur. Cette option n'existe pas telle quelle. Nous allons donc
604 |   utiliser notre algorithme de recherche avec 1 différence au maximum. Voyons
605 |   plutôt :</p>
606 |   <pre><code class="language-php">$x      = 'testot';
607 | $y      = '--testIt --foobar --testThat --testAt';
608 | $k      = 1;
609 | $search = Hoa\Ustring\Search::approximated($y, $x, $k);
610 | $n      = count($search);
611 | 
612 | // …
613 | 
614 | /**
615 |  * Will output:
616 |  *     Try to match testot in --testIt --foobar --testThat --testAt with at most 1 difference(s)
617 |  *     2 match(es) found:
618 |  *         • testIt
619 |  *         • testAt
620 |  */</code></pre>
621 |   <p>Les résultats <code>testIt</code> et <code>testAt</code> sont des vraies
622 |   options, donc nous pouvons les proposer à l'utilisateur. C'est un mécanisme
623 |   utilisé par <code>Hoa\Console</code> pour proposer des corrections à
624 |   l'utilisateur s'il se trompe.</p>
625 | 
626 |   <h2 id="Conclusion" for="main-toc">Conclusion</h2>
627 | 
628 |   <p>La bibliothèque <code>Hoa\Ustring</code> propose des facilités pour
629 |   manipuler des chaînes encodées au format Unicode, mais aussi pour effectuer
630 |   des recherches sophistiquées sur des chaînes.</p>
631 | 
632 | </yield>
633 | </overlay>
634 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <img src="https://static.hoa-project.net/Image/Hoa.svg" alt="Hoa" width="250px" />
  3 | </p>
  4 | 
  5 | ---
  6 | 
  7 | <p align="center">
  8 |   <a href="https://travis-ci.org/hoaproject/ustring"><img src="https://img.shields.io/travis/hoaproject/ustring/master.svg" alt="Build status" /></a>
  9 |   <a href="https://coveralls.io/github/hoaproject/ustring?branch=master"><img src="https://img.shields.io/coveralls/hoaproject/ustring/master.svg" alt="Code coverage" /></a>
 10 |   <a href="https://packagist.org/packages/hoa/ustring"><img src="https://img.shields.io/packagist/dt/hoa/ustring.svg" alt="Packagist" /></a>
 11 |   <a href="https://hoa-project.net/LICENSE"><img src="https://img.shields.io/packagist/l/hoa/ustring.svg" alt="License" /></a>
 12 | </p>
 13 | <p align="center">
 14 |   Hoa is a <strong>modular</strong>, <strong>extensible</strong> and
 15 |   <strong>structured</strong> set of PHP libraries.<br />
 16 |   Moreover, Hoa aims at being a bridge between industrial and research worlds.
 17 | </p>
 18 | 
 19 | # Hoa\Ustring
 20 | 
 21 | [![Help on IRC](https://img.shields.io/badge/help-%23hoaproject-ff0066.svg)](https://webchat.freenode.net/?channels=#hoaproject)
 22 | [![Help on Gitter](https://img.shields.io/badge/help-gitter-ff0066.svg)](https://gitter.im/hoaproject/central)
 23 | [![Documentation](https://img.shields.io/badge/documentation-hack_book-ff0066.svg)](https://central.hoa-project.net/Documentation/Library/Ustring)
 24 | [![Board](https://img.shields.io/badge/organisation-board-ff0066.svg)](https://waffle.io/hoaproject/ustring)
 25 | 
 26 | This library allows to manipulate UTF-8 strings easily with some search
 27 | algorithms.
 28 | 
 29 | [Learn more](https://central.hoa-project.net/Documentation/Library/Ustring).
 30 | 
 31 | ## Installation
 32 | 
 33 | With [Composer](https://getcomposer.org/), to include this library into
 34 | your dependencies, you need to
 35 | require [`hoa/ustring`](https://packagist.org/packages/hoa/ustring):
 36 | 
 37 | ```sh
 38 | $ composer require hoa/ustring '~4.0'
 39 | ```
 40 | 
 41 | For more installation procedures, please read [the Source
 42 | page](https://hoa-project.net/Source.html).
 43 | 
 44 | ## Testing
 45 | 
 46 | Before running the test suites, the development dependencies must be installed:
 47 | 
 48 | ```sh
 49 | $ composer install
 50 | ```
 51 | 
 52 | Then, to run all the test suites:
 53 | 
 54 | ```sh
 55 | $ vendor/bin/hoa test:run
 56 | ```
 57 | 
 58 | For more information, please read the [contributor
 59 | guide](https://hoa-project.net/Literature/Contributor/Guide.html).
 60 | 
 61 | ## Quick usage
 62 | 
 63 | We propose a quick overview of two usages: manipulate UTF-8 strings and one
 64 | search algorithm.
 65 | 
 66 | ### Natural UTF-8 strings manipulation
 67 | 
 68 | The `Hoa\Ustring\Ustring` class allows to manipulate easily UTF-8 strings in a
 69 | very natural way. This class implements the `\ArrayAccess`, `\Countable` and
 70 | `\IteratorAggregate` interfaces. We will use the following examples:
 71 | 
 72 | ```php
 73 | $french   = new Hoa\Ustring\Ustring('Je t\'aime');
 74 | $arabic   = new Hoa\Ustring\Ustring('أحبك');
 75 | $japanese = new Hoa\Ustring\Ustring('私はあなたを愛して');
 76 | ```
 77 | 
 78 | To get the first character, we will do:
 79 | 
 80 | ```php
 81 | var_dump(
 82 |     $french[0],  // string(1) "J"
 83 |     $arabic[0],  // string(2) "أ"
 84 |     $japanese[0] // string(3) "私"
 85 | );
 86 | ```
 87 | 
 88 | And to get the last character, we will do `[-1]`. It supports unbounded (and
 89 | modulo) indexes.
 90 | 
 91 | We note that it cares about text **direction**. Look at `$arabic[0]`, it returns
 92 | `أ` and not `ك`. To get the direction, we can use the
 93 | `Hoa\Ustring\Ustring::getDirection` method (which call the
 94 | `Hoa\Ustring\Ustring::getCharDirection` static method), it returns either
 95 | `Hoa\Ustring\Ustring::LTR` (`0`) or `Hoa\Ustring\Ustring::RTL` (`1`):
 96 | 
 97 | ```php
 98 | var_dump(
 99 |     $french->getDirection(),  // int(0)
100 |     $arabic->getDirection(),  // int(1)
101 |     $japanese->getDirection() // int(0)
102 | );
103 | ```
104 | 
105 | Text direction is also important for the `append`, `prepend`, `pad`… methods on
106 | `Hoa\Ustring\Ustring` for example. 
107 | 
108 | To get the length of a string, we can use the `count` function:
109 | 
110 | ```php
111 | var_dump(
112 |     count($french),  // int(9)
113 |     count($arabic),  // int(4)
114 |     count($japanese) // int(9)
115 | );
116 | ```
117 | 
118 | We are also able to iterate over the string:
119 | 
120 | ```php
121 | foreach ($arabic as $letter) {
122 |     var_dump($letter);
123 | }
124 | 
125 | /**
126 |  * Will output:
127 |  *     string(2) "أ"
128 |  *     string(2) "ح"
129 |  *     string(2) "ب"
130 |  *     string(2) "ك"
131 |  */
132 | ```
133 | 
134 | Again, text direction is useful here. For `$arabic`, the iteration is done from
135 | right to left.
136 | 
137 | Some static methods are helpful, such as `fromCode`, `toCode` or `isUtf8` on
138 | `Hoa\Ustring\Ustring`:
139 | 
140 | ```php
141 | var_dump(
142 |     Hoa\Ustring\Ustring::fromCode(0x1a9), // string(2) "Ʃ"
143 |     Hoa\Ustring\Ustring::toCode('Ʃ'),     // int(425) == 0x1a9
144 |     Hoa\Ustring\Ustring::isUtf8('Ʃ')      // bool(true)
145 | );
146 | ```
147 | 
148 | We can also transform any text into ASCII:
149 | 
150 | ```php
151 | $emoji = new Hoa\Ustring\Ustring('I ❤ Unicode');
152 | $maths = new Hoa\Ustring\Ustring('∀ i ∈ ℕ');
153 | 
154 | echo
155 |     $emoji->toAscii(), "\n",
156 |     $maths->toAscii(), "\n";
157 | 
158 | /**
159 |  * Will output:
160 |  *     I (heavy black heart) Unicode
161 |  *     (for all) i (element of) N
162 |  */
163 | ```
164 | 
165 | ### Search algorithm
166 | 
167 | The `Hoa\Ustring\Search` implements search algorithms on strings.
168 | 
169 | For example, the `Hoa\Ustring\Search::approximated` method make a search by
170 | approximated patterns (with *k* differences based upon the principle diagonal
171 | monotony). If we search the word `GATAA` in `CAGATAAGAGAA` with 1 difference, we
172 | will do:
173 | 
174 | ```php
175 | $search = Hoa\Ustring\Search::approximated(
176 |     $haystack = 'CAGATAAGAGAA',
177 |     $needle   = 'GATAA',
178 |     $k        = 1
179 | );
180 | $solutions = array();
181 | 
182 | foreach ($search as $pos) {
183 |     $solutions[] = substr($haystack, $pos['i'], $pos['l']);
184 | }
185 | ```
186 | 
187 | We will found `AGATA`, `GATAA`, `ATAAG` and `GAGAA`.
188 | 
189 | The result is not very handy but the algorithm is much optimized and found many
190 | applications.
191 | 
192 | ## Documentation
193 | 
194 | The
195 | [hack book of `Hoa\Ustring`](https://central.hoa-project.net/Documentation/Library/Ustring) contains
196 | detailed information about how to use this library and how it works.
197 | 
198 | To generate the documentation locally, execute the following commands:
199 | 
200 | ```sh
201 | $ composer require --dev hoa/devtools
202 | $ vendor/bin/hoa devtools:documentation --open
203 | ```
204 | 
205 | More documentation can be found on the project's website:
206 | [hoa-project.net](https://hoa-project.net/).
207 | 
208 | ## Getting help
209 | 
210 | There are mainly two ways to get help:
211 | 
212 |   * On the [`#hoaproject`](https://webchat.freenode.net/?channels=#hoaproject)
213 |     IRC channel,
214 |   * On the forum at [users.hoa-project.net](https://users.hoa-project.net).
215 | 
216 | ## Contribution
217 | 
218 | Do you want to contribute? Thanks! A detailed [contributor
219 | guide](https://hoa-project.net/Literature/Contributor/Guide.html) explains
220 | everything you need to know.
221 | 
222 | ## License
223 | 
224 | Hoa is under the New BSD License (BSD-3-Clause). Please, see
225 | [`LICENSE`](https://hoa-project.net/LICENSE) for details.
226 | 


--------------------------------------------------------------------------------
/Source/Exception.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | declare(strict_types=1);
 4 | 
 5 | /**
 6 |  * Hoa
 7 |  *
 8 |  *
 9 |  * @license
10 |  *
11 |  * New BSD License
12 |  *
13 |  * Copyright © 2007-2017, Hoa community. All rights reserved.
14 |  *
15 |  * Redistribution and use in source and binary forms, with or without
16 |  * modification, are permitted provided that the following conditions are met:
17 |  *     * Redistributions of source code must retain the above copyright
18 |  *       notice, this list of conditions and the following disclaimer.
19 |  *     * Redistributions in binary form must reproduce the above copyright
20 |  *       notice, this list of conditions and the following disclaimer in the
21 |  *       documentation and/or other materials provided with the distribution.
22 |  *     * Neither the name of the Hoa nor the names of its contributors may be
23 |  *       used to endorse or promote products derived from this software without
24 |  *       specific prior written permission.
25 |  *
26 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS AND CONTRIBUTORS BE
30 |  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 |  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 |  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 |  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 |  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 |  * POSSIBILITY OF SUCH DAMAGE.
37 |  */
38 | 
39 | namespace Hoa\Ustring;
40 | 
41 | use Hoa\Exception as HoaException;
42 | 
43 | /**
44 |  * Extends the `Hoa\Exception\Exception` class.
45 |  */
46 | class Exception extends HoaException
47 | {
48 | }
49 | 


--------------------------------------------------------------------------------
/Source/Search.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | declare(strict_types=1);
  4 | 
  5 | /**
  6 |  * Hoa
  7 |  *
  8 |  *
  9 |  * @license
 10 |  *
 11 |  * New BSD License
 12 |  *
 13 |  * Copyright © 2007-2017, Hoa community. All rights reserved.
 14 |  *
 15 |  * Redistribution and use in source and binary forms, with or without
 16 |  * modification, are permitted provided that the following conditions are met:
 17 |  *     * Redistributions of source code must retain the above copyright
 18 |  *       notice, this list of conditions and the following disclaimer.
 19 |  *     * Redistributions in binary form must reproduce the above copyright
 20 |  *       notice, this list of conditions and the following disclaimer in the
 21 |  *       documentation and/or other materials provided with the distribution.
 22 |  *     * Neither the name of the Hoa nor the names of its contributors may be
 23 |  *       used to endorse or promote products derived from this software without
 24 |  *       specific prior written permission.
 25 |  *
 26 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 27 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 28 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 29 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS AND CONTRIBUTORS BE
 30 |  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 31 |  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 32 |  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 33 |  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 34 |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 35 |  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 36 |  * POSSIBILITY OF SUCH DAMAGE.
 37 |  */
 38 | 
 39 | namespace Hoa\Ustring;
 40 | 
 41 | /**
 42 |  * Some algorithms about search in strings.
 43 |  */
 44 | class Search
 45 | {
 46 |     /**
 47 |      * Search by approximated patterns, with k differences based upon the
 48 |      * principle diagonal monotony.
 49 |      */
 50 |     public static function approximated(string $y, string $x, int $k): array
 51 |     {
 52 |         $x      = (string) $x;
 53 |         $y      = (string) $y;
 54 |         $m      = strlen($x);
 55 |         $n      = strlen($y);
 56 |         $offset = [];
 57 |         $L      = [-1 => array_fill(-1, $n - $m + $k + 3, -2)];
 58 | 
 59 |         for ($q = 0, $max = $k - 1; $q <= $max; ++$q) {
 60 |             $L[$q][-$q - 1] = $L[$q][-$q - 2] = $q - 1;
 61 |         }
 62 | 
 63 |         for ($q = 0; $q <= $k; ++$q) {
 64 |             for ($d = -$q, $max = $n - $m + $k - $q; $d <= $max; ++$d) {
 65 |                 $l         = min(
 66 |                                  max(
 67 |                                      $L[$q - 1][$d - 1],
 68 |                                      $L[$q - 1][$d    ] + 1,
 69 |                                      $L[$q - 1][$d + 1] + 1
 70 |                                  ),
 71 |                                  $m - 1
 72 |                              );
 73 |                 $a         = substr($x, $l + 1, $m - $l);
 74 |                 $b         = substr($y, $l + 1 + $d, $n - $l - $d);
 75 |                 $L[$q][$d] = $l + static::lcp($a, $b);
 76 | 
 77 |                 if ($L[$q][$d] == $m - 1 ||
 78 |                     $d + $L[$q][$d] == $n - 1) {
 79 |                     $j            = $m + $d;
 80 |                     $i            = max(0, $j - $m);
 81 |                     $offset[$q][] = ['i' => $i, 'j' => $j, 'l' => $j - $i];
 82 |                 }
 83 |             }
 84 |         }
 85 | 
 86 |         return empty($offset) ? $offset : $offset[$k];
 87 |     }
 88 | 
 89 |     /**
 90 |      * Length of the longest common prefixes.
 91 |      */
 92 |     public static function lcp(string $x, string $y): int
 93 |     {
 94 |         $max = min(strlen($x), strlen($y));
 95 |         $i   = 0;
 96 | 
 97 |         while ($i < $max && $x[$i] == $y[$i]) {
 98 |             ++$i;
 99 |         }
100 | 
101 |         return $i;
102 |     }
103 | }
104 | 


--------------------------------------------------------------------------------
/Source/Ustring.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | declare(strict_types=1);
  4 | 
  5 | /**
  6 |  * Hoa
  7 |  *
  8 |  *
  9 |  * @license
 10 |  *
 11 |  * New BSD License
 12 |  *
 13 |  * Copyright © 2007-2017, Hoa community. All rights reserved.
 14 |  *
 15 |  * Redistribution and use in source and binary forms, with or without
 16 |  * modification, are permitted provided that the following conditions are met:
 17 |  *     * Redistributions of source code must retain the above copyright
 18 |  *       notice, this list of conditions and the following disclaimer.
 19 |  *     * Redistributions in binary form must reproduce the above copyright
 20 |  *       notice, this list of conditions and the following disclaimer in the
 21 |  *       documentation and/or other materials provided with the distribution.
 22 |  *     * Neither the name of the Hoa nor the names of its contributors may be
 23 |  *       used to endorse or promote products derived from this software without
 24 |  *       specific prior written permission.
 25 |  *
 26 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 27 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 28 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 29 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS AND CONTRIBUTORS BE
 30 |  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 31 |  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 32 |  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 33 |  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 34 |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 35 |  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 36 |  * POSSIBILITY OF SUCH DAMAGE.
 37 |  */
 38 | 
 39 | namespace Hoa\Ustring;
 40 | 
 41 | use ArrayIterator;
 42 | use Collator;
 43 | use Hoa\Consistency;
 44 | use Transliterator;
 45 | 
 46 | /**
 47 |  * This class represents a UTF-8 string.
 48 |  * Please, see:
 49 |  *   * http://www.ietf.org/rfc/rfc3454.txt,
 50 |  *   * http://unicode.org/reports/tr9/,
 51 |  *   * http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt.
 52 |  */
 53 | class Ustring implements \ArrayAccess, \Countable, \IteratorAggregate
 54 | {
 55 |     /**
 56 |      * Left-To-Right.
 57 |      */
 58 |     public const LTR              = 0;
 59 | 
 60 |     /**
 61 |      * Right-To-Left.
 62 |      */
 63 |     public const RTL              = 1;
 64 | 
 65 |     /**
 66 |      * ZERO WIDTH NON-BREAKING SPACE (ZWNPBSP, aka byte-order mark, BOM).
 67 |      */
 68 |     public const BOM              = 0xfeff;
 69 | 
 70 |     /**
 71 |      * LEFT-TO-RIGHT MARK.
 72 |      */
 73 |     public const LRM              = 0x200e;
 74 | 
 75 |     /**
 76 |      * RIGHT-TO-LEFT MARK.
 77 |      */
 78 |     public const RLM              = 0x200f;
 79 | 
 80 |     /**
 81 |      * LEFT-TO-RIGHT EMBEDDING.
 82 |      */
 83 |     public const LRE              = 0x202a;
 84 | 
 85 |     /**
 86 |      * RIGHT-TO-LEFT EMBEDDING.
 87 |      */
 88 |     public const RLE              = 0x202b;
 89 | 
 90 |     /**
 91 |      * POP DIRECTIONAL FORMATTING.
 92 |      */
 93 |     public const PDF              = 0x202c;
 94 | 
 95 |     /**
 96 |      * LEFT-TO-RIGHT OVERRIDE.
 97 |      */
 98 |     public const LRO              = 0x202d;
 99 | 
100 |     /**
101 |      * RIGHT-TO-LEFT OVERRIDE.
102 |      */
103 |     public const RLO              = 0x202e;
104 | 
105 |     /**
106 |      * Represent the beginning of the string.
107 |      */
108 |     public const BEGINNING        = 1;
109 | 
110 |     /**
111 |      * Represent the end of the string.
112 |      */
113 |     public const END              = 2;
114 | 
115 |     /**
116 |      * Split: non-empty pieces is returned.
117 |      */
118 |     public const WITHOUT_EMPTY    = PREG_SPLIT_NO_EMPTY;
119 | 
120 |     /**
121 |      * Split: parenthesized expression in the delimiter pattern will be captured
122 |      * and returned.
123 |      */
124 |     public const WITH_DELIMITERS  = PREG_SPLIT_DELIM_CAPTURE;
125 | 
126 |     /**
127 |      * Split: offsets of captures will be returned.
128 |      */
129 |     public const WITH_OFFSET      = PREG_OFFSET_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE;
130 | 
131 |     /**
132 |      * Group results by patterns.
133 |      */
134 |     public const GROUP_BY_PATTERN = PREG_PATTERN_ORDER;
135 | 
136 |     /**
137 |      * Group results by tuple (set of patterns).
138 |      */
139 |     public const GROUP_BY_TUPLE   = PREG_SET_ORDER;
140 | 
141 |     /**
142 |      * Current string.
143 |      */
144 |     protected $_string          = null;
145 | 
146 |     /**
147 |      * Direction. Please see self::LTR and self::RTL constants.
148 |      */
149 |     protected $_direction       = null;
150 | 
151 |     /**
152 |      * Collator.
153 |      */
154 |     protected static $_collator = null;
155 | 
156 | 
157 | 
158 |     /**
159 |      * Construct a UTF-8 string.
160 |      */
161 |     public function __construct(string $string = null)
162 |     {
163 |         if (null !== $string) {
164 |             $this->append($string);
165 |         }
166 | 
167 |         return;
168 |     }
169 | 
170 |     /**
171 |      * Check if ext/mbstring is available.
172 |      */
173 |     public static function checkMbString(): bool
174 |     {
175 |         return function_exists('mb_substr');
176 |     }
177 | 
178 |     /**
179 |      * Check if ext/iconv is available.
180 |      */
181 |     public static function checkIconv(): bool
182 |     {
183 |         return function_exists('iconv');
184 |     }
185 | 
186 |     /**
187 |      * Append a substring to the current string, i.e. add to the end.
188 |      */
189 |     public function append(string $substring): self
190 |     {
191 |         $this->_string .= $substring;
192 | 
193 |         return $this;
194 |     }
195 | 
196 |     /**
197 |      * Prepend a substring to the current string, i.e. add to the start.
198 |      */
199 |     public function prepend(string $substring): self
200 |     {
201 |         $this->_string = $substring . $this->_string;
202 | 
203 |         return $this;
204 |     }
205 | 
206 |     /**
207 |      * Pad the current string to a certain length with another piece, aka piece.
208 |      */
209 |     public function pad(int $length, string $piece, int $side = self::END): self
210 |     {
211 |         $difference = $length - $this->count();
212 | 
213 |         if (0 >= $difference) {
214 |             return $this;
215 |         }
216 | 
217 |         $handle = null;
218 | 
219 |         for ($i = $difference / mb_strlen($piece) - 1; $i >= 0; --$i) {
220 |             $handle .= $piece;
221 |         }
222 | 
223 |         $handle .= mb_substr($piece, 0, $difference - mb_strlen($handle));
224 | 
225 |         return
226 |             static::END === $side
227 |                 ? $this->append($handle)
228 |                 : $this->prepend($handle);
229 |     }
230 | 
231 |     /**
232 |      * Make a comparison with a string.
233 |      * Return < 0 if current string is less than $string, > 0 if greater and 0
234 |      * if equal.
235 |      */
236 |     public function compare($string): int
237 |     {
238 |         if (null === $collator = static::getCollator()) {
239 |             return strcmp($this->_string, (string) $string);
240 |         }
241 | 
242 |         return $collator->compare($this->_string, $string);
243 |     }
244 | 
245 |     /**
246 |      * Get collator.
247 |      */
248 |     public static function getCollator(): ?Collator
249 |     {
250 |         if (false === class_exists('Collator')) {
251 |             return null;
252 |         }
253 | 
254 |         if (null === static::$_collator) {
255 |             static::$_collator = new Collator(setlocale(LC_COLLATE, null));
256 |         }
257 | 
258 |         return static::$_collator;
259 |     }
260 | 
261 |     /**
262 |      * Ensure that the pattern is safe for Unicode: add the “u” option.
263 |      */
264 |     public static function safePattern(string $pattern): string
265 |     {
266 |         $delimiter = mb_substr($pattern, 0, 1);
267 |         $options   = mb_substr(
268 |             mb_strrchr($pattern, $delimiter, false),
269 |             mb_strlen($delimiter)
270 |         );
271 | 
272 |         if (false === strpos($options, 'u')) {
273 |             $pattern .= 'u';
274 |         }
275 | 
276 |         return $pattern;
277 |     }
278 | 
279 |     /**
280 |      * Perform a regular expression (PCRE) match.
281 |      */
282 |     public function match(
283 |         string $pattern,
284 |         array &$matches = null,
285 |         int $flags      = 0,
286 |         int $offset     = 0,
287 |         bool $global    = false
288 |     ): int {
289 |         $pattern = static::safePattern($pattern);
290 | 
291 |         if (0 === $flags) {
292 |             if (true === $global) {
293 |                 $flags = static::GROUP_BY_PATTERN;
294 |             }
295 |         } else {
296 |             $flags &= ~PREG_SPLIT_OFFSET_CAPTURE;
297 |         }
298 | 
299 | 
300 |         $offset = strlen(mb_substr($this->_string, 0, $offset));
301 | 
302 |         if (true === $global) {
303 |             return preg_match_all(
304 |                 $pattern,
305 |                 $this->_string,
306 |                 $matches,
307 |                 $flags,
308 |                 $offset
309 |             );
310 |         }
311 | 
312 |         return preg_match($pattern, $this->_string, $matches, $flags, $offset);
313 |     }
314 | 
315 |     /**
316 |      * Perform a regular expression (PCRE) search and replace.
317 |      */
318 |     public function replace($pattern, $replacement, int $limit = -1): self
319 |     {
320 |         $pattern = static::safePattern($pattern);
321 | 
322 |         if (false === is_callable($replacement)) {
323 |             $this->_string = preg_replace(
324 |                 $pattern,
325 |                 $replacement,
326 |                 $this->_string,
327 |                 $limit
328 |             );
329 |         } else {
330 |             $this->_string = preg_replace_callback(
331 |                 $pattern,
332 |                 $replacement,
333 |                 $this->_string,
334 |                 $limit
335 |             );
336 |         }
337 | 
338 |         return $this;
339 |     }
340 | 
341 |     /**
342 |      * Split the current string according to a given pattern (PCRE).
343 |      */
344 |     public function split(
345 |         string $pattern,
346 |         int $limit = -1,
347 |         int $flags = self::WITHOUT_EMPTY
348 |     ): array {
349 |         return preg_split(
350 |             static::safePattern($pattern),
351 |             $this->_string,
352 |             $limit,
353 |             $flags
354 |         );
355 |     }
356 | 
357 |     /**
358 |      * Iterator over chars.
359 |      */
360 |     public function getIterator(): ArrayIterator
361 |     {
362 |         return new ArrayIterator(preg_split('#(?<!^)(?!$)#u', $this->_string));
363 |     }
364 | 
365 |     /**
366 |      * Perform a lowercase folding on the current string.
367 |      */
368 |     public function toLowerCase(): self
369 |     {
370 |         $this->_string = mb_strtolower($this->_string);
371 | 
372 |         return $this;
373 |     }
374 | 
375 |     /**
376 |      * Perform an uppercase folding on the current string.
377 |      *
378 |      * @return  \Hoa\Ustring
379 |      */
380 |     public function toUpperCase(): \Hoa\Ustring
381 |     {
382 |         $this->_string = mb_strtoupper($this->_string);
383 | 
384 |         return $this;
385 |     }
386 | 
387 |     /**
388 |      * Transform a UTF-8 string into an ASCII one.
389 |      * First, try with a transliterator. If not available, will fallback to a
390 |      * normalizer. If not available, will try something homemade.
391 |      */
392 |     public function toAscii(bool $try = false): self
393 |     {
394 |         if (0 === preg_match('#[\x80-\xff]#', $this->_string)) {
395 |             return $this;
396 |         }
397 | 
398 |         $string  = $this->_string;
399 |         $transId =
400 |             'Any-Latin; ' .
401 |             '[\p{S}] Name; ' .
402 |             'Latin-ASCII';
403 | 
404 |         if (null !== $transliterator = static::getTransliterator($transId)) {
405 |             $this->_string = preg_replace_callback(
406 |                 '#\\\N\{([A-Z ]+)\}#u',
407 |                 function (array $matches) {
408 |                     return '(' . strtolower($matches[1]) . ')';
409 |                 },
410 |                 $transliterator->transliterate($string)
411 |             );
412 | 
413 |             return $this;
414 |         }
415 | 
416 |         if (false === class_exists('Normalizer')) {
417 |             if (false === $try) {
418 |                 throw new Exception(
419 |                     '%s needs the class Normalizer to work properly, ' .
420 |                     'or you can force a try by using %1$s(true).',
421 |                     0,
422 |                     __METHOD__
423 |                 );
424 |             }
425 | 
426 |             $string        = static::transcode($string, 'UTF-8', 'ASCII//IGNORE//TRANSLIT');
427 |             $this->_string = preg_replace('#(?:[\'"`^](\w))#u', '\1', $string);
428 | 
429 |             return $this;
430 |         }
431 | 
432 |         $string        = \Normalizer::normalize($string, \Normalizer::NFKD);
433 |         $string        = preg_replace('#\p{Mn}+#u', '', $string);
434 |         $this->_string = static::transcode($string, 'UTF-8', 'ASCII//IGNORE//TRANSLIT');
435 | 
436 |         return $this;
437 |     }
438 | 
439 |     /**
440 |      * Transliterate the string into another.
441 |      * See self::getTransliterator for more information.
442 |      */
443 |     public function transliterate(string $identifier, int $start = 0, int $end = null): self
444 |     {
445 |         if (null === $transliterator = static::getTransliterator($identifier)) {
446 |             throw new Exception(
447 |                 '%s needs the class Transliterator to work properly.',
448 |                 1,
449 |                 __METHOD__
450 |             );
451 |         }
452 | 
453 |         $this->_string = $transliterator->transliterate($this->_string, $start, $end);
454 | 
455 |         return $this;
456 |     }
457 | 
458 |     /**
459 |      * Get transliterator.
460 |      * See http://userguide.icu-project.org/transforms/general for $identifier.
461 |      */
462 |     public static function getTransliterator(string $identifier): ?Transliterator
463 |     {
464 |         if (false === class_exists('Transliterator')) {
465 |             return null;
466 |         }
467 | 
468 |         return Transliterator::create($identifier);
469 |     }
470 | 
471 |     /**
472 |      * Strip characters (default \s) of the current string.
473 |      */
474 |     public function trim(string $regex = '\s', int $side = self::BEGINNING | self::END): self
475 |     {
476 |         $regex  = '(?:' . $regex . ')+';
477 |         $handle = null;
478 | 
479 |         if (0 !== ($side & static::BEGINNING)) {
480 |             $handle .= '(^' . $regex . ')';
481 |         }
482 | 
483 |         if (0 !== ($side & static::END)) {
484 |             if (null !== $handle) {
485 |                 $handle .= '|';
486 |             }
487 | 
488 |             $handle .= '(' . $regex . '$)';
489 |         }
490 | 
491 |         $this->_string    = preg_replace('#' . $handle . '#u', '', $this->_string);
492 |         $this->_direction = null;
493 | 
494 |         return $this;
495 |     }
496 | 
497 |     /**
498 |      * Compute offset (negative, unbound etc.).
499 |      */
500 |     protected function computeOffset(int $offset): int
501 |     {
502 |         $length = mb_strlen($this->_string);
503 | 
504 |         if (0 > $offset) {
505 |             $offset = -$offset % $length;
506 | 
507 |             if (0 !== $offset) {
508 |                 $offset = $length - $offset;
509 |             }
510 |         } elseif ($offset >= $length) {
511 |             $offset %= $length;
512 |         }
513 | 
514 |         return $offset;
515 |     }
516 | 
517 |     /**
518 |      * Get a specific chars of the current string.
519 |      */
520 |     public function offsetGet($offset): string
521 |     {
522 |         return mb_substr($this->_string, $this->computeOffset($offset), 1);
523 |     }
524 | 
525 |     /**
526 |      * Set a specific character of the current string.
527 |      */
528 |     public function offsetSet($offset, $value): self
529 |     {
530 |         $head   = null;
531 |         $offset = $this->computeOffset($offset);
532 | 
533 |         if (0 < $offset) {
534 |             $head = mb_substr($this->_string, 0, $offset);
535 |         }
536 | 
537 |         $tail             = mb_substr($this->_string, $offset + 1);
538 |         $this->_string    = $head . $value . $tail;
539 |         $this->_direction = null;
540 | 
541 |         return $this;
542 |     }
543 | 
544 |     /**
545 |      * Delete a specific character of the current string.
546 |      */
547 |     public function offsetUnset($offset): void
548 |     {
549 |         $this->offsetSet($offset, null);
550 |     }
551 | 
552 |     /**
553 |      * Check if a specific offset exists.
554 |      */
555 |     public function offsetExists($offset): bool
556 |     {
557 |         return true;
558 |     }
559 | 
560 |     /**
561 |      * Reduce the strings.
562 |      */
563 |     public function reduce(int $start, int $length = null): self
564 |     {
565 |         $this->_string = mb_substr($this->_string, $start, $length);
566 | 
567 |         return $this;
568 |     }
569 | 
570 |     /**
571 |      * Count number of characters of the current string.
572 |      */
573 |     public function count(): int
574 |     {
575 |         return mb_strlen($this->_string);
576 |     }
577 | 
578 |     /**
579 |      * Get byte (not character) at a specific offset.
580 |      */
581 |     public function getByteAt(int $offset): string
582 |     {
583 |         $length = strlen($this->_string);
584 | 
585 |         if (0 > $offset) {
586 |             $offset = -$offset % $length;
587 | 
588 |             if (0 !== $offset) {
589 |                 $offset = $length - $offset;
590 |             }
591 |         } elseif ($offset >= $length) {
592 |             $offset %= $length;
593 |         }
594 | 
595 |         return $this->_string[$offset];
596 |     }
597 | 
598 |     /**
599 |      * Count number of bytes (not characters) of the current string.
600 |      */
601 |     public function getBytesLength(): int
602 |     {
603 |         return strlen($this->_string);
604 |     }
605 | 
606 |     /**
607 |      * Get the width of the current string.
608 |      * Useful when printing the string in monotype (some character need more
609 |      * than one column to be printed).
610 |      */
611 |     public function getWidth(): int
612 |     {
613 |         return mb_strwidth($this->_string);
614 |     }
615 | 
616 |     /**
617 |      * Get direction of the current string.
618 |      * Please, see the self::LTR and self::RTL constants.
619 |      * It does not yet support embedding directions.
620 |      */
621 |     public function getDirection(): int
622 |     {
623 |         if (null === $this->_direction) {
624 |             if (null === $this->_string) {
625 |                 $this->_direction = static::LTR;
626 |             } else {
627 |                 $this->_direction = static::getCharDirection(
628 |                     mb_substr($this->_string, 0, 1)
629 |                 );
630 |             }
631 |         }
632 | 
633 |         return $this->_direction;
634 |     }
635 | 
636 |     /**
637 |      * Get character of a specific character.
638 |      * Please, see the self::LTR and self::RTL constants.
639 |      */
640 |     public static function getCharDirection(string $char): int
641 |     {
642 |         $c = static::toCode($char);
643 | 
644 |         if (!(0x5be <= $c && 0x10b7f >= $c)) {
645 |             return static::LTR;
646 |         }
647 | 
648 |         if (0x85e >= $c) {
649 |             if (0x5be === $c ||
650 |                 0x5c0 === $c ||
651 |                 0x5c3 === $c ||
652 |                 0x5c6 === $c ||
653 |                 (0x5d0 <= $c && 0x5ea >= $c) ||
654 |                 (0x5f0 <= $c && 0x5f4 >= $c) ||
655 |                 0x608 === $c ||
656 |                 0x60b === $c ||
657 |                 0x60d === $c ||
658 |                 0x61b === $c ||
659 |                 (0x61e <= $c && 0x64a >= $c) ||
660 |                 (0x66d <= $c && 0x66f >= $c) ||
661 |                 (0x671 <= $c && 0x6d5 >= $c) ||
662 |                 (0x6e5 <= $c && 0x6e6 >= $c) ||
663 |                 (0x6ee <= $c && 0x6ef >= $c) ||
664 |                 (0x6fa <= $c && 0x70d >= $c) ||
665 |                 0x710 === $c ||
666 |                 (0x712 <= $c && 0x72f >= $c) ||
667 |                 (0x74d <= $c && 0x7a5 >= $c) ||
668 |                 0x7b1 === $c ||
669 |                 (0x7c0 <= $c && 0x7ea >= $c) ||
670 |                 (0x7f4 <= $c && 0x7f5 >= $c) ||
671 |                 0x7fa === $c ||
672 |                 (0x800 <= $c && 0x815 >= $c) ||
673 |                 0x81a === $c ||
674 |                 0x824 === $c ||
675 |                 0x828 === $c ||
676 |                 (0x830 <= $c && 0x83e >= $c) ||
677 |                 (0x840 <= $c && 0x858 >= $c) ||
678 |                 0x85e === $c) {
679 |                 return static::RTL;
680 |             }
681 |         } elseif (0x200f === $c) {
682 |             return static::RTL;
683 |         } elseif (0xfb1d <= $c) {
684 |             if (0xfb1d === $c ||
685 |                 (0xfb1f <= $c && 0xfb28 >= $c) ||
686 |                 (0xfb2a <= $c && 0xfb36 >= $c) ||
687 |                 (0xfb38 <= $c && 0xfb3c >= $c) ||
688 |                 0xfb3e === $c ||
689 |                 (0xfb40 <= $c && 0xfb41 >= $c) ||
690 |                 (0xfb43 <= $c && 0xfb44 >= $c) ||
691 |                 (0xfb46 <= $c && 0xfbc1 >= $c) ||
692 |                 (0xfbd3 <= $c && 0xfd3d >= $c) ||
693 |                 (0xfd50 <= $c && 0xfd8f >= $c) ||
694 |                 (0xfd92 <= $c && 0xfdc7 >= $c) ||
695 |                 (0xfdf0 <= $c && 0xfdfc >= $c) ||
696 |                 (0xfe70 <= $c && 0xfe74 >= $c) ||
697 |                 (0xfe76 <= $c && 0xfefc >= $c) ||
698 |                 (0x10800 <= $c && 0x10805 >= $c) ||
699 |                 0x10808 === $c ||
700 |                 (0x1080a <= $c && 0x10835 >= $c) ||
701 |                 (0x10837 <= $c && 0x10838 >= $c) ||
702 |                 0x1083c === $c ||
703 |                 (0x1083f <= $c && 0x10855 >= $c) ||
704 |                 (0x10857 <= $c && 0x1085f >= $c) ||
705 |                 (0x10900 <= $c && 0x1091b >= $c) ||
706 |                 (0x10920 <= $c && 0x10939 >= $c) ||
707 |                 0x1093f === $c ||
708 |                 0x10a00 === $c ||
709 |                 (0x10a10 <= $c && 0x10a13 >= $c) ||
710 |                 (0x10a15 <= $c && 0x10a17 >= $c) ||
711 |                 (0x10a19 <= $c && 0x10a33 >= $c) ||
712 |                 (0x10a40 <= $c && 0x10a47 >= $c) ||
713 |                 (0x10a50 <= $c && 0x10a58 >= $c) ||
714 |                 (0x10a60 <= $c && 0x10a7f >= $c) ||
715 |                 (0x10b00 <= $c && 0x10b35 >= $c) ||
716 |                 (0x10b40 <= $c && 0x10b55 >= $c) ||
717 |                 (0x10b58 <= $c && 0x10b72 >= $c) ||
718 |                 (0x10b78 <= $c && 0x10b7f >= $c)) {
719 |                 return static::RTL;
720 |             }
721 |         }
722 | 
723 |         return static::LTR;
724 |     }
725 | 
726 |     /**
727 |      * Get the number of column positions of a wide-character.
728 |      *
729 |      * This is a PHP implementation of wcwidth() and wcswidth() (defined in IEEE
730 |      * Std 1002.1-2001) for Unicode, by Markus Kuhn. Please, see
731 |      * http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c.
732 |      *
733 |      * The wcwidth(wc) function shall either return 0 (if wc is a null
734 |      * wide-character code), or return the number of column positions to be
735 |      * occupied by the wide-character code wc, or return -1 (if wc does not
736 |      * correspond to a printable wide-character code).
737 |      */
738 |     public static function getCharWidth(string $char): int
739 |     {
740 |         $char = (string) $char;
741 |         $c    = static::toCode($char);
742 | 
743 |         // Test for 8-bit control characters.
744 |         if (0x0 === $c) {
745 |             return 0;
746 |         }
747 | 
748 |         if (0x20 > $c || (0x7f <= $c && $c < 0xa0)) {
749 |             return -1;
750 |         }
751 | 
752 |         // Non-spacing characters.
753 |         if (0xad !== $c &&
754 |             0 !== preg_match('#^[\p{Mn}\p{Me}\p{Cf}\x{1160}-\x{11ff}\x{200b}]#u', $char)) {
755 |             return 0;
756 |         }
757 | 
758 |         // If we arrive here, $c is not a combining C0/C1 control character.
759 |         return 1 +
760 |             (0x1100 <= $c &&
761 |                 (0x115f >= $c ||                        // Hangul Jamo init. consonants
762 |                  0x2329 === $c || 0x232a === $c ||
763 |                      (0x2e80 <= $c && 0xa4cf >= $c &&
764 |                       0x303f !== $c) ||                // CJK…Yi
765 |                      (0xac00 <= $c && 0xd7a3 >= $c) || // Hangul Syllables
766 |                      (0xf900 <= $c && 0xfaff >= $c) || // CJK Compatibility Ideographs
767 |                      (0xfe10 <= $c && 0xfe19 >= $c) || // Vertical forms
768 |                      (0xfe30 <= $c && 0xfe6f >= $c) || // CJK Compatibility Forms
769 |                      (0xff00 <= $c && 0xff60 >= $c) || // Fullwidth Forms
770 |                      (0xffe0 <= $c && 0xffe6 >= $c) ||
771 |                      (0x20000 <= $c && 0x2fffd >= $c) ||
772 |                      (0x30000 <= $c && 0x3fffd >= $c)));
773 |     }
774 | 
775 |     /**
776 |      * Check whether the character is printable or not.
777 |      */
778 |     public static function isCharPrintable(string $char): bool
779 |     {
780 |         return 1 <= static::getCharWidth($char);
781 |     }
782 | 
783 |     /**
784 |      * Get a UTF-8 character from its decimal code representation.
785 |      */
786 |     public static function fromCode(int $code): string
787 |     {
788 |         return mb_convert_encoding(
789 |             '&#x' . dechex($code) . ';',
790 |             'UTF-8',
791 |             'HTML-ENTITIES'
792 |         );
793 |     }
794 | 
795 |     /**
796 |      * Get a decimal code representation of a specific character.
797 |      */
798 |     public static function toCode(string $char): int
799 |     {
800 |         $char  = (string) $char;
801 |         $code  = ord($char[0]);
802 |         $bytes = 1;
803 | 
804 |         if (!($code & 0x80)) { // 0xxxxxxx
805 |             return $code;
806 |         }
807 | 
808 |         if (($code & 0xe0) === 0xc0) { // 110xxxxx
809 |             $bytes = 2;
810 |             $code  = $code & ~0xc0;
811 |         } elseif (($code & 0xf0) == 0xe0) { // 1110xxxx
812 |             $bytes = 3;
813 |             $code  = $code & ~0xe0;
814 |         } elseif (($code & 0xf8) === 0xf0) { // 11110xxx
815 |             $bytes = 4;
816 |             $code  = $code & ~0xf0;
817 |         }
818 | 
819 |         for ($i = 2; $i <= $bytes; $i++) { // 10xxxxxx
820 |             $code = ($code << 6) + (ord($char[$i - 1]) & ~0x80);
821 |         }
822 | 
823 |         return $code;
824 |     }
825 | 
826 |     /**
827 |      * Get a binary representation of a specific character.
828 |      */
829 |     public static function toBinaryCode(string $char): string
830 |     {
831 |         $char = (string) $char;
832 |         $out  = '';
833 | 
834 |         for ($i = 0, $max = strlen($char); $i < $max; ++$i) {
835 |             $out .= vsprintf('%08b', ord($char[$i]));
836 |         }
837 | 
838 |         return $out;
839 |     }
840 | 
841 |     /**
842 |      * Transcode.
843 |      */
844 |     public static function transcode(string $string, string $from, string $to = 'UTF-8'): string
845 |     {
846 |         if (false === static::checkIconv()) {
847 |             throw new Exception(
848 |                 '%s needs the iconv extension.',
849 |                 2,
850 |                 __CLASS__
851 |             );
852 |         }
853 | 
854 |         return iconv($from, $to, $string);
855 |     }
856 | 
857 |     /**
858 |      * Check if a string is encoded in UTF-8.
859 |      */
860 |     public static function isUtf8(string $string): bool
861 |     {
862 |         return (bool) preg_match('##u', $string);
863 |     }
864 | 
865 |     /**
866 |      * Copy current object string
867 |      */
868 |     public function copy(): self
869 |     {
870 |         return clone $this;
871 |     }
872 | 
873 |     /**
874 |      * Transform the object as a string.
875 |      */
876 |     public function __toString(): string
877 |     {
878 |         return $this->_string;
879 |     }
880 | }
881 | 
882 | /**
883 |  * Flex entity.
884 |  */
885 | Consistency::flexEntity(Ustring::class);
886 | 
887 | if (false === Ustring::checkMbString()) {
888 |     throw new Exception(
889 |         '%s needs the mbstring extension.',
890 |         0,
891 |         __NAMESPACE__ . '\Ustring'
892 |     );
893 | }
894 | 


--------------------------------------------------------------------------------
/Test/Unit/Issue.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | declare(strict_types=1);
 4 | 
 5 | /**
 6 |  * Hoa
 7 |  *
 8 |  *
 9 |  * @license
10 |  *
11 |  * New BSD License
12 |  *
13 |  * Copyright © 2007-2017, Hoa community. All rights reserved.
14 |  *
15 |  * Redistribution and use in source and binary forms, with or without
16 |  * modification, are permitted provided that the following conditions are met:
17 |  *     * Redistributions of source code must retain the above copyright
18 |  *       notice, this list of conditions and the following disclaimer.
19 |  *     * Redistributions in binary form must reproduce the above copyright
20 |  *       notice, this list of conditions and the following disclaimer in the
21 |  *       documentation and/or other materials provided with the distribution.
22 |  *     * Neither the name of the Hoa nor the names of its contributors may be
23 |  *       used to endorse or promote products derived from this software without
24 |  *       specific prior written permission.
25 |  *
26 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS AND CONTRIBUTORS BE
30 |  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 |  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 |  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 |  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 |  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 |  * POSSIBILITY OF SUCH DAMAGE.
37 |  */
38 | 
39 | namespace Hoa\Ustring\Test\Unit;
40 | 
41 | use Hoa\Test;
42 | use Hoa\Ustring as LUT;
43 | 
44 | /**
45 |  * Test suite of detected issues.
46 |  */
47 | class Issue extends Test\Unit\Suite implements Test\Decorrelated
48 | {
49 |     public function case_github_26(): void
50 |     {
51 |         $this
52 |             ->when($result = LUT::toCode(chr(160)))
53 |             ->then
54 |                 ->integer($result)
55 |                     ->isEqualTo(0xa0);
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/Test/Unit/Search.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | declare(strict_types=1);
 4 | 
 5 | /**
 6 |  * Hoa
 7 |  *
 8 |  *
 9 |  * @license
10 |  *
11 |  * New BSD License
12 |  *
13 |  * Copyright © 2007-2017, Hoa community. All rights reserved.
14 |  *
15 |  * Redistribution and use in source and binary forms, with or without
16 |  * modification, are permitted provided that the following conditions are met:
17 |  *     * Redistributions of source code must retain the above copyright
18 |  *       notice, this list of conditions and the following disclaimer.
19 |  *     * Redistributions in binary form must reproduce the above copyright
20 |  *       notice, this list of conditions and the following disclaimer in the
21 |  *       documentation and/or other materials provided with the distribution.
22 |  *     * Neither the name of the Hoa nor the names of its contributors may be
23 |  *       used to endorse or promote products derived from this software without
24 |  *       specific prior written permission.
25 |  *
26 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS AND CONTRIBUTORS BE
30 |  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 |  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 |  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 |  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 |  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 |  * POSSIBILITY OF SUCH DAMAGE.
37 |  */
38 | 
39 | namespace Hoa\Ustring\Test\Unit;
40 | 
41 | use Hoa\Test;
42 | use Hoa\Ustring as LUT;
43 | 
44 | /**
45 |  * Test suite of the search algorithms.
46 |  */
47 | class Search extends Test\Unit\Suite
48 | {
49 |     public function case_approximated(): void
50 |     {
51 |         $this
52 |             ->given(
53 |                 $x = 'GATAA',
54 |                 $y = 'CAGATAAGAGAA',
55 |                 $k = 1
56 |             )
57 |             ->when($result = LUT\Search::approximated($y, $x, $k))
58 |             ->then
59 |                 ->array($result)
60 |                     ->isEqualTo([
61 |                         0 => [
62 |                             'i' => 1,
63 |                             'j' => 6,
64 |                             'l' => 5
65 |                         ],
66 |                         1 => [
67 |                             'i' => 2,
68 |                             'j' => 7,
69 |                             'l' => 5
70 |                         ],
71 |                         2 => [
72 |                             'i' => 3,
73 |                             'j' => 8,
74 |                             'l' => 5
75 |                         ],
76 |                         3 => [
77 |                             'i' => 7,
78 |                             'j' => 12,
79 |                             'l' => 5
80 |                         ]
81 |                     ]);
82 |     }
83 | }
84 | 


--------------------------------------------------------------------------------
/Test/Unit/Ustring.php:
--------------------------------------------------------------------------------
   1 | <?php
   2 | 
   3 | declare(strict_types=1);
   4 | 
   5 | /**
   6 |  * Hoa
   7 |  *
   8 |  *
   9 |  * @license
  10 |  *
  11 |  * New BSD License
  12 |  *
  13 |  * Copyright © 2007-2017, Hoa community. All rights reserved.
  14 |  *
  15 |  * Redistribution and use in source and binary forms, with or without
  16 |  * modification, are permitted provided that the following conditions are met:
  17 |  *     * Redistributions of source code must retain the above copyright
  18 |  *       notice, this list of conditions and the following disclaimer.
  19 |  *     * Redistributions in binary form must reproduce the above copyright
  20 |  *       notice, this list of conditions and the following disclaimer in the
  21 |  *       documentation and/or other materials provided with the distribution.
  22 |  *     * Neither the name of the Hoa nor the names of its contributors may be
  23 |  *       used to endorse or promote products derived from this software without
  24 |  *       specific prior written permission.
  25 |  *
  26 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  27 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS AND CONTRIBUTORS BE
  30 |  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31 |  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32 |  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33 |  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34 |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35 |  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36 |  * POSSIBILITY OF SUCH DAMAGE.
  37 |  */
  38 | 
  39 | namespace Hoa\Ustring\Test\Unit;
  40 | 
  41 | use Hoa\Test;
  42 | use Hoa\Ustring as LUT;
  43 | 
  44 | /**
  45 |  * Test suite of the string class.
  46 |  */
  47 | class Ustring extends Test\Unit\Suite
  48 | {
  49 |     public function case_check_mbstring(): void
  50 |     {
  51 |         $this
  52 |             ->given($this->function->function_exists = true)
  53 |             ->then
  54 |                 ->boolean(LUT::checkMbString())
  55 |                     ->isTrue();
  56 |     }
  57 | 
  58 |     public function case_append_ltr(): void
  59 |     {
  60 |         $this
  61 |             ->given($string = new LUT('je'))
  62 |             ->when($result = $string->append(' t\'aime'))
  63 |             ->then
  64 |                 ->object($result)
  65 |                     ->isIdenticalTo($string)
  66 |                 ->string((string) $result)
  67 |                     ->isEqualTo('je t\'aime');
  68 |     }
  69 | 
  70 |     public function case_append_rtl(): void
  71 |     {
  72 |         $this
  73 |             ->given($string = new LUT('أ'))
  74 |             ->when($result = $string->append('حبك'))
  75 |             ->then
  76 |                 ->object($result)
  77 |                     ->isIdenticalTo($string)
  78 |                 ->string((string) $result)
  79 |                     ->isEqualTo('أحبك');
  80 |     }
  81 | 
  82 |     public function case_prepend_ltr(): void
  83 |     {
  84 |         $this
  85 |             ->given($string = new LUT(' t\'aime'))
  86 |             ->when($result = $string->prepend('je'))
  87 |             ->then
  88 |                 ->object($result)
  89 |                     ->isIdenticalTo($string)
  90 |                 ->string((string) $result)
  91 |                     ->isEqualTo('je t\'aime');
  92 |     }
  93 | 
  94 |     public function case_prepend_rtl(): void
  95 |     {
  96 |         $this
  97 |             ->given($string = new LUT('ك'))
  98 |             ->when($result = $string->prepend('أحب'))
  99 |             ->then
 100 |                 ->object($result)
 101 |                     ->isIdenticalTo($string)
 102 |                 ->string((string) $result)
 103 |                     ->isEqualTo('أحبك');
 104 |     }
 105 | 
 106 |     public function case_pad_beginning_ltr(): void
 107 |     {
 108 |         $this
 109 |             ->given($string = new LUT('je t\'aime'))
 110 |             ->when($result = $string->pad(20, '👍 💩 😄 ❤️ ', LUT::BEGINNING))
 111 |             ->then
 112 |                 ->object($result)
 113 |                     ->isIdenticalTo($string)
 114 |                 ->string((string) $result)
 115 |                     ->isEqualTo('👍 💩 😄 ❤️ 👍 je t\'aime');
 116 |     }
 117 | 
 118 |     public function case_pad_beginning_rtl(): void
 119 |     {
 120 |         $this
 121 |             ->given($string = new LUT('أحبك'))
 122 |             ->when($result = $string->pad(20, '👍 💩 😄 ❤️ ', LUT::BEGINNING))
 123 |             ->then
 124 |                 ->object($result)
 125 |                     ->isIdenticalTo($string)
 126 |                 ->string((string) $result)
 127 |                     ->isEqualTo('👍 💩 😄 ❤️ 👍 💩 😄 ❤أحبك');
 128 |     }
 129 | 
 130 |     public function case_pad_end_ltr(): void
 131 |     {
 132 |         $this
 133 |             ->given($string = new LUT('je t\'aime'))
 134 |             ->when($result = $string->pad(20, '👍 💩 😄 ❤️ ', LUT::END))
 135 |             ->then
 136 |                 ->object($result)
 137 |                     ->isIdenticalTo($string)
 138 |                 ->string((string) $result)
 139 |                     ->isEqualTo('je t\'aime👍 💩 😄 ❤️ 👍 ');
 140 |     }
 141 | 
 142 |     public function case_pad_end_rtl(): void
 143 |     {
 144 |         $this
 145 |             ->given($string = new LUT('أحبك'))
 146 |             ->when($result = $string->pad(20, '👍 💩 😄 ❤️ ', LUT::END))
 147 |             ->then
 148 |                 ->object($result)
 149 |                     ->isIdenticalTo($string)
 150 |                 ->string((string) $result)
 151 |                     ->isEqualTo('أحبك👍 💩 😄 ❤️ 👍 💩 😄 ❤');
 152 |     }
 153 | 
 154 |     public function case_compare_no_collator(): void
 155 |     {
 156 |         $this
 157 |             ->given(
 158 |                 $this->function->class_exists = function ($name) {
 159 |                     return 'Collator' !== $name;
 160 |                 },
 161 |                 $string = new LUT('b')
 162 |             )
 163 |             ->case_compare();
 164 |     }
 165 | 
 166 |     public function case_compare(): void
 167 |     {
 168 |         $this
 169 |             ->given($string = new LUT('b'))
 170 |             ->when($result = $string->compare('a'))
 171 |             ->then
 172 |                 ->integer($result)
 173 |                     ->isEqualTo(1)
 174 | 
 175 |             ->when($result = $string->compare('b'))
 176 |             ->then
 177 |                 ->integer($result)
 178 |                     ->isEqualTo(0)
 179 | 
 180 |             ->when($result = $string->compare('c'))
 181 |             ->then
 182 |                 ->integer($result)
 183 |                     ->isEqualTo(-1);
 184 |     }
 185 | 
 186 |     public function case_collator(): void
 187 |     {
 188 |         $this
 189 |             ->given(
 190 |                 $this->function->setlocale = 'fr_FR',
 191 |                 $collator = LUT::getCollator()
 192 |             )
 193 |             ->when($result = $collator->getLocale(\Locale::VALID_LOCALE))
 194 |             ->then
 195 |                 ->string($result)
 196 |                     ->isEqualTo('fr');
 197 |     }
 198 | 
 199 |     public function case_safe_unsafe_pattern(): void
 200 |     {
 201 |         $this
 202 |             ->given($pattern = '/foo/i')
 203 |             ->when($result = LUT::safePattern($pattern))
 204 |             ->then
 205 |                 ->string($result)
 206 |                     ->isEqualto('/foo/iu');
 207 |     }
 208 | 
 209 |     public function case_safe_safe_pattern(): void
 210 |     {
 211 |         $this
 212 |             ->given($pattern = '/foo/ui')
 213 |             ->when($result = LUT::safePattern($pattern))
 214 |             ->then
 215 |                 ->string($result)
 216 |                     ->isEqualto('/foo/ui');
 217 |     }
 218 | 
 219 |     public function case_match_default(): void
 220 |     {
 221 |         $this
 222 |             ->given(
 223 |                 $pattern = '/💩/u',
 224 |                 $string  = new LUT('foo 💩 bar')
 225 |             )
 226 |             ->when($result = $string->match($pattern, $matches))
 227 |             ->then
 228 |                 ->integer($result)
 229 |                     ->isEqualTo(1)
 230 |                 ->array($matches)
 231 |                     ->isEqualTo([
 232 |                         0 => '💩'
 233 |                     ]);
 234 |     }
 235 | 
 236 |     public function case_match_offset(): void
 237 |     {
 238 |         $this
 239 |             ->given(
 240 |                 $pattern = '/💩/u',
 241 |                 $string  = new LUT('foo 💩 bar')
 242 |             )
 243 |             ->when($result = $string->match($pattern, $matches, 0, 0))
 244 |             ->then
 245 |                 ->integer($result)
 246 |                     ->isEqualTo(1)
 247 |                 ->array($matches)
 248 |                     ->isEqualTo([0 => '💩'])
 249 | 
 250 |             ->when($result = $string->match($pattern, $matches, 0, 4))
 251 |             ->then
 252 |                 ->integer($result)
 253 |                     ->isEqualTo(1)
 254 |                 ->array($matches)
 255 |                     ->isEqualTo([0 => '💩'])
 256 | 
 257 |             ->when($result = $string->match($pattern, $matches, 0, 5))
 258 |             ->then
 259 |                 ->integer($result)
 260 |                     ->isEqualTo(0)
 261 |                 ->array($matches)
 262 |                     ->isEmpty();
 263 |     }
 264 | 
 265 |     public function case_match_with_offset(): void
 266 |     {
 267 |         $this
 268 |             ->given(
 269 |                 $pattern = '/💩/u',
 270 |                 $string  = new LUT('foo 💩 bar')
 271 |             )
 272 |             ->when($result = $string->match($pattern, $matches, $string::WITH_OFFSET))
 273 |             ->then
 274 |                 ->integer($result)
 275 |                     ->isEqualTo(1)
 276 |                 ->array($matches)
 277 |                     ->isEqualTo([
 278 |                         0 => [
 279 |                             0 => '💩',
 280 |                             1 => 4
 281 |                         ]
 282 |                     ]);
 283 |     }
 284 | 
 285 |     public function case_match_all_default(): void
 286 |     {
 287 |         $this
 288 |             ->given(
 289 |                 $pattern = '/💩/u',
 290 |                 $string  = new LUT('foo 💩 bar 💩 baz')
 291 |             )
 292 |             ->when($result = $string->match($pattern, $matches, 0, 0, true))
 293 |             ->then
 294 |                 ->integer($result)
 295 |                     ->isEqualTo(2)
 296 |                 ->array($matches)
 297 |                     ->isEqualTo([
 298 |                         0 => [
 299 |                             0 => '💩',
 300 |                             1 => '💩'
 301 |                         ]
 302 |                     ]);
 303 |     }
 304 | 
 305 |     public function case_match_all_with_offset(): void
 306 |     {
 307 |         $this
 308 |             ->given(
 309 |                 $pattern = '/💩/u',
 310 |                 $string  = new LUT('foo 💩 bar 💩 baz')
 311 |             )
 312 |             ->when($result = $string->match($pattern, $matches, $string::WITH_OFFSET, 0, true))
 313 |             ->then
 314 |                 ->integer($result)
 315 |                     ->isEqualTo(2)
 316 |                 ->array($matches)
 317 |                     ->isEqualTo([
 318 |                         0 => [
 319 |                             0 => [
 320 |                                 0 => '💩',
 321 |                                 1 => 4
 322 |                             ],
 323 |                             1 => [
 324 |                                 0 => '💩',
 325 |                                 1 => 13
 326 |                             ]
 327 |                         ]
 328 |                     ]);
 329 |     }
 330 | 
 331 |     public function case_match_all_grouped_by_pattern(): void
 332 |     {
 333 |         $this
 334 |             ->given(
 335 |                 $pattern = '/(💩)/u',
 336 |                 $string  = new LUT('foo 💩 bar 💩 baz')
 337 |             )
 338 |             ->when($result = $string->match($pattern, $matches, $string::GROUP_BY_PATTERN, 0, true))
 339 |             ->then
 340 |                 ->integer($result)
 341 |                     ->isEqualTo(2)
 342 |                 ->array($matches)
 343 |                     ->isEqualTo([
 344 |                         0 => [
 345 |                             0 => '💩',
 346 |                             1 => '💩'
 347 |                         ],
 348 |                         1 => [
 349 |                             0 => '💩',
 350 |                             1 => '💩'
 351 |                         ]
 352 |                     ]);
 353 |     }
 354 | 
 355 |     public function case_match_all_grouped_by_tuple(): void
 356 |     {
 357 |         $this
 358 |             ->given(
 359 |                 $pattern = '/(💩)/u',
 360 |                 $string  = new LUT('foo 💩 bar 💩 baz')
 361 |             )
 362 |             ->when($result = $string->match($pattern, $matches, $string::GROUP_BY_TUPLE, 0, true))
 363 |             ->then
 364 |                 ->integer($result)
 365 |                     ->isEqualTo(2)
 366 |                 ->array($matches)
 367 |                     ->isEqualTo([
 368 |                         0 => [
 369 |                             0 => '💩',
 370 |                             1 => '💩'
 371 |                         ],
 372 |                         1 => [
 373 |                             0 => '💩',
 374 |                             1 => '💩'
 375 |                         ]
 376 |                     ]);
 377 |     }
 378 | 
 379 |     public function case_replace(): void
 380 |     {
 381 |         $this
 382 |             ->given($string = new LUT('❤️ 💩 💩'))
 383 |             ->when($result = $string->replace('/💩/u', '😄'))
 384 |             ->then
 385 |                 ->object($result)
 386 |                     ->isIdenticalTo($string)
 387 |                 ->string((string) $result)
 388 |                     ->isEqualTo('❤️ 😄 😄');
 389 |     }
 390 | 
 391 |     public function case_replace_limited(): void
 392 |     {
 393 |         $this
 394 |             ->given($string = new LUT('❤️ 💩 💩'))
 395 |             ->when($result = $string->replace('/💩/u', '😄', 1))
 396 |             ->then
 397 |                 ->object($result)
 398 |                     ->isIdenticalTo($string)
 399 |                 ->string((string) $result)
 400 |                     ->isEqualTo('❤️ 😄 💩');
 401 |     }
 402 | 
 403 |     public function case_split_default(): void
 404 |     {
 405 |         $this
 406 |             ->given($string = new LUT('❤️💩❤️💩❤️'))
 407 |             ->when($result = $string->split('/💩/'))
 408 |             ->then
 409 |                 ->array($result)
 410 |                     ->isEqualTo([
 411 |                         0 => '❤️',
 412 |                         1 => '❤️',
 413 |                         2 => '❤️'
 414 |                     ]);
 415 |     }
 416 | 
 417 |     public function case_split_default_limited(): void
 418 |     {
 419 |         $this
 420 |             ->given($string = new LUT('❤️💩❤️💩❤️'))
 421 |             ->when($result = $string->split('/💩/', 1))
 422 |             ->then
 423 |                 ->array($result)
 424 |                     ->isEqualTo([
 425 |                         0 => '❤️💩❤️💩❤️'
 426 |                     ]);
 427 |     }
 428 | 
 429 |     public function case_split_with_delimiters(): void
 430 |     {
 431 |         $this
 432 |             ->given($string = new LUT('❤️💩❤️💩❤️'))
 433 |             ->when($result = $string->split('/💩/', -1, $string::WITH_DELIMITERS))
 434 |             ->then
 435 |                 ->array($result)
 436 |                     ->isEqualTo([
 437 |                         0 => '❤️',
 438 |                         1 => '❤️',
 439 |                         2 => '❤️'
 440 |                     ]);
 441 |     }
 442 | 
 443 |     public function case_split_with_offset(): void
 444 |     {
 445 |         $this
 446 |             ->given($string = new LUT('❤️💩❤️💩❤️'))
 447 |             ->when($result = $string->split('/💩/', -1, $string::WITH_OFFSET))
 448 |             ->then
 449 |                 ->array($result)
 450 |                     ->isEqualTo([
 451 |                         0 => [
 452 |                             0 => '❤️',
 453 |                             1 => 0
 454 |                         ],
 455 |                         1 => [
 456 |                             0 => '❤️',
 457 |                             1 => 10
 458 |                         ],
 459 |                         2 => [
 460 |                             0 => '❤️',
 461 |                             1 => 20
 462 |                         ]
 463 |                     ]);
 464 |     }
 465 | 
 466 |     public function case_iterator_ltr(): void
 467 |     {
 468 |         $this
 469 |             ->given($string = new LUT('je t\'aime'))
 470 |             ->when($result = iterator_to_array($string))
 471 |             ->then
 472 |                 ->array($result)
 473 |                     ->isEqualTo([
 474 |                         'j',
 475 |                         'e',
 476 |                         ' ',
 477 |                         't',
 478 |                         '\'',
 479 |                         'a',
 480 |                         'i',
 481 |                         'm',
 482 |                         'e'
 483 |                     ]);
 484 |     }
 485 | 
 486 |     public function case_iterator_rtl(): void
 487 |     {
 488 |         $this
 489 |             ->given($string = new LUT('أحبك'))
 490 |             ->when($result = iterator_to_array($string))
 491 |             ->then
 492 |                 ->array($result)
 493 |                     ->isEqualTo([
 494 |                         'أ',
 495 |                         'ح',
 496 |                         'ب',
 497 |                         'ك'
 498 |                     ]);
 499 |     }
 500 | 
 501 |     public function case_to_lower(): void
 502 |     {
 503 |         $this
 504 |             ->given($string = new LUT('Σ \'ΑΓΑΠΏ'))
 505 |             ->when($result = $string->toLowerCase())
 506 |             ->then
 507 |                 ->object($result)
 508 |                     ->isIdenticalTo($string)
 509 |                 ->string((string) $result)
 510 |                     ->isEqualTo('σ \'αγαπώ')
 511 | 
 512 |             ->given($string = new LUT('JE T\'AIME'))
 513 |             ->when($result = $string->toLowerCase())
 514 |             ->then
 515 |                 ->object($result)
 516 |                     ->isIdenticalTo($string)
 517 |                 ->string((string) $result)
 518 |                     ->isEqualTo('je t\'aime');
 519 |     }
 520 | 
 521 |     public function case_to_upper(): void
 522 |     {
 523 |         $this
 524 |             ->given($string = new LUT('σ \'αγαπώ'))
 525 |             ->when($result = $string->toUpperCase())
 526 |             ->then
 527 |                 ->object($result)
 528 |                     ->isIdenticalTo($string)
 529 |                 ->string((string) $result)
 530 |                     ->isEqualTo('Σ \'ΑΓΑΠΏ')
 531 | 
 532 |             ->given($string = new LUT('je t\'aime'))
 533 |             ->when($result = $string->toUpperCase())
 534 |             ->then
 535 |                 ->object($result)
 536 |                     ->isIdenticalTo($string)
 537 |                 ->string((string) $result)
 538 |                     ->isEqualTo('JE T\'AIME');
 539 |     }
 540 | 
 541 |     public function case_trim_default(): void
 542 |     {
 543 |         $this
 544 |             ->given($string = new LUT('💩💩❤️💩💩'))
 545 |             ->when($result = $string->trim('💩'))
 546 |             ->then
 547 |                 ->object($result)
 548 |                     ->isIdenticalTo($string)
 549 |                 ->string((string) $result)
 550 |                     ->isEqualTo('❤️');
 551 |     }
 552 | 
 553 |     public function case_trim_beginning(): void
 554 |     {
 555 |         $this
 556 |             ->given($string = new LUT('💩💩❤️💩💩'))
 557 |             ->when($result = $string->trim('💩', $string::BEGINNING))
 558 |             ->then
 559 |                 ->object($result)
 560 |                     ->isIdenticalTo($string)
 561 |                 ->string((string) $result)
 562 |                     ->isEqualTo('❤️💩💩');
 563 |     }
 564 | 
 565 |     public function case_trim_end(): void
 566 |     {
 567 |         $this
 568 |             ->given($string = new LUT('💩💩❤️💩💩'))
 569 |             ->when($result = $string->trim('💩', $string::END))
 570 |             ->then
 571 |                 ->object($result)
 572 |                     ->isIdenticalTo($string)
 573 |                 ->string((string) $result)
 574 |                     ->isEqualTo('💩💩❤️');
 575 |     }
 576 | 
 577 |     public function case_offset_get_ltr(): void
 578 |     {
 579 |         $this
 580 |             ->given($string = new LUT('je t\'aime'))
 581 |             ->when($result = $string[0])
 582 |             ->then
 583 |                 ->string($result)
 584 |                     ->isEqualTo('j')
 585 | 
 586 |             ->when($result = $string[-1])
 587 |             ->then
 588 |                 ->string($result)
 589 |                     ->isEqualTo('e');
 590 |     }
 591 | 
 592 |     public function case_offset_get_rtl(): void
 593 |     {
 594 |         $this
 595 |             ->given($string = new LUT('أحبك'))
 596 |             ->when($result = $string[0])
 597 |             ->then
 598 |                 ->string($result)
 599 |                     ->isEqualTo('أ')
 600 | 
 601 |             ->when($result = $string[-1])
 602 |             ->then
 603 |                 ->string($result)
 604 |                     ->isEqualTo('ك');
 605 |     }
 606 | 
 607 |     public function case_offset_set(): void
 608 |     {
 609 |         $this
 610 |             ->given($string = new LUT('أحبﻙ'))
 611 |             ->when($string[-1] = 'ك')
 612 |             ->then
 613 |                 ->string((string) $string)
 614 |                     ->isEqualTo('أحبك');
 615 |     }
 616 | 
 617 |     public function case_offset_unset(): void
 618 |     {
 619 |         $this
 620 |             ->given($string = new LUT('أحبك😄'))
 621 |             ->when(function () use ($string): void {
 622 |                 unset($string[-1]);
 623 |             })
 624 |             ->then
 625 |                 ->string((string) $string)
 626 |                     ->isEqualTo('أحبك');
 627 |     }
 628 | 
 629 |     public function case_reduce(): void
 630 |     {
 631 |         $this
 632 |             ->given($string = new LUT('أحبك'))
 633 |             ->when($result = $string->reduce(0, 1))
 634 |             ->then
 635 |                 ->object($result)
 636 |                     ->isIdenticalTo($string)
 637 |                 ->string((string) $result)
 638 |                     ->isEqualTo('أ');
 639 |     }
 640 | 
 641 |     public function case_count(): void
 642 |     {
 643 |         $this
 644 |             ->given($string = new LUT('je t\'aime'))
 645 |             ->when($result = count($string))
 646 |             ->then
 647 |                 ->integer($result)
 648 |                     ->isEqualTo(9)
 649 | 
 650 |             ->given($string = new LUT('أحبك'))
 651 |             ->when($result = count($string))
 652 |             ->then
 653 |                 ->integer($result)
 654 |                     ->isEqualTo(4)
 655 | 
 656 |             ->given($string = new LUT('💩'))
 657 |             ->when($result = count($string))
 658 |             ->then
 659 |                 ->integer($result)
 660 |                     ->isEqualTo(1);
 661 |     }
 662 | 
 663 |     public function case_byte_at(): void
 664 |     {
 665 |         $this
 666 |             ->given($string = new LUT('💩'))
 667 |             ->when($result = $string->getByteAt(0))
 668 |             ->then
 669 |                 ->integer(ord($result))
 670 |                     ->isEqualTo(0xf0)
 671 | 
 672 |             ->when($result = $string->getByteAt(1))
 673 |             ->then
 674 |                 ->integer(ord($result))
 675 |                     ->isEqualTo(0x9f)
 676 | 
 677 |             ->when($result = $string->getByteAt(2))
 678 |             ->then
 679 |                 ->integer(ord($result))
 680 |                     ->isEqualTo(0x92)
 681 | 
 682 |             ->when($result = $string->getByteAt(3))
 683 |             ->then
 684 |                 ->integer(ord($result))
 685 |                     ->isEqualTo(0xa9)
 686 | 
 687 |             ->when($result = $string->getByteAt(-1))
 688 |             ->then
 689 |                 ->integer(ord($result))
 690 |                     ->isEqualTo(0xa9);
 691 |     }
 692 | 
 693 |     public function case_bytes_length(): void
 694 |     {
 695 |         $this
 696 |             ->given($string = new LUT('💩'))
 697 |             ->when($result = $string->getBytesLength())
 698 |             ->then
 699 |                 ->integer($result)
 700 |                     ->isEqualTo(4);
 701 |     }
 702 | 
 703 |     public function case_get_width(): void
 704 |     {
 705 |         $this
 706 |             ->given($string = new LUT('💩'))
 707 |             ->when($result = $string->getWidth())
 708 |             ->then
 709 |                 ->integer($result)
 710 |                     ->isEqualTo(1)
 711 | 
 712 |             ->given($string = new LUT('習'))
 713 |             ->when($result = $string->getWidth())
 714 |             ->then
 715 |                 ->integer($result)
 716 |                     ->isEqualTo(2);
 717 |     }
 718 | 
 719 |     public function case_get_char_direction(): void
 720 |     {
 721 |         $this
 722 |             ->when($result = LUT::getCharDirection('A'))
 723 |             ->then
 724 |                 ->integer($result)
 725 |                     ->isEqualTo(LUT::LTR)
 726 | 
 727 |             ->when($result = LUT::getCharDirection('ا'))
 728 |             ->then
 729 |                 ->integer($result)
 730 |                     ->isEqualTo(LUT::RTL);
 731 |     }
 732 | 
 733 |     public function case_get_char_width(): void
 734 |     {
 735 |         $this
 736 |             ->given(
 737 |                 $data = [
 738 |                     // 8-bit control character.
 739 |                     [0x0,    0],
 740 |                     [0x19,  -1],
 741 |                     [0x7f,  -1],
 742 |                     [0x9f,  -1],
 743 | 
 744 |                     // Regular.
 745 |                     [0xa0,   1],
 746 | 
 747 |                     // Non-spacing characters mark.
 748 |                     [0x300,  0], // in Mn
 749 |                     [0x488,  0], // in Me
 750 |                     [0x600,  0], // in Cf
 751 |                     [0xad,   1], // in Cf, but the only exception
 752 |                     [0x1160, 0],
 753 |                     [0x11ff, 0],
 754 |                     [0x200b, 0],
 755 | 
 756 |                     // To test the last return statement.
 757 |                     [0x1100, 2],
 758 |                     [0x2160, 1],
 759 |                     [0x3f60, 2],
 760 |                     [0x303f, 1],
 761 |                     [0x2329, 2],
 762 |                     [0xaed0, 2],
 763 |                     [0x232a, 2],
 764 |                     [0xffa4, 1],
 765 |                     [0xfe10, 2],
 766 |                     [0xfe30, 2],
 767 |                     [0xff00, 2],
 768 |                     [0xf900, 2]
 769 |                 ]
 770 |             )
 771 |             ->when(function () use ($data): void {
 772 |                 foreach ($data as $datum) {
 773 |                     list($code, $width) = $datum;
 774 | 
 775 |                     $this
 776 |                         ->when($result = LUT::getCharWidth(LUT::fromCode($code)))
 777 |                         ->then
 778 |                             ->integer($result)
 779 |                                 ->isEqualTo($width);
 780 |                 }
 781 |             });
 782 |     }
 783 | 
 784 |     public function case_is_char_printable(): void
 785 |     {
 786 |         $this
 787 |             ->when($result = LUT::isCharPrintable(LUT::fromCode(0x7f)))
 788 |             ->then
 789 |                 ->boolean($result)
 790 |                     ->isFalse()
 791 | 
 792 |             ->when($result = LUT::isCharPrintable(LUT::fromCode(0xa0)))
 793 |             ->then
 794 |                 ->boolean($result)
 795 |                     ->isTrue()
 796 | 
 797 |             ->when($result = LUT::isCharPrintable(LUT::fromCode(0x1100)))
 798 |             ->then
 799 |                 ->boolean($result)
 800 |                     ->isTrue();
 801 |     }
 802 | 
 803 |     public function case_from_code(): void
 804 |     {
 805 |         $this
 806 |             // U+0000 to U+007F
 807 |             ->when($result = LUT::fromCode(0x7e))
 808 |             ->then
 809 |                 ->string($result)
 810 |                     ->isEqualTo('~')
 811 | 
 812 |             // U+0080 to U+07FF
 813 |             ->when($result = LUT::fromCode(0xa7))
 814 |             ->then
 815 |                 ->string($result)
 816 |                     ->isEqualTo('§')
 817 | 
 818 |             // U+0800 to U+FFFF
 819 |             ->when($result = LUT::fromCode(0x1207))
 820 |             ->then
 821 |                 ->string($result)
 822 |                     ->isEqualTo('ሇ')
 823 | 
 824 |             // U+10000 to U+10FFFF
 825 |             ->when($result = LUT::fromCode(0x1f4a9))
 826 |             ->then
 827 |                 ->string($result)
 828 |                     ->isEqualTo('💩');
 829 |     }
 830 | 
 831 |     public function case_to_code(): void
 832 |     {
 833 |         $this
 834 |             // U+0000 to U+007F
 835 |             ->when($result = LUT::toCode('~'))
 836 |             ->then
 837 |                 ->integer($result)
 838 |                     ->isEqualTo(0x7e)
 839 | 
 840 |             // U+0080 to U+07FF
 841 |             ->when($result = LUT::toCode('§'))
 842 |             ->then
 843 |                 ->integer($result)
 844 |                     ->isEqualTo(0xa7)
 845 | 
 846 |             // U+0800 to U+FFFF
 847 |             ->when($result = LUT::toCode('ሇ'))
 848 |             ->then
 849 |                 ->integer($result)
 850 |                     ->isEqualTo(0x1207)
 851 | 
 852 |             // U+10000 to U+10FFFF
 853 |             ->when($result = LUT::toCode('💩'))
 854 |             ->then
 855 |                 ->integer($result)
 856 |                     ->isEqualTo(0x1f4a9);
 857 |     }
 858 | 
 859 |     public function case_to_binary_code(): void
 860 |     {
 861 |         $this
 862 |             // U+0000 to U+007F
 863 |             ->when($result = LUT::toBinaryCode('~'))
 864 |             ->then
 865 |                 ->string($result)
 866 |                     ->isEqualTo('01111110')
 867 | 
 868 |             // U+0080 to U+07FF
 869 |             ->when($result = LUT::toBinaryCode('§'))
 870 |             ->then
 871 |                 ->string($result)
 872 |                     ->isEqualTo('1100001010100111')
 873 | 
 874 |             // U+0800 to U+FFFF
 875 |             ->when($result = LUT::toBinaryCode('ሇ'))
 876 |             ->then
 877 |                 ->string($result)
 878 |                     ->isEqualTo('111000011000100010000111')
 879 | 
 880 |             // U+10000 to U+10FFFF
 881 |             ->when($result = LUT::toBinaryCode('💩'))
 882 |             ->then
 883 |                 ->string($result)
 884 |                     ->isEqualTo('11110000100111111001001010101001');
 885 |     }
 886 | 
 887 |     public function case_transcode_no_iconv(): void
 888 |     {
 889 |         $this
 890 |             ->given(
 891 |                 $this->function->function_exists = function ($name) {
 892 |                     return 'iconv' !== $name;
 893 |                 }
 894 |             )
 895 |             ->exception(function (): void {
 896 |                 LUT::transcode('foo', 'UTF-8');
 897 |             })
 898 |                 ->isInstanceOf(LUT\Exception::class);
 899 |     }
 900 | 
 901 |     public function case_transcode_and_isUtf8(): void
 902 |     {
 903 |         $this
 904 |             ->given($uΣ = 'Σ')
 905 |             ->when($Σ = LUT::transcode($uΣ, 'UTF-8', 'UTF-16'))
 906 |             ->then
 907 |                 ->string($Σ)
 908 |                     ->isNotEqualTo($uΣ)
 909 |                 ->boolean(LUT::isUtf8($Σ))
 910 |                     ->isFalse()
 911 | 
 912 |             ->when($Σ = LUT::transcode($Σ, 'UTF-16', 'UTF-8'))
 913 |                 ->string($Σ)
 914 |                     ->isEqualTo($uΣ)
 915 |                 ->boolean(LUT::isUtf8($Σ))
 916 |                     ->isTrue()
 917 |                 ->boolean(LUT::isUtf8($uΣ))
 918 |                     ->isTrue();
 919 |     }
 920 | 
 921 |     public function case_to_ascii_no_transliterator_no_normalizer(): void
 922 |     {
 923 |         $this
 924 |             ->given(
 925 |                 $this->function->class_exists = function ($name) {
 926 |                     return false === in_array($name, ['Transliterator', 'Normalizer']);
 927 |                 },
 928 |                 $string = new LUT('Un été brûlant sur la côte')
 929 |             )
 930 |             ->exception(function () use ($string): void {
 931 |                 $string->toAscii();
 932 |             })
 933 |                 ->isInstanceOf(LUT\Exception::class);
 934 |     }
 935 | 
 936 |     public function case_to_ascii_no_transliterator_no_normalizer_try(): void
 937 |     {
 938 |         $this
 939 |             ->given(
 940 |                 $this->function->class_exists = function ($name) {
 941 |                     return false === in_array($name, ['Transliterator', 'Normalizer']);
 942 |                 },
 943 |                 $string = new LUT('Un été brûlant sur la côte')
 944 |             )
 945 |             ->when($result = $string->toAscii(true))
 946 |             ->then
 947 |                 ->object($result)
 948 |                     ->isIdenticalTo($string)
 949 |                 ->string((string) $result)
 950 |                     ->isEqualTo('Un ete brulant sur la cote');
 951 |     }
 952 | 
 953 |     public function case_to_ascii_no_transliterator(): void
 954 |     {
 955 |         $this
 956 |             ->given(
 957 |                 $this->function->class_exists = function ($name) {
 958 |                     return 'Transliterator' !== $name;
 959 |                 },
 960 |                 $string = new LUT('Un été brûlant sur la côte')
 961 |             )
 962 |             ->when($result = $string->toAscii())
 963 |             ->then
 964 |                 ->object($result)
 965 |                     ->isIdenticalTo($string)
 966 |                 ->string((string) $result)
 967 |                     ->isEqualTo('Un ete brulant sur la cote');
 968 |     }
 969 | 
 970 |     public function case_to_ascii(): void
 971 |     {
 972 |         $this
 973 |             ->given(
 974 |                 $strings = [
 975 |                     'Un été brûlant sur la côte'
 976 |                     => 'Un ete brulant sur la cote',
 977 | 
 978 |                     'Αυτή είναι μια δοκιμή'
 979 |                     => 'Aute einai mia dokime',
 980 | 
 981 |                     'أحبك'
 982 |                     => 'ahbk',
 983 | 
 984 |                     'キャンパス'
 985 |                     => 'kyanpasu',
 986 | 
 987 |                     'биологическом'
 988 |                     => 'biologiceskom',
 989 | 
 990 |                     '정, 병호'
 991 |                     => 'jeong, byeongho',
 992 | 
 993 |                     'ますだ, よしひこ'
 994 |                     => 'masuda, yoshihiko',
 995 | 
 996 |                     'मोनिच'
 997 |                     => 'monica',
 998 | 
 999 |                     'क्ष'
1000 |                     => 'ksa',
1001 | 
1002 |                     'أحبك 😀'
1003 |                     => 'ahbk (grinning face)',
1004 | 
1005 |                     '∀ i ∈ ℕ'
1006 |                     => '(for all) i (element of) N'
1007 |                 ]
1008 |             )
1009 |             ->when(function () use ($strings): void {
1010 |                 foreach ($strings as $original => $asciied) {
1011 |                     $this
1012 |                         ->given($string = new LUT($original))
1013 |                         ->when($result = $string->toAscii())
1014 |                         ->then
1015 |                             ->object($result)
1016 |                                 ->isIdenticalTo($string)
1017 |                             ->string((string) $result)
1018 |                                 ->isEqualTo($asciied);
1019 |                 }
1020 |             });
1021 |     }
1022 | 
1023 |     public function case_copy(): void
1024 |     {
1025 |         $this
1026 |             ->given($string = new LUT('foo'))
1027 |             ->when($result = $string->copy())
1028 |             ->then
1029 |                 ->object($result)
1030 |                     ->isEqualTo($string);
1031 |     }
1032 | 
1033 |     public function case_toString(): void
1034 |     {
1035 |         $this
1036 |             ->given($datum = $this->sample($this->realdom->regex('/\w{7,42}/')))
1037 |             ->when($result = new LUT($datum))
1038 |             ->then
1039 |                 ->castToString($result)
1040 |                     ->isEqualTo($datum);
1041 |     }
1042 | }
1043 | 


--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name"       : "hoa/ustring",
 3 |     "description": "The Hoa\\Ustring library.",
 4 |     "type"       : "library",
 5 |     "keywords"   : ["library", "string", "unicode", "search"],
 6 |     "homepage"   : "https://hoa-project.net/",
 7 |     "license"    : "BSD-3-Clause",
 8 |     "authors"    : [
 9 |         {
10 |             "name" : "Ivan Enderlin",
11 |             "email": "ivan.enderlin@hoa-project.net"
12 |         },
13 |         {
14 |             "name"    : "Hoa community",
15 |             "homepage": "https://hoa-project.net/"
16 |         }
17 |     ],
18 |     "support": {
19 |         "email" : "support@hoa-project.net",
20 |         "irc"   : "irc://chat.freenode.net/hoaproject",
21 |         "forum" : "https://users.hoa-project.net/",
22 |         "docs"  : "https://central.hoa-project.net/Documentation/Library/Ustring",
23 |         "source": "https://central.hoa-project.net/Resource/Library/Ustring"
24 |     },
25 |     "require": {
26 |         "php"            : ">=7.1",
27 |         "hoa/consistency": "dev-master",
28 |         "hoa/exception"  : "dev-master"
29 |     },
30 |     "require-dev": {
31 |         "hoa/test": "dev-master"
32 |     },
33 |     "autoload": {
34 |         "psr-4": {
35 |             "Hoa\\Ustring\\"     : "Source",
36 |             "Hoa\\Ustring\\Bin\\": "Bin"
37 |         }
38 |     },
39 |     "suggest": {
40 |         "ext-iconv": "ext/iconv must be present (or a third implementation) to use Hoa\\Ustring::transcode().",
41 |         "ext-intl" : "To get a better Hoa\\Ustring::toAscii() and Hoa\\Ustring::compareTo()."
42 |     },
43 |     "extra"     : {
44 |         "branch-alias": {
45 |             "dev-master": "4.x-dev"
46 |         }
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------