├── .State ├── .gitignore ├── CHANGELOG.md ├── README.md ├── Source ├── Exception.php ├── Grammar.pp └── Visitor │ └── Isotropic.php └── composer.json /.State: -------------------------------------------------------------------------------- 1 | beta 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /vendor/ 2 | /composer.lock 3 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # 1.17.01.13 2 | 3 | * Quality: Happy new year! (Alexis von Glasow, 2017-01-12T14:00:43+01:00) 4 | * Documentation: New `README.md` file. (Ivan Enderlin, 2016-10-21T17:01:48+02:00) 5 | * Documentation: Update `support` properties. (Ivan Enderlin, 2016-10-11T16:56:16+02:00) 6 | 7 | # 1.16.09.06 8 | 9 | * Grammar: Update copyright. (Ivan Enderlin, 2016-01-17T14:21:02+01:00) 10 | 11 | # 1.16.01.15 12 | 13 | * Composer: New stable library. (Ivan Enderlin, 2016-01-14T22:08:58+01:00) 14 | 15 | # 1.16.01.14 16 | 17 | * Quality: Drop PHP5.4. (Ivan Enderlin, 2016-01-11T09:15:26+01:00) 18 | * Quality: Run devtools:cs. (Ivan Enderlin, 2016-01-09T09:07:46+01:00) 19 | * Core: Remove `Hoa\Core`. (Ivan Enderlin, 2016-01-09T08:22:53+01:00) 20 | * Consistency: Use `Hoa\Consistency`. (Ivan Enderlin, 2015-12-08T21:44:37+01:00) 21 | * Exception: Use `Hoa\Exception`. (Ivan Enderlin, 2015-11-20T13:06:20+01:00) 22 | 23 | # 0.15.08.13 24 | 25 | * Fix non-breaking space character encoding. (Metalaka, 2015-08-13T08:01:24+02:00) 26 | * Fix generatation of space (`\s`) characters. (Metalaka, 2015-08-11T21:04:15+02:00) 27 | * Add a `.gitignore` file. (Stéphane HULARD, 2015-08-03T11:43:58+02:00) 28 | 29 | # 0.15.05.29 30 | 31 | * Move to `Hoa\Ustring`. (Ivan Enderlin, 2015-05-29T14:51:52+02:00) 32 | * Move to PSR-1 and PSR-2. (Ivan Enderlin, 2015-05-18T09:28:32+02:00) 33 | 34 | # 0.15.02.24 35 | 36 | * Add the `CHANGELOG.md` file. (Ivan Enderlin, 2015-02-24T09:46:53+01:00) 37 | * Format code in an example. (Ivan Enderlin, 2015-02-22T14:22:09+01:00) 38 | * Better internal options support (no skip). (Ivan Enderlin, 2015-02-22T14:18:57+01:00) 39 | * Happy new year! (Ivan Enderlin, 2015-01-05T15:05:31+01:00) 40 | 41 | # 0.14.12.10 42 | 43 | * Move to PSR-4. (Ivan Enderlin, 2014-12-10T08:52:28+01:00) 44 | * Skip internal option setting. (Ivan Enderlin, 2014-12-05T10:02:41+01:00) 45 | 46 | # 0.14.11.25 47 | 48 | * `Hoa\Visitor` has been finalized. (Ivan Enderlin, 2014-11-15T22:23:45+01:00) 49 | 50 | # 0.14.11.14 51 | 52 | * Support the non-capturing construction. (Ivan Enderlin, 2014-11-14T10:46:02+01:00) 53 | * Throw an exception when a node is not supported. (Ivan Enderlin, 2014-11-14T10:43:56+01:00) 54 | * Add a `README.md` and a `.State` files. (Ivan Enderlin, 2014-11-13T20:47:33+01:00) 55 | * Remove `from`/`import` and update to PHP5.4. (Ivan Enderlin, 2014-11-13T20:15:14+01:00) 56 | 57 | # 0.14.09.23 58 | 59 | * Add `branch-alias`. (Stéphane PY, 2014-09-23T11:56:13+02:00) 60 | 61 | # 0.14.09.17 62 | 63 | * Drop PHP5.3. (Ivan Enderlin, 2014-09-17T17:32:53+02:00) 64 | 65 | (first snapshot) 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | Hoa 3 |

4 | 5 | --- 6 | 7 |

8 | Build status 9 | Code coverage 10 | Packagist 11 | License 12 |

13 |

14 | Hoa is a modular, extensible and 15 | structured set of PHP libraries.
16 | Moreover, Hoa aims at being a bridge between industrial and research worlds. 17 |

18 | 19 | # Hoa\Regex 20 | 21 | [![Help on IRC](https://img.shields.io/badge/help-%23hoaproject-ff0066.svg)](https://webchat.freenode.net/?channels=#hoaproject) 22 | [![Help on Gitter](https://img.shields.io/badge/help-gitter-ff0066.svg)](https://gitter.im/hoaproject/central) 23 | [![Documentation](https://img.shields.io/badge/documentation-hack_book-ff0066.svg)](https://central.hoa-project.net/Documentation/Library/Regex) 24 | [![Board](https://img.shields.io/badge/organisation-board-ff0066.svg)](https://waffle.io/hoaproject/regex) 25 | 26 | This library provides tools to analyze regular expressions and generate strings 27 | based on regular expressions ([Perl Compatible Regular 28 | Expressions](http://pcre.org)). 29 | 30 | [Learn more](https://central.hoa-project.net/Documentation/Library/Regex). 31 | 32 | ## Installation 33 | 34 | With [Composer](https://getcomposer.org/), to include this library into 35 | your dependencies, you need to 36 | require [`hoa/regex`](https://packagist.org/packages/hoa/regex): 37 | 38 | ```sh 39 | $ composer require hoa/regex '~1.0' 40 | ``` 41 | 42 | For more installation procedures, please read [the Source 43 | page](https://hoa-project.net/Source.html). 44 | 45 | ## Testing 46 | 47 | Before running the test suites, the development dependencies must be installed: 48 | 49 | ```sh 50 | $ composer install 51 | ``` 52 | 53 | Then, to run all the test suites: 54 | 55 | ```sh 56 | $ vendor/bin/hoa test:run 57 | ``` 58 | 59 | For more information, please read the [contributor 60 | guide](https://hoa-project.net/Literature/Contributor/Guide.html). 61 | 62 | ## Quick usage 63 | 64 | As a quick overview, we propose to see two examples. First, analyze a regular 65 | expression, i.e. lex, parse and produce an AST. Second, generate strings based 66 | on a regular expression by visiting its AST with an isotropic random approach. 67 | 68 | ### Analyze regular expressions 69 | 70 | We need the [`Hoa\Compiler` 71 | library](https://central.hoa-project.net/Resource/Library/Compiler) to lex, parse 72 | and produce an AST of the following regular expression: `ab(c|d){2,4}e?`. Thus: 73 | 74 | ```php 75 | // 1. Read the grammar. 76 | $grammar = new Hoa\File\Read('hoa://Library/Regex/Grammar.pp'); 77 | 78 | // 2. Load the compiler. 79 | $compiler = Hoa\Compiler\Llk\Llk::load($grammar); 80 | 81 | // 3. Lex, parse and produce the AST. 82 | $ast = $compiler->parse('ab(c|d){2,4}e?'); 83 | 84 | // 4. Dump the result. 85 | $dump = new Hoa\Compiler\Visitor\Dump(); 86 | echo $dump->visit($ast); 87 | 88 | /** 89 | * Will output: 90 | * > #expression 91 | * > > #concatenation 92 | * > > > token(literal, a) 93 | * > > > token(literal, b) 94 | * > > > #quantification 95 | * > > > > #alternation 96 | * > > > > > token(literal, c) 97 | * > > > > > token(literal, d) 98 | * > > > > token(n_to_m, {2,4}) 99 | * > > > #quantification 100 | * > > > > token(literal, e) 101 | * > > > > token(zero_or_one, ?) 102 | */ 103 | ``` 104 | 105 | We read that the whole expression is composed of a single concatenation of two 106 | tokens: `a` and `b`, followed by a quantification, followed by another 107 | quantification. The first quantification is an alternation of (a choice betwen) 108 | two tokens: `c` and `d`, between 2 to 4 times. The second quantification is the 109 | `e` token that can appear zero or one time. 110 | 111 | We can visit the tree with the help of the [`Hoa\Visitor` 112 | library](https://central.hoa-project.net/Resource/Library/Visitor). 113 | 114 | ### Generate strings based on regular expressions 115 | 116 | To generate strings based on the AST of a regular expressions, we will use the 117 | `Hoa\Regex\Visitor\Isotropic` visitor: 118 | 119 | ```php 120 | $generator = new Hoa\Regex\Visitor\Isotropic(new Hoa\Math\Sampler\Random()); 121 | echo $generator->visit($ast); 122 | 123 | /** 124 | * Could output: 125 | * abdcde 126 | */ 127 | ``` 128 | 129 | Strings are generated at random and match the given regular expression. 130 | 131 | ## Documentation 132 | 133 | The 134 | [hack book of `Hoa\Regex`](https://central.hoa-project.net/Documentation/Library/Regex) 135 | contains detailed information about how to use this library and how it works. 136 | 137 | To generate the documentation locally, execute the following commands: 138 | 139 | ```sh 140 | $ composer require --dev hoa/devtools 141 | $ vendor/bin/hoa devtools:documentation --open 142 | ``` 143 | 144 | More documentation can be found on the project's website: 145 | [hoa-project.net](https://hoa-project.net/). 146 | 147 | ## Getting help 148 | 149 | There are mainly two ways to get help: 150 | 151 | * On the [`#hoaproject`](https://webchat.freenode.net/?channels=#hoaproject) 152 | IRC channel, 153 | * On the forum at [users.hoa-project.net](https://users.hoa-project.net). 154 | 155 | ## Contribution 156 | 157 | Do you want to contribute? Thanks! A detailed [contributor 158 | guide](https://hoa-project.net/Literature/Contributor/Guide.html) explains 159 | everything you need to know. 160 | 161 | ## License 162 | 163 | Hoa is under the New BSD License (BSD-3-Clause). Please, see 164 | [`LICENSE`](https://hoa-project.net/LICENSE) for details. 165 | -------------------------------------------------------------------------------- /Source/Exception.php: -------------------------------------------------------------------------------- 1 | nc 62 | %token absolute_reference_ \(\?\((?=\d) -> c 63 | %token relative_reference_ \(\?\((?=[\+\-]) -> c 64 | %token c:index [\+\-]?\d+ -> default 65 | %token assertion_reference_ \(\?\( 66 | 67 | // Comments. 68 | %token comment_ \(\?# -> co 69 | %token co:_comment \) -> default 70 | %token co:comment .*?(?=(? nc 74 | %token nc:_named_capturing > -> default 75 | %token nc:capturing_name .+?(?=(?) 76 | %token non_capturing_ \(\?: 77 | %token non_capturing_reset_ \(\?\| 78 | %token atomic_group_ \(\?> 79 | %token capturing_ \( 80 | %token _capturing \) 81 | 82 | // Quantifiers (by default, greedy). 83 | %token zero_or_one_possessive \?\+ 84 | %token zero_or_one_lazy \?\? 85 | %token zero_or_one \? 86 | %token zero_or_more_possessive \*\+ 87 | %token zero_or_more_lazy \*\? 88 | %token zero_or_more \* 89 | %token one_or_more_possessive \+\+ 90 | %token one_or_more_lazy \+\? 91 | %token one_or_more \+ 92 | %token exactly_n \{[0-9]+\} 93 | %token n_to_m_possessive \{[0-9]+,[0-9]+\}\+ 94 | %token n_to_m_lazy \{[0-9]+,[0-9]+\}\? 95 | %token n_to_m \{[0-9]+,[0-9]+\} 96 | %token n_or_more_possessive \{[0-9]+,\}\+ 97 | %token n_or_more_lazy \{[0-9]+,\}\? 98 | %token n_or_more \{[0-9]+,\} 99 | 100 | // Alternation. 101 | %token alternation \| 102 | 103 | // Literal. 104 | %token character \\([aefnrt]|c[\x00-\x7f]) 105 | %token dynamic_character \\([0-7]{3}|x[0-9a-zA-Z]{2}|x{[0-9a-zA-Z]+}) 106 | // Please, see PCRESYNTAX(3), General Category properties, PCRE special category 107 | // properties and script names for \p{} and \P{}. 108 | %token character_type \\([CdDhHNRsSvVwWX]|[pP]{[^}]+}) 109 | %token anchor \\([bBAZzG])|\^|\$ 110 | %token match_point_reset \\K 111 | %token literal \\.|. 112 | 113 | 114 | // Rules. 115 | 116 | #expression: 117 | alternation() 118 | 119 | alternation: 120 | concatenation() ( ::alternation:: concatenation() #alternation )* 121 | 122 | concatenation: 123 | ( internal_options() | assertion() | quantification() | condition() ) 124 | ( ( internal_options() | assertion() | quantification() | condition() ) #concatenation )* 125 | 126 | #internal_options: 127 | 128 | 129 | #condition: 130 | ( 131 | ::named_reference_:: ::_named_capturing:: #namedcondition 132 | | ( 133 | ::relative_reference_:: #relativecondition 134 | | ::absolute_reference_:: #absolutecondition 135 | ) 136 | 137 | | ::assertion_reference_:: alternation() #assertioncondition 138 | ) 139 | ::_capturing:: concatenation()? 140 | ( ::alternation:: concatenation()? )? 141 | ::_capturing:: 142 | 143 | assertion: 144 | ( 145 | ::lookahead_:: #lookahead 146 | | ::negative_lookahead_:: #negativelookahead 147 | | ::lookbehind_:: #lookbehind 148 | | ::negative_lookbehind_:: #negativelookbehind 149 | ) 150 | alternation() ::_capturing:: 151 | 152 | quantification: 153 | ( class() | simple() ) ( quantifier() #quantification )? 154 | 155 | quantifier: 156 | | | 157 | | | | 158 | | | | 159 | | 160 | | | | 161 | | | | 162 | 163 | #class: 164 | ( 165 | ::negative_class_:: #negativeclass 166 | | ::class_:: 167 | ) 168 | ( | range() | literal() )+ ? 169 | ::_class:: 170 | 171 | #range: 172 | literal() ::range:: literal() 173 | 174 | simple: 175 | capturing() 176 | | literal() 177 | 178 | capturing: 179 | ::comment_:: ? ::_comment:: #comment 180 | | ( 181 | ::named_capturing_:: ::_named_capturing:: #namedcapturing 182 | | ::non_capturing_:: #noncapturing 183 | | ::non_capturing_reset_:: #noncapturingreset 184 | | ::atomic_group_:: #atomicgroup 185 | | ::capturing_:: 186 | ) 187 | alternation() ::_capturing:: 188 | 189 | literal: 190 | 191 | | 192 | | 193 | | 194 | | 195 | | 196 | -------------------------------------------------------------------------------- /Source/Visitor/Isotropic.php: -------------------------------------------------------------------------------- 1 | _sampler = $sampler; 66 | 67 | return; 68 | } 69 | 70 | /** 71 | * Visit an element. 72 | */ 73 | public function visit( 74 | Visitor\Element $element, 75 | &$handle = null, 76 | $eldnah = null 77 | ) { 78 | switch ($element->getId()) { 79 | case '#expression': 80 | case '#capturing': 81 | case '#noncapturing': 82 | case '#namedcapturing': 83 | return $element->getChild(0)->accept($this, $handle, $eldnah); 84 | 85 | case '#alternation': 86 | case '#class': 87 | return $element->getChild($this->_sampler->getInteger( 88 | 0, 89 | $element->getChildrenNumber() - 1 90 | ))->accept($this, $handle, $eldnah); 91 | 92 | case '#concatenation': 93 | $out = null; 94 | 95 | foreach ($element->getChildren() as $child) { 96 | $out .= $child->accept($this, $handle, $eldnah); 97 | } 98 | 99 | return $out; 100 | 101 | case '#quantification': 102 | $out = null; 103 | $xy = $element->getChild(1)->getValueValue(); 104 | $x = 0; 105 | $y = 0; 106 | 107 | switch ($element->getChild(1)->getValueToken()) { 108 | case 'zero_or_one': 109 | $y = 1; 110 | 111 | break; 112 | 113 | case 'zero_or_more': 114 | $y = mt_rand(5, 8); // why not? 115 | 116 | break; 117 | 118 | case 'one_or_more': 119 | $x = 1; 120 | $y = mt_rand(5, 8); // why not? 121 | 122 | break; 123 | 124 | case 'exactly_n': 125 | $x = $y = (int) substr($xy, 1, -1); 126 | 127 | break; 128 | 129 | case 'n_to_m': 130 | $xy = explode(',', substr($xy, 1, -1)); 131 | $x = (int) trim($xy[0]); 132 | $y = (int) trim($xy[1]); 133 | 134 | break; 135 | 136 | case 'n_or_more': 137 | $xy = explode(',', substr($xy, 1, -1)); 138 | $x = (int) trim($xy[0]); 139 | $y = mt_rand($x + 5, $x + 8); // why not? 140 | 141 | break; 142 | } 143 | 144 | for ( 145 | $i = 0, $max = $this->_sampler->getInteger($x, $y); 146 | $i < $max; 147 | ++$i 148 | ) { 149 | $out .= $element->getChild(0)->accept( 150 | $this, 151 | $handle, 152 | $eldnah 153 | ); 154 | } 155 | 156 | return $out; 157 | 158 | case '#negativeclass': 159 | $c = []; 160 | 161 | foreach ($element->getChildren() as $child) { 162 | $c[Ustring::toCode( 163 | $child->accept($this, $handle, $eldnah) 164 | )] = true; 165 | } 166 | 167 | do { 168 | // all printable ASCII. 169 | $i = $this->_sampler->getInteger(32, 126); 170 | } while (isset($c[$i])); 171 | 172 | return Ustring::fromCode($i); 173 | 174 | case '#range': 175 | $out = null; 176 | $left = $element->getChild(0)->accept($this, $handle, $eldnah); 177 | $right = $element->getChild(1)->accept($this, $handle, $eldnah); 178 | 179 | return 180 | Ustring::fromCode( 181 | $this->_sampler->getInteger( 182 | Ustring::toCode($left), 183 | Ustring::toCode($right) 184 | ) 185 | ); 186 | 187 | case 'token': 188 | $value = $element->getValueValue(); 189 | 190 | switch ($element->getValueToken()) { 191 | case 'character': 192 | $value = ltrim($value, '\\'); 193 | 194 | switch ($value) { 195 | case 'a': 196 | return "\a"; 197 | 198 | case 'e': 199 | return "\e"; 200 | 201 | case 'f': 202 | return "\f"; 203 | 204 | case 'n': 205 | return "\n"; 206 | 207 | case 'r': 208 | return "\r"; 209 | 210 | case 't': 211 | return "\t"; 212 | 213 | default: 214 | return 215 | Ustring::fromCode( 216 | intval( 217 | substr($value, 1) 218 | ) 219 | ); 220 | } 221 | 222 | break; 223 | 224 | case 'dynamic_character': 225 | $value = ltrim($value, '\\'); 226 | 227 | switch ($value[0]) { 228 | case 'x': 229 | $value = trim($value, 'x{}'); 230 | 231 | return Ustring::fromCode( 232 | hexdec($value) 233 | ); 234 | 235 | default: 236 | return Ustring::fromCode(octdec($value)); 237 | } 238 | 239 | break; 240 | 241 | case 'character_type': 242 | $value = ltrim($value, '\\'); 243 | 244 | if ('s' === $value) { 245 | $value = $this->_sampler->getInteger(0, 1) ? 'h' : 'v'; 246 | } 247 | 248 | switch ($value) { 249 | case 'C': 250 | return $this->_sampler->getInteger(0, 127); 251 | 252 | case 'd': 253 | return $this->_sampler->getInteger(0, 9); 254 | 255 | case 'h': 256 | $h = [ 257 | Ustring::fromCode(0x0009), 258 | Ustring::fromCode(0x0020), 259 | Ustring::fromCode(0x00a0) 260 | ]; 261 | 262 | return $h[$this->_sampler->getInteger(0, count($h) - 1)]; 263 | 264 | case 'v': 265 | $v = [ 266 | Ustring::fromCode(0x000a), 267 | Ustring::fromCode(0x000b), 268 | Ustring::fromCode(0x000c), 269 | Ustring::fromCode(0x000d) 270 | ]; 271 | 272 | return $v[$this->_sampler->getInteger(0, count($v) - 1)]; 273 | 274 | case 'w': 275 | $w = array_merge( 276 | range(0x41, 0x5a), 277 | range(0x61, 0x7a), 278 | [0x5f] 279 | ); 280 | 281 | return Ustring::fromCode($w[$this->_sampler->getInteger(0, count($w) - 1)]); 282 | 283 | default: 284 | return '?'; 285 | } 286 | 287 | break; 288 | 289 | case 'literal': 290 | if ('.' === $value) { 291 | $w = array_merge( 292 | range(0x41, 0x5a), 293 | range(0x61, 0x7a), 294 | [0x5f] 295 | ); 296 | 297 | return Ustring::fromCode($w[$this->_sampler->getInteger(0, count($w) - 1)]); 298 | } 299 | 300 | return 301 | str_replace( 302 | '\\\\', 303 | '\\', 304 | preg_replace( 305 | '#\\\(?!\\\)#', 306 | '', 307 | $value 308 | ) 309 | ); 310 | } 311 | 312 | break; 313 | 314 | case '#internal_options': 315 | break; 316 | 317 | default: 318 | throw new Regex\Exception( 319 | 'Unsupported node: %s.', 320 | 0, 321 | $element->getId() 322 | ); 323 | } 324 | 325 | return; 326 | } 327 | } 328 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name" : "hoa/regex", 3 | "description": "The Hoa\\Regex library.", 4 | "type" : "library", 5 | "keywords" : ["library", "regex", "compiler"], 6 | "homepage" : "https://hoa-project.net/", 7 | "license" : "BSD-3-Clause", 8 | "authors" : [ 9 | { 10 | "name" : "Ivan Enderlin", 11 | "email": "ivan.enderlin@hoa-project.net" 12 | }, 13 | { 14 | "name" : "Hoa community", 15 | "homepage": "https://hoa-project.net/" 16 | } 17 | ], 18 | "support": { 19 | "email" : "support@hoa-project.net", 20 | "irc" : "irc://chat.freenode.net/hoaproject", 21 | "forum" : "https://users.hoa-project.net/", 22 | "docs" : "https://central.hoa-project.net/Documentation/Library/Regex", 23 | "source": "https://central.hoa-project.net/Resource/Library/Regex" 24 | }, 25 | "require": { 26 | "php" : ">=7.1", 27 | "hoa/consistency": "dev-master", 28 | "hoa/exception" : "dev-master", 29 | "hoa/math" : "dev-master", 30 | "hoa/protocol" : "dev-master", 31 | "hoa/ustring" : "dev-master", 32 | "hoa/visitor" : "dev-master" 33 | }, 34 | "autoload": { 35 | "psr-4": { 36 | "Hoa\\Regex\\": "Source" 37 | } 38 | }, 39 | "extra": { 40 | "branch-alias": { 41 | "dev-master": "1.x-dev" 42 | } 43 | } 44 | } 45 | --------------------------------------------------------------------------------