├── LICENSE ├── README.md ├── UPGRADE.md ├── composer.json └── src ├── AbstractLexer.php └── Token.php /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2006-2018 Doctrine Project 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 | of the Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Doctrine Lexer 2 | 3 | [![Build Status](https://github.com/doctrine/lexer/workflows/Continuous%20Integration/badge.svg)](https://github.com/doctrine/lexer/actions) 4 | 5 | Base library for a lexer that can be used in Top-Down, Recursive Descent Parsers. 6 | 7 | This lexer is used in Doctrine Annotations and in Doctrine ORM (DQL). 8 | 9 | https://www.doctrine-project.org/projects/lexer.html 10 | -------------------------------------------------------------------------------- /UPGRADE.md: -------------------------------------------------------------------------------- 1 | Note about upgrading: Doctrine uses static and runtime mechanisms to raise 2 | awareness about deprecated code. 3 | 4 | - Use of `@deprecated` docblock that is detected by IDEs (like PHPStorm) or 5 | Static Analysis tools (like Psalm, phpstan) 6 | - Use of our low-overhead runtime deprecation API, details: 7 | https://github.com/doctrine/deprecations/ 8 | 9 | # Upgrade to 3.0.0 10 | 11 | `Doctrine\Common\Lexer\Token` no longer implements `ArrayAccess`. 12 | Parameter type declarations have been added to 13 | `Doctrine\Common\Lexer\AbstractLexer` and `Doctrine\Common\Lexer\Token`. 14 | You should add both parameter type declarations and return type declarations to 15 | your lexers, based on the `@return` phpdoc. 16 | 17 | # Upgrade to 2.0.0 18 | 19 | `AbstractLexer::glimpse()` and `AbstractLexer::peek()` now return 20 | instances of `Doctrine\Common\Lexer\Token`, which is an array-like class 21 | Using it as an array is deprecated in favor of using properties of that class. 22 | Using `count()` on it is deprecated with no replacement. 23 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "doctrine/lexer", 3 | "description": "PHP Doctrine Lexer parser library that can be used in Top-Down, Recursive Descent Parsers.", 4 | "license": "MIT", 5 | "type": "library", 6 | "keywords": [ 7 | "php", 8 | "parser", 9 | "lexer", 10 | "annotations", 11 | "docblock" 12 | ], 13 | "authors": [ 14 | { 15 | "name": "Guilherme Blanco", 16 | "email": "guilhermeblanco@gmail.com" 17 | }, 18 | { 19 | "name": "Roman Borschel", 20 | "email": "roman@code-factory.org" 21 | }, 22 | { 23 | "name": "Johannes Schmitt", 24 | "email": "schmittjoh@gmail.com" 25 | } 26 | ], 27 | "homepage": "https://www.doctrine-project.org/projects/lexer.html", 28 | "require": { 29 | "php": "^8.1" 30 | }, 31 | "require-dev": { 32 | "doctrine/coding-standard": "^12", 33 | "phpstan/phpstan": "^1.10", 34 | "phpunit/phpunit": "^10.5" 35 | }, 36 | "autoload": { 37 | "psr-4": { 38 | "Doctrine\\Common\\Lexer\\": "src" 39 | } 40 | }, 41 | "autoload-dev": { 42 | "psr-4": { 43 | "Doctrine\\Tests\\Common\\Lexer\\": "tests" 44 | } 45 | }, 46 | "config": { 47 | "allow-plugins": { 48 | "composer/package-versions-deprecated": true, 49 | "dealerdirect/phpcodesniffer-composer-installer": true 50 | }, 51 | "sort-packages": true 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/AbstractLexer.php: -------------------------------------------------------------------------------- 1 | > 36 | */ 37 | private array $tokens = []; 38 | 39 | /** 40 | * Current lexer position in input string. 41 | */ 42 | private int $position = 0; 43 | 44 | /** 45 | * Current peek of current lexer position. 46 | */ 47 | private int $peek = 0; 48 | 49 | /** 50 | * The next token in the input. 51 | * 52 | * @var Token|null 53 | */ 54 | public Token|null $lookahead; 55 | 56 | /** 57 | * The last matched/seen token. 58 | * 59 | * @var Token|null 60 | */ 61 | public Token|null $token; 62 | 63 | /** 64 | * Composed regex for input parsing. 65 | * 66 | * @var non-empty-string|null 67 | */ 68 | private string|null $regex = null; 69 | 70 | /** 71 | * Sets the input data to be tokenized. 72 | * 73 | * The Lexer is immediately reset and the new input tokenized. 74 | * Any unprocessed tokens from any previous input are lost. 75 | * 76 | * @param string $input The input to be tokenized. 77 | * 78 | * @return void 79 | */ 80 | public function setInput(string $input) 81 | { 82 | $this->input = $input; 83 | $this->tokens = []; 84 | 85 | $this->reset(); 86 | $this->scan($input); 87 | } 88 | 89 | /** 90 | * Resets the lexer. 91 | * 92 | * @return void 93 | */ 94 | public function reset() 95 | { 96 | $this->lookahead = null; 97 | $this->token = null; 98 | $this->peek = 0; 99 | $this->position = 0; 100 | } 101 | 102 | /** 103 | * Resets the peek pointer to 0. 104 | * 105 | * @return void 106 | */ 107 | public function resetPeek() 108 | { 109 | $this->peek = 0; 110 | } 111 | 112 | /** 113 | * Resets the lexer position on the input to the given position. 114 | * 115 | * @param int $position Position to place the lexical scanner. 116 | * 117 | * @return void 118 | */ 119 | public function resetPosition(int $position = 0) 120 | { 121 | $this->position = $position; 122 | } 123 | 124 | /** 125 | * Retrieve the original lexer's input until a given position. 126 | * 127 | * @return string 128 | */ 129 | public function getInputUntilPosition(int $position) 130 | { 131 | return substr($this->input, 0, $position); 132 | } 133 | 134 | /** 135 | * Checks whether a given token matches the current lookahead. 136 | * 137 | * @param T $type 138 | * 139 | * @return bool 140 | * 141 | * @phpstan-assert-if-true !=null $this->lookahead 142 | */ 143 | public function isNextToken(int|string|UnitEnum $type) 144 | { 145 | return $this->lookahead !== null && $this->lookahead->isA($type); 146 | } 147 | 148 | /** 149 | * Checks whether any of the given tokens matches the current lookahead. 150 | * 151 | * @param list $types 152 | * 153 | * @return bool 154 | * 155 | * @phpstan-assert-if-true !=null $this->lookahead 156 | */ 157 | public function isNextTokenAny(array $types) 158 | { 159 | return $this->lookahead !== null && $this->lookahead->isA(...$types); 160 | } 161 | 162 | /** 163 | * Moves to the next token in the input string. 164 | * 165 | * @return bool 166 | * 167 | * @phpstan-assert-if-true !null $this->lookahead 168 | */ 169 | public function moveNext() 170 | { 171 | $this->peek = 0; 172 | $this->token = $this->lookahead; 173 | $this->lookahead = isset($this->tokens[$this->position]) 174 | ? $this->tokens[$this->position++] : null; 175 | 176 | return $this->lookahead !== null; 177 | } 178 | 179 | /** 180 | * Tells the lexer to skip input tokens until it sees a token with the given value. 181 | * 182 | * @param T $type The token type to skip until. 183 | * 184 | * @return void 185 | */ 186 | public function skipUntil(int|string|UnitEnum $type) 187 | { 188 | while ($this->lookahead !== null && ! $this->lookahead->isA($type)) { 189 | $this->moveNext(); 190 | } 191 | } 192 | 193 | /** 194 | * Checks if given value is identical to the given token. 195 | * 196 | * @return bool 197 | */ 198 | public function isA(string $value, int|string|UnitEnum $token) 199 | { 200 | return $this->getType($value) === $token; 201 | } 202 | 203 | /** 204 | * Moves the lookahead token forward. 205 | * 206 | * @return Token|null The next token or NULL if there are no more tokens ahead. 207 | */ 208 | public function peek() 209 | { 210 | if (isset($this->tokens[$this->position + $this->peek])) { 211 | return $this->tokens[$this->position + $this->peek++]; 212 | } 213 | 214 | return null; 215 | } 216 | 217 | /** 218 | * Peeks at the next token, returns it and immediately resets the peek. 219 | * 220 | * @return Token|null The next token or NULL if there are no more tokens ahead. 221 | */ 222 | public function glimpse() 223 | { 224 | $peek = $this->peek(); 225 | $this->peek = 0; 226 | 227 | return $peek; 228 | } 229 | 230 | /** 231 | * Scans the input string for tokens. 232 | * 233 | * @param string $input A query string. 234 | * 235 | * @return void 236 | */ 237 | protected function scan(string $input) 238 | { 239 | if (! isset($this->regex)) { 240 | $this->regex = sprintf( 241 | '/(%s)|%s/%s', 242 | implode(')|(', $this->getCatchablePatterns()), 243 | implode('|', $this->getNonCatchablePatterns()), 244 | $this->getModifiers(), 245 | ); 246 | } 247 | 248 | $flags = PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE; 249 | $matches = preg_split($this->regex, $input, -1, $flags); 250 | 251 | if ($matches === false) { 252 | // Work around https://bugs.php.net/78122 253 | $matches = [[$input, 0]]; 254 | } 255 | 256 | foreach ($matches as $match) { 257 | // Must remain before 'value' assignment since it can change content 258 | $firstMatch = $match[0]; 259 | $type = $this->getType($firstMatch); 260 | 261 | $this->tokens[] = new Token( 262 | $firstMatch, 263 | $type, 264 | $match[1], 265 | ); 266 | } 267 | } 268 | 269 | /** 270 | * Gets the literal for a given token. 271 | * 272 | * @param T $token 273 | * 274 | * @return int|string 275 | */ 276 | public function getLiteral(int|string|UnitEnum $token) 277 | { 278 | if ($token instanceof UnitEnum) { 279 | return $token::class . '::' . $token->name; 280 | } 281 | 282 | $className = static::class; 283 | 284 | $reflClass = new ReflectionClass($className); 285 | $constants = $reflClass->getConstants(); 286 | 287 | foreach ($constants as $name => $value) { 288 | if ($value === $token) { 289 | return $className . '::' . $name; 290 | } 291 | } 292 | 293 | return $token; 294 | } 295 | 296 | /** 297 | * Regex modifiers 298 | * 299 | * @return string 300 | */ 301 | protected function getModifiers() 302 | { 303 | return 'iu'; 304 | } 305 | 306 | /** 307 | * Lexical catchable patterns. 308 | * 309 | * @return string[] 310 | */ 311 | abstract protected function getCatchablePatterns(); 312 | 313 | /** 314 | * Lexical non-catchable patterns. 315 | * 316 | * @return string[] 317 | */ 318 | abstract protected function getNonCatchablePatterns(); 319 | 320 | /** 321 | * Retrieve token type. Also processes the token value if necessary. 322 | * 323 | * @return T|null 324 | * 325 | * @param-out V $value 326 | */ 327 | abstract protected function getType(string &$value); 328 | } 329 | -------------------------------------------------------------------------------- /src/Token.php: -------------------------------------------------------------------------------- 1 | value = $value; 47 | $this->type = $type; 48 | $this->position = $position; 49 | } 50 | 51 | /** @param T ...$types */ 52 | public function isA(...$types): bool 53 | { 54 | return in_array($this->type, $types, true); 55 | } 56 | } 57 | --------------------------------------------------------------------------------