├── CHANGELOG.md ├── LICENSE ├── README.md ├── composer.json ├── composer.lock └── src ├── Exception.php ├── NamespaceUri.php ├── NamespaceUriException.php ├── Token.php ├── TokenCollection.php ├── TokenCollectionException.php ├── Tokenizer.php └── XMLSerializer.php /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to Tokenizer are documented in this file using the [Keep a CHANGELOG](http://keepachangelog.com/) principles. 4 | 5 | ## [1.2.3] - 2024-03-03 6 | 7 | ### Changed 8 | 9 | * Do not use implicitly nullable parameters 10 | 11 | ## [1.2.2] - 2023-11-20 12 | 13 | ### Fixed 14 | 15 | * [#18](https://github.com/theseer/tokenizer/issues/18): Tokenizer fails on protobuf metadata files 16 | 17 | 18 | ## [1.2.1] - 2021-07-28 19 | 20 | ### Fixed 21 | 22 | * [#13](https://github.com/theseer/tokenizer/issues/13): Fatal error when tokenizing files that contain only a single empty line 23 | 24 | 25 | ## [1.2.0] - 2020-07-13 26 | 27 | This release is now PHP 8.0 compliant. 28 | 29 | ### Fixed 30 | 31 | * Whitespace handling in general (only noticable in the intermediate `TokenCollection`) is now consitent 32 | 33 | ### Changed 34 | 35 | * Updated `Tokenizer` to deal with changed whitespace handling in PHP 8.0 36 | The XMLSerializer was unaffected. 37 | 38 | 39 | ## [1.1.3] - 2019-06-14 40 | 41 | ### Changed 42 | 43 | * Ensure XMLSerializer can deal with empty token collections 44 | 45 | ### Fixed 46 | 47 | * [#2](https://github.com/theseer/tokenizer/issues/2): Fatal error in infection / phpunit 48 | 49 | 50 | ## [1.1.2] - 2019-04-04 51 | 52 | ### Changed 53 | 54 | * Reverted PHPUnit 8 test update to stay PHP 7.0 compliant 55 | 56 | 57 | ## [1.1.1] - 2019-04-03 58 | 59 | ### Fixed 60 | 61 | * [#1](https://github.com/theseer/tokenizer/issues/1): Empty file causes invalid array read 62 | 63 | ### Changed 64 | 65 | * Tests should now be PHPUnit 8 compliant 66 | 67 | 68 | ## [1.1.0] - 2017-04-07 69 | 70 | ### Added 71 | 72 | * Allow use of custom namespace for XML serialization 73 | 74 | 75 | ## [1.0.0] - 2017-04-05 76 | 77 | Initial Release 78 | 79 | [1.2.3]: https://github.com/theseer/tokenizer/compare/1.2.2...1.2.3 80 | [1.2.2]: https://github.com/theseer/tokenizer/compare/1.2.1...1.2.2 81 | [1.2.1]: https://github.com/theseer/tokenizer/compare/1.2.0...1.2.1 82 | [1.2.0]: https://github.com/theseer/tokenizer/compare/1.1.3...1.2.0 83 | [1.1.3]: https://github.com/theseer/tokenizer/compare/1.1.2...1.1.3 84 | [1.1.2]: https://github.com/theseer/tokenizer/compare/1.1.1...1.1.2 85 | [1.1.1]: https://github.com/theseer/tokenizer/compare/1.1.0...1.1.1 86 | [1.1.0]: https://github.com/theseer/tokenizer/compare/1.0.0...1.1.0 87 | [1.0.0]: https://github.com/theseer/tokenizer/compare/b2493e57de80c1b7414219b28503fa5c6b4d0a98...1.0.0 88 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Tokenizer 2 | 3 | Copyright (c) 2017 Arne Blankerts and contributors 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without modification, 7 | are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, 10 | this list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of Arne Blankerts nor the names of contributors 17 | may be used to endorse or promote products derived from this software 18 | without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT * NOT LIMITED TO, 22 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS 24 | BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 25 | OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 | POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tokenizer 2 | 3 | A small library for converting tokenized PHP source code into XML. 4 | 5 | [![Test](https://github.com/theseer/tokenizer/actions/workflows/ci.yml/badge.svg)](https://github.com/theseer/tokenizer/actions/workflows/ci.yml) 6 | 7 | ## Installation 8 | 9 | You can add this library as a local, per-project dependency to your project using [Composer](https://getcomposer.org/): 10 | 11 | composer require theseer/tokenizer 12 | 13 | If you only need this library during development, for instance to run your project's test suite, then you should add it as a development-time dependency: 14 | 15 | composer require --dev theseer/tokenizer 16 | 17 | ## Usage examples 18 | 19 | ```php 20 | $tokenizer = new TheSeer\Tokenizer\Tokenizer(); 21 | $tokens = $tokenizer->parse(file_get_contents(__DIR__ . '/src/XMLSerializer.php')); 22 | 23 | $serializer = new TheSeer\Tokenizer\XMLSerializer(); 24 | $xml = $serializer->toXML($tokens); 25 | 26 | echo $xml; 27 | ``` 28 | 29 | The generated XML structure looks something like this: 30 | 31 | ```xml 32 | 33 | 34 | 35 | <?php 36 | declare 37 | ( 38 | strict_types 39 | 40 | = 41 | 42 | 1 43 | ) 44 | ; 45 | 46 | 47 | ``` 48 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "theseer/tokenizer", 3 | "description": "A small library for converting tokenized PHP source code into XML and potentially other formats", 4 | "license": "BSD-3-Clause", 5 | "authors": [ 6 | { 7 | "name": "Arne Blankerts", 8 | "email": "arne@blankerts.de", 9 | "role": "Developer" 10 | } 11 | ], 12 | "support": { 13 | "issues": "https://github.com/theseer/tokenizer/issues" 14 | }, 15 | "require": { 16 | "php": "^7.2 || ^8.0", 17 | "ext-xmlwriter": "*", 18 | "ext-dom": "*", 19 | "ext-tokenizer": "*" 20 | }, 21 | "autoload": { 22 | "classmap": [ 23 | "src/" 24 | ] 25 | } 26 | } 27 | 28 | -------------------------------------------------------------------------------- /composer.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_readme": [ 3 | "This file locks the dependencies of your project to a known state", 4 | "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", 5 | "This file is @generated automatically" 6 | ], 7 | "content-hash": "b010f1b3d9d47d431ee1cb54ac1de755", 8 | "packages": [], 9 | "packages-dev": [], 10 | "aliases": [], 11 | "minimum-stability": "stable", 12 | "stability-flags": [], 13 | "prefer-stable": false, 14 | "prefer-lowest": false, 15 | "platform": { 16 | "php": "^7.2 || ^8.0", 17 | "ext-xmlwriter": "*", 18 | "ext-dom": "*", 19 | "ext-tokenizer": "*" 20 | }, 21 | "platform-dev": [] 22 | } 23 | -------------------------------------------------------------------------------- /src/Exception.php: -------------------------------------------------------------------------------- 1 | ensureValidUri($value); 11 | $this->value = $value; 12 | } 13 | 14 | public function asString(): string { 15 | return $this->value; 16 | } 17 | 18 | private function ensureValidUri($value): void { 19 | if (\strpos($value, ':') === false) { 20 | throw new NamespaceUriException( 21 | \sprintf("Namespace URI '%s' must contain at least one colon", $value) 22 | ); 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/NamespaceUriException.php: -------------------------------------------------------------------------------- 1 | line = $line; 20 | $this->name = $name; 21 | $this->value = $value; 22 | } 23 | 24 | public function getLine(): int { 25 | return $this->line; 26 | } 27 | 28 | public function getName(): string { 29 | return $this->name; 30 | } 31 | 32 | public function getValue(): string { 33 | return $this->value; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/TokenCollection.php: -------------------------------------------------------------------------------- 1 | tokens[] = $token; 14 | } 15 | 16 | public function current(): Token { 17 | return \current($this->tokens); 18 | } 19 | 20 | public function key(): int { 21 | return \key($this->tokens); 22 | } 23 | 24 | public function next(): void { 25 | \next($this->tokens); 26 | $this->pos++; 27 | } 28 | 29 | public function valid(): bool { 30 | return $this->count() > $this->pos; 31 | } 32 | 33 | public function rewind(): void { 34 | \reset($this->tokens); 35 | $this->pos = 0; 36 | } 37 | 38 | public function count(): int { 39 | return \count($this->tokens); 40 | } 41 | 42 | public function offsetExists($offset): bool { 43 | return isset($this->tokens[$offset]); 44 | } 45 | 46 | /** 47 | * @throws TokenCollectionException 48 | */ 49 | public function offsetGet($offset): Token { 50 | if (!$this->offsetExists($offset)) { 51 | throw new TokenCollectionException( 52 | \sprintf('No Token at offest %s', $offset) 53 | ); 54 | } 55 | 56 | return $this->tokens[$offset]; 57 | } 58 | 59 | /** 60 | * @param Token $value 61 | * 62 | * @throws TokenCollectionException 63 | */ 64 | public function offsetSet($offset, $value): void { 65 | if (!\is_int($offset)) { 66 | $type = \gettype($offset); 67 | 68 | throw new TokenCollectionException( 69 | \sprintf( 70 | 'Offset must be of type integer, %s given', 71 | $type === 'object' ? \get_class($value) : $type 72 | ) 73 | ); 74 | } 75 | 76 | if (!$value instanceof Token) { 77 | $type = \gettype($value); 78 | 79 | throw new TokenCollectionException( 80 | \sprintf( 81 | 'Value must be of type %s, %s given', 82 | Token::class, 83 | $type === 'object' ? \get_class($value) : $type 84 | ) 85 | ); 86 | } 87 | $this->tokens[$offset] = $value; 88 | } 89 | 90 | public function offsetUnset($offset): void { 91 | unset($this->tokens[$offset]); 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/TokenCollectionException.php: -------------------------------------------------------------------------------- 1 | 'T_OPEN_BRACKET', 13 | ')' => 'T_CLOSE_BRACKET', 14 | '[' => 'T_OPEN_SQUARE', 15 | ']' => 'T_CLOSE_SQUARE', 16 | '{' => 'T_OPEN_CURLY', 17 | '}' => 'T_CLOSE_CURLY', 18 | ';' => 'T_SEMICOLON', 19 | '.' => 'T_DOT', 20 | ',' => 'T_COMMA', 21 | '=' => 'T_EQUAL', 22 | '<' => 'T_LT', 23 | '>' => 'T_GT', 24 | '+' => 'T_PLUS', 25 | '-' => 'T_MINUS', 26 | '*' => 'T_MULT', 27 | '/' => 'T_DIV', 28 | '?' => 'T_QUESTION_MARK', 29 | '!' => 'T_EXCLAMATION_MARK', 30 | ':' => 'T_COLON', 31 | '"' => 'T_DOUBLE_QUOTES', 32 | '@' => 'T_AT', 33 | '&' => 'T_AMPERSAND', 34 | '%' => 'T_PERCENT', 35 | '|' => 'T_PIPE', 36 | '$' => 'T_DOLLAR', 37 | '^' => 'T_CARET', 38 | '~' => 'T_TILDE', 39 | '`' => 'T_BACKTICK' 40 | ]; 41 | 42 | public function parse(string $source): TokenCollection { 43 | $result = new TokenCollection(); 44 | 45 | if ($source === '') { 46 | return $result; 47 | } 48 | 49 | $tokens = \token_get_all($source); 50 | 51 | $lastToken = new Token( 52 | $tokens[0][2], 53 | 'Placeholder', 54 | '' 55 | ); 56 | 57 | foreach ($tokens as $pos => $tok) { 58 | if (\is_string($tok)) { 59 | $token = new Token( 60 | $lastToken->getLine(), 61 | $this->map[$tok], 62 | $tok 63 | ); 64 | $result->addToken($token); 65 | $lastToken = $token; 66 | 67 | continue; 68 | } 69 | 70 | $line = $tok[2]; 71 | $values = \preg_split('/\R+/Uu', $tok[1]); 72 | 73 | if (!$values) { 74 | $result->addToken( 75 | new Token( 76 | $line, 77 | \token_name($tok[0]), 78 | '{binary data}' 79 | ) 80 | ); 81 | 82 | continue; 83 | } 84 | 85 | foreach ($values as $v) { 86 | $token = new Token( 87 | $line, 88 | \token_name($tok[0]), 89 | $v 90 | ); 91 | $lastToken = $token; 92 | $line++; 93 | 94 | if ($v === '') { 95 | continue; 96 | } 97 | 98 | $result->addToken($token); 99 | } 100 | } 101 | 102 | return $this->fillBlanks($result, $lastToken->getLine()); 103 | } 104 | 105 | private function fillBlanks(TokenCollection $tokens, int $maxLine): TokenCollection { 106 | $prev = new Token( 107 | 0, 108 | 'Placeholder', 109 | '' 110 | ); 111 | 112 | $final = new TokenCollection(); 113 | 114 | foreach ($tokens as $token) { 115 | $gap = $token->getLine() - $prev->getLine(); 116 | 117 | while ($gap > 1) { 118 | $linebreak = new Token( 119 | $prev->getLine() + 1, 120 | 'T_WHITESPACE', 121 | '' 122 | ); 123 | $final->addToken($linebreak); 124 | $prev = $linebreak; 125 | $gap--; 126 | } 127 | 128 | $final->addToken($token); 129 | $prev = $token; 130 | } 131 | 132 | $gap = $maxLine - $prev->getLine(); 133 | 134 | while ($gap > 0) { 135 | $linebreak = new Token( 136 | $prev->getLine() + 1, 137 | 'T_WHITESPACE', 138 | '' 139 | ); 140 | $final->addToken($linebreak); 141 | $prev = $linebreak; 142 | $gap--; 143 | } 144 | 145 | return $final; 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /src/XMLSerializer.php: -------------------------------------------------------------------------------- 1 | xmlns = $xmlns; 27 | } 28 | 29 | public function toDom(TokenCollection $tokens): DOMDocument { 30 | $dom = new DOMDocument(); 31 | $dom->preserveWhiteSpace = false; 32 | $dom->loadXML($this->toXML($tokens)); 33 | 34 | return $dom; 35 | } 36 | 37 | public function toXML(TokenCollection $tokens): string { 38 | $this->writer = new \XMLWriter(); 39 | $this->writer->openMemory(); 40 | $this->writer->setIndent(true); 41 | $this->writer->startDocument(); 42 | $this->writer->startElement('source'); 43 | $this->writer->writeAttribute('xmlns', $this->xmlns->asString()); 44 | 45 | if (\count($tokens) > 0) { 46 | $this->writer->startElement('line'); 47 | $this->writer->writeAttribute('no', '1'); 48 | 49 | $this->previousToken = $tokens[0]; 50 | 51 | foreach ($tokens as $token) { 52 | $this->addToken($token); 53 | } 54 | } 55 | 56 | $this->writer->endElement(); 57 | $this->writer->endElement(); 58 | $this->writer->endDocument(); 59 | 60 | return $this->writer->outputMemory(); 61 | } 62 | 63 | private function addToken(Token $token): void { 64 | if ($this->previousToken->getLine() < $token->getLine()) { 65 | $this->writer->endElement(); 66 | 67 | $this->writer->startElement('line'); 68 | $this->writer->writeAttribute('no', (string)$token->getLine()); 69 | $this->previousToken = $token; 70 | } 71 | 72 | if ($token->getValue() !== '') { 73 | $this->writer->startElement('token'); 74 | $this->writer->writeAttribute('name', $token->getName()); 75 | $this->writer->writeRaw(\htmlspecialchars($token->getValue(), \ENT_NOQUOTES | \ENT_DISALLOWED | \ENT_XML1)); 76 | $this->writer->endElement(); 77 | } 78 | } 79 | } 80 | --------------------------------------------------------------------------------