├── .php-cs-fixer.dist.php ├── .vscode ├── extensions.json └── settings.json ├── LICENSE.md ├── README.md ├── composer.json ├── phpstan.neon └── src ├── Microdata.php ├── MicrodataDOMDocument.php ├── MicrodataDOMElement.php └── MicrodataParser.php /.php-cs-fixer.dist.php: -------------------------------------------------------------------------------- 1 | in([ 5 | __DIR__ . '/src', 6 | __DIR__ . '/tests', 7 | ]) 8 | ->name('*.php') 9 | ->ignoreDotFiles(true) 10 | ->ignoreVCS(true); 11 | 12 | return (new PhpCsFixer\Config()) 13 | ->setRules([ 14 | '@Symfony' => true, 15 | '@Symfony:risky' => true, 16 | 'concat_space' => ['spacing' => 'one'], 17 | 'yoda_style' => ['equal' => false, 'identical' => false, 'less_and_greater' => false], 18 | ]) 19 | ->setRiskyAllowed(true) 20 | ->setFinder($finder); 21 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "editorconfig.editorconfig", 4 | "wayou.vscode-todo-highlight", 5 | 6 | "bmewburn.vscode-intelephense-client", 7 | "junstyle.php-cs-fixer" 8 | ] 9 | } 10 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.formatOnSave": true, 3 | "php.suggest.basic": false, 4 | "php.validate.enable": false, 5 | "[php]": { 6 | "editor.defaultFormatter": "junstyle.php-cs-fixer" 7 | }, 8 | "php-cs-fixer.executablePath": "${workspaceFolder}/vendor/bin/php-cs-fixer" 9 | } 10 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | 3 | Copyright (c) 2018 Yusuf Kandemir 4 | 5 | > Permission is hereby granted, free of charge, to any person obtaining a copy 6 | > of this software and associated documentation files (the "Software"), to deal 7 | > in the Software without restriction, including without limitation the rights 8 | > to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | > copies of the Software, and to permit persons to whom the Software is 10 | > furnished to do so, subject to the following conditions: 11 | > 12 | > The above copyright notice and this permission notice shall be included in 13 | > all copies or substantial portions of the Software. 14 | > 15 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | > IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | > FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | > AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | > LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | > THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # microdata-parser 2 | 3 | [![Latest Version on Packagist][ico-version]][link-packagist] 4 | [![PHP Version Support][ico-php-version]][link-packagist] 5 | [![Software License][ico-license]](LICENSE.md) 6 | [![Tests][ico-tests]][link-tests] 7 | [![Quality Checks][ico-code-quality]][link-code-quality] 8 | [![Total Downloads][ico-downloads]][link-packagist] 9 | 10 | This package aims to implement [W3C Microdata to JSON Specification](https://www.w3.org/TR/microdata/#json). 11 | 12 | **microdata-parser** extracts microdata from documents. 13 | 14 | ## Installation 15 | 16 | Via Composer 17 | 18 | ```bash 19 | $ composer require yusufkandemir/microdata-parser 20 | ``` 21 | 22 | ## Usage 23 | 24 | ##### PHP 25 | 26 | ```php 27 | use YusufKandemir\MicrodataParser\Microdata; 28 | 29 | $microdata = Microdata::fromHTMLFile('source.html')->toJSON(); 30 | /* Other sources: 31 | fromHTML() // from HTML string 32 | fromDOMDocument() // from DOMDocument object 33 | Other output methods: 34 | toArray() // to Associtive PHP Array 35 | toObject() // to PHP Object (stdClass) 36 | */ 37 | ``` 38 | 39 | ##### Source as HTML 40 | 41 | ```html 42 | 43 |
44 | 45 | 48 |
49 | ``` 50 | 51 | ##### Result as JSON 52 | 53 | ```json 54 | { 55 | "items": [ 56 | { 57 | "type": [ "http://schema.org/Product" ], 58 | "properties": { 59 | "image": [ "http://shop.example.com/test_product.jpg" ], 60 | "url": [ "http://shop.example.com/test_product" ], 61 | "name": [ "Test Product" ] 62 | } 63 | } 64 | ] 65 | } 66 | ``` 67 | 68 | ## Testing 69 | 70 | ```bash 71 | $ composer test 72 | ``` 73 | 74 | ## Contributing 75 | 76 | Please see [CONTRIBUTING](.github/CONTRIBUTING.md) for details. 77 | 78 | ## Credits 79 | 80 | - [Yusuf Kandemir][link-author] 81 | - [All Contributors][link-contributors] 82 | 83 | ## License 84 | 85 | The MIT License (MIT). Please see [License File](LICENSE.md) for more information. 86 | 87 | [ico-version]: https://img.shields.io/packagist/v/yusufkandemir/microdata-parser.svg?style=flat-square 88 | [ico-php-version]: https://img.shields.io/packagist/php-v/yusufkandemir/microdata-parser?style=flat-square 89 | [ico-license]: https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat-square 90 | [ico-tests]: https://img.shields.io/github/actions/workflow/status/yusufkandemir/microdata-parser/run-tests.yml?style=flat-square&logo=github&label=tests 91 | [ico-code-quality]: https://img.shields.io/github/actions/workflow/status/yusufkandemir/microdata-parser/analyze-quality.yml?style=flat-square&logo=github&label=quality 92 | [ico-downloads]: https://img.shields.io/packagist/dt/yusufkandemir/microdata-parser.svg?style=flat-square 93 | 94 | [link-packagist]: https://packagist.org/packages/yusufkandemir/microdata-parser 95 | [link-tests]: https://github.com/yusufkandemir/microdata-parser/actions/workflows/run-tests.yml 96 | [link-code-quality]: https://github.com/yusufkandemir/microdata-parser/actions/workflows/analyze-quality.yml 97 | [link-author]: https://github.com/yusufkandemir 98 | [link-contributors]: ../../contributors 99 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "yusufkandemir/microdata-parser", 3 | "type": "library", 4 | "description": "Parse microdata from HTML documents with ease. PHP Implementation of W3C Microdata to JSON Specification.", 5 | "keywords": [ 6 | "microdata", 7 | "parser", 8 | "json", 9 | "w3c", 10 | "whatwg" 11 | ], 12 | "homepage": "https://github.com/yusufkandemir/microdata-parser", 13 | "license": "MIT", 14 | "authors": [ 15 | { 16 | "name": "Yusuf Kandemir", 17 | "email": "yusuf.kandemir@outlook.com.tr", 18 | "homepage": "https://github.com/yusufkandemir" 19 | } 20 | ], 21 | "scripts": { 22 | "analyze": "phpstan", 23 | "test": "pest", 24 | "lint": "php-cs-fixer fix --dry-run --verbose", 25 | "lint:fix": "php-cs-fixer fix" 26 | }, 27 | "require": { 28 | "php": "^8.1", 29 | "ext-dom": "*", 30 | "ext-libxml": "*" 31 | }, 32 | "require-dev": { 33 | "friendsofphp/php-cs-fixer": "^3.54", 34 | "pestphp/pest": "^1.23.1", 35 | "phpstan/phpstan": "^1.10.67" 36 | }, 37 | "suggest": { 38 | "ext-json": "Needed to convert results to JSON" 39 | }, 40 | "autoload": { 41 | "psr-4": { 42 | "YusufKandemir\\MicrodataParser\\": "src" 43 | } 44 | }, 45 | "autoload-dev": { 46 | "psr-4": { 47 | "YusufKandemir\\MicrodataParser\\Tests\\": "tests" 48 | } 49 | }, 50 | "extra": { 51 | "branch-alias": { 52 | "dev-master": "1.0-dev" 53 | } 54 | }, 55 | "config": { 56 | "sort-packages": true, 57 | "allow-plugins": { 58 | "pestphp/pest-plugin": true 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /phpstan.neon: -------------------------------------------------------------------------------- 1 | parameters: 2 | level: 6 3 | paths: 4 | - src 5 | tmpDir: build/phpstan 6 | -------------------------------------------------------------------------------- /src/Microdata.php: -------------------------------------------------------------------------------- 1 | loadHTML($html, \LIBXML_NOERROR); 17 | $dom->documentURI = $documentURI; 18 | 19 | return new MicrodataParser($dom); 20 | } 21 | 22 | /** 23 | * Creates a MicrodataParser from a HTML file. 24 | * 25 | * @param string $filename Path to the file to be parsed 26 | * @param string $documentURI DocumentURI to be used in absolutizing URIs 27 | */ 28 | public static function fromHTMLFile(string $filename, string $documentURI = ''): MicrodataParser 29 | { 30 | $dom = new MicrodataDOMDocument(); 31 | $dom->loadHTMLFile($filename, \LIBXML_NOERROR); 32 | $dom->documentURI = $documentURI; 33 | 34 | return new MicrodataParser($dom); 35 | } 36 | 37 | /** 38 | * Creates a MicrodataParser from a DOMDocument instance. 39 | * If you have MicrodataDOMDocument then instantiate MicrodataParser class directly to avoid conversion. 40 | * 41 | * @param \DOMDocument $domDocument DOMDocument to be parsed. 42 | * Needs to have documentURI property to be used in absolutizing URIs if wanted. 43 | */ 44 | public static function fromDOMDocument(\DOMDocument $domDocument): MicrodataParser 45 | { 46 | $dom = new MicrodataDOMDocument(); 47 | $dom->loadDOMDocument($domDocument); 48 | 49 | return new MicrodataParser($dom); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/MicrodataDOMDocument.php: -------------------------------------------------------------------------------- 1 | List of top level items as elements 15 | */ 16 | public function getItems(): \DOMNodeList 17 | { 18 | return $this->xpath->query('//*[@itemscope and not(@itemprop)]'); 19 | } 20 | 21 | /** 22 | * {@inheritdoc} 23 | * Also assigns $xpath with DOMXPath of the freshly loaded DOMDocument. 24 | */ 25 | public function loadHTML($source, $options = 0): bool 26 | { 27 | $return = parent::loadHTML($source, $options); 28 | 29 | $this->xpath = new \DOMXPath($this); 30 | 31 | return $return; 32 | } 33 | 34 | /** 35 | * {@inheritdoc} 36 | * Also assigns $xpath with DOMXPath of the freshly loaded DOMDocument. 37 | */ 38 | public function loadHTMLFile($filename, $options = 0): bool 39 | { 40 | $return = parent::loadHTMLFile($filename, $options); 41 | 42 | $this->xpath = new \DOMXPath($this); 43 | 44 | return $return; 45 | } 46 | 47 | /** 48 | * Load a DOMDocument instance as the root of this document. 49 | * Also assigns $xpath with DOMXPath of the freshly loaded DOMDocument. 50 | * Also copies documentURI from the given DOMDocument. 51 | */ 52 | public function loadDOMDocument(\DOMDocument $domDocument): void 53 | { 54 | $this->documentURI = $domDocument->documentURI; 55 | 56 | $importedNode = $this->importNode($domDocument->documentElement, true); 57 | $this->appendChild($importedNode); 58 | 59 | $this->xpath = new \DOMXPath($this); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/MicrodataDOMElement.php: -------------------------------------------------------------------------------- 1 | "tag name" to "attribute name" mapping */ 11 | private static array $tagNameLookup = [ 12 | 'audio' => 'src', 13 | 'embed' => 'src', 14 | 'iframe' => 'src', 15 | 'img' => 'src', 16 | 'source' => 'src', 17 | 'track' => 'src', 18 | 'video' => 'src', 19 | 'a' => 'href', 20 | 'area' => 'href', 21 | 'link' => 'href', 22 | 'object' => 'data', 23 | 'data' => 'value', 24 | 'meter' => 'value', 25 | 'time' => 'datetime', 26 | ]; 27 | 28 | /** @var string[] Attributes that have absolute values */ 29 | private static array $absoluteAttributes = ['src', 'href', 'data']; 30 | 31 | /** 32 | * @see https://www.w3.org/TR/2018/WD-microdata-20180426/#dfn-item-properties for details of algorithm 33 | * 34 | * @return self[] 35 | */ 36 | public function getProperties(): array 37 | { 38 | /** @var self[] $results */ 39 | $results = []; 40 | $memory = [$this]; 41 | $pending = $this->getChildElementNodes(); 42 | 43 | $pending = array_merge($pending, $this->getReferenceNodes()); 44 | 45 | while ($pending) { 46 | $current = array_pop($pending); 47 | 48 | foreach ($memory as $memoryItem) { 49 | if ($current->isSameNode($memoryItem)) { 50 | continue 2; // Skip next part and continue while loop if memory contains $current 51 | } 52 | } 53 | 54 | $memory[] = $current; 55 | 56 | if (!$current->hasAttribute('itemscope')) { 57 | $pending = array_merge($pending, $current->getChildElementNodes()); 58 | } 59 | 60 | if ($current->hasAttribute('itemprop') && $current->hasPropertyNames()) { 61 | $results[] = $current; 62 | } 63 | } 64 | 65 | return array_reverse($results); 66 | } 67 | 68 | public function hasPropertyNames(): bool 69 | { 70 | return !empty($this->tokenizeAttribute('itemprop')); 71 | } 72 | 73 | /** 74 | * @see https://www.w3.org/TR/2018/WD-microdata-20180426/#dfn-property-name 75 | * 76 | * @return string[] 77 | */ 78 | public function getPropertyNames(): array 79 | { 80 | $tokens = $this->tokenizeAttribute('itemprop'); 81 | 82 | $properties = []; 83 | 84 | foreach ($tokens as $token) { 85 | if (!$this->isAbsoluteUri($token) && $this->tokenizeAttribute('itemtype')) { 86 | $token = /* $vocabularyIdentifier . */ $token; 87 | } 88 | 89 | $properties[] = $token; 90 | } 91 | 92 | return array_unique($properties); 93 | } 94 | 95 | /** 96 | * @see https://www.w3.org/TR/2018/WD-microdata-20180426/#dfn-property-value for details of algorithm 97 | * 98 | * @return $this|string 99 | */ 100 | public function getPropertyValue(?callable $absoluteUriHandler = null): string|static 101 | { 102 | if ($this->hasAttribute('itemscope')) { 103 | return $this; 104 | } 105 | 106 | if ($this->hasAttribute('content')) { 107 | return $this->getAttribute('content'); 108 | } 109 | 110 | $value = ''; 111 | 112 | if (\array_key_exists($this->tagName, self::$tagNameLookup)) { 113 | $attribute = self::$tagNameLookup[$this->tagName]; 114 | $value = $this->getAttribute($attribute); 115 | 116 | if (!empty($value) && \in_array($attribute, self::$absoluteAttributes) && !$this->isAbsoluteUri($value)) { 117 | $value = $absoluteUriHandler($value, $this->ownerDocument->documentURI); 118 | } 119 | } 120 | 121 | return $value ?: $this->textContent; 122 | } 123 | 124 | /** 125 | * Checks a string to see if its absolute uri or not 126 | * Note: As it uses a simple regex to check, it is not that reliable. 127 | */ 128 | protected function isAbsoluteUri(string $uri): bool 129 | { 130 | return preg_match("/^\w+:/", trim($uri)) === 1; 131 | } 132 | 133 | /** 134 | * Filters out TextNodes etc. and returns the DOMElements. 135 | * 136 | * @return self[] 137 | */ 138 | protected function getChildElementNodes(): array 139 | { 140 | $childNodes = []; 141 | 142 | /** @var self $childNode */ 143 | foreach ($this->childNodes as $childNode) { 144 | if ($childNode->nodeType === \XML_ELEMENT_NODE) { 145 | $childNodes[] = $childNode; 146 | } 147 | } 148 | 149 | return $childNodes; 150 | } 151 | 152 | /** 153 | * Tokenizes value of given attribute. 154 | * 155 | * @param string $attributeName Name of the attribute 156 | * 157 | * @return string[] 158 | */ 159 | public function tokenizeAttribute(string $attributeName): array 160 | { 161 | return $this->hasAttribute($attributeName) 162 | ? $this->tokenize($this->getAttribute($attributeName)) 163 | : []; 164 | } 165 | 166 | /** 167 | * Splits given attribute value in space characters to array. 168 | * 169 | * @see https://www.w3.org/TR/2018/WD-microdata-20180426/#dfn-split-a-string-on-spaces for definition of tokens 170 | * 171 | * @return string[] 172 | */ 173 | protected function tokenize(string $attribute): array 174 | { 175 | return preg_split('/\s+/', trim($attribute)) ?: []; 176 | } 177 | 178 | /** 179 | * Finds the nodes that this node references through the document. 180 | * 181 | * @see https://www.w3.org/TR/microdata/#dfn-item-properties 4th step 182 | * 183 | * @return self[] 184 | */ 185 | protected function getReferenceNodes(): array 186 | { 187 | /** @var self[] $referenceNodes */ 188 | $referenceNodes = []; 189 | 190 | if ($this->hasAttribute('itemref')) { 191 | $tokens = $this->tokenizeAttribute('itemref'); 192 | 193 | foreach ($tokens as $token) { 194 | $references = $this->ownerDocument->xpath->query('//*[@id="' . $token . '"]'); 195 | 196 | /** @var self|null $first */ 197 | $first = $references->item(0); 198 | if ($first) { 199 | $referenceNodes[] = $first; 200 | } 201 | } 202 | } 203 | 204 | return $referenceNodes; 205 | } 206 | } 207 | -------------------------------------------------------------------------------- /src/MicrodataParser.php: -------------------------------------------------------------------------------- 1 | registerNodeClass(\DOMElement::class, MicrodataDOMElement::class); 28 | 29 | $this->dom = $dom; 30 | $this->absoluteUriHandler = $absoluteUriHandler ?: function ($value, $base) { 31 | return $base . $value; 32 | }; 33 | } 34 | 35 | /** 36 | * Extracts and converts microdata to associative array. 37 | * 38 | * @return mixed[] 39 | * 40 | * @throws \JsonException 41 | */ 42 | public function toArray(): array 43 | { 44 | // Somewhat hacky way to convert deep objects 45 | return json_decode(json_encode($this->extractMicrodata(), \JSON_THROW_ON_ERROR), true, flags: \JSON_THROW_ON_ERROR); 46 | } 47 | 48 | /** 49 | * Extracts and converts microdata to object. 50 | */ 51 | public function toObject(): \stdClass 52 | { 53 | return $this->extractMicrodata(); 54 | } 55 | 56 | /** 57 | * Extracts and converts microdata to JSON using \json_encode(). 58 | * 59 | * @see json_encode() to description of parameters 60 | * 61 | * @throws \JsonException 62 | */ 63 | public function toJSON(int $options = 0, int $depth = 512): string 64 | { 65 | return json_encode($this->extractMicrodata(), $options | \JSON_THROW_ON_ERROR, $depth); 66 | } 67 | 68 | protected function extractMicrodata(): \stdClass 69 | { 70 | $result = new \stdClass(); 71 | 72 | $result->items = []; 73 | 74 | foreach ($this->dom->getItems() as $item) { 75 | $result->items[] = $this->getObject($item); 76 | } 77 | 78 | return $result; 79 | } 80 | 81 | /** 82 | * @see https://www.w3.org/TR/2018/WD-microdata-20180426/#dfn-get-the-object 83 | * 84 | * @param MicrodataDOMElement[] $memory 85 | */ 86 | protected function getObject(MicrodataDOMElement $item, array $memory = []): \stdClass 87 | { 88 | $result = new \stdClass(); 89 | 90 | $memory[] = $item; 91 | 92 | $result->type = $item->tokenizeAttribute('itemtype'); 93 | // @todo Check if types are valid absolute urls 94 | 95 | if ($item->hasAttribute('itemid')) { 96 | $result->id = $item->getAttribute('itemid'); 97 | } 98 | // @todo Check if item ids are valid absolute urls or like isbn:xxx 99 | 100 | $properties = new \stdClass(); 101 | 102 | foreach ($item->getProperties() as $element) { 103 | $value = $element->getPropertyValue($this->absoluteUriHandler); 104 | 105 | if ($this->isItem($value)) { 106 | foreach ($memory as $memoryItem) { 107 | if ($element->isSameNode($memoryItem)) { 108 | $value = 'ERROR'; 109 | 110 | break; 111 | } 112 | } 113 | 114 | if ($value != 'ERROR') { 115 | $value = $this->getObject($value, $memory); 116 | } 117 | } 118 | 119 | foreach ($element->getPropertyNames() as $name) { 120 | $properties->{$name}[] = $value; 121 | } 122 | } 123 | 124 | $result->properties = $properties; 125 | 126 | return $result; 127 | } 128 | 129 | /** 130 | * Set absolute uri handler. 131 | */ 132 | public function setAbsoluteUriHandler(callable $handler): void 133 | { 134 | $this->absoluteUriHandler = $handler; 135 | } 136 | 137 | /** 138 | * Check if the given parameter is a MicrodataDOMElement and has itemscope attribute. 139 | */ 140 | protected function isItem(mixed $element): bool 141 | { 142 | return $element instanceof MicrodataDOMElement && $element->hasAttribute('itemscope'); 143 | } 144 | } 145 | --------------------------------------------------------------------------------