├── lib ├── html5 │ ├── HTML5 │ │ ├── Exception.php │ │ ├── Parser │ │ │ ├── ParseError.php │ │ │ ├── FileInputStream.php │ │ │ ├── CharacterReference.php │ │ │ ├── README.md │ │ │ ├── InputStream.php │ │ │ ├── TreeBuildingRules.php │ │ │ ├── EventHandler.php │ │ │ ├── UTF8Utils.php │ │ │ ├── StringInputStream.php │ │ │ └── Scanner.php │ │ ├── Serializer │ │ │ ├── README.md │ │ │ ├── RulesInterface.php │ │ │ └── Traverser.php │ │ └── InstructionProcessor.php │ ├── autoloader.php │ ├── UPGRADING.md │ ├── RELEASE.md │ ├── HTML5.php │ └── README.md └── mf2 │ └── LICENSE.md ├── includes ├── autoload.php ├── class-parse-this-json.php ├── class-parse-this-youtube.php ├── class-parse-this-opml.php ├── class-parse-this-twitter.php ├── class-parse-this-jsonfeed.php ├── class-rest-parse-this.php ├── class-parse-this-instagram.php ├── compat-functions.php ├── class-parse-this-base.php ├── class-parse-this-discovery.php ├── class-parse-this-rss.php ├── class-parse-this-restapi.php ├── class-parse-this-mf2-utils.php ├── class-parse-this-html.php └── class-parse-this.php ├── parse-this.php └── readme.txt /lib/html5/HTML5/Exception.php: -------------------------------------------------------------------------------- 1 | ....'); 10 | \HTML5::saveHTML($dom); 11 | 12 | After: 13 | 14 | use Masterminds\HTML5; 15 | 16 | $html5 = new HTML5(); 17 | 18 | $dom = $html5->loadHTML('....'); 19 | echo $html5->saveHTML($dom); 20 | 21 | 22 | -------------------------------------------------------------------------------- /parse-this.php: -------------------------------------------------------------------------------- 1 | query( "//script[@type='application/json']" ) as $script ) { 20 | $content = $script->textContent; // phpcs:ignore 21 | $json[] = json_decode( $content, true ); 22 | } 23 | $json = array_filter( $json ); 24 | 25 | $jf2 = array(); 26 | 27 | if ( 1 === count( $json ) && wp_is_numeric_array( $json ) ) { 28 | $json = $json[0]; 29 | if ( array_key_exists( 'props', $json ) ) { 30 | $props = $json['props']; 31 | if ( array_key_exists( 'pageProps', $props ) ) { 32 | $props = $props['pageProps']; 33 | if ( array_key_exists( 'article', $props ) ) { 34 | $jf2['type'] = 'entry'; 35 | $jf2['name'] = ifset( $props['article']['title'] ); 36 | if ( array_key_exists( 'meta', $props['article'] ) ) { 37 | $jf2['published'] = normalize_iso8601( ifset( $props['article']['meta']['date'] ) ); 38 | $jf2['category'] = ifset( $props['article']['meta']['tags'] ); 39 | } 40 | } 41 | } 42 | } 43 | } 44 | $jf2 = array_filter( $jf2 ); 45 | 46 | if ( WP_DEBUG ) { 47 | $jf2['_json'] = $json; 48 | } 49 | return array_filter( $jf2 ); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /readme.txt: -------------------------------------------------------------------------------- 1 | === Parse This === 2 | Contributors: dshanske 3 | Tags: indieweb 4 | Stable tag: trunk 5 | Requires at least: 4.9 6 | Requires PHP: 5.6 7 | Tested up to: 5.6 8 | License: GPLv2 or later 9 | License URI: http://www.gnu.org/licenses/gpl-2.0.html 10 | 11 | Parse This turns URLs into structured jf2 data 12 | 13 | == Description == 14 | 15 | Parse This is based on a variety of projects including the parsing code from Press This, which was removed from WordPress. 16 | 17 | * It supports parsing from MF2 if present 18 | * For sites that are not marked up with Microformats 2(MF2) it will fall back onto parsing JSON-LD, then HTML/OpenGraph/Dublin Core Tags/etc. 19 | * It supports parsing of JSONFeed and RSS/Atom feeds 20 | * It supports parsing of WordPress REST API endpoints to generate a site feed 21 | 22 | The goal is to produce structured jf2 data that can be used for previewing links as well as feed readers and other options. It is also bundled in the Post Kinds and Yarns Microsub plugins as a library. 23 | 24 | It can be installed as a standalone plugin which will provide the necessary libraries and functionality as well as the REST API endpoint for getting JF2 data from an arbitrary URL or a WordPress Post. 25 | 26 | 27 | == Frequently Asked Questions == 28 | 29 | == Changelog == 30 | 31 | = 1.0.1 ( 2021-04-02 ) = 32 | * Remove SimplePie as a dependency as the latest version 1.5.6 is now bundled with WordPress as of 5.6. 33 | * Remove MB polyfill due issues with PHP8.0 compatibility in favor of simpler solution. 34 | 35 | = 1.0.0 ( 2020-12-15 ) = 36 | * First Official Release. Prior to this point it was in a point release. 37 | -------------------------------------------------------------------------------- /lib/html5/HTML5/Parser/CharacterReference.php: -------------------------------------------------------------------------------- 1 | ifset( $details['videoID'] ), 31 | 'name' => ifset( $details['title'] ), 32 | 'duration' => seconds_to_iso8601( ifset( $details['lengthSeconds'] ) ), 33 | 'category' => ifset( $details['keywords'] ), 34 | 'summary' => ifset( $details['shortDescription'] ), 35 | 'published' => normalize_iso8601( ifset( $microformat['publishDate'] ) ), 36 | ); 37 | $author = array( 38 | 'type' => 'card', 39 | 'url' => ifset( $microformat['ownerProfileUrl'] ), 40 | 'name' => ifset( $details['author'] ), 41 | ); 42 | $jf2['author'] = array_filter( $author ); 43 | 44 | if ( isset( $details['thumbnail'] ) ) { 45 | $thumbnail = end( $details['thumbnail']['thumbnails'] ); 46 | $jf2['featured'] = $thumbnail['url']; 47 | } 48 | if ( isset( $microformat['embed'] ) ) { 49 | $jf2['video'] = ifset( $microformat['embed']['iframeUrl'] ); 50 | } 51 | if ( WP_DEBUG ) { 52 | $jf2['_yt'] = $decode; 53 | } 54 | return array_filter( $jf2 ); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /lib/html5/HTML5/Parser/README.md: -------------------------------------------------------------------------------- 1 | # The Parser Model 2 | 3 | The parser model here follows the model in section 4 | [8.2.1](http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#parsing) 5 | of the HTML5 specification, though we do not assume a networking layer. 6 | 7 | [ InputStream ] // Generic support for reading input. 8 | || 9 | [ Scanner ] // Breaks down the stream into characters. 10 | || 11 | [ Tokenizer ] // Groups characters into syntactic 12 | || 13 | [ Tree Builder ] // Organizes units into a tree of objects 14 | || 15 | [ DOM Document ] // The final state of the parsed document. 16 | 17 | 18 | ## InputStream 19 | 20 | This is an interface with at least two concrete implementations: 21 | 22 | - StringInputStream: Reads an HTML5 string. 23 | - FileInputStream: Reads an HTML5 file. 24 | 25 | ## Scanner 26 | 27 | This is a mechanical piece of the parser. 28 | 29 | ## Tokenizer 30 | 31 | This follows section 8.4 of the HTML5 spec. It is (roughly) a recursive 32 | descent parser. (Though there are plenty of optimizations that are less 33 | than purely functional. 34 | 35 | ## EventHandler and DOMTree 36 | 37 | EventHandler is the interface for tree builders. Since not all 38 | implementations will necessarily build trees, we've chosen a more 39 | generic name. 40 | 41 | The event handler emits tokens during tokenization. 42 | 43 | The DOMTree is an event handler that builds a DOM tree. The output of 44 | the DOMTree builder is a DOMDocument. 45 | 46 | ## DOMDocument 47 | 48 | PHP has a DOMDocument class built-in (technically, it's part of libxml.) 49 | We use that, thus rendering the output of this process compatible with 50 | SimpleXML, QueryPath, and many other XML/HTML processing tools. 51 | 52 | For cases where the HTML5 is a fragment of a HTML5 document a 53 | DOMDocumentFragment is returned instead. This is another built-in class. 54 | -------------------------------------------------------------------------------- /lib/html5/HTML5/InstructionProcessor.php: -------------------------------------------------------------------------------- 1 | 15, 23 | 'limit_response_size' => 1048576, 24 | 'redirection' => 5, 25 | // Use an explicit user-agent for Parse This 26 | ); 27 | $links = array(); 28 | 29 | $response = wp_safe_remote_get( $url, $args ); 30 | $response_code = wp_remote_retrieve_response_code( $response ); 31 | $content_type = wp_remote_retrieve_header( $response, 'content-type' ); 32 | 33 | if ( in_array( $response_code, array( 403, 415 ), true ) ) { 34 | $args['user-agent'] = $user_agent; 35 | $response = wp_safe_remote_get( $url, $args ); 36 | $response_code = wp_remote_retrieve_response_code( $response ); 37 | if ( in_array( $response_code, array( 403, 415 ), true ) ) { 38 | return new WP_Error( 'source_error', 'Unable to Retrieve' ); 39 | } 40 | } 41 | 42 | // Strip any character set off the content type 43 | $ct = explode( ';', $content_type ); 44 | if ( is_array( $ct ) ) { 45 | $content_type = array_shift( $ct ); 46 | } 47 | $content_type = trim( $content_type ); 48 | 49 | $content = wp_remote_retrieve_body( $response ); 50 | return $content; 51 | } 52 | 53 | public function convert( $content ) { 54 | $xml = simplexml_load_string( $content ); 55 | $xml = $xml->body; 56 | $return = array(); 57 | foreach ( $xml->outline as $outline ) { 58 | $top = array( 59 | 'title' => $outline['title'], 60 | 'children' => array(), 61 | ); 62 | foreach ( $outline as $feed ) { 63 | $top['children'][] = array( 64 | 'name' => $feed['title'], 65 | 'url' => $feed['xmlUrl'], 66 | ); 67 | } 68 | $return[] = $top; 69 | } 70 | return $return; 71 | } 72 | } 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /includes/class-parse-this-twitter.php: -------------------------------------------------------------------------------- 1 | 15, 17 | 'limit_response_size' => 1048576, 18 | 'redirection' => 5, 19 | // Use an explicit user-agent for Parse This 20 | 'user_agent' => 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:57.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36 Parse This/WP', 21 | ); 22 | $url = add_query_arg( 'url', $url, 'https://publish.twitter.com/oembed' ); 23 | $response = wp_safe_remote_get( $url, $args ); 24 | $oembed = json_decode( wp_remote_retrieve_body( $response ), true ); 25 | $jf2 = array(); 26 | if ( array_key_exists( 'url', $oembed ) ) { 27 | $jf2['url'] = $oembed['url']; 28 | } 29 | if ( array_key_exists( 'html', $oembed ) ) { 30 | $html = $oembed['html']; 31 | $dom = pt_load_domdocument( $html ); 32 | $html = explode( '—', $html ); 33 | $html = $html[0]; 34 | $text = wp_strip_all_tags( $html ); 35 | $text = explode( '—', $text ); 36 | $text = $text[0]; 37 | 38 | $links = $dom->getElementsByTagName( 'a' ); 39 | $names = array(); 40 | $category = array(); 41 | foreach ( $links as $link ) { 42 | $key = wp_strip_all_tags( $link->nodeValue ); // phpcs:ignore 43 | $value = $link->getAttribute( 'href' ); 44 | $parse = wp_parse_url( $value ); 45 | unset( $parse['query'] ); 46 | $value = build_url( $parse ); 47 | if ( '#' === $key[0] ) { 48 | $category[] = str_replace( '#', '', $key ); 49 | } elseif ( '@' === $key[0] ) { 50 | $category[] = $value; 51 | } elseif ( $jf2['url'] === $value ) { 52 | $published = new DateTime( $key ); 53 | $jf2['published'] = $published->format( DATE_W3C ); 54 | } else { 55 | $names[ wp_strip_all_tags( $key ) ] = normalize_url( $value ); // phpcs:ignore 56 | } 57 | } 58 | $jf2['links'] = $names; 59 | $jf2['category'] = $category; 60 | $jf2['content'] = array( 61 | 'html' => Parse_This::clean_content( $html, array( 'blockquote' => array() ) ), 62 | 'value' => $text, 63 | ); 64 | $jf2['summary'] = $jf2['content']['html']; 65 | } 66 | $jf2['author'] = array_filter( 67 | array( 68 | 'type' => 'card', 69 | 'name' => ifset( $oembed['author_name'] ), 70 | 'url' => ifset( $oembed['author_url'] ), 71 | ) 72 | ); 73 | $jf2['publication'] = 'Twitter'; 74 | if ( WP_DEBUG ) { 75 | $jf2['_ombed'] = $oembed; 76 | } 77 | 78 | return array_filter( $jf2 ); 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /lib/html5/HTML5/Parser/InputStream.php: -------------------------------------------------------------------------------- 1 | self::ifset( 'name', $element ), 23 | 'url' => self::ifset( 'url', $element ), 24 | 'photo' => self::ifset( 'avatar', $element ), 25 | ) 26 | ); 27 | } 28 | $return = array_filter( $return ); 29 | if ( 1 === count( $return ) ) { 30 | return $return[0]; 31 | } 32 | return $return; 33 | } 34 | 35 | public static function to_jf2( $content, $url ) { 36 | $return = array_filter( 37 | array( 38 | 'type' => 'feed', 39 | '_feed_type' => 'jsonfeed', 40 | 'name' => self::ifset( 'title', $content ), 41 | 'url' => $url, 42 | 'summary' => self::ifset( 'description', $content ), 43 | 'photo' => self::ifset( 'icon', $content ), 44 | 'author' => self::get_author( $content ), 45 | 'language' => self::ifset( 'language', $content ), 46 | ) 47 | ); 48 | $return['items'] = array(); 49 | foreach ( $content['items'] as $item ) { 50 | $newitem = array_filter( 51 | array( 52 | 'uid' => self::ifset( 'id', $item ), 53 | 'url' => self::ifset( 'url', $item ), 54 | 'in-reply-to' => self::ifset( 'external_url', $item ), 55 | 'name' => self::ifset( 'title', $item ), 56 | 'content' => array_filter( 57 | array( 58 | 'html' => Parse_This::clean_content( self::ifset( 'content_html', $item ) ), 59 | 'text' => self::ifset( 'content_text', $item ), 60 | ) 61 | ), 62 | 'summary' => self::ifset( 'summary', $item ), 63 | 'featured' => self::ifset( 'image', $item ), 64 | 'published' => normalize_iso8601( self::ifset( 'date_published', $item ) ), 65 | 'updated' => normalize_iso8601( self::ifset( 'date_modified', $item ) ), 66 | 'author' => self::get_author( $item ), 67 | 'category' => self::ifset( 'tags', $item ), 68 | 'language' => self::ifset( 'language', $item ), 69 | ) 70 | ); 71 | if ( array_key_exists( 'attachments', $item ) ) { 72 | foreach ( $item['attachments'] as $attachment ) { 73 | $type = explode( '/', $attachment['mime_type'] ); 74 | $type = array_shift( $type ); 75 | switch ( $type ) { 76 | case 'audio': 77 | $newitem['audio'] = $attachment['url']; 78 | if ( isset( $attachment['duration_in_seconds'] ) ) { 79 | $newitem['duration'] = seconds_to_iso8601( $attachment['duration_in_seconds'] ); 80 | } 81 | break; 82 | case 'image': 83 | $newitem['photo'] = $attachment['url']; 84 | break; 85 | case 'video': 86 | $newitem['video'] = $attachment['url']; 87 | if ( isset( $attachment['duration_in_seconds'] ) ) { 88 | $newitem['duration'] = seconds_to_iso8601( $attachment['duration_in_seconds'] ); 89 | } 90 | break; 91 | } 92 | } 93 | } 94 | $return['items'][] = $newitem; 95 | } 96 | $return['_last_published'] = self::find_last_published( $return['items'] ); 97 | $return['_last_updated'] = self::find_last_updated( $return['items'] ); 98 | return $return; 99 | } 100 | } 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /lib/html5/HTML5/Parser/TreeBuildingRules.php: -------------------------------------------------------------------------------- 1 | 1, 20 | 'dd' => 1, 21 | 'dt' => 1, 22 | 'rt' => 1, 23 | 'rp' => 1, 24 | 'tr' => 1, 25 | 'th' => 1, 26 | 'td' => 1, 27 | 'thead' => 1, 28 | 'tfoot' => 1, 29 | 'tbody' => 1, 30 | 'table' => 1, 31 | 'optgroup' => 1, 32 | 'option' => 1, 33 | ); 34 | 35 | /** 36 | * Returns true if the given tagname has special processing rules. 37 | */ 38 | public function hasRules($tagname) 39 | { 40 | return isset(static::$tags[$tagname]); 41 | } 42 | 43 | /** 44 | * Evaluate the rule for the current tag name. 45 | * 46 | * This may modify the existing DOM. 47 | * 48 | * @return \DOMElement The new Current DOM element. 49 | */ 50 | public function evaluate($new, $current) 51 | { 52 | switch ($new->tagName) { 53 | case 'li': 54 | return $this->handleLI($new, $current); 55 | case 'dt': 56 | case 'dd': 57 | return $this->handleDT($new, $current); 58 | case 'rt': 59 | case 'rp': 60 | return $this->handleRT($new, $current); 61 | case 'optgroup': 62 | return $this->closeIfCurrentMatches($new, $current, array( 63 | 'optgroup', 64 | )); 65 | case 'option': 66 | return $this->closeIfCurrentMatches($new, $current, array( 67 | 'option', 68 | )); 69 | case 'tr': 70 | return $this->closeIfCurrentMatches($new, $current, array( 71 | 'tr', 72 | )); 73 | case 'td': 74 | case 'th': 75 | return $this->closeIfCurrentMatches($new, $current, array( 76 | 'th', 77 | 'td', 78 | )); 79 | case 'tbody': 80 | case 'thead': 81 | case 'tfoot': 82 | case 'table': // Spec isn't explicit about this, but it's necessary. 83 | 84 | return $this->closeIfCurrentMatches($new, $current, array( 85 | 'thead', 86 | 'tfoot', 87 | 'tbody', 88 | )); 89 | } 90 | 91 | return $current; 92 | } 93 | 94 | protected function handleLI($ele, $current) 95 | { 96 | return $this->closeIfCurrentMatches($ele, $current, array( 97 | 'li', 98 | )); 99 | } 100 | 101 | protected function handleDT($ele, $current) 102 | { 103 | return $this->closeIfCurrentMatches($ele, $current, array( 104 | 'dt', 105 | 'dd', 106 | )); 107 | } 108 | 109 | protected function handleRT($ele, $current) 110 | { 111 | return $this->closeIfCurrentMatches($ele, $current, array( 112 | 'rt', 113 | 'rp', 114 | )); 115 | } 116 | 117 | protected function closeIfCurrentMatches($ele, $current, $match) 118 | { 119 | if (in_array($current->tagName, $match, true)) { 120 | $current->parentNode->appendChild($ele); 121 | } else { 122 | $current->appendChild($ele); 123 | } 124 | 125 | return $ele; 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /lib/html5/HTML5/Parser/EventHandler.php: -------------------------------------------------------------------------------- 1 | ). 65 | * 66 | * @return int one of the Tokenizer::TEXTMODE_* constants 67 | */ 68 | public function startTag($name, $attributes = array(), $selfClosing = false); 69 | 70 | /** 71 | * An end-tag. 72 | */ 73 | public function endTag($name); 74 | 75 | /** 76 | * A comment section (unparsed character data). 77 | */ 78 | public function comment($cdata); 79 | 80 | /** 81 | * A unit of parsed character data. 82 | * 83 | * Entities in this text are *already decoded*. 84 | */ 85 | public function text($cdata); 86 | 87 | /** 88 | * Indicates that the document has been entirely processed. 89 | */ 90 | public function eof(); 91 | 92 | /** 93 | * Emitted when the parser encounters an error condition. 94 | */ 95 | public function parseError($msg, $line, $col); 96 | 97 | /** 98 | * A CDATA section. 99 | * 100 | * @param string $data 101 | * The unparsed character data 102 | */ 103 | public function cdata($data); 104 | 105 | /** 106 | * This is a holdover from the XML spec. 107 | * 108 | * While user agents don't get PIs, server-side does. 109 | * 110 | * @param string $name The name of the processor (e.g. 'php'). 111 | * @param string $data The unparsed data. 112 | */ 113 | public function processingInstruction($name, $data = null); 114 | } 115 | -------------------------------------------------------------------------------- /lib/html5/HTML5/Serializer/Traverser.php: -------------------------------------------------------------------------------- 1 | 'html', 21 | 'http://www.w3.org/1998/Math/MathML' => 'math', 22 | 'http://www.w3.org/2000/svg' => 'svg', 23 | ); 24 | 25 | protected $dom; 26 | 27 | protected $options; 28 | 29 | protected $encode = false; 30 | 31 | protected $rules; 32 | 33 | protected $out; 34 | 35 | /** 36 | * Create a traverser. 37 | * 38 | * @param \DOMNode|\DOMNodeList $dom The document or node to traverse. 39 | * @param resource $out A stream that allows writing. The traverser will output into this 40 | * stream. 41 | * @param array $options An array of options for the traverser as key/value pairs. These include: 42 | * - encode_entities: A bool to specify if full encding should happen for all named 43 | * charachter references. Defaults to false which escapes &'<>". 44 | * - output_rules: The path to the class handling the output rules. 45 | */ 46 | public function __construct($dom, $out, RulesInterface $rules, $options = array()) 47 | { 48 | $this->dom = $dom; 49 | $this->out = $out; 50 | $this->rules = $rules; 51 | $this->options = $options; 52 | 53 | $this->rules->setTraverser($this); 54 | } 55 | 56 | /** 57 | * Tell the traverser to walk the DOM. 58 | * 59 | * @return resource $out Returns the output stream. 60 | */ 61 | public function walk() 62 | { 63 | if ($this->dom instanceof \DOMDocument) { 64 | $this->rules->document($this->dom); 65 | } elseif ($this->dom instanceof \DOMDocumentFragment) { 66 | // Document fragments are a special case. Only the children need to 67 | // be serialized. 68 | if ($this->dom->hasChildNodes()) { 69 | $this->children($this->dom->childNodes); 70 | } 71 | } // If NodeList, loop 72 | elseif ($this->dom instanceof \DOMNodeList) { 73 | // If this is a NodeList of DOMDocuments this will not work. 74 | $this->children($this->dom); 75 | } // Else assume this is a DOMNode-like datastructure. 76 | else { 77 | $this->node($this->dom); 78 | } 79 | 80 | return $this->out; 81 | } 82 | 83 | /** 84 | * Process a node in the DOM. 85 | * 86 | * @param mixed $node A node implementing \DOMNode. 87 | */ 88 | public function node($node) 89 | { 90 | // A listing of types is at http://php.net/manual/en/dom.constants.php 91 | switch ($node->nodeType) { 92 | case XML_ELEMENT_NODE: 93 | $this->rules->element($node); 94 | break; 95 | case XML_TEXT_NODE: 96 | $this->rules->text($node); 97 | break; 98 | case XML_CDATA_SECTION_NODE: 99 | $this->rules->cdata($node); 100 | break; 101 | case XML_PI_NODE: 102 | $this->rules->processorInstruction($node); 103 | break; 104 | case XML_COMMENT_NODE: 105 | $this->rules->comment($node); 106 | break; 107 | // Currently we don't support embedding DTDs. 108 | default: 109 | //print ''; 110 | break; 111 | } 112 | } 113 | 114 | /** 115 | * Walk through all the nodes on a node list. 116 | * 117 | * @param \DOMNodeList $nl A list of child elements to walk through. 118 | */ 119 | public function children($nl) 120 | { 121 | foreach ($nl as $node) { 122 | $this->node($node); 123 | } 124 | } 125 | 126 | /** 127 | * Is an element local? 128 | * 129 | * @param mixed $ele An element that implement \DOMNode. 130 | * 131 | * @return bool true if local and false otherwise. 132 | */ 133 | public function isLocalElement($ele) 134 | { 135 | $uri = $ele->namespaceURI; 136 | if (empty($uri)) { 137 | return false; 138 | } 139 | 140 | return isset(static::$local_ns[$uri]); 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /lib/html5/RELEASE.md: -------------------------------------------------------------------------------- 1 | # Release Notes 2 | 3 | 2.7.6 (2021-08-18) 4 | 5 | - #218: Address comment handling issues 6 | 7 | 2.7.5 (2021-07-01) 8 | 9 | - #204: Travis: Enable tests on PHP 8.0 10 | - #207: Fix PHP 8.1 deprecations 11 | 12 | 2.7.4 (2020-10-01) 13 | 14 | - #191: Fix travisci build 15 | - #195: Add .gitattributes file with export-ignore rules 16 | - #194: Fix query parameter parsed as character entity 17 | 18 | 2.7.3 (2020-07-05) 19 | 20 | - #190: mitigate cyclic reference between output rules and the traverser objects 21 | 22 | 2.7.2 (2020-07-01) 23 | 24 | - #187: Fixed memory leak in HTML5::saveHTML() 25 | - #186: Add special case for end tag 26 | 27 | 2.7.1 (2020-06-14) 28 | 29 | - #171: add PHP 7.4 job 30 | - #178: Prevent infinite loop on un-terminated entity declaration at EOF 31 | 32 | 2.7.0 (2019-07-25) 33 | 34 | - #164: Drop HHVM support 35 | - #168: Set default encoding in the DOMDocument object 36 | 37 | 2.6.0 (2019-03-10) 38 | 39 | - #163: Allow to pass a charset to the Scanner 40 | 41 | 2.5.0 (2018-12-27) 42 | 43 | - #162, #161, #155, #154, #153, #151: big performance improvements 44 | - #156: fixed typos 45 | - #160: adopt and enforce code style 46 | - #159: remove deprecated php unit base test case 47 | - #150: backport changes from old master branch 48 | 49 | 2.4.0 (2018-11-17) 50 | 51 | - #148: Improve performance by moving sequence matching 52 | - #147: Improve the Tokenizer performance 53 | - #146: Improve performance by relying on a native string instead of InputStream 54 | - #144: Add DOM extension in composer.json 55 | - #145: Add more extensions on composer.json, improve phpdocs and remove dead code 56 | - #143: Remove experimental comment 57 | 58 | 2.3.1 (2018-10-18) 59 | 60 | - #121: Audio is not a block tag (fixed by #141) 61 | - #136: Handle illegal self-closing according to spec (fixed by #137) 62 | - #141: Minor fixes in the README 63 | 64 | 2.3.0 (2017-09-04) 65 | 66 | - #129: image within inline svg breaks system (fixed by #133) 67 | - #131: ² does not work (fixed by #132) 68 | - #134: Improve tokenizer performance by 20% (alternative version of #130 thanks to @MichaelHeerklotz) 69 | - #135: Raw & in attributes 70 | 71 | 2.2.2 (2016-09-22) 72 | 73 | - #116: In XML mode, tags are case sensitive 74 | - #115: Fix PHP Notice in OutputRules 75 | - #112: fix parsing of options of an optgroup 76 | - #111: Adding test for the address tag 77 | 78 | 2.2.1 (2016-05-10) 79 | 80 | - #109: Fixed issue where address tag could be written without closing tag (thanks sylus) 81 | 82 | 2.2.0 (2016-04-11) 83 | 84 | - #105: Enable composer cache (for CI/CD) 85 | - #100: Use mb_substitute_character inset of ini_set for environments where ini_set is disable (e.g., shared hosting) 86 | - #98: Allow link, meta, style tags in noscript tags 87 | - #96: Fixed xml:href on svgs that use the "use" breaking 88 | - #94: Counting UTF8 characters performance improvement 89 | - #93: Use newer version of coveralls package 90 | - #90: Remove duplicate test 91 | - #87: Allow multiple root nodes 92 | 93 | 2.1.2 (2015-06-07) 94 | - #82: Support for PHP7 95 | - #84: Improved boolean attribute handling 96 | 97 | 2.1.1 (2015-03-23) 98 | - #78: Fixes bug where unmatched entity like string drops everything after &. 99 | 100 | 2.1.0 (2015-02-01) 101 | - #74: Added `disable_html_ns` and `target_doc` dom parsing options 102 | - Unified option names 103 | - #73: Fixed alphabet, ß now can be detected 104 | - #75 and #76: Allow whitespace in RCDATA tags 105 | - #77: Fixed parsing blunder for json embeds 106 | - #72: Add options to HTML methods 107 | 108 | 2.0.2 (2014-12-17) 109 | - #50: empty document handling 110 | - #63: tags with strange capitalization 111 | - #65: dashes and underscores as allowed characters in tag names 112 | - #68: Fixed issue with non-inline elements inside inline containers 113 | 114 | 2.0.1 (2014-09-23) 115 | - #59: Fixed issue parsing some fragments. 116 | - #56: Incorrectly saw 0 as empty string 117 | - Sami as new documentation generator 118 | 119 | 2.0.0 (2014-07-28) 120 | - #53: Improved boolean attributes handling 121 | - #52: Facebook HHVM compatibility 122 | - #48: Adopted PSR-2 as coding standard 123 | - #47: Moved everything to Masterminds namespace 124 | - #45: Added custom namespaces 125 | - #44: Added support to XML-style namespaces 126 | - #37: Refactored HTML5 class removing static methods 127 | 128 | 1.0.5 (2014-06-10) 129 | - #38: Set the dev-master branch as the 1.0.x branch for composer (goetas) 130 | - #34: Tests use PSR-4 for autoloading. (goetas) 131 | - #40, #41: Fix entity handling in RCDATA sections. (KitaitiMakoto) 132 | - #32: Fixed issue where wharacter references were being incorrectly encoded in style tags. 133 | 134 | 1.0.4 (2014-04-29) 135 | - #30/#31 Don't throw an exception for invalid tag names. 136 | 137 | 1.0.3 (2014-02-28) 138 | - #23 and #29: Ignore attributes with illegal chars in name for the PHP DOM. 139 | 140 | 1.0.2 (2014-02-12) 141 | - #23: Handle missing tag close in attribute list. 142 | - #25: Fixed text escaping in the serializer (HTML% 8.3). 143 | - #27: Fixed tests on Windows: changed "\n" -> PHP_EOL. 144 | - #28: Fixed infinite loop for char "&" in unquoted attribute in parser. 145 | - #26: Updated tag name case handling to deal with uppercase usage. 146 | - #24: Newlines and tabs are allowed inside quoted attributes (HTML5 8.2.4). 147 | - Fixed Travis CI testing. 148 | 149 | 1.0.1 (2013-11-07) 150 | - CDATA encoding is improved. (Non-standard; Issue #19) 151 | - Some parser rules were not returning the new current element. (Issue #20) 152 | - Added, to the README, details on code test coverage and to packagist version. 153 | - Fixed processor instructions. 154 | - Improved test coverage and documentation coverage. 155 | 156 | 1.0.0 (2013-10-02) 157 | - Initial release. 158 | -------------------------------------------------------------------------------- /includes/class-rest-parse-this.php: -------------------------------------------------------------------------------- 1 | 34 |
110 | WP_REST_Server::READABLE, 125 | 'callback' => array( $cls, 'read' ), 126 | 'args' => array( 127 | 'url' => array( 128 | 'required' => true, 129 | 'validate_callback' => array( $cls, 'is_valid_url' ), 130 | 'sanitize_callback' => 'esc_url_raw', 131 | ), 132 | ), 133 | 'permission_callback' => function () { 134 | return current_user_can( 'read' ); 135 | }, 136 | ), 137 | ) 138 | ); 139 | } 140 | 141 | public static function read( $request ) { 142 | $url = $request->get_param( 'url' ); 143 | $mf2 = $request->get_param( 'mf2' ); 144 | $return = $request->get_param( 'return' ); 145 | $refs = $request->get_param( 'references' ); 146 | $discovery = $request->get_param( 'discovery' ); 147 | $location = $request->get_param( 'location' ); 148 | $follow = $request->get_param( 'follow' ); 149 | if ( $discovery ) { 150 | $parse = new Parse_This_Discovery(); 151 | return $parse->fetch( $url ); 152 | } 153 | $parse = new Parse_This( $url ); 154 | $r = $parse->fetch(); 155 | 156 | if ( is_wp_error( $r ) ) { 157 | return $r; 158 | } 159 | $parse->parse( 160 | array( 161 | 'return' => $return, 162 | 'follow' => $follow, 163 | 'references' => $refs, 164 | 'location' => $location, 165 | ) 166 | ); 167 | if ( $mf2 ) { 168 | return $parse->get( 'mf2' ); 169 | } 170 | return $parse->get(); 171 | } 172 | 173 | /** 174 | * Returns if valid URL for REST validation 175 | * 176 | * @param string $url 177 | * 178 | * @return boolean 179 | */ 180 | public static function is_valid_url( $url, $request = null, $key = null ) { 181 | return wp_http_validate_url( $url ); 182 | } 183 | 184 | 185 | public static function addscheme( $url, $scheme = 'http://' ) { 186 | return wp_parse_url( $url, PHP_URL_SCHEME ) === null ? $scheme . $url : $url; 187 | } 188 | 189 | } 190 | 191 | new REST_Parse_This(); 192 | -------------------------------------------------------------------------------- /includes/class-parse-this-instagram.php: -------------------------------------------------------------------------------- 1 | query( '//script' ) as $script ) { 16 | if ( preg_match( '/window\._sharedData = ({.+});/', $script->textContent, $match ) ) { // phpcs:ignore 17 | $data = json_decode( $match[1], true ); 18 | } 19 | } 20 | if ( empty( $data ) ) { 21 | return array(); 22 | } 23 | 24 | $jf2 = array(); 25 | if ( $data && is_array( $data ) && array_key_exists( 'entry_data', $data ) ) { 26 | if ( is_array( $data['entry_data'] ) ) { 27 | if ( array_key_exists( 'PostPage', $data['entry_data'] ) ) { 28 | // Photo Page 29 | $jf2 = self::html_photo( $data, $url ); 30 | } elseif ( array_key_exists( 'LocationsPage', $data['entry_data'] ) ) { 31 | // Locations Page 32 | $jf2 = self::html_location( $data, $url ); 33 | } elseif ( array_key_exists( 'LoginAndSignupPage', $data['entry_data'] ) ) { 34 | return array(); 35 | } 36 | } 37 | } 38 | if ( WP_DEBUG ) { 39 | $jf2['_ig'] = $data; 40 | } 41 | return array_filter( $jf2 ); 42 | } 43 | 44 | private static function html_location( $data, $url ) { 45 | $post = $data['entry_data']['LocationsPage']; 46 | if ( isset( $post[0]['graphql']['location'] ) ) { 47 | $data = $post[0]['graphql']['location']; 48 | } else { 49 | return array(); 50 | } 51 | return self::json_location( $data, $url ); 52 | } 53 | 54 | private static function json_location( $data, $url ) { 55 | $address = isset( $data['address_json'] ) ? json_decode( $data['address_json'], true ) : array(); 56 | $jf2 = array( 57 | 'address' => $address, 58 | 'name' => ifset( $data['name'] ), 59 | 'latitude' => ifset( $data['lat'] ), 60 | 'longitude' => ifset( $data['lng'] ), 61 | 'url' => ifset( $data['website'] ), 62 | 'street_address' => ifset( $address['street_address'] ), 63 | 'postal_code' => ifset( $address['zip_code'] ), 64 | 'region' => ifset( $address['region_name'] ), 65 | 'country' => ifset( $address['country_code'] ), 66 | ); 67 | return array_filter( $jf2 ); 68 | } 69 | 70 | private static function feed( $data, $url ) { 71 | return self::profile( $data ); 72 | } 73 | 74 | private static function html_photo( $data, $url ) { 75 | $post = $data['entry_data']['PostPage']; 76 | if ( isset( $post[0]['graphql']['shortcode_media'] ) ) { 77 | $data = $post[0]['graphql']['shortcode_media']; 78 | } elseif ( isset( $post[0]['graphql']['media'] ) ) { 79 | $data = $post[0]['graphql']['media']; 80 | } elseif ( isset( $post[0]['media'] ) ) { 81 | $data = $post[0]['media']; 82 | } 83 | return self::json_photo( $data, $url ); 84 | } 85 | 86 | public static function json_photo( $data, $url ) { 87 | // Start building the h-entry 88 | $entry = array( 89 | 'type' => 'entry', 90 | 'url' => $url, 91 | ); 92 | 93 | // Content and hashtags 94 | $caption = false; 95 | 96 | if ( isset( $data['caption'] ) ) { 97 | $caption = $data['caption']; 98 | } elseif ( isset( $data['edge_media_to_caption']['edges'][0]['node']['text'] ) ) { 99 | $caption = $data['edge_media_to_caption']['edges'][0]['node']['text']; 100 | } 101 | 102 | if ( $caption ) { 103 | if ( preg_match_all( '/#([a-z0-9_-]+)/i', $caption, $matches ) ) { 104 | $entry['category'] = array(); 105 | foreach ( $matches[1] as $match ) { 106 | $entry['category'][] = $match; 107 | } 108 | } 109 | 110 | $entry['content'] = array( 111 | 'text' => $caption, 112 | ); 113 | } 114 | 115 | // Include the photo/video media URLs 116 | // (Always return arrays, even for single images) 117 | if ( array_key_exists( 'edge_sidecar_to_children', $data ) ) { 118 | $entry['photo'] = array(); 119 | foreach ( $data['edge_sidecar_to_children']['edges'] as $edge ) { 120 | $entry['photo'][] = $edge['node']['display_url']; 121 | } 122 | } else { 123 | // Single photo or video 124 | if ( array_key_exists( 'display_src', $data ) ) { 125 | $entry['photo'] = array( $data['display_src'] ); 126 | } elseif ( array_key_exists( 'display_url', $data ) ) { 127 | $entry['photo'] = array( $data['display_url'] ); 128 | } 129 | 130 | if ( isset( $data['is_video'] ) && $data['is_video'] && isset( $data['video_url'] ) ) { 131 | $entry['video'] = array( $data['video_url'] ); 132 | } 133 | } 134 | 135 | // Published date 136 | $published = new Datetime(); 137 | if ( isset( $data['taken_at_timestamp'] ) ) { 138 | $published->setTimestamp( $data['taken_at_timestamp'] ); 139 | } elseif ( isset( $data['date'] ) ) { 140 | $published = new DateTime( $data['date'] ); 141 | } 142 | $entry['published'] = $published->format( DATE_W3C ); 143 | if ( isset( $data['location'] ) ) { 144 | $entry['location'] = array(); 145 | if ( isset( $data['location']['address_json'] ) ) { 146 | $address = json_decode( $data['location']['address_json'], true ); 147 | $entry['location'] = array( 148 | 'street_address' => $address['street_address'], 149 | 'postal_code' => $address['zip_code'], 150 | 'region' => $address['region_name'], 151 | 'country' => $address['country_code'], 152 | ); 153 | } 154 | $entry['location']['name'] = $data['location']['name']; 155 | $entry['location']['url'] = sprintf( 'https://www.instagram.com/explore/locations/%1$s', $data['location']['id'] ); 156 | $entry['location'] = array_filter( $entry['location'] ); 157 | } 158 | if ( isset( $data['owner'] ) ) { 159 | $entry['author'] = array( 160 | 'type' => 'card', 161 | 'name' => ifset( $data['owner']['full_name'] ), 162 | 'nickname' => ifset( $data['owner']['username'] ), 163 | 'url' => sprintf( 'https://www.instagram.com/%1$s/', $data['owner']['username'] ), 164 | 'photo' => ifset( $data['owner']['profile_pic_url'] ), 165 | ); 166 | } 167 | return $entry; 168 | } 169 | 170 | private static function profile( $data ) { 171 | if ( isset( $data['entry_data']['ProfilePage'][0] ) ) { 172 | $profile = $data['entry_data']['ProfilePage'][0]; 173 | if ( $profile && isset( $profile['graphql']['user'] ) ) { 174 | $user = $profile['graphql']['user']; 175 | return $user; 176 | } 177 | } 178 | return array(); 179 | } 180 | 181 | 182 | 183 | } 184 | -------------------------------------------------------------------------------- /lib/mf2/LICENSE.md: -------------------------------------------------------------------------------- 1 | # Creative Commons Legal Code 2 | 3 | ## CC0 1.0 Universal 4 | 5 | http://creativecommons.org/publicdomain/zero/1.0 6 | 7 | Official translations of this legal tool are available> CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED HEREUNDER. 8 | 9 | ### _Statement of Purpose_ 10 | 11 | The laws of most jurisdictions throughout the world automatically confer exclusive Copyright and Related Rights (defined below) upon the creator and subsequent owner(s) (each and all, an "owner") of an original work of authorship and/or a database (each, a "Work"). 12 | 13 | Certain owners wish to permanently relinquish those rights to a Work for the purpose of contributing to a commons of creative, cultural and scientific works ("Commons") that the public can reliably and without fear of later claims of infringement build upon, modify, incorporate in other works, reuse and redistribute as freely as possible in any form whatsoever and for any purposes, including without limitation commercial purposes. These owners may contribute to the Commons to promote the ideal of a free culture and the further production of creative, cultural and scientific works, or to gain reputation or greater distribution for their Work in part through the use and efforts of others. 14 | 15 | For these and/or other purposes and motivations, and without any expectation of additional consideration or compensation, the person associating CC0 with a Work (the "Affirmer"), to the extent that he or she is an owner of Copyright and Related Rights in the Work, voluntarily elects to apply CC0 to the Work and publicly distribute the Work under its terms, with knowledge of his or her Copyright and Related Rights in the Work and the meaning and intended legal effect of CC0 on those rights. 16 | 17 | **1. Copyright and Related Rights.** A Work made available under CC0 may be protected by copyright and related or neighboring rights ("Copyright and Related Rights"). Copyright and Related Rights include, but are not limited to, the following: 18 | 19 | 1. the right to reproduce, adapt, distribute, perform, display, communicate, and translate a Work; 20 | 2. moral rights retained by the original author(s) and/or performer(s); 21 | 3. publicity and privacy rights pertaining to a person's image or likeness depicted in a Work; 22 | 4. rights protecting against unfair competition in regards to a Work, subject to the limitations in paragraph 4(a), below; 23 | 5. rights protecting the extraction, dissemination, use and reuse of data in a Work; 24 | 6. database rights (such as those arising under Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, and under any national implementation thereof, including any amended or successor version of such directive); and 25 | 7. other similar, equivalent or corresponding rights throughout the world based on applicable law or treaty, and any national implementations thereof. 26 | 27 | **2. Waiver.** To the greatest extent permitted by, but not in contravention of, applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and unconditionally waives, abandons, and surrenders all of Affirmer's Copyright and Related Rights and associated claims and causes of action, whether now known or unknown (including existing as well as future claims and causes of action), in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each member of the public at large and to the detriment of Affirmer's heirs and successors, fully intending that such Waiver shall not be subject to revocation, rescission, cancellation, termination, or any other legal or equitable action to disrupt the quiet enjoyment of the Work by the public as contemplated by Affirmer's express Statement of Purpose. 28 | 29 | **3. Public License Fallback.** Should any part of the Waiver for any reason be judged legally invalid or ineffective under applicable law, then the Waiver shall be preserved to the maximum extent permitted taking into account Affirmer's express Statement of Purpose. In addition, to the extent the Waiver is so judged Affirmer hereby grants to each affected person a royalty-free, non transferable, non sublicensable, non exclusive, irrevocable and unconditional license to exercise Affirmer's Copyright and Related Rights in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "License"). The License shall be deemed effective as of the date CC0 was applied by Affirmer to the Work. Should any part of the License for any reason be judged legally invalid or ineffective under applicable law, such partial invalidity or ineffectiveness shall not invalidate the remainder of the License, and in such case Affirmer hereby affirms that he or she will not (i) exercise any of his or her remaining Copyright and Related Rights in the Work or (ii) assert any associated claims and causes of action with respect to the Work, in either case contrary to Affirmer's express Statement of Purpose. 30 | 31 | **4. Limitations and Disclaimers.** 32 | 33 | 1. No trademark or patent rights held by Affirmer are waived, abandoned, surrendered, licensed or otherwise affected by this document. 34 | 2. Affirmer offers the Work as-is and makes no representations or warranties of any kind concerning the Work, express, implied, statutory or otherwise, including without limitation warranties of title, merchantability, fitness for a particular purpose, non infringement, or the absence of latent or other defects, accuracy, or the present or absence of errors, whether or not discoverable, all to the greatest extent permissible under applicable law. 35 | 3. Affirmer disclaims responsibility for clearing rights of other persons that may apply to the Work or any use thereof, including without limitation any person's Copyright and Related Rights in the Work. Further, Affirmer disclaims responsibility for obtaining any necessary consents, permissions or other rights required for any use of the Work. 36 | 4. Affirmer understands and acknowledges that Creative Commons is not a party to this document and has no duty or obligation with respect to this CC0 or use of the Work. 37 | -------------------------------------------------------------------------------- /includes/compat-functions.php: -------------------------------------------------------------------------------- 1 | getTimestamp(); 36 | } 37 | } 38 | 39 | 40 | if ( ! function_exists( 'get_post_datetime' ) ) { 41 | /** 42 | * Retrieve post published or modified time as a `DateTime` object instance. 43 | * 44 | * The object will be set to the timezone from WordPress settings. 45 | * 46 | * @since 5.3.0 - backported to Parse This 47 | * 48 | * @param int|WP_Post $post Optional. WP_Post object or ID. Default is global `$post` object. 49 | * @param string $field Optional. Post field to use. Accepts 'date' or 'modified'. 50 | * @return DateTime|false Time object on success, false on failure. 51 | */ 52 | function get_post_datetime( $post = null, $field = 'date' ) { 53 | $post = get_post( $post ); 54 | if ( ! $post ) { 55 | return false; 56 | } 57 | $time = ( 'modified' === $field ) ? $post->post_modified : $post->post_date; 58 | if ( empty( $time ) || '0000-00-00 00:00:00' === $time ) { 59 | return false; 60 | } 61 | return date_create_immutable_from_format( 'Y-m-d H:i:s', $time, wp_timezone() ); 62 | } 63 | } 64 | 65 | if ( ! function_exists( 'wp_timezone_string' ) ) { 66 | /** 67 | * Retrieves the timezone from site settings as a string. 68 | * 69 | * Uses the `timezone_string` option to get a proper timezone if available, 70 | * otherwise falls back to an offset. 71 | * 72 | * @since 5.3.0 - backported into Parse This 73 | * 74 | * @return string PHP timezone string or a ±HH:MM offset. 75 | */ 76 | function wp_timezone_string() { 77 | $timezone_string = get_option( 'timezone_string' ); 78 | if ( $timezone_string ) { 79 | return $timezone_string; 80 | } 81 | $offset = (float) get_option( 'gmt_offset' ); 82 | $hours = (int) $offset; 83 | $minutes = ( $offset - $hours ); 84 | $sign = ( $offset < 0 ) ? '-' : '+'; 85 | $abs_hour = abs( $hours ); 86 | $abs_mins = abs( $minutes * 60 ); 87 | $tz_offset = sprintf( '%s%02d:%02d', $sign, $abs_hour, $abs_mins ); 88 | return $tz_offset; 89 | } 90 | } 91 | 92 | if ( ! function_exists( 'wp_timezone' ) ) { 93 | /** 94 | * Retrieves the timezone from site settings as a `DateTimeZone` object. 95 | * 96 | * Timezone can be based on a PHP timezone string or a ±HH:MM offset. 97 | * 98 | * @since 5.3.0 - backported into Parse This 99 | * 100 | * @return DateTimeZone Timezone object. 101 | */ 102 | function wp_timezone() { 103 | return new DateTimeZone( wp_timezone_string() ); 104 | } 105 | } 106 | 107 | 108 | if ( ! function_exists( 'wp_date' ) ) { 109 | /** 110 | * Retrieves the date, in localized format. 111 | * 112 | * This is a newer function, intended to replace `date_i18n()` without legacy quirks in it. 113 | * 114 | * Note that, unlike `date_i18n()`, this function accepts a true Unix timestamp, not summed 115 | * with timezone offset. 116 | * 117 | * @since 5.3.0 - backported to Parse This 118 | * 119 | * @param string $format PHP date format. 120 | * @param int $timestamp Optional. Unix timestamp. Defaults to current time. 121 | * @param DateTimeZone $timezone Optional. Timezone to output result in. Defaults to timezone 122 | * from site settings. 123 | * @return string|false The date, translated if locale specifies it. False on invalid timestamp input. 124 | */ 125 | function wp_date( $format, $timestamp = null, $timezone = null ) { 126 | global $wp_locale; 127 | if ( null === $timestamp ) { 128 | $timestamp = time(); 129 | } elseif ( ! is_numeric( $timestamp ) ) { 130 | return false; 131 | } 132 | if ( ! $timezone ) { 133 | $timezone = wp_timezone(); 134 | } 135 | $datetime = date_create( '@' . $timestamp ); 136 | $datetime->setTimezone( $timezone ); 137 | if ( empty( $wp_locale->month ) || empty( $wp_locale->weekday ) ) { 138 | $date = $datetime->format( $format ); 139 | } else { 140 | // We need to unpack shorthand `r` format because it has parts that might be localized. 141 | $format = preg_replace( '/(?get_month( $datetime->format( 'm' ) ); 145 | $weekday = $wp_locale->get_weekday( $datetime->format( 'w' ) ); 146 | for ( $i = 0; $i < $format_length; $i ++ ) { 147 | switch ( $format[ $i ] ) { 148 | case 'D': 149 | $new_format .= backslashit( $wp_locale->get_weekday_abbrev( $weekday ) ); 150 | break; 151 | case 'F': 152 | $new_format .= backslashit( $month ); 153 | break; 154 | case 'l': 155 | $new_format .= backslashit( $weekday ); 156 | break; 157 | case 'M': 158 | $new_format .= backslashit( $wp_locale->get_month_abbrev( $month ) ); 159 | break; 160 | case 'a': 161 | $new_format .= backslashit( $wp_locale->get_meridiem( $datetime->format( 'a' ) ) ); 162 | break; 163 | case 'A': 164 | $new_format .= backslashit( $wp_locale->get_meridiem( $datetime->format( 'A' ) ) ); 165 | break; 166 | case '\\': 167 | $new_format .= $format[ $i ]; 168 | // If character follows a slash, we add it without translating. 169 | if ( $i < $format_length ) { 170 | $new_format .= $format[ ++$i ]; 171 | } 172 | break; 173 | default: 174 | $new_format .= $format[ $i ]; 175 | break; 176 | } 177 | } 178 | $date = $datetime->format( $new_format ); 179 | $date = wp_maybe_decline_date( $date ); 180 | } 181 | /** 182 | * Filters the date formatted based on the locale. 183 | * 184 | * @since 5.3.0 but backported to Parse This 185 | * 186 | * @param string $date Formatted date string. 187 | * @param string $format Format to display the date. 188 | * @param int $timestamp Unix timestamp. 189 | * @param DateTimeZone $timezone Timezone. 190 | */ 191 | $date = apply_filters( 'wp_date', $date, $format, $timestamp, $timezone ); 192 | return $date; 193 | } 194 | } 195 | 196 | if ( ! function_exists( 'str_contains' ) ) { 197 | function str_contains( $haystack, $needle ) { 198 | return $needle !== '' && false !== mb_strpos( $haystack, $needle ); 199 | } 200 | } 201 | -------------------------------------------------------------------------------- /lib/html5/HTML5/Parser/UTF8Utils.php: -------------------------------------------------------------------------------- 1 | 9 | 10 | Permission is hereby granted, free of charge, to any person obtaining a 11 | copy of this software and associated documentation files (the 12 | "Software"), to deal in the Software without restriction, including 13 | without limitation the rights to use, copy, modify, merge, publish, 14 | distribute, sublicense, and/or sell copies of the Software, and to 15 | permit persons to whom the Software is furnished to do so, subject to 16 | the following conditions: 17 | 18 | The above copyright notice and this permission notice shall be included 19 | in all copies or substantial portions of the Software. 20 | 21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 22 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 23 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 24 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 25 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 26 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 27 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 28 | */ 29 | 30 | use Masterminds\HTML5\Exception; 31 | 32 | class UTF8Utils 33 | { 34 | /** 35 | * The Unicode replacement character. 36 | */ 37 | const FFFD = "\xEF\xBF\xBD"; 38 | 39 | /** 40 | * Count the number of characters in a string. 41 | * UTF-8 aware. This will try (in order) iconv, MB, libxml, and finally a custom counter. 42 | * 43 | * @param string $string 44 | * 45 | * @return int 46 | */ 47 | public static function countChars($string) 48 | { 49 | // Get the length for the string we need. 50 | if (function_exists('mb_strlen')) { 51 | return mb_strlen($string, 'utf-8'); 52 | } 53 | 54 | if (function_exists('iconv_strlen')) { 55 | return iconv_strlen($string, 'utf-8'); 56 | } 57 | 58 | if (function_exists('utf8_decode')) { 59 | // MPB: Will this work? Won't certain decodes lead to two chars 60 | // extrapolated out of 2-byte chars? 61 | return strlen(utf8_decode($string)); 62 | } 63 | 64 | $count = count_chars($string); 65 | 66 | // 0x80 = 0x7F - 0 + 1 (one added to get inclusive range) 67 | // 0x33 = 0xF4 - 0x2C + 1 (one added to get inclusive range) 68 | return array_sum(array_slice($count, 0, 0x80)) + array_sum(array_slice($count, 0xC2, 0x33)); 69 | } 70 | 71 | /** 72 | * Convert data from the given encoding to UTF-8. 73 | * 74 | * This has not yet been tested with charactersets other than UTF-8. 75 | * It should work with ISO-8859-1/-13 and standard Latin Win charsets. 76 | * 77 | * @param string $data The data to convert 78 | * @param string $encoding A valid encoding. Examples: http://www.php.net/manual/en/mbstring.supported-encodings.php 79 | * 80 | * @return string 81 | */ 82 | public static function convertToUTF8($data, $encoding = 'UTF-8') 83 | { 84 | /* 85 | * From the HTML5 spec: Given an encoding, the bytes in the input stream must be converted 86 | * to Unicode characters for the tokeniser, as described by the rules for that encoding, 87 | * except that the leading U+FEFF BYTE ORDER MARK character, if any, must not be stripped 88 | * by the encoding layer (it is stripped by the rule below). Bytes or sequences of bytes 89 | * in the original byte stream that could not be converted to Unicode characters must be 90 | * converted to U+FFFD REPLACEMENT CHARACTER code points. 91 | */ 92 | 93 | // mb_convert_encoding is chosen over iconv because of a bug. The best 94 | // details for the bug are on http://us1.php.net/manual/en/function.iconv.php#108643 95 | // which contains links to the actual but reports as well as work around 96 | // details. 97 | if (function_exists('mb_convert_encoding')) { 98 | // mb library has the following behaviors: 99 | // - UTF-16 surrogates result in false. 100 | // - Overlongs and outside Plane 16 result in empty strings. 101 | 102 | // Before we run mb_convert_encoding we need to tell it what to do with 103 | // characters it does not know. This could be different than the parent 104 | // application executing this library so we store the value, change it 105 | // to our needs, and then change it back when we are done. This feels 106 | // a little excessive and it would be great if there was a better way. 107 | $save = mb_substitute_character(); 108 | mb_substitute_character('none'); 109 | $data = mb_convert_encoding($data, 'UTF-8', $encoding); 110 | mb_substitute_character($save); 111 | } 112 | // @todo Get iconv running in at least some environments if that is possible. 113 | elseif (function_exists('iconv') && 'auto' !== $encoding) { 114 | // fprintf(STDOUT, "iconv found\n"); 115 | // iconv has the following behaviors: 116 | // - Overlong representations are ignored. 117 | // - Beyond Plane 16 is replaced with a lower char. 118 | // - Incomplete sequences generate a warning. 119 | $data = @iconv($encoding, 'UTF-8//IGNORE', $data); 120 | } else { 121 | throw new Exception('Not implemented, please install mbstring or iconv'); 122 | } 123 | 124 | /* 125 | * One leading U+FEFF BYTE ORDER MARK character must be ignored if any are present. 126 | */ 127 | if ("\xEF\xBB\xBF" === substr($data, 0, 3)) { 128 | $data = substr($data, 3); 129 | } 130 | 131 | return $data; 132 | } 133 | 134 | /** 135 | * Checks for Unicode code points that are not valid in a document. 136 | * 137 | * @param string $data A string to analyze 138 | * 139 | * @return array An array of (string) error messages produced by the scanning 140 | */ 141 | public static function checkForIllegalCodepoints($data) 142 | { 143 | // Vestigal error handling. 144 | $errors = array(); 145 | 146 | /* 147 | * All U+0000 null characters in the input must be replaced by U+FFFD REPLACEMENT CHARACTERs. 148 | * Any occurrences of such characters is a parse error. 149 | */ 150 | for ($i = 0, $count = substr_count($data, "\0"); $i < $count; ++$i) { 151 | $errors[] = 'null-character'; 152 | } 153 | 154 | /* 155 | * Any occurrences of any characters in the ranges U+0001 to U+0008, U+000B, U+000E to U+001F, U+007F 156 | * to U+009F, U+D800 to U+DFFF , U+FDD0 to U+FDEF, and characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, 157 | * U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE, 158 | * U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, 159 | * U+DFFFE, U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and U+10FFFF are parse errors. 160 | * (These are all control characters or permanently undefined Unicode characters.) 161 | */ 162 | // Check PCRE is loaded. 163 | $count = preg_match_all( 164 | '/(?: 165 | [\x01-\x08\x0B\x0E-\x1F\x7F] # U+0001 to U+0008, U+000B, U+000E to U+001F and U+007F 166 | | 167 | \xC2[\x80-\x9F] # U+0080 to U+009F 168 | | 169 | \xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF]) # U+D800 to U+DFFFF 170 | | 171 | \xEF\xB7[\x90-\xAF] # U+FDD0 to U+FDEF 172 | | 173 | \xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF 174 | | 175 | [\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16}) 176 | )/x', $data, $matches); 177 | for ($i = 0; $i < $count; ++$i) { 178 | $errors[] = 'invalid-codepoint'; 179 | } 180 | 181 | return $errors; 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /includes/class-parse-this-base.php: -------------------------------------------------------------------------------- 1 | format( DATE_W3C ); 40 | } 41 | 42 | public static function validate_email( $email ) { 43 | return filter_var( $email, FILTER_VALIDATE_EMAIL ); 44 | } 45 | 46 | /** 47 | * 48 | */ 49 | protected static function find_last_updated( $items ) { 50 | $items = self::order_by_date( $items, 'updated' ); 51 | $return = new DateTime( $items[0]['updated'], wp_timezone() ); 52 | return $return->format( DATE_W3C ); 53 | } 54 | 55 | /** 56 | * Utility method to limit an array to 100 values. 57 | * Originally set to 50 but some sites are very detailed in their meta. 58 | * 59 | * @ignore 60 | * @since 4.2.0 61 | * 62 | * @param array $value Array to limit. 63 | * @return array Original array if fewer than 100 values, limited array, empty array otherwise. 64 | */ 65 | protected static function limit_array( $value ) { 66 | if ( is_array( $value ) ) { 67 | if ( count( $value ) > 100 ) { 68 | return array_slice( $value, 0, 100 ); 69 | } 70 | 71 | return $value; 72 | } 73 | 74 | return array(); 75 | } 76 | 77 | /** 78 | * Utility method to limit the length of a given string to 5,000 characters. 79 | * 80 | * @ignore 81 | * @since 4.2.0 82 | * 83 | * @param string $value String to limit. 84 | * @return bool|int|string If boolean or integer, that value. If a string, the original value 85 | * if fewer than 5,000 characters, a truncated version, otherwise an 86 | * empty string. 87 | */ 88 | protected static function limit_string( $value ) { 89 | $return = ''; 90 | if ( is_numeric( $value ) || is_bool( $value ) ) { 91 | $return = $value; 92 | } elseif ( is_string( $value ) ) { 93 | if ( mb_strlen( $value ) > 5000 ) { 94 | $return = mb_substr( $value, 0, 5000 ); 95 | } else { 96 | $return = $value; 97 | } 98 | $return = sanitize_text_field( trim( $return ) ); 99 | } 100 | 101 | return $return; 102 | } 103 | 104 | /** 105 | * Utility method to limit a given URL to 2,048 characters. 106 | * 107 | * @ignore 108 | * @since 4.2.0 109 | * 110 | * @param string $url URL to check for length and validity. 111 | * @param string $source_url URL URL to use to resolve relative URLs 112 | * @return string Escaped URL if of valid length (< 2048) and makeup. Empty string otherwise. 113 | */ 114 | protected static function limit_url( $url, $source_url ) { 115 | if ( ! is_string( $url ) ) { 116 | return ''; 117 | } 118 | 119 | // HTTP 1.1 allows 8000 chars but the "de-facto" standard supported in all current browsers is 2048. 120 | if ( strlen( $url ) > 2048 ) { 121 | return ''; // Return empty rather than a truncated/invalid URL 122 | } 123 | 124 | // Does not look like a URL. 125 | if ( ! filter_var( $url, FILTER_VALIDATE_URL ) ) { 126 | return ''; 127 | } 128 | 129 | $url = pt_make_absolute_url( $url, $source_url ); 130 | 131 | return esc_url_raw( $url, array( 'http', 'https' ) ); 132 | } 133 | 134 | /** 135 | * Utility method to limit image source URLs. 136 | * 137 | * Excluded URLs include share-this type buttons, loaders, spinners, spacers, WordPress interface images, 138 | * tiny buttons or thumbs, mathtag.com or quantserve.com images, or the WordPress.com stats gif. 139 | * 140 | * @param string $src Image source URL. 141 | * @return string If not matched an excluded URL type, the original URL, empty string otherwise. 142 | */ 143 | protected static function limit_img( $src, $source_url ) { 144 | $src = self::limit_url( $src, $source_url ); 145 | 146 | if ( preg_match( '!/ad[sx]?/!i', $src ) ) { 147 | // Ads 148 | return ''; 149 | } elseif ( preg_match( '!(/share-?this[^.]+?\.[a-z0-9]{3,4})(\?.*)?$!i', $src ) ) { 150 | // Share-this type button 151 | return ''; 152 | } elseif ( preg_match( '!/(spinner|loading|spacer|blank|rss)\.(gif|jpg|png)!i', $src ) ) { 153 | // Loaders, spinners, spacers 154 | return ''; 155 | } elseif ( preg_match( '!/([^./]+[-_])?(spinner|loading|spacer|blank)s?([-_][^./]+)?\.[a-z0-9]{3,4}!i', $src ) ) { 156 | // Fancy loaders, spinners, spacers 157 | return ''; 158 | } elseif ( preg_match( '!([^./]+[-_])?thumb[^.]*\.(gif|jpg|png)$!i', $src ) ) { 159 | // Thumbnails, too small, usually irrelevant to context 160 | return ''; 161 | } elseif ( false !== stripos( $src, '/wp-includes/' ) ) { 162 | // Classic WordPress interface images 163 | return ''; 164 | } elseif ( false !== stripos( $src, '/wp-content/themes' ) ) { 165 | // Anything within a WordPress theme directory 166 | return ''; 167 | } elseif ( false !== stripos( $src, '/wp-content/plugins' ) ) { 168 | // Anything within a WordPress plugin directory 169 | return ''; 170 | } elseif ( preg_match( '![^\d]\d{1,2}x\d+\.(gif|jpg|png)$!i', $src ) ) { 171 | // Most often tiny buttons/thumbs (< 100px wide) 172 | return ''; 173 | } elseif ( preg_match( '!/pixel\.(mathtag|quantserve)\.com!i', $src ) ) { 174 | // See mathtag.com and https://www.quantcast.com/how-we-do-it/iab-standard-measurement/how-we-collect-data/ 175 | return ''; 176 | } elseif ( preg_match( '!/[gb]\.gif(\?.+)?$!i', $src ) ) { 177 | // WordPress.com stats gif 178 | return ''; 179 | } 180 | // Optionally add additional limits 181 | return apply_filters( 'parse_this_img_filters', $src ); 182 | } 183 | 184 | /** 185 | * Limit embed source URLs to specific providers. 186 | * 187 | * Not all core oEmbed providers are supported. Supported providers include YouTube, Vimeo, 188 | * Vine, Daily Motion, SoundCloud, and Twitter. 189 | * 190 | * @param string $src Embed source URL. 191 | * @param string $source_url Source URL 192 | * @return string If not from a supported provider, an empty string. Otherwise, a reformatted embed URL. 193 | */ 194 | protected static function limit_embed( $src, $source_url ) { 195 | $src = self::limit_url( $src, $source_url ); 196 | 197 | if ( empty( $src ) ) { 198 | return ''; 199 | } 200 | 201 | if ( preg_match( '!//(m|www)\.youtube\.com/(embed|v)/([^?]+)\?.+$!i', $src, $src_matches ) ) { 202 | // Embedded Youtube videos (www or mobile) 203 | $src = 'https://www.youtube.com/watch?v=' . $src_matches[3]; 204 | } elseif ( preg_match( '!//player\.vimeo\.com/video/([\d]+)([?/].*)?$!i', $src, $src_matches ) ) { 205 | // Embedded Vimeo iframe videos 206 | $src = 'https://vimeo.com/' . (int) $src_matches[1]; 207 | } elseif ( preg_match( '!//vimeo\.com/moogaloop\.swf\?clip_id=([\d]+)$!i', $src, $src_matches ) ) { 208 | // Embedded Vimeo Flash videos 209 | $src = 'https://vimeo.com/' . (int) $src_matches[1]; 210 | } elseif ( preg_match( '!//vine\.co/v/([^/]+)/embed!i', $src, $src_matches ) ) { 211 | // Embedded Vine videos 212 | $src = 'https://vine.co/v/' . $src_matches[1]; 213 | } elseif ( preg_match( '!//(www\.)?dailymotion\.com/embed/video/([^/?]+)([/?].+)?!i', $src, $src_matches ) ) { 214 | // Embedded Daily Motion videos 215 | $src = 'https://www.dailymotion.com/video/' . $src_matches[2]; 216 | } else { 217 | $oembed = _wp_oembed_get_object(); 218 | 219 | if ( ! $oembed->get_provider( 220 | $src, 221 | array( 222 | 'discover' => false, 223 | ) 224 | ) ) { 225 | $src = ''; 226 | } 227 | } 228 | 229 | return $src; 230 | } 231 | 232 | public static function set( $array, $key, $value ) { 233 | if ( ! isset( $array[ $key ] ) ) { 234 | $array[ $key ] = $value; 235 | } elseif ( is_string( $array[ $key ] ) ) { 236 | $array[ $key ] = array( $array[ $key ], $value ); 237 | } elseif ( is_array( $array[ $key ] ) ) { 238 | $array[ $key ][] = $value; 239 | } 240 | return $array; 241 | } 242 | } 243 | -------------------------------------------------------------------------------- /lib/html5/HTML5.php: -------------------------------------------------------------------------------- 1 | false, 25 | 26 | // Prevents the parser from automatically assigning the HTML5 namespace to the DOM document. 27 | 'disable_html_ns' => false, 28 | ); 29 | 30 | protected $errors = array(); 31 | 32 | public function __construct(array $defaultOptions = array()) 33 | { 34 | $this->defaultOptions = array_merge($this->defaultOptions, $defaultOptions); 35 | } 36 | 37 | /** 38 | * Get the current default options. 39 | * 40 | * @return array 41 | */ 42 | public function getOptions() 43 | { 44 | return $this->defaultOptions; 45 | } 46 | 47 | /** 48 | * Load and parse an HTML file. 49 | * 50 | * This will apply the HTML5 parser, which is tolerant of many 51 | * varieties of HTML, including XHTML 1, HTML 4, and well-formed HTML 52 | * 3. Note that in these cases, not all of the old data will be 53 | * preserved. For example, XHTML's XML declaration will be removed. 54 | * 55 | * The rules governing parsing are set out in the HTML 5 spec. 56 | * 57 | * @param string|resource $file The path to the file to parse. If this is a resource, it is 58 | * assumed to be an open stream whose pointer is set to the first 59 | * byte of input. 60 | * @param array $options Configuration options when parsing the HTML. 61 | * 62 | * @return \DOMDocument A DOM document. These object type is defined by the libxml 63 | * library, and should have been included with your version of PHP. 64 | */ 65 | public function load($file, array $options = array()) 66 | { 67 | // Handle the case where file is a resource. 68 | if (is_resource($file)) { 69 | return $this->parse(stream_get_contents($file), $options); 70 | } 71 | 72 | return $this->parse(file_get_contents($file), $options); 73 | } 74 | 75 | /** 76 | * Parse a HTML Document from a string. 77 | * 78 | * Take a string of HTML 5 (or earlier) and parse it into a 79 | * DOMDocument. 80 | * 81 | * @param string $string A html5 document as a string. 82 | * @param array $options Configuration options when parsing the HTML. 83 | * 84 | * @return \DOMDocument A DOM document. DOM is part of libxml, which is included with 85 | * almost all distribtions of PHP. 86 | */ 87 | public function loadHTML($string, array $options = array()) 88 | { 89 | return $this->parse($string, $options); 90 | } 91 | 92 | /** 93 | * Convenience function to load an HTML file. 94 | * 95 | * This is here to provide backwards compatibility with the 96 | * PHP DOM implementation. It simply calls load(). 97 | * 98 | * @param string $file The path to the file to parse. If this is a resource, it is 99 | * assumed to be an open stream whose pointer is set to the first 100 | * byte of input. 101 | * @param array $options Configuration options when parsing the HTML. 102 | * 103 | * @return \DOMDocument A DOM document. These object type is defined by the libxml 104 | * library, and should have been included with your version of PHP. 105 | */ 106 | public function loadHTMLFile($file, array $options = array()) 107 | { 108 | return $this->load($file, $options); 109 | } 110 | 111 | /** 112 | * Parse a HTML fragment from a string. 113 | * 114 | * @param string $string the HTML5 fragment as a string 115 | * @param array $options Configuration options when parsing the HTML 116 | * 117 | * @return \DOMDocumentFragment A DOM fragment. The DOM is part of libxml, which is included with 118 | * almost all distributions of PHP. 119 | */ 120 | public function loadHTMLFragment($string, array $options = array()) 121 | { 122 | return $this->parseFragment($string, $options); 123 | } 124 | 125 | /** 126 | * Return all errors encountered into parsing phase. 127 | * 128 | * @return array 129 | */ 130 | public function getErrors() 131 | { 132 | return $this->errors; 133 | } 134 | 135 | /** 136 | * Return true it some errors were encountered into parsing phase. 137 | * 138 | * @return bool 139 | */ 140 | public function hasErrors() 141 | { 142 | return count($this->errors) > 0; 143 | } 144 | 145 | /** 146 | * Parse an input string. 147 | * 148 | * @param string $input 149 | * @param array $options 150 | * 151 | * @return \DOMDocument 152 | */ 153 | public function parse($input, array $options = array()) 154 | { 155 | $this->errors = array(); 156 | $options = array_merge($this->defaultOptions, $options); 157 | $events = new DOMTreeBuilder(false, $options); 158 | $scanner = new Scanner($input, !empty($options['encoding']) ? $options['encoding'] : 'UTF-8'); 159 | $parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML : Tokenizer::CONFORMANT_HTML); 160 | 161 | $parser->parse(); 162 | $this->errors = $events->getErrors(); 163 | 164 | return $events->document(); 165 | } 166 | 167 | /** 168 | * Parse an input stream where the stream is a fragment. 169 | * 170 | * Lower-level loading function. This requires an input stream instead 171 | * of a string, file, or resource. 172 | * 173 | * @param string $input The input data to parse in the form of a string. 174 | * @param array $options An array of options. 175 | * 176 | * @return \DOMDocumentFragment 177 | */ 178 | public function parseFragment($input, array $options = array()) 179 | { 180 | $options = array_merge($this->defaultOptions, $options); 181 | $events = new DOMTreeBuilder(true, $options); 182 | $scanner = new Scanner($input, !empty($options['encoding']) ? $options['encoding'] : 'UTF-8'); 183 | $parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML : Tokenizer::CONFORMANT_HTML); 184 | 185 | $parser->parse(); 186 | $this->errors = $events->getErrors(); 187 | 188 | return $events->fragment(); 189 | } 190 | 191 | /** 192 | * Save a DOM into a given file as HTML5. 193 | * 194 | * @param mixed $dom The DOM to be serialized. 195 | * @param string|resource $file The filename to be written or resource to write to. 196 | * @param array $options Configuration options when serializing the DOM. These include: 197 | * - encode_entities: Text written to the output is escaped by default and not all 198 | * entities are encoded. If this is set to true all entities will be encoded. 199 | * Defaults to false. 200 | */ 201 | public function save($dom, $file, $options = array()) 202 | { 203 | $close = true; 204 | if (is_resource($file)) { 205 | $stream = $file; 206 | $close = false; 207 | } else { 208 | $stream = fopen($file, 'wb'); 209 | } 210 | $options = array_merge($this->defaultOptions, $options); 211 | $rules = new OutputRules($stream, $options); 212 | $trav = new Traverser($dom, $stream, $rules, $options); 213 | 214 | $trav->walk(); 215 | /* 216 | * release the traverser to avoid cyclic references and allow PHP to free memory without waiting for gc_collect_cycles 217 | */ 218 | $rules->unsetTraverser(); 219 | if ($close) { 220 | fclose($stream); 221 | } 222 | } 223 | 224 | /** 225 | * Convert a DOM into an HTML5 string. 226 | * 227 | * @param mixed $dom The DOM to be serialized. 228 | * @param array $options Configuration options when serializing the DOM. These include: 229 | * - encode_entities: Text written to the output is escaped by default and not all 230 | * entities are encoded. If this is set to true all entities will be encoded. 231 | * Defaults to false. 232 | * 233 | * @return string A HTML5 documented generated from the DOM. 234 | */ 235 | public function saveHTML($dom, $options = array()) 236 | { 237 | $stream = fopen('php://temp', 'wb'); 238 | $this->save($dom, $stream, array_merge($this->defaultOptions, $options)); 239 | 240 | $html = stream_get_contents($stream, -1, 0); 241 | 242 | fclose($stream); 243 | 244 | return $html; 245 | } 246 | } 247 | -------------------------------------------------------------------------------- /includes/class-parse-this-discovery.php: -------------------------------------------------------------------------------- 1 | 15, 63 | 'limit_response_size' => 1048576, 64 | 'redirection' => 5, 65 | // Use an explicit user-agent for Parse This 66 | ); 67 | $links = array(); 68 | 69 | $response = wp_safe_remote_get( $url, $args ); 70 | $response_code = wp_remote_retrieve_response_code( $response ); 71 | $content_type = wp_remote_retrieve_header( $response, 'content-type' ); 72 | $wprest = array(); 73 | $linkheaders = wp_remote_retrieve_header( $response, 'link' ); 74 | if ( $linkheaders ) { 75 | if ( is_array( $linkheaders ) ) { 76 | foreach ( $linkheaders as $link ) { 77 | if ( preg_match( '/<(.[^>]+)>;\s+rel\s?=\s?[\"\']?(https:\/\/)?api.w.org?\/?[\"\']?/i', $link, $result ) ) { 78 | $wprest[] = array( 79 | 'url' => untrailingslashit( pt_make_absolute_url( $result[1], $url ) ), 80 | 'type' => 'feed', 81 | '_feed_type' => 'wordpress', 82 | 'name' => 'WordPress REST API', 83 | ); 84 | } 85 | } 86 | } else { 87 | if ( preg_match( '/<(.[^>]+)>;\s+rel\s?=\s?[\"\']?(https:\/\/)?api.w.org?\/?[\"\']?/i', $linkheaders, $result ) ) { 88 | $wprest[] = array( 89 | 'url' => untrailingslashit( pt_make_absolute_url( $result[1], $url ) ), 90 | 'type' => 'feed', 91 | '_feed_type' => 'wordpress', 92 | 'name' => 'WordPress REST API', 93 | ); 94 | } 95 | } 96 | } 97 | if ( in_array( $response_code, array( 403, 415 ), true ) ) { 98 | $args['user-agent'] = $user_agent; 99 | $response = wp_safe_remote_get( $url, $args ); 100 | $response_code = wp_remote_retrieve_response_code( $response ); 101 | if ( in_array( $response_code, array( 403, 415 ), true ) ) { 102 | return new WP_Error( 'source_error', 'Unable to Retrieve' ); 103 | } 104 | } 105 | 106 | // Strip any character set off the content type 107 | $ct = explode( ';', $content_type ); 108 | if ( is_array( $ct ) ) { 109 | $content_type = array_shift( $ct ); 110 | } 111 | $content_type = trim( $content_type ); 112 | 113 | $content = wp_remote_retrieve_body( $response ); 114 | // Find Youtube RSS Feeds 115 | if ( in_array( wp_parse_url( $url, PHP_URL_HOST ), array( 'www.youtube.com', 'm.youtube.com', 'youtube.com' ), true ) ) { 116 | $links[] = array( 117 | 'url' => self::youtube_rss( $url ), 118 | 'type' => 'feed', 119 | '_feed_type' => 'atom', 120 | 'name' => 'YouTube Feed', 121 | ); 122 | } 123 | // This is an RSS or Atom Feed URL and if it is not we do not know how to deal with XML anyway 124 | if ( ( in_array( $content_type, array( 'application/rss+xml', 'application/atom+xml', 'text/xml', 'application/xml', 'text/xml' ), true ) ) ) { 125 | $content = Parse_This::fetch_feed( $url ); 126 | if ( class_exists( 'Parse_This_RSS' ) ) { 127 | $links[] = array( 128 | 'url' => $url, 129 | 'type' => 'feed', 130 | '_feed_type' => Parse_This_RSS::get_type( $content ), 131 | 'name' => $content->get_title(), 132 | ); 133 | } 134 | return array( 'results' => $links ); 135 | } 136 | 137 | if ( in_array( $content_type, array( 'application/mf2+json', 'application/jf2+json', 'application/jf2feed+json' ), true ) ) { 138 | $content = json_decode( $content, true ); 139 | } 140 | if ( 'application/json' === $content_type ) { 141 | $content = json_decode( $content, true ); 142 | if ( $content && isset( $content['version'] ) && 'https://jsonfeed.org/version/1' === $content['version'] ) { 143 | $links[] = array( 144 | 'url' => $url, 145 | 'type' => 'feed', 146 | '_feed_type' => 'jsonfeed', 147 | ); 148 | } 149 | return array( 'results' => $links ); 150 | } 151 | if ( 'text/html' === $content_type ) { 152 | $doc = pt_load_domdocument( $content ); 153 | if ( $doc instanceof DOMDocument ) { 154 | $xpath = new DOMXPath( $doc ); 155 | // Fetch and gather data. 156 | $mf2 = false; 157 | foreach ( $xpath->query( '(//link|//a)[@rel and @href]' ) as $link ) { 158 | $rel = $link->getAttribute( 'rel' ); 159 | $href = $link->getAttribute( 'href' ); 160 | $title = $link->getAttribute( 'title' ); 161 | $type = self::get_feed_type( $link->getAttribute( 'type' ) ); 162 | if ( 'microformats' === $type ) { 163 | $mf2 = true; 164 | } 165 | 166 | if ( in_array( $rel, array( 'alternate', 'feed' ), true ) && ! empty( $type ) ) { 167 | $links[] = array_filter( 168 | array( 169 | 'url' => pt_make_absolute_url( $href, $url ), 170 | 'type' => 'feed', 171 | '_feed_type' => $type, 172 | 'name' => $title, 173 | '_mime-type' => $link->getAttribute( 'type' ), 174 | '_rel' => $rel, 175 | ) 176 | ); 177 | } 178 | if ( 'https://api.w.org/' === $rel && empty( $wprest ) ) { 179 | $wprest[] = array_filter( 180 | array( 181 | 'url' => untrailingslashit( pt_make_absolute_url( $href, $url ) ), 182 | 'type' => 'feed', 183 | '_feed_type' => 'wordpress', 184 | 'name' => 'WordPress REST API', 185 | ) 186 | ); 187 | } 188 | } 189 | 190 | // If an mf2 feed was found, do not check to see if this page is also one. 191 | if ( ! $mf2 ) { 192 | // Check to see if the current page is an h-feed 193 | $feeds = Parse_This_MF2::find_hfeed( $doc, $url ); 194 | foreach ( $feeds as $key => $feed ) { 195 | if ( ! Parse_This_MF2::is_microformat( $feed ) ) { 196 | continue; 197 | } 198 | if ( array_key_exists( 'children', $feed ) ) { 199 | unset( $feed['children'] ); 200 | } 201 | $jf2 = mf2_to_jf2( $feed ); 202 | if ( isset( $jf2['type'] ) && 'feed' === $jf2['type'] ) { 203 | $author = array(); 204 | if ( array_key_exists( 'author', $jf2 ) ) { 205 | if ( is_array( $jf2['author'] ) ) { 206 | $author = $jf2['author']; 207 | } elseif ( is_string( $jf2['author'] ) ) { 208 | $author = array( 209 | 'type' => 'card', 210 | ); 211 | if ( wp_http_validate_url( $jf2['author'] ) ) { 212 | $author['url'] = $jf2['author']; 213 | } else { 214 | $author['name'] = $jf2['author']; 215 | } 216 | } 217 | } 218 | $links[] = array_filter( 219 | array( 220 | 'url' => $jf2['url'], 221 | 'type' => 'feed', 222 | '_feed_type' => 'microformats', 223 | 'name' => isset( $jf2['name'] ) ? $jf2['name'] : null, 224 | 'author' => $author, 225 | ) 226 | ); 227 | } 228 | } 229 | } 230 | } 231 | 232 | if ( ! empty( $wprest ) ) { 233 | $links = array_merge( $wprest, $links ); 234 | } 235 | 236 | // Sort feeds by priority 237 | $rank = array( 238 | 'jf2feed' => 0, 239 | 'microformats' => 1, 240 | 'jsonfeed' => 2, 241 | 'wordpress' => 3, 242 | 'atom' => 4, 243 | 'rss' => 5, 244 | ); 245 | usort( 246 | $links, 247 | function( $a, $b ) use ( $rank ) { 248 | return $rank[ $a['_feed_type'] ] > $rank[ $b['_feed_type'] ]; 249 | } 250 | ); 251 | 252 | return array( 'results' => $links ); 253 | 254 | } 255 | } 256 | 257 | 258 | private static function youtube_rss( $url ) { 259 | $youtube_url_base = 'https://www.youtube.com/feeds/videos.xml'; 260 | $preg_entities = array( 261 | 'channel_id' => '\/channel\/(([^\/])+?)$', // match YouTube channel ID from url 262 | 'user' => '\/user\/(([^\/])+?)$', // match YouTube user from url 263 | 'playlist_id' => '\/playlist\?list=(([^\/])+?)$', // match YouTube playlist ID from url 264 | ); 265 | 266 | foreach ( $preg_entities as $key => $preg_entity ) { 267 | if ( preg_match( '/' . $preg_entity . '/', $url, $matches ) ) { 268 | if ( isset( $matches[1] ) ) { 269 | return $youtube_url_base . '?' . $key . '=' . $matches[1]; 270 | } 271 | } 272 | } 273 | } 274 | } 275 | -------------------------------------------------------------------------------- /lib/html5/README.md: -------------------------------------------------------------------------------- 1 | > # UKRAINE NEEDS YOUR HELP NOW! 2 | > 3 | > On 24 February 2022, Russian [President Vladimir Putin ordered an invasion of Ukraine by Russian Armed Forces](https://www.bbc.com/news/world-europe-60504334). 4 | > 5 | > Your support is urgently needed. 6 | > 7 | > - Donate to the volunteers. Here is the volunteer fund helping the Ukrainian army to provide all the necessary equipment: 8 | > https://bank.gov.ua/en/news/all/natsionalniy-bank-vidkriv-spetsrahunok-dlya-zboru-koshtiv-na-potrebi-armiyi or https://savelife.in.ua/en/donate/ 9 | > - Triple-check social media sources. Russian disinformation is attempting to coverup and distort the reality in Ukraine. 10 | > - Help Ukrainian refugees who are fleeing Russian attacks and shellings: https://www.globalcitizen.org/en/content/ways-to-help-ukraine-conflict/ 11 | > - Put pressure on your political representatives to provide help to Ukraine. 12 | > - Believe in the Ukrainian people, they will not surrender, they don't have another Ukraine. 13 | > 14 | > THANK YOU! 15 | ---- 16 | 17 | # HTML5-PHP 18 | 19 | HTML5 is a standards-compliant HTML5 parser and writer written entirely in PHP. 20 | It is stable and used in many production websites, and has 21 | well over [five million downloads](https://packagist.org/packages/masterminds/html5). 22 | 23 | HTML5 provides the following features. 24 | 25 | - An HTML5 serializer 26 | - Support for PHP namespaces 27 | - Composer support 28 | - Event-based (SAX-like) parser 29 | - A DOM tree builder 30 | - Interoperability with [QueryPath](https://github.com/technosophos/querypath) 31 | - Runs on **PHP** 5.3.0 or newer 32 | 33 | [](https://travis-ci.org/Masterminds/html5-php) 34 | [](https://packagist.org/packages/masterminds/html5) 35 | [](https://scrutinizer-ci.com/g/Masterminds/html5-php/?branch=master) 36 | [](https://scrutinizer-ci.com/g/Masterminds/html5-php/?branch=master) 37 | [](https://masterminds.github.io/stability/sustained.html) 38 | 39 | ## Installation 40 | 41 | Install HTML5-PHP using [composer](http://getcomposer.org/). 42 | 43 | By adding the `masterminds/html5` dependency to your `composer.json` file: 44 | 45 | ```json 46 | { 47 | "require" : { 48 | "masterminds/html5": "^2.0" 49 | }, 50 | } 51 | ``` 52 | 53 | By invoking require command via composer executable: 54 | 55 | ```bash 56 | composer require masterminds/html5 57 | ``` 58 | 59 | ## Basic Usage 60 | 61 | HTML5-PHP has a high-level API and a low-level API. 62 | 63 | Here is how you use the high-level `HTML5` library API: 64 | 65 | ```php 66 | 75 | 76 |This is a test of the HTML5 parser.
81 | 82 | 83 | HERE; 84 | 85 | // Parse the document. $dom is a DOMDocument. 86 | $html5 = new HTML5(); 87 | $dom = $html5->loadHTML($html); 88 | 89 | // Render it as HTML5: 90 | print $html5->saveHTML($dom); 91 | 92 | // Or save it to a file: 93 | $html5->save($dom, 'out.html'); 94 | ``` 95 | 96 | The `$dom` created by the parser is a full `DOMDocument` object. And the 97 | `save()` and `saveHTML()` methods will take any DOMDocument. 98 | 99 | ### Options 100 | 101 | It is possible to pass in an array of configuration options when loading 102 | an HTML5 document. 103 | 104 | ```php 105 | // An associative array of options 106 | $options = array( 107 | 'option_name' => 'option_value', 108 | ); 109 | 110 | // Provide the options to the constructor 111 | $html5 = new HTML5($options); 112 | 113 | $dom = $html5->loadHTML($html); 114 | ``` 115 | 116 | The following options are supported: 117 | 118 | * `encode_entities` (boolean): Indicates that the serializer should aggressively 119 | encode characters as entities. Without this, it only encodes the bare 120 | minimum. 121 | * `disable_html_ns` (boolean): Prevents the parser from automatically 122 | assigning the HTML5 namespace to the DOM document. This is for 123 | non-namespace aware DOM tools. 124 | * `target_document` (\DOMDocument): A DOM document that will be used as the 125 | destination for the parsed nodes. 126 | * `implicit_namespaces` (array): An assoc array of namespaces that should be 127 | used by the parser. Name is tag prefix, value is NS URI. 128 | 129 | ## The Low-Level API 130 | 131 | This library provides the following low-level APIs that you can use to 132 | create more customized HTML5 tools: 133 | 134 | - A SAX-like event-based parser that you can hook into for special kinds 135 | of parsing. 136 | - A flexible error-reporting mechanism that can be tuned to document 137 | syntax checking. 138 | - A DOM implementation that uses PHP's built-in DOM library. 139 | 140 | The unit tests exercise each piece of the API, and every public function 141 | is well-documented. 142 | 143 | ### Parser Design 144 | 145 | The parser is designed as follows: 146 | 147 | - The `Scanner` handles scanning on behalf of the parser. 148 | - The `Tokenizer` requests data off of the scanner, parses it, clasifies 149 | it, and sends it to an `EventHandler`. It is a *recursive descent parser.* 150 | - The `EventHandler` receives notifications and data for each specific 151 | semantic event that occurs during tokenization. 152 | - The `DOMBuilder` is an `EventHandler` that listens for tokenizing 153 | events and builds a document tree (`DOMDocument`) based on the events. 154 | 155 | ### Serializer Design 156 | 157 | The serializer takes a data structure (the `DOMDocument`) and transforms 158 | it into a character representation -- an HTML5 document. 159 | 160 | The serializer is broken into three parts: 161 | 162 | - The `OutputRules` contain the rules to turn DOM elements into strings. The 163 | rules are an implementation of the interface `RulesInterface` allowing for 164 | different rule sets to be used. 165 | - The `Traverser`, which is a special-purpose tree walker. It visits 166 | each node node in the tree and uses the `OutputRules` to transform the node 167 | into a string. 168 | - `HTML5` manages the `Traverser` and stores the resultant data 169 | in the correct place. 170 | 171 | The serializer (`save()`, `saveHTML()`) follows the 172 | [section 8.9 of the HTML 5.0 spec](http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#serializing-html-fragments). 173 | So tags are serialized according to these rules: 174 | 175 | - A tag with children: <foo>CHILDREN</foo> 176 | - A tag that cannot have content: <foo> (no closing tag) 177 | - A tag that could have content, but doesn't: <foo></foo> 178 | 179 | ## Known Issues (Or, Things We Designed Against the Spec) 180 | 181 | Please check the issue queue for a full list, but the following are 182 | issues known issues that are not presently on the roadmap: 183 | 184 | - Namespaces: HTML5 only [supports a selected list of namespaces](http://www.w3.org/TR/html5/infrastructure.html#namespaces) 185 | and they do not operate in the same way as XML namespaces. A `:` has no special 186 | meaning. 187 | By default the parser does not support XML style namespaces via `:`; 188 | to enable the XML namespaces see the [XML Namespaces section](#xml-namespaces) 189 | - Scripts: This parser does not contain a JavaScript or a CSS 190 | interpreter. While one may be supplied, not all features will be 191 | supported. 192 | - Rentrance: The current parser is not re-entrant. (Thus you can't pause 193 | the parser to modify the HTML string mid-parse.) 194 | - Validation: The current tree builder is **not** a validating parser. 195 | While it will correct some HTML, it does not check that the HTML 196 | conforms to the standard. (Should you wish, you can build a validating 197 | parser by extending DOMTree or building your own EventHandler 198 | implementation.) 199 | * There is limited support for insertion modes. 200 | * Some autocorrection is done automatically. 201 | * Per the spec, many legacy tags are admitted and correctly handled, 202 | even though they are technically not part of HTML5. 203 | - Attribute names and values: Due to the implementation details of the 204 | PHP implementation of DOM, attribute names that do not follow the 205 | XML 1.0 standard are not inserted into the DOM. (Effectively, they 206 | are ignored.) If you've got a clever fix for this, jump in! 207 | - Processor Instructions: The HTML5 spec does not allow processor 208 | instructions. We do. Since this is a server-side library, we think 209 | this is useful. And that means, dear reader, that in some cases you 210 | can parse the HTML from a mixed PHP/HTML document. This, however, 211 | is an incidental feature, not a core feature. 212 | - HTML manifests: Unsupported. 213 | - PLAINTEXT: Unsupported. 214 | - Adoption Agency Algorithm: Not yet implemented. (8.2.5.4.7) 215 | 216 | ## XML Namespaces 217 | 218 | To use XML style namespaces you have to configure well the main `HTML5` instance. 219 | 220 | ```php 221 | use Masterminds\HTML5; 222 | $html = new HTML5(array( 223 | "xmlNamespaces" => true 224 | )); 225 | 226 | $dom = $html->loadHTML('