├── lib ├── html5 │ ├── HTML5 │ │ ├── Exception.php │ │ ├── Parser │ │ │ ├── ParseError.php │ │ │ ├── FileInputStream.php │ │ │ ├── CharacterReference.php │ │ │ ├── README.md │ │ │ ├── InputStream.php │ │ │ ├── TreeBuildingRules.php │ │ │ ├── EventHandler.php │ │ │ ├── UTF8Utils.php │ │ │ ├── StringInputStream.php │ │ │ └── Scanner.php │ │ ├── Serializer │ │ │ ├── README.md │ │ │ ├── RulesInterface.php │ │ │ └── Traverser.php │ │ └── InstructionProcessor.php │ ├── autoloader.php │ ├── UPGRADING.md │ ├── RELEASE.md │ ├── HTML5.php │ └── README.md └── mf2 │ └── LICENSE.md ├── includes ├── autoload.php ├── class-parse-this-json.php ├── class-parse-this-youtube.php ├── class-parse-this-opml.php ├── class-parse-this-twitter.php ├── class-parse-this-jsonfeed.php ├── class-rest-parse-this.php ├── class-parse-this-instagram.php ├── compat-functions.php ├── class-parse-this-base.php ├── class-parse-this-discovery.php ├── class-parse-this-rss.php ├── class-parse-this-restapi.php ├── class-parse-this-mf2-utils.php ├── class-parse-this-html.php └── class-parse-this.php ├── parse-this.php └── readme.txt /lib/html5/HTML5/Exception.php: -------------------------------------------------------------------------------- 1 | ....'); 10 | \HTML5::saveHTML($dom); 11 | 12 | After: 13 | 14 | use Masterminds\HTML5; 15 | 16 | $html5 = new HTML5(); 17 | 18 | $dom = $html5->loadHTML('....'); 19 | echo $html5->saveHTML($dom); 20 | 21 | 22 | -------------------------------------------------------------------------------- /parse-this.php: -------------------------------------------------------------------------------- 1 | query( "//script[@type='application/json']" ) as $script ) { 20 | $content = $script->textContent; // phpcs:ignore 21 | $json[] = json_decode( $content, true ); 22 | } 23 | $json = array_filter( $json ); 24 | 25 | $jf2 = array(); 26 | 27 | if ( 1 === count( $json ) && wp_is_numeric_array( $json ) ) { 28 | $json = $json[0]; 29 | if ( array_key_exists( 'props', $json ) ) { 30 | $props = $json['props']; 31 | if ( array_key_exists( 'pageProps', $props ) ) { 32 | $props = $props['pageProps']; 33 | if ( array_key_exists( 'article', $props ) ) { 34 | $jf2['type'] = 'entry'; 35 | $jf2['name'] = ifset( $props['article']['title'] ); 36 | if ( array_key_exists( 'meta', $props['article'] ) ) { 37 | $jf2['published'] = normalize_iso8601( ifset( $props['article']['meta']['date'] ) ); 38 | $jf2['category'] = ifset( $props['article']['meta']['tags'] ); 39 | } 40 | } 41 | } 42 | } 43 | } 44 | $jf2 = array_filter( $jf2 ); 45 | 46 | if ( WP_DEBUG ) { 47 | $jf2['_json'] = $json; 48 | } 49 | return array_filter( $jf2 ); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /readme.txt: -------------------------------------------------------------------------------- 1 | === Parse This === 2 | Contributors: dshanske 3 | Tags: indieweb 4 | Stable tag: trunk 5 | Requires at least: 4.9 6 | Requires PHP: 5.6 7 | Tested up to: 5.6 8 | License: GPLv2 or later 9 | License URI: http://www.gnu.org/licenses/gpl-2.0.html 10 | 11 | Parse This turns URLs into structured jf2 data 12 | 13 | == Description == 14 | 15 | Parse This is based on a variety of projects including the parsing code from Press This, which was removed from WordPress. 16 | 17 | * It supports parsing from MF2 if present 18 | * For sites that are not marked up with Microformats 2(MF2) it will fall back onto parsing JSON-LD, then HTML/OpenGraph/Dublin Core Tags/etc. 19 | * It supports parsing of JSONFeed and RSS/Atom feeds 20 | * It supports parsing of WordPress REST API endpoints to generate a site feed 21 | 22 | The goal is to produce structured jf2 data that can be used for previewing links as well as feed readers and other options. It is also bundled in the Post Kinds and Yarns Microsub plugins as a library. 23 | 24 | It can be installed as a standalone plugin which will provide the necessary libraries and functionality as well as the REST API endpoint for getting JF2 data from an arbitrary URL or a WordPress Post. 25 | 26 | 27 | == Frequently Asked Questions == 28 | 29 | == Changelog == 30 | 31 | = 1.0.1 ( 2021-04-02 ) = 32 | * Remove SimplePie as a dependency as the latest version 1.5.6 is now bundled with WordPress as of 5.6. 33 | * Remove MB polyfill due issues with PHP8.0 compatibility in favor of simpler solution. 34 | 35 | = 1.0.0 ( 2020-12-15 ) = 36 | * First Official Release. Prior to this point it was in a point release. 37 | -------------------------------------------------------------------------------- /lib/html5/HTML5/Parser/CharacterReference.php: -------------------------------------------------------------------------------- 1 | ifset( $details['videoID'] ), 31 | 'name' => ifset( $details['title'] ), 32 | 'duration' => seconds_to_iso8601( ifset( $details['lengthSeconds'] ) ), 33 | 'category' => ifset( $details['keywords'] ), 34 | 'summary' => ifset( $details['shortDescription'] ), 35 | 'published' => normalize_iso8601( ifset( $microformat['publishDate'] ) ), 36 | ); 37 | $author = array( 38 | 'type' => 'card', 39 | 'url' => ifset( $microformat['ownerProfileUrl'] ), 40 | 'name' => ifset( $details['author'] ), 41 | ); 42 | $jf2['author'] = array_filter( $author ); 43 | 44 | if ( isset( $details['thumbnail'] ) ) { 45 | $thumbnail = end( $details['thumbnail']['thumbnails'] ); 46 | $jf2['featured'] = $thumbnail['url']; 47 | } 48 | if ( isset( $microformat['embed'] ) ) { 49 | $jf2['video'] = ifset( $microformat['embed']['iframeUrl'] ); 50 | } 51 | if ( WP_DEBUG ) { 52 | $jf2['_yt'] = $decode; 53 | } 54 | return array_filter( $jf2 ); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /lib/html5/HTML5/Parser/README.md: -------------------------------------------------------------------------------- 1 | # The Parser Model 2 | 3 | The parser model here follows the model in section 4 | [8.2.1](http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#parsing) 5 | of the HTML5 specification, though we do not assume a networking layer. 6 | 7 | [ InputStream ] // Generic support for reading input. 8 | || 9 | [ Scanner ] // Breaks down the stream into characters. 10 | || 11 | [ Tokenizer ] // Groups characters into syntactic 12 | || 13 | [ Tree Builder ] // Organizes units into a tree of objects 14 | || 15 | [ DOM Document ] // The final state of the parsed document. 16 | 17 | 18 | ## InputStream 19 | 20 | This is an interface with at least two concrete implementations: 21 | 22 | - StringInputStream: Reads an HTML5 string. 23 | - FileInputStream: Reads an HTML5 file. 24 | 25 | ## Scanner 26 | 27 | This is a mechanical piece of the parser. 28 | 29 | ## Tokenizer 30 | 31 | This follows section 8.4 of the HTML5 spec. It is (roughly) a recursive 32 | descent parser. (Though there are plenty of optimizations that are less 33 | than purely functional. 34 | 35 | ## EventHandler and DOMTree 36 | 37 | EventHandler is the interface for tree builders. Since not all 38 | implementations will necessarily build trees, we've chosen a more 39 | generic name. 40 | 41 | The event handler emits tokens during tokenization. 42 | 43 | The DOMTree is an event handler that builds a DOM tree. The output of 44 | the DOMTree builder is a DOMDocument. 45 | 46 | ## DOMDocument 47 | 48 | PHP has a DOMDocument class built-in (technically, it's part of libxml.) 49 | We use that, thus rendering the output of this process compatible with 50 | SimpleXML, QueryPath, and many other XML/HTML processing tools. 51 | 52 | For cases where the HTML5 is a fragment of a HTML5 document a 53 | DOMDocumentFragment is returned instead. This is another built-in class. 54 | -------------------------------------------------------------------------------- /lib/html5/HTML5/InstructionProcessor.php: -------------------------------------------------------------------------------- 1 | 15, 23 | 'limit_response_size' => 1048576, 24 | 'redirection' => 5, 25 | // Use an explicit user-agent for Parse This 26 | ); 27 | $links = array(); 28 | 29 | $response = wp_safe_remote_get( $url, $args ); 30 | $response_code = wp_remote_retrieve_response_code( $response ); 31 | $content_type = wp_remote_retrieve_header( $response, 'content-type' ); 32 | 33 | if ( in_array( $response_code, array( 403, 415 ), true ) ) { 34 | $args['user-agent'] = $user_agent; 35 | $response = wp_safe_remote_get( $url, $args ); 36 | $response_code = wp_remote_retrieve_response_code( $response ); 37 | if ( in_array( $response_code, array( 403, 415 ), true ) ) { 38 | return new WP_Error( 'source_error', 'Unable to Retrieve' ); 39 | } 40 | } 41 | 42 | // Strip any character set off the content type 43 | $ct = explode( ';', $content_type ); 44 | if ( is_array( $ct ) ) { 45 | $content_type = array_shift( $ct ); 46 | } 47 | $content_type = trim( $content_type ); 48 | 49 | $content = wp_remote_retrieve_body( $response ); 50 | return $content; 51 | } 52 | 53 | public function convert( $content ) { 54 | $xml = simplexml_load_string( $content ); 55 | $xml = $xml->body; 56 | $return = array(); 57 | foreach ( $xml->outline as $outline ) { 58 | $top = array( 59 | 'title' => $outline['title'], 60 | 'children' => array(), 61 | ); 62 | foreach ( $outline as $feed ) { 63 | $top['children'][] = array( 64 | 'name' => $feed['title'], 65 | 'url' => $feed['xmlUrl'], 66 | ); 67 | } 68 | $return[] = $top; 69 | } 70 | return $return; 71 | } 72 | } 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /includes/class-parse-this-twitter.php: -------------------------------------------------------------------------------- 1 | 15, 17 | 'limit_response_size' => 1048576, 18 | 'redirection' => 5, 19 | // Use an explicit user-agent for Parse This 20 | 'user_agent' => 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:57.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36 Parse This/WP', 21 | ); 22 | $url = add_query_arg( 'url', $url, 'https://publish.twitter.com/oembed' ); 23 | $response = wp_safe_remote_get( $url, $args ); 24 | $oembed = json_decode( wp_remote_retrieve_body( $response ), true ); 25 | $jf2 = array(); 26 | if ( array_key_exists( 'url', $oembed ) ) { 27 | $jf2['url'] = $oembed['url']; 28 | } 29 | if ( array_key_exists( 'html', $oembed ) ) { 30 | $html = $oembed['html']; 31 | $dom = pt_load_domdocument( $html ); 32 | $html = explode( '—', $html ); 33 | $html = $html[0]; 34 | $text = wp_strip_all_tags( $html ); 35 | $text = explode( '—', $text ); 36 | $text = $text[0]; 37 | 38 | $links = $dom->getElementsByTagName( 'a' ); 39 | $names = array(); 40 | $category = array(); 41 | foreach ( $links as $link ) { 42 | $key = wp_strip_all_tags( $link->nodeValue ); // phpcs:ignore 43 | $value = $link->getAttribute( 'href' ); 44 | $parse = wp_parse_url( $value ); 45 | unset( $parse['query'] ); 46 | $value = build_url( $parse ); 47 | if ( '#' === $key[0] ) { 48 | $category[] = str_replace( '#', '', $key ); 49 | } elseif ( '@' === $key[0] ) { 50 | $category[] = $value; 51 | } elseif ( $jf2['url'] === $value ) { 52 | $published = new DateTime( $key ); 53 | $jf2['published'] = $published->format( DATE_W3C ); 54 | } else { 55 | $names[ wp_strip_all_tags( $key ) ] = normalize_url( $value ); // phpcs:ignore 56 | } 57 | } 58 | $jf2['links'] = $names; 59 | $jf2['category'] = $category; 60 | $jf2['content'] = array( 61 | 'html' => Parse_This::clean_content( $html, array( 'blockquote' => array() ) ), 62 | 'value' => $text, 63 | ); 64 | $jf2['summary'] = $jf2['content']['html']; 65 | } 66 | $jf2['author'] = array_filter( 67 | array( 68 | 'type' => 'card', 69 | 'name' => ifset( $oembed['author_name'] ), 70 | 'url' => ifset( $oembed['author_url'] ), 71 | ) 72 | ); 73 | $jf2['publication'] = 'Twitter'; 74 | if ( WP_DEBUG ) { 75 | $jf2['_ombed'] = $oembed; 76 | } 77 | 78 | return array_filter( $jf2 ); 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /lib/html5/HTML5/Parser/InputStream.php: -------------------------------------------------------------------------------- 1 | self::ifset( 'name', $element ), 23 | 'url' => self::ifset( 'url', $element ), 24 | 'photo' => self::ifset( 'avatar', $element ), 25 | ) 26 | ); 27 | } 28 | $return = array_filter( $return ); 29 | if ( 1 === count( $return ) ) { 30 | return $return[0]; 31 | } 32 | return $return; 33 | } 34 | 35 | public static function to_jf2( $content, $url ) { 36 | $return = array_filter( 37 | array( 38 | 'type' => 'feed', 39 | '_feed_type' => 'jsonfeed', 40 | 'name' => self::ifset( 'title', $content ), 41 | 'url' => $url, 42 | 'summary' => self::ifset( 'description', $content ), 43 | 'photo' => self::ifset( 'icon', $content ), 44 | 'author' => self::get_author( $content ), 45 | 'language' => self::ifset( 'language', $content ), 46 | ) 47 | ); 48 | $return['items'] = array(); 49 | foreach ( $content['items'] as $item ) { 50 | $newitem = array_filter( 51 | array( 52 | 'uid' => self::ifset( 'id', $item ), 53 | 'url' => self::ifset( 'url', $item ), 54 | 'in-reply-to' => self::ifset( 'external_url', $item ), 55 | 'name' => self::ifset( 'title', $item ), 56 | 'content' => array_filter( 57 | array( 58 | 'html' => Parse_This::clean_content( self::ifset( 'content_html', $item ) ), 59 | 'text' => self::ifset( 'content_text', $item ), 60 | ) 61 | ), 62 | 'summary' => self::ifset( 'summary', $item ), 63 | 'featured' => self::ifset( 'image', $item ), 64 | 'published' => normalize_iso8601( self::ifset( 'date_published', $item ) ), 65 | 'updated' => normalize_iso8601( self::ifset( 'date_modified', $item ) ), 66 | 'author' => self::get_author( $item ), 67 | 'category' => self::ifset( 'tags', $item ), 68 | 'language' => self::ifset( 'language', $item ), 69 | ) 70 | ); 71 | if ( array_key_exists( 'attachments', $item ) ) { 72 | foreach ( $item['attachments'] as $attachment ) { 73 | $type = explode( '/', $attachment['mime_type'] ); 74 | $type = array_shift( $type ); 75 | switch ( $type ) { 76 | case 'audio': 77 | $newitem['audio'] = $attachment['url']; 78 | if ( isset( $attachment['duration_in_seconds'] ) ) { 79 | $newitem['duration'] = seconds_to_iso8601( $attachment['duration_in_seconds'] ); 80 | } 81 | break; 82 | case 'image': 83 | $newitem['photo'] = $attachment['url']; 84 | break; 85 | case 'video': 86 | $newitem['video'] = $attachment['url']; 87 | if ( isset( $attachment['duration_in_seconds'] ) ) { 88 | $newitem['duration'] = seconds_to_iso8601( $attachment['duration_in_seconds'] ); 89 | } 90 | break; 91 | } 92 | } 93 | } 94 | $return['items'][] = $newitem; 95 | } 96 | $return['_last_published'] = self::find_last_published( $return['items'] ); 97 | $return['_last_updated'] = self::find_last_updated( $return['items'] ); 98 | return $return; 99 | } 100 | } 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /lib/html5/HTML5/Parser/TreeBuildingRules.php: -------------------------------------------------------------------------------- 1 | 1, 20 | 'dd' => 1, 21 | 'dt' => 1, 22 | 'rt' => 1, 23 | 'rp' => 1, 24 | 'tr' => 1, 25 | 'th' => 1, 26 | 'td' => 1, 27 | 'thead' => 1, 28 | 'tfoot' => 1, 29 | 'tbody' => 1, 30 | 'table' => 1, 31 | 'optgroup' => 1, 32 | 'option' => 1, 33 | ); 34 | 35 | /** 36 | * Returns true if the given tagname has special processing rules. 37 | */ 38 | public function hasRules($tagname) 39 | { 40 | return isset(static::$tags[$tagname]); 41 | } 42 | 43 | /** 44 | * Evaluate the rule for the current tag name. 45 | * 46 | * This may modify the existing DOM. 47 | * 48 | * @return \DOMElement The new Current DOM element. 49 | */ 50 | public function evaluate($new, $current) 51 | { 52 | switch ($new->tagName) { 53 | case 'li': 54 | return $this->handleLI($new, $current); 55 | case 'dt': 56 | case 'dd': 57 | return $this->handleDT($new, $current); 58 | case 'rt': 59 | case 'rp': 60 | return $this->handleRT($new, $current); 61 | case 'optgroup': 62 | return $this->closeIfCurrentMatches($new, $current, array( 63 | 'optgroup', 64 | )); 65 | case 'option': 66 | return $this->closeIfCurrentMatches($new, $current, array( 67 | 'option', 68 | )); 69 | case 'tr': 70 | return $this->closeIfCurrentMatches($new, $current, array( 71 | 'tr', 72 | )); 73 | case 'td': 74 | case 'th': 75 | return $this->closeIfCurrentMatches($new, $current, array( 76 | 'th', 77 | 'td', 78 | )); 79 | case 'tbody': 80 | case 'thead': 81 | case 'tfoot': 82 | case 'table': // Spec isn't explicit about this, but it's necessary. 83 | 84 | return $this->closeIfCurrentMatches($new, $current, array( 85 | 'thead', 86 | 'tfoot', 87 | 'tbody', 88 | )); 89 | } 90 | 91 | return $current; 92 | } 93 | 94 | protected function handleLI($ele, $current) 95 | { 96 | return $this->closeIfCurrentMatches($ele, $current, array( 97 | 'li', 98 | )); 99 | } 100 | 101 | protected function handleDT($ele, $current) 102 | { 103 | return $this->closeIfCurrentMatches($ele, $current, array( 104 | 'dt', 105 | 'dd', 106 | )); 107 | } 108 | 109 | protected function handleRT($ele, $current) 110 | { 111 | return $this->closeIfCurrentMatches($ele, $current, array( 112 | 'rt', 113 | 'rp', 114 | )); 115 | } 116 | 117 | protected function closeIfCurrentMatches($ele, $current, $match) 118 | { 119 | if (in_array($current->tagName, $match, true)) { 120 | $current->parentNode->appendChild($ele); 121 | } else { 122 | $current->appendChild($ele); 123 | } 124 | 125 | return $ele; 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /lib/html5/HTML5/Parser/EventHandler.php: -------------------------------------------------------------------------------- 1 | ). 65 | * 66 | * @return int one of the Tokenizer::TEXTMODE_* constants 67 | */ 68 | public function startTag($name, $attributes = array(), $selfClosing = false); 69 | 70 | /** 71 | * An end-tag. 72 | */ 73 | public function endTag($name); 74 | 75 | /** 76 | * A comment section (unparsed character data). 77 | */ 78 | public function comment($cdata); 79 | 80 | /** 81 | * A unit of parsed character data. 82 | * 83 | * Entities in this text are *already decoded*. 84 | */ 85 | public function text($cdata); 86 | 87 | /** 88 | * Indicates that the document has been entirely processed. 89 | */ 90 | public function eof(); 91 | 92 | /** 93 | * Emitted when the parser encounters an error condition. 94 | */ 95 | public function parseError($msg, $line, $col); 96 | 97 | /** 98 | * A CDATA section. 99 | * 100 | * @param string $data 101 | * The unparsed character data 102 | */ 103 | public function cdata($data); 104 | 105 | /** 106 | * This is a holdover from the XML spec. 107 | * 108 | * While user agents don't get PIs, server-side does. 109 | * 110 | * @param string $name The name of the processor (e.g. 'php'). 111 | * @param string $data The unparsed data. 112 | */ 113 | public function processingInstruction($name, $data = null); 114 | } 115 | -------------------------------------------------------------------------------- /lib/html5/HTML5/Serializer/Traverser.php: -------------------------------------------------------------------------------- 1 | 'html', 21 | 'http://www.w3.org/1998/Math/MathML' => 'math', 22 | 'http://www.w3.org/2000/svg' => 'svg', 23 | ); 24 | 25 | protected $dom; 26 | 27 | protected $options; 28 | 29 | protected $encode = false; 30 | 31 | protected $rules; 32 | 33 | protected $out; 34 | 35 | /** 36 | * Create a traverser. 37 | * 38 | * @param \DOMNode|\DOMNodeList $dom The document or node to traverse. 39 | * @param resource $out A stream that allows writing. The traverser will output into this 40 | * stream. 41 | * @param array $options An array of options for the traverser as key/value pairs. These include: 42 | * - encode_entities: A bool to specify if full encding should happen for all named 43 | * charachter references. Defaults to false which escapes &'<>". 44 | * - output_rules: The path to the class handling the output rules. 45 | */ 46 | public function __construct($dom, $out, RulesInterface $rules, $options = array()) 47 | { 48 | $this->dom = $dom; 49 | $this->out = $out; 50 | $this->rules = $rules; 51 | $this->options = $options; 52 | 53 | $this->rules->setTraverser($this); 54 | } 55 | 56 | /** 57 | * Tell the traverser to walk the DOM. 58 | * 59 | * @return resource $out Returns the output stream. 60 | */ 61 | public function walk() 62 | { 63 | if ($this->dom instanceof \DOMDocument) { 64 | $this->rules->document($this->dom); 65 | } elseif ($this->dom instanceof \DOMDocumentFragment) { 66 | // Document fragments are a special case. Only the children need to 67 | // be serialized. 68 | if ($this->dom->hasChildNodes()) { 69 | $this->children($this->dom->childNodes); 70 | } 71 | } // If NodeList, loop 72 | elseif ($this->dom instanceof \DOMNodeList) { 73 | // If this is a NodeList of DOMDocuments this will not work. 74 | $this->children($this->dom); 75 | } // Else assume this is a DOMNode-like datastructure. 76 | else { 77 | $this->node($this->dom); 78 | } 79 | 80 | return $this->out; 81 | } 82 | 83 | /** 84 | * Process a node in the DOM. 85 | * 86 | * @param mixed $node A node implementing \DOMNode. 87 | */ 88 | public function node($node) 89 | { 90 | // A listing of types is at http://php.net/manual/en/dom.constants.php 91 | switch ($node->nodeType) { 92 | case XML_ELEMENT_NODE: 93 | $this->rules->element($node); 94 | break; 95 | case XML_TEXT_NODE: 96 | $this->rules->text($node); 97 | break; 98 | case XML_CDATA_SECTION_NODE: 99 | $this->rules->cdata($node); 100 | break; 101 | case XML_PI_NODE: 102 | $this->rules->processorInstruction($node); 103 | break; 104 | case XML_COMMENT_NODE: 105 | $this->rules->comment($node); 106 | break; 107 | // Currently we don't support embedding DTDs. 108 | default: 109 | //print ''; 110 | break; 111 | } 112 | } 113 | 114 | /** 115 | * Walk through all the nodes on a node list. 116 | * 117 | * @param \DOMNodeList $nl A list of child elements to walk through. 118 | */ 119 | public function children($nl) 120 | { 121 | foreach ($nl as $node) { 122 | $this->node($node); 123 | } 124 | } 125 | 126 | /** 127 | * Is an element local? 128 | * 129 | * @param mixed $ele An element that implement \DOMNode. 130 | * 131 | * @return bool true if local and false otherwise. 132 | */ 133 | public function isLocalElement($ele) 134 | { 135 | $uri = $ele->namespaceURI; 136 | if (empty($uri)) { 137 | return false; 138 | } 139 | 140 | return isset(static::$local_ns[$uri]); 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /lib/html5/RELEASE.md: -------------------------------------------------------------------------------- 1 | # Release Notes 2 | 3 | 2.7.6 (2021-08-18) 4 | 5 | - #218: Address comment handling issues 6 | 7 | 2.7.5 (2021-07-01) 8 | 9 | - #204: Travis: Enable tests on PHP 8.0 10 | - #207: Fix PHP 8.1 deprecations 11 | 12 | 2.7.4 (2020-10-01) 13 | 14 | - #191: Fix travisci build 15 | - #195: Add .gitattributes file with export-ignore rules 16 | - #194: Fix query parameter parsed as character entity 17 | 18 | 2.7.3 (2020-07-05) 19 | 20 | - #190: mitigate cyclic reference between output rules and the traverser objects 21 | 22 | 2.7.2 (2020-07-01) 23 | 24 | - #187: Fixed memory leak in HTML5::saveHTML() 25 | - #186: Add special case for end tag
26 | 27 | 2.7.1 (2020-06-14) 28 | 29 | - #171: add PHP 7.4 job 30 | - #178: Prevent infinite loop on un-terminated entity declaration at EOF 31 | 32 | 2.7.0 (2019-07-25) 33 | 34 | - #164: Drop HHVM support 35 | - #168: Set default encoding in the DOMDocument object 36 | 37 | 2.6.0 (2019-03-10) 38 | 39 | - #163: Allow to pass a charset to the Scanner 40 | 41 | 2.5.0 (2018-12-27) 42 | 43 | - #162, #161, #155, #154, #153, #151: big performance improvements 44 | - #156: fixed typos 45 | - #160: adopt and enforce code style 46 | - #159: remove deprecated php unit base test case 47 | - #150: backport changes from old master branch 48 | 49 | 2.4.0 (2018-11-17) 50 | 51 | - #148: Improve performance by moving sequence matching 52 | - #147: Improve the Tokenizer performance 53 | - #146: Improve performance by relying on a native string instead of InputStream 54 | - #144: Add DOM extension in composer.json 55 | - #145: Add more extensions on composer.json, improve phpdocs and remove dead code 56 | - #143: Remove experimental comment 57 | 58 | 2.3.1 (2018-10-18) 59 | 60 | - #121: Audio is not a block tag (fixed by #141) 61 | - #136: Handle illegal self-closing according to spec (fixed by #137) 62 | - #141: Minor fixes in the README 63 | 64 | 2.3.0 (2017-09-04) 65 | 66 | - #129: image within inline svg breaks system (fixed by #133) 67 | - #131: ² does not work (fixed by #132) 68 | - #134: Improve tokenizer performance by 20% (alternative version of #130 thanks to @MichaelHeerklotz) 69 | - #135: Raw & in attributes 70 | 71 | 2.2.2 (2016-09-22) 72 | 73 | - #116: In XML mode, tags are case sensitive 74 | - #115: Fix PHP Notice in OutputRules 75 | - #112: fix parsing of options of an optgroup 76 | - #111: Adding test for the address tag 77 | 78 | 2.2.1 (2016-05-10) 79 | 80 | - #109: Fixed issue where address tag could be written without closing tag (thanks sylus) 81 | 82 | 2.2.0 (2016-04-11) 83 | 84 | - #105: Enable composer cache (for CI/CD) 85 | - #100: Use mb_substitute_character inset of ini_set for environments where ini_set is disable (e.g., shared hosting) 86 | - #98: Allow link, meta, style tags in noscript tags 87 | - #96: Fixed xml:href on svgs that use the "use" breaking 88 | - #94: Counting UTF8 characters performance improvement 89 | - #93: Use newer version of coveralls package 90 | - #90: Remove duplicate test 91 | - #87: Allow multiple root nodes 92 | 93 | 2.1.2 (2015-06-07) 94 | - #82: Support for PHP7 95 | - #84: Improved boolean attribute handling 96 | 97 | 2.1.1 (2015-03-23) 98 | - #78: Fixes bug where unmatched entity like string drops everything after &. 99 | 100 | 2.1.0 (2015-02-01) 101 | - #74: Added `disable_html_ns` and `target_doc` dom parsing options 102 | - Unified option names 103 | - #73: Fixed alphabet, ß now can be detected 104 | - #75 and #76: Allow whitespace in RCDATA tags 105 | - #77: Fixed parsing blunder for json embeds 106 | - #72: Add options to HTML methods 107 | 108 | 2.0.2 (2014-12-17) 109 | - #50: empty document handling 110 | - #63: tags with strange capitalization 111 | - #65: dashes and underscores as allowed characters in tag names 112 | - #68: Fixed issue with non-inline elements inside inline containers 113 | 114 | 2.0.1 (2014-09-23) 115 | - #59: Fixed issue parsing some fragments. 116 | - #56: Incorrectly saw 0 as empty string 117 | - Sami as new documentation generator 118 | 119 | 2.0.0 (2014-07-28) 120 | - #53: Improved boolean attributes handling 121 | - #52: Facebook HHVM compatibility 122 | - #48: Adopted PSR-2 as coding standard 123 | - #47: Moved everything to Masterminds namespace 124 | - #45: Added custom namespaces 125 | - #44: Added support to XML-style namespaces 126 | - #37: Refactored HTML5 class removing static methods 127 | 128 | 1.0.5 (2014-06-10) 129 | - #38: Set the dev-master branch as the 1.0.x branch for composer (goetas) 130 | - #34: Tests use PSR-4 for autoloading. (goetas) 131 | - #40, #41: Fix entity handling in RCDATA sections. (KitaitiMakoto) 132 | - #32: Fixed issue where wharacter references were being incorrectly encoded in style tags. 133 | 134 | 1.0.4 (2014-04-29) 135 | - #30/#31 Don't throw an exception for invalid tag names. 136 | 137 | 1.0.3 (2014-02-28) 138 | - #23 and #29: Ignore attributes with illegal chars in name for the PHP DOM. 139 | 140 | 1.0.2 (2014-02-12) 141 | - #23: Handle missing tag close in attribute list. 142 | - #25: Fixed text escaping in the serializer (HTML% 8.3). 143 | - #27: Fixed tests on Windows: changed "\n" -> PHP_EOL. 144 | - #28: Fixed infinite loop for char "&" in unquoted attribute in parser. 145 | - #26: Updated tag name case handling to deal with uppercase usage. 146 | - #24: Newlines and tabs are allowed inside quoted attributes (HTML5 8.2.4). 147 | - Fixed Travis CI testing. 148 | 149 | 1.0.1 (2013-11-07) 150 | - CDATA encoding is improved. (Non-standard; Issue #19) 151 | - Some parser rules were not returning the new current element. (Issue #20) 152 | - Added, to the README, details on code test coverage and to packagist version. 153 | - Fixed processor instructions. 154 | - Improved test coverage and documentation coverage. 155 | 156 | 1.0.0 (2013-10-02) 157 | - Initial release. 158 | -------------------------------------------------------------------------------- /includes/class-rest-parse-this.php: -------------------------------------------------------------------------------- 1 | 34 |
35 |

36 |

37 | 38 |

39 | 40 |

41 | 46 |


47 |
48 |

49 | 50 |

51 | 52 | 53 | 54 | 57 | 60 | 61 | 62 | 65 | 68 | 69 | 70 | 73 | 76 | 77 | 78 | 81 | 84 | 85 | 86 | 89 | 95 | 96 | 97 | 100 | 103 | 104 | 105 | 106 | 107 | 108 |
109 |
110 | WP_REST_Server::READABLE, 125 | 'callback' => array( $cls, 'read' ), 126 | 'args' => array( 127 | 'url' => array( 128 | 'required' => true, 129 | 'validate_callback' => array( $cls, 'is_valid_url' ), 130 | 'sanitize_callback' => 'esc_url_raw', 131 | ), 132 | ), 133 | 'permission_callback' => function () { 134 | return current_user_can( 'read' ); 135 | }, 136 | ), 137 | ) 138 | ); 139 | } 140 | 141 | public static function read( $request ) { 142 | $url = $request->get_param( 'url' ); 143 | $mf2 = $request->get_param( 'mf2' ); 144 | $return = $request->get_param( 'return' ); 145 | $refs = $request->get_param( 'references' ); 146 | $discovery = $request->get_param( 'discovery' ); 147 | $location = $request->get_param( 'location' ); 148 | $follow = $request->get_param( 'follow' ); 149 | if ( $discovery ) { 150 | $parse = new Parse_This_Discovery(); 151 | return $parse->fetch( $url ); 152 | } 153 | $parse = new Parse_This( $url ); 154 | $r = $parse->fetch(); 155 | 156 | if ( is_wp_error( $r ) ) { 157 | return $r; 158 | } 159 | $parse->parse( 160 | array( 161 | 'return' => $return, 162 | 'follow' => $follow, 163 | 'references' => $refs, 164 | 'location' => $location, 165 | ) 166 | ); 167 | if ( $mf2 ) { 168 | return $parse->get( 'mf2' ); 169 | } 170 | return $parse->get(); 171 | } 172 | 173 | /** 174 | * Returns if valid URL for REST validation 175 | * 176 | * @param string $url 177 | * 178 | * @return boolean 179 | */ 180 | public static function is_valid_url( $url, $request = null, $key = null ) { 181 | return wp_http_validate_url( $url ); 182 | } 183 | 184 | 185 | public static function addscheme( $url, $scheme = 'http://' ) { 186 | return wp_parse_url( $url, PHP_URL_SCHEME ) === null ? $scheme . $url : $url; 187 | } 188 | 189 | } 190 | 191 | new REST_Parse_This(); 192 | -------------------------------------------------------------------------------- /includes/class-parse-this-instagram.php: -------------------------------------------------------------------------------- 1 | query( '//script' ) as $script ) { 16 | if ( preg_match( '/window\._sharedData = ({.+});/', $script->textContent, $match ) ) { // phpcs:ignore 17 | $data = json_decode( $match[1], true ); 18 | } 19 | } 20 | if ( empty( $data ) ) { 21 | return array(); 22 | } 23 | 24 | $jf2 = array(); 25 | if ( $data && is_array( $data ) && array_key_exists( 'entry_data', $data ) ) { 26 | if ( is_array( $data['entry_data'] ) ) { 27 | if ( array_key_exists( 'PostPage', $data['entry_data'] ) ) { 28 | // Photo Page 29 | $jf2 = self::html_photo( $data, $url ); 30 | } elseif ( array_key_exists( 'LocationsPage', $data['entry_data'] ) ) { 31 | // Locations Page 32 | $jf2 = self::html_location( $data, $url ); 33 | } elseif ( array_key_exists( 'LoginAndSignupPage', $data['entry_data'] ) ) { 34 | return array(); 35 | } 36 | } 37 | } 38 | if ( WP_DEBUG ) { 39 | $jf2['_ig'] = $data; 40 | } 41 | return array_filter( $jf2 ); 42 | } 43 | 44 | private static function html_location( $data, $url ) { 45 | $post = $data['entry_data']['LocationsPage']; 46 | if ( isset( $post[0]['graphql']['location'] ) ) { 47 | $data = $post[0]['graphql']['location']; 48 | } else { 49 | return array(); 50 | } 51 | return self::json_location( $data, $url ); 52 | } 53 | 54 | private static function json_location( $data, $url ) { 55 | $address = isset( $data['address_json'] ) ? json_decode( $data['address_json'], true ) : array(); 56 | $jf2 = array( 57 | 'address' => $address, 58 | 'name' => ifset( $data['name'] ), 59 | 'latitude' => ifset( $data['lat'] ), 60 | 'longitude' => ifset( $data['lng'] ), 61 | 'url' => ifset( $data['website'] ), 62 | 'street_address' => ifset( $address['street_address'] ), 63 | 'postal_code' => ifset( $address['zip_code'] ), 64 | 'region' => ifset( $address['region_name'] ), 65 | 'country' => ifset( $address['country_code'] ), 66 | ); 67 | return array_filter( $jf2 ); 68 | } 69 | 70 | private static function feed( $data, $url ) { 71 | return self::profile( $data ); 72 | } 73 | 74 | private static function html_photo( $data, $url ) { 75 | $post = $data['entry_data']['PostPage']; 76 | if ( isset( $post[0]['graphql']['shortcode_media'] ) ) { 77 | $data = $post[0]['graphql']['shortcode_media']; 78 | } elseif ( isset( $post[0]['graphql']['media'] ) ) { 79 | $data = $post[0]['graphql']['media']; 80 | } elseif ( isset( $post[0]['media'] ) ) { 81 | $data = $post[0]['media']; 82 | } 83 | return self::json_photo( $data, $url ); 84 | } 85 | 86 | public static function json_photo( $data, $url ) { 87 | // Start building the h-entry 88 | $entry = array( 89 | 'type' => 'entry', 90 | 'url' => $url, 91 | ); 92 | 93 | // Content and hashtags 94 | $caption = false; 95 | 96 | if ( isset( $data['caption'] ) ) { 97 | $caption = $data['caption']; 98 | } elseif ( isset( $data['edge_media_to_caption']['edges'][0]['node']['text'] ) ) { 99 | $caption = $data['edge_media_to_caption']['edges'][0]['node']['text']; 100 | } 101 | 102 | if ( $caption ) { 103 | if ( preg_match_all( '/#([a-z0-9_-]+)/i', $caption, $matches ) ) { 104 | $entry['category'] = array(); 105 | foreach ( $matches[1] as $match ) { 106 | $entry['category'][] = $match; 107 | } 108 | } 109 | 110 | $entry['content'] = array( 111 | 'text' => $caption, 112 | ); 113 | } 114 | 115 | // Include the photo/video media URLs 116 | // (Always return arrays, even for single images) 117 | if ( array_key_exists( 'edge_sidecar_to_children', $data ) ) { 118 | $entry['photo'] = array(); 119 | foreach ( $data['edge_sidecar_to_children']['edges'] as $edge ) { 120 | $entry['photo'][] = $edge['node']['display_url']; 121 | } 122 | } else { 123 | // Single photo or video 124 | if ( array_key_exists( 'display_src', $data ) ) { 125 | $entry['photo'] = array( $data['display_src'] ); 126 | } elseif ( array_key_exists( 'display_url', $data ) ) { 127 | $entry['photo'] = array( $data['display_url'] ); 128 | } 129 | 130 | if ( isset( $data['is_video'] ) && $data['is_video'] && isset( $data['video_url'] ) ) { 131 | $entry['video'] = array( $data['video_url'] ); 132 | } 133 | } 134 | 135 | // Published date 136 | $published = new Datetime(); 137 | if ( isset( $data['taken_at_timestamp'] ) ) { 138 | $published->setTimestamp( $data['taken_at_timestamp'] ); 139 | } elseif ( isset( $data['date'] ) ) { 140 | $published = new DateTime( $data['date'] ); 141 | } 142 | $entry['published'] = $published->format( DATE_W3C ); 143 | if ( isset( $data['location'] ) ) { 144 | $entry['location'] = array(); 145 | if ( isset( $data['location']['address_json'] ) ) { 146 | $address = json_decode( $data['location']['address_json'], true ); 147 | $entry['location'] = array( 148 | 'street_address' => $address['street_address'], 149 | 'postal_code' => $address['zip_code'], 150 | 'region' => $address['region_name'], 151 | 'country' => $address['country_code'], 152 | ); 153 | } 154 | $entry['location']['name'] = $data['location']['name']; 155 | $entry['location']['url'] = sprintf( 'https://www.instagram.com/explore/locations/%1$s', $data['location']['id'] ); 156 | $entry['location'] = array_filter( $entry['location'] ); 157 | } 158 | if ( isset( $data['owner'] ) ) { 159 | $entry['author'] = array( 160 | 'type' => 'card', 161 | 'name' => ifset( $data['owner']['full_name'] ), 162 | 'nickname' => ifset( $data['owner']['username'] ), 163 | 'url' => sprintf( 'https://www.instagram.com/%1$s/', $data['owner']['username'] ), 164 | 'photo' => ifset( $data['owner']['profile_pic_url'] ), 165 | ); 166 | } 167 | return $entry; 168 | } 169 | 170 | private static function profile( $data ) { 171 | if ( isset( $data['entry_data']['ProfilePage'][0] ) ) { 172 | $profile = $data['entry_data']['ProfilePage'][0]; 173 | if ( $profile && isset( $profile['graphql']['user'] ) ) { 174 | $user = $profile['graphql']['user']; 175 | return $user; 176 | } 177 | } 178 | return array(); 179 | } 180 | 181 | 182 | 183 | } 184 | -------------------------------------------------------------------------------- /lib/mf2/LICENSE.md: -------------------------------------------------------------------------------- 1 | # Creative Commons Legal Code 2 | 3 | ## CC0 1.0 Universal 4 | 5 | http://creativecommons.org/publicdomain/zero/1.0 6 | 7 | Official translations of this legal tool are available> CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED HEREUNDER. 8 | 9 | ### _Statement of Purpose_ 10 | 11 | The laws of most jurisdictions throughout the world automatically confer exclusive Copyright and Related Rights (defined below) upon the creator and subsequent owner(s) (each and all, an "owner") of an original work of authorship and/or a database (each, a "Work"). 12 | 13 | Certain owners wish to permanently relinquish those rights to a Work for the purpose of contributing to a commons of creative, cultural and scientific works ("Commons") that the public can reliably and without fear of later claims of infringement build upon, modify, incorporate in other works, reuse and redistribute as freely as possible in any form whatsoever and for any purposes, including without limitation commercial purposes. These owners may contribute to the Commons to promote the ideal of a free culture and the further production of creative, cultural and scientific works, or to gain reputation or greater distribution for their Work in part through the use and efforts of others. 14 | 15 | For these and/or other purposes and motivations, and without any expectation of additional consideration or compensation, the person associating CC0 with a Work (the "Affirmer"), to the extent that he or she is an owner of Copyright and Related Rights in the Work, voluntarily elects to apply CC0 to the Work and publicly distribute the Work under its terms, with knowledge of his or her Copyright and Related Rights in the Work and the meaning and intended legal effect of CC0 on those rights. 16 | 17 | **1. Copyright and Related Rights.** A Work made available under CC0 may be protected by copyright and related or neighboring rights ("Copyright and Related Rights"). Copyright and Related Rights include, but are not limited to, the following: 18 | 19 | 1. the right to reproduce, adapt, distribute, perform, display, communicate, and translate a Work; 20 | 2. moral rights retained by the original author(s) and/or performer(s); 21 | 3. publicity and privacy rights pertaining to a person's image or likeness depicted in a Work; 22 | 4. rights protecting against unfair competition in regards to a Work, subject to the limitations in paragraph 4(a), below; 23 | 5. rights protecting the extraction, dissemination, use and reuse of data in a Work; 24 | 6. database rights (such as those arising under Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, and under any national implementation thereof, including any amended or successor version of such directive); and 25 | 7. other similar, equivalent or corresponding rights throughout the world based on applicable law or treaty, and any national implementations thereof. 26 | 27 | **2. Waiver.** To the greatest extent permitted by, but not in contravention of, applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and unconditionally waives, abandons, and surrenders all of Affirmer's Copyright and Related Rights and associated claims and causes of action, whether now known or unknown (including existing as well as future claims and causes of action), in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each member of the public at large and to the detriment of Affirmer's heirs and successors, fully intending that such Waiver shall not be subject to revocation, rescission, cancellation, termination, or any other legal or equitable action to disrupt the quiet enjoyment of the Work by the public as contemplated by Affirmer's express Statement of Purpose. 28 | 29 | **3. Public License Fallback.** Should any part of the Waiver for any reason be judged legally invalid or ineffective under applicable law, then the Waiver shall be preserved to the maximum extent permitted taking into account Affirmer's express Statement of Purpose. In addition, to the extent the Waiver is so judged Affirmer hereby grants to each affected person a royalty-free, non transferable, non sublicensable, non exclusive, irrevocable and unconditional license to exercise Affirmer's Copyright and Related Rights in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "License"). The License shall be deemed effective as of the date CC0 was applied by Affirmer to the Work. Should any part of the License for any reason be judged legally invalid or ineffective under applicable law, such partial invalidity or ineffectiveness shall not invalidate the remainder of the License, and in such case Affirmer hereby affirms that he or she will not (i) exercise any of his or her remaining Copyright and Related Rights in the Work or (ii) assert any associated claims and causes of action with respect to the Work, in either case contrary to Affirmer's express Statement of Purpose. 30 | 31 | **4. Limitations and Disclaimers.** 32 | 33 | 1. No trademark or patent rights held by Affirmer are waived, abandoned, surrendered, licensed or otherwise affected by this document. 34 | 2. Affirmer offers the Work as-is and makes no representations or warranties of any kind concerning the Work, express, implied, statutory or otherwise, including without limitation warranties of title, merchantability, fitness for a particular purpose, non infringement, or the absence of latent or other defects, accuracy, or the present or absence of errors, whether or not discoverable, all to the greatest extent permissible under applicable law. 35 | 3. Affirmer disclaims responsibility for clearing rights of other persons that may apply to the Work or any use thereof, including without limitation any person's Copyright and Related Rights in the Work. Further, Affirmer disclaims responsibility for obtaining any necessary consents, permissions or other rights required for any use of the Work. 36 | 4. Affirmer understands and acknowledges that Creative Commons is not a party to this document and has no duty or obligation with respect to this CC0 or use of the Work. 37 | -------------------------------------------------------------------------------- /includes/compat-functions.php: -------------------------------------------------------------------------------- 1 | getTimestamp(); 36 | } 37 | } 38 | 39 | 40 | if ( ! function_exists( 'get_post_datetime' ) ) { 41 | /** 42 | * Retrieve post published or modified time as a `DateTime` object instance. 43 | * 44 | * The object will be set to the timezone from WordPress settings. 45 | * 46 | * @since 5.3.0 - backported to Parse This 47 | * 48 | * @param int|WP_Post $post Optional. WP_Post object or ID. Default is global `$post` object. 49 | * @param string $field Optional. Post field to use. Accepts 'date' or 'modified'. 50 | * @return DateTime|false Time object on success, false on failure. 51 | */ 52 | function get_post_datetime( $post = null, $field = 'date' ) { 53 | $post = get_post( $post ); 54 | if ( ! $post ) { 55 | return false; 56 | } 57 | $time = ( 'modified' === $field ) ? $post->post_modified : $post->post_date; 58 | if ( empty( $time ) || '0000-00-00 00:00:00' === $time ) { 59 | return false; 60 | } 61 | return date_create_immutable_from_format( 'Y-m-d H:i:s', $time, wp_timezone() ); 62 | } 63 | } 64 | 65 | if ( ! function_exists( 'wp_timezone_string' ) ) { 66 | /** 67 | * Retrieves the timezone from site settings as a string. 68 | * 69 | * Uses the `timezone_string` option to get a proper timezone if available, 70 | * otherwise falls back to an offset. 71 | * 72 | * @since 5.3.0 - backported into Parse This 73 | * 74 | * @return string PHP timezone string or a ±HH:MM offset. 75 | */ 76 | function wp_timezone_string() { 77 | $timezone_string = get_option( 'timezone_string' ); 78 | if ( $timezone_string ) { 79 | return $timezone_string; 80 | } 81 | $offset = (float) get_option( 'gmt_offset' ); 82 | $hours = (int) $offset; 83 | $minutes = ( $offset - $hours ); 84 | $sign = ( $offset < 0 ) ? '-' : '+'; 85 | $abs_hour = abs( $hours ); 86 | $abs_mins = abs( $minutes * 60 ); 87 | $tz_offset = sprintf( '%s%02d:%02d', $sign, $abs_hour, $abs_mins ); 88 | return $tz_offset; 89 | } 90 | } 91 | 92 | if ( ! function_exists( 'wp_timezone' ) ) { 93 | /** 94 | * Retrieves the timezone from site settings as a `DateTimeZone` object. 95 | * 96 | * Timezone can be based on a PHP timezone string or a ±HH:MM offset. 97 | * 98 | * @since 5.3.0 - backported into Parse This 99 | * 100 | * @return DateTimeZone Timezone object. 101 | */ 102 | function wp_timezone() { 103 | return new DateTimeZone( wp_timezone_string() ); 104 | } 105 | } 106 | 107 | 108 | if ( ! function_exists( 'wp_date' ) ) { 109 | /** 110 | * Retrieves the date, in localized format. 111 | * 112 | * This is a newer function, intended to replace `date_i18n()` without legacy quirks in it. 113 | * 114 | * Note that, unlike `date_i18n()`, this function accepts a true Unix timestamp, not summed 115 | * with timezone offset. 116 | * 117 | * @since 5.3.0 - backported to Parse This 118 | * 119 | * @param string $format PHP date format. 120 | * @param int $timestamp Optional. Unix timestamp. Defaults to current time. 121 | * @param DateTimeZone $timezone Optional. Timezone to output result in. Defaults to timezone 122 | * from site settings. 123 | * @return string|false The date, translated if locale specifies it. False on invalid timestamp input. 124 | */ 125 | function wp_date( $format, $timestamp = null, $timezone = null ) { 126 | global $wp_locale; 127 | if ( null === $timestamp ) { 128 | $timestamp = time(); 129 | } elseif ( ! is_numeric( $timestamp ) ) { 130 | return false; 131 | } 132 | if ( ! $timezone ) { 133 | $timezone = wp_timezone(); 134 | } 135 | $datetime = date_create( '@' . $timestamp ); 136 | $datetime->setTimezone( $timezone ); 137 | if ( empty( $wp_locale->month ) || empty( $wp_locale->weekday ) ) { 138 | $date = $datetime->format( $format ); 139 | } else { 140 | // We need to unpack shorthand `r` format because it has parts that might be localized. 141 | $format = preg_replace( '/(?get_month( $datetime->format( 'm' ) ); 145 | $weekday = $wp_locale->get_weekday( $datetime->format( 'w' ) ); 146 | for ( $i = 0; $i < $format_length; $i ++ ) { 147 | switch ( $format[ $i ] ) { 148 | case 'D': 149 | $new_format .= backslashit( $wp_locale->get_weekday_abbrev( $weekday ) ); 150 | break; 151 | case 'F': 152 | $new_format .= backslashit( $month ); 153 | break; 154 | case 'l': 155 | $new_format .= backslashit( $weekday ); 156 | break; 157 | case 'M': 158 | $new_format .= backslashit( $wp_locale->get_month_abbrev( $month ) ); 159 | break; 160 | case 'a': 161 | $new_format .= backslashit( $wp_locale->get_meridiem( $datetime->format( 'a' ) ) ); 162 | break; 163 | case 'A': 164 | $new_format .= backslashit( $wp_locale->get_meridiem( $datetime->format( 'A' ) ) ); 165 | break; 166 | case '\\': 167 | $new_format .= $format[ $i ]; 168 | // If character follows a slash, we add it without translating. 169 | if ( $i < $format_length ) { 170 | $new_format .= $format[ ++$i ]; 171 | } 172 | break; 173 | default: 174 | $new_format .= $format[ $i ]; 175 | break; 176 | } 177 | } 178 | $date = $datetime->format( $new_format ); 179 | $date = wp_maybe_decline_date( $date ); 180 | } 181 | /** 182 | * Filters the date formatted based on the locale. 183 | * 184 | * @since 5.3.0 but backported to Parse This 185 | * 186 | * @param string $date Formatted date string. 187 | * @param string $format Format to display the date. 188 | * @param int $timestamp Unix timestamp. 189 | * @param DateTimeZone $timezone Timezone. 190 | */ 191 | $date = apply_filters( 'wp_date', $date, $format, $timestamp, $timezone ); 192 | return $date; 193 | } 194 | } 195 | 196 | if ( ! function_exists( 'str_contains' ) ) { 197 | function str_contains( $haystack, $needle ) { 198 | return $needle !== '' && false !== mb_strpos( $haystack, $needle ); 199 | } 200 | } 201 | -------------------------------------------------------------------------------- /lib/html5/HTML5/Parser/UTF8Utils.php: -------------------------------------------------------------------------------- 1 | 9 | 10 | Permission is hereby granted, free of charge, to any person obtaining a 11 | copy of this software and associated documentation files (the 12 | "Software"), to deal in the Software without restriction, including 13 | without limitation the rights to use, copy, modify, merge, publish, 14 | distribute, sublicense, and/or sell copies of the Software, and to 15 | permit persons to whom the Software is furnished to do so, subject to 16 | the following conditions: 17 | 18 | The above copyright notice and this permission notice shall be included 19 | in all copies or substantial portions of the Software. 20 | 21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 22 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 23 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 24 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 25 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 26 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 27 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 28 | */ 29 | 30 | use Masterminds\HTML5\Exception; 31 | 32 | class UTF8Utils 33 | { 34 | /** 35 | * The Unicode replacement character. 36 | */ 37 | const FFFD = "\xEF\xBF\xBD"; 38 | 39 | /** 40 | * Count the number of characters in a string. 41 | * UTF-8 aware. This will try (in order) iconv, MB, libxml, and finally a custom counter. 42 | * 43 | * @param string $string 44 | * 45 | * @return int 46 | */ 47 | public static function countChars($string) 48 | { 49 | // Get the length for the string we need. 50 | if (function_exists('mb_strlen')) { 51 | return mb_strlen($string, 'utf-8'); 52 | } 53 | 54 | if (function_exists('iconv_strlen')) { 55 | return iconv_strlen($string, 'utf-8'); 56 | } 57 | 58 | if (function_exists('utf8_decode')) { 59 | // MPB: Will this work? Won't certain decodes lead to two chars 60 | // extrapolated out of 2-byte chars? 61 | return strlen(utf8_decode($string)); 62 | } 63 | 64 | $count = count_chars($string); 65 | 66 | // 0x80 = 0x7F - 0 + 1 (one added to get inclusive range) 67 | // 0x33 = 0xF4 - 0x2C + 1 (one added to get inclusive range) 68 | return array_sum(array_slice($count, 0, 0x80)) + array_sum(array_slice($count, 0xC2, 0x33)); 69 | } 70 | 71 | /** 72 | * Convert data from the given encoding to UTF-8. 73 | * 74 | * This has not yet been tested with charactersets other than UTF-8. 75 | * It should work with ISO-8859-1/-13 and standard Latin Win charsets. 76 | * 77 | * @param string $data The data to convert 78 | * @param string $encoding A valid encoding. Examples: http://www.php.net/manual/en/mbstring.supported-encodings.php 79 | * 80 | * @return string 81 | */ 82 | public static function convertToUTF8($data, $encoding = 'UTF-8') 83 | { 84 | /* 85 | * From the HTML5 spec: Given an encoding, the bytes in the input stream must be converted 86 | * to Unicode characters for the tokeniser, as described by the rules for that encoding, 87 | * except that the leading U+FEFF BYTE ORDER MARK character, if any, must not be stripped 88 | * by the encoding layer (it is stripped by the rule below). Bytes or sequences of bytes 89 | * in the original byte stream that could not be converted to Unicode characters must be 90 | * converted to U+FFFD REPLACEMENT CHARACTER code points. 91 | */ 92 | 93 | // mb_convert_encoding is chosen over iconv because of a bug. The best 94 | // details for the bug are on http://us1.php.net/manual/en/function.iconv.php#108643 95 | // which contains links to the actual but reports as well as work around 96 | // details. 97 | if (function_exists('mb_convert_encoding')) { 98 | // mb library has the following behaviors: 99 | // - UTF-16 surrogates result in false. 100 | // - Overlongs and outside Plane 16 result in empty strings. 101 | 102 | // Before we run mb_convert_encoding we need to tell it what to do with 103 | // characters it does not know. This could be different than the parent 104 | // application executing this library so we store the value, change it 105 | // to our needs, and then change it back when we are done. This feels 106 | // a little excessive and it would be great if there was a better way. 107 | $save = mb_substitute_character(); 108 | mb_substitute_character('none'); 109 | $data = mb_convert_encoding($data, 'UTF-8', $encoding); 110 | mb_substitute_character($save); 111 | } 112 | // @todo Get iconv running in at least some environments if that is possible. 113 | elseif (function_exists('iconv') && 'auto' !== $encoding) { 114 | // fprintf(STDOUT, "iconv found\n"); 115 | // iconv has the following behaviors: 116 | // - Overlong representations are ignored. 117 | // - Beyond Plane 16 is replaced with a lower char. 118 | // - Incomplete sequences generate a warning. 119 | $data = @iconv($encoding, 'UTF-8//IGNORE', $data); 120 | } else { 121 | throw new Exception('Not implemented, please install mbstring or iconv'); 122 | } 123 | 124 | /* 125 | * One leading U+FEFF BYTE ORDER MARK character must be ignored if any are present. 126 | */ 127 | if ("\xEF\xBB\xBF" === substr($data, 0, 3)) { 128 | $data = substr($data, 3); 129 | } 130 | 131 | return $data; 132 | } 133 | 134 | /** 135 | * Checks for Unicode code points that are not valid in a document. 136 | * 137 | * @param string $data A string to analyze 138 | * 139 | * @return array An array of (string) error messages produced by the scanning 140 | */ 141 | public static function checkForIllegalCodepoints($data) 142 | { 143 | // Vestigal error handling. 144 | $errors = array(); 145 | 146 | /* 147 | * All U+0000 null characters in the input must be replaced by U+FFFD REPLACEMENT CHARACTERs. 148 | * Any occurrences of such characters is a parse error. 149 | */ 150 | for ($i = 0, $count = substr_count($data, "\0"); $i < $count; ++$i) { 151 | $errors[] = 'null-character'; 152 | } 153 | 154 | /* 155 | * Any occurrences of any characters in the ranges U+0001 to U+0008, U+000B, U+000E to U+001F, U+007F 156 | * to U+009F, U+D800 to U+DFFF , U+FDD0 to U+FDEF, and characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, 157 | * U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE, 158 | * U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, 159 | * U+DFFFE, U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and U+10FFFF are parse errors. 160 | * (These are all control characters or permanently undefined Unicode characters.) 161 | */ 162 | // Check PCRE is loaded. 163 | $count = preg_match_all( 164 | '/(?: 165 | [\x01-\x08\x0B\x0E-\x1F\x7F] # U+0001 to U+0008, U+000B, U+000E to U+001F and U+007F 166 | | 167 | \xC2[\x80-\x9F] # U+0080 to U+009F 168 | | 169 | \xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF]) # U+D800 to U+DFFFF 170 | | 171 | \xEF\xB7[\x90-\xAF] # U+FDD0 to U+FDEF 172 | | 173 | \xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF 174 | | 175 | [\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16}) 176 | )/x', $data, $matches); 177 | for ($i = 0; $i < $count; ++$i) { 178 | $errors[] = 'invalid-codepoint'; 179 | } 180 | 181 | return $errors; 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /includes/class-parse-this-base.php: -------------------------------------------------------------------------------- 1 | format( DATE_W3C ); 40 | } 41 | 42 | public static function validate_email( $email ) { 43 | return filter_var( $email, FILTER_VALIDATE_EMAIL ); 44 | } 45 | 46 | /** 47 | * 48 | */ 49 | protected static function find_last_updated( $items ) { 50 | $items = self::order_by_date( $items, 'updated' ); 51 | $return = new DateTime( $items[0]['updated'], wp_timezone() ); 52 | return $return->format( DATE_W3C ); 53 | } 54 | 55 | /** 56 | * Utility method to limit an array to 100 values. 57 | * Originally set to 50 but some sites are very detailed in their meta. 58 | * 59 | * @ignore 60 | * @since 4.2.0 61 | * 62 | * @param array $value Array to limit. 63 | * @return array Original array if fewer than 100 values, limited array, empty array otherwise. 64 | */ 65 | protected static function limit_array( $value ) { 66 | if ( is_array( $value ) ) { 67 | if ( count( $value ) > 100 ) { 68 | return array_slice( $value, 0, 100 ); 69 | } 70 | 71 | return $value; 72 | } 73 | 74 | return array(); 75 | } 76 | 77 | /** 78 | * Utility method to limit the length of a given string to 5,000 characters. 79 | * 80 | * @ignore 81 | * @since 4.2.0 82 | * 83 | * @param string $value String to limit. 84 | * @return bool|int|string If boolean or integer, that value. If a string, the original value 85 | * if fewer than 5,000 characters, a truncated version, otherwise an 86 | * empty string. 87 | */ 88 | protected static function limit_string( $value ) { 89 | $return = ''; 90 | if ( is_numeric( $value ) || is_bool( $value ) ) { 91 | $return = $value; 92 | } elseif ( is_string( $value ) ) { 93 | if ( mb_strlen( $value ) > 5000 ) { 94 | $return = mb_substr( $value, 0, 5000 ); 95 | } else { 96 | $return = $value; 97 | } 98 | $return = sanitize_text_field( trim( $return ) ); 99 | } 100 | 101 | return $return; 102 | } 103 | 104 | /** 105 | * Utility method to limit a given URL to 2,048 characters. 106 | * 107 | * @ignore 108 | * @since 4.2.0 109 | * 110 | * @param string $url URL to check for length and validity. 111 | * @param string $source_url URL URL to use to resolve relative URLs 112 | * @return string Escaped URL if of valid length (< 2048) and makeup. Empty string otherwise. 113 | */ 114 | protected static function limit_url( $url, $source_url ) { 115 | if ( ! is_string( $url ) ) { 116 | return ''; 117 | } 118 | 119 | // HTTP 1.1 allows 8000 chars but the "de-facto" standard supported in all current browsers is 2048. 120 | if ( strlen( $url ) > 2048 ) { 121 | return ''; // Return empty rather than a truncated/invalid URL 122 | } 123 | 124 | // Does not look like a URL. 125 | if ( ! filter_var( $url, FILTER_VALIDATE_URL ) ) { 126 | return ''; 127 | } 128 | 129 | $url = pt_make_absolute_url( $url, $source_url ); 130 | 131 | return esc_url_raw( $url, array( 'http', 'https' ) ); 132 | } 133 | 134 | /** 135 | * Utility method to limit image source URLs. 136 | * 137 | * Excluded URLs include share-this type buttons, loaders, spinners, spacers, WordPress interface images, 138 | * tiny buttons or thumbs, mathtag.com or quantserve.com images, or the WordPress.com stats gif. 139 | * 140 | * @param string $src Image source URL. 141 | * @return string If not matched an excluded URL type, the original URL, empty string otherwise. 142 | */ 143 | protected static function limit_img( $src, $source_url ) { 144 | $src = self::limit_url( $src, $source_url ); 145 | 146 | if ( preg_match( '!/ad[sx]?/!i', $src ) ) { 147 | // Ads 148 | return ''; 149 | } elseif ( preg_match( '!(/share-?this[^.]+?\.[a-z0-9]{3,4})(\?.*)?$!i', $src ) ) { 150 | // Share-this type button 151 | return ''; 152 | } elseif ( preg_match( '!/(spinner|loading|spacer|blank|rss)\.(gif|jpg|png)!i', $src ) ) { 153 | // Loaders, spinners, spacers 154 | return ''; 155 | } elseif ( preg_match( '!/([^./]+[-_])?(spinner|loading|spacer|blank)s?([-_][^./]+)?\.[a-z0-9]{3,4}!i', $src ) ) { 156 | // Fancy loaders, spinners, spacers 157 | return ''; 158 | } elseif ( preg_match( '!([^./]+[-_])?thumb[^.]*\.(gif|jpg|png)$!i', $src ) ) { 159 | // Thumbnails, too small, usually irrelevant to context 160 | return ''; 161 | } elseif ( false !== stripos( $src, '/wp-includes/' ) ) { 162 | // Classic WordPress interface images 163 | return ''; 164 | } elseif ( false !== stripos( $src, '/wp-content/themes' ) ) { 165 | // Anything within a WordPress theme directory 166 | return ''; 167 | } elseif ( false !== stripos( $src, '/wp-content/plugins' ) ) { 168 | // Anything within a WordPress plugin directory 169 | return ''; 170 | } elseif ( preg_match( '![^\d]\d{1,2}x\d+\.(gif|jpg|png)$!i', $src ) ) { 171 | // Most often tiny buttons/thumbs (< 100px wide) 172 | return ''; 173 | } elseif ( preg_match( '!/pixel\.(mathtag|quantserve)\.com!i', $src ) ) { 174 | // See mathtag.com and https://www.quantcast.com/how-we-do-it/iab-standard-measurement/how-we-collect-data/ 175 | return ''; 176 | } elseif ( preg_match( '!/[gb]\.gif(\?.+)?$!i', $src ) ) { 177 | // WordPress.com stats gif 178 | return ''; 179 | } 180 | // Optionally add additional limits 181 | return apply_filters( 'parse_this_img_filters', $src ); 182 | } 183 | 184 | /** 185 | * Limit embed source URLs to specific providers. 186 | * 187 | * Not all core oEmbed providers are supported. Supported providers include YouTube, Vimeo, 188 | * Vine, Daily Motion, SoundCloud, and Twitter. 189 | * 190 | * @param string $src Embed source URL. 191 | * @param string $source_url Source URL 192 | * @return string If not from a supported provider, an empty string. Otherwise, a reformatted embed URL. 193 | */ 194 | protected static function limit_embed( $src, $source_url ) { 195 | $src = self::limit_url( $src, $source_url ); 196 | 197 | if ( empty( $src ) ) { 198 | return ''; 199 | } 200 | 201 | if ( preg_match( '!//(m|www)\.youtube\.com/(embed|v)/([^?]+)\?.+$!i', $src, $src_matches ) ) { 202 | // Embedded Youtube videos (www or mobile) 203 | $src = 'https://www.youtube.com/watch?v=' . $src_matches[3]; 204 | } elseif ( preg_match( '!//player\.vimeo\.com/video/([\d]+)([?/].*)?$!i', $src, $src_matches ) ) { 205 | // Embedded Vimeo iframe videos 206 | $src = 'https://vimeo.com/' . (int) $src_matches[1]; 207 | } elseif ( preg_match( '!//vimeo\.com/moogaloop\.swf\?clip_id=([\d]+)$!i', $src, $src_matches ) ) { 208 | // Embedded Vimeo Flash videos 209 | $src = 'https://vimeo.com/' . (int) $src_matches[1]; 210 | } elseif ( preg_match( '!//vine\.co/v/([^/]+)/embed!i', $src, $src_matches ) ) { 211 | // Embedded Vine videos 212 | $src = 'https://vine.co/v/' . $src_matches[1]; 213 | } elseif ( preg_match( '!//(www\.)?dailymotion\.com/embed/video/([^/?]+)([/?].+)?!i', $src, $src_matches ) ) { 214 | // Embedded Daily Motion videos 215 | $src = 'https://www.dailymotion.com/video/' . $src_matches[2]; 216 | } else { 217 | $oembed = _wp_oembed_get_object(); 218 | 219 | if ( ! $oembed->get_provider( 220 | $src, 221 | array( 222 | 'discover' => false, 223 | ) 224 | ) ) { 225 | $src = ''; 226 | } 227 | } 228 | 229 | return $src; 230 | } 231 | 232 | public static function set( $array, $key, $value ) { 233 | if ( ! isset( $array[ $key ] ) ) { 234 | $array[ $key ] = $value; 235 | } elseif ( is_string( $array[ $key ] ) ) { 236 | $array[ $key ] = array( $array[ $key ], $value ); 237 | } elseif ( is_array( $array[ $key ] ) ) { 238 | $array[ $key ][] = $value; 239 | } 240 | return $array; 241 | } 242 | } 243 | -------------------------------------------------------------------------------- /lib/html5/HTML5.php: -------------------------------------------------------------------------------- 1 | false, 25 | 26 | // Prevents the parser from automatically assigning the HTML5 namespace to the DOM document. 27 | 'disable_html_ns' => false, 28 | ); 29 | 30 | protected $errors = array(); 31 | 32 | public function __construct(array $defaultOptions = array()) 33 | { 34 | $this->defaultOptions = array_merge($this->defaultOptions, $defaultOptions); 35 | } 36 | 37 | /** 38 | * Get the current default options. 39 | * 40 | * @return array 41 | */ 42 | public function getOptions() 43 | { 44 | return $this->defaultOptions; 45 | } 46 | 47 | /** 48 | * Load and parse an HTML file. 49 | * 50 | * This will apply the HTML5 parser, which is tolerant of many 51 | * varieties of HTML, including XHTML 1, HTML 4, and well-formed HTML 52 | * 3. Note that in these cases, not all of the old data will be 53 | * preserved. For example, XHTML's XML declaration will be removed. 54 | * 55 | * The rules governing parsing are set out in the HTML 5 spec. 56 | * 57 | * @param string|resource $file The path to the file to parse. If this is a resource, it is 58 | * assumed to be an open stream whose pointer is set to the first 59 | * byte of input. 60 | * @param array $options Configuration options when parsing the HTML. 61 | * 62 | * @return \DOMDocument A DOM document. These object type is defined by the libxml 63 | * library, and should have been included with your version of PHP. 64 | */ 65 | public function load($file, array $options = array()) 66 | { 67 | // Handle the case where file is a resource. 68 | if (is_resource($file)) { 69 | return $this->parse(stream_get_contents($file), $options); 70 | } 71 | 72 | return $this->parse(file_get_contents($file), $options); 73 | } 74 | 75 | /** 76 | * Parse a HTML Document from a string. 77 | * 78 | * Take a string of HTML 5 (or earlier) and parse it into a 79 | * DOMDocument. 80 | * 81 | * @param string $string A html5 document as a string. 82 | * @param array $options Configuration options when parsing the HTML. 83 | * 84 | * @return \DOMDocument A DOM document. DOM is part of libxml, which is included with 85 | * almost all distribtions of PHP. 86 | */ 87 | public function loadHTML($string, array $options = array()) 88 | { 89 | return $this->parse($string, $options); 90 | } 91 | 92 | /** 93 | * Convenience function to load an HTML file. 94 | * 95 | * This is here to provide backwards compatibility with the 96 | * PHP DOM implementation. It simply calls load(). 97 | * 98 | * @param string $file The path to the file to parse. If this is a resource, it is 99 | * assumed to be an open stream whose pointer is set to the first 100 | * byte of input. 101 | * @param array $options Configuration options when parsing the HTML. 102 | * 103 | * @return \DOMDocument A DOM document. These object type is defined by the libxml 104 | * library, and should have been included with your version of PHP. 105 | */ 106 | public function loadHTMLFile($file, array $options = array()) 107 | { 108 | return $this->load($file, $options); 109 | } 110 | 111 | /** 112 | * Parse a HTML fragment from a string. 113 | * 114 | * @param string $string the HTML5 fragment as a string 115 | * @param array $options Configuration options when parsing the HTML 116 | * 117 | * @return \DOMDocumentFragment A DOM fragment. The DOM is part of libxml, which is included with 118 | * almost all distributions of PHP. 119 | */ 120 | public function loadHTMLFragment($string, array $options = array()) 121 | { 122 | return $this->parseFragment($string, $options); 123 | } 124 | 125 | /** 126 | * Return all errors encountered into parsing phase. 127 | * 128 | * @return array 129 | */ 130 | public function getErrors() 131 | { 132 | return $this->errors; 133 | } 134 | 135 | /** 136 | * Return true it some errors were encountered into parsing phase. 137 | * 138 | * @return bool 139 | */ 140 | public function hasErrors() 141 | { 142 | return count($this->errors) > 0; 143 | } 144 | 145 | /** 146 | * Parse an input string. 147 | * 148 | * @param string $input 149 | * @param array $options 150 | * 151 | * @return \DOMDocument 152 | */ 153 | public function parse($input, array $options = array()) 154 | { 155 | $this->errors = array(); 156 | $options = array_merge($this->defaultOptions, $options); 157 | $events = new DOMTreeBuilder(false, $options); 158 | $scanner = new Scanner($input, !empty($options['encoding']) ? $options['encoding'] : 'UTF-8'); 159 | $parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML : Tokenizer::CONFORMANT_HTML); 160 | 161 | $parser->parse(); 162 | $this->errors = $events->getErrors(); 163 | 164 | return $events->document(); 165 | } 166 | 167 | /** 168 | * Parse an input stream where the stream is a fragment. 169 | * 170 | * Lower-level loading function. This requires an input stream instead 171 | * of a string, file, or resource. 172 | * 173 | * @param string $input The input data to parse in the form of a string. 174 | * @param array $options An array of options. 175 | * 176 | * @return \DOMDocumentFragment 177 | */ 178 | public function parseFragment($input, array $options = array()) 179 | { 180 | $options = array_merge($this->defaultOptions, $options); 181 | $events = new DOMTreeBuilder(true, $options); 182 | $scanner = new Scanner($input, !empty($options['encoding']) ? $options['encoding'] : 'UTF-8'); 183 | $parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML : Tokenizer::CONFORMANT_HTML); 184 | 185 | $parser->parse(); 186 | $this->errors = $events->getErrors(); 187 | 188 | return $events->fragment(); 189 | } 190 | 191 | /** 192 | * Save a DOM into a given file as HTML5. 193 | * 194 | * @param mixed $dom The DOM to be serialized. 195 | * @param string|resource $file The filename to be written or resource to write to. 196 | * @param array $options Configuration options when serializing the DOM. These include: 197 | * - encode_entities: Text written to the output is escaped by default and not all 198 | * entities are encoded. If this is set to true all entities will be encoded. 199 | * Defaults to false. 200 | */ 201 | public function save($dom, $file, $options = array()) 202 | { 203 | $close = true; 204 | if (is_resource($file)) { 205 | $stream = $file; 206 | $close = false; 207 | } else { 208 | $stream = fopen($file, 'wb'); 209 | } 210 | $options = array_merge($this->defaultOptions, $options); 211 | $rules = new OutputRules($stream, $options); 212 | $trav = new Traverser($dom, $stream, $rules, $options); 213 | 214 | $trav->walk(); 215 | /* 216 | * release the traverser to avoid cyclic references and allow PHP to free memory without waiting for gc_collect_cycles 217 | */ 218 | $rules->unsetTraverser(); 219 | if ($close) { 220 | fclose($stream); 221 | } 222 | } 223 | 224 | /** 225 | * Convert a DOM into an HTML5 string. 226 | * 227 | * @param mixed $dom The DOM to be serialized. 228 | * @param array $options Configuration options when serializing the DOM. These include: 229 | * - encode_entities: Text written to the output is escaped by default and not all 230 | * entities are encoded. If this is set to true all entities will be encoded. 231 | * Defaults to false. 232 | * 233 | * @return string A HTML5 documented generated from the DOM. 234 | */ 235 | public function saveHTML($dom, $options = array()) 236 | { 237 | $stream = fopen('php://temp', 'wb'); 238 | $this->save($dom, $stream, array_merge($this->defaultOptions, $options)); 239 | 240 | $html = stream_get_contents($stream, -1, 0); 241 | 242 | fclose($stream); 243 | 244 | return $html; 245 | } 246 | } 247 | -------------------------------------------------------------------------------- /includes/class-parse-this-discovery.php: -------------------------------------------------------------------------------- 1 | 15, 63 | 'limit_response_size' => 1048576, 64 | 'redirection' => 5, 65 | // Use an explicit user-agent for Parse This 66 | ); 67 | $links = array(); 68 | 69 | $response = wp_safe_remote_get( $url, $args ); 70 | $response_code = wp_remote_retrieve_response_code( $response ); 71 | $content_type = wp_remote_retrieve_header( $response, 'content-type' ); 72 | $wprest = array(); 73 | $linkheaders = wp_remote_retrieve_header( $response, 'link' ); 74 | if ( $linkheaders ) { 75 | if ( is_array( $linkheaders ) ) { 76 | foreach ( $linkheaders as $link ) { 77 | if ( preg_match( '/<(.[^>]+)>;\s+rel\s?=\s?[\"\']?(https:\/\/)?api.w.org?\/?[\"\']?/i', $link, $result ) ) { 78 | $wprest[] = array( 79 | 'url' => untrailingslashit( pt_make_absolute_url( $result[1], $url ) ), 80 | 'type' => 'feed', 81 | '_feed_type' => 'wordpress', 82 | 'name' => 'WordPress REST API', 83 | ); 84 | } 85 | } 86 | } else { 87 | if ( preg_match( '/<(.[^>]+)>;\s+rel\s?=\s?[\"\']?(https:\/\/)?api.w.org?\/?[\"\']?/i', $linkheaders, $result ) ) { 88 | $wprest[] = array( 89 | 'url' => untrailingslashit( pt_make_absolute_url( $result[1], $url ) ), 90 | 'type' => 'feed', 91 | '_feed_type' => 'wordpress', 92 | 'name' => 'WordPress REST API', 93 | ); 94 | } 95 | } 96 | } 97 | if ( in_array( $response_code, array( 403, 415 ), true ) ) { 98 | $args['user-agent'] = $user_agent; 99 | $response = wp_safe_remote_get( $url, $args ); 100 | $response_code = wp_remote_retrieve_response_code( $response ); 101 | if ( in_array( $response_code, array( 403, 415 ), true ) ) { 102 | return new WP_Error( 'source_error', 'Unable to Retrieve' ); 103 | } 104 | } 105 | 106 | // Strip any character set off the content type 107 | $ct = explode( ';', $content_type ); 108 | if ( is_array( $ct ) ) { 109 | $content_type = array_shift( $ct ); 110 | } 111 | $content_type = trim( $content_type ); 112 | 113 | $content = wp_remote_retrieve_body( $response ); 114 | // Find Youtube RSS Feeds 115 | if ( in_array( wp_parse_url( $url, PHP_URL_HOST ), array( 'www.youtube.com', 'm.youtube.com', 'youtube.com' ), true ) ) { 116 | $links[] = array( 117 | 'url' => self::youtube_rss( $url ), 118 | 'type' => 'feed', 119 | '_feed_type' => 'atom', 120 | 'name' => 'YouTube Feed', 121 | ); 122 | } 123 | // This is an RSS or Atom Feed URL and if it is not we do not know how to deal with XML anyway 124 | if ( ( in_array( $content_type, array( 'application/rss+xml', 'application/atom+xml', 'text/xml', 'application/xml', 'text/xml' ), true ) ) ) { 125 | $content = Parse_This::fetch_feed( $url ); 126 | if ( class_exists( 'Parse_This_RSS' ) ) { 127 | $links[] = array( 128 | 'url' => $url, 129 | 'type' => 'feed', 130 | '_feed_type' => Parse_This_RSS::get_type( $content ), 131 | 'name' => $content->get_title(), 132 | ); 133 | } 134 | return array( 'results' => $links ); 135 | } 136 | 137 | if ( in_array( $content_type, array( 'application/mf2+json', 'application/jf2+json', 'application/jf2feed+json' ), true ) ) { 138 | $content = json_decode( $content, true ); 139 | } 140 | if ( 'application/json' === $content_type ) { 141 | $content = json_decode( $content, true ); 142 | if ( $content && isset( $content['version'] ) && 'https://jsonfeed.org/version/1' === $content['version'] ) { 143 | $links[] = array( 144 | 'url' => $url, 145 | 'type' => 'feed', 146 | '_feed_type' => 'jsonfeed', 147 | ); 148 | } 149 | return array( 'results' => $links ); 150 | } 151 | if ( 'text/html' === $content_type ) { 152 | $doc = pt_load_domdocument( $content ); 153 | if ( $doc instanceof DOMDocument ) { 154 | $xpath = new DOMXPath( $doc ); 155 | // Fetch and gather data. 156 | $mf2 = false; 157 | foreach ( $xpath->query( '(//link|//a)[@rel and @href]' ) as $link ) { 158 | $rel = $link->getAttribute( 'rel' ); 159 | $href = $link->getAttribute( 'href' ); 160 | $title = $link->getAttribute( 'title' ); 161 | $type = self::get_feed_type( $link->getAttribute( 'type' ) ); 162 | if ( 'microformats' === $type ) { 163 | $mf2 = true; 164 | } 165 | 166 | if ( in_array( $rel, array( 'alternate', 'feed' ), true ) && ! empty( $type ) ) { 167 | $links[] = array_filter( 168 | array( 169 | 'url' => pt_make_absolute_url( $href, $url ), 170 | 'type' => 'feed', 171 | '_feed_type' => $type, 172 | 'name' => $title, 173 | '_mime-type' => $link->getAttribute( 'type' ), 174 | '_rel' => $rel, 175 | ) 176 | ); 177 | } 178 | if ( 'https://api.w.org/' === $rel && empty( $wprest ) ) { 179 | $wprest[] = array_filter( 180 | array( 181 | 'url' => untrailingslashit( pt_make_absolute_url( $href, $url ) ), 182 | 'type' => 'feed', 183 | '_feed_type' => 'wordpress', 184 | 'name' => 'WordPress REST API', 185 | ) 186 | ); 187 | } 188 | } 189 | 190 | // If an mf2 feed was found, do not check to see if this page is also one. 191 | if ( ! $mf2 ) { 192 | // Check to see if the current page is an h-feed 193 | $feeds = Parse_This_MF2::find_hfeed( $doc, $url ); 194 | foreach ( $feeds as $key => $feed ) { 195 | if ( ! Parse_This_MF2::is_microformat( $feed ) ) { 196 | continue; 197 | } 198 | if ( array_key_exists( 'children', $feed ) ) { 199 | unset( $feed['children'] ); 200 | } 201 | $jf2 = mf2_to_jf2( $feed ); 202 | if ( isset( $jf2['type'] ) && 'feed' === $jf2['type'] ) { 203 | $author = array(); 204 | if ( array_key_exists( 'author', $jf2 ) ) { 205 | if ( is_array( $jf2['author'] ) ) { 206 | $author = $jf2['author']; 207 | } elseif ( is_string( $jf2['author'] ) ) { 208 | $author = array( 209 | 'type' => 'card', 210 | ); 211 | if ( wp_http_validate_url( $jf2['author'] ) ) { 212 | $author['url'] = $jf2['author']; 213 | } else { 214 | $author['name'] = $jf2['author']; 215 | } 216 | } 217 | } 218 | $links[] = array_filter( 219 | array( 220 | 'url' => $jf2['url'], 221 | 'type' => 'feed', 222 | '_feed_type' => 'microformats', 223 | 'name' => isset( $jf2['name'] ) ? $jf2['name'] : null, 224 | 'author' => $author, 225 | ) 226 | ); 227 | } 228 | } 229 | } 230 | } 231 | 232 | if ( ! empty( $wprest ) ) { 233 | $links = array_merge( $wprest, $links ); 234 | } 235 | 236 | // Sort feeds by priority 237 | $rank = array( 238 | 'jf2feed' => 0, 239 | 'microformats' => 1, 240 | 'jsonfeed' => 2, 241 | 'wordpress' => 3, 242 | 'atom' => 4, 243 | 'rss' => 5, 244 | ); 245 | usort( 246 | $links, 247 | function( $a, $b ) use ( $rank ) { 248 | return $rank[ $a['_feed_type'] ] > $rank[ $b['_feed_type'] ]; 249 | } 250 | ); 251 | 252 | return array( 'results' => $links ); 253 | 254 | } 255 | } 256 | 257 | 258 | private static function youtube_rss( $url ) { 259 | $youtube_url_base = 'https://www.youtube.com/feeds/videos.xml'; 260 | $preg_entities = array( 261 | 'channel_id' => '\/channel\/(([^\/])+?)$', // match YouTube channel ID from url 262 | 'user' => '\/user\/(([^\/])+?)$', // match YouTube user from url 263 | 'playlist_id' => '\/playlist\?list=(([^\/])+?)$', // match YouTube playlist ID from url 264 | ); 265 | 266 | foreach ( $preg_entities as $key => $preg_entity ) { 267 | if ( preg_match( '/' . $preg_entity . '/', $url, $matches ) ) { 268 | if ( isset( $matches[1] ) ) { 269 | return $youtube_url_base . '?' . $key . '=' . $matches[1]; 270 | } 271 | } 272 | } 273 | } 274 | } 275 | -------------------------------------------------------------------------------- /lib/html5/README.md: -------------------------------------------------------------------------------- 1 | > # UKRAINE NEEDS YOUR HELP NOW! 2 | > 3 | > On 24 February 2022, Russian [President Vladimir Putin ordered an invasion of Ukraine by Russian Armed Forces](https://www.bbc.com/news/world-europe-60504334). 4 | > 5 | > Your support is urgently needed. 6 | > 7 | > - Donate to the volunteers. Here is the volunteer fund helping the Ukrainian army to provide all the necessary equipment: 8 | > https://bank.gov.ua/en/news/all/natsionalniy-bank-vidkriv-spetsrahunok-dlya-zboru-koshtiv-na-potrebi-armiyi or https://savelife.in.ua/en/donate/ 9 | > - Triple-check social media sources. Russian disinformation is attempting to coverup and distort the reality in Ukraine. 10 | > - Help Ukrainian refugees who are fleeing Russian attacks and shellings: https://www.globalcitizen.org/en/content/ways-to-help-ukraine-conflict/ 11 | > - Put pressure on your political representatives to provide help to Ukraine. 12 | > - Believe in the Ukrainian people, they will not surrender, they don't have another Ukraine. 13 | > 14 | > THANK YOU! 15 | ---- 16 | 17 | # HTML5-PHP 18 | 19 | HTML5 is a standards-compliant HTML5 parser and writer written entirely in PHP. 20 | It is stable and used in many production websites, and has 21 | well over [five million downloads](https://packagist.org/packages/masterminds/html5). 22 | 23 | HTML5 provides the following features. 24 | 25 | - An HTML5 serializer 26 | - Support for PHP namespaces 27 | - Composer support 28 | - Event-based (SAX-like) parser 29 | - A DOM tree builder 30 | - Interoperability with [QueryPath](https://github.com/technosophos/querypath) 31 | - Runs on **PHP** 5.3.0 or newer 32 | 33 | [![Build Status](https://travis-ci.org/Masterminds/html5-php.png?branch=master)](https://travis-ci.org/Masterminds/html5-php) 34 | [![Latest Stable Version](https://poser.pugx.org/masterminds/html5/v/stable.png)](https://packagist.org/packages/masterminds/html5) 35 | [![Code Coverage](https://scrutinizer-ci.com/g/Masterminds/html5-php/badges/coverage.png?b=master)](https://scrutinizer-ci.com/g/Masterminds/html5-php/?branch=master) 36 | [![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/Masterminds/html5-php/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/Masterminds/html5-php/?branch=master) 37 | [![Stability: Sustained](https://masterminds.github.io/stability/sustained.svg)](https://masterminds.github.io/stability/sustained.html) 38 | 39 | ## Installation 40 | 41 | Install HTML5-PHP using [composer](http://getcomposer.org/). 42 | 43 | By adding the `masterminds/html5` dependency to your `composer.json` file: 44 | 45 | ```json 46 | { 47 | "require" : { 48 | "masterminds/html5": "^2.0" 49 | }, 50 | } 51 | ``` 52 | 53 | By invoking require command via composer executable: 54 | 55 | ```bash 56 | composer require masterminds/html5 57 | ``` 58 | 59 | ## Basic Usage 60 | 61 | HTML5-PHP has a high-level API and a low-level API. 62 | 63 | Here is how you use the high-level `HTML5` library API: 64 | 65 | ```php 66 | 75 | 76 | TEST 77 | 78 | 79 |

Hello World

80 |

This is a test of the HTML5 parser.

81 | 82 | 83 | HERE; 84 | 85 | // Parse the document. $dom is a DOMDocument. 86 | $html5 = new HTML5(); 87 | $dom = $html5->loadHTML($html); 88 | 89 | // Render it as HTML5: 90 | print $html5->saveHTML($dom); 91 | 92 | // Or save it to a file: 93 | $html5->save($dom, 'out.html'); 94 | ``` 95 | 96 | The `$dom` created by the parser is a full `DOMDocument` object. And the 97 | `save()` and `saveHTML()` methods will take any DOMDocument. 98 | 99 | ### Options 100 | 101 | It is possible to pass in an array of configuration options when loading 102 | an HTML5 document. 103 | 104 | ```php 105 | // An associative array of options 106 | $options = array( 107 | 'option_name' => 'option_value', 108 | ); 109 | 110 | // Provide the options to the constructor 111 | $html5 = new HTML5($options); 112 | 113 | $dom = $html5->loadHTML($html); 114 | ``` 115 | 116 | The following options are supported: 117 | 118 | * `encode_entities` (boolean): Indicates that the serializer should aggressively 119 | encode characters as entities. Without this, it only encodes the bare 120 | minimum. 121 | * `disable_html_ns` (boolean): Prevents the parser from automatically 122 | assigning the HTML5 namespace to the DOM document. This is for 123 | non-namespace aware DOM tools. 124 | * `target_document` (\DOMDocument): A DOM document that will be used as the 125 | destination for the parsed nodes. 126 | * `implicit_namespaces` (array): An assoc array of namespaces that should be 127 | used by the parser. Name is tag prefix, value is NS URI. 128 | 129 | ## The Low-Level API 130 | 131 | This library provides the following low-level APIs that you can use to 132 | create more customized HTML5 tools: 133 | 134 | - A SAX-like event-based parser that you can hook into for special kinds 135 | of parsing. 136 | - A flexible error-reporting mechanism that can be tuned to document 137 | syntax checking. 138 | - A DOM implementation that uses PHP's built-in DOM library. 139 | 140 | The unit tests exercise each piece of the API, and every public function 141 | is well-documented. 142 | 143 | ### Parser Design 144 | 145 | The parser is designed as follows: 146 | 147 | - The `Scanner` handles scanning on behalf of the parser. 148 | - The `Tokenizer` requests data off of the scanner, parses it, clasifies 149 | it, and sends it to an `EventHandler`. It is a *recursive descent parser.* 150 | - The `EventHandler` receives notifications and data for each specific 151 | semantic event that occurs during tokenization. 152 | - The `DOMBuilder` is an `EventHandler` that listens for tokenizing 153 | events and builds a document tree (`DOMDocument`) based on the events. 154 | 155 | ### Serializer Design 156 | 157 | The serializer takes a data structure (the `DOMDocument`) and transforms 158 | it into a character representation -- an HTML5 document. 159 | 160 | The serializer is broken into three parts: 161 | 162 | - The `OutputRules` contain the rules to turn DOM elements into strings. The 163 | rules are an implementation of the interface `RulesInterface` allowing for 164 | different rule sets to be used. 165 | - The `Traverser`, which is a special-purpose tree walker. It visits 166 | each node node in the tree and uses the `OutputRules` to transform the node 167 | into a string. 168 | - `HTML5` manages the `Traverser` and stores the resultant data 169 | in the correct place. 170 | 171 | The serializer (`save()`, `saveHTML()`) follows the 172 | [section 8.9 of the HTML 5.0 spec](http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#serializing-html-fragments). 173 | So tags are serialized according to these rules: 174 | 175 | - A tag with children: <foo>CHILDREN</foo> 176 | - A tag that cannot have content: <foo> (no closing tag) 177 | - A tag that could have content, but doesn't: <foo></foo> 178 | 179 | ## Known Issues (Or, Things We Designed Against the Spec) 180 | 181 | Please check the issue queue for a full list, but the following are 182 | issues known issues that are not presently on the roadmap: 183 | 184 | - Namespaces: HTML5 only [supports a selected list of namespaces](http://www.w3.org/TR/html5/infrastructure.html#namespaces) 185 | and they do not operate in the same way as XML namespaces. A `:` has no special 186 | meaning. 187 | By default the parser does not support XML style namespaces via `:`; 188 | to enable the XML namespaces see the [XML Namespaces section](#xml-namespaces) 189 | - Scripts: This parser does not contain a JavaScript or a CSS 190 | interpreter. While one may be supplied, not all features will be 191 | supported. 192 | - Rentrance: The current parser is not re-entrant. (Thus you can't pause 193 | the parser to modify the HTML string mid-parse.) 194 | - Validation: The current tree builder is **not** a validating parser. 195 | While it will correct some HTML, it does not check that the HTML 196 | conforms to the standard. (Should you wish, you can build a validating 197 | parser by extending DOMTree or building your own EventHandler 198 | implementation.) 199 | * There is limited support for insertion modes. 200 | * Some autocorrection is done automatically. 201 | * Per the spec, many legacy tags are admitted and correctly handled, 202 | even though they are technically not part of HTML5. 203 | - Attribute names and values: Due to the implementation details of the 204 | PHP implementation of DOM, attribute names that do not follow the 205 | XML 1.0 standard are not inserted into the DOM. (Effectively, they 206 | are ignored.) If you've got a clever fix for this, jump in! 207 | - Processor Instructions: The HTML5 spec does not allow processor 208 | instructions. We do. Since this is a server-side library, we think 209 | this is useful. And that means, dear reader, that in some cases you 210 | can parse the HTML from a mixed PHP/HTML document. This, however, 211 | is an incidental feature, not a core feature. 212 | - HTML manifests: Unsupported. 213 | - PLAINTEXT: Unsupported. 214 | - Adoption Agency Algorithm: Not yet implemented. (8.2.5.4.7) 215 | 216 | ## XML Namespaces 217 | 218 | To use XML style namespaces you have to configure well the main `HTML5` instance. 219 | 220 | ```php 221 | use Masterminds\HTML5; 222 | $html = new HTML5(array( 223 | "xmlNamespaces" => true 224 | )); 225 | 226 | $dom = $html->loadHTML(''); 227 | 228 | $dom->documentElement->namespaceURI; // http://www.example.com 229 | 230 | ``` 231 | 232 | You can also add some default prefixes that will not require the namespace declaration, 233 | but its elements will be namespaced. 234 | 235 | ```php 236 | use Masterminds\HTML5; 237 | $html = new HTML5(array( 238 | "implicitNamespaces"=>array( 239 | "t"=>"http://www.example.com" 240 | ) 241 | )); 242 | 243 | $dom = $html->loadHTML(''); 244 | 245 | $dom->documentElement->namespaceURI; // http://www.example.com 246 | 247 | ``` 248 | 249 | ## Thanks to... 250 | 251 | The dedicated (and patient) contributors of patches small and large, 252 | who have already made this library better.See the CREDITS file for 253 | a list of contributors. 254 | 255 | We owe a huge debt of gratitude to the original authors of html5lib. 256 | 257 | While not much of the original parser remains, we learned a lot from 258 | reading the html5lib library. And some pieces remain here. In 259 | particular, much of the UTF-8 and Unicode handling is derived from the 260 | html5lib project. 261 | 262 | ## License 263 | 264 | This software is released under the MIT license. The original html5lib 265 | library was also released under the MIT license. 266 | 267 | See LICENSE.txt 268 | 269 | Certain files contain copyright assertions by specific individuals 270 | involved with html5lib. Those have been retained where appropriate. 271 | -------------------------------------------------------------------------------- /lib/html5/HTML5/Parser/StringInputStream.php: -------------------------------------------------------------------------------- 1 | 13 | 14 | Permission is hereby granted, free of charge, to any person obtaining a 15 | copy of this software and associated documentation files (the 16 | "Software"), to deal in the Software without restriction, including 17 | without limitation the rights to use, copy, modify, merge, publish, 18 | distribute, sublicense, and/or sell copies of the Software, and to 19 | permit persons to whom the Software is furnished to do so, subject to 20 | the following conditions: 21 | 22 | The above copyright notice and this permission notice shall be included 23 | in all copies or substantial portions of the Software. 24 | 25 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 26 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 27 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 28 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 29 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 30 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 31 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 32 | 33 | */ 34 | 35 | // Some conventions: 36 | // - /* */ indicates verbatim text from the HTML 5 specification 37 | // MPB: Not sure which version of the spec. Moving from HTML5lib to 38 | // HTML5-PHP, I have been using this version: 39 | // http://www.w3.org/TR/2012/CR-html5-20121217/Overview.html#contents 40 | // 41 | // - // indicates regular comments 42 | 43 | /** 44 | * @deprecated since 2.4, to remove in 3.0. Use a string in the scanner instead. 45 | */ 46 | class StringInputStream implements InputStream 47 | { 48 | /** 49 | * The string data we're parsing. 50 | */ 51 | private $data; 52 | 53 | /** 54 | * The current integer byte position we are in $data. 55 | */ 56 | private $char; 57 | 58 | /** 59 | * Length of $data; when $char === $data, we are at the end-of-file. 60 | */ 61 | private $EOF; 62 | 63 | /** 64 | * Parse errors. 65 | */ 66 | public $errors = array(); 67 | 68 | /** 69 | * Create a new InputStream wrapper. 70 | * 71 | * @param string $data Data to parse. 72 | * @param string $encoding The encoding to use for the data. 73 | * @param string $debug A fprintf format to use to echo the data on stdout. 74 | */ 75 | public function __construct($data, $encoding = 'UTF-8', $debug = '') 76 | { 77 | $data = UTF8Utils::convertToUTF8($data, $encoding); 78 | if ($debug) { 79 | fprintf(STDOUT, $debug, $data, strlen($data)); 80 | } 81 | 82 | // There is good reason to question whether it makes sense to 83 | // do this here, since most of these checks are done during 84 | // parsing, and since this check doesn't actually *do* anything. 85 | $this->errors = UTF8Utils::checkForIllegalCodepoints($data); 86 | 87 | $data = $this->replaceLinefeeds($data); 88 | 89 | $this->data = $data; 90 | $this->char = 0; 91 | $this->EOF = strlen($data); 92 | } 93 | 94 | public function __toString() 95 | { 96 | return $this->data; 97 | } 98 | 99 | /** 100 | * Replace linefeed characters according to the spec. 101 | */ 102 | protected function replaceLinefeeds($data) 103 | { 104 | /* 105 | * U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED (LF) characters are treated specially. 106 | * Any CR characters that are followed by LF characters must be removed, and any CR characters not 107 | * followed by LF characters must be converted to LF characters. Thus, newlines in HTML DOMs are 108 | * represented by LF characters, and there are never any CR characters in the input to the tokenization 109 | * stage. 110 | */ 111 | $crlfTable = array( 112 | "\0" => "\xEF\xBF\xBD", 113 | "\r\n" => "\n", 114 | "\r" => "\n", 115 | ); 116 | 117 | return strtr($data, $crlfTable); 118 | } 119 | 120 | /** 121 | * Returns the current line that the tokenizer is at. 122 | */ 123 | public function currentLine() 124 | { 125 | if (empty($this->EOF) || 0 === $this->char) { 126 | return 1; 127 | } 128 | // Add one to $this->char because we want the number for the next 129 | // byte to be processed. 130 | return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1; 131 | } 132 | 133 | /** 134 | * @deprecated 135 | */ 136 | public function getCurrentLine() 137 | { 138 | return $this->currentLine(); 139 | } 140 | 141 | /** 142 | * Returns the current column of the current line that the tokenizer is at. 143 | * Newlines are column 0. The first char after a newline is column 1. 144 | * 145 | * @return int The column number. 146 | */ 147 | public function columnOffset() 148 | { 149 | // Short circuit for the first char. 150 | if (0 === $this->char) { 151 | return 0; 152 | } 153 | // strrpos is weird, and the offset needs to be negative for what we 154 | // want (i.e., the last \n before $this->char). This needs to not have 155 | // one (to make it point to the next character, the one we want the 156 | // position of) added to it because strrpos's behaviour includes the 157 | // final offset byte. 158 | $backwardFrom = $this->char - 1 - strlen($this->data); 159 | $lastLine = strrpos($this->data, "\n", $backwardFrom); 160 | 161 | // However, for here we want the length up until the next byte to be 162 | // processed, so add one to the current byte ($this->char). 163 | if (false !== $lastLine) { 164 | $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine); 165 | } else { 166 | // After a newline. 167 | $findLengthOf = substr($this->data, 0, $this->char); 168 | } 169 | 170 | return UTF8Utils::countChars($findLengthOf); 171 | } 172 | 173 | /** 174 | * @deprecated 175 | */ 176 | public function getColumnOffset() 177 | { 178 | return $this->columnOffset(); 179 | } 180 | 181 | /** 182 | * Get the current character. 183 | * 184 | * @return string The current character. 185 | */ 186 | public function current() 187 | { 188 | return $this->data[$this->char]; 189 | } 190 | 191 | /** 192 | * Advance the pointer. 193 | * This is part of the Iterator interface. 194 | */ 195 | public function next() 196 | { 197 | ++$this->char; 198 | } 199 | 200 | /** 201 | * Rewind to the start of the string. 202 | */ 203 | public function rewind() 204 | { 205 | $this->char = 0; 206 | } 207 | 208 | /** 209 | * Is the current pointer location valid. 210 | * 211 | * @return bool Whether the current pointer location is valid. 212 | */ 213 | public function valid() 214 | { 215 | return $this->char < $this->EOF; 216 | } 217 | 218 | /** 219 | * Get all characters until EOF. 220 | * 221 | * This reads to the end of the file, and sets the read marker at the 222 | * end of the file. 223 | * 224 | * Note this performs bounds checking. 225 | * 226 | * @return string Returns the remaining text. If called when the InputStream is 227 | * already exhausted, it returns an empty string. 228 | */ 229 | public function remainingChars() 230 | { 231 | if ($this->char < $this->EOF) { 232 | $data = substr($this->data, $this->char); 233 | $this->char = $this->EOF; 234 | 235 | return $data; 236 | } 237 | 238 | return ''; // false; 239 | } 240 | 241 | /** 242 | * Read to a particular match (or until $max bytes are consumed). 243 | * 244 | * This operates on byte sequences, not characters. 245 | * 246 | * Matches as far as possible until we reach a certain set of bytes 247 | * and returns the matched substring. 248 | * 249 | * @param string $bytes Bytes to match. 250 | * @param int $max Maximum number of bytes to scan. 251 | * 252 | * @return mixed Index or false if no match is found. You should use strong 253 | * equality when checking the result, since index could be 0. 254 | */ 255 | public function charsUntil($bytes, $max = null) 256 | { 257 | if ($this->char >= $this->EOF) { 258 | return false; 259 | } 260 | 261 | if (0 === $max || $max) { 262 | $len = strcspn($this->data, $bytes, $this->char, $max); 263 | } else { 264 | $len = strcspn($this->data, $bytes, $this->char); 265 | } 266 | 267 | $string = (string) substr($this->data, $this->char, $len); 268 | $this->char += $len; 269 | 270 | return $string; 271 | } 272 | 273 | /** 274 | * Returns the string so long as $bytes matches. 275 | * 276 | * Matches as far as possible with a certain set of bytes 277 | * and returns the matched substring. 278 | * 279 | * @param string $bytes A mask of bytes to match. If ANY byte in this mask matches the 280 | * current char, the pointer advances and the char is part of the 281 | * substring. 282 | * @param int $max The max number of chars to read. 283 | * 284 | * @return string 285 | */ 286 | public function charsWhile($bytes, $max = null) 287 | { 288 | if ($this->char >= $this->EOF) { 289 | return false; 290 | } 291 | 292 | if (0 === $max || $max) { 293 | $len = strspn($this->data, $bytes, $this->char, $max); 294 | } else { 295 | $len = strspn($this->data, $bytes, $this->char); 296 | } 297 | $string = (string) substr($this->data, $this->char, $len); 298 | $this->char += $len; 299 | 300 | return $string; 301 | } 302 | 303 | /** 304 | * Unconsume characters. 305 | * 306 | * @param int $howMany The number of characters to unconsume. 307 | */ 308 | public function unconsume($howMany = 1) 309 | { 310 | if (($this->char - $howMany) >= 0) { 311 | $this->char -= $howMany; 312 | } 313 | } 314 | 315 | /** 316 | * Look ahead without moving cursor. 317 | */ 318 | public function peek() 319 | { 320 | if (($this->char + 1) <= $this->EOF) { 321 | return $this->data[$this->char + 1]; 322 | } 323 | 324 | return false; 325 | } 326 | 327 | public function key() 328 | { 329 | return $this->char; 330 | } 331 | } 332 | -------------------------------------------------------------------------------- /includes/class-parse-this-rss.php: -------------------------------------------------------------------------------- 1 | get_items(); 17 | $title = $feed->get_title(); 18 | foreach ( $rss_items as $item ) { 19 | $items[] = self::get_item( $item, $title ); 20 | } 21 | return array_filter( 22 | array( 23 | 'type' => 'feed', 24 | '_feed_type' => self::get_type( $feed ), 25 | '_last_updated' => self::last_updated( $feed ), 26 | '_last_published' => self::find_last_published( $items ), 27 | '_last_updated' => self::find_last_updated( $items ), 28 | 'summary' => $feed->get_description(), 29 | 'author' => self::get_authors( $feed->get_author() ), 30 | 'name' => htmlspecialchars_decode( $title, ENT_QUOTES ), 31 | 'url' => $feed->get_permalink(), 32 | 'photo' => $feed->get_image_url(), 33 | 'items' => $items, 34 | ) 35 | ); 36 | } 37 | 38 | public static function last_updated( $feed ) { 39 | $type = self::get_type( $feed ); 40 | $updated = null; 41 | if ( 'RSS' === $type ) { 42 | $updated = $feed->get_channel_tags( SIMPLEPIE_NAMESPACE_RSS_20, 'lastBuildDate' ); 43 | } elseif ( 'atom' === $type ) { 44 | $updated = $feed->get_channel_tags( SIMPLEPIE_NAMESPACE_ATOM_10, 'updated' ); 45 | } 46 | if ( $updated && isset( $updated[0]['data'] ) ) { 47 | $datetime = new DateTime( $updated[0]['data'] ); 48 | if ( $datetime ) { 49 | return $datetime->format( DATE_W3C ); 50 | } 51 | } 52 | 53 | return null; 54 | } 55 | 56 | public static function get_type( $feed ) { 57 | if ( $feed->get_type() & SIMPLEPIE_TYPE_NONE ) { 58 | return 'unknown'; 59 | } elseif ( $feed->get_type() & SIMPLEPIE_TYPE_RSS_ALL ) { 60 | return 'RSS'; 61 | } elseif ( $feed->get_type() & SIMPLEPIE_TYPE_ATOM_ALL ) { 62 | return 'atom'; 63 | } 64 | } 65 | 66 | /* 67 | * Takes a SimplePie_Author object and Turns it into a JF2 Author property 68 | * @param SimplePie_Author $author 69 | * @return JF2 array 70 | */ 71 | public static function get_authors( $author ) { 72 | if ( ! $author ) { 73 | return array(); 74 | } 75 | if ( $author instanceof SimplePie_Author ) { 76 | $author = array( $author ); 77 | } 78 | $return = array(); 79 | foreach ( $author as $a ) { 80 | $r = array( 81 | 'type' => 'card', 82 | 'name' => htmlspecialchars_decode( $a->get_name() ), 83 | 'url' => $a->get_link(), 84 | 'email' => self::validate_email( $a->get_email() ), 85 | ); 86 | $dom = pt_load_domdocument( $r['name'] ); 87 | $links = $dom->getElementsByTagName( 'a' ); 88 | $names = array(); 89 | foreach ( $links as $link ) { 90 | $names[ wp_strip_all_tags( $link->nodeValue ) ] = $link->getAttribute( 'href' ); // phpcs:ignore 91 | } 92 | if ( ! empty( $names ) ) { 93 | if ( 1 === count( $names ) ) { 94 | reset( $names ); 95 | $r['name'] = key( $names ); 96 | } else { 97 | foreach ( $names as $name => $url ) { 98 | $return[] = array( 99 | 'type' => 'card', 100 | 'name' => $name, 101 | 'url' => $url, 102 | ); 103 | } 104 | } 105 | } else { 106 | $r['name'] = wp_strip_all_tags( $r['name'] ); 107 | $return[] = array_filter( $r ); 108 | } 109 | } 110 | if ( 1 === count( $return ) ) { 111 | $return = array_shift( $return ); 112 | } 113 | return $return; 114 | } 115 | 116 | public static function credit_to_card( $credit ) { 117 | if ( ! $credit instanceof SimplePie_Credit ) { 118 | return null; 119 | } 120 | return array( 121 | 'type' => 'card', 122 | 'role' => $credit->get_role(), 123 | 'name' => $credit->get_name(), 124 | ); 125 | } 126 | 127 | public static function source_to_cite( $source ) { 128 | if ( ! $source instanceof SimplePie_Source ) { 129 | return null; 130 | } 131 | return array_filter( 132 | array( 133 | 'type' => 'cite', 134 | 'name' => $source->get_title(), 135 | 'summary' => $source->get_description(), 136 | 'url' => $source->get_permalink(), 137 | 'author' => self::get_authors( $source->get_authors() ), 138 | 'photo' => $sourece->get_image_url(), 139 | ) 140 | ); 141 | } 142 | 143 | 144 | public static function get_source( $item ) { 145 | $return = $item->get_item_tags( SIMPLEPIE_NAMESPACE_RSS_20, 'source' ); 146 | if ( $return ) { 147 | return array( 148 | 'url' => $return[0]['attribs']['']['url'], 149 | 'name' => $return[0]['data'], 150 | ); 151 | } 152 | return self::source_to_cite( $item->get_source() ); 153 | } 154 | 155 | public static function get_thumbnail( $item ) { 156 | if ( method_exists( $item, 'get_thumbnail' ) ) { 157 | $return = $item->get_thumbnail(); 158 | if ( is_string( $return ) ) { 159 | return $return; 160 | } 161 | if ( is_array( $return ) && isset( $return['url'] ) ) { 162 | return $return['url']; 163 | } 164 | } 165 | return null; 166 | } 167 | 168 | /* 169 | * Takes a SimplePie_Item object and Turns it into a JF2 entry 170 | * @param SimplePie_Item $item 171 | * @return JF2 172 | */ 173 | public static function get_item( $item, $title = '' ) { 174 | $content = Parse_This::clean_content( $item->get_content( true ) ); 175 | $return = array( 176 | 'type' => 'entry', 177 | 'name' => $item->get_title(), 178 | 'author' => self::get_authors( $item->get_authors() ), 179 | 'contributors' => self::get_authors( $item->get_contributors() ), 180 | 'publication' => $title, 181 | 'summary' => wp_strip_all_tags( $item->get_description( true ) ), 182 | 'content' => array_filter( 183 | array( 184 | 'html' => $content, 185 | 'text' => wp_strip_all_tags( $content ), 186 | ) 187 | ), 188 | '_source' => self::get_source( $item ), 189 | 'published' => self::get_date( $item ), 190 | 'updated' => self::get_updated_date( $item ), 191 | 'url' => $item->get_permalink(), 192 | 'uid' => $item->get_id(), 193 | 'location' => self::get_location( $item ), 194 | 'category' => self::get_categories( $item->get_categories() ), 195 | 'featured' => self::get_thumbnail( $item ), 196 | ); 197 | 198 | if ( ! is_array( $return['category'] ) ) { 199 | $return['category'] = array(); 200 | } 201 | 202 | // To cover the non obvious types 203 | $medium_map = array( 204 | 'application/x-shockwave-flash' => 'video', 205 | ); 206 | 207 | $enclosures = $item->get_enclosures(); 208 | foreach ( $enclosures as $enclosure ) { 209 | $medium = $enclosure->get_type(); 210 | if ( ! $medium ) { 211 | $medium = $enclosure->get_medium(); 212 | } else { 213 | if ( array_key_exists( $medium, $medium_map ) ) { 214 | $medium = $medium_map[ $medium ]; 215 | } else { 216 | $medium = explode( '/', $medium ); 217 | $medium = array_shift( $medium ); 218 | } 219 | } 220 | switch ( $medium ) { 221 | case 'audio': 222 | $medium = 'audio'; 223 | break; 224 | case 'image': 225 | $medium = 'photo'; 226 | break; 227 | case 'video': 228 | $medium = 'video'; 229 | break; 230 | } 231 | if ( array_key_exists( $medium, $return ) ) { 232 | if ( is_string( $return[ $medium ] ) ) { 233 | $return[ $medium ] = array( $return[ $medium ] ); 234 | } 235 | $return[ $medium ][] = $enclosure->get_link(); 236 | } else { 237 | $return[ $medium ] = $enclosure->get_link(); 238 | } 239 | if ( isset( $return['category'] ) && is_array( $return['category'] ) ) { 240 | $keywords = $enclosure->get_keywords(); 241 | if ( ! $keywords ) { 242 | $keywords = array(); 243 | } 244 | $return['category'] = array_merge( $return['category'], $keywords ); 245 | } else { 246 | $return['category'] = $enclosure->get_keywords(); 247 | } 248 | if ( ! isset( $return['duration'] ) ) { 249 | $duration = $enclosure->get_duration(); 250 | if ( 0 < $duration ) { 251 | $return['duration'] = seconds_to_iso8601( $duration ); 252 | } 253 | } 254 | if ( empty( $return['summary'] ) ) { 255 | $return['summary'] = $enclosure->get_description(); 256 | } 257 | if ( empty( $return['featured'] ) ) { 258 | $return['featured'] = self::get_thumbnail( $enclosure ); 259 | } 260 | $credits = $enclosure->get_credits(); 261 | if ( ! $credits ) { 262 | $credits = array(); 263 | } 264 | foreach ( $credits as $credit ) { 265 | if ( ! isset( $return['credits'] ) ) { 266 | $return['credits'] = array(); 267 | } 268 | $return['credits'][] = self::credit_to_card( $credit ); 269 | } 270 | } 271 | // If there is just one photo it is probably the featured image 272 | if ( isset( $return['photo'] ) && is_string( $return['photo'] ) && empty( $return['featured'] ) ) { 273 | $return['featured'] = $return['photo']; 274 | unset( $return['photo'] ); 275 | } 276 | if ( empty( $return['featured'] ) ) { 277 | $i = $item->get_item_tags( SIMPLEPIE_NAMESPACE_ITUNES, 'image' ); 278 | if ( is_array( $i ) ) { 279 | $i = array_shift( $i ); 280 | if ( isset( $i['attribs'] ) && is_array( $i['attribs'] ) ) { 281 | $i = array_shift( $i['attribs'] ); 282 | if ( isset( $i['href'] ) ) { 283 | $i = $i['href']; 284 | } 285 | } 286 | } 287 | if ( is_string( $i ) ) { 288 | $return['featured'] = $i; 289 | } 290 | } 291 | $return['post_type'] = post_type_discovery( $return ); 292 | foreach ( array( 'category', 'video', 'audio' ) as $prop ) { 293 | if ( array_key_exists( $prop, $return ) && is_array( $return[ $prop ] ) ) { 294 | $return[ $prop ] = array_unique( $return[ $prop ] ); 295 | } 296 | } 297 | return array_filter( $return ); 298 | } 299 | 300 | private static function get_categories( $categories ) { 301 | if ( ! is_array( $categories ) ) { 302 | return array(); 303 | } 304 | $return = array(); 305 | foreach ( $categories as $category ) { 306 | $return[] = $category->get_label(); 307 | } 308 | return $return; 309 | } 310 | 311 | private static function get_location_name( $item ) { 312 | $return = $item->get_item_tags( SIMPLEPIE_NAMESPACE_W3C_BASIC_GEO, 'featureName' ); 313 | if ( $return ) { 314 | return $return[0]['data']; 315 | } 316 | } 317 | 318 | 319 | public static function get_location( $item ) { 320 | return array_filter( 321 | array( 322 | 'latitude' => $item->get_latitude(), 323 | 'longitude' => $item->get_longitude(), 324 | 'name' => self::get_location_name( $item ), 325 | ) 326 | ); 327 | } 328 | 329 | public static function get_date( $item ) { 330 | $datetime = new DateTime( $item->get_date( null ) ); 331 | if ( $datetime ) { 332 | return $datetime->format( DATE_W3C ); 333 | } 334 | return null; 335 | } 336 | 337 | public static function get_updated_date( $item ) { 338 | $datetime = new DateTime( $item->get_updated_date( null ) ); 339 | if ( $datetime ) { 340 | return $datetime->format( DATE_W3C ); 341 | } 342 | return null; 343 | } 344 | 345 | 346 | } 347 | -------------------------------------------------------------------------------- /includes/class-parse-this-restapi.php: -------------------------------------------------------------------------------- 1 | $path, 34 | '_embed' => 1, 35 | ), 36 | trailingslashit( $rest_url ) 37 | ); 38 | } 39 | return false; 40 | } 41 | 42 | $rest_url = untrailingslashit( $rest_url ); 43 | return add_query_arg( '_embed', 1, $rest_url . $path ); 44 | } 45 | 46 | public static function get_rest_path( $rest_url, $url ) { 47 | if ( ! wp_http_validate_url( $rest_url ) ) { 48 | return false; 49 | } 50 | $query = wp_parse_url( $rest_url, PHP_URL_QUERY ); 51 | if ( ! empty( $query ) ) { 52 | $query = explode( '=', $query ); 53 | if ( array_key_exists( 'rest_route' ) ) { 54 | return $query['rest_route']; 55 | } 56 | } 57 | $path = str_replace( $rest_url, '', $url ); 58 | return '/' . ltrim( $path, '/' ); 59 | } 60 | 61 | 62 | public static function fetch( $rest_url, $path, $cache = false ) { 63 | if ( empty( $rest_url ) || ! $rest_url ) { 64 | return new WP_Error( 'no_url', __( 'No URL provided', 'parse-this' ) ); 65 | } 66 | 67 | $url = self::get_rest_url( $rest_url, $path ); 68 | $key = 'pt_rest_' . self::base64url_encode( $url ); 69 | if ( $cache ) { 70 | $transient = get_transient( $key ); 71 | if ( false !== $transient ) { 72 | return json_decode( $transient, true ); 73 | } 74 | } 75 | 76 | $user_agent = 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:57.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36 Parse This/WP'; 77 | $args = array( 78 | 'timeout' => 15, 79 | 'limit_response_size' => 1048576, 80 | 'redirection' => 5, 81 | // Use an explicit user-agent for Parse This 82 | ); 83 | 84 | $response = wp_safe_remote_get( $url, $args ); 85 | if ( is_wp_error( $response ) ) { 86 | return $response; 87 | } 88 | $response_code = (int) wp_remote_retrieve_response_code( $response ); 89 | $content_type = wp_remote_retrieve_header( $response, 'content-type' ); 90 | if ( in_array( $response_code, array( 404, 403, 415 ), true ) ) { 91 | $args['user-agent'] = $user_agent; 92 | $response = wp_safe_remote_get( $url, $args ); 93 | $response_code = wp_remote_retrieve_response_code( $response ); 94 | if ( in_array( $response_code, array( 404, 403, 415 ), true ) ) { 95 | return new WP_Error( 'source_error', 'Unable to Retrieve' ); 96 | } 97 | } 98 | 99 | // Strip any character set off the content type 100 | $ct = explode( ';', $content_type ); 101 | if ( is_array( $ct ) ) { 102 | $content_type = array_shift( $ct ); 103 | } 104 | $content_type = trim( $content_type ); 105 | // List of content types we know how to handle 106 | if ( 'application/json' !== $content_type ) { 107 | return new WP_Error( 'content-type', 'Retrieved incorrect page', array( 'content-type' => $content_type ) ); 108 | } 109 | 110 | $content = wp_remote_retrieve_body( $response ); 111 | if ( $cache ) { 112 | set_transient( $key, $content, WEEK_IN_SECONDS ); 113 | } 114 | 115 | $content = json_decode( $content, true ); 116 | 117 | if ( wp_remote_retrieve_header( $response, 'x-wp-total' ) ) { 118 | $return = array(); 119 | $return['_total'] = wp_remote_retrieve_header( $response, 'x-wp-total' ); 120 | $return['_pages'] = wp_remote_retrieve_header( $response, 'x-wp-totalpages' ); 121 | $return['items'] = $content; 122 | return $return; 123 | } else { 124 | return $content; 125 | } 126 | return false; 127 | 128 | } 129 | 130 | public static function parse( $content, $rest_url, $args ) { 131 | if ( is_wp_error( $content ) ) { 132 | return $content; 133 | } 134 | if ( array_key_exists( 'id', $content ) ) { 135 | return self::get_post( $content, $rest_url ); 136 | // This is the REST URL itself if it has this. 137 | } elseif ( array_key_exists( 'namespaces', $content ) ) { 138 | // Return site data if single otherwise feed data. 139 | if ( 'single' === $args['return'] ) { 140 | $return = array( 141 | 'type' => 'card', 142 | ); 143 | $timezone = self::timezone( $content ); 144 | $return['tz'] = $timezone->getName(); 145 | if ( array_key_exists( '_embedded', $content ) ) { 146 | if ( array_key_exists( 'wp:featuredmedia', $content['_embedded'] ) ) { 147 | $photo = array(); 148 | foreach ( $content['_embedded']['wp:featuredmedia'] as $media ) { 149 | $photo[] = ifset( $media['source_url'] ); 150 | } 151 | $photo = array_unique( $photo ); 152 | if ( 1 === count( $photo ) ) { 153 | $return['photo'] = array_pop( $photo ); 154 | } else { 155 | $return['photo'] = array_filter( $photo ); 156 | } 157 | } 158 | } 159 | $return['url'] = $content['url']; 160 | $return['name'] = $content['name']; 161 | $return['note'] = $content['description']; 162 | return $return; 163 | } else { 164 | $content = self::fetch( $rest_url, '/wp/v2/posts?_embed=1' ); 165 | 166 | $content = self::posts_to_feed( $content, $rest_url ); 167 | return $content; 168 | } 169 | } 170 | return false; 171 | } 172 | 173 | public static function get_author( $item ) { 174 | if ( ! array_key_exists( '_embedded', $item ) ) { 175 | return null; 176 | } 177 | $author = $item['_embedded']['author'][0]; 178 | if ( array_key_exists( 'code', $author ) ) { 179 | return null; 180 | } 181 | $avatar_urls = self::ifset( 'avatar_urls', $author ); 182 | $avatar_urls = is_array( $avatar_urls ) ? end( $avatar_urls ) : null; 183 | $return = array( 184 | 'type' => 'card', 185 | 'name' => self::ifset( 'name', $author ), 186 | 'url' => self::ifset( 'url', $author ), 187 | 'note' => self::ifset( 'description', $author ), 188 | 'photo' => $avatar_urls, 189 | 'me' => self::ifset( 'me', $author ), 190 | ); 191 | return array_filter( $return ); 192 | } 193 | 194 | public static function format_author( $json ) { 195 | $avatar_urls = self::ifset( 'avatar_urls', $json ); 196 | $avatar_urls = is_array( $avatar_urls ) ? end( $avatar_urls ) : null; 197 | $return = array( 198 | 'type' => 'card', 199 | 'name' => self::ifset( 'name', $json ), 200 | 'url' => self::ifset( 'url', $json ), 201 | 'note' => self::ifset( 'description', $json ), 202 | 'photo' => $avatar_urls, 203 | ); 204 | return $return; 205 | } 206 | 207 | public static function get_datetime( $time, $timezone = null ) { 208 | $datetime = new DateTime( $time ); 209 | if ( 'UTC' === $datetime->getTimeZone()->getName() ) { 210 | $datetime = new DateTime( $time, $timezone ); 211 | } 212 | return $datetime->format( DATE_W3C ); 213 | } 214 | 215 | public static function site_data( $rest_url ) { 216 | $fetch = self::fetch( $rest_url, '', true ); 217 | return wp_array_slice_assoc( $fetch, array( 'name', 'url', 'timezone_string', 'gmt_offset', 'description' ) ); 218 | } 219 | 220 | public static function timezone( $fetch ) { 221 | $timezone_string = self::ifset( 'timezone_string', $fetch ); 222 | if ( $timezone_string ) { 223 | return new DateTimeZone( $timezone_string ); 224 | } 225 | 226 | $offset = (float) self::ifset( 'gmt_offset', $fetch ); 227 | $hours = (int) $offset; 228 | $minutes = ( $offset - $hours ); 229 | 230 | $sign = ( $offset < 0 ) ? '-' : '+'; 231 | $abs_hour = abs( $hours ); 232 | $abs_mins = abs( $minutes * 60 ); 233 | $tz_offset = sprintf( '%s%02d:%02d', $sign, $abs_hour, $abs_mins ); 234 | return new DateTimeZone( $tz_offset ); 235 | } 236 | 237 | public static function get_post( $item, $rest_url ) { 238 | $site_data = self::site_data( $rest_url ); 239 | $author = self::get_rest_path( $rest_url, $item['_links']['author'][0]['href'] ); 240 | $timezone = self::timezone( $site_data ); 241 | $newitem = array_filter( 242 | array( 243 | 'uid' => self::get_rendered( 'guid', $item ), 244 | 'url' => self::ifset( 'link', $item ), 245 | 'name' => self::get_rendered( 'title', $item ), 246 | 'content' => array_filter( 247 | array( 248 | 'html' => Parse_This::clean_content( self::get_rendered( 'content', $item ) ), 249 | 'text' => wp_strip_all_tags( self::get_rendered( 'content', $item ) ), 250 | ) 251 | ), 252 | 'summary' => self::get_rendered( 'excerpt', $item ), 253 | 'published' => self::get_datetime( self::ifset( 'date', $item ), $timezone ), 254 | 'updated' => self::get_datetime( self::ifset( 'modified', $item ), $timezone ), 255 | 'kind' => self::ifset( 'kind', $item ), 256 | ) 257 | ); 258 | 259 | if ( array_key_exists( '_embedded', $item ) ) { 260 | if ( array_key_exists( 'featured_media', $item ) && 0 !== $item['featured_media'] ) { 261 | $newitem['featured'] = $item['_embedded']['wp:featuredmedia'][0]['source_url']; 262 | } 263 | if ( array_key_exists( 'tags', $item ) && ! empty( $item['tags'] ) ) { 264 | foreach ( $item['_links']['wp:term'] as $term ) { 265 | if ( 'post_tag' === $term['taxonomy'] ) { 266 | $tag_path = self::get_rest_path( $rest_url, $term['href'] ); 267 | $tags = self::fetch( $rest_url, $tag_path ); 268 | $newitem['category'] = wp_list_pluck( $tags['items'], 'name' ); 269 | } 270 | } 271 | } 272 | $newitem['author'] = self::get_author( $item ); 273 | } 274 | return array_filter( $newitem ); 275 | } 276 | 277 | public static function posts_to_feed( $input, $url ) { 278 | $return = array_filter( 279 | array( 280 | 'type' => 'feed', 281 | '_feed_type' => 'wordpress', 282 | ) 283 | ); 284 | $items = $input['items']; 285 | $data = self::site_data( $url ); 286 | $timezone = self::timezone( $data ); 287 | $return['items'] = array(); 288 | $return['name'] = self::ifset( 'name', $data ); 289 | $return['summary'] = self::ifset( 'description', $data ); 290 | $return['url'] = self::ifset( 'url', $data ); 291 | foreach ( $items as $item ) { 292 | $newitem = array_filter( 293 | array( 294 | 'uid' => self::get_rendered( 'guid', $item ), 295 | 'url' => self::ifset( 'link', $item ), 296 | 'name' => self::get_rendered( 'title', $item ), 297 | 'content' => array_filter( 298 | array( 299 | 'html' => Parse_This::clean_content( self::get_rendered( 'content', $item ) ), 300 | 'text' => wp_strip_all_tags( self::get_rendered( 'content', $item ) ), 301 | ) 302 | ), 303 | 'summary' => self::get_rendered( 'excerpt', $item ), 304 | 'published' => self::get_datetime( self::ifset( 'date', $item ), $timezone ), 305 | 'updated' => self::get_datetime( self::ifset( 'modified', $item ), $timezone ), 306 | 'author' => self::get_author( $item ), 307 | 'kind' => self::ifset( 'kind', $item ), 308 | ) 309 | ); 310 | if ( array_key_exists( '_embedded', $item ) ) { 311 | if ( array_key_exists( 'wp:term', $item['_embedded'] ) ) { 312 | $category = array(); 313 | foreach ( $item['_embedded']['wp:term'] as $terms ) { 314 | foreach ( $terms as $term ) { 315 | if ( in_array( $term['taxonomy'], array( 'category', 'post_tags' ), true ) && 'Uncategorized' !== $term['name'] ) { 316 | $category[] = $term['name']; 317 | } 318 | } 319 | } 320 | $newitem['category'] = $category; 321 | } 322 | if ( array_key_exists( 'wp:featuredmedia', $item['_embedded'] ) ) { 323 | $newitem['featured'] = $item['_embedded']['wp:featuredmedia'][0]['source_url']; 324 | } 325 | } 326 | if ( WP_DEBUG ) { 327 | $newitem['_rest'] = $item; 328 | } 329 | $return['items'][] = array_filter( $newitem ); 330 | } 331 | if ( array_key_exists( '_pages', $input ) ) { 332 | $return['_pages'] = $input['_pages']; 333 | $return['_total'] = $input['_total']; 334 | } 335 | return $return; 336 | } 337 | } 338 | 339 | 340 | 341 | -------------------------------------------------------------------------------- /includes/class-parse-this-mf2-utils.php: -------------------------------------------------------------------------------- 1 | $textcontent, 94 | ); 95 | if ( $htmlcontent && $textcontent !== $htmlcontent ) { 96 | $data['html'] = $htmlcontent; 97 | } 98 | return $data; 99 | } 100 | 101 | /** 102 | * Verifies if $p is an array without numeric keys and has key 'value' and 'html' set. 103 | * 104 | * @param $p 105 | * @return bool 106 | */ 107 | public static function is_embedded_html( $p ) { 108 | return is_array( $p ) && ! wp_is_numeric_array( $p ) && isset( $p['value'] ) && isset( $p['html'] ); 109 | } 110 | 111 | /** 112 | * Verifies if $p is an array without numeric keys and has key 'value' and 'alt' set. 113 | * 114 | * @param $p 115 | * @return bool 116 | */ 117 | public static function is_embedded_img( $p ) { 118 | return is_array( $p ) && ! wp_is_numeric_array( $p ) && isset( $p['value'] ) && isset( $p['alt'] ); 119 | } 120 | 121 | /** 122 | * Verifies if property named $propname is in array $mf. 123 | * 124 | * @param array $mf 125 | * @param $propname 126 | * @return bool 127 | */ 128 | public static function has_prop( array $mf, $propname ) { 129 | return ! empty( $mf['properties'][ $propname ] ) && is_array( $mf['properties'][ $propname ] ); 130 | } 131 | 132 | 133 | /** 134 | * Verifies if rel named $relname is in array $mf. 135 | * 136 | * @param array $mf 137 | * @param $relname 138 | * @return bool 139 | */ 140 | public static function has_rel( array $mf, $relname ) { 141 | return ! empty( $mf['rels'][ $relname ] ) && is_array( $mf['rels'][ $relname ] ); 142 | } 143 | 144 | /** 145 | * Returns rel property $relname in array $mf. 146 | * 147 | * @param array $mf 148 | * @param $relname 149 | * @return mixed 150 | */ 151 | public static function get_rel( array $mf, $relname ) { 152 | if ( self::has_rel( $mf, $relname ) ) { 153 | return $mf['rels'][ $relname ]; 154 | } 155 | return false; 156 | } 157 | 158 | /** 159 | * Verifies if rel-url named $url is in array $mf. 160 | * 161 | * @param array $mf 162 | * @param $url 163 | * @return bool 164 | */ 165 | public static function has_rel_urls( array $mf, $url ) { 166 | return ! empty( $mf['rel-urls'][ $url ] ) && is_array( $mf['rel-urls'][ $url ] ); 167 | } 168 | 169 | /** 170 | * Returns rel-url property $url array $mf. 171 | * 172 | * @param array $mf 173 | * @param $url 174 | * @return mixed 175 | */ 176 | public static function get_rel_urls( array $mf, $url ) { 177 | if ( self::has_rel_urls( $mf, $url ) ) { 178 | $return = array( 179 | 'url' => array( $url ), 180 | ); 181 | if ( array_key_exists( 'text', $mf['rel-urls'][ $url ] ) ) { 182 | $return['name'] = array( $mf['rel-urls'][ $url ]['text'] ); 183 | } else { 184 | $return['name'] = array( $mf['rel-urls'][ $url ]['title'] ); 185 | } 186 | return $return; 187 | } 188 | return false; 189 | } 190 | 191 | /** 192 | * shortcut for getPlaintext. 193 | * 194 | * @deprecated use getPlaintext from now on 195 | * @param array $mf 196 | * @param $propname 197 | * @param null|string $fallback 198 | * @return mixed|null 199 | */ 200 | public static function get_prop( array $mf, $propname, $fallback = null ) { 201 | return self::get_plaintext( $mf, $propname, $fallback ); 202 | } 203 | 204 | /** 205 | * If $v is a microformat or embedded html, return $v['value']. Else return v. 206 | * 207 | * @param $v 208 | * @return mixed 209 | */ 210 | public static function to_plaintext( $v ) { 211 | if ( self::is_microformat( $v ) || self::is_embedded_html( $v ) || self::is_embedded_img( $v ) ) { 212 | return $v['value']; 213 | } elseif ( is_array( $v ) && isset( $v['text'] ) ) { 214 | return $v['text']; 215 | } 216 | return $v; 217 | } 218 | 219 | /** 220 | * Returns plaintext of $propname with optional $fallback 221 | * 222 | * @param array $mf 223 | * @param $propname 224 | * @param null|string $fallback 225 | * @return mixed|null 226 | * @link http://php.net/manual/en/function.current.php 227 | */ 228 | public static function get_plaintext( array $mf, $propname, $fallback = null ) { 229 | if ( ! empty( $mf['properties'][ $propname ] ) && is_array( $mf['properties'][ $propname ] ) ) { 230 | return self::to_plaintext( current( $mf['properties'][ $propname ] ) ); 231 | } 232 | return $fallback; 233 | } 234 | 235 | /** 236 | * Converts $propname in $mf into array_map plaintext, or $fallback if not valid. 237 | * 238 | * @param array $mf 239 | * @param $propname 240 | * @param null|string $fallback 241 | * @return null 242 | */ 243 | public static function get_plaintext_array( array $mf, $propname, $fallback = null ) { 244 | if ( ! empty( $mf['properties'][ $propname ] ) && is_array( $mf['properties'][ $propname ] ) ) { 245 | return array_map( array( static::class, 'to_plaintext' ), $mf['properties'][ $propname ] ); } 246 | return $fallback; 247 | } 248 | 249 | /** 250 | * Returns ['html'] element of $v, or ['value'] or just $v, in order of availablility. 251 | * 252 | * @param $v 253 | * @return mixed 254 | */ 255 | public static function to_html( $v ) { 256 | if ( self::is_embedded_html( $v ) ) { 257 | return $v['html']; } elseif ( self::is_microformat( $v ) ) { 258 | return htmlspecialchars( $v['value'] ); } 259 | return htmlspecialchars( $v ); 260 | } 261 | 262 | /** 263 | * Gets HTML of $propname or if not, $fallback 264 | * 265 | * @param array $mf 266 | * @param $propname 267 | * @param null|string $fallback 268 | * @return mixed|null 269 | */ 270 | public static function get_html( array $mf, $propname, $fallback = null ) { 271 | if ( ! empty( $mf['properties'][ $propname ] ) && is_array( $mf['properties'][ $propname ] ) ) { 272 | return self::to_html( current( $mf['properties'][ $propname ] ) ); } 273 | return $fallback; 274 | } 275 | 276 | 277 | 278 | /** 279 | * Returns 'summary' element of $mf or a truncated Plaintext of $mf['properties']['content'] with 19 chars and ellipsis. 280 | * 281 | * @deprecated as not often used 282 | * @param array $mf 283 | * @param array $content 284 | * @return mixed|null|string 285 | */ 286 | public static function get_summary( array $mf, $content = null ) { 287 | if ( self::has_prop( $mf, 'summary' ) ) { 288 | return self::get_prop( $mf, 'summary' ); 289 | } 290 | if ( ! $content ) { 291 | $content = self::parse_html_value( $mf, 'content' ); 292 | } 293 | if ( is_array( $content ) && array_key_exists( 'text', $content ) ) { 294 | $summary = substr( $content['text'], 0, 300 ); 295 | if ( 300 < strlen( $content['text'] ) ) { 296 | $summary .= '...'; 297 | } 298 | return $summary; 299 | } 300 | return ''; 301 | } 302 | 303 | 304 | /** 305 | * Gets the date published of $mf array. 306 | * 307 | * @param array $mf 308 | * @param bool $ensurevalid 309 | * @param null|string $fallback optional result if date not available 310 | * @return mixed|null 311 | */ 312 | public static function get_published( array $mf, $ensurevalid = false, $fallback = null ) { 313 | $date = self::get_datetime_property( 'published', $mf, $ensurevalid, $fallback ); 314 | if ( $date instanceof DateTimeImmutable ) { 315 | return $date->format( DATE_W3C ); 316 | } 317 | return null; 318 | } 319 | 320 | /** 321 | * Gets the date updated of $mf array. 322 | * 323 | * @param array $mf 324 | * @param bool $ensurevalid 325 | * @param null $fallback 326 | * @return mixed|null 327 | */ 328 | public static function get_updated( array $mf, $ensurevalid = false, $fallback = null ) { 329 | $date = self::get_datetime_property( 'updated', $mf, $ensurevalid, $fallback ); 330 | if ( $date instanceof DateTimeImmutable ) { 331 | return $date->format( DATE_W3C ); 332 | } 333 | return null; 334 | } 335 | 336 | /** 337 | * Gets the DateTime properties including published or updated, depending on params. 338 | * 339 | * @param $name string updated or published 340 | * @param array $mf 341 | * @param bool $ensurevalid 342 | * @param null|string $fallback 343 | * @return DateTime|null 344 | */ 345 | public static function get_datetime_property( $name, array $mf, $ensurevalid = false, $fallback = null ) { 346 | $compliment = 'published' === $name ? 'updated' : 'published'; 347 | if ( self::has_prop( $mf, $name ) ) { 348 | $return = self::get_prop( $mf, $name ); 349 | } elseif ( self::has_prop( $mf, $compliment ) ) { 350 | $return = self::get_prop( $mf, $compliment ); 351 | } else { 352 | return $fallback; 353 | } 354 | if ( ! $ensurevalid ) { 355 | return $return; 356 | } else { 357 | try { 358 | return new DateTimeImmutable( $return ); 359 | } catch ( Exception $e ) { 360 | return $fallback; 361 | } 362 | } 363 | } 364 | 365 | /** 366 | * True if same hostname is parsed on both 367 | * 368 | * @param $u1 string url 369 | * @param $u2 string url 370 | * @return bool 371 | * @link http://php.net/manual/en/function.parse-url.php 372 | */ 373 | public static function same_hostname( $u1, $u2 ) { 374 | return wp_parse_url( $u1, PHP_URL_HOST ) === wp_parse_url( $u2, PHP_URL_HOST ); 375 | } 376 | 377 | /** 378 | * Returns array per parse_url standard with pathname key added. 379 | * 380 | * @param $url 381 | * @return mixed 382 | * @link http://php.net/manual/en/function.parse-url.php 383 | */ 384 | public static function parse_url( $url ) { 385 | $r = wp_parse_url( $url ); 386 | $r['pathname'] = empty( $r['path'] ) ? '/' : $r['path']; 387 | return $r; 388 | } 389 | 390 | 391 | /** 392 | * See if urls match for each component of parsed urls. Return true if so. 393 | * 394 | * @param $url1 395 | * @param $url2 396 | * @return bool 397 | * @see parseUrl() 398 | */ 399 | public static function urls_match( $url1, $url2 ) { 400 | return ( normalize_url( $url1 ) === normalize_url( $url2 ) ); 401 | } 402 | } 403 | -------------------------------------------------------------------------------- /lib/html5/HTML5/Parser/Scanner.php: -------------------------------------------------------------------------------- 1 | errors = UTF8Utils::checkForIllegalCodepoints($data); 57 | 58 | $data = $this->replaceLinefeeds($data); 59 | 60 | $this->data = $data; 61 | $this->char = 0; 62 | $this->EOF = strlen($data); 63 | } 64 | 65 | /** 66 | * Check if upcomming chars match the given sequence. 67 | * 68 | * This will read the stream for the $sequence. If it's 69 | * found, this will return true. If not, return false. 70 | * Since this unconsumes any chars it reads, the caller 71 | * will still need to read the next sequence, even if 72 | * this returns true. 73 | * 74 | * Example: $this->scanner->sequenceMatches('') will 75 | * see if the input stream is at the start of a 76 | * '' string. 77 | * 78 | * @param string $sequence 79 | * @param bool $caseSensitive 80 | * 81 | * @return bool 82 | */ 83 | public function sequenceMatches($sequence, $caseSensitive = true) 84 | { 85 | $portion = substr($this->data, $this->char, strlen($sequence)); 86 | 87 | return $caseSensitive ? $portion === $sequence : 0 === strcasecmp($portion, $sequence); 88 | } 89 | 90 | /** 91 | * Get the current position. 92 | * 93 | * @return int The current intiger byte position. 94 | */ 95 | public function position() 96 | { 97 | return $this->char; 98 | } 99 | 100 | /** 101 | * Take a peek at the next character in the data. 102 | * 103 | * @return string The next character. 104 | */ 105 | public function peek() 106 | { 107 | if (($this->char + 1) < $this->EOF) { 108 | return $this->data[$this->char + 1]; 109 | } 110 | 111 | return false; 112 | } 113 | 114 | /** 115 | * Get the next character. 116 | * Note: This advances the pointer. 117 | * 118 | * @return string The next character. 119 | */ 120 | public function next() 121 | { 122 | ++$this->char; 123 | 124 | if ($this->char < $this->EOF) { 125 | return $this->data[$this->char]; 126 | } 127 | 128 | return false; 129 | } 130 | 131 | /** 132 | * Get the current character. 133 | * Note, this does not advance the pointer. 134 | * 135 | * @return string The current character. 136 | */ 137 | public function current() 138 | { 139 | if ($this->char < $this->EOF) { 140 | return $this->data[$this->char]; 141 | } 142 | 143 | return false; 144 | } 145 | 146 | /** 147 | * Silently consume N chars. 148 | * 149 | * @param int $count 150 | */ 151 | public function consume($count = 1) 152 | { 153 | $this->char += $count; 154 | } 155 | 156 | /** 157 | * Unconsume some of the data. 158 | * This moves the data pointer backwards. 159 | * 160 | * @param int $howMany The number of characters to move the pointer back. 161 | */ 162 | public function unconsume($howMany = 1) 163 | { 164 | if (($this->char - $howMany) >= 0) { 165 | $this->char -= $howMany; 166 | } 167 | } 168 | 169 | /** 170 | * Get the next group of that contains hex characters. 171 | * Note, along with getting the characters the pointer in the data will be 172 | * moved as well. 173 | * 174 | * @return string The next group that is hex characters. 175 | */ 176 | public function getHex() 177 | { 178 | return $this->doCharsWhile(static::CHARS_HEX); 179 | } 180 | 181 | /** 182 | * Get the next group of characters that are ASCII Alpha characters. 183 | * Note, along with getting the characters the pointer in the data will be 184 | * moved as well. 185 | * 186 | * @return string The next group of ASCII alpha characters. 187 | */ 188 | public function getAsciiAlpha() 189 | { 190 | return $this->doCharsWhile(static::CHARS_ALPHA); 191 | } 192 | 193 | /** 194 | * Get the next group of characters that are ASCII Alpha characters and numbers. 195 | * Note, along with getting the characters the pointer in the data will be 196 | * moved as well. 197 | * 198 | * @return string The next group of ASCII alpha characters and numbers. 199 | */ 200 | public function getAsciiAlphaNum() 201 | { 202 | return $this->doCharsWhile(static::CHARS_ALNUM); 203 | } 204 | 205 | /** 206 | * Get the next group of numbers. 207 | * Note, along with getting the characters the pointer in the data will be 208 | * moved as well. 209 | * 210 | * @return string The next group of numbers. 211 | */ 212 | public function getNumeric() 213 | { 214 | return $this->doCharsWhile('0123456789'); 215 | } 216 | 217 | /** 218 | * Consume whitespace. 219 | * Whitespace in HTML5 is: formfeed, tab, newline, space. 220 | * 221 | * @return int The length of the matched whitespaces. 222 | */ 223 | public function whitespace() 224 | { 225 | if ($this->char >= $this->EOF) { 226 | return false; 227 | } 228 | 229 | $len = strspn($this->data, "\n\t\f ", $this->char); 230 | 231 | $this->char += $len; 232 | 233 | return $len; 234 | } 235 | 236 | /** 237 | * Returns the current line that is being consumed. 238 | * 239 | * @return int The current line number. 240 | */ 241 | public function currentLine() 242 | { 243 | if (empty($this->EOF) || 0 === $this->char) { 244 | return 1; 245 | } 246 | 247 | // Add one to $this->char because we want the number for the next 248 | // byte to be processed. 249 | return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1; 250 | } 251 | 252 | /** 253 | * Read chars until something in the mask is encountered. 254 | * 255 | * @param string $mask 256 | * 257 | * @return mixed 258 | */ 259 | public function charsUntil($mask) 260 | { 261 | return $this->doCharsUntil($mask); 262 | } 263 | 264 | /** 265 | * Read chars as long as the mask matches. 266 | * 267 | * @param string $mask 268 | * 269 | * @return int 270 | */ 271 | public function charsWhile($mask) 272 | { 273 | return $this->doCharsWhile($mask); 274 | } 275 | 276 | /** 277 | * Returns the current column of the current line that the tokenizer is at. 278 | * 279 | * Newlines are column 0. The first char after a newline is column 1. 280 | * 281 | * @return int The column number. 282 | */ 283 | public function columnOffset() 284 | { 285 | // Short circuit for the first char. 286 | if (0 === $this->char) { 287 | return 0; 288 | } 289 | 290 | // strrpos is weird, and the offset needs to be negative for what we 291 | // want (i.e., the last \n before $this->char). This needs to not have 292 | // one (to make it point to the next character, the one we want the 293 | // position of) added to it because strrpos's behaviour includes the 294 | // final offset byte. 295 | $backwardFrom = $this->char - 1 - strlen($this->data); 296 | $lastLine = strrpos($this->data, "\n", $backwardFrom); 297 | 298 | // However, for here we want the length up until the next byte to be 299 | // processed, so add one to the current byte ($this->char). 300 | if (false !== $lastLine) { 301 | $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine); 302 | } else { 303 | // After a newline. 304 | $findLengthOf = substr($this->data, 0, $this->char); 305 | } 306 | 307 | return UTF8Utils::countChars($findLengthOf); 308 | } 309 | 310 | /** 311 | * Get all characters until EOF. 312 | * 313 | * This consumes characters until the EOF. 314 | * 315 | * @return int The number of characters remaining. 316 | */ 317 | public function remainingChars() 318 | { 319 | if ($this->char < $this->EOF) { 320 | $data = substr($this->data, $this->char); 321 | $this->char = $this->EOF; 322 | 323 | return $data; 324 | } 325 | 326 | return ''; // false; 327 | } 328 | 329 | /** 330 | * Replace linefeed characters according to the spec. 331 | * 332 | * @param $data 333 | * 334 | * @return string 335 | */ 336 | private function replaceLinefeeds($data) 337 | { 338 | /* 339 | * U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED (LF) characters are treated specially. 340 | * Any CR characters that are followed by LF characters must be removed, and any CR characters not 341 | * followed by LF characters must be converted to LF characters. Thus, newlines in HTML DOMs are 342 | * represented by LF characters, and there are never any CR characters in the input to the tokenization 343 | * stage. 344 | */ 345 | $crlfTable = array( 346 | "\0" => "\xEF\xBF\xBD", 347 | "\r\n" => "\n", 348 | "\r" => "\n", 349 | ); 350 | 351 | return strtr($data, $crlfTable); 352 | } 353 | 354 | /** 355 | * Read to a particular match (or until $max bytes are consumed). 356 | * 357 | * This operates on byte sequences, not characters. 358 | * 359 | * Matches as far as possible until we reach a certain set of bytes 360 | * and returns the matched substring. 361 | * 362 | * @param string $bytes Bytes to match. 363 | * @param int $max Maximum number of bytes to scan. 364 | * 365 | * @return mixed Index or false if no match is found. You should use strong 366 | * equality when checking the result, since index could be 0. 367 | */ 368 | private function doCharsUntil($bytes, $max = null) 369 | { 370 | if ($this->char >= $this->EOF) { 371 | return false; 372 | } 373 | 374 | if (0 === $max || $max) { 375 | $len = strcspn($this->data, $bytes, $this->char, $max); 376 | } else { 377 | $len = strcspn($this->data, $bytes, $this->char); 378 | } 379 | 380 | $string = (string) substr($this->data, $this->char, $len); 381 | $this->char += $len; 382 | 383 | return $string; 384 | } 385 | 386 | /** 387 | * Returns the string so long as $bytes matches. 388 | * 389 | * Matches as far as possible with a certain set of bytes 390 | * and returns the matched substring. 391 | * 392 | * @param string $bytes A mask of bytes to match. If ANY byte in this mask matches the 393 | * current char, the pointer advances and the char is part of the 394 | * substring. 395 | * @param int $max The max number of chars to read. 396 | * 397 | * @return string 398 | */ 399 | private function doCharsWhile($bytes, $max = null) 400 | { 401 | if ($this->char >= $this->EOF) { 402 | return false; 403 | } 404 | 405 | if (0 === $max || $max) { 406 | $len = strspn($this->data, $bytes, $this->char, $max); 407 | } else { 408 | $len = strspn($this->data, $bytes, $this->char); 409 | } 410 | 411 | $string = (string) substr($this->data, $this->char, $len); 412 | $this->char += $len; 413 | 414 | return $string; 415 | } 416 | } 417 | -------------------------------------------------------------------------------- /includes/class-parse-this-html.php: -------------------------------------------------------------------------------- 1 | query( '//meta[(@name or @property or @itemprop) and @content]' ) as $tag ) { 24 | $meta_name = self::limit_string( $tag->getAttribute( 'property' ) ); 25 | if ( ! $meta_name ) { 26 | $meta_name = self::limit_string( $tag->getAttribute( 'name' ) ); 27 | } 28 | if ( ! $meta_name ) { 29 | $meta_name = self::limit_string( $tag->getAttribute( 'itemprop' ) ); 30 | } 31 | $meta_value = $tag->getAttribute( 'content' ); 32 | 33 | // Sanity check. $key is usually things like 'title', 'description', 'keywords', etc. 34 | if ( strlen( $meta_name ) > 200 ) { 35 | continue; 36 | } 37 | // Decode known JSON encoded properties 38 | if ( 'parsely-metadata' === $meta_name ) { 39 | $json = json_decode( $meta_value, true ); 40 | if ( is_array( $json ) ) { 41 | $meta_value = $json; 42 | } 43 | } 44 | 45 | // Parsely-page is deprecated but convert it to the new parsely format. 46 | if ( 'parsely-page' === $meta_name ) { 47 | $json = json_decode( $meta_value, true ); 48 | if ( is_array( $json ) ) { 49 | foreach ( $json as $key => $value ) { 50 | $key = str_replace( '_', '-', $key ); 51 | $meta = self::set( $meta, 'parsely-' . $key, $value ); 52 | } 53 | continue; 54 | } 55 | } 56 | $meta = self::set( $meta, $meta_name, $meta_value ); 57 | } 58 | 59 | $meta['title'] = trim( $xpath->query( '//title' )->item( 0 )->textContent ); 60 | $meta = self::parse_meta( $meta ); 61 | if ( isset( $meta['og'] ) ) { 62 | $meta['og'] = self::parse_meta( $meta['og'] ); 63 | } 64 | $jf2 = self::meta_to_jf2( $meta ); 65 | 66 | if ( ! isset( $jf2['video'] ) ) { 67 | // Fetch and gather