├── .editorconfig ├── .styleci.yml ├── .travis.yml ├── README.md ├── bin └── chrome.js ├── composer.json └── src ├── ChromeHeadless.php └── Exceptions └── ChromeException.php /.editorconfig: -------------------------------------------------------------------------------- 1 | ; This file is for unifying the coding style for different editors and IDEs. 2 | ; More information at http://editorconfig.org 3 | 4 | root = true 5 | 6 | [*] 7 | charset = utf-8 8 | indent_size = 4 9 | indent_style = space 10 | end_of_line = lf 11 | insert_final_newline = true 12 | trim_trailing_whitespace = true 13 | 14 | [*.md] 15 | trim_trailing_whitespace = false -------------------------------------------------------------------------------- /.styleci.yml: -------------------------------------------------------------------------------- 1 | preset: laravel -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | dist: trusty 3 | addons: 4 | chrome: stable 5 | 6 | language: php 7 | 8 | php: 9 | - 7.1 10 | - 7.2 11 | 12 | before_script: 13 | - . $HOME/.nvm/nvm.sh 14 | - nvm install stable 15 | - nvm use stable 16 | - npm install puppeteer 17 | - travis_retry composer self-update 18 | - travis_retry composer update --no-interaction --prefer-dist 19 | 20 | script: 21 | - composer test -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A Chrome Headless wrapper for PHP 2 | [![Build Status](https://img.shields.io/travis/helloiamlukas/chrome-php/master.svg?style=flat-square)](https://travis-ci.org/helloiamlukas/chrome-php) [![StyleCI](https://styleci.io/repos/128383656/shield?branch=master)](https://styleci.io/repos/128383656) 3 | 4 | Get the DOM of any webpage by using headless Chrome. Inspired by [Browsershot](https://github.com/spatie/browsershot). 5 | 6 | ## Requirements 7 | 8 | This package requires the [Puppeteer Chrome Headless Node library](https://github.com/GoogleChrome/puppeteer). 9 | If you want to install it on Ubuntu 16.04 you can do it like this: 10 | ```bash 11 | sudo apt-get update 12 | curl -sL https://deb.nodesource.com/setup_8.x | sudo -E bash - 13 | sudo apt-get install -y nodejs gconf-service libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release xdg-utils wget 14 | sudo npm install --global --unsafe-perm puppeteer 15 | sudo chmod -R o+rx /usr/lib/node_modules/puppeteer/.local-chromium 16 | ``` 17 | ## Installation 18 | 19 | To add this package to your project, you can install it via composer by running 20 | 21 | ```bash 22 | composer require helloiamlukas/chrome-php 23 | ``` 24 | 25 | ## Usage 26 | 27 | Here is a quick example how to use this package: 28 | 29 | ```php 30 | use ChromeHeadless\ChromeHeadless; 31 | 32 | $html = ChromeHeadless::url('https://example.com')->getHtml(); 33 | ``` 34 | 35 | Instead of getting the DOM as a string, you can also use the`getDOMCrawler()` method, which will return a `Symfony\Component\DomCrawler\Crawler` instance. 36 | 37 | ```php 38 | use ChromeHeadless\ChromeHeadless; 39 | 40 | $dom = ChromeHeadless::url('https://example.com')->getDOMCrawler(); 41 | 42 | $title = $dom->filter('title')->text(); 43 | ``` 44 | 45 | This makes it easy to filter the DOM for specific elements. Check the full documentation [here](https://symfony.com/doc/current/components/dom_crawler.html). 46 | 47 | ### Timeout 48 | 49 | You can specify a timeout after which the process will be killed. The timeout should be given in seconds. 50 | 51 | ````````````php 52 | ChromeHeadless::url('https://example.com') 53 | ->setTimeout(10) 54 | ->getDOMCrawler(); 55 | ```````````` 56 | 57 | If the process runs out of time a `Symfony\Component\Process\Exception\ProcessTimedOutException` will be thrown. 58 | 59 | ### Custom Chrome Path 60 | 61 | You can specify a custom path to your Chrome installation. 62 | 63 | ```php 64 | ChromeHeadless::url('https://example.com') 65 | ->setChromePath('/path/to/chrome') 66 | ->getDOMCrawler(); 67 | ``` 68 | 69 | ### Custom User Agent 70 | 71 | You can specify a custom user agent. By default the standard Chrome Headless user agent will be used. 72 | 73 | ```php 74 | ChromeHeadless::url('https://example.com') 75 | ->setUserAgent('nice-user-agent') 76 | ->getDOMCrawler(); 77 | ``` 78 | 79 | ### Custom Headers 80 | 81 | You can specify custom headers which will be used for the request. 82 | 83 | ```php 84 | ChromeHeadless::url('https://example.com') 85 | ->setHeaders([ 86 | 'DNT' => 1 // DO NOT TRACK 87 | ]) 88 | ->getDOMCrawler(); 89 | ``` 90 | 91 | ### Blacklist 92 | 93 | You can specify a list of regular expressions for files that should not be loaded when you request a website. These expressions will be checked against the url of the file. 94 | 95 | ```php 96 | ChromeHeadless::url('https://example.com') 97 | ->setBlacklist([ 98 | 'www.google-analytics.com', 99 | 'analytics.js' 100 | ]) 101 | ->getDOMCrawler(); 102 | ``` 103 | 104 | ### Viewport 105 | 106 | You can specify a custom viewport that will be used when you make a request. By default the Chrome Headless standard of 800x600px will be used. 107 | 108 | ```php 109 | ChromeHeadless::url('https://example.com') 110 | ->setViewport([ 111 | 'width' => 1920, 112 | 'height' => 1080 113 | ]) 114 | ->getDOMCrawler(); 115 | ``` 116 | 117 | ## Testing 118 | 119 | You can run the tests by using 120 | 121 | ```bash 122 | composer test 123 | ``` 124 | -------------------------------------------------------------------------------- /bin/chrome.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | 3 | const options = JSON.parse(process.argv[2]); 4 | 5 | const runChrome = async () => { 6 | let browser; 7 | let page; 8 | 9 | try { 10 | browser = await puppeteer.launch({ 11 | ignoreHTTPSErrors: true, 12 | executablePath: options.path 13 | }); 14 | 15 | page = await browser.newPage(); 16 | 17 | if (options.userAgent) 18 | await page.setUserAgent(options.userAgent); 19 | 20 | if (options.viewport) 21 | await page.setViewport(options.viewport); 22 | 23 | if (options.headers) 24 | await page.setExtraHTTPHeaders(options.headers); 25 | 26 | if (options.blacklist && options.blacklist.length) { 27 | await page.setRequestInterception(true); 28 | page.on('request', req => { 29 | const block = ['www.google-analytics.com', '/gtag/js', 'ga.js', 'analytics.js']; 30 | if (block.find(regex => req.url().match(regex))) return req.abort(); 31 | req.continue(); 32 | }); 33 | } 34 | 35 | 36 | /* 37 | The following code is taken from 38 | https://github.com/intoli/intoli-article-materials/tree/master/articles/not-possible-to-block-chrome-headless 39 | */ 40 | await page.evaluateOnNewDocument(() => { 41 | // Pass the Chrome Test. 42 | window.navigator.chrome = JSON.parse('{"app":{"isInstalled":false},"webstore":{"onInstallStageChanged":{},"onDownloadProgress":{}},"runtime":{"PlatformOs":{"MAC":"mac","WIN":"win","ANDROID":"android","CROS":"cros","LINUX":"linux","OPENBSD":"openbsd"},"PlatformArch":{"ARM":"arm","X86_32":"x86-32","X86_64":"x86-64"},"PlatformNaclArch":{"ARM":"arm","X86_32":"x86-32","X86_64":"x86-64"},"RequestUpdateCheckStatus":{"THROTTLED":"throttled","NO_UPDATE":"no_update","UPDATE_AVAILABLE":"update_available"},"OnInstalledReason":{"INSTALL":"install","UPDATE":"update","CHROME_UPDATE":"chrome_update","SHARED_MODULE_UPDATE":"shared_module_update"},"OnRestartRequiredReason":{"APP_UPDATE":"app_update","OS_UPDATE":"os_update","PERIODIC":"periodic"}}}'); 43 | 44 | // Pass the Permissions Test. 45 | const originalQuery = window.navigator.permissions.query; 46 | window.navigator.permissions.query = (parameters) => ( 47 | parameters.name === 'notifications' ? 48 | Promise.resolve({state: Notification.permission}) : 49 | originalQuery(parameters) 50 | ); 51 | 52 | // Pass the Webdriver Test. 53 | Object.defineProperty(navigator, 'webdriver', { 54 | get: () => false, 55 | }); 56 | 57 | // Pass the Plugins Length Test. 58 | // Overwrite the `plugins` property to use a custom getter. 59 | Object.defineProperty(navigator, 'plugins', { 60 | // This just needs to have `length > 0` for the current test, 61 | // but we could mock the plugins too if necessary. 62 | get: () => [1, 2, 3, 4, 5], 63 | }); 64 | 65 | // Pass the Languages Test. 66 | // Overwrite the `plugins` property to use a custom getter. 67 | Object.defineProperty(navigator, 'languages', { 68 | get: () => ['en-US', 'en'], 69 | }); 70 | }); 71 | /* */ 72 | 73 | const response = await page.goto(options.url, {}); 74 | 75 | if (response.status() >= 400) { 76 | throw new Error('HTTP Response: ' + response.status()); 77 | } 78 | 79 | const output = await page.evaluate(() => document.documentElement.outerHTML); 80 | 81 | console.log(output); 82 | 83 | await browser.close(); 84 | } 85 | catch (exception) { 86 | if (browser) await browser.close(); 87 | console.log(exception); 88 | process.exit(0); 89 | } 90 | } 91 | ; 92 | 93 | runChrome(); -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "helloiamlukas/chrome-php", 3 | "description": "A PHP Wrapper for Chrome Headless. Get the DOM of any webpage.", 4 | "homepage": "https://github.com/helloiamlukas/chrome-php", 5 | "keywords": 6 | [ 7 | "webpage", 8 | "dom", 9 | "chrome", 10 | "headless" 11 | ], 12 | "license": "MIT", 13 | "require": { 14 | "php": "^7.1", 15 | "symfony/css-selector": "^4.0", 16 | "symfony/dom-crawler": "^4.0", 17 | "symfony/process": "^4.0" 18 | }, 19 | "require-dev": { 20 | "phpunit/phpunit": "^6.1|^7.0" 21 | }, 22 | "autoload": { 23 | "psr-4": { 24 | "ChromeHeadless\\": "src" 25 | } 26 | }, 27 | "autoload-dev": { 28 | "psr-4": { 29 | "ChromeHeadless\\Test\\": "tests" 30 | } 31 | }, 32 | "scripts": { 33 | "test": "vendor/bin/phpunit" 34 | }, 35 | "config": { 36 | "sort-packages": true 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/ChromeHeadless.php: -------------------------------------------------------------------------------- 1 | url = $url; 83 | } 84 | 85 | /** 86 | * Set the url of the request and get a new ChromeHeadless instance. 87 | * 88 | * @param string $url 89 | * @return static 90 | */ 91 | public static function url(string $url) 92 | { 93 | return (new static)->setUrl($url); 94 | } 95 | 96 | /** 97 | * Set the url. 98 | * 99 | * @param string $url 100 | * @return $this 101 | */ 102 | public function setUrl(string $url) 103 | { 104 | $this->url = $url; 105 | 106 | return $this; 107 | } 108 | 109 | /** 110 | * Set the timeout. 111 | * 112 | * @param float $timeout Timeout in seconds. 113 | * @return $this 114 | */ 115 | public function setTimeout(float $timeout) 116 | { 117 | $this->timeout = $timeout; 118 | 119 | return $this; 120 | } 121 | 122 | /** 123 | * Set the content. 124 | * 125 | * @param string $html 126 | * @return $this 127 | * @throws \ChromeHeadless\Exceptions\ChromeException 128 | */ 129 | public function setHtml(string $html) 130 | { 131 | if (strpos($html, 'Error:') === 0) { 132 | throw new ChromeException($this->url, $html); 133 | } 134 | 135 | $this->html = $html; 136 | 137 | return $this; 138 | } 139 | 140 | /** 141 | * Set the chrome path. 142 | * 143 | * @param string $path 144 | * @return $this 145 | */ 146 | public function setChromePath(string $path) 147 | { 148 | $this->chrome_path = $path; 149 | 150 | return $this; 151 | } 152 | 153 | /** 154 | * Set the user agent. 155 | * 156 | * @param string $user_agent 157 | * @return $this 158 | */ 159 | public function setUserAgent(string $user_agent) 160 | { 161 | $this->user_agent = $user_agent; 162 | 163 | return $this; 164 | } 165 | 166 | /** 167 | * Set the viewport. 168 | * 169 | * @param mixed $viewport 170 | */ 171 | public function setViewport($viewport) 172 | { 173 | $this->viewport = $viewport; 174 | } 175 | 176 | /** 177 | * Set additional request headers. 178 | * 179 | * @param mixed $headers 180 | */ 181 | public function setHeaders($headers) 182 | { 183 | $this->headers = $headers; 184 | } 185 | 186 | /** 187 | * Set a blacklist of files that should not be loaded. 188 | * 189 | * @param array $blacklist 190 | */ 191 | public function setBlacklist(array $blacklist) 192 | { 193 | $this->blacklist = $blacklist; 194 | } 195 | 196 | /** 197 | * Get the DOM of the website as a Crawler instance. 198 | * 199 | * @return Crawler 200 | * @throws \ChromeHeadless\Exceptions\ChromeException 201 | */ 202 | public function getDOMCrawler() 203 | { 204 | $this->makeRequest(); 205 | 206 | $this->dom = new Crawler($this->html); 207 | 208 | return $this->dom; 209 | } 210 | 211 | /** 212 | * Get the DOM of the website as string. 213 | * 214 | * @return string 215 | * @throws \ChromeHeadless\Exceptions\ChromeException 216 | */ 217 | public function getHtml() 218 | { 219 | $this->makeRequest(); 220 | 221 | return $this->html; 222 | } 223 | 224 | /** 225 | * Make the request. 226 | * 227 | * @throws \ChromeHeadless\Exceptions\ChromeException 228 | */ 229 | protected function makeRequest() 230 | { 231 | $command = $this->createCommand(); 232 | 233 | $chrome = new Process($command); 234 | 235 | if (! is_null($this->timeout)) { 236 | $chrome->setTimeout($this->timeout); 237 | } 238 | 239 | $chrome->run(); 240 | 241 | if (! $chrome->isSuccessful()) { 242 | throw new ProcessFailedException($chrome); 243 | } 244 | 245 | $this->setHtml($chrome->getOutput()); 246 | } 247 | 248 | /** 249 | * Generate the command. 250 | * 251 | * @return string 252 | */ 253 | public function createCommand() 254 | { 255 | $options = [ 256 | 'url' => $this->url, 257 | 'path' => $this->chrome_path, 258 | ]; 259 | 260 | if (! empty($this->user_agent)) { 261 | $options['userAgent'] = $this->user_agent; 262 | } 263 | 264 | if (! empty($this->viewport)) { 265 | $options['viewport'] = $this->viewport; 266 | } 267 | 268 | if (! empty($this->headers)) { 269 | $options['headers'] = $this->headers; 270 | } 271 | 272 | if (! empty($this->blacklist)) { 273 | $options['blacklist'] = $this->blacklist; 274 | } 275 | 276 | $command = [ 277 | 'NODE_PATH=`npm root -g`', 278 | 'node', 279 | __DIR__.'/../bin/chrome.js', 280 | escapeshellarg(json_encode($options)), 281 | ]; 282 | 283 | return implode(' ', $command); 284 | } 285 | } 286 | -------------------------------------------------------------------------------- /src/Exceptions/ChromeException.php: -------------------------------------------------------------------------------- 1 |