├── LICENSE.md ├── README.md ├── composer.json └── src ├── Cfscrape.php └── CfscrapeRuntimeException.php /LICENSE.md: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | 3 | Copyright (c) 2020 klgd 4 | 5 | > Permission is hereby granted, free of charge, to any person obtaining a copy 6 | > of this software and associated documentation files (the "Software"), to deal 7 | > in the Software without restriction, including without limitation the rights 8 | > to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | > copies of the Software, and to permit persons to whom the Software is 10 | > furnished to do so, subject to the following conditions: 11 | > 12 | > The above copyright notice and this permission notice shall be included in 13 | > all copies or substantial portions of the Software. 14 | > 15 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | > IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | > FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | > AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | > LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | > THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Cloudflare Scrape 2 | 3 | [![Latest Version on Packagist][ico-version]][link-packagist] 4 | [![Software License][ico-license]](LICENSE.md) 5 | [![Build Status][ico-travis]][link-travis] 6 | [![Total Downloads][ico-downloads]][link-downloads] 7 | 8 | A simple PHP module to bypass Cloudflare's anti-bot page (also known as "I'm Under Attack Mode", or IUAM) 9 | 10 | Thanks [https://github.com/Anorov/cloudflare-scrape](https://github.com/Anorov/cloudflare-scrape) 11 | 12 | 13 | ## 依赖 14 | 15 | php >= 7.2 16 | 17 | v8js扩展 https://github.com/phpv8/v8js 18 | 19 | 20 | ## 安装 21 | 22 | ``` bash 23 | $ composer require cfscrape/cfscrape 24 | ``` 25 | 26 | ## 使用 27 | 28 | ### 获取响应 29 | 30 | ``` php 31 | $scraper = \Cfscrape\Cfscrape::createScraper(); 32 | // 延时 33 | $scraper->setDelay(10); 34 | // 自定义UA 35 | $scraper->setUserAgent('custom-ua'); 36 | // \Psr\Http\Message\ResponseInterface 37 | $response = $scraper->get('http://somesite.com'); 38 | ``` 39 | ### 获取Cookie 40 | 41 | ```php 42 | // cookies is array 43 | // [ 44 | // 'cf_clearance' => 'c8f913c707b818b47aa328d81cab57c349b1eee5-1426733163-3600', 45 | // '__cfduid' => 'dd8ec03dfdbcb8c2ea63e920f1335c1001426733158' 46 | // ] 47 | [$cookies, $userAgent] = \Cfscrape\Cfscrape::getTokens('http://somesite.com'); 48 | 49 | // cookies is string 50 | // cf_clearance=c8f913c707b818b47aa328d81cab57c349b1eee5-1426733163-3600; __cfduid=dd8ec03dfdbcb8c2ea63e920f1335c1001426733158 51 | [$cookies, $userAgent] = \Cfscrape\Cfscrape::getCookieString('http://somesite.com'); 52 | ``` 53 | 54 | ## License 55 | 56 | The MIT License (MIT). Please see [License File](LICENSE.md) for more information. 57 | 58 | [ico-version]: https://img.shields.io/packagist/v/cfscrape/cfscrape.svg?style=flat-square 59 | [ico-license]: https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat-square 60 | [ico-travis]: https://img.shields.io/travis/cfscrape/cfscrape/master.svg?style=flat-square 61 | [ico-downloads]: https://img.shields.io/packagist/dt/cfscrape/cfscrape.svg?style=flat-square 62 | 63 | [link-packagist]: https://packagist.org/packages/cfscrape/cfscrape 64 | [link-travis]: https://travis-ci.com/cfscrape/cfscrape 65 | [link-downloads]: https://packagist.org/packages/cfscrape/cfscrape 66 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cfscrape/cfscrape", 3 | "type": "library", 4 | "description": "A PHP module to bypass Cloudflare's anti-bot page.", 5 | "keywords": [ 6 | "Cloudflare", 7 | "ScoLib", 8 | "cfscrape" 9 | ], 10 | "homepage": "https://github.com/ScoLib/cfscrape", 11 | "license": "MIT", 12 | "authors": [ 13 | { 14 | "name": "klgd", 15 | "email": "slice1213@gmail.com", 16 | "homepage": "https://github.com/klgd", 17 | "role": "Developer" 18 | } 19 | ], 20 | "require": { 21 | "php": "~7.2", 22 | "ext-v8js": "*", 23 | "guzzlehttp/guzzle": "^7.0" 24 | }, 25 | "require-dev": { 26 | "phpunit/phpunit" : ">=8.0", 27 | "squizlabs/php_codesniffer": "^3.0" 28 | }, 29 | "autoload": { 30 | "psr-4": { 31 | "Cfscrape\\": "src" 32 | } 33 | }, 34 | "autoload-dev": { 35 | "psr-4": { 36 | "Tests\\": "tests" 37 | } 38 | }, 39 | "scripts": { 40 | "test": "phpunit", 41 | "check-style": "phpcs -p --standard=PSR2 --runtime-set ignore_errors_on_exit 1 --runtime-set ignore_warnings_on_exit 1 src tests", 42 | "fix-style": "phpcbf -p --standard=PSR2 --runtime-set ignore_errors_on_exit 1 --runtime-set ignore_warnings_on_exit 1 src tests" 43 | }, 44 | "extra": { 45 | "branch-alias": { 46 | "dev-master": "1.0-dev" 47 | } 48 | }, 49 | "config": { 50 | "sort-packages": true 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/Cfscrape.php: -------------------------------------------------------------------------------- 1 | headers = $this->getDefaultHeaders(); 63 | } 64 | 65 | /** 66 | * @return \GuzzleHttp\Cookie\CookieJar 67 | */ 68 | public function getCookies(): \GuzzleHttp\Cookie\CookieJar 69 | { 70 | return $this->cookies; 71 | } 72 | 73 | /** 74 | * @param \GuzzleHttp\Cookie\CookieJar $cookies 75 | * @return Cfscrape 76 | */ 77 | public function setCookies(\GuzzleHttp\Cookie\CookieJar $cookies): Cfscrape 78 | { 79 | $this->cookies = $cookies; 80 | 81 | return $this; 82 | } 83 | 84 | /** 85 | * @return mixed 86 | */ 87 | public function getDelay() 88 | { 89 | if ($this->delay) { 90 | return $this->delay; 91 | } 92 | 93 | return; 94 | } 95 | 96 | /** 97 | * @param mixed $delay 98 | * @return Cfscrape 99 | */ 100 | public function setDelay(int $delay) 101 | { 102 | $this->delay = $delay; 103 | 104 | return $this; 105 | } 106 | 107 | /** 108 | * @return mixed 109 | */ 110 | public function getUserAgent() 111 | { 112 | if ($this->headers) { 113 | return $this->headers['User-Agent']; 114 | } 115 | 116 | return; 117 | } 118 | 119 | /** 120 | * @param mixed $userAgent 121 | * @return Cfscrape 122 | */ 123 | public function setUserAgent($userAgent) 124 | { 125 | $this->headers['User-Agent'] = $userAgent; 126 | 127 | return $this; 128 | } 129 | 130 | /** 131 | * @param $url 132 | * @param array $option 133 | * @return mixed|\Psr\Http\Message\ResponseInterface|null 134 | * @throws \GuzzleHttp\Exception\GuzzleException 135 | */ 136 | public function get($url, $option = []) 137 | { 138 | $option['allow_redirects'] = true; 139 | 140 | return $this->request('GET', $url, $option); 141 | } 142 | 143 | /** 144 | * @param $method 145 | * @param $url 146 | * @param array $option 147 | * @return mixed|\Psr\Http\Message\ResponseInterface|null 148 | * @throws \GuzzleHttp\Exception\GuzzleException 149 | */ 150 | public function request($method, $url, $option = []) 151 | { 152 | try { 153 | $option['headers'] = array_merge($this->headers, $option['headers'] ?? []); 154 | $client = new Client(); 155 | $response = $client->request($method, $url, $option); 156 | } catch (RequestException $exception) { 157 | $response = $exception->getResponse(); 158 | if ($this->isCloudflareCaptchaChallenge($response)) { 159 | throw new CfscrapeRuntimeException(sprintf( 160 | 'Cloudflare captcha challenge presented for %s (cfscrape cannot solve captchas)', 161 | $url 162 | )); 163 | } 164 | 165 | if ($this->isCloudflareIuamChallenge($response)) { 166 | $response = $this->solveCloudflareChallenge($response, $exception->getRequest(), $option); 167 | } 168 | } 169 | 170 | return $response; 171 | } 172 | 173 | /** 174 | * @param \Psr\Http\Message\ResponseInterface $response 175 | * @return bool 176 | */ 177 | protected function isCloudflareCaptchaChallenge(ResponseInterface $response) 178 | { 179 | return $response->getStatusCode() == 403 180 | && strpos($response->getHeaderLine('Server'), 'cloudflare') !== false 181 | && strpos((string)$response->getBody(), '/cdn-cgi/l/chk_captcha') !== false; 182 | } 183 | 184 | /** 185 | * @param \Psr\Http\Message\ResponseInterface $response 186 | * @return bool 187 | */ 188 | protected function isCloudflareIuamChallenge(ResponseInterface $response) 189 | { 190 | return in_array($response->getStatusCode(), [503, 429]) 191 | && strpos($response->getHeaderLine('Server'), 'cloudflare') !== false 192 | && strpos((string)$response->getBody(), 'jschl_vc') !== false 193 | && strpos((string)$response->getBody(), 'jschl_answer') !== false; 194 | } 195 | 196 | /** 197 | * @param \Psr\Http\Message\ResponseInterface $response 198 | * @param \Psr\Http\Message\RequestInterface $request 199 | * @param $oriOption 200 | * @return mixed|\Psr\Http\Message\ResponseInterface|null 201 | * @throws \GuzzleHttp\Exception\GuzzleException 202 | */ 203 | protected function solveCloudflareChallenge( 204 | ResponseInterface $response, 205 | RequestInterface $request, 206 | $oriOption 207 | ) { 208 | $body = (string)$response->getBody(); 209 | $parseUrl = $request->getUri(); 210 | $domain = $parseUrl->getHost(); 211 | 212 | $challengeForm = $this->substr($body, '
'); 213 | 214 | $action = explode('?', $this->substr($challengeForm, 'action="', '"')); 215 | 216 | $method = $this->substr($challengeForm, 'method="', '"'); 217 | $submitUrl = sprintf( 218 | '%s://%s%s', 219 | $parseUrl->getScheme(), 220 | $domain, 221 | $action[0] 222 | ); 223 | 224 | $oriMethod = $request->getMethod(); 225 | 226 | $option = $oriOption; 227 | 228 | $option['form_params'] = []; 229 | $option['query'] = []; 230 | $option['headers'] = []; 231 | $option['headers']['Referer'] = (string)$request->getUri(); 232 | 233 | if (isset($action[1])) { 234 | parse_str($action[1], $option['query']); 235 | } 236 | 237 | preg_match_all('/[^-] \|\<\/input\>)/', $challengeForm, $matches); 238 | 239 | $k = $method == 'POST' ? 'form_params' : 'query'; 240 | 241 | foreach ($matches[0] as $match) { 242 | preg_match('/name=\"(.*?)\"/', $match, $name); 243 | if ($name[1] != 'jschl_answer') { 244 | preg_match('/value=\"(.*?)\"/', $match, $value); 245 | $option[$k][$name[1]] = $value[1]; 246 | } 247 | } 248 | 249 | foreach (['jschl_vc', 'pass'] as $item) { 250 | if (!isset($option[$k][$item])) { 251 | throw new \InvalidArgumentException(sprintf( 252 | '%s is missing from challenge form', 253 | $item 254 | )); 255 | } 256 | } 257 | 258 | $answer = $this->solveChallenge($body, $domain); 259 | 260 | $option[$k]['jschl_answer'] = $answer; 261 | $option['allow_redirects'] = false; 262 | 263 | preg_match('/(?:[^{<>]*},\s*(\d{4,}))/', $body, $match); 264 | $delay = $this->getDelay() ?: ($match[1] / 1000 ?? 8); 265 | sleep($delay); 266 | 267 | $redirect = $this->request($method, $submitUrl, $option); 268 | if ($redirectUrl = $redirect->getHeaderLine('Location')) { 269 | $redirectLocation = parse_url($redirectUrl); 270 | if (empty($redirectLocation['host'])) { 271 | $redirectUrl = sprintf( 272 | '%s://%s%s', 273 | $parseUrl->getScheme(), 274 | $domain, 275 | $redirectLocation['path'] 276 | ); 277 | $redirectLocation['query'] && $redirectUrl .= '?' . $redirectLocation['query']; 278 | $redirectLocation['fragment'] && $redirectUrl .= '#' . $redirectLocation['fragment']; 279 | } 280 | 281 | return $this->request($method, $redirectUrl, $oriOption); 282 | } elseif ($setCookie = $redirect->getHeader('Set-Cookie')) { 283 | $cookieJar = new CookieJar(); 284 | foreach ($setCookie as $item) { 285 | $cookieJar->setCookie(SetCookie::fromString($item)); 286 | } 287 | if ($cookieJar->getCookieByName('cf_clearance')) { 288 | $this->setCookies($cookieJar); 289 | 290 | return $this->request($oriMethod, $submitUrl, ['cookies' => $cookieJar]); 291 | } else { 292 | return $this->request($method, $submitUrl, $oriOption); 293 | } 294 | } else { 295 | return $this->request($oriMethod, $submitUrl, $option); 296 | } 297 | } 298 | 299 | /** 300 | * @param string $body 301 | * @param string $domain 302 | * @return mixed 303 | */ 304 | protected function solveChallenge(string $body, string $domain) 305 | { 306 | try { 307 | $challenge = $this->substr($body, 'setTimeout(function(){', 'f.action += location.hash;'); 308 | $challenge = str_replace('setInterval(function(){}, 100),', "", $challenge); 309 | 310 | $kHtml = $this->substr($body, '
', '
substr($body, "; k = '", "';"); 317 | preg_match_all( 318 | '|]?>([^<]+)
|', 319 | $kHtml, 320 | $matches 321 | ); 322 | 323 | $innerHTML = json_encode(array_combine($matches[1], $matches[2])); 324 | 325 | $challenge = <<executeString($challenge); 346 | } 347 | 348 | /** 349 | * @param string $subject 350 | * @param string $start 351 | * @param string $end 352 | * @return mixed|string 353 | */ 354 | protected function substr(string $subject, string $start, string $end) 355 | { 356 | $r = explode($start, $subject, 2); 357 | if (isset($r[1])) { 358 | return explode($end, $r[1])[0]; 359 | } 360 | 361 | return ''; 362 | } 363 | 364 | /** 365 | * @return \Cfscrape\Cfscrape 366 | */ 367 | public static function createScraper() 368 | { 369 | $scraper = new self(); 370 | 371 | return $scraper; 372 | } 373 | 374 | /** 375 | * @param string $url 376 | * @param string|null $userAgent 377 | * @param int $delay 378 | * @return array 379 | * @throws \GuzzleHttp\Exception\GuzzleException 380 | */ 381 | public static function getTokens(string $url, string $userAgent = null, int $delay = 0) 382 | { 383 | $scraper = self::createScraper(); 384 | 385 | if ($userAgent) { 386 | $scraper->setUserAgent($userAgent); 387 | } 388 | 389 | if ($delay) { 390 | $scraper->setDelay($delay); 391 | } 392 | 393 | $response = $scraper->get($url); 394 | 395 | $cookies = $scraper->getCookies(); 396 | 397 | return [ 398 | [ 399 | '__cfduid' => $cookies->getCookieByName('__cfduid')->getValue(), 400 | 'cf_clearance' => $cookies->getCookieByName('cf_clearance')->getValue(), 401 | ], 402 | $scraper->getUserAgent() 403 | ]; 404 | } 405 | 406 | /** 407 | * @param string $url 408 | * @param string|null $userAgent 409 | * @param int $delay 410 | * @return array 411 | * @throws \GuzzleHttp\Exception\GuzzleException 412 | */ 413 | public static function getCookieString(string $url, string $userAgent = null, int $delay = 0) 414 | { 415 | [$tokens, $userAgent] = self::getTokens($url, $userAgent, $delay); 416 | return [http_build_query($tokens, '', '; '), $userAgent]; 417 | } 418 | 419 | protected function getDefaultHeaders() 420 | { 421 | return [ 422 | 'Connection' => 'keep-alive', 423 | 'Upgrade-Insecure-Requests' => '1', 424 | 'User-Agent' => $this->getDefaultUserAgent(), 425 | // phpcs:ignore 426 | 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 427 | 'Accept-Language' => 'en-US,en;q=0.9', 428 | 'Accept-Encoding' => 'gzip, deflate', 429 | ]; 430 | } 431 | 432 | protected function getDefaultUserAgent() 433 | { 434 | $userAgents = $this->userAgents; 435 | shuffle($userAgents); 436 | 437 | return $userAgents[0]; 438 | } 439 | } 440 | -------------------------------------------------------------------------------- /src/CfscrapeRuntimeException.php: -------------------------------------------------------------------------------- 1 |