├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── composer.json └── src ├── Core ├── Browser │ ├── AbstractBrowser.php │ ├── Browser.php │ └── BrowserInterface.php ├── Captcha │ ├── AsyncCaptchaSolvingCallback.php │ ├── AsyncCaptchaSolvingInterface.php │ ├── CaptchaResponse.php │ └── CaptchaSolverInterface.php ├── Cookie │ ├── ArrayCookieJar.php │ ├── Cookie.php │ ├── CookieJarInterface.php │ └── SetCookieString.php ├── Dom │ ├── Css.php │ ├── DocumentWrapper.php │ ├── DomElement.php │ ├── DomNodeInterface.php │ ├── DomNodeList.php │ ├── DomNodeListInterface.php │ ├── DomXpath.php │ ├── EmptyDomNodeList.php │ ├── InternalDocumentWrapper.php │ ├── NullDomNode.php │ ├── OtherDomNode.php │ └── WebPage.php ├── Http │ ├── HttpClientInterface.php │ ├── Proxy.php │ ├── ProxyInterface.php │ ├── SearchEngineResponse.php │ └── StackingHttpClient.php ├── Media │ ├── AbstractMedia.php │ ├── Base64.php │ ├── Binary.php │ ├── File.php │ ├── MediaFactory.php │ ├── MediaInterface.php │ └── Stream.php ├── Psr7 │ └── RequestBuilder.php ├── Serp │ ├── BaseResult.php │ ├── CompositeResultSet.php │ ├── IndexedResultSet.php │ ├── ItemPosition.php │ ├── ProxyResult.php │ ├── ResultDataInterface.php │ ├── ResultSet.php │ └── ResultSetInterface.php ├── Url.php ├── Url │ ├── AlterableUrlInterface.php │ ├── AlterableUrlTrait.php │ ├── QueryParam.php │ ├── UrlArchiveInterface.php │ └── UrlArchiveTrait.php └── UrlArchive.php ├── Exception.php └── Exception ├── CaptchaSolver ├── CaptchaNotSolvableException.php ├── CaptchaResolutionFailed.php └── UnknownCaptchaTypeException.php ├── InvalidCookieException.php ├── RequestError ├── CaptchaException.php ├── InvalidResponseException.php ├── NetworkErrorException.php ├── PageNotFoundException.php └── RequestErrorException.php └── TimeoutException.php /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # CHANGELOG 2 | 3 | 4 | ## 0.3.1 5 | 6 | *2018-xx-xx* NOT RELEASED YET 7 | 8 | 9 | ## 0.3.0 10 | 11 | > 2018-04-04 12 | 13 | * new features 14 | * **bc break** added method ``DomNodeInterface::getChildren`` 15 | * **bc break** added method ``DomNodeInterface::getLastChild`` 16 | * added interface ``DomNodeListInterface`` 17 | * added class ``EmptyDomNodeList`` 18 | * added class ``InternalDocumentWrapper`` 19 | * class ``DocumentWrapper`` now extends ``InternalDocumentWrapper`` 20 | * class ``NullDomNode`` now extends ``\DOMNode`` 21 | * method ``DomXpath`` is now able to handle ``NullDomNode`` as instances context 22 | 23 | 24 | ## 0.2.6 25 | 26 | > 2017-12-11 27 | 28 | * new features 29 | * **bc break** added method ``DomNodeInterface::hasAnyClass`` 30 | * added method ``DomNodeList::hasAnyClass`` 31 | 32 | ## 0.2.5 33 | 34 | > 2017-12-10 35 | 36 | * new features 37 | * **bc break** added method ``DomNodeInterface::hasClasses`` 38 | * added method ``DomNodeList::hasClasses`` 39 | 40 | 41 | ## 0.2.4 42 | 43 | > 2017-11-25 44 | 45 | * new features 46 | * data value can now depend on other results [aa59e55b10c28645decc5312b9c93681f5fe0691](https://github.com/serp-spider/core/commit/aa59e55b10c28645decc5312b9c93681f5fe0691) 47 | * BaseResult::getData is now able to dump resultSetInterface [d6e1b3627a50a5cce56d5320b56accabd107d851](https://github.com/serp-spider/core/commit/d6e1b3627a50a5cce56d5320b56accabd107d851) 48 | 49 | ## 0.2.3 50 | 51 | > 2017-08-08 52 | 53 | * bug fix 54 | * url query params was generating bad value for null array value [590ee240e9032ec1538fc6ffe5ad394cb9fac8d7](https://github.com/serp-spider/core/commit/590ee240e9032ec1538fc6ffe5ad394cb9fac8d7) 55 | 56 | 57 | ## 0.2.2 58 | 59 | > 2017-07-26 60 | 61 | * Addition 62 | * browser class is now able to set default headers for every requests [serp-spider/search-engine-google#73](https://github.com/serp-spider/search-engine-google/issues/73) 63 | 64 | ## 0.2.1 65 | 66 | > 2017-06-13 67 | 68 | * breaking change: 69 | * method ResultDataInterface::getData() will now return sub results as parsed arrays instead of objects (e047801) 70 | 71 | * bug fix 72 | * getDataValue failed to parse string value with the name of an existing php function (649c214) 73 | 74 | ## 0.2.0 75 | 76 | > 2017-05-01 77 | 78 | * New dependency: ``"symfony/css-selector": "^2|^3"`` 79 | 80 | * breaking changes 81 | * url interface was refactored [#22](https://github.com/serp-spider/core/pull/22) 82 | * Internal structure is better (no construct in the interface) 83 | * now ``port`` and ``user:pass auth`` are supported [#18](https://github.com/serp-spider/core/issues/18) 84 | * resolve and resolveAsString are now 2 distinct methods. [#19](https://github.com/serp-spider/core/issues/19) 85 | * resolve does not support string anymore [d56cbc39e710735296bbdd675431f7b3e87f534c](https://github.com/serp-spider/core/commit/d56cbc39e710735296bbdd675431f7b3e87f534c#diff-2bb04ebe8ec8dc8575afdd6a7a0bc0f6L325) 86 | * new method ``UrlArchiveInterface::getAuthority`` 87 | * url resolution is now compatible with rfc3986 88 | * query params now accept empty value [7233b7d1b67ed2a061746c210171b121ac931bb9](https://github.com/serp-spider/core/commit/7233b7d1b67ed2a061746c210171b121ac931bb9#diff-ea6d1c5de04976abd5f773367a57da23R79) 89 | * fix a bug with query params that are number only [#25](https://github.com/serp-spider/core/pull/25) 90 | * url parser is now able to parse array values from query string [#23](https://github.com/serp-spider/core/issues/23) 91 | * cookie expiration time was not on the same standard everywhere 92 | 93 | * Additions 94 | * Css parser was moved from google package to core [2f7d022d6da4905519a02d65c2f262aefc8b6bbf](https://github.com/serp-spider/core/commit/2f7d022d6da4905519a02d65c2f262aefc8b6bbf) 95 | * ``Dom`` component that offers better parsing of the dom (replacement for the ``googleDom`` class from google package) [view commits](https://github.com/serp-spider/core/compare/2f7d022d6da4905519a02d65c2f262aefc8b6bbf...22749d020c953e987dedc452566b4973923bf439) 96 | * ``RequestBuilder`` class that allows to construct PSR7 request from installed packages (``zendframework/zend-diactoros`` or ``guzzlehttp/psr7``) 97 | [98ab9f56bcef0ac36bae2b43cd965d14522a3294](https://github.com/serp-spider/core/commit/98ab9f56bcef0ac36bae2b43cd965d14522a3294) 98 | * Addition of ``BrowserInterface``, ``AbstractBrowser`` and ``Browser`` [#26](https://github.com/serp-spider/core/pull/26) 99 | * Addition of ``StackingHttpClient``: a http client implementation for unit test purposes [#26](https://github.com/serp-spider/core/pull/26) 100 | 101 | ------------------ 102 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | TLDL; Saying "hi", "please", "thanks" and being polite and showing empathy has never hurt anyone and makes everyone happy. 4 | We are humans not robots, so let's make the open source experience an enjoyable one for humans that take part in it. 5 | 6 | ## Our Pledge 7 | 8 | In the interest of fostering an open and welcoming environment, 9 | we as contributors and maintainers pledge to making participation in our project and our community 10 | a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, 11 | gender identity and expression, level of experience, nationality, personal appearance, race, religion, 12 | or sexual identity and orientation. 13 | 14 | ## Our Standards 15 | 16 | Examples of behavior that contributes to creating a positive environment include: 17 | 18 | * Using welcoming and inclusive language 19 | * Being respectful of differing viewpoints and experiences 20 | * Gracefully accepting constructive criticism 21 | * Focusing on what is best for the community 22 | * Showing empathy towards other community members 23 | 24 | Examples of unacceptable behavior by participants include: 25 | 26 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 30 | * Other conduct which could reasonably be considered inappropriate in a professional setting 31 | 32 | ## Our Responsibilities 33 | 34 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 35 | 36 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 37 | 38 | ## Scope 39 | 40 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 41 | 42 | ## Enforcement 43 | 44 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at sghzal@gmail.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 45 | 46 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 47 | 48 | ## Attribution 49 | 50 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] 51 | 52 | [homepage]: http://contributor-covenant.org 53 | [version]: http://contributor-covenant.org/version/1/4/ 54 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | CONTRIBUTING 2 | ============ 3 | 4 | Any contribution is welcome. 5 | 6 | Issues 7 | ------ 8 | 9 | - Your issue is related to a SERP parsing? 10 | 11 | When you report the issue try to include as much details as possible about your current search. 12 | SERPs are dependant on many factors and we need to know all of them. 13 | 14 | Tests 15 | ----- 16 | 17 | All contributions must be tested following as much as possible the current test structure: 18 | 19 | Look at current tests in ``test/suites`` for more details. Think about adding ``@cover`` annotation. 20 | 21 | If your test fixes an issue, first you will have to reproduce this issue in the test suit and you can comment 22 | your test to tell it fixes the given issue. 23 | 24 | Conding Standards 25 | ----------------- 26 | 27 | The code follow the PSR-2 coding standards 28 | 29 | Tools 30 | ----- 31 | 32 | - Run test suit: ``composer test`` 33 | - Check coding standards: ``composer cscheck`` 34 | - Auto fix coding standards: ``composer csfix`` 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015-today, Soufiane GHZAL and contributors 2 | 3 | Usage of the works is permitted provided that this instrument is retained with the works, so that any entity that uses the works is notified of this instrument. 4 | 5 | DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY. 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | SERPS 2 | ===== 3 | 4 | **S**earch **E**ngine **R**esult **P**age **S**crapper 5 | 6 | [![Build Status](https://travis-ci.org/serp-spider/core.svg?branch=master)](https://travis-ci.org/serp-spider/core) 7 | [![Test Coverage](https://codeclimate.com/github/serp-spider/core/badges/coverage.svg)](https://codeclimate.com/github/serp-spider/core/coverage) 8 | [![Gitter](https://img.shields.io/gitter/room/nwjs/nw.js.svg)](https://gitter.im/serp-spider/help) 9 | 10 | Install 11 | ------- 12 | 13 | Install it using [composer](https://getcomposer.org/) with the package 14 | [serps/core](https://packagist.org/packages/serps/core) : 15 | 16 | ``composer require 'serps/core'`` 17 | 18 | Documentation 19 | ------------- 20 | 21 | Browse the website and documentation at https://serp-spider.github.io/documentation/ 22 | 23 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "name": "serps/core", 4 | "description": "Search engine scrapper", 5 | "type": "library", 6 | "keywords": ["google", "yahoo", "bing", "search engine", "scrapper"], 7 | "homepage": "https://github.com/serp-spider/core", 8 | "license": "Fair", 9 | 10 | "authors": [ 11 | { 12 | "name": "Soufiane GHZAL", 13 | "homepage": "https://github.com/gsouf" 14 | } 15 | ], 16 | 17 | "autoload":{ 18 | "psr-4" : { 19 | "Serps\\": "src/" 20 | } 21 | }, 22 | 23 | "autoload-dev":{ 24 | "psr-4": { 25 | "Serps\\Test\\TDD\\": "test/suites/TDD" 26 | } 27 | }, 28 | 29 | "require": { 30 | "php": ">=5.5", 31 | "symfony/css-selector": "^2|^3|^4", 32 | "psr/http-message": "^1.0.0" 33 | }, 34 | 35 | "require-dev":{ 36 | "phpunit/phpunit": "~4.1", 37 | "squizlabs/php_codesniffer": "~2.5", 38 | "zendframework/zend-diactoros": "1.3.3", 39 | "guzzlehttp/psr7": "^1.3.0" 40 | }, 41 | 42 | "scripts": { 43 | "phpunit": "test/bin/test.bash", 44 | "test": [ 45 | "@phpunit", 46 | "@cscheck" 47 | ], 48 | "csfix": "test/bin/phpcbf.bash", 49 | "cscheck": "test/bin/phpcs.bash emacs" 50 | }, 51 | 52 | "extra": { 53 | "branch-alias": { 54 | "dev-master": "0.3.0-dev" 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/Core/Browser/AbstractBrowser.php: -------------------------------------------------------------------------------- 1 | prepareRequest($request); 24 | return $this->getHttpClient()->sendRequest($request, $this->getProxy(), $this->getCookieJar()); 25 | } 26 | 27 | /** 28 | * @return null|string 29 | */ 30 | public function getAcceptLanguage() 31 | { 32 | return $this->getDefaultHeaderValue('ACCEPT-LANGUAGE'); 33 | } 34 | 35 | /** 36 | * @return null|string 37 | */ 38 | public function getUserAgent() 39 | { 40 | return $this->getDefaultHeaderValue('USER-AGENT'); 41 | } 42 | 43 | /** 44 | * Adds a default header to be sent with every request 45 | * @param $headerName 46 | * @param $headerValue 47 | */ 48 | public function setDefaultHeader($headerName, $headerValue) 49 | { 50 | $this->defaultHeaders[$headerName] = $headerValue; 51 | $this->defaultHeadersUC[strtoupper($headerName)] = $headerName; 52 | } 53 | 54 | /** 55 | * Check if the header name is defined as a default header 56 | * @param $headerName 57 | * @return bool 58 | */ 59 | public function hasDefaultHeader($headerName) 60 | { 61 | return isset($this->defaultHeadersUC[strtoupper($headerName)]); 62 | } 63 | 64 | /** 65 | * Get the value of the default given default header name or null if not set 66 | * @param $headerName 67 | * @return bool 68 | */ 69 | public function getDefaultHeaderValue($headerName) 70 | { 71 | $headerName = strtoupper($headerName); 72 | if (isset($this->defaultHeadersUC[$headerName])) { 73 | return $this->defaultHeaders[$this->defaultHeadersUC[$headerName]]; 74 | } else { 75 | return null; 76 | } 77 | } 78 | 79 | /** 80 | * @inheritdoc 81 | */ 82 | public function prepareRequest(RequestInterface $request) 83 | { 84 | $headers = $this->getDefaultHeaders(); 85 | foreach ($headers as $name => $value) { 86 | $request = $request->withHeader($name, $value); 87 | } 88 | return $request; 89 | } 90 | 91 | /** 92 | * @inheritdoc 93 | */ 94 | public function getDefaultHeaders() 95 | { 96 | return $this->defaultHeaders; 97 | } 98 | 99 | /** 100 | * @inheritdoc 101 | */ 102 | public function requestFromUrl(UrlArchiveInterface $url) 103 | { 104 | $headers = $this->getDefaultHeaders(); 105 | 106 | $request = RequestBuilder::buildRequest( 107 | (string) $url, 108 | 'GET', 109 | $headers, 110 | 'php://memory' 111 | ); 112 | 113 | return $request; 114 | } 115 | 116 | /** 117 | * @inheritdoc 118 | */ 119 | public function navigateToUrl(UrlArchiveInterface $url) 120 | { 121 | $request = $this->requestFromUrl($url); 122 | return $this->sendRequest($request); 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /src/Core/Browser/Browser.php: -------------------------------------------------------------------------------- 1 | httpClient = $httpClient; 50 | $this->setAcceptLanguage($acceptLanguage ? $acceptLanguage : 'en-US,en;q=0.8'); 51 | $this->setUserAgent($userAgent ? $userAgent : 'serps'); 52 | $this->cookieJar = $cookieJar; 53 | $this->proxy = $proxy; 54 | } 55 | 56 | /** 57 | * @return null|CookieJarInterface 58 | */ 59 | public function getCookieJar() 60 | { 61 | return $this->cookieJar; 62 | } 63 | 64 | /** 65 | * @return null|ProxyInterface 66 | */ 67 | public function getProxy() 68 | { 69 | return $this->proxy; 70 | } 71 | 72 | /** 73 | * @return HttpClientInterface 74 | */ 75 | public function getHttpClient() 76 | { 77 | return $this->httpClient; 78 | } 79 | 80 | /** 81 | * @param null|string $acceptLanguage 82 | */ 83 | public function setAcceptLanguage($acceptLanguage) 84 | { 85 | $this->setDefaultHeader('Accept-Language', $acceptLanguage); 86 | } 87 | 88 | /** 89 | * @param null|string $userAgent 90 | */ 91 | public function setUserAgent($userAgent) 92 | { 93 | $this->setDefaultHeader('User-Agent', $userAgent); 94 | } 95 | 96 | /** 97 | * @param null|CookieJarInterface $cookieJar 98 | */ 99 | public function setCookieJar(CookieJarInterface $cookieJar = null) 100 | { 101 | $this->cookieJar = $cookieJar; 102 | } 103 | 104 | /** 105 | * @param null|ProxyInterface $proxy 106 | */ 107 | public function setProxy(ProxyInterface $proxy = null) 108 | { 109 | $this->proxy = $proxy; 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/Core/Browser/BrowserInterface.php: -------------------------------------------------------------------------------- 1 | getCaptchaCb = $getCaptcha; 22 | $this->tryForDefaultDelay = $tryForDefaultDelay; 23 | } 24 | 25 | public function getCaptcha() 26 | { 27 | if (null === $this->captchaDone) { 28 | $captcha = call_user_func($this->getCaptchaCb); 29 | if (false !== $captcha) { 30 | $this->captchaDone = $captcha; 31 | } 32 | return $captcha; 33 | } else { 34 | return $this->captchaDone; 35 | } 36 | } 37 | 38 | /** 39 | * try to get the captcha for the given time 40 | * @param int $time max time to wait in second 41 | * @param int $interval interval between 2 test in second 42 | * @return null|mixed 43 | */ 44 | public function tryFor($time, $interval = null) 45 | { 46 | if (null == $interval) { 47 | $interval = $this->tryForDefaultDelay; 48 | } 49 | $tryUntil = microtime(true) + $time; 50 | while ($tryUntil > microtime(true)) { 51 | if ($c = $this->getCaptcha()) { 52 | return $c; 53 | } 54 | usleep($interval * 1000000); 55 | } 56 | return null; 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/Core/Captcha/AsyncCaptchaSolvingInterface.php: -------------------------------------------------------------------------------- 1 | strictMode = $strictMode; 31 | } 32 | 33 | /** 34 | * @return mixed 35 | */ 36 | public function getStrictMode() 37 | { 38 | return $this->strictMode; 39 | } 40 | 41 | /** 42 | * @param bool $strictMode pass to true to throw an exception when an invalid cookie is added 43 | */ 44 | public function setStrictMode($strictMode) 45 | { 46 | $this->strictMode = $strictMode; 47 | } 48 | 49 | 50 | /** 51 | * @inheritdoc 52 | */ 53 | public function set(Cookie $cookie) 54 | { 55 | $result = $cookie->validate(); 56 | if ($result !== true) { 57 | if ($this->strictMode) { 58 | throw new InvalidCookieException($result); 59 | } else { 60 | $this->removeCookieIfEmpty($cookie); 61 | return false; 62 | } 63 | } 64 | 65 | // Resolve conflicts with previously set cookies 66 | foreach ($this->cookies as $i => $c) { 67 | // Two cookies are identical, when their path, domain and name are identical 68 | if ($c->getPath() != $cookie->getPath() || 69 | $c->getDomain() != $cookie->getDomain() || 70 | $c->getName() != $cookie->getName() 71 | ) { 72 | continue; 73 | } 74 | // The previously set cookie is a discard cookie and this one is not so allow the new cookie to be set 75 | if (!$cookie->getDiscard() && $c->getDiscard()) { 76 | unset($this->cookies[$i]); 77 | continue; 78 | } 79 | // If the new cookie's expiration is further into the future, then replace the old cookie 80 | if ($cookie->getExpires() > $c->getExpires()) { 81 | unset($this->cookies[$i]); 82 | continue; 83 | } 84 | // If the value has changed, we better change it 85 | if ($cookie->getValue() !== $c->getValue()) { 86 | unset($this->cookies[$i]); 87 | continue; 88 | } 89 | // The cookie exists, so no need to continue 90 | return false; 91 | } 92 | $this->cookies[] = $cookie; 93 | return true; 94 | } 95 | 96 | /** 97 | * If a cookie already exists and the server asks to set it again with a null value, the 98 | * cookie must be deleted. 99 | * 100 | * @param Cookie $cookie 101 | */ 102 | private function removeCookieIfEmpty(Cookie $cookie) 103 | { 104 | $cookieValue = $cookie->getValue(); 105 | if ($cookieValue === null || $cookieValue === '') { 106 | $this->remove($cookie->getDomain(), $cookie->getPath(), $cookie->getName()); 107 | } 108 | } 109 | 110 | /** 111 | * @inheritdoc 112 | */ 113 | public function remove($domain = null, $path = null, $name = null) 114 | { 115 | $cookies = $this->all($domain, $path, $name, false, false); 116 | $this->cookies = array_filter($this->cookies, function (Cookie $cookie) use ($cookies) { 117 | return !in_array($cookie, $cookies, true); 118 | }); 119 | } 120 | 121 | /** 122 | * @inheritdoc 123 | */ 124 | public function removeTemporary() 125 | { 126 | $this->cookies = array_filter($this->cookies, function (Cookie $cookie) { 127 | return !$cookie->getDiscard() && $cookie->getExpires(); 128 | }); 129 | return $this; 130 | } 131 | 132 | /** 133 | * @inheritdoc 134 | */ 135 | public function removeExpired() 136 | { 137 | $currentTime = time(); 138 | $this->cookies = array_filter($this->cookies, function (Cookie $cookie) use ($currentTime) { 139 | return !$cookie->getExpires() || $currentTime < $cookie->getExpires(); 140 | }); 141 | return $this; 142 | } 143 | 144 | /** 145 | * @inheritdoc 146 | */ 147 | public function getMatchingCookies(RequestInterface $request) 148 | { 149 | // Find cookies that match this request 150 | $cookies = $this->all($request->getUri()->getHost(), $request->getUri()->getPath()); 151 | // Remove ineligible cookies 152 | foreach ($cookies as $index => $cookie) { 153 | if ($cookie->getSecure() && $request->getUri()->getScheme() != 'https') { 154 | unset($cookies[$index]); 155 | } 156 | }; 157 | return $cookies; 158 | } 159 | 160 | /** 161 | * @inheritdoc 162 | */ 163 | public function all($domain = null, $path = null, $name = null, $skipDiscardable = false, $skipExpired = true) 164 | { 165 | return array_values(array_filter($this->cookies, function (Cookie $cookie) use ( 166 | $domain, 167 | $path, 168 | $name, 169 | $skipDiscardable, 170 | $skipExpired 171 | ) { 172 | return false === (($name && $cookie->getName() != $name) || 173 | ($skipExpired && $cookie->isExpired()) || 174 | ($skipDiscardable && ($cookie->getDiscard() || !$cookie->getExpires())) || 175 | ($path && !$cookie->matchesPath($path)) || 176 | ($domain && !$cookie->matchesDomain($domain))); 177 | })); 178 | } 179 | 180 | public function export() 181 | { 182 | $cookies = $this->all(null, null, null, false, false); 183 | $data = []; 184 | foreach ($cookies as $cookie) { 185 | $data[] = $cookie->export(); 186 | } 187 | return $data; 188 | } 189 | 190 | public function import($data) 191 | { 192 | foreach ($data as $cookieData) { 193 | $this->set(new Cookie($cookieData['name'], $cookieData['value'], $cookieData['flags'])); 194 | } 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /src/Core/Cookie/Cookie.php: -------------------------------------------------------------------------------- 1 | name = $name; 22 | $this->value = $value; 23 | $this->flags = $flags; 24 | } 25 | 26 | protected function getFlag($flag, $default = null) 27 | { 28 | return isset($this->flags[$flag]) ? 29 | $this->flags[$flag] 30 | : $default; 31 | } 32 | 33 | 34 | /** 35 | * @return mixed 36 | */ 37 | public function getName() 38 | { 39 | return $this->name; 40 | } 41 | 42 | /** 43 | * @return mixed 44 | */ 45 | public function getValue() 46 | { 47 | return $this->value; 48 | } 49 | 50 | 51 | 52 | /** 53 | * @return mixed 54 | */ 55 | public function getPath() 56 | { 57 | return $this->getFlag('path', '/'); 58 | } 59 | 60 | public function getDomain() 61 | { 62 | return $this->getFlag('domain'); 63 | } 64 | 65 | public function getExpires() 66 | { 67 | return $this->getFlag('expires'); 68 | } 69 | 70 | public function matchesPath($path) 71 | { 72 | 73 | $cookiePath = $this->getPath(); 74 | 75 | // RFC6265 http://tools.ietf.org/search/rfc6265#section-5.1.4 76 | // A request-path path-matches a given cookie-path if at least one of 77 | // the following conditions holds: 78 | // o The cookie-path and the request-path are identical. 79 | if ($path == $cookiePath) { 80 | return true; 81 | } 82 | $pos = stripos($path, $cookiePath); 83 | if ($pos === 0) { 84 | // o The cookie-path is a prefix of the request-path, and the last 85 | // character of the cookie-path is %x2F ("/"). 86 | if (substr($cookiePath, -1, 1) === '/') { 87 | return true; 88 | } 89 | // o The cookie-path is a prefix of the request-path, and the first 90 | // character of the request-path that is not included in the cookie- 91 | // path is a %x2F ("/") character. 92 | if (substr($path, strlen($cookiePath), 1) === '/') { 93 | return true; 94 | } 95 | } 96 | return false; 97 | } 98 | 99 | /** 100 | * Check if the cookie matches a domain value 101 | * 102 | * @param string $domain Domain to check against 103 | * 104 | * @return bool 105 | */ 106 | public function matchesDomain($domain) 107 | { 108 | // Remove the leading '.' as per spec in RFC 6265: http://tools.ietf.org/html/rfc6265#section-5.2.3 109 | $cookieDomain = ltrim($this->getDomain(), '.'); 110 | // Domain not set or exact match. 111 | if (!$cookieDomain || !strcasecmp($domain, $cookieDomain)) { 112 | return true; 113 | } 114 | // Matching the subdomain according to RFC 6265: http://tools.ietf.org/html/rfc6265#section-5.1.3 115 | if (filter_var($domain, FILTER_VALIDATE_IP)) { 116 | return false; 117 | } 118 | return (bool) preg_match('/\.' . preg_quote($cookieDomain, '/') . '$/i', $domain); 119 | } 120 | 121 | /** 122 | * Check if the cookie is expired 123 | * 124 | * @return bool 125 | */ 126 | public function isExpired() 127 | { 128 | return $this->getExpires() && time() > $this->getExpires(); 129 | } 130 | 131 | /** 132 | * Check if the cookie is valid according to RFC 6265 133 | * 134 | * @return bool|string Returns true if valid or an error message if invalid 135 | */ 136 | public function validate() 137 | { 138 | // Names must not be empty, but can be 0 139 | $name = $this->getName(); 140 | if (empty($name) && !is_numeric($name)) { 141 | return 'The cookie name must not be empty'; 142 | } 143 | // Check if any of the invalid characters are present in the cookie name 144 | if (strpbrk($name, self::getInvalidCharacters()) !== false) { 145 | return 'The cookie name must not contain invalid characters: ' . $name; 146 | } 147 | // Value must not be empty, but can be 0 148 | $value = $this->getValue(); 149 | if (empty($value) && !is_numeric($value)) { 150 | return 'The cookie value must not be empty'; 151 | } 152 | // Domains must not be empty, but can be 0 153 | // A "0" is not a valid internet domain, but may be used as server name in a private network 154 | $domain = $this->getDomain(); 155 | if (empty($domain) && !is_numeric($domain)) { 156 | return 'The cookie domain must not be empty'; 157 | } 158 | return true; 159 | } 160 | 161 | /** 162 | * @var string ASCII codes not valid for for use in a cookie name 163 | * 164 | * Cookie names are defined as 'token', according to RFC 2616, Section 2.2 165 | * A valid token may contain any CHAR except CTLs (ASCII 0 - 31 or 127) 166 | * or any of the following separators 167 | */ 168 | protected static $invalidCharString; 169 | 170 | /** 171 | * Gets an array of invalid cookie characters 172 | * 173 | * @return array 174 | */ 175 | protected static function getInvalidCharacters() 176 | { 177 | if (!self::$invalidCharString) { 178 | self::$invalidCharString = implode('', array_map('chr', array_merge( 179 | range(0, 32), 180 | [34, 40, 41, 44, 47], 181 | [58, 59, 60, 61, 62, 63, 64, 91, 92, 93, 123, 125, 127] 182 | ))); 183 | } 184 | return self::$invalidCharString; 185 | } 186 | 187 | public function getDiscard() 188 | { 189 | return $this->getFlag('discard', false); 190 | } 191 | 192 | public function getSecure() 193 | { 194 | return $this->getFlag('secure', false); 195 | } 196 | 197 | public function getHttpOnly() 198 | { 199 | return $this->getFlag('http_only', false); 200 | } 201 | 202 | /** 203 | * Formats the cookies to be set in the Cookie header 204 | * 205 | * @return string the formatted cookie with this format: name=value 206 | */ 207 | public function formatForCookieHeader() 208 | { 209 | return $this->getName() . '=' . $this->getValue(); 210 | } 211 | 212 | 213 | /** 214 | * Formats the cookie into a json string to make it exportable. 215 | * @return string a json representation of the cookie 216 | */ 217 | public function export() 218 | { 219 | return [ 220 | 'name' => $this->getName(), 221 | 'value' => $this->getValue(), 222 | 'flags' => [ 223 | 'path' => $this->getPath(), 224 | 'domain' => $this->getDomain(), 225 | 'expires' => $this->getExpires(), 226 | 'discard' => $this->getDiscard(), 227 | 'secure' => $this->getSecure(), 228 | 'http_only' => $this->getHttpOnly(), 229 | ] 230 | ]; 231 | } 232 | } 233 | -------------------------------------------------------------------------------- /src/Core/Cookie/CookieJarInterface.php: -------------------------------------------------------------------------------- 1 | 'Path', 17 | 'max_age' => 'Max-Age', 18 | 'expires' => 'Expires', 19 | 'version' => 'Version', 20 | 'secure' => 'Secure', 21 | 'port' => 'Port', 22 | 'discard' => 'Discard', 23 | 'comment' => 'Comment', 24 | 'comment_url' => 'Comment-Url', 25 | 'http_only' => 'HttpOnly' 26 | ]; 27 | 28 | public static function parse($string, $host, $path, $decode = false) 29 | { 30 | 31 | // Explode the cookie string using a series of semicolons 32 | $pieces = array_filter(array_map('trim', explode(';', $string))); 33 | 34 | if (empty($pieces) || !strpos($pieces[0], '=')) { 35 | return false; 36 | } 37 | 38 | $firstPiece = array_shift($pieces); 39 | $cookieParts = explode('=', $firstPiece, 2); 40 | $cookieName = $cookieParts[0]; 41 | $cookieValue = $cookieParts[1]; 42 | 43 | // Create the default return array 44 | $data = array_merge(array_fill_keys(array_keys(self::$cookieParts), null), [ 45 | 'path' => null, 46 | 'http_only' => false, 47 | 'discard' => false 48 | ]); 49 | $foundNonCookies = 0; 50 | // Add the cookie pieces into the parsed data array 51 | foreach ($pieces as $part) { 52 | $cookieParts = explode('=', $part, 2); 53 | $key = trim($cookieParts[0]); 54 | if (count($cookieParts) == 1) { 55 | // Can be a single value (e.g. secure, httpOnly) 56 | $value = true; 57 | } else { 58 | // Be sure to strip wrapping quotes 59 | $value = trim($cookieParts[1], " \n\r\t\0\x0B\""); 60 | if ($decode) { 61 | $value = urldecode($value); 62 | } 63 | } 64 | $data[$key] = $value; 65 | } 66 | 67 | if (is_string($data['expires'])) { 68 | $data['expires'] = strtotime($data['expires']); 69 | } 70 | 71 | // Calculate the expires date 72 | if (!$data['expires'] && $data['max_age']) { 73 | $data['expires'] = time() + (int) $data['max_age']; 74 | } 75 | // Check path attribute according RFC6265 http://tools.ietf.org/search/rfc6265#section-5.2.4 76 | // "If the attribute-value is empty or if the first character of the 77 | // attribute-value is not %x2F ("/"): 78 | // Let cookie-path be the default-path. 79 | // Otherwise: 80 | // Let cookie-path be the attribute-value." 81 | if (!$data['path'] || substr($data['path'], 0, 1) !== '/') { 82 | $data['path'] = self::getDefaultPath($path); 83 | } 84 | 85 | if (!isset($data['domain'])) { 86 | $data['domain'] = $host; 87 | } 88 | 89 | return new Cookie($cookieName, $cookieValue, $data); 90 | } 91 | 92 | /** 93 | * Get default cookie path according to RFC 6265 94 | * http://tools.ietf.org/search/rfc6265#section-5.1.4 Paths and Path-Match 95 | * 96 | * @param string $path Request uri-path 97 | * 98 | * @return string 99 | */ 100 | protected static function getDefaultPath($path) 101 | { 102 | // "The user agent MUST use an algorithm equivalent to the following algorithm 103 | // to compute the default-path of a cookie:" 104 | // "2. If the uri-path is empty or if the first character of the uri-path is not 105 | // a %x2F ("/") character, output %x2F ("/") and skip the remaining steps. 106 | if (empty($path) || substr($path, 0, 1) !== '/') { 107 | return '/'; 108 | } 109 | // "3. If the uri-path contains no more than one %x2F ("/") character, output 110 | // %x2F ("/") and skip the remaining step." 111 | if ($path === '/') { 112 | return $path; 113 | } 114 | $rightSlashPos = strrpos($path, '/'); 115 | if ($rightSlashPos === 0) { 116 | return '/'; 117 | } 118 | // "4. Output the characters of the uri-path from the first character up to, 119 | // but not including, the right-most %x2F ("/")." 120 | return substr($path, 0, $rightSlashPos); 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /src/Core/Dom/Css.php: -------------------------------------------------------------------------------- 1 | = 2.8 32 | self::$converter = new CssSelectorConverter(); 33 | } else { 34 | // Version < 2.8 35 | self::$converter = new CssSelector(); 36 | } 37 | } 38 | return self::$converter; 39 | } 40 | 41 | /** 42 | * @param $css 43 | * @return string the xpath representation of the css string 44 | */ 45 | public static function toXPath($css) 46 | { 47 | return self::getConverter()->toXPath($css); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/Core/Dom/DocumentWrapper.php: -------------------------------------------------------------------------------- 1 | ' . $domString; 27 | } 28 | 29 | // Load DOM 30 | $dom = new \DOMDocument(); 31 | $previousUseIE = libxml_use_internal_errors(true); 32 | $dom->loadHTML($domString); 33 | libxml_use_internal_errors($previousUseIE); 34 | libxml_clear_errors(); 35 | 36 | $dom->registerNodeClass(\DOMElement::class, DomElement::class); 37 | parent::__construct($dom); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/Core/Dom/DomElement.php: -------------------------------------------------------------------------------- 1 | getAttribute('class'); 17 | 18 | if ($classList) { 19 | return explode(' ', $classList); 20 | } 21 | 22 | return []; 23 | } 24 | 25 | /** 26 | * @inheritdoc 27 | */ 28 | public function hasClass($className) 29 | { 30 | return in_array($className, $this->getClassList()); 31 | } 32 | 33 | /** 34 | * @inheritdoc 35 | */ 36 | public function hasAnyClass(array $classNames) 37 | { 38 | $classList = $this->getClassList(); 39 | 40 | foreach ($classNames as $className) { 41 | if (in_array($className, $classList)) { 42 | return true; 43 | } 44 | } 45 | 46 | return false; 47 | } 48 | 49 | /** 50 | * @inheritdoc 51 | */ 52 | public function hasClasses(array $classNames) 53 | { 54 | $classList = $this->getClassList(); 55 | 56 | if (!empty($classList)) { 57 | foreach ($classNames as $className) { 58 | if (!in_array($className, $classList)) { 59 | return false; 60 | } 61 | } 62 | 63 | return true; 64 | } 65 | 66 | return empty($classNames); 67 | } 68 | 69 | /** 70 | * @inheritdoc 71 | */ 72 | public function getTagName() 73 | { 74 | return $this->tagName; 75 | } 76 | 77 | /** 78 | * @inheritdoc 79 | */ 80 | public function getNodeValue() 81 | { 82 | return $this->nodeValue; 83 | } 84 | 85 | /** 86 | * @inheritdoc 87 | */ 88 | public function getChildren() 89 | { 90 | return new DomNodeList($this->childNodes, new InternalDocumentWrapper($this->ownerDocument)); 91 | } 92 | 93 | /** 94 | * @inheritdoc 95 | */ 96 | public function getLastChild() 97 | { 98 | return InternalDocumentWrapper::toDomNodeInterface($this->lastChild); 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /src/Core/Dom/DomNodeInterface.php: -------------------------------------------------------------------------------- 1 | nodeList = $list; 31 | $this->documentWrapper = $doc; 32 | } 33 | 34 | /** 35 | * @inheritdoc 36 | */ 37 | public function hasClass($className) 38 | { 39 | for ($i = 0; $i < $this->nodeList->length; $i++) { 40 | if ($this->getNodeAt($i)->hasClass($className)) { 41 | return true; 42 | } 43 | } 44 | 45 | return false; 46 | } 47 | 48 | /** 49 | * @inheritdoc 50 | */ 51 | public function hasClasses(array $classNames) 52 | { 53 | for ($i = 0; $i < $this->nodeList->length; $i++) { 54 | if ($this->getNodeAt($i)->hasClasses($classNames)) { 55 | return true; 56 | } 57 | } 58 | 59 | return false; 60 | } 61 | 62 | /** 63 | * @inheritdoc 64 | */ 65 | public function hasAnyClass(array $classNames) 66 | { 67 | for ($i = 0; $i < $this->nodeList->length; $i++) { 68 | if ($this->getNodeAt($i)->hasAnyClass($classNames)) { 69 | return true; 70 | } 71 | } 72 | 73 | return false; 74 | } 75 | 76 | /** 77 | * @inheritdoc 78 | */ 79 | public function item($index) 80 | { 81 | $item = $this->nodeList->item($index); 82 | 83 | if (!$item) { 84 | return null; 85 | } 86 | 87 | if (!$item instanceof DomNodeInterface) { 88 | return new OtherDomNode($item); 89 | } 90 | 91 | return $item; 92 | } 93 | 94 | /** 95 | * @inheritdoc 96 | */ 97 | public function getNodeAt($index) 98 | { 99 | $item = $this->nodeList->item($index); 100 | 101 | return InternalDocumentWrapper::toDomNodeInterface($item); 102 | } 103 | 104 | /** 105 | * @inheritdoc 106 | */ 107 | public function __get($name) 108 | { 109 | if ($name === 'length') { 110 | return $this->count(); 111 | } 112 | } 113 | 114 | /** 115 | * @inheritdoc 116 | */ 117 | public function count() 118 | { 119 | return $this->nodeList->length; 120 | } 121 | 122 | /** 123 | * @inheritdoc 124 | */ 125 | public function current() 126 | { 127 | return $this->nodeList->item($this->itCur); 128 | } 129 | 130 | /** 131 | * @inheritdoc 132 | */ 133 | public function next() 134 | { 135 | $this->itCur++; 136 | } 137 | 138 | /** 139 | * @inheritdoc 140 | */ 141 | public function key() 142 | { 143 | return $this->itCur; 144 | } 145 | 146 | /** 147 | * @inheritdoc 148 | */ 149 | public function valid() 150 | { 151 | return $this->itCur < $this->count(); 152 | } 153 | 154 | /** 155 | * @inheritdoc 156 | */ 157 | public function rewind() 158 | { 159 | $this->itCur = 0; 160 | } 161 | } 162 | -------------------------------------------------------------------------------- /src/Core/Dom/DomNodeListInterface.php: -------------------------------------------------------------------------------- 1 | documentWrapper = $doc; 26 | parent::__construct($doc->getDom()); 27 | } 28 | 29 | /** 30 | * @inheritdoc 31 | */ 32 | public function query($expr, BaseDomNode $context = null, $registerNodeNS = null) 33 | { 34 | 35 | // NullDomNode is an addition to SERPS and must be handled differently 36 | if ($context instanceof NullDomNode) { 37 | if ($expr[0] == '/') { 38 | $context = null; 39 | } else { 40 | return new EmptyDomNodeList(); 41 | } 42 | } 43 | 44 | $nodeList = parent::query($expr, $context); 45 | return new DomNodeList($nodeList, $this->documentWrapper); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/Core/Dom/EmptyDomNodeList.php: -------------------------------------------------------------------------------- 1 | count(); 58 | } 59 | } 60 | 61 | /** 62 | * @inheritdoc 63 | */ 64 | public function count() 65 | { 66 | return 0; 67 | } 68 | 69 | /** 70 | * @inheritdoc 71 | */ 72 | public function current() 73 | { 74 | return null; 75 | } 76 | 77 | /** 78 | * @inheritdoc 79 | */ 80 | public function next() 81 | { 82 | // nothing 83 | } 84 | 85 | /** 86 | * @inheritdoc 87 | */ 88 | public function key() 89 | { 90 | return null; 91 | } 92 | 93 | /** 94 | * @inheritdoc 95 | */ 96 | public function valid() 97 | { 98 | return false; 99 | } 100 | 101 | /** 102 | * @inheritdoc 103 | */ 104 | public function rewind() 105 | { 106 | // nothing 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /src/Core/Dom/InternalDocumentWrapper.php: -------------------------------------------------------------------------------- 1 | dom = $dom; 46 | } 47 | 48 | /** 49 | * get the object xpath to query it 50 | * @return DomXpath 51 | */ 52 | public function getXpath() 53 | { 54 | if (null === $this->xpath) { 55 | $this->xpath = new DomXpath($this); 56 | } 57 | return $this->xpath; 58 | } 59 | 60 | /** 61 | * @return \DOMDocument 62 | */ 63 | public function getDom() 64 | { 65 | return $this->dom; 66 | } 67 | 68 | /** 69 | * Runs a xpath query against the wrapped dom object 70 | * 71 | * That's a shortcut for \DOMXPath::query() 72 | * 73 | * @link http://php.net/manual/en/domxpath.query.php 74 | * 75 | * @param string $query the xpath query 76 | * @param \DOMNode|null $node the context node for the query, leave it null to query the root 77 | * @return DomNodeList 78 | */ 79 | public function xpathQuery($query, $node = null) 80 | { 81 | return $this->getXpath()->query($query, $node); 82 | } 83 | 84 | /** 85 | * Runs a css query against the wrapped dom object. Internally the css will translate to xpath 86 | * 87 | * @link http://php.net/manual/en/domxpath.query.php 88 | * 89 | * @param string $query the css query 90 | * @param \DOMNode|null $node the context node for the query, leave it null to query the root 91 | * @return DomNodeList 92 | */ 93 | public function cssQuery($query, $node = null) 94 | { 95 | return $this->getXpath()->query(Css::toXPath($query), $node); 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/Core/Dom/NullDomNode.php: -------------------------------------------------------------------------------- 1 | domNode = $domNode; 22 | } 23 | 24 | /** 25 | * @inheritdoc 26 | */ 27 | public function hasClass($className) 28 | { 29 | return false; 30 | } 31 | 32 | /** 33 | * @inheritdoc 34 | */ 35 | public function hasClasses(array $classNames) 36 | { 37 | return false; 38 | } 39 | 40 | /** 41 | * @inheritdoc 42 | */ 43 | public function hasAnyClass(array $classNames) 44 | { 45 | return false; 46 | } 47 | 48 | /** 49 | * @inheritdoc 50 | */ 51 | public function getAttribute($name) 52 | { 53 | return null; 54 | } 55 | 56 | /** 57 | * @inheritdoc 58 | */ 59 | public function getTagName() 60 | { 61 | return null; 62 | } 63 | 64 | /** 65 | * @inheritdoc 66 | */ 67 | public function getNodeValue() 68 | { 69 | return $this->domNode->nodeValue; 70 | } 71 | 72 | /** 73 | * @inheritdoc 74 | */ 75 | public function getChildren() 76 | { 77 | return new EmptyDomNodeList(); 78 | } 79 | 80 | /** 81 | * @inheritdoc 82 | */ 83 | public function getLastChild() 84 | { 85 | if (property_exists($this, 'lastChild')) { 86 | return InternalDocumentWrapper::toDomNodeInterface($this->lastChild); 87 | } else { 88 | return new NullDomNode(); 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/Core/Dom/WebPage.php: -------------------------------------------------------------------------------- 1 | url = $url; 26 | } 27 | 28 | /** 29 | * @return UrlArchiveInterface 30 | */ 31 | public function getUrl() 32 | { 33 | return $this->url; 34 | } 35 | 36 | /** 37 | * Get data from a given html form. Form data can be fill with given data 38 | * 39 | * @param \DOMElement $formNode the form DOMElement 40 | * @param array $formData optional data to replace the default data from the html 41 | * @param bool $strict by default the data returned will we be processed from form element inputs and 42 | * given data that are not present as an input in the form will be ignored. Pass this argument to false in order 43 | * to return all data from the original data 44 | * @param bool $submit by default this method will search for the first submit and get data from it (if the 45 | * submit is named) 46 | * @return array 47 | */ 48 | public function formGetData(\DOMElement $formNode, array $formData = [], $strict = true, $submit = true) 49 | { 50 | 51 | $items = $this->xpathQuery('(//input | //textarea | //select)', $formNode); 52 | $consumed = []; 53 | $queryItems = []; 54 | 55 | 56 | foreach ($items as $item) { 57 | /* @var \DOMElement $item */ 58 | 59 | if ($item->hasAttribute('disabled')) { 60 | continue; 61 | } 62 | switch ($item->tagName) { 63 | case 'input': 64 | $query = $this->parseInput($item, $formData, $consumed); 65 | break; 66 | case 'textarea': 67 | $query = $this->parseTextArea($item, $formData, $consumed); 68 | break; 69 | case 'select': 70 | $query = $this->parseSelect($item, $formData, $consumed); 71 | break; 72 | default: 73 | $query = null; 74 | break; 75 | } 76 | 77 | if ($query) { 78 | $queryItems[] = $query; 79 | } 80 | } 81 | 82 | if (!$strict) { 83 | foreach ($formData as $queryName => $queryValue) { 84 | if (!in_array($queryName, $consumed)) { 85 | $queryItems[] = http_build_query([$queryName => $queryValue]); 86 | } 87 | } 88 | } 89 | 90 | if (true === $submit) { 91 | // when the submit button is pressed, and if the submit has a name, it will add an item in the query string 92 | $items = $this->cssQuery('input[type="submit"], button', $formNode); 93 | 94 | foreach ($items as $item) { 95 | if ($item->hasAttribute('disabled')) { 96 | continue; 97 | } 98 | $name = $item->hasAttribute('name') ? $item->getAttribute('name') : false; 99 | 100 | if ($item->tagName == 'input') { 101 | if (!$name) { 102 | break; 103 | } 104 | $value = $item->hasAttribute('value') 105 | ? $item->getAttribute('value') 106 | : 'Submit'; // chrome uses "Submit" as default value 107 | $queryItems[] = http_build_query([$name => $value]); 108 | break; 109 | } else { 110 | if ($item->hasAttribute('type') 111 | && in_array($item->getAttribute('type'), ['button', 'reset']) 112 | ) { 113 | // buttons with type = 'reset' or 'button' are not valid to submit the form 114 | continue; 115 | } else { 116 | if (!$name) { 117 | break; 118 | } 119 | $value = $item->hasAttribute('value') 120 | ? $item->getAttribute('value') 121 | : ''; 122 | $queryItems[] = http_build_query([$name => $value]); 123 | break; 124 | } 125 | } 126 | } 127 | } 128 | 129 | return $queryItems; 130 | } 131 | 132 | /** 133 | * Build a request from the given form. The form data, the form method and action will be considered 134 | * @param \DOMElement $formNode 135 | * @param array $formData @see formGetData 136 | * @param bool $strict @see formGetData 137 | * @param bool $submit @see formGetData 138 | * @param RequestInterface|null $request an optional request instance to fill with the form preset. 139 | * If this parameter is omitted the method will try to find a request builder 140 | * from zendframework/zend-diactoros or guzzlehttp/psr7 141 | * @return RequestInterface 142 | */ 143 | public function requestFromForm( 144 | \DOMElement $formNode, 145 | array $formData = [], 146 | $strict = true, 147 | $submit = true, 148 | RequestInterface $request = null 149 | ) { 150 | 151 | 152 | if (null == $request) { 153 | $request = RequestBuilder::buildRequest(); 154 | } 155 | 156 | 157 | 158 | $formAction = $formNode->getAttribute('action'); 159 | if (!$formAction) { 160 | $formAction = ''; 161 | } 162 | $formUrl = $this->getUrl()->resolve($formAction); 163 | 164 | $method = $formNode->getAttribute('method'); 165 | if (!$method) { 166 | $method = 'get'; 167 | } else { 168 | $method = strtolower($method); 169 | } 170 | 171 | $queryItems = $this->formGetData($formNode, $formData, $strict, $submit); 172 | 173 | $url = $request->getUri() 174 | ->withScheme($this->nullToEmpty($formUrl->getScheme())) 175 | ->withUserInfo($this->nullToEmpty($formUrl->getUser()), $this->nullToEmpty($formUrl->getPass())) 176 | ->withHost($this->nullToEmpty($formUrl->getHost())) 177 | ->withPort($formUrl->getPort()) 178 | ->withPath($this->nullToEmpty($formUrl->getPath())) 179 | ->withFragment($this->nullToEmpty($formUrl->getHash())); 180 | 181 | if (!in_array(strtolower($method), ['post', 'put'])) { 182 | $url = $url->withQuery(implode('&', $queryItems)); 183 | } else { 184 | $body = $request->getBody(); 185 | $body->rewind(); 186 | $body->write(implode('&', $queryItems)); 187 | $request = $request->withBody($body); 188 | } 189 | 190 | // TODO enctype 191 | return $request->withUri($url)->withMethod($method); 192 | } 193 | 194 | private function nullToEmpty($data) 195 | { 196 | return $data === null ? '' : $data; 197 | } 198 | 199 | private function parseInput(\DOMElement $input, array $formData, array &$consumed) 200 | { 201 | $name = strtolower($input->getAttribute('name')); 202 | 203 | if (!$name) { 204 | return false; 205 | } 206 | 207 | if (isset($formData[$name])) { 208 | $consumed[] = $name; 209 | return http_build_query([$name => $formData[$name]]); 210 | } 211 | 212 | $inputType = strtolower($input->getAttribute('type')); 213 | switch ($inputType) { 214 | case 'file': 215 | // TODO ? 216 | break; 217 | case 'submit': 218 | // Submit are not parsed they are processed after because only 1 submit will be used 219 | break; 220 | case 'radio': 221 | case 'checkbox': 222 | if ($input->hasAttribute('checked')) { 223 | return http_build_query([$name => $input->getAttribute('value')]); 224 | } 225 | break; 226 | default: 227 | if ($input->hasAttribute('value')) { 228 | return http_build_query([$name => $input->getAttribute('value')]); 229 | } else { 230 | return urlencode($name); 231 | } 232 | break; 233 | } 234 | } 235 | 236 | private function parseSelect(\DOMElement $select, array $formData, array &$consumed) 237 | { 238 | 239 | $name = strtolower($select->getAttribute('name')); 240 | 241 | if (isset($formData[$name])) { 242 | $consumed[] = $name; 243 | return http_build_query([$name => $formData[$name]]); 244 | } 245 | 246 | $isMultiple = $select->hasAttribute('multiple'); 247 | if ($isMultiple) { 248 | $values = []; 249 | } 250 | 251 | 252 | $options = $this->cssQuery('option', $select); 253 | foreach ($options as $option) { 254 | /* @var \DOMElement $option */ 255 | if ($option->hasAttribute('disabled')) { 256 | continue; 257 | } 258 | if ($option->hasAttribute('selected')) { 259 | $optionValue = $option->hasAttribute('value') ? $option->getAttribute('value') : $option->nodeValue; 260 | if ($isMultiple) { 261 | $values[] = http_build_query([$name => $optionValue]); 262 | } else { 263 | return http_build_query([$name => $optionValue]); 264 | } 265 | } 266 | } 267 | 268 | 269 | if ($isMultiple) { 270 | return implode('&', $values); 271 | } else { 272 | return urlencode($name); 273 | } 274 | } 275 | 276 | private function parseTextArea(\DOMElement $textarea, array $formData, array &$consumed) 277 | { 278 | $name = strtolower($textarea->getAttribute('name')); 279 | 280 | if (isset($formData[$name])) { 281 | $consumed[] = $consumed; 282 | return http_build_query([$name => $formData[$name]]); 283 | } 284 | 285 | return http_build_query([$name => $textarea->nodeValue]); 286 | } 287 | } 288 | -------------------------------------------------------------------------------- /src/Core/Http/HttpClientInterface.php: -------------------------------------------------------------------------------- 1 | host = $host; 29 | $this->port = $port; 30 | $this->type = strtoupper($type); 31 | $this->user = $user; 32 | $this->password = $password; 33 | } 34 | 35 | public function getHost() 36 | { 37 | return $this->host; 38 | } 39 | 40 | public function getPort() 41 | { 42 | return $this->port; 43 | } 44 | 45 | public function getType() 46 | { 47 | return $this->type; 48 | } 49 | 50 | public function getUser() 51 | { 52 | return $this->user; 53 | } 54 | 55 | public function getPassword() 56 | { 57 | return $this->password; 58 | } 59 | 60 | public static function createFromString($proxy) 61 | { 62 | 63 | if (preg_match('#^[a-zA-Z0-9]+://#', $proxy)) { 64 | list($type, $proxy) = explode('://', $proxy, 2); 65 | } else { 66 | $type = 'HTTP'; 67 | } 68 | 69 | $proxyPieces = explode('@', $proxy); 70 | if (count($proxyPieces) == 2) { 71 | $authPieces = explode(':', $proxyPieces[0]); 72 | if (count($authPieces) > 2) { 73 | throw new Exception('Bad proxy string. Expected format: [user[:passsword]@]host:port'); 74 | } 75 | if (!isset($authPieces[1])) { 76 | $authPieces[1] = null; 77 | } 78 | $hostPieces = explode(':', $proxyPieces[1]); 79 | if (count($hostPieces) !== 2) { 80 | throw new Exception('Bad proxy string. Expected format: [user[:passsword]@]host:port'); 81 | } 82 | } elseif (count($proxyPieces) == 1) { 83 | $authPieces = [null, null]; 84 | $hostPieces = explode(':', $proxyPieces[0]); 85 | } else { 86 | throw new Exception('Bad proxy string. Expected format: [user[:passsword]@]host:port'); 87 | } 88 | $options['login'] = $authPieces[0]; 89 | $options['password'] = $authPieces[1]; 90 | return new self($hostPieces[0], $hostPieces[1], $authPieces[0], $authPieces[1], $type); 91 | } 92 | 93 | public function __toString() 94 | { 95 | $proxy = $this->getHost() . ':' . $this->getPort(); 96 | if ($user = $this->getUser()) { 97 | if ($password = $this->getPassword()) { 98 | $user .= ':' . $password; 99 | } 100 | $proxy = $user . '@' . $proxy; 101 | } 102 | 103 | if ($this->type) { 104 | $proxy = strtolower($this->getType()) . '://' . $proxy; 105 | } 106 | 107 | return $proxy; 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /src/Core/Http/ProxyInterface.php: -------------------------------------------------------------------------------- 1 | $v) { 45 | $this->headerNames[strtoupper($k)] = $k; 46 | } 47 | $this->httpResponseHeaders = $httpResponseHeaders; 48 | 49 | $this->httpResponseStatus = $httpResponseStatus; 50 | $this->pageEvaluated = (bool)$pageEvaluated; 51 | $this->pageContent = $pageContent; 52 | $this->initialUrl = $initialUrl; 53 | $this->effectiveUrl = $effectiveUrl; 54 | $this->proxy = $proxy; 55 | } 56 | 57 | /** 58 | * Get the header value or null if it does not exist 59 | * @param $headerName 60 | * @return null 61 | */ 62 | public function getHeader($headerName) 63 | { 64 | if ($this->hasHeader($headerName)) { 65 | return $this->httpResponseHeaders[$this->headerNames[strtoupper($headerName)]]; 66 | } 67 | return null; 68 | } 69 | 70 | /** 71 | * Check if the given header was in the http response 72 | * @param $headerName 73 | * @return bool 74 | */ 75 | public function hasHeader($headerName) 76 | { 77 | return isset($this->headerNames[strtoupper($headerName)]); 78 | } 79 | 80 | /** 81 | * all http response headers 82 | * @return array 83 | */ 84 | public function getHeaders() 85 | { 86 | return $this->httpResponseHeaders; 87 | } 88 | 89 | /** 90 | * the http response status code 91 | * @return int 92 | */ 93 | public function getHttpResponseStatus() 94 | { 95 | return $this->httpResponseStatus; 96 | } 97 | 98 | /** 99 | * Will return true if the page/javascript were evaluated, in this case dom might be updated 100 | * @return bool 101 | */ 102 | public function isPageEvaluated() 103 | { 104 | return $this->pageEvaluated; 105 | } 106 | 107 | /** 108 | * @return string 109 | */ 110 | public function getPageContent() 111 | { 112 | return $this->pageContent; 113 | } 114 | 115 | /** 116 | * the url that initiated the request 117 | * @return UrlArchive 118 | */ 119 | public function getInitialUrl() 120 | { 121 | return $this->initialUrl; 122 | } 123 | 124 | /** 125 | * the final url of the request. In case of a redirection, that will be the final url of the redirection 126 | * @return UrlArchive 127 | */ 128 | public function getEffectiveUrl() 129 | { 130 | return $this->effectiveUrl; 131 | } 132 | 133 | /** 134 | * The proxy that was used. Will be null if no proxy was used 135 | * @return ProxyInterface|null 136 | */ 137 | public function getProxy() 138 | { 139 | return $this->proxy; 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /src/Core/Http/StackingHttpClient.php: -------------------------------------------------------------------------------- 1 | requestStack = []; 25 | } 26 | 27 | public function count() 28 | { 29 | return count($this->requestStack); 30 | } 31 | 32 | /** 33 | * @return [] 34 | */ 35 | public function getStack() 36 | { 37 | return $this->requestStack; 38 | } 39 | 40 | public function sendRequest( 41 | RequestInterface $request, 42 | ProxyInterface $proxy = null, 43 | CookieJarInterface $cookieJar = null 44 | ) { 45 | 46 | $requestData = new \stdClass(); 47 | 48 | $requestData->request = $request; 49 | $requestData->cookieJar = $cookieJar; 50 | $requestData->proxy = $proxy; 51 | 52 | $this->requestStack[] = $requestData; 53 | 54 | return new SearchEngineResponse( 55 | [], 56 | 200, 57 | '', 58 | false, 59 | UrlArchive::fromString($request->getUri()), 60 | UrlArchive::fromString($request->getUri()), 61 | $proxy 62 | ); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/Core/Media/AbstractMedia.php: -------------------------------------------------------------------------------- 1 | asString(); 14 | } 15 | 16 | public function asStream() 17 | { 18 | $stream = fopen('php://memory', 'r+'); 19 | fwrite($stream, $this->asString()); 20 | rewind($stream); 21 | return $stream; 22 | } 23 | 24 | public function asBase64() 25 | { 26 | return base64_encode($this->asString()); 27 | } 28 | 29 | public function saveFile($fileName) 30 | { 31 | file_put_contents($fileName, $this->asString()); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/Core/Media/Base64.php: -------------------------------------------------------------------------------- 1 | data = $data; 22 | } 23 | 24 | public function asBase64() 25 | { 26 | return $this->data; 27 | } 28 | 29 | public function asString() 30 | { 31 | return base64_decode($this->data); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/Core/Media/Binary.php: -------------------------------------------------------------------------------- 1 | data = $data; 22 | } 23 | 24 | public function asString() 25 | { 26 | return $this->data; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/Core/Media/File.php: -------------------------------------------------------------------------------- 1 | file = $file; 23 | $this->useCache = $useCache; 24 | } 25 | 26 | private function getFileContent() 27 | { 28 | return @file_get_contents($this->file); 29 | } 30 | 31 | public function asString() 32 | { 33 | if ($this->useCache) { 34 | if (!$this->cache) { 35 | $this->cache = $this->getFileContent(); 36 | } 37 | return $this->cache; 38 | } else { 39 | return $this->getFileContent(); 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/Core/Media/MediaFactory.php: -------------------------------------------------------------------------------- 1 | stream = $stream; 25 | } 26 | 27 | public function asStream() 28 | { 29 | $newStream = fopen('php://memory', 'r+'); 30 | rewind($this->stream); 31 | stream_copy_to_stream($this->stream, $newStream); 32 | rewind($newStream); 33 | return $newStream; 34 | } 35 | 36 | public function asString() 37 | { 38 | rewind($this->stream); 39 | return stream_get_contents($this->stream); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/Core/Psr7/RequestBuilder.php: -------------------------------------------------------------------------------- 1 | write($bodyStr); 49 | } 50 | 51 | return new ZendDiactorosRequest( 52 | $url ? (string) $url : '', 53 | $method ? $method : 'GET', 54 | $body ? $body : 'php://temp', 55 | $headers ? $headers : [] 56 | ); 57 | } 58 | 59 | protected static function requestFromGuzzlePsr7($url = null, $method = null, array $headers = null, $body = null) 60 | { 61 | return new GuzzlePsr7Request( 62 | $method ? $method : 'GET', 63 | $url ? (string) $url : '', 64 | $headers ? $headers : [], 65 | $body ? $body : '' 66 | ); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/Core/Serp/BaseResult.php: -------------------------------------------------------------------------------- 1 | types = is_array($types) ? $types : [$types]; 20 | $this->data = $data; 21 | } 22 | 23 | public function getTypes() 24 | { 25 | return $this->types; 26 | } 27 | 28 | /** 29 | * @param array ...$type 30 | * @return bool 31 | */ 32 | public function is($types) 33 | { 34 | $types = func_get_args(); 35 | 36 | $testedTypes = $this->getTypes(); 37 | 38 | foreach ($types as $type) { 39 | if (in_array($type, $testedTypes)) { 40 | return true; 41 | } 42 | } 43 | return false; 44 | } 45 | 46 | public function getDataValue($name) 47 | { 48 | $data = isset($this->data[$name]) ? $this->data[$name] : null; 49 | if ($data instanceof Closure) { 50 | $data = call_user_func($data, $this); 51 | $this->data[$name] = $data; 52 | return $this->getDataValue($name); 53 | } 54 | return $data; 55 | } 56 | 57 | 58 | public function __get($name) 59 | { 60 | return $this->getDataValue($name); 61 | } 62 | 63 | public function getData() 64 | { 65 | $data = []; 66 | foreach ($this->data as $k => $v) { 67 | $datum = $this->getDataValue($k); 68 | if (is_array($datum) || (is_object($datum) && $datum instanceof ResultSetInterface)) { 69 | // make sur datum is an array 70 | $baseDatum = $datum; 71 | $datum = []; 72 | 73 | foreach ($baseDatum as $subK => $subV) { 74 | if (is_object($subV) && $subV instanceof ResultDataInterface) { 75 | $datum[$subK] = $subV->getData(); 76 | } else { 77 | $datum[$subK] = $subV; 78 | } 79 | } 80 | } elseif (is_object($datum) && $datum instanceof ResultDataInterface) { 81 | $datum = $datum->getData(); 82 | } 83 | $data[$k] = $datum; 84 | } 85 | return $data; 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/Core/Serp/CompositeResultSet.php: -------------------------------------------------------------------------------- 1 | resultSets = $resultSets; 23 | } 24 | 25 | /** 26 | * @return ResultSetInterface[] 27 | */ 28 | public function getResultSets() 29 | { 30 | return $this->resultSets; 31 | } 32 | 33 | /** 34 | * Adds a result set to the internal list 35 | * @param ResultSetInterface $resultSet 36 | */ 37 | public function addResultSet(ResultSetInterface $resultSet) 38 | { 39 | $this->resultSets[] = $resultSet; 40 | } 41 | 42 | /** 43 | * @return ResultDataInterface[] 44 | */ 45 | public function getItems() 46 | { 47 | 48 | $items = []; 49 | 50 | foreach ($this->resultSets as $resultSet) { 51 | $items = array_merge($items, $resultSet->getItems()); 52 | } 53 | 54 | return $items; 55 | } 56 | 57 | /** 58 | * Get all the results matching one of the given types 59 | * @param array ...$types 60 | * @return ResultDataInterface[] 61 | */ 62 | public function getResultsByType($types) 63 | { 64 | $types = func_get_args(); 65 | $items = new CompositeResultSet(); 66 | foreach ($this->resultSets as $resultSet) { 67 | $items->addResultSet( 68 | call_user_func_array([$resultSet, 'getResultsByType'], $types) 69 | ); 70 | } 71 | return $items; 72 | } 73 | 74 | /** 75 | * Checks if the ResultSet has one of the given types 76 | * @param string ...$types 77 | * @return bool 78 | */ 79 | public function hasType($types) 80 | { 81 | $types = func_get_args(); 82 | foreach ($this->resultSets as $resultSet) { 83 | if (call_user_func_array([$resultSet, 'hasType'], $types)) { 84 | return true; 85 | } 86 | } 87 | return false; 88 | } 89 | 90 | public function count() 91 | { 92 | $count = 0; 93 | foreach ($this->resultSets as $resultSet) { 94 | $count += count($resultSet); 95 | } 96 | 97 | return $count; 98 | } 99 | 100 | public function getIterator() 101 | { 102 | return new \ArrayIterator($this->getItems()); 103 | } 104 | 105 | 106 | 107 | public function offsetExists($offset) 108 | { 109 | return key_exists($offset, $this->getItems()); 110 | } 111 | 112 | public function offsetGet($offset) 113 | { 114 | return $this->getItems()[$offset]; 115 | } 116 | 117 | public function offsetSet($offset, $value) 118 | { 119 | throw new \Exception('Cannot set items in resultset, please use addResultSet()'); 120 | } 121 | 122 | public function offsetUnset($offset) 123 | { 124 | throw new \Exception('Deleting item is forbidden'); 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /src/Core/Serp/IndexedResultSet.php: -------------------------------------------------------------------------------- 1 | startingAt = $startingAt - 1; 26 | } 27 | 28 | /** 29 | * @param ResultDataInterface $item 30 | */ 31 | public function addItem(ResultDataInterface $item) 32 | { 33 | $itemCount = count($this->items) + 1; 34 | parent::addItem(new ItemPosition( 35 | $itemCount, 36 | $itemCount + $this->startingAt, 37 | $item 38 | )); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/Core/Serp/ItemPosition.php: -------------------------------------------------------------------------------- 1 | positionOnPage = $positionOnPage; 23 | $this->realPosition = $realPosition; 24 | 25 | parent::__construct($itemData); 26 | } 27 | 28 | /** 29 | * @return int the position of the item on the page (starting at 1) 30 | */ 31 | public function getOnPagePosition() 32 | { 33 | return $this->positionOnPage; 34 | } 35 | 36 | /** 37 | * @return int the general position of the item (starting at 1) 38 | */ 39 | public function getRealPosition() 40 | { 41 | return $this->realPosition; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/Core/Serp/ProxyResult.php: -------------------------------------------------------------------------------- 1 | itemData = $item; 19 | } 20 | 21 | 22 | /** 23 | * @param array ...$type 24 | * @return bool 25 | */ 26 | public function is($types) 27 | { 28 | $types = func_get_args(); 29 | return call_user_func_array([$this->itemData, 'is'], $types); 30 | } 31 | 32 | public function getDataValue($name) 33 | { 34 | return $this->itemData->getDataValue($name); 35 | } 36 | 37 | public function __get($name) 38 | { 39 | return $this->itemData->getDataValue($name); 40 | } 41 | 42 | public function getData() 43 | { 44 | return $this->itemData->getData(); 45 | } 46 | 47 | 48 | public function getTypes() 49 | { 50 | return $this->itemData->getTypes(); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/Core/Serp/ResultDataInterface.php: -------------------------------------------------------------------------------- 1 | items = array_merge($this->items, $items); 19 | } 20 | 21 | /** 22 | * @param ResultDataInterface $item 23 | */ 24 | public function addItem(ResultDataInterface $item) 25 | { 26 | $this->items[] = $item; 27 | } 28 | 29 | /** 30 | * @return ResultDataInterface[] 31 | */ 32 | public function getItems() 33 | { 34 | return $this->items; 35 | } 36 | 37 | /** 38 | * Get all the results matching one of the given types 39 | * @param array ...$types 40 | * @return ItemPosition[] 41 | */ 42 | public function getResultsByType($types) 43 | { 44 | $types = func_get_args(); 45 | $items = new ResultSet(); 46 | foreach ($this->items as $item) { 47 | if (call_user_func_array([$item, 'is'], $types)) { 48 | $items->addItem($item); 49 | } 50 | } 51 | return $items; 52 | } 53 | 54 | /** 55 | * Checks if the ResultSet has one of the given types 56 | * @param string ...$types 57 | * @return bool 58 | */ 59 | public function hasType($types) 60 | { 61 | $types = func_get_args(); 62 | 63 | foreach ($this->items as $item) { 64 | if (call_user_func_array([$item, 'is'], $types)) { 65 | return true; 66 | } 67 | } 68 | return false; 69 | } 70 | 71 | public function count() 72 | { 73 | return count($this->items); 74 | } 75 | 76 | public function getIterator() 77 | { 78 | return new \ArrayIterator($this->items); 79 | } 80 | 81 | public function offsetExists($offset) 82 | { 83 | return key_exists($offset, $this->items); 84 | } 85 | 86 | public function offsetGet($offset) 87 | { 88 | return $this->items[$offset]; 89 | } 90 | 91 | public function offsetSet($offset, $value) 92 | { 93 | throw new \Exception('Cannot set items in resultset, please use addItem()'); 94 | } 95 | 96 | public function offsetUnset($offset) 97 | { 98 | throw new \Exception('Deleting item is forbidden'); 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /src/Core/Serp/ResultSetInterface.php: -------------------------------------------------------------------------------- 1 | initWithDefaults( 27 | $scheme, 28 | $host, 29 | $path, 30 | $query, 31 | $hash, 32 | $port, 33 | $user, 34 | $pass 35 | ); 36 | } 37 | 38 | /** 39 | * @inheritdoc 40 | */ 41 | public static function build( 42 | $scheme = null, 43 | $host = null, 44 | $path = null, 45 | array $query = [], 46 | $hash = null, 47 | $port = null, 48 | $user = null, 49 | $pass = null 50 | ) { 51 | return new static( 52 | $scheme, 53 | $host, 54 | $path, 55 | $query, 56 | $hash, 57 | $port, 58 | $user, 59 | $pass 60 | ); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/Core/Url/AlterableUrlInterface.php: -------------------------------------------------------------------------------- 1 | hash = $hash; 22 | return $this; 23 | } 24 | 25 | /** 26 | * Set the path. 27 | * ``/some/path`` in ``http://www.example.com/some/path`` 28 | * @param string $path 29 | * @return $this 30 | */ 31 | public function setPath($path) 32 | { 33 | $this->path = $path; 34 | return $this; 35 | } 36 | 37 | /** 38 | * Set the port 39 | * @param int $port 40 | */ 41 | public function setPort($port) 42 | { 43 | $this->port = $port; 44 | } 45 | 46 | /** 47 | * Set the user for auth 48 | * @param string $user 49 | */ 50 | public function setUser($user) 51 | { 52 | $this->user = $user; 53 | } 54 | 55 | /** 56 | * Set the pass for auth 57 | * @param string $pass 58 | */ 59 | public function setPass($pass) 60 | { 61 | $this->pass = $pass; 62 | } 63 | 64 | 65 | /** 66 | * Set the scheme. 67 | * ``http`` in ``http://www.example.com`` 68 | * @param string $scheme 69 | * @return $this 70 | */ 71 | public function setScheme($scheme) 72 | { 73 | $this->scheme = $scheme; 74 | } 75 | 76 | 77 | /** 78 | * Set the hostname. 79 | * ``www.example.com`` in ``http://www.example.com`` 80 | * @param string $host the hostname 81 | * @return $this 82 | */ 83 | public function setHost($host) 84 | { 85 | $this->host = $host; 86 | return $this; 87 | } 88 | 89 | 90 | /** 91 | * Add a parameter to the URL. 92 | * Parameter will come after the ``?`` e.g: ``http://example.com?param=value¶m2=value`` 93 | * @param string $name name of the parameter 94 | * @param string $value value of the parameter 95 | * @param bool $raw by default params are encoded to be url (``foo bar`` becomes ``foo+bar``) pass it to true 96 | * to disable this encoding 97 | * @return $this 98 | */ 99 | public function setParam($name, $value, $raw = false) 100 | { 101 | $this->query[$name] = new QueryParam($name, $value, $raw); 102 | return $this; 103 | } 104 | 105 | /** 106 | * Remove current params and replace them with given params 107 | * @param array $params 108 | */ 109 | public function setParams(array $params) 110 | { 111 | $this->query = []; 112 | foreach ($params as $k => $v) { 113 | if (is_object($v)) { 114 | if ($v instanceof QueryParam) { 115 | $this->query[$v->getName()] = clone $v; 116 | } else { 117 | throw new \InvalidArgumentException('invalid query param item'); 118 | } 119 | } else { 120 | $this->query[$k] = new QueryParam($k, $v); 121 | } 122 | } 123 | } 124 | 125 | /** 126 | * Remove the given parameter 127 | * @param string $name name of the parameter to remove 128 | * @return $this; 129 | */ 130 | public function removeParam($name) 131 | { 132 | if (isset($this->query[$name])) { 133 | unset($this->query[$name]); 134 | } 135 | return $this; 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /src/Core/Url/QueryParam.php: -------------------------------------------------------------------------------- 1 | raw = $raw; 36 | $this->name = $name; 37 | $this->value = $value; 38 | } 39 | 40 | /** 41 | * @return string 42 | */ 43 | public function getName() 44 | { 45 | return $this->name; 46 | } 47 | 48 | /** 49 | * @return string 50 | */ 51 | public function getValue() 52 | { 53 | if (!$this->isRaw()) { 54 | if (is_string($this->value)) { 55 | return urlencode($this->value); 56 | } else { 57 | return $this->value; 58 | } 59 | } 60 | return $this->value; 61 | } 62 | 63 | public function getRawValue() 64 | { 65 | return $this->value; 66 | } 67 | 68 | /** 69 | * @return boolean 70 | */ 71 | public function isRaw() 72 | { 73 | return $this->raw; 74 | } 75 | 76 | 77 | /** 78 | * Generate the parameter to be appended to the url 79 | * @return string the parameter on this format: ``name=value`` 80 | */ 81 | public function generate() 82 | { 83 | return $this->queryItemToString($this->getValue()); 84 | } 85 | 86 | private function queryItemToString($value) 87 | { 88 | 89 | if (is_string($value)) { 90 | if (strlen($value) > 0) { 91 | return $this->getName() . '=' . $value; 92 | } 93 | } elseif (is_numeric($value)) { 94 | return $this->getName() . '=' . $value; 95 | } elseif (is_array($value)) { 96 | if (empty($value)) { 97 | return $this->getName(); 98 | } else { 99 | return $this->arrayToStringRecursive($this->getName(), $value); 100 | } 101 | } 102 | 103 | return (string) $this->getName(); 104 | } 105 | 106 | private function arrayToStringRecursive($currentKey, $dataArray) 107 | { 108 | $data = []; 109 | foreach ($dataArray as $k => $v) { 110 | $key = "${currentKey}[${k}]"; 111 | if (is_array($v)) { 112 | $str = $this->arrayToStringRecursive($key, $v); 113 | } else { 114 | if (is_null($v)) { 115 | $str = $key; 116 | } else { 117 | if (!$this->isRaw()) { 118 | $v = urlencode($v); 119 | } 120 | $str = $key . '=' . $v; 121 | } 122 | } 123 | $data[] = $str; 124 | } 125 | return implode('&', $data); 126 | } 127 | 128 | public function __toString() 129 | { 130 | return $this->generate(); 131 | } 132 | 133 | public function __clone() 134 | { 135 | return new self($this->getName(), $this->getRawValue(), $this->isRaw()); 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /src/Core/Url/UrlArchiveInterface.php: -------------------------------------------------------------------------------- 1 | host = $host; 54 | $this->scheme = $scheme; 55 | $this->path = $path ; 56 | $this->hash = $hash; 57 | $this->port = $port; 58 | $this->user = $user; 59 | $this->pass = $pass; 60 | 61 | $this->query = []; 62 | foreach ($query as $k => $v) { 63 | if (is_object($v)) { 64 | if ($v instanceof QueryParam) { 65 | $this->query[$v->getName()] = clone $v; 66 | } else { 67 | throw new \InvalidArgumentException('invalid query param item'); 68 | } 69 | } else { 70 | $this->query[$k] = new QueryParam($k, $v); 71 | } 72 | } 73 | } 74 | 75 | /** 76 | * Alternative to the builtin parse_str php function that will replace periods with underscores 77 | * 78 | * Inspired by @elyobo solution (see link) 79 | * 80 | * @link https://gist.github.com/elyobo/6200838 81 | */ 82 | public static function parseStr($str) 83 | { 84 | 85 | if (!$str) { 86 | return []; 87 | } 88 | 89 | $foundKeys = []; 90 | $finalKeys = []; 91 | 92 | 93 | $source = preg_replace_callback( 94 | '/ 95 | # Match at start of string or & 96 | (?:^|(?<=&)) 97 | # Exclude cases where the period is in brackets, e.g. foo[bar.blarg] 98 | [^=&\[]* 99 | # Affected cases: periods and spaces 100 | (?:\.|%20| ) 101 | # Keep matching until assignment, next variable, end of string or 102 | # start of an array 103 | [^=&\[]* 104 | /x', 105 | function ($key) use (&$foundKeys) { 106 | $foundKeys[] = $key = base64_encode(urldecode($key[0])); 107 | return urlencode($key); 108 | }, 109 | $str 110 | ); 111 | 112 | parse_str($source, $data); 113 | 114 | foreach ($data as $key => $val) { 115 | // Only unprocess encoded keys 116 | 117 | if (!in_array($key, $foundKeys)) { 118 | $finalKeys[$key] = $val; 119 | } else { 120 | $key = base64_decode($key); 121 | $finalKeys[$key] = $val; 122 | } 123 | } 124 | 125 | return $finalKeys; 126 | } 127 | 128 | /** 129 | * @param array $urlItems 130 | * @return static 131 | */ 132 | public static function fromArray(array $urlItems) 133 | { 134 | 135 | if (isset($urlItems['query'])) { 136 | $query = self::parseStr($urlItems['query']); 137 | } else { 138 | $query = []; 139 | } 140 | 141 | return static::build( 142 | isset($urlItems['scheme']) ? $urlItems['scheme'] : null, 143 | isset($urlItems['host']) ? $urlItems['host'] : null, 144 | isset($urlItems['path']) ? $urlItems['path'] : null, 145 | $query, 146 | isset($urlItems['fragment']) ? $urlItems['fragment'] : null, 147 | isset($urlItems['port']) ? $urlItems['port'] : null, 148 | isset($urlItems['user']) ? $urlItems['user'] : null, 149 | isset($urlItems['path']) ? $urlItems['path'] : null 150 | ); 151 | } 152 | 153 | protected static function parseUrl($url) 154 | { 155 | // Normally a URI must be ASCII, however. However, often it's not and 156 | // parse_url might corrupt these strings. 157 | // 158 | // For that reason we take any non-ascii characters from the uri and 159 | // uriencode them first. 160 | // 161 | // code from https://github.com/fruux/sabre-uri 162 | $url = preg_replace_callback( 163 | '/[^[:ascii:]]/u', 164 | function ($matches) { 165 | return rawurlencode($matches[0]); 166 | }, 167 | $url 168 | ); 169 | 170 | return parse_url($url); 171 | } 172 | 173 | /** 174 | * Builds an url instance from an url string 175 | * @param string $url the url to parse 176 | * @return static 177 | */ 178 | public static function fromString($url) 179 | { 180 | $urlItems = self::parseUrl($url); 181 | return static::fromArray($urlItems); 182 | } 183 | 184 | /** 185 | * @return QueryParam[] 186 | */ 187 | public function getParams() 188 | { 189 | return $this->query; 190 | } 191 | 192 | public function getUser() 193 | { 194 | return $this->user; 195 | } 196 | 197 | public function getPass() 198 | { 199 | return $this->pass; 200 | } 201 | 202 | public function getPort() 203 | { 204 | return $this->port; 205 | } 206 | 207 | 208 | /** 209 | * Set the scheme. 210 | * ``foo`` in ``http://www.example.com#foo`` 211 | * @return string 212 | */ 213 | public function getHash() 214 | { 215 | return $this->hash; 216 | } 217 | 218 | /** 219 | * Set the scheme. 220 | * ``some/path`` in ``http://www.example.com/some/path`` 221 | * @return string 222 | */ 223 | public function getPath() 224 | { 225 | return $this->path; 226 | } 227 | 228 | /** 229 | * Get the scheme. 230 | * ``http`` in ``http://www.example.com`` 231 | * @return string 232 | */ 233 | public function getScheme() 234 | { 235 | return $this->scheme; 236 | } 237 | 238 | /** 239 | * Get the hostname. 240 | * ``www.example.com`` in ``http://www.example.com`` 241 | * @return string 242 | */ 243 | public function getHost() 244 | { 245 | return $this->host; 246 | } 247 | 248 | /** 249 | * @param string $name 250 | * @param mixed $default 251 | * @return mixed 252 | */ 253 | public function getParamValue($name, $default = null) 254 | { 255 | if (isset($this->query[$name])) { 256 | return $this->query[$name]->getValue(); 257 | } 258 | return $default; 259 | } 260 | 261 | /** 262 | * @param string $name 263 | * @param mixed $default 264 | * @return mixed 265 | */ 266 | public function getParamRawValue($name, $default = null) 267 | { 268 | if (isset($this->query[$name])) { 269 | return $this->query[$name]->getRawValue(); 270 | } 271 | return $default; 272 | } 273 | 274 | /** 275 | * @param string $name 276 | * @return bool 277 | */ 278 | public function hasParam($name) 279 | { 280 | return isset($this->query[$name]); 281 | } 282 | 283 | public function getAuthority() 284 | { 285 | $authority = ''; 286 | 287 | if ($host = $this->getHost()) { 288 | if ($user = $this->getUser()) { 289 | $authority .= $user; 290 | if ($pass = $this->getPass()) { 291 | $authority .= ':' . $pass; 292 | } 293 | $authority .= '@'; 294 | } 295 | 296 | $authority .= $this->getHost(); 297 | 298 | if ($port = $this->getPort()) { 299 | if (!(80 == $port && 'http' === $this->getScheme()) 300 | && !(443 == $port && 'https' === $this->getScheme() ) 301 | ) { 302 | $authority .= ':' . $port; 303 | } 304 | } 305 | } 306 | 307 | return $authority; 308 | } 309 | 310 | /** 311 | * Get the full uri: ``http://www.example.com/path?param=value#hash`` 312 | * @return string 313 | */ 314 | public function buildUrl() 315 | { 316 | $scheme = $this->getScheme(); 317 | if ($scheme) { 318 | $uri = $scheme . ':'; 319 | } else { 320 | $uri = ''; 321 | } 322 | 323 | if ($authority = $this->getAuthority()) { 324 | $uri .= '//' . $authority; 325 | } 326 | 327 | if ($path = $this->getPath()) { 328 | $uri .= '/' . ltrim($path, '/'); 329 | } 330 | 331 | if ($query = $this->getQueryString()) { 332 | $uri .= '?' . $query; 333 | } 334 | 335 | if ($hash = $this->getHash()) { 336 | $uri .= '#' . $this->getHash(); 337 | } 338 | 339 | 340 | return $uri; 341 | } 342 | 343 | public function __toString() 344 | { 345 | return $this->buildUrl(); 346 | } 347 | 348 | /** 349 | * Get the query string. 350 | * ``foo=bar&bar=foo`` in ``http://www.example.com?foo=bar&bar=foo`` 351 | * @return string 352 | */ 353 | public function getQueryString() 354 | { 355 | return implode('&', $this->query); 356 | } 357 | 358 | /** 359 | * Resolve the given url, and returns it as an alterable url, that's made for minor performance update between 360 | * resolve and resolveAsString (thus resolveAsString does not need double transformation of the url) 361 | * This method must remain private 362 | * 363 | * @param string $a class to resolve to, must be an alterableUrl class 364 | * @param string $url url to resolve 365 | * @return AlterableUrlInterface 366 | */ 367 | private function resolveAsAlterableUrl($url, $as) 368 | { 369 | $delta = call_user_func([$as, 'fromString'], $url); 370 | 371 | if (!($scheme = $delta->getScheme())) { 372 | $delta->setScheme($this->getScheme()); 373 | 374 | $path = $delta->getPath(); 375 | if (empty($delta->getAuthority())) { 376 | $delta->setUser($this->getUser()); 377 | $delta->setPass($this->getPass()); 378 | $delta->setHost($this->getHost()); 379 | $delta->setPort($this->getPort()); 380 | 381 | if (empty($path)) { 382 | $path = $this->getPath(); 383 | if (empty($delta->getParams())) { 384 | $delta->setParams($this->getParams()); 385 | } 386 | } elseif ('/' !== $path[0]) { 387 | $path = $this->getPath(); 388 | if (strpos($path, '/') !== false) { 389 | $path = substr($path, 0, strrpos($path, '/')); 390 | } 391 | $path .= '/' . $delta->getPath(); 392 | } 393 | } 394 | 395 | // Removing .. and . 396 | $pathParts = explode('/', $path); 397 | $newPathParts = []; 398 | foreach ($pathParts as $pathPart) { 399 | switch ($pathPart) { 400 | //case '' : 401 | case '.': 402 | break; 403 | case '..': 404 | array_pop($newPathParts); 405 | break; 406 | default: 407 | $newPathParts[] = $pathPart; 408 | break; 409 | } 410 | } 411 | $path = implode('/', $newPathParts); 412 | 413 | // If ends with . or .. we want to preserve / at end 414 | $lastItem = end($pathParts); 415 | if ('.' === $lastItem || '..' === $lastItem) { 416 | $path .= '/'; 417 | } 418 | 419 | $delta->setPath($path); 420 | 421 | // In every cases we want to keep $delta hash 422 | } 423 | 424 | return $delta; 425 | } 426 | 427 | /** 428 | * @see UrlArchiveInterface::resolve 429 | */ 430 | public function resolve($url, $as = null) 431 | { 432 | if (null === $as) { 433 | $as = static::class; 434 | $implements = class_implements($as, true); 435 | } else { 436 | if (!is_string($as)) { 437 | throw new \InvalidArgumentException( 438 | 'Invalid argument for UrlArchive::resolve(), the class name must be a string' 439 | ); 440 | } elseif (!class_exists($as, true)) { 441 | throw new \InvalidArgumentException($as . ' class does not exist'); 442 | } 443 | 444 | // Check if the given class implements urlArchive 445 | $implements = class_implements($as, true); 446 | 447 | if (!in_array(UrlArchiveInterface::class, $implements)) { 448 | throw new \InvalidArgumentException( 449 | 'Invalid argument for ' . __CLASS__ . '::' . __METHOD__ . ', the specified class must implement' 450 | . UrlArchiveInterface::class 451 | ); 452 | } 453 | } 454 | 455 | // If not resolved as an alterable url we need to use an alterable url and to transform it latter 456 | if (!in_array(AlterableUrlInterface::class, $implements)) { 457 | return $this->resolveAsAlterableUrl($url, Url::class)->cloneAs($as); 458 | } else { 459 | return $this->resolveAsAlterableUrl($url, Url::class); 460 | } 461 | } 462 | 463 | public function resolveAsString($url) 464 | { 465 | return $this->resolveAsAlterableUrl($url, Url::class)->buildUrl(); 466 | } 467 | 468 | /** 469 | * @param null $as 470 | * @return UrlArchiveInterface 471 | */ 472 | public function cloneAs($as = null) 473 | { 474 | 475 | if (null === $as) { 476 | $as = static::class; 477 | } else { 478 | if (!is_string($as)) { 479 | throw new \InvalidArgumentException('Invalid argument for ' . static::class . '::' . __METHOD__); 480 | } elseif (!class_exists($as, true)) { 481 | throw new \InvalidArgumentException($as . ' class does not exist'); 482 | } 483 | 484 | // Check if the given class implements urlArchive 485 | $implements = class_implements($as, true); 486 | 487 | if (!in_array(UrlArchiveInterface::class, $implements)) { 488 | throw new \InvalidArgumentException( 489 | 'Invalid argument for ' . __CLASS__ . '::' . __METHOD__ . '(), the specified class must implement' 490 | . UrlArchiveInterface::class 491 | ); 492 | } 493 | } 494 | 495 | return call_user_func( 496 | [$as, 'build'], 497 | $this->getScheme(), 498 | $this->getHost(), 499 | $this->getPath(), 500 | $this->getParams(), 501 | $this->getHash(), 502 | $this->getPort(), 503 | $this->getUser(), 504 | $this->getPass() 505 | ); 506 | } 507 | 508 | public function __clone() 509 | { 510 | return $this->cloneAs(); 511 | } 512 | } 513 | -------------------------------------------------------------------------------- /src/Core/UrlArchive.php: -------------------------------------------------------------------------------- 1 | initWithDefaults( 25 | $scheme, 26 | $host, 27 | $path, 28 | $query, 29 | $hash, 30 | $port, 31 | $user, 32 | $pass 33 | ); 34 | } 35 | 36 | /** 37 | * @inheritdoc 38 | */ 39 | public static function build( 40 | $scheme = null, 41 | $host = null, 42 | $path = null, 43 | array $query = [], 44 | $hash = null, 45 | $port = null, 46 | $user = null, 47 | $pass = null 48 | ) { 49 | return new static( 50 | $scheme, 51 | $host, 52 | $path, 53 | $query, 54 | $hash, 55 | $port, 56 | $user, 57 | $pass 58 | ); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/Exception.php: -------------------------------------------------------------------------------- 1 | captchaType = $captchaType; 29 | 30 | $captchaSolverClass = get_class($captchaSolver); 31 | $message = "Captcha of type $captchaType is not solvable by $captchaSolverClass."; 32 | if ($additionalMessage) { 33 | $message .= ' ' . $additionalMessage; 34 | } 35 | 36 | parent::__construct($message, $code, $previous); 37 | } 38 | 39 | /** 40 | * @return string the captcha type that is unknown 41 | */ 42 | public function getCaptchaType() 43 | { 44 | return $this->captchaType; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/Exception/InvalidCookieException.php: -------------------------------------------------------------------------------- 1 | captcha = $captchaResponse; 21 | } 22 | 23 | /** 24 | * @return CaptchaResponse 25 | */ 26 | public function getCaptcha() 27 | { 28 | return $this->captcha; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/Exception/RequestError/InvalidResponseException.php: -------------------------------------------------------------------------------- 1 | searchEngineResponse = $response; 23 | parent::__construct($message, $code, $previous); 24 | } 25 | 26 | /** 27 | * @return string 28 | */ 29 | public function getHttpStatusCode() 30 | { 31 | return $this->searchEngineResponse->getHttpResponseStatus(); 32 | } 33 | 34 | /** 35 | * @return SearchEngineResponse 36 | */ 37 | public function getResponse() 38 | { 39 | return $this->searchEngineResponse; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/Exception/RequestError/NetworkErrorException.php: -------------------------------------------------------------------------------- 1 |