├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── composer.json ├── phpunit.xml.dist ├── serp-scraper.jpg ├── src ├── Exceptions │ ├── Exception.php │ ├── InvalidArgumentException.php │ ├── RuntimeException.php │ └── UnsupportedEngineException.php ├── Helpers │ ├── FileSystemHelper.php │ ├── KeywordValidator.php │ ├── SerpScraperHelper.php │ ├── SerpUrlGenerator.php │ └── TestHelper.php ├── Scrapers │ ├── AskScraper.php │ ├── BingScraper.php │ ├── GoogleScraper.php │ ├── SerpScraper.php │ └── YahooScraper.php └── SerpScraperBuilder.php └── tests ├── SerpScraperBuilderTest.php ├── SerpScraperHelpersTest.php └── SerpScraperTest.php /.gitignore: -------------------------------------------------------------------------------- 1 | vendor/ 2 | composer.phar 3 | composer.lock 4 | fetcher_cache/ 5 | **/fetcher_cache/ 6 | serializer_cache/ 7 | **/serializer_cache/ 8 | out/ 9 | **/out/ 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: php 2 | 3 | php: 4 | - 5.4 5 | - 5.5 6 | - 5.6 7 | - hhvm 8 | 9 | before_script: 10 | - composer self-update 11 | - composer install --prefer-source --no-interaction --dev 12 | 13 | script: 14 | - mkdir -p build/logs 15 | - phpunit --coverage-clover build/logs/clover.xml 16 | 17 | after_script: 18 | - php vendor/bin/coveralls -v 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Francesco Pezzella 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/franzip/serp-scraper.svg?branch=master)](https://travis-ci.org/franzip/serp-scraper) 2 | [![Coverage Status](https://coveralls.io/repos/franzip/serp-scraper/badge.svg)](https://coveralls.io/r/franzip/serp-scraper) 3 | 4 | # SerpScraper 5 | A library to extract, serialize and store data scraped on Search Engine result pages. 6 | 7 | ## Installing via Composer (recommended) 8 | 9 | Install composer in your project: 10 | ``` 11 | curl -s http://getcomposer.org/installer | php 12 | ``` 13 | 14 | Create a composer.json file in your project root: 15 | ``` 16 | { 17 | "require": { 18 | "franzip/serp-scraper": "0.1.*@dev" 19 | } 20 | } 21 | ``` 22 | 23 | Install via composer 24 | ``` 25 | php composer.phar install 26 | ``` 27 | 28 | ## Supported Search Engines 29 | 30 | * Google 31 | * Bing 32 | * Ask 33 | * Yahoo 34 | 35 | ## Supported Serialization format 36 | 37 | * JSON 38 | * XML 39 | * YAML 40 | 41 | ## Legal Disclaimer 42 | 43 | Under no circumstances I shall be considered liable to any user for direct, 44 | indirect, incidental, consequential, special, or exemplary damages, arising 45 | from or relating to userʹs use or misuse of this software. 46 | Consult the following Terms of Service before using SerpScraper: 47 | 48 | * [Google](https://www.google.com/accounts/TOS) 49 | * [Bing](http://windows.microsoft.com/en-us/windows/microsoft-services-agreement) 50 | * [Ask](http://about.ask.com/terms-of-service) 51 | * [Yahoo](https://info.yahoo.com/legal/us/yahoo/utos/en-us/) 52 | 53 | ## How it works in a nutshell 54 | 55 | ![SerpScraper Diagram](./serp-scraper.jpg?raw=true "SerpScraper Diagram") 56 | 57 | ## Description 58 | 59 | Scraping legal status seems to be quite disputed. Anyway, this library tries 60 | to avoid unnecessary HTTP overhead by using three strategies: 61 | 62 | - Throttling: [an internal object](https://github.com/franzip/throttler) takes care of capping the amount of 63 | allowed HTTP requests to a default of 15 per hour. Once that limit has been reached, 64 | it will not be possible to scrape more content until the timeframe expires. 65 | 66 | - Caching: [the library used to retrieve data](https://github.com/franzip/serp-fetcher) caches every fetched page. The 67 | default cache expiration is set to 24 hours. 68 | 69 | - Delaying: a simple and quite naive approach is used here. Multiple HTTP requests 70 | will be spaced out by a default 0,5 sec delay. 71 | 72 | ## Constructor details 73 | 74 | This is the abstract constructor, used by all the concrete implementations: 75 | 76 | ```php 77 | SerpScraper($keywords, $outDir = 'out', $fetcherCacheDir = 'fetcher_cache', 78 | $serializerCacheDir = 'serializer_cache', $cacheTTL = 24, 79 | $requestDelay = 500); 80 | ``` 81 | 82 | 1. `$keywords` - array 83 | - The keywords you want to scrape. Cannot be an empty array. 84 | 2. `$outDir` - string 85 | - Path to the folder to be used to store serialized pages. 86 | 3. `$fetcherCacheDir` - string 87 | - Path to the folder to be used to store [SerpFetcher](https://github.com/franzip/serp-fetcher) cache. 88 | 4. `$serializerCacheDir` - string 89 | - Path to the folder to be used to store [SerpPageSerializer](https://github.com/franzip/serp-page-serializer) cache. 90 | 5. `$cacheTTL` - integer 91 | - Time expiration of the [SerpFetcher](https://github.com/franzip/serp-fetcher) cache expressed in hours. 92 | 6. `$requestDelay` - integer 93 | - Delay to use between multiple HTTP requests, expressed in microseconds. 94 | 95 | ## Building a Scraper (using Factory) 96 | 97 | Specify the vendor as first argument. You can specify custom settings using an 98 | array as second argument (see the SerpScraper constructor above). 99 | 100 | ```php 101 | use Franzip\SerpScraper\SerpScraperBuilder; 102 | 103 | $googleScraper = SerpScraperBuilder::create('Google', array(array('keyword1', 104 | 'keyword2', 105 | ...))); 106 | 107 | $askScraper = SerpScraperBuilder::create('Ask', array(array('key1', 'key2'))); 108 | $bingScraper = SerpScraperBuilder::create('Bing', array(array('baz', 'foo'))); 109 | ... 110 | ``` 111 | 112 | ## Building a Scraper (with explicit constructors) 113 | 114 | ```php 115 | use Franzip\SerpScraper\Scrapers\GoogleScraper; 116 | use Franzip\SerpScraper\Scrapers\AskScraper; 117 | use Franzip\SerpScraper\Scrapers\BingScraper; 118 | use Franzip\SerpScraper\Scrapers\YahooScraper; 119 | 120 | $googleScraper = new GoogleScraper($keywords = array('foo', 'bar'), 121 | $outDir = 'google_results'); 122 | $askScraper = new AskScraper($keywords = array('foo', bar), 123 | $outDir = 'ask_results'); 124 | ... 125 | ``` 126 | 127 | ## scrape() and scrapeAll() 128 | 129 | You can scrape a single tracked keyword with ```scrape()```, or scrape all the 130 | tracked keywords using ```scrapeAll()```. 131 | 132 | ```scrape()``` signature: 133 | ```php 134 | $serpScraper->scrape($keyword, $pagesToScrape = 1, $toRemove = false, 135 | $timezone = 'UTC', $throttling = true); 136 | ``` 137 | 138 | Usage example: 139 | 140 | ```php 141 | // Scrape the first 5 pages for the keyword 'foo', remove it from the tracked 142 | // keyword, use the Los Angeles timezone and don't use throttling. 143 | $serpScraper->scrape('foo', 5, true, 'America/Los Angeles', false); 144 | ``` 145 | 146 | ```scrapeAll()``` signature: 147 | 148 | ```php 149 | $serpScraper->scrapeAll($pagesToScrape = 1, $toRemove = false, $timezone = 'UTC', 150 | $throttling = true); 151 | ``` 152 | 153 | Usage example: 154 | 155 | ```php 156 | // Scrape the first 5 pages for all the tracked keywords, remove them all from 157 | // tracked keywords, use the Berlin timezone and don't use throttling. 158 | $serpScraper->scrapeAll(5, true, 'Europe/Berlin', false); 159 | // keywords array has been emptied 160 | var_dump($serpScraper->getKeywords()); 161 | // array() 162 | ``` 163 | 164 | ## serialize() and getFetchedPages() 165 | 166 | Serialize all the results fetched so far. Supported formats are: JSON, XML and 167 | YAML. 168 | You can access the fetched array by calling ```getFetchedPages()```. 169 | 170 | ```serialize()``` signature: 171 | ```php 172 | $serpScraper->serialize($format, $toRemove = false); 173 | ``` 174 | 175 | Usage example: 176 | 177 | ```php 178 | $serpScraper->serialize($format, $toRemove = false); 179 | // serialize to JSON the stuff retrieved so far 180 | $serpScraper->serialize('json'); 181 | // serialize to XML the stuff retrieved so far 182 | $serpScraper->serialize('xml'); 183 | // fetched pages are still there 184 | var_dump($serpScraper->getFetchedPages()); 185 | // array( 186 | // object(Franzip\SerpPageSerializer\Models\SerializableSerpPage) (1), 187 | // ... 188 | // ) 189 | 190 | // now serialize to YAML the stuff retrieved so far and empty the fetched data 191 | $serpScraper->serialize('yml', true); 192 | // fetched array is now empty 193 | var_dump($serpScraper->getFetchedPages()); 194 | // array() 195 | ``` 196 | 197 | ## save() and getSerializedPages() 198 | 199 | Write to files the serialized results so far. 200 | The format used as filename is the following: 201 | *vendor_keyword_pagenumber_time.format* | *google_foo_3_12032015.json* 202 | 203 | ```save()``` signature: 204 | ```php 205 | $serpScraper->save($toRemove = false) 206 | ``` 207 | 208 | Usage example: 209 | 210 | ```php 211 | // write serialized results so far to the specified output folder 212 | $serpScraper->save(); 213 | // serialized pages are still there 214 | var_dump($serpScraper->getSerializedPages()); 215 | // array( 216 | // object(Franzip\SerpPageSerializer\Models\SerializedSerpPage) (1), 217 | // ... 218 | // ) 219 | 220 | // write serialized results so far to the specified output folder and remove 221 | // them from the serialized array 222 | $serpScraper->save(true); 223 | // serialized array is now empty 224 | var_dump($serpScraper->getSerializedPages()); 225 | // array() 226 | ``` 227 | 228 | ## Adding/Removing keywords. 229 | 230 | ```php 231 | $serpScraper->addKeyword('bar'); 232 | $serpScraper->addKeywords(array('foo', 'bar', ...)); 233 | $serpScraper->removeKeyword('bar'); 234 | ``` 235 | 236 | ## Cache flushing 237 | 238 | You can call ```flushCache()``` anytime. This will remove all the cached files 239 | used by the ```SerpFetcher``` component and will also remove all the entries 240 | from the fetched and serialized arrays. 241 | 242 | ```php 243 | 244 | $serpScraper->flushCache(); 245 | var_dump($serpScraper->getFetchedPages()); 246 | // array() 247 | var_dump($serpScraper->getSerializedPages()); 248 | // array() 249 | ``` 250 | 251 | ## Basic usage 252 | 253 | ```php 254 | use Franzip\SerpScraper\SerpScraperBuilder; 255 | 256 | $googleScraper = SerpScraperBuilder::create('Google', array(array('keyword1', 257 | 'keyword2', 258 | 'keyword3'))); 259 | // scrape the first page for 'keyword1' 260 | $googleScraper->scrape('keyword1'); 261 | // scrape the first 5 page for 'keyword2' 262 | $googleScraper->scrape('keyword2', 5); 263 | // serialize to JSON what has been scraped so far 264 | $googleScraper->serialize('json'); 265 | // 266 | ... 267 | ``` 268 | 269 | ## Using multiple output folders 270 | 271 | You can use different output folders as you see fit. In this case, the same 272 | keywords will be scraped once but the results will be written to different folders, 273 | based on their serialization format. 274 | Since the results are cached, the ```serialize()``` method will use the same 275 | data over and over again. 276 | 277 | ```php 278 | use Franzip\SerpScraper\SerpScraperBuilder; 279 | 280 | $googleScraper = SerpScraperBuilder::create('Google', 281 | array(array('foo', 'baz', ...))); 282 | 283 | // output folders 284 | $xmlDir = 'google_results/xml'; 285 | $jsonDir = 'google_results/json'; 286 | $yamlDir = 'google_results/yaml'; 287 | 288 | ... 289 | // scraping action happens here... 290 | 291 | // write xml results first 292 | $googleScraper->serialize('xml'); 293 | $googleScraper->setOutDir($xmlDir); 294 | $googleScraper->save(); 295 | // now json 296 | $googleScraper->serialize('json'); 297 | $googleScraper->setOutDir($jsonDir); 298 | $googleScraper->save(); 299 | // write yaml results, we can now remove the serialized array 300 | $googleScraper->serialize('yml', true); 301 | $googleScraper->setOutDir($yamlDir); 302 | $googleScraper->save(); 303 | 304 | ``` 305 | 306 | ## TODOs 307 | 308 | - [ ] Avoid request delay on cache hit. 309 | - [ ] Validate YAML results in the tests (couldn't find a suitable library so far). 310 | - [ ] Improve docs with better organization and more examples. 311 | - [ ] Refactoring messy tests. 312 | 313 | ## License 314 | [MIT](http://opensource.org/licenses/MIT/ "MIT") Public License. 315 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "franzip/serp-scraper", 3 | "type": "library", 4 | "description": "A library to extract, serialize and store data scraped on Search Engine result pages.", 5 | "license": "MIT", 6 | "homepage": "http://github.com/franzip/serp-scraper", 7 | "keywords": ["scrape", "scraper", "serp", "data", "harvesting", "search-engine", "google"], 8 | "authors": [{ 9 | "name": "Francesco Pezzella", 10 | "email": "franzpezzella@gmail.com" 11 | }], 12 | "minimum-stability": "dev", 13 | "require": { 14 | "php": ">=5.4.0", 15 | "franzip/throttler": "0.2.*@dev", 16 | "franzip/serp-fetcher": "0.2.*@dev", 17 | "franzip/serp-page-serializer": "0.2.*@dev" 18 | }, 19 | "require-dev": { 20 | "phpunit/phpunit": "4.0.*", 21 | "seld/jsonlint": "dev-master", 22 | "satooshi/php-coveralls": "dev-master" 23 | }, 24 | "autoload": { 25 | "psr-4": { 26 | "Franzip\\SerpScraper\\": "src/" 27 | } 28 | }, 29 | "extra": { 30 | "branch-alias": { 31 | "dev-master": "0.1.x-dev" 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /phpunit.xml.dist: -------------------------------------------------------------------------------- 1 | 2 | 12 | 13 | 14 | ./src 15 | 16 | ./vendor 17 | ./tests 18 | 19 | 20 | 21 | 22 | 23 | tests/ 24 | SerpScraperHelpersTest.php 25 | 26 | 27 | tests/ 28 | SerpScraperBuilderTest.php 29 | 30 | 31 | tests/ 32 | SerpScraperTest.php 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /serp-scraper.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/franzip/serp-scraper/56bfeec1b2b1a744e16ff657d2f0abf67a21db5d/serp-scraper.jpg -------------------------------------------------------------------------------- /src/Exceptions/Exception.php: -------------------------------------------------------------------------------- 1 | 0; 64 | } 65 | 66 | /** 67 | * Check for valid scraping args. 68 | * @param array $keywords 69 | * @param int $pagesToScrape 70 | * @param bool $toRemove 71 | * @param string $timezone 72 | * @param bool $throttling 73 | * @param array $trackingKeywords 74 | * @return bool 75 | */ 76 | public static function validScrapeArgs($keywords, $pagesToScrape, $toRemove, 77 | $timezone, $throttling, $trackingKeywords) 78 | { 79 | return \Franzip\SerpScraper\Helpers\KeywordValidator::validKeywords($keywords) 80 | && self::keywordsAllTracked($keywords, $trackingKeywords) 81 | && is_int($pagesToScrape) 82 | && $pagesToScrape > 0 && is_bool($toRemove) 83 | && in_array($timezone, \DateTimeZone::listIdentifiers()) 84 | && is_bool($throttling); 85 | } 86 | 87 | /** 88 | * Check that a keyword is being tracked. 89 | * @param string $keyword 90 | * @param array $trackingKeywords 91 | * @return bool 92 | */ 93 | public static function keywordPresent($keyword, $trackingKeywords) 94 | { 95 | return in_array($keyword, $trackingKeywords); 96 | } 97 | 98 | /** 99 | * Check that all supplied keywords are being tracked. 100 | * @param array $keywords 101 | * @param array $trackingKeywords 102 | * @return bool 103 | */ 104 | public static function keywordsAllTracked($keywords, $trackingKeywords) 105 | { 106 | return !in_array(false, array_map(function($keyword) use ($trackingKeywords) { 107 | return in_array($keyword, $trackingKeywords); 108 | }, $keywords)); 109 | } 110 | 111 | /** 112 | * Extract relevant data from a SerializableSerpPage. 113 | * @param SerializableSerpPage $serializablePage 114 | * @return array 115 | */ 116 | public static function extractSerializablePageData($serializablePage) 117 | { 118 | $keyword = $serializablePage->getKeyword(); 119 | $pageNumber = $serializablePage->getPageNumber(); 120 | $age = $serializablePage->getAge()->format('Y-m-d');; 121 | return array($keyword, $pageNumber, $age); 122 | } 123 | 124 | private function __construct() {} 125 | } 126 | -------------------------------------------------------------------------------- /src/Helpers/SerpUrlGenerator.php: -------------------------------------------------------------------------------- 1 | isDir() && !$fileinfo->isDot() 45 | && !in_array($fileinfo->getFileName(), $dontDelete)) { 46 | self::rrmdir($fileinfo->getFilename()); 47 | } 48 | } 49 | } 50 | 51 | /** 52 | * Allow testing private methods. 53 | * @param string $name 54 | * @param string $className 55 | * @return callable 56 | */ 57 | public static function getMethod($name, $className) 58 | { 59 | $classQualifiedName = Builder::SCRAPER_CLASS_PREFIX . $className . Builder::SCRAPER_CLASS_SUFFIX; 60 | $class = new \ReflectionClass($classQualifiedName); 61 | $method = $class->getMethod($name); 62 | $method->setAccessible(true); 63 | return $method; 64 | } 65 | 66 | private function __construct() {} 67 | } 68 | -------------------------------------------------------------------------------- /src/Scrapers/AskScraper.php: -------------------------------------------------------------------------------- 1 | 7 | * @link https://github.com/franzip/serp-scraper 8 | * @copyright Copyright 2015 Francesco Pezzella 9 | * @license http://www.opensource.org/licenses/mit-license.php MIT License 10 | * @package SerpScraper 11 | */ 12 | 13 | namespace Franzip\SerpScraper\Scrapers; 14 | 15 | /** 16 | * SerpScraper implementation for Ask search engine. 17 | * 18 | * @package SerpScraper 19 | */ 20 | class AskScraper extends SerpScraper 21 | { 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/Scrapers/BingScraper.php: -------------------------------------------------------------------------------- 1 | 7 | * @link https://github.com/franzip/serp-scraper 8 | * @copyright Copyright 2015 Francesco Pezzella 9 | * @license http://www.opensource.org/licenses/mit-license.php MIT License 10 | * @package SerpScraper 11 | */ 12 | 13 | namespace Franzip\SerpScraper\Scrapers; 14 | 15 | /** 16 | * SerpScraper implementation for Bing search engine. 17 | * 18 | * @package SerpScraper 19 | */ 20 | class BingScraper extends SerpScraper 21 | { 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/Scrapers/GoogleScraper.php: -------------------------------------------------------------------------------- 1 | 7 | * @link https://github.com/franzip/serp-scraper 8 | * @copyright Copyright 2015 Francesco Pezzella 9 | * @license http://www.opensource.org/licenses/mit-license.php MIT License 10 | * @package SerpScraper 11 | */ 12 | 13 | namespace Franzip\SerpScraper\Scrapers; 14 | 15 | /** 16 | * SerpScraper implementation for Google search engine. 17 | * 18 | * @package SerpScraper 19 | */ 20 | class GoogleScraper extends SerpScraper 21 | { 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/Scrapers/SerpScraper.php: -------------------------------------------------------------------------------- 1 | 7 | * @link https://github.com/franzip/serp-scraper 8 | * @copyright Copyright 2015 Francesco Pezzella 9 | * @license http://www.opensource.org/licenses/mit-license.php MIT License 10 | * @package SerpScraper 11 | */ 12 | 13 | namespace Franzip\SerpScraper\Scrapers; 14 | use Franzip\SerpScraper\Helpers\SerpScraperHelper; 15 | use Franzip\SerpScraper\Helpers\FileSystemHelper; 16 | use Franzip\SerpScraper\Helpers\SerpUrlGenerator; 17 | use Franzip\SerpScraper\Helpers\KeywordValidator; 18 | use Franzip\SerpFetcher\SerpFetcherBuilder; 19 | use Franzip\SerpPageSerializer\Models\SerializableSerpPage; 20 | use Franzip\SerpPageSerializer\SerpPageSerializer; 21 | use Franzip\Throttler\Throttler; 22 | 23 | /** 24 | * Abstract class describing a SerpScraper. 25 | * The whole implementation is here and the concrete children classes have the sole 26 | * purpose to isolate different search engine scrapers and to allow dependencies 27 | * to work properly. 28 | * Once added some keywords through the constructor or through the addKeyword() 29 | * and addKeywords() method, the instance will be ready to scrape. 30 | * 31 | * Caching/Throttling/Delaying 32 | * 33 | * Since scraping legal status seems to be quite disputed and nobody likes 34 | * jerks, this implementation tries to avoid HTTP overhead using three simple 35 | * strategies. 36 | * The first is caching: the internal component of the class that 37 | * takes care of fetching data from the Internet (SerpFetcher) use caching, so 38 | * scraping the same page over and over again will result in a single 39 | * HTTP request (cache expiration is set to 24 hours by default). 40 | * The second is throttling: an internal component of the class take care of 41 | * capping HTTP requests (default cap is 15 requests per hour). Scraped data 42 | * retrieved from the cache are not counted. 43 | * The third is delaying: by default, a 0.5 sec delay takes place inbetween each 44 | * HTTP request needed to retrieve data. 45 | * 46 | *_____________________________________________________________________________ 47 | * Scrape | Serialize | Store | 48 | *________________________|_________________________|_________________________| 49 | * | | | 50 | * scrape()|scrapeAll() | serialize() | save() | 51 | * | | | 52 | *________________________|_________________________|_________________________| 53 | * Input | Input | Input | 54 | *________________________|_________________________|_________________________| 55 | * | | | 56 | * HTTP request|Cache Hit | SerializableSerpPage | SerializedSerpPage | 57 | * | array | array | 58 | *________________________|_________________________|_________________________| 59 | * Output | Output | Output | 60 | *________________________|_________________________|_________________________| 61 | * | | | 62 | * SerializableSerpPage | SerializedSerpPage | JSON|XML|YAML | 63 | * array | array | files | 64 | *________________________|_________________________|_________________________| 65 | * 66 | * 67 | * Scraping 68 | * 69 | * It is possible to scrape a single keyword with the scrape() method, or to scrape 70 | * all the added keywords alltogether with the scrapeAll() method. 71 | * The scraped data will be available as SerializableSerpPage objects in the 72 | * $fetched array. 73 | * 74 | * Serializing 75 | * 76 | * Once scraped some keywords, it is possible to serialize the fetched data through 77 | * the serialize() method (only JSON, XML and YAML are supported). This method 78 | * will provide to serialize all the data sitting in the $fetched array, and will 79 | * populate the $serialized array with SerializedSerpPage objects. 80 | * 81 | * Writing serialized data to file 82 | * 83 | * Storing data to disk is easy. The save() method takes care of writing the 84 | * content of all SerializedSerpPage objects to different files format (XML, 85 | * JSON, YAML) in the specified output folder. 86 | * 87 | * 88 | * @package SerpScraper 89 | */ 90 | abstract class SerpScraper 91 | { 92 | // namespacing constants 93 | const SCRAPERS_PREFIX = 'Franzip\SerpScraper\Scrapers\\'; 94 | const SCRAPERS_SUFFIX = 'Scraper'; 95 | // default results per page 96 | const DEFAULT_RESULTS_PER_PAGE = 10; 97 | // default timezone 98 | const DEFAULT_TIMEZONE = 'UTC'; 99 | 100 | // Throttler 101 | // Allow 15 requests per hour (cache hits are not considered) 102 | const DEFAULT_THROTTLER_NAME = 'http_requests'; 103 | const DEFAULT_THROTTLER_THRESHOLD = 15; 104 | const DEFAULT_THROTTLER_METRIC = 'hrs'; 105 | const DEFAULT_THROTTLER_METRIC_FACTOR = 1; 106 | const DEFAULT_THROTTLER_COMPONENT_THRESHOLD = null; 107 | 108 | // SerpFetcher 109 | // SerpFetcher result array keys 110 | const SERP_FETCHER_URLS = 'urls'; 111 | const SERP_FETCHER_SNIPPETS = 'snippets'; 112 | const SERP_FETCHER_TITLES = 'titles'; 113 | // default fetcher cache dir 114 | const DEFAULT_FETCHER_CACHE_DIR = 'fetcher_cache'; 115 | // default fetcher cache time to live in hours 116 | const DEFAULT_FETCHER_CACHE_TTL = 24; 117 | 118 | // SerpPageSerializer 119 | // default serializer cache dir 120 | const DEFAULT_SERIALIZER_CACHE_DIR = 'serializer_cache'; 121 | 122 | // SerpScraper 123 | // default number of pages to scrape 124 | const DEFAULT_PAGES_TO_SCRAPE = 1; 125 | // default output dir 126 | const DEFAULT_OUTPUT_DIR = 'out'; 127 | // default request delay in microseconds 128 | const DEFAULT_REQUEST_DELAY = 500; 129 | 130 | // supported serialization format 131 | private static $supportedSerializationFormat = array('json', 'xml', 'yml'); 132 | 133 | // dependencies to inject 134 | // Throttler component 135 | private $throttler; 136 | // SerpFetcher component 137 | private $fetcher; 138 | // SerpPageSerializer component 139 | private $serializer; 140 | 141 | // instance variables 142 | // output folder 143 | private $outDir; 144 | // fetcher cache folder 145 | private $fetcherCacheDir; 146 | // serializer cache folder 147 | private $serializerCacheDir; 148 | // cache expiration time 149 | private $cacheTTL; 150 | // request delay time in microseconds 151 | private $requestDelay; 152 | // keywords to scrape 153 | private $keywords; 154 | // store fetched objects 155 | private $fetched; 156 | // store serialized pages 157 | private $serialized; 158 | 159 | /** 160 | * Create a SerpScraper object. 161 | * @param array $keywords 162 | * @param string $outDir 163 | * @param string $fetcherCacheDir 164 | * @param string $serializerCacheDir 165 | * @param int $cacheTTL 166 | * @param int $requestDelay 167 | */ 168 | public function __construct($keywords, 169 | $outDir = self::DEFAULT_OUTPUT_DIR, 170 | $fetcherCacheDir = self::DEFAULT_FETCHER_CACHE_DIR, 171 | $serializerCacheDir = self::DEFAULT_SERIALIZER_CACHE_DIR, 172 | $cacheTTL = self::DEFAULT_FETCHER_CACHE_TTL, 173 | $requestDelay = self::DEFAULT_REQUEST_DELAY) 174 | { 175 | // perform validation 176 | SerpScraperHelper::checkArgs($keywords, $outDir, $fetcherCacheDir, 177 | $serializerCacheDir, $cacheTTL, $requestDelay); 178 | // instance variables 179 | $this->outDir = $outDir; 180 | $this->fetcherCacheDir = $fetcherCacheDir; 181 | $this->serializerCacheDir = $serializerCacheDir; 182 | $this->cacheTTL = $cacheTTL; 183 | $this->requestDelay = $requestDelay; 184 | $this->keywords = array(); 185 | $this->fetched = array(); 186 | $this->serialized = array(); 187 | // normalize user input keywords 188 | for ($i = 0; $i < count($keywords); $i++) { 189 | array_push($this->keywords, KeywordValidator::processKeyword($keywords[$i])); 190 | } 191 | // set up folders 192 | FileSystemHelper::setUpDir($outDir); 193 | FileSystemHelper::setUpDir($serializerCacheDir); 194 | // deps injection 195 | $this->throttler = new Throttler(self::DEFAULT_THROTTLER_NAME, 196 | self::DEFAULT_THROTTLER_THRESHOLD, 197 | self::DEFAULT_THROTTLER_METRIC, 198 | self::DEFAULT_THROTTLER_METRIC_FACTOR, 199 | self::DEFAULT_THROTTLER_COMPONENT_THRESHOLD, 200 | $this->keywords); 201 | // turn on throttling 202 | $this->throttler->start(); 203 | // instatiate the right fetcher at runtime (will also setup fetcher cache dir) 204 | $this->fetcher = SerpFetcherBuilder::create(self::runTimeClassName(), 205 | array($this->fetcherCacheDir, 206 | $this->cacheTTL)); 207 | $this->serializer = new SerpPageSerializer($serializerCacheDir); 208 | } 209 | 210 | /** 211 | * Scrape a single keyword. This will yield as many SerializableSerpPage 212 | * objects as there are to scrape. Those objects will be stored in the 213 | * fetched array queue, ready to be serialized. 214 | * It is also possible to empty the keywords array by setting $toRemove to 215 | * true, set a specific $timezone and turn throttling off by setting $throttling 216 | * to false. 217 | * @param string $keyword 218 | * @param int $pagesToScrape 219 | * @param bool $toRemove 220 | * @param string $timezone 221 | * @param bool $throttling 222 | * @return bool 223 | */ 224 | public function scrape($keyword, 225 | $pagesToScrape = self::DEFAULT_PAGES_TO_SCRAPE, 226 | $toRemove = false, 227 | $timezone = self::DEFAULT_TIMEZONE, 228 | $throttling = true) 229 | { 230 | // allow scrapeAll() to reuse scrape() 231 | if (is_string($keyword)) 232 | $keyword = array($keyword); 233 | // perform validations 234 | if (!SerpScraperHelper::validScrapeArgs($keyword, $pagesToScrape, 235 | $toRemove, $timezone, 236 | $throttling, $this->keywords)) 237 | return false; 238 | // map keywords to array of urls ready to scrape 239 | $urlsToScrape = $this->mapKeywordsToUrls($pagesToScrape, $keyword); 240 | // check for legal operation only if throttling 241 | if ($throttling && !$this->allowedScrapeOperation($pagesToScrape, $urlsToScrape)) 242 | return false; 243 | // avoid DateTime() annoying notices 244 | date_default_timezone_set($timezone); 245 | // loop over the keywords to scrape 246 | for ($i = 0; $i < count($keyword); $i++) { 247 | // get the current keyword 248 | $key = $keyword[$i]; 249 | // scrape $pagesToScrape pages for each keyword 250 | for ($j = 0; $j < $pagesToScrape; $j++) { 251 | $pageUrl = $urlsToScrape[$key][$j]; 252 | $fetched = $this->fetchPage($key, $pageUrl); 253 | $entries = $this->makeEntries($fetched); 254 | $engine = strtolower(self::runTimeClassName()); 255 | $pageNumber = $j + 1; 256 | $age = new \DateTime(); 257 | $age->setTimeStamp(time()); 258 | // construct a SerializableSerpPage and store it 259 | $serializablePage = new SerializableSerpPage($engine, $key, $pageUrl, 260 | $pageNumber, $age, 261 | $entries); 262 | array_push($this->fetched, $serializablePage); 263 | // delay inbetween requests 264 | usleep(self::DEFAULT_REQUEST_DELAY); 265 | } 266 | // remove the key from the queue if specified 267 | if ($toRemove) 268 | $this->removeKeyword($key); 269 | } 270 | 271 | return true; 272 | } 273 | 274 | /** 275 | * Scrape all the tracked keywords. This method reuses the scrape() method. 276 | * @param int $pagesToScrape 277 | * @param bool $toRemove 278 | * @param string $timezone 279 | * @param bool $throttling 280 | * @return bool 281 | */ 282 | public function scrapeAll($pagesToScrape = self::DEFAULT_PAGES_TO_SCRAPE, 283 | $toRemove = false, 284 | $timezone = self::DEFAULT_TIMEZONE, 285 | $throttling = true) 286 | { 287 | return $this->scrape($this->keywords, $pagesToScrape, $toRemove, 288 | $timezone, $throttling); 289 | } 290 | 291 | /** 292 | * Write all the serialized objects in the output dir. 293 | * If the related flag $toRemove is on, the serialized queue will be emptied. 294 | * @param bool $toRemove 295 | * @return bool 296 | */ 297 | public function save($toRemove = false) 298 | { 299 | // fail if there are no results to write to disk 300 | if (empty($this->serialized)) 301 | return false; 302 | // loop over serialized objects 303 | foreach ($this->serialized as $key => $serializedObject) { 304 | // generate filenames 305 | $filename = FileSystemHelper::generateFileName($key); 306 | // write files in the output dir 307 | file_put_contents($this->getOutDir() . DIRECTORY_SEPARATOR . $filename, 308 | $serializedObject->getContent()); 309 | } 310 | 311 | if ($toRemove) 312 | $this->serialized = array(); 313 | 314 | return true; 315 | } 316 | 317 | /** 318 | * Perform serialization on the SerializableSerpPage array. 319 | * The serialized objects will be stored in the serialized queue, waiting to 320 | * be written to files or to do whatever you wanna do with them. 321 | * If the related flag $toRemove is on, the fetched queue will be emptied. 322 | * @param string $format 323 | * @param bool $toRemove 324 | * @return bool 325 | */ 326 | public function serialize($format, $toRemove = false) 327 | { 328 | // fail if there's nothing to serialize or unsupported serialization format 329 | if (empty($this->fetched) || !self::supportedFormat($format)) 330 | return false; 331 | // loop over the SerializableSerpPage array 332 | for ($i = 0; $i < count($this->fetched); $i++) { 333 | // get a SerializablePage object 334 | $serializablePage = $this->fetched[$i]; 335 | // extract data to generate array key 336 | $engine = self::runTimeClassName(); 337 | list($keyword, $pageNumber, $age) = SerpScraperHelper::extractSerializablePageData($serializablePage); 338 | // generate array key 339 | $fileName = FileSystemHelper::generateArrKey($engine, $keyword, 340 | $pageNumber, $age, 341 | $format); 342 | $this->serialized[$fileName] = $this->serializer->serialize($serializablePage, 343 | $format); 344 | } 345 | // empty the fetched array if specified 346 | if ($toRemove) 347 | $this->fetched = array(); 348 | 349 | return true; 350 | } 351 | 352 | /** 353 | * Flush the underlying Fetcher object cache, along with the fetched and 354 | * serialized queues. 355 | */ 356 | public function flushCache() 357 | { 358 | $this->fetcher->flushCache(); 359 | $this->fetched = array(); 360 | $this->serialized = array(); 361 | } 362 | 363 | /** 364 | * Return the underlying Fetcher object. 365 | * @return AskFetcher|BingFetcher|GoogleFetcher|YahooFetcher 366 | */ 367 | public function getFetcher() 368 | { 369 | return $this->fetcher; 370 | } 371 | 372 | /** 373 | * Return the underlying Throttler object. 374 | * @return Throttler 375 | */ 376 | public function getThrottler() 377 | { 378 | return $this->throttler; 379 | } 380 | 381 | /** 382 | * Return the underlying SerpPageSerializer object. 383 | * @return SerpPageSerializer 384 | */ 385 | public function getSerializer() 386 | { 387 | return $this->serializer; 388 | } 389 | 390 | /** 391 | * Return the serialized serp pages. 392 | * @return array 393 | */ 394 | public function getSerializedPages() 395 | { 396 | return $this->serialized; 397 | } 398 | 399 | /** 400 | * Return the fetched serp pages. 401 | * @return array 402 | */ 403 | public function getFetchedPages() 404 | { 405 | return $this->fetched; 406 | } 407 | 408 | /** 409 | * Get the path to the folder used to store output. 410 | * @return string 411 | */ 412 | public function getOutDir() 413 | { 414 | return $this->outDir; 415 | } 416 | 417 | /** 418 | * Set the path to the folder used to store output. 419 | * @param string $dir 420 | * @return bool 421 | */ 422 | public function setOutDir($dir) 423 | { 424 | if (SerpScraperHelper::validateDirName($dir) 425 | && FileSystemHelper::preventCacheCollision($dir, $this->fetcherCacheDir, 426 | $this->serializerCacheDir)) { 427 | $this->outDir = $dir; 428 | FileSystemHelper::setUpDir($dir); 429 | return true; 430 | } 431 | return false; 432 | } 433 | 434 | /** 435 | * Get the path to the folder used to store the fetcher cache. 436 | * @return string 437 | */ 438 | public function getFetcherCacheDir() 439 | { 440 | return $this->fetcherCacheDir; 441 | } 442 | 443 | /** 444 | * Set the path to the folder used to store the fetcher cache. 445 | * @param string 446 | * @return bool 447 | */ 448 | public function setFetcherCacheDir($dir) 449 | { 450 | if (SerpScraperHelper::validateDirName($dir) 451 | && FileSystemHelper::preventCacheCollision($this->outDir, $dir, 452 | $this->serializerCacheDir)) { 453 | $this->fetcherCacheDir = $dir; 454 | FileSystemHelper::setUpDir($dir); 455 | return true; 456 | } 457 | return false; 458 | } 459 | 460 | /** 461 | * Get the path to the folder used to store the serializer cache. 462 | * @return string 463 | */ 464 | public function getSerializerCacheDir() 465 | { 466 | return $this->serializerCacheDir; 467 | } 468 | 469 | /** 470 | * Get the cache expiration time, in hours. 471 | * @return string 472 | */ 473 | public function getCacheTTL() 474 | { 475 | return $this->cacheTTL; 476 | } 477 | 478 | /** 479 | * Set the cache expiration time, in hours. 480 | * @param int 481 | * @return bool 482 | */ 483 | public function setCacheTTL($hours) 484 | { 485 | if (SerpScraperHelper::validateExpirationTime($hours)) { 486 | $this->cacheTTL = $hours; 487 | $this->fetcher->setCacheTTL($hours); 488 | return true; 489 | } 490 | return false; 491 | } 492 | 493 | /** 494 | * Get the delay used between each request, in microseconds. 495 | * @return int 496 | */ 497 | public function getRequestDelay() 498 | { 499 | return $this->requestDelay; 500 | } 501 | 502 | /** 503 | * Set the delay used between each request, in microseconds. 504 | * @param int 505 | * @return bool 506 | */ 507 | public function setRequestDelay($microseconds) 508 | { 509 | if (SerpScraperHelper::validateExpirationTime($microseconds)) { 510 | $this->requestDelay = $microseconds; 511 | return true; 512 | } 513 | return false; 514 | } 515 | 516 | /** 517 | * Get the array with keywords to scrape. 518 | * @return array 519 | */ 520 | public function getKeywords() 521 | { 522 | return $this->keywords; 523 | } 524 | 525 | /** 526 | * Add a keyword to scrape. Update Throttler object accordingly. 527 | * @param string $keyword 528 | * @return string 529 | */ 530 | public function addKeyword($keyword) 531 | { 532 | if (KeywordValidator::isValid($keyword) 533 | && !SerpScraperHelper::keywordPresent($keyword, $this->keywords)) { 534 | $cleanKeyword = KeywordValidator::processKeyword($keyword); 535 | array_push($this->keywords, $cleanKeyword); 536 | $this->throttler->stop(); 537 | $this->throttler->addComponents($cleanKeyword); 538 | $this->throttler->resume(); 539 | return true; 540 | } 541 | return false; 542 | } 543 | 544 | /** 545 | * Add multiple keywords to scrape. 546 | * @param array $keywords 547 | * @return bool 548 | */ 549 | public function addKeywords($keywords) 550 | { 551 | if (KeywordValidator::validKeywords($keywords) && !empty($keywords)) { 552 | for ($i = 0; $i < count($keywords); $i++) { 553 | $this->addKeyword($keywords[$i]); 554 | } 555 | return true; 556 | } 557 | return false; 558 | } 559 | 560 | /** 561 | * Remove a keyword from the queue. 562 | * This will not remove the keyword from the underlying Throttler object, since 563 | * it's possible to add the keyword back again and generate new requests hits 564 | * that still need to be throttled. 565 | * @param string $keyword 566 | * @return bool 567 | */ 568 | public function removeKeyword($keyword) 569 | { 570 | if (SerpScraperHelper::keywordPresent($keyword, $this->keywords)) { 571 | $toDel = array_search($keyword, $this->keywords); 572 | unset($this->keywords[$toDel]); 573 | $this->keywords = array_values($this->keywords); 574 | return true; 575 | } 576 | return false; 577 | } 578 | 579 | /** 580 | * Fetch a SERP page and update the underlying Throttler status accordingly. 581 | * @param string $keyword 582 | * @param string $url 583 | * @return array 584 | */ 585 | private function fetchPage($keyword, $url) 586 | { 587 | if (!$this->fetcher->cacheHit($url)) { 588 | $this->throttler->updateComponent($keyword); 589 | } 590 | return $this->fetcher->fetch($url); 591 | } 592 | 593 | /** 594 | * Map a fetched page returned by SerpFetcher->fetch() to an array suitable 595 | * for SerializableSerpPage constructor. 596 | * @param array $fetchedPage 597 | * @return array 598 | */ 599 | private function makeEntries($fetchedPage) 600 | { 601 | $entries = array(); 602 | for ($i = 0; $i < self::DEFAULT_RESULTS_PER_PAGE; $i++) { 603 | // construct an entry 604 | $entry = array('url' => $fetchedPage[self::SERP_FETCHER_URLS][$i], 605 | 'snippet' => $fetchedPage[self::SERP_FETCHER_SNIPPETS][$i], 606 | 'title' => $fetchedPage[self::SERP_FETCHER_TITLES][$i]); 607 | // don't add padded entries 608 | if (!$this->paddedEntry($entry)) 609 | array_push($entries, $entry); 610 | } 611 | return $entries; 612 | } 613 | 614 | /** 615 | * Detect padded entries. 616 | * @param array $entry 617 | * @return bool 618 | */ 619 | private function paddedEntry($entry) 620 | { 621 | return $entry['url'] == \Franzip\SerpFetcher\Fetchers\SerpFetcher::DEFAULT_PAD_ENTRY 622 | && $entry['title'] == \Franzip\SerpFetcher\Fetchers\SerpFetcher::DEFAULT_PAD_ENTRY 623 | && $entry['snippet'] == \Franzip\SerpFetcher\Fetchers\SerpFetcher::DEFAULT_PAD_ENTRY; 624 | } 625 | 626 | /** 627 | * Map keywords array to urls ready to to be scraped. 628 | * @param int $pagesToScrape 629 | * @param string|array $keywords 630 | * @return array 631 | */ 632 | private function mapKeywordsToUrls($pagesToScrape, $keywords) 633 | { 634 | $urls = array(); 635 | if (is_string($keywords)) 636 | $keywords = array($keywords); 637 | for ($i = 0; $i < count($keywords); $i++) { 638 | $urls[$this->keywords[$i]] = array(); 639 | for ($j = 0; $j < $pagesToScrape; $j++) { 640 | $urls[$keywords[$i]][] = SerpUrlGenerator::makeUrl(self::runTimeClassName(), 641 | $keywords[$i], $j); 642 | } 643 | } 644 | return $urls; 645 | } 646 | 647 | /** 648 | * Check whether a scrape operation is to allow. 649 | * @param int $pagesToScrape 650 | * @param string $urlsToCheck 651 | * @return bool 652 | */ 653 | private function allowedScrapeOperation($pagesToScrape, $urlsToCheck) 654 | { 655 | if ($this->throttler->timeExpired()) { 656 | $this->throttler->refreshInstance(); 657 | return true; 658 | } 659 | list($globalHitCount, $componentHitCount) = $this->hitCounter($urlsToCheck); 660 | return $this->hitChecker($globalHitCount, $componentHitCount); 661 | } 662 | 663 | /** 664 | * Compute global and per-keyword HTTP requests needed to complete a 665 | * scraping operation. Cached hit will be ignored. 666 | * @param array $urlsArr 667 | * @return array 668 | */ 669 | private function hitCounter($urlsArr) 670 | { 671 | $globalHitCount = 0; 672 | $componentHitCount = array(); 673 | // initialize per-component hit array 674 | foreach ($this->keywords as $key => $value) { 675 | $componentHitCount[$value] = 0; 676 | } 677 | foreach ($urlsArr as $keyword => $arr) { 678 | for ($i = 0; $i < count($arr); $i++) { 679 | // increase hits only on HTTP requests, ignore cache hit 680 | if (!$this->getFetcher()->cacheHit($arr[$i])) { 681 | $globalHitCount += 1; 682 | $componentHitCount[$keyword] += 1; 683 | } 684 | } 685 | } 686 | return array($globalHitCount, $componentHitCount); 687 | } 688 | 689 | /** 690 | * Check that the global and per-keyword hit counts are within the Throttler 691 | * thresholds. 692 | * @param int $globalHitCount 693 | * @param array $componentHitCount 694 | * @return bool 695 | */ 696 | private function hitChecker($globalHitCount, $componentHitCount) 697 | { 698 | $componentCheck = true; 699 | $globalCheck = ($globalHitCount + $this->throttler->getCounter()) < $this->throttler->getGlobalThreshold(); 700 | $throttlerThreshold = $this->getThrottler()->getComponentThreshold(); 701 | // check per-keywords hits only if per-component throttling is set 702 | if ($throttlerThreshold !== null) { 703 | $throttlerComponents = $this->getThrottler()->getComponents(); 704 | foreach ($componentHitCount as $key => $value) { 705 | if ($value + $throttlerComponents[$key] > $throttlerThreshold) { 706 | $componentCheck = false; 707 | break; 708 | } 709 | } 710 | } 711 | return $globalCheck && $componentCheck; 712 | } 713 | 714 | /** 715 | * Identify the search engine at runtime from the calling class. 716 | * @return string 717 | */ 718 | private static function runTimeClassName() 719 | { 720 | return str_replace(array(self::SCRAPERS_PREFIX, self::SCRAPERS_SUFFIX), 721 | '', get_called_class()); 722 | } 723 | 724 | /** 725 | * Check for supported serialization format. 726 | * @param string $format 727 | * @return bool 728 | */ 729 | private static function supportedFormat($format) 730 | { 731 | return in_array(strtolower($format), self::$supportedSerializationFormat); 732 | } 733 | } 734 | -------------------------------------------------------------------------------- /src/Scrapers/YahooScraper.php: -------------------------------------------------------------------------------- 1 | 7 | * @link https://github.com/franzip/serp-scraper 8 | * @copyright Copyright 2015 Francesco Pezzella 9 | * @license http://www.opensource.org/licenses/mit-license.php MIT License 10 | * @package SerpScraper 11 | */ 12 | 13 | namespace Franzip\SerpScraper\Scrapers; 14 | 15 | /** 16 | * SerpScraper implementation for Yahoo search engine. 17 | * 18 | * @package SerpScraper 19 | */ 20 | class YahooScraper extends SerpScraper 21 | { 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/SerpScraperBuilder.php: -------------------------------------------------------------------------------- 1 | 7 | * @link https://github.com/franzip/serp-scraper 8 | * @copyright Copyright 2015 Francesco Pezzella 9 | * @license http://www.opensource.org/licenses/mit-license.php MIT License 10 | * @package SerpScraper 11 | */ 12 | 13 | namespace Franzip\SerpScraper; 14 | 15 | /** 16 | * SerpScraper Factory. 17 | * 18 | * @package SerpScraper 19 | */ 20 | class SerpScraperBuilder 21 | { 22 | // namespace constants 23 | const SCRAPER_CLASS_PREFIX = '\\Franzip\\SerpScraper\\Scrapers\\'; 24 | const SCRAPER_CLASS_SUFFIX = 'Scraper'; 25 | // implemented scrapers 26 | private static $supportedEngines = array('google', 'yahoo', 'bing', 'ask'); 27 | 28 | /** 29 | * Return a SerpScraper implementation for a given search engine. 30 | * @param string $engine 31 | * @param null|array $args 32 | * @return mixed 33 | */ 34 | public static function create($engine, $args = null) 35 | { 36 | $engine = strtolower($engine); 37 | if (self::validEngine($engine)) { 38 | return (isset($args)) ? self::createWithArgs($engine, $args) : self::createWithArgs($engine, array()); 39 | } 40 | throw new \Franzip\SerpScraper\Exceptions\UnsupportedEngineException('Unknown or unsupported Search Engine.'); 41 | } 42 | 43 | /** 44 | * Use reflection to instantiate the right Scraper at runtime. 45 | * @param string $engine 46 | * @param null|array $args 47 | * @return mixed 48 | */ 49 | private static function createWithArgs($engine, $args) 50 | { 51 | $engineName = ucfirst($engine); 52 | $className = self::SCRAPER_CLASS_PREFIX . $engineName . self::SCRAPER_CLASS_SUFFIX; 53 | return call_user_func_array(array(new \ReflectionClass($className), 54 | 'newInstance'), $args); 55 | } 56 | 57 | /** 58 | * Check if there is a SerpScraper implementation for the given search engine. 59 | * @param string $engine 60 | * @return bool 61 | */ 62 | private static function validEngine($engine) 63 | { 64 | return is_string($engine) && in_array($engine, self::$supportedEngines); 65 | } 66 | 67 | /** 68 | * Make the class static. 69 | */ 70 | private function __construct() {} 71 | } 72 | -------------------------------------------------------------------------------- /tests/SerpScraperBuilderTest.php: -------------------------------------------------------------------------------- 1 | invalidEngines = $invalidEngines; 16 | } 17 | 18 | /** 19 | * @expectedException \Franzip\SerpScraper\Exceptions\UnsupportedEngineException 20 | * @expectedExceptionMessage Unknown or unsupported Search Engine. 21 | */ 22 | public function testInvalidEngineArgument1() 23 | { 24 | $foo = Builder::create($this->invalidEngines[0]); 25 | } 26 | 27 | /** 28 | * @expectedException \Franzip\SerpScraper\Exceptions\UnsupportedEngineException 29 | * @expectedExceptionMessage Unknown or unsupported Search Engine. 30 | */ 31 | public function testInvalidEngineArgument2() 32 | { 33 | $foo = Builder::create($this->invalidEngines[1]); 34 | } 35 | 36 | /** 37 | * @expectedException \Franzip\SerpScraper\Exceptions\UnsupportedEngineException 38 | * @expectedExceptionMessage Unknown or unsupported Search Engine. 39 | */ 40 | public function testInvalidEngineArgument3() 41 | { 42 | $foo = Builder::create($this->invalidEngines[2]); 43 | } 44 | 45 | /** 46 | * @expectedException \Franzip\SerpScraper\Exceptions\UnsupportedEngineException 47 | * @expectedExceptionMessage Unknown or unsupported Search Engine. 48 | */ 49 | public function testInvalidEngineArgument4() 50 | { 51 | $foo = Builder::create($this->invalidEngines[3]); 52 | } 53 | } 54 | 55 | class BuilderInvalidArgsTest extends PHPUnit_Framework_TestCase 56 | { 57 | protected $invalidKeywords; 58 | protected $invalidTime; 59 | protected $invalidDirs; 60 | protected $engines; 61 | 62 | protected function setUp() 63 | { 64 | $invalidDirs = array(1, '', false, 21); 65 | $invalidTime = array('', 'foo', ' ', false); 66 | $invalidKeywords = array(array(''), array(' '), array(false), array(2), 67 | array(str_repeat('foo', 100)), array('foo' => 'bar'), 68 | array('foo', 'baz', 'fobaz', 'bar' => 'baz'), 69 | array('foo', 'baz', 0)); 70 | $engines = array('gOOgLe', 'aSk', 'BIng', 'yAHOo'); 71 | $this->invalidKeywords = $invalidKeywords; 72 | $this->invalidTime = $invalidTime; 73 | $this->invalidDirs = $invalidDirs; 74 | $this->engines = $engines; 75 | } 76 | 77 | protected function tearDown() 78 | { 79 | TestHelper::cleanMess(); 80 | } 81 | 82 | /** 83 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 84 | * @expectedExceptionMessage Invalid SerializableSerpPage $keywords: please supply a sequential non-empty array of strings. 85 | */ 86 | public function testEmptyKeywordsArr() 87 | { 88 | $googleScraper = Builder::create($this->engines[0], array(array())); 89 | } 90 | 91 | /** 92 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 93 | * @expectedExceptionMessage Invalid SerializableSerpPage $outDir: please supply a non empty string. 94 | */ 95 | public function testInvalidOutDir() 96 | { 97 | $googleScraper = Builder::create($this->engines[0], array(array('foo'), $this->invalidDirs[0])); 98 | } 99 | 100 | /** 101 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 102 | * @expectedExceptionMessage Invalid SerializableSerpPage $outDir: please supply a non empty string. 103 | */ 104 | public function testInvalidOutDir1() 105 | { 106 | $askScraper = Builder::create($this->engines[1], array(array('foo'), $this->invalidDirs[1])); 107 | } 108 | 109 | /** 110 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 111 | * @expectedExceptionMessage Invalid SerializableSerpPage $fetcherCacheDir: please supply a non empty string. 112 | */ 113 | public function testInvalidCacheDir() 114 | { 115 | $bingScraper = Builder::create($this->engines[2], array(array('foo'), 'foo', $this->invalidDirs[2])); 116 | } 117 | 118 | /** 119 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 120 | * @expectedExceptionMessage Invalid SerializableSerpPage $fetcherCacheDir: please supply a non empty string. 121 | */ 122 | public function testInvalidCacheDir2() 123 | { 124 | $yahooScraper = Builder::create($this->engines[3], array(array('foo'), 'bar', $this->invalidDirs[3])); 125 | } 126 | 127 | /** 128 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 129 | * @expectedExceptionMessage Invalid SerializableSerpPage $serializerCacheDir: please supply a non empty string. 130 | */ 131 | public function testInvalidCacheDir3() 132 | { 133 | $yahooScraper = Builder::create($this->engines[3], array(array('foo'), 'bar', 'baz', $this->invalidDirs[0])); 134 | } 135 | 136 | /** 137 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 138 | * @expectedExceptionMessage Invalid SerializableSerpPage $serializerCacheDir: please supply a non empty string. 139 | */ 140 | public function testInvalidCacheDir4() 141 | { 142 | $googleScraper = Builder::create($this->engines[0], array(array('foo'), 'baz', 'bar', $this->invalidDirs[1])); 143 | } 144 | 145 | /** 146 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 147 | * @expectedExceptionMessage Invalid SerializableSerpPage $outDir, $fetcherCacheDir, $serializerCacheDir: cannot share the same folder for different caches. Please supply different folders path for different caches. 148 | */ 149 | public function testEqualCacheDir() 150 | { 151 | $googleScraper = Builder::create($this->engines[0], array(array('foo'), 'baz', 'bar', 'bar')); 152 | } 153 | 154 | /** 155 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 156 | * @expectedExceptionMessage Invalid SerializableSerpPage $cacheTTL: please supply a positive integer. 157 | */ 158 | public function testInvalidCacheTTL() 159 | { 160 | $askScraper = Builder::create($this->engines[1], array(array('foo'), 'baz', 'bar', 'foo', $this->invalidTime[0])); 161 | } 162 | 163 | /** 164 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 165 | * @expectedExceptionMessage Invalid SerializableSerpPage $cacheTTL: please supply a positive integer. 166 | */ 167 | public function testInvalidCacheTTL1() 168 | { 169 | $bingScraper = Builder::create($this->engines[2], array(array('foo'), 'baz', 'bar', 'foo', $this->invalidTime[1])); 170 | } 171 | 172 | /** 173 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 174 | * @expectedExceptionMessage Invalid SerializableSerpPage $cacheTTL: please supply a positive integer. 175 | */ 176 | public function testInvalidCacheTTL2() 177 | { 178 | $bingScraper = Builder::create($this->engines[2], array(array('foo'), 'baz', 'bar', 'foo', $this->invalidTime[2])); 179 | } 180 | 181 | /** 182 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 183 | * @expectedExceptionMessage Invalid SerializableSerpPage $cacheTTL: please supply a positive integer. 184 | */ 185 | public function testInvalidCacheTTL3() 186 | { 187 | $yahooScraper = Builder::create($this->engines[3], array(array('foo'), 'baz', 'bar', 'foo', $this->invalidTime[3])); 188 | } 189 | 190 | /** 191 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 192 | * @expectedExceptionMessage Invalid SerializableSerpPage $requestDelay: please supply a positive integer. 193 | */ 194 | public function testInvalidDelay() 195 | { 196 | $googleScraper = Builder::create($this->engines[0], array(array('foo'), 'baz', 'bar', 'foobar', 24, $this->invalidTime[0])); 197 | } 198 | 199 | /** 200 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 201 | * @expectedExceptionMessage Invalid SerializableSerpPage $requestDelay: please supply a positive integer. 202 | */ 203 | public function testInvalidDelay1() 204 | { 205 | $googleScraper = Builder::create($this->engines[0], array(array('foo'), 'baz', 'bar', 'foobar', 24, $this->invalidTime[1])); 206 | } 207 | 208 | /** 209 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 210 | * @expectedExceptionMessage Invalid SerializableSerpPage $requestDelay: please supply a positive integer. 211 | */ 212 | public function testInvalidDelay2() 213 | { 214 | $googleScraper = Builder::create($this->engines[0], array(array('foo'), 'baz', 'bar', 'foobar', 24, $this->invalidTime[2])); 215 | } 216 | 217 | /** 218 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 219 | * @expectedExceptionMessage Invalid SerializableSerpPage $requestDelay: please supply a positive integer. 220 | */ 221 | public function testInvalidDelay3() 222 | { 223 | $googleScraper = Builder::create($this->engines[0], array(array('foo'), 'baz', 'bar', 'foobar', 24, $this->invalidTime[3])); 224 | } 225 | 226 | /** 227 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 228 | * @expectedExceptionMessage Invalid SerializableSerpPage $keywords: please supply a sequential non-empty array of strings. 229 | */ 230 | public function testInvalidKeywords() 231 | { 232 | $googleScraper = Builder::create($this->engines[0], array($this->invalidKeywords[0], 'baz', 'bar', 'bad', 24, 500)); 233 | } 234 | 235 | /** 236 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 237 | * @expectedExceptionMessage Invalid SerializableSerpPage $keywords: please supply a sequential non-empty array of strings. 238 | */ 239 | public function testInvalidKeywords1() 240 | { 241 | $googleScraper = Builder::create($this->engines[0], array($this->invalidKeywords[1], 'baz', 'bar', 'bad', 24, 500)); 242 | } 243 | 244 | /** 245 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 246 | * @expectedExceptionMessage Invalid SerializableSerpPage $keywords: please supply a sequential non-empty array of strings. 247 | */ 248 | public function testInvalidKeywords2() 249 | { 250 | $askScraper = Builder::create($this->engines[1], array($this->invalidKeywords[2], 'baz', 'bar', 'bad', 24, 500)); 251 | } 252 | 253 | /** 254 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 255 | * @expectedExceptionMessage Invalid SerializableSerpPage $keywords: please supply a sequential non-empty array of strings. 256 | */ 257 | public function testInvalidKeywords3() 258 | { 259 | $bingScraper = Builder::create($this->engines[0], array($this->invalidKeywords[3], 'baz', 'bar', 'bad', 24, 500)); 260 | } 261 | 262 | /** 263 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 264 | * @expectedExceptionMessage Invalid SerializableSerpPage $keywords: please supply a sequential non-empty array of strings. 265 | */ 266 | public function testInvalidKeywords4() 267 | { 268 | $yahooScraper = Builder::create($this->engines[0], array($this->invalidKeywords[4], 'baz', 'bar', 'bad', 24, 500)); 269 | } 270 | 271 | /** 272 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 273 | * @expectedExceptionMessage Invalid SerializableSerpPage $keywords: please supply a sequential non-empty array of strings. 274 | */ 275 | public function testInvalidKeywords5() 276 | { 277 | $yahooScraper = Builder::create($this->engines[0], array($this->invalidKeywords[5], 'baz', 'bar', 'bad', 24, 500)); 278 | } 279 | 280 | /** 281 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 282 | * @expectedExceptionMessage Invalid SerializableSerpPage $keywords: please supply a sequential non-empty array of strings. 283 | */ 284 | public function testInvalidKeywords6() 285 | { 286 | $yahooScraper = Builder::create($this->engines[0], array($this->invalidKeywords[6], 'baz', 'bar', 'bad', 24, 500)); 287 | } 288 | 289 | /** 290 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 291 | * @expectedExceptionMessage Invalid SerializableSerpPage $keywords: please supply a sequential non-empty array of strings. 292 | */ 293 | public function testInvalidKeywords7() 294 | { 295 | $yahooScraper = Builder::create($this->engines[0], array($this->invalidKeywords[7], 'baz', 'bar', 'bad', 24, 500)); 296 | } 297 | } 298 | 299 | class BuilderTypesTest extends PHPUnit_Framework_TestCase 300 | { 301 | protected $engines; 302 | 303 | protected function setUp() 304 | { 305 | $engines = array('gOOgLe', 'aSk', 'BIng', 'yAHOo'); 306 | $this->engines = $engines; 307 | } 308 | 309 | protected function tearDown() 310 | { 311 | TestHelper::cleanMess(); 312 | } 313 | 314 | public function testGoogleScraper() 315 | { 316 | $googleScraper = Builder::create($this->engines[0], 317 | array(array('foo'), 'foobar', 'baz', 318 | 'bazbar', 48, 200)); 319 | $this->assertEquals(get_parent_class($googleScraper), 320 | 'Franzip\SerpScraper\Scrapers\SerpScraper'); 321 | $this->assertInstanceOf('Franzip\SerpScraper\Scrapers\GoogleScraper', 322 | $googleScraper); 323 | $this->assertInstanceOf('Franzip\Throttler\Throttler', 324 | $googleScraper->getThrottler()); 325 | $this->assertInstanceOf('Franzip\SerpFetcher\Fetchers\GoogleFetcher', 326 | $googleScraper->getFetcher()); 327 | $this->assertTrue(file_exists('foobar') && is_dir('foobar')); 328 | $this->assertTrue(file_exists('baz') && is_dir('baz')); 329 | $this->assertTrue(file_exists('bazbar') && is_dir('bazbar')); 330 | } 331 | 332 | public function testAskScraper() 333 | { 334 | $askScraper = Builder::create($this->engines[1], 335 | array(array('foo'), 'bad', 'foo', 'foobad', 336 | 72, 100)); 337 | $this->assertEquals(get_parent_class($askScraper), 338 | 'Franzip\SerpScraper\Scrapers\SerpScraper'); 339 | $this->assertInstanceOf('Franzip\SerpScraper\Scrapers\AskScraper', 340 | $askScraper); 341 | $this->assertInstanceOf('Franzip\Throttler\Throttler', 342 | $askScraper->getThrottler()); 343 | $this->assertInstanceOf('Franzip\SerpFetcher\Fetchers\AskFetcher', 344 | $askScraper->getFetcher()); 345 | $this->assertTrue(file_exists('bad') && is_dir('bad')); 346 | $this->assertTrue(file_exists('foobad') && is_dir('foobad')); 347 | $this->assertTrue(file_exists('foo') && is_dir('foo')); 348 | } 349 | 350 | public function testBingScraper() 351 | { 352 | $bingScraper = Builder::create($this->engines[2], array(array('baz'))); 353 | $this->assertEquals(get_parent_class($bingScraper), 354 | 'Franzip\SerpScraper\Scrapers\SerpScraper'); 355 | $this->assertInstanceOf('Franzip\SerpScraper\Scrapers\BingScraper', 356 | $bingScraper); 357 | $this->assertInstanceOf('Franzip\Throttler\Throttler', 358 | $bingScraper->getThrottler()); 359 | $this->assertInstanceOf('Franzip\SerpFetcher\Fetchers\BingFetcher', 360 | $bingScraper->getFetcher()); 361 | $this->assertTrue(file_exists($bingScraper::DEFAULT_OUTPUT_DIR) 362 | && is_dir($bingScraper::DEFAULT_OUTPUT_DIR)); 363 | $this->assertTrue(file_exists($bingScraper::DEFAULT_FETCHER_CACHE_DIR) 364 | && is_dir($bingScraper::DEFAULT_FETCHER_CACHE_DIR)); 365 | $this->assertTrue(file_exists($bingScraper::DEFAULT_SERIALIZER_CACHE_DIR) 366 | && is_dir($bingScraper::DEFAULT_SERIALIZER_CACHE_DIR)); 367 | } 368 | 369 | public function testYahooScraper() 370 | { 371 | $yahooScraper = Builder::create($this->engines[3], array(array('baz'))); 372 | $this->assertEquals(get_parent_class($yahooScraper), 373 | 'Franzip\SerpScraper\Scrapers\SerpScraper'); 374 | $this->assertInstanceOf('Franzip\SerpScraper\Scrapers\YahooScraper', 375 | $yahooScraper); 376 | $this->assertInstanceOf('Franzip\Throttler\Throttler', 377 | $yahooScraper->getThrottler()); 378 | $this->assertInstanceOf('Franzip\SerpFetcher\Fetchers\YahooFetcher', 379 | $yahooScraper->getFetcher()); 380 | $this->assertTrue(file_exists($yahooScraper::DEFAULT_OUTPUT_DIR) 381 | && is_dir($yahooScraper::DEFAULT_OUTPUT_DIR)); 382 | $this->assertTrue(file_exists($yahooScraper::DEFAULT_FETCHER_CACHE_DIR) 383 | && is_dir($yahooScraper::DEFAULT_FETCHER_CACHE_DIR)); 384 | $this->assertTrue(file_exists($yahooScraper::DEFAULT_SERIALIZER_CACHE_DIR) 385 | && is_dir($yahooScraper::DEFAULT_SERIALIZER_CACHE_DIR)); 386 | } 387 | } 388 | -------------------------------------------------------------------------------- /tests/SerpScraperHelpersTest.php: -------------------------------------------------------------------------------- 1 | invalidKeywords = $invalidKeywords; 19 | $this->invalidEngines = $invalidEngines; 20 | } 21 | 22 | protected function tearDown() 23 | { 24 | TestHelper::cleanMess(); 25 | } 26 | 27 | /** 28 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 29 | * @expectedExceptionMessage $keyword must be a valid string (max 180 characters). 30 | */ 31 | public function testEmptyString() 32 | { 33 | Validator::processKeyword($this->invalidKeywords[0]); 34 | } 35 | 36 | /** 37 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 38 | * @expectedExceptionMessage $keyword must be a valid string (max 180 characters). 39 | */ 40 | public function testWhiteSpaces() 41 | { 42 | Validator::processKeyword($this->invalidKeywords[1]); 43 | } 44 | 45 | /** 46 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 47 | * @expectedExceptionMessage $keyword must be a valid string (max 180 characters). 48 | */ 49 | public function testInt() 50 | { 51 | Validator::processKeyword($this->invalidKeywords[2]); 52 | } 53 | 54 | /** 55 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 56 | * @expectedExceptionMessage $keyword must be a valid string (max 180 characters). 57 | */ 58 | public function testBool() 59 | { 60 | Validator::processKeyword($this->invalidKeywords[3]); 61 | } 62 | 63 | /** 64 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 65 | * @expectedExceptionMessage $keyword must be a valid string (max 180 characters). 66 | */ 67 | public function testNull() 68 | { 69 | Validator::processKeyword($this->invalidKeywords[4]); 70 | } 71 | 72 | /** 73 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 74 | * @expectedExceptionMessage $keyword must be a valid string (max 180 characters). 75 | */ 76 | public function testLongString() 77 | { 78 | Validator::processKeyword($this->invalidKeywords[5]); 79 | } 80 | 81 | /** 82 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException 83 | * @expectedExceptionMessage $keyword must be a valid string (max 180 characters). 84 | */ 85 | public function testNewLine() 86 | { 87 | Validator::processKeyword($this->invalidKeywords[6]); 88 | } 89 | 90 | /** 91 | * @expectedException \Franzip\SerpScraper\Exceptions\UnsupportedEngineException 92 | * @expectedExceptionMessage Unknown or unsupported Search Engine. 93 | */ 94 | public function testInvalidEngine1() 95 | { 96 | Generator::makeUrl($this->invalidEngines[0], 'foobar', 0); 97 | } 98 | 99 | /** 100 | * @expectedException \Franzip\SerpScraper\Exceptions\UnsupportedEngineException 101 | * @expectedExceptionMessage Unknown or unsupported Search Engine. 102 | */ 103 | public function testInvalidEngine2() 104 | { 105 | Generator::makeUrl($this->invalidEngines[1], 'foobar', 0); 106 | } 107 | 108 | /** 109 | * @expectedException \Franzip\SerpScraper\Exceptions\UnsupportedEngineException 110 | * @expectedExceptionMessage Unknown or unsupported Search Engine. 111 | */ 112 | public function testInvalidEngine3() 113 | { 114 | Generator::makeUrl($this->invalidEngines[2], 'foobar', 0); 115 | } 116 | 117 | /** 118 | * @expectedException \Franzip\SerpScraper\Exceptions\UnsupportedEngineException 119 | * @expectedExceptionMessage Unknown or unsupported Search Engine. 120 | */ 121 | public function testInvalidEngine4() 122 | { 123 | Generator::makeUrl($this->invalidEngines[3], 'foobar', 0); 124 | } 125 | } 126 | 127 | class CleaningKeywordsTest extends PHPUnit_Framework_TestCase 128 | { 129 | protected $keywords; 130 | 131 | protected function setUp() 132 | { 133 | $cleanKey = array('foo', 'bar', 'barfoo12', str_repeat('foo', 30)); 134 | $keyToClean = array('foo, bar', ' foo ', "\tbar foo bar foo ", 135 | "\t\t foo \t bar", "\t\t\t \s//\\+?<>", 136 | "\ + / ? $ \t\t '<' \" >"); 137 | $this->keywords = array('cleanKey' => $cleanKey, 138 | 'keyToClean' => $keyToClean); 139 | } 140 | 141 | public function testClean() 142 | { 143 | $this->assertEquals(Validator::processKeyword($this->keywords['cleanKey'][0]), 'foo'); 144 | $this->assertEquals(Validator::processKeyword($this->keywords['cleanKey'][1]), 'bar'); 145 | $this->assertEquals(Validator::processKeyword($this->keywords['cleanKey'][2]), 'barfoo12'); 146 | $this->assertEquals(Validator::processKeyword($this->keywords['cleanKey'][3]), str_repeat('foo', 30)); 147 | } 148 | 149 | public function testDirty() 150 | { 151 | $this->assertEquals(Validator::processKeyword($this->keywords['keyToClean'][0]), 'foo, bar'); 152 | $this->assertEquals(Validator::processKeyword($this->keywords['keyToClean'][1]), 'foo'); 153 | $this->assertEquals(Validator::processKeyword($this->keywords['keyToClean'][2]), 'bar foo bar foo'); 154 | $this->assertEquals(Validator::processKeyword($this->keywords['keyToClean'][3]), 'foo bar'); 155 | $this->assertEquals(Validator::processKeyword($this->keywords['keyToClean'][4]), '\s//\+?<>'); 156 | $this->assertEquals(Validator::processKeyword($this->keywords['keyToClean'][5]), "\ + / ? $ '<' \" >"); 157 | } 158 | } 159 | 160 | class UrlGeneratorTest extends PHPUnit_Framework_TestCase 161 | { 162 | protected $settings; 163 | 164 | protected function setUp() 165 | { 166 | $engines = array('google', 'bing', 'yahoo', 'ask'); 167 | $offsets = array(0, 1, 2, 3, 4, 5, 6); 168 | $keywords = array(Validator::processKeyword('foobar'), 169 | Validator::processKeyword('foo'), 170 | Validator::processKeyword('foo baz ? \/ <>'), 171 | Validator::processKeyword('\n\t/ ')); 172 | $this->settings = array('engines' => $engines, 173 | 'offsets' => $offsets, 174 | 'keywords' => $keywords); 175 | } 176 | 177 | public function testUrls() 178 | { 179 | $this->assertEquals(Generator::makeUrl($this->settings['engines'][0], 180 | $this->settings['keywords'][0], 181 | $this->settings['offsets'][0]), 182 | "http://www.google.com/search?q=foobar&start=0"); 183 | $this->assertEquals(Generator::makeUrl($this->settings['engines'][1], 184 | $this->settings['keywords'][1], 185 | $this->settings['offsets'][1]), 186 | "http://www.bing.com/search?q=foo&first=11"); 187 | $this->assertEquals(Generator::makeUrl($this->settings['engines'][2], 188 | $this->settings['keywords'][2], 189 | $this->settings['offsets'][2]), 190 | "https://search.yahoo.com/search?p=foo+baz+%3F+%5C%2F+%3C%3E&b=21"); 191 | $this->assertEquals(Generator::makeUrl($this->settings['engines'][3], 192 | $this->settings['keywords'][3], 193 | $this->settings['offsets'][3]), 194 | 'http://us.ask.com/web?q=%5Cn%5Ct%2F+%3Ca%3E&page=4'); 195 | $this->assertEquals(Generator::makeUrl($this->settings['engines'][0], 196 | $this->settings['keywords'][2], 197 | $this->settings['offsets'][4]), 198 | "http://www.google.com/search?q=foo+baz+%3F+%5C%2F+%3C%3E&start=40"); 199 | $this->assertEquals(Generator::makeUrl($this->settings['engines'][1], 200 | $this->settings['keywords'][2], 201 | $this->settings['offsets'][5]), 202 | "http://www.bing.com/search?q=foo+baz+%3F+%5C%2F+%3C%3E&first=51"); 203 | $this->assertEquals(Generator::makeUrl($this->settings['engines'][2], 204 | $this->settings['keywords'][3], 205 | $this->settings['offsets'][6]), 206 | "https://search.yahoo.com/search?p=%5Cn%5Ct%2F+%3Ca%3E&b=61"); 207 | } 208 | } 209 | -------------------------------------------------------------------------------- /tests/SerpScraperTest.php: -------------------------------------------------------------------------------- 1 | engines = $engines; 16 | } 17 | 18 | protected function tearDown() 19 | { 20 | TestHelper::cleanMess(); 21 | } 22 | 23 | public function testWithDefaultArgs() 24 | { 25 | $googleScraper = Builder::create($this->engines[0], array(array('foo'))); 26 | $this->assertEquals($googleScraper->getOutDir(), 'out'); 27 | $this->assertEquals($googleScraper->getFetcherCacheDir(), 'fetcher_cache'); 28 | $this->assertEquals($googleScraper->getSerializerCacheDir(), 'serializer_cache'); 29 | $this->assertEquals($googleScraper->getCacheTTL(), 24); 30 | $this->assertEquals($googleScraper->getRequestDelay(), 500); 31 | $this->assertEquals($googleScraper->getKeywords(), array('foo')); 32 | $this->assertEquals($googleScraper->getThrottler()->getName(), 'http_requests'); 33 | $this->assertEquals($googleScraper->getThrottler()->getGlobalThreshold(), 15); 34 | $this->assertEquals($googleScraper->getThrottler()->getMetric(), 'hrs'); 35 | $this->assertEquals($googleScraper->getThrottler()->getMetricFactor(), 1); 36 | $this->assertNull($googleScraper->getThrottler()->getComponentThreshold()); 37 | $this->assertEquals($googleScraper->getThrottler()->getComponents(), 38 | array('foo' => 0)); 39 | $this->assertFalse($googleScraper->setOutDir(2)); 40 | $this->assertTrue($googleScraper->setOutDir('foobar1')); 41 | $this->assertFalse($googleScraper->setFetcherCacheDir(3)); 42 | $this->assertFalse($googleScraper->setFetcherCacheDir('foobar1')); 43 | $this->assertFalse($googleScraper->setOutDir('fetcher_cache')); 44 | $this->assertTrue($googleScraper->setFetcherCacheDir('foobar2')); 45 | $this->assertTrue(file_exists('foobar1') && is_dir('foobar1')); 46 | $this->assertTrue(file_exists('foobar2') && is_dir('foobar2')); 47 | $this->assertFalse($googleScraper->setCacheTTL('bar')); 48 | $this->assertTrue($googleScraper->setCacheTTL(200)); 49 | $this->assertEquals($googleScraper->getFetcher()->getCacheTTL(), 200); 50 | $this->assertFalse($googleScraper->setRequestDelay('foo')); 51 | $this->assertTrue($googleScraper->setRequestDelay(100)); 52 | $this->assertEquals($googleScraper->getOutDir(), 'foobar1'); 53 | $this->assertEquals($googleScraper->getFetcherCacheDir(), 'foobar2'); 54 | $this->assertEquals($googleScraper->getCacheTTL(), 200); 55 | $this->assertEquals($googleScraper->getRequestDelay(), 100); 56 | $this->assertFalse($googleScraper->addKeyword(3)); 57 | $this->assertTrue($googleScraper->addKeyword('foobaz')); 58 | $this->assertTrue($googleScraper->addKeyword('baz')); 59 | $this->assertEquals($googleScraper->getKeywords(), 60 | array('foo', 'foobaz', 'baz')); 61 | $this->assertEquals($googleScraper->getThrottler()->getComponents(), 62 | array('foo' => 0, 'foobaz' => 0, 'baz' => 0)); 63 | $this->assertFalse($googleScraper->addKeywords(array())); 64 | $this->assertFalse($googleScraper->addKeywords(array('foo' => 1))); 65 | $this->assertTrue($googleScraper->addKeywords(array('foo', 'baz', 'bar'))); 66 | $this->assertEquals($googleScraper->getKeywords(), 67 | array('foo', 'foobaz', 'baz', 'bar')); 68 | $this->assertTrue($googleScraper->removeKeyword('foobaz')); 69 | $this->assertTrue($googleScraper->removeKeyword('foo')); 70 | $this->assertEquals($googleScraper->getKeywords(), array('baz', 'bar')); 71 | $this->assertEquals($googleScraper->getThrottler()->getComponents(), 72 | array('foo' => 0, 'foobaz' => 0, 'baz' => 0, 'bar' => 0)); 73 | 74 | $askScraper = Builder::create($this->engines[1], array(array('foo'))); 75 | $this->assertEquals($askScraper->getOutDir(), 'out'); 76 | $this->assertEquals($askScraper->getFetcherCacheDir(), 'fetcher_cache'); 77 | $this->assertEquals($askScraper->getSerializerCacheDir(), 'serializer_cache'); 78 | $this->assertEquals($askScraper->getCacheTTL(), 24); 79 | $this->assertEquals($askScraper->getRequestDelay(), 500); 80 | $this->assertEquals($askScraper->getKeywords(), array('foo')); 81 | $this->assertEquals($askScraper->getThrottler()->getName(), 'http_requests'); 82 | $this->assertEquals($askScraper->getThrottler()->getGlobalThreshold(), 15); 83 | $this->assertEquals($askScraper->getThrottler()->getMetric(), 'hrs'); 84 | $this->assertEquals($askScraper->getThrottler()->getMetricFactor(), 1); 85 | $this->assertNull($askScraper->getThrottler()->getComponentThreshold()); 86 | $this->assertEquals($askScraper->getThrottler()->getComponents(), 87 | array('foo' => 0)); 88 | $this->assertFalse($askScraper->setOutDir(2)); 89 | $this->assertTrue($askScraper->setOutDir('foobar3')); 90 | $this->assertFalse($askScraper->setFetcherCacheDir(3)); 91 | $this->assertTrue($askScraper->setFetcherCacheDir('foobar4')); 92 | $this->assertTrue(file_exists('foobar3') && is_dir('foobar3')); 93 | $this->assertTrue(file_exists('foobar4') && is_dir('foobar4')); 94 | $this->assertFalse($askScraper->setCacheTTL('bar')); 95 | $this->assertTrue($askScraper->setCacheTTL(200)); 96 | $this->assertEquals($askScraper->getFetcher()->getCacheTTL(), 200); 97 | $this->assertFalse($askScraper->setRequestDelay('foo')); 98 | $this->assertTrue($askScraper->setRequestDelay(100)); 99 | $this->assertEquals($askScraper->getOutDir(), 'foobar3'); 100 | $this->assertEquals($askScraper->getFetcherCacheDir(), 'foobar4'); 101 | $this->assertEquals($askScraper->getCacheTTL(), 200); 102 | $this->assertEquals($askScraper->getRequestDelay(), 100); 103 | $this->assertFalse($askScraper->addKeyword(3)); 104 | $this->assertTrue($askScraper->addKeyword('foobaz')); 105 | $this->assertTrue($askScraper->addKeyword('baz')); 106 | $this->assertEquals($askScraper->getKeywords(), 107 | array('foo', 'foobaz', 'baz')); 108 | $this->assertEquals($askScraper->getThrottler()->getComponents(), 109 | array('foo' => 0, 'foobaz' => 0, 'baz' => 0)); 110 | $this->assertFalse($askScraper->addKeywords(array('foo', 'bar' => 1))); 111 | $this->assertTrue($askScraper->addKeywords(array('bar', 'foobar', 'barfoo'))); 112 | $this->assertTrue($askScraper->removeKeyword('foobaz')); 113 | $this->assertTrue($askScraper->removeKeyword('foo')); 114 | $this->assertEquals($askScraper->getKeywords(), 115 | array('baz', 'bar', 'foobar', 'barfoo')); 116 | $this->assertEquals($askScraper->getThrottler()->getComponents(), 117 | array('foo' => 0, 'foobaz' => 0, 'baz' => 0, 118 | 'bar' => 0, 'foobar' => 0, 'barfoo' => 0)); 119 | 120 | $bingScraper = Builder::create($this->engines[2], array(array('foo'))); 121 | $this->assertEquals($bingScraper->getOutDir(), 'out'); 122 | $this->assertEquals($bingScraper->getFetcherCacheDir(), 'fetcher_cache'); 123 | $this->assertEquals($bingScraper->getSerializerCacheDir(), 'serializer_cache'); 124 | $this->assertEquals($bingScraper->getCacheTTL(), 24); 125 | $this->assertEquals($bingScraper->getRequestDelay(), 500); 126 | $this->assertEquals($bingScraper->getKeywords(), array('foo')); 127 | $this->assertEquals($bingScraper->getThrottler()->getName(), 'http_requests'); 128 | $this->assertEquals($bingScraper->getThrottler()->getGlobalThreshold(), 15); 129 | $this->assertEquals($bingScraper->getThrottler()->getMetric(), 'hrs'); 130 | $this->assertEquals($bingScraper->getThrottler()->getMetricFactor(), 1); 131 | $this->assertNull($bingScraper->getThrottler()->getComponentThreshold()); 132 | $this->assertEquals($bingScraper->getThrottler()->getComponents(), 133 | array('foo' => 0)); 134 | $this->assertFalse($bingScraper->setOutDir(2)); 135 | $this->assertTrue($bingScraper->setOutDir('foobar5')); 136 | $this->assertFalse($bingScraper->setFetcherCacheDir(3)); 137 | $this->assertTrue($bingScraper->setFetcherCacheDir('foobar6')); 138 | $this->assertTrue(file_exists('foobar5') && is_dir('foobar5')); 139 | $this->assertTrue(file_exists('foobar6') && is_dir('foobar6')); 140 | $this->assertFalse($bingScraper->setCacheTTL('bar')); 141 | $this->assertTrue($bingScraper->setCacheTTL(200)); 142 | $this->assertEquals($bingScraper->getFetcher()->getCacheTTL(), 200); 143 | $this->assertFalse($bingScraper->setRequestDelay('foo')); 144 | $this->assertTrue($bingScraper->setRequestDelay(100)); 145 | $this->assertEquals($bingScraper->getOutDir(), 'foobar5'); 146 | $this->assertEquals($bingScraper->getFetcherCacheDir(), 'foobar6'); 147 | $this->assertEquals($bingScraper->getCacheTTL(), 200); 148 | $this->assertEquals($bingScraper->getRequestDelay(), 100); 149 | $this->assertFalse($bingScraper->addKeyword(3)); 150 | $this->assertTrue($bingScraper->addKeyword('foobaz')); 151 | $this->assertTrue($bingScraper->addKeyword('baz')); 152 | $this->assertEquals($bingScraper->getKeywords(), 153 | array('foo', 'foobaz', 'baz')); 154 | $this->assertEquals($bingScraper->getThrottler()->getComponents(), 155 | array('foo' => 0, 'foobaz' => 0, 'baz' => 0)); 156 | $this->assertTrue($bingScraper->addKeywords(array('barfoo'))); 157 | $this->assertTrue($bingScraper->removeKeyword('foobaz')); 158 | $this->assertTrue($bingScraper->removeKeyword('foo')); 159 | $this->assertEquals($bingScraper->getKeywords(), array('baz', 'barfoo')); 160 | $this->assertEquals($bingScraper->getThrottler()->getComponents(), 161 | array('foo' => 0, 'foobaz' => 0, 'baz' => 0, 'barfoo' => 0)); 162 | 163 | $yahooScraper = Builder::create($this->engines[3], array(array('foo'))); 164 | $this->assertEquals($yahooScraper->getOutDir(), 'out'); 165 | $this->assertEquals($yahooScraper->getFetcherCacheDir(), 'fetcher_cache'); 166 | $this->assertEquals($yahooScraper->getSerializerCacheDir(), 'serializer_cache'); 167 | $this->assertEquals($yahooScraper->getCacheTTL(), 24); 168 | $this->assertEquals($yahooScraper->getRequestDelay(), 500); 169 | $this->assertEquals($yahooScraper->getKeywords(), array('foo')); 170 | $this->assertEquals($yahooScraper->getThrottler()->getName(), 'http_requests'); 171 | $this->assertEquals($yahooScraper->getThrottler()->getGlobalThreshold(), 15); 172 | $this->assertEquals($yahooScraper->getThrottler()->getMetric(), 'hrs'); 173 | $this->assertEquals($yahooScraper->getThrottler()->getMetricFactor(), 1); 174 | $this->assertNull($yahooScraper->getThrottler()->getComponentThreshold()); 175 | $this->assertEquals($yahooScraper->getThrottler()->getComponents(), 176 | array('foo' => 0)); 177 | $this->assertFalse($yahooScraper->setOutDir(2)); 178 | $this->assertTrue($yahooScraper->setOutDir('foobar7')); 179 | $this->assertFalse($yahooScraper->setFetcherCacheDir(3)); 180 | $this->assertTrue($yahooScraper->setFetcherCacheDir('foobar8')); 181 | $this->assertTrue(file_exists('foobar7') && is_dir('foobar7')); 182 | $this->assertTrue(file_exists('foobar8') && is_dir('foobar8')); 183 | $this->assertFalse($yahooScraper->setCacheTTL('bar')); 184 | $this->assertTrue($yahooScraper->setCacheTTL(200)); 185 | $this->assertFalse($yahooScraper->setRequestDelay('foo')); 186 | $this->assertTrue($yahooScraper->setRequestDelay(100)); 187 | $this->assertEquals($yahooScraper->getOutDir(), 'foobar7'); 188 | $this->assertEquals($yahooScraper->getFetcherCacheDir(), 'foobar8'); 189 | $this->assertEquals($yahooScraper->getCacheTTL(), 200); 190 | $this->assertEquals($yahooScraper->getFetcher()->getCacheTTL(), 200); 191 | $this->assertEquals($yahooScraper->getRequestDelay(), 100); 192 | $this->assertFalse($yahooScraper->addKeyword(3)); 193 | $this->assertFalse($yahooScraper->addKeyword('foo')); 194 | $this->assertTrue($yahooScraper->addKeyword('baz')); 195 | $this->assertEquals($yahooScraper->getKeywords(), array('foo', 'baz')); 196 | $this->assertEquals($yahooScraper->getThrottler()->getComponents(), 197 | array('foo' => 0, 'baz' => 0)); 198 | $this->assertFalse($yahooScraper->removeKeyword('foobaz')); 199 | $this->assertTrue($yahooScraper->removeKeyword('foo')); 200 | $this->assertEquals($yahooScraper->getKeywords(), array('baz')); 201 | $this->assertEquals($yahooScraper->getThrottler()->getComponents(), 202 | array('foo' => 0, 'baz' => 0)); 203 | } 204 | 205 | public function testWithCustomArgs() 206 | { 207 | $googleScraper = Builder::create($this->engines[0], 208 | array(array('foobam ', ' foobaz', 'baz'), 209 | 'foo', 'bar', 'baz', 48, 1000)); 210 | $this->assertEquals($googleScraper->getOutDir(), 'foo'); 211 | $this->assertEquals($googleScraper->getFetcherCacheDir(), 'bar'); 212 | $this->assertEquals($googleScraper->getSerializerCacheDir(), 'baz'); 213 | $this->assertTrue(file_exists('foo') && is_dir('foo')); 214 | $this->assertTrue(file_exists('bar') && is_dir('bar')); 215 | $this->assertTrue(file_exists('baz') && is_dir('baz')); 216 | $this->assertEquals($googleScraper->getCacheTTL(), 48); 217 | $this->assertEquals($googleScraper->getRequestDelay(), 1000); 218 | $this->assertEquals($googleScraper->getKeywords(), 219 | array('foobam', 'foobaz', 'baz')); 220 | $this->assertEquals($googleScraper->getThrottler()->getName(), 'http_requests'); 221 | $this->assertEquals($googleScraper->getThrottler()->getGlobalThreshold(), 15); 222 | $this->assertEquals($googleScraper->getThrottler()->getMetric(), 'hrs'); 223 | $this->assertEquals($googleScraper->getThrottler()->getMetricFactor(), 1); 224 | $this->assertNull($googleScraper->getThrottler()->getComponentThreshold()); 225 | $this->assertEquals($googleScraper->getThrottler()->getComponents(), 226 | array('foobam' => 0, 'foobaz' => 0, 'baz' => 0)); 227 | $this->assertFalse($googleScraper->setOutDir(2)); 228 | $this->assertTrue($googleScraper->setOutDir('foobar1')); 229 | $this->assertFalse($googleScraper->setFetcherCacheDir(3)); 230 | $this->assertTrue($googleScraper->setFetcherCacheDir('foobar2')); 231 | $this->assertTrue(file_exists('foobar1') && is_dir('foobar1')); 232 | $this->assertTrue(file_exists('foobar2') && is_dir('foobar2')); 233 | $this->assertFalse($googleScraper->setCacheTTL('bar')); 234 | $this->assertTrue($googleScraper->setCacheTTL(200)); 235 | $this->assertFalse($googleScraper->setRequestDelay('foo')); 236 | $this->assertTrue($googleScraper->setRequestDelay(100)); 237 | $this->assertEquals($googleScraper->getOutDir(), 'foobar1'); 238 | $this->assertEquals($googleScraper->getFetcherCacheDir(), 'foobar2'); 239 | $this->assertEquals($googleScraper->getCacheTTL(), 200); 240 | $this->assertEquals($googleScraper->getRequestDelay(), 100); 241 | $this->assertFalse($googleScraper->addKeyword(3)); 242 | $this->assertTrue($googleScraper->addKeyword("\t foo")); 243 | $this->assertFalse($googleScraper->addKeyword('baz')); 244 | $this->assertEquals($googleScraper->getKeywords(), 245 | array('foobam', 'foobaz', 'baz', 'foo')); 246 | $this->assertEquals($googleScraper->getThrottler()->getComponents(), 247 | array('foo' => 0, 'foobam' => 0, 'foobaz' => 0, 'baz' => 0)); 248 | $this->assertTrue($googleScraper->removeKeyword('foobaz')); 249 | $this->assertTrue($googleScraper->removeKeyword('foo')); 250 | $this->assertEquals($googleScraper->getKeywords(), array('foobam', 'baz')); 251 | $this->assertTrue($googleScraper->addKeywords(array('foobaz', 'foo'))); 252 | $this->assertEquals($googleScraper->getKeywords(), 253 | array('foobam', 'baz', 'foobaz', 'foo')); 254 | 255 | $askScraper = Builder::create($this->engines[1], array(array('foobaz'), 256 | 'fooz', 'barz', 'bazz', 72, 1500)); 257 | $this->assertEquals($askScraper->getOutDir(), 'fooz'); 258 | $this->assertEquals($askScraper->getFetcherCacheDir(), 'barz'); 259 | $this->assertEquals($askScraper->getSerializerCacheDir(), 'bazz'); 260 | $this->assertTrue(file_exists('fooz') && is_dir('fooz')); 261 | $this->assertTrue(file_exists('barz') && is_dir('barz')); 262 | $this->assertTrue(file_exists('bazz') && is_dir('bazz')); 263 | $this->assertEquals($askScraper->getCacheTTL(), 72); 264 | $this->assertEquals($askScraper->getRequestDelay(), 1500); 265 | $this->assertEquals($askScraper->getKeywords(), array('foobaz')); 266 | $this->assertEquals($askScraper->getThrottler()->getName(), 'http_requests'); 267 | $this->assertEquals($askScraper->getThrottler()->getGlobalThreshold(), 15); 268 | $this->assertEquals($askScraper->getThrottler()->getMetric(), 'hrs'); 269 | $this->assertEquals($askScraper->getThrottler()->getMetricFactor(), 1); 270 | $this->assertNull($askScraper->getThrottler()->getComponentThreshold()); 271 | $this->assertEquals($askScraper->getThrottler()->getComponents(), 272 | array('foobaz' => 0)); 273 | 274 | $bingScraper = Builder::create($this->engines[2], 275 | array( 276 | array('foobam', 'foobaz'), 277 | 'foobar', 'barfoo', 'bazfoo', 148, 278 | 100)); 279 | $this->assertEquals($bingScraper->getOutDir(), 'foobar'); 280 | $this->assertEquals($bingScraper->getFetcherCacheDir(), 'barfoo'); 281 | $this->assertEquals($bingScraper->getSerializerCacheDir(), 'bazfoo'); 282 | $this->assertTrue(file_exists('foobar') && is_dir('foobar')); 283 | $this->assertTrue(file_exists('barfoo') && is_dir('barfoo')); 284 | $this->assertTrue(file_exists('bazfoo') && is_dir('bazfoo')); 285 | $this->assertEquals($bingScraper->getCacheTTL(), 148); 286 | $this->assertEquals($bingScraper->getRequestDelay(), 100); 287 | $this->assertEquals($bingScraper->getKeywords(), array('foobam', 'foobaz')); 288 | $this->assertEquals($bingScraper->getThrottler()->getName(), 'http_requests'); 289 | $this->assertEquals($bingScraper->getThrottler()->getGlobalThreshold(), 15); 290 | $this->assertEquals($bingScraper->getThrottler()->getMetric(), 'hrs'); 291 | $this->assertEquals($bingScraper->getThrottler()->getMetricFactor(), 1); 292 | $this->assertNull($bingScraper->getThrottler()->getComponentThreshold()); 293 | $this->assertEquals($bingScraper->getThrottler()->getComponents(), 294 | array('foobam' => 0, 'foobaz' => 0)); 295 | 296 | $yahooScraper = Builder::create($this->engines[1], 297 | array( 298 | array(' foobam', ' foobaz'), 299 | 'foo', 'bar', 'baz', 48, 1000)); 300 | $this->assertEquals($yahooScraper->getOutDir(), 'foo'); 301 | $this->assertEquals($yahooScraper->getFetcherCacheDir(), 'bar'); 302 | $this->assertEquals($yahooScraper->getSerializerCacheDir(), 'baz'); 303 | $this->assertEquals($yahooScraper->getCacheTTL(), 48); 304 | $this->assertEquals($yahooScraper->getRequestDelay(), 1000); 305 | $this->assertEquals($yahooScraper->getKeywords(), array('foobam', 'foobaz')); 306 | $this->assertEquals($yahooScraper->getThrottler()->getName(), 'http_requests'); 307 | $this->assertEquals($yahooScraper->getThrottler()->getGlobalThreshold(), 15); 308 | $this->assertEquals($yahooScraper->getThrottler()->getMetric(), 'hrs'); 309 | $this->assertEquals($yahooScraper->getThrottler()->getMetricFactor(), 1); 310 | $this->assertNull($yahooScraper->getThrottler()->getComponentThreshold()); 311 | $this->assertEquals($yahooScraper->getThrottler()->getComponents(), 312 | array('foobam' => 0, 'foobaz' => 0)); 313 | } 314 | } 315 | 316 | class ScrapingTest extends PHPUnit_Framework_TestCase 317 | { 318 | protected $engines; 319 | 320 | protected function setUp() 321 | { 322 | $engines = array('gOOgLe', 'aSk', 'BIng', 'yAHOo'); 323 | $this->engines = $engines; 324 | } 325 | 326 | protected function tearDown() 327 | { 328 | TestHelper::cleanMess(); 329 | } 330 | 331 | public function testKeyToUrlMapping() 332 | { 333 | $googleScraper = Builder::create($this->engines[0], 334 | array(array('foo', 'baz'))); 335 | $mapKeywordsToUrls = TestHelper::getMethod('mapKeywordsToUrls', 'Google'); 336 | $this->assertEquals( 337 | $mapKeywordsToUrls->invokeArgs($googleScraper, array(1, 'foo')), 338 | array( 339 | "foo" => array("http://www.google.com/search?q=foo&start=0") 340 | ) 341 | ); 342 | 343 | $this->assertEquals( 344 | $mapKeywordsToUrls->invokeArgs($googleScraper, array(1, $googleScraper->getKeywords())), 345 | array( 346 | "foo" => array("http://www.google.com/search?q=foo&start=0"), 347 | "baz" => array("http://www.google.com/search?q=baz&start=0") 348 | ) 349 | ); 350 | 351 | $this->assertEquals( 352 | $mapKeywordsToUrls->invokeArgs($googleScraper, array(2, $googleScraper->getKeywords())), 353 | array( 354 | "foo" => array("http://www.google.com/search?q=foo&start=0", 355 | "http://www.google.com/search?q=foo&start=10"), 356 | "baz" => array("http://www.google.com/search?q=baz&start=0", 357 | "http://www.google.com/search?q=baz&start=10") 358 | ) 359 | ); 360 | 361 | $askScraper = Builder::create($this->engines[1], array(array('foobar', 'baz'))); 362 | $mapKeywordsToUrls = TestHelper::getMethod('mapKeywordsToUrls', 'Ask'); 363 | $this->assertEquals($mapKeywordsToUrls->invokeArgs($askScraper, array(1, 'foobar')), 364 | array( 365 | "foobar" => array("http://us.ask.com/web?q=foobar&page=1") 366 | ) 367 | ); 368 | 369 | $this->assertEquals($mapKeywordsToUrls->invokeArgs($askScraper, array(1, $askScraper->getKeywords())), 370 | array( 371 | "foobar" => array("http://us.ask.com/web?q=foobar&page=1"), 372 | "baz" => array("http://us.ask.com/web?q=baz&page=1") 373 | ) 374 | ); 375 | 376 | $this->assertEquals($mapKeywordsToUrls->invokeArgs($askScraper, array(2, $askScraper->getKeywords())), 377 | array("foobar" => array("http://us.ask.com/web?q=foobar&page=1", 378 | "http://us.ask.com/web?q=foobar&page=2"), 379 | "baz" => array("http://us.ask.com/web?q=baz&page=1", 380 | "http://us.ask.com/web?q=baz&page=2") 381 | ) 382 | ); 383 | 384 | $bingScraper = Builder::create($this->engines[2], array(array('bazfoo', 'foobaz'))); 385 | $mapKeywordsToUrls = TestHelper::getMethod('mapKeywordsToUrls', 'Bing'); 386 | $this->assertEquals($mapKeywordsToUrls->invokeArgs($bingScraper, array(1, 'bazfoo')), 387 | array( 388 | "bazfoo" => array("http://www.bing.com/search?q=bazfoo&first=1") 389 | ) 390 | ); 391 | 392 | $this->assertEquals($mapKeywordsToUrls->invokeArgs($bingScraper, array(1, $bingScraper->getKeywords())), 393 | array( 394 | "bazfoo" => array("http://www.bing.com/search?q=bazfoo&first=1"), 395 | "foobaz" => array("http://www.bing.com/search?q=foobaz&first=1") 396 | ) 397 | ); 398 | 399 | $this->assertEquals($mapKeywordsToUrls->invokeArgs($bingScraper, array(2, $bingScraper->getKeywords())), 400 | array( 401 | "bazfoo" => array("http://www.bing.com/search?q=bazfoo&first=1", 402 | "http://www.bing.com/search?q=bazfoo&first=11"), 403 | "foobaz" => array("http://www.bing.com/search?q=foobaz&first=1", 404 | "http://www.bing.com/search?q=foobaz&first=11") 405 | ) 406 | ); 407 | 408 | $yahooScraper = Builder::create($this->engines[3], array(array('foo'))); 409 | $mapKeywordsToUrls = TestHelper::getMethod('mapKeywordsToUrls', 'Yahoo'); 410 | $this->assertEquals($mapKeywordsToUrls->invokeArgs($yahooScraper, array(1, 'foo')), 411 | array( 412 | "foo" => array("https://search.yahoo.com/search?p=foo&b=1") 413 | ) 414 | ); 415 | 416 | $this->assertEquals($mapKeywordsToUrls->invokeArgs($yahooScraper, array(5, $yahooScraper->getKeywords())), 417 | array( 418 | "foo" => array("https://search.yahoo.com/search?p=foo&b=1", 419 | "https://search.yahoo.com/search?p=foo&b=11", 420 | "https://search.yahoo.com/search?p=foo&b=21", 421 | "https://search.yahoo.com/search?p=foo&b=31", 422 | "https://search.yahoo.com/search?p=foo&b=41") 423 | ) 424 | ); 425 | } 426 | 427 | public function testHitCounter() 428 | { 429 | $googleScraper = Builder::create($this->engines[0], array(array('foo', 'baz'))); 430 | $mapKeywordsToUrls = TestHelper::getMethod('mapKeywordsToUrls', 'Google'); 431 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($googleScraper, 432 | array(1, $googleScraper->getKeywords())); 433 | $hitCounter = TestHelper::getMethod('hitCounter', 'Google'); 434 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($googleScraper, 435 | array($urlsToScrape)); 436 | $this->assertEquals($globalHitCount, 2); 437 | $this->assertEquals($componentHitCount, array("foo" => 1, "baz" => 1)); 438 | $this->assertTrue($googleScraper->addKeyword('foobaz')); 439 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($googleScraper, 440 | array(2, $googleScraper->getKeywords())); 441 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($googleScraper, 442 | array($urlsToScrape)); 443 | $this->assertEquals($globalHitCount, 6); 444 | $this->assertEquals($componentHitCount, array("foo" => 2, "baz" => 2, "foobaz" => 2)); 445 | $this->assertTrue($googleScraper->removeKeyword("foo")); 446 | $this->assertTrue($googleScraper->removeKeyword("baz")); 447 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($googleScraper, 448 | array(3, $googleScraper->getKeywords())); 449 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($googleScraper, 450 | array($urlsToScrape)); 451 | $this->assertEquals($globalHitCount, 3); 452 | $this->assertEquals($componentHitCount, array("foobaz" => 3)); 453 | 454 | $askScraper = Builder::create($this->engines[1], array(array('foobar', 'baz'))); 455 | $mapKeywordsToUrls = TestHelper::getMethod('mapKeywordsToUrls', 'Ask'); 456 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($askScraper, 457 | array(1, $askScraper->getKeywords())); 458 | $hitCounter = TestHelper::getMethod('hitCounter', 'Ask'); 459 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($askScraper, 460 | array($urlsToScrape)); 461 | $this->assertEquals($globalHitCount, 2); 462 | $this->assertEquals($componentHitCount, array("foobar" => 1, "baz" => 1)); 463 | $this->assertTrue($askScraper->addKeyword('foobaz')); 464 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($askScraper, 465 | array(2, $askScraper->getKeywords())); 466 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($askScraper, 467 | array($urlsToScrape)); 468 | $this->assertEquals($globalHitCount, 6); 469 | $this->assertEquals($componentHitCount, array("foobar" => 2, "baz" => 2, "foobaz" => 2)); 470 | $this->assertTrue($askScraper->removeKeyword("foobar")); 471 | $this->assertTrue($askScraper->removeKeyword("baz")); 472 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($askScraper, 473 | array(3, $askScraper->getKeywords())); 474 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($askScraper, 475 | array($urlsToScrape)); 476 | $this->assertEquals($globalHitCount, 3); 477 | $this->assertEquals($componentHitCount, array("foobaz" => 3)); 478 | 479 | $bingScraper = Builder::create($this->engines[2], array(array('bazfoo', 'foobaz'))); 480 | $mapKeywordsToUrls = TestHelper::getMethod('mapKeywordsToUrls', 'Bing'); 481 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($bingScraper, 482 | array(1, $bingScraper->getKeywords())); 483 | $hitCounter = TestHelper::getMethod('hitCounter', 'Bing'); 484 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($bingScraper, 485 | array($urlsToScrape)); 486 | $this->assertEquals($globalHitCount, 2); 487 | $this->assertEquals($componentHitCount, array("bazfoo" => 1, "foobaz" => 1)); 488 | $this->assertTrue($bingScraper->addKeyword('foo')); 489 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($bingScraper, 490 | array(2, $bingScraper->getKeywords())); 491 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($bingScraper, 492 | array($urlsToScrape)); 493 | $this->assertEquals($globalHitCount, 6); 494 | $this->assertEquals($componentHitCount, array("bazfoo" => 2, "foo" => 2, "foobaz" => 2)); 495 | $this->assertTrue($bingScraper->removeKeyword("foo")); 496 | $this->assertTrue($bingScraper->removeKeyword("bazfoo")); 497 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($bingScraper, 498 | array(3, $bingScraper->getKeywords())); 499 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($bingScraper, 500 | array($urlsToScrape)); 501 | $this->assertEquals($globalHitCount, 3); 502 | $this->assertEquals($componentHitCount, array("foobaz" => 3)); 503 | 504 | $yahooScraper = Builder::create($this->engines[3], array(array('foo'))); 505 | $mapKeywordsToUrls = TestHelper::getMethod('mapKeywordsToUrls', 'Yahoo'); 506 | $hitCounter = TestHelper::getMethod('hitCounter', 'Yahoo'); 507 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($yahooScraper, 508 | array(1, $yahooScraper->getKeywords())); 509 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($yahooScraper, 510 | array($urlsToScrape)); 511 | $this->assertEquals($globalHitCount, 1); 512 | $this->assertEquals($componentHitCount, array("foo" => 1)); 513 | $this->assertTrue($yahooScraper->addKeyword('foobaz')); 514 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($yahooScraper, 515 | array(2, $yahooScraper->getKeywords())); 516 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($yahooScraper, 517 | array($urlsToScrape)); 518 | $this->assertEquals($globalHitCount, 4); 519 | $this->assertEquals($componentHitCount, array("foo" => 2, "foobaz" => 2)); 520 | $this->assertTrue($yahooScraper->removeKeyword("foobaz")); 521 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($yahooScraper, 522 | array(3, $yahooScraper->getKeywords())); 523 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($yahooScraper, 524 | array($urlsToScrape)); 525 | $this->assertEquals($globalHitCount, 3); 526 | $this->assertEquals($componentHitCount, array("foo" => 3)); 527 | } 528 | 529 | public function testHitChecker() 530 | { 531 | $googleScraper = Builder::create($this->engines[0], array(array('foo', 'baz'))); 532 | $mapKeywordsToUrls = TestHelper::getMethod('mapKeywordsToUrls', 'Google'); 533 | $hitCounter = TestHelper::getMethod('hitCounter', 'Google'); 534 | $hitChecker = TestHelper::getMethod('hitChecker', 'Google'); 535 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($googleScraper, 536 | array(1, $googleScraper->getKeywords())); 537 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($googleScraper, 538 | array($urlsToScrape)); 539 | $this->assertTrue($hitChecker->invokeArgs($googleScraper, 540 | array($globalHitCount, $componentHitCount))); 541 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($googleScraper, 542 | array(7, $googleScraper->getKeywords())); 543 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($googleScraper, 544 | array($urlsToScrape)); 545 | $this->assertTrue($hitChecker->invokeArgs($googleScraper, 546 | array($globalHitCount, $componentHitCount))); 547 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($googleScraper, 548 | array(8, $googleScraper->getKeywords())); 549 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($googleScraper, 550 | array($urlsToScrape)); 551 | $this->assertFalse($hitChecker->invokeArgs($googleScraper, 552 | array($globalHitCount, $componentHitCount))); 553 | $googleScraper->getThrottler()->stop(); 554 | $this->assertTrue($googleScraper->getThrottler()->setComponentThreshold(3)); 555 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($googleScraper, 556 | array(3, $googleScraper->getKeywords())); 557 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($googleScraper, 558 | array($urlsToScrape)); 559 | $this->assertTrue($hitChecker->invokeArgs($googleScraper, 560 | array($globalHitCount, $componentHitCount))); 561 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($googleScraper, 562 | array(4, $googleScraper->getKeywords())); 563 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($googleScraper, 564 | array($urlsToScrape)); 565 | $this->assertFalse($hitChecker->invokeArgs($googleScraper, 566 | array($globalHitCount, $componentHitCount))); 567 | 568 | $askScraper = Builder::create($this->engines[1], array(array('foo', 'baz', 'foobaz'))); 569 | $mapKeywordsToUrls = TestHelper::getMethod('mapKeywordsToUrls', 'Ask'); 570 | $hitCounter = TestHelper::getMethod('hitCounter', 'Ask'); 571 | $hitChecker = TestHelper::getMethod('hitChecker', 'Ask'); 572 | $askScraper->getThrottler()->stop(); 573 | $this->assertTrue($askScraper->getThrottler()->setGlobalThreshold(100)); 574 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($askScraper, 575 | array(30, $askScraper->getKeywords())); 576 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($askScraper, 577 | array($urlsToScrape)); 578 | $this->assertTrue($hitChecker->invokeArgs($askScraper, 579 | array($globalHitCount, $componentHitCount))); 580 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($askScraper, 581 | array(40, $askScraper->getKeywords())); 582 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($askScraper, 583 | array($urlsToScrape)); 584 | $this->assertFalse($hitChecker->invokeArgs($askScraper, 585 | array($globalHitCount, $componentHitCount))); 586 | $this->assertTrue($askScraper->getThrottler()->setComponentThreshold(20)); 587 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($askScraper, array(15, $askScraper->getKeywords())); 588 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($askScraper, 589 | array($urlsToScrape)); 590 | $this->assertTrue($hitChecker->invokeArgs($askScraper, 591 | array($globalHitCount, $componentHitCount))); 592 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($askScraper, 593 | array(21, $askScraper->getKeywords())); 594 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($askScraper, 595 | array($urlsToScrape)); 596 | $this->assertFalse($hitChecker->invokeArgs($askScraper, array($globalHitCount, $componentHitCount))); 597 | 598 | $bingScraper = Builder::create($this->engines[2], array(array('foo', 'baz', 'foobaz'))); 599 | $mapKeywordsToUrls = TestHelper::getMethod('mapKeywordsToUrls', 'Bing'); 600 | $hitCounter = TestHelper::getMethod('hitCounter', 'Bing'); 601 | $hitChecker = TestHelper::getMethod('hitChecker', 'Bing'); 602 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($bingScraper, 603 | array(4, $bingScraper->getKeywords())); 604 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($bingScraper, 605 | array($urlsToScrape)); 606 | $this->assertTrue($hitChecker->invokeArgs($bingScraper, 607 | array($globalHitCount, $componentHitCount))); 608 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($bingScraper, 609 | array(5, $bingScraper->getKeywords())); 610 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($bingScraper, 611 | array($urlsToScrape)); 612 | $this->assertFalse($hitChecker->invokeArgs($bingScraper, 613 | array($globalHitCount, $componentHitCount))); 614 | } 615 | 616 | public function testScrapingFilter() 617 | { 618 | $googleScraper = Builder::create($this->engines[0], array(array('foo', 'baz'))); 619 | $this->assertFalse($googleScraper->scrape(0)); 620 | $this->assertFalse($googleScraper->scrape('')); 621 | $this->assertFalse($googleScraper->scrape('foobaz')); 622 | $this->assertFalse($googleScraper->scrape('foo', 's')); 623 | $this->assertFalse($googleScraper->scrape('foo', 0)); 624 | $this->assertFalse($googleScraper->scrape('foo', 1, 'UTC', 1)); 625 | $this->assertFalse($googleScraper->scrape('foo', 16)); 626 | $this->assertFalse($googleScraper->scrape('foo', 100)); 627 | $googleScraper->getThrottler()->stop(); 628 | $this->assertTrue($googleScraper->getThrottler()->setGlobalThreshold(10)); 629 | $this->assertFalse($googleScraper->scrape('foo', 11)); 630 | $this->assertFalse($googleScraper->scrape('foo', 10)); 631 | $this->assertTrue($googleScraper->getThrottler()->setComponentThreshold(4)); 632 | $this->assertFalse($googleScraper->scrape('foo', 5)); 633 | $this->assertFalse($googleScraper->scrapeAll(8)); 634 | $this->assertFalse($googleScraper->scrapeAll('foo')); 635 | $this->assertFalse($googleScraper->scrapeAll(2, 'foo')); 636 | $this->assertTrue($googleScraper->removeKeyword("foo")); 637 | $this->assertTrue($googleScraper->removeKeyword("baz")); 638 | $this->assertFalse($googleScraper->scrapeAll(1)); 639 | 640 | $askScraper = Builder::create($this->engines[1], array(array('foo', 'baz'))); 641 | $this->assertFalse($askScraper->scrape(0)); 642 | $this->assertFalse($askScraper->scrape('')); 643 | $this->assertFalse($askScraper->scrape('foobaz')); 644 | $this->assertFalse($askScraper->scrape('foo', 's')); 645 | $this->assertFalse($askScraper->scrape('foo', 0)); 646 | $this->assertFalse($askScraper->scrape('foo', 1, 'UTC', 1)); 647 | $this->assertFalse($askScraper->scrape('foo', 16)); 648 | $this->assertFalse($askScraper->scrape('foo', 100)); 649 | $askScraper->getThrottler()->stop(); 650 | $this->assertTrue($askScraper->getThrottler()->setGlobalThreshold(10)); 651 | $this->assertFalse($askScraper->scrape('foo', 11)); 652 | $this->assertFalse($askScraper->scrape('foo', 10)); 653 | $this->assertTrue($askScraper->getThrottler()->setComponentThreshold(4)); 654 | $this->assertFalse($askScraper->scrape('foo', 5)); 655 | $this->assertFalse($askScraper->scrapeAll(8)); 656 | $this->assertFalse($askScraper->scrapeAll('foo')); 657 | $this->assertFalse($askScraper->scrapeAll(2, 'foo')); 658 | $this->assertTrue($askScraper->removeKeyword("foo")); 659 | $this->assertTrue($askScraper->removeKeyword("baz")); 660 | $this->assertFalse($askScraper->scrapeAll(1)); 661 | 662 | $bingScraper = Builder::create($this->engines[2], array(array('foo', 'baz'))); 663 | $this->assertFalse($bingScraper->scrape(0)); 664 | $this->assertFalse($bingScraper->scrape('')); 665 | $this->assertFalse($bingScraper->scrape('foobaz')); 666 | $this->assertFalse($bingScraper->scrape('foo', 's')); 667 | $this->assertFalse($bingScraper->scrape('foo', 0)); 668 | $this->assertFalse($bingScraper->scrape('foo', 1, 'UTC', 1)); 669 | $this->assertFalse($bingScraper->scrape('foo', 16)); 670 | $this->assertFalse($bingScraper->scrape('foo', 100)); 671 | $bingScraper->getThrottler()->stop(); 672 | $this->assertTrue($bingScraper->getThrottler()->setGlobalThreshold(10)); 673 | $this->assertFalse($bingScraper->scrape('foo', 11)); 674 | $this->assertFalse($bingScraper->scrape('foo', 10)); 675 | $this->assertTrue($bingScraper->getThrottler()->setComponentThreshold(4)); 676 | $this->assertFalse($bingScraper->scrape('foo', 5)); 677 | $this->assertFalse($bingScraper->scrapeAll(8)); 678 | $this->assertFalse($bingScraper->scrapeAll('foo')); 679 | $this->assertFalse($bingScraper->scrapeAll(2, 'foo')); 680 | $this->assertTrue($bingScraper->removeKeyword("foo")); 681 | $this->assertTrue($bingScraper->removeKeyword("baz")); 682 | $this->assertFalse($bingScraper->scrapeAll(1)); 683 | 684 | $yahooScraper = Builder::create($this->engines[3], array(array('foo', 'baz'))); 685 | $this->assertFalse($yahooScraper->scrape(0)); 686 | $this->assertFalse($yahooScraper->scrape('')); 687 | $this->assertFalse($yahooScraper->scrape('foobaz')); 688 | $this->assertFalse($yahooScraper->scrape('foo', 's')); 689 | $this->assertFalse($yahooScraper->scrape('foo', 0)); 690 | $this->assertFalse($yahooScraper->scrape('foo', 1, 'UTC', 1)); 691 | $this->assertFalse($yahooScraper->scrape('foo', 16)); 692 | $this->assertFalse($yahooScraper->scrape('foo', 100)); 693 | $yahooScraper->getThrottler()->stop(); 694 | $this->assertTrue($yahooScraper->getThrottler()->setGlobalThreshold(10)); 695 | $this->assertFalse($yahooScraper->scrape('foo', 11)); 696 | $this->assertFalse($yahooScraper->scrape('foo', 10)); 697 | $this->assertTrue($yahooScraper->getThrottler()->setComponentThreshold(4)); 698 | $this->assertFalse($yahooScraper->scrape('foo', 5)); 699 | $this->assertFalse($yahooScraper->scrapeAll(8)); 700 | $this->assertFalse($yahooScraper->scrapeAll('foo')); 701 | $this->assertFalse($yahooScraper->scrapeAll(2, 'foo')); 702 | $this->assertTrue($yahooScraper->removeKeyword("foo")); 703 | $this->assertTrue($yahooScraper->removeKeyword("baz")); 704 | $this->assertFalse($yahooScraper->scrapeAll(1)); 705 | } 706 | 707 | public function testScrapeArgs() 708 | { 709 | $googleScraper = Builder::create($this->engines[0], array(array('foo'))); 710 | $this->assertTrue($googleScraper->removeKeyword('foo')); 711 | $this->assertFalse($googleScraper->scrape(1)); 712 | $this->assertTrue($googleScraper->addKeywords(array("foo", "bar", "baz"))); 713 | $this->assertFalse($googleScraper->scrape('foobar')); 714 | $this->assertFalse($googleScraper->scrape('foo', 'baz')); 715 | $this->assertFalse($googleScraper->scrape('foo', 2, 'UTC', 'baz')); 716 | $this->assertFalse($googleScraper->scrape('foo', 2, 'UTC', true, 'baz')); 717 | $this->assertFalse($googleScraper->scrapeAll('foobar')); 718 | $this->assertFalse($googleScraper->scrapeAll(2, 'foobar')); 719 | $this->assertFalse($googleScraper->scrapeAll(2, true, 'foobar')); 720 | 721 | $askScraper = Builder::create($this->engines[1], array(array('foo'))); 722 | $this->assertTrue($askScraper->removeKeyword('foo')); 723 | $this->assertFalse($askScraper->scrape(2)); 724 | $this->assertTrue($askScraper->addKeywords(array("foo", "bar", "baz"))); 725 | $this->assertFalse($askScraper->scrape('foobar')); 726 | $this->assertFalse($askScraper->scrape('foo', 'baz')); 727 | $this->assertFalse($askScraper->scrape('foo', 2, 'UTC', 'baz')); 728 | $this->assertFalse($askScraper->scrape('foo', 2, true, 'baz')); 729 | $this->assertFalse($askScraper->scrapeAll('foobar')); 730 | $this->assertFalse($askScraper->scrapeAll(2, 'foobar')); 731 | $this->assertFalse($askScraper->scrapeAll(2, true, 'foobar')); 732 | 733 | $bingScraper = Builder::create($this->engines[2], array(array('foo'))); 734 | $this->assertTrue($bingScraper->removeKeyword('foo')); 735 | $this->assertFalse($bingScraper->scrape(3)); 736 | $this->assertTrue($bingScraper->addKeywords(array("foo", "bar", "baz"))); 737 | $this->assertFalse($bingScraper->scrape('foobar')); 738 | $this->assertFalse($bingScraper->scrape('foo', 'baz')); 739 | $this->assertFalse($bingScraper->scrape('foo', 2, 'UTC', 'baz')); 740 | $this->assertFalse($bingScraper->scrape('foo', 2, true, 'baz')); 741 | $this->assertFalse($bingScraper->scrapeAll('foobar')); 742 | $this->assertFalse($bingScraper->scrapeAll(2, 'foobar')); 743 | $this->assertFalse($bingScraper->scrapeAll(2, true, 'foobar')); 744 | 745 | $yahooScraper = Builder::create($this->engines[2], array(array('foo'))); 746 | $this->assertTrue($yahooScraper->removeKeyword('foo')); 747 | $this->assertFalse($yahooScraper->scrape(4)); 748 | $this->assertTrue($yahooScraper->addKeywords(array("foo", "bar", "baz"))); 749 | $this->assertFalse($yahooScraper->scrape('foobar')); 750 | $this->assertFalse($yahooScraper->scrape('foo', 'baz')); 751 | $this->assertFalse($yahooScraper->scrape('foo', 2, 'UTC', 'baz')); 752 | $this->assertFalse($yahooScraper->scrape('foo', 2, true, 'baz')); 753 | $this->assertFalse($yahooScraper->scrapeAll('foobar')); 754 | $this->assertFalse($yahooScraper->scrapeAll(2, 'foobar')); 755 | $this->assertFalse($yahooScraper->scrapeAll(2, true, 'foobar')); 756 | } 757 | 758 | public function testScrape() 759 | { 760 | $parser = new \Seld\JsonLint\JsonParser(); 761 | 762 | $googleScraper = Builder::create($this->engines[0], 763 | array(array('foo', 'baz'), 'google')); 764 | $outDir = $googleScraper->getOutDir(); 765 | $this->assertFalse($googleScraper->scrape('bar')); 766 | $this->assertFalse($googleScraper->scrape('baz', 100)); 767 | $this->assertFalse($googleScraper->scrape('baz', 1, 'baz')); 768 | $this->assertFalse($googleScraper->scrape('baz', 1, true, 'foobad')); 769 | $this->assertFalse($googleScraper->scrape('baz', 1, true, 'UTC', 'faz')); 770 | $this->assertFalse($googleScraper->serialize('json')); 771 | $this->assertTrue($googleScraper->scrape('foo', 2, true, 'Europe/Berlin')); 772 | $this->assertCount(2, $googleScraper->getFetchedPages()); 773 | $this->assertCount(1, $googleScraper->getKeywords()); 774 | $this->assertTrue($googleScraper->scrape('baz', 2, true)); 775 | $this->assertCount(4, $googleScraper->getFetchedPages()); 776 | $this->assertCount(0, $googleScraper->getKeywords()); 777 | $this->assertFalse($googleScraper->scrapeAll()); 778 | $this->assertTrue($googleScraper->addKeywords(array('foobaz', 'foobar'))); 779 | $this->assertTrue($googleScraper->scrapeAll(2, true, 'America/Los_Angeles')); 780 | $this->assertCount(8, $googleScraper->getFetchedPages()); 781 | $this->assertCount(0, $googleScraper->getKeywords()); 782 | $this->assertFalse($googleScraper->serialize('baz')); 783 | $this->assertTrue($googleScraper->serialize('json', true)); 784 | $this->assertCount(0, $googleScraper->getFetchedPages()); 785 | $this->assertCount(8, $googleScraper->getSerializedPages()); 786 | $toCheck = array_map('Franzip\SerpScraper\Helpers\FileSystemHelper::generateFileName', 787 | array_keys($googleScraper->getSerializedPages())); 788 | $this->assertTrue($googleScraper->save(true)); 789 | for ($i = 0; $i < count($toCheck); $i++) { 790 | $json = file_get_contents($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]); 791 | $this->assertNull($parser->lint($json)); 792 | } 793 | $this->assertTrue($googleScraper->addKeywords(array('foo bad'))); 794 | $this->assertTrue($googleScraper->scrapeAll(3, true)); 795 | $this->assertCount(3, $googleScraper->getFetchedPages()); 796 | $this->assertTrue($googleScraper->serialize('xml', true)); 797 | $this->assertCount(0, $googleScraper->getFetchedPages()); 798 | $this->assertCount(3, $googleScraper->getSerializedPages()); 799 | $toCheck = array_map('Franzip\SerpScraper\Helpers\FileSystemHelper::generateFileName', 800 | array_keys($googleScraper->getSerializedPages())); 801 | $this->assertTrue($googleScraper->save(true)); 802 | for ($i = 0; $i < count($toCheck); $i++) { 803 | $xml = new \XMLReader(); 804 | $xml->open($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]); 805 | $xml->setParserProperty(\XMLReader::VALIDATE, true); 806 | $this->assertTrue($xml->isValid()); 807 | } 808 | 809 | $askScraper = Builder::create($this->engines[1], 810 | array(array('foo', 'baz'), 'ask')); 811 | $outDir = $askScraper->getOutDir(); 812 | $this->assertFalse($askScraper->scrape('bar')); 813 | $this->assertFalse($askScraper->scrape('baz', 100)); 814 | $this->assertFalse($askScraper->scrape('baz', 1, 'baz')); 815 | $this->assertFalse($askScraper->scrape('baz', 1, true, 'foobad')); 816 | $this->assertFalse($askScraper->scrape('baz', 1, true, 'UTC', 'faz')); 817 | $this->assertTrue($askScraper->scrape('foo', 2, true, 'Europe/Rome')); 818 | $this->assertCount(2, $askScraper->getFetchedPages()); 819 | $this->assertCount(1, $askScraper->getKeywords()); 820 | $this->assertTrue($askScraper->scrape('baz', 2, true)); 821 | $this->assertCount(4, $askScraper->getFetchedPages()); 822 | $this->assertCount(0, $askScraper->getKeywords()); 823 | $this->assertFalse($askScraper->scrapeAll()); 824 | $this->assertTrue($askScraper->addKeywords(array('foobaz', 'foobar'))); 825 | $this->assertTrue($askScraper->scrapeAll(2, true, 'America/Los_Angeles')); 826 | $this->assertCount(8, $askScraper->getFetchedPages()); 827 | $this->assertCount(0, $askScraper->getKeywords()); 828 | $this->assertFalse($askScraper->serialize('baz')); 829 | $this->assertTrue($askScraper->serialize('xml', true)); 830 | $this->assertCount(0, $askScraper->getFetchedPages()); 831 | $this->assertCount(8, $askScraper->getSerializedPages()); 832 | $toCheck = array_map('Franzip\SerpScraper\Helpers\FileSystemHelper::generateFileName', 833 | array_keys($askScraper->getSerializedPages())); 834 | $this->assertTrue($askScraper->save(true)); 835 | $this->assertCount(0, $askScraper->getSerializedPages()); 836 | for ($i = 0; $i < count($toCheck); $i++) { 837 | $xml = new \XMLReader(); 838 | $xml->open($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]); 839 | $xml->setParserProperty(\XMLReader::VALIDATE, true); 840 | $this->assertTrue($xml->isValid()); 841 | } 842 | $this->assertTrue($askScraper->addKeywords(array('foobaz'))); 843 | $this->assertTrue($askScraper->scrapeAll(3, true)); 844 | $this->assertTrue($askScraper->serialize('json', true)); 845 | $toCheck = array_map('Franzip\SerpScraper\Helpers\FileSystemHelper::generateFileName', 846 | array_keys($askScraper->getSerializedPages())); 847 | $this->assertTrue($askScraper->save(true)); 848 | for ($i = 0; $i < count($toCheck); $i++) { 849 | $json = file_get_contents($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]); 850 | $this->assertNull($parser->lint($json)); 851 | } 852 | 853 | $bingScraper = Builder::create($this->engines[2], 854 | array(array('foo', 'baz'), 'bing')); 855 | $outDir = $bingScraper->getOutDir(); 856 | $this->assertFalse($bingScraper->scrape('bar')); 857 | $this->assertFalse($bingScraper->scrape('baz', 100)); 858 | $this->assertFalse($bingScraper->scrape('baz', 1, 'baz')); 859 | $this->assertFalse($bingScraper->scrape('baz', 1, true, 'foobad')); 860 | $this->assertFalse($bingScraper->scrape('baz', 1, true, 'UTC', 'faz')); 861 | $this->assertFalse($bingScraper->serialize('json')); 862 | $this->assertTrue($bingScraper->scrape('foo', 2, true, 'Europe/Berlin')); 863 | $this->assertCount(2, $bingScraper->getFetchedPages()); 864 | $this->assertCount(1, $bingScraper->getKeywords()); 865 | $this->assertTrue($bingScraper->scrape('baz', 2, true)); 866 | $this->assertCount(4, $bingScraper->getFetchedPages()); 867 | $this->assertCount(0, $bingScraper->getKeywords()); 868 | $this->assertFalse($bingScraper->scrapeAll()); 869 | $this->assertTrue($bingScraper->addKeywords(array('foobaz', 'foobar'))); 870 | $this->assertTrue($bingScraper->scrapeAll(2, true, 'America/Los_Angeles')); 871 | $this->assertCount(8, $bingScraper->getFetchedPages()); 872 | $this->assertCount(0, $bingScraper->getKeywords()); 873 | $this->assertFalse($bingScraper->serialize('baz')); 874 | $this->assertTrue($bingScraper->serialize('json', true)); 875 | $this->assertCount(0, $bingScraper->getFetchedPages()); 876 | $this->assertCount(8, $bingScraper->getSerializedPages()); 877 | $toCheck = array_map('Franzip\SerpScraper\Helpers\FileSystemHelper::generateFileName', 878 | array_keys($bingScraper->getSerializedPages())); 879 | $this->assertTrue($bingScraper->save(true)); 880 | for ($i = 0; $i < count($toCheck); $i++) { 881 | $json = file_get_contents($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]); 882 | $this->assertNull($parser->lint($json)); 883 | } 884 | $this->assertTrue($bingScraper->addKeywords(array('foo bad'))); 885 | $this->assertTrue($bingScraper->scrapeAll(2, true)); 886 | $this->assertCount(2, $bingScraper->getFetchedPages()); 887 | $this->assertTrue($bingScraper->serialize('xml', true)); 888 | $this->assertCount(0, $bingScraper->getFetchedPages()); 889 | $this->assertCount(2, $bingScraper->getSerializedPages()); 890 | $toCheck = array_map('Franzip\SerpScraper\Helpers\FileSystemHelper::generateFileName', 891 | array_keys($bingScraper->getSerializedPages())); 892 | $this->assertTrue($bingScraper->save(true)); 893 | for ($i = 0; $i < count($toCheck); $i++) { 894 | $xml = new \XMLReader(); 895 | $xml->open($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]); 896 | $xml->setParserProperty(\XMLReader::VALIDATE, true); 897 | $this->assertTrue($xml->isValid()); 898 | } 899 | 900 | $yahooScraper = Builder::create($this->engines[3], 901 | array(array('foo', 'baz'), 'yahoo')); 902 | $outDir = $yahooScraper->getOutDir(); 903 | $this->assertFalse($yahooScraper->scrape('bar')); 904 | $this->assertFalse($yahooScraper->scrape('baz', 100)); 905 | $this->assertFalse($yahooScraper->scrape('baz', 1, 'baz')); 906 | $this->assertFalse($yahooScraper->scrape('baz', 1, true, 'foobad')); 907 | $this->assertFalse($yahooScraper->scrape('baz', 1, true, 'UTC', 'faz')); 908 | $this->assertTrue($yahooScraper->scrape('foo', 2, true, 'Europe/Rome')); 909 | $this->assertCount(2, $yahooScraper->getFetchedPages()); 910 | $this->assertCount(1, $yahooScraper->getKeywords()); 911 | $this->assertTrue($yahooScraper->scrape('baz', 2, true)); 912 | $this->assertCount(4, $yahooScraper->getFetchedPages()); 913 | $this->assertCount(0, $yahooScraper->getKeywords()); 914 | $this->assertFalse($yahooScraper->scrapeAll()); 915 | $this->assertTrue($yahooScraper->addKeywords(array('foobaz', 'foobar'))); 916 | $this->assertTrue($yahooScraper->scrapeAll(2, true, 'America/Los_Angeles')); 917 | $this->assertCount(8, $yahooScraper->getFetchedPages()); 918 | $this->assertCount(0, $yahooScraper->getKeywords()); 919 | $this->assertFalse($yahooScraper->serialize('baz')); 920 | $this->assertTrue($yahooScraper->serialize('xml', true)); 921 | $this->assertCount(0, $yahooScraper->getFetchedPages()); 922 | $this->assertCount(8, $yahooScraper->getSerializedPages()); 923 | $toCheck = array_map('Franzip\SerpScraper\Helpers\FileSystemHelper::generateFileName', 924 | array_keys($yahooScraper->getSerializedPages())); 925 | $this->assertTrue($yahooScraper->save(true)); 926 | $this->assertCount(0, $yahooScraper->getSerializedPages()); 927 | for ($i = 0; $i < count($toCheck); $i++) { 928 | $xml = new \XMLReader(); 929 | $xml->open($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]); 930 | $xml->setParserProperty(\XMLReader::VALIDATE, true); 931 | $this->assertTrue($xml->isValid()); 932 | } 933 | $this->assertTrue($yahooScraper->addKeywords(array('foobaz'))); 934 | $this->assertTrue($yahooScraper->scrapeAll(3, true)); 935 | $this->assertTrue($yahooScraper->serialize('json', true)); 936 | $toCheck = array_map('Franzip\SerpScraper\Helpers\FileSystemHelper::generateFileName', 937 | array_keys($yahooScraper->getSerializedPages())); 938 | $this->assertTrue($yahooScraper->save(true)); 939 | for ($i = 0; $i < count($toCheck); $i++) { 940 | $json = file_get_contents($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]); 941 | $this->assertNull($parser->lint($json)); 942 | } 943 | } 944 | } 945 | --------------------------------------------------------------------------------