├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── composer.json
├── phpunit.xml.dist
├── serp-scraper.jpg
├── src
├── Exceptions
│ ├── Exception.php
│ ├── InvalidArgumentException.php
│ ├── RuntimeException.php
│ └── UnsupportedEngineException.php
├── Helpers
│ ├── FileSystemHelper.php
│ ├── KeywordValidator.php
│ ├── SerpScraperHelper.php
│ ├── SerpUrlGenerator.php
│ └── TestHelper.php
├── Scrapers
│ ├── AskScraper.php
│ ├── BingScraper.php
│ ├── GoogleScraper.php
│ ├── SerpScraper.php
│ └── YahooScraper.php
└── SerpScraperBuilder.php
└── tests
├── SerpScraperBuilderTest.php
├── SerpScraperHelpersTest.php
└── SerpScraperTest.php
/.gitignore:
--------------------------------------------------------------------------------
1 | vendor/
2 | composer.phar
3 | composer.lock
4 | fetcher_cache/
5 | **/fetcher_cache/
6 | serializer_cache/
7 | **/serializer_cache/
8 | out/
9 | **/out/
10 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: php
2 |
3 | php:
4 | - 5.4
5 | - 5.5
6 | - 5.6
7 | - hhvm
8 |
9 | before_script:
10 | - composer self-update
11 | - composer install --prefer-source --no-interaction --dev
12 |
13 | script:
14 | - mkdir -p build/logs
15 | - phpunit --coverage-clover build/logs/clover.xml
16 |
17 | after_script:
18 | - php vendor/bin/coveralls -v
19 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Francesco Pezzella
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://travis-ci.org/franzip/serp-scraper)
2 | [](https://coveralls.io/r/franzip/serp-scraper)
3 |
4 | # SerpScraper
5 | A library to extract, serialize and store data scraped on Search Engine result pages.
6 |
7 | ## Installing via Composer (recommended)
8 |
9 | Install composer in your project:
10 | ```
11 | curl -s http://getcomposer.org/installer | php
12 | ```
13 |
14 | Create a composer.json file in your project root:
15 | ```
16 | {
17 | "require": {
18 | "franzip/serp-scraper": "0.1.*@dev"
19 | }
20 | }
21 | ```
22 |
23 | Install via composer
24 | ```
25 | php composer.phar install
26 | ```
27 |
28 | ## Supported Search Engines
29 |
30 | * Google
31 | * Bing
32 | * Ask
33 | * Yahoo
34 |
35 | ## Supported Serialization format
36 |
37 | * JSON
38 | * XML
39 | * YAML
40 |
41 | ## Legal Disclaimer
42 |
43 | Under no circumstances I shall be considered liable to any user for direct,
44 | indirect, incidental, consequential, special, or exemplary damages, arising
45 | from or relating to userʹs use or misuse of this software.
46 | Consult the following Terms of Service before using SerpScraper:
47 |
48 | * [Google](https://www.google.com/accounts/TOS)
49 | * [Bing](http://windows.microsoft.com/en-us/windows/microsoft-services-agreement)
50 | * [Ask](http://about.ask.com/terms-of-service)
51 | * [Yahoo](https://info.yahoo.com/legal/us/yahoo/utos/en-us/)
52 |
53 | ## How it works in a nutshell
54 |
55 | 
56 |
57 | ## Description
58 |
59 | Scraping legal status seems to be quite disputed. Anyway, this library tries
60 | to avoid unnecessary HTTP overhead by using three strategies:
61 |
62 | - Throttling: [an internal object](https://github.com/franzip/throttler) takes care of capping the amount of
63 | allowed HTTP requests to a default of 15 per hour. Once that limit has been reached,
64 | it will not be possible to scrape more content until the timeframe expires.
65 |
66 | - Caching: [the library used to retrieve data](https://github.com/franzip/serp-fetcher) caches every fetched page. The
67 | default cache expiration is set to 24 hours.
68 |
69 | - Delaying: a simple and quite naive approach is used here. Multiple HTTP requests
70 | will be spaced out by a default 0,5 sec delay.
71 |
72 | ## Constructor details
73 |
74 | This is the abstract constructor, used by all the concrete implementations:
75 |
76 | ```php
77 | SerpScraper($keywords, $outDir = 'out', $fetcherCacheDir = 'fetcher_cache',
78 | $serializerCacheDir = 'serializer_cache', $cacheTTL = 24,
79 | $requestDelay = 500);
80 | ```
81 |
82 | 1. `$keywords` - array
83 | - The keywords you want to scrape. Cannot be an empty array.
84 | 2. `$outDir` - string
85 | - Path to the folder to be used to store serialized pages.
86 | 3. `$fetcherCacheDir` - string
87 | - Path to the folder to be used to store [SerpFetcher](https://github.com/franzip/serp-fetcher) cache.
88 | 4. `$serializerCacheDir` - string
89 | - Path to the folder to be used to store [SerpPageSerializer](https://github.com/franzip/serp-page-serializer) cache.
90 | 5. `$cacheTTL` - integer
91 | - Time expiration of the [SerpFetcher](https://github.com/franzip/serp-fetcher) cache expressed in hours.
92 | 6. `$requestDelay` - integer
93 | - Delay to use between multiple HTTP requests, expressed in microseconds.
94 |
95 | ## Building a Scraper (using Factory)
96 |
97 | Specify the vendor as first argument. You can specify custom settings using an
98 | array as second argument (see the SerpScraper constructor above).
99 |
100 | ```php
101 | use Franzip\SerpScraper\SerpScraperBuilder;
102 |
103 | $googleScraper = SerpScraperBuilder::create('Google', array(array('keyword1',
104 | 'keyword2',
105 | ...)));
106 |
107 | $askScraper = SerpScraperBuilder::create('Ask', array(array('key1', 'key2')));
108 | $bingScraper = SerpScraperBuilder::create('Bing', array(array('baz', 'foo')));
109 | ...
110 | ```
111 |
112 | ## Building a Scraper (with explicit constructors)
113 |
114 | ```php
115 | use Franzip\SerpScraper\Scrapers\GoogleScraper;
116 | use Franzip\SerpScraper\Scrapers\AskScraper;
117 | use Franzip\SerpScraper\Scrapers\BingScraper;
118 | use Franzip\SerpScraper\Scrapers\YahooScraper;
119 |
120 | $googleScraper = new GoogleScraper($keywords = array('foo', 'bar'),
121 | $outDir = 'google_results');
122 | $askScraper = new AskScraper($keywords = array('foo', bar),
123 | $outDir = 'ask_results');
124 | ...
125 | ```
126 |
127 | ## scrape() and scrapeAll()
128 |
129 | You can scrape a single tracked keyword with ```scrape()```, or scrape all the
130 | tracked keywords using ```scrapeAll()```.
131 |
132 | ```scrape()``` signature:
133 | ```php
134 | $serpScraper->scrape($keyword, $pagesToScrape = 1, $toRemove = false,
135 | $timezone = 'UTC', $throttling = true);
136 | ```
137 |
138 | Usage example:
139 |
140 | ```php
141 | // Scrape the first 5 pages for the keyword 'foo', remove it from the tracked
142 | // keyword, use the Los Angeles timezone and don't use throttling.
143 | $serpScraper->scrape('foo', 5, true, 'America/Los Angeles', false);
144 | ```
145 |
146 | ```scrapeAll()``` signature:
147 |
148 | ```php
149 | $serpScraper->scrapeAll($pagesToScrape = 1, $toRemove = false, $timezone = 'UTC',
150 | $throttling = true);
151 | ```
152 |
153 | Usage example:
154 |
155 | ```php
156 | // Scrape the first 5 pages for all the tracked keywords, remove them all from
157 | // tracked keywords, use the Berlin timezone and don't use throttling.
158 | $serpScraper->scrapeAll(5, true, 'Europe/Berlin', false);
159 | // keywords array has been emptied
160 | var_dump($serpScraper->getKeywords());
161 | // array()
162 | ```
163 |
164 | ## serialize() and getFetchedPages()
165 |
166 | Serialize all the results fetched so far. Supported formats are: JSON, XML and
167 | YAML.
168 | You can access the fetched array by calling ```getFetchedPages()```.
169 |
170 | ```serialize()``` signature:
171 | ```php
172 | $serpScraper->serialize($format, $toRemove = false);
173 | ```
174 |
175 | Usage example:
176 |
177 | ```php
178 | $serpScraper->serialize($format, $toRemove = false);
179 | // serialize to JSON the stuff retrieved so far
180 | $serpScraper->serialize('json');
181 | // serialize to XML the stuff retrieved so far
182 | $serpScraper->serialize('xml');
183 | // fetched pages are still there
184 | var_dump($serpScraper->getFetchedPages());
185 | // array(
186 | // object(Franzip\SerpPageSerializer\Models\SerializableSerpPage) (1),
187 | // ...
188 | // )
189 |
190 | // now serialize to YAML the stuff retrieved so far and empty the fetched data
191 | $serpScraper->serialize('yml', true);
192 | // fetched array is now empty
193 | var_dump($serpScraper->getFetchedPages());
194 | // array()
195 | ```
196 |
197 | ## save() and getSerializedPages()
198 |
199 | Write to files the serialized results so far.
200 | The format used as filename is the following:
201 | *vendor_keyword_pagenumber_time.format* | *google_foo_3_12032015.json*
202 |
203 | ```save()``` signature:
204 | ```php
205 | $serpScraper->save($toRemove = false)
206 | ```
207 |
208 | Usage example:
209 |
210 | ```php
211 | // write serialized results so far to the specified output folder
212 | $serpScraper->save();
213 | // serialized pages are still there
214 | var_dump($serpScraper->getSerializedPages());
215 | // array(
216 | // object(Franzip\SerpPageSerializer\Models\SerializedSerpPage) (1),
217 | // ...
218 | // )
219 |
220 | // write serialized results so far to the specified output folder and remove
221 | // them from the serialized array
222 | $serpScraper->save(true);
223 | // serialized array is now empty
224 | var_dump($serpScraper->getSerializedPages());
225 | // array()
226 | ```
227 |
228 | ## Adding/Removing keywords.
229 |
230 | ```php
231 | $serpScraper->addKeyword('bar');
232 | $serpScraper->addKeywords(array('foo', 'bar', ...));
233 | $serpScraper->removeKeyword('bar');
234 | ```
235 |
236 | ## Cache flushing
237 |
238 | You can call ```flushCache()``` anytime. This will remove all the cached files
239 | used by the ```SerpFetcher``` component and will also remove all the entries
240 | from the fetched and serialized arrays.
241 |
242 | ```php
243 |
244 | $serpScraper->flushCache();
245 | var_dump($serpScraper->getFetchedPages());
246 | // array()
247 | var_dump($serpScraper->getSerializedPages());
248 | // array()
249 | ```
250 |
251 | ## Basic usage
252 |
253 | ```php
254 | use Franzip\SerpScraper\SerpScraperBuilder;
255 |
256 | $googleScraper = SerpScraperBuilder::create('Google', array(array('keyword1',
257 | 'keyword2',
258 | 'keyword3')));
259 | // scrape the first page for 'keyword1'
260 | $googleScraper->scrape('keyword1');
261 | // scrape the first 5 page for 'keyword2'
262 | $googleScraper->scrape('keyword2', 5);
263 | // serialize to JSON what has been scraped so far
264 | $googleScraper->serialize('json');
265 | //
266 | ...
267 | ```
268 |
269 | ## Using multiple output folders
270 |
271 | You can use different output folders as you see fit. In this case, the same
272 | keywords will be scraped once but the results will be written to different folders,
273 | based on their serialization format.
274 | Since the results are cached, the ```serialize()``` method will use the same
275 | data over and over again.
276 |
277 | ```php
278 | use Franzip\SerpScraper\SerpScraperBuilder;
279 |
280 | $googleScraper = SerpScraperBuilder::create('Google',
281 | array(array('foo', 'baz', ...)));
282 |
283 | // output folders
284 | $xmlDir = 'google_results/xml';
285 | $jsonDir = 'google_results/json';
286 | $yamlDir = 'google_results/yaml';
287 |
288 | ...
289 | // scraping action happens here...
290 |
291 | // write xml results first
292 | $googleScraper->serialize('xml');
293 | $googleScraper->setOutDir($xmlDir);
294 | $googleScraper->save();
295 | // now json
296 | $googleScraper->serialize('json');
297 | $googleScraper->setOutDir($jsonDir);
298 | $googleScraper->save();
299 | // write yaml results, we can now remove the serialized array
300 | $googleScraper->serialize('yml', true);
301 | $googleScraper->setOutDir($yamlDir);
302 | $googleScraper->save();
303 |
304 | ```
305 |
306 | ## TODOs
307 |
308 | - [ ] Avoid request delay on cache hit.
309 | - [ ] Validate YAML results in the tests (couldn't find a suitable library so far).
310 | - [ ] Improve docs with better organization and more examples.
311 | - [ ] Refactoring messy tests.
312 |
313 | ## License
314 | [MIT](http://opensource.org/licenses/MIT/ "MIT") Public License.
315 |
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "franzip/serp-scraper",
3 | "type": "library",
4 | "description": "A library to extract, serialize and store data scraped on Search Engine result pages.",
5 | "license": "MIT",
6 | "homepage": "http://github.com/franzip/serp-scraper",
7 | "keywords": ["scrape", "scraper", "serp", "data", "harvesting", "search-engine", "google"],
8 | "authors": [{
9 | "name": "Francesco Pezzella",
10 | "email": "franzpezzella@gmail.com"
11 | }],
12 | "minimum-stability": "dev",
13 | "require": {
14 | "php": ">=5.4.0",
15 | "franzip/throttler": "0.2.*@dev",
16 | "franzip/serp-fetcher": "0.2.*@dev",
17 | "franzip/serp-page-serializer": "0.2.*@dev"
18 | },
19 | "require-dev": {
20 | "phpunit/phpunit": "4.0.*",
21 | "seld/jsonlint": "dev-master",
22 | "satooshi/php-coveralls": "dev-master"
23 | },
24 | "autoload": {
25 | "psr-4": {
26 | "Franzip\\SerpScraper\\": "src/"
27 | }
28 | },
29 | "extra": {
30 | "branch-alias": {
31 | "dev-master": "0.1.x-dev"
32 | }
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/phpunit.xml.dist:
--------------------------------------------------------------------------------
1 |
2 |
12 |
13 |
14 | ./src
15 |
16 | ./vendor
17 | ./tests
18 |
19 |
20 |
21 |
22 |
23 | tests/
24 | SerpScraperHelpersTest.php
25 |
26 |
27 | tests/
28 | SerpScraperBuilderTest.php
29 |
30 |
31 | tests/
32 | SerpScraperTest.php
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/serp-scraper.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/franzip/serp-scraper/56bfeec1b2b1a744e16ff657d2f0abf67a21db5d/serp-scraper.jpg
--------------------------------------------------------------------------------
/src/Exceptions/Exception.php:
--------------------------------------------------------------------------------
1 | 0;
64 | }
65 |
66 | /**
67 | * Check for valid scraping args.
68 | * @param array $keywords
69 | * @param int $pagesToScrape
70 | * @param bool $toRemove
71 | * @param string $timezone
72 | * @param bool $throttling
73 | * @param array $trackingKeywords
74 | * @return bool
75 | */
76 | public static function validScrapeArgs($keywords, $pagesToScrape, $toRemove,
77 | $timezone, $throttling, $trackingKeywords)
78 | {
79 | return \Franzip\SerpScraper\Helpers\KeywordValidator::validKeywords($keywords)
80 | && self::keywordsAllTracked($keywords, $trackingKeywords)
81 | && is_int($pagesToScrape)
82 | && $pagesToScrape > 0 && is_bool($toRemove)
83 | && in_array($timezone, \DateTimeZone::listIdentifiers())
84 | && is_bool($throttling);
85 | }
86 |
87 | /**
88 | * Check that a keyword is being tracked.
89 | * @param string $keyword
90 | * @param array $trackingKeywords
91 | * @return bool
92 | */
93 | public static function keywordPresent($keyword, $trackingKeywords)
94 | {
95 | return in_array($keyword, $trackingKeywords);
96 | }
97 |
98 | /**
99 | * Check that all supplied keywords are being tracked.
100 | * @param array $keywords
101 | * @param array $trackingKeywords
102 | * @return bool
103 | */
104 | public static function keywordsAllTracked($keywords, $trackingKeywords)
105 | {
106 | return !in_array(false, array_map(function($keyword) use ($trackingKeywords) {
107 | return in_array($keyword, $trackingKeywords);
108 | }, $keywords));
109 | }
110 |
111 | /**
112 | * Extract relevant data from a SerializableSerpPage.
113 | * @param SerializableSerpPage $serializablePage
114 | * @return array
115 | */
116 | public static function extractSerializablePageData($serializablePage)
117 | {
118 | $keyword = $serializablePage->getKeyword();
119 | $pageNumber = $serializablePage->getPageNumber();
120 | $age = $serializablePage->getAge()->format('Y-m-d');;
121 | return array($keyword, $pageNumber, $age);
122 | }
123 |
124 | private function __construct() {}
125 | }
126 |
--------------------------------------------------------------------------------
/src/Helpers/SerpUrlGenerator.php:
--------------------------------------------------------------------------------
1 | isDir() && !$fileinfo->isDot()
45 | && !in_array($fileinfo->getFileName(), $dontDelete)) {
46 | self::rrmdir($fileinfo->getFilename());
47 | }
48 | }
49 | }
50 |
51 | /**
52 | * Allow testing private methods.
53 | * @param string $name
54 | * @param string $className
55 | * @return callable
56 | */
57 | public static function getMethod($name, $className)
58 | {
59 | $classQualifiedName = Builder::SCRAPER_CLASS_PREFIX . $className . Builder::SCRAPER_CLASS_SUFFIX;
60 | $class = new \ReflectionClass($classQualifiedName);
61 | $method = $class->getMethod($name);
62 | $method->setAccessible(true);
63 | return $method;
64 | }
65 |
66 | private function __construct() {}
67 | }
68 |
--------------------------------------------------------------------------------
/src/Scrapers/AskScraper.php:
--------------------------------------------------------------------------------
1 |
7 | * @link https://github.com/franzip/serp-scraper
8 | * @copyright Copyright 2015 Francesco Pezzella
9 | * @license http://www.opensource.org/licenses/mit-license.php MIT License
10 | * @package SerpScraper
11 | */
12 |
13 | namespace Franzip\SerpScraper\Scrapers;
14 |
15 | /**
16 | * SerpScraper implementation for Ask search engine.
17 | *
18 | * @package SerpScraper
19 | */
20 | class AskScraper extends SerpScraper
21 | {
22 |
23 | }
24 |
--------------------------------------------------------------------------------
/src/Scrapers/BingScraper.php:
--------------------------------------------------------------------------------
1 |
7 | * @link https://github.com/franzip/serp-scraper
8 | * @copyright Copyright 2015 Francesco Pezzella
9 | * @license http://www.opensource.org/licenses/mit-license.php MIT License
10 | * @package SerpScraper
11 | */
12 |
13 | namespace Franzip\SerpScraper\Scrapers;
14 |
15 | /**
16 | * SerpScraper implementation for Bing search engine.
17 | *
18 | * @package SerpScraper
19 | */
20 | class BingScraper extends SerpScraper
21 | {
22 |
23 | }
24 |
--------------------------------------------------------------------------------
/src/Scrapers/GoogleScraper.php:
--------------------------------------------------------------------------------
1 |
7 | * @link https://github.com/franzip/serp-scraper
8 | * @copyright Copyright 2015 Francesco Pezzella
9 | * @license http://www.opensource.org/licenses/mit-license.php MIT License
10 | * @package SerpScraper
11 | */
12 |
13 | namespace Franzip\SerpScraper\Scrapers;
14 |
15 | /**
16 | * SerpScraper implementation for Google search engine.
17 | *
18 | * @package SerpScraper
19 | */
20 | class GoogleScraper extends SerpScraper
21 | {
22 |
23 | }
24 |
--------------------------------------------------------------------------------
/src/Scrapers/SerpScraper.php:
--------------------------------------------------------------------------------
1 |
7 | * @link https://github.com/franzip/serp-scraper
8 | * @copyright Copyright 2015 Francesco Pezzella
9 | * @license http://www.opensource.org/licenses/mit-license.php MIT License
10 | * @package SerpScraper
11 | */
12 |
13 | namespace Franzip\SerpScraper\Scrapers;
14 | use Franzip\SerpScraper\Helpers\SerpScraperHelper;
15 | use Franzip\SerpScraper\Helpers\FileSystemHelper;
16 | use Franzip\SerpScraper\Helpers\SerpUrlGenerator;
17 | use Franzip\SerpScraper\Helpers\KeywordValidator;
18 | use Franzip\SerpFetcher\SerpFetcherBuilder;
19 | use Franzip\SerpPageSerializer\Models\SerializableSerpPage;
20 | use Franzip\SerpPageSerializer\SerpPageSerializer;
21 | use Franzip\Throttler\Throttler;
22 |
23 | /**
24 | * Abstract class describing a SerpScraper.
25 | * The whole implementation is here and the concrete children classes have the sole
26 | * purpose to isolate different search engine scrapers and to allow dependencies
27 | * to work properly.
28 | * Once added some keywords through the constructor or through the addKeyword()
29 | * and addKeywords() method, the instance will be ready to scrape.
30 | *
31 | * Caching/Throttling/Delaying
32 | *
33 | * Since scraping legal status seems to be quite disputed and nobody likes
34 | * jerks, this implementation tries to avoid HTTP overhead using three simple
35 | * strategies.
36 | * The first is caching: the internal component of the class that
37 | * takes care of fetching data from the Internet (SerpFetcher) use caching, so
38 | * scraping the same page over and over again will result in a single
39 | * HTTP request (cache expiration is set to 24 hours by default).
40 | * The second is throttling: an internal component of the class take care of
41 | * capping HTTP requests (default cap is 15 requests per hour). Scraped data
42 | * retrieved from the cache are not counted.
43 | * The third is delaying: by default, a 0.5 sec delay takes place inbetween each
44 | * HTTP request needed to retrieve data.
45 | *
46 | *_____________________________________________________________________________
47 | * Scrape | Serialize | Store |
48 | *________________________|_________________________|_________________________|
49 | * | | |
50 | * scrape()|scrapeAll() | serialize() | save() |
51 | * | | |
52 | *________________________|_________________________|_________________________|
53 | * Input | Input | Input |
54 | *________________________|_________________________|_________________________|
55 | * | | |
56 | * HTTP request|Cache Hit | SerializableSerpPage | SerializedSerpPage |
57 | * | array | array |
58 | *________________________|_________________________|_________________________|
59 | * Output | Output | Output |
60 | *________________________|_________________________|_________________________|
61 | * | | |
62 | * SerializableSerpPage | SerializedSerpPage | JSON|XML|YAML |
63 | * array | array | files |
64 | *________________________|_________________________|_________________________|
65 | *
66 | *
67 | * Scraping
68 | *
69 | * It is possible to scrape a single keyword with the scrape() method, or to scrape
70 | * all the added keywords alltogether with the scrapeAll() method.
71 | * The scraped data will be available as SerializableSerpPage objects in the
72 | * $fetched array.
73 | *
74 | * Serializing
75 | *
76 | * Once scraped some keywords, it is possible to serialize the fetched data through
77 | * the serialize() method (only JSON, XML and YAML are supported). This method
78 | * will provide to serialize all the data sitting in the $fetched array, and will
79 | * populate the $serialized array with SerializedSerpPage objects.
80 | *
81 | * Writing serialized data to file
82 | *
83 | * Storing data to disk is easy. The save() method takes care of writing the
84 | * content of all SerializedSerpPage objects to different files format (XML,
85 | * JSON, YAML) in the specified output folder.
86 | *
87 | *
88 | * @package SerpScraper
89 | */
90 | abstract class SerpScraper
91 | {
92 | // namespacing constants
93 | const SCRAPERS_PREFIX = 'Franzip\SerpScraper\Scrapers\\';
94 | const SCRAPERS_SUFFIX = 'Scraper';
95 | // default results per page
96 | const DEFAULT_RESULTS_PER_PAGE = 10;
97 | // default timezone
98 | const DEFAULT_TIMEZONE = 'UTC';
99 |
100 | // Throttler
101 | // Allow 15 requests per hour (cache hits are not considered)
102 | const DEFAULT_THROTTLER_NAME = 'http_requests';
103 | const DEFAULT_THROTTLER_THRESHOLD = 15;
104 | const DEFAULT_THROTTLER_METRIC = 'hrs';
105 | const DEFAULT_THROTTLER_METRIC_FACTOR = 1;
106 | const DEFAULT_THROTTLER_COMPONENT_THRESHOLD = null;
107 |
108 | // SerpFetcher
109 | // SerpFetcher result array keys
110 | const SERP_FETCHER_URLS = 'urls';
111 | const SERP_FETCHER_SNIPPETS = 'snippets';
112 | const SERP_FETCHER_TITLES = 'titles';
113 | // default fetcher cache dir
114 | const DEFAULT_FETCHER_CACHE_DIR = 'fetcher_cache';
115 | // default fetcher cache time to live in hours
116 | const DEFAULT_FETCHER_CACHE_TTL = 24;
117 |
118 | // SerpPageSerializer
119 | // default serializer cache dir
120 | const DEFAULT_SERIALIZER_CACHE_DIR = 'serializer_cache';
121 |
122 | // SerpScraper
123 | // default number of pages to scrape
124 | const DEFAULT_PAGES_TO_SCRAPE = 1;
125 | // default output dir
126 | const DEFAULT_OUTPUT_DIR = 'out';
127 | // default request delay in microseconds
128 | const DEFAULT_REQUEST_DELAY = 500;
129 |
130 | // supported serialization format
131 | private static $supportedSerializationFormat = array('json', 'xml', 'yml');
132 |
133 | // dependencies to inject
134 | // Throttler component
135 | private $throttler;
136 | // SerpFetcher component
137 | private $fetcher;
138 | // SerpPageSerializer component
139 | private $serializer;
140 |
141 | // instance variables
142 | // output folder
143 | private $outDir;
144 | // fetcher cache folder
145 | private $fetcherCacheDir;
146 | // serializer cache folder
147 | private $serializerCacheDir;
148 | // cache expiration time
149 | private $cacheTTL;
150 | // request delay time in microseconds
151 | private $requestDelay;
152 | // keywords to scrape
153 | private $keywords;
154 | // store fetched objects
155 | private $fetched;
156 | // store serialized pages
157 | private $serialized;
158 |
159 | /**
160 | * Create a SerpScraper object.
161 | * @param array $keywords
162 | * @param string $outDir
163 | * @param string $fetcherCacheDir
164 | * @param string $serializerCacheDir
165 | * @param int $cacheTTL
166 | * @param int $requestDelay
167 | */
168 | public function __construct($keywords,
169 | $outDir = self::DEFAULT_OUTPUT_DIR,
170 | $fetcherCacheDir = self::DEFAULT_FETCHER_CACHE_DIR,
171 | $serializerCacheDir = self::DEFAULT_SERIALIZER_CACHE_DIR,
172 | $cacheTTL = self::DEFAULT_FETCHER_CACHE_TTL,
173 | $requestDelay = self::DEFAULT_REQUEST_DELAY)
174 | {
175 | // perform validation
176 | SerpScraperHelper::checkArgs($keywords, $outDir, $fetcherCacheDir,
177 | $serializerCacheDir, $cacheTTL, $requestDelay);
178 | // instance variables
179 | $this->outDir = $outDir;
180 | $this->fetcherCacheDir = $fetcherCacheDir;
181 | $this->serializerCacheDir = $serializerCacheDir;
182 | $this->cacheTTL = $cacheTTL;
183 | $this->requestDelay = $requestDelay;
184 | $this->keywords = array();
185 | $this->fetched = array();
186 | $this->serialized = array();
187 | // normalize user input keywords
188 | for ($i = 0; $i < count($keywords); $i++) {
189 | array_push($this->keywords, KeywordValidator::processKeyword($keywords[$i]));
190 | }
191 | // set up folders
192 | FileSystemHelper::setUpDir($outDir);
193 | FileSystemHelper::setUpDir($serializerCacheDir);
194 | // deps injection
195 | $this->throttler = new Throttler(self::DEFAULT_THROTTLER_NAME,
196 | self::DEFAULT_THROTTLER_THRESHOLD,
197 | self::DEFAULT_THROTTLER_METRIC,
198 | self::DEFAULT_THROTTLER_METRIC_FACTOR,
199 | self::DEFAULT_THROTTLER_COMPONENT_THRESHOLD,
200 | $this->keywords);
201 | // turn on throttling
202 | $this->throttler->start();
203 | // instatiate the right fetcher at runtime (will also setup fetcher cache dir)
204 | $this->fetcher = SerpFetcherBuilder::create(self::runTimeClassName(),
205 | array($this->fetcherCacheDir,
206 | $this->cacheTTL));
207 | $this->serializer = new SerpPageSerializer($serializerCacheDir);
208 | }
209 |
210 | /**
211 | * Scrape a single keyword. This will yield as many SerializableSerpPage
212 | * objects as there are to scrape. Those objects will be stored in the
213 | * fetched array queue, ready to be serialized.
214 | * It is also possible to empty the keywords array by setting $toRemove to
215 | * true, set a specific $timezone and turn throttling off by setting $throttling
216 | * to false.
217 | * @param string $keyword
218 | * @param int $pagesToScrape
219 | * @param bool $toRemove
220 | * @param string $timezone
221 | * @param bool $throttling
222 | * @return bool
223 | */
224 | public function scrape($keyword,
225 | $pagesToScrape = self::DEFAULT_PAGES_TO_SCRAPE,
226 | $toRemove = false,
227 | $timezone = self::DEFAULT_TIMEZONE,
228 | $throttling = true)
229 | {
230 | // allow scrapeAll() to reuse scrape()
231 | if (is_string($keyword))
232 | $keyword = array($keyword);
233 | // perform validations
234 | if (!SerpScraperHelper::validScrapeArgs($keyword, $pagesToScrape,
235 | $toRemove, $timezone,
236 | $throttling, $this->keywords))
237 | return false;
238 | // map keywords to array of urls ready to scrape
239 | $urlsToScrape = $this->mapKeywordsToUrls($pagesToScrape, $keyword);
240 | // check for legal operation only if throttling
241 | if ($throttling && !$this->allowedScrapeOperation($pagesToScrape, $urlsToScrape))
242 | return false;
243 | // avoid DateTime() annoying notices
244 | date_default_timezone_set($timezone);
245 | // loop over the keywords to scrape
246 | for ($i = 0; $i < count($keyword); $i++) {
247 | // get the current keyword
248 | $key = $keyword[$i];
249 | // scrape $pagesToScrape pages for each keyword
250 | for ($j = 0; $j < $pagesToScrape; $j++) {
251 | $pageUrl = $urlsToScrape[$key][$j];
252 | $fetched = $this->fetchPage($key, $pageUrl);
253 | $entries = $this->makeEntries($fetched);
254 | $engine = strtolower(self::runTimeClassName());
255 | $pageNumber = $j + 1;
256 | $age = new \DateTime();
257 | $age->setTimeStamp(time());
258 | // construct a SerializableSerpPage and store it
259 | $serializablePage = new SerializableSerpPage($engine, $key, $pageUrl,
260 | $pageNumber, $age,
261 | $entries);
262 | array_push($this->fetched, $serializablePage);
263 | // delay inbetween requests
264 | usleep(self::DEFAULT_REQUEST_DELAY);
265 | }
266 | // remove the key from the queue if specified
267 | if ($toRemove)
268 | $this->removeKeyword($key);
269 | }
270 |
271 | return true;
272 | }
273 |
274 | /**
275 | * Scrape all the tracked keywords. This method reuses the scrape() method.
276 | * @param int $pagesToScrape
277 | * @param bool $toRemove
278 | * @param string $timezone
279 | * @param bool $throttling
280 | * @return bool
281 | */
282 | public function scrapeAll($pagesToScrape = self::DEFAULT_PAGES_TO_SCRAPE,
283 | $toRemove = false,
284 | $timezone = self::DEFAULT_TIMEZONE,
285 | $throttling = true)
286 | {
287 | return $this->scrape($this->keywords, $pagesToScrape, $toRemove,
288 | $timezone, $throttling);
289 | }
290 |
291 | /**
292 | * Write all the serialized objects in the output dir.
293 | * If the related flag $toRemove is on, the serialized queue will be emptied.
294 | * @param bool $toRemove
295 | * @return bool
296 | */
297 | public function save($toRemove = false)
298 | {
299 | // fail if there are no results to write to disk
300 | if (empty($this->serialized))
301 | return false;
302 | // loop over serialized objects
303 | foreach ($this->serialized as $key => $serializedObject) {
304 | // generate filenames
305 | $filename = FileSystemHelper::generateFileName($key);
306 | // write files in the output dir
307 | file_put_contents($this->getOutDir() . DIRECTORY_SEPARATOR . $filename,
308 | $serializedObject->getContent());
309 | }
310 |
311 | if ($toRemove)
312 | $this->serialized = array();
313 |
314 | return true;
315 | }
316 |
317 | /**
318 | * Perform serialization on the SerializableSerpPage array.
319 | * The serialized objects will be stored in the serialized queue, waiting to
320 | * be written to files or to do whatever you wanna do with them.
321 | * If the related flag $toRemove is on, the fetched queue will be emptied.
322 | * @param string $format
323 | * @param bool $toRemove
324 | * @return bool
325 | */
326 | public function serialize($format, $toRemove = false)
327 | {
328 | // fail if there's nothing to serialize or unsupported serialization format
329 | if (empty($this->fetched) || !self::supportedFormat($format))
330 | return false;
331 | // loop over the SerializableSerpPage array
332 | for ($i = 0; $i < count($this->fetched); $i++) {
333 | // get a SerializablePage object
334 | $serializablePage = $this->fetched[$i];
335 | // extract data to generate array key
336 | $engine = self::runTimeClassName();
337 | list($keyword, $pageNumber, $age) = SerpScraperHelper::extractSerializablePageData($serializablePage);
338 | // generate array key
339 | $fileName = FileSystemHelper::generateArrKey($engine, $keyword,
340 | $pageNumber, $age,
341 | $format);
342 | $this->serialized[$fileName] = $this->serializer->serialize($serializablePage,
343 | $format);
344 | }
345 | // empty the fetched array if specified
346 | if ($toRemove)
347 | $this->fetched = array();
348 |
349 | return true;
350 | }
351 |
352 | /**
353 | * Flush the underlying Fetcher object cache, along with the fetched and
354 | * serialized queues.
355 | */
356 | public function flushCache()
357 | {
358 | $this->fetcher->flushCache();
359 | $this->fetched = array();
360 | $this->serialized = array();
361 | }
362 |
363 | /**
364 | * Return the underlying Fetcher object.
365 | * @return AskFetcher|BingFetcher|GoogleFetcher|YahooFetcher
366 | */
367 | public function getFetcher()
368 | {
369 | return $this->fetcher;
370 | }
371 |
372 | /**
373 | * Return the underlying Throttler object.
374 | * @return Throttler
375 | */
376 | public function getThrottler()
377 | {
378 | return $this->throttler;
379 | }
380 |
381 | /**
382 | * Return the underlying SerpPageSerializer object.
383 | * @return SerpPageSerializer
384 | */
385 | public function getSerializer()
386 | {
387 | return $this->serializer;
388 | }
389 |
390 | /**
391 | * Return the serialized serp pages.
392 | * @return array
393 | */
394 | public function getSerializedPages()
395 | {
396 | return $this->serialized;
397 | }
398 |
399 | /**
400 | * Return the fetched serp pages.
401 | * @return array
402 | */
403 | public function getFetchedPages()
404 | {
405 | return $this->fetched;
406 | }
407 |
408 | /**
409 | * Get the path to the folder used to store output.
410 | * @return string
411 | */
412 | public function getOutDir()
413 | {
414 | return $this->outDir;
415 | }
416 |
417 | /**
418 | * Set the path to the folder used to store output.
419 | * @param string $dir
420 | * @return bool
421 | */
422 | public function setOutDir($dir)
423 | {
424 | if (SerpScraperHelper::validateDirName($dir)
425 | && FileSystemHelper::preventCacheCollision($dir, $this->fetcherCacheDir,
426 | $this->serializerCacheDir)) {
427 | $this->outDir = $dir;
428 | FileSystemHelper::setUpDir($dir);
429 | return true;
430 | }
431 | return false;
432 | }
433 |
434 | /**
435 | * Get the path to the folder used to store the fetcher cache.
436 | * @return string
437 | */
438 | public function getFetcherCacheDir()
439 | {
440 | return $this->fetcherCacheDir;
441 | }
442 |
443 | /**
444 | * Set the path to the folder used to store the fetcher cache.
445 | * @param string
446 | * @return bool
447 | */
448 | public function setFetcherCacheDir($dir)
449 | {
450 | if (SerpScraperHelper::validateDirName($dir)
451 | && FileSystemHelper::preventCacheCollision($this->outDir, $dir,
452 | $this->serializerCacheDir)) {
453 | $this->fetcherCacheDir = $dir;
454 | FileSystemHelper::setUpDir($dir);
455 | return true;
456 | }
457 | return false;
458 | }
459 |
460 | /**
461 | * Get the path to the folder used to store the serializer cache.
462 | * @return string
463 | */
464 | public function getSerializerCacheDir()
465 | {
466 | return $this->serializerCacheDir;
467 | }
468 |
469 | /**
470 | * Get the cache expiration time, in hours.
471 | * @return string
472 | */
473 | public function getCacheTTL()
474 | {
475 | return $this->cacheTTL;
476 | }
477 |
478 | /**
479 | * Set the cache expiration time, in hours.
480 | * @param int
481 | * @return bool
482 | */
483 | public function setCacheTTL($hours)
484 | {
485 | if (SerpScraperHelper::validateExpirationTime($hours)) {
486 | $this->cacheTTL = $hours;
487 | $this->fetcher->setCacheTTL($hours);
488 | return true;
489 | }
490 | return false;
491 | }
492 |
493 | /**
494 | * Get the delay used between each request, in microseconds.
495 | * @return int
496 | */
497 | public function getRequestDelay()
498 | {
499 | return $this->requestDelay;
500 | }
501 |
502 | /**
503 | * Set the delay used between each request, in microseconds.
504 | * @param int
505 | * @return bool
506 | */
507 | public function setRequestDelay($microseconds)
508 | {
509 | if (SerpScraperHelper::validateExpirationTime($microseconds)) {
510 | $this->requestDelay = $microseconds;
511 | return true;
512 | }
513 | return false;
514 | }
515 |
516 | /**
517 | * Get the array with keywords to scrape.
518 | * @return array
519 | */
520 | public function getKeywords()
521 | {
522 | return $this->keywords;
523 | }
524 |
525 | /**
526 | * Add a keyword to scrape. Update Throttler object accordingly.
527 | * @param string $keyword
528 | * @return string
529 | */
530 | public function addKeyword($keyword)
531 | {
532 | if (KeywordValidator::isValid($keyword)
533 | && !SerpScraperHelper::keywordPresent($keyword, $this->keywords)) {
534 | $cleanKeyword = KeywordValidator::processKeyword($keyword);
535 | array_push($this->keywords, $cleanKeyword);
536 | $this->throttler->stop();
537 | $this->throttler->addComponents($cleanKeyword);
538 | $this->throttler->resume();
539 | return true;
540 | }
541 | return false;
542 | }
543 |
544 | /**
545 | * Add multiple keywords to scrape.
546 | * @param array $keywords
547 | * @return bool
548 | */
549 | public function addKeywords($keywords)
550 | {
551 | if (KeywordValidator::validKeywords($keywords) && !empty($keywords)) {
552 | for ($i = 0; $i < count($keywords); $i++) {
553 | $this->addKeyword($keywords[$i]);
554 | }
555 | return true;
556 | }
557 | return false;
558 | }
559 |
560 | /**
561 | * Remove a keyword from the queue.
562 | * This will not remove the keyword from the underlying Throttler object, since
563 | * it's possible to add the keyword back again and generate new requests hits
564 | * that still need to be throttled.
565 | * @param string $keyword
566 | * @return bool
567 | */
568 | public function removeKeyword($keyword)
569 | {
570 | if (SerpScraperHelper::keywordPresent($keyword, $this->keywords)) {
571 | $toDel = array_search($keyword, $this->keywords);
572 | unset($this->keywords[$toDel]);
573 | $this->keywords = array_values($this->keywords);
574 | return true;
575 | }
576 | return false;
577 | }
578 |
579 | /**
580 | * Fetch a SERP page and update the underlying Throttler status accordingly.
581 | * @param string $keyword
582 | * @param string $url
583 | * @return array
584 | */
585 | private function fetchPage($keyword, $url)
586 | {
587 | if (!$this->fetcher->cacheHit($url)) {
588 | $this->throttler->updateComponent($keyword);
589 | }
590 | return $this->fetcher->fetch($url);
591 | }
592 |
593 | /**
594 | * Map a fetched page returned by SerpFetcher->fetch() to an array suitable
595 | * for SerializableSerpPage constructor.
596 | * @param array $fetchedPage
597 | * @return array
598 | */
599 | private function makeEntries($fetchedPage)
600 | {
601 | $entries = array();
602 | for ($i = 0; $i < self::DEFAULT_RESULTS_PER_PAGE; $i++) {
603 | // construct an entry
604 | $entry = array('url' => $fetchedPage[self::SERP_FETCHER_URLS][$i],
605 | 'snippet' => $fetchedPage[self::SERP_FETCHER_SNIPPETS][$i],
606 | 'title' => $fetchedPage[self::SERP_FETCHER_TITLES][$i]);
607 | // don't add padded entries
608 | if (!$this->paddedEntry($entry))
609 | array_push($entries, $entry);
610 | }
611 | return $entries;
612 | }
613 |
614 | /**
615 | * Detect padded entries.
616 | * @param array $entry
617 | * @return bool
618 | */
619 | private function paddedEntry($entry)
620 | {
621 | return $entry['url'] == \Franzip\SerpFetcher\Fetchers\SerpFetcher::DEFAULT_PAD_ENTRY
622 | && $entry['title'] == \Franzip\SerpFetcher\Fetchers\SerpFetcher::DEFAULT_PAD_ENTRY
623 | && $entry['snippet'] == \Franzip\SerpFetcher\Fetchers\SerpFetcher::DEFAULT_PAD_ENTRY;
624 | }
625 |
626 | /**
627 | * Map keywords array to urls ready to to be scraped.
628 | * @param int $pagesToScrape
629 | * @param string|array $keywords
630 | * @return array
631 | */
632 | private function mapKeywordsToUrls($pagesToScrape, $keywords)
633 | {
634 | $urls = array();
635 | if (is_string($keywords))
636 | $keywords = array($keywords);
637 | for ($i = 0; $i < count($keywords); $i++) {
638 | $urls[$this->keywords[$i]] = array();
639 | for ($j = 0; $j < $pagesToScrape; $j++) {
640 | $urls[$keywords[$i]][] = SerpUrlGenerator::makeUrl(self::runTimeClassName(),
641 | $keywords[$i], $j);
642 | }
643 | }
644 | return $urls;
645 | }
646 |
647 | /**
648 | * Check whether a scrape operation is to allow.
649 | * @param int $pagesToScrape
650 | * @param string $urlsToCheck
651 | * @return bool
652 | */
653 | private function allowedScrapeOperation($pagesToScrape, $urlsToCheck)
654 | {
655 | if ($this->throttler->timeExpired()) {
656 | $this->throttler->refreshInstance();
657 | return true;
658 | }
659 | list($globalHitCount, $componentHitCount) = $this->hitCounter($urlsToCheck);
660 | return $this->hitChecker($globalHitCount, $componentHitCount);
661 | }
662 |
663 | /**
664 | * Compute global and per-keyword HTTP requests needed to complete a
665 | * scraping operation. Cached hit will be ignored.
666 | * @param array $urlsArr
667 | * @return array
668 | */
669 | private function hitCounter($urlsArr)
670 | {
671 | $globalHitCount = 0;
672 | $componentHitCount = array();
673 | // initialize per-component hit array
674 | foreach ($this->keywords as $key => $value) {
675 | $componentHitCount[$value] = 0;
676 | }
677 | foreach ($urlsArr as $keyword => $arr) {
678 | for ($i = 0; $i < count($arr); $i++) {
679 | // increase hits only on HTTP requests, ignore cache hit
680 | if (!$this->getFetcher()->cacheHit($arr[$i])) {
681 | $globalHitCount += 1;
682 | $componentHitCount[$keyword] += 1;
683 | }
684 | }
685 | }
686 | return array($globalHitCount, $componentHitCount);
687 | }
688 |
689 | /**
690 | * Check that the global and per-keyword hit counts are within the Throttler
691 | * thresholds.
692 | * @param int $globalHitCount
693 | * @param array $componentHitCount
694 | * @return bool
695 | */
696 | private function hitChecker($globalHitCount, $componentHitCount)
697 | {
698 | $componentCheck = true;
699 | $globalCheck = ($globalHitCount + $this->throttler->getCounter()) < $this->throttler->getGlobalThreshold();
700 | $throttlerThreshold = $this->getThrottler()->getComponentThreshold();
701 | // check per-keywords hits only if per-component throttling is set
702 | if ($throttlerThreshold !== null) {
703 | $throttlerComponents = $this->getThrottler()->getComponents();
704 | foreach ($componentHitCount as $key => $value) {
705 | if ($value + $throttlerComponents[$key] > $throttlerThreshold) {
706 | $componentCheck = false;
707 | break;
708 | }
709 | }
710 | }
711 | return $globalCheck && $componentCheck;
712 | }
713 |
714 | /**
715 | * Identify the search engine at runtime from the calling class.
716 | * @return string
717 | */
718 | private static function runTimeClassName()
719 | {
720 | return str_replace(array(self::SCRAPERS_PREFIX, self::SCRAPERS_SUFFIX),
721 | '', get_called_class());
722 | }
723 |
724 | /**
725 | * Check for supported serialization format.
726 | * @param string $format
727 | * @return bool
728 | */
729 | private static function supportedFormat($format)
730 | {
731 | return in_array(strtolower($format), self::$supportedSerializationFormat);
732 | }
733 | }
734 |
--------------------------------------------------------------------------------
/src/Scrapers/YahooScraper.php:
--------------------------------------------------------------------------------
1 |
7 | * @link https://github.com/franzip/serp-scraper
8 | * @copyright Copyright 2015 Francesco Pezzella
9 | * @license http://www.opensource.org/licenses/mit-license.php MIT License
10 | * @package SerpScraper
11 | */
12 |
13 | namespace Franzip\SerpScraper\Scrapers;
14 |
15 | /**
16 | * SerpScraper implementation for Yahoo search engine.
17 | *
18 | * @package SerpScraper
19 | */
20 | class YahooScraper extends SerpScraper
21 | {
22 |
23 | }
24 |
--------------------------------------------------------------------------------
/src/SerpScraperBuilder.php:
--------------------------------------------------------------------------------
1 |
7 | * @link https://github.com/franzip/serp-scraper
8 | * @copyright Copyright 2015 Francesco Pezzella
9 | * @license http://www.opensource.org/licenses/mit-license.php MIT License
10 | * @package SerpScraper
11 | */
12 |
13 | namespace Franzip\SerpScraper;
14 |
15 | /**
16 | * SerpScraper Factory.
17 | *
18 | * @package SerpScraper
19 | */
20 | class SerpScraperBuilder
21 | {
22 | // namespace constants
23 | const SCRAPER_CLASS_PREFIX = '\\Franzip\\SerpScraper\\Scrapers\\';
24 | const SCRAPER_CLASS_SUFFIX = 'Scraper';
25 | // implemented scrapers
26 | private static $supportedEngines = array('google', 'yahoo', 'bing', 'ask');
27 |
28 | /**
29 | * Return a SerpScraper implementation for a given search engine.
30 | * @param string $engine
31 | * @param null|array $args
32 | * @return mixed
33 | */
34 | public static function create($engine, $args = null)
35 | {
36 | $engine = strtolower($engine);
37 | if (self::validEngine($engine)) {
38 | return (isset($args)) ? self::createWithArgs($engine, $args) : self::createWithArgs($engine, array());
39 | }
40 | throw new \Franzip\SerpScraper\Exceptions\UnsupportedEngineException('Unknown or unsupported Search Engine.');
41 | }
42 |
43 | /**
44 | * Use reflection to instantiate the right Scraper at runtime.
45 | * @param string $engine
46 | * @param null|array $args
47 | * @return mixed
48 | */
49 | private static function createWithArgs($engine, $args)
50 | {
51 | $engineName = ucfirst($engine);
52 | $className = self::SCRAPER_CLASS_PREFIX . $engineName . self::SCRAPER_CLASS_SUFFIX;
53 | return call_user_func_array(array(new \ReflectionClass($className),
54 | 'newInstance'), $args);
55 | }
56 |
57 | /**
58 | * Check if there is a SerpScraper implementation for the given search engine.
59 | * @param string $engine
60 | * @return bool
61 | */
62 | private static function validEngine($engine)
63 | {
64 | return is_string($engine) && in_array($engine, self::$supportedEngines);
65 | }
66 |
67 | /**
68 | * Make the class static.
69 | */
70 | private function __construct() {}
71 | }
72 |
--------------------------------------------------------------------------------
/tests/SerpScraperBuilderTest.php:
--------------------------------------------------------------------------------
1 | invalidEngines = $invalidEngines;
16 | }
17 |
18 | /**
19 | * @expectedException \Franzip\SerpScraper\Exceptions\UnsupportedEngineException
20 | * @expectedExceptionMessage Unknown or unsupported Search Engine.
21 | */
22 | public function testInvalidEngineArgument1()
23 | {
24 | $foo = Builder::create($this->invalidEngines[0]);
25 | }
26 |
27 | /**
28 | * @expectedException \Franzip\SerpScraper\Exceptions\UnsupportedEngineException
29 | * @expectedExceptionMessage Unknown or unsupported Search Engine.
30 | */
31 | public function testInvalidEngineArgument2()
32 | {
33 | $foo = Builder::create($this->invalidEngines[1]);
34 | }
35 |
36 | /**
37 | * @expectedException \Franzip\SerpScraper\Exceptions\UnsupportedEngineException
38 | * @expectedExceptionMessage Unknown or unsupported Search Engine.
39 | */
40 | public function testInvalidEngineArgument3()
41 | {
42 | $foo = Builder::create($this->invalidEngines[2]);
43 | }
44 |
45 | /**
46 | * @expectedException \Franzip\SerpScraper\Exceptions\UnsupportedEngineException
47 | * @expectedExceptionMessage Unknown or unsupported Search Engine.
48 | */
49 | public function testInvalidEngineArgument4()
50 | {
51 | $foo = Builder::create($this->invalidEngines[3]);
52 | }
53 | }
54 |
55 | class BuilderInvalidArgsTest extends PHPUnit_Framework_TestCase
56 | {
57 | protected $invalidKeywords;
58 | protected $invalidTime;
59 | protected $invalidDirs;
60 | protected $engines;
61 |
62 | protected function setUp()
63 | {
64 | $invalidDirs = array(1, '', false, 21);
65 | $invalidTime = array('', 'foo', ' ', false);
66 | $invalidKeywords = array(array(''), array(' '), array(false), array(2),
67 | array(str_repeat('foo', 100)), array('foo' => 'bar'),
68 | array('foo', 'baz', 'fobaz', 'bar' => 'baz'),
69 | array('foo', 'baz', 0));
70 | $engines = array('gOOgLe', 'aSk', 'BIng', 'yAHOo');
71 | $this->invalidKeywords = $invalidKeywords;
72 | $this->invalidTime = $invalidTime;
73 | $this->invalidDirs = $invalidDirs;
74 | $this->engines = $engines;
75 | }
76 |
77 | protected function tearDown()
78 | {
79 | TestHelper::cleanMess();
80 | }
81 |
82 | /**
83 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
84 | * @expectedExceptionMessage Invalid SerializableSerpPage $keywords: please supply a sequential non-empty array of strings.
85 | */
86 | public function testEmptyKeywordsArr()
87 | {
88 | $googleScraper = Builder::create($this->engines[0], array(array()));
89 | }
90 |
91 | /**
92 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
93 | * @expectedExceptionMessage Invalid SerializableSerpPage $outDir: please supply a non empty string.
94 | */
95 | public function testInvalidOutDir()
96 | {
97 | $googleScraper = Builder::create($this->engines[0], array(array('foo'), $this->invalidDirs[0]));
98 | }
99 |
100 | /**
101 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
102 | * @expectedExceptionMessage Invalid SerializableSerpPage $outDir: please supply a non empty string.
103 | */
104 | public function testInvalidOutDir1()
105 | {
106 | $askScraper = Builder::create($this->engines[1], array(array('foo'), $this->invalidDirs[1]));
107 | }
108 |
109 | /**
110 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
111 | * @expectedExceptionMessage Invalid SerializableSerpPage $fetcherCacheDir: please supply a non empty string.
112 | */
113 | public function testInvalidCacheDir()
114 | {
115 | $bingScraper = Builder::create($this->engines[2], array(array('foo'), 'foo', $this->invalidDirs[2]));
116 | }
117 |
118 | /**
119 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
120 | * @expectedExceptionMessage Invalid SerializableSerpPage $fetcherCacheDir: please supply a non empty string.
121 | */
122 | public function testInvalidCacheDir2()
123 | {
124 | $yahooScraper = Builder::create($this->engines[3], array(array('foo'), 'bar', $this->invalidDirs[3]));
125 | }
126 |
127 | /**
128 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
129 | * @expectedExceptionMessage Invalid SerializableSerpPage $serializerCacheDir: please supply a non empty string.
130 | */
131 | public function testInvalidCacheDir3()
132 | {
133 | $yahooScraper = Builder::create($this->engines[3], array(array('foo'), 'bar', 'baz', $this->invalidDirs[0]));
134 | }
135 |
136 | /**
137 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
138 | * @expectedExceptionMessage Invalid SerializableSerpPage $serializerCacheDir: please supply a non empty string.
139 | */
140 | public function testInvalidCacheDir4()
141 | {
142 | $googleScraper = Builder::create($this->engines[0], array(array('foo'), 'baz', 'bar', $this->invalidDirs[1]));
143 | }
144 |
145 | /**
146 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
147 | * @expectedExceptionMessage Invalid SerializableSerpPage $outDir, $fetcherCacheDir, $serializerCacheDir: cannot share the same folder for different caches. Please supply different folders path for different caches.
148 | */
149 | public function testEqualCacheDir()
150 | {
151 | $googleScraper = Builder::create($this->engines[0], array(array('foo'), 'baz', 'bar', 'bar'));
152 | }
153 |
154 | /**
155 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
156 | * @expectedExceptionMessage Invalid SerializableSerpPage $cacheTTL: please supply a positive integer.
157 | */
158 | public function testInvalidCacheTTL()
159 | {
160 | $askScraper = Builder::create($this->engines[1], array(array('foo'), 'baz', 'bar', 'foo', $this->invalidTime[0]));
161 | }
162 |
163 | /**
164 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
165 | * @expectedExceptionMessage Invalid SerializableSerpPage $cacheTTL: please supply a positive integer.
166 | */
167 | public function testInvalidCacheTTL1()
168 | {
169 | $bingScraper = Builder::create($this->engines[2], array(array('foo'), 'baz', 'bar', 'foo', $this->invalidTime[1]));
170 | }
171 |
172 | /**
173 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
174 | * @expectedExceptionMessage Invalid SerializableSerpPage $cacheTTL: please supply a positive integer.
175 | */
176 | public function testInvalidCacheTTL2()
177 | {
178 | $bingScraper = Builder::create($this->engines[2], array(array('foo'), 'baz', 'bar', 'foo', $this->invalidTime[2]));
179 | }
180 |
181 | /**
182 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
183 | * @expectedExceptionMessage Invalid SerializableSerpPage $cacheTTL: please supply a positive integer.
184 | */
185 | public function testInvalidCacheTTL3()
186 | {
187 | $yahooScraper = Builder::create($this->engines[3], array(array('foo'), 'baz', 'bar', 'foo', $this->invalidTime[3]));
188 | }
189 |
190 | /**
191 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
192 | * @expectedExceptionMessage Invalid SerializableSerpPage $requestDelay: please supply a positive integer.
193 | */
194 | public function testInvalidDelay()
195 | {
196 | $googleScraper = Builder::create($this->engines[0], array(array('foo'), 'baz', 'bar', 'foobar', 24, $this->invalidTime[0]));
197 | }
198 |
199 | /**
200 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
201 | * @expectedExceptionMessage Invalid SerializableSerpPage $requestDelay: please supply a positive integer.
202 | */
203 | public function testInvalidDelay1()
204 | {
205 | $googleScraper = Builder::create($this->engines[0], array(array('foo'), 'baz', 'bar', 'foobar', 24, $this->invalidTime[1]));
206 | }
207 |
208 | /**
209 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
210 | * @expectedExceptionMessage Invalid SerializableSerpPage $requestDelay: please supply a positive integer.
211 | */
212 | public function testInvalidDelay2()
213 | {
214 | $googleScraper = Builder::create($this->engines[0], array(array('foo'), 'baz', 'bar', 'foobar', 24, $this->invalidTime[2]));
215 | }
216 |
217 | /**
218 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
219 | * @expectedExceptionMessage Invalid SerializableSerpPage $requestDelay: please supply a positive integer.
220 | */
221 | public function testInvalidDelay3()
222 | {
223 | $googleScraper = Builder::create($this->engines[0], array(array('foo'), 'baz', 'bar', 'foobar', 24, $this->invalidTime[3]));
224 | }
225 |
226 | /**
227 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
228 | * @expectedExceptionMessage Invalid SerializableSerpPage $keywords: please supply a sequential non-empty array of strings.
229 | */
230 | public function testInvalidKeywords()
231 | {
232 | $googleScraper = Builder::create($this->engines[0], array($this->invalidKeywords[0], 'baz', 'bar', 'bad', 24, 500));
233 | }
234 |
235 | /**
236 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
237 | * @expectedExceptionMessage Invalid SerializableSerpPage $keywords: please supply a sequential non-empty array of strings.
238 | */
239 | public function testInvalidKeywords1()
240 | {
241 | $googleScraper = Builder::create($this->engines[0], array($this->invalidKeywords[1], 'baz', 'bar', 'bad', 24, 500));
242 | }
243 |
244 | /**
245 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
246 | * @expectedExceptionMessage Invalid SerializableSerpPage $keywords: please supply a sequential non-empty array of strings.
247 | */
248 | public function testInvalidKeywords2()
249 | {
250 | $askScraper = Builder::create($this->engines[1], array($this->invalidKeywords[2], 'baz', 'bar', 'bad', 24, 500));
251 | }
252 |
253 | /**
254 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
255 | * @expectedExceptionMessage Invalid SerializableSerpPage $keywords: please supply a sequential non-empty array of strings.
256 | */
257 | public function testInvalidKeywords3()
258 | {
259 | $bingScraper = Builder::create($this->engines[0], array($this->invalidKeywords[3], 'baz', 'bar', 'bad', 24, 500));
260 | }
261 |
262 | /**
263 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
264 | * @expectedExceptionMessage Invalid SerializableSerpPage $keywords: please supply a sequential non-empty array of strings.
265 | */
266 | public function testInvalidKeywords4()
267 | {
268 | $yahooScraper = Builder::create($this->engines[0], array($this->invalidKeywords[4], 'baz', 'bar', 'bad', 24, 500));
269 | }
270 |
271 | /**
272 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
273 | * @expectedExceptionMessage Invalid SerializableSerpPage $keywords: please supply a sequential non-empty array of strings.
274 | */
275 | public function testInvalidKeywords5()
276 | {
277 | $yahooScraper = Builder::create($this->engines[0], array($this->invalidKeywords[5], 'baz', 'bar', 'bad', 24, 500));
278 | }
279 |
280 | /**
281 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
282 | * @expectedExceptionMessage Invalid SerializableSerpPage $keywords: please supply a sequential non-empty array of strings.
283 | */
284 | public function testInvalidKeywords6()
285 | {
286 | $yahooScraper = Builder::create($this->engines[0], array($this->invalidKeywords[6], 'baz', 'bar', 'bad', 24, 500));
287 | }
288 |
289 | /**
290 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
291 | * @expectedExceptionMessage Invalid SerializableSerpPage $keywords: please supply a sequential non-empty array of strings.
292 | */
293 | public function testInvalidKeywords7()
294 | {
295 | $yahooScraper = Builder::create($this->engines[0], array($this->invalidKeywords[7], 'baz', 'bar', 'bad', 24, 500));
296 | }
297 | }
298 |
299 | class BuilderTypesTest extends PHPUnit_Framework_TestCase
300 | {
301 | protected $engines;
302 |
303 | protected function setUp()
304 | {
305 | $engines = array('gOOgLe', 'aSk', 'BIng', 'yAHOo');
306 | $this->engines = $engines;
307 | }
308 |
309 | protected function tearDown()
310 | {
311 | TestHelper::cleanMess();
312 | }
313 |
314 | public function testGoogleScraper()
315 | {
316 | $googleScraper = Builder::create($this->engines[0],
317 | array(array('foo'), 'foobar', 'baz',
318 | 'bazbar', 48, 200));
319 | $this->assertEquals(get_parent_class($googleScraper),
320 | 'Franzip\SerpScraper\Scrapers\SerpScraper');
321 | $this->assertInstanceOf('Franzip\SerpScraper\Scrapers\GoogleScraper',
322 | $googleScraper);
323 | $this->assertInstanceOf('Franzip\Throttler\Throttler',
324 | $googleScraper->getThrottler());
325 | $this->assertInstanceOf('Franzip\SerpFetcher\Fetchers\GoogleFetcher',
326 | $googleScraper->getFetcher());
327 | $this->assertTrue(file_exists('foobar') && is_dir('foobar'));
328 | $this->assertTrue(file_exists('baz') && is_dir('baz'));
329 | $this->assertTrue(file_exists('bazbar') && is_dir('bazbar'));
330 | }
331 |
332 | public function testAskScraper()
333 | {
334 | $askScraper = Builder::create($this->engines[1],
335 | array(array('foo'), 'bad', 'foo', 'foobad',
336 | 72, 100));
337 | $this->assertEquals(get_parent_class($askScraper),
338 | 'Franzip\SerpScraper\Scrapers\SerpScraper');
339 | $this->assertInstanceOf('Franzip\SerpScraper\Scrapers\AskScraper',
340 | $askScraper);
341 | $this->assertInstanceOf('Franzip\Throttler\Throttler',
342 | $askScraper->getThrottler());
343 | $this->assertInstanceOf('Franzip\SerpFetcher\Fetchers\AskFetcher',
344 | $askScraper->getFetcher());
345 | $this->assertTrue(file_exists('bad') && is_dir('bad'));
346 | $this->assertTrue(file_exists('foobad') && is_dir('foobad'));
347 | $this->assertTrue(file_exists('foo') && is_dir('foo'));
348 | }
349 |
350 | public function testBingScraper()
351 | {
352 | $bingScraper = Builder::create($this->engines[2], array(array('baz')));
353 | $this->assertEquals(get_parent_class($bingScraper),
354 | 'Franzip\SerpScraper\Scrapers\SerpScraper');
355 | $this->assertInstanceOf('Franzip\SerpScraper\Scrapers\BingScraper',
356 | $bingScraper);
357 | $this->assertInstanceOf('Franzip\Throttler\Throttler',
358 | $bingScraper->getThrottler());
359 | $this->assertInstanceOf('Franzip\SerpFetcher\Fetchers\BingFetcher',
360 | $bingScraper->getFetcher());
361 | $this->assertTrue(file_exists($bingScraper::DEFAULT_OUTPUT_DIR)
362 | && is_dir($bingScraper::DEFAULT_OUTPUT_DIR));
363 | $this->assertTrue(file_exists($bingScraper::DEFAULT_FETCHER_CACHE_DIR)
364 | && is_dir($bingScraper::DEFAULT_FETCHER_CACHE_DIR));
365 | $this->assertTrue(file_exists($bingScraper::DEFAULT_SERIALIZER_CACHE_DIR)
366 | && is_dir($bingScraper::DEFAULT_SERIALIZER_CACHE_DIR));
367 | }
368 |
369 | public function testYahooScraper()
370 | {
371 | $yahooScraper = Builder::create($this->engines[3], array(array('baz')));
372 | $this->assertEquals(get_parent_class($yahooScraper),
373 | 'Franzip\SerpScraper\Scrapers\SerpScraper');
374 | $this->assertInstanceOf('Franzip\SerpScraper\Scrapers\YahooScraper',
375 | $yahooScraper);
376 | $this->assertInstanceOf('Franzip\Throttler\Throttler',
377 | $yahooScraper->getThrottler());
378 | $this->assertInstanceOf('Franzip\SerpFetcher\Fetchers\YahooFetcher',
379 | $yahooScraper->getFetcher());
380 | $this->assertTrue(file_exists($yahooScraper::DEFAULT_OUTPUT_DIR)
381 | && is_dir($yahooScraper::DEFAULT_OUTPUT_DIR));
382 | $this->assertTrue(file_exists($yahooScraper::DEFAULT_FETCHER_CACHE_DIR)
383 | && is_dir($yahooScraper::DEFAULT_FETCHER_CACHE_DIR));
384 | $this->assertTrue(file_exists($yahooScraper::DEFAULT_SERIALIZER_CACHE_DIR)
385 | && is_dir($yahooScraper::DEFAULT_SERIALIZER_CACHE_DIR));
386 | }
387 | }
388 |
--------------------------------------------------------------------------------
/tests/SerpScraperHelpersTest.php:
--------------------------------------------------------------------------------
1 | invalidKeywords = $invalidKeywords;
19 | $this->invalidEngines = $invalidEngines;
20 | }
21 |
22 | protected function tearDown()
23 | {
24 | TestHelper::cleanMess();
25 | }
26 |
27 | /**
28 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
29 | * @expectedExceptionMessage $keyword must be a valid string (max 180 characters).
30 | */
31 | public function testEmptyString()
32 | {
33 | Validator::processKeyword($this->invalidKeywords[0]);
34 | }
35 |
36 | /**
37 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
38 | * @expectedExceptionMessage $keyword must be a valid string (max 180 characters).
39 | */
40 | public function testWhiteSpaces()
41 | {
42 | Validator::processKeyword($this->invalidKeywords[1]);
43 | }
44 |
45 | /**
46 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
47 | * @expectedExceptionMessage $keyword must be a valid string (max 180 characters).
48 | */
49 | public function testInt()
50 | {
51 | Validator::processKeyword($this->invalidKeywords[2]);
52 | }
53 |
54 | /**
55 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
56 | * @expectedExceptionMessage $keyword must be a valid string (max 180 characters).
57 | */
58 | public function testBool()
59 | {
60 | Validator::processKeyword($this->invalidKeywords[3]);
61 | }
62 |
63 | /**
64 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
65 | * @expectedExceptionMessage $keyword must be a valid string (max 180 characters).
66 | */
67 | public function testNull()
68 | {
69 | Validator::processKeyword($this->invalidKeywords[4]);
70 | }
71 |
72 | /**
73 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
74 | * @expectedExceptionMessage $keyword must be a valid string (max 180 characters).
75 | */
76 | public function testLongString()
77 | {
78 | Validator::processKeyword($this->invalidKeywords[5]);
79 | }
80 |
81 | /**
82 | * @expectedException \Franzip\SerpScraper\Exceptions\InvalidArgumentException
83 | * @expectedExceptionMessage $keyword must be a valid string (max 180 characters).
84 | */
85 | public function testNewLine()
86 | {
87 | Validator::processKeyword($this->invalidKeywords[6]);
88 | }
89 |
90 | /**
91 | * @expectedException \Franzip\SerpScraper\Exceptions\UnsupportedEngineException
92 | * @expectedExceptionMessage Unknown or unsupported Search Engine.
93 | */
94 | public function testInvalidEngine1()
95 | {
96 | Generator::makeUrl($this->invalidEngines[0], 'foobar', 0);
97 | }
98 |
99 | /**
100 | * @expectedException \Franzip\SerpScraper\Exceptions\UnsupportedEngineException
101 | * @expectedExceptionMessage Unknown or unsupported Search Engine.
102 | */
103 | public function testInvalidEngine2()
104 | {
105 | Generator::makeUrl($this->invalidEngines[1], 'foobar', 0);
106 | }
107 |
108 | /**
109 | * @expectedException \Franzip\SerpScraper\Exceptions\UnsupportedEngineException
110 | * @expectedExceptionMessage Unknown or unsupported Search Engine.
111 | */
112 | public function testInvalidEngine3()
113 | {
114 | Generator::makeUrl($this->invalidEngines[2], 'foobar', 0);
115 | }
116 |
117 | /**
118 | * @expectedException \Franzip\SerpScraper\Exceptions\UnsupportedEngineException
119 | * @expectedExceptionMessage Unknown or unsupported Search Engine.
120 | */
121 | public function testInvalidEngine4()
122 | {
123 | Generator::makeUrl($this->invalidEngines[3], 'foobar', 0);
124 | }
125 | }
126 |
127 | class CleaningKeywordsTest extends PHPUnit_Framework_TestCase
128 | {
129 | protected $keywords;
130 |
131 | protected function setUp()
132 | {
133 | $cleanKey = array('foo', 'bar', 'barfoo12', str_repeat('foo', 30));
134 | $keyToClean = array('foo, bar', ' foo ', "\tbar foo bar foo ",
135 | "\t\t foo \t bar", "\t\t\t \s//\\+?<>",
136 | "\ + / ? $ \t\t '<' \" >");
137 | $this->keywords = array('cleanKey' => $cleanKey,
138 | 'keyToClean' => $keyToClean);
139 | }
140 |
141 | public function testClean()
142 | {
143 | $this->assertEquals(Validator::processKeyword($this->keywords['cleanKey'][0]), 'foo');
144 | $this->assertEquals(Validator::processKeyword($this->keywords['cleanKey'][1]), 'bar');
145 | $this->assertEquals(Validator::processKeyword($this->keywords['cleanKey'][2]), 'barfoo12');
146 | $this->assertEquals(Validator::processKeyword($this->keywords['cleanKey'][3]), str_repeat('foo', 30));
147 | }
148 |
149 | public function testDirty()
150 | {
151 | $this->assertEquals(Validator::processKeyword($this->keywords['keyToClean'][0]), 'foo, bar');
152 | $this->assertEquals(Validator::processKeyword($this->keywords['keyToClean'][1]), 'foo');
153 | $this->assertEquals(Validator::processKeyword($this->keywords['keyToClean'][2]), 'bar foo bar foo');
154 | $this->assertEquals(Validator::processKeyword($this->keywords['keyToClean'][3]), 'foo bar');
155 | $this->assertEquals(Validator::processKeyword($this->keywords['keyToClean'][4]), '\s//\+?<>');
156 | $this->assertEquals(Validator::processKeyword($this->keywords['keyToClean'][5]), "\ + / ? $ '<' \" >");
157 | }
158 | }
159 |
160 | class UrlGeneratorTest extends PHPUnit_Framework_TestCase
161 | {
162 | protected $settings;
163 |
164 | protected function setUp()
165 | {
166 | $engines = array('google', 'bing', 'yahoo', 'ask');
167 | $offsets = array(0, 1, 2, 3, 4, 5, 6);
168 | $keywords = array(Validator::processKeyword('foobar'),
169 | Validator::processKeyword('foo'),
170 | Validator::processKeyword('foo baz ? \/ <>'),
171 | Validator::processKeyword('\n\t/ '));
172 | $this->settings = array('engines' => $engines,
173 | 'offsets' => $offsets,
174 | 'keywords' => $keywords);
175 | }
176 |
177 | public function testUrls()
178 | {
179 | $this->assertEquals(Generator::makeUrl($this->settings['engines'][0],
180 | $this->settings['keywords'][0],
181 | $this->settings['offsets'][0]),
182 | "http://www.google.com/search?q=foobar&start=0");
183 | $this->assertEquals(Generator::makeUrl($this->settings['engines'][1],
184 | $this->settings['keywords'][1],
185 | $this->settings['offsets'][1]),
186 | "http://www.bing.com/search?q=foo&first=11");
187 | $this->assertEquals(Generator::makeUrl($this->settings['engines'][2],
188 | $this->settings['keywords'][2],
189 | $this->settings['offsets'][2]),
190 | "https://search.yahoo.com/search?p=foo+baz+%3F+%5C%2F+%3C%3E&b=21");
191 | $this->assertEquals(Generator::makeUrl($this->settings['engines'][3],
192 | $this->settings['keywords'][3],
193 | $this->settings['offsets'][3]),
194 | 'http://us.ask.com/web?q=%5Cn%5Ct%2F+%3Ca%3E&page=4');
195 | $this->assertEquals(Generator::makeUrl($this->settings['engines'][0],
196 | $this->settings['keywords'][2],
197 | $this->settings['offsets'][4]),
198 | "http://www.google.com/search?q=foo+baz+%3F+%5C%2F+%3C%3E&start=40");
199 | $this->assertEquals(Generator::makeUrl($this->settings['engines'][1],
200 | $this->settings['keywords'][2],
201 | $this->settings['offsets'][5]),
202 | "http://www.bing.com/search?q=foo+baz+%3F+%5C%2F+%3C%3E&first=51");
203 | $this->assertEquals(Generator::makeUrl($this->settings['engines'][2],
204 | $this->settings['keywords'][3],
205 | $this->settings['offsets'][6]),
206 | "https://search.yahoo.com/search?p=%5Cn%5Ct%2F+%3Ca%3E&b=61");
207 | }
208 | }
209 |
--------------------------------------------------------------------------------
/tests/SerpScraperTest.php:
--------------------------------------------------------------------------------
1 | engines = $engines;
16 | }
17 |
18 | protected function tearDown()
19 | {
20 | TestHelper::cleanMess();
21 | }
22 |
23 | public function testWithDefaultArgs()
24 | {
25 | $googleScraper = Builder::create($this->engines[0], array(array('foo')));
26 | $this->assertEquals($googleScraper->getOutDir(), 'out');
27 | $this->assertEquals($googleScraper->getFetcherCacheDir(), 'fetcher_cache');
28 | $this->assertEquals($googleScraper->getSerializerCacheDir(), 'serializer_cache');
29 | $this->assertEquals($googleScraper->getCacheTTL(), 24);
30 | $this->assertEquals($googleScraper->getRequestDelay(), 500);
31 | $this->assertEquals($googleScraper->getKeywords(), array('foo'));
32 | $this->assertEquals($googleScraper->getThrottler()->getName(), 'http_requests');
33 | $this->assertEquals($googleScraper->getThrottler()->getGlobalThreshold(), 15);
34 | $this->assertEquals($googleScraper->getThrottler()->getMetric(), 'hrs');
35 | $this->assertEquals($googleScraper->getThrottler()->getMetricFactor(), 1);
36 | $this->assertNull($googleScraper->getThrottler()->getComponentThreshold());
37 | $this->assertEquals($googleScraper->getThrottler()->getComponents(),
38 | array('foo' => 0));
39 | $this->assertFalse($googleScraper->setOutDir(2));
40 | $this->assertTrue($googleScraper->setOutDir('foobar1'));
41 | $this->assertFalse($googleScraper->setFetcherCacheDir(3));
42 | $this->assertFalse($googleScraper->setFetcherCacheDir('foobar1'));
43 | $this->assertFalse($googleScraper->setOutDir('fetcher_cache'));
44 | $this->assertTrue($googleScraper->setFetcherCacheDir('foobar2'));
45 | $this->assertTrue(file_exists('foobar1') && is_dir('foobar1'));
46 | $this->assertTrue(file_exists('foobar2') && is_dir('foobar2'));
47 | $this->assertFalse($googleScraper->setCacheTTL('bar'));
48 | $this->assertTrue($googleScraper->setCacheTTL(200));
49 | $this->assertEquals($googleScraper->getFetcher()->getCacheTTL(), 200);
50 | $this->assertFalse($googleScraper->setRequestDelay('foo'));
51 | $this->assertTrue($googleScraper->setRequestDelay(100));
52 | $this->assertEquals($googleScraper->getOutDir(), 'foobar1');
53 | $this->assertEquals($googleScraper->getFetcherCacheDir(), 'foobar2');
54 | $this->assertEquals($googleScraper->getCacheTTL(), 200);
55 | $this->assertEquals($googleScraper->getRequestDelay(), 100);
56 | $this->assertFalse($googleScraper->addKeyword(3));
57 | $this->assertTrue($googleScraper->addKeyword('foobaz'));
58 | $this->assertTrue($googleScraper->addKeyword('baz'));
59 | $this->assertEquals($googleScraper->getKeywords(),
60 | array('foo', 'foobaz', 'baz'));
61 | $this->assertEquals($googleScraper->getThrottler()->getComponents(),
62 | array('foo' => 0, 'foobaz' => 0, 'baz' => 0));
63 | $this->assertFalse($googleScraper->addKeywords(array()));
64 | $this->assertFalse($googleScraper->addKeywords(array('foo' => 1)));
65 | $this->assertTrue($googleScraper->addKeywords(array('foo', 'baz', 'bar')));
66 | $this->assertEquals($googleScraper->getKeywords(),
67 | array('foo', 'foobaz', 'baz', 'bar'));
68 | $this->assertTrue($googleScraper->removeKeyword('foobaz'));
69 | $this->assertTrue($googleScraper->removeKeyword('foo'));
70 | $this->assertEquals($googleScraper->getKeywords(), array('baz', 'bar'));
71 | $this->assertEquals($googleScraper->getThrottler()->getComponents(),
72 | array('foo' => 0, 'foobaz' => 0, 'baz' => 0, 'bar' => 0));
73 |
74 | $askScraper = Builder::create($this->engines[1], array(array('foo')));
75 | $this->assertEquals($askScraper->getOutDir(), 'out');
76 | $this->assertEquals($askScraper->getFetcherCacheDir(), 'fetcher_cache');
77 | $this->assertEquals($askScraper->getSerializerCacheDir(), 'serializer_cache');
78 | $this->assertEquals($askScraper->getCacheTTL(), 24);
79 | $this->assertEquals($askScraper->getRequestDelay(), 500);
80 | $this->assertEquals($askScraper->getKeywords(), array('foo'));
81 | $this->assertEquals($askScraper->getThrottler()->getName(), 'http_requests');
82 | $this->assertEquals($askScraper->getThrottler()->getGlobalThreshold(), 15);
83 | $this->assertEquals($askScraper->getThrottler()->getMetric(), 'hrs');
84 | $this->assertEquals($askScraper->getThrottler()->getMetricFactor(), 1);
85 | $this->assertNull($askScraper->getThrottler()->getComponentThreshold());
86 | $this->assertEquals($askScraper->getThrottler()->getComponents(),
87 | array('foo' => 0));
88 | $this->assertFalse($askScraper->setOutDir(2));
89 | $this->assertTrue($askScraper->setOutDir('foobar3'));
90 | $this->assertFalse($askScraper->setFetcherCacheDir(3));
91 | $this->assertTrue($askScraper->setFetcherCacheDir('foobar4'));
92 | $this->assertTrue(file_exists('foobar3') && is_dir('foobar3'));
93 | $this->assertTrue(file_exists('foobar4') && is_dir('foobar4'));
94 | $this->assertFalse($askScraper->setCacheTTL('bar'));
95 | $this->assertTrue($askScraper->setCacheTTL(200));
96 | $this->assertEquals($askScraper->getFetcher()->getCacheTTL(), 200);
97 | $this->assertFalse($askScraper->setRequestDelay('foo'));
98 | $this->assertTrue($askScraper->setRequestDelay(100));
99 | $this->assertEquals($askScraper->getOutDir(), 'foobar3');
100 | $this->assertEquals($askScraper->getFetcherCacheDir(), 'foobar4');
101 | $this->assertEquals($askScraper->getCacheTTL(), 200);
102 | $this->assertEquals($askScraper->getRequestDelay(), 100);
103 | $this->assertFalse($askScraper->addKeyword(3));
104 | $this->assertTrue($askScraper->addKeyword('foobaz'));
105 | $this->assertTrue($askScraper->addKeyword('baz'));
106 | $this->assertEquals($askScraper->getKeywords(),
107 | array('foo', 'foobaz', 'baz'));
108 | $this->assertEquals($askScraper->getThrottler()->getComponents(),
109 | array('foo' => 0, 'foobaz' => 0, 'baz' => 0));
110 | $this->assertFalse($askScraper->addKeywords(array('foo', 'bar' => 1)));
111 | $this->assertTrue($askScraper->addKeywords(array('bar', 'foobar', 'barfoo')));
112 | $this->assertTrue($askScraper->removeKeyword('foobaz'));
113 | $this->assertTrue($askScraper->removeKeyword('foo'));
114 | $this->assertEquals($askScraper->getKeywords(),
115 | array('baz', 'bar', 'foobar', 'barfoo'));
116 | $this->assertEquals($askScraper->getThrottler()->getComponents(),
117 | array('foo' => 0, 'foobaz' => 0, 'baz' => 0,
118 | 'bar' => 0, 'foobar' => 0, 'barfoo' => 0));
119 |
120 | $bingScraper = Builder::create($this->engines[2], array(array('foo')));
121 | $this->assertEquals($bingScraper->getOutDir(), 'out');
122 | $this->assertEquals($bingScraper->getFetcherCacheDir(), 'fetcher_cache');
123 | $this->assertEquals($bingScraper->getSerializerCacheDir(), 'serializer_cache');
124 | $this->assertEquals($bingScraper->getCacheTTL(), 24);
125 | $this->assertEquals($bingScraper->getRequestDelay(), 500);
126 | $this->assertEquals($bingScraper->getKeywords(), array('foo'));
127 | $this->assertEquals($bingScraper->getThrottler()->getName(), 'http_requests');
128 | $this->assertEquals($bingScraper->getThrottler()->getGlobalThreshold(), 15);
129 | $this->assertEquals($bingScraper->getThrottler()->getMetric(), 'hrs');
130 | $this->assertEquals($bingScraper->getThrottler()->getMetricFactor(), 1);
131 | $this->assertNull($bingScraper->getThrottler()->getComponentThreshold());
132 | $this->assertEquals($bingScraper->getThrottler()->getComponents(),
133 | array('foo' => 0));
134 | $this->assertFalse($bingScraper->setOutDir(2));
135 | $this->assertTrue($bingScraper->setOutDir('foobar5'));
136 | $this->assertFalse($bingScraper->setFetcherCacheDir(3));
137 | $this->assertTrue($bingScraper->setFetcherCacheDir('foobar6'));
138 | $this->assertTrue(file_exists('foobar5') && is_dir('foobar5'));
139 | $this->assertTrue(file_exists('foobar6') && is_dir('foobar6'));
140 | $this->assertFalse($bingScraper->setCacheTTL('bar'));
141 | $this->assertTrue($bingScraper->setCacheTTL(200));
142 | $this->assertEquals($bingScraper->getFetcher()->getCacheTTL(), 200);
143 | $this->assertFalse($bingScraper->setRequestDelay('foo'));
144 | $this->assertTrue($bingScraper->setRequestDelay(100));
145 | $this->assertEquals($bingScraper->getOutDir(), 'foobar5');
146 | $this->assertEquals($bingScraper->getFetcherCacheDir(), 'foobar6');
147 | $this->assertEquals($bingScraper->getCacheTTL(), 200);
148 | $this->assertEquals($bingScraper->getRequestDelay(), 100);
149 | $this->assertFalse($bingScraper->addKeyword(3));
150 | $this->assertTrue($bingScraper->addKeyword('foobaz'));
151 | $this->assertTrue($bingScraper->addKeyword('baz'));
152 | $this->assertEquals($bingScraper->getKeywords(),
153 | array('foo', 'foobaz', 'baz'));
154 | $this->assertEquals($bingScraper->getThrottler()->getComponents(),
155 | array('foo' => 0, 'foobaz' => 0, 'baz' => 0));
156 | $this->assertTrue($bingScraper->addKeywords(array('barfoo')));
157 | $this->assertTrue($bingScraper->removeKeyword('foobaz'));
158 | $this->assertTrue($bingScraper->removeKeyword('foo'));
159 | $this->assertEquals($bingScraper->getKeywords(), array('baz', 'barfoo'));
160 | $this->assertEquals($bingScraper->getThrottler()->getComponents(),
161 | array('foo' => 0, 'foobaz' => 0, 'baz' => 0, 'barfoo' => 0));
162 |
163 | $yahooScraper = Builder::create($this->engines[3], array(array('foo')));
164 | $this->assertEquals($yahooScraper->getOutDir(), 'out');
165 | $this->assertEquals($yahooScraper->getFetcherCacheDir(), 'fetcher_cache');
166 | $this->assertEquals($yahooScraper->getSerializerCacheDir(), 'serializer_cache');
167 | $this->assertEquals($yahooScraper->getCacheTTL(), 24);
168 | $this->assertEquals($yahooScraper->getRequestDelay(), 500);
169 | $this->assertEquals($yahooScraper->getKeywords(), array('foo'));
170 | $this->assertEquals($yahooScraper->getThrottler()->getName(), 'http_requests');
171 | $this->assertEquals($yahooScraper->getThrottler()->getGlobalThreshold(), 15);
172 | $this->assertEquals($yahooScraper->getThrottler()->getMetric(), 'hrs');
173 | $this->assertEquals($yahooScraper->getThrottler()->getMetricFactor(), 1);
174 | $this->assertNull($yahooScraper->getThrottler()->getComponentThreshold());
175 | $this->assertEquals($yahooScraper->getThrottler()->getComponents(),
176 | array('foo' => 0));
177 | $this->assertFalse($yahooScraper->setOutDir(2));
178 | $this->assertTrue($yahooScraper->setOutDir('foobar7'));
179 | $this->assertFalse($yahooScraper->setFetcherCacheDir(3));
180 | $this->assertTrue($yahooScraper->setFetcherCacheDir('foobar8'));
181 | $this->assertTrue(file_exists('foobar7') && is_dir('foobar7'));
182 | $this->assertTrue(file_exists('foobar8') && is_dir('foobar8'));
183 | $this->assertFalse($yahooScraper->setCacheTTL('bar'));
184 | $this->assertTrue($yahooScraper->setCacheTTL(200));
185 | $this->assertFalse($yahooScraper->setRequestDelay('foo'));
186 | $this->assertTrue($yahooScraper->setRequestDelay(100));
187 | $this->assertEquals($yahooScraper->getOutDir(), 'foobar7');
188 | $this->assertEquals($yahooScraper->getFetcherCacheDir(), 'foobar8');
189 | $this->assertEquals($yahooScraper->getCacheTTL(), 200);
190 | $this->assertEquals($yahooScraper->getFetcher()->getCacheTTL(), 200);
191 | $this->assertEquals($yahooScraper->getRequestDelay(), 100);
192 | $this->assertFalse($yahooScraper->addKeyword(3));
193 | $this->assertFalse($yahooScraper->addKeyword('foo'));
194 | $this->assertTrue($yahooScraper->addKeyword('baz'));
195 | $this->assertEquals($yahooScraper->getKeywords(), array('foo', 'baz'));
196 | $this->assertEquals($yahooScraper->getThrottler()->getComponents(),
197 | array('foo' => 0, 'baz' => 0));
198 | $this->assertFalse($yahooScraper->removeKeyword('foobaz'));
199 | $this->assertTrue($yahooScraper->removeKeyword('foo'));
200 | $this->assertEquals($yahooScraper->getKeywords(), array('baz'));
201 | $this->assertEquals($yahooScraper->getThrottler()->getComponents(),
202 | array('foo' => 0, 'baz' => 0));
203 | }
204 |
205 | public function testWithCustomArgs()
206 | {
207 | $googleScraper = Builder::create($this->engines[0],
208 | array(array('foobam ', ' foobaz', 'baz'),
209 | 'foo', 'bar', 'baz', 48, 1000));
210 | $this->assertEquals($googleScraper->getOutDir(), 'foo');
211 | $this->assertEquals($googleScraper->getFetcherCacheDir(), 'bar');
212 | $this->assertEquals($googleScraper->getSerializerCacheDir(), 'baz');
213 | $this->assertTrue(file_exists('foo') && is_dir('foo'));
214 | $this->assertTrue(file_exists('bar') && is_dir('bar'));
215 | $this->assertTrue(file_exists('baz') && is_dir('baz'));
216 | $this->assertEquals($googleScraper->getCacheTTL(), 48);
217 | $this->assertEquals($googleScraper->getRequestDelay(), 1000);
218 | $this->assertEquals($googleScraper->getKeywords(),
219 | array('foobam', 'foobaz', 'baz'));
220 | $this->assertEquals($googleScraper->getThrottler()->getName(), 'http_requests');
221 | $this->assertEquals($googleScraper->getThrottler()->getGlobalThreshold(), 15);
222 | $this->assertEquals($googleScraper->getThrottler()->getMetric(), 'hrs');
223 | $this->assertEquals($googleScraper->getThrottler()->getMetricFactor(), 1);
224 | $this->assertNull($googleScraper->getThrottler()->getComponentThreshold());
225 | $this->assertEquals($googleScraper->getThrottler()->getComponents(),
226 | array('foobam' => 0, 'foobaz' => 0, 'baz' => 0));
227 | $this->assertFalse($googleScraper->setOutDir(2));
228 | $this->assertTrue($googleScraper->setOutDir('foobar1'));
229 | $this->assertFalse($googleScraper->setFetcherCacheDir(3));
230 | $this->assertTrue($googleScraper->setFetcherCacheDir('foobar2'));
231 | $this->assertTrue(file_exists('foobar1') && is_dir('foobar1'));
232 | $this->assertTrue(file_exists('foobar2') && is_dir('foobar2'));
233 | $this->assertFalse($googleScraper->setCacheTTL('bar'));
234 | $this->assertTrue($googleScraper->setCacheTTL(200));
235 | $this->assertFalse($googleScraper->setRequestDelay('foo'));
236 | $this->assertTrue($googleScraper->setRequestDelay(100));
237 | $this->assertEquals($googleScraper->getOutDir(), 'foobar1');
238 | $this->assertEquals($googleScraper->getFetcherCacheDir(), 'foobar2');
239 | $this->assertEquals($googleScraper->getCacheTTL(), 200);
240 | $this->assertEquals($googleScraper->getRequestDelay(), 100);
241 | $this->assertFalse($googleScraper->addKeyword(3));
242 | $this->assertTrue($googleScraper->addKeyword("\t foo"));
243 | $this->assertFalse($googleScraper->addKeyword('baz'));
244 | $this->assertEquals($googleScraper->getKeywords(),
245 | array('foobam', 'foobaz', 'baz', 'foo'));
246 | $this->assertEquals($googleScraper->getThrottler()->getComponents(),
247 | array('foo' => 0, 'foobam' => 0, 'foobaz' => 0, 'baz' => 0));
248 | $this->assertTrue($googleScraper->removeKeyword('foobaz'));
249 | $this->assertTrue($googleScraper->removeKeyword('foo'));
250 | $this->assertEquals($googleScraper->getKeywords(), array('foobam', 'baz'));
251 | $this->assertTrue($googleScraper->addKeywords(array('foobaz', 'foo')));
252 | $this->assertEquals($googleScraper->getKeywords(),
253 | array('foobam', 'baz', 'foobaz', 'foo'));
254 |
255 | $askScraper = Builder::create($this->engines[1], array(array('foobaz'),
256 | 'fooz', 'barz', 'bazz', 72, 1500));
257 | $this->assertEquals($askScraper->getOutDir(), 'fooz');
258 | $this->assertEquals($askScraper->getFetcherCacheDir(), 'barz');
259 | $this->assertEquals($askScraper->getSerializerCacheDir(), 'bazz');
260 | $this->assertTrue(file_exists('fooz') && is_dir('fooz'));
261 | $this->assertTrue(file_exists('barz') && is_dir('barz'));
262 | $this->assertTrue(file_exists('bazz') && is_dir('bazz'));
263 | $this->assertEquals($askScraper->getCacheTTL(), 72);
264 | $this->assertEquals($askScraper->getRequestDelay(), 1500);
265 | $this->assertEquals($askScraper->getKeywords(), array('foobaz'));
266 | $this->assertEquals($askScraper->getThrottler()->getName(), 'http_requests');
267 | $this->assertEquals($askScraper->getThrottler()->getGlobalThreshold(), 15);
268 | $this->assertEquals($askScraper->getThrottler()->getMetric(), 'hrs');
269 | $this->assertEquals($askScraper->getThrottler()->getMetricFactor(), 1);
270 | $this->assertNull($askScraper->getThrottler()->getComponentThreshold());
271 | $this->assertEquals($askScraper->getThrottler()->getComponents(),
272 | array('foobaz' => 0));
273 |
274 | $bingScraper = Builder::create($this->engines[2],
275 | array(
276 | array('foobam', 'foobaz'),
277 | 'foobar', 'barfoo', 'bazfoo', 148,
278 | 100));
279 | $this->assertEquals($bingScraper->getOutDir(), 'foobar');
280 | $this->assertEquals($bingScraper->getFetcherCacheDir(), 'barfoo');
281 | $this->assertEquals($bingScraper->getSerializerCacheDir(), 'bazfoo');
282 | $this->assertTrue(file_exists('foobar') && is_dir('foobar'));
283 | $this->assertTrue(file_exists('barfoo') && is_dir('barfoo'));
284 | $this->assertTrue(file_exists('bazfoo') && is_dir('bazfoo'));
285 | $this->assertEquals($bingScraper->getCacheTTL(), 148);
286 | $this->assertEquals($bingScraper->getRequestDelay(), 100);
287 | $this->assertEquals($bingScraper->getKeywords(), array('foobam', 'foobaz'));
288 | $this->assertEquals($bingScraper->getThrottler()->getName(), 'http_requests');
289 | $this->assertEquals($bingScraper->getThrottler()->getGlobalThreshold(), 15);
290 | $this->assertEquals($bingScraper->getThrottler()->getMetric(), 'hrs');
291 | $this->assertEquals($bingScraper->getThrottler()->getMetricFactor(), 1);
292 | $this->assertNull($bingScraper->getThrottler()->getComponentThreshold());
293 | $this->assertEquals($bingScraper->getThrottler()->getComponents(),
294 | array('foobam' => 0, 'foobaz' => 0));
295 |
296 | $yahooScraper = Builder::create($this->engines[1],
297 | array(
298 | array(' foobam', ' foobaz'),
299 | 'foo', 'bar', 'baz', 48, 1000));
300 | $this->assertEquals($yahooScraper->getOutDir(), 'foo');
301 | $this->assertEquals($yahooScraper->getFetcherCacheDir(), 'bar');
302 | $this->assertEquals($yahooScraper->getSerializerCacheDir(), 'baz');
303 | $this->assertEquals($yahooScraper->getCacheTTL(), 48);
304 | $this->assertEquals($yahooScraper->getRequestDelay(), 1000);
305 | $this->assertEquals($yahooScraper->getKeywords(), array('foobam', 'foobaz'));
306 | $this->assertEquals($yahooScraper->getThrottler()->getName(), 'http_requests');
307 | $this->assertEquals($yahooScraper->getThrottler()->getGlobalThreshold(), 15);
308 | $this->assertEquals($yahooScraper->getThrottler()->getMetric(), 'hrs');
309 | $this->assertEquals($yahooScraper->getThrottler()->getMetricFactor(), 1);
310 | $this->assertNull($yahooScraper->getThrottler()->getComponentThreshold());
311 | $this->assertEquals($yahooScraper->getThrottler()->getComponents(),
312 | array('foobam' => 0, 'foobaz' => 0));
313 | }
314 | }
315 |
316 | class ScrapingTest extends PHPUnit_Framework_TestCase
317 | {
318 | protected $engines;
319 |
320 | protected function setUp()
321 | {
322 | $engines = array('gOOgLe', 'aSk', 'BIng', 'yAHOo');
323 | $this->engines = $engines;
324 | }
325 |
326 | protected function tearDown()
327 | {
328 | TestHelper::cleanMess();
329 | }
330 |
331 | public function testKeyToUrlMapping()
332 | {
333 | $googleScraper = Builder::create($this->engines[0],
334 | array(array('foo', 'baz')));
335 | $mapKeywordsToUrls = TestHelper::getMethod('mapKeywordsToUrls', 'Google');
336 | $this->assertEquals(
337 | $mapKeywordsToUrls->invokeArgs($googleScraper, array(1, 'foo')),
338 | array(
339 | "foo" => array("http://www.google.com/search?q=foo&start=0")
340 | )
341 | );
342 |
343 | $this->assertEquals(
344 | $mapKeywordsToUrls->invokeArgs($googleScraper, array(1, $googleScraper->getKeywords())),
345 | array(
346 | "foo" => array("http://www.google.com/search?q=foo&start=0"),
347 | "baz" => array("http://www.google.com/search?q=baz&start=0")
348 | )
349 | );
350 |
351 | $this->assertEquals(
352 | $mapKeywordsToUrls->invokeArgs($googleScraper, array(2, $googleScraper->getKeywords())),
353 | array(
354 | "foo" => array("http://www.google.com/search?q=foo&start=0",
355 | "http://www.google.com/search?q=foo&start=10"),
356 | "baz" => array("http://www.google.com/search?q=baz&start=0",
357 | "http://www.google.com/search?q=baz&start=10")
358 | )
359 | );
360 |
361 | $askScraper = Builder::create($this->engines[1], array(array('foobar', 'baz')));
362 | $mapKeywordsToUrls = TestHelper::getMethod('mapKeywordsToUrls', 'Ask');
363 | $this->assertEquals($mapKeywordsToUrls->invokeArgs($askScraper, array(1, 'foobar')),
364 | array(
365 | "foobar" => array("http://us.ask.com/web?q=foobar&page=1")
366 | )
367 | );
368 |
369 | $this->assertEquals($mapKeywordsToUrls->invokeArgs($askScraper, array(1, $askScraper->getKeywords())),
370 | array(
371 | "foobar" => array("http://us.ask.com/web?q=foobar&page=1"),
372 | "baz" => array("http://us.ask.com/web?q=baz&page=1")
373 | )
374 | );
375 |
376 | $this->assertEquals($mapKeywordsToUrls->invokeArgs($askScraper, array(2, $askScraper->getKeywords())),
377 | array("foobar" => array("http://us.ask.com/web?q=foobar&page=1",
378 | "http://us.ask.com/web?q=foobar&page=2"),
379 | "baz" => array("http://us.ask.com/web?q=baz&page=1",
380 | "http://us.ask.com/web?q=baz&page=2")
381 | )
382 | );
383 |
384 | $bingScraper = Builder::create($this->engines[2], array(array('bazfoo', 'foobaz')));
385 | $mapKeywordsToUrls = TestHelper::getMethod('mapKeywordsToUrls', 'Bing');
386 | $this->assertEquals($mapKeywordsToUrls->invokeArgs($bingScraper, array(1, 'bazfoo')),
387 | array(
388 | "bazfoo" => array("http://www.bing.com/search?q=bazfoo&first=1")
389 | )
390 | );
391 |
392 | $this->assertEquals($mapKeywordsToUrls->invokeArgs($bingScraper, array(1, $bingScraper->getKeywords())),
393 | array(
394 | "bazfoo" => array("http://www.bing.com/search?q=bazfoo&first=1"),
395 | "foobaz" => array("http://www.bing.com/search?q=foobaz&first=1")
396 | )
397 | );
398 |
399 | $this->assertEquals($mapKeywordsToUrls->invokeArgs($bingScraper, array(2, $bingScraper->getKeywords())),
400 | array(
401 | "bazfoo" => array("http://www.bing.com/search?q=bazfoo&first=1",
402 | "http://www.bing.com/search?q=bazfoo&first=11"),
403 | "foobaz" => array("http://www.bing.com/search?q=foobaz&first=1",
404 | "http://www.bing.com/search?q=foobaz&first=11")
405 | )
406 | );
407 |
408 | $yahooScraper = Builder::create($this->engines[3], array(array('foo')));
409 | $mapKeywordsToUrls = TestHelper::getMethod('mapKeywordsToUrls', 'Yahoo');
410 | $this->assertEquals($mapKeywordsToUrls->invokeArgs($yahooScraper, array(1, 'foo')),
411 | array(
412 | "foo" => array("https://search.yahoo.com/search?p=foo&b=1")
413 | )
414 | );
415 |
416 | $this->assertEquals($mapKeywordsToUrls->invokeArgs($yahooScraper, array(5, $yahooScraper->getKeywords())),
417 | array(
418 | "foo" => array("https://search.yahoo.com/search?p=foo&b=1",
419 | "https://search.yahoo.com/search?p=foo&b=11",
420 | "https://search.yahoo.com/search?p=foo&b=21",
421 | "https://search.yahoo.com/search?p=foo&b=31",
422 | "https://search.yahoo.com/search?p=foo&b=41")
423 | )
424 | );
425 | }
426 |
427 | public function testHitCounter()
428 | {
429 | $googleScraper = Builder::create($this->engines[0], array(array('foo', 'baz')));
430 | $mapKeywordsToUrls = TestHelper::getMethod('mapKeywordsToUrls', 'Google');
431 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($googleScraper,
432 | array(1, $googleScraper->getKeywords()));
433 | $hitCounter = TestHelper::getMethod('hitCounter', 'Google');
434 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($googleScraper,
435 | array($urlsToScrape));
436 | $this->assertEquals($globalHitCount, 2);
437 | $this->assertEquals($componentHitCount, array("foo" => 1, "baz" => 1));
438 | $this->assertTrue($googleScraper->addKeyword('foobaz'));
439 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($googleScraper,
440 | array(2, $googleScraper->getKeywords()));
441 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($googleScraper,
442 | array($urlsToScrape));
443 | $this->assertEquals($globalHitCount, 6);
444 | $this->assertEquals($componentHitCount, array("foo" => 2, "baz" => 2, "foobaz" => 2));
445 | $this->assertTrue($googleScraper->removeKeyword("foo"));
446 | $this->assertTrue($googleScraper->removeKeyword("baz"));
447 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($googleScraper,
448 | array(3, $googleScraper->getKeywords()));
449 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($googleScraper,
450 | array($urlsToScrape));
451 | $this->assertEquals($globalHitCount, 3);
452 | $this->assertEquals($componentHitCount, array("foobaz" => 3));
453 |
454 | $askScraper = Builder::create($this->engines[1], array(array('foobar', 'baz')));
455 | $mapKeywordsToUrls = TestHelper::getMethod('mapKeywordsToUrls', 'Ask');
456 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($askScraper,
457 | array(1, $askScraper->getKeywords()));
458 | $hitCounter = TestHelper::getMethod('hitCounter', 'Ask');
459 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($askScraper,
460 | array($urlsToScrape));
461 | $this->assertEquals($globalHitCount, 2);
462 | $this->assertEquals($componentHitCount, array("foobar" => 1, "baz" => 1));
463 | $this->assertTrue($askScraper->addKeyword('foobaz'));
464 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($askScraper,
465 | array(2, $askScraper->getKeywords()));
466 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($askScraper,
467 | array($urlsToScrape));
468 | $this->assertEquals($globalHitCount, 6);
469 | $this->assertEquals($componentHitCount, array("foobar" => 2, "baz" => 2, "foobaz" => 2));
470 | $this->assertTrue($askScraper->removeKeyword("foobar"));
471 | $this->assertTrue($askScraper->removeKeyword("baz"));
472 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($askScraper,
473 | array(3, $askScraper->getKeywords()));
474 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($askScraper,
475 | array($urlsToScrape));
476 | $this->assertEquals($globalHitCount, 3);
477 | $this->assertEquals($componentHitCount, array("foobaz" => 3));
478 |
479 | $bingScraper = Builder::create($this->engines[2], array(array('bazfoo', 'foobaz')));
480 | $mapKeywordsToUrls = TestHelper::getMethod('mapKeywordsToUrls', 'Bing');
481 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($bingScraper,
482 | array(1, $bingScraper->getKeywords()));
483 | $hitCounter = TestHelper::getMethod('hitCounter', 'Bing');
484 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($bingScraper,
485 | array($urlsToScrape));
486 | $this->assertEquals($globalHitCount, 2);
487 | $this->assertEquals($componentHitCount, array("bazfoo" => 1, "foobaz" => 1));
488 | $this->assertTrue($bingScraper->addKeyword('foo'));
489 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($bingScraper,
490 | array(2, $bingScraper->getKeywords()));
491 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($bingScraper,
492 | array($urlsToScrape));
493 | $this->assertEquals($globalHitCount, 6);
494 | $this->assertEquals($componentHitCount, array("bazfoo" => 2, "foo" => 2, "foobaz" => 2));
495 | $this->assertTrue($bingScraper->removeKeyword("foo"));
496 | $this->assertTrue($bingScraper->removeKeyword("bazfoo"));
497 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($bingScraper,
498 | array(3, $bingScraper->getKeywords()));
499 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($bingScraper,
500 | array($urlsToScrape));
501 | $this->assertEquals($globalHitCount, 3);
502 | $this->assertEquals($componentHitCount, array("foobaz" => 3));
503 |
504 | $yahooScraper = Builder::create($this->engines[3], array(array('foo')));
505 | $mapKeywordsToUrls = TestHelper::getMethod('mapKeywordsToUrls', 'Yahoo');
506 | $hitCounter = TestHelper::getMethod('hitCounter', 'Yahoo');
507 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($yahooScraper,
508 | array(1, $yahooScraper->getKeywords()));
509 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($yahooScraper,
510 | array($urlsToScrape));
511 | $this->assertEquals($globalHitCount, 1);
512 | $this->assertEquals($componentHitCount, array("foo" => 1));
513 | $this->assertTrue($yahooScraper->addKeyword('foobaz'));
514 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($yahooScraper,
515 | array(2, $yahooScraper->getKeywords()));
516 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($yahooScraper,
517 | array($urlsToScrape));
518 | $this->assertEquals($globalHitCount, 4);
519 | $this->assertEquals($componentHitCount, array("foo" => 2, "foobaz" => 2));
520 | $this->assertTrue($yahooScraper->removeKeyword("foobaz"));
521 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($yahooScraper,
522 | array(3, $yahooScraper->getKeywords()));
523 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($yahooScraper,
524 | array($urlsToScrape));
525 | $this->assertEquals($globalHitCount, 3);
526 | $this->assertEquals($componentHitCount, array("foo" => 3));
527 | }
528 |
529 | public function testHitChecker()
530 | {
531 | $googleScraper = Builder::create($this->engines[0], array(array('foo', 'baz')));
532 | $mapKeywordsToUrls = TestHelper::getMethod('mapKeywordsToUrls', 'Google');
533 | $hitCounter = TestHelper::getMethod('hitCounter', 'Google');
534 | $hitChecker = TestHelper::getMethod('hitChecker', 'Google');
535 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($googleScraper,
536 | array(1, $googleScraper->getKeywords()));
537 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($googleScraper,
538 | array($urlsToScrape));
539 | $this->assertTrue($hitChecker->invokeArgs($googleScraper,
540 | array($globalHitCount, $componentHitCount)));
541 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($googleScraper,
542 | array(7, $googleScraper->getKeywords()));
543 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($googleScraper,
544 | array($urlsToScrape));
545 | $this->assertTrue($hitChecker->invokeArgs($googleScraper,
546 | array($globalHitCount, $componentHitCount)));
547 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($googleScraper,
548 | array(8, $googleScraper->getKeywords()));
549 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($googleScraper,
550 | array($urlsToScrape));
551 | $this->assertFalse($hitChecker->invokeArgs($googleScraper,
552 | array($globalHitCount, $componentHitCount)));
553 | $googleScraper->getThrottler()->stop();
554 | $this->assertTrue($googleScraper->getThrottler()->setComponentThreshold(3));
555 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($googleScraper,
556 | array(3, $googleScraper->getKeywords()));
557 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($googleScraper,
558 | array($urlsToScrape));
559 | $this->assertTrue($hitChecker->invokeArgs($googleScraper,
560 | array($globalHitCount, $componentHitCount)));
561 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($googleScraper,
562 | array(4, $googleScraper->getKeywords()));
563 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($googleScraper,
564 | array($urlsToScrape));
565 | $this->assertFalse($hitChecker->invokeArgs($googleScraper,
566 | array($globalHitCount, $componentHitCount)));
567 |
568 | $askScraper = Builder::create($this->engines[1], array(array('foo', 'baz', 'foobaz')));
569 | $mapKeywordsToUrls = TestHelper::getMethod('mapKeywordsToUrls', 'Ask');
570 | $hitCounter = TestHelper::getMethod('hitCounter', 'Ask');
571 | $hitChecker = TestHelper::getMethod('hitChecker', 'Ask');
572 | $askScraper->getThrottler()->stop();
573 | $this->assertTrue($askScraper->getThrottler()->setGlobalThreshold(100));
574 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($askScraper,
575 | array(30, $askScraper->getKeywords()));
576 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($askScraper,
577 | array($urlsToScrape));
578 | $this->assertTrue($hitChecker->invokeArgs($askScraper,
579 | array($globalHitCount, $componentHitCount)));
580 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($askScraper,
581 | array(40, $askScraper->getKeywords()));
582 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($askScraper,
583 | array($urlsToScrape));
584 | $this->assertFalse($hitChecker->invokeArgs($askScraper,
585 | array($globalHitCount, $componentHitCount)));
586 | $this->assertTrue($askScraper->getThrottler()->setComponentThreshold(20));
587 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($askScraper, array(15, $askScraper->getKeywords()));
588 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($askScraper,
589 | array($urlsToScrape));
590 | $this->assertTrue($hitChecker->invokeArgs($askScraper,
591 | array($globalHitCount, $componentHitCount)));
592 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($askScraper,
593 | array(21, $askScraper->getKeywords()));
594 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($askScraper,
595 | array($urlsToScrape));
596 | $this->assertFalse($hitChecker->invokeArgs($askScraper, array($globalHitCount, $componentHitCount)));
597 |
598 | $bingScraper = Builder::create($this->engines[2], array(array('foo', 'baz', 'foobaz')));
599 | $mapKeywordsToUrls = TestHelper::getMethod('mapKeywordsToUrls', 'Bing');
600 | $hitCounter = TestHelper::getMethod('hitCounter', 'Bing');
601 | $hitChecker = TestHelper::getMethod('hitChecker', 'Bing');
602 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($bingScraper,
603 | array(4, $bingScraper->getKeywords()));
604 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($bingScraper,
605 | array($urlsToScrape));
606 | $this->assertTrue($hitChecker->invokeArgs($bingScraper,
607 | array($globalHitCount, $componentHitCount)));
608 | $urlsToScrape = $mapKeywordsToUrls->invokeArgs($bingScraper,
609 | array(5, $bingScraper->getKeywords()));
610 | list($globalHitCount, $componentHitCount) = $hitCounter->invokeArgs($bingScraper,
611 | array($urlsToScrape));
612 | $this->assertFalse($hitChecker->invokeArgs($bingScraper,
613 | array($globalHitCount, $componentHitCount)));
614 | }
615 |
616 | public function testScrapingFilter()
617 | {
618 | $googleScraper = Builder::create($this->engines[0], array(array('foo', 'baz')));
619 | $this->assertFalse($googleScraper->scrape(0));
620 | $this->assertFalse($googleScraper->scrape(''));
621 | $this->assertFalse($googleScraper->scrape('foobaz'));
622 | $this->assertFalse($googleScraper->scrape('foo', 's'));
623 | $this->assertFalse($googleScraper->scrape('foo', 0));
624 | $this->assertFalse($googleScraper->scrape('foo', 1, 'UTC', 1));
625 | $this->assertFalse($googleScraper->scrape('foo', 16));
626 | $this->assertFalse($googleScraper->scrape('foo', 100));
627 | $googleScraper->getThrottler()->stop();
628 | $this->assertTrue($googleScraper->getThrottler()->setGlobalThreshold(10));
629 | $this->assertFalse($googleScraper->scrape('foo', 11));
630 | $this->assertFalse($googleScraper->scrape('foo', 10));
631 | $this->assertTrue($googleScraper->getThrottler()->setComponentThreshold(4));
632 | $this->assertFalse($googleScraper->scrape('foo', 5));
633 | $this->assertFalse($googleScraper->scrapeAll(8));
634 | $this->assertFalse($googleScraper->scrapeAll('foo'));
635 | $this->assertFalse($googleScraper->scrapeAll(2, 'foo'));
636 | $this->assertTrue($googleScraper->removeKeyword("foo"));
637 | $this->assertTrue($googleScraper->removeKeyword("baz"));
638 | $this->assertFalse($googleScraper->scrapeAll(1));
639 |
640 | $askScraper = Builder::create($this->engines[1], array(array('foo', 'baz')));
641 | $this->assertFalse($askScraper->scrape(0));
642 | $this->assertFalse($askScraper->scrape(''));
643 | $this->assertFalse($askScraper->scrape('foobaz'));
644 | $this->assertFalse($askScraper->scrape('foo', 's'));
645 | $this->assertFalse($askScraper->scrape('foo', 0));
646 | $this->assertFalse($askScraper->scrape('foo', 1, 'UTC', 1));
647 | $this->assertFalse($askScraper->scrape('foo', 16));
648 | $this->assertFalse($askScraper->scrape('foo', 100));
649 | $askScraper->getThrottler()->stop();
650 | $this->assertTrue($askScraper->getThrottler()->setGlobalThreshold(10));
651 | $this->assertFalse($askScraper->scrape('foo', 11));
652 | $this->assertFalse($askScraper->scrape('foo', 10));
653 | $this->assertTrue($askScraper->getThrottler()->setComponentThreshold(4));
654 | $this->assertFalse($askScraper->scrape('foo', 5));
655 | $this->assertFalse($askScraper->scrapeAll(8));
656 | $this->assertFalse($askScraper->scrapeAll('foo'));
657 | $this->assertFalse($askScraper->scrapeAll(2, 'foo'));
658 | $this->assertTrue($askScraper->removeKeyword("foo"));
659 | $this->assertTrue($askScraper->removeKeyword("baz"));
660 | $this->assertFalse($askScraper->scrapeAll(1));
661 |
662 | $bingScraper = Builder::create($this->engines[2], array(array('foo', 'baz')));
663 | $this->assertFalse($bingScraper->scrape(0));
664 | $this->assertFalse($bingScraper->scrape(''));
665 | $this->assertFalse($bingScraper->scrape('foobaz'));
666 | $this->assertFalse($bingScraper->scrape('foo', 's'));
667 | $this->assertFalse($bingScraper->scrape('foo', 0));
668 | $this->assertFalse($bingScraper->scrape('foo', 1, 'UTC', 1));
669 | $this->assertFalse($bingScraper->scrape('foo', 16));
670 | $this->assertFalse($bingScraper->scrape('foo', 100));
671 | $bingScraper->getThrottler()->stop();
672 | $this->assertTrue($bingScraper->getThrottler()->setGlobalThreshold(10));
673 | $this->assertFalse($bingScraper->scrape('foo', 11));
674 | $this->assertFalse($bingScraper->scrape('foo', 10));
675 | $this->assertTrue($bingScraper->getThrottler()->setComponentThreshold(4));
676 | $this->assertFalse($bingScraper->scrape('foo', 5));
677 | $this->assertFalse($bingScraper->scrapeAll(8));
678 | $this->assertFalse($bingScraper->scrapeAll('foo'));
679 | $this->assertFalse($bingScraper->scrapeAll(2, 'foo'));
680 | $this->assertTrue($bingScraper->removeKeyword("foo"));
681 | $this->assertTrue($bingScraper->removeKeyword("baz"));
682 | $this->assertFalse($bingScraper->scrapeAll(1));
683 |
684 | $yahooScraper = Builder::create($this->engines[3], array(array('foo', 'baz')));
685 | $this->assertFalse($yahooScraper->scrape(0));
686 | $this->assertFalse($yahooScraper->scrape(''));
687 | $this->assertFalse($yahooScraper->scrape('foobaz'));
688 | $this->assertFalse($yahooScraper->scrape('foo', 's'));
689 | $this->assertFalse($yahooScraper->scrape('foo', 0));
690 | $this->assertFalse($yahooScraper->scrape('foo', 1, 'UTC', 1));
691 | $this->assertFalse($yahooScraper->scrape('foo', 16));
692 | $this->assertFalse($yahooScraper->scrape('foo', 100));
693 | $yahooScraper->getThrottler()->stop();
694 | $this->assertTrue($yahooScraper->getThrottler()->setGlobalThreshold(10));
695 | $this->assertFalse($yahooScraper->scrape('foo', 11));
696 | $this->assertFalse($yahooScraper->scrape('foo', 10));
697 | $this->assertTrue($yahooScraper->getThrottler()->setComponentThreshold(4));
698 | $this->assertFalse($yahooScraper->scrape('foo', 5));
699 | $this->assertFalse($yahooScraper->scrapeAll(8));
700 | $this->assertFalse($yahooScraper->scrapeAll('foo'));
701 | $this->assertFalse($yahooScraper->scrapeAll(2, 'foo'));
702 | $this->assertTrue($yahooScraper->removeKeyword("foo"));
703 | $this->assertTrue($yahooScraper->removeKeyword("baz"));
704 | $this->assertFalse($yahooScraper->scrapeAll(1));
705 | }
706 |
707 | public function testScrapeArgs()
708 | {
709 | $googleScraper = Builder::create($this->engines[0], array(array('foo')));
710 | $this->assertTrue($googleScraper->removeKeyword('foo'));
711 | $this->assertFalse($googleScraper->scrape(1));
712 | $this->assertTrue($googleScraper->addKeywords(array("foo", "bar", "baz")));
713 | $this->assertFalse($googleScraper->scrape('foobar'));
714 | $this->assertFalse($googleScraper->scrape('foo', 'baz'));
715 | $this->assertFalse($googleScraper->scrape('foo', 2, 'UTC', 'baz'));
716 | $this->assertFalse($googleScraper->scrape('foo', 2, 'UTC', true, 'baz'));
717 | $this->assertFalse($googleScraper->scrapeAll('foobar'));
718 | $this->assertFalse($googleScraper->scrapeAll(2, 'foobar'));
719 | $this->assertFalse($googleScraper->scrapeAll(2, true, 'foobar'));
720 |
721 | $askScraper = Builder::create($this->engines[1], array(array('foo')));
722 | $this->assertTrue($askScraper->removeKeyword('foo'));
723 | $this->assertFalse($askScraper->scrape(2));
724 | $this->assertTrue($askScraper->addKeywords(array("foo", "bar", "baz")));
725 | $this->assertFalse($askScraper->scrape('foobar'));
726 | $this->assertFalse($askScraper->scrape('foo', 'baz'));
727 | $this->assertFalse($askScraper->scrape('foo', 2, 'UTC', 'baz'));
728 | $this->assertFalse($askScraper->scrape('foo', 2, true, 'baz'));
729 | $this->assertFalse($askScraper->scrapeAll('foobar'));
730 | $this->assertFalse($askScraper->scrapeAll(2, 'foobar'));
731 | $this->assertFalse($askScraper->scrapeAll(2, true, 'foobar'));
732 |
733 | $bingScraper = Builder::create($this->engines[2], array(array('foo')));
734 | $this->assertTrue($bingScraper->removeKeyword('foo'));
735 | $this->assertFalse($bingScraper->scrape(3));
736 | $this->assertTrue($bingScraper->addKeywords(array("foo", "bar", "baz")));
737 | $this->assertFalse($bingScraper->scrape('foobar'));
738 | $this->assertFalse($bingScraper->scrape('foo', 'baz'));
739 | $this->assertFalse($bingScraper->scrape('foo', 2, 'UTC', 'baz'));
740 | $this->assertFalse($bingScraper->scrape('foo', 2, true, 'baz'));
741 | $this->assertFalse($bingScraper->scrapeAll('foobar'));
742 | $this->assertFalse($bingScraper->scrapeAll(2, 'foobar'));
743 | $this->assertFalse($bingScraper->scrapeAll(2, true, 'foobar'));
744 |
745 | $yahooScraper = Builder::create($this->engines[2], array(array('foo')));
746 | $this->assertTrue($yahooScraper->removeKeyword('foo'));
747 | $this->assertFalse($yahooScraper->scrape(4));
748 | $this->assertTrue($yahooScraper->addKeywords(array("foo", "bar", "baz")));
749 | $this->assertFalse($yahooScraper->scrape('foobar'));
750 | $this->assertFalse($yahooScraper->scrape('foo', 'baz'));
751 | $this->assertFalse($yahooScraper->scrape('foo', 2, 'UTC', 'baz'));
752 | $this->assertFalse($yahooScraper->scrape('foo', 2, true, 'baz'));
753 | $this->assertFalse($yahooScraper->scrapeAll('foobar'));
754 | $this->assertFalse($yahooScraper->scrapeAll(2, 'foobar'));
755 | $this->assertFalse($yahooScraper->scrapeAll(2, true, 'foobar'));
756 | }
757 |
758 | public function testScrape()
759 | {
760 | $parser = new \Seld\JsonLint\JsonParser();
761 |
762 | $googleScraper = Builder::create($this->engines[0],
763 | array(array('foo', 'baz'), 'google'));
764 | $outDir = $googleScraper->getOutDir();
765 | $this->assertFalse($googleScraper->scrape('bar'));
766 | $this->assertFalse($googleScraper->scrape('baz', 100));
767 | $this->assertFalse($googleScraper->scrape('baz', 1, 'baz'));
768 | $this->assertFalse($googleScraper->scrape('baz', 1, true, 'foobad'));
769 | $this->assertFalse($googleScraper->scrape('baz', 1, true, 'UTC', 'faz'));
770 | $this->assertFalse($googleScraper->serialize('json'));
771 | $this->assertTrue($googleScraper->scrape('foo', 2, true, 'Europe/Berlin'));
772 | $this->assertCount(2, $googleScraper->getFetchedPages());
773 | $this->assertCount(1, $googleScraper->getKeywords());
774 | $this->assertTrue($googleScraper->scrape('baz', 2, true));
775 | $this->assertCount(4, $googleScraper->getFetchedPages());
776 | $this->assertCount(0, $googleScraper->getKeywords());
777 | $this->assertFalse($googleScraper->scrapeAll());
778 | $this->assertTrue($googleScraper->addKeywords(array('foobaz', 'foobar')));
779 | $this->assertTrue($googleScraper->scrapeAll(2, true, 'America/Los_Angeles'));
780 | $this->assertCount(8, $googleScraper->getFetchedPages());
781 | $this->assertCount(0, $googleScraper->getKeywords());
782 | $this->assertFalse($googleScraper->serialize('baz'));
783 | $this->assertTrue($googleScraper->serialize('json', true));
784 | $this->assertCount(0, $googleScraper->getFetchedPages());
785 | $this->assertCount(8, $googleScraper->getSerializedPages());
786 | $toCheck = array_map('Franzip\SerpScraper\Helpers\FileSystemHelper::generateFileName',
787 | array_keys($googleScraper->getSerializedPages()));
788 | $this->assertTrue($googleScraper->save(true));
789 | for ($i = 0; $i < count($toCheck); $i++) {
790 | $json = file_get_contents($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]);
791 | $this->assertNull($parser->lint($json));
792 | }
793 | $this->assertTrue($googleScraper->addKeywords(array('foo bad')));
794 | $this->assertTrue($googleScraper->scrapeAll(3, true));
795 | $this->assertCount(3, $googleScraper->getFetchedPages());
796 | $this->assertTrue($googleScraper->serialize('xml', true));
797 | $this->assertCount(0, $googleScraper->getFetchedPages());
798 | $this->assertCount(3, $googleScraper->getSerializedPages());
799 | $toCheck = array_map('Franzip\SerpScraper\Helpers\FileSystemHelper::generateFileName',
800 | array_keys($googleScraper->getSerializedPages()));
801 | $this->assertTrue($googleScraper->save(true));
802 | for ($i = 0; $i < count($toCheck); $i++) {
803 | $xml = new \XMLReader();
804 | $xml->open($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]);
805 | $xml->setParserProperty(\XMLReader::VALIDATE, true);
806 | $this->assertTrue($xml->isValid());
807 | }
808 |
809 | $askScraper = Builder::create($this->engines[1],
810 | array(array('foo', 'baz'), 'ask'));
811 | $outDir = $askScraper->getOutDir();
812 | $this->assertFalse($askScraper->scrape('bar'));
813 | $this->assertFalse($askScraper->scrape('baz', 100));
814 | $this->assertFalse($askScraper->scrape('baz', 1, 'baz'));
815 | $this->assertFalse($askScraper->scrape('baz', 1, true, 'foobad'));
816 | $this->assertFalse($askScraper->scrape('baz', 1, true, 'UTC', 'faz'));
817 | $this->assertTrue($askScraper->scrape('foo', 2, true, 'Europe/Rome'));
818 | $this->assertCount(2, $askScraper->getFetchedPages());
819 | $this->assertCount(1, $askScraper->getKeywords());
820 | $this->assertTrue($askScraper->scrape('baz', 2, true));
821 | $this->assertCount(4, $askScraper->getFetchedPages());
822 | $this->assertCount(0, $askScraper->getKeywords());
823 | $this->assertFalse($askScraper->scrapeAll());
824 | $this->assertTrue($askScraper->addKeywords(array('foobaz', 'foobar')));
825 | $this->assertTrue($askScraper->scrapeAll(2, true, 'America/Los_Angeles'));
826 | $this->assertCount(8, $askScraper->getFetchedPages());
827 | $this->assertCount(0, $askScraper->getKeywords());
828 | $this->assertFalse($askScraper->serialize('baz'));
829 | $this->assertTrue($askScraper->serialize('xml', true));
830 | $this->assertCount(0, $askScraper->getFetchedPages());
831 | $this->assertCount(8, $askScraper->getSerializedPages());
832 | $toCheck = array_map('Franzip\SerpScraper\Helpers\FileSystemHelper::generateFileName',
833 | array_keys($askScraper->getSerializedPages()));
834 | $this->assertTrue($askScraper->save(true));
835 | $this->assertCount(0, $askScraper->getSerializedPages());
836 | for ($i = 0; $i < count($toCheck); $i++) {
837 | $xml = new \XMLReader();
838 | $xml->open($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]);
839 | $xml->setParserProperty(\XMLReader::VALIDATE, true);
840 | $this->assertTrue($xml->isValid());
841 | }
842 | $this->assertTrue($askScraper->addKeywords(array('foobaz')));
843 | $this->assertTrue($askScraper->scrapeAll(3, true));
844 | $this->assertTrue($askScraper->serialize('json', true));
845 | $toCheck = array_map('Franzip\SerpScraper\Helpers\FileSystemHelper::generateFileName',
846 | array_keys($askScraper->getSerializedPages()));
847 | $this->assertTrue($askScraper->save(true));
848 | for ($i = 0; $i < count($toCheck); $i++) {
849 | $json = file_get_contents($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]);
850 | $this->assertNull($parser->lint($json));
851 | }
852 |
853 | $bingScraper = Builder::create($this->engines[2],
854 | array(array('foo', 'baz'), 'bing'));
855 | $outDir = $bingScraper->getOutDir();
856 | $this->assertFalse($bingScraper->scrape('bar'));
857 | $this->assertFalse($bingScraper->scrape('baz', 100));
858 | $this->assertFalse($bingScraper->scrape('baz', 1, 'baz'));
859 | $this->assertFalse($bingScraper->scrape('baz', 1, true, 'foobad'));
860 | $this->assertFalse($bingScraper->scrape('baz', 1, true, 'UTC', 'faz'));
861 | $this->assertFalse($bingScraper->serialize('json'));
862 | $this->assertTrue($bingScraper->scrape('foo', 2, true, 'Europe/Berlin'));
863 | $this->assertCount(2, $bingScraper->getFetchedPages());
864 | $this->assertCount(1, $bingScraper->getKeywords());
865 | $this->assertTrue($bingScraper->scrape('baz', 2, true));
866 | $this->assertCount(4, $bingScraper->getFetchedPages());
867 | $this->assertCount(0, $bingScraper->getKeywords());
868 | $this->assertFalse($bingScraper->scrapeAll());
869 | $this->assertTrue($bingScraper->addKeywords(array('foobaz', 'foobar')));
870 | $this->assertTrue($bingScraper->scrapeAll(2, true, 'America/Los_Angeles'));
871 | $this->assertCount(8, $bingScraper->getFetchedPages());
872 | $this->assertCount(0, $bingScraper->getKeywords());
873 | $this->assertFalse($bingScraper->serialize('baz'));
874 | $this->assertTrue($bingScraper->serialize('json', true));
875 | $this->assertCount(0, $bingScraper->getFetchedPages());
876 | $this->assertCount(8, $bingScraper->getSerializedPages());
877 | $toCheck = array_map('Franzip\SerpScraper\Helpers\FileSystemHelper::generateFileName',
878 | array_keys($bingScraper->getSerializedPages()));
879 | $this->assertTrue($bingScraper->save(true));
880 | for ($i = 0; $i < count($toCheck); $i++) {
881 | $json = file_get_contents($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]);
882 | $this->assertNull($parser->lint($json));
883 | }
884 | $this->assertTrue($bingScraper->addKeywords(array('foo bad')));
885 | $this->assertTrue($bingScraper->scrapeAll(2, true));
886 | $this->assertCount(2, $bingScraper->getFetchedPages());
887 | $this->assertTrue($bingScraper->serialize('xml', true));
888 | $this->assertCount(0, $bingScraper->getFetchedPages());
889 | $this->assertCount(2, $bingScraper->getSerializedPages());
890 | $toCheck = array_map('Franzip\SerpScraper\Helpers\FileSystemHelper::generateFileName',
891 | array_keys($bingScraper->getSerializedPages()));
892 | $this->assertTrue($bingScraper->save(true));
893 | for ($i = 0; $i < count($toCheck); $i++) {
894 | $xml = new \XMLReader();
895 | $xml->open($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]);
896 | $xml->setParserProperty(\XMLReader::VALIDATE, true);
897 | $this->assertTrue($xml->isValid());
898 | }
899 |
900 | $yahooScraper = Builder::create($this->engines[3],
901 | array(array('foo', 'baz'), 'yahoo'));
902 | $outDir = $yahooScraper->getOutDir();
903 | $this->assertFalse($yahooScraper->scrape('bar'));
904 | $this->assertFalse($yahooScraper->scrape('baz', 100));
905 | $this->assertFalse($yahooScraper->scrape('baz', 1, 'baz'));
906 | $this->assertFalse($yahooScraper->scrape('baz', 1, true, 'foobad'));
907 | $this->assertFalse($yahooScraper->scrape('baz', 1, true, 'UTC', 'faz'));
908 | $this->assertTrue($yahooScraper->scrape('foo', 2, true, 'Europe/Rome'));
909 | $this->assertCount(2, $yahooScraper->getFetchedPages());
910 | $this->assertCount(1, $yahooScraper->getKeywords());
911 | $this->assertTrue($yahooScraper->scrape('baz', 2, true));
912 | $this->assertCount(4, $yahooScraper->getFetchedPages());
913 | $this->assertCount(0, $yahooScraper->getKeywords());
914 | $this->assertFalse($yahooScraper->scrapeAll());
915 | $this->assertTrue($yahooScraper->addKeywords(array('foobaz', 'foobar')));
916 | $this->assertTrue($yahooScraper->scrapeAll(2, true, 'America/Los_Angeles'));
917 | $this->assertCount(8, $yahooScraper->getFetchedPages());
918 | $this->assertCount(0, $yahooScraper->getKeywords());
919 | $this->assertFalse($yahooScraper->serialize('baz'));
920 | $this->assertTrue($yahooScraper->serialize('xml', true));
921 | $this->assertCount(0, $yahooScraper->getFetchedPages());
922 | $this->assertCount(8, $yahooScraper->getSerializedPages());
923 | $toCheck = array_map('Franzip\SerpScraper\Helpers\FileSystemHelper::generateFileName',
924 | array_keys($yahooScraper->getSerializedPages()));
925 | $this->assertTrue($yahooScraper->save(true));
926 | $this->assertCount(0, $yahooScraper->getSerializedPages());
927 | for ($i = 0; $i < count($toCheck); $i++) {
928 | $xml = new \XMLReader();
929 | $xml->open($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]);
930 | $xml->setParserProperty(\XMLReader::VALIDATE, true);
931 | $this->assertTrue($xml->isValid());
932 | }
933 | $this->assertTrue($yahooScraper->addKeywords(array('foobaz')));
934 | $this->assertTrue($yahooScraper->scrapeAll(3, true));
935 | $this->assertTrue($yahooScraper->serialize('json', true));
936 | $toCheck = array_map('Franzip\SerpScraper\Helpers\FileSystemHelper::generateFileName',
937 | array_keys($yahooScraper->getSerializedPages()));
938 | $this->assertTrue($yahooScraper->save(true));
939 | for ($i = 0; $i < count($toCheck); $i++) {
940 | $json = file_get_contents($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]);
941 | $this->assertNull($parser->lint($json));
942 | }
943 | }
944 | }
945 |
--------------------------------------------------------------------------------