├── README.md ├── composer.json └── src └── GoogleScraper.php /README.md: -------------------------------------------------------------------------------- 1 | # Google-Scraper 2 | This class can retrieve search results from Google. 3 | 4 | 5 | ## Install the package using composer 6 | 7 | ``` 8 | composer require samay/google-scraper 9 | ``` 10 | 11 | ## Usage 12 | 13 | ``` 14 | getUrlList(urlencode('car'),''); 23 | 24 | print_r($arr); 25 | ``` 26 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "samay/google-scraper", 3 | "description": "This class can retrieve search results from Google.", 4 | "type": "library", 5 | "license": "MIT", 6 | "authors": [ 7 | { 8 | "name": "Samay Bhavsar", 9 | "email": "samay@samay.info", 10 | "homepage": "https://github.com/samaybhavsar/google-scraper" 11 | } 12 | ], 13 | "minimum-stability": "stable", 14 | "require": {}, 15 | "autoload": { 16 | "psr-4": { 17 | "Scraper\\": "src/" 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/GoogleScraper.php: -------------------------------------------------------------------------------- 1 | 4 | @Version : 1.3 5 | * 6 | * This source file is subject to the MIT license that is bundled 7 | * with this source code in the file LICENSE. 8 | */ 9 | 10 | namespace Scraper; 11 | 12 | class GoogleScraper 13 | { 14 | var $keyword = "testing"; 15 | var $urlList = array(); 16 | var $time1 = 4000000; 17 | var $time2 = 8000000; 18 | var $proxy = ""; 19 | var $cookie = ""; 20 | var $header = ""; 21 | var $ei = ""; 22 | var $ved = ""; 23 | 24 | 25 | function __construct() { 26 | $this->cookie = tempnam ("/tmp", "cookie"); 27 | $this->headers[] = "Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"; 28 | $this->headers[] = "Connection: keep-alive"; 29 | $this->headers[] = "Keep-Alive: 115"; 30 | $this->headers[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7"; 31 | $this->headers[] = "Accept-Language: en-us,en;q=0.5"; 32 | $this->headers[] = "Pragma: "; 33 | } 34 | 35 | function getpagedata($url) 36 | { 37 | $ch = curl_init(); 38 | curl_setopt($ch, CURLOPT_URL, $url); 39 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); 40 | curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); 41 | curl_setopt($ch, CURLOPT_USERAGENT,'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'); 42 | curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate'); 43 | curl_setopt($ch, CURLOPT_HTTPHEADER, $this->headers); 44 | curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookie); 45 | curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookie); 46 | curl_setopt($ch, CURLOPT_PROXY, $this->proxy); 47 | curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30); 48 | $data=curl_exec($ch); 49 | curl_close($ch); 50 | return $data; 51 | } 52 | 53 | function pause() { 54 | usleep(rand($this->time1,$this->time2)); 55 | } 56 | 57 | function initGoogle() { 58 | $data=$this->getpagedata('http://www.google.com'); // Open google.com ( Might redirect to country specific site e.g. www.google.co.in) 59 | $this->pause(); 60 | $data = $this->getpagedata('http://www.google.com/ncr'); // Moves back to google.com 61 | preg_match('/type="submit" data-ved="(.*?)"/', $data, $matches); 62 | $this->ved = $matches[1]; 63 | } 64 | 65 | function fetchUrlList() 66 | { 67 | for($i=0;$i<=250;$i=$i+10) { 68 | $data = $this->getpagedata('http://www.google.com/search?source=hp&q='.$this->keyword.'&ei='.$this->ei.'&btnK=Google+Search&ved='.$this->ved.'&start='.$i); 69 | 70 | preg_match('/;ei=(.*?)&/', $data, $matches); 71 | if(empty($matches)) 72 | { 73 | preg_match('/;sei=(.*?)"/', $data, $matches); 74 | $this->ei=urlencode($matches[1]); 75 | if(empty($matches)) 76 | { 77 | exit(); 78 | } 79 | } else { 80 | $this->ei=urlencode($matches[1]); 81 | } 82 | 83 | if ($data) { 84 | if(preg_match("/sorry.google.com/", $data)) { 85 | echo "You are blocked"; 86 | exit; 87 | } else { 88 | preg_match_all('@