├── README ├── testdata_num.txt ├── testdata_urls.txt ├── cliargs.php ├── parallelcurl.php └── buzzprofilecrawl.php /README: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petewarden/buzzprofilecrawl/HEAD/README -------------------------------------------------------------------------------- /testdata_num.txt: -------------------------------------------------------------------------------- 1 | 114482883275883376711 2 | 110180527922237151490 3 | 100722937543579674532 4 | 111343326546693401371 5 | 100023042519328780763 6 | 107298427506852802199 7 | 115624021056418704315 8 | 109213761823894503198 9 | 117827376991976059856 10 | 112036812458594362599 11 | -------------------------------------------------------------------------------- /testdata_urls.txt: -------------------------------------------------------------------------------- 1 | http://www.google.com/profiles/115863474911002159675 2 | http://www.google.com/profiles/102563990701473154621 3 | http://www.google.com/profiles/100448244799026528799 4 | http://www.google.com/profiles/100503905368015720339 5 | http://www.google.com/profiles/117913250525659179200 6 | http://www.google.com/profiles/103228038539082456359 7 | http://www.google.com/profiles/104251208055103372299 8 | http://www.google.com/profiles/100359947051490088968 9 | http://www.google.com/profiles/104287775546672278556 10 | http://www.google.com/profiles/113509451750336198219 11 | http://www.google.com/profiles/104945601995852006424 12 | http://www.google.com/profiles/106664725926862859359 -------------------------------------------------------------------------------- /cliargs.php: -------------------------------------------------------------------------------- 1 | ' => array( 8 | // 'short' => '', 9 | // 'type' => <'switch' | 'optional' | 'required'>, 10 | // 'description' => '', 11 | // 'default' => '', 12 | // ), 13 | // ... 14 | // ); 15 | // 16 | // If the type is switch, then the result is a boolean that will be false if it's 17 | // not present, or true if it is 18 | // 19 | // If the type is optional, then the result will be the default if it's not present 20 | // 21 | // If the type is required, then the script will print out the usage and exit if it's 22 | // not found 23 | // 24 | // To use, call cliargs_print_usage_and_exit() with the array of argument descriptions 25 | // The result will be an array with the argument names as keys to the found values 26 | // 27 | // by Pete Warden, http://petewarden.typepad.com but freely reusable with no restrictions 28 | 29 | function cliargs_print_usage_and_exit($cliargs) 30 | { 31 | print "Usage:\n"; 32 | 33 | foreach ($cliargs as $long => $arginfo) 34 | { 35 | $short = $arginfo['short']; 36 | $type = $arginfo['type']; 37 | $required = ($type=='required'); 38 | $optional = ($type=='optional'); 39 | $description = $arginfo['description']; 40 | 41 | print "-$short/--$long "; 42 | 43 | if ($optional||$required) 44 | print " "; 45 | 46 | print ": $description"; 47 | 48 | if ($required) 49 | print " (required)"; 50 | 51 | print "\n"; 52 | } 53 | 54 | exit(); 55 | } 56 | 57 | function cliargs_strstartswith($source, $prefix) 58 | { 59 | return strncmp($source, $prefix, strlen($prefix)) == 0; 60 | } 61 | 62 | function cliargs_get_options($cliargs) 63 | { 64 | global $argv; 65 | global $argc; 66 | 67 | $options = array('unnamed' => array()); 68 | for ($index=1; $index<$argc; $index+=1) 69 | { 70 | $currentarg = strtolower($argv[$index]); 71 | $argparts = split('=', $currentarg); 72 | $namepart = $argparts[0]; 73 | 74 | if (cliargs_strstartswith($namepart, '--')) 75 | { 76 | $longname = substr($namepart, 2); 77 | } 78 | else if (cliargs_strstartswith($namepart, '-')) 79 | { 80 | $shortname = substr($namepart, 1); 81 | $longname = $shortname; 82 | foreach ($cliargs as $name => $info) 83 | { 84 | if ($shortname===$info['short']) 85 | { 86 | $longname = $name; 87 | break; 88 | } 89 | } 90 | 91 | } 92 | else 93 | { 94 | $longname = 'unnamed'; 95 | } 96 | 97 | if ($longname=='unnamed') 98 | { 99 | $options['unnamed'][] = $namepart; 100 | } 101 | else 102 | { 103 | if (empty($cliargs[$longname])) 104 | { 105 | print "Unknown argument '$longname'\n"; 106 | cliargs_print_usage_and_exit($cliargs); 107 | } 108 | 109 | $arginfo = $cliargs[$longname]; 110 | $argtype = $arginfo['type']; 111 | if ($argtype==='switch') 112 | { 113 | $value = true; 114 | } 115 | else if (isset($argparts[1])) 116 | { 117 | $value = $argparts[1]; 118 | } 119 | else if (($index+1)<$argc) 120 | { 121 | $value = $argv[$index+1]; 122 | $index += 1; 123 | } 124 | else 125 | { 126 | print "Missing value after '$longname'\n"; 127 | cliargs_print_usage_and_exit($cliargs); 128 | } 129 | 130 | $options[$longname] = $value; 131 | } 132 | } 133 | 134 | foreach ($cliargs as $longname => $arginfo) 135 | { 136 | $type = $arginfo['type']; 137 | 138 | if (!isset($options[$longname])) 139 | { 140 | if ($type=='required') 141 | { 142 | print("Missing required value for '$longname'\n"); 143 | cliargs_print_usage_and_exit($cliargs); 144 | } 145 | else if ($type=='optional') 146 | { 147 | if (!isset($arginfo['default'])) 148 | die('Missing default value for '.$long); 149 | 150 | $options[$longname] = $arginfo['default']; 151 | } 152 | else if ($type=='switch') 153 | { 154 | $options[$longname] = false; 155 | } 156 | } 157 | } 158 | 159 | return $options; 160 | } 161 | 162 | ?> -------------------------------------------------------------------------------- /parallelcurl.php: -------------------------------------------------------------------------------- 1 | startRequest('http://example.com', 'on_request_done', array('something')); 19 | // 20 | // The first argument is the address that should be fetched 21 | // The second is the callback function that will be run once the request is done 22 | // The third is a 'cookie', that can contain arbitrary data to be passed to the callback 23 | // 24 | // This startRequest call will return immediately, as long as less than the maximum number of 25 | // requests are outstanding. Once the request is done, the callback function will be called, eg: 26 | // 27 | // on_request_done($content, 'http://example.com', $ch, array('something)); 28 | // 29 | // The callback should take four arguments. The first is a string containing the content found at 30 | // the URL. The second is the original URL requested, the third is the curl handle of the request that 31 | // can be queried to get the results, and the fourth is the arbitrary 'cookie' value that you 32 | // associated with this object. This cookie contains user-defined data. 33 | // 34 | // Since you may have requests outstanding at the end of your script, you *MUST* call 35 | // 36 | // $parallelcurl->finishAllRequests(); 37 | // 38 | // before you exit. If you don't, the final requests may be left unprocessed! 39 | // 40 | // By Pete Warden , freely reusable, see http://petewarden.typepad.com for more 41 | 42 | class ParallelCurl { 43 | 44 | public $max_requests; 45 | public $options; 46 | 47 | public $outstanding_requests; 48 | public $multi_handle; 49 | 50 | public function __construct($in_max_requests = 10, $in_options = array()) { 51 | $this->max_requests = $in_max_requests; 52 | $this->options = $in_options; 53 | 54 | $this->outstanding_requests = array(); 55 | $this->multi_handle = curl_multi_init(); 56 | } 57 | 58 | // Sets how many requests can be outstanding at once before we block and wait for one to 59 | // finish before starting the next one 60 | public function setMaxRequests($in_max_requests) { 61 | $max_requests = $in_max_requests; 62 | } 63 | 64 | // Sets the options to pass to curl, using the format of curl_setopt_array() 65 | public function setOptions($in_options) { 66 | 67 | $options = $in_options; 68 | } 69 | 70 | // Start a fetch from the $url address, calling the $callback function passing the optional 71 | // $user_data value. The callback should accept 3 arguments, the url, curl handle and user 72 | // data, eg on_request_done($url, $ch, $user_data); 73 | public function startRequest($url, $callback, $user_data = array()) { 74 | 75 | $this->waitForOutstandingRequestsToDropBelow($this->max_requests); 76 | 77 | $ch = curl_init(); 78 | curl_setopt_array($ch, $this->options); 79 | curl_setopt($ch, CURLOPT_URL, $url); 80 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); 81 | 82 | curl_multi_add_handle($this->multi_handle, $ch); 83 | 84 | $this->outstanding_requests[$ch] = array( 85 | 'url' => $url, 86 | 'callback' => $callback, 87 | 'user_data' => $user_data, 88 | ); 89 | 90 | $this->checkForCompletedRequests(); 91 | } 92 | 93 | // You *MUST* call this function at the end of your script. It waits for any running requests 94 | // to complete, and calls their callback functions 95 | public function finishAllRequests() { 96 | $this->waitForOutstandingRequestsToDropBelow(1); 97 | } 98 | 99 | // Checks to see if any of the outstanding requests have finished 100 | private function checkForCompletedRequests() { 101 | 102 | // Call select to see if anything is waiting for us 103 | if (curl_multi_select($this->multi_handle, 0.0) === -1) 104 | return; 105 | 106 | // Since something's waiting, give curl a chance to process it 107 | do { 108 | $mrc = curl_multi_exec($this->multi_handle, $active); 109 | } while ($mrc == CURLM_CALL_MULTI_PERFORM); 110 | 111 | // Now grab the information about the completed requests 112 | while ($info = curl_multi_info_read($this->multi_handle)) { 113 | 114 | $ch = $info['handle']; 115 | 116 | if (!isset($this->outstanding_requests[$ch])) { 117 | die("Error - handle wasn't found in requests: '$ch' in ". 118 | print_r($this->outstanding_requests, true)); 119 | } 120 | 121 | $request = $this->outstanding_requests[$ch]; 122 | 123 | $url = $request['url']; 124 | $content = curl_multi_getcontent($ch); 125 | $callback = $request['callback']; 126 | $user_data = $request['user_data']; 127 | 128 | $callback($content, $url, $ch, $user_data); 129 | 130 | unset($this->outstanding_requests[$ch]); 131 | 132 | curl_multi_remove_handle($this->multi_handle, $ch); 133 | } 134 | 135 | } 136 | 137 | // Blocks until there's less than the specified number of requests outstanding 138 | private function waitForOutstandingRequestsToDropBelow($max) 139 | { 140 | while (count($this->outstanding_requests)>=$max) 141 | { 142 | $this->checkForCompletedRequests(); 143 | sleep(1); 144 | } 145 | } 146 | 147 | } 148 | 149 | 150 | ?> -------------------------------------------------------------------------------- /buzzprofilecrawl.php: -------------------------------------------------------------------------------- 1 | #!/usr/bin/php 2 | -o 18 | 19 | where and are replaced by your contact email address and company, so that 20 | Google can get in touch if your crawling causes any problems. You should see JSON arrays of 21 | information for each of the 20 users mentioned in the test data files. 22 | 23 | The script fetches the HTML for the page from Google's servers, and then runs a set of regular 24 | expressions to extract the microformatted information for that user. The profiles mostly use hcard 25 | to help robots like us understand what the meaning of the different elements is. 26 | 27 | The output is in the form , eg: 28 | 29 | 106664725926862859359 {"user_name":"searchbrowser","name":"Pete Warden","portrait_url":"\/s2\/photos\/public\/AIbEiAIAAABDCN_Y_J-1nfe-XCILdmNhcmRfcGhvdG8qKDdkYTYyODgxMTAzYjg0OGUzODAzNjM1OTUxMzgxMWVhNjY3MzdlZDgwAUQ6MaRMKXz3oZLOOF-uOVBoUoqx","location":"Boulder, CO","location_born":"Cambridge, UK","employment_history":["Apple"],"education_history":["University of Manchester"],"links":["http:\/\/petewarden.typepad.com\/"],"title":"Software Engineer","organization":"Mailana Inc","location_history":["Dundee, Scotland","Los Angeles, CA"]} 30 | 31 | (c) Pete Warden http://petewarden.typepad.com/ Jan 8th 2010 32 | 33 | Redistribution and use in source and binary forms, with or without modification, are 34 | permitted provided that the following conditions are met: 35 | 36 | 1. Redistributions of source code must retain the above copyright notice, this 37 | list of conditions and the following disclaimer. 38 | 2. Redistributions in binary form must reproduce the above copyright notice, this 39 | list of conditions and the following disclaimer in the documentation and/or 40 | other materials provided with the distribution. 41 | 3. The name of the author may not be used to endorse or promote products derived 42 | from this software without specific prior written permission. 43 | 44 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, 45 | INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 46 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 47 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 48 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 49 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 50 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 51 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY 52 | OF SUCH DAMAGE. 53 | 54 | */ 55 | 56 | require_once('cliargs.php'); 57 | require_once('parallelcurl.php'); 58 | 59 | define('SOURCE_USER_ID_RE', '@http://www.google.com/profiles/([0-9]+)@'); 60 | define('SOURCE_USER_NAME_RE', '@http://www.google.com/profiles/([^/]+)@'); 61 | 62 | // These are the REs used to extract the information from the raw HTML. Most of the 63 | // elements are defined by the hCard microformat, for more details see 64 | // http://microformats.org/wiki/hcard 65 | define('NAME_RE', '@([^<]*)@'); 66 | define('PORTRAIT_RE', '@([^<]*)@'); 68 | define('TITLE_RE', '@

([^<]*) at@'); 69 | define('SCHOOL_RE', '@([^<]*)@'); 70 | define('LOCATION_BORN_RE', '@

Where I grew up
([^<]*)
@'); 71 | define('WORK_HISTORY_RE', '@
Companies I've worked for
([^<]*)
@'); 72 | define('EDUCATION_HISTORY_RE', '@
Schools I've attended
([^<]*)
@'); 73 | define('OTHER_NAME_RE', '@
Other names
([^<]*)
@'); 74 | define('LINKS_RE', '@