├── cache └── .gitignore ├── README.markdown └── index.php /cache/.gitignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | Important People 2 | ================ 3 | 4 | This script grabs a set of Twitter search results and retrieves the author's follower and update counts. Results are color coded and can be downloaded as a CSV. 5 | 6 | BTW: I wrote this in about half an hour, so there's very little (read: no) error checking. If the fail whale were to rear it's ugly head, bad things may happen. 7 | 8 | Based on an idea from [@mulls](http://twitter.com/mulls). 9 | -------------------------------------------------------------------------------- /index.php: -------------------------------------------------------------------------------- 1 | 50) $pages_to_fetch = 50; 11 | if($pages_to_fetch < 1) $pages_to_fetch = 1; 12 | 13 | if(isset($_GET['q'])) 14 | { 15 | $tweets = array(); 16 | $users = array(); 17 | $q = urlencode($_GET['q']); 18 | $page = 1; 19 | $url = "http://search.twitter.com/search.atom?q=$q&rpp=100&page=$page"; 20 | 21 | $pages = 0; 22 | while($url && $pages++ < $pages_to_fetch) 23 | { 24 | // Note: we're caching search results. Feel free to turn off if you want real-time results. 25 | $xmlstr = geturl($url); 26 | $xml = simplexml_load_string($xmlstr); 27 | foreach($xml->entry as $tweet) 28 | { 29 | $user = array_shift(explode(' ', (string) $tweet->author->name)); 30 | if((strtolower($user) != strtolower($_GET['q'])) && (strtolower("@$user") != strtolower($_GET['q']))) 31 | { 32 | $tweets[] = array('msg' => (string) $tweet->content, 33 | 'user' => $user, 34 | 'link' => (string) $tweet->link[0]['href'], 35 | 'dt' => (string) $tweet->published); 36 | 37 | // Only fetch server-side user data when doing a csv dump 38 | if(isset($_GET['csv']) && !isset($users[$user])) 39 | $users[$user] = user_info($user); 40 | } 41 | } 42 | 43 | // Look for another page of results 44 | $url = false; 45 | foreach($xml->link as $link) 46 | { 47 | if((string) $link['rel'] == 'next') 48 | $url = (string) $link['href']; 49 | } 50 | } 51 | 52 | if(isset($_GET['csv'])) 53 | { 54 | header("Cache-Control: must-revalidate, post-check=0, pre-check=0"); 55 | header("Content-Disposition: attachment; filename=" . $_GET['q'] . ".csv"); 56 | header("Content-Type: text/csv"); 57 | 58 | echo "message,username,permalink,timestamp,following,followers,updates\n"; 59 | $fp = fopen('php://output', 'w'); 60 | foreach($tweets as $t) 61 | { 62 | unset($users[$t['user']]['username']); 63 | $arr = array_merge($t, $users[$t['user']]); 64 | fputcsv($fp, $arr); 65 | } 66 | fclose($fp); 67 | exit; 68 | } 69 | } 70 | 71 | // Check our current API limit 72 | // $xmlstr = geturl('http://twitter.com/account/rate_limit_status.xml'); 73 | // $xml = simplexml_load_string($xmlstr); 74 | // $api_calls_remaining = (string) $xml->{'remaining-hits'}; 75 | // $reset_time = local_time((string) $xml->{'reset-time'}); 76 | 77 | // END OF THE MAIN SCRIPT 78 | // JUST HELPER FUNCTIONS BELOW 79 | 80 | function geturl($url, $cache = true) 81 | { 82 | // Please be nice and cache your requests! 83 | if($cache) 84 | { 85 | $fn = 'cache/' . md5($url); 86 | if(file_exists($fn) && filemtime($fn) > (time() - 3600) && !isset($_GET['cb'])) 87 | return file_get_contents($fn); 88 | } 89 | 90 | $ch = curl_init(); 91 | curl_setopt($ch, CURLOPT_URL, $url); 92 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 93 | curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5); 94 | $data = curl_exec($ch); 95 | curl_close($ch); 96 | 97 | if($cache && !file_put_contents($fn, $data)) 98 | die("Caching is turned on, but it doesn't appear that your cache directory is writable. Tried to write to '$fn'."); 99 | 100 | return $data; 101 | } 102 | 103 | // Let's face it. Twitter's API is often shitty and they limit you to 100 requests per hour. 104 | // With that in mind, I offer three solutions for grabbing user data, which can be controlled 105 | // via the $stupid_twitter parameter. They are... 106 | // 107 | // 1) Scrape the data directly from Twitter's website 108 | // 2) Pull the data using YQL from Yahoo!, which handles caching for us in case twitter.com goes down 109 | // 3) Use Twitter's API directly 110 | // 111 | // I'd suggest using YQL as it's more reliable than the API and a bit nicer on their servers than raw scraping. 112 | function user_info($username, $stupid_twitter = 'yql') 113 | { 114 | if($stupid_twitter === 'scrape') // Scrape public website 115 | { 116 | // Please leave the cache turned on so we're at least scraping Twitter "nicely" 117 | $html = geturl("http://twitter.com/$username"); 118 | $followers = match('/follower_count.*?([0-9,]+)/ms', $html, 1); 119 | $updates = match('/update_count.*?([0-9,]+)/ms', $html, 1); 120 | 121 | $followers = preg_replace('/[^0-9]/', '', $followers); 122 | $updates = preg_replace('/[^0-9]/', '', $updates); 123 | 124 | return array('followers' => $followers, 'updates' => $updates, 'username' => $username); 125 | } 126 | elseif($stupid_twitter === 'yql') // Go through YQL 127 | { 128 | // YQL: select * from html where url="http://twitter.com/tylerhall/" and xpath='//span[contains(@class, "_count")]' 129 | $user = urlencode($username); 130 | $url = "http://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20html%20where%20url%3D%22http%3A%2F%2Ftwitter.com%2F" . $user . "%2F%22%20and%20xpath%3D'%2F%2Fspan%5Bcontains(%40class%2C%20%22_count%22)%5D'&format=xml"; 131 | $xmlstr = geturl($url); 132 | $uxml = simplexml_load_string($xmlstr); 133 | 134 | $following = preg_replace('/[^0-9]/', '', (string) $uxml->results->span[0]); 135 | $followers = preg_replace('/[^0-9]/', '', (string) $uxml->results->span[1]); 136 | $updates = preg_replace('/[^0-9]/', '', (string) $uxml->results->span[2]); 137 | return array('following' => $following, 'followers' => $followers, 'updates' => $updates, 'username' => $username); 138 | } 139 | else // Use Twitter's API 140 | { 141 | $xmlstr = geturl("http://twitter.com/users/show/" . urlencode($user) . ".xml"); 142 | $uxml = simplexml_load_string($xmlstr); 143 | return array('followers' => (string) $uxml->followers_count, 'updates' => (string) $uxml->statuses_count, 'username' => $username); 144 | } 145 | } 146 | 147 | // Simple wrapper around preg_match() 148 | function match($regex, $str, $i = 0) 149 | { 150 | if(preg_match($regex, $str, $match) == 1) 151 | return $match[$i]; 152 | else 153 | return false; 154 | } 155 | 156 | // Convert a timestamp or date formatted string to a local time 157 | // I'm sure there's some obscure PHP function I just don't know about 158 | // that already does this. Right? 159 | function local_time($dt, $format = 'm/d/Y g:ia') 160 | { 161 | if(ctype_digit($dt) !== true) 162 | $dt = strtotime($dt); 163 | 164 | $arr = localtime($dt, true); 165 | $local_time = date($format, mktime($arr['tm_hour'], $arr['tm_min'], $arr['tm_sec'], $arr['tm_mon'] + 1, $arr['tm_mday'], $arr['tm_year'])); 166 | return $local_time; 167 | } 168 | ?> 169 | 171 | 172 | 173 |
174 | 175 | 176 |During a conversation at work, @mulls wanted a quick way to see who are the most influential people tweeting about a specific topic. @chadauld and I came up with the simple metric of ranking users by their follower count. And this is the result.
317 | 318 | 319 |Users we consider influential are colored bright yellow. Mid-range users are pink, gradiating down to users that no one cares about in white, and ordered by date — most recent on top.
320 | 321 |For the math nerds in the house, we deem important users to be those where
322 |followers > μ + σ / 2323 |
and un-important users are where
324 |followers < μ - σ / 4325 | 326 |
That said, given more time, we should probably develop a better metric. Take into account their updates per day, or how many followers *their* followers have. Something like that.
327 | 328 |Anyway, because Twitter's API ocasionally doesn't respond quickly enough, we load the follower counts on the client side by making callbacks to Yahoo!'s YQL service, which pulls the data for us and also serves as a proxy for when Twitter goes down.
329 | 330 |This was written in a couple hours, so don't hate too much. (We know it doesn't currently work in Internet Explorer.) Feel free to download and run it on your own box.
331 | 332 |Update 4/18: We've added an experimental new feature which attempts to highlight (er...ignore) users that we detect are spammy. Our reason for doing this it to try and filter out users that are merely contributing to Twitter's noise. You'll see that these users are greyed out. And here's the formula we're using to detect them:
333 |(followers > μ + σ / 4) && (following > followers * 0.75)334 | 335 | 336 |
Do you find this hack useful? We'd love to get your feedback as we're thinking of incorporating this functionality directly into Sideline. Let us know! Feedback is welcome either at @tylerhall, @chadauld, or @ysideline.
339 | 340 || Users | 361 |Followers | 362 |Following | 363 |Updates | 364 |Date ↓ | 365 |Message | 366 |
|---|---|---|---|---|---|
| 372 | | - | 373 |- | 374 |- | 375 |376 | | 377 | |