├── .DS_Store ├── README.md ├── content.sql ├── images └── .DS_Store ├── libs ├── .DS_Store └── class_curl_multi.php └── splider.php /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/woondroo/curl_multi/8063374155fd40ab47ff78abe92de4a09266236e/.DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | curl_multi 2 | ========== 3 | 4 | PHP多线程数据采集 5 | 6 | 详细说明:http://blog.eiodesign.com/archives/86 -------------------------------------------------------------------------------- /content.sql: -------------------------------------------------------------------------------- 1 | -- 2 | -- 数据库: `www_curlmulti` 3 | -- 4 | 5 | -- -------------------------------------------------------- 6 | 7 | -- 8 | -- 表的结构 `content` 9 | -- 10 | 11 | CREATE TABLE IF NOT EXISTS `content` ( 12 | `id` int(9) NOT NULL AUTO_INCREMENT, 13 | `meta_title` varchar(500) CHARACTER SET latin1 NOT NULL, 14 | `meta_keywords` text CHARACTER SET latin1 NOT NULL, 15 | `meta_description` text CHARACTER SET latin1 NOT NULL, 16 | `product_name` varchar(500) CHARACTER SET latin1 NOT NULL, 17 | `product_image` varchar(500) CHARACTER SET latin1 NOT NULL, 18 | `product_price` varchar(500) CHARACTER SET latin1 NOT NULL, 19 | `product_description` text CHARACTER SET latin1 NOT NULL, 20 | `product_url` varchar(500) CHARACTER SET latin1 NOT NULL, 21 | UNIQUE KEY `id` (`id`) 22 | ) ENGINE=MyISAM DEFAULT CHARSET=utf8 AUTO_INCREMENT=1 ; 23 | -------------------------------------------------------------------------------- /images/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/woondroo/curl_multi/8063374155fd40ab47ff78abe92de4a09266236e/images/.DS_Store -------------------------------------------------------------------------------- /libs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/woondroo/curl_multi/8063374155fd40ab47ff78abe92de4a09266236e/libs/.DS_Store -------------------------------------------------------------------------------- /libs/class_curl_multi.php: -------------------------------------------------------------------------------- 1 | start(); 9 | * ======================== 10 | * 当然,如果你喜欢,还可以对此类进行扩展, 11 | * 比如,如果需要用户登录才能采集的数据怎么办? 12 | * 只要我们使用 curl 来做伪登录,把 cookie 保存到文件, 13 | * 每次请求发送有效的 cookie 即可实现伪登录抓去数据! 14 | */ 15 | class MultiHttpRequest { 16 | public $urls = array(); 17 | public $curlopt_header = 0; 18 | public $method = "GET"; 19 | 20 | function __construct($urls = false) { 21 | $this->urls = $urls; 22 | } 23 | 24 | function set_urls($urls) { 25 | $this->urls = $urls; 26 | return $this; 27 | } 28 | 29 | function is_return_header($b) { 30 | $this->curlopt_header = $b; 31 | return $this; 32 | } 33 | 34 | function set_method($m) { 35 | $this->medthod = strtoupper($m); 36 | return $this; 37 | } 38 | 39 | function start() { 40 | if(!is_array($this->urls) or count($this->urls) == 0){ 41 | return false; 42 | } 43 | $curl = $text = array(); 44 | $handle = curl_multi_init(); 45 | foreach($this->urls as $k=>$v){ 46 | $curl[$k] = $this->add_handle($handle, $v); 47 | } 48 | 49 | $this->exec_handle($handle); 50 | foreach($this->urls as $k=>$v){ 51 | $text[$k] = curl_multi_getcontent($curl[$k]); 52 | curl_multi_remove_handle($handle, $curl[$k]); 53 | } 54 | curl_multi_close($handle); 55 | 56 | return $text; 57 | } 58 | 59 | private function add_handle($handle, $url) { 60 | $curl = curl_init(); 61 | curl_setopt($curl, CURLOPT_URL, $url); 62 | 63 | curl_setopt($curl, CURLOPT_HEADER, $this->curlopt_header); 64 | curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); 65 | curl_multi_add_handle($handle, $curl); 66 | return $curl; 67 | } 68 | 69 | private function exec_handle($handle) { 70 | $flag = null; 71 | do { 72 | curl_multi_exec($handle, $flag); 73 | } while ($flag > 0); 74 | } 75 | 76 | public function get_content($url){ 77 | $ch = curl_init(); 78 | curl_setopt ($ch, CURLOPT_URL, $url); 79 | curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1); 80 | curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT,10); 81 | return curl_exec($ch); 82 | } 83 | } -------------------------------------------------------------------------------- /splider.php: -------------------------------------------------------------------------------- 1 | .*?.*?.*?

'; 30 | 31 | //内容页面信息字段表达式 32 | $detail_rules = array( 33 | 'meta_title'=>'(.*?)', 34 | 'meta_keywords'=>'', 35 | 'meta_description'=>'', 36 | 'product_name'=>'

(.*?)

', 37 | 'product_image'=>'
.*?.*?.*?
', 38 | 'product_price'=>'Our Price : (.*?)', 39 | 'product_description'=>'
(.*?)
', 40 | ); 41 | 42 | //实例 43 | $mp = new MultiHttpRequest(); 44 | 45 | //调试使用记录采集条目 46 | $j = 1; 47 | 48 | //每次并发几个链接 49 | $limit = 10; 50 | 51 | // 分页时被跳过的页数 52 | $last_page = 0; 53 | 54 | //开始采集 55 | foreach ($list as $link) { 56 | 57 | //解析列表页数 58 | preg_match_all('/\[(.*)\]/i',$link,$_page); 59 | if($_page[1][0]==''){ 60 | continue; 61 | } 62 | $pages = explode('-',$_page[1][0]); 63 | if(count($pages) != 2){ 64 | continue; 65 | } 66 | 67 | $urls = array(); 68 | 69 | for($i=$pages[0];$i<=$pages[1] || $last_page === 1;$i++){ 70 | if(count($urls) < $limit && $last_page === 0){ 71 | $urls[] = preg_replace('/\[(.*)\]/i',$i,$link); 72 | if($i < $pages[1]){ 73 | continue; 74 | } 75 | } 76 | 77 | // var_dump($urls);echo '
'; 78 | $last_page = 0; 79 | 80 | //采集列表内容 81 | $mp->set_urls($urls); 82 | $contents = $mp->start(); 83 | 84 | foreach ($contents as $content) { 85 | $content = _prefilter($content); 86 | //debug 87 | // exit($content); 88 | 89 | //匹配内容 90 | preg_match_all('/'.str_replace('/', '\/', addslashes($list_rules)).'/i',$content,$pregArr); 91 | 92 | $detail_urls = array(); 93 | foreach($pregArr[1] as $detail_key=>$detail_value){ 94 | $data = array(); 95 | if(count($detail_urls) < $limit ){ 96 | $content_url = $base.$detail_value; 97 | $detail_urls[$content_url] = $content_url; 98 | if($pregArr[1][$detail_key+1] != ''){ 99 | continue; 100 | } 101 | } 102 | 103 | // var_dump($detail_urls);echo '
'; 104 | //continue; 105 | $mp->set_urls($detail_urls); 106 | 107 | $details = $mp->start(); 108 | //图片路径临时存放 109 | $images_urls = array(); 110 | 111 | //采集内容页面 112 | foreach ($details as $url_key=>$detail) { 113 | $detail = _prefilter($detail); 114 | //debug 115 | // exit($detail); 116 | 117 | foreach ($detail_rules as $key => $value) { 118 | 119 | preg_match_all('/'.str_replace('/', '\/', addslashes($value)).'/i',$detail,$detailArr); 120 | 121 | //处理特殊这段信息 122 | switch ($key) { 123 | case 'product_image': 124 | $data[$key] = "images/".md5($detailArr[1][0]).".jpg"; 125 | if(!file_exists($data[$key])){ 126 | $images_urls[$data[$key]] = $base.$detailArr[1][0]; 127 | } 128 | break; 129 | case 'product_description': 130 | $data[$key] = trim(strip_tags($detailArr[1][0])); 131 | break; 132 | default: 133 | $data[$key] = $detailArr[1][0]; 134 | break; 135 | } 136 | 137 | } 138 | 139 | 140 | //产品url 141 | $data['product_url'] = $url_key; 142 | 143 | //转义采集后的数据 144 | foreach ($data as $_k => $_v) { 145 | $data[$_k] = addslashes($_v); 146 | } 147 | 148 | //入库 149 | $r = mysql_query(" 150 | insert into `content` values( 151 | null, 152 | '{$data['meta_title']}', 153 | '{$data['meta_keywords']}', 154 | '{$data['meta_description']}', 155 | '{$data['product_name']}', 156 | '{$data['product_image']}', 157 | '{$data['product_price']}', 158 | '{$data['product_description']}', 159 | '{$data['product_url']}')"); 160 | 161 | //打印log 162 | _flush($j++."|".$r."|".$data['product_name']."
"); 163 | //_flush($data); 164 | } 165 | //远程图片本地化 166 | $mp->set_urls($images_urls); 167 | $images = $mp->start(); 168 | foreach ((array)$images as $image_key => $image_value) { 169 | if (!empty($image_key)) { 170 | _flush("store image:".$image_key."
"); 171 | file_put_contents($image_key,$image_value); 172 | } 173 | } 174 | //清空内容url并加入本次循环url。不然本次会被跳过 175 | $content_url = $base.$detail_value; 176 | $detail_urls = array($content_url=>$content_url); 177 | } 178 | } 179 | //清空内容url并加入本次循环url。不然本次会被跳过 180 | if ($i == $pages[1] && ($pages[1] - $i) % $limit > 0) { 181 | $last_page = 1; 182 | } 183 | $urls = array(preg_replace('/\[(.*)\]/i',$i,$link)); 184 | } 185 | } 186 | 187 | // 输出日志 188 | function _flush($msg) { 189 | print_r ($msg); 190 | ob_flush(); 191 | flush(); 192 | } 193 | 194 | // 对抓去到的内容做简单过滤(过滤空白字符,便于正则匹配) 195 | function _prefilter($output) { 196 | $output=preg_replace("/\/\/[\S\f\t\v ]*?;[\r|\n]/", "", $output); 197 | $output=preg_replace("/\<\!\-\-[\s\S]*?\-\-\>/", "", $output); 198 | $output=preg_replace("/\>[\s]+\<", $output); 199 | $output=preg_replace("/;[\s]+/", ";", $output); 200 | $output=preg_replace("/[\s]+\}/", "}", $output); 201 | $output=preg_replace("/}[\s]+/", "}", $output); 202 | $output=preg_replace("/\{[\s]+/", "{", $output); 203 | $output=preg_replace("/([\s]){2,}/", "$1", $output); 204 | $output=preg_replace("/[\s]+\=[\s]+/", "=", $output); 205 | return $output; 206 | } 207 | ?> --------------------------------------------------------------------------------