├── .DS_Store ├── content.sql ├── images └── .DS_Store ├── libs └── class_curl_multi.php └── splider.php /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyhiving/MultiHttpRequest/fe20c9376a5ff02948cfe0a29983b1f15fe69e8d/.DS_Store -------------------------------------------------------------------------------- /content.sql: -------------------------------------------------------------------------------- 1 | -- phpMyAdmin SQL Dump 2 | -- version 3.5.0 3 | -- http://www.phpmyadmin.net 4 | -- 5 | -- 主机: localhost 6 | -- 生成日期: 2012 年 05 月 22 日 10:48 7 | -- 服务器版本: 5.1.44 8 | -- PHP 版本: 5.3.1 9 | 10 | SET SQL_MODE="NO_AUTO_VALUE_ON_ZERO"; 11 | SET time_zone = "+00:00"; 12 | 13 | 14 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 15 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 16 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 17 | /*!40101 SET NAMES utf8 */; 18 | 19 | -- 20 | -- 数据库: `test` 21 | -- 22 | 23 | -- -------------------------------------------------------- 24 | 25 | -- 26 | -- 表的结构 `content` 27 | -- 28 | 29 | CREATE TABLE IF NOT EXISTS `content` ( 30 | `id` int(9) NOT NULL AUTO_INCREMENT, 31 | `meta_title` varchar(255) CHARACTER SET latin1 NOT NULL, 32 | `meta_keywords` varchar(255) CHARACTER SET latin1 NOT NULL, 33 | `meta_description` varchar(255) CHARACTER SET latin1 NOT NULL, 34 | `product_name` varchar(255) CHARACTER SET latin1 NOT NULL, 35 | `product_image` varchar(255) CHARACTER SET latin1 NOT NULL, 36 | `product_price` varchar(255) CHARACTER SET latin1 NOT NULL, 37 | `product_description` text CHARACTER SET latin1 NOT NULL, 38 | `product_url` varchar(255) CHARACTER SET latin1 NOT NULL, 39 | UNIQUE KEY `id` (`id`) 40 | ) ENGINE=MyISAM DEFAULT CHARSET=utf8 AUTO_INCREMENT=1 ; 41 | 42 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 43 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 44 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 45 | -------------------------------------------------------------------------------- /images/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyhiving/MultiHttpRequest/fe20c9376a5ff02948cfe0a29983b1f15fe69e8d/images/.DS_Store -------------------------------------------------------------------------------- /libs/class_curl_multi.php: -------------------------------------------------------------------------------- 1 | start(); 9 | 10 | * ======================== 11 | */ 12 | class MultiHttpRequest { 13 | public $urls = array(); 14 | public $curlopt_header = 0; 15 | public $method = "GET"; 16 | 17 | function __construct($urls = false) { 18 | $this->urls = $urls; 19 | } 20 | 21 | function set_urls($urls) { 22 | $this->urls = $urls; 23 | return $this; 24 | } 25 | 26 | function is_return_header($b) { 27 | $this->curlopt_header = $b; 28 | return $this; 29 | } 30 | 31 | function set_method($m) { 32 | $this->medthod = strtoupper($m); 33 | return $this; 34 | } 35 | 36 | function start() { 37 | if(!is_array($this->urls) or count($this->urls) == 0){ 38 | return false; 39 | } 40 | $curl = $text = array(); 41 | $handle = curl_multi_init(); 42 | foreach($this->urls as $k=>$v){ 43 | $curl[$k] = $this->add_handle($handle, $v); 44 | } 45 | 46 | $this->exec_handle($handle); 47 | foreach($this->urls as $k=>$v){ 48 | //curl_multi_getcontent($curl[$k]); 49 | //echo $curl[$k]."\n"; 50 | $text[$k] = curl_multi_getcontent($curl[$k]); 51 | //echo $text[$k], "\n\n"; 52 | curl_multi_remove_handle($handle, $curl[$k]); 53 | } 54 | curl_multi_close($handle); 55 | 56 | return $text; 57 | } 58 | 59 | private function add_handle($handle, $url) { 60 | $curl = curl_init(); 61 | curl_setopt($curl, CURLOPT_URL, $url); 62 | 63 | curl_setopt($curl, CURLOPT_HEADER, $this->curlopt_header); 64 | curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); 65 | curl_multi_add_handle($handle, $curl); 66 | return $curl; 67 | } 68 | 69 | private function exec_handle($handle) { 70 | $flag = null; 71 | do { 72 | curl_multi_exec($handle, $flag); 73 | } while ($flag > 0); 74 | } 75 | 76 | public function get_content($url){ 77 | $ch = curl_init(); 78 | curl_setopt ($ch, CURLOPT_URL, $url); 79 | curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1); 80 | curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT,10); 81 | return curl_exec($ch); 82 | } 83 | } -------------------------------------------------------------------------------- /splider.php: -------------------------------------------------------------------------------- 1 | .*'; 34 | 35 | //内容页面信息字段表达式 36 | $detail_rules = array( 37 | 'meta_title'=>'(.*)', 38 | 'meta_keywords'=>'', 39 | 'meta_description'=>'', 40 | 'product_name'=>'

(.*)

', 41 | 'product_image'=>'
.*.*.*
', 42 | 'product_price'=>'Our Price : (.*)', 43 | 'product_description'=>'
(.*)
', 44 | ); 45 | 46 | //实例 47 | $mp = new MultiHttpRequest(); 48 | 49 | //调试使用记录采集条目 50 | $j = 1; 51 | 52 | //每次并发几个链接 53 | $limit = 10; 54 | 55 | //开始采集 56 | foreach ($list as $link) { 57 | 58 | //解析列表页数 59 | preg_match_all('#\[(.*)\]#isU',$link,$_page); 60 | if($_page[1][0]==''){ 61 | continue; 62 | } 63 | $pages = explode('-',$_page[1][0]); 64 | if(count($pages) != 2){ 65 | continue; 66 | } 67 | 68 | $urls = array(); 69 | 70 | for($i=$pages[0];$i<=$pages[1];$i++){ 71 | if(count($urls) < $limit){ 72 | $urls[] = preg_replace('#\[(.*)\]#isU',$i,$link); 73 | if($i != $pages[1]){ 74 | continue; 75 | } 76 | } 77 | //采集列表内容 78 | $mp->set_urls($urls); 79 | $contents = $mp->start(); 80 | 81 | foreach ($contents as $content) { 82 | 83 | $content = _prefilter($content); 84 | //debug 85 | //exit($content); 86 | 87 | //匹配内容 88 | preg_match_all('#'.addslashes($list_rules).'#isU',$content,$pregArr); 89 | 90 | $detail_urls = array(); 91 | foreach($pregArr[1] as $detail_key=>$detail_value){ 92 | $data = array(); 93 | if(count($detail_urls) < $limit ){ 94 | $detail_urls[] = $base.$detail_value; 95 | if($pregArr[1][$detail_key+1] != ''){ 96 | continue; 97 | } 98 | } 99 | 100 | //print_r($detail_urls); 101 | //continue; 102 | $mp->set_urls($detail_urls); 103 | 104 | $details = $mp->start(); 105 | //图片路径临时存放 106 | $images_urls = array(); 107 | 108 | //采集内容页面 109 | foreach ($details as $detail) { 110 | $detail = _prefilter($detail); 111 | //debug 112 | exit($detail); 113 | 114 | foreach ($detail_rules as $key => $value) { 115 | 116 | preg_match_all('#'.addslashes($value).'#isU',$detail,$detailArr); 117 | //处理特殊这段信息 118 | switch ($key) { 119 | case 'product_image': 120 | $data[$key] = "images/".md5($detailArr[1][0]).".jpg"; 121 | if(!file_exists($data[$key])){ 122 | $images_urls[$data[$key]] = $base.$detailArr[1][0]; 123 | //file_put_contents($data[$key],$mp->get_content($base.$detailArr[1][0])); 124 | } 125 | break; 126 | case 'product_description': 127 | $data[$key] = trim(strip_tags($detailArr[1][0])); 128 | break; 129 | default: 130 | $data[$key] = $detailArr[1][0]; 131 | break; 132 | } 133 | 134 | } 135 | 136 | 137 | //产品url 138 | $data['product_url'] = _title($data['product_name']); 139 | //转义采集后的数据 140 | foreach ($data as $_k => $_v) { 141 | $data[$_k] = addslashes($_v); 142 | } 143 | //入库 144 | $r = mysql_query(" 145 | insert into `content` values( 146 | null, 147 | '{$data['meta_title']}', 148 | '{$data['meta_keywords']}', 149 | '{$data['meta_description']}', 150 | '{$data['product_name']}', 151 | '{$data['product_image']}', 152 | '{$data['product_price']}', 153 | '{$data['product_description']}', 154 | '{$data['product_url']}')"); 155 | //打印log 156 | _flush($j++."|".$r."|".$data['product_name']."\n"); 157 | //_flush($data); 158 | } 159 | //远程图片本地化 160 | $mp->set_urls($images_urls); 161 | $images = $mp->start(); 162 | foreach ((array)$images as $image_key => $image_value) { 163 | _flush($image_key."\n"); 164 | file_put_contents($image_key,$image_value); 165 | } 166 | //清空内容url并加入本次循环url。不然本次会被跳过 167 | $detail_urls = array($base.$detail_value); 168 | } 169 | } 170 | //清空内容url并加入本次循环url。不然本次会被跳过 171 | $urls = array(preg_replace('#\[(.*)\]#isU',$i,$link)); 172 | } 173 | } 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | //========================================================= 182 | function _flush($msg) 183 | { 184 | print_r ($msg); 185 | ob_flush(); 186 | flush(); 187 | } 188 | 189 | 190 | function _title($title, $separator = '-' ) 191 | { 192 | $title = preg_replace('![^'.preg_quote($separator).'\pL\pN\s]+!u', '', strtolower($title)); 193 | $title = preg_replace('!['.preg_quote($separator).'\s]+!u', $separator, $title); 194 | return trim($title, $separator); 195 | } 196 | 197 | 198 | function _prefilter($output) { 199 | $output=preg_replace("/\/\/[\S\f\t\v ]*?;[\r|\n]/","",$output); 200 | $output=preg_replace("/\<\!\-\-[\s\S]*?\-\-\>/","",$output); 201 | $output=preg_replace("/\>[\s]+\<",$output); 202 | $output=preg_replace("/;[\s]+/",";",$output); 203 | $output=preg_replace("/[\s]+\}/","}",$output); 204 | $output=preg_replace("/}[\s]+/","}",$output); 205 | $output=preg_replace("/\{[\s]+/","{",$output); 206 | $output=preg_replace("/([\s]){2,}/","$1",$output); 207 | $output=preg_replace("/[\s]+\=[\s]+/","=",$output); 208 | return $output; 209 | } 210 | 211 | 212 | 213 | 214 | ?> --------------------------------------------------------------------------------