├── .DS_Store
├── README.md
├── content.sql
├── images
    └── .DS_Store
├── libs
    ├── .DS_Store
    └── class_curl_multi.php
└── splider.php


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/woondroo/curl_multi/8063374155fd40ab47ff78abe92de4a09266236e/.DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | curl_multi
2 | ==========
3 | 
4 | PHP多线程数据采集
5 | 
6 | 详细说明：http://blog.eiodesign.com/archives/86


--------------------------------------------------------------------------------
/content.sql:
--------------------------------------------------------------------------------
 1 | --
 2 | -- 数据库: `www_curlmulti`
 3 | --
 4 | 
 5 | -- --------------------------------------------------------
 6 | 
 7 | --
 8 | -- 表的结构 `content`
 9 | --
10 | 
11 | CREATE TABLE IF NOT EXISTS `content` (
12 |   `id` int(9) NOT NULL AUTO_INCREMENT,
13 |   `meta_title` varchar(500) CHARACTER SET latin1 NOT NULL,
14 |   `meta_keywords` text CHARACTER SET latin1 NOT NULL,
15 |   `meta_description` text CHARACTER SET latin1 NOT NULL,
16 |   `product_name` varchar(500) CHARACTER SET latin1 NOT NULL,
17 |   `product_image` varchar(500) CHARACTER SET latin1 NOT NULL,
18 |   `product_price` varchar(500) CHARACTER SET latin1 NOT NULL,
19 |   `product_description` text CHARACTER SET latin1 NOT NULL,
20 |   `product_url` varchar(500) CHARACTER SET latin1 NOT NULL,
21 |   UNIQUE KEY `id` (`id`)
22 | ) ENGINE=MyISAM  DEFAULT CHARSET=utf8 AUTO_INCREMENT=1 ;
23 | 


--------------------------------------------------------------------------------
/images/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/woondroo/curl_multi/8063374155fd40ab47ff78abe92de4a09266236e/images/.DS_Store


--------------------------------------------------------------------------------
/libs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/woondroo/curl_multi/8063374155fd40ab47ff78abe92de4a09266236e/libs/.DS_Store


--------------------------------------------------------------------------------
/libs/class_curl_multi.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | /*
 3 |  * Curl 多线程类
 4 |  * 使用方法：
 5 |  * ========================
 6 | 	$urls = array("http://baidu.com", "http://dzone.com", "http://google.com");
 7 | 	$mp = new MultiHttpRequest($urls);
 8 | 	$mp->start();
 9 |  * ========================
10 |  * 当然，如果你喜欢，还可以对此类进行扩展，
11 |  * 比如，如果需要用户登录才能采集的数据怎么办？
12 |  * 只要我们使用 curl 来做伪登录，把 cookie 保存到文件，
13 |  * 每次请求发送有效的 cookie 即可实现伪登录抓去数据！
14 |  */
15 | class MultiHttpRequest {
16 |     public $urls = array();
17 |     public $curlopt_header = 0;
18 |     public $method = "GET";
19 |     
20 |     function __construct($urls = false) {
21 |         $this->urls = $urls;
22 |     }
23 | 
24 |     function set_urls($urls) {
25 |         $this->urls = $urls;
26 |         return $this;
27 |      }
28 | 
29 |      function is_return_header($b) {
30 |          $this->curlopt_header = $b;
31 |          return $this;
32 |      }
33 | 
34 |      function set_method($m) {
35 |          $this->medthod = strtoupper($m);
36 |          return $this;
37 |      }
38 | 
39 |      function start() {
40 |          if(!is_array($this->urls) or count($this->urls) == 0){
41 |             return false;
42 |          }
43 |          $curl = $text = array();
44 |          $handle = curl_multi_init();
45 |          foreach($this->urls as $k=>$v){
46 |             $curl[$k] = $this->add_handle($handle, $v);
47 |          }
48 | 
49 |          $this->exec_handle($handle);
50 |          foreach($this->urls as $k=>$v){
51 |              $text[$k] =  curl_multi_getcontent($curl[$k]);
52 |              curl_multi_remove_handle($handle, $curl[$k]);
53 |          }
54 |          curl_multi_close($handle);
55 | 		
56 |          return $text;
57 |      }
58 | 
59 |      private function add_handle($handle, $url) {
60 |          $curl = curl_init();
61 |          curl_setopt($curl, CURLOPT_URL, $url);
62 |          
63 |          curl_setopt($curl, CURLOPT_HEADER, $this->curlopt_header);
64 |          curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
65 |          curl_multi_add_handle($handle, $curl);
66 |          return $curl;
67 |      }
68 | 
69 |      private function exec_handle($handle) {
70 |          $flag = null;
71 |          do {
72 |             curl_multi_exec($handle, $flag);
73 |          } while ($flag > 0);
74 |      }
75 |      
76 |      public function get_content($url){
77 | 		$ch = curl_init();
78 | 		curl_setopt ($ch, CURLOPT_URL, $url);
79 | 		curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
80 | 		curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT,10);
81 | 		return curl_exec($ch);
82 |      }
83 | }


--------------------------------------------------------------------------------
/splider.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | /**
  3 |  * PHP通用多线程采集
  4 |  *
  5 |  * 2013-06-28 woondroo create
  6 |  */
  7 | header("Content-type: text/html;charset=utf-8");
  8 | 
  9 | set_time_limit(0);
 10 | 
 11 | require("libs/class_curl_multi.php");
 12 | 
 13 | //连接数据库
 14 | $link = mysql_connect("localhost","root","greenwen");
 15 | mysql_select_db("www_curlmulti",$link);
 16 | 
 17 | //清空数据库
 18 | mysql_query("TRUNCATE TABLE content");
 19 | 
 20 | //域名前缀
 21 | $base = "http://sellbest.net";
 22 | 
 23 | //需要采集的规则列表（分页）
 24 | $list = array(
 25 | 	'http://sellbest.net/by-category/page[1-2]/36-iPad-CASES.html'
 26 | );
 27 | 
 28 | //在列表页面内容链接表达式
 29 | $list_rules = '<p class="productName">.*?<a href="(.*?)">.*?</a>.*?</p>';
 30 | 
 31 | //内容页面信息字段表达式
 32 | $detail_rules = array(
 33 | 	'meta_title'=>'<title>(.*?)</title>',
 34 | 	'meta_keywords'=>'<meta name="keywords" content="(.*?)" />',
 35 | 	'meta_description'=>'<meta name="description" content="(.*?)" />',
 36 | 	'product_name'=>'<h4 class="h4-title float-l"> (.*?)</h4>',
 37 | 	'product_image'=>'<div class="v-inner">.*?<a href="(.*?)" id="originalImg"><img src=".*?" alt=".*?" /></a>.*?</div>',
 38 | 	'product_price'=>'Our Price : <strong>(.*?)</strong>',
 39 | 	'product_description'=>'<div class="description-text" id="description"><div class="border-cont">(.*?)</div>',
 40 | );
 41 | 
 42 | //实例
 43 | $mp = new MultiHttpRequest();
 44 | 
 45 | //调试使用记录采集条目
 46 | $j = 1;
 47 | 
 48 | //每次并发几个链接
 49 | $limit = 10;
 50 | 
 51 | // 分页时被跳过的页数
 52 | $last_page = 0;
 53 | 
 54 | //开始采集
 55 | foreach ($list as $link) {
 56 | 	
 57 | 	//解析列表页数
 58 | 	preg_match_all('/\[(.*)\]/i',$link,$_page);
 59 | 	if($_page[1][0]==''){
 60 | 		continue;
 61 | 	}
 62 | 	$pages = explode('-',$_page[1][0]);
 63 | 	if(count($pages) != 2){
 64 | 		continue;
 65 | 	}
 66 | 	
 67 | 	$urls = array();
 68 | 	
 69 | 	for($i=$pages[0];$i<=$pages[1] || $last_page === 1;$i++){
 70 | 		if(count($urls) < $limit && $last_page === 0){
 71 | 			$urls[] = preg_replace('/\[(.*)\]/i',$i,$link);
 72 | 			if($i < $pages[1]){
 73 | 				continue;
 74 | 			}
 75 | 		}
 76 | 
 77 | 		// var_dump($urls);echo '<br/>';
 78 | 		$last_page = 0;
 79 | 
 80 | 		//采集列表内容
 81 | 		$mp->set_urls($urls);
 82 | 		$contents = $mp->start();
 83 | 				
 84 | 		foreach ($contents as $content) {
 85 | 			$content = _prefilter($content);
 86 | 			//debug
 87 | 			// exit($content);
 88 | 			
 89 | 			//匹配内容
 90 | 			preg_match_all('/'.str_replace('/', '\/', addslashes($list_rules)).'/i',$content,$pregArr);
 91 | 			
 92 | 			$detail_urls = array();
 93 | 			foreach($pregArr[1] as $detail_key=>$detail_value){
 94 | 			 	$data = array();
 95 | 				if(count($detail_urls) < $limit ){
 96 | 					$content_url = $base.$detail_value;
 97 | 					$detail_urls[$content_url] = $content_url;
 98 | 					if($pregArr[1][$detail_key+1] != ''){
 99 | 						continue;
100 | 					}
101 | 				}
102 | 				
103 | 				// var_dump($detail_urls);echo '<br/>';
104 | 				//continue;
105 | 				$mp->set_urls($detail_urls);
106 | 				
107 | 				$details = $mp->start();
108 | 				//图片路径临时存放
109 | 				$images_urls = array();
110 | 				
111 | 				//采集内容页面
112 | 				foreach ($details as $url_key=>$detail) {
113 | 					$detail = _prefilter($detail);
114 | 					//debug
115 | 					// exit($detail);
116 | 					
117 | 					foreach ($detail_rules as $key => $value) {
118 | 						
119 | 						preg_match_all('/'.str_replace('/', '\/', addslashes($value)).'/i',$detail,$detailArr);
120 | 						
121 | 						//处理特殊这段信息
122 | 						switch ($key) {
123 | 							case 'product_image':
124 | 								$data[$key] = "images/".md5($detailArr[1][0]).".jpg";
125 | 								if(!file_exists($data[$key])){
126 | 									$images_urls[$data[$key]] = $base.$detailArr[1][0];
127 | 								}
128 | 								break;
129 | 							case 'product_description':
130 | 								$data[$key] = trim(strip_tags($detailArr[1][0]));
131 | 								break;
132 | 							default:
133 | 								$data[$key] = $detailArr[1][0];
134 | 								break;
135 | 						}
136 | 										
137 | 					}
138 | 					
139 | 					
140 | 					//产品url			
141 | 					$data['product_url'] = $url_key;
142 | 
143 | 					//转义采集后的数据
144 | 					foreach ($data as $_k => $_v) {
145 | 						$data[$_k] = addslashes($_v);
146 | 					}
147 | 
148 | 					//入库
149 | 					$r = mysql_query("
150 | 					insert into `content` values(
151 | 					null,
152 | 					'{$data['meta_title']}',
153 | 					'{$data['meta_keywords']}',
154 | 					'{$data['meta_description']}',
155 | 					'{$data['product_name']}',
156 | 					'{$data['product_image']}',
157 | 					'{$data['product_price']}',
158 | 					'{$data['product_description']}',
159 | 					'{$data['product_url']}')");
160 | 
161 | 					//打印log
162 | 					_flush($j++."|".$r."|".$data['product_name']."<br/>");
163 | 					//_flush($data);
164 | 				}
165 | 				//远程图片本地化
166 | 				$mp->set_urls($images_urls);
167 | 				$images = $mp->start();
168 | 				foreach ((array)$images as $image_key => $image_value) {
169 | 					if (!empty($image_key)) {
170 | 						_flush("store image:".$image_key."<br/>");
171 | 						file_put_contents($image_key,$image_value);
172 | 					}
173 | 				}
174 | 				//清空内容url并加入本次循环url。不然本次会被跳过
175 | 				$content_url = $base.$detail_value;
176 | 				$detail_urls = array($content_url=>$content_url);
177 | 			}
178 | 		}
179 | 		//清空内容url并加入本次循环url。不然本次会被跳过
180 | 		if ($i == $pages[1] && ($pages[1] - $i) % $limit > 0) {
181 | 			$last_page = 1;
182 | 		}
183 | 		$urls = array(preg_replace('/\[(.*)\]/i',$i,$link));
184 | 	}
185 | }
186 | 
187 | // 输出日志
188 | function _flush($msg) {
189 | 	print_r ($msg);
190 | 	ob_flush();
191 | 	flush();
192 | }
193 | 
194 | // 对抓去到的内容做简单过滤（过滤空白字符，便于正则匹配）
195 | function _prefilter($output) {
196 | 	$output=preg_replace("/\/\/[\S\f\t\v ]*?;[\r|\n]/", "", $output);
197 | 	$output=preg_replace("/\<\!\-\-[\s\S]*?\-\-\>/", "", $output);
198 | 	$output=preg_replace("/\>[\s]+\</", "><", $output);
199 | 	$output=preg_replace("/;[\s]+/", ";", $output);
200 | 	$output=preg_replace("/[\s]+\}/", "}", $output);
201 | 	$output=preg_replace("/}[\s]+/", "}", $output);
202 | 	$output=preg_replace("/\{[\s]+/", "{", $output);
203 | 	$output=preg_replace("/([\s]){2,}/", "$1", $output);
204 | 	$output=preg_replace("/[\s]+\=[\s]+/", "=", $output);
205 | 	return $output;
206 | }
207 | ?>


--------------------------------------------------------------------------------