├── .DS_Store ├── README.md ├── content.sql ├── images └── .DS_Store ├── libs ├── .DS_Store └── class_curl_multi.php └── splider.php /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/woondroo/curl_multi/8063374155fd40ab47ff78abe92de4a09266236e/.DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | curl_multi 2 | ========== 3 | 4 | PHP多线程数据采集 5 | 6 | 详细说明:http://blog.eiodesign.com/archives/86 -------------------------------------------------------------------------------- /content.sql: -------------------------------------------------------------------------------- 1 | -- 2 | -- 数据库: `www_curlmulti` 3 | -- 4 | 5 | -- -------------------------------------------------------- 6 | 7 | -- 8 | -- 表的结构 `content` 9 | -- 10 | 11 | CREATE TABLE IF NOT EXISTS `content` ( 12 | `id` int(9) NOT NULL AUTO_INCREMENT, 13 | `meta_title` varchar(500) CHARACTER SET latin1 NOT NULL, 14 | `meta_keywords` text CHARACTER SET latin1 NOT NULL, 15 | `meta_description` text CHARACTER SET latin1 NOT NULL, 16 | `product_name` varchar(500) CHARACTER SET latin1 NOT NULL, 17 | `product_image` varchar(500) CHARACTER SET latin1 NOT NULL, 18 | `product_price` varchar(500) CHARACTER SET latin1 NOT NULL, 19 | `product_description` text CHARACTER SET latin1 NOT NULL, 20 | `product_url` varchar(500) CHARACTER SET latin1 NOT NULL, 21 | UNIQUE KEY `id` (`id`) 22 | ) ENGINE=MyISAM DEFAULT CHARSET=utf8 AUTO_INCREMENT=1 ; 23 | -------------------------------------------------------------------------------- /images/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/woondroo/curl_multi/8063374155fd40ab47ff78abe92de4a09266236e/images/.DS_Store -------------------------------------------------------------------------------- /libs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/woondroo/curl_multi/8063374155fd40ab47ff78abe92de4a09266236e/libs/.DS_Store -------------------------------------------------------------------------------- /libs/class_curl_multi.php: -------------------------------------------------------------------------------- 1 | start(); 9 | * ======================== 10 | * 当然,如果你喜欢,还可以对此类进行扩展, 11 | * 比如,如果需要用户登录才能采集的数据怎么办? 12 | * 只要我们使用 curl 来做伪登录,把 cookie 保存到文件, 13 | * 每次请求发送有效的 cookie 即可实现伪登录抓去数据! 14 | */ 15 | class MultiHttpRequest { 16 | public $urls = array(); 17 | public $curlopt_header = 0; 18 | public $method = "GET"; 19 | 20 | function __construct($urls = false) { 21 | $this->urls = $urls; 22 | } 23 | 24 | function set_urls($urls) { 25 | $this->urls = $urls; 26 | return $this; 27 | } 28 | 29 | function is_return_header($b) { 30 | $this->curlopt_header = $b; 31 | return $this; 32 | } 33 | 34 | function set_method($m) { 35 | $this->medthod = strtoupper($m); 36 | return $this; 37 | } 38 | 39 | function start() { 40 | if(!is_array($this->urls) or count($this->urls) == 0){ 41 | return false; 42 | } 43 | $curl = $text = array(); 44 | $handle = curl_multi_init(); 45 | foreach($this->urls as $k=>$v){ 46 | $curl[$k] = $this->add_handle($handle, $v); 47 | } 48 | 49 | $this->exec_handle($handle); 50 | foreach($this->urls as $k=>$v){ 51 | $text[$k] = curl_multi_getcontent($curl[$k]); 52 | curl_multi_remove_handle($handle, $curl[$k]); 53 | } 54 | curl_multi_close($handle); 55 | 56 | return $text; 57 | } 58 | 59 | private function add_handle($handle, $url) { 60 | $curl = curl_init(); 61 | curl_setopt($curl, CURLOPT_URL, $url); 62 | 63 | curl_setopt($curl, CURLOPT_HEADER, $this->curlopt_header); 64 | curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); 65 | curl_multi_add_handle($handle, $curl); 66 | return $curl; 67 | } 68 | 69 | private function exec_handle($handle) { 70 | $flag = null; 71 | do { 72 | curl_multi_exec($handle, $flag); 73 | } while ($flag > 0); 74 | } 75 | 76 | public function get_content($url){ 77 | $ch = curl_init(); 78 | curl_setopt ($ch, CURLOPT_URL, $url); 79 | curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1); 80 | curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT,10); 81 | return curl_exec($ch); 82 | } 83 | } -------------------------------------------------------------------------------- /splider.php: -------------------------------------------------------------------------------- 1 | .*?.*?.*?
'; 30 | 31 | //内容页面信息字段表达式 32 | $detail_rules = array( 33 | 'meta_title'=>'