├── README.md ├── config ├── inc_config.php └── inc_mimetype.php ├── core ├── cache.php ├── db.php ├── init.php ├── log.php ├── phpspider.php ├── requests.php ├── selector.php ├── util.php └── worker.php ├── data ├── imooc.csv └── log │ └── error.log ├── demo └── imooc.php ├── library ├── cls_curl.php ├── cls_query.php ├── cls_redis.php ├── phpQuery.php └── rolling_curl.php ├── user.php ├── user.sql ├── user_index.php ├── user_info.php └── user_info_progress.php /README.md: -------------------------------------------------------------------------------- 1 | # phpspider -- PHP蜘蛛爬虫框架 2 | 《快收藏,怪客教你十分钟使用php进阶爬虫大师,一小时“偷”慕课网10万用户信息 》所使用的程序 3 | 4 | - 主程序 5 | - demo/imooc.php 6 | - 运行命令 7 | - php demo/imooc.php 8 | - 守护进程运行 9 | - nohup php demo/imooc.php & 10 | - 网址 11 | - http://www.scienceswork.com 12 | -------------------------------------------------------------------------------- /config/inc_config.php: -------------------------------------------------------------------------------- 1 | '127.0.0.1', 5 | 'port' => 3306, 6 | 'user' => 'root', 7 | 'pass' => 'root', 8 | 'name' => 'demo', 9 | ); 10 | 11 | $GLOBALS['config']['redis'] = array( 12 | 'host' => '127.0.0.1', 13 | 'port' => 6379, 14 | 'pass' => '', 15 | 'prefix' => 'phpspider', 16 | 'timeout' => 30, 17 | ); 18 | 19 | include "inc_mimetype.php"; 20 | -------------------------------------------------------------------------------- /config/inc_mimetype.php: -------------------------------------------------------------------------------- 1 | 'binary', 9 | //'text/xml' => 'xml', 10 | //'text/html' => 'html', 11 | //'text/htm' => 'htm', 12 | //'text/plain' => 'txt', 13 | 'image/png' => 'png', 14 | 'image/jpeg' => 'jpg', 15 | 'image/gif' => 'gif', 16 | 'image/tiff' => 'tiff', 17 | 'image/x-jpg' => 'jpg', 18 | 'image/x-icon' => 'icon', 19 | 'image/x-img' => 'img', 20 | 'application/pdf' => 'pdf', 21 | 'audio/mp3' => 'mp3', 22 | 'video/avi' => 'avi', 23 | 'video/mp4' => 'mp4', 24 | 'application/x-msdownload' => 'exe', 25 | 'application/vnd.iphone' => 'ipa', 26 | 'application/x-bittorrent' => 'torrent', 27 | 'application/vnd.android.package-archive' => 'apk', 28 | ); 29 | -------------------------------------------------------------------------------- /core/cache.php: -------------------------------------------------------------------------------- 1 | 10 | // +---------------------------------------------------------------------- 11 | 12 | //---------------------------------- 13 | // PHPSpider缓存类文件 14 | //---------------------------------- 15 | 16 | class cache 17 | { 18 | // 多进程下面不能用单例模式 19 | //protected static $_instance; 20 | /** 21 | * 获取实例 22 | * 23 | * @return void 24 | * @author seatle 25 | * @created time :2016-04-10 22:55 26 | */ 27 | public static function init() 28 | { 29 | if(extension_loaded('Redis')) 30 | { 31 | $_instance = new Redis(); 32 | } 33 | else 34 | { 35 | $errmsg = "extension redis is not installed"; 36 | log::add($errmsg, "Error"); 37 | return null; 38 | } 39 | // 这里不能用pconnect,会报错:Uncaught exception 'RedisException' with message 'read error on connection' 40 | $_instance->connect($GLOBALS['config']['redis']['host'], $GLOBALS['config']['redis']['port'], $GLOBALS['config']['redis']['timeout']); 41 | 42 | // 验证 43 | if ($GLOBALS['config']['redis']['pass']) 44 | { 45 | if ( !$_instance->auth($GLOBALS['config']['redis']['pass']) ) 46 | { 47 | $errmsg = "Redis Server authentication failed!!"; 48 | log::add($errmsg, "Error"); 49 | return null; 50 | } 51 | } 52 | 53 | // 不序列化的话不能存数组,用php的序列化方式其他语言又不能读取,所以这里自己用json序列化了,性能还比php的序列化好1.4倍 54 | //$_instance->setOption(Redis::OPT_SERIALIZER, Redis::SERIALIZER_NONE); // don't serialize data 55 | //$_instance->setOption(Redis::OPT_SERIALIZER, Redis::SERIALIZER_PHP); // use built-in serialize/unserialize 56 | //$_instance->setOption(Redis::OPT_SERIALIZER, Redis::SERIALIZER_IGBINARY); // use igBinary serialize/unserialize 57 | 58 | $_instance->setOption(Redis::OPT_PREFIX, $GLOBALS['config']['redis']['prefix'] . ":"); 59 | 60 | return $_instance; 61 | } 62 | } 63 | 64 | 65 | -------------------------------------------------------------------------------- /core/db.php: -------------------------------------------------------------------------------- 1 | 10 | // +---------------------------------------------------------------------- 11 | 12 | //---------------------------------- 13 | // PHPSpider数据库类文件 14 | //---------------------------------- 15 | 16 | class db 17 | { 18 | private static $config = array(); 19 | private static $conn; 20 | private static $rsid; 21 | private static $query_count = 0; 22 | private static $conn_fail = 0; 23 | private static $worker_pid = 0; 24 | 25 | public static function _init_mysql($config = array()) 26 | { 27 | if (empty($config)) 28 | { 29 | // 记住不要把原来有的配置信息给强制换成$GLOBALS['config']['db'],否则换数据库会有问题 30 | self::$config = empty(self::$config) ? $GLOBALS['config']['db'] : self::$config; 31 | } 32 | else 33 | { 34 | self::$config = $config; 35 | } 36 | 37 | if ( !self::$conn ) 38 | { 39 | self::$conn = @mysqli_connect(self::$config['host'], self::$config['user'], self::$config['pass'], self::$config['name'], self::$config['port']); 40 | if(mysqli_connect_errno()) 41 | { 42 | self::$conn_fail++; 43 | $errmsg = 'Mysql Connect failed['.self::$conn_fail.']: ' . mysqli_connect_error(); 44 | echo util::colorize(date("H:i:s") . " {$errmsg}\n\n", 'fail'); 45 | log::add($errmsg, "Error"); 46 | // 连接失败5次,中断进程 47 | if (self::$conn_fail >= 5) 48 | { 49 | exit(250); 50 | } 51 | self::_init_mysql($config); 52 | } 53 | else 54 | { 55 | // 连接成功清零 56 | self::$conn_fail = 0; 57 | self::$worker_pid = function_exists('posix_getpid') ? posix_getpid() : 0; 58 | mysqli_query(self::$conn, " SET character_set_connection=utf8, character_set_results=utf8, character_set_client=binary, sql_mode='' "); 59 | } 60 | } 61 | else 62 | { 63 | $curr_pid = function_exists('posix_getpid') ? posix_getpid() : 0; 64 | // 如果父进程已经生成资源就释放重新生成,因为多进程不能共享连接资源 65 | if (self::$worker_pid != $curr_pid) 66 | { 67 | self::reset_connect(); 68 | } 69 | } 70 | 71 | } 72 | 73 | /** 74 | * 重新设置连接 75 | * 传空的话就等于关闭数据库再连接 76 | * 在多进程环境下如果主进程已经调用过了,子进程一定要调用一次 reset_connect,否则会报错: 77 | * Error while reading greeting packet. PID=19615,这是两个进程互抢一个连接句柄引起的 78 | * 79 | * @param array $config 80 | * @return void 81 | * @author seatle 82 | * @created time :2016-03-29 00:51 83 | */ 84 | public static function reset_connect($config = array()) 85 | { 86 | @mysqli_close(self::$conn); 87 | self::$conn = null; 88 | self::_init_mysql($config); 89 | } 90 | 91 | /** 92 | * 返回查询游标 93 | * @return rsid 94 | */ 95 | protected static function _get_rsid($rsid = '') 96 | { 97 | return $rsid == '' ? self::$rsid : $rsid; 98 | } 99 | 100 | public static function autocommit($mode = false) 101 | { 102 | self::$conn = self::_init_mysql(); 103 | // $int = $mode ? 1 : 0; 104 | // return @mysqli_query(self::$conn, "SET autocommit={$int}"); 105 | return mysqli_autocommit(self::$conn, $mode); 106 | } 107 | 108 | public static function begin_tran() 109 | { 110 | // self::$conn = self::_init_mysql( true ); 111 | // return @mysqli_query(self::$conn, 'BEGIN'); 112 | return self::autocommit(false); 113 | } 114 | 115 | public static function commit() 116 | { 117 | return mysqli_commit(self::$conn); 118 | } 119 | 120 | 121 | public static function rollback() 122 | { 123 | return mysqli_rollback(self::$conn); 124 | } 125 | 126 | public static function query($sql) 127 | { 128 | $sql = trim($sql); 129 | 130 | // 初始化数据库 131 | self::_init_mysql(); 132 | self::$rsid = @mysqli_query(self::$conn, $sql); 133 | 134 | if (self::$rsid === false) 135 | { 136 | // 不要每次都ping,浪费流量浪费性能,执行出错了才重新连接 137 | $errno = mysqli_errno(self::$conn); 138 | if ($errno == 2013 || $errno == 2006) 139 | { 140 | $errmsg = mysqli_error(self::$conn); 141 | log::add($errmsg, "Error"); 142 | 143 | @mysqli_close(self::$conn); 144 | self::$conn = null; 145 | return self::query($sql); 146 | } 147 | 148 | $errmsg = "Query SQL: ".$sql; 149 | log::add($errmsg, "Warning"); 150 | $errmsg = "Error SQL: ".mysqli_error(self::$conn); 151 | log::add($errmsg, "Warning"); 152 | 153 | $backtrace = debug_backtrace(); 154 | array_shift($backtrace); 155 | $narr = array('class', 'type', 'function', 'file', 'line'); 156 | $err = "debug_backtrace:\n"; 157 | foreach($backtrace as $i => $l) 158 | { 159 | foreach($narr as $k) 160 | { 161 | if( !isset($l[$k]) ) 162 | { 163 | $l[$k] = ''; 164 | } 165 | } 166 | $err .= "[$i] in function {$l['class']}{$l['type']}{$l['function']} "; 167 | if($l['file']) $err .= " in {$l['file']} "; 168 | if($l['line']) $err .= " on line {$l['line']} "; 169 | $err .= "\n"; 170 | } 171 | log::add($err); 172 | 173 | return false; 174 | } 175 | else 176 | { 177 | self::$query_count++; 178 | return self::$rsid; 179 | } 180 | } 181 | 182 | public static function fetch($rsid = '') 183 | { 184 | $rsid = self::_get_rsid($rsid); 185 | $row = mysqli_fetch_array($rsid, MYSQLI_ASSOC); 186 | return $row; 187 | } 188 | 189 | public static function get_one($sql, $func = '') 190 | { 191 | if (!preg_match("/limit/i", $sql)) 192 | { 193 | $sql = preg_replace("/[,;]$/i", '', trim($sql)) . " limit 1 "; 194 | } 195 | $rsid = self::query($sql); 196 | if ($rsid === false) 197 | { 198 | return; 199 | } 200 | $row = self::fetch($rsid); 201 | self::free($rsid); 202 | if (!empty($func)) 203 | { 204 | return call_user_func($func, $row); 205 | } 206 | return $row; 207 | } 208 | 209 | public static function get_all($sql, $func = '') 210 | { 211 | $rsid = self::query($sql); 212 | if ($rsid === false) 213 | { 214 | return; 215 | } 216 | while ( $row = self::fetch($rsid) ) 217 | { 218 | $rows[] = $row; 219 | } 220 | self::free($rsid); 221 | if (!empty($func)) 222 | { 223 | return call_user_func($func, $rows); 224 | } 225 | return empty($rows) ? false : $rows; 226 | } 227 | 228 | public static function free($rsid) 229 | { 230 | return mysqli_free_result($rsid); 231 | } 232 | 233 | public static function insert_id() 234 | { 235 | return mysqli_insert_id(self::$conn); 236 | } 237 | 238 | public static function affected_rows() 239 | { 240 | return mysqli_affected_rows(self::$conn); 241 | } 242 | 243 | public static function insert($table = '', $data = null, $return_sql = false) 244 | { 245 | $items_sql = $values_sql = ""; 246 | foreach ($data as $k => $v) 247 | { 248 | $v = stripslashes($v); 249 | $v = addslashes($v); 250 | $items_sql .= "`$k`,"; 251 | $values_sql .= "\"$v\","; 252 | } 253 | $sql = "Insert Ignore Into `{$table}` (" . substr($items_sql, 0, -1) . ") Values (" . substr($values_sql, 0, -1) . ")"; 254 | if ($return_sql) 255 | { 256 | return $sql; 257 | } 258 | else 259 | { 260 | if (self::query($sql)) 261 | { 262 | return mysqli_insert_id(self::$conn); 263 | } 264 | else 265 | { 266 | return false; 267 | } 268 | } 269 | } 270 | 271 | public static function insert_batch($table = '', $set = NULL, $return_sql = FALSE) 272 | { 273 | if (empty($table) || empty($set)) 274 | { 275 | return false; 276 | } 277 | $set = self::strsafe($set); 278 | $fields = self::get_fields($table); 279 | 280 | $keys_sql = $vals_sql = array(); 281 | foreach ($set as $i=>$val) 282 | { 283 | ksort($val); 284 | $vals = array(); 285 | foreach ($val as $k => $v) 286 | { 287 | // 过滤掉数据库没有的字段 288 | if (!in_array($k, $fields)) 289 | { 290 | continue; 291 | } 292 | // 如果是第一个数组,把key当做插入条件 293 | if ($i == 0 && $k == 0) 294 | { 295 | $keys_sql[] = "`$k`"; 296 | } 297 | $vals[] = "\"$v\""; 298 | } 299 | $vals_sql[] = implode(",", $vals); 300 | } 301 | 302 | $sql = "Insert Ignore Into `{$table}`(".implode(", ", $keys_sql).") Values (".implode("), (", $vals_sql).")"; 303 | 304 | if ($return_sql) return $sql; 305 | 306 | $rt = self::query($sql); 307 | $insert_id = self::insert_id(); 308 | $return = empty($insert_id) ? $rt : $insert_id; 309 | return $return; 310 | } 311 | 312 | public static function update_batch($table = '', $set = NULL, $index = NULL, $where = NULL, $return_sql = FALSE) 313 | { 314 | if (empty($table) || is_null($set) || is_null($index)) 315 | { 316 | // 不要用exit,会中断程序 317 | return false; 318 | } 319 | $set = self::strsafe($set); 320 | $fields = self::get_fields($table); 321 | 322 | $ids = array(); 323 | foreach ($set as $val) 324 | { 325 | ksort($val); 326 | // 去重,其实不去也可以,因为相同的when只会执行第一个,后面的就直接跳过不执行了 327 | $key = md5($val[$index]); 328 | $ids[$key] = $val[$index]; 329 | 330 | foreach (array_keys($val) as $field) 331 | { 332 | if ($field != $index) 333 | { 334 | $final[$field][$key] = 'When `'.$index.'` = "'.$val[$index].'" Then "'.$val[$field].'"'; 335 | } 336 | } 337 | } 338 | //$ids = array_values($ids); 339 | 340 | // 如果不是数组而且不为空,就转数组 341 | if (!is_array($where) && !empty($where)) 342 | { 343 | $where = array($where); 344 | } 345 | $where[] = $index.' In ("'.implode('","', $ids).'")'; 346 | $where = empty($where) ? "" : " Where ".implode(" And ", $where); 347 | 348 | $sql = "Update `".$table."` Set "; 349 | $cases = ''; 350 | 351 | foreach ($final as $k => $v) 352 | { 353 | // 过滤掉数据库没有的字段 354 | if (!in_array($k, $fields)) 355 | { 356 | continue; 357 | } 358 | $cases .= '`'.$k.'` = Case '."\n"; 359 | foreach ($v as $row) 360 | { 361 | $cases .= $row."\n"; 362 | } 363 | 364 | $cases .= 'Else `'.$k.'` End, '; 365 | } 366 | 367 | $sql .= substr($cases, 0, -2); 368 | 369 | // 其实不带 Where In ($index) 的条件也可以的 370 | $sql .= $where; 371 | 372 | if ($return_sql) return $sql; 373 | 374 | $rt = self::query($sql); 375 | $insert_id = self::affected_rows(); 376 | $return = empty($affected_rows) ? $rt : $affected_rows; 377 | return $return; 378 | } 379 | 380 | public static function update($table = '', $data = array(), $where = null, $return_sql = false) 381 | { 382 | $sql = "UPDATE `{$table}` SET "; 383 | foreach ($data as $k => $v) 384 | { 385 | $v = stripslashes($v); 386 | $v = addslashes($v); 387 | $sql .= "`{$k}` = \"{$v}\","; 388 | } 389 | if (!is_array($where)) 390 | { 391 | $where = array($where); 392 | } 393 | // 删除空字段,不然array("")会成为WHERE 394 | foreach ($where as $k => $v) 395 | { 396 | if (empty($v)) 397 | { 398 | unset($where[$k]); 399 | } 400 | } 401 | $where = empty($where) ? "" : " Where " . implode(" And ", $where); 402 | $sql = substr($sql, 0, -1) . $where; 403 | if ($return_sql) 404 | { 405 | return $sql; 406 | } 407 | else 408 | { 409 | if (self::query($sql)) 410 | { 411 | return mysqli_affected_rows(self::$conn); 412 | } 413 | else 414 | { 415 | return false; 416 | } 417 | } 418 | } 419 | 420 | public static function delete($table = '', $where = null, $return_sql = false) 421 | { 422 | // 小心全部被删除了 423 | if (empty($where)) 424 | { 425 | return false; 426 | } 427 | $where = 'Where ' . (!is_array($where) ? $where : implode(' And ', $where)); 428 | $sql = "Delete From `{$table}` {$where}"; 429 | if ($return_sql) 430 | { 431 | return $sql; 432 | } 433 | else 434 | { 435 | if (self::query($sql)) 436 | { 437 | return mysqli_affected_rows(self::$conn); 438 | } 439 | else 440 | { 441 | return false; 442 | } 443 | } 444 | } 445 | 446 | public static function ping() 447 | { 448 | if (!mysqli_ping(self::$conn)) 449 | { 450 | @mysqli_close(self::$conn); 451 | self::$conn = null; 452 | self::_init_mysql(); 453 | } 454 | } 455 | 456 | public static function strsafe($array) 457 | { 458 | $arrays = array(); 459 | if(is_array($array)===true) 460 | { 461 | foreach ($array as $key => $val) 462 | { 463 | if(is_array($val)===true) 464 | { 465 | $arrays[$key] = self::strsafe($val); 466 | } 467 | else 468 | { 469 | //先去掉转义,避免下面重复转义了 470 | $val = stripslashes($val); 471 | //进行转义 472 | $val = addslashes($val); 473 | //处理addslashes没法处理的 _ % 字符 474 | //$val = strtr($val, array('_'=>'\_', '%'=>'\%')); 475 | $arrays[$key] = $val; 476 | } 477 | } 478 | return $arrays; 479 | } 480 | else 481 | { 482 | $array = stripslashes($array); 483 | $array = addslashes($array); 484 | //$array = strtr($array, array('_'=>'\_', '%'=>'\%')); 485 | return $array; 486 | } 487 | } 488 | 489 | // 这个是给insert、update、insert_batch、update_batch用的 490 | public static function get_fields($table) 491 | { 492 | // $sql = "SHOW COLUMNS FROM $table"; //和下面的语句效果一样 493 | $rows = self::get_all("Desc `{$table}`"); 494 | $fields = array(); 495 | foreach ($rows as $k => $v) 496 | { 497 | // 过滤自增主键 498 | // if ($v['Key'] != 'PRI') 499 | if ($v['Extra'] != 'auto_increment') 500 | { 501 | $fields[] = $v['Field']; 502 | } 503 | } 504 | return $fields; 505 | } 506 | 507 | public static function table_exists($table_name) 508 | { 509 | $sql = "SHOW TABLES LIKE '" . $table_name . "'"; 510 | $rsid = self::query($sql); 511 | $table = self::fetch($rsid); 512 | if (empty($table)) 513 | { 514 | return false; 515 | } 516 | return true; 517 | } 518 | } 519 | 520 | 521 | 522 | 523 | 524 | -------------------------------------------------------------------------------- /core/init.php: -------------------------------------------------------------------------------- 1 | 10 | // +---------------------------------------------------------------------- 11 | 12 | //---------------------------------- 13 | // PHPSpider公共入口文件 14 | //---------------------------------- 15 | 16 | // 严格开发模式 17 | error_reporting( E_ALL ); 18 | //ini_set('display_errors', 1); 19 | 20 | // 永不超时 21 | ini_set('max_execution_time', 0); 22 | set_time_limit(0); 23 | // 内存限制,如果外面设置的内存比 /etc/php/php-cli.ini 大,就不要设置了 24 | if (intval(ini_get("memory_limit")) < 1024) 25 | { 26 | ini_set('memory_limit', '1024M'); 27 | } 28 | 29 | if( PHP_SAPI != 'cli' ) 30 | { 31 | exit("You must run the CLI environment\n"); 32 | } 33 | 34 | // 设置时区 35 | date_default_timezone_set('Asia/Shanghai'); 36 | 37 | //核心库目录 38 | define('CORE', dirname(__FILE__)); 39 | define('PATH_ROOT', CORE."/../"); 40 | define('PATH_DATA', CORE."/../data"); 41 | define('PATH_LIBRARY', CORE."/../library"); 42 | 43 | //系统配置 44 | if( file_exists( PATH_ROOT."/config/inc_config.php" ) ) 45 | { 46 | require PATH_ROOT."/config/inc_config.php"; 47 | } 48 | require CORE.'/log.php'; 49 | require CORE.'/requests.php'; 50 | require CORE.'/selector.php'; 51 | require CORE.'/util.php'; 52 | require CORE.'/db.php'; 53 | require CORE.'/cache.php'; 54 | require CORE."/worker.php"; 55 | require CORE."/phpspider.php"; 56 | 57 | // 启动的时候生成data目录 58 | util::path_exists(PATH_DATA); 59 | util::path_exists(PATH_DATA."/lock"); 60 | util::path_exists(PATH_DATA."/log"); 61 | util::path_exists(PATH_DATA."/cache"); 62 | util::path_exists(PATH_DATA."/status"); 63 | 64 | function autoload($classname) { 65 | set_include_path(PATH_ROOT.'/library/'); 66 | spl_autoload($classname); //replaces include/require 67 | } 68 | 69 | spl_autoload_extensions('.php'); 70 | spl_autoload_register('autoload'); 71 | 72 | /** 73 | * 自动加载类库处理 74 | * @return void 75 | */ 76 | //function __autoload( $classname ) 77 | //{ 78 | //$classname = preg_replace("/[^0-9a-z_]/i", '', $classname); 79 | //if( class_exists ( $classname ) ) { 80 | //return true; 81 | //} 82 | //$classfile = $classname.'.php'; 83 | //try 84 | //{ 85 | //if ( file_exists ( PATH_LIBRARY.'/'.$classfile ) ) 86 | //{ 87 | //require PATH_LIBRARY.'/'.$classfile; 88 | //} 89 | //else 90 | //{ 91 | //throw new Exception ( 'Error: Cannot find the '.$classname ); 92 | //} 93 | //} 94 | //catch ( Exception $e ) 95 | //{ 96 | //log::error($e->getMessage().'|'.$classname); 97 | //exit(); 98 | //} 99 | //} 100 | -------------------------------------------------------------------------------- /core/log.php: -------------------------------------------------------------------------------- 1 | 10 | // +---------------------------------------------------------------------- 11 | 12 | //---------------------------------- 13 | // PHPSpider日志类文件 14 | //---------------------------------- 15 | 16 | class log 17 | { 18 | public static $log_show = false; 19 | public static $log_type = false; 20 | public static $log_file = "data/phpspider.log"; 21 | public static $out_sta = ""; 22 | public static $out_end = ""; 23 | 24 | public static function note($msg) 25 | { 26 | self::$out_sta = self::$out_end = ""; 27 | self::msg($msg, 'note'); 28 | } 29 | 30 | public static function info($msg) 31 | { 32 | self::$out_sta = self::$out_end = ""; 33 | self::msg($msg, 'info'); 34 | } 35 | 36 | public static function warn($msg) 37 | { 38 | self::$out_sta = self::$out_end = ""; 39 | if (!util::is_win()) 40 | { 41 | self::$out_sta = "\033[33m"; 42 | self::$out_end = "\033[0m"; 43 | } 44 | 45 | self::msg($msg, 'warn'); 46 | } 47 | 48 | public static function debug($msg) 49 | { 50 | self::$out_sta = self::$out_end = ""; 51 | if (!util::is_win()) 52 | { 53 | self::$out_sta = "\033[36m"; 54 | self::$out_end = "\033[0m"; 55 | } 56 | 57 | self::msg($msg, 'debug'); 58 | } 59 | 60 | public static function error($msg) 61 | { 62 | self::$out_sta = self::$out_end = ""; 63 | if (!util::is_win()) 64 | { 65 | self::$out_sta = "\033[31m"; 66 | self::$out_end = "\033[0m"; 67 | } 68 | 69 | self::msg($msg, 'error'); 70 | } 71 | 72 | public static function msg($msg, $log_type) 73 | { 74 | if ($log_type != 'note' && self::$log_type && strpos(self::$log_type, $log_type) === false) 75 | { 76 | return false; 77 | } 78 | 79 | if ($log_type == 'note') 80 | { 81 | $msg = self::$out_sta. $msg . "\n".self::$out_end; 82 | } 83 | else 84 | { 85 | $msg = self::$out_sta.date("Y-m-d H:i:s")." [{$log_type}] " . $msg .self::$out_end. "\n"; 86 | } 87 | if(self::$log_show) 88 | { 89 | echo $msg; 90 | } 91 | file_put_contents(self::$log_file, $msg, FILE_APPEND | LOCK_EX); 92 | } 93 | 94 | /** 95 | * 记录日志 XXX 96 | * @param string $msg 97 | * @param string $log_type Note|Warning|Error 98 | * @return void 99 | */ 100 | public static function add($msg, $log_type = '') 101 | { 102 | if ($log_type != '') 103 | { 104 | $msg = date("Y-m-d H:i:s")." [{$log_type}] " . $msg . "\n"; 105 | } 106 | if(self::$log_show) 107 | { 108 | echo $msg; 109 | } 110 | //file_put_contents(PATH_DATA."/log/".strtolower($log_type).".log", $msg, FILE_APPEND | LOCK_EX); 111 | file_put_contents(PATH_DATA."/log/error.log", $msg, FILE_APPEND | LOCK_EX); 112 | } 113 | 114 | } 115 | 116 | -------------------------------------------------------------------------------- /core/requests.php: -------------------------------------------------------------------------------- 1 | 10 | // +---------------------------------------------------------------------- 11 | 12 | //---------------------------------- 13 | // PHPSpider请求类文件 14 | //---------------------------------- 15 | 16 | class requests 17 | { 18 | /** 19 | * 版本号 20 | * @var string 21 | */ 22 | const VERSION = '1.2.0'; 23 | 24 | protected static $ch = null; 25 | protected static $timeout = 10; 26 | //public static $request = array( 27 | //'headers' => array() 28 | //); 29 | public static $input_encoding = null; 30 | public static $output_encoding = null; 31 | public static $cookies = array(); 32 | public static $domain_cookies = array(); 33 | public static $hosts = array(); 34 | public static $headers = array(); 35 | public static $useragents = array(); 36 | public static $client_ips = array(); 37 | public static $proxies = array(); 38 | public static $url = null; 39 | public static $domain = null; 40 | public static $raw = null; 41 | public static $content = null; 42 | public static $info = array(); 43 | public static $status_code = 0; 44 | public static $error = null; 45 | 46 | /** 47 | * set timeout 48 | * 49 | * @param init $timeout 50 | * @return 51 | */ 52 | public static function set_timeout($timeout) 53 | { 54 | self::$timeout = $timeout; 55 | } 56 | 57 | /** 58 | * 设置代理 59 | * 60 | * @param mixed $proxies 61 | * array ( 62 | * 'http': 'socks5://user:pass@host:port', 63 | * 'https': 'socks5://user:pass@host:port' 64 | *) 65 | * @return void 66 | * @author seatle 67 | * @created time :2016-09-18 10:17 68 | */ 69 | public static function set_proxies($proxies) 70 | { 71 | self::$proxies = $proxies; 72 | } 73 | 74 | /** 75 | * 设置Headers 76 | * 77 | * @param string $headers 78 | * @return void 79 | */ 80 | public static function set_header($key, $value) 81 | { 82 | self::$headers[$key] = $value; 83 | } 84 | 85 | /** 86 | * 设置COOKIE 87 | * 88 | * @param string $cookie 89 | * @return void 90 | */ 91 | public static function set_cookie($key, $value, $domain = '') 92 | { 93 | if (empty($key) || empty($value)) 94 | { 95 | return false; 96 | } 97 | if (!empty($domain)) 98 | { 99 | self::$domain_cookies[$domain][$key] = $value; 100 | } 101 | else 102 | { 103 | self::$cookies[$key] = $value; 104 | } 105 | return true; 106 | } 107 | 108 | public static function set_cookies($cookies, $domain = '') 109 | { 110 | $cookies_arr = explode(";", $cookies); 111 | if (empty($cookies_arr)) 112 | { 113 | return false; 114 | } 115 | 116 | foreach ($cookies_arr as $cookie) 117 | { 118 | $cookie_arr = explode("=", $cookie); 119 | $key = $value = ""; 120 | foreach ($cookie_arr as $k=>$v) 121 | { 122 | if ($k == 0) 123 | { 124 | $key = trim($v); 125 | } 126 | else 127 | { 128 | $value .= trim(str_replace('"', '', $v)); 129 | } 130 | } 131 | $key = strstr($cookie, '=', true); 132 | $value = substr(strstr($cookie, '='), 1); 133 | 134 | if (!empty($domain)) 135 | { 136 | self::$domain_cookies[$domain][$key] = $value; 137 | } 138 | else 139 | { 140 | self::$cookies[$key] = $value; 141 | } 142 | } 143 | return true; 144 | } 145 | 146 | public static function get_cookie($name, $domain = '') 147 | { 148 | if (!empty($domain) && !isset(self::$domain_cookies[$domain])) 149 | { 150 | return ''; 151 | } 152 | $cookies = empty($domain) ? self::$cookies : self::$domain_cookies[$domain]; 153 | return isset($cookies[$name]) ? $cookies[$name] : ''; 154 | } 155 | 156 | public static function get_cookies($domain = '') 157 | { 158 | if (!empty($domain) && !isset(self::$domain_cookies[$domain])) 159 | { 160 | return array(); 161 | } 162 | return empty($domain) ? self::$cookies : self::$domain_cookies[$domain]; 163 | } 164 | 165 | public static function del_cookies($domain = '') 166 | { 167 | if (!empty($domain) && !isset(self::$domain_cookies[$domain])) 168 | { 169 | return false; 170 | } 171 | if ( empty($domain)) 172 | { 173 | unset(self::$cookies); 174 | } 175 | else 176 | { 177 | unset(self::$domain_cookies[$domain]); 178 | } 179 | return true; 180 | } 181 | 182 | 183 | /** 184 | * 设置随机的user_agent 185 | * 186 | * @param string $useragent 187 | * @return void 188 | */ 189 | public static function set_useragents($useragents) 190 | { 191 | self::$useragents = $useragents; 192 | } 193 | 194 | /** 195 | * 设置 user_agent 196 | * 197 | * @param string $useragent 198 | * @return void 199 | */ 200 | public static function set_useragent($useragent) 201 | { 202 | self::$headers['User-Agent'] = $useragent; 203 | } 204 | 205 | /** 206 | * set referer 207 | * 208 | */ 209 | public static function set_referer($referer) 210 | { 211 | self::$headers['Referer'] = $referer; 212 | } 213 | 214 | /** 215 | * 设置伪造IP 216 | * 217 | * @param string $ip 218 | * @return void 219 | */ 220 | public static function set_client_ip($ip) 221 | { 222 | self::$headers["CLIENT-IP"] = $ip; 223 | self::$headers["X-FORWARDED-FOR"] = $ip; 224 | } 225 | 226 | /** 227 | * 设置随机伪造IP 228 | * 229 | * @param mixed $ip 230 | * @return void 231 | * @author seatle 232 | * @created time :2016-11-16 11:06 233 | */ 234 | public static function set_client_ips($ips) 235 | { 236 | self::$client_ips = $ips; 237 | } 238 | 239 | /** 240 | * 设置Hosts 241 | * 242 | * @param string $hosts 243 | * @return void 244 | */ 245 | public static function set_hosts($host, $ips = array()) 246 | { 247 | $ips = is_array($ips) ? $ips : array($ips); 248 | self::$hosts[$host] = $ips; 249 | } 250 | 251 | public static function get_response_body($domain) 252 | { 253 | $header = $body = ''; 254 | $http_headers = array(); 255 | // 解析HTTP数据流 256 | if (!empty(self::$raw)) 257 | { 258 | self::get_response_cookies($domain); 259 | // body里面可能有 \r\n\r\n,但是第一个一定是HTTP Header,去掉后剩下的就是body 260 | $array = explode("\r\n\r\n", self::$raw); 261 | foreach ($array as $k=>$v) 262 | { 263 | // post 方法会有两个http header:HTTP/1.1 100 Continue、HTTP/1.1 200 OK 264 | if (preg_match("#^HTTP/.*? 100 Continue#", $v)) 265 | { 266 | unset($array[$k]); 267 | continue; 268 | } 269 | if (preg_match("#^HTTP/.*? \d+ #", $v)) 270 | { 271 | $header = $v; 272 | unset($array[$k]); 273 | $http_headers = self::get_response_headers($v); 274 | } 275 | } 276 | $body = implode("\r\n\r\n", $array); 277 | } 278 | 279 | // 如果用户没有明确指定输入的页面编码格式(utf-8, gb2312),通过程序去判断 280 | if(self::$input_encoding == null) 281 | { 282 | // 从头部获取 283 | preg_match("/charset=([^\s]*)/i", $header, $out); 284 | $encoding = empty($out[1]) ? '' : str_replace(array('"', '\''), '', strtolower(trim($out[1]))); 285 | //$encoding = null; 286 | if (empty($encoding)) 287 | { 288 | // 在某些情况下,无法再 response header 中获取 html 的编码格式 289 | // 则需要根据 html 的文本格式获取 290 | $encoding = self::get_encoding($body); 291 | $encoding = strtolower($encoding); 292 | if($encoding == false || $encoding == "ascii") 293 | { 294 | $encoding = 'gbk'; 295 | } 296 | } 297 | self::$input_encoding = $encoding; 298 | } 299 | 300 | // 设置了输出编码的转码,注意: xpath只支持utf-8,iso-8859-1 不要转,他本身就是utf-8 301 | if (self::$output_encoding && self::$input_encoding != self::$output_encoding && self::$input_encoding != 'iso-8859-1') 302 | { 303 | // 先将非utf8编码,转化为utf8编码 304 | $body = @mb_convert_encoding($body, self::$output_encoding, self::$input_encoding); 305 | // 将页面中的指定的编码方式修改为utf8 306 | $body = preg_replace("/]*)charset=([^>]*)>/is", '', $body); 307 | // 直接干掉头部,国外很多信息是在头部的 308 | //$body = self::_remove_head($body); 309 | } 310 | return $body; 311 | } 312 | 313 | public static function get_response_cookies($domain) 314 | { 315 | // 解析Cookie并存入 self::$cookies 方便调用 316 | preg_match_all("/.*?Set\-Cookie: ([^\r\n]*)/i", self::$raw, $matches); 317 | $cookies = empty($matches[1]) ? array() : $matches[1]; 318 | 319 | // 解析到Cookie 320 | if (!empty($cookies)) 321 | { 322 | $cookies = implode(";", $cookies); 323 | $cookies = explode(";", $cookies); 324 | foreach ($cookies as $cookie) 325 | { 326 | $cookie_arr = explode("=", $cookie); 327 | // 过滤 httponly、secure 328 | if (count($cookie_arr) < 2) 329 | { 330 | continue; 331 | } 332 | $cookie_name = !empty($cookie_arr[0]) ? trim($cookie_arr[0]) : ''; 333 | if (empty($cookie_name)) 334 | { 335 | continue; 336 | } 337 | // 过滤掉domain路径 338 | if (in_array(strtolower($cookie_name), array('path', 'domain', 'expires', 'max-age'))) 339 | { 340 | continue; 341 | } 342 | self::$domain_cookies[$domain][trim($cookie_arr[0])] = trim($cookie_arr[1]); 343 | } 344 | } 345 | } 346 | 347 | public static function get_response_headers($html) 348 | { 349 | $header_lines = explode("\n", $html); 350 | if (!empty($header_lines)) 351 | { 352 | foreach ($header_lines as $line) 353 | { 354 | $header_arr = explode(":", $line); 355 | $key = empty($header_arr[0]) ? '' : trim($header_arr[0]); 356 | $val = empty($header_arr[1]) ? '' : trim($header_arr[1]); 357 | if (empty($key) || empty($val)) 358 | { 359 | continue; 360 | } 361 | $headers[$key] = $val; 362 | } 363 | } 364 | } 365 | 366 | /** 367 | * 获取文件编码 368 | * @param $string 369 | * @return string 370 | */ 371 | public static function get_encoding($string) 372 | { 373 | $encoding = mb_detect_encoding($string, array('UTF-8', 'GBK', 'GB2312', 'LATIN1', 'ASCII', 'BIG5')); 374 | return strtolower($encoding); 375 | } 376 | 377 | /** 378 | * 移除页面head区域代码 379 | * @param $html 380 | * @return mixed 381 | */ 382 | private static function _remove_head($html) 383 | { 384 | return preg_replace('/.+<\/head>/is', '', $html); 385 | } 386 | 387 | /** 388 | * 简单的判断一下参数是否为一个URL链接 389 | * @param string $str 390 | * @return boolean 391 | */ 392 | private static function _is_url($url) 393 | { 394 | //$pattern = '/^http(s)?:\\/\\/.+/'; 395 | $pattern = "/\b(([\w-]+:\/\/?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|\/)))/"; 396 | if (preg_match($pattern, $url)) 397 | { 398 | return true; 399 | } 400 | return false; 401 | } 402 | 403 | /** 404 | * 初始化 CURL 405 | * 406 | */ 407 | public static function init() 408 | { 409 | if (!is_resource ( self::$ch )) 410 | { 411 | self::$ch = curl_init (); 412 | curl_setopt( self::$ch, CURLOPT_RETURNTRANSFER, true ); 413 | curl_setopt( self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout ); 414 | curl_setopt( self::$ch, CURLOPT_HEADER, false ); 415 | curl_setopt( self::$ch, CURLOPT_USERAGENT, "phpspider-requests/".self::VERSION ); 416 | curl_setopt( self::$ch, CURLOPT_TIMEOUT, self::$timeout + 5); 417 | // 在多线程处理场景下使用超时选项时,会忽略signals对应的处理函数,但是无耐的是还有小概率的crash情况发生 418 | curl_setopt( self::$ch, CURLOPT_NOSIGNAL, true); 419 | } 420 | return self::$ch; 421 | } 422 | 423 | /** 424 | * get 425 | * 426 | * 427 | */ 428 | public static function get($url, $fields = array()) 429 | { 430 | self::init (); 431 | return self::request($url, 'get', $fields); 432 | } 433 | 434 | /** 435 | * $fields 有三种类型:1、数组;2、http query;3、json 436 | * 1、array('name'=>'yangzetao') 2、http_build_query(array('name'=>'yangzetao')) 3、json_encode(array('name'=>'yangzetao')) 437 | * 前两种是普通的post,可以用$_POST方式获取 438 | * 第三种是post stream( json rpc,其实就是webservice ),虽然是post方式,但是只能用流方式 http://input 后者 $HTTP_RAW_POST_DATA 获取 439 | * 440 | * @param mixed $url 441 | * @param array $fields 442 | * @param mixed $proxies 443 | * @static 444 | * @access public 445 | * @return void 446 | */ 447 | public static function post($url, $fields = array()) 448 | { 449 | self::init (); 450 | return self::request($url, 'POST', $fields); 451 | } 452 | 453 | public static function put($url, $fields = array()) 454 | { 455 | self::init (); 456 | return self::request($url, 'PUT', $fields); 457 | } 458 | 459 | public static function delete($url, $fields = array()) 460 | { 461 | self::init (); 462 | return self::request($url, 'DELETE', $fields); 463 | } 464 | 465 | public static function head($url, $fields = array()) 466 | { 467 | self::init (); 468 | return self::request($url, 'HEAD', $fields); 469 | } 470 | 471 | public static function options($url, $fields = array()) 472 | { 473 | self::init (); 474 | return self::request($url, 'OPTIONS', $fields); 475 | } 476 | 477 | public static function patch($url, $fields = array()) 478 | { 479 | self::init (); 480 | return self::request($url, 'PATCH', $fields); 481 | } 482 | 483 | public static function request($url, $method = 'GET', $fields) 484 | { 485 | $method = strtoupper($method); 486 | if(!self::_is_url($url)) 487 | { 488 | self::$error = "You have requested URL ({$url}) is not a valid HTTP address"; 489 | return false; 490 | } 491 | 492 | // 如果是 get 方式,直接拼凑一个 url 出来 493 | if ($method == 'GET' && !empty($fields)) 494 | { 495 | $url = $url . (strpos($url,"?")===false ? "?" : "&") . http_build_query($fields); 496 | } 497 | 498 | $parse_url = parse_url($url); 499 | if (empty($parse_url) || empty($parse_url['host']) || !in_array($parse_url['scheme'], array('http', 'https'))) 500 | { 501 | self::$error = "No connection adapters were found for '{$url}'"; 502 | return false; 503 | } 504 | $scheme = $parse_url['scheme']; 505 | $domain = $parse_url['host']; 506 | 507 | // 随机绑定 hosts,做负载均衡 508 | if (self::$hosts) 509 | { 510 | if (isset(self::$hosts[$domain])) 511 | { 512 | $hosts = self::$hosts[$domain]; 513 | $key = rand(0, count($hosts)-1); 514 | $ip = $hosts[$key]; 515 | $url = str_replace($domain, $ip, $url); 516 | self::$headers['Host'] = $domain; 517 | } 518 | } 519 | 520 | curl_setopt( self::$ch, CURLOPT_URL, $url ); 521 | 522 | if ($method != 'GET') 523 | { 524 | // 如果是 post 方式 525 | if ($method == 'POST') 526 | { 527 | curl_setopt( self::$ch, CURLOPT_POST, true ); 528 | } 529 | else 530 | { 531 | self::$headers['X-HTTP-Method-Override'] = $method; 532 | curl_setopt( self::$ch, CURLOPT_CUSTOMREQUEST, $method ); 533 | } 534 | if (!empty($fields)) 535 | { 536 | if (is_array($fields)) 537 | { 538 | $fields = http_build_query($fields); 539 | } 540 | // 不能直接传数组,不知道是什么Bug,会非常慢 541 | curl_setopt( self::$ch, CURLOPT_POSTFIELDS, $fields ); 542 | //curl_setopt( self::$ch, CURLOPT_POSTFIELDS, $fields ); 543 | } 544 | } 545 | 546 | $cookies = self::get_cookies(); 547 | $domain_cookies = self::get_cookies($domain); 548 | $cookies = array_merge($cookies, $domain_cookies); 549 | // 是否设置了cookie 550 | if (!empty($cookies)) 551 | { 552 | foreach ($cookies as $key=>$value) 553 | { 554 | $cookie_arr[] = $key."=".$value; 555 | } 556 | $cookies = implode("; ", $cookie_arr); 557 | curl_setopt( self::$ch, CURLOPT_COOKIE, $cookies ); 558 | } 559 | 560 | if (!empty(self::$useragents)) 561 | { 562 | $key = rand(0, count(self::$useragents) - 1); 563 | self::$headers['User-Agent'] = self::$useragents[$key]; 564 | } 565 | 566 | if (!empty(self::$client_ips)) 567 | { 568 | $key = rand(0, count(self::$client_ips) - 1); 569 | self::$headers["CLIENT-IP"] = self::$client_ips[$key]; 570 | self::$headers["X-FORWARDED-FOR"] = self::$client_ips[$key]; 571 | } 572 | 573 | if (self::$headers) 574 | { 575 | $headers = array(); 576 | foreach (self::$headers as $k=>$v) 577 | { 578 | $headers[] = $k.": ".$v; 579 | } 580 | curl_setopt( self::$ch, CURLOPT_HTTPHEADER, $headers ); 581 | } 582 | 583 | curl_setopt( self::$ch, CURLOPT_ENCODING, 'gzip' ); 584 | 585 | // 关闭验证 586 | if ("https" == substr($url, 0, 5)) 587 | { 588 | curl_setopt(self::$ch, CURLOPT_SSL_VERIFYPEER, false); 589 | curl_setopt(self::$ch, CURLOPT_SSL_VERIFYHOST, false); 590 | } 591 | 592 | if (self::$proxies) 593 | { 594 | if (!empty(self::$proxies[$scheme])) 595 | { 596 | curl_setopt( self::$ch, CURLOPT_PROXY, self::$proxies[$scheme] ); 597 | } 598 | } 599 | 600 | // header + body,header 里面有 cookie 601 | curl_setopt( self::$ch, CURLOPT_HEADER, true ); 602 | 603 | self::$raw = curl_exec ( self::$ch ); 604 | //var_dump($data); 605 | self::$info = curl_getinfo( self::$ch ); 606 | self::$status_code = self::$info['http_code']; 607 | if (self::$raw === false) 608 | { 609 | self::$error = ' Curl error: ' . curl_error( self::$ch ); 610 | } 611 | 612 | // 关闭句柄 613 | curl_close( self::$ch ); 614 | 615 | // 请求成功之后才把URL存起来 616 | self::$url = $url; 617 | self::$content = self::get_response_body($domain); 618 | //$data = substr($data, 10); 619 | //$data = gzinflate($data); 620 | return self::$content; 621 | } 622 | 623 | } 624 | 625 | 626 | -------------------------------------------------------------------------------- /core/selector.php: -------------------------------------------------------------------------------- 1 | 10 | // +---------------------------------------------------------------------- 11 | 12 | //---------------------------------- 13 | // PHPSpider选择器类文件 14 | //---------------------------------- 15 | 16 | class selector 17 | { 18 | /** 19 | * 版本号 20 | * @var string 21 | */ 22 | const VERSION = '1.0.0'; 23 | public static $dom = null; 24 | public static $dom_auth = null; 25 | public static $xpath = null; 26 | public static $error = null; 27 | 28 | public static function select($html, $selector, $selector_type = 'xpath') 29 | { 30 | if (empty($html) || empty($selector)) 31 | { 32 | return false; 33 | } 34 | 35 | $selector_type = strtolower($selector_type); 36 | if ($selector_type == 'xpath') 37 | { 38 | return self::_xpath_select($html, $selector); 39 | } 40 | elseif ($selector_type == 'regex') 41 | { 42 | return self::_regex_select($html, $selector); 43 | } 44 | elseif ($selector_type == 'css') 45 | { 46 | return self::_css_select($html, $selector); 47 | } 48 | } 49 | 50 | public static function remove($html, $selector, $selector_type = 'xpath') 51 | { 52 | if (empty($html) || empty($selector)) 53 | { 54 | return false; 55 | } 56 | 57 | $remove_html = ""; 58 | $selector_type = strtolower($selector_type); 59 | if ($selector_type == 'xpath') 60 | { 61 | $remove_html = self::_xpath_select($html, $selector, true); 62 | } 63 | elseif ($selector_type == 'regex') 64 | { 65 | $remove_html = self::_regex_select($html, $selector, true); 66 | } 67 | elseif ($selector_type == 'css') 68 | { 69 | $remove_html = self::_css_select($html, $selector, true); 70 | } 71 | $html = str_replace($remove_html, "", $html); 72 | return $html; 73 | } 74 | 75 | /** 76 | * xpath选择器 77 | * 78 | * @param mixed $html 79 | * @param mixed $selector 80 | * @return void 81 | * @author seatle 82 | * @created time :2016-10-26 12:53 83 | */ 84 | private static function _xpath_select($html, $selector, $remove = false) 85 | { 86 | if (!is_object(self::$dom)) 87 | { 88 | self::$dom = new DOMDocument(); 89 | } 90 | 91 | // 如果加载的不是之前的HTML内容,替换一下验证标识 92 | if (self::$dom_auth != md5($html)) 93 | { 94 | self::$dom_auth = md5($html); 95 | @self::$dom->loadHTML(''.$html); 96 | self::$xpath = new DOMXpath(self::$dom); 97 | } 98 | 99 | //libxml_use_internal_errors(true); 100 | //self::$dom->loadHTML(''.$html); 101 | //$errors = libxml_get_errors(); 102 | //if (!empty($errors)) 103 | //{ 104 | //print_r($errors); 105 | //exit; 106 | //} 107 | 108 | $elements = @self::$xpath->query($selector); 109 | if ($elements === false) 110 | { 111 | self::$error = "the selector in the xpath(\"{$selector}\") syntax errors"; 112 | return false; 113 | } 114 | 115 | $result = array(); 116 | if (!is_null($elements)) 117 | { 118 | foreach ($elements as $element) 119 | { 120 | // 如果是删除操作,取一整块代码 121 | if ($remove) 122 | { 123 | $content = self::$dom->saveXml($element); 124 | } 125 | else 126 | { 127 | $nodeName = $element->nodeName; 128 | $nodeType = $element->nodeType; // 1.Element 2.Attribute 3.Text 129 | //$nodeAttr = $element->getAttribute('src'); 130 | //$nodes = util::node_to_array(self::$dom, $element); 131 | //echo $nodes['@src']."\n"; 132 | // 如果是img标签,直接取src值 133 | if ($nodeType == 1 && in_array($nodeName, array('img'))) 134 | { 135 | $content = $element->getAttribute('src'); 136 | } 137 | // 如果是标签属性,直接取节点值 138 | elseif ($nodeType == 2 || $nodeType == 3 || $nodeType == 4) 139 | { 140 | $content = $element->nodeValue; 141 | } 142 | else 143 | { 144 | // 保留nodeValue里的html符号,给children二次提取 145 | $content = self::$dom->saveXml($element); 146 | //$content = trim(self::$dom->saveHtml($element)); 147 | $content = preg_replace(array("#^<{$nodeName}.*>#isU","#$#isU"), array('', ''), $content); 148 | } 149 | } 150 | $result[] = $content; 151 | } 152 | } 153 | if (empty($result)) 154 | { 155 | return false; 156 | } 157 | // 如果只有一个元素就直接返回string,否则返回数组 158 | return count($result) > 1 ? $result : $result[0]; 159 | } 160 | 161 | /** 162 | * 正则选择器 163 | * 164 | * @param mixed $html 165 | * @param mixed $selector 166 | * @return void 167 | * @author seatle 168 | * @created time :2016-10-26 12:53 169 | */ 170 | private static function _regex_select($html, $selector, $remove = false) 171 | { 172 | if(@preg_match_all($selector, $html, $out) === false) 173 | { 174 | self::$error = "the selector in the regex(\"{$selector}\") syntax errors"; 175 | return false; 176 | } 177 | $count = count($out); 178 | $result = array(); 179 | // 一个都没有匹配到 180 | if ($count == 0) 181 | { 182 | return false; 183 | } 184 | // 只匹配一个,就是只有一个 () 185 | elseif ($count == 2) 186 | { 187 | // 删除的话取匹配到的所有内容 188 | if ($remove) 189 | { 190 | $result = $out[0]; 191 | } 192 | else 193 | { 194 | $result = $out[1]; 195 | } 196 | } 197 | else 198 | { 199 | for ($i = 1; $i < $count; $i++) 200 | { 201 | // 如果只有一个元素,就直接返回好了 202 | $result[] = count($out[$i]) > 1 ? $out[$i] : $out[$i][0]; 203 | } 204 | } 205 | if (empty($result)) 206 | { 207 | return false; 208 | } 209 | 210 | return count($result) > 1 ? $result : $result[0]; 211 | } 212 | 213 | /** 214 | * css选择器 215 | * 216 | * @param mixed $html 217 | * @param mixed $selector 218 | * @return void 219 | * @author seatle 220 | * @created time :2016-10-26 12:53 221 | */ 222 | private static function _css_select($html, $selector, $remove = false) 223 | { 224 | // 如果加载的不是之前的HTML内容,替换一下验证标识 225 | if (self::$dom_auth != md5($html)) 226 | { 227 | self::$dom_auth = md5($html); 228 | phpQuery::loadDocumentHTML($html); 229 | } 230 | if ($remove) 231 | { 232 | return pq($selector)->remove(); 233 | } 234 | else 235 | { 236 | return pq($selector)->html(); 237 | } 238 | } 239 | 240 | public static function find_all($html, $selector) 241 | { 242 | } 243 | } 244 | -------------------------------------------------------------------------------- /core/util.php: -------------------------------------------------------------------------------- 1 | 10 | // +---------------------------------------------------------------------- 11 | 12 | //---------------------------------- 13 | // PHPSpider实用函数集合类文件 14 | //---------------------------------- 15 | 16 | class util 17 | { 18 | /** 19 | * 文件锁 20 | * 如果没有锁,就加一把锁并且执行逻辑,然后删除锁 21 | * if (!util::lock('statistics_offer')) 22 | * { 23 | * util::lock('statistics_offer'); 24 | * ... 25 | * util::unlock('statistics_offer'); 26 | * } 27 | * 否则输出锁存在 28 | * else 29 | * { 30 | * echo "process has been locked\n"; 31 | * } 32 | * 33 | * @param mixed $lock_name 34 | * @param int $lock_timeout 35 | * @return void 36 | * @author seatle 37 | * @created time :2016-02-18 14:28 38 | */ 39 | public static function lock($lock_name, $lock_timeout = 600) 40 | { 41 | $lock = util::get_file(PATH_DATA."/lock/{$lock_name}.lock"); 42 | if ($lock) 43 | { 44 | $time = time() - $lock; 45 | // 还没到10分钟,说明进程还活着 46 | if ($time < $lock_timeout) 47 | { 48 | return true; 49 | } 50 | unlink(PATH_DATA."/lock/{$lock_name}.lock"); 51 | } 52 | util::put_file(PATH_DATA."/lock/{$lock_name}.lock", time()); 53 | return false; 54 | } 55 | 56 | public static function unlock($lock_name) 57 | { 58 | unlink(PATH_DATA."/lock/{$lock_name}.lock"); 59 | } 60 | 61 | public static function time2second($time, $is_log = true) 62 | { 63 | if(is_numeric($time)) 64 | { 65 | $value = array( 66 | "years" => 0, "days" => 0, "hours" => 0, 67 | "minutes" => 0, "seconds" => 0, 68 | ); 69 | if($time >= 31556926) 70 | { 71 | $value["years"] = floor($time/31556926); 72 | $time = ($time%31556926); 73 | } 74 | if($time >= 86400) 75 | { 76 | $value["days"] = floor($time/86400); 77 | $time = ($time%86400); 78 | } 79 | if($time >= 3600) 80 | { 81 | $value["hours"] = floor($time/3600); 82 | $time = ($time%3600); 83 | } 84 | if($time >= 60) 85 | { 86 | $value["minutes"] = floor($time/60); 87 | $time = ($time%60); 88 | } 89 | $value["seconds"] = floor($time); 90 | //return (array) $value; 91 | //$t = $value["years"] ."y ". $value["days"] ."d ". $value["hours"] ."h ". $value["minutes"] ."m ".$value["seconds"]."s"; 92 | if ($is_log) 93 | { 94 | $t = $value["days"] ."d ". $value["hours"] ."h ". $value["minutes"] ."m ".$value["seconds"]."s"; 95 | } 96 | else 97 | { 98 | $t = $value["days"] ." days ". $value["hours"] ." hours ". $value["minutes"] ." minutes"; 99 | } 100 | return $t; 101 | 102 | } 103 | else 104 | { 105 | return false; 106 | } 107 | } 108 | 109 | public static function get_days($day_sta, $day_end = true, $range = 86400) 110 | { 111 | if ($day_end === true) $day_end = date('Y-m-d'); 112 | 113 | return array_map(function ($time) { 114 | return date('Y-m-d', $time); 115 | }, range(strtotime($day_sta), strtotime($day_end), $range)); 116 | } 117 | 118 | /** 119 | * 获取文件行数 120 | * 121 | * @param mixed $filepath 122 | * @return void 123 | * @author seatle 124 | * @created time :2016-03-31 21:54 125 | */ 126 | public static function get_file_line($filepath) 127 | { 128 | $line = 0 ; 129 | $fp = fopen($filepath , 'r'); 130 | if (!$fp) 131 | { 132 | return 0; 133 | } 134 | //获取文件的一行内容,注意:需要php5才支持该函数; 135 | while( stream_get_line($fp,8192,"\n") ){ 136 | $line++; 137 | } 138 | fclose($fp);//关闭文件 139 | return $line; 140 | } 141 | 142 | /** 143 | * 获得表数 144 | * 145 | * @param mixed $table_name 表名 146 | * @param mixed $item_value 唯一索引 147 | * @param int $table_num 表数量 148 | * @return void 149 | * @author seatle 150 | * @created time :2015-10-22 23:25 151 | */ 152 | public static function get_table_num($item_value, $table_num = 100) 153 | { 154 | //sha1:返回一个40字符长度的16进制数字 155 | $item_value = sha1(strtolower($item_value)); 156 | //base_convert:进制建转换,下面是把16进制转成10进制,方便做除法运算 157 | //str_pad:把字符串填充为指定的长度,下面是在左边加0,表数量大于100就3位,否则2位 158 | $step = $table_num > 100 ? 3 : 2; 159 | $item_value = str_pad(base_convert(substr($item_value, -2), 16, 10) % $table_num, $step, "0", STR_PAD_LEFT); 160 | return $item_value; 161 | } 162 | 163 | /** 164 | * 获得表面 165 | * 166 | * @param mixed $table_name 表名 167 | * @param mixed $item_value 唯一索引 168 | * @param int $table_num 表数量 169 | * @return void 170 | * @author seatle 171 | * @created time :2015-10-22 23:25 172 | */ 173 | public static function get_table_name($table_name, $item_value, $table_num = 100) 174 | { 175 | //sha1:返回一个40字符长度的16进制数字 176 | $item_value = sha1(strtolower($item_value)); 177 | //base_convert:进制建转换,下面是把16进制转成10进制,方便做除法运算 178 | //str_pad:把字符串填充为指定的长度,下面是在左边加0,共3位 179 | $step = $table_num > 100 ? 3 : 2; 180 | $item_value = str_pad(base_convert(substr($item_value, -2), 16, 10) % $table_num, $step, "0", STR_PAD_LEFT); 181 | return $table_name."_".$item_value; 182 | } 183 | 184 | // 获得当前使用内存 185 | public static function memory_get_usage() 186 | { 187 | $memory = memory_get_usage(); 188 | return self::format_bytes($memory); 189 | } 190 | 191 | // 获得最高使用内存 192 | public static function memory_get_peak_usage() 193 | { 194 | $memory = memory_get_peak_usage(); 195 | return self::format_bytes($memory); 196 | } 197 | 198 | // 转换大小单位 199 | public static function format_bytes($size) 200 | { 201 | $unit = array('b', 'kb', 'mb', 'gb', 'tb', 'pb'); 202 | return @round($size / pow(1024, ($i = floor(log($size, 1024)))), 2) . ' ' . $unit[$i]; 203 | } 204 | 205 | /** 206 | * 获取数组大小 207 | * 208 | * @param mixed $arr 数组 209 | * @return string 210 | */ 211 | public static function array_size($arr) 212 | { 213 | ob_start(); 214 | print_r($arr); 215 | $mem = ob_get_contents(); 216 | ob_end_clean(); 217 | $mem = preg_replace("/\n +/", "", $mem); 218 | $mem = strlen($mem); 219 | return self::format_bytes($mem); 220 | } 221 | 222 | /** 223 | * 数字随机数 224 | * 225 | * @param int $num 226 | * @return void 227 | * @author seatle 228 | * @created time :2016-09-18 10:17 229 | */ 230 | public static function rand_num($num = 7) 231 | { 232 | $rand = ""; 233 | for ($i = 0; $i < $num; $i ++) 234 | { 235 | $rand .= mt_rand(0, 9); 236 | } 237 | return $rand; 238 | } 239 | 240 | /** 241 | * 字母数字混合随机数 242 | * 243 | * @param int $num 244 | * @return void 245 | * @author seatle 246 | * @created time :2016-09-18 10:17 247 | */ 248 | public static function rand_str($num = 10) 249 | { 250 | $chars = 'abcdefghijklmnopqrstuvwxyz0123456789'; 251 | $string = ""; 252 | for ($i = 0; $i < $num; $i ++) 253 | { 254 | $string .= substr($chars, rand(0, strlen($chars)), 1); 255 | } 256 | return $string; 257 | } 258 | 259 | /** 260 | * 汉字转拼音 261 | * 262 | * @param mixed $str 汉字 263 | * @param int $ishead 264 | * @param int $isclose 265 | * @static 266 | * @access public 267 | * @return string 268 | */ 269 | public static function pinyin($str, $ishead = 0, $isclose = 1) 270 | { 271 | // $str = iconv("utf-8", "gbk//ignore", $str); 272 | $str = mb_convert_encoding($str, "gbk", "utf-8"); 273 | global $pinyins; 274 | $restr = ''; 275 | $str = trim($str); 276 | $slen = strlen($str); 277 | if ($slen < 2) 278 | { 279 | return $str; 280 | } 281 | if (count($pinyins) == 0) 282 | { 283 | $fp = fopen(PATH_DATA . '/pinyin.dat', 'r'); 284 | while (!feof($fp)) 285 | { 286 | $line = trim(fgets($fp)); 287 | $pinyins[$line[0] . $line[1]] = substr($line, 3, strlen($line) - 3); 288 | } 289 | fclose($fp); 290 | } 291 | for ($i = 0; $i < $slen; $i ++) 292 | { 293 | if (ord($str[$i]) > 0x80) 294 | { 295 | $c = $str[$i] . $str[$i + 1]; 296 | $i ++; 297 | if (isset($pinyins[$c])) 298 | { 299 | if ($ishead == 0) 300 | { 301 | $restr .= $pinyins[$c]; 302 | } 303 | else 304 | { 305 | $restr .= $pinyins[$c][0]; 306 | } 307 | } 308 | else 309 | { 310 | // $restr .= "_"; 311 | } 312 | } 313 | else if (preg_match("/[a-z0-9]/i", $str[$i])) 314 | { 315 | $restr .= $str[$i]; 316 | } 317 | else 318 | { 319 | // $restr .= "_"; 320 | } 321 | } 322 | if ($isclose == 0) 323 | { 324 | unset($pinyins); 325 | } 326 | return $restr; 327 | } 328 | 329 | /** 330 | * 生成字母前缀 331 | * 332 | * @param mixed $s0 333 | * @return char 334 | * @author seatle 335 | * @created time :2016-09-18 10:17 336 | */ 337 | public static function letter_first($s0) 338 | { 339 | $firstchar_ord = ord(strtoupper($s0{0})); 340 | if (($firstchar_ord >= 65 and $firstchar_ord <= 91) or ($firstchar_ord >= 48 and $firstchar_ord <= 57)) return $s0{0}; 341 | // $s = iconv("utf-8", "gbk//ignore", $s0); 342 | $s = mb_convert_encoding($s0, "gbk", "utf-8"); 343 | $asc = ord($s{0}) * 256 + ord($s{1}) - 65536; 344 | if ($asc >= -20319 and $asc <= -20284) return "A"; 345 | if ($asc >= -20283 and $asc <= -19776) return "B"; 346 | if ($asc >= -19775 and $asc <= -19219) return "C"; 347 | if ($asc >= -19218 and $asc <= -18711) return "D"; 348 | if ($asc >= -18710 and $asc <= -18527) return "E"; 349 | if ($asc >= -18526 and $asc <= -18240) return "F"; 350 | if ($asc >= -18239 and $asc <= -17923) return "G"; 351 | if ($asc >= -17922 and $asc <= -17418) return "H"; 352 | if ($asc >= -17417 and $asc <= -16475) return "J"; 353 | if ($asc >= -16474 and $asc <= -16213) return "K"; 354 | if ($asc >= -16212 and $asc <= -15641) return "L"; 355 | if ($asc >= -15640 and $asc <= -15166) return "M"; 356 | if ($asc >= -15165 and $asc <= -14923) return "N"; 357 | if ($asc >= -14922 and $asc <= -14915) return "O"; 358 | if ($asc >= -14914 and $asc <= -14631) return "P"; 359 | if ($asc >= -14630 and $asc <= -14150) return "Q"; 360 | if ($asc >= -14149 and $asc <= -14091) return "R"; 361 | if ($asc >= -14090 and $asc <= -13319) return "S"; 362 | if ($asc >= -13318 and $asc <= -12839) return "T"; 363 | if ($asc >= -12838 and $asc <= -12557) return "W"; 364 | if ($asc >= -12556 and $asc <= -11848) return "X"; 365 | if ($asc >= -11847 and $asc <= -11056) return "Y"; 366 | if ($asc >= -11055 and $asc <= -10247) return "Z"; 367 | return 0; // null 368 | } 369 | 370 | /** 371 | * 获得某天前的时间戳 372 | * 373 | * @param mixed $day 374 | * @return void 375 | * @author seatle 376 | * @created time :2016-09-18 10:17 377 | */ 378 | public static function getxtime($day) 379 | { 380 | $day = intval($day); 381 | return mktime(23, 59, 59, date("m"), date("d") - $day, date("y")); 382 | } 383 | 384 | /** 385 | * 读文件 386 | */ 387 | public static function get_file($url, $timeout = 10) 388 | { 389 | if (function_exists('curl_init')) 390 | { 391 | $ch = curl_init(); 392 | curl_setopt($ch, CURLOPT_URL, $url); 393 | curl_setopt($ch, CURLOPT_HEADER, 0); 394 | curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); 395 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 396 | curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10); 397 | $content = curl_exec($ch); 398 | curl_close($ch); 399 | if ($content) return $content; 400 | } 401 | $ctx = stream_context_create(array('http' => array('timeout' => $timeout))); 402 | $content = @file_get_contents($url, 0, $ctx); 403 | if ($content) return $content; 404 | return false; 405 | } 406 | 407 | /** 408 | * 写文件,如果文件目录不存在,则递归生成 409 | */ 410 | public static function put_file($file, $content, $flag = 0) 411 | { 412 | $pathinfo = pathinfo($file); 413 | if (!empty($pathinfo['dirname'])) 414 | { 415 | if (file_exists($pathinfo['dirname']) === false) 416 | { 417 | if (@mkdir($pathinfo['dirname'], 0777, true) === false) 418 | { 419 | return false; 420 | } 421 | } 422 | } 423 | if ($flag === FILE_APPEND) 424 | { 425 | // 多个php-fpm写一个文件的时候容易丢失,要加锁 426 | //return @file_put_contents($file, $content, FILE_APPEND|LOCK_EX); 427 | return @file_put_contents($file, $content, FILE_APPEND); 428 | } 429 | else 430 | { 431 | return @file_put_contents($file, $content, LOCK_EX); 432 | } 433 | } 434 | 435 | /** 436 | * 检查路径是否存在,不存在则递归生成路径 437 | * 438 | * @param mixed $path 路径 439 | * @static 440 | * @access public 441 | * @return bool or string 442 | */ 443 | public static function path_exists($path) 444 | { 445 | $pathinfo = pathinfo($path . '/tmp.txt'); 446 | if (!empty($pathinfo['dirname'])) 447 | { 448 | if (file_exists($pathinfo['dirname']) === false) 449 | { 450 | if (mkdir($pathinfo['dirname'], 0777, true) === false) 451 | { 452 | return false; 453 | } 454 | } 455 | } 456 | return $path; 457 | } 458 | 459 | /** 460 | * 递归删除目录 461 | * 462 | * @param mixed $dir 463 | * @return void 464 | * @author seatle 465 | * @created time :2016-09-18 10:17 466 | */ 467 | public static function deldir($dir) 468 | { 469 | //先删除目录下的文件: 470 | $dh = opendir($dir); 471 | while ($file = readdir($dh)) 472 | { 473 | if($file!="." && $file!="..") 474 | { 475 | $fullpath = $dir."/".$file; 476 | if(!is_dir($fullpath)) 477 | { 478 | unlink($fullpath); 479 | } 480 | else 481 | { 482 | self::deldir($fullpath); 483 | } 484 | } 485 | } 486 | 487 | closedir($dh); 488 | //删除当前文件夹: 489 | if(rmdir($dir)) 490 | { 491 | return true; 492 | } 493 | else 494 | { 495 | return false; 496 | } 497 | } 498 | 499 | /** 500 | * 递归修改目录权限 501 | * 502 | * @param mixed $path 目录 503 | * @param mixed $filemode 权限 504 | * @return bool 505 | */ 506 | public static function chmodr($path, $filemode) 507 | { 508 | if (!is_dir($path)) 509 | { 510 | return @chmod($path, $filemode); 511 | } 512 | 513 | $dh = opendir($path); 514 | while (($file = readdir($dh)) !== false) 515 | { 516 | if ($file != '.' && $file != '..') 517 | { 518 | $fullpath = $path . '/' . $file; 519 | if (is_link($fullpath)) 520 | { 521 | return FALSE; 522 | } 523 | elseif (!is_dir($fullpath) && !@chmod($fullpath, $filemode)) 524 | { 525 | return FALSE; 526 | } 527 | elseif (!self::chmodr($fullpath, $filemode)) 528 | { 529 | return FALSE; 530 | } 531 | } 532 | } 533 | 534 | closedir($dh); 535 | 536 | if (@chmod($path, $filemode)) 537 | { 538 | return TRUE; 539 | } 540 | else 541 | { 542 | return FALSE; 543 | } 544 | } 545 | 546 | /** 547 | * 数组格式化为CSV 548 | * 549 | * @param mixed $data 550 | * @return void 551 | * @author seatle 552 | * @created time :2016-07-29 11:32 553 | */ 554 | public static function format_csv($data) 555 | { 556 | foreach ($data as $k=>$v) 557 | { 558 | $v = str_replace(",", "", $v); 559 | $v = str_replace(",", "", $v); 560 | $data[$k] = $v; 561 | } 562 | return implode(",", $data); 563 | } 564 | 565 | /** 566 | * 判断是否为utf8字符串 567 | * @parem $str 568 | * @return bool 569 | */ 570 | public static function is_utf8($str) 571 | { 572 | if ($str === mb_convert_encoding(mb_convert_encoding($str, "UTF-32", "UTF-8"), "UTF-8", "UTF-32")) 573 | { 574 | return true; 575 | } 576 | else 577 | { 578 | return false; 579 | } 580 | } 581 | 582 | /** 583 | * 获取文件编码 584 | * @param $string 585 | * @return string 586 | */ 587 | public static function get_encoding($string) 588 | { 589 | $encoding = mb_detect_encoding($string, array('UTF-8', 'GBK', 'GB2312', 'LATIN1', 'ASCII', 'BIG5')); 590 | return strtolower($encoding); 591 | } 592 | 593 | /** 594 | * 转换数组值的编码格式 595 | * @param array $arr 596 | * @param string $toEncoding 597 | * @param string $fromEncoding 598 | * @return array 599 | */ 600 | public static function array_iconv($arr, $from_encoding, $to_encoding) 601 | { 602 | eval('$arr = '.iconv($from_encoding, $to_encoding.'//IGNORE', var_export($arr,TRUE)).';'); 603 | return $arr; 604 | } 605 | 606 | /** 607 | * 从普通时间返回Linux时间截(strtotime中文处理版) 608 | * @parem string $dtime 609 | * @return int 610 | */ 611 | public static function cn_strtotime($dtime) 612 | { 613 | if (!preg_match("/[^0-9]/", $dtime)) 614 | { 615 | return $dtime; 616 | } 617 | $dtime = trim($dtime); 618 | $dt = Array(1970, 1, 1, 0, 0, 0); 619 | $dtime = preg_replace("/[\r\n\t]|日|秒/", " ", $dtime); 620 | $dtime = str_replace("年", "-", $dtime); 621 | $dtime = str_replace("月", "-", $dtime); 622 | $dtime = str_replace("时", ":", $dtime); 623 | $dtime = str_replace("分", ":", $dtime); 624 | $dtime = trim(preg_replace("/[ ]{1,}/", " ", $dtime)); 625 | $ds = explode(" ", $dtime); 626 | $ymd = explode("-", $ds[0]); 627 | if (!isset($ymd[1])) 628 | { 629 | $ymd = explode(".", $ds[0]); 630 | } 631 | if (isset($ymd[0])) 632 | { 633 | $dt[0] = $ymd[0]; 634 | } 635 | if (isset($ymd[1])) $dt[1] = $ymd[1]; 636 | if (isset($ymd[2])) $dt[2] = $ymd[2]; 637 | if (strlen($dt[0]) == 2) $dt[0] = '20' . $dt[0]; 638 | if (isset($ds[1])) 639 | { 640 | $hms = explode(":", $ds[1]); 641 | if (isset($hms[0])) $dt[3] = $hms[0]; 642 | if (isset($hms[1])) $dt[4] = $hms[1]; 643 | if (isset($hms[2])) $dt[5] = $hms[2]; 644 | } 645 | foreach ($dt as $k => $v) 646 | { 647 | $v = preg_replace("/^0{1,}/", '', trim($v)); 648 | if ($v == '') 649 | { 650 | $dt[$k] = 0; 651 | } 652 | } 653 | $mt = mktime($dt[3], $dt[4], $dt[5], $dt[1], $dt[2], $dt[0]); 654 | if (!empty($mt)) 655 | { 656 | return $mt; 657 | } 658 | else 659 | { 660 | return strtotime($dtime); 661 | } 662 | } 663 | 664 | public static function cn_substr($string, $length = 80, $etc = '...', $count_words = true) 665 | { 666 | mb_internal_encoding("UTF-8"); 667 | if ($length == 0) return ''; 668 | if (strlen($string) <= $length) return $string; 669 | preg_match_all("/[\x01-\x7f]|[\xc2-\xdf][\x80-\xbf]|\xe0[\xa0-\xbf][\x80-\xbf]|[\xe1-\xef][\x80-\xbf][\x80-\xbf]|\xf0[\x90-\xbf][\x80-\xbf][\x80-\xbf]|[\xf1-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]/", $string, $info); 670 | if ($count_words) 671 | { 672 | $j = 0; 673 | $wordscut = ""; 674 | for ($i = 0; $i < count($info[0]); $i ++) 675 | { 676 | $wordscut .= $info[0][$i]; 677 | if (ord($info[0][$i]) >= 128) 678 | { 679 | $j = $j + 2; 680 | } 681 | else 682 | { 683 | $j = $j + 1; 684 | } 685 | if ($j >= $length) 686 | { 687 | return $wordscut . $etc; 688 | } 689 | } 690 | return join('', $info[0]); 691 | } 692 | return join("", array_slice($info[0], 0, $length)) . $etc; 693 | } 694 | 695 | /** 696 | * 获取文件后缀名 697 | * 698 | * @param mixed $file_name 文件名 699 | * @static 700 | * 701 | * @access public 702 | * @return string 703 | */ 704 | public static function get_extension($file_name) 705 | { 706 | $ext = explode('.', $file_name); 707 | $ext = array_pop($ext); 708 | return strtolower($ext); 709 | } 710 | 711 | // 获取 Url 跳转后的真实地址 712 | public static function getrealurl($url) 713 | { 714 | if (empty($url)) 715 | { 716 | return $url; 717 | } 718 | $header = get_headers($url, 1); 719 | if (empty($header[0]) || empty($header[1])) 720 | { 721 | return $url; 722 | } 723 | if (strpos($header[0], '301') || strpos($header[0], '302')) 724 | { 725 | if (empty($header['Location'])) 726 | { 727 | return $url; 728 | } 729 | if (is_array($header['Location'])) 730 | { 731 | return $header['Location'][count($header['Location']) - 1]; 732 | } 733 | else 734 | { 735 | return $header['Location']; 736 | } 737 | } 738 | else 739 | { 740 | return $url; 741 | } 742 | } 743 | 744 | // 解压服务器用 Content-Encoding:gzip 压缩过的数据 745 | public static function gzdecode($data) 746 | { 747 | $flags = ord(substr($data, 3, 1)); 748 | $headerlen = 10; 749 | $extralen = 0; 750 | $filenamelen = 0; 751 | if ($flags & 4) 752 | { 753 | $extralen = unpack('v', substr($data, 10, 2)); 754 | $extralen = $extralen[1]; 755 | $headerlen += 2 + $extralen; 756 | } 757 | if ($flags & 8) // Filename 758 | $headerlen = strpos($data, chr(0), $headerlen) + 1; 759 | if ($flags & 16) // Comment 760 | $headerlen = strpos($data, chr(0), $headerlen) + 1; 761 | if ($flags & 2) // CRC at end of file 762 | $headerlen += 2; 763 | $unpacked = @gzinflate(substr($data, $headerlen)); 764 | if ($unpacked === FALSE) $unpacked = $data; 765 | return $unpacked; 766 | } 767 | 768 | /** 769 | * 数字金额转换为中文 770 | * @param string|integer|float $num 目标数字 771 | * @param boolean $sim 使用小写(默认) 772 | * @return string 773 | */ 774 | public static function number2chinese($num, $sim = FALSE) 775 | { 776 | if (!is_numeric($num)) return '含有非数字非小数点字符!'; 777 | $char = $sim ? array('零', '一', '二', '三', '四', '五', '六', '七', '八', '九') : array('零', '壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖'); 778 | $unit = $sim ? array('', '十', '百', '千', '', '万', '亿', '兆') : array('', '拾', '佰', '仟', '', '萬', '億', '兆'); 779 | $retval = ''; 780 | 781 | $num = sprintf("%01.2f", $num); 782 | 783 | list ($num, $dec) = explode('.', $num); 784 | 785 | // 小数部分 786 | if ($dec['0'] > 0) 787 | { 788 | $retval .= "{$char[$dec['0']]}角"; 789 | } 790 | if ($dec['1'] > 0) 791 | { 792 | $retval .= "{$char[$dec['1']]}分"; 793 | } 794 | 795 | // 整数部分 796 | if ($num > 0) 797 | { 798 | $retval = "元" . $retval; 799 | $f = 1; 800 | $str = strrev(intval($num)); 801 | for ($i = 0, $c = strlen($str); $i < $c; $i ++) 802 | { 803 | if ($str[$i] > 0) 804 | { 805 | $f = 0; 806 | } 807 | if ($f == 1 && $str[$i] == 0) 808 | { 809 | $out[$i] = ""; 810 | } 811 | else 812 | { 813 | $out[$i] = $char[$str[$i]]; 814 | } 815 | $out[$i] .= $str[$i] != '0' ? $unit[$i % 4] : ''; 816 | if ($i > 1 and $str[$i] + $str[$i - 1] == 0) 817 | { 818 | $out[$i] = ''; 819 | } 820 | if ($i % 4 == 0) 821 | { 822 | $out[$i] .= $unit[4 + floor($i / 4)]; 823 | } 824 | } 825 | $retval = join('', array_reverse($out)) . $retval; 826 | } 827 | return $retval; 828 | } 829 | 830 | public static function colorize($str, $status = "info") 831 | { 832 | $out = ""; 833 | switch ($status) 834 | { 835 | case 'succ': 836 | $out = "\033[32m"; // Blue 837 | break; 838 | case "error": 839 | $out = "\033[31m"; // Red 840 | break; 841 | case "warn": 842 | $out = "\033[33m"; // Yellow 843 | break; 844 | case "note": 845 | $out = "\033[34m"; // Green 846 | break; 847 | case "debug": 848 | $out = "\033[36m"; // Green 849 | break; 850 | default: 851 | $out = "\033[0m"; // info 852 | break; 853 | } 854 | return $out.$str."\033[0m"; 855 | } 856 | 857 | public static function node_to_array($dom, $node) 858 | { 859 | if(!is_a( $dom, 'DOMDocument' ) || !is_a( $node, 'DOMNode' )) 860 | { 861 | return false; 862 | } 863 | 864 | $array = array(); 865 | // Discard empty nodes 866 | $localName = trim( $node->localName ); 867 | if( empty($localName)) 868 | { 869 | return false; 870 | } 871 | if( XML_TEXT_NODE == $node->nodeType ) 872 | { 873 | return $node->nodeValue; 874 | } 875 | foreach ($node->attributes as $attr) 876 | { 877 | $array['@'.$attr->localName] = $attr->nodeValue; 878 | } 879 | foreach ($node->childNodes as $childNode) 880 | { 881 | if ( (isset($childNode->childNodes->length) && 1 == $childNode->childNodes->length) && 882 | XML_TEXT_NODE == $childNode->firstChild->nodeType ) 883 | { 884 | $array[$childNode->localName] = $childNode->nodeValue; 885 | } 886 | else 887 | { 888 | if( false !== ($a = self::node_to_array( $dom, $childNode))) 889 | { 890 | $array[$childNode->localName] = $a; 891 | } 892 | } 893 | } 894 | return $array; 895 | } 896 | 897 | public static function is_win() 898 | { 899 | return strtoupper(substr(PHP_OS,0,3))==="WIN"; 900 | } 901 | } 902 | 903 | 904 | -------------------------------------------------------------------------------- /core/worker.php: -------------------------------------------------------------------------------- 1 | 10 | // +---------------------------------------------------------------------- 11 | 12 | //---------------------------------- 13 | // Worker多进程操作类 14 | //---------------------------------- 15 | 16 | class worker 17 | { 18 | // worker进程数 19 | public $count = 0; 20 | // worker id,worker进程从1开始,0被master进程所使用 21 | public $worker_id = 0; 22 | // worker 进程ID 23 | public $worker_pid = 0; 24 | // 进程用户 25 | public $user = ''; 26 | // 进程名 27 | public $title = ''; 28 | // 每个进程是否只运行一次 29 | public $run_once = true; 30 | // 是否输出日志 31 | public $log_show = false; 32 | // master进程启动回调 33 | public $on_start = false; 34 | // master进程停止回调 35 | public $on_stop = false; 36 | // worker进程启动回调 37 | public $on_worker_start = false; 38 | // worker进程停止回调 39 | public $on_worker_stop = false; 40 | // master进程ID 41 | protected static $_master_pid = 0; 42 | // worker进程ID 43 | protected static $_worker_pids = array(); 44 | // master、worker进程启动时间 45 | public $time_start = 0; 46 | // master、worker进程运行状态 [starting|running|shutdown|reload] 47 | protected static $_status = "starting"; 48 | 49 | 50 | public function __construct() 51 | { 52 | self::$_master_pid = posix_getpid(); 53 | // 产生时钟云,添加后父进程才可以收到信号 54 | declare(ticks = 1); 55 | $this->install_signal(); 56 | } 57 | 58 | /** 59 | * 安装信号处理函数 60 | * @return void 61 | */ 62 | protected function install_signal() 63 | { 64 | // stop 65 | pcntl_signal(SIGINT, array($this, 'signal_handler'), false); 66 | // reload 67 | pcntl_signal(SIGUSR1, array($this, 'signal_handler'), false); 68 | // status 69 | pcntl_signal(SIGUSR2, array($this, 'signal_handler'), false); 70 | // ignore 71 | pcntl_signal(SIGPIPE, SIG_IGN, false); 72 | // install signal handler for dead kids 73 | // pcntl_signal(SIGCHLD, array($this, 'signal_handler')); 74 | } 75 | 76 | /** 77 | * 卸载信号处理函数 78 | * @return void 79 | */ 80 | protected function uninstall_signal() 81 | { 82 | // uninstall stop signal handler 83 | pcntl_signal(SIGINT, SIG_IGN, false); 84 | // uninstall reload signal handler 85 | pcntl_signal(SIGUSR1, SIG_IGN, false); 86 | // uninstall status signal handler 87 | pcntl_signal(SIGUSR2, SIG_IGN, false); 88 | } 89 | 90 | /** 91 | * 信号处理函数,会被其他类调用到,所以要设置为public 92 | * @param int $signal 93 | */ 94 | public function signal_handler($signal) { 95 | switch ($signal) { 96 | // stop 2 97 | case SIGINT: 98 | // master进程和worker进程都会调用 99 | $this->stop_all(); 100 | break; 101 | // reload 30 102 | case SIGUSR1: 103 | echo "reload\n"; 104 | break; 105 | // show status 31 106 | case SIGUSR2: 107 | echo "status\n"; 108 | break; 109 | } 110 | } 111 | 112 | /** 113 | * 运行worker实例 114 | */ 115 | public function run() 116 | { 117 | $this->time_start = microtime(true); 118 | $this->worker_id = 0; 119 | $this->worker_pid = posix_getpid(); 120 | $this->set_process_title($this->title); 121 | 122 | // 这里赋值,worker进程也会克隆到 123 | if ($this->log_show) 124 | { 125 | log::$log_show = true; 126 | } 127 | 128 | if ($this->on_start) 129 | { 130 | call_user_func($this->on_start, $this); 131 | } 132 | 133 | // worker进程从1开始,0被master进程所使用 134 | for ($i = 1; $i <= $this->count; $i++) 135 | { 136 | $this->fork_one_worker($i); 137 | } 138 | $this->monitor_workers(); 139 | } 140 | 141 | /** 142 | * 创建一个子进程 143 | * @param Worker $worker 144 | * @throws Exception 145 | */ 146 | public function fork_one_worker($worker_id) 147 | { 148 | //$sockets = stream_socket_pair(STREAM_PF_UNIX, STREAM_SOCK_STREAM, STREAM_IPPROTO_IP); 149 | $pid = pcntl_fork(); 150 | 151 | // 主进程记录子进程pid 152 | if($pid > 0) 153 | { 154 | self::$_worker_pids[$worker_id] = $pid; 155 | } 156 | // 子进程运行 157 | elseif(0 === $pid) 158 | { 159 | $this->time_start = microtime(true); 160 | $this->worker_id = $worker_id; 161 | $this->worker_pid = posix_getpid(); 162 | $this->set_process_title($this->title); 163 | $this->set_process_user($this->user); 164 | // 清空master进程克隆过来的worker进程ID 165 | self::$_worker_pids = array(); 166 | //$this->uninstall_signal(); 167 | 168 | // 设置worker进程的运行状态为运行中 169 | self::$_status = "running"; 170 | 171 | // 注册进程退出回调,用来检查是否有错误(子进程里面注册) 172 | register_shutdown_function(array($this, 'check_errors')); 173 | 174 | // 如果设置了worker进程启动回调函数 175 | if ($this->on_worker_start) 176 | { 177 | call_user_func($this->on_worker_start, $this); 178 | } 179 | 180 | // 停止当前worker实例 181 | $this->stop(); 182 | // 这里用0表示正常退出 183 | exit(0); 184 | } 185 | else 186 | { 187 | log::add("fork one worker fail", "Error"); 188 | exit; 189 | } 190 | } 191 | 192 | /** 193 | * 尝试设置运行当前进程的用户 194 | * 195 | * @param $user_name 196 | */ 197 | protected static function set_process_user($user_name) 198 | { 199 | // 用户名为空 或者 当前用户不是root用户 200 | if(empty($user_name) || posix_getuid() !== 0) 201 | { 202 | return; 203 | } 204 | $user_info = posix_getpwnam($user_name); 205 | if($user_info['uid'] != posix_getuid() || $user_info['gid'] != posix_getgid()) 206 | { 207 | if(!posix_setgid($user_info['gid']) || !posix_setuid($user_info['uid'])) 208 | { 209 | log::add('Can not run woker as '.$user_name." , You shuld be root", "Error"); 210 | } 211 | } 212 | } 213 | 214 | /** 215 | * 设置当前进程的名称,在ps aux命令中有用 216 | * 注意 需要php>=5.5或者安装了protitle扩展 217 | * @param string $title 218 | * @return void 219 | */ 220 | protected function set_process_title($title) 221 | { 222 | if (!empty($title)) 223 | { 224 | // 需要扩展 225 | if(extension_loaded('proctitle') && function_exists('setproctitle')) 226 | { 227 | @setproctitle($title); 228 | } 229 | // >=php 5.5 230 | elseif (function_exists('cli_set_process_title')) 231 | { 232 | cli_set_process_title($title); 233 | } 234 | } 235 | } 236 | 237 | /** 238 | * 监控所有子进程的退出事件及退出码 239 | * @return void 240 | */ 241 | public function monitor_workers() 242 | { 243 | // 设置master进程的运行状态为运行中 244 | self::$_status = "running"; 245 | while(1) 246 | { 247 | // pcntl_signal_dispatch 子进程无法接受到信号 248 | // 如果有信号到来,尝试触发信号处理函数 249 | //pcntl_signal_dispatch(); 250 | // 挂起进程,直到有子进程退出或者被信号打断 251 | $status = 0; 252 | $pid = pcntl_wait($status, WUNTRACED); 253 | // 如果有信号到来,尝试触发信号处理函数 254 | //pcntl_signal_dispatch(); 255 | 256 | // 子进程退出信号 257 | if($pid > 0) 258 | { 259 | //echo "worker[".$pid."] stop\n"; 260 | //$this->stop(); 261 | 262 | // 如果不是正常退出,是被kill等杀掉的 263 | if($status !== 0) 264 | { 265 | log::add("worker {$pid} exit with status $status", "Warning"); 266 | } 267 | 268 | // key 和 value 互换 269 | $worker_pids = array_flip(self::$_worker_pids); 270 | // 通过 pid 得到 worker_id 271 | $worker_id = $worker_pids[$pid]; 272 | // 这里不unset掉,是为了进程重启 273 | self::$_worker_pids[$worker_id] = 0; 274 | //unset(self::$_worker_pids[$pid]); 275 | 276 | // 再生成一个worker 277 | if (!$this->run_once) 278 | { 279 | $this->fork_one_worker($worker_id); 280 | } 281 | 282 | // 如果所有子进程都退出了,触发主进程退出函数 283 | $all_worker_stop = true; 284 | foreach (self::$_worker_pids as $_worker_pid) 285 | { 286 | // 只要有一个worker进程还存在进程ID,就不算退出 287 | if ($_worker_pid != 0) 288 | { 289 | $all_worker_stop = false; 290 | } 291 | } 292 | if ($all_worker_stop) 293 | { 294 | if ($this->on_stop) 295 | { 296 | call_user_func($this->on_stop, $this); 297 | } 298 | exit(0); 299 | } 300 | } 301 | // 其他信号 302 | else 303 | { 304 | // worker进程接受到master进行信号退出的,会到这里来 305 | if ($this->on_stop) 306 | { 307 | call_user_func($this->on_stop, $this); 308 | } 309 | exit(0); 310 | } 311 | } 312 | } 313 | 314 | /** 315 | * 执行关闭流程(所有进程) 316 | * 事件触发,非正常程序执行完毕 317 | * @return void 318 | */ 319 | public function stop_all() 320 | { 321 | // 设置master、worker进程的运行状态为关闭状态 322 | self::$_status = "shutdown"; 323 | // master进程 324 | if(self::$_master_pid === posix_getpid()) 325 | { 326 | // 循环给worker进程发送关闭信号 327 | foreach (self::$_worker_pids as $worker_pid) 328 | { 329 | posix_kill($worker_pid, SIGINT); 330 | } 331 | } 332 | // worker进程 333 | else 334 | { 335 | // 接收到master进程发送的关闭信号之后退出,这里应该考虑业务的完整性,不能强行exit 336 | $this->stop(); 337 | exit(0); 338 | } 339 | } 340 | 341 | /** 342 | * 停止当前worker实例 343 | * 正常运行结束和接受信号退出,都会调用这个方法 344 | * @return void 345 | */ 346 | public function stop() 347 | { 348 | if ($this->on_worker_stop) 349 | { 350 | call_user_func($this->on_worker_stop, $this); 351 | } 352 | // 设置worker进程的运行状态为关闭 353 | self::$_status = "shutdown"; 354 | } 355 | 356 | /** 357 | * 检查错误,PHP exit之前会执行 358 | * @return void 359 | */ 360 | public function check_errors() 361 | { 362 | // 如果当前worker进程不是正常退出 363 | if(self::$_status != "shutdown") 364 | { 365 | $error_msg = "WORKER EXIT UNEXPECTED "; 366 | $errors = error_get_last(); 367 | if($errors && ($errors['type'] === E_ERROR || 368 | $errors['type'] === E_PARSE || 369 | $errors['type'] === E_CORE_ERROR || 370 | $errors['type'] === E_COMPILE_ERROR || 371 | $errors['type'] === E_RECOVERABLE_ERROR )) 372 | { 373 | $error_msg .= $this->get_error_type($errors['type']) . " {$errors['message']} in {$errors['file']} on line {$errors['line']}"; 374 | } 375 | log::add($error_msg, 'Error'); 376 | } 377 | } 378 | 379 | /** 380 | * 获取错误类型对应的意义 381 | * @param integer $type 382 | * @return string 383 | */ 384 | protected function get_error_type($type) 385 | { 386 | switch($type) 387 | { 388 | case E_ERROR: // 1 // 389 | return 'E_ERROR'; 390 | case E_WARNING: // 2 // 391 | return 'E_WARNING'; 392 | case E_PARSE: // 4 // 393 | return 'E_PARSE'; 394 | case E_NOTICE: // 8 // 395 | return 'E_NOTICE'; 396 | case E_CORE_ERROR: // 16 // 397 | return 'E_CORE_ERROR'; 398 | case E_CORE_WARNING: // 32 // 399 | return 'E_CORE_WARNING'; 400 | case E_COMPILE_ERROR: // 64 // 401 | return 'E_COMPILE_ERROR'; 402 | case E_COMPILE_WARNING: // 128 // 403 | return 'E_COMPILE_WARNING'; 404 | case E_USER_ERROR: // 256 // 405 | return 'E_USER_ERROR'; 406 | case E_USER_WARNING: // 512 // 407 | return 'E_USER_WARNING'; 408 | case E_USER_NOTICE: // 1024 // 409 | return 'E_USER_NOTICE'; 410 | case E_STRICT: // 2048 // 411 | return 'E_STRICT'; 412 | case E_RECOVERABLE_ERROR: // 4096 // 413 | return 'E_RECOVERABLE_ERROR'; 414 | case E_DEPRECATED: // 8192 // 415 | return 'E_DEPRECATED'; 416 | case E_USER_DEPRECATED: // 16384 // 417 | return 'E_USER_DEPRECATED'; 418 | } 419 | return ""; 420 | } 421 | } 422 | -------------------------------------------------------------------------------- /data/imooc.csv: -------------------------------------------------------------------------------- 1 | Man_For_Mars,保密 ,142小时21分,,775,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a5613c000185d301800180-200-200.jpg,http://www.imooc.com/u/200000 2 | (路明)业务员施宇, 男 , 7分,,20,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a5616a0001eb8001000100-200-200.jpg,http://www.imooc.com/u/200001 3 | 胡莎莎,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4d850001c10602000200-200-200.jpg,http://www.imooc.com/u/200002 4 | 好多好多,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e564d0001308602000200-200-200.jpg,http://www.imooc.com/u/200003 5 | 哼哼,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4c2300012ab002200220-200-200.jpg,http://www.imooc.com/u/200004 6 | 带你回忆的回忆,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e52b90001456f02000200-200-200.jpg,http://www.imooc.com/u/200005 7 | 老师坏人,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a0aa000121d702000200-200-200.jpg,http://www.imooc.com/u/200006 8 | 如今的你怎么样,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4cc800016ffd02200220-200-200.jpg,http://www.imooc.com/u/200007 9 | 奋斗的人儿,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4d3d0001ed7802000200-200-200.jpg,http://www.imooc.com/u/200008 10 | 母猪都会上树,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4bec0001ae5302000200-200-200.jpg,http://www.imooc.com/u/200009 11 | 性感不是骚,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a1d100010c2602000200-200-200.jpg,http://www.imooc.com/u/200010 12 | q0621184934, 男 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a5633e0001115601000100-200-200.jpg,http://www.imooc.com/u/200011 13 | 守候着最后一片星辰, 男 , 2分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a5633e0001e09d01000100-200-200.jpg,http://www.imooc.com/u/200012 14 | 尚观VaVa, 男 , 1小时31分,,30,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a5634a0001d60f01000100-200-200.jpg,http://www.imooc.com/u/200013 15 | 长腿妹妹,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4cc800016ffd02200220-200-200.jpg,http://www.imooc.com/u/200014 16 | 胖了个去, 男 ,34小时46分,,530,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a56351000148da01800180-200-200.jpg,http://www.imooc.com/u/200015 17 | 天空中的棉花寂寞了谁的心灵,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4cf4000151f602000200-200-200.jpg,http://www.imooc.com/u/200016 18 | van▪cleef, 男 ,266小时46分,,6583,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a563d100017fc101000100-200-200.jpg,http://www.imooc.com/u/200017 19 | 橱窗里的童话,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a1360001954902000200-200-200.jpg,http://www.imooc.com/u/200018 20 | 麻蛋,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a2750001728602200220-200-200.jpg,http://www.imooc.com/u/200019 21 | 需要都来不及,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4d850001c10602000200-200-200.jpg,http://www.imooc.com/u/200020 22 | 王泥煤,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a1360001954902000200-200-200.jpg,http://www.imooc.com/u/200021 23 | 咋见,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4cc800016ffd02200220-200-200.jpg,http://www.imooc.com/u/200022 24 | 夏天又悄悄过去,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a2b70001a5a802000200-200-200.jpg,http://www.imooc.com/u/200023 25 | 你带给的快乐,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a0c40001088802000200-200-200.jpg,http://www.imooc.com/u/200024 26 | 葛沛沛,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4d5b0001d57502200203-200-200.jpg,http://www.imooc.com/u/200025 27 | 终于看开爱回不来,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a17a0001592502000200-200-200.jpg,http://www.imooc.com/u/200026 28 | 米老鸭110, 男 , 3小时27分,,335,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a564e500015d2404800480-200-200.jpg,http://www.imooc.com/u/200027 29 | 浪漫的一个你,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a207000118af02200220-200-200.jpg,http://www.imooc.com/u/200028 30 | 哭着求着我回来,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4d660001312002000200-200-200.jpg,http://www.imooc.com/u/200029 31 | 一点一滴的回忆着,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4c1500010baf02200220-200-200.jpg,http://www.imooc.com/u/200030 32 | 槑_0002, 男 , 2分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a5658d0001139801000100-200-200.jpg,http://www.imooc.com/u/200031 33 | q0621190057, 男 , 3小时54分,,285,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a565e90001115601000100-200-200.jpg,http://www.imooc.com/u/200032 34 | 只想和你在一起,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4c640001354402000200-200-200.jpg,http://www.imooc.com/u/200033 35 | 圣诞节,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e51f30001edf702000200-200-200.jpg,http://www.imooc.com/u/200034 36 | 春光喜气,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a2a10001064f02000200-200-200.jpg,http://www.imooc.com/u/200035 37 | user200036, 男 , 3分,,10,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a566630001115601000100-200-200.jpg,http://www.imooc.com/u/200036 38 | 锁住关于你的一切,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a2d100019c1d02000200-200-200.jpg,http://www.imooc.com/u/200037 39 | 并蒂的马蹄莲,女,19分,,70,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a5666f0001dc4401800180-200-200.jpg,http://www.imooc.com/u/200038 40 | 速度与激情,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e51840001ca2502000200-200-200.jpg,http://www.imooc.com/u/200039 41 | 梦想总是那么遥不可及,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4cde000148e602000200-200-200.jpg,http://www.imooc.com/u/200040 42 | 步步难做,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a207000118af02200220-200-200.jpg,http://www.imooc.com/u/200041 43 | 3654729479,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4d710001322402000200-200-200.jpg,http://www.imooc.com/u/200042 44 | 奶牛沙豆绿, 男 , 8小时17分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a566bb0001a5cc01000100-200-200.jpg,http://www.imooc.com/u/200043 45 | 放飞的风筝_0001,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4c420001b2e502000200-200-200.jpg,http://www.imooc.com/u/200044 46 | 瞬间吓尿,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a154000110c002200220-200-200.jpg,http://www.imooc.com/u/200045 47 | 天空满天星星,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a0d9000196ff02000200-200-200.jpg,http://www.imooc.com/u/200046 48 | 我知道怎么说我就怎么说了,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a1bc00014e8302000200-200-200.jpg,http://www.imooc.com/u/200047 49 | 在半路回头,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4d5b0001d57502200203-200-200.jpg,http://www.imooc.com/u/200048 50 | 淡了谈了飞了,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a0c40001088802000200-200-200.jpg,http://www.imooc.com/u/200049 51 | 梧桐树_0001, 男 ,20分,,25,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a5677a0001e6cc01000100-200-200.jpg,http://www.imooc.com/u/200050 52 | 越长大越孤单_0001,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4cc800016ffd02200220-200-200.jpg,http://www.imooc.com/u/200051 53 | 天突然偷偷的亮了,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4cf4000151f602000200-200-200.jpg,http://www.imooc.com/u/200052 54 | 汪洋中的狼,保密 ,18分,,10,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4d00000171e602000200-200-200.jpg,http://www.imooc.com/u/200053 55 | 我终于真的失去了,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a10b000190e402200220-200-200.jpg,http://www.imooc.com/u/200054 56 | 娃哈哈代言人,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4cde000148e602000200-200-200.jpg,http://www.imooc.com/u/200055 57 | 不得不爱你,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a28f00019e7702100210-200-200.jpg,http://www.imooc.com/u/200056 58 | 我们可以天长地久,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4cbd00011ecc01000100-200-200.jpg,http://www.imooc.com/u/200057 59 | 听云客, 男 ,25分,,30,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a568480001b93700400040-200-200.jpg,http://www.imooc.com/u/200058 60 | 模模糊糊,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53339fdf00019de902200220-200-200.jpg,http://www.imooc.com/u/200059 61 | choie,保密 ,12小时34分,,144,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/images/unknow-160.png,http://www.imooc.com/u/200060 62 | 懂得你也容易,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4d2600013fe202000200-200-200.jpg,http://www.imooc.com/u/200061 63 | 海况天空,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4c420001b2e502000200-200-200.jpg,http://www.imooc.com/u/200062 64 | GoVim, 男 , 5小时 6分,,35,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a5690c0001d8d601800180-200-200.jpg,http://www.imooc.com/u/200063 65 | q0621191441, 男 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a569210001115601000100-200-200.jpg,http://www.imooc.com/u/200064 66 | 凉菜减肥法,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a1bc00014e8302000200-200-200.jpg,http://www.imooc.com/u/200065 67 | .,女, 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a569550001850801000100-200-200.jpg,http://www.imooc.com/u/200066 68 | 清浅_0,女,16小时55分,,118,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a569640001b67201800180-200-200.jpg,http://www.imooc.com/u/200067 69 | 从来都没有真正的了解,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a1660001394602000200-200-200.jpg,http://www.imooc.com/u/200068 70 | Tiffny漏,女, 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a5696a000167ea01000100-200-200.jpg,http://www.imooc.com/u/200069 71 | 夏木木1992,女, 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a569920001c7cf01800180-200-200.jpg,http://www.imooc.com/u/200070 72 | 没认真过怎么会伤心,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4d3d0001ed7802000200-200-200.jpg,http://www.imooc.com/u/200071 73 | q0621191644, 男 , 4分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a5699c0001115601000100-200-200.jpg,http://www.imooc.com/u/200072 74 | q0621191705, 男 ,40分,,20,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a569b10001115601000100-200-200.jpg,http://www.imooc.com/u/200073 75 | 杨少康,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4c5600017c5b02010200-200-200.jpg,http://www.imooc.com/u/200074 76 | 在那遥远的地方,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4c0500010c7602000200-200-200.jpg,http://www.imooc.com/u/200075 77 | 吴三妹,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4cd40001c4fb02000200-200-200.jpg,http://www.imooc.com/u/200076 78 | 你为何这么屌,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4d3200016b4e02000200-200-200.jpg,http://www.imooc.com/u/200077 79 | 历史与发展, 男 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a56a420001b53a01000100-200-200.jpg,http://www.imooc.com/u/200078 80 | XIAOXIAO_0002,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4d850001c10602000200-200-200.jpg,http://www.imooc.com/u/200079 81 | leo650, 男 ,59小时20分,,6261,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e52b90001456f02000200-200-200.jpg,http://www.imooc.com/u/200080 82 | 那一束才是真的,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4c640001354402000200-200-200.jpg,http://www.imooc.com/u/200081 83 | 等待就是浪费青春_0001,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a207000118af02200220-200-200.jpg,http://www.imooc.com/u/200082 84 | 苍天之魂, 男 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a56aa700012a6d01000100-200-200.jpg,http://www.imooc.com/u/200083 85 | 不能给你啊,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4d470001a00a02000200-200-200.jpg,http://www.imooc.com/u/200084 86 | 看怎么样就怎么样,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a1a90001c8d802000200-200-200.jpg,http://www.imooc.com/u/200085 87 | 登陆无效,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a154000110c002200220-200-200.jpg,http://www.imooc.com/u/200086 88 | 手机用户2908419433,女, 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/53a56afd0001c51301800180-200-200.jpg,http://www.imooc.com/u/200087 89 | 星期天_0001,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a1d100010c2602000200-200-200.jpg,http://www.imooc.com/u/200088 90 | 碰到钉子了,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/5333a2750001728602200220-200-200.jpg,http://www.imooc.com/u/200089 91 | 有一天我会让你知道,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4c1500010baf02200220-200-200.jpg,http://www.imooc.com/u/200090 92 | 我们都好孩子,保密 , 0分,,,这位同学很懒木有签名的说~,1,,http://img.mukewang.com/533e4d5b0001d57502200203-200-200.jpg,http://www.imooc.com/u/200091 93 | -------------------------------------------------------------------------------- /demo/imooc.php: -------------------------------------------------------------------------------- 1 | '慕课网', 11 | // 同时工作的爬虫任务数 12 | 'tasknum' => 8, 13 | // 采集失败后尝试的次数 14 | 'max_try' => 5, 15 | // 导出的数据格式 16 | 'export' => array( 17 | 'type' => 'db', 18 | 'table' => 'imooc', 19 | ), 20 | // 爬虫采集的域名 21 | 'domains' => array( 22 | 'imooc.com', 23 | 'www.imooc.com', 24 | ), 25 | // 爬虫入口 26 | 'scan_urls' => array( 27 | 'http://www.imooc.com/comment/295', 28 | ), 29 | // 内容页url 30 | 'content_url_regexes' => array( 31 | 'http://www.imooc.com/u/\d+$', 32 | ), 33 | // 列表页url 34 | 'list_url_regexes' => array( 35 | 'http://www.imooc.com/course/comment/id/.*?', 36 | 'http://www.imooc.com/comment/\d+', 37 | 'http://www.imooc.com/wenda/.*?', 38 | ), 39 | // 定义内容以抽取规则 40 | 'fields' => array( 41 | // 童鞋名称 42 | array( 43 | 'name' => 'name', 44 | 'selector' => '//*[@id="main"]/div[1]/div/h3/span', 45 | ), 46 | // 性别 47 | array( 48 | 'name' => 'sex', 49 | 'selector' => '//*[@id="main"]/div[1]/div/p[1]/span[1]/@title', 50 | ), 51 | // 学习时长 52 | array( 53 | 'name' => 'time', 54 | 'selector' => '//*[@class="u-info-learn"]/em', 55 | ), 56 | // 积分 57 | array( 58 | 'name' => 'integral', 59 | 'selector' => '//*[@id="main"]/div[1]/div/p[1]/span[3]/em', 60 | ), 61 | // 经验 62 | array( 63 | 'name' => 'experience', 64 | 'selector' => '//*[@class="u-info-mp"]/em', 65 | ), 66 | // 描述 67 | array( 68 | 'name' => 'description', 69 | 'selector' => '//*[@id="main"]/div[1]/div/p[2]', 70 | ), 71 | // 关注 72 | array( 73 | 'name' => 'follow', 74 | 'selector' => '//*[@id="main"]/div[1]/div/div/div[1]/a/em', 75 | ), 76 | // 粉丝 77 | array( 78 | 'name' => 'fans', 79 | 'selector' => '//*[@id="main"]/div[1]/div/div/div[2]/a/em', 80 | ), 81 | // 头像 82 | array( 83 | 'name' => 'img', 84 | 'selector' => '//*[@id="main"]/div[2]/div[1]/div/img/@src', 85 | ), 86 | // 用户url 87 | array( 88 | 'name' => 'url', 89 | 'selector' => '//*[@id="main"]/div[2]/div[1]/div/img', // 随便填 90 | ) 91 | ), 92 | ); 93 | 94 | 95 | $spider = new phpspider($configs); 96 | 97 | $spider->on_start = function ($spider) { 98 | for ($i = 1; $i <= 200100; $i++) { 99 | $url = "http://www.imooc.com/u/{$i}"; 100 | $spider->add_url($url); 101 | } 102 | }; 103 | 104 | $spider->on_extract_field = function ($fieldname, $data, $page) { 105 | if ($fieldname == 'url') { 106 | $data = $page['url']; 107 | } 108 | 109 | return $data; 110 | }; 111 | 112 | $spider->start(); 113 | -------------------------------------------------------------------------------- /library/cls_curl.php: -------------------------------------------------------------------------------- 1 | 10 | * @copyright seatle 11 | * @link http://www.epooll.com/ 12 | * @license http://www.opensource.org/licenses/mit-license.php MIT License 13 | */ 14 | 15 | class cls_curl 16 | { 17 | protected static $timeout = 10; 18 | protected static $ch = null; 19 | protected static $useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36'; 20 | protected static $http_raw = false; 21 | protected static $cookie = null; 22 | protected static $cookie_jar = null; 23 | protected static $cookie_file = null; 24 | protected static $referer = null; 25 | protected static $ip = null; 26 | protected static $proxy = null; 27 | protected static $headers = array(); 28 | protected static $hosts = array(); 29 | protected static $gzip = false; 30 | protected static $info = array(); 31 | 32 | /** 33 | * set timeout 34 | * 35 | * @param init $timeout 36 | * @return 37 | */ 38 | public static function set_timeout($timeout) 39 | { 40 | self::$timeout = $timeout; 41 | } 42 | 43 | /** 44 | * 设置代理 45 | * 46 | * @param mixed $proxy 47 | * @return void 48 | * @author seatle 49 | * @created time :2016-09-18 10:17 50 | */ 51 | public static function set_proxy($proxy) 52 | { 53 | self::$proxy = $proxy; 54 | } 55 | 56 | /** 57 | * set referer 58 | * 59 | */ 60 | public static function set_referer($referer) 61 | { 62 | self::$referer = $referer; 63 | } 64 | 65 | /** 66 | * 设置 user_agent 67 | * 68 | * @param string $useragent 69 | * @return void 70 | */ 71 | public static function set_useragent($useragent) 72 | { 73 | self::$useragent = $useragent; 74 | } 75 | 76 | /** 77 | * 设置COOKIE 78 | * 79 | * @param string $cookie 80 | * @return void 81 | */ 82 | public static function set_cookie($cookie) 83 | { 84 | self::$cookie = $cookie; 85 | } 86 | 87 | /** 88 | * 设置COOKIE JAR 89 | * 90 | * @param string $cookie_jar 91 | * @return void 92 | */ 93 | public static function set_cookie_jar($cookie_jar) 94 | { 95 | self::$cookie_jar = $cookie_jar; 96 | } 97 | 98 | /** 99 | * 设置COOKIE FILE 100 | * 101 | * @param string $cookie_file 102 | * @return void 103 | */ 104 | public static function set_cookie_file($cookie_file) 105 | { 106 | self::$cookie_file = $cookie_file; 107 | } 108 | 109 | /** 110 | * 获取内容的时候是不是连header也一起获取 111 | * 112 | * @param mixed $http_raw 113 | * @return void 114 | * @author seatle 115 | * @created time :2016-09-18 10:17 116 | */ 117 | public static function set_http_raw($http_raw) 118 | { 119 | self::$http_raw = $http_raw; 120 | } 121 | 122 | /** 123 | * 设置IP 124 | * 125 | * @param string $ip 126 | * @return void 127 | */ 128 | public static function set_ip($ip) 129 | { 130 | self::$ip = $ip; 131 | } 132 | 133 | /** 134 | * 设置Headers 135 | * 136 | * @param string $headers 137 | * @return void 138 | */ 139 | public static function set_headers($headers) 140 | { 141 | self::$headers = $headers; 142 | } 143 | 144 | /** 145 | * 设置Hosts 146 | * 147 | * @param string $hosts 148 | * @return void 149 | */ 150 | public static function set_hosts($hosts) 151 | { 152 | self::$hosts = $hosts; 153 | } 154 | 155 | /** 156 | * 设置Gzip 157 | * 158 | * @param string $hosts 159 | * @return void 160 | */ 161 | public static function set_gzip($gzip) 162 | { 163 | self::$gzip = $gzip; 164 | } 165 | 166 | /** 167 | * 初始化 CURL 168 | * 169 | */ 170 | public static function init() 171 | { 172 | //if (empty ( self::$ch )) 173 | if (!is_resource ( self::$ch )) 174 | { 175 | self::$ch = curl_init (); 176 | curl_setopt( self::$ch, CURLOPT_RETURNTRANSFER, true ); 177 | curl_setopt( self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout ); 178 | curl_setopt( self::$ch, CURLOPT_HEADER, false ); 179 | curl_setopt( self::$ch, CURLOPT_USERAGENT, self::$useragent ); 180 | curl_setopt( self::$ch, CURLOPT_TIMEOUT, self::$timeout + 5); 181 | // 在多线程处理场景下使用超时选项时,会忽略signals对应的处理函数,但是无耐的是还有小概率的crash情况发生 182 | curl_setopt( self::$ch, CURLOPT_NOSIGNAL, true); 183 | } 184 | return self::$ch; 185 | } 186 | 187 | /** 188 | * get 189 | * 190 | * 191 | */ 192 | public static function get($url, $fields = array()) 193 | { 194 | self::init (); 195 | return self::http_request($url, 'get', $fields); 196 | } 197 | 198 | /** 199 | * $fields 有三种类型:1、数组;2、http query;3、json 200 | * 1、array('name'=>'yangzetao') 2、http_build_query(array('name'=>'yangzetao')) 3、json_encode(array('name'=>'yangzetao')) 201 | * 前两种是普通的post,可以用$_POST方式获取 202 | * 第三种是post stream( json rpc,其实就是webservice ),虽然是post方式,但是只能用流方式 http://input 后者 $HTTP_RAW_POST_DATA 获取 203 | * 204 | * @param mixed $url 205 | * @param array $fields 206 | * @param mixed $proxy 207 | * @static 208 | * @access public 209 | * @return void 210 | */ 211 | public static function post($url, $fields = array()) 212 | { 213 | self::init (); 214 | return self::http_request($url, 'post', $fields); 215 | } 216 | 217 | public static function http_request($url, $type = 'get', $fields) 218 | { 219 | // 如果是 get 方式,直接拼凑一个 url 出来 220 | if (strtolower($type) == 'get' && !empty($fields)) 221 | { 222 | $url = $url . (strpos($url,"?")===false ? "?" : "&") . http_build_query($fields); 223 | } 224 | 225 | // 随机绑定 hosts,做负载均衡 226 | if (self::$hosts) 227 | { 228 | $parse_url = parse_url($url); 229 | $host = $parse_url['host']; 230 | $key = rand(0, count(self::$hosts)-1); 231 | $ip = self::$hosts[$key]; 232 | $url = str_replace($host, $ip, $url); 233 | self::$headers = array_merge( array('Host:'.$host), self::$headers ); 234 | } 235 | curl_setopt( self::$ch, CURLOPT_URL, $url ); 236 | // 如果是 post 方式 237 | if (strtolower($type) == 'post') 238 | { 239 | curl_setopt( self::$ch, CURLOPT_POST, true ); 240 | curl_setopt( self::$ch, CURLOPT_POSTFIELDS, $fields ); 241 | } 242 | if (self::$useragent) 243 | { 244 | curl_setopt( self::$ch, CURLOPT_USERAGENT, self::$useragent ); 245 | } 246 | if (self::$cookie) 247 | { 248 | curl_setopt( self::$ch, CURLOPT_COOKIE, self::$cookie ); 249 | } 250 | if (self::$cookie_jar) 251 | { 252 | curl_setopt( self::$ch, CURLOPT_COOKIEJAR, self::$cookie_jar ); 253 | } 254 | if (self::$cookie_file) 255 | { 256 | curl_setopt( self::$ch, CURLOPT_COOKIEFILE, self::$cookie_file ); 257 | } 258 | if (self::$referer) 259 | { 260 | curl_setopt( self::$ch, CURLOPT_REFERER, self::$referer ); 261 | } 262 | if (self::$ip) 263 | { 264 | self::$headers = array_merge( array('CLIENT-IP:'.self::$ip, 'X-FORWARDED-FOR:'.self::$ip), self::$headers ); 265 | } 266 | if (self::$headers) 267 | { 268 | curl_setopt( self::$ch, CURLOPT_HTTPHEADER, self::$headers ); 269 | } 270 | if (self::$gzip) 271 | { 272 | curl_setopt( self::$ch, CURLOPT_ENCODING, 'gzip' ); 273 | } 274 | if (self::$proxy) 275 | { 276 | curl_setopt( self::$ch, CURLOPT_PROXY, self::$proxy ); 277 | } 278 | if (self::$http_raw) 279 | { 280 | curl_setopt( self::$ch, CURLOPT_HEADER, true ); 281 | } 282 | 283 | $data = curl_exec ( self::$ch ); 284 | self::$info = curl_getinfo(self::$ch); 285 | if ($data === false) 286 | { 287 | //echo date("Y-m-d H:i:s"), ' Curl error: ' . curl_error( self::$ch ), "\n"; 288 | } 289 | 290 | // 关闭句柄 291 | curl_close( self::$ch ); 292 | //$data = substr($data, 10); 293 | //$data = gzinflate($data); 294 | return $data; 295 | } 296 | 297 | public static function get_info() 298 | { 299 | return self::$info; 300 | } 301 | 302 | public static function get_http_code() 303 | { 304 | return self::$info['http_code']; 305 | } 306 | } 307 | 308 | function classic_curl($urls, $delay) 309 | { 310 | $queue = curl_multi_init(); 311 | $map = array(); 312 | 313 | foreach ($urls as $url) 314 | { 315 | // create cURL resources 316 | $ch = curl_init(); 317 | 318 | // 设置 URL 和 其他参数 319 | curl_setopt($ch, CURLOPT_URL, $url); 320 | curl_setopt($ch, CURLOPT_TIMEOUT, 1); 321 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 322 | curl_setopt($ch, CURLOPT_HEADER, 0); 323 | curl_setopt($ch, CURLOPT_NOSIGNAL, true); 324 | 325 | // 把当前 curl resources 加入到 curl_multi_init 队列 326 | curl_multi_add_handle($queue, $ch); 327 | $map[$url] = $ch; 328 | } 329 | 330 | $active = null; 331 | 332 | // execute the handles 333 | do { 334 | $mrc = curl_multi_exec($queue, $active); 335 | } while ($mrc == CURLM_CALL_MULTI_PERFORM); 336 | 337 | while ($active > 0 && $mrc == CURLM_OK) { 338 | while (curl_multi_exec($queue, $active) === CURLM_CALL_MULTI_PERFORM); 339 | // 这里 curl_multi_select 一直返回 -1,所以这里就死循环了,CPU就100%了 340 | if (curl_multi_select($queue, 0.5) != -1) 341 | { 342 | do { 343 | $mrc = curl_multi_exec($queue, $active); 344 | } while ($mrc == CURLM_CALL_MULTI_PERFORM); 345 | } 346 | } 347 | 348 | $responses = array(); 349 | foreach ($map as $url=>$ch) { 350 | //$responses[$url] = callback(curl_multi_getcontent($ch), $delay); 351 | $responses[$url] = callback(curl_multi_getcontent($ch), $delay, $url); 352 | curl_multi_remove_handle($queue, $ch); 353 | curl_close($ch); 354 | } 355 | 356 | curl_multi_close($queue); 357 | return $responses; 358 | } 359 | 360 | function rolling_curl($urls, $delay) 361 | { 362 | $queue = curl_multi_init(); 363 | $map = array(); 364 | 365 | foreach ($urls as $url) { 366 | $ch = curl_init(); 367 | 368 | curl_setopt($ch, CURLOPT_URL, $url); 369 | curl_setopt($ch, CURLOPT_TIMEOUT, 10); 370 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 371 | curl_setopt($ch, CURLOPT_HEADER, 0); 372 | curl_setopt($ch, CURLOPT_NOSIGNAL, true); 373 | $cookie = '_za=36643642-e546-4d60-a771-8af8dcfbd001; q_c1=a57a2b9f10964f909b8d8969febf3ab2|1437705596000|1437705596000; _xsrf=f0304fba4e44e1d008ec308d59bab029; cap_id="YWY1YmRmODlmZGVmNDc3MWJlZGFkZDg3M2E0M2Q5YjM=|1437705596|963518c454bb6f10d96775021c098c84e1e46f5a"; z_c0="QUFCQVgtRWZBQUFYQUFBQVlRSlZUVjR6NEZVUTgtRkdjTVc5UDMwZXRJZFdWZ2JaOWctNVhnPT0=|1438164574|aed6ef3707f246a7b64da4f1e8c089395d77ff2b"; __utma=51854390.1105113342.1437990174.1438160686.1438164116.10; __utmc=51854390; __utmz=51854390.1438134939.8.5.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/people/yangzetao; __utmv=51854390.100-1|2=registration_date=20131030=1^3=entry_date=20131030=1'; 374 | curl_setopt($ch, CURLOPT_COOKIE, $cookie); 375 | $useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36'; 376 | curl_setopt( $ch, CURLOPT_USERAGENT, $useragent ); 377 | curl_setopt($ch, CURLOPT_ENCODING, 'gzip'); 378 | 379 | curl_multi_add_handle($queue, $ch); 380 | $map[(string) $ch] = $url; 381 | } 382 | 383 | $responses = array(); 384 | do { 385 | while (($code = curl_multi_exec($queue, $active)) == CURLM_CALL_MULTI_PERFORM) ; 386 | 387 | if ($code != CURLM_OK) { break; } 388 | 389 | // a request was just completed -- find out which one 390 | while ($done = curl_multi_info_read($queue)) { 391 | 392 | // get the info and content returned on the request 393 | $info = curl_getinfo($done['handle']); 394 | $error = curl_error($done['handle']); 395 | $results = callback(curl_multi_getcontent($done['handle']), $delay, $map[(string) $done['handle']]); 396 | $responses[$map[(string) $done['handle']]] = compact('info', 'error', 'results'); 397 | 398 | // remove the curl handle that just completed 399 | curl_multi_remove_handle($queue, $done['handle']); 400 | curl_close($done['handle']); 401 | } 402 | 403 | // Block for data in / output; error handling is done by curl_multi_exec 404 | if ($active > 0) { 405 | curl_multi_select($queue, 0.5); 406 | } 407 | 408 | } while ($active); 409 | 410 | curl_multi_close($queue); 411 | return $responses; 412 | } 413 | 414 | function callback($data, $delay, $url) { 415 | //echo $data; 416 | //echo date("Y-m-d H:i:s", time()) . " --- " . $url . "\n"; 417 | if (!empty($data)) 418 | { 419 | file_put_contents("./html2/".md5($url).".html", $data); 420 | } 421 | // usleep模拟现实中比较负责的数据处理逻辑(如提取, 分词, 写入文件或数据库等) 422 | //usleep(1); 423 | //return compact('data', 'matches'); 424 | } 425 | 426 | -------------------------------------------------------------------------------- /library/cls_query.php: -------------------------------------------------------------------------------- 1 | 29 | * @created time :2015-08-08 15:52 30 | */ 31 | private static function get_nodes($query) 32 | { 33 | // 把一到多个空格 替换成 一个空格 34 | // 把 > 和 ~ 符号两边的空格去掉,因为没有用这两个符号,所以这里可以不这么做 35 | // ul>li.className 36 | $query = trim( 37 | preg_replace('@\s+@', ' ', 38 | preg_replace('@\s*(>|\\+|~)\s*@', '\\1', $query) 39 | ) 40 | ); 41 | 42 | $nodes = array(); 43 | if (! $query) 44 | { 45 | return $nodes; 46 | } 47 | 48 | $query_arr = explode(" ", $query); 49 | foreach ($query_arr as $k=>$v) 50 | { 51 | $path = $k == 0 ? $v : $path.' '.$v; 52 | $node = array("path"=>(string)$path, "name"=>"", "id"=>"", "class"=>"", "other"=>array()); 53 | // 如果存在内容选择器 54 | if (preg_match('@(.*?)\[(.*?)=[\'|"](.*?)[\'|"]\]@', $v, $matches) && !empty($matches[2]) && !empty($matches[3])) 55 | { 56 | // 把选择器过滤掉 [rel='topic'] 57 | $v = $matches[1]; 58 | $node['other'] = array( 59 | 'key'=>$matches[2], 60 | 'val'=>$matches[3], 61 | ); 62 | } 63 | 64 | // 如果存在 id 65 | $id_arr = explode("#", $v); 66 | $class_arr = explode(".", $v); 67 | if (count($id_arr) === 2) 68 | { 69 | $node['name'] = $id_arr[0]; 70 | $node['id'] = $id_arr[1]; 71 | } 72 | // 如果存在 class 73 | elseif (count($class_arr) === 2) 74 | { 75 | $node['name'] = $class_arr[0]; 76 | $node['class'] = $class_arr[1]; 77 | } 78 | // 如果没有样式 79 | else 80 | { 81 | $node['name'] = $v; 82 | } 83 | $nodes[] = $node; 84 | } 85 | //print_r($nodes); 86 | //exit; 87 | return $nodes; 88 | } 89 | 90 | public static function get_datas($nodes, $attr = "html") 91 | { 92 | if (empty(self::$content)) 93 | { 94 | return false; 95 | } 96 | 97 | $node_datas = array(); 98 | $count = count($nodes); 99 | // 循环所有节点 100 | foreach ($nodes as $i=>$node) 101 | { 102 | $is_last = $count == $i+1 ? true : false; 103 | // 第一次 104 | if ($i == 0) 105 | { 106 | $datas = array(); 107 | $datas = self::get_node_datas($node, self::$content, $attr, $is_last); 108 | // 如果第一次都取不到数据,直接跳出循环 109 | if(!$datas) 110 | { 111 | break; 112 | } 113 | $node_datas[$nodes[$i]['path']] = $datas; 114 | } 115 | else 116 | { 117 | $datas = array(); 118 | // 循环上一个节点的数组 119 | foreach ($node_datas[$nodes[$i-1]['path']] as $v) 120 | { 121 | $datas = array_merge( $datas, self::get_node_datas($node, trim($v), $attr, $is_last) ); 122 | } 123 | $node_datas[$nodes[$i]['path']] = $datas; 124 | // 删除上一个节点,防止内存溢出,或者缓存到本地,再次使用?! 125 | unset($node_datas[$nodes[$i-1]['path']]); 126 | } 127 | } 128 | //print_r($datas);exit; 129 | // 从数组中弹出最后一个元素 130 | $node_datas = array_pop($node_datas); 131 | //print_r($node_datas); 132 | //exit; 133 | return $node_datas; 134 | } 135 | 136 | /** 137 | * 从节点中获取内容 138 | * $regex = '@]+http-equiv\\s*=\\s*(["|\'])Content-Type\\1([^>]+?)>@i'; 139 | * 140 | * @param mixed $node 141 | * @param mixed $content 142 | * @return void 143 | * @author seatle 144 | * @created time :2015-08-08 15:52 145 | */ 146 | private static function get_node_datas($node, $content, $attr = "html", $is_last = false) 147 | { 148 | $node_datas = $datas = array(); 149 | 150 | if (!empty($node['id'])) 151 | { 152 | if ($node['name']) 153 | $regex = '@<'.$node['name'].'[^>]+id\\s*=\\s*["|\']+?'.$node['id'].'\\s*[^>]+?>(.*?)@is'; 154 | else 155 | $regex = '@id\\s*=\\s*["|\']+?'.$node['id'].'\\s*[^>]+?>(.*?)<@is'; 156 | } 157 | elseif (!empty($node['class'])) 158 | { 159 | if ($node['name']) 160 | $regex = '@<'.$node['name'].'[^>]+class\\s*=\\s*["|\']+?'.$node['class'].'\\s*[^>]+?>(.*?)@is'; 161 | else 162 | $regex = '@class\\s*=\\s*["|\']+?'.$node['class'].'\\s*[^>]+?>(.*?)<@is'; 163 | } 164 | else 165 | { 166 | // 这里为是么是*,0次到多次,因为有可能是
  • 167 | $regex = '@<'.$node['name'].'[^>]*?>(.*?)@is'; 168 | } 169 | self::log("regex --- " . $regex);; 170 | preg_match_all($regex, $content, $matches); 171 | $all_datas = empty($matches[0]) ? array() : $matches[0]; 172 | $html_datas = empty($matches[1]) ? array() : $matches[1]; 173 | 174 | // 过滤掉选择器对不上的 175 | foreach ($all_datas as $i=>$data) 176 | { 177 | // 如果有设置其他选择器,验证一下选择器 178 | if (!empty($node['other'])) 179 | { 180 | $regex = '@'.$node['other']['key'].'=[\'|"]'.$node['other']['val'].'[\'|"]@is'; 181 | self::log("regex other --- " . $regex); 182 | // 过滤器对不上的,跳过 183 | if (!preg_match($regex, $data, $matches)) 184 | { 185 | continue; 186 | } 187 | } 188 | // 获取节点的html内容 189 | if ($attr != "html" && $is_last) 190 | { 191 | $regex = '@'.$attr.'=[\'|"](.*?)[\'|"]@is'; 192 | preg_match($regex, $data, $matches); 193 | $node_datas[] = empty($matches[1]) ? '' : trim($matches[1]); 194 | } 195 | // 获取节点属性名的值 196 | else 197 | { 198 | $node_datas[] = trim($html_datas[$i]); 199 | } 200 | } 201 | //echo " 11111 ========================================= \n"; 202 | //print_r($node_datas); 203 | //echo " 22222 ========================================= \n\n\n"; 204 | return $node_datas; 205 | } 206 | 207 | /** 208 | * 记录日志 209 | * @param string $msg 210 | * @return void 211 | */ 212 | private static function log($msg) 213 | { 214 | $msg = "[".date("Y-m-d H:i:s")."] " . $msg . "\n"; 215 | if (self::$debug) 216 | { 217 | echo $msg; 218 | } 219 | } 220 | 221 | } 222 | 223 | //$xpath = "ul.top-nav-dropdown li"; 224 | //$xpath = "i.zg-icon"; 225 | //print_r($nodes); 226 | //exit; 227 | // [^>]+ 不是>的字符重复一次到多次, ? 表示不贪婪 228 | // \s 表示空白字符 229 | // * 表示0次或者多次 230 | // + 表示1次或者多次 231 | // 232 | // 后向引用,表示表达式中,从左往右数,第一个左括号对应的括号内的内容。 233 | // \\0 表示整个表达式 234 | // \\1表示第1个表达式 235 | // \\2表示第2个表达式 236 | // $regex = '@]+http-equiv\\s*=\\s*(["|\'])Content-Type\\1([^>]+?)>@i'; 237 | //preg_match_all($regex, $content, $matches); 238 | //print_r($matches); 239 | //exit; 240 | 241 | // 用法 242 | //$content = file_get_contents("./test.html"); 243 | //$query = "ul#top-nav-profile-dropdown li a"; 244 | //$query = "div#zh-profile-following-topic a.link[href='/topic/19550937']"; 245 | //cls_query::init($content); 246 | //$list = cls_query::query($query, "href"); 247 | //print_r($list); 248 | 249 | -------------------------------------------------------------------------------- /library/cls_redis.php: -------------------------------------------------------------------------------- 1 | 10 | // +---------------------------------------------------------------------- 11 | 12 | //---------------------------------- 13 | // PHPSpider Redis操作类文件 14 | //---------------------------------- 15 | 16 | class cls_redis 17 | { 18 | /** 19 | * redis链接标识符号 20 | */ 21 | protected static $redis = NULL; 22 | 23 | /** 24 | * redis配置数组 25 | */ 26 | protected static $configs = array(); 27 | 28 | /** 29 | * 默认redis前缀 30 | */ 31 | public static $prefix = "phpspider"; 32 | 33 | public static $error = ""; 34 | 35 | public static function init() 36 | { 37 | if (!extension_loaded("redis")) 38 | { 39 | self::$error = "The redis extension was not found"; 40 | return false; 41 | } 42 | 43 | // 获取配置 44 | $configs = empty(self::$configs) ? self::_get_default_config() : self::$configs; 45 | if (empty($configs)) 46 | { 47 | self::$error = "You not set a config array for connect\nPlease check the configuration file config/inc_config.php"; 48 | return false; 49 | } 50 | 51 | // 如果当前链接标识符为空,或者ping不同,就close之后重新打开 52 | //if ( empty(self::$redis) || !self::ping() ) 53 | if ( !self::$redis ) 54 | { 55 | self::$redis = new Redis(); 56 | if (!self::$redis->connect($configs['host'], $configs['port'], $configs['timeout'])) 57 | { 58 | self::$error = "Unable to connect to redis server\nPlease check the configuration file config/inc_config.php"; 59 | self::$redis = null; 60 | return false; 61 | } 62 | 63 | // 验证 64 | if ($configs['pass']) 65 | { 66 | if ( !self::$redis->auth($configs['pass']) ) 67 | { 68 | self::$error = "Redis Server authentication failed\nPlease check the configuration file config/inc_config.php"; 69 | self::$redis = null; 70 | return false; 71 | } 72 | } 73 | 74 | $prefix = empty($configs['prefix']) ? self::$prefix : $configs['prefix']; 75 | self::$redis->setOption(Redis::OPT_PREFIX, $prefix . ":"); 76 | self::$redis->setOption(Redis::OPT_READ_TIMEOUT, -1); 77 | } 78 | 79 | return self::$redis; 80 | } 81 | 82 | public static function close() 83 | { 84 | if ( !empty(self::$redis) ) 85 | { 86 | self::$redis->close(); 87 | self::$redis = null; 88 | } 89 | } 90 | 91 | public static function set_connect($config = array()) 92 | { 93 | // 先断开原来的连接 94 | if ( !empty(self::$redis) ) 95 | { 96 | self::$redis->close(); 97 | self::$redis = null; 98 | } 99 | 100 | if (!empty($config)) 101 | { 102 | self::$configs = $config; 103 | } 104 | else 105 | { 106 | if (empty(self::$configs)) 107 | { 108 | throw new Exception("You not set a config array for connect!"); 109 | } 110 | } 111 | } 112 | 113 | public static function set_connect_default($config = '') 114 | { 115 | if (empty($config)) 116 | { 117 | $config = self::_get_default_config(); 118 | } 119 | self::set_connect($config); 120 | } 121 | 122 | /** 123 | * 获取默认配置 124 | */ 125 | protected static function _get_default_config() 126 | { 127 | if (empty($GLOBALS['config']['redis'])) 128 | { 129 | return array(); 130 | } 131 | self::$configs = $GLOBALS['config']['redis']; 132 | return self::$configs; 133 | } 134 | 135 | /** 136 | * set 137 | * 138 | * @param mixed $key 键 139 | * @param mixed $value 值 140 | * @param int $expire 过期时间,单位:秒 141 | * @return void 142 | * @author seatle 143 | * @created time :2015-12-13 01:05 144 | */ 145 | public static function set($key, $value, $expire = 0) 146 | { 147 | self::init(); 148 | try 149 | { 150 | if ( self::$redis ) 151 | { 152 | if ($expire > 0) 153 | { 154 | return self::$redis->setex($key, $expire, $value); 155 | } 156 | else 157 | { 158 | return self::$redis->set($key, $value); 159 | } 160 | } 161 | } 162 | catch (Exception $e) 163 | { 164 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 165 | log::warn($msg); 166 | if ($e->getCode() == 0) 167 | { 168 | self::$redis->close(); 169 | self::$redis = null; 170 | usleep(100000); 171 | return self::set($key, $value, $expire); 172 | } 173 | } 174 | return NULL; 175 | } 176 | 177 | 178 | /** 179 | * set 180 | * 181 | * @param mixed $key 键 182 | * @param mixed $value 值 183 | * @param int $expire 过期时间,单位:秒 184 | * @return void 185 | * @author seatle 186 | * @created time :2015-12-13 01:05 187 | */ 188 | public static function setnx($key, $value, $expire = 0) 189 | { 190 | self::init(); 191 | try 192 | { 193 | if ( self::$redis ) 194 | { 195 | if ($expire > 0) 196 | { 197 | return self::$redis->set($key, $value, array('nx', 'ex' => $expire)); 198 | //self::$redis->multi(); 199 | //self::$redis->setNX($key, $value); 200 | //self::$redis->expire($key, $expire); 201 | //self::$redis->exec(); 202 | //return true; 203 | } 204 | else 205 | { 206 | return self::$redis->setnx($key, $value); 207 | } 208 | } 209 | } 210 | catch (Exception $e) 211 | { 212 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 213 | log::warn($msg); 214 | if ($e->getCode() == 0) 215 | { 216 | self::$redis->close(); 217 | self::$redis = null; 218 | usleep(100000); 219 | return self::setnx($key, $value, $expire); 220 | } 221 | } 222 | return NULL; 223 | } 224 | 225 | /** 226 | * 锁 227 | * 默认锁1秒 228 | * 229 | * @param mixed $name 锁的标识名 230 | * @param mixed $value 锁的值,貌似没啥意义 231 | * @param int $expire 当前锁的最大生存时间(秒),必须大于0,超过生存时间系统会自动强制释放锁 232 | * @param int $interval 获取锁失败后挂起再试的时间间隔(微秒) 233 | * @return void 234 | * @author seatle 235 | * @created time :2016-10-30 23:56 236 | */ 237 | public static function lock($name, $value = 1, $expire = 5, $interval = 100000) 238 | { 239 | if ($name == null) return false; 240 | 241 | self::init(); 242 | try 243 | { 244 | if ( self::$redis ) 245 | { 246 | $key = "Lock:{$name}"; 247 | while (true) 248 | { 249 | // 因为 setnx 没有 expire 设置,所以还是用set 250 | //$result = self::$redis->setnx($key, $value); 251 | $result = self::$redis->set($key, $value, array('nx', 'ex' => $expire)); 252 | if ($result != false) 253 | { 254 | return true; 255 | } 256 | 257 | usleep($interval); 258 | } 259 | return false; 260 | } 261 | } 262 | catch (Exception $e) 263 | { 264 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 265 | log::warn($msg); 266 | if ($e->getCode() == 0) 267 | { 268 | self::$redis->close(); 269 | self::$redis = null; 270 | // 睡眠100毫秒 271 | usleep(100000); 272 | return self::lock($name, $value, $expire, $interval); 273 | } 274 | } 275 | return false; 276 | } 277 | 278 | public static function unlock($name) 279 | { 280 | $key = "Lock:{$name}"; 281 | return self::del($key); 282 | } 283 | 284 | /** 285 | * get 286 | * 287 | * @param mixed $key 288 | * @return void 289 | * @author seatle 290 | * @created time :2015-12-13 01:05 291 | */ 292 | public static function get($key) 293 | { 294 | self::init(); 295 | try 296 | { 297 | if ( self::$redis ) 298 | { 299 | return self::$redis->get($key); 300 | } 301 | } 302 | catch (Exception $e) 303 | { 304 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 305 | log::warn($msg); 306 | if ($e->getCode() == 0) 307 | { 308 | self::$redis->close(); 309 | self::$redis = null; 310 | usleep(100000); 311 | return self::get($key); 312 | } 313 | } 314 | return NULL; 315 | } 316 | 317 | /** 318 | * del 删除数据 319 | * 320 | * @param mixed $key 321 | * @return void 322 | * @author seatle 323 | * @created time :2015-12-13 01:05 324 | */ 325 | public static function del($key) 326 | { 327 | self::init(); 328 | try 329 | { 330 | if ( self::$redis ) 331 | { 332 | return self::$redis->del($key); 333 | } 334 | } 335 | catch (Exception $e) 336 | { 337 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 338 | log::warn($msg); 339 | if ($e->getCode() == 0) 340 | { 341 | self::$redis->close(); 342 | self::$redis = null; 343 | usleep(100000); 344 | return self::del($key); 345 | } 346 | } 347 | return NULL; 348 | } 349 | 350 | /** 351 | * type 返回值的类型 352 | * 353 | * @param mixed $key 354 | * @return void 355 | * @author seatle 356 | * @created time :2015-12-13 01:05 357 | */ 358 | public static function type($key) 359 | { 360 | self::init(); 361 | 362 | $types = array( 363 | '0' => 'set', 364 | '1' => 'string', 365 | '3' => 'list', 366 | ); 367 | 368 | try 369 | { 370 | if ( self::$redis ) 371 | { 372 | $type = self::$redis->type($key); 373 | if (isset($types[$type])) 374 | { 375 | return $types[$type]; 376 | } 377 | } 378 | } 379 | catch (Exception $e) 380 | { 381 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 382 | log::warn($msg); 383 | if ($e->getCode() == 0) 384 | { 385 | self::$redis->close(); 386 | self::$redis = null; 387 | usleep(100000); 388 | return self::type($key); 389 | } 390 | } 391 | return NULL; 392 | } 393 | 394 | /** 395 | * incr 名称为key的string增加integer, integer为0则增1 396 | * 397 | * @param mixed $key 398 | * @param int $integer 399 | * @return void 400 | * @author seatle 401 | * @created time :2015-12-18 11:28 402 | */ 403 | public static function incr($key, $integer = 0) 404 | { 405 | self::init(); 406 | try 407 | { 408 | if ( self::$redis ) 409 | { 410 | if (empty($integer)) 411 | { 412 | return self::$redis->incr($key); 413 | } 414 | else 415 | { 416 | return self::$redis->incrby($key, $integer); 417 | } 418 | } 419 | } 420 | catch (Exception $e) 421 | { 422 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 423 | log::warn($msg); 424 | if ($e->getCode() == 0) 425 | { 426 | self::$redis->close(); 427 | self::$redis = null; 428 | usleep(100000); 429 | return self::incr($key, $integer); 430 | } 431 | } 432 | return NULL; 433 | } 434 | 435 | /** 436 | * decr 名称为key的string减少integer, integer为0则减1 437 | * 438 | * @param mixed $key 439 | * @param int $integer 440 | * @return void 441 | * @author seatle 442 | * @created time :2015-12-18 11:28 443 | */ 444 | public static function decr($key, $integer = 0) 445 | { 446 | self::init(); 447 | try 448 | { 449 | if ( self::$redis ) 450 | { 451 | if (empty($integer)) 452 | { 453 | return self::$redis->decr($key); 454 | } 455 | else 456 | { 457 | return self::$redis->decrby($key, $integer); 458 | } 459 | } 460 | } 461 | catch (Exception $e) 462 | { 463 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 464 | log::warn($msg); 465 | if ($e->getCode() == 0) 466 | { 467 | self::$redis->close(); 468 | self::$redis = null; 469 | usleep(100000); 470 | return self::decr($key, $integer); 471 | } 472 | } 473 | return NULL; 474 | } 475 | 476 | /** 477 | * append 名称为key的string的值附加value 478 | * 479 | * @param mixed $key 480 | * @param mixed $value 481 | * @return void 482 | * @author seatle 483 | * @created time :2015-12-18 11:28 484 | */ 485 | public static function append($key, $value) 486 | { 487 | self::init(); 488 | try 489 | { 490 | if ( self::$redis ) 491 | { 492 | return self::$redis->append($key, $value); 493 | } 494 | } 495 | catch (Exception $e) 496 | { 497 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 498 | log::warn($msg); 499 | if ($e->getCode() == 0) 500 | { 501 | self::$redis->close(); 502 | self::$redis = null; 503 | usleep(100000); 504 | return self::append($key, $value); 505 | } 506 | } 507 | return NULL; 508 | } 509 | 510 | /** 511 | * substr 返回名称为key的string的value的子串 512 | * 513 | * @param mixed $key 514 | * @param mixed $start 515 | * @param mixed $end 516 | * @return void 517 | * @author seatle 518 | * @created time :2015-12-18 11:28 519 | */ 520 | public static function substr($key, $start, $end) 521 | { 522 | self::init(); 523 | try 524 | { 525 | if ( self::$redis ) 526 | { 527 | return self::$redis->substr($key, $start, $end); 528 | } 529 | } 530 | catch (Exception $e) 531 | { 532 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 533 | log::warn($msg); 534 | if ($e->getCode() == 0) 535 | { 536 | self::$redis->close(); 537 | self::$redis = null; 538 | usleep(100000); 539 | return self::substr($key, $start, $end); 540 | } 541 | } 542 | return NULL; 543 | } 544 | 545 | /** 546 | * select 按索引查询 547 | * 548 | * @param mixed $index 549 | * @return void 550 | * @author seatle 551 | * @created time :2015-12-18 11:28 552 | */ 553 | public static function select($index) 554 | { 555 | self::init(); 556 | try 557 | { 558 | if ( self::$redis ) 559 | { 560 | return self::$redis->select($index); 561 | } 562 | } 563 | catch (Exception $e) 564 | { 565 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 566 | log::warn($msg); 567 | if ($e->getCode() == 0) 568 | { 569 | self::$redis->close(); 570 | self::$redis = null; 571 | usleep(100000); 572 | return self::select($index); 573 | } 574 | } 575 | return NULL; 576 | } 577 | 578 | /** 579 | * dbsize 返回当前数据库中key的数目 580 | * 581 | * @param mixed $key 582 | * @return void 583 | * @author seatle 584 | * @created time :2015-12-18 11:28 585 | */ 586 | public static function dbsize() 587 | { 588 | self::init(); 589 | try 590 | { 591 | if ( self::$redis ) 592 | { 593 | return self::$redis->dbsize(); 594 | } 595 | } 596 | catch (Exception $e) 597 | { 598 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 599 | log::warn($msg); 600 | if ($e->getCode() == 0) 601 | { 602 | self::$redis->close(); 603 | self::$redis = null; 604 | usleep(100000); 605 | return self::dbsize(); 606 | } 607 | } 608 | return NULL; 609 | } 610 | 611 | /** 612 | * flushdb 删除当前选择数据库中的所有key 613 | * 614 | * @return void 615 | * @author seatle 616 | * @created time :2015-12-18 11:28 617 | */ 618 | public static function flushdb() 619 | { 620 | self::init(); 621 | try 622 | { 623 | if ( self::$redis ) 624 | { 625 | return self::$redis->flushdb(); 626 | } 627 | } 628 | catch (Exception $e) 629 | { 630 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 631 | log::warn($msg); 632 | if ($e->getCode() == 0) 633 | { 634 | self::$redis->close(); 635 | self::$redis = null; 636 | usleep(100000); 637 | return self::flushdb(); 638 | } 639 | } 640 | return NULL; 641 | } 642 | 643 | /** 644 | * flushall 删除所有数据库中的所有key 645 | * 646 | * @return void 647 | * @author seatle 648 | * @created time :2015-12-18 11:28 649 | */ 650 | public static function flushall() 651 | { 652 | self::init(); 653 | try 654 | { 655 | if ( self::$redis ) 656 | { 657 | return self::$redis->flushall(); 658 | } 659 | } 660 | catch (Exception $e) 661 | { 662 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 663 | log::warn($msg); 664 | if ($e->getCode() == 0) 665 | { 666 | self::$redis->close(); 667 | self::$redis = null; 668 | usleep(100000); 669 | return self::flushall(); 670 | } 671 | } 672 | return NULL; 673 | } 674 | 675 | /** 676 | * save 将数据保存到磁盘 677 | * 678 | * @param mixed $is_bgsave 将数据异步保存到磁盘 679 | * @return void 680 | * @author seatle 681 | * @created time :2015-12-18 11:28 682 | */ 683 | public static function save($is_bgsave = false) 684 | { 685 | self::init(); 686 | try 687 | { 688 | if ( self::$redis ) 689 | { 690 | if (!$is_bgsave) 691 | { 692 | return self::$redis->save(); 693 | } 694 | else 695 | { 696 | return self::$redis->bgsave(); 697 | } 698 | } 699 | } 700 | catch (Exception $e) 701 | { 702 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 703 | log::warn($msg); 704 | if ($e->getCode() == 0) 705 | { 706 | self::$redis->close(); 707 | self::$redis = null; 708 | usleep(100000); 709 | return self::save($is_bgsave); 710 | } 711 | } 712 | return NULL; 713 | } 714 | 715 | /** 716 | * info 提供服务器的信息和统计 717 | * 718 | * @return void 719 | * @author seatle 720 | * @created time :2015-12-18 11:28 721 | */ 722 | public static function info() 723 | { 724 | self::init(); 725 | try 726 | { 727 | if ( self::$redis ) 728 | { 729 | return self::$redis->info(); 730 | } 731 | } 732 | catch (Exception $e) 733 | { 734 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 735 | log::warn($msg); 736 | if ($e->getCode() == 0) 737 | { 738 | self::$redis->close(); 739 | self::$redis = null; 740 | usleep(100000); 741 | return self::info(); 742 | } 743 | } 744 | return NULL; 745 | } 746 | 747 | /** 748 | * slowlog 慢查询日志 749 | * 750 | * @return void 751 | * @author seatle 752 | * @created time :2015-12-18 11:28 753 | */ 754 | public static function slowlog($command = 'get', $len = 0) 755 | { 756 | self::init(); 757 | try 758 | { 759 | if ( self::$redis ) 760 | { 761 | if (!empty($len)) 762 | { 763 | return $redis->slowlog($command, $len); 764 | } 765 | else 766 | { 767 | return $redis->slowlog($command); 768 | } 769 | } 770 | } 771 | catch (Exception $e) 772 | { 773 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 774 | log::warn($msg); 775 | if ($e->getCode() == 0) 776 | { 777 | self::$redis->close(); 778 | self::$redis = null; 779 | usleep(100000); 780 | return self::slowlog($command, $len); 781 | } 782 | } 783 | return NULL; 784 | } 785 | 786 | /** 787 | * lastsave 返回上次成功将数据保存到磁盘的Unix时戳 788 | * 789 | * @return void 790 | * @author seatle 791 | * @created time :2015-12-18 11:28 792 | */ 793 | public static function lastsave() 794 | { 795 | self::init(); 796 | try 797 | { 798 | if ( self::$redis ) 799 | { 800 | return self::$redis->lastsave(); 801 | } 802 | } 803 | catch (Exception $e) 804 | { 805 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 806 | log::warn($msg); 807 | if ($e->getCode() == 0) 808 | { 809 | self::$redis->close(); 810 | self::$redis = null; 811 | usleep(100000); 812 | return self::lastsave(); 813 | } 814 | } 815 | return NULL; 816 | } 817 | 818 | /** 819 | * lpush 将数据从左边压入 820 | * 821 | * @param mixed $key 822 | * @param mixed $value 823 | * @return void 824 | * @author seatle 825 | * @created time :2015-12-13 01:05 826 | */ 827 | public static function lpush($key, $value) 828 | { 829 | self::init(); 830 | try 831 | { 832 | if ( self::$redis ) 833 | { 834 | return self::$redis->lpush($key, $value); 835 | } 836 | } 837 | catch (Exception $e) 838 | { 839 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 840 | log::warn($msg); 841 | if ($e->getCode() == 0) 842 | { 843 | self::$redis->close(); 844 | self::$redis = null; 845 | usleep(100000); 846 | return self::lpush($key, $value); 847 | } 848 | } 849 | return NULL; 850 | } 851 | 852 | /** 853 | * rpush 将数据从右边压入 854 | * 855 | * @param mixed $key 856 | * @param mixed $value 857 | * @return void 858 | * @author seatle 859 | * @created time :2015-12-13 01:05 860 | */ 861 | public static function rpush($key, $value) 862 | { 863 | self::init(); 864 | try 865 | { 866 | if ( self::$redis ) 867 | { 868 | return self::$redis->rpush($key, $value); 869 | } 870 | } 871 | catch (Exception $e) 872 | { 873 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 874 | log::warn($msg); 875 | if ($e->getCode() == 0) 876 | { 877 | self::$redis->close(); 878 | self::$redis = null; 879 | usleep(100000); 880 | return self::rpush($key, $value); 881 | } 882 | } 883 | return NULL; 884 | } 885 | 886 | /** 887 | * lpop 从左边弹出数据, 并删除数据 888 | * 889 | * @param mixed $key 890 | * @return void 891 | * @author seatle 892 | * @created time :2015-12-13 01:05 893 | */ 894 | public static function lpop($key) 895 | { 896 | self::init(); 897 | try 898 | { 899 | if ( self::$redis ) 900 | { 901 | return self::$redis->lpop($key); 902 | } 903 | } 904 | catch (Exception $e) 905 | { 906 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 907 | log::warn($msg); 908 | if ($e->getCode() == 0) 909 | { 910 | self::$redis->close(); 911 | self::$redis = null; 912 | usleep(100000); 913 | return self::lpop($key); 914 | } 915 | } 916 | return NULL; 917 | } 918 | 919 | /** 920 | * rpop 从右边弹出数据, 并删除数据 921 | * 922 | * @param mixed $key 923 | * @return void 924 | * @author seatle 925 | * @created time :2015-12-13 01:05 926 | */ 927 | public static function rpop($key) 928 | { 929 | self::init(); 930 | try 931 | { 932 | if ( self::$redis ) 933 | { 934 | return self::$redis->rpop($key); 935 | } 936 | } 937 | catch (Exception $e) 938 | { 939 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 940 | log::warn($msg); 941 | if ($e->getCode() == 0) 942 | { 943 | self::$redis->close(); 944 | self::$redis = null; 945 | usleep(100000); 946 | return self::rpop($key); 947 | } 948 | } 949 | return NULL; 950 | } 951 | 952 | /** 953 | * lsize 队列长度,同llen 954 | * 955 | * @param mixed $key 956 | * @return void 957 | * @author seatle 958 | * @created time :2015-12-13 01:05 959 | */ 960 | public static function lsize($key) 961 | { 962 | self::init(); 963 | try 964 | { 965 | if ( self::$redis ) 966 | { 967 | return self::$redis->lSize($key); 968 | } 969 | } 970 | catch (Exception $e) 971 | { 972 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 973 | log::warn($msg); 974 | if ($e->getCode() == 0) 975 | { 976 | self::$redis->close(); 977 | self::$redis = null; 978 | usleep(100000); 979 | return self::lsize($key); 980 | } 981 | } 982 | return NULL; 983 | } 984 | 985 | /** 986 | * lget 获取数据 987 | * 988 | * @param mixed $key 989 | * @param int $index 990 | * @return void 991 | * @author seatle 992 | * @created time :2015-12-13 01:05 993 | */ 994 | public static function lget($key, $index = 0) 995 | { 996 | self::init(); 997 | try 998 | { 999 | if ( self::$redis ) 1000 | { 1001 | return self::$redis->lget($key, $index); 1002 | } 1003 | } 1004 | catch (Exception $e) 1005 | { 1006 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 1007 | log::warn($msg); 1008 | if ($e->getCode() == 0) 1009 | { 1010 | self::$redis->close(); 1011 | self::$redis = null; 1012 | usleep(100000); 1013 | return self::lget($key, $index); 1014 | } 1015 | } 1016 | return NULL; 1017 | } 1018 | 1019 | /** 1020 | * lRange 获取范围数据 1021 | * 1022 | * @param mixed $key 1023 | * @param mixed $start 1024 | * @param mixed $end 1025 | * @return void 1026 | * @author seatle 1027 | * @created time :2015-12-13 01:05 1028 | */ 1029 | public static function lrange($key, $start, $end) 1030 | { 1031 | self::init(); 1032 | try 1033 | { 1034 | if ( self::$redis ) 1035 | { 1036 | return self::$redis->lRange($key, $start, $end); 1037 | } 1038 | } 1039 | catch (Exception $e) 1040 | { 1041 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 1042 | log::warn($msg); 1043 | if ($e->getCode() == 0) 1044 | { 1045 | self::$redis->close(); 1046 | self::$redis = null; 1047 | usleep(100000); 1048 | return self::lrange($key, $start, $end); 1049 | } 1050 | } 1051 | return NULL; 1052 | } 1053 | 1054 | /** 1055 | * rlist 从右边弹出 $length 长度数据,并删除数据 1056 | * 1057 | * @param mixed $key 1058 | * @param mixed $length 1059 | * @return void 1060 | * @author seatle 1061 | * @created time :2015-12-13 01:05 1062 | */ 1063 | public static function rlist($key, $length) 1064 | { 1065 | $queue_length = self::lsize($key); 1066 | // 如果队列中有数据 1067 | if ($queue_length > 0) 1068 | { 1069 | $list = array(); 1070 | $count = ($queue_length >= $length) ? $length : $queue_length; 1071 | for ($i = 0; $i < $count; $i++) 1072 | { 1073 | $data = self::rpop($key); 1074 | if ($data === false) 1075 | { 1076 | continue; 1077 | } 1078 | 1079 | $list[] = $data; 1080 | } 1081 | return $list; 1082 | } 1083 | else 1084 | { 1085 | // 没有数据返回NULL 1086 | return NULL; 1087 | } 1088 | } 1089 | 1090 | /** 1091 | * keys 1092 | * 1093 | * @param mixed $key 1094 | * @return void 1095 | * @author seatle 1096 | * @created time :2015-12-13 01:05 1097 | * 查找符合给定模式的key。 1098 | * KEYS *命中数据库中所有key。 1099 | * KEYS h?llo命中hello, hallo and hxllo等。 1100 | * KEYS h*llo命中hllo和heeeeello等。 1101 | * KEYS h[ae]llo命中hello和hallo,但不命中hillo。 1102 | * 特殊符号用"\"隔开 1103 | * 因为这个类加了OPT_PREFIX前缀,所以并不能真的列出redis所有的key,需要的话,要把前缀去掉 1104 | */ 1105 | public static function keys($key) 1106 | { 1107 | self::init(); 1108 | try 1109 | { 1110 | if ( self::$redis ) 1111 | { 1112 | return self::$redis->keys($key); 1113 | } 1114 | } 1115 | catch (Exception $e) 1116 | { 1117 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 1118 | log::warn($msg); 1119 | if ($e->getCode() == 0) 1120 | { 1121 | self::$redis->close(); 1122 | self::$redis = null; 1123 | usleep(100000); 1124 | return self::keys($key); 1125 | } 1126 | } 1127 | return NULL; 1128 | } 1129 | 1130 | /** 1131 | * ttl 返回某个KEY的过期时间 1132 | * 正数:剩余多少秒 1133 | * -1:永不超时 1134 | * -2:key不存在 1135 | * @param mixed $key 1136 | * @return void 1137 | * @author seatle 1138 | * @created time :2015-12-13 01:05 1139 | */ 1140 | public static function ttl($key) 1141 | { 1142 | self::init(); 1143 | try 1144 | { 1145 | if ( self::$redis ) 1146 | { 1147 | return self::$redis->ttl($key); 1148 | } 1149 | } 1150 | catch (Exception $e) 1151 | { 1152 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 1153 | log::warn($msg); 1154 | if ($e->getCode() == 0) 1155 | { 1156 | self::$redis->close(); 1157 | self::$redis = null; 1158 | usleep(100000); 1159 | return self::ttl($key); 1160 | } 1161 | } 1162 | return NULL; 1163 | } 1164 | 1165 | /** 1166 | * expire 为某个key设置过期时间,同setTimeout 1167 | * 1168 | * @param mixed $key 1169 | * @param mixed $expire 1170 | * @return void 1171 | * @author seatle 1172 | * @created time :2015-12-13 01:05 1173 | */ 1174 | public static function expire($key, $expire) 1175 | { 1176 | self::init(); 1177 | try 1178 | { 1179 | if ( self::$redis ) 1180 | { 1181 | return self::$redis->expire($key, $expire); 1182 | } 1183 | } 1184 | catch (Exception $e) 1185 | { 1186 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 1187 | log::warn($msg); 1188 | if ($e->getCode() == 0) 1189 | { 1190 | self::$redis->close(); 1191 | self::$redis = null; 1192 | usleep(100000); 1193 | return self::expire($key, $expire); 1194 | } 1195 | } 1196 | return NULL; 1197 | } 1198 | 1199 | /** 1200 | * exists key值是否存在 1201 | * 1202 | * @param mixed $key 1203 | * @return void 1204 | * @author seatle 1205 | * @created time :2015-12-13 01:05 1206 | */ 1207 | public static function exists($key) 1208 | { 1209 | self::init(); 1210 | try 1211 | { 1212 | if ( self::$redis ) 1213 | { 1214 | return self::$redis->exists($key); 1215 | } 1216 | } 1217 | catch (Exception $e) 1218 | { 1219 | $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; 1220 | log::warn($msg); 1221 | if ($e->getCode() == 0) 1222 | { 1223 | self::$redis->close(); 1224 | self::$redis = null; 1225 | usleep(100000); 1226 | return self::exists($key); 1227 | } 1228 | } 1229 | return false; 1230 | } 1231 | 1232 | /** 1233 | * ping 检查当前redis是否存在且是否可以连接上 1234 | * 1235 | * @return void 1236 | * @author seatle 1237 | * @created time :2015-12-13 01:05 1238 | */ 1239 | //protected static function ping() 1240 | //{ 1241 | //if ( empty (self::$redis) ) 1242 | //{ 1243 | //return false; 1244 | //} 1245 | //return self::$redis->ping() == '+PONG'; 1246 | //} 1247 | 1248 | public static function encode($value) 1249 | { 1250 | return json_encode($value, JSON_UNESCAPED_UNICODE); 1251 | } 1252 | 1253 | public static function decode($value) 1254 | { 1255 | return json_decode($value, true); 1256 | } 1257 | } 1258 | 1259 | 1260 | -------------------------------------------------------------------------------- /library/rolling_curl.php: -------------------------------------------------------------------------------- 1 | 10 | * @copyright seatle 11 | * @link http://www.epooll.com/ 12 | * @license http://www.opensource.org/licenses/mit-license.php MIT License 13 | */ 14 | 15 | class rolling_curl 16 | { 17 | /** 18 | * @var float 19 | * 20 | * 同时运行任务数 21 | * 例如:有8个请求,则会被分成两批,第一批5个请求,第二批3个请求 22 | * 注意:采集知乎的时候,5个是比较稳定的,7个以上就开始会超时了,多进程就没有这样的问题,因为多进程很少几率会发生并发 23 | */ 24 | public $window_size = 5; 25 | 26 | /** 27 | * @var float 28 | * 29 | * Timeout is the timeout used for curl_multi_select. 30 | */ 31 | private $timeout = 10; 32 | 33 | /** 34 | * @var string|array 35 | * 36 | * 应用在每个请求的回调函数 37 | */ 38 | public $callback; 39 | 40 | /** 41 | * @var array 42 | * 43 | * 设置默认的请求参数 44 | */ 45 | protected $options = array( 46 | CURLOPT_SSL_VERIFYPEER => 0, 47 | CURLOPT_RETURNTRANSFER => 1, 48 | // 注意:TIMEOUT = CONNECTTIMEOUT + 数据获取时间,所以 TIMEOUT 一定要大于 CONNECTTIMEOUT,否则 CONNECTTIMEOUT 设置了就没意义 49 | // "Connection timed out after 30001 milliseconds" 50 | CURLOPT_CONNECTTIMEOUT => 30, 51 | CURLOPT_TIMEOUT => 60, 52 | CURLOPT_RETURNTRANSFER => 1, 53 | CURLOPT_HEADER => 0, 54 | // 在多线程处理场景下使用超时选项时,会忽略signals对应的处理函数,但是无耐的是还有小概率的crash情况发生 55 | CURLOPT_NOSIGNAL => 1, 56 | CURLOPT_USERAGENT => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36", 57 | ); 58 | 59 | /** 60 | * @var array 61 | */ 62 | private $headers = array(); 63 | 64 | /** 65 | * @var Request[] 66 | * 67 | * 请求队列 68 | */ 69 | private $requests = array(); 70 | 71 | /** 72 | * @var RequestMap[] 73 | * 74 | * Maps handles to request indexes 75 | */ 76 | private $requestMap = array(); 77 | 78 | public function __construct() 79 | { 80 | } 81 | 82 | /** 83 | * set timeout 84 | * 85 | * @param init $timeout 86 | * @return 87 | */ 88 | public function set_timeout($timeout) 89 | { 90 | $this->options[CURLOPT_TIMEOUT] = $timeout; 91 | } 92 | 93 | /** 94 | * set proxy 95 | * 96 | */ 97 | public function set_proxy($proxy) 98 | { 99 | $this->options[CURLOPT_PROXY] = $proxy; 100 | } 101 | 102 | /** 103 | * set referer 104 | * 105 | */ 106 | public function set_referer($referer) 107 | { 108 | $this->options[CURLOPT_REFERER] = $referer; 109 | } 110 | 111 | /** 112 | * 设置 user_agent 113 | * 114 | * @param string $useragent 115 | * @return void 116 | */ 117 | public function set_useragent($useragent) 118 | { 119 | $this->options[CURLOPT_USERAGENT] = $useragent; 120 | } 121 | 122 | /** 123 | * 设置COOKIE 124 | * 125 | * @param string $cookie 126 | * @return void 127 | */ 128 | public function set_cookie($cookie) 129 | { 130 | $this->options[CURLOPT_COOKIE] = $cookie; 131 | } 132 | 133 | /** 134 | * 设置COOKIE JAR 135 | * 136 | * @param string $cookie_jar 137 | * @return void 138 | */ 139 | public function set_cookiejar($cookiejar) 140 | { 141 | $this->options[CURLOPT_COOKIEJAR] = $cookiejar; 142 | } 143 | 144 | /** 145 | * 设置COOKIE FILE 146 | * 147 | * @param string $cookie_file 148 | * @return void 149 | */ 150 | public function set_cookiefile($cookiefile) 151 | { 152 | $this->options[CURLOPT_COOKIEFILE] = $cookiefile; 153 | } 154 | 155 | /** 156 | * 获取内容的时候是不是连header也一起获取 157 | * 158 | * @param mixed $http_raw 159 | * @return void 160 | * @author seatle 161 | * @created time :2016-09-18 10:17 162 | */ 163 | public function set_http_raw($http_raw = false) 164 | { 165 | $this->options[CURLOPT_HEADER] = $http_raw; 166 | } 167 | 168 | /** 169 | * 设置IP 170 | * 171 | * @param string $ip 172 | * @return void 173 | */ 174 | public function set_ip($ip) 175 | { 176 | $headers = array( 177 | 'CLIENT-IP'=>$ip, 178 | 'X-FORWARDED-FOR'=>$ip, 179 | ); 180 | $this->headers = $this->headers + $headers; 181 | } 182 | 183 | /** 184 | * 设置Headers 185 | * 186 | * @param string $headers 187 | * @return void 188 | */ 189 | public function set_headers($headers) 190 | { 191 | $this->headers = $this->headers + $headers; 192 | } 193 | 194 | /** 195 | * 设置Hosts 196 | * 197 | * @param string $hosts 198 | * @return void 199 | */ 200 | public function set_hosts($hosts) 201 | { 202 | $headers = array( 203 | 'Host'=>$hosts, 204 | ); 205 | $this->headers = $this->headers + $headers; 206 | } 207 | 208 | /** 209 | * 设置Gzip 210 | * 211 | * @param string $hosts 212 | * @return void 213 | */ 214 | public function set_gzip($gzip) 215 | { 216 | if ($gzip) 217 | { 218 | $this->options[CURLOPT_ENCODING] = 'gzip'; 219 | } 220 | } 221 | 222 | public function request($url, $method = "GET", $fields = array(), $headers = array(), $options = array()) 223 | { 224 | $this->requests[] = array('url'=>$url,'method'=>$method,'fields'=>$fields,'headers'=>$headers,'options'=>$options); 225 | return true; 226 | } 227 | 228 | public function get_options($request) 229 | { 230 | $options = $this->options; 231 | $headers = $this->headers; 232 | 233 | if (ini_get('safe_mode') == 'Off' || !ini_get('safe_mode')) 234 | { 235 | $options[CURLOPT_FOLLOWLOCATION] = 1; 236 | $options[CURLOPT_MAXREDIRS] = 5; 237 | } 238 | 239 | // 如果是 get 方式,直接拼凑一个 url 出来 240 | if (strtolower($request['method']) == 'get' && !empty($request['fields'])) 241 | { 242 | $url = $request['url'] . "?" . http_build_query($request['fields']); 243 | } 244 | // 如果是 post 方式 245 | if (strtolower($request['method']) == 'post') 246 | { 247 | $options[CURLOPT_POST] = 1; 248 | $options[CURLOPT_POSTFIELDS] = $request['fields']; 249 | } 250 | 251 | // append custom options for this specific request 252 | if ($request['options']) 253 | { 254 | $options = $request['options'] + $options; 255 | } 256 | 257 | if ($request['headers']) 258 | { 259 | $headers = $request['headers'] + $headers; 260 | } 261 | 262 | // 随机绑定 hosts,做负载均衡 263 | //if (self::$hosts) 264 | //{ 265 | //$parse_url = parse_url($url); 266 | //$host = $parse_url['host']; 267 | //$key = rand(0, count(self::$hosts)-1); 268 | //$ip = self::$hosts[$key]; 269 | //$url = str_replace($host, $ip, $url); 270 | //self::$headers = array_merge( array('Host:'.$host), self::$headers ); 271 | //} 272 | 273 | // header 要这样拼凑 274 | $headers_tmp = array(); 275 | foreach ($headers as $k=>$v) 276 | { 277 | $headers_tmp[] = $k.":".$v; 278 | } 279 | $headers = $headers_tmp; 280 | 281 | $options[CURLOPT_URL] = $request['url']; 282 | $options[CURLOPT_HTTPHEADER] = $headers; 283 | 284 | return $options; 285 | } 286 | 287 | /** 288 | * GET 请求 289 | * 290 | * @param string $url 291 | * @param array $headers 292 | * @param array $options 293 | * @return bool 294 | */ 295 | public function get($url, $fields = array(), $headers = array(), $options = array()) 296 | { 297 | return $this->request($url, 'get', $fields, $headers, $options); 298 | } 299 | 300 | /** 301 | * $fields 有三种类型:1、数组;2、http query;3、json 302 | * 1、array('name'=>'yangzetao') 2、http_build_query(array('name'=>'yangzetao')) 3、json_encode(array('name'=>'yangzetao')) 303 | * 前两种是普通的post,可以用$_POST方式获取 304 | * 第三种是post stream( json rpc,其实就是webservice ),虽然是post方式,但是只能用流方式 http://input 后者 $HTTP_RAW_POST_DATA 获取 305 | * 306 | * @param string $url 307 | * @param array $fields 308 | * @param array $headers 309 | * @param array $options 310 | * @return void 311 | */ 312 | public function post($url, $fields = array(), $headers = array(), $options = array()) 313 | { 314 | return $this->request($url, 'post', $fields, $headers, $options); 315 | } 316 | 317 | /** 318 | * Execute processing 319 | * 320 | * @param int $window_size Max number of simultaneous connections 321 | * @return string|bool 322 | */ 323 | public function execute($window_size = null) 324 | { 325 | $count = sizeof($this->requests); 326 | if ($count == 0) 327 | { 328 | return false; 329 | } 330 | // 只有一个请求 331 | elseif ($count == 1) 332 | { 333 | return $this->single_curl(); 334 | } 335 | else 336 | { 337 | // 开始 rolling curl,window_size 是最大同时连接数 338 | return $this->rolling_curl($window_size); 339 | } 340 | } 341 | 342 | private function single_curl() 343 | { 344 | $ch = curl_init(); 345 | // 从请求队列里面弹出一个来 346 | $request = array_shift($this->requests); 347 | $options = $this->get_options($request); 348 | curl_setopt_array($ch, $options); 349 | $output = curl_exec($ch); 350 | $info = curl_getinfo($ch); 351 | $error = null; 352 | if ($output === false) 353 | { 354 | $error = curl_error( $ch ); 355 | } 356 | //$output = substr($output, 10); 357 | //$output = gzinflate($output); 358 | 359 | // 其实一个请求的时候没是么必要回调,直接返回数据就好了,不过这里算是多一个功能吧,和多请求保持一样的操作 360 | if ($this->callback) 361 | { 362 | if (is_callable($this->callback)) 363 | { 364 | call_user_func($this->callback, $output, $info, $request, $error); 365 | } 366 | } 367 | else 368 | { 369 | return $output; 370 | } 371 | return true; 372 | } 373 | 374 | private function rolling_curl($window_size = null) 375 | { 376 | // 如何设置了最大任务数 377 | if ($window_size) 378 | $this->window_size = $window_size; 379 | 380 | // 如果请求数 小于 任务数,设置任务数为请求数 381 | if (sizeof($this->requests) < $this->window_size) 382 | $this->window_size = sizeof($this->requests); 383 | 384 | // 如果任务数小于2个,不应该用这个方法的,用上面的single_curl方法就好了 385 | if ($this->window_size < 2) 386 | exit("Window size must be greater than 1"); 387 | 388 | // 初始化任务队列 389 | $master = curl_multi_init(); 390 | 391 | // 开始第一批请求 392 | for ($i = 0; $i < $this->window_size; $i++) 393 | { 394 | $ch = curl_init(); 395 | $options = $this->get_options($this->requests[$i]); 396 | curl_setopt_array($ch, $options); 397 | curl_multi_add_handle($master, $ch); 398 | // 添加到请求数组 399 | $key = (string) $ch; 400 | $this->requestMap[$key] = $i; 401 | } 402 | 403 | do { 404 | while (($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM) ; 405 | 406 | // 如果 407 | if ($execrun != CURLM_OK) { break; } 408 | 409 | // 一旦有一个请求完成,找出来,因为curl底层是select,所以最大受限于1024 410 | while ($done = curl_multi_info_read($master)) 411 | { 412 | // 从请求中获取信息、内容、错误 413 | $info = curl_getinfo($done['handle']); 414 | $output = curl_multi_getcontent($done['handle']); 415 | $error = curl_error($done['handle']); 416 | 417 | // 如果绑定了回调函数 418 | $callback = $this->callback; 419 | if (is_callable($callback)) 420 | { 421 | $key = (string) $done['handle']; 422 | $request = $this->requests[$this->requestMap[$key]]; 423 | unset($this->requestMap[$key]); 424 | call_user_func($callback, $output, $info, $request, $error); 425 | } 426 | 427 | // 一个请求完了,就加一个进来,一直保证5个任务同时进行 428 | if ($i < sizeof($this->requests) && isset($this->requests[$i]) && $i < count($this->requests)) 429 | { 430 | $ch = curl_init(); 431 | $options = $this->get_options($this->requests[$i]); 432 | curl_setopt_array($ch, $options); 433 | curl_multi_add_handle($master, $ch); 434 | 435 | // 添加到请求数组 436 | $key = (string) $ch; 437 | $this->requestMap[$key] = $i; 438 | $i++; 439 | } 440 | // 把请求已经完成了得 curl handle 删除 441 | curl_multi_remove_handle($master, $done['handle']); 442 | } 443 | 444 | // 当没有数据的时候进行堵塞,把 CPU 使用权交出来,避免上面 do 死循环空跑数据导致 CPU 100% 445 | if ($running) 446 | { 447 | curl_multi_select($master, $this->timeout); 448 | } 449 | 450 | } while ($running); 451 | // 关闭任务 452 | curl_multi_close($master); 453 | 454 | // 把请求清空,否则没有重新 new rolling_curl(); 直接再次导入一批url的时候,就会把前面已经执行过的url又执行一轮 455 | unset($this->requests); 456 | return true; 457 | } 458 | 459 | /** 460 | * @return void 461 | */ 462 | public function __destruct() 463 | { 464 | unset($this->window_size, $this->callback, $this->options, $this->headers, $this->requests); 465 | } 466 | } 467 | -------------------------------------------------------------------------------- /user.php: -------------------------------------------------------------------------------- 1 | 10 | * @copyright seatle 11 | * @link http://www.epooll.com/ 12 | * @license http://www.opensource.org/licenses/mit-license.php MIT License 13 | */ 14 | 15 | /** 16 | * 获取用户详细信息 17 | * 18 | * @param string $username 19 | * @return void 20 | * @author seatle 21 | * @created time :2015-07-28 09:46 22 | */ 23 | function get_user_about($content) 24 | { 25 | $data = array(); 26 | 27 | if (empty($content)) 28 | { 29 | return $data; 30 | } 31 | 32 | // 一句话介绍 33 | preg_match('##', $content, $out); 34 | $data['headline'] = empty($out[1]) ? '' : $out[1]; 35 | 36 | // 头像 37 | //preg_match('#龙威廉#', $content, $out); 38 | preg_match('#.*?#', $content, $out); 39 | $data['headimg'] = empty($out[1]) ? '' : $out[1]; 40 | 41 | // 居住地 42 | preg_match('##', $content, $out); 43 | $data['location'] = empty($out[1]) ? '' : $out[1]; 44 | 45 | // 所在行业 46 | preg_match('##', $content, $out); 47 | $data['business'] = empty($out[1]) ? '' : $out[1]; 48 | 49 | // 性别 50 | preg_match('##', $content, $out); 51 | $gender = empty($out[1]) ? 'other' : $out[1]; 52 | if ($gender == 'female') 53 | $data['gender'] = 0; 54 | elseif ($gender == 'male') 55 | $data['gender'] = 1; 56 | else 57 | $data['gender'] = 2; 58 | 59 | // 公司或组织名称 60 | preg_match('##', $content, $out); 61 | $data['employment'] = empty($out[1]) ? '' : $out[1]; 62 | 63 | // 职位 64 | preg_match('##', $content, $out); 65 | $data['position'] = empty($out[1]) ? '' : $out[1]; 66 | 67 | // 学校或教育机构名 68 | preg_match('##', $content, $out); 69 | $data['education'] = empty($out[1]) ? '' : $out[1]; 70 | 71 | // 专业方向 72 | preg_match('##', $content, $out); 73 | $data['education_extra'] = empty($out[1]) ? '' : $out[1]; 74 | 75 | // 新浪微博 76 | preg_match('#\s(.*?)\s#s', $content, $out); 81 | $data['description'] = empty($out[1]) ? '' : trim(strip_tags($out[1])); 82 | 83 | // 关注了、关注者 84 | preg_match('#关注了
    \s(.*?)#', $content, $out); 85 | $data['followees'] = empty($out[1]) ? 0 : intval($out[1]); 86 | preg_match('#关注者
    \s(.*?)#', $content, $out); 87 | $data['followers'] = empty($out[1]) ? 0 : intval($out[1]); 88 | 89 | // 关注专栏 90 | preg_match('#(.*?) 个专栏#', $content, $out); 91 | $data['followed'] = empty($out[1]) ? 0 : intval($out[1]); 92 | 93 | // 关注话题 94 | preg_match('#(.*?) 个话题#', $content, $out); 95 | $data['topics'] = empty($out[1]) ? 0 : intval($out[1]); 96 | 97 | // 关注专栏 98 | preg_match('#个人主页被 (.*?) 人浏览#', $content, $out); 99 | $data['pv'] = empty($out[1]) ? 0 : intval($out[1]); 100 | 101 | // 提问、回答、专栏文章、收藏、公共编辑 102 | preg_match('#提问\s(.*?)#', $content, $out); 103 | $data['asks'] = empty($out[1]) ? 0 : intval($out[1]); 104 | preg_match('#回答\s(.*?)#', $content, $out); 105 | $data['answers'] = empty($out[1]) ? 0 : intval($out[1]); 106 | preg_match('#专栏文章\s(.*?)#', $content, $out); 107 | $data['posts'] = empty($out[1]) ? 0 : intval($out[1]); 108 | preg_match('#收藏\s(.*?)#', $content, $out); 109 | $data['collections'] = empty($out[1]) ? 0 : intval($out[1]); 110 | preg_match('#公共编辑\s(.*?)#', $content, $out); 111 | $data['logs'] = empty($out[1]) ? 0 : intval($out[1]); 112 | 113 | // 赞同、感谢、收藏、分享 114 | preg_match('#(.*?) 赞同#', $content, $out); 115 | $data['votes'] = empty($out[1]) ? 0 : intval($out[1]); 116 | preg_match('#(.*?) 感谢#', $content, $out); 117 | $data['thanks'] = empty($out[1]) ? 0 : intval($out[1]); 118 | preg_match('#(.*?) 收藏#', $content, $out); 119 | $data['favs'] = empty($out[1]) ? 0 : intval($out[1]); 120 | preg_match('#(.*?) 分享#', $content, $out); 121 | $data['shares'] = empty($out[1]) ? 0 : intval($out[1]); 122 | return $data; 123 | } 124 | 125 | function get_user($content) 126 | { 127 | $data = array(); 128 | 129 | if (empty($content)) 130 | { 131 | return $data; 132 | } 133 | // 从用户主页获取用户最后一条动态信息 134 | preg_match('#
    (.*?)
    #s', $content, $out); 137 | $data['last_message'] = empty($out[1]) ? 0 : trim(str_replace("\n", " ", strip_tags($out[1]))); 138 | return $data; 139 | } 140 | 141 | /** 142 | * 保存用户信息 143 | * 144 | * @param object $worker 145 | * @return array 146 | * @author seatle 147 | * @created time :2015-08-02 12:30 148 | */ 149 | function save_user_info($worker = null) 150 | { 151 | // 先给一条记录上锁 152 | $progress_id = posix_getpid(); 153 | $time = time(); 154 | 155 | // 不要按照uptime排序然后又去更新uptime,锁的索引值会变成整张表,如果所有进程都锁住整张表,就会出现死锁 156 | //$sql = "Update `user` Set `info_progress_id`='{$progress_id}', `info_uptime`='{$time}' Order By `info_uptime` Asc Limit 1"; 157 | //$sql = "Update `user` Set `info_progress_id`='{$progress_id}' Order By `info_uptime` Asc Limit 1"; 158 | //db::query($sql); 159 | // 因为uptime在下面修改,所以这里还是正序 160 | //$sql = "Select `username` From `user` Where `info_progress_id`='{$progress_id}' Order By `info_uptime` Asc Limit 1"; 161 | //$row = db::get_one($sql); 162 | //if (!empty($row['username'])) 163 | $username = get_user_queue('info'); 164 | if (!empty($username)) 165 | { 166 | $username = addslashes($username); 167 | $worker->log("采集用户信息 --- " . $username . " --- 开始\n"); 168 | $data = get_user_info($username); 169 | if (!empty($data)) 170 | { 171 | $worker->log("采集用户信息 --- " . $username . " --- 成功\n"); 172 | // 更新采集时间, 让队列每次都取到不同的用户,形成采集死循环 173 | $data['info_uptime'] = $time; 174 | $data['info_progress_id'] = $progress_id; 175 | $data['last_message_week'] = empty($data['last_message_time']) ? 7 : intval(date("w")); 176 | $data['last_message_hour'] = empty($data['last_message_time']) ? 24 : intval(date("H")); 177 | $sql = db::update('user', $data, "`username`='{$username}'", true); 178 | db::query($sql); 179 | } 180 | else 181 | { 182 | $worker->log("采集用户信息 --- " . $username . " --- 失败\n"); 183 | // 更新采集时间, 让队列每次都取到不同的用户,形成采集死循环 184 | $sql = "Update `user` Set `info_uptime`='{$time}',`info_progress_id`='{$progress_id}' Where `username`='{$username}'"; 185 | db::query($sql); 186 | } 187 | } 188 | else 189 | { 190 | $worker->log("采集用户 --- 队列不存在"); 191 | } 192 | } 193 | 194 | /** 195 | * 获取用户采集队列 196 | * 197 | * @param string $key 198 | * @param int $count 199 | * @return void 200 | * @author seatle 201 | * @created time :2015-08-03 19:36 202 | */ 203 | function get_user_queue($key = 'list', $count = 10000) 204 | { 205 | // 如果队列为空, 从数据库取一些 206 | if (!cache::get_instance()->lsize($key)) 207 | { 208 | $sql = "Select `username`, `{$key}_uptime` From `user` Order By `{$key}_uptime` Asc Limit {$count}"; 209 | $rows = db::get_all($sql); 210 | foreach ($rows as $row) 211 | { 212 | //echo $row['username'] . " --- " . date("Y-m-d H:i:s", $row[$key.'_uptime']) . "\n"; 213 | cache::get_instance()->lpush($key, $row['username']); 214 | } 215 | } 216 | // 从队列中取出一条数据 217 | return cache::get_instance()->lpop($key); 218 | } 219 | 220 | /** 221 | * 保存用户索引 222 | * 223 | * @return void 224 | * @author seatle 225 | * @created time :2015-08-02 12:30 226 | */ 227 | function save_user_index($worker = null) 228 | { 229 | // 先给一条记录上锁, 采用队列之后就不需要了,这个多进程下还是有问题 230 | $progress_id = posix_getpid(); 231 | $time = time(); 232 | 233 | // 会和下面的更新采集时间发送死锁,因为Order By 会扫描整张表,虽然desc出来的rows为1,也不知道为什么 234 | //$sql = "Update `user` Set `index_progress_id`='{$progress_id}' Order By `index_uptime` Asc Limit 1"; 235 | // 效率太低 236 | //$sql = "Update `user` Set `index_progress_id`='15895' Where `index_uptime` = (Select Min(`index_uptime`) From (Select tmp.* From user tmp) a limit 1);"; 237 | // 语法错误 238 | //$sql = "Update `user` Set `index_progress_id`='{$progress_id}' Where `index_uptime` = (Select Min(`index_uptime`) From `user`)"; 239 | //db::query($sql); 240 | 241 | 242 | //$sql = "Select `username`, `depth` From `user` Where `index_progress_id`='{$progress_id}' Order By `index_uptime` Asc Limit 1"; 243 | //$row = db::get_one($sql); 244 | //if (!empty($row['username'])) 245 | $username = get_user_queue('index'); 246 | if (!empty($username)) 247 | { 248 | $username = addslashes($username); 249 | // 先把用户深度拿出来,下面要增加1给新用户 250 | $sql = "Select `depth` From `user` Where `username`='{$username}'"; 251 | $row = db::get_one($sql); 252 | $depth = $row['depth']; 253 | 254 | // 更新采集时间, 让队列每次都取到不同的用户 255 | $sql = "Update `user` Set `index_uptime`='{$time}',`index_progress_id`='{$progress_id}' Where `username`='{$username}'"; 256 | db::query($sql); 257 | 258 | $worker->log("采集用户列表 --- " . $username . " --- 开始"); 259 | // $user_rows = get_user_index($username); 260 | // $user_type followees 、followers 261 | // 获取关注了 262 | $followees_user = get_user_index($username, 'followees', $worker); 263 | $worker->log("采集用户列表 --- " . $username . " --- 关注了 --- 成功"); 264 | // 获取关注者 265 | $followers_user = get_user_index($username, 'followers', $worker); 266 | $worker->log("采集用户列表 --- " . $username . " --- 关注者 --- 成功"); 267 | // 合并 关注了 和 关注者 268 | $user_rows = array_merge($followers_user, $followees_user); 269 | 270 | if (!empty($user_rows)) 271 | { 272 | $worker->log("采集用户列表 --- " . $username . " --- 成功"); 273 | 274 | foreach ($user_rows as $user_row) 275 | { 276 | // 子用户 277 | $c_username = addslashes($user_row['username']); 278 | $sql = "Select Count(*) As count From `user` Where `username`='{$c_username}'"; 279 | $row = db::get_one($sql); 280 | // 如果用户不存在 281 | if (!$row['count']) 282 | { 283 | $user_row['depth'] = $depth+1; 284 | $user_row['parent_username'] = $username; 285 | $user_row['addtime'] = $user_row['index_uptime'] = $user_row['info_uptime'] = time(); 286 | if (db::insert('user', $user_row)) 287 | { 288 | $worker->log("入库用户 --- " . $c_username . " --- 成功"); 289 | } 290 | else 291 | { 292 | $worker->log("入库用户 --- " . $c_username . " --- 失败"); 293 | } 294 | } 295 | } 296 | } 297 | else 298 | { 299 | $worker->log("采集用户列表 --- " . $username . " --- 失败"); 300 | } 301 | } 302 | else 303 | { 304 | $worker->log("采集用户 --- 队列不存在"); 305 | } 306 | } 307 | 308 | /** 309 | * 获取用户 310 | * 311 | * @param string $username 312 | * @param string $user_type followees 、followers 313 | * @return void 314 | * @author seatle 315 | * @created time :2015-07-28 09:46 316 | */ 317 | function get_user_index($username, $user_type = 'followees', $worker) 318 | { 319 | $url = "http://www.zhihu.com/people/{$username}/{$user_type}"; 320 | set_cookie(); 321 | cls_curl::set_gzip(true); 322 | $content = cls_curl::get($url); 323 | 324 | if (empty($content)) 325 | { 326 | return array(); 327 | } 328 | 329 | $users = array(); 330 | 331 | // 用户不足20个的时候,从ajax取不到用户,所以首页这里还是要取一下 332 | preg_match_all('#

    (.*?)

    #', $content, $out); 333 | $count = count($out[1]); 334 | for ($i = 0; $i < $count; $i++) 335 | { 336 | $d_username = empty($out[1][$i]) ? '' : $out[1][$i]; 337 | $d_nickname = empty($out[2][$i]) ? '' : $out[2][$i]; 338 | if (!empty($d_username) && !empty($d_nickname)) 339 | { 340 | $users[$d_username] = array( 341 | 'username'=>$d_username, 342 | 'nickname'=>$d_nickname, 343 | ); 344 | } 345 | } 346 | 347 | $keyword = $user_type == 'followees' ? '关注了' : '关注者'; 348 | $worker->log("采集用户 --- " . $username . " --- {$keyword} --- 主页 --- 成功\n"); 349 | 350 | preg_match('#'.$keyword.'
    \s(.*?)#', $content, $out); 351 | $user_count = empty($out[1]) ? 0 : intval($out[1]); 352 | 353 | preg_match('##', $content, $out); 354 | $_xsrf = empty($out[1]) ? '' : trim($out[1]); 355 | 356 | preg_match('#
    #', $content, $out); 357 | $url_params = empty($out[1]) ? '' : json_decode(html_entity_decode($out[1]), true); 358 | if (!empty($_xsrf) && !empty($url_params) && is_array($url_params)) 359 | { 360 | $url = "http://www.zhihu.com/node/" . $url_params['nodename']; 361 | $params = $url_params['params']; 362 | 363 | $j = 1; 364 | for ($i = 0; $i < $user_count; $i=$i+20) 365 | { 366 | $params['offset'] = $i; 367 | $post_data = array( 368 | 'method'=>'next', 369 | 'params'=>json_encode($params), 370 | '_xsrf'=>$_xsrf, 371 | ); 372 | $content = cls_curl::post($url, $post_data); 373 | if (empty($content)) 374 | { 375 | $worker->log("采集用户 --- " . $username . " --- {$keyword} --- 第{$j}页 --- 失败\n"); 376 | continue; 377 | } 378 | $rows = json_decode($content, true); 379 | if (empty($rows['msg']) || !is_array($rows['msg'])) 380 | { 381 | $worker->log("采集用户 --- " . $username . " --- {$keyword} --- 第{$j}页 --- 失败\n"); 382 | continue; 383 | } 384 | $worker->log("采集用户 --- " . $username . " --- {$keyword} --- 第{$j}页 --- 成功\n"); 385 | 386 | foreach ($rows['msg'] as $row) 387 | { 388 | preg_match_all('#

    (.*?)

    #', $row, $out); 389 | $d_username = empty($out[1][0]) ? '' : $out[1][0]; 390 | $d_nickname = empty($out[2][0]) ? '' : $out[2][0]; 391 | if (!empty($d_username) && !empty($d_nickname)) 392 | { 393 | $users[$d_username] = array( 394 | 'username'=>$d_username, 395 | 'nickname'=>$d_nickname, 396 | ); 397 | } 398 | } 399 | $j++; 400 | } 401 | } 402 | return $users; 403 | } 404 | 405 | -------------------------------------------------------------------------------- /user.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE `user` ( 2 | `id` int(11) unsigned NOT NULL AUTO_INCREMENT COMMENT 'ID', 3 | `username` varchar(60) NOT NULL DEFAULT '' COMMENT '用户名', 4 | `parent_username` varchar(60) DEFAULT NULL COMMENT '来自那个用户', 5 | `depth` int(11) DEFAULT '0' COMMENT '深度', 6 | `list_progress_id` int(11) DEFAULT '0' COMMENT '采集用户索引进程ID', 7 | `list_uptime` int(10) DEFAULT NULL COMMENT '抓取索引时间', 8 | `list_server_id` tinyint(1) DEFAULT '0', 9 | `info_progress_id` int(11) DEFAULT '0' COMMENT '采集用户信息进程ID', 10 | `info_uptime` int(10) DEFAULT NULL COMMENT '抓取信息时间', 11 | `info_server_id` tinyint(1) DEFAULT '0', 12 | `nickname` varchar(32) NOT NULL COMMENT '真实姓名', 13 | `headimg` varchar(248) DEFAULT NULL COMMENT '头像', 14 | `location` varchar(32) DEFAULT NULL COMMENT '居住地', 15 | `business` varchar(32) DEFAULT NULL COMMENT '所在行业', 16 | `gender` tinyint(1) DEFAULT '2' COMMENT '性别 0:女 1:男 2:其他 ', 17 | `employment` varchar(32) DEFAULT NULL COMMENT '公司或组织名称', 18 | `position` varchar(32) DEFAULT NULL COMMENT '职位', 19 | `education` varchar(32) DEFAULT NULL COMMENT '学校或教育机构名', 20 | `education_extra` int(11) DEFAULT NULL COMMENT '专业方向', 21 | `weibo` varchar(248) DEFAULT NULL COMMENT '微博地址', 22 | `headline` varchar(248) DEFAULT NULL COMMENT '一句话介绍', 23 | `description` varchar(248) DEFAULT NULL COMMENT '个人简介', 24 | `last_message` varchar(248) DEFAULT NULL COMMENT '最后信息', 25 | `last_message_time` int(10) DEFAULT '0' COMMENT '最后信息时间', 26 | `last_message_week` tinyint(1) DEFAULT '7' COMMENT '最后信息星期,最大6,这里7做排除', 27 | `last_message_hour` tinyint(3) DEFAULT '24' COMMENT '最后信息小时,最大23,这里24做排除', 28 | `followees` int(11) DEFAULT '0' COMMENT '关注了', 29 | `followers` int(11) DEFAULT '0' COMMENT '关注者', 30 | `followed` int(11) DEFAULT '0' COMMENT '关注专栏', 31 | `topics` int(11) DEFAULT '0' COMMENT '关注话题', 32 | `pv` int(11) DEFAULT '0' COMMENT '主页被多少人浏览', 33 | `asks` int(11) DEFAULT '0' COMMENT '提问', 34 | `answers` int(11) DEFAULT '0' COMMENT '回答', 35 | `posts` int(11) DEFAULT '0' COMMENT '专栏文章', 36 | `collections` int(11) DEFAULT '0' COMMENT '收藏了', 37 | `logs` int(11) DEFAULT '0' COMMENT '公共编辑', 38 | `votes` int(11) DEFAULT '0' COMMENT '赞同', 39 | `thanks` int(11) DEFAULT '0' COMMENT '感谢', 40 | `favs` int(11) DEFAULT '0' COMMENT '收藏者', 41 | `shares` int(11) DEFAULT '0' COMMENT '分享', 42 | `birthday` date DEFAULT NULL COMMENT '生日', 43 | `status` tinyint(1) DEFAULT '0' COMMENT '状态', 44 | `addtime` int(10) DEFAULT NULL COMMENT '添加时间', 45 | PRIMARY KEY (`id`), 46 | UNIQUE KEY `username` (`username`), 47 | KEY `location` (`location`), 48 | KEY `gender` (`gender`), 49 | KEY `business` (`business`), 50 | KEY `info_progress_id` (`info_progress_id`), 51 | KEY `education` (`education`), 52 | KEY `uptime` (`info_uptime`), 53 | KEY `index_time` (`list_uptime`), 54 | KEY `index_progress_id` (`list_progress_id`), 55 | KEY `last_message_week` (`last_message_week`), 56 | KEY `last_message_hour` (`last_message_hour`), 57 | KEY `list_server_id` (`list_server_id`), 58 | KEY `info_server_id` (`info_server_id`) 59 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 60 | -------------------------------------------------------------------------------- /user_index.php: -------------------------------------------------------------------------------- 1 | ';print_r($data);echo ''; 17 | //exit("\n"); 18 | 19 | $w = new worker(); 20 | $w->count = 8; 21 | $w->is_once = true; 22 | 23 | $count = 100000; // 每个进程循环多少次 24 | $w->on_worker_start = function($worker) use ($count) { 25 | 26 | //$progress_id = posix_getpid(); 27 | 28 | for ($i = 0; $i < $count; $i++) 29 | { 30 | save_user_index($worker); 31 | } 32 | }; 33 | 34 | $w->run(); 35 | -------------------------------------------------------------------------------- /user_info.php: -------------------------------------------------------------------------------- 1 | set_cookie($cookie); 13 | $curl->set_gzip(true); 14 | $curl->callback = function($response, $info, $request, $error) { 15 | 16 | preg_match("@http://www.zhihu.com/people/(.*?)/@i", $request['url'], $out); 17 | $username = $out[1]; 18 | 19 | // 更新采集时间, 让队列每次都取到不同的用户,形成采集死循环 20 | $server_data['info_uptime'] = time(); 21 | $server_data['info_progress_id'] = posix_getpid(); 22 | $server_data['info_server_id'] = 1; 23 | 24 | if (empty($response)) 25 | { 26 | db::update('user', $server_data, "`username`='{$username}'"); 27 | file_put_contents("./data/error_timeout.log", date("Y-m-d H:i:s") . ' ' . $username.' --- '.json_encode($error)."\n", FILE_APPEND); 28 | // 注意这里不要用 exit,否则整个程序就断开了 29 | return; 30 | } 31 | 32 | // 如果不是about的 33 | if (strpos($request['url'], 'about') !== false) 34 | { 35 | $data = get_user_about($response); 36 | if (empty($data)) 37 | { 38 | db::update('user', $server_data, "`username`='{$username}'"); 39 | file_put_contents("./data/error_emptydata.log", date("Y-m-d H:i:s") . ' ' . $username." about data not exists --- \n", FILE_APPEND); 40 | return; 41 | } 42 | 43 | $data = array_merge($data, $server_data); 44 | db::update('user', $data, "`username`='{$username}'"); 45 | //file_put_contents("./data/info/".$username.".json", json_encode($data)); 46 | return; 47 | } 48 | 49 | $data = get_user($response); 50 | if (empty($data)) 51 | { 52 | db::update('user', $server_data, "`username`='{$username}'"); 53 | file_put_contents("./data/error_emptydata.log", date("Y-m-d H:i:s") . ' ' . $username." info data not exists --- \n", FILE_APPEND); 54 | return; 55 | } 56 | $data['last_message_week'] = empty($data['last_message_time']) ? 7 : intval(date("w", $data['last_message_time'])); 57 | $data['last_message_hour'] = empty($data['last_message_time']) ? 24 : intval(date("H", $data['last_message_time'])); 58 | $data = array_merge($data, $server_data); 59 | db::update('user', $data, "`username`='{$username}'"); 60 | //file_put_contents("./data/about/".$username.".json", json_encode($data)); 61 | }; 62 | 63 | for ($j = 0; $j < 1; $j++) 64 | { 65 | for ($i = 0; $i < 10; $i++) 66 | { 67 | $username = get_user_queue('info'); 68 | $username = addslashes($username); 69 | $url = "http://www.zhihu.com/people/{$username}/about"; 70 | $curl->get($url); 71 | $url = "http://www.zhihu.com/people/{$username}/"; 72 | $curl->get($url); 73 | } 74 | $data = $curl->execute(); 75 | // 睡眠100毫秒,太快了会被认为是ddos 76 | usleep(100000); 77 | } 78 | -------------------------------------------------------------------------------- /user_info_progress.php: -------------------------------------------------------------------------------- 1 | count = 10; 12 | $w->is_once = true; 13 | $w->log_show = false; 14 | 15 | $count = 1; // 每个进程循环多少次 16 | $w->on_worker_start = function($worker) use ($count) { 17 | 18 | $cookie = trim(file_get_contents("cookie.txt")); 19 | $curl = new rolling_curl(); 20 | $curl->set_cookie($cookie); 21 | $curl->set_gzip(true); 22 | 23 | // 更新采集时间, 让队列每次都取到不同的用户,形成采集死循环 24 | $server_data['info_uptime'] = time(); 25 | $server_data['info_progress_id'] = posix_getpid(); 26 | $server_data['info_server_id'] = 2; 27 | 28 | for ($i = 0; $i < $count; $i++) 29 | { 30 | $username = get_user_queue('info'); 31 | if (empty($username)) 32 | { 33 | return; 34 | } 35 | $username = addslashes($username); 36 | $worker->log("采集用户信息 --- " . $username . " --- 开始\n"); 37 | 38 | // 采集用户最后发信息时间和内容 =========================================================== 39 | $data = array(); 40 | $url = "http://www.zhihu.com/people/{$username}/"; 41 | $curl->get($url); 42 | $content = $curl->execute(); 43 | 44 | if (empty($content)) 45 | { 46 | file_put_contents("./data/error_timeout.log", date("Y-m-d H:i:s") . ' ' . $username."\n", FILE_APPEND); 47 | db::update('user', $server_data, "`username`='{$username}'"); 48 | return; 49 | } 50 | 51 | $data = get_user($content); 52 | if (empty($data)) 53 | { 54 | file_put_contents("./data/error_emptydata.log", date("Y-m-d H:i:s") . ' ' . $username." info data not exists --- \n", FILE_APPEND); 55 | db::update('user', $server_data, "`username`='{$username}'"); 56 | return; 57 | } 58 | 59 | //$worker->log("采集用户信息 --- " . $username . " --- 成功\n"); 60 | $data['last_message_week'] = empty($data['last_message_time']) ? 7 : intval(date("w", $data['last_message_time'])); 61 | $data['last_message_hour'] = empty($data['last_message_time']) ? 24 : intval(date("H", $data['last_message_time'])); 62 | $data = array_merge($data, $server_data); 63 | db::update('user', $data, "`username`='{$username}'"); 64 | 65 | // 采集用户详细信息 ======================================================================= 66 | $data = array(); 67 | $url = "http://www.zhihu.com/people/{$username}/about"; 68 | $curl->get($url); 69 | $content = $curl->execute(); 70 | 71 | if (empty($content)) 72 | { 73 | file_put_contents("./data/error_timeout.log", date("Y-m-d H:i:s") . ' ' . $username."\n", FILE_APPEND); 74 | db::update('user', $server_data, "`username`='{$username}'"); 75 | return; 76 | } 77 | 78 | $data = get_user_about($content); 79 | if (empty($data)) 80 | { 81 | file_put_contents("./data/error_emptydata.log", date("Y-m-d H:i:s") . ' ' . $username." about data not exists --- \n", FILE_APPEND); 82 | db::update('user', $server_data, "`username`='{$username}'"); 83 | return; 84 | } 85 | 86 | $data = array_merge($data, $server_data); 87 | db::update('user', $data, "`username`='{$username}'"); 88 | } 89 | 90 | }; 91 | --------------------------------------------------------------------------------