├── .gitignore ├── README.md ├── autoloader.php ├── composer.json ├── core ├── cache.php ├── constants.php ├── db.php ├── init.php ├── log.php ├── phpspider.bak20170807.php ├── phpspider.php ├── queue.php ├── requests.php ├── selector.php ├── util.php └── worker.php ├── demo ├── 13384.php ├── 52mnw.php ├── mafengwo.php ├── mafengwo.sql ├── qiushibaike.php ├── qiushibaike.sql ├── qiushibaike_css_selector.php ├── qiushibaike_task.php └── test_requests.php ├── gitadd.sh ├── hacked-emails ├── banners.txt └── hacked_emails.php ├── library ├── cls_curl.php ├── cls_query.php ├── cls_redis.php ├── cls_redis_client.php ├── cls_redis_server.php ├── phpquery.php └── rolling_curl.php ├── test.go ├── test.php └── worker.php /.gitignore: -------------------------------------------------------------------------------- 1 | .* 2 | /vendor/ 3 | composer.lock 4 | data/ 5 | demo/ 6 | cache/ 7 | client.php 8 | *.bak.php 9 | 10 | 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # phpspider -- PHP蜘蛛爬虫框架 2 | 《我用爬虫一天时间“偷了”知乎一百万用户,只为证明PHP是世界上最好的语言 》所使用的程序 3 | 4 | phpspider是一个爬虫开发框架。使用本框架,你不用了解爬虫的底层技术实现,爬虫被网站屏蔽、有些网站需要登录或验证码识别才能爬取等问题。简单几行PHP代码,就可以创建自己的爬虫,利用框架封装的多进程Worker类库,代码更简洁,执行效率更高速度更快。 5 | 6 | demo目录下有一些特定网站的爬取规则,只要你安装了PHP环境,代码就可以在命令行下直接跑。 对爬虫感兴趣的开发者可以加QQ群一起讨论:147824717。 7 | 8 | 下面以糗事百科为例, 来看一下我们的爬虫长什么样子: 9 | 10 | ``` 11 | $configs = array( 12 | 'name' => '糗事百科', 13 | 'domains' => array( 14 | 'qiushibaike.com', 15 | 'www.qiushibaike.com' 16 | ), 17 | 'scan_urls' => array( 18 | 'http://www.qiushibaike.com/' 19 | ), 20 | 'content_url_regexes' => array( 21 | "http://www.qiushibaike.com/article/\d+" 22 | ), 23 | 'list_url_regexes' => array( 24 | "http://www.qiushibaike.com/8hr/page/\d+\?s=\d+" 25 | ), 26 | 'fields' => array( 27 | array( 28 | // 抽取内容页的文章内容 29 | 'name' => "article_content", 30 | 'selector' => "//*[@id='single-next-link']", 31 | 'required' => true 32 | ), 33 | array( 34 | // 抽取内容页的文章作者 35 | 'name' => "article_author", 36 | 'selector' => "//div[contains(@class,'author')]//h2", 37 | 'required' => true 38 | ), 39 | ), 40 | ); 41 | $spider = new phpspider($configs); 42 | $spider->start(); 43 | ``` 44 | 爬虫的整体框架就是这样, 首先定义了一个$configs数组, 里面设置了待爬网站的一些信息, 然后通过调用```$spider = new phpspider($configs);```和```$spider->start();```来配置并启动爬虫. 45 | 46 | #### 运行界面如下: 47 | 48 | ![](http://www.epooll.com/zhihu/pachong.gif) 49 | 50 | 更多详细内容,移步到: 51 | 52 | [开发文档](http://doc.phpspider.org) 53 | -------------------------------------------------------------------------------- /autoloader.php: -------------------------------------------------------------------------------- 1 | 10 | * @copyright seatle 11 | * @link http://www.phpspider.org/ 12 | * @license http://www.opensource.org/licenses/mit-license.php MIT License 13 | */ 14 | namespace phpspider; 15 | 16 | /** 17 | * autoloader. 18 | */ 19 | class autoloader 20 | { 21 | /** 22 | * Autoload root path. 23 | * 24 | * @var string 25 | */ 26 | protected static $_autoload_root_path = ''; 27 | 28 | /** 29 | * Set autoload root path. 30 | * 31 | * @param string $root_path 32 | * @return void 33 | */ 34 | public static function set_root_path($root_path) 35 | { 36 | self::$_autoload_root_path = $root_path; 37 | } 38 | 39 | /** 40 | * Load files by namespace. 41 | * 42 | * @param string $name 43 | * @return boolean 44 | */ 45 | public static function load_by_namespace($name) 46 | { 47 | $class_path = str_replace('\\', DIRECTORY_SEPARATOR, $name); 48 | 49 | if (strpos($name, 'phpspider\\') === 0) 50 | { 51 | $class_file = __DIR__ . substr($class_path, strlen('phpspider')) . '.php'; 52 | } 53 | else 54 | { 55 | if (self::$_autoload_root_path) 56 | { 57 | $class_file = self::$_autoload_root_path . DIRECTORY_SEPARATOR . $class_path . '.php'; 58 | } 59 | if (empty($class_file) || !is_file($class_file)) 60 | { 61 | $class_file = __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . "$class_path.php"; 62 | } 63 | } 64 | 65 | if (is_file($class_file)) 66 | { 67 | require_once($class_file); 68 | if (class_exists($name, false)) 69 | { 70 | return true; 71 | } 72 | } 73 | return false; 74 | } 75 | } 76 | 77 | spl_autoload_register('\phpspider\autoloader::load_by_namespace', true, true); 78 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "owner888/phpspider", 3 | "type": "library", 4 | "keywords": [ 5 | "framework", 6 | "phpspider" 7 | ], 8 | "homepage": "http://www.phpspider.org", 9 | "license": "MIT", 10 | "description": "The PHPSpider Framework.", 11 | "authors": [ 12 | { 13 | "name": "Seatle Yang", 14 | "email": "seatle@foxmail.com", 15 | "homepage": "http://www.phpspider.org", 16 | "role": "Developer" 17 | } 18 | ], 19 | "support": { 20 | "email": "seatle@foxmail.com", 21 | "issues": "https://github.com/owner888/phpspider/issues", 22 | "forum": "http://wenda.phpspider.org/", 23 | "wiki": "http://doc.phpspider.org/", 24 | "source": "https://github.com/owner888/phpspider" 25 | }, 26 | "require": { 27 | "php": ">=5.5.0" 28 | }, 29 | "suggest": { 30 | "ext-pcntl、ext-redis": "For better performance. " 31 | }, 32 | "autoload": { 33 | "psr-4": { 34 | "phpspider\\": "./" 35 | } 36 | }, 37 | "minimum-stability": "dev" 38 | } 39 | -------------------------------------------------------------------------------- /core/cache.php: -------------------------------------------------------------------------------- 1 | 10 | // +---------------------------------------------------------------------- 11 | 12 | //---------------------------------- 13 | // PHPSpider缓存类文件 14 | //---------------------------------- 15 | 16 | class cache 17 | { 18 | // 多进程下面不能用单例模式 19 | //protected static $_instance; 20 | /** 21 | * 获取实例 22 | * 23 | * @return void 24 | * @author seatle 25 | * @created time :2016-04-10 22:55 26 | */ 27 | public static function init() 28 | { 29 | if(extension_loaded('Redis')) 30 | { 31 | $_instance = new Redis(); 32 | } 33 | else 34 | { 35 | $errmsg = "extension redis is not installed"; 36 | log::add($errmsg, "Error"); 37 | return null; 38 | } 39 | // 这里不能用pconnect,会报错:Uncaught exception 'RedisException' with message 'read error on connection' 40 | $_instance->connect($GLOBALS['config']['redis']['host'], $GLOBALS['config']['redis']['port'], $GLOBALS['config']['redis']['timeout']); 41 | 42 | // 验证 43 | if ($GLOBALS['config']['redis']['pass']) 44 | { 45 | if ( !$_instance->auth($GLOBALS['config']['redis']['pass']) ) 46 | { 47 | $errmsg = "Redis Server authentication failed!!"; 48 | log::add($errmsg, "Error"); 49 | return null; 50 | } 51 | } 52 | 53 | // 不序列化的话不能存数组,用php的序列化方式其他语言又不能读取,所以这里自己用json序列化了,性能还比php的序列化好1.4倍 54 | //$_instance->setOption(Redis::OPT_SERIALIZER, Redis::SERIALIZER_NONE); // don't serialize data 55 | //$_instance->setOption(Redis::OPT_SERIALIZER, Redis::SERIALIZER_PHP); // use built-in serialize/unserialize 56 | //$_instance->setOption(Redis::OPT_SERIALIZER, Redis::SERIALIZER_IGBINARY); // use igBinary serialize/unserialize 57 | 58 | $_instance->setOption(Redis::OPT_PREFIX, $GLOBALS['config']['redis']['prefix'] . ":"); 59 | 60 | return $_instance; 61 | } 62 | } 63 | 64 | 65 | -------------------------------------------------------------------------------- /core/constants.php: -------------------------------------------------------------------------------- 1 | 10 | // +---------------------------------------------------------------------- 11 | 12 | //---------------------------------- 13 | // PHPSpider公共入口文件 14 | //---------------------------------- 15 | 16 | //namespace phpspider\core; 17 | 18 | // Display errors. 19 | ini_set('display_errors', 'on'); 20 | // Reporting all. 21 | error_reporting(E_ALL); 22 | 23 | // 永不超时 24 | ini_set('max_execution_time', 0); 25 | set_time_limit(0); 26 | // 内存限制,如果外面设置的内存比 /etc/php/php-cli.ini 大,就不要设置了 27 | if (intval(ini_get("memory_limit")) < 1024) 28 | { 29 | ini_set('memory_limit', '1024M'); 30 | } 31 | 32 | if( PHP_SAPI != 'cli' ) 33 | { 34 | exit("You must run the CLI environment\n"); 35 | } 36 | 37 | // Date.timezone 38 | if (!ini_get('date.timezone')) 39 | { 40 | date_default_timezone_set('Asia/Shanghai'); 41 | } 42 | 43 | //核心库目录 44 | define('CORE', dirname(__FILE__)); 45 | define('PATH_ROOT', CORE."/../"); 46 | define('PATH_DATA', CORE."/../data"); 47 | define('PATH_LIBRARY', CORE."/../library"); 48 | 49 | //系统配置 50 | //if( file_exists( PATH_ROOT."/config/inc_config.php" ) ) 51 | //{ 52 | //require PATH_ROOT."/config/inc_config.php"; 53 | //} 54 | 55 | 56 | -------------------------------------------------------------------------------- /core/db.php: -------------------------------------------------------------------------------- 1 | 10 | // +---------------------------------------------------------------------- 11 | 12 | //---------------------------------- 13 | // PHPSpider数据库类文件 14 | //---------------------------------- 15 | 16 | namespace phpspider\core; 17 | 18 | class db 19 | { 20 | private static $configs = array(); 21 | private static $rsid; 22 | private static $links = array(); 23 | private static $link_name = 'default'; 24 | private static $autocommiting = false; 25 | 26 | public static function _init() 27 | { 28 | // 获取配置 29 | $config = self::$link_name == 'default' ? self::_get_default_config() : self::$configs[self::$link_name]; 30 | 31 | // 创建连接 32 | if (empty(self::$links[self::$link_name]) || empty(self::$links[self::$link_name]['conn'])) 33 | { 34 | // 第一次连接,初始化fail和pid 35 | if (empty(self::$links[self::$link_name])) 36 | { 37 | self::$links[self::$link_name]['fail'] = 0; 38 | self::$links[self::$link_name]['pid'] = function_exists('posix_getpid') ? posix_getpid() : 0; 39 | //echo "progress[".self::$links[self::$link_name]['pid']."] create db connect[".self::$link_name."]\n"; 40 | } 41 | self::$links[self::$link_name]['conn'] = mysqli_connect($config['host'], $config['user'], $config['pass'], $config['name'], $config['port']); 42 | if(mysqli_connect_errno()) 43 | { 44 | self::$links[self::$link_name]['fail']++; 45 | $errmsg = 'Mysql Connect failed['.self::$links[self::$link_name]['fail'].']: ' . mysqli_connect_error(); 46 | echo util::colorize(date("H:i:s") . " {$errmsg}\n\n", 'fail'); 47 | log::add($errmsg, "Error"); 48 | // 连接失败5次,中断进程 49 | if (self::$links[self::$link_name]['fail'] >= 5) 50 | { 51 | exit(250); 52 | } 53 | self::_init($config); 54 | } 55 | else 56 | { 57 | mysqli_query(self::$links[self::$link_name]['conn'], " SET character_set_connection=utf8, character_set_results=utf8, character_set_client=binary, sql_mode='' "); 58 | } 59 | } 60 | else 61 | { 62 | $curr_pid = function_exists('posix_getpid') ? posix_getpid() : 0; 63 | // 如果父进程已经生成资源就释放重新生成,因为多进程不能共享连接资源 64 | if (self::$links[self::$link_name]['pid'] != $curr_pid) 65 | { 66 | self::clear_link(); 67 | } 68 | } 69 | } 70 | 71 | /** 72 | * 重新设置连接 73 | * 传空的话就等于关闭数据库再连接 74 | * 在多进程环境下如果主进程已经调用过了,子进程一定要调用一次 clear_link,否则会报错: 75 | * Error while reading greeting packet. PID=19615,这是两个进程互抢一个连接句柄引起的 76 | * 77 | * @param array $config 78 | * @return void 79 | * @author seatle 80 | * @created time :2016-03-29 00:51 81 | */ 82 | public static function clear_link() 83 | { 84 | if(self::$links) 85 | { 86 | foreach(self::$links as $k=>$v) 87 | { 88 | @mysqli_close($v['conn']); 89 | unset(self::$links[$k]); 90 | } 91 | } 92 | // 注意,只会连接最后一个,不过貌似也够用了啊 93 | self::_init(); 94 | } 95 | 96 | /** 97 | * 改变链接为指定配置的链接(如果不同时使用多个数据库,不会涉及这个操作) 98 | * @parem $link_name 链接标识名 99 | * @parem $config 多次使用时, 这个数组只需传递一次 100 | * config 格式与 $GLOBALS['config']['db'] 一致 101 | * @return void 102 | */ 103 | public static function set_connect($link_name, $config = array()) 104 | { 105 | self::$link_name = $link_name; 106 | if (!empty($config)) 107 | { 108 | self::$configs[self::$link_name] = $config; 109 | } 110 | else 111 | { 112 | if (empty(self::$configs[self::$link_name])) 113 | { 114 | throw new Exception("You not set a config array for connect!"); 115 | } 116 | } 117 | } 118 | 119 | 120 | /** 121 | * 还原为默认连接(如果不同时使用多个数据库,不会涉及这个操作) 122 | * @parem $config 指定配置(默认使用inc_config.php的配置) 123 | * @return void 124 | */ 125 | public static function set_connect_default() 126 | { 127 | $config = self::_get_default_config(); 128 | self::set_connect('default', $config); 129 | } 130 | 131 | 132 | /** 133 | * 获取默认配置 134 | */ 135 | protected static function _get_default_config() 136 | { 137 | if (empty(self::$configs['default'])) 138 | { 139 | if (!is_array($GLOBALS['config']['db'])) 140 | { 141 | exit('db.php _get_default_config()' . '没有mysql配置'); 142 | } 143 | self::$configs['default'] = $GLOBALS['config']['db']; 144 | } 145 | return self::$configs['default']; 146 | } 147 | 148 | /** 149 | * 返回查询游标 150 | * @return rsid 151 | */ 152 | protected static function _get_rsid($rsid = '') 153 | { 154 | return $rsid == '' ? self::$rsid : $rsid; 155 | } 156 | 157 | public static function autocommit($mode = false) 158 | { 159 | if ( self::$autocommiting ) 160 | { 161 | return true; 162 | } 163 | 164 | self::$autocommiting = true; 165 | 166 | self::_init(); 167 | return mysqli_autocommit(self::$links[self::$link_name]['conn'], $mode); 168 | } 169 | 170 | public static function begin_tran() 171 | { 172 | return self::autocommit(false); 173 | } 174 | 175 | public static function commit() 176 | { 177 | mysqli_commit(self::$links[self::$link_name]['conn']); 178 | self::autocommit(true); 179 | return true; 180 | } 181 | 182 | 183 | public static function rollback() 184 | { 185 | mysqli_rollback(self::$links[self::$link_name]['conn']); 186 | self::autocommit(true); 187 | return true; 188 | } 189 | 190 | public static function query($sql) 191 | { 192 | $sql = trim($sql); 193 | 194 | // 初始化数据库 195 | self::_init(); 196 | self::$rsid = @mysqli_query(self::$links[self::$link_name]['conn'], $sql); 197 | 198 | if (self::$rsid === false) 199 | { 200 | // 不要每次都ping,浪费流量浪费性能,执行出错了才重新连接 201 | $errno = mysqli_errno(self::$links[self::$link_name]['conn']); 202 | if ($errno == 2013 || $errno == 2006) 203 | { 204 | $errmsg = mysqli_error(self::$links[self::$link_name]['conn']); 205 | log::add($errmsg, "Error"); 206 | 207 | @mysqli_close(self::$links[self::$link_name]['conn']); 208 | self::$links[self::$link_name]['conn'] = null; 209 | return self::query($sql); 210 | } 211 | 212 | $errmsg = "Query SQL: ".$sql; 213 | log::add($errmsg, "Warning"); 214 | $errmsg = "Error SQL: ".mysqli_error(self::$links[self::$link_name]['conn']); 215 | log::add($errmsg, "Warning"); 216 | 217 | $backtrace = debug_backtrace(); 218 | array_shift($backtrace); 219 | $narr = array('class', 'type', 'function', 'file', 'line'); 220 | $err = "debug_backtrace:\n"; 221 | foreach($backtrace as $i => $l) 222 | { 223 | foreach($narr as $k) 224 | { 225 | if( !isset($l[$k]) ) 226 | { 227 | $l[$k] = ''; 228 | } 229 | } 230 | $err .= "[$i] in function {$l['class']}{$l['type']}{$l['function']} "; 231 | if($l['file']) $err .= " in {$l['file']} "; 232 | if($l['line']) $err .= " on line {$l['line']} "; 233 | $err .= "\n"; 234 | } 235 | log::add($err); 236 | 237 | return false; 238 | } 239 | else 240 | { 241 | return self::$rsid; 242 | } 243 | } 244 | 245 | public static function fetch($rsid = '') 246 | { 247 | $rsid = self::_get_rsid($rsid); 248 | $row = mysqli_fetch_array($rsid, MYSQLI_ASSOC); 249 | return $row; 250 | } 251 | 252 | public static function get_one($sql) 253 | { 254 | if (!preg_match("/limit/i", $sql)) 255 | { 256 | $sql = preg_replace("/[,;]$/i", '', trim($sql)) . " limit 1 "; 257 | } 258 | $rsid = self::query($sql); 259 | if ($rsid === false) 260 | { 261 | return array(); 262 | } 263 | $row = self::fetch($rsid); 264 | self::free($rsid); 265 | return $row; 266 | } 267 | 268 | public static function get_all($sql) 269 | { 270 | $rsid = self::query($sql); 271 | if ($rsid === false) 272 | { 273 | return array(); 274 | } 275 | while ( $row = self::fetch($rsid) ) 276 | { 277 | $rows[] = $row; 278 | } 279 | self::free($rsid); 280 | return empty($rows) ? false : $rows; 281 | } 282 | 283 | public static function free($rsid) 284 | { 285 | return mysqli_free_result($rsid); 286 | } 287 | 288 | public static function insert_id() 289 | { 290 | return mysqli_insert_id(self::$links[self::$link_name]['conn']); 291 | } 292 | 293 | public static function affected_rows() 294 | { 295 | return mysqli_affected_rows(self::$links[self::$link_name]['conn']); 296 | } 297 | 298 | public static function insert($table = '', $data = null, $return_sql = false) 299 | { 300 | $items_sql = $values_sql = ""; 301 | foreach ($data as $k => $v) 302 | { 303 | $v = stripslashes($v); 304 | $v = addslashes($v); 305 | $items_sql .= "`$k`,"; 306 | $values_sql .= "\"$v\","; 307 | } 308 | $sql = "Insert Ignore Into `{$table}` (" . substr($items_sql, 0, -1) . ") Values (" . substr($values_sql, 0, -1) . ")"; 309 | if ($return_sql) 310 | { 311 | return $sql; 312 | } 313 | else 314 | { 315 | if (self::query($sql)) 316 | { 317 | return mysqli_insert_id(self::$links[self::$link_name]['conn']); 318 | } 319 | else 320 | { 321 | return false; 322 | } 323 | } 324 | } 325 | 326 | public static function insert_batch($table = '', $set = NULL, $return_sql = FALSE) 327 | { 328 | if (empty($table) || empty($set)) 329 | { 330 | return false; 331 | } 332 | $set = self::strsafe($set); 333 | $fields = self::get_fields($table); 334 | 335 | $keys_sql = $vals_sql = array(); 336 | foreach ($set as $i=>$val) 337 | { 338 | ksort($val); 339 | $vals = array(); 340 | foreach ($val as $k => $v) 341 | { 342 | // 过滤掉数据库没有的字段 343 | if (!in_array($k, $fields)) 344 | { 345 | continue; 346 | } 347 | // 如果是第一个数组,把key当做插入条件 348 | if ($i == 0 && $k == 0) 349 | { 350 | $keys_sql[] = "`$k`"; 351 | } 352 | $vals[] = "\"$v\""; 353 | } 354 | $vals_sql[] = implode(",", $vals); 355 | } 356 | 357 | $sql = "Insert Ignore Into `{$table}`(".implode(", ", $keys_sql).") Values (".implode("), (", $vals_sql).")"; 358 | 359 | if ($return_sql) return $sql; 360 | 361 | $rt = self::query($sql); 362 | $insert_id = self::insert_id(); 363 | $return = empty($insert_id) ? $rt : $insert_id; 364 | return $return; 365 | } 366 | 367 | public static function update_batch($table = '', $set = NULL, $index = NULL, $where = NULL, $return_sql = FALSE) 368 | { 369 | if (empty($table) || is_null($set) || is_null($index)) 370 | { 371 | // 不要用exit,会中断程序 372 | return false; 373 | } 374 | $set = self::strsafe($set); 375 | $fields = self::get_fields($table); 376 | 377 | $ids = array(); 378 | foreach ($set as $val) 379 | { 380 | ksort($val); 381 | // 去重,其实不去也可以,因为相同的when只会执行第一个,后面的就直接跳过不执行了 382 | $key = md5($val[$index]); 383 | $ids[$key] = $val[$index]; 384 | 385 | foreach (array_keys($val) as $field) 386 | { 387 | if ($field != $index) 388 | { 389 | $final[$field][$key] = 'When `'.$index.'` = "'.$val[$index].'" Then "'.$val[$field].'"'; 390 | } 391 | } 392 | } 393 | //$ids = array_values($ids); 394 | 395 | // 如果不是数组而且不为空,就转数组 396 | if (!is_array($where) && !empty($where)) 397 | { 398 | $where = array($where); 399 | } 400 | $where[] = $index.' In ("'.implode('","', $ids).'")'; 401 | $where = empty($where) ? "" : " Where ".implode(" And ", $where); 402 | 403 | $sql = "Update `".$table."` Set "; 404 | $cases = ''; 405 | 406 | foreach ($final as $k => $v) 407 | { 408 | // 过滤掉数据库没有的字段 409 | if (!in_array($k, $fields)) 410 | { 411 | continue; 412 | } 413 | $cases .= '`'.$k.'` = Case '."\n"; 414 | foreach ($v as $row) 415 | { 416 | $cases .= $row."\n"; 417 | } 418 | 419 | $cases .= 'Else `'.$k.'` End, '; 420 | } 421 | 422 | $sql .= substr($cases, 0, -2); 423 | 424 | // 其实不带 Where In ($index) 的条件也可以的 425 | $sql .= $where; 426 | 427 | if ($return_sql) return $sql; 428 | 429 | $rt = self::query($sql); 430 | $insert_id = self::affected_rows(); 431 | $return = empty($affected_rows) ? $rt : $affected_rows; 432 | return $return; 433 | } 434 | 435 | public static function update($table = '', $data = array(), $where = null, $return_sql = false) 436 | { 437 | $sql = "UPDATE `{$table}` SET "; 438 | foreach ($data as $k => $v) 439 | { 440 | $v = stripslashes($v); 441 | $v = addslashes($v); 442 | $sql .= "`{$k}` = \"{$v}\","; 443 | } 444 | if (!is_array($where)) 445 | { 446 | $where = array($where); 447 | } 448 | // 删除空字段,不然array("")会成为WHERE 449 | foreach ($where as $k => $v) 450 | { 451 | if (empty($v)) 452 | { 453 | unset($where[$k]); 454 | } 455 | } 456 | $where = empty($where) ? "" : " Where " . implode(" And ", $where); 457 | $sql = substr($sql, 0, -1) . $where; 458 | if ($return_sql) 459 | { 460 | return $sql; 461 | } 462 | else 463 | { 464 | if (self::query($sql)) 465 | { 466 | return mysqli_affected_rows(self::$links[self::$link_name]['conn']); 467 | } 468 | else 469 | { 470 | return false; 471 | } 472 | } 473 | } 474 | 475 | public static function delete($table = '', $where = null, $return_sql = false) 476 | { 477 | // 小心全部被删除了 478 | if (empty($where)) 479 | { 480 | return false; 481 | } 482 | $where = 'Where ' . (!is_array($where) ? $where : implode(' And ', $where)); 483 | $sql = "Delete From `{$table}` {$where}"; 484 | if ($return_sql) 485 | { 486 | return $sql; 487 | } 488 | else 489 | { 490 | if (self::query($sql)) 491 | { 492 | return mysqli_affected_rows(self::$links[self::$link_name]['conn']); 493 | } 494 | else 495 | { 496 | return false; 497 | } 498 | } 499 | } 500 | 501 | public static function ping() 502 | { 503 | if (!mysqli_ping(self::$links[self::$link_name]['conn'])) 504 | { 505 | @mysqli_close(self::$links[self::$link_name]['conn']); 506 | self::$links[self::$link_name]['conn'] = null; 507 | self::_init(); 508 | } 509 | } 510 | 511 | public static function strsafe($array) 512 | { 513 | $arrays = array(); 514 | if(is_array($array)===true) 515 | { 516 | foreach ($array as $key => $val) 517 | { 518 | if(is_array($val)===true) 519 | { 520 | $arrays[$key] = self::strsafe($val); 521 | } 522 | else 523 | { 524 | //先去掉转义,避免下面重复转义了 525 | $val = stripslashes($val); 526 | //进行转义 527 | $val = addslashes($val); 528 | //处理addslashes没法处理的 _ % 字符 529 | //$val = strtr($val, array('_'=>'\_', '%'=>'\%')); 530 | $arrays[$key] = $val; 531 | } 532 | } 533 | return $arrays; 534 | } 535 | else 536 | { 537 | $array = stripslashes($array); 538 | $array = addslashes($array); 539 | //$array = strtr($array, array('_'=>'\_', '%'=>'\%')); 540 | return $array; 541 | } 542 | } 543 | 544 | // 这个是给insert、update、insert_batch、update_batch用的 545 | public static function get_fields($table) 546 | { 547 | // $sql = "SHOW COLUMNS FROM $table"; //和下面的语句效果一样 548 | $rows = self::get_all("Desc `{$table}`"); 549 | $fields = array(); 550 | foreach ($rows as $k => $v) 551 | { 552 | // 过滤自增主键 553 | // if ($v['Key'] != 'PRI') 554 | if ($v['Extra'] != 'auto_increment') 555 | { 556 | $fields[] = $v['Field']; 557 | } 558 | } 559 | return $fields; 560 | } 561 | 562 | public static function table_exists($table_name) 563 | { 564 | $sql = "SHOW TABLES LIKE '" . $table_name . "'"; 565 | $rsid = self::query($sql); 566 | $table = self::fetch($rsid); 567 | if (empty($table)) 568 | { 569 | return false; 570 | } 571 | return true; 572 | } 573 | } 574 | 575 | 576 | 577 | 578 | 579 | 580 | -------------------------------------------------------------------------------- /core/init.php: -------------------------------------------------------------------------------- 1 | 10 | // +---------------------------------------------------------------------- 11 | 12 | //---------------------------------- 13 | // PHPSpider公共入口文件 14 | //---------------------------------- 15 | 16 | // 严格开发模式 17 | error_reporting( E_ALL ); 18 | //ini_set('display_errors', 1); 19 | 20 | // 永不超时 21 | ini_set('max_execution_time', 0); 22 | set_time_limit(0); 23 | // 内存限制,如果外面设置的内存比 /etc/php/php-cli.ini 大,就不要设置了 24 | if (intval(ini_get("memory_limit")) < 1024) 25 | { 26 | ini_set('memory_limit', '1024M'); 27 | } 28 | 29 | if( PHP_SAPI != 'cli' ) 30 | { 31 | exit("You must run the CLI environment\n"); 32 | } 33 | 34 | // 设置时区 35 | date_default_timezone_set('Asia/Shanghai'); 36 | 37 | // 引入PATH_DATA 38 | require_once __DIR__ . '/constants.php'; 39 | // 核心库目录 40 | define('CORE', dirname(__FILE__)); 41 | define('PATH_ROOT', CORE."/../"); 42 | define('PATH_DATA', CORE."/../data"); 43 | define('PATH_LIBRARY', CORE."/../library"); 44 | 45 | // 系统配置 46 | if( file_exists( PATH_ROOT."/config/inc_config.php" ) ) 47 | { 48 | require PATH_ROOT."/config/inc_config.php"; 49 | } 50 | require CORE.'/log.php'; 51 | require CORE.'/requests.php'; 52 | require CORE.'/selector.php'; 53 | require CORE.'/util.php'; 54 | require CORE.'/db.php'; 55 | require CORE.'/cache.php'; 56 | require CORE."/worker.php"; 57 | require CORE."/phpspider.php"; 58 | 59 | // 启动的时候生成data目录 60 | util::path_exists(PATH_DATA); 61 | util::path_exists(PATH_DATA."/lock"); 62 | util::path_exists(PATH_DATA."/log"); 63 | util::path_exists(PATH_DATA."/cache"); 64 | util::path_exists(PATH_DATA."/status"); 65 | 66 | function autoload($classname) { 67 | set_include_path(PATH_ROOT.'/library/'); 68 | spl_autoload($classname); //replaces include/require 69 | } 70 | 71 | spl_autoload_extensions('.php'); 72 | spl_autoload_register('autoload'); 73 | 74 | /** 75 | * 自动加载类库处理 76 | * @return void 77 | */ 78 | //function __autoload( $classname ) 79 | //{ 80 | //$classname = preg_replace("/[^0-9a-z_]/i", '', $classname); 81 | //if( class_exists ( $classname ) ) { 82 | //return true; 83 | //} 84 | //$classfile = $classname.'.php'; 85 | //try 86 | //{ 87 | //if ( file_exists ( PATH_LIBRARY.'/'.$classfile ) ) 88 | //{ 89 | //require PATH_LIBRARY.'/'.$classfile; 90 | //} 91 | //else 92 | //{ 93 | //throw new Exception ( 'Error: Cannot find the '.$classname ); 94 | //} 95 | //} 96 | //catch ( Exception $e ) 97 | //{ 98 | //log::error($e->getMessage().'|'.$classname); 99 | //exit(); 100 | //} 101 | //} 102 | -------------------------------------------------------------------------------- /core/log.php: -------------------------------------------------------------------------------- 1 | 10 | // +---------------------------------------------------------------------- 11 | 12 | //---------------------------------- 13 | // PHPSpider日志类文件 14 | //---------------------------------- 15 | 16 | namespace phpspider\core; 17 | // 引入PATH_DATA 18 | require_once __DIR__ . '/constants.php'; 19 | 20 | class log 21 | { 22 | public static $log_show = false; 23 | public static $log_type = false; 24 | public static $log_file = "data/phpspider.log"; 25 | public static $out_sta = ""; 26 | public static $out_end = ""; 27 | 28 | public static function note($msg) 29 | { 30 | self::$out_sta = self::$out_end = ""; 31 | self::msg($msg, 'note'); 32 | } 33 | 34 | public static function info($msg) 35 | { 36 | self::$out_sta = self::$out_end = ""; 37 | self::msg($msg, 'info'); 38 | } 39 | 40 | public static function warn($msg) 41 | { 42 | self::$out_sta = self::$out_end = ""; 43 | if (!util::is_win()) 44 | { 45 | self::$out_sta = "\033[33m"; 46 | self::$out_end = "\033[0m"; 47 | } 48 | 49 | self::msg($msg, 'warn'); 50 | } 51 | 52 | public static function debug($msg) 53 | { 54 | self::$out_sta = self::$out_end = ""; 55 | if (!util::is_win()) 56 | { 57 | self::$out_sta = "\033[36m"; 58 | self::$out_end = "\033[0m"; 59 | } 60 | 61 | self::msg($msg, 'debug'); 62 | } 63 | 64 | public static function error($msg) 65 | { 66 | self::$out_sta = self::$out_end = ""; 67 | if (!util::is_win()) 68 | { 69 | self::$out_sta = "\033[31m"; 70 | self::$out_end = "\033[0m"; 71 | } 72 | 73 | self::msg($msg, 'error'); 74 | } 75 | 76 | public static function msg($msg, $log_type) 77 | { 78 | if ($log_type != 'note' && self::$log_type && strpos(self::$log_type, $log_type) === false) 79 | { 80 | return false; 81 | } 82 | 83 | if ($log_type == 'note') 84 | { 85 | $msg = self::$out_sta. $msg . "\n".self::$out_end; 86 | } 87 | else 88 | { 89 | $msg = self::$out_sta.date("Y-m-d H:i:s")." [{$log_type}] " . $msg .self::$out_end. "\n"; 90 | } 91 | if(self::$log_show) 92 | { 93 | echo $msg; 94 | } 95 | file_put_contents(self::$log_file, $msg, FILE_APPEND | LOCK_EX); 96 | } 97 | 98 | /** 99 | * 记录日志 XXX 100 | * @param string $msg 101 | * @param string $log_type Note|Warning|Error 102 | * @return void 103 | */ 104 | public static function add($msg, $log_type = '') 105 | { 106 | if ($log_type != '') 107 | { 108 | $msg = date("Y-m-d H:i:s")." [{$log_type}] " . $msg . "\n"; 109 | } 110 | if(self::$log_show) 111 | { 112 | echo $msg; 113 | } 114 | //file_put_contents(PATH_DATA."/log/".strtolower($log_type).".log", $msg, FILE_APPEND | LOCK_EX); 115 | file_put_contents(PATH_DATA."/log/error.log", $msg, FILE_APPEND | LOCK_EX); 116 | } 117 | 118 | } 119 | 120 | -------------------------------------------------------------------------------- /core/requests.php: -------------------------------------------------------------------------------- 1 | 10 | // +---------------------------------------------------------------------- 11 | 12 | // +---------------------------------------------------------------------- 13 | // | GET请求 14 | // | requests::get('http://www.test.com'); 15 | // | SERVER 16 | // | $_GET 17 | // +---------------------------------------------------------------------- 18 | // | POST请求 19 | // | $data = array('name'=>'request'); 20 | // | requests::post('http://www.test.com', $data); 21 | // | SERVER 22 | // | $_POST 23 | // +---------------------------------------------------------------------- 24 | // | POST RESTful请求 25 | // | $data = array('name'=>'request'); 26 | // | $data_string = json_encode($data); 27 | // | requests::set_header("Content-Type", "application/json"); 28 | // | requests::post('http://www.test.com', $data_string); 29 | // | SERVER 30 | // | file_get_contents('php://input') 31 | // +---------------------------------------------------------------------- 32 | // | POST 文件上传 33 | // | $data = array('file1'=>''./data/phpspider.log''); 34 | // | requests::post('http://www.test.com', null, $data); 35 | // | SERVER 36 | // | $_FILES 37 | // +---------------------------------------------------------------------- 38 | // | 代理 39 | // | requests::set_proxy(array('223.153.69.150:42354')); 40 | // | $html = requests::get('https://www.test.com'); 41 | // +---------------------------------------------------------------------- 42 | 43 | //---------------------------------- 44 | // PHPSpider请求类文件 45 | //---------------------------------- 46 | 47 | namespace phpspider\core; 48 | 49 | if (!function_exists('curl_file_create')) 50 | { 51 | function curl_file_create($filename, $mimetype = '', $postname = '') 52 | { 53 | return "@$filename;filename=" 54 | . ($postname ?: basename($filename)) 55 | . ($mimetype ? ";type=$mimetype" : ''); 56 | } 57 | } 58 | 59 | class requests 60 | { 61 | const VERSION = '2.0.1'; 62 | 63 | protected static $ch = null; 64 | 65 | /**** Public variables ****/ 66 | 67 | /* user definable vars */ 68 | 69 | public static $timeout = 15; 70 | public static $encoding = null; 71 | public static $input_encoding = null; 72 | public static $output_encoding = null; 73 | public static $cookies = array(); // array of cookies to pass 74 | // $cookies['username'] = "seatle"; 75 | public static $rawheaders = array(); // array of raw headers to send 76 | public static $domain_cookies = array(); // array of cookies for domain to pass 77 | public static $hosts = array(); // random host binding for make request faster 78 | public static $headers = array(); // headers returned from server sent here 79 | public static $useragents = array("requests/2.0.0"); // random agent we masquerade as 80 | public static $client_ips = array(); // random ip we masquerade as 81 | public static $proxies = array(); // random proxy ip 82 | public static $raw = ""; // head + body content returned from server sent here 83 | public static $head = ""; // head content 84 | public static $content = ""; // The body before encoding 85 | public static $text = ""; // The body after encoding 86 | public static $info = array(); // curl info 87 | public static $history = 302; // http request status before redirect. ex:30x 88 | public static $status_code = 0; // http request status 89 | public static $error = ""; // error messages sent here 90 | 91 | /** 92 | * set timeout 93 | * $timeout 为数组时会分别设置connect和read 94 | * 95 | * @param init or array $timeout 96 | * @return 97 | */ 98 | public static function set_timeout($timeout) 99 | { 100 | self::$timeout = $timeout; 101 | } 102 | 103 | /** 104 | * 设置代理 105 | * 如果代理有多个,请求时会随机使用 106 | * 107 | * @param mixed $proxies 108 | * array ( 109 | * 'socks5://user1:pass2@host:port', 110 | * 'socks5://user2:pass2@host:port' 111 | *) 112 | * @return void 113 | * @author seatle 114 | * @created time :2016-09-18 10:17 115 | */ 116 | public static function set_proxy($proxy) 117 | { 118 | self::$proxies = is_array($proxy) ? $proxy : array($proxy); 119 | } 120 | 121 | /** 122 | * 删除代理 123 | * 因为每个链接信息里面都有代理信息,有的链接需要,有的不需要,所以必须提供一个删除功能 124 | * 125 | * @return void 126 | * @author seatle 127 | * @created time :2018-07-16 17:59 128 | */ 129 | public static function del_proxy() 130 | { 131 | self::$proxies = array(); 132 | } 133 | 134 | /** 135 | * 自定义请求头部 136 | * 请求头内容可以用 requests::$rawheaders 来获取 137 | * 比如获取Content-Type:requests::$rawheaders['Content-Type'] 138 | * 139 | * @param string $headers 140 | * @return void 141 | */ 142 | public static function set_header($key, $value) 143 | { 144 | self::$rawheaders[$key] = $value; 145 | } 146 | 147 | /** 148 | * 设置全局COOKIE 149 | * 150 | * @param string $cookie 151 | * @return void 152 | */ 153 | public static function set_cookie($key, $value, $domain = '') 154 | { 155 | if (empty($key)) 156 | { 157 | return false; 158 | } 159 | if (!empty($domain)) 160 | { 161 | self::$domain_cookies[$domain][$key] = $value; 162 | } 163 | else 164 | { 165 | self::$cookies[$key] = $value; 166 | } 167 | return true; 168 | } 169 | 170 | /** 171 | * 批量设置全局cookie 172 | * 173 | * @param mixed $cookies 174 | * @param string $domain 175 | * @return void 176 | * @author seatle 177 | * @created time :2017-08-03 18:06 178 | */ 179 | public static function set_cookies($cookies, $domain = '') 180 | { 181 | $cookies_arr = explode(';', $cookies); 182 | if (empty($cookies_arr)) 183 | { 184 | return false; 185 | } 186 | 187 | foreach ($cookies_arr as $cookie) 188 | { 189 | $cookie_arr = explode('=', $cookie, 2); 190 | $key = $cookie_arr[0]; 191 | $value = empty($cookie_arr[1]) ? '' : $cookie_arr[1]; 192 | 193 | if (!empty($domain)) 194 | { 195 | self::$domain_cookies[$domain][$key] = $value; 196 | } 197 | else 198 | { 199 | self::$cookies[$key] = $value; 200 | } 201 | } 202 | return true; 203 | } 204 | 205 | /** 206 | * 获取单一Cookie 207 | * 208 | * @param mixed $name cookie名称 209 | * @param string $domain 不传则取全局cookie,就是手动set_cookie的cookie 210 | * @return void 211 | * @author seatle 212 | * @created time :2017-08-03 18:06 213 | */ 214 | public static function get_cookie($name, $domain = '') 215 | { 216 | if (!empty($domain) && !isset(self::$domain_cookies[$domain])) 217 | { 218 | return ''; 219 | } 220 | $cookies = empty($domain) ? self::$cookies : self::$domain_cookies[$domain]; 221 | return isset($cookies[$name]) ? $cookies[$name] : ''; 222 | } 223 | 224 | /** 225 | * 获取Cookie数组 226 | * 227 | * @param string $domain 不传则取全局cookie,就是手动set_cookie的cookie 228 | * @return void 229 | * @author seatle 230 | * @created time :2017-08-03 18:06 231 | */ 232 | public static function get_cookies($domain = '') 233 | { 234 | if (!empty($domain) && !isset(self::$domain_cookies[$domain])) 235 | { 236 | return array(); 237 | } 238 | return empty($domain) ? self::$cookies : self::$domain_cookies[$domain]; 239 | } 240 | 241 | /** 242 | * 删除Cookie 243 | * 244 | * @param string $domain 不传则删除全局Cookie 245 | * @return void 246 | * @author seatle 247 | * @created time :2017-08-03 18:06 248 | */ 249 | public static function del_cookie($key, $domain = '') 250 | { 251 | if (empty($key)) 252 | { 253 | return false; 254 | } 255 | 256 | if (!empty($domain) && !isset(self::$domain_cookies[$domain])) 257 | { 258 | return false; 259 | } 260 | 261 | if (!empty($domain)) 262 | { 263 | if (isset(self::$domain_cookies[$domain][$key])) 264 | { 265 | unset(self::$domain_cookies[$domain][$key]); 266 | } 267 | } 268 | else 269 | { 270 | if (isset(self::$cookies[$key])) 271 | { 272 | unset(self::$cookies[$key]); 273 | } 274 | } 275 | return true; 276 | } 277 | 278 | /** 279 | * 删除Cookie 280 | * 281 | * @param string $domain 不传则删除全局Cookie 282 | * @return void 283 | * @author seatle 284 | * @created time :2017-08-03 18:06 285 | */ 286 | public static function del_cookies($domain = '') 287 | { 288 | if (!empty($domain) && !isset(self::$domain_cookies[$domain])) 289 | { 290 | return false; 291 | } 292 | if ( empty($domain) ) 293 | { 294 | self::$cookies = array(); 295 | } 296 | else 297 | { 298 | if (isset(self::$domain_cookies[$domain])) 299 | { 300 | unset(self::$domain_cookies[$domain]); 301 | } 302 | } 303 | return true; 304 | } 305 | 306 | /** 307 | * 设置随机的user_agent 308 | * 309 | * @param string $useragent 310 | * @return void 311 | */ 312 | public static function set_useragent($useragent) 313 | { 314 | self::$useragents = is_array($useragent) ? $useragent : array($useragent); 315 | } 316 | 317 | /** 318 | * set referer 319 | * 320 | */ 321 | public static function set_referer($referer) 322 | { 323 | self::$rawheaders['Referer'] = $referer; 324 | } 325 | 326 | /** 327 | * 设置伪造IP 328 | * 传入数组则为随机IP 329 | * @param string $ip 330 | * @return void 331 | */ 332 | public static function set_client_ip($ip) 333 | { 334 | self::$client_ips = is_array($ip) ? $ip : array($ip); 335 | } 336 | 337 | /** 338 | * 删除伪造IP 339 | * 340 | * @return void 341 | * @author seatle 342 | * @created time :2018-07-16 17:59 343 | */ 344 | public static function del_client_ip() 345 | { 346 | self::$client_ips = array(); 347 | } 348 | 349 | /** 350 | * 设置中文请求 351 | * 352 | * @param string $lang 353 | * @return void 354 | */ 355 | public static function set_accept_language($lang = 'zh-CN') 356 | { 357 | self::$rawheaders['Accept-Language'] = $lang; 358 | } 359 | 360 | /** 361 | * 设置Hosts 362 | * 负载均衡到不同的服务器,如果对方使用CDN,采用这个是最好的了 363 | * 364 | * @param string $hosts 365 | * @return void 366 | */ 367 | public static function set_hosts($host, $ips = array()) 368 | { 369 | $ips = is_array($ips) ? $ips : array($ips); 370 | self::$hosts[$host] = $ips; 371 | } 372 | 373 | /** 374 | * 分割返回的header和body 375 | * header用来判断编码和获取Cookie 376 | * body用来判断编码,得到编码前和编码后的内容 377 | * 378 | * @return void 379 | * @author seatle 380 | * @created time :2017-08-03 18:06 381 | */ 382 | public static function split_header_body() 383 | { 384 | $head = $body = ''; 385 | $head = substr(self::$raw, 0, self::$info['header_size']); 386 | $body = substr(self::$raw, self::$info['header_size']); 387 | // http header 388 | self::$head = $head; 389 | // The body before encoding 390 | self::$content = $body; 391 | 392 | //$http_headers = array(); 393 | //// 解析HTTP数据流 394 | //if (!empty(self::$raw)) 395 | //{ 396 | //self::get_response_cookies($domain); 397 | //// body里面可能有 \r\n\r\n,但是第一个一定是HTTP Header,去掉后剩下的就是body 398 | //$array = explode("\r\n\r\n", self::$raw); 399 | //foreach ($array as $k=>$v) 400 | //{ 401 | //// post 方法会有两个http header:HTTP/1.1 100 Continue、HTTP/1.1 200 OK 402 | //if (preg_match("#^HTTP/.*? 100 Continue#", $v)) 403 | //{ 404 | //unset($array[$k]); 405 | //continue; 406 | //} 407 | //if (preg_match("#^HTTP/.*? \d+ #", $v)) 408 | //{ 409 | //$header = $v; 410 | //unset($array[$k]); 411 | //$http_headers = self::get_response_headers($v); 412 | //} 413 | //} 414 | //$body = implode("\r\n\r\n", $array); 415 | //} 416 | 417 | // 设置了输出编码的转码,注意: xpath只支持utf-8,iso-8859-1 不要转,他本身就是utf-8 418 | $body = self::encoding($body); //自动转码 419 | // 转码后 420 | self::$encoding = self::$output_encoding; 421 | 422 | // The body after encoding 423 | self::$text = $body; 424 | return array($head, $body); 425 | } 426 | 427 | /** 428 | * 获得域名相对应的Cookie 429 | * 430 | * @param mixed $header 431 | * @param mixed $domain 432 | * @return void 433 | * @author seatle 434 | * @created time :2017-08-03 18:06 435 | */ 436 | public static function get_response_cookies($header, $domain) 437 | { 438 | // 解析Cookie并存入 self::$cookies 方便调用 439 | preg_match_all("/.*?Set\-Cookie: ([^\r\n]*)/i", $header, $matches); 440 | $cookies = empty($matches[1]) ? array() : $matches[1]; 441 | 442 | // 解析到Cookie 443 | if (!empty($cookies)) 444 | { 445 | $cookies = implode(';', $cookies); 446 | $cookies = explode(';', $cookies); 447 | foreach ($cookies as $cookie) 448 | { 449 | $cookie_arr = explode('=', $cookie, 2); 450 | // 过滤 httponly、secure 451 | if (count($cookie_arr) < 2) 452 | { 453 | continue; 454 | } 455 | $cookie_name = !empty($cookie_arr[0]) ? trim($cookie_arr[0]) : ''; 456 | if (empty($cookie_name)) 457 | { 458 | continue; 459 | } 460 | // 过滤掉domain路径 461 | if (in_array(strtolower($cookie_name), array('path', 'domain', 'expires', 'max-age'))) 462 | { 463 | continue; 464 | } 465 | self::$domain_cookies[$domain][trim($cookie_arr[0])] = trim($cookie_arr[1]); 466 | } 467 | } 468 | } 469 | 470 | /** 471 | * 获得response header 472 | * 此方法占时没有用到 473 | * 474 | * @param mixed $header 475 | * @return void 476 | * @author seatle 477 | * @created time :2017-08-03 18:06 478 | */ 479 | public static function get_response_headers($header) 480 | { 481 | $headers = array(); 482 | $header_lines = explode("\n", $header); 483 | if (!empty($header_lines)) 484 | { 485 | foreach ($header_lines as $line) 486 | { 487 | $header_arr = explode(':', $line, 2); 488 | $key = empty($header_arr[0]) ? '' : trim($header_arr[0]); 489 | $val = empty($header_arr[1]) ? '' : trim($header_arr[1]); 490 | if (empty($key) || empty($val)) 491 | { 492 | continue; 493 | } 494 | $headers[$key] = $val; 495 | } 496 | } 497 | self::$headers = $headers; 498 | return self::$headers; 499 | } 500 | 501 | /** 502 | * 获取编码 503 | * @param $string 504 | * @return string 505 | */ 506 | public static function get_encoding($string) 507 | { 508 | $encoding = mb_detect_encoding($string, array('UTF-8', 'GBK', 'GB2312', 'LATIN1', 'ASCII', 'BIG5', 'ISO-8859-1')); 509 | return strtolower($encoding); 510 | } 511 | 512 | /** 513 | * 移除页面head区域代码 514 | * @param $html 515 | * @return mixed 516 | */ 517 | private static function _remove_head($html) 518 | { 519 | return preg_replace('/.+<\/head>/is', '', $html); 520 | } 521 | 522 | /** 523 | * 简单的判断一下参数是否为一个URL链接 524 | * @param string $str 525 | * @return boolean 526 | */ 527 | private static function _is_url($url) 528 | { 529 | //$pattern = '/^http(s)?:\\/\\/.+/'; 530 | $pattern = "/\b(([\w-]+:\/\/?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|\/)))/"; 531 | if (preg_match($pattern, $url)) 532 | { 533 | return true; 534 | } 535 | return false; 536 | } 537 | 538 | /** 539 | * 初始化 CURL 540 | * 541 | */ 542 | public static function init() 543 | { 544 | if (!is_resource ( self::$ch )) 545 | { 546 | self::$ch = curl_init (); 547 | curl_setopt( self::$ch, CURLOPT_RETURNTRANSFER, true ); 548 | curl_setopt( self::$ch, CURLOPT_HEADER, false ); 549 | curl_setopt( self::$ch, CURLOPT_USERAGENT, "phpspider-requests/".self::VERSION ); 550 | // 如果设置了两个时间,就分开设置 551 | if (is_array(self::$timeout)) 552 | { 553 | curl_setopt( self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout[0] ); 554 | curl_setopt( self::$ch, CURLOPT_TIMEOUT, self::$timeout[1]); 555 | } 556 | else 557 | { 558 | curl_setopt(self::$ch, CURLOPT_CONNECTTIMEOUT, ceil(self::$timeout / 2)); 559 | curl_setopt(self::$ch, CURLOPT_TIMEOUT, self::$timeout); 560 | } 561 | curl_setopt(self::$ch, CURLOPT_MAXREDIRS, 5); //maximum number of redirects allowed 562 | // 在多线程处理场景下使用超时选项时,会忽略signals对应的处理函数,但是无耐的是还有小概率的crash情况发生 563 | curl_setopt( self::$ch, CURLOPT_NOSIGNAL, true); 564 | } 565 | return self::$ch; 566 | } 567 | 568 | /** 569 | * get 请求 570 | */ 571 | public static function get($url, $fields = array(), $allow_redirects = true, $cert = NULL) 572 | { 573 | self::init (); 574 | return self::request($url, 'get', $fields, NULL, $allow_redirects, $cert); 575 | } 576 | 577 | /** 578 | * post 请求 579 | * $fields 有三种类型:1、数组;2、http query;3、json 580 | * 1、array('name'=>'yangzetao') 581 | * 2、http_build_query(array('name'=>'yangzetao')) 582 | * 3、json_encode(array('name'=>'yangzetao')) 583 | * 前两种是普通的post,可以用$_POST方式获取 584 | * 第三种是post stream( json rpc,其实就是webservice ) 585 | * 虽然是post方式,但是只能用流方式 http://input 后者 $HTTP_RAW_POST_DATA 获取 586 | * 587 | * @param mixed $url 588 | * @param array $fields 589 | * @param mixed $proxies 590 | * @static 591 | * @access public 592 | * @return void 593 | */ 594 | public static function post($url, $fields = array(), $files = array(), $allow_redirects = true, $cert = NULL) 595 | { 596 | self::init (); 597 | return self::request($url, 'POST', $fields, $files, $allow_redirects, $cert); 598 | } 599 | 600 | public static function put($url, $fields = array(), $allow_redirects = true, $cert = NULL) 601 | { 602 | self::init (); 603 | return self::request($url, 'PUT', $fields, $allow_redirects, $cert); 604 | } 605 | 606 | public static function delete($url, $fields = array(), $allow_redirects = true, $cert = NULL) 607 | { 608 | self::init (); 609 | return self::request($url, 'DELETE', $fields, $allow_redirects, $cert); 610 | } 611 | 612 | // 响应HTTP头域里的元信息 613 | // 此方法被用来获取请求实体的元信息而不需要传输实体主体(entity-body) 614 | // 此方法经常被用来测试超文本链接的有效性,可访问性,和最近的改变。. 615 | public static function head($url, $fields = array(), $allow_redirects = true, $cert = NULL) 616 | { 617 | self::init (); 618 | self::request($url, 'HEAD', $fields, $allow_redirects, $cert); 619 | } 620 | 621 | public static function options($url, $fields = array(), $allow_redirects = true, $cert = NULL) 622 | { 623 | self::init (); 624 | return self::request($url, 'OPTIONS', $fields, $allow_redirects, $cert); 625 | } 626 | 627 | public static function patch($url, $fields = array(), $allow_redirects = true, $cert = NULL) 628 | { 629 | self::init (); 630 | return self::request($url, 'PATCH', $fields, $allow_redirects, $cert); 631 | } 632 | 633 | /** 634 | * request 635 | * 636 | * @param mixed $url 请求URL 637 | * @param string $method 请求方法 638 | * @param array $fields 表单字段 639 | * @param array $files 上传文件 640 | * @param mixed $cert CA证书 641 | * @return void 642 | * @author seatle 643 | * @created time :2017-08-03 18:06 644 | */ 645 | public static function request($url, $method = 'GET', $fields = array(), $files = array(), $allow_redirects = true, $cert = NULL) 646 | { 647 | $method = strtoupper($method); 648 | if(!self::_is_url($url)) 649 | { 650 | self::$error = "You have requested URL ({$url}) is not a valid HTTP address"; 651 | return false; 652 | } 653 | 654 | // 如果是 get 方式,直接拼凑一个 url 出来 655 | if ($method == 'GET' && !empty($fields)) 656 | { 657 | $url = $url.(strpos($url, '?') === false ? '?' : '&').http_build_query($fields); 658 | } 659 | 660 | $parse_url = parse_url($url); 661 | if (empty($parse_url) || empty($parse_url['host']) || !in_array($parse_url['scheme'], array('http', 'https'))) 662 | { 663 | self::$error = "No connection adapters were found for '{$url}'"; 664 | return false; 665 | } 666 | $scheme = $parse_url['scheme']; 667 | $domain = $parse_url['host']; 668 | 669 | // 随机绑定 hosts,做负载均衡 670 | if (self::$hosts) 671 | { 672 | if (isset(self::$hosts[$domain])) 673 | { 674 | $hosts = self::$hosts[$domain]; 675 | $key = rand(0, count($hosts)-1); 676 | $ip = $hosts[$key]; 677 | $url = str_replace($domain, $ip, $url); 678 | self::$rawheaders['Host'] = $domain; 679 | } 680 | } 681 | 682 | curl_setopt( self::$ch, CURLOPT_URL, $url ); 683 | 684 | if ($method != 'GET') 685 | { 686 | // 如果是 post 方式 687 | if ($method == 'POST') 688 | { 689 | //curl_setopt( self::$ch, CURLOPT_POST, true ); 690 | $tmpheaders = array_change_key_case(self::$rawheaders, CASE_LOWER); 691 | // 有些RESTful服务只接受JSON形态的数据 692 | // CURLOPT_POST会把上傳的文件类型设为 multipart/form-data 693 | // 把CURLOPT_POSTFIELDS的内容按multipart/form-data 的形式编码 694 | // CURLOPT_CUSTOMREQUEST可以按指定内容上传 695 | if ( isset($tmpheaders['content-type']) && $tmpheaders['content-type'] == 'application/json' ) 696 | { 697 | curl_setopt( self::$ch, CURLOPT_CUSTOMREQUEST, $method ); 698 | } 699 | else 700 | { 701 | curl_setopt( self::$ch, CURLOPT_POST, true ); 702 | } 703 | 704 | $file_fields = array(); 705 | if (!empty($files)) 706 | { 707 | foreach ($files as $postname => $file) 708 | { 709 | $filepath = realpath($file); 710 | // 如果文件不存在 711 | if (!file_exists($filepath)) 712 | { 713 | continue; 714 | } 715 | 716 | $filename = basename($filepath); 717 | $type = self::get_mimetype($filepath); 718 | $file_fields[$postname] = curl_file_create($filepath, $type, $filename); 719 | // curl -F "name=seatle&file=@/absolute/path/to/image.png" htt://localhost/uploadfile.php 720 | //$cfile = '@'.realpath($filename).";type=".$type.";filename=".$filename; 721 | } 722 | } 723 | } 724 | else 725 | { 726 | self::$rawheaders['X-HTTP-Method-Override'] = $method; 727 | curl_setopt( self::$ch, CURLOPT_CUSTOMREQUEST, $method ); 728 | } 729 | 730 | if ( $method == 'POST' ) 731 | { 732 | // 不是上传文件的,用http_build_query, 能实现更好的兼容性,更小的请求数据包 733 | if ( empty($file_fields) ) 734 | { 735 | // post方式 736 | if ( is_array($fields) ) 737 | { 738 | $fields = http_build_query($fields); 739 | } 740 | } 741 | else 742 | { 743 | // 有post数据 744 | if ( is_array($fields) && !empty($fields) ) 745 | { 746 | // 某些server可能会有问题 747 | $fields = array_merge($fields, $file_fields); 748 | } 749 | else 750 | { 751 | $fields = $file_fields; 752 | } 753 | } 754 | 755 | // 不能直接传数组,不知道是什么Bug,会非常慢 756 | curl_setopt( self::$ch, CURLOPT_POSTFIELDS, $fields ); 757 | } 758 | } 759 | 760 | $cookies = self::get_cookies(); 761 | $domain_cookies = self::get_cookies($domain); 762 | $cookies = array_merge($cookies, $domain_cookies); 763 | // 是否设置了cookie 764 | if (!empty($cookies)) 765 | { 766 | foreach ($cookies as $key=>$value) 767 | { 768 | $cookie_arr[] = $key.'='.$value; 769 | } 770 | $cookies = implode('; ', $cookie_arr); 771 | curl_setopt(self::$ch, CURLOPT_COOKIE, $cookies); 772 | } 773 | 774 | if (!empty(self::$useragents)) 775 | { 776 | $key = rand(0, count(self::$useragents) - 1); 777 | self::$rawheaders['User-Agent'] = self::$useragents[$key]; 778 | } 779 | 780 | if (!empty(self::$client_ips)) 781 | { 782 | $key = rand(0, count(self::$client_ips) - 1); 783 | self::$rawheaders['CLIENT-IP'] = self::$client_ips[$key]; 784 | self::$rawheaders['X-FORWARDED-FOR'] = self::$client_ips[$key]; 785 | } 786 | 787 | if (self::$rawheaders) 788 | { 789 | $http_headers = array(); 790 | foreach (self::$rawheaders as $k=>$v) 791 | { 792 | $http_headers[] = $k.': '.$v; 793 | } 794 | curl_setopt( self::$ch, CURLOPT_HTTPHEADER, $http_headers ); 795 | } 796 | 797 | curl_setopt( self::$ch, CURLOPT_ENCODING, 'gzip' ); 798 | 799 | // 关闭验证 800 | if ($scheme == 'https') 801 | { 802 | curl_setopt(self::$ch, CURLOPT_SSL_VERIFYPEER, false); 803 | curl_setopt(self::$ch, CURLOPT_SSL_VERIFYHOST, false); 804 | } 805 | 806 | if (self::$proxies) 807 | { 808 | $key = rand(0, count(self::$proxies) - 1); 809 | $proxy = self::$proxies[$key]; 810 | curl_setopt( self::$ch, CURLOPT_PROXY, $proxy ); 811 | } 812 | 813 | // header + body,header 里面有 cookie 814 | curl_setopt( self::$ch, CURLOPT_HEADER, true ); 815 | // 请求跳转后的内容 816 | if ($allow_redirects) 817 | { 818 | curl_setopt( self::$ch, CURLOPT_FOLLOWLOCATION, true); 819 | } 820 | 821 | self::$raw = curl_exec ( self::$ch ); 822 | // 真实url 823 | //$location = curl_getinfo( self::$ch, CURLINFO_EFFECTIVE_URL); 824 | self::$info = curl_getinfo( self::$ch ); 825 | //print_r(self::$info); 826 | self::$status_code = self::$info['http_code']; 827 | if (self::$raw === false) 828 | { 829 | self::$error = 'Curl error: ' . curl_error( self::$ch ); 830 | //trigger_error(self::$error, E_USER_WARNING); 831 | } 832 | 833 | // 关闭句柄 834 | curl_close( self::$ch ); 835 | 836 | // 请求成功之后才把URL存起来 837 | list($header, $text) = self::split_header_body(); 838 | self::$history = self::get_history($header); 839 | self::$headers = self::get_response_headers($header); 840 | self::get_response_cookies($header, $domain); 841 | //$data = substr($data, 10); 842 | //$data = gzinflate($data); 843 | return $text; 844 | } 845 | 846 | public static function get_history($header) 847 | { 848 | $status_code = 0; 849 | $lines = explode("\n", $header); 850 | foreach ($lines as $line) 851 | { 852 | $line = trim($line); 853 | if (preg_match("#^HTTP/.*? (\d+) Found#", $line, $out)) 854 | { 855 | $status_code = empty($out[1]) ? 0 : intval($out[1]); 856 | } 857 | } 858 | return $status_code; 859 | } 860 | 861 | // 获取 mimetype 862 | public static function get_mimetype($filepath) 863 | { 864 | $fp = finfo_open(FILEINFO_MIME); 865 | $mime = finfo_file($fp, $filepath); 866 | finfo_close($fp); 867 | $arr = explode(';', $mime); 868 | $type = empty($arr[0]) ? '' : $arr[0]; 869 | return $type; 870 | } 871 | 872 | /** 873 | * 拼凑文件和表单 874 | * 占时没有用到 875 | * 876 | * @param mixed $post_fields 877 | * @param mixed $file_fields 878 | * @return void 879 | * @author seatle 880 | * @created time :2017-08-03 18:06 881 | */ 882 | public static function get_postfile_form($post_fields, $file_fields) 883 | { 884 | // 构造post数据 885 | $data = ''; 886 | $delimiter = '-------------' . uniqid(); 887 | // 表单数据 888 | foreach ($post_fields as $name => $content) 889 | { 890 | $data .= '--'.$delimiter."\r\n"; 891 | $data .= 'Content-Disposition: form-data; name = "'.$name.'"'; 892 | $data .= "\r\n\r\n"; 893 | $data .= $content; 894 | $data .= "\r\n"; 895 | } 896 | 897 | foreach ($file_fields as $input_name => $file) 898 | { 899 | $data .= '--'.$delimiter."\r\n"; 900 | $data .= 'Content-Disposition: form-data; name = "'.$input_name.'";'. 901 | ' filename="'.$file['filename'].'"'."\r\n"; 902 | $data .= "Content-Type: {$file['type']}\r\n"; 903 | $data .= "\r\n"; 904 | $data .= $file['content']; 905 | $data .= "\r\n"; 906 | } 907 | 908 | // 结束符 909 | $data .= '--'.$delimiter."--\r\n"; 910 | 911 | //return array( 912 | //CURLOPT_HTTPHEADER => array( 913 | //'Content-Type:multipart/form-data;boundary=' . $delimiter, 914 | //'Content-Length:' . strlen($data) 915 | //), 916 | //CURLOPT_POST => true, 917 | //CURLOPT_POSTFIELDS => $data, 918 | //); 919 | return array($delimiter, $data); 920 | } 921 | 922 | /** 923 | * html encoding transform 924 | * 925 | * @param string $html 926 | * @param string $in 927 | * @param string $out 928 | * @param string $content 929 | * @param string $mode 930 | * auto|iconv|mb_convert_encoding 931 | * @return string 932 | */ 933 | public static function encoding($html, $in = null, $out = null, $mode = 'auto') 934 | { 935 | $valid = array( 936 | 'auto', 937 | 'iconv', 938 | 'mb_convert_encoding', 939 | ); 940 | if (isset(self::$output_encoding)) 941 | { 942 | $out = self::$output_encoding; 943 | } 944 | if ( ! isset($out)) 945 | { 946 | $out = 'UTF-8'; 947 | } 948 | if ( ! in_array($mode, $valid)) 949 | { 950 | throw new Exception('invalid mode, mode='.$mode); 951 | } 952 | $if = function_exists('mb_convert_encoding'); 953 | $if = $if && ($mode == 'auto' || $mode == 'mb_convert_encoding'); 954 | if (function_exists('iconv') && ($mode == 'auto' || $mode == 'iconv')) 955 | { 956 | $func = 'iconv'; 957 | } 958 | elseif ($if) 959 | { 960 | $func = 'mb_convert_encoding'; 961 | } 962 | else 963 | { 964 | throw new Exception('charsetTrans failed, no function'); 965 | } 966 | 967 | $pattern = '/(]*?charset=([\"\']?))([a-z\d_\-]*)(\2[^>]*?>)/is'; 968 | if ( ! isset($in)) 969 | { 970 | $n = preg_match($pattern, $html, $in); 971 | if ($n > 0) 972 | { 973 | $in = $in[3]; 974 | } 975 | else 976 | { 977 | $in = null; 978 | } 979 | if (empty($in) and function_exists('mb_detect_encoding')) 980 | { 981 | $in = mb_detect_encoding($html, array('UTF-8', 'GBK', 'GB2312', 'LATIN1', 'ASCII', 'BIG5', 'ISO-8859-1')); 982 | } 983 | } 984 | 985 | if (isset($in)) 986 | { 987 | if ($in == 'ISO-8859-1') 988 | { 989 | $in = 'UTF-8'; 990 | } 991 | $old = error_reporting(error_reporting() & ~E_NOTICE); 992 | $html = call_user_func($func, $in, $out.'//IGNORE', $html); 993 | error_reporting($old); 994 | $html = preg_replace($pattern, "\\1$out\\4", $html, 1); 995 | } 996 | return $html; 997 | } 998 | } 999 | -------------------------------------------------------------------------------- /core/selector.php: -------------------------------------------------------------------------------- 1 | 10 | // +---------------------------------------------------------------------- 11 | 12 | //---------------------------------- 13 | // PHPSpider选择器类文件 14 | //---------------------------------- 15 | 16 | namespace phpspider\core; 17 | 18 | use phpspider\library\phpquery; 19 | use DOMDocument; 20 | use DOMXpath; 21 | use Exception; 22 | 23 | class selector 24 | { 25 | /** 26 | * 版本号 27 | * @var string 28 | */ 29 | const VERSION = '1.0.2'; 30 | public static $dom = null; 31 | public static $dom_auth = ''; 32 | public static $xpath = null; 33 | public static $error = null; 34 | 35 | public static function select($html, $selector, $selector_type = 'xpath') 36 | { 37 | if (empty($html) || empty($selector)) 38 | { 39 | return false; 40 | } 41 | 42 | $selector_type = strtolower($selector_type); 43 | if ($selector_type == 'xpath') 44 | { 45 | return self::_xpath_select($html, $selector); 46 | } 47 | elseif ($selector_type == 'regex') 48 | { 49 | return self::_regex_select($html, $selector); 50 | } 51 | elseif ($selector_type == 'css') 52 | { 53 | return self::_css_select($html, $selector); 54 | } 55 | } 56 | 57 | public static function remove($html, $selector, $selector_type = 'xpath') 58 | { 59 | if (empty($html) || empty($selector)) 60 | { 61 | return false; 62 | } 63 | 64 | $remove_html = ""; 65 | $selector_type = strtolower($selector_type); 66 | if ($selector_type == 'xpath') 67 | { 68 | $remove_html = self::_xpath_select($html, $selector, true); 69 | } 70 | elseif ($selector_type == 'regex') 71 | { 72 | $remove_html = self::_regex_select($html, $selector, true); 73 | } 74 | elseif ($selector_type == 'css') 75 | { 76 | $remove_html = self::_css_select($html, $selector, true); 77 | } 78 | $html = str_replace($remove_html, "", $html); 79 | return $html; 80 | } 81 | 82 | /** 83 | * xpath选择器 84 | * 85 | * @param mixed $html 86 | * @param mixed $selector 87 | * @return void 88 | * @author seatle 89 | * @created time :2016-10-26 12:53 90 | */ 91 | private static function _xpath_select($html, $selector, $remove = false) 92 | { 93 | if (!is_object(self::$dom)) 94 | { 95 | self::$dom = new DOMDocument(); 96 | } 97 | 98 | // 如果加载的不是之前的HTML内容,替换一下验证标识 99 | if (self::$dom_auth != md5($html)) 100 | { 101 | self::$dom_auth = md5($html); 102 | @self::$dom->loadHTML(''.$html); 103 | self::$xpath = new DOMXpath(self::$dom); 104 | } 105 | 106 | //libxml_use_internal_errors(true); 107 | //self::$dom->loadHTML(''.$html); 108 | //$errors = libxml_get_errors(); 109 | //if (!empty($errors)) 110 | //{ 111 | //print_r($errors); 112 | //exit; 113 | //} 114 | 115 | $elements = @self::$xpath->query($selector); 116 | if ($elements === false) 117 | { 118 | self::$error = "the selector in the xpath(\"{$selector}\") syntax errors"; 119 | // 不应该返回false,因为isset(false)为true,更不能通过 !$values 去判断,因为!0为true,所以这里只能返回null 120 | //return false; 121 | return null; 122 | } 123 | 124 | $result = array(); 125 | if (!is_null($elements)) 126 | { 127 | foreach ($elements as $element) 128 | { 129 | // 如果是删除操作,取一整块代码 130 | if ($remove) 131 | { 132 | $content = self::$dom->saveXml($element); 133 | } 134 | else 135 | { 136 | $nodeName = $element->nodeName; 137 | $nodeType = $element->nodeType; // 1.Element 2.Attribute 3.Text 138 | //$nodeAttr = $element->getAttribute('src'); 139 | //$nodes = util::node_to_array(self::$dom, $element); 140 | //echo $nodes['@src']."\n"; 141 | // 如果是img标签,直接取src值 142 | if ($nodeType == 1 && in_array($nodeName, array('img'))) 143 | { 144 | $content = $element->getAttribute('src'); 145 | } 146 | // 如果是标签属性,直接取节点值 147 | elseif ($nodeType == 2 || $nodeType == 3 || $nodeType == 4) 148 | { 149 | $content = $element->nodeValue; 150 | } 151 | else 152 | { 153 | // 保留nodeValue里的html符号,给children二次提取 154 | $content = self::$dom->saveXml($element); 155 | //$content = trim(self::$dom->saveHtml($element)); 156 | $content = preg_replace(array("#^<{$nodeName}.*>#isU","#$#isU"), array('', ''), $content); 157 | } 158 | } 159 | $result[] = $content; 160 | } 161 | } 162 | if (empty($result)) 163 | { 164 | return null; 165 | } 166 | // 如果只有一个元素就直接返回string,否则返回数组 167 | return count($result) > 1 ? $result : $result[0]; 168 | } 169 | 170 | /** 171 | * css选择器 172 | * 173 | * @param mixed $html 174 | * @param mixed $selector 175 | * @return void 176 | * @author seatle 177 | * @created time :2016-10-26 12:53 178 | */ 179 | private static function _css_select($html, $selector, $remove = false) 180 | { 181 | $selector = self::css_to_xpath($selector); 182 | //echo $selector."\n"; 183 | //exit("\n"); 184 | return self::_xpath_select($html, $selector, $remove); 185 | // 如果加载的不是之前的HTML内容,替换一下验证标识 186 | //if (self::$dom_auth['css'] != md5($html)) 187 | //{ 188 | //self::$dom_auth['css'] = md5($html); 189 | //phpQuery::loadDocumentHTML($html); 190 | //} 191 | //if ($remove) 192 | //{ 193 | //return phpQuery::pq($selector)->remove(); 194 | //} 195 | //else 196 | //{ 197 | //return phpQuery::pq($selector)->html(); 198 | //} 199 | } 200 | 201 | /** 202 | * 正则选择器 203 | * 204 | * @param mixed $html 205 | * @param mixed $selector 206 | * @return void 207 | * @author seatle 208 | * @created time :2016-10-26 12:53 209 | */ 210 | private static function _regex_select($html, $selector, $remove = false) 211 | { 212 | if(@preg_match_all($selector, $html, $out) === false) 213 | { 214 | self::$error = "the selector in the regex(\"{$selector}\") syntax errors"; 215 | return null; 216 | } 217 | $count = count($out); 218 | $result = array(); 219 | // 一个都没有匹配到 220 | if ($count == 0) 221 | { 222 | return null; 223 | } 224 | // 只匹配一个,就是只有一个 () 225 | elseif ($count == 2) 226 | { 227 | // 删除的话取匹配到的所有内容 228 | if ($remove) 229 | { 230 | $result = $out[0]; 231 | } 232 | else 233 | { 234 | $result = $out[1]; 235 | } 236 | } 237 | else 238 | { 239 | for ($i = 1; $i < $count; $i++) 240 | { 241 | // 如果只有一个元素,就直接返回好了 242 | $result[] = count($out[$i]) > 1 ? $out[$i] : $out[$i][0]; 243 | } 244 | } 245 | if (empty($result)) 246 | { 247 | return null; 248 | } 249 | 250 | return count($result) > 1 ? $result : $result[0]; 251 | } 252 | 253 | public static function find_all($html, $selector) 254 | { 255 | } 256 | 257 | 258 | public static function css_to_xpath($selectors) 259 | { 260 | $queries = self::parse_selector($selectors); 261 | $delimiter_before = false; 262 | $xquery = ''; 263 | foreach($queries as $s) 264 | { 265 | // TAG 266 | $is_tag = preg_match('@^[\w|\||-]+$@', $s) || $s == '*'; 267 | if ($is_tag) 268 | { 269 | $xquery .= $s; 270 | } 271 | // ID 272 | else if ($s[0] == '#') 273 | { 274 | if ($delimiter_before) 275 | { 276 | $xquery .= '*'; 277 | } 278 | // ID用精确查询 279 | $xquery .= "[@id='".substr($s, 1)."']"; 280 | } 281 | // CLASSES 282 | else if ($s[0] == '.') 283 | { 284 | if ($delimiter_before) 285 | { 286 | $xquery .= '*'; 287 | } 288 | // CLASS用模糊查询 289 | $xquery .= "[contains(@class,'".substr($s, 1)."')]"; 290 | } 291 | // ATTRIBUTES 292 | else if ($s[0] == '[') 293 | { 294 | if ($delimiter_before) 295 | { 296 | $xquery .= '*'; 297 | } 298 | // strip side brackets 299 | $attr = trim($s, ']['); 300 | // attr with specifed value 301 | if (mb_strpos($s, '=')) 302 | { 303 | $value = null; 304 | list($attr, $value) = explode('=', $attr); 305 | $value = trim($value, "'\""); 306 | if (self::is_regexp($attr)) 307 | { 308 | // cut regexp character 309 | $attr = substr($attr, 0, -1); 310 | $xquery .= "[@{$attr}]"; 311 | } 312 | else 313 | { 314 | $xquery .= "[@{$attr}='{$value}']"; 315 | } 316 | } 317 | // attr without specified value 318 | else 319 | { 320 | $xquery .= "[@{$attr}]"; 321 | } 322 | } 323 | // ~ General Sibling Selector 324 | else if ($s[0] == '~') 325 | { 326 | } 327 | // + Adjacent sibling selectors 328 | else if ($s[0] == '+') 329 | { 330 | } 331 | // PSEUDO CLASSES 332 | else if ($s[0] == ':') 333 | { 334 | } 335 | // DIRECT DESCENDANDS 336 | else if ($s == '>') 337 | { 338 | $xquery .= '/'; 339 | $delimiter_before = 2; 340 | } 341 | // ALL DESCENDANDS 342 | else if ($s == ' ') 343 | { 344 | $xquery .= '//'; 345 | $delimiter_before = 2; 346 | } 347 | // ERRORS 348 | else 349 | { 350 | exit("Unrecognized token '$s'"); 351 | } 352 | $delimiter_before = $delimiter_before === 2; 353 | } 354 | return $xquery; 355 | } 356 | 357 | /** 358 | * @access private 359 | */ 360 | public static function parse_selector($query) 361 | { 362 | $query = trim( preg_replace( '@\s+@', ' ', preg_replace('@\s*(>|\\+|~)\s*@', '\\1', $query) ) ); 363 | $queries = array(); 364 | if ( !$query ) 365 | { 366 | return $queries; 367 | } 368 | 369 | $special_chars = array('>',' '); 370 | $special_chars_mapping = array(); 371 | $strlen = mb_strlen($query); 372 | $class_chars = array('.', '-'); 373 | $pseudo_chars = array('-'); 374 | $tag_chars = array('*', '|', '-'); 375 | // split multibyte string 376 | // http://code.google.com/p/phpquery/issues/detail?id=76 377 | $_query = array(); 378 | for ( $i=0; $i<$strlen; $i++ ) 379 | { 380 | $_query[] = mb_substr($query, $i, 1); 381 | } 382 | $query = $_query; 383 | // it works, but i dont like it... 384 | $i = 0; 385 | while( $i < $strlen ) 386 | { 387 | $c = $query[$i]; 388 | $tmp = ''; 389 | // TAG 390 | if ( self::is_char($c) || in_array($c, $tag_chars) ) 391 | { 392 | while(isset($query[$i]) && (self::is_char($query[$i]) || in_array($query[$i], $tag_chars))) 393 | { 394 | $tmp .= $query[$i]; 395 | $i++; 396 | } 397 | $queries[] = $tmp; 398 | } 399 | // IDs 400 | else if ( $c == '#' ) 401 | { 402 | $i++; 403 | while( isset($query[$i]) && (self::is_char($query[$i]) || $query[$i] == '-') ) 404 | { 405 | $tmp .= $query[$i]; 406 | $i++; 407 | } 408 | $queries[] = '#'.$tmp; 409 | } 410 | // SPECIAL CHARS 411 | else if ( in_array($c, $special_chars) ) 412 | { 413 | $queries[] = $c; 414 | $i++; 415 | // MAPPED SPECIAL MULTICHARS 416 | // } else if ( $c.$query[$i+1] == '//') { 417 | // $return[] = ' '; 418 | // $i = $i+2; 419 | } 420 | // MAPPED SPECIAL CHARS 421 | else if ( isset($special_chars_mapping[$c])) 422 | { 423 | $queries[] = $special_chars_mapping[$c]; 424 | $i++; 425 | } 426 | // COMMA 427 | else if ( $c == ',' ) 428 | { 429 | $i++; 430 | while( isset($query[$i]) && $query[$i] == ' ') 431 | { 432 | $i++; 433 | } 434 | } 435 | // CLASSES 436 | else if ($c == '.') 437 | { 438 | while( isset($query[$i]) && (self::is_char($query[$i]) || in_array($query[$i], $class_chars))) 439 | { 440 | $tmp .= $query[$i]; 441 | $i++; 442 | } 443 | $queries[] = $tmp; 444 | } 445 | // ~ General Sibling Selector 446 | else if ($c == '~') 447 | { 448 | $space_allowed = true; 449 | $tmp .= $query[$i++]; 450 | while( isset($query[$i]) 451 | && (self::is_char($query[$i]) 452 | || in_array($query[$i], $class_chars) 453 | || $query[$i] == '*' 454 | || ($query[$i] == ' ' && $space_allowed) 455 | )) 456 | { 457 | if ($query[$i] != ' ') 458 | { 459 | $space_allowed = false; 460 | } 461 | $tmp .= $query[$i]; 462 | $i++; 463 | } 464 | $queries[] = $tmp; 465 | } 466 | // + Adjacent sibling selectors 467 | else if ($c == '+') 468 | { 469 | $space_allowed = true; 470 | $tmp .= $query[$i++]; 471 | while( isset($query[$i]) 472 | && (self::is_char($query[$i]) 473 | || in_array($query[$i], $class_chars) 474 | || $query[$i] == '*' 475 | || ($space_allowed && $query[$i] == ' ') 476 | )) 477 | { 478 | if ($query[$i] != ' ') 479 | $space_allowed = false; 480 | $tmp .= $query[$i]; 481 | $i++; 482 | } 483 | $queries[] = $tmp; 484 | } 485 | // ATTRS 486 | else if ($c == '[') 487 | { 488 | $stack = 1; 489 | $tmp .= $c; 490 | while( isset($query[++$i])) 491 | { 492 | $tmp .= $query[$i]; 493 | if ( $query[$i] == '[') 494 | { 495 | $stack++; 496 | } 497 | else if ( $query[$i] == ']') 498 | { 499 | $stack--; 500 | if (! $stack ) 501 | { 502 | break; 503 | } 504 | } 505 | } 506 | $queries[] = $tmp; 507 | $i++; 508 | } 509 | // PSEUDO CLASSES 510 | else if ($c == ':') 511 | { 512 | $stack = 1; 513 | $tmp .= $query[$i++]; 514 | while( isset($query[$i]) && (self::is_char($query[$i]) || in_array($query[$i], $pseudo_chars))) 515 | { 516 | $tmp .= $query[$i]; 517 | $i++; 518 | } 519 | // with arguments ? 520 | if ( isset($query[$i]) && $query[$i] == '(') 521 | { 522 | $tmp .= $query[$i]; 523 | $stack = 1; 524 | while( isset($query[++$i])) 525 | { 526 | $tmp .= $query[$i]; 527 | if ( $query[$i] == '(') 528 | { 529 | $stack++; 530 | } 531 | else if ( $query[$i] == ')') 532 | { 533 | $stack--; 534 | if (! $stack ) 535 | { 536 | break; 537 | } 538 | } 539 | } 540 | $queries[] = $tmp; 541 | $i++; 542 | } 543 | else 544 | { 545 | $queries[] = $tmp; 546 | } 547 | } 548 | else 549 | { 550 | $i++; 551 | } 552 | } 553 | 554 | if (isset($queries[0])) 555 | { 556 | if (isset($queries[0][0]) && $queries[0][0] == ':') 557 | { 558 | array_unshift($queries, '*'); 559 | } 560 | if ($queries[0] != '>') 561 | { 562 | array_unshift($queries, ' '); 563 | } 564 | } 565 | 566 | return $queries; 567 | } 568 | 569 | public static function is_char($char) 570 | { 571 | return preg_match('@\w@', $char); 572 | } 573 | 574 | /** 575 | * 模糊匹配 576 | * ^ 前缀字符串 577 | * * 包含字符串 578 | * $ 后缀字符串 579 | * @access private 580 | */ 581 | protected static function is_regexp($pattern) 582 | { 583 | return in_array( 584 | $pattern[ mb_strlen($pattern)-1 ], 585 | array('^','*','$') 586 | ); 587 | } 588 | } 589 | -------------------------------------------------------------------------------- /core/util.php: -------------------------------------------------------------------------------- 1 | 10 | // +---------------------------------------------------------------------- 11 | 12 | //---------------------------------- 13 | // PHPSpider实用函数集合类文件 14 | //---------------------------------- 15 | 16 | namespace phpspider\core; 17 | // 引入PATH_DATA 18 | require_once __DIR__ . '/constants.php'; 19 | 20 | class util 21 | { 22 | /** 23 | * 文件锁 24 | * 如果没有锁,就加一把锁并且执行逻辑,然后删除锁 25 | * if (!util::lock('statistics_offer')) 26 | * { 27 | * util::lock('statistics_offer'); 28 | * ... 29 | * util::unlock('statistics_offer'); 30 | * } 31 | * 否则输出锁存在 32 | * else 33 | * { 34 | * echo "process has been locked\n"; 35 | * } 36 | * 37 | * @param mixed $lock_name 38 | * @param int $lock_timeout 39 | * @return void 40 | * @author seatle 41 | * @created time :2016-02-18 14:28 42 | */ 43 | public static function lock($lock_name, $lock_timeout = 600) 44 | { 45 | $lock = util::get_file(PATH_DATA."/lock/{$lock_name}.lock"); 46 | if ($lock) 47 | { 48 | $time = time() - $lock; 49 | // 还没到10分钟,说明进程还活着 50 | if ($time < $lock_timeout) 51 | { 52 | return true; 53 | } 54 | unlink(PATH_DATA."/lock/{$lock_name}.lock"); 55 | } 56 | util::put_file(PATH_DATA."/lock/{$lock_name}.lock", time()); 57 | return false; 58 | } 59 | 60 | public static function unlock($lock_name) 61 | { 62 | unlink(PATH_DATA."/lock/{$lock_name}.lock"); 63 | } 64 | 65 | public static function time2second($time, $is_log = true) 66 | { 67 | if(is_numeric($time)) 68 | { 69 | $value = array( 70 | "years" => 0, "days" => 0, "hours" => 0, 71 | "minutes" => 0, "seconds" => 0, 72 | ); 73 | if($time >= 31556926) 74 | { 75 | $value["years"] = floor($time/31556926); 76 | $time = ($time%31556926); 77 | } 78 | if($time >= 86400) 79 | { 80 | $value["days"] = floor($time/86400); 81 | $time = ($time%86400); 82 | } 83 | if($time >= 3600) 84 | { 85 | $value["hours"] = floor($time/3600); 86 | $time = ($time%3600); 87 | } 88 | if($time >= 60) 89 | { 90 | $value["minutes"] = floor($time/60); 91 | $time = ($time%60); 92 | } 93 | $value["seconds"] = floor($time); 94 | //return (array) $value; 95 | //$t = $value["years"] ."y ". $value["days"] ."d ". $value["hours"] ."h ". $value["minutes"] ."m ".$value["seconds"]."s"; 96 | if ($is_log) 97 | { 98 | $t = $value["days"] ."d ". $value["hours"] ."h ". $value["minutes"] ."m ".$value["seconds"]."s"; 99 | } 100 | else 101 | { 102 | $t = $value["days"] ." days ". $value["hours"] ." hours ". $value["minutes"] ." minutes"; 103 | } 104 | return $t; 105 | 106 | } 107 | else 108 | { 109 | return false; 110 | } 111 | } 112 | 113 | public static function get_days($day_sta, $day_end = true, $range = 86400) 114 | { 115 | if ($day_end === true) $day_end = date('Y-m-d'); 116 | 117 | return array_map(function ($time) { 118 | return date('Y-m-d', $time); 119 | }, range(strtotime($day_sta), strtotime($day_end), $range)); 120 | } 121 | 122 | /** 123 | * 获取文件行数 124 | * 125 | * @param mixed $filepath 126 | * @return void 127 | * @author seatle 128 | * @created time :2016-03-31 21:54 129 | */ 130 | public static function get_file_line($filepath) 131 | { 132 | $line = 0 ; 133 | $fp = fopen($filepath , 'r'); 134 | if (!$fp) 135 | { 136 | return 0; 137 | } 138 | //获取文件的一行内容,注意:需要php5才支持该函数; 139 | while( stream_get_line($fp,8192,"\n") ){ 140 | $line++; 141 | } 142 | fclose($fp);//关闭文件 143 | return $line; 144 | } 145 | 146 | /** 147 | * 获得表数 148 | * 149 | * @param mixed $table_name 表名 150 | * @param mixed $item_value 唯一索引 151 | * @param int $table_num 表数量 152 | * @return void 153 | * @author seatle 154 | * @created time :2015-10-22 23:25 155 | */ 156 | public static function get_table_num($item_value, $table_num = 100) 157 | { 158 | //sha1:返回一个40字符长度的16进制数字 159 | $item_value = sha1(strtolower($item_value)); 160 | //base_convert:进制建转换,下面是把16进制转成10进制,方便做除法运算 161 | //str_pad:把字符串填充为指定的长度,下面是在左边加0,表数量大于100就3位,否则2位 162 | $step = $table_num > 100 ? 3 : 2; 163 | $item_value = str_pad(base_convert(substr($item_value, -2), 16, 10) % $table_num, $step, "0", STR_PAD_LEFT); 164 | return $item_value; 165 | } 166 | 167 | /** 168 | * 获得表面 169 | * 170 | * @param mixed $table_name 表名 171 | * @param mixed $item_value 唯一索引 172 | * @param int $table_num 表数量 173 | * @return void 174 | * @author seatle 175 | * @created time :2015-10-22 23:25 176 | */ 177 | public static function get_table_name($table_name, $item_value, $table_num = 100) 178 | { 179 | //sha1:返回一个40字符长度的16进制数字 180 | $item_value = sha1(strtolower($item_value)); 181 | //base_convert:进制建转换,下面是把16进制转成10进制,方便做除法运算 182 | //str_pad:把字符串填充为指定的长度,下面是在左边加0,共3位 183 | $step = $table_num > 100 ? 3 : 2; 184 | $item_value = str_pad(base_convert(substr($item_value, -2), 16, 10) % $table_num, $step, "0", STR_PAD_LEFT); 185 | return $table_name."_".$item_value; 186 | } 187 | 188 | // 获得当前使用内存 189 | public static function memory_get_usage() 190 | { 191 | $memory = memory_get_usage(); 192 | return self::format_bytes($memory); 193 | } 194 | 195 | // 获得最高使用内存 196 | public static function memory_get_peak_usage() 197 | { 198 | $memory = memory_get_peak_usage(); 199 | return self::format_bytes($memory); 200 | } 201 | 202 | // 转换大小单位 203 | public static function format_bytes($size) 204 | { 205 | $unit = array('b', 'kb', 'mb', 'gb', 'tb', 'pb'); 206 | return @round($size / pow(1024, ($i = floor(log($size, 1024)))), 2) . ' ' . $unit[$i]; 207 | } 208 | 209 | /** 210 | * 获取数组大小 211 | * 212 | * @param mixed $arr 数组 213 | * @return string 214 | */ 215 | public static function array_size($arr) 216 | { 217 | ob_start(); 218 | print_r($arr); 219 | $mem = ob_get_contents(); 220 | ob_end_clean(); 221 | $mem = preg_replace("/\n +/", "", $mem); 222 | $mem = strlen($mem); 223 | return self::format_bytes($mem); 224 | } 225 | 226 | /** 227 | * 数字随机数 228 | * 229 | * @param int $num 230 | * @return void 231 | * @author seatle 232 | * @created time :2016-09-18 10:17 233 | */ 234 | public static function rand_num($num = 7) 235 | { 236 | $rand = ""; 237 | for ($i = 0; $i < $num; $i ++) 238 | { 239 | $rand .= mt_rand(0, 9); 240 | } 241 | return $rand; 242 | } 243 | 244 | /** 245 | * 字母数字混合随机数 246 | * 247 | * @param int $num 248 | * @return void 249 | * @author seatle 250 | * @created time :2016-09-18 10:17 251 | */ 252 | public static function rand_str($num = 10) 253 | { 254 | $chars = 'abcdefghijklmnopqrstuvwxyz0123456789'; 255 | $string = ""; 256 | for ($i = 0; $i < $num; $i ++) 257 | { 258 | $string .= substr($chars, rand(0, strlen($chars)), 1); 259 | } 260 | return $string; 261 | } 262 | 263 | /** 264 | * 汉字转拼音 265 | * 266 | * @param mixed $str 汉字 267 | * @param int $ishead 268 | * @param int $isclose 269 | * @static 270 | * @access public 271 | * @return string 272 | */ 273 | public static function pinyin($str, $ishead = 0, $isclose = 1) 274 | { 275 | // $str = iconv("utf-8", "gbk//ignore", $str); 276 | $str = mb_convert_encoding($str, "gbk", "utf-8"); 277 | global $pinyins; 278 | $restr = ''; 279 | $str = trim($str); 280 | $slen = strlen($str); 281 | if ($slen < 2) 282 | { 283 | return $str; 284 | } 285 | if (count($pinyins) == 0) 286 | { 287 | $fp = fopen(PATH_DATA . '/pinyin.dat', 'r'); 288 | while (!feof($fp)) 289 | { 290 | $line = trim(fgets($fp)); 291 | $pinyins[$line[0] . $line[1]] = substr($line, 3, strlen($line) - 3); 292 | } 293 | fclose($fp); 294 | } 295 | for ($i = 0; $i < $slen; $i ++) 296 | { 297 | if (ord($str[$i]) > 0x80) 298 | { 299 | $c = $str[$i] . $str[$i + 1]; 300 | $i ++; 301 | if (isset($pinyins[$c])) 302 | { 303 | if ($ishead == 0) 304 | { 305 | $restr .= $pinyins[$c]; 306 | } 307 | else 308 | { 309 | $restr .= $pinyins[$c][0]; 310 | } 311 | } 312 | else 313 | { 314 | // $restr .= "_"; 315 | } 316 | } 317 | else if (preg_match("/[a-z0-9]/i", $str[$i])) 318 | { 319 | $restr .= $str[$i]; 320 | } 321 | else 322 | { 323 | // $restr .= "_"; 324 | } 325 | } 326 | if ($isclose == 0) 327 | { 328 | unset($pinyins); 329 | } 330 | return $restr; 331 | } 332 | 333 | /** 334 | * 生成字母前缀 335 | * 336 | * @param mixed $s0 337 | * @return char 338 | * @author seatle 339 | * @created time :2016-09-18 10:17 340 | * @updated time :2021-07-02 04:08 341 | */ 342 | public static function letter_first($s0) 343 | { 344 | $firstchar_ord = ord(strtoupper(substr($s0, 0, 1))); 345 | if (($firstchar_ord >= 65 and $firstchar_ord <= 91) or ($firstchar_ord >= 48 and $firstchar_ord <= 57)) return substr($s0, 0, 1); 346 | // $s = iconv("utf-8", "gbk//ignore", $s0); 347 | $s = mb_convert_encoding($s0, "gbk", "utf-8"); 348 | $asc = ord(substr($s, 0, 1)) * 256 + ord(substr($s, 1, 1)) - 65536; 349 | if ($asc >= -20319 and $asc <= -20284) return "A"; 350 | if ($asc >= -20283 and $asc <= -19776) return "B"; 351 | if ($asc >= -19775 and $asc <= -19219) return "C"; 352 | if ($asc >= -19218 and $asc <= -18711) return "D"; 353 | if ($asc >= -18710 and $asc <= -18527) return "E"; 354 | if ($asc >= -18526 and $asc <= -18240) return "F"; 355 | if ($asc >= -18239 and $asc <= -17923) return "G"; 356 | if ($asc >= -17922 and $asc <= -17418) return "H"; 357 | if ($asc >= -17417 and $asc <= -16475) return "J"; 358 | if ($asc >= -16474 and $asc <= -16213) return "K"; 359 | if ($asc >= -16212 and $asc <= -15641) return "L"; 360 | if ($asc >= -15640 and $asc <= -15166) return "M"; 361 | if ($asc >= -15165 and $asc <= -14923) return "N"; 362 | if ($asc >= -14922 and $asc <= -14915) return "O"; 363 | if ($asc >= -14914 and $asc <= -14631) return "P"; 364 | if ($asc >= -14630 and $asc <= -14150) return "Q"; 365 | if ($asc >= -14149 and $asc <= -14091) return "R"; 366 | if ($asc >= -14090 and $asc <= -13319) return "S"; 367 | if ($asc >= -13318 and $asc <= -12839) return "T"; 368 | if ($asc >= -12838 and $asc <= -12557) return "W"; 369 | if ($asc >= -12556 and $asc <= -11848) return "X"; 370 | if ($asc >= -11847 and $asc <= -11056) return "Y"; 371 | if ($asc >= -11055 and $asc <= -10247) return "Z"; 372 | return 0; // null 373 | } 374 | 375 | /** 376 | * 获得某天前的时间戳 377 | * 378 | * @param mixed $day 379 | * @return void 380 | * @author seatle 381 | * @created time :2016-09-18 10:17 382 | */ 383 | public static function getxtime($day) 384 | { 385 | $day = intval($day); 386 | return mktime(23, 59, 59, date("m"), date("d") - $day, date("y")); 387 | } 388 | 389 | /** 390 | * 读文件 391 | */ 392 | public static function get_file($url, $timeout = 10) 393 | { 394 | if (function_exists('curl_init')) 395 | { 396 | $ch = curl_init(); 397 | curl_setopt($ch, CURLOPT_URL, $url); 398 | curl_setopt($ch, CURLOPT_HEADER, 0); 399 | curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); 400 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 401 | curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10); 402 | $content = curl_exec($ch); 403 | curl_close($ch); 404 | if ($content) return $content; 405 | } 406 | $ctx = stream_context_create(array('http' => array('timeout' => $timeout))); 407 | $content = @file_get_contents($url, 0, $ctx); 408 | if ($content) return $content; 409 | return false; 410 | } 411 | 412 | /** 413 | * 写文件,如果文件目录不存在,则递归生成 414 | */ 415 | public static function put_file($file, $content, $flag = 0) 416 | { 417 | $pathinfo = pathinfo($file); 418 | if (!empty($pathinfo['dirname'])) 419 | { 420 | if (file_exists($pathinfo['dirname']) === false) 421 | { 422 | if (@mkdir($pathinfo['dirname'], 0777, true) === false) 423 | { 424 | return false; 425 | } 426 | } 427 | } 428 | if ($flag === FILE_APPEND) 429 | { 430 | // 多个php-fpm写一个文件的时候容易丢失,要加锁 431 | //return @file_put_contents($file, $content, FILE_APPEND|LOCK_EX); 432 | return @file_put_contents($file, $content, FILE_APPEND); 433 | } 434 | else 435 | { 436 | return @file_put_contents($file, $content, LOCK_EX); 437 | } 438 | } 439 | 440 | /** 441 | * 检查路径是否存在,不存在则递归生成路径 442 | * 443 | * @param mixed $path 路径 444 | * @static 445 | * @access public 446 | * @return bool or string 447 | */ 448 | public static function path_exists($path) 449 | { 450 | $pathinfo = pathinfo($path . '/tmp.txt'); 451 | if (!empty($pathinfo['dirname'])) 452 | { 453 | if (file_exists($pathinfo['dirname']) === false) 454 | { 455 | if (mkdir($pathinfo['dirname'], 0777, true) === false) 456 | { 457 | return false; 458 | } 459 | } 460 | } 461 | return $path; 462 | } 463 | 464 | /** 465 | * 递归删除目录 466 | * 467 | * @param mixed $dir 468 | * @return void 469 | * @author seatle 470 | * @created time :2016-09-18 10:17 471 | */ 472 | public static function deldir($dir) 473 | { 474 | //先删除目录下的文件: 475 | $dh = opendir($dir); 476 | while ($file = readdir($dh)) 477 | { 478 | if($file!="." && $file!="..") 479 | { 480 | $fullpath = $dir."/".$file; 481 | if(!is_dir($fullpath)) 482 | { 483 | unlink($fullpath); 484 | } 485 | else 486 | { 487 | self::deldir($fullpath); 488 | } 489 | } 490 | } 491 | 492 | closedir($dh); 493 | //删除当前文件夹: 494 | if(rmdir($dir)) 495 | { 496 | return true; 497 | } 498 | else 499 | { 500 | return false; 501 | } 502 | } 503 | 504 | /** 505 | * 递归修改目录权限 506 | * 507 | * @param mixed $path 目录 508 | * @param mixed $filemode 权限 509 | * @return bool 510 | */ 511 | public static function chmodr($path, $filemode) 512 | { 513 | if (!is_dir($path)) 514 | { 515 | return @chmod($path, $filemode); 516 | } 517 | 518 | $dh = opendir($path); 519 | while (($file = readdir($dh)) !== false) 520 | { 521 | if ($file != '.' && $file != '..') 522 | { 523 | $fullpath = $path . '/' . $file; 524 | if (is_link($fullpath)) 525 | { 526 | return FALSE; 527 | } 528 | elseif (!is_dir($fullpath) && !@chmod($fullpath, $filemode)) 529 | { 530 | return FALSE; 531 | } 532 | elseif (!self::chmodr($fullpath, $filemode)) 533 | { 534 | return FALSE; 535 | } 536 | } 537 | } 538 | 539 | closedir($dh); 540 | 541 | if (@chmod($path, $filemode)) 542 | { 543 | return TRUE; 544 | } 545 | else 546 | { 547 | return FALSE; 548 | } 549 | } 550 | 551 | /** 552 | * 数组格式化为CSV 553 | * 554 | * @param mixed $data 555 | * @return void 556 | * @author seatle 557 | * @created time :2016-07-29 11:32 558 | */ 559 | public static function format_csv($data) 560 | { 561 | foreach ($data as $k=>$v) 562 | { 563 | $v = str_replace(",", "", $v); 564 | $v = str_replace(",", "", $v); 565 | $data[$k] = $v; 566 | } 567 | return implode(",", $data); 568 | } 569 | 570 | /** 571 | * 判断是否为utf8字符串 572 | * @parem $str 573 | * @return bool 574 | */ 575 | public static function is_utf8($str) 576 | { 577 | if ($str === mb_convert_encoding(mb_convert_encoding($str, "UTF-32", "UTF-8"), "UTF-8", "UTF-32")) 578 | { 579 | return true; 580 | } 581 | else 582 | { 583 | return false; 584 | } 585 | } 586 | 587 | /** 588 | * 获取文件编码 589 | * @param $string 590 | * @return string 591 | */ 592 | public static function get_encoding($string) 593 | { 594 | $encoding = mb_detect_encoding($string, array('UTF-8', 'GBK', 'GB2312', 'LATIN1', 'ASCII', 'BIG5')); 595 | return strtolower($encoding); 596 | } 597 | 598 | /** 599 | * 转换数组值的编码格式 600 | * @param array $arr 601 | * @param string $toEncoding 602 | * @param string $fromEncoding 603 | * @return array 604 | */ 605 | public static function array_iconv($arr, $from_encoding, $to_encoding) 606 | { 607 | eval('$arr = '.iconv($from_encoding, $to_encoding.'//IGNORE', var_export($arr,TRUE)).';'); 608 | return $arr; 609 | } 610 | 611 | /** 612 | * 从普通时间返回Linux时间截(strtotime中文处理版) 613 | * @parem string $dtime 614 | * @return int 615 | */ 616 | public static function cn_strtotime($dtime) 617 | { 618 | if (!preg_match("/[^0-9]/", $dtime)) 619 | { 620 | return $dtime; 621 | } 622 | $dtime = trim($dtime); 623 | $dt = Array(1970, 1, 1, 0, 0, 0); 624 | $dtime = preg_replace("/[\r\n\t]|日|秒/", " ", $dtime); 625 | $dtime = str_replace("年", "-", $dtime); 626 | $dtime = str_replace("月", "-", $dtime); 627 | $dtime = str_replace("时", ":", $dtime); 628 | $dtime = str_replace("分", ":", $dtime); 629 | $dtime = trim(preg_replace("/[ ]{1,}/", " ", $dtime)); 630 | $ds = explode(" ", $dtime); 631 | $ymd = explode("-", $ds[0]); 632 | if (!isset($ymd[1])) 633 | { 634 | $ymd = explode(".", $ds[0]); 635 | } 636 | if (isset($ymd[0])) 637 | { 638 | $dt[0] = $ymd[0]; 639 | } 640 | if (isset($ymd[1])) $dt[1] = $ymd[1]; 641 | if (isset($ymd[2])) $dt[2] = $ymd[2]; 642 | if (strlen($dt[0]) == 2) $dt[0] = '20' . $dt[0]; 643 | if (isset($ds[1])) 644 | { 645 | $hms = explode(":", $ds[1]); 646 | if (isset($hms[0])) $dt[3] = $hms[0]; 647 | if (isset($hms[1])) $dt[4] = $hms[1]; 648 | if (isset($hms[2])) $dt[5] = $hms[2]; 649 | } 650 | foreach ($dt as $k => $v) 651 | { 652 | $v = preg_replace("/^0{1,}/", '', trim($v)); 653 | if ($v == '') 654 | { 655 | $dt[$k] = 0; 656 | } 657 | } 658 | $mt = mktime($dt[3], $dt[4], $dt[5], $dt[1], $dt[2], $dt[0]); 659 | if (!empty($mt)) 660 | { 661 | return $mt; 662 | } 663 | else 664 | { 665 | return strtotime($dtime); 666 | } 667 | } 668 | 669 | public static function cn_substr($string, $length = 80, $etc = '...', $count_words = true) 670 | { 671 | mb_internal_encoding("UTF-8"); 672 | if ($length == 0) return ''; 673 | if (strlen($string) <= $length) return $string; 674 | preg_match_all("/[\x01-\x7f]|[\xc2-\xdf][\x80-\xbf]|\xe0[\xa0-\xbf][\x80-\xbf]|[\xe1-\xef][\x80-\xbf][\x80-\xbf]|\xf0[\x90-\xbf][\x80-\xbf][\x80-\xbf]|[\xf1-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]/", $string, $info); 675 | if ($count_words) 676 | { 677 | $j = 0; 678 | $wordscut = ""; 679 | for ($i = 0; $i < count($info[0]); $i ++) 680 | { 681 | $wordscut .= $info[0][$i]; 682 | if (ord($info[0][$i]) >= 128) 683 | { 684 | $j = $j + 2; 685 | } 686 | else 687 | { 688 | $j = $j + 1; 689 | } 690 | if ($j >= $length) 691 | { 692 | return $wordscut . $etc; 693 | } 694 | } 695 | return join('', $info[0]); 696 | } 697 | return join("", array_slice($info[0], 0, $length)) . $etc; 698 | } 699 | 700 | /** 701 | * 获取文件后缀名 702 | * 703 | * @param mixed $file_name 文件名 704 | * @static 705 | * 706 | * @access public 707 | * @return string 708 | */ 709 | public static function get_extension($file_name) 710 | { 711 | $ext = explode('.', $file_name); 712 | $ext = array_pop($ext); 713 | return strtolower($ext); 714 | } 715 | 716 | // 获取 Url 跳转后的真实地址 717 | public static function getrealurl($url) 718 | { 719 | if (empty($url)) 720 | { 721 | return $url; 722 | } 723 | $header = get_headers($url, 1); 724 | if (empty($header[0]) || empty($header[1])) 725 | { 726 | return $url; 727 | } 728 | if (strpos($header[0], '301') || strpos($header[0], '302')) 729 | { 730 | if (empty($header['Location'])) 731 | { 732 | return $url; 733 | } 734 | if (is_array($header['Location'])) 735 | { 736 | return $header['Location'][count($header['Location']) - 1]; 737 | } 738 | else 739 | { 740 | return $header['Location']; 741 | } 742 | } 743 | else 744 | { 745 | return $url; 746 | } 747 | } 748 | 749 | // 解压服务器用 Content-Encoding:gzip 压缩过的数据 750 | public static function gzdecode($data) 751 | { 752 | $flags = ord(substr($data, 3, 1)); 753 | $headerlen = 10; 754 | $extralen = 0; 755 | $filenamelen = 0; 756 | if ($flags & 4) 757 | { 758 | $extralen = unpack('v', substr($data, 10, 2)); 759 | $extralen = $extralen[1]; 760 | $headerlen += 2 + $extralen; 761 | } 762 | if ($flags & 8) // Filename 763 | $headerlen = strpos($data, chr(0), $headerlen) + 1; 764 | if ($flags & 16) // Comment 765 | $headerlen = strpos($data, chr(0), $headerlen) + 1; 766 | if ($flags & 2) // CRC at end of file 767 | $headerlen += 2; 768 | $unpacked = @gzinflate(substr($data, $headerlen)); 769 | if ($unpacked === FALSE) $unpacked = $data; 770 | return $unpacked; 771 | } 772 | 773 | /** 774 | * 数字金额转换为中文 775 | * @param string|integer|float $num 目标数字 776 | * @param boolean $sim 使用小写(默认) 777 | * @return string 778 | */ 779 | public static function number2chinese($num, $sim = FALSE) 780 | { 781 | if (!is_numeric($num)) return '含有非数字非小数点字符!'; 782 | $char = $sim ? array('零', '一', '二', '三', '四', '五', '六', '七', '八', '九') : array('零', '壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖'); 783 | $unit = $sim ? array('', '十', '百', '千', '', '万', '亿', '兆') : array('', '拾', '佰', '仟', '', '萬', '億', '兆'); 784 | $retval = ''; 785 | 786 | $num = sprintf("%01.2f", $num); 787 | 788 | list ($num, $dec) = explode('.', $num); 789 | 790 | // 小数部分 791 | if ($dec['0'] > 0) 792 | { 793 | $retval .= "{$char[$dec['0']]}角"; 794 | } 795 | if ($dec['1'] > 0) 796 | { 797 | $retval .= "{$char[$dec['1']]}分"; 798 | } 799 | 800 | // 整数部分 801 | if ($num > 0) 802 | { 803 | $retval = "元" . $retval; 804 | $f = 1; 805 | $str = strrev(intval($num)); 806 | for ($i = 0, $c = strlen($str); $i < $c; $i ++) 807 | { 808 | if ($str[$i] > 0) 809 | { 810 | $f = 0; 811 | } 812 | if ($f == 1 && $str[$i] == 0) 813 | { 814 | $out[$i] = ""; 815 | } 816 | else 817 | { 818 | $out[$i] = $char[$str[$i]]; 819 | } 820 | $out[$i] .= $str[$i] != '0' ? $unit[$i % 4] : ''; 821 | if ($i > 1 and $str[$i] + $str[$i - 1] == 0) 822 | { 823 | $out[$i] = ''; 824 | } 825 | if ($i % 4 == 0) 826 | { 827 | $out[$i] .= $unit[4 + floor($i / 4)]; 828 | } 829 | } 830 | $retval = join('', array_reverse($out)) . $retval; 831 | } 832 | return $retval; 833 | } 834 | 835 | public static function colorize($str, $status = "info") 836 | { 837 | $out = ""; 838 | switch ($status) 839 | { 840 | case 'succ': 841 | $out = "\033[32m"; // Blue 842 | break; 843 | case "error": 844 | $out = "\033[31m"; // Red 845 | break; 846 | case "warn": 847 | $out = "\033[33m"; // Yellow 848 | break; 849 | case "note": 850 | $out = "\033[34m"; // Green 851 | break; 852 | case "debug": 853 | $out = "\033[36m"; // Green 854 | break; 855 | default: 856 | $out = "\033[0m"; // info 857 | break; 858 | } 859 | return $out.$str."\033[0m"; 860 | } 861 | 862 | public static function node_to_array($dom, $node) 863 | { 864 | if(!is_a( $dom, 'DOMDocument' ) || !is_a( $node, 'DOMNode' )) 865 | { 866 | return false; 867 | } 868 | 869 | $array = array(); 870 | // Discard empty nodes 871 | $localName = trim( $node->localName ); 872 | if( empty($localName)) 873 | { 874 | return false; 875 | } 876 | if( XML_TEXT_NODE == $node->nodeType ) 877 | { 878 | return $node->nodeValue; 879 | } 880 | foreach ($node->attributes as $attr) 881 | { 882 | $array['@'.$attr->localName] = $attr->nodeValue; 883 | } 884 | foreach ($node->childNodes as $childNode) 885 | { 886 | if ( (isset($childNode->childNodes->length) && 1 == $childNode->childNodes->length) && 887 | XML_TEXT_NODE == $childNode->firstChild->nodeType ) 888 | { 889 | $array[$childNode->localName] = $childNode->nodeValue; 890 | } 891 | else 892 | { 893 | if( false !== ($a = self::node_to_array( $dom, $childNode))) 894 | { 895 | $array[$childNode->localName] = $a; 896 | } 897 | } 898 | } 899 | return $array; 900 | } 901 | 902 | public static function is_win() 903 | { 904 | return strtoupper(substr(PHP_OS,0,3))==="WIN"; 905 | } 906 | 907 | /** 908 | * 和 http_build_query 相反,分解出参数 909 | * 910 | * @return void 911 | * @author seatle 912 | * @created time :2016-05-16 17:29 913 | */ 914 | public static function http_split_query($query, $is_query = false) 915 | { 916 | if (!$is_query) 917 | { 918 | $parse_arr = parse_url($query); 919 | if (empty($parse_arr['query'])) 920 | { 921 | return array(); 922 | } 923 | $query = $parse_arr['query']; 924 | } 925 | 926 | $query_arr = explode("&", $query); 927 | $params = array(); 928 | foreach ($query_arr as $val) 929 | { 930 | $arr = explode("=", $val); 931 | $params[$arr[0]] = $arr[1]; 932 | } 933 | return $params; 934 | } 935 | } 936 | 937 | 938 | -------------------------------------------------------------------------------- /core/worker.php: -------------------------------------------------------------------------------- 1 | 10 | // +---------------------------------------------------------------------- 11 | 12 | //---------------------------------- 13 | // Worker多进程操作类 14 | //---------------------------------- 15 | 16 | class worker 17 | { 18 | // worker进程数 19 | public $count = 0; 20 | // worker id,worker进程从1开始,0被master进程所使用 21 | public $worker_id = 0; 22 | // worker 进程ID 23 | public $worker_pid = 0; 24 | // 进程用户 25 | public $user = ''; 26 | // 进程名 27 | public $title = ''; 28 | // 每个进程是否只运行一次 29 | public $run_once = true; 30 | // 是否输出日志 31 | public $log_show = false; 32 | // master进程启动回调 33 | public $on_start = false; 34 | // master进程停止回调 35 | public $on_stop = false; 36 | // worker进程启动回调 37 | public $on_worker_start = false; 38 | // worker进程停止回调 39 | public $on_worker_stop = false; 40 | // master进程ID 41 | protected static $_master_pid = 0; 42 | // worker进程ID 43 | protected static $_worker_pids = array(); 44 | // master、worker进程启动时间 45 | public $time_start = 0; 46 | // master、worker进程运行状态 [starting|running|shutdown|reload] 47 | protected static $_status = "starting"; 48 | 49 | 50 | public function __construct() 51 | { 52 | self::$_master_pid = posix_getpid(); 53 | // 产生时钟云,添加后父进程才可以收到信号 54 | declare(ticks = 1); 55 | $this->install_signal(); 56 | } 57 | 58 | /** 59 | * 安装信号处理函数 60 | * @return void 61 | */ 62 | protected function install_signal() 63 | { 64 | // stop 65 | pcntl_signal(SIGINT, array($this, 'signal_handler'), false); 66 | // reload 67 | pcntl_signal(SIGUSR1, array($this, 'signal_handler'), false); 68 | // status 69 | pcntl_signal(SIGUSR2, array($this, 'signal_handler'), false); 70 | // ignore 71 | pcntl_signal(SIGPIPE, SIG_IGN, false); 72 | // install signal handler for dead kids 73 | // pcntl_signal(SIGCHLD, array($this, 'signal_handler')); 74 | } 75 | 76 | /** 77 | * 卸载信号处理函数 78 | * @return void 79 | */ 80 | protected function uninstall_signal() 81 | { 82 | // uninstall stop signal handler 83 | pcntl_signal(SIGINT, SIG_IGN, false); 84 | // uninstall reload signal handler 85 | pcntl_signal(SIGUSR1, SIG_IGN, false); 86 | // uninstall status signal handler 87 | pcntl_signal(SIGUSR2, SIG_IGN, false); 88 | } 89 | 90 | /** 91 | * 信号处理函数,会被其他类调用到,所以要设置为public 92 | * @param int $signal 93 | */ 94 | public function signal_handler($signal) { 95 | switch ($signal) { 96 | // stop 2 97 | case SIGINT: 98 | // master进程和worker进程都会调用 99 | $this->stop_all(); 100 | break; 101 | // reload 30 102 | case SIGUSR1: 103 | echo "reload\n"; 104 | break; 105 | // show status 31 106 | case SIGUSR2: 107 | echo "status\n"; 108 | break; 109 | } 110 | } 111 | 112 | /** 113 | * 运行worker实例 114 | */ 115 | public function run() 116 | { 117 | $this->time_start = microtime(true); 118 | $this->worker_id = 0; 119 | $this->worker_pid = posix_getpid(); 120 | $this->set_process_title($this->title); 121 | 122 | // 这里赋值,worker进程也会克隆到 123 | if ($this->log_show) 124 | { 125 | log::$log_show = true; 126 | } 127 | 128 | if ($this->on_start) 129 | { 130 | call_user_func($this->on_start, $this); 131 | } 132 | 133 | // worker进程从1开始,0被master进程所使用 134 | for ($i = 1; $i <= $this->count; $i++) 135 | { 136 | $this->fork_one_worker($i); 137 | } 138 | $this->monitor_workers(); 139 | } 140 | 141 | /** 142 | * 创建一个子进程 143 | * @param Worker $worker 144 | * @throws Exception 145 | */ 146 | public function fork_one_worker($worker_id) 147 | { 148 | //$sockets = stream_socket_pair(STREAM_PF_UNIX, STREAM_SOCK_STREAM, STREAM_IPPROTO_IP); 149 | $pid = pcntl_fork(); 150 | 151 | // 主进程记录子进程pid 152 | if($pid > 0) 153 | { 154 | self::$_worker_pids[$worker_id] = $pid; 155 | } 156 | // 子进程运行 157 | elseif(0 === $pid) 158 | { 159 | $this->time_start = microtime(true); 160 | $this->worker_id = $worker_id; 161 | $this->worker_pid = posix_getpid(); 162 | $this->set_process_title($this->title); 163 | $this->set_process_user($this->user); 164 | // 清空master进程克隆过来的worker进程ID 165 | self::$_worker_pids = array(); 166 | //$this->uninstall_signal(); 167 | 168 | // 设置worker进程的运行状态为运行中 169 | self::$_status = "running"; 170 | 171 | // 注册进程退出回调,用来检查是否有错误(子进程里面注册) 172 | register_shutdown_function(array($this, 'check_errors')); 173 | 174 | // 如果设置了worker进程启动回调函数 175 | if ($this->on_worker_start) 176 | { 177 | call_user_func($this->on_worker_start, $this); 178 | } 179 | 180 | // 停止当前worker实例 181 | $this->stop(); 182 | // 这里用0表示正常退出 183 | exit(0); 184 | } 185 | else 186 | { 187 | log::add("fork one worker fail", "Error"); 188 | exit; 189 | } 190 | } 191 | 192 | /** 193 | * 尝试设置运行当前进程的用户 194 | * 195 | * @param $user_name 196 | */ 197 | protected static function set_process_user($user_name) 198 | { 199 | // 用户名为空 或者 当前用户不是root用户 200 | if(empty($user_name) || posix_getuid() !== 0) 201 | { 202 | return; 203 | } 204 | $user_info = posix_getpwnam($user_name); 205 | if($user_info['uid'] != posix_getuid() || $user_info['gid'] != posix_getgid()) 206 | { 207 | if(!posix_setgid($user_info['gid']) || !posix_setuid($user_info['uid'])) 208 | { 209 | log::add('Can not run woker as '.$user_name." , You shuld be root", "Error"); 210 | } 211 | } 212 | } 213 | 214 | /** 215 | * 设置当前进程的名称,在ps aux命令中有用 216 | * 注意 需要php>=5.5或者安装了protitle扩展 217 | * @param string $title 218 | * @return void 219 | */ 220 | protected function set_process_title($title) 221 | { 222 | if (!empty($title)) 223 | { 224 | // 需要扩展 225 | if(extension_loaded('proctitle') && function_exists('setproctitle')) 226 | { 227 | @setproctitle($title); 228 | } 229 | // >=php 5.5 230 | elseif (function_exists('cli_set_process_title')) 231 | { 232 | cli_set_process_title($title); 233 | } 234 | } 235 | } 236 | 237 | /** 238 | * 监控所有子进程的退出事件及退出码 239 | * @return void 240 | */ 241 | public function monitor_workers() 242 | { 243 | // 设置master进程的运行状态为运行中 244 | self::$_status = "running"; 245 | while(1) 246 | { 247 | // pcntl_signal_dispatch 子进程无法接受到信号 248 | // 如果有信号到来,尝试触发信号处理函数 249 | //pcntl_signal_dispatch(); 250 | // 挂起进程,直到有子进程退出或者被信号打断 251 | $status = 0; 252 | $pid = pcntl_wait($status, WUNTRACED); 253 | // 如果有信号到来,尝试触发信号处理函数 254 | //pcntl_signal_dispatch(); 255 | 256 | // 子进程退出信号 257 | if($pid > 0) 258 | { 259 | //echo "worker[".$pid."] stop\n"; 260 | //$this->stop(); 261 | 262 | // 如果不是正常退出,是被kill等杀掉的 263 | if($status !== 0) 264 | { 265 | log::add("worker {$pid} exit with status $status", "Warning"); 266 | } 267 | 268 | // key 和 value 互换 269 | $worker_pids = array_flip(self::$_worker_pids); 270 | // 通过 pid 得到 worker_id 271 | $worker_id = $worker_pids[$pid]; 272 | // 这里不unset掉,是为了进程重启 273 | self::$_worker_pids[$worker_id] = 0; 274 | //unset(self::$_worker_pids[$pid]); 275 | 276 | // 再生成一个worker 277 | if (!$this->run_once) 278 | { 279 | $this->fork_one_worker($worker_id); 280 | } 281 | 282 | // 如果所有子进程都退出了,触发主进程退出函数 283 | $all_worker_stop = true; 284 | foreach (self::$_worker_pids as $_worker_pid) 285 | { 286 | // 只要有一个worker进程还存在进程ID,就不算退出 287 | if ($_worker_pid != 0) 288 | { 289 | $all_worker_stop = false; 290 | } 291 | } 292 | if ($all_worker_stop) 293 | { 294 | if ($this->on_stop) 295 | { 296 | call_user_func($this->on_stop, $this); 297 | } 298 | exit(0); 299 | } 300 | } 301 | // 其他信号 302 | else 303 | { 304 | // worker进程接受到master进行信号退出的,会到这里来 305 | if ($this->on_stop) 306 | { 307 | call_user_func($this->on_stop, $this); 308 | } 309 | exit(0); 310 | } 311 | } 312 | } 313 | 314 | /** 315 | * 执行关闭流程(所有进程) 316 | * 事件触发,非正常程序执行完毕 317 | * @return void 318 | */ 319 | public function stop_all() 320 | { 321 | // 设置master、worker进程的运行状态为关闭状态 322 | self::$_status = "shutdown"; 323 | // master进程 324 | if(self::$_master_pid === posix_getpid()) 325 | { 326 | // 循环给worker进程发送关闭信号 327 | foreach (self::$_worker_pids as $worker_pid) 328 | { 329 | posix_kill($worker_pid, SIGINT); 330 | } 331 | } 332 | // worker进程 333 | else 334 | { 335 | // 接收到master进程发送的关闭信号之后退出,这里应该考虑业务的完整性,不能强行exit 336 | $this->stop(); 337 | exit(0); 338 | } 339 | } 340 | 341 | /** 342 | * 停止当前worker实例 343 | * 正常运行结束和接受信号退出,都会调用这个方法 344 | * @return void 345 | */ 346 | public function stop() 347 | { 348 | if ($this->on_worker_stop) 349 | { 350 | call_user_func($this->on_worker_stop, $this); 351 | } 352 | // 设置worker进程的运行状态为关闭 353 | self::$_status = "shutdown"; 354 | } 355 | 356 | /** 357 | * 检查错误,PHP exit之前会执行 358 | * @return void 359 | */ 360 | public function check_errors() 361 | { 362 | // 如果当前worker进程不是正常退出 363 | if(self::$_status != "shutdown") 364 | { 365 | $error_msg = "WORKER EXIT UNEXPECTED "; 366 | $errors = error_get_last(); 367 | if($errors && ($errors['type'] === E_ERROR || 368 | $errors['type'] === E_PARSE || 369 | $errors['type'] === E_CORE_ERROR || 370 | $errors['type'] === E_COMPILE_ERROR || 371 | $errors['type'] === E_RECOVERABLE_ERROR )) 372 | { 373 | $error_msg .= $this->get_error_type($errors['type']) . " {$errors['message']} in {$errors['file']} on line {$errors['line']}"; 374 | } 375 | log::add($error_msg, 'Error'); 376 | } 377 | } 378 | 379 | /** 380 | * 获取错误类型对应的意义 381 | * @param integer $type 382 | * @return string 383 | */ 384 | protected function get_error_type($type) 385 | { 386 | switch($type) 387 | { 388 | case E_ERROR: // 1 // 389 | return 'E_ERROR'; 390 | case E_WARNING: // 2 // 391 | return 'E_WARNING'; 392 | case E_PARSE: // 4 // 393 | return 'E_PARSE'; 394 | case E_NOTICE: // 8 // 395 | return 'E_NOTICE'; 396 | case E_CORE_ERROR: // 16 // 397 | return 'E_CORE_ERROR'; 398 | case E_CORE_WARNING: // 32 // 399 | return 'E_CORE_WARNING'; 400 | case E_COMPILE_ERROR: // 64 // 401 | return 'E_COMPILE_ERROR'; 402 | case E_COMPILE_WARNING: // 128 // 403 | return 'E_COMPILE_WARNING'; 404 | case E_USER_ERROR: // 256 // 405 | return 'E_USER_ERROR'; 406 | case E_USER_WARNING: // 512 // 407 | return 'E_USER_WARNING'; 408 | case E_USER_NOTICE: // 1024 // 409 | return 'E_USER_NOTICE'; 410 | case E_STRICT: // 2048 // 411 | return 'E_STRICT'; 412 | case E_RECOVERABLE_ERROR: // 4096 // 413 | return 'E_RECOVERABLE_ERROR'; 414 | case E_DEPRECATED: // 8192 // 415 | return 'E_DEPRECATED'; 416 | case E_USER_DEPRECATED: // 16384 // 417 | return 'E_USER_DEPRECATED'; 418 | } 419 | return ""; 420 | } 421 | } 422 | -------------------------------------------------------------------------------- /demo/13384.php: -------------------------------------------------------------------------------- 1 | '13384美女图', 18 | 'tasknum' => 1, 19 | //'multiserver' => true, 20 | 'log_show' => true, 21 | //'save_running_state' => false, 22 | 'domains' => array( 23 | 'www.13384.com' 24 | ), 25 | 'scan_urls' => array( 26 | "http://www.13384.com/qingchunmeinv/", 27 | "http://www.13384.com/xingganmeinv/", 28 | "http://www.13384.com/mingxingmeinv/", 29 | "http://www.13384.com/siwameitui/", 30 | "http://www.13384.com/meinvmote/", 31 | "http://www.13384.com/weimeixiezhen/", 32 | ), 33 | 'list_url_regexes' => array( 34 | "http://www.13384.com/qingchunmeinv/index_\d+.html", 35 | "http://www.13384.com/xingganmeinv/index_\d+.html", 36 | "http://www.13384.com/mingxingmeinv/index_\d+.html", 37 | "http://www.13384.com/siwameitui/index_\d+.html", 38 | "http://www.13384.com/meinvmote/index_\d+.html", 39 | "http://www.13384.com/weimeixiezhen/index_\d+.html", 40 | ), 41 | 'content_url_regexes' => array( 42 | "http://www.13384.com/qingchunmeinv/\d+.html", 43 | "http://www.13384.com/xingganmeinv/\d+.html", 44 | "http://www.13384.com/mingxingmeinv/\d+.html", 45 | "http://www.13384.com/siwameitui/\d+.html", 46 | "http://www.13384.com/meinvmote/\d+.html", 47 | "http://www.13384.com/weimeixiezhen/\d+.html", 48 | ), 49 | //'export' => array( 50 | //'type' => 'db', 51 | //'table' => 'meinv_content', 52 | //), 53 | 'db_config' => array( 54 | 'host' => '127.0.0.1', 55 | 'port' => 3306, 56 | 'user' => 'root', 57 | 'pass' => 'root', 58 | 'name' => 'qiushibaike', 59 | ), 60 | 'fields' => array( 61 | // 标题 62 | array( 63 | 'name' => "name", 64 | 'selector' => "//div[@id='Article']//h1", 65 | 'required' => true, 66 | ), 67 | // 分类 68 | array( 69 | 'name' => "category", 70 | 'selector' => "//div[contains(@class,'crumbs')]//span//a", 71 | 'required' => true, 72 | ), 73 | // 发布时间 74 | array( 75 | 'name' => "addtime", 76 | 'selector' => "//p[contains(@class,'sub-info')]//span", 77 | 'required' => true, 78 | ), 79 | // API URL 80 | array( 81 | 'name' => "url", 82 | 'selector' => "//p[contains(@class,'sub-info')]//span", 83 | 'required' => true, 84 | ), 85 | // 图片 86 | array( 87 | 'name' => "image", 88 | 'selector' => "//*[@id='big-pic']//a//img", 89 | 'required' => true, 90 | ), 91 | // 内容 92 | array( 93 | 'name' => "content", 94 | 'selector' => "//div[@id='pages']//a//@href", 95 | 'repeated' => true, 96 | 'required' => true, 97 | 'children' => array( 98 | array( 99 | // 抽取出其他分页的url待用 100 | 'name' => 'content_page_url', 101 | 'selector' => "//text()" 102 | ), 103 | array( 104 | // 抽取其他分页的内容 105 | 'name' => 'page_content', 106 | // 发送 attached_url 请求获取其他的分页数据 107 | // attached_url 使用了上面抓取的 content_page_url 108 | 'source_type' => 'attached_url', 109 | 'attached_url' => 'content_page_url', 110 | 'selector' => "//*[@id='big-pic']//a//img" 111 | ), 112 | ), 113 | ), 114 | ), 115 | ); 116 | 117 | $spider = new phpspider($configs); 118 | 119 | 120 | $spider->on_start = function($phpspider) 121 | { 122 | $db_config = $phpspider->get_config("db_config"); 123 | //print_r($db_config); 124 | //exit; 125 | // 数据库连接 126 | db::set_connect('default', $db_config); 127 | db::_init(); 128 | }; 129 | 130 | $spider->on_extract_field = function($fieldname, $data, $page) 131 | { 132 | if ($fieldname == 'url') 133 | { 134 | $data = $page['request']['url']; 135 | } 136 | elseif ($fieldname == 'name') 137 | { 138 | $data = trim(preg_replace("#\(.*?\)#", "", $data)); 139 | } 140 | if ($fieldname == 'addtime') 141 | { 142 | $data = strtotime(substr($data, 0, 19)); 143 | } 144 | elseif ($fieldname == 'content') 145 | { 146 | $contents = $data; 147 | $array = array(); 148 | foreach ($contents as $content) 149 | { 150 | $url = $content['page_content']; 151 | // md5($url) 过滤重复的URL 152 | $array[md5($url)] = $url; 153 | 154 | //// 以纳秒为单位生成随机数 155 | //$filename = uniqid().".jpg"; 156 | //// 在data目录下生成图片 157 | //$filepath = PATH_ROOT."/images/{$filename}"; 158 | //// 用系统自带的下载器wget下载 159 | //exec("wget -q {$url} -O {$filepath}"); 160 | //$array[] = $filename; 161 | } 162 | $data = implode(",", $array); 163 | } 164 | return $data; 165 | }; 166 | 167 | $category = array( 168 | '丝袜美女' => 'siwameitui', 169 | '唯美写真' => 'weimeixiezhen', 170 | '性感美女' => 'xingganmeinv', 171 | '明星美女' => 'mingxingmeinv', 172 | '清纯美女' => 'qingchunmeinv', 173 | '美女模特' => 'meinvmote', 174 | ); 175 | 176 | $spider->on_extract_page = function($page, $data) use ($category) 177 | { 178 | if (!isset($category[$data['category']])) 179 | { 180 | return false; 181 | } 182 | 183 | $data['dir'] = $category[$data['category']]; 184 | $data['content'] = $data['image'].','.$data['content']; 185 | $data['image'] = str_replace("ocnt0imhl.bkt.clouddn.com", "file.13384.com", $data['image']); 186 | $data['image'] = $data['image']."?imageView2/1/w/320/h/420"; 187 | $data['content'] = str_replace("ocnt0imhl.bkt.clouddn.com", "file.13384.com", $data['content']); 188 | $sql = "Select Count(*) As `count` From `meinv_content` Where `name`='{$data['name']}'"; 189 | $row = db::get_one($sql); 190 | if (!$row['count']) 191 | { 192 | db::insert("meinv_content", $data); 193 | } 194 | return $data; 195 | }; 196 | 197 | $spider->start(); 198 | 199 | -------------------------------------------------------------------------------- /demo/52mnw.php: -------------------------------------------------------------------------------- 1 | '52mnw美女图', 10 | //'tasknum' => 8, 11 | 'log_show' => true, 12 | 'save_running_state' => false, 13 | 'domains' => array( 14 | 'm.52mnw.cn' 15 | ), 16 | 'scan_urls' => array( 17 | "http://m.52mnw.cn/ikaimi/morepic.php?classid=6,7,8,10,11,15&line=10&order=newstime&page=1", 18 | ), 19 | 'list_url_regexes' => array( 20 | ), 21 | 'content_url_regexes' => array( 22 | "http://m.52mnw.cn/photo/\d+.html", 23 | ), 24 | 'export' => array( 25 | 'type' => 'db', 26 | 'table' => 'meinv_content', 27 | ), 28 | 'db_config' => array( 29 | 'host' => '127.0.0.1', 30 | 'port' => 3306, 31 | 'user' => 'root', 32 | 'pass' => 'root', 33 | 'name' => 'qiushibaike', 34 | ), 35 | 'fields' => array( 36 | // 标题 37 | array( 38 | 'name' => "name", 39 | 'selector' => "//title", 40 | 'required' => true, 41 | ), 42 | // 分类 43 | array( 44 | 'name' => "category", 45 | 'selector' => "//div[contains(@class,'header')]//span", 46 | 'required' => true, 47 | ), 48 | // 发布时间 49 | array( 50 | 'name' => "addtime", 51 | 'selector' => "//div[contains(@class,'content-msg')]", 52 | //'required' => true, 53 | ), 54 | // 图片 55 | array( 56 | 'name' => "image", 57 | 'selector' => "//li[contains(@class,'swiper-slide')]//img/@lazysrc", 58 | 'required' => true, 59 | 'repeated' => true, 60 | ), 61 | ), 62 | ); 63 | 64 | $spider = new phpspider($configs); 65 | 66 | $spider->on_start = function($phpspider) 67 | { 68 | for ($i = 2; $i <= 932; $i++) 69 | { 70 | $url = "http://m.52mnw.cn/ikaimi/morepic.php?classid=6,7,8,10,11,15&line=10&order=newstime&page={$i}"; 71 | $phpspider->add_scan_url($url); 72 | } 73 | }; 74 | 75 | $spider->on_extract_field = function($fieldname, $data, $page) 76 | { 77 | if ($fieldname == 'name') 78 | { 79 | $data = str_replace("-我爱美女网手机版", "", $data); 80 | } 81 | elseif ($fieldname == 'addtime') 82 | { 83 | $data = time(); 84 | } 85 | return $data; 86 | }; 87 | 88 | $categorys = array( 89 | '性感美女' => array( 90 | 'dir' => 'xingganmeinv', 91 | 'name' => '性感美女', 92 | ), 93 | '女星写真' => array( 94 | 'dir' => 'mingxingmeinv', 95 | 'name' => '明星美女', 96 | ), 97 | '高清美女' => array( 98 | 'dir' => 'qingchunmeinv', 99 | 'name' => '清纯美女', 100 | ), 101 | '模特美女' => array( 102 | 'dir' => 'meinvmote', 103 | 'name' => '美女模特', 104 | ), 105 | '丝袜美腿' => array( 106 | 'dir' => 'siwameitui', 107 | 'name' => '丝袜美女', 108 | ), 109 | '唯美写真' => array( 110 | 'dir' => 'weimeixiezhen', 111 | 'name' => '唯美写真', 112 | ), 113 | ); 114 | $spider->on_extract_page = function($page, $data) use ($categorys) 115 | { 116 | if (!isset($categorys[$data['category']])) 117 | { 118 | return false; 119 | } 120 | $data['dir'] = $categorys[$data['category']]['dir']; 121 | $data['category'] = $categorys[$data['category']]['name']; 122 | $data['content'] = implode(",", $data['image']); 123 | $data['image'] = $data['image'][0]; 124 | 125 | //$data['dir'] = $category[$data['category']]; 126 | //$data['content'] = $data['image'].','.$data['content']; 127 | //$sql = "Select Count(*) As `count` From `meinv_content` Where `name`='{$data['name']}'"; 128 | //$row = db::get_one($sql); 129 | //if (!$row['count']) 130 | //{ 131 | //db::insert("meinv_content", $data); 132 | //} 133 | return $data; 134 | }; 135 | 136 | $spider->start(); 137 | -------------------------------------------------------------------------------- /demo/mafengwo.php: -------------------------------------------------------------------------------- 1 | '马蜂窝', 11 | 'tasknum' => 1, 12 | //'save_running_state' => true, 13 | 'log_show' => true, 14 | 'domains' => array( 15 | 'www.mafengwo.cn' 16 | ), 17 | 'scan_urls' => array( 18 | "http://www.mafengwo.cn/travel-scenic-spot/mafengwo/10088.html", // 随便定义一个入口,要不然会报没有入口url错误,但是这里其实没用 19 | ), 20 | 'list_url_regexes' => array( 21 | "http://www.mafengwo.cn/mdd/base/list/pagedata_citylist\?page=\d+", // 城市列表页 22 | "http://www.mafengwo.cn/gonglve/ajax.php\?act=get_travellist\&mddid=\d+", // 文章列表页 23 | ), 24 | 'content_url_regexes' => array( 25 | "http://www.mafengwo.cn/i/\d+.html", 26 | ), 27 | //'export' => array( 28 | //'type' => 'db', 29 | //'table' => 'mafengwo_content', 30 | //), 31 | 'fields' => array( 32 | // 标题 33 | array( 34 | 'name' => "name", 35 | 'selector' => "//h1[contains(@class,'headtext')]", 36 | //'selector' => "//div[@id='Article']//h1", 37 | 'required' => true, 38 | ), 39 | // 分类 40 | array( 41 | 'name' => "city", 42 | 'selector' => "//div[contains(@class,'relation_mdd')]//a", 43 | 'required' => true, 44 | ), 45 | // 出发时间 46 | array( 47 | 'name' => "date", 48 | 'selector' => "//li[contains(@class,'time')]", 49 | 'required' => true, 50 | ), 51 | ), 52 | ); 53 | 54 | $spider = new phpspider($configs); 55 | 56 | $spider->on_start = function($phpspider) 57 | { 58 | requests::set_header('Referer','http://www.mafengwo.cn/mdd/citylist/21536.html'); 59 | }; 60 | 61 | $spider->on_scan_page = function($page, $content, $phpspider) 62 | { 63 | //for ($i = 0; $i < 298; $i++) 64 | //测试的时候先采集一个国家,要不然等的时间太长 65 | for ($i = 0; $i < 1; $i++) 66 | { 67 | // 全国热点城市 68 | $url = "http://www.mafengwo.cn/mdd/base/list/pagedata_citylist?page={$i}"; 69 | $options = array( 70 | 'method' => 'post', 71 | 'params' => array( 72 | 'mddid'=>21536, 73 | 'page'=>$i, 74 | ) 75 | ); 76 | $phpspider->add_url($url, $options); 77 | } 78 | }; 79 | 80 | $spider->on_list_page = function($page, $content, $phpspider) 81 | { 82 | // 如果是城市列表页 83 | if (preg_match("#pagedata_citylist#", $page['request']['url'])) 84 | { 85 | $data = json_decode($content, true); 86 | $html = $data['list']; 87 | preg_match_all('# 'post', 95 | 'params' => array( 96 | 'mddid'=>$v, 97 | 'pageid'=>'mdd_index', 98 | 'sort'=>1, 99 | 'cost'=>0, 100 | 'days'=>0, 101 | 'month'=>0, 102 | 'tagid'=>0, 103 | 'page'=>1, 104 | ) 105 | ); 106 | $phpspider->add_url($url, $options); 107 | } 108 | } 109 | } 110 | // 如果是文章列表页 111 | else 112 | { 113 | $data = json_decode($content, true); 114 | $html = $data['list']; 115 | // 遇到第一页的时候,获取分页数,把其他分页全部入队列 116 | if ($page['request']['params']['page'] == 1) 117 | { 118 | $data_page = trim($data['page']); 119 | if (!empty($data_page)) 120 | { 121 | preg_match('#(.*?)页#', $data_page, $out); 122 | for ($i = 0; $i < $out[1]; $i++) 123 | { 124 | $v = $page['request']['params']['mddid']; 125 | $url = "http://www.mafengwo.cn/gonglve/ajax.php?act=get_travellist&mddid={$v}&page={$i}"; 126 | $options = array( 127 | 'method' => 'post', 128 | 'params' => array( 129 | 'mddid'=>$v, 130 | 'pageid'=>'mdd_index', 131 | 'sort'=>1, 132 | 'cost'=>0, 133 | 'days'=>0, 134 | 'month'=>0, 135 | 'tagid'=>0, 136 | 'page'=>$i, 137 | ) 138 | ); 139 | $phpspider->add_url($url, $options); 140 | } 141 | } 142 | } 143 | 144 | // 获取内容页 145 | preg_match_all('##', $html, $out); 146 | if (!empty($out[1])) 147 | { 148 | foreach ($out[1] as $v) 149 | { 150 | $url = "http://www.mafengwo.cn/i/{$v}.html"; 151 | $phpspider->add_url($url); 152 | } 153 | } 154 | 155 | } 156 | }; 157 | 158 | $spider->on_extract_field = function($fieldname, $data, $page) 159 | { 160 | if ($fieldname == 'date') 161 | { 162 | $data = trim(str_replace(array("出发时间","/"),"", strip_tags($data))); 163 | } 164 | return $data; 165 | }; 166 | 167 | $spider->start(); 168 | -------------------------------------------------------------------------------- /demo/mafengwo.sql: -------------------------------------------------------------------------------- 1 | # ************************************************************ 2 | # Sequel Pro SQL dump 3 | # Version 4541 4 | # 5 | # http://www.sequelpro.com/ 6 | # https://github.com/sequelpro/sequelpro 7 | # 8 | # Host: 203.195.132.34 (MySQL 5.6.33-0ubuntu0.14.04.1) 9 | # Database: phpspider 10 | # Generation Time: 2016-10-11 08:48:31 +0000 11 | # ************************************************************ 12 | 13 | 14 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 15 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 16 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 17 | /*!40101 SET NAMES utf8 */; 18 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; 19 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; 20 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 21 | 22 | 23 | # Dump of table mafengwo_content3 24 | # ------------------------------------------------------------ 25 | 26 | DROP TABLE IF EXISTS `mafengwo_content`; 27 | 28 | CREATE TABLE `mafengwo_content` ( 29 | `id` int(11) unsigned NOT NULL AUTO_INCREMENT COMMENT 'ID', 30 | `city` varchar(20) DEFAULT NULL COMMENT '城市', 31 | `name` varchar(50) DEFAULT NULL COMMENT '标题', 32 | `date` date DEFAULT NULL COMMENT '出发日期', 33 | `up` int(11) DEFAULT NULL COMMENT '顶', 34 | `pv` int(11) DEFAULT NULL COMMENT '浏览次数', 35 | `fav` int(11) DEFAULT NULL COMMENT '收藏', 36 | `share` int(11) DEFAULT NULL COMMENT '分享', 37 | `pic` int(11) DEFAULT NULL COMMENT '图片数目', 38 | PRIMARY KEY (`id`) 39 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 40 | 41 | 42 | 43 | 44 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 45 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 46 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; 47 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 48 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 49 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 50 | -------------------------------------------------------------------------------- /demo/qiushibaike.php: -------------------------------------------------------------------------------- 1 | '糗事百科', 17 | 'log_show' => true, 18 | 'tasknum' => 1, 19 | 'save_running_state' => true, 20 | 'domains' => array( 21 | 'qiushibaike.com', 22 | 'www.qiushibaike.com' 23 | ), 24 | 'scan_urls' => array( 25 | 'http://www.qiushibaike.com/' 26 | ), 27 | 'list_url_regexes' => array( 28 | "http://www.qiushibaike.com/8hr/page/\d+\?s=\d+" 29 | ), 30 | 'content_url_regexes' => array( 31 | "http://www.qiushibaike.com/article/\d+", 32 | ), 33 | 'max_try' => 5, 34 | //'proxies' => array( 35 | //'http://H784U84R444YABQD:57A8B0B743F9B4D2@proxy.abuyun.com:9010' 36 | //), 37 | //'export' => array( 38 | //'type' => 'csv', 39 | //'file' => '../data/qiushibaike.csv', 40 | //), 41 | //'export' => array( 42 | //'type' => 'sql', 43 | //'file' => '../data/qiushibaike.sql', 44 | //'table' => 'content', 45 | //), 46 | //'export' => array( 47 | //'type' => 'db', 48 | //'table' => 'content', 49 | //), 50 | //'db_config' => array( 51 | //'host' => '127.0.0.1', 52 | //'port' => 3306, 53 | //'user' => 'root', 54 | //'pass' => 'root', 55 | //'name' => 'qiushibaike', 56 | //), 57 | 'queue_config' => array( 58 | 'host' => '127.0.0.1', 59 | 'port' => 6379, 60 | 'pass' => 'foobared', 61 | 'db' => 5, 62 | 'prefix' => 'phpspider', 63 | 'timeout' => 30, 64 | ), 65 | 'fields' => array( 66 | array( 67 | 'name' => "article_title", 68 | 'selector' => "//*[@id='single-next-link']//div[contains(@class,'content')]/text()[1]", 69 | 'required' => true, 70 | ), 71 | array( 72 | 'name' => "article_author", 73 | 'selector' => "//div[contains(@class,'author')]//h2", 74 | 'required' => true, 75 | ), 76 | array( 77 | 'name' => "article_headimg", 78 | 'selector' => "//div[contains(@class,'author')]//a[1]", 79 | 'required' => true, 80 | ), 81 | array( 82 | 'name' => "article_content", 83 | 'selector' => "//*[@id='single-next-link']//div[contains(@class,'content')]", 84 | 'required' => true, 85 | ), 86 | array( 87 | 'name' => "article_publish_time", 88 | 'selector' => "//div[contains(@class,'author')]//h2", 89 | 'required' => true, 90 | ), 91 | array( 92 | 'name' => "url", 93 | 'selector' => "//div[contains(@class,'author')]//h2", // 这里随便设置,on_extract_field回调里面会替换 94 | 'required' => true, 95 | ), 96 | ), 97 | ); 98 | 99 | $spider = new phpspider($configs); 100 | 101 | $spider->on_handle_img = function($fieldname, $img) 102 | { 103 | $regex = '/src="(https?:\/\/.*?)"/i'; 104 | preg_match($regex, $img, $rs); 105 | if (!$rs) 106 | { 107 | return $img; 108 | } 109 | 110 | $url = $rs[1]; 111 | $img = $url; 112 | 113 | //$pathinfo = pathinfo($url); 114 | //$fileext = $pathinfo['extension']; 115 | //if (strtolower($fileext) == 'jpeg') 116 | //{ 117 | //$fileext = 'jpg'; 118 | //} 119 | //// 以纳秒为单位生成随机数 120 | //$filename = uniqid().".".$fileext; 121 | //// 在data目录下生成图片 122 | //$filepath = PATH_ROOT."/images/{$filename}"; 123 | //// 用系统自带的下载器wget下载 124 | //exec("wget -q {$url} -O {$filepath}"); 125 | 126 | //// 替换成真是图片url 127 | //$img = str_replace($url, $filename, $img); 128 | return $img; 129 | }; 130 | 131 | $spider->on_extract_field = function($fieldname, $data, $page) 132 | { 133 | if ($fieldname == 'article_title') 134 | { 135 | if (strlen($data) > 10) 136 | { 137 | $data = mb_strcut($data, 0, 10, 'UTF-8')."..."; 138 | $data = trim($data); 139 | } 140 | } 141 | elseif ($fieldname == 'article_publish_time') 142 | { 143 | // 用当前采集时间戳作为发布时间 144 | $data = time(); 145 | } 146 | // 把当前内容页URL替换上面的field 147 | elseif ($fieldname == 'url') 148 | { 149 | $data = $page['url']; 150 | } 151 | return $data; 152 | }; 153 | 154 | $spider->start(); 155 | 156 | 157 | -------------------------------------------------------------------------------- /demo/qiushibaike.sql: -------------------------------------------------------------------------------- 1 | # ************************************************************ 2 | # Sequel Pro SQL dump 3 | # Version 4541 4 | # 5 | # http://www.sequelpro.com/ 6 | # https://github.com/sequelpro/sequelpro 7 | # 8 | # Host: 127.0.0.1 (MySQL 5.7.14) 9 | # Database: demo 10 | # Generation Time: 2016-10-20 16:55:11 +0000 11 | # ************************************************************ 12 | 13 | 14 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 15 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 16 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 17 | /*!40101 SET NAMES utf8 */; 18 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; 19 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; 20 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 21 | 22 | 23 | # Dump of table content 24 | # ------------------------------------------------------------ 25 | 26 | DROP TABLE IF EXISTS `content`; 27 | 28 | CREATE TABLE `content` ( 29 | `id` int(11) unsigned NOT NULL AUTO_INCREMENT, 30 | `depth` int(11) DEFAULT NULL, 31 | `url` varchar(200) DEFAULT NULL, 32 | `article_title` varchar(20) DEFAULT NULL, 33 | `article_headimg` varchar(150) DEFAULT NULL, 34 | `article_author` varchar(20) DEFAULT NULL, 35 | `article_content` text, 36 | `article_publish_time` int(10) DEFAULT NULL, 37 | PRIMARY KEY (`id`) 38 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 39 | 40 | 41 | 42 | 43 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 44 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 45 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; 46 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 47 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 48 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 49 | -------------------------------------------------------------------------------- /demo/qiushibaike_css_selector.php: -------------------------------------------------------------------------------- 1 | '糗事百科CSS选择器示例', 10 | //'tasknum' => 8, 11 | 'interval' => 350, 12 | 'log_show' => true, 13 | 'domains' => array( 14 | 'qiushibaike.com', 15 | 'www.qiushibaike.com' 16 | ), 17 | 'scan_urls' => array( 18 | 'http://www.qiushibaike.com/article/117568316', 19 | 'http://www.qiushibaike.com/' 20 | ), 21 | 'list_url_regexes' => array( 22 | "http://www.qiushibaike.com/8hr/page/\d+\?s=\d+" 23 | ), 24 | 'content_url_regexes' => array( 25 | "http://www.qiushibaike.com/article/\d+", 26 | ), 27 | 'proxies' => array( 28 | 'http://H784U84R444YABQD:57A8B0B743F9B4D2@proxy.abuyun.com:9010' 29 | ), 30 | 'max_try' => 5, 31 | //'export' => array( 32 | //'type' => 'db', 33 | //'table' => 'content', 34 | //), 35 | 'fields' => array( 36 | array( 37 | 'name' => "article_title", 38 | 'selector' => "#single-next-link > div.content", 39 | 'selector_type' => 'css', 40 | 'required' => true, 41 | ), 42 | array( 43 | 'name' => "article_author", 44 | 'selector' => "div.author > a > h2", 45 | 'selector_type' => 'css', 46 | 'required' => true, 47 | ), 48 | //array( 49 | //'name' => "article_headimg", 50 | //'selector' => "//div.author > a:eq(0)", 51 | //'selector_type' => 'css', 52 | //'required' => true, 53 | //), 54 | //array( 55 | //'name' => "article_content", 56 | //'selector' => "#single-next-link > div.content", 57 | //'selector_type' => 'css', 58 | //'required' => true, 59 | //), 60 | //array( 61 | //'name' => "article_publish_time", 62 | //'selector' => "div.author > a > h2", // 这里随便设置,on_extract_field回调里面会替换 63 | //'selector_type' => 'css', 64 | //'required' => true, 65 | //), 66 | //array( 67 | //'name' => "url", 68 | //'selector' => "div.author > a > h2", // 这里随便设置,on_extract_field回调里面会替换 69 | //'selector_type' => 'css', 70 | //'required' => true, 71 | //), 72 | //array( 73 | //'name' => "depth", 74 | //'selector' => "div.author > a > h2", // 这里随便设置,on_extract_field回调里面会替换 75 | //'selector_type' => 'css', 76 | //'required' => true, 77 | //), 78 | ), 79 | ); 80 | 81 | $spider = new phpspider($configs); 82 | 83 | $spider->on_handle_img = function($fieldname, $img) 84 | { 85 | $regex = '/src="(https?:\/\/.*?)"/i'; 86 | preg_match($regex, $img, $rs); 87 | if (!$rs) 88 | { 89 | return $img; 90 | } 91 | 92 | $url = $rs[1]; 93 | $img = $url; 94 | 95 | //$pathinfo = pathinfo($url); 96 | //$fileext = $pathinfo['extension']; 97 | //if (strtolower($fileext) == 'jpeg') 98 | //{ 99 | //$fileext = 'jpg'; 100 | //} 101 | //// 以纳秒为单位生成随机数 102 | //$filename = uniqid().".".$fileext; 103 | //// 在data目录下生成图片 104 | //$filepath = PATH_ROOT."/images/{$filename}"; 105 | //// 用系统自带的下载器wget下载 106 | //exec("wget -q {$url} -O {$filepath}"); 107 | 108 | //// 替换成真是图片url 109 | //$img = str_replace($url, $filename, $img); 110 | return $img; 111 | }; 112 | 113 | $spider->on_extract_field = function($fieldname, $data, $page) 114 | { 115 | if ($fieldname == 'article_title') 116 | { 117 | $data = trim($data); 118 | if (strlen($data) > 10) 119 | { 120 | // 下面方法截取中文会有乱码 121 | //$data = substr($data, 0, 10)."..."; 122 | $data = mb_substr($data, 0, 10, 'UTF-8')."..."; 123 | } 124 | } 125 | elseif ($fieldname == 'article_publish_time') 126 | { 127 | // 用当前采集时间戳作为发布时间 128 | $data = time(); 129 | } 130 | // 把当前内容页URL替换上面的field 131 | elseif ($fieldname == 'url') 132 | { 133 | $data = $page['url']; 134 | } 135 | // 把当前内容页depth替换上面的field 136 | elseif ($fieldname == 'depth') 137 | { 138 | $data = $page['request']['depth']; 139 | } 140 | return $data; 141 | }; 142 | 143 | $spider->start(); 144 | 145 | 146 | -------------------------------------------------------------------------------- /demo/qiushibaike_task.php: -------------------------------------------------------------------------------- 1 | '糗事百科测试样例', 11 | //'log_show' => true, 12 | //'log_type' => 'error,debug', 13 | 'multiserver' => true, 14 | 'serverid' => 1, 15 | 'tasknum' => 3, 16 | //'save_running_state' => true, 17 | //'input_encoding' => 'utf-8', 18 | //'max_depth' => 3, 19 | 'domains' => array( 20 | 'qiushibaike.com', 21 | 'www.qiushibaike.com' 22 | ), 23 | 'scan_urls' => array( 24 | 'http://www.qiushibaike.com/', 25 | ), 26 | 'list_url_regexes' => array( 27 | "http://www.qiushibaike.com/8hr/page/\d+\?s=\d+" 28 | ), 29 | 'content_url_regexes' => array( 30 | "http://www.qiushibaike.com/article/\d+", 31 | ), 32 | 'max_try' => 5, 33 | 'proxies' => array( 34 | 'http://H784U84R444YABQD:57A8B0B743F9B4D2@proxy.abuyun.com:9010' 35 | ), 36 | //'export' => array( 37 | //'type' => 'csv', 38 | //'file' => '../data/qiushibaike.csv', 39 | //), 40 | //'export' => array( 41 | //'type' => 'sql', 42 | //'file' => '../data/qiushibaike.sql', 43 | //'table' => 'content', 44 | //), 45 | //'export' => array( 46 | //'type' => 'db', 47 | //'table' => 'content', 48 | //), 49 | //'db_config' => array( 50 | //'host' => '127.0.0.1', 51 | //'port' => 3306, 52 | //'user' => 'root', 53 | //'pass' => 'root', 54 | //'name' => 'qiushibaike', 55 | //), 56 | 'queue_config' => array( 57 | 'host' => '127.0.0.1', 58 | 'port' => 6379, 59 | 'pass' => '', 60 | 'db' => 5, 61 | 'prefix' => 'phpspider', 62 | 'timeout' => 30, 63 | ), 64 | 'fields' => array( 65 | array( 66 | 'name' => "article_title", 67 | 'selector' => "//*[@id='single-next-link']//div[contains(@class,'content')]/text()[1]", 68 | 'required' => true, 69 | ), 70 | array( 71 | 'name' => "article_author", 72 | 'selector' => "//div[contains(@class,'author')]//h2", 73 | 'required' => true, 74 | ), 75 | array( 76 | 'name' => "article_headimg", 77 | 'selector' => "//div[contains(@class,'author')]//a[1]", 78 | 'required' => true, 79 | ), 80 | array( 81 | 'name' => "article_content", 82 | 'selector' => "//*[@id='single-next-link']//div[contains(@class,'content')]", 83 | 'required' => true, 84 | ), 85 | array( 86 | 'name' => "article_publish_time", 87 | 'selector' => "//div[contains(@class,'author')]//h2", 88 | 'required' => true, 89 | ), 90 | array( 91 | 'name' => "url", 92 | 'selector' => "//div[contains(@class,'author')]//h2", // 这里随便设置,on_extract_field回调里面会替换 93 | 'required' => true, 94 | ), 95 | array( 96 | 'name' => "depth", 97 | 'selector' => "//div[contains(@class,'author')]//h2", // 这里随便设置,on_extract_field回调里面会替换 98 | 'required' => true, 99 | ), 100 | array( 101 | 'name' => "taskid", 102 | 'selector' => "//div[contains(@class,'author')]//h2", // 这里随便设置,on_extract_field回调里面会替换 103 | 'required' => true, 104 | ), 105 | ), 106 | ); 107 | 108 | $spider = new phpspider($configs); 109 | 110 | $spider->on_start = function($phpspider) 111 | { 112 | }; 113 | 114 | $spider->on_handle_img = function($fieldname, $img) 115 | { 116 | $regex = '/src="(https?:\/\/.*?)"/i'; 117 | preg_match($regex, $img, $rs); 118 | if (!$rs) 119 | { 120 | return $img; 121 | } 122 | 123 | $url = $rs[1]; 124 | $img = $url; 125 | 126 | //$pathinfo = pathinfo($url); 127 | //$fileext = $pathinfo['extension']; 128 | //if (strtolower($fileext) == 'jpeg') 129 | //{ 130 | //$fileext = 'jpg'; 131 | //} 132 | //// 以纳秒为单位生成随机数 133 | //$filename = uniqid().".".$fileext; 134 | //// 在data目录下生成图片 135 | //$filepath = PATH_ROOT."/images/{$filename}"; 136 | //// 用系统自带的下载器wget下载 137 | //shell_exec("wget -q {$url} -O {$filepath}"); 138 | 139 | //// 替换成真是图片url 140 | //$img = str_replace($url, $filename, $img); 141 | return $img; 142 | }; 143 | 144 | $spider->on_extract_field = function($fieldname, $data, $page) 145 | { 146 | $encoding = util::get_encoding($page['raw']); 147 | if ($encoding == 'iso-8859-1') 148 | { 149 | //$data = mb_convert_encoding($data, "LATIN1", "UTF-8"); 150 | //用 UTF-8 编码的数据解码为 ISO-8859-1 编码 151 | $data = utf8_decode($data); 152 | } 153 | 154 | if ($fieldname == 'article_title') 155 | { 156 | $data = trim($data); 157 | if (strlen($data) > 10) 158 | { 159 | $data = mb_strcut($data, 0, 10, 'UTF-8')."..."; 160 | } 161 | } 162 | elseif ($fieldname == 'article_publish_time') 163 | { 164 | // 用当前采集时间戳作为发布时间 165 | $data = time(); 166 | } 167 | // 把当前内容页URL替换上面的field 168 | elseif ($fieldname == 'url') 169 | { 170 | $data = $page['url']; 171 | } 172 | elseif ($fieldname == 'depth') 173 | { 174 | $data = $page['request']['depth']; 175 | } 176 | elseif ($fieldname == 'taskid') 177 | { 178 | $data = $page['request']['taskid']; 179 | } 180 | return $data; 181 | }; 182 | 183 | $spider->start(); 184 | -------------------------------------------------------------------------------- /demo/test_requests.php: -------------------------------------------------------------------------------- 1 | 2, 17 | 'chat_id' => 10160267, 18 | 'text' => 'Hello', 19 | ); 20 | $data = json_encode($data); 21 | requests::set_header("Content-Type", "application/json"); 22 | $html = requests::post($url, $data); 23 | var_dump($html); 24 | exit; 25 | 26 | 27 | //$url = "https://api.telegram.org/bot631221524:AAHmiCfIDNfJdae1WXXNNQvhC7t2qSSjqPE/setWebhook"; 28 | $url = "https://api.potato.im:8443/{$bot_token}/setWebhook"; 29 | 30 | $data = array('url'=>'https://www.quivernote.com/bot.php'); 31 | $data = json_encode($data); 32 | requests::set_header("Content-Type", "application/json"); 33 | $html = requests::post($url, $data); 34 | var_dump($html); 35 | 36 | 37 | exit; 38 | $html = requests::get('http://lishi.zhuixue.net/xiachao/576024.html'); 39 | //echo $html; 40 | $data = selector::select($html, "//div[@class='list']"); 41 | print_r($data); 42 | exit; 43 | 44 | //$html =<< 46 | //aaa 47 | //bbb 48 | //ccc 49 | //

ddd

50 | // 51 | //STR; 52 | 53 | //// 获取id为demo的div内容 54 | ////$data = selector::select($html, "//div[contains(@id,'demo')]"); 55 | //$data = selector::select($html, "#demo", "css"); 56 | //print_r($data); 57 | 58 | requests::set_proxy(array('223.153.69.150:42354')); 59 | $html = requests::get('https://www.quivernote.com/test.php'); 60 | var_dump($html); 61 | exit; 62 | $html = requests::get('http://www.qiushibaike.com/article/118914171'); 63 | //echo $html; 64 | //exit; 65 | $data = selector::select($html, "div.author", "css"); 66 | echo $data; 67 | -------------------------------------------------------------------------------- /gitadd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ ! -d "$1" ] && [ ! -f "$1" ]; then 3 | echo "file $1 not exists" 4 | exit 5 | fi 6 | filename=$1 7 | 8 | comment="add file" 9 | if [[ $2 != "" ]]; then 10 | comment=$2 11 | fi 12 | 13 | echo "start update..." 14 | git pull 15 | echo "start add new file..." 16 | git add $filename 17 | echo "start commit..." 18 | git commit -m "$comment" $filename 19 | git push -u origin master 20 | echo "git commit complete..." 21 | -------------------------------------------------------------------------------- /hacked-emails/banners.txt: -------------------------------------------------------------------------------- 1 | 2 | _-o#&&*''''?d:>b\_ 3 | _o/"`'' '',, dMF9MMMMMHo_ 4 | .o&#' `"MbHMMMMMMMMMMMHo. 5 | .o"" ' vodM*$&&HMMMMMMMMMM?. 6 | ,' $M&ood,~'`(&##MMMMMMH\ 7 | / ,MMMMMMM#b?#bobMMMMHMMML 8 | & ?MMMMMMMMMMMMMMMMM7MMM$R*Hk 9 | ?$. :MMMMMMMMMMMMMMMMMMM/HMMM|`*L 10 | | |MMMMMMMMMMMMMMMMMMMMbMH' T, 11 | $H#: `*MMMMMMMMMMMMMMMMMMMMb#]' `? 12 | ]MMH# ""*""""*#MMMMMMMMMMMMM' - 13 | MMMMMb_ |MMMMMMMMMMMP' : 14 | HMMMMMMMHo `MMMMMMMMMT . 15 | ?MMMMMMMMP 9MMMMMMMM] - 16 | -?MMMMMMM |MMMMMMMMM?,d- ' {Name} 17 | :|MMMMMM- `MMMMMMMT .M|. : {Description} 18 | .9MMM[ &MMMMM*' `' . {Loaded} 19 | :9MMk `MMM#" - 20 | &M] ` .- 21 | `&. . 22 | `~, . ./ 23 | . _ .- 24 | '`--._,dd###pp=""' 25 | 26 | $$$$$AnyShIt$$$$$$ 27 | 28 | _v->#H#P? "':o<>\_ 29 | .,dP` `'' "'-o.+H6&MMMHo_ 30 | oHMH9' `?&bHMHMMMMMMHo. 31 | oMP"' ' ooMP*#&HMMMMMMM?. 32 | ,M* - `*MSdob//`^&##MMMH\ 33 | d*' .,MMMMMMH#o>#ooMMMMMb 34 | HM- :HMMMMMMMMMMMMMMM&HM[R\ 35 | d"Z\. 9MMMMMMMMMMMMMMMMM[HMM|: 36 | -H - MMMMMMMMMMMMMMMMMMMbMP' : 37 | :??Mb# `9MMMMMMMMMMMMMMMMMMH#! . 38 | : MMMMH#, "*""""`#HMMMMMMMMMMH - 39 | ||MMMMMM6\. [MMMMMMMMMH' : 40 | :|MMMMMMMMMMHo `9MMMMMMMM' . 41 | . HMMMMMMMMMMP' !MMMMMMMM ` 42 | - `#MMMMMMMMM HMMMMMMM*,/ : 43 | : ?MMMMMMMF HMMMMMM',P' : {Name} 44 | . HMMMMR' [MMMMP' ^' - {Description} 45 | : `HMMMT iMMH' .' {Loaded} 46 | -.`HMH . 47 | -:*H . ' 48 | -`\,, . .- 49 | ' . _ .-` 50 | '`~\.__,obb#q==~''' 51 | 52 | $$$$$AnyShIt$$$$$$ 53 | 54 | _ood>H&H&Z?#M#b-\. 55 | .\HMMMMMR?`\M6b."`' ''``v. 56 | .. .MMMMMMMMMMHMMM#&. ``~o. 57 | . ,HMMMMMMMMMMMM*"'-` &b. 58 | . .MMMMMMMMMMMMH' `"&\ 59 | - RMMMMM#H##R' 4Mb 60 | - |7MMM' ?:: `|MMb 61 | / HMM__#|`"\>?v.. `MMML 62 | . `"'#Hd| ` 9MMM: 63 | - |\,\?HH#bbL `9MMb 64 | : !MMMMMMMH#b, `""T 65 | . . ,MMMMMMMMMMMbo. | 66 | : 4MMMMMMMMMMMMMMMHo | 67 | : ?MMMMMMMMMMMMMMM? : 68 | -. `#MMMMMMMMMMMM: .- 69 | : |MMMMMMMMMM? . 70 | - JMMMMMMMT' : {Name} 71 | `. MMMMMMH' - {Description} 72 | -. |MMM#*` - {Loaded} 73 | . HMH' . ' 74 | -. #H:. .- 75 | ` . .\ .- 76 | '-..-+oodHL_,--/-` 77 | 78 | 79 | $$$$$AnyShIt$$$$$$ 80 | 81 | .,:,#&6dHHHb&##o\_ 82 | .oHHMMMMMMMMMMMMMMMMMH*\,. 83 | oHMMMMMMMMMMMMMMMMMMMMMMHb:'-. 84 | .dMMMMMMMMMMMMMMMMMMMMMMMMMH|\/' . 85 | ,&HMMMMMMMMMMMMMMMMMMMMMMM/"&.,d. -. 86 | dboMMHMMMMMMMMMMMMMMMMMMMMMML `' . 87 | HMHMMM$Z***MMMMMMMMMMMMMMMMMM|.- . 88 | dMM]MMMM#' `9MMMH?"`MMMMR'T' _ : 89 | |MMMbM#'' |MM" ``MMMH. <_ . 90 | dMMMM#& *&. .?`*" .'&: . 91 | MMMMMH- `' -v/H .dD "' ' : 92 | MMMM* `*M: 4MM*::-!v,_ : 93 | MMMM `*?::" "'``"?9Mb::. : 94 | &MMM, `"'"'|"._ "?`| - : 95 | `MMM].H ,#dM[_H ..: 96 | 9MMi`M: . .ooHMMMMMMM, .. 97 | 9Mb `- 1MMMMMMMMMM| : {Name} 98 | ?M |MM#*#MMMM* . {Description} 99 | -. ` |#"' ,' {Loaded} 100 | . -" v` 101 | -. .- 102 | - . . ` 103 | '-*#d#HHMMMMHH#"-' 104 | 105 | $$$$$AnyShIt$$$$$$ 106 | 107 | .-:?,Z?:&$dHH##b\_ 108 | ,:bqRMMMMMMMMMMMMMMMMMHo. 109 | .?HHHMMMMMMMMMMMMMMMMMMMMMMMHo. 110 | -o/*M9MMMMMMMMMMMMMMMMMMMMMMMMMMMv 111 | .:H\b\'|?#HHMMMMMMMMMMMMMMMMMMMMMM6?Z\ 112 | .?MMMHbdbbodMMMMHMMMMMMMMMMMMMMMMMMMM\': 113 | :MMMMMMMMMMM7MMMMb?6P**#MMMMMMMMMMMMMMM_ : 114 | \MMMMMMMMMMMMb^MMMMMM? `*MMMM*"`MMMR<' . - 115 | .1MMMMMMMMMMMMMb]M#"" 9MR' `?MMb \. : 116 | -MMMMMMMMMMMMMMMH##|` *&. |`*' .\ . 117 | -?""*MMMMMMMMMMMMM' ' |?b ,]" : 118 | : MMMMMMMMMMH' `M_|M]r\? 119 | . `MMMMMMMMM' `$_:`'"H 120 | - TMMMMMMMM, '"``:: 121 | : [MMMMMMMM| oH| .#M- 122 | : `9MMMMMM' .MP . ,oMMT 123 | . HMMMMP' `' ,MMMP {Name} 124 | - `MMH' HH9* {Description} 125 | '. ` ` .' {Loaded} 126 | - . ' 127 | ` . - .- 128 | ` . .- 129 | ' -==pHMMH##HH#""" 130 | -------------------------------------------------------------------------------- /hacked-emails/hacked_emails.php: -------------------------------------------------------------------------------- 1 | 10 | * @copyright seatle 11 | * @link http://www.epooll.com/ 12 | * @license http://www.opensource.org/licenses/mit-license.php MIT License 13 | */ 14 | 15 | class cls_curl 16 | { 17 | protected static $timeout = 10; 18 | protected static $ch = null; 19 | protected static $useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36'; 20 | protected static $http_raw = false; 21 | protected static $cookie = null; 22 | protected static $cookie_jar = null; 23 | protected static $cookie_file = null; 24 | protected static $referer = null; 25 | protected static $ip = null; 26 | protected static $proxy = null; 27 | protected static $headers = array(); 28 | protected static $hosts = array(); 29 | protected static $gzip = false; 30 | protected static $info = array(); 31 | 32 | /** 33 | * set timeout 34 | * 35 | * @param init $timeout 36 | * @return 37 | */ 38 | public static function set_timeout($timeout) 39 | { 40 | self::$timeout = $timeout; 41 | } 42 | 43 | /** 44 | * 设置代理 45 | * 46 | * @param mixed $proxy 47 | * @return void 48 | * @author seatle 49 | * @created time :2016-09-18 10:17 50 | */ 51 | public static function set_proxy($proxy) 52 | { 53 | self::$proxy = $proxy; 54 | } 55 | 56 | /** 57 | * set referer 58 | * 59 | */ 60 | public static function set_referer($referer) 61 | { 62 | self::$referer = $referer; 63 | } 64 | 65 | /** 66 | * 设置 user_agent 67 | * 68 | * @param string $useragent 69 | * @return void 70 | */ 71 | public static function set_useragent($useragent) 72 | { 73 | self::$useragent = $useragent; 74 | } 75 | 76 | /** 77 | * 设置COOKIE 78 | * 79 | * @param string $cookie 80 | * @return void 81 | */ 82 | public static function set_cookie($cookie) 83 | { 84 | self::$cookie = $cookie; 85 | } 86 | 87 | /** 88 | * 设置COOKIE JAR 89 | * 90 | * @param string $cookie_jar 91 | * @return void 92 | */ 93 | public static function set_cookie_jar($cookie_jar) 94 | { 95 | self::$cookie_jar = $cookie_jar; 96 | } 97 | 98 | /** 99 | * 设置COOKIE FILE 100 | * 101 | * @param string $cookie_file 102 | * @return void 103 | */ 104 | public static function set_cookie_file($cookie_file) 105 | { 106 | self::$cookie_file = $cookie_file; 107 | } 108 | 109 | /** 110 | * 获取内容的时候是不是连header也一起获取 111 | * 112 | * @param mixed $http_raw 113 | * @return void 114 | * @author seatle 115 | * @created time :2016-09-18 10:17 116 | */ 117 | public static function set_http_raw($http_raw) 118 | { 119 | self::$http_raw = $http_raw; 120 | } 121 | 122 | /** 123 | * 设置IP 124 | * 125 | * @param string $ip 126 | * @return void 127 | */ 128 | public static function set_ip($ip) 129 | { 130 | self::$ip = $ip; 131 | } 132 | 133 | /** 134 | * 设置Headers 135 | * 136 | * @param string $headers 137 | * @return void 138 | */ 139 | public static function set_headers($headers) 140 | { 141 | self::$headers = $headers; 142 | } 143 | 144 | /** 145 | * 设置Hosts 146 | * 147 | * @param string $hosts 148 | * @return void 149 | */ 150 | public static function set_hosts($hosts) 151 | { 152 | self::$hosts = $hosts; 153 | } 154 | 155 | /** 156 | * 设置Gzip 157 | * 158 | * @param string $hosts 159 | * @return void 160 | */ 161 | public static function set_gzip($gzip) 162 | { 163 | self::$gzip = $gzip; 164 | } 165 | 166 | /** 167 | * 初始化 CURL 168 | * 169 | */ 170 | public static function init() 171 | { 172 | //if (empty ( self::$ch )) 173 | if (!is_resource ( self::$ch )) 174 | { 175 | self::$ch = curl_init (); 176 | curl_setopt( self::$ch, CURLOPT_RETURNTRANSFER, true ); 177 | curl_setopt( self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout ); 178 | curl_setopt( self::$ch, CURLOPT_HEADER, false ); 179 | curl_setopt( self::$ch, CURLOPT_USERAGENT, self::$useragent ); 180 | curl_setopt( self::$ch, CURLOPT_TIMEOUT, self::$timeout + 5); 181 | // 在多线程处理场景下使用超时选项时,会忽略signals对应的处理函数,但是无耐的是还有小概率的crash情况发生 182 | curl_setopt( self::$ch, CURLOPT_NOSIGNAL, true); 183 | } 184 | return self::$ch; 185 | } 186 | 187 | /** 188 | * get 189 | * 190 | * 191 | */ 192 | public static function get($url, $fields = array()) 193 | { 194 | self::init (); 195 | return self::http_request($url, 'get', $fields); 196 | } 197 | 198 | /** 199 | * $fields 有三种类型:1、数组;2、http query;3、json 200 | * 1、array('name'=>'yangzetao') 2、http_build_query(array('name'=>'yangzetao')) 3、json_encode(array('name'=>'yangzetao')) 201 | * 前两种是普通的post,可以用$_POST方式获取 202 | * 第三种是post stream( json rpc,其实就是webservice ),虽然是post方式,但是只能用流方式 http://input 后者 $HTTP_RAW_POST_DATA 获取 203 | * 204 | * @param mixed $url 205 | * @param array $fields 206 | * @param mixed $proxy 207 | * @static 208 | * @access public 209 | * @return void 210 | */ 211 | public static function post($url, $fields = array()) 212 | { 213 | self::init (); 214 | return self::http_request($url, 'post', $fields); 215 | } 216 | 217 | public static function http_request($url, $type = 'get', $fields) 218 | { 219 | // 如果是 get 方式,直接拼凑一个 url 出来 220 | if (strtolower($type) == 'get' && !empty($fields)) 221 | { 222 | $url = $url . (strpos($url,"?")===false ? "?" : "&") . http_build_query($fields); 223 | } 224 | 225 | // 随机绑定 hosts,做负载均衡 226 | if (self::$hosts) 227 | { 228 | $parse_url = parse_url($url); 229 | $host = $parse_url['host']; 230 | $key = rand(0, count(self::$hosts)-1); 231 | $ip = self::$hosts[$key]; 232 | $url = str_replace($host, $ip, $url); 233 | self::$headers = array_merge( array('Host:'.$host), self::$headers ); 234 | } 235 | curl_setopt( self::$ch, CURLOPT_URL, $url ); 236 | // 如果是 post 方式 237 | if (strtolower($type) == 'post') 238 | { 239 | curl_setopt( self::$ch, CURLOPT_POST, true ); 240 | curl_setopt( self::$ch, CURLOPT_POSTFIELDS, $fields ); 241 | } 242 | if (self::$useragent) 243 | { 244 | curl_setopt( self::$ch, CURLOPT_USERAGENT, self::$useragent ); 245 | } 246 | if (self::$cookie) 247 | { 248 | curl_setopt( self::$ch, CURLOPT_COOKIE, self::$cookie ); 249 | } 250 | if (self::$cookie_jar) 251 | { 252 | curl_setopt( self::$ch, CURLOPT_COOKIEJAR, self::$cookie_jar ); 253 | } 254 | if (self::$cookie_file) 255 | { 256 | curl_setopt( self::$ch, CURLOPT_COOKIEFILE, self::$cookie_file ); 257 | } 258 | if (self::$referer) 259 | { 260 | curl_setopt( self::$ch, CURLOPT_REFERER, self::$referer ); 261 | } 262 | if (self::$ip) 263 | { 264 | self::$headers = array_merge( array('CLIENT-IP:'.self::$ip, 'X-FORWARDED-FOR:'.self::$ip), self::$headers ); 265 | } 266 | if (self::$headers) 267 | { 268 | curl_setopt( self::$ch, CURLOPT_HTTPHEADER, self::$headers ); 269 | } 270 | if (self::$gzip) 271 | { 272 | curl_setopt( self::$ch, CURLOPT_ENCODING, 'gzip' ); 273 | } 274 | if (self::$proxy) 275 | { 276 | curl_setopt( self::$ch, CURLOPT_PROXY, self::$proxy ); 277 | } 278 | if (self::$http_raw) 279 | { 280 | curl_setopt( self::$ch, CURLOPT_HEADER, true ); 281 | } 282 | 283 | $data = curl_exec ( self::$ch ); 284 | self::$info = curl_getinfo(self::$ch); 285 | if ($data === false) 286 | { 287 | //echo date("Y-m-d H:i:s"), ' Curl error: ' . curl_error( self::$ch ), "\n"; 288 | } 289 | 290 | // 关闭句柄 291 | curl_close( self::$ch ); 292 | //$data = substr($data, 10); 293 | //$data = gzinflate($data); 294 | return $data; 295 | } 296 | 297 | public static function get_info() 298 | { 299 | return self::$info; 300 | } 301 | 302 | public static function get_http_code() 303 | { 304 | return self::$info['http_code']; 305 | } 306 | } 307 | 308 | function classic_curl($urls, $delay) 309 | { 310 | $queue = curl_multi_init(); 311 | $map = array(); 312 | 313 | foreach ($urls as $url) 314 | { 315 | // create cURL resources 316 | $ch = curl_init(); 317 | 318 | // 设置 URL 和 其他参数 319 | curl_setopt($ch, CURLOPT_URL, $url); 320 | curl_setopt($ch, CURLOPT_TIMEOUT, 1); 321 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 322 | curl_setopt($ch, CURLOPT_HEADER, 0); 323 | curl_setopt($ch, CURLOPT_NOSIGNAL, true); 324 | 325 | // 把当前 curl resources 加入到 curl_multi_init 队列 326 | curl_multi_add_handle($queue, $ch); 327 | $map[$url] = $ch; 328 | } 329 | 330 | $active = null; 331 | 332 | // execute the handles 333 | do { 334 | $mrc = curl_multi_exec($queue, $active); 335 | } while ($mrc == CURLM_CALL_MULTI_PERFORM); 336 | 337 | while ($active > 0 && $mrc == CURLM_OK) { 338 | while (curl_multi_exec($queue, $active) === CURLM_CALL_MULTI_PERFORM); 339 | // 这里 curl_multi_select 一直返回 -1,所以这里就死循环了,CPU就100%了 340 | if (curl_multi_select($queue, 0.5) != -1) 341 | { 342 | do { 343 | $mrc = curl_multi_exec($queue, $active); 344 | } while ($mrc == CURLM_CALL_MULTI_PERFORM); 345 | } 346 | } 347 | 348 | $responses = array(); 349 | foreach ($map as $url=>$ch) { 350 | //$responses[$url] = callback(curl_multi_getcontent($ch), $delay); 351 | $responses[$url] = callback(curl_multi_getcontent($ch), $delay, $url); 352 | curl_multi_remove_handle($queue, $ch); 353 | curl_close($ch); 354 | } 355 | 356 | curl_multi_close($queue); 357 | return $responses; 358 | } 359 | 360 | function rolling_curl($urls, $delay) 361 | { 362 | $queue = curl_multi_init(); 363 | $map = array(); 364 | 365 | foreach ($urls as $url) { 366 | $ch = curl_init(); 367 | 368 | curl_setopt($ch, CURLOPT_URL, $url); 369 | curl_setopt($ch, CURLOPT_TIMEOUT, 10); 370 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 371 | curl_setopt($ch, CURLOPT_HEADER, 0); 372 | curl_setopt($ch, CURLOPT_NOSIGNAL, true); 373 | $cookie = '_za=36643642-e546-4d60-a771-8af8dcfbd001; q_c1=a57a2b9f10964f909b8d8969febf3ab2|1437705596000|1437705596000; _xsrf=f0304fba4e44e1d008ec308d59bab029; cap_id="YWY1YmRmODlmZGVmNDc3MWJlZGFkZDg3M2E0M2Q5YjM=|1437705596|963518c454bb6f10d96775021c098c84e1e46f5a"; z_c0="QUFCQVgtRWZBQUFYQUFBQVlRSlZUVjR6NEZVUTgtRkdjTVc5UDMwZXRJZFdWZ2JaOWctNVhnPT0=|1438164574|aed6ef3707f246a7b64da4f1e8c089395d77ff2b"; __utma=51854390.1105113342.1437990174.1438160686.1438164116.10; __utmc=51854390; __utmz=51854390.1438134939.8.5.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/people/yangzetao; __utmv=51854390.100-1|2=registration_date=20131030=1^3=entry_date=20131030=1'; 374 | curl_setopt($ch, CURLOPT_COOKIE, $cookie); 375 | $useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36'; 376 | curl_setopt( $ch, CURLOPT_USERAGENT, $useragent ); 377 | curl_setopt($ch, CURLOPT_ENCODING, 'gzip'); 378 | 379 | curl_multi_add_handle($queue, $ch); 380 | $map[(string) $ch] = $url; 381 | } 382 | 383 | $responses = array(); 384 | do { 385 | while (($code = curl_multi_exec($queue, $active)) == CURLM_CALL_MULTI_PERFORM) ; 386 | 387 | if ($code != CURLM_OK) { break; } 388 | 389 | // a request was just completed -- find out which one 390 | while ($done = curl_multi_info_read($queue)) { 391 | 392 | // get the info and content returned on the request 393 | $info = curl_getinfo($done['handle']); 394 | $error = curl_error($done['handle']); 395 | $results = callback(curl_multi_getcontent($done['handle']), $delay, $map[(string) $done['handle']]); 396 | $responses[$map[(string) $done['handle']]] = compact('info', 'error', 'results'); 397 | 398 | // remove the curl handle that just completed 399 | curl_multi_remove_handle($queue, $done['handle']); 400 | curl_close($done['handle']); 401 | } 402 | 403 | // Block for data in / output; error handling is done by curl_multi_exec 404 | if ($active > 0) { 405 | curl_multi_select($queue, 0.5); 406 | } 407 | 408 | } while ($active); 409 | 410 | curl_multi_close($queue); 411 | return $responses; 412 | } 413 | 414 | function callback($data, $delay, $url) { 415 | //echo $data; 416 | //echo date("Y-m-d H:i:s", time()) . " --- " . $url . "\n"; 417 | if (!empty($data)) 418 | { 419 | file_put_contents("./html2/".md5($url).".html", $data); 420 | } 421 | // usleep模拟现实中比较负责的数据处理逻辑(如提取, 分词, 写入文件或数据库等) 422 | //usleep(1); 423 | //return compact('data', 'matches'); 424 | } 425 | 426 | -------------------------------------------------------------------------------- /library/cls_query.php: -------------------------------------------------------------------------------- 1 | 29 | * @created time :2015-08-08 15:52 30 | */ 31 | private static function get_nodes($query) 32 | { 33 | // 把一到多个空格 替换成 一个空格 34 | // 把 > 和 ~ 符号两边的空格去掉,因为没有用这两个符号,所以这里可以不这么做 35 | // ul>li.className 36 | $query = trim( 37 | preg_replace('@\s+@', ' ', 38 | preg_replace('@\s*(>|\\+|~)\s*@', '\\1', $query) 39 | ) 40 | ); 41 | 42 | $nodes = array(); 43 | if (! $query) 44 | { 45 | return $nodes; 46 | } 47 | 48 | $query_arr = explode(" ", $query); 49 | foreach ($query_arr as $k=>$v) 50 | { 51 | $path = $k == 0 ? $v : $path.' '.$v; 52 | $node = array("path"=>(string)$path, "name"=>"", "id"=>"", "class"=>"", "other"=>array()); 53 | // 如果存在内容选择器 54 | if (preg_match('@(.*?)\[(.*?)=[\'|"](.*?)[\'|"]\]@', $v, $matches) && !empty($matches[2]) && !empty($matches[3])) 55 | { 56 | // 把选择器过滤掉 [rel='topic'] 57 | $v = $matches[1]; 58 | $node['other'] = array( 59 | 'key'=>$matches[2], 60 | 'val'=>$matches[3], 61 | ); 62 | } 63 | 64 | // 如果存在 id 65 | $id_arr = explode("#", $v); 66 | $class_arr = explode(".", $v); 67 | if (count($id_arr) === 2) 68 | { 69 | $node['name'] = $id_arr[0]; 70 | $node['id'] = $id_arr[1]; 71 | } 72 | // 如果存在 class 73 | elseif (count($class_arr) === 2) 74 | { 75 | $node['name'] = $class_arr[0]; 76 | $node['class'] = $class_arr[1]; 77 | } 78 | // 如果没有样式 79 | else 80 | { 81 | $node['name'] = $v; 82 | } 83 | $nodes[] = $node; 84 | } 85 | //print_r($nodes); 86 | //exit; 87 | return $nodes; 88 | } 89 | 90 | public static function get_datas($nodes, $attr = "html") 91 | { 92 | if (empty(self::$content)) 93 | { 94 | return false; 95 | } 96 | 97 | $node_datas = array(); 98 | $count = count($nodes); 99 | // 循环所有节点 100 | foreach ($nodes as $i=>$node) 101 | { 102 | $is_last = $count == $i+1 ? true : false; 103 | // 第一次 104 | if ($i == 0) 105 | { 106 | $datas = array(); 107 | $datas = self::get_node_datas($node, self::$content, $attr, $is_last); 108 | // 如果第一次都取不到数据,直接跳出循环 109 | if(!$datas) 110 | { 111 | break; 112 | } 113 | $node_datas[$nodes[$i]['path']] = $datas; 114 | } 115 | else 116 | { 117 | $datas = array(); 118 | // 循环上一个节点的数组 119 | foreach ($node_datas[$nodes[$i-1]['path']] as $v) 120 | { 121 | $datas = array_merge( $datas, self::get_node_datas($node, trim($v), $attr, $is_last) ); 122 | } 123 | $node_datas[$nodes[$i]['path']] = $datas; 124 | // 删除上一个节点,防止内存溢出,或者缓存到本地,再次使用?! 125 | unset($node_datas[$nodes[$i-1]['path']]); 126 | } 127 | } 128 | //print_r($datas);exit; 129 | // 从数组中弹出最后一个元素 130 | $node_datas = array_pop($node_datas); 131 | //print_r($node_datas); 132 | //exit; 133 | return $node_datas; 134 | } 135 | 136 | /** 137 | * 从节点中获取内容 138 | * $regex = '@]+http-equiv\\s*=\\s*(["|\'])Content-Type\\1([^>]+?)>@i'; 139 | * 140 | * @param mixed $node 141 | * @param mixed $content 142 | * @return void 143 | * @author seatle 144 | * @created time :2015-08-08 15:52 145 | */ 146 | private static function get_node_datas($node, $content, $attr = "html", $is_last = false) 147 | { 148 | $node_datas = $datas = array(); 149 | 150 | if (!empty($node['id'])) 151 | { 152 | if ($node['name']) 153 | $regex = '@<'.$node['name'].'[^>]+id\\s*=\\s*["|\']+?'.$node['id'].'\\s*[^>]+?>(.*?)@is'; 154 | else 155 | $regex = '@id\\s*=\\s*["|\']+?'.$node['id'].'\\s*[^>]+?>(.*?)<@is'; 156 | } 157 | elseif (!empty($node['class'])) 158 | { 159 | if ($node['name']) 160 | $regex = '@<'.$node['name'].'[^>]+class\\s*=\\s*["|\']+?'.$node['class'].'\\s*[^>]+?>(.*?)@is'; 161 | else 162 | $regex = '@class\\s*=\\s*["|\']+?'.$node['class'].'\\s*[^>]+?>(.*?)<@is'; 163 | } 164 | else 165 | { 166 | // 这里为是么是*,0次到多次,因为有可能是
  • 167 | $regex = '@<'.$node['name'].'[^>]*?>(.*?)@is'; 168 | } 169 | self::log("regex --- " . $regex);; 170 | preg_match_all($regex, $content, $matches); 171 | $all_datas = empty($matches[0]) ? array() : $matches[0]; 172 | $html_datas = empty($matches[1]) ? array() : $matches[1]; 173 | 174 | // 过滤掉选择器对不上的 175 | foreach ($all_datas as $i=>$data) 176 | { 177 | // 如果有设置其他选择器,验证一下选择器 178 | if (!empty($node['other'])) 179 | { 180 | $regex = '@'.$node['other']['key'].'=[\'|"]'.$node['other']['val'].'[\'|"]@is'; 181 | self::log("regex other --- " . $regex); 182 | // 过滤器对不上的,跳过 183 | if (!preg_match($regex, $data, $matches)) 184 | { 185 | continue; 186 | } 187 | } 188 | // 获取节点的html内容 189 | if ($attr != "html" && $is_last) 190 | { 191 | $regex = '@'.$attr.'=[\'|"](.*?)[\'|"]@is'; 192 | preg_match($regex, $data, $matches); 193 | $node_datas[] = empty($matches[1]) ? '' : trim($matches[1]); 194 | } 195 | // 获取节点属性名的值 196 | else 197 | { 198 | $node_datas[] = trim($html_datas[$i]); 199 | } 200 | } 201 | //echo " 11111 ========================================= \n"; 202 | //print_r($node_datas); 203 | //echo " 22222 ========================================= \n\n\n"; 204 | return $node_datas; 205 | } 206 | 207 | /** 208 | * 记录日志 209 | * @param string $msg 210 | * @return void 211 | */ 212 | private static function log($msg) 213 | { 214 | $msg = "[".date("Y-m-d H:i:s")."] " . $msg . "\n"; 215 | if (self::$debug) 216 | { 217 | echo $msg; 218 | } 219 | } 220 | 221 | } 222 | 223 | //$xpath = "ul.top-nav-dropdown li"; 224 | //$xpath = "i.zg-icon"; 225 | //print_r($nodes); 226 | //exit; 227 | // [^>]+ 不是>的字符重复一次到多次, ? 表示不贪婪 228 | // \s 表示空白字符 229 | // * 表示0次或者多次 230 | // + 表示1次或者多次 231 | // 232 | // 后向引用,表示表达式中,从左往右数,第一个左括号对应的括号内的内容。 233 | // \\0 表示整个表达式 234 | // \\1表示第1个表达式 235 | // \\2表示第2个表达式 236 | // $regex = '@]+http-equiv\\s*=\\s*(["|\'])Content-Type\\1([^>]+?)>@i'; 237 | //preg_match_all($regex, $content, $matches); 238 | //print_r($matches); 239 | //exit; 240 | 241 | // 用法 242 | //$content = file_get_contents("./test.html"); 243 | //$query = "ul#top-nav-profile-dropdown li a"; 244 | //$query = "div#zh-profile-following-topic a.link[href='/topic/19550937']"; 245 | //cls_query::init($content); 246 | //$list = cls_query::query($query, "href"); 247 | //print_r($list); 248 | 249 | -------------------------------------------------------------------------------- /library/cls_redis_client.php: -------------------------------------------------------------------------------- 1 | 9 | * @created time :2018-01-03 10 | */ 11 | class cls_redis_client 12 | { 13 | private $redis_socket = false; 14 | //private $command = ''; 15 | 16 | public function __construct($host='127.0.0.1', $port=6379, $timeout = 3) 17 | { 18 | $this->redis_socket = stream_socket_client("tcp://".$host.":".$port, $errno, $errstr, $timeout); 19 | if ( !$this->redis_socket ) 20 | { 21 | throw new Exception("{$errno} - {$errstr}"); 22 | } 23 | } 24 | 25 | public function __destruct() 26 | { 27 | fclose($this->redis_socket); 28 | } 29 | 30 | public function __call($name, $args) 31 | { 32 | $crlf = "\r\n"; 33 | array_unshift($args, $name); 34 | $command = '*' . count($args) . $crlf; 35 | foreach ($args as $arg) 36 | { 37 | $command .= '$' . strlen($arg) . $crlf . $arg . $crlf; 38 | } 39 | //echo $command."\n"; 40 | $fwrite = fwrite($this->redis_socket, $command); 41 | if ($fwrite === FALSE || $fwrite <= 0) 42 | { 43 | throw new Exception('Failed to write entire command to stream'); 44 | } 45 | return $this->read_response(); 46 | } 47 | 48 | private function read_response() 49 | { 50 | $reply = trim(fgets($this->redis_socket, 1024)); 51 | switch (substr($reply, 0, 1)) 52 | { 53 | case '-': 54 | throw new Exception(trim(substr($reply, 1))); 55 | break; 56 | case '+': 57 | $response = substr(trim($reply), 1); 58 | if ($response === 'OK') 59 | { 60 | $response = TRUE; 61 | } 62 | break; 63 | case '$': 64 | $response = NULL; 65 | if ($reply == '$-1') 66 | { 67 | break; 68 | } 69 | $read = 0; 70 | $size = intval(substr($reply, 1)); 71 | if ($size > 0) 72 | { 73 | do 74 | { 75 | $block_size = ($size - $read) > 1024 ? 1024 : ($size - $read); 76 | $r = fread($this->redis_socket, $block_size); 77 | if ($r === FALSE) 78 | { 79 | throw new Exception('Failed to read response from stream'); 80 | } 81 | else 82 | { 83 | $read += strlen($r); 84 | $response .= $r; 85 | } 86 | } 87 | while ($read < $size); 88 | } 89 | fread($this->redis_socket, 2); /* discard crlf */ 90 | break; 91 | /* Multi-bulk reply */ 92 | case '*': 93 | $count = intval(substr($reply, 1)); 94 | if ($count == '-1') 95 | { 96 | return NULL; 97 | } 98 | $response = array(); 99 | for ($i = 0; $i < $count; $i++) 100 | { 101 | $response[] = $this->read_response(); 102 | } 103 | break; 104 | /* Integer reply */ 105 | case ':': 106 | $response = intval(substr(trim($reply), 1)); 107 | break; 108 | default: 109 | throw new RedisException("Unknown response: {$reply}"); 110 | break; 111 | } 112 | return $response; 113 | } 114 | } 115 | 116 | 117 | //$redis = new cls_redis_client(); 118 | //var_dump($redis->auth("foobared")); 119 | //var_dump($redis->set("name",'abc')); 120 | //var_dump($redis->get("name")); 121 | 122 | -------------------------------------------------------------------------------- /library/cls_redis_server.php: -------------------------------------------------------------------------------- 1 | 11 | * @created time :2018-01-03 12 | */ 13 | class cls_redis_server 14 | { 15 | private $socket = false; 16 | private $process_num = 3; 17 | public $redis_kv_data = array(); 18 | public $onMessage = null; 19 | 20 | public function __construct($host="0.0.0.0", $port=6379) 21 | { 22 | $this->socket = stream_socket_server("tcp://".$host.":".$port,$errno, $errstr); 23 | if (!$this->socket) die($errstr."--".$errno); 24 | echo "listen $host $port \r\n"; 25 | } 26 | 27 | private function parse_resp(&$conn) 28 | { 29 | // 读取一行,遇到 \r\n 为一行 30 | $line = fgets($conn); 31 | if($line === '' || $line === false) 32 | { 33 | return null; 34 | } 35 | // 获取第一个字符作为类型 36 | $type = $line[0]; 37 | // 去掉第一个字符,去掉结尾的 \r\n 38 | $line = mb_substr($line, 1, -2); 39 | switch ( $type ) 40 | { 41 | case "*": 42 | // 得到长度 43 | $count = (int) $line; 44 | $data = array(); 45 | for ($i = 1; $i <= $count; $i++) 46 | { 47 | $data[] = $this->parse_resp($conn); 48 | } 49 | return $data; 50 | case "$": 51 | if ($line == '-1') 52 | { 53 | return null; 54 | } 55 | // 截取的长度要加上 \r\n 两个字符 56 | $length = $line + 2; 57 | $data = ''; 58 | while ($length > 0) 59 | { 60 | $block = fread($conn, $length); 61 | if ($length !== strlen($block)) 62 | { 63 | throw new Exception('RECEIVING'); 64 | } 65 | $data .= $block; 66 | $length -= mb_strlen($block); 67 | } 68 | return mb_substr($data, 0, -2); 69 | } 70 | return $line; 71 | } 72 | 73 | private function start_worker_process() 74 | { 75 | $pid = pcntl_fork(); 76 | switch ($pid) 77 | { 78 | case -1: 79 | echo "fork error : {$i} \r\n"; 80 | exit; 81 | case 0: 82 | while ( true ) 83 | { 84 | echo "PID ".posix_getpid()." waiting...\n"; 85 | // 堵塞等待 86 | $conn = stream_socket_accept($this->socket, -1); 87 | if ( !$conn ) 88 | { 89 | continue; 90 | } 91 | //"*3\r\n$3\r\nSET\r\n$5\r\nmykey\r\n$7\r\nmyvalue\r\n" 92 | while( true ) 93 | { 94 | $arr = $this->parse_resp($conn); 95 | if ( is_array($arr) ) 96 | { 97 | if ($this->onMessage) 98 | { 99 | call_user_func($this->onMessage, $conn, $arr); 100 | } 101 | } 102 | else if ( $arr ) 103 | { 104 | if ($this->onMessage) 105 | { 106 | call_user_func($this->onMessage, $conn, $arr); 107 | } 108 | } 109 | else 110 | { 111 | fclose($conn); 112 | break; 113 | } 114 | } 115 | } 116 | default: 117 | $this->pids[$pid] = $pid; 118 | break; 119 | } 120 | } 121 | 122 | public function run() 123 | { 124 | for($i = 1; $i <= $this->process_num; $i++) 125 | { 126 | $this->start_worker_process(); 127 | } 128 | 129 | while( true ) 130 | { 131 | foreach ($this->pids as $i => $pid) 132 | { 133 | if($pid) 134 | { 135 | $res = pcntl_waitpid($pid, $status,WNOHANG); 136 | 137 | if ( $res == -1 || $res > 0 ) 138 | { 139 | $this->start_worker_process(); 140 | unset($this->pids[$pid]); 141 | } 142 | } 143 | } 144 | sleep(1); 145 | } 146 | } 147 | 148 | } 149 | 150 | $server = new cls_redis_server(); 151 | $server->onMessage = function($conn, $info) use($server) 152 | { 153 | if ( is_array($info) ) 154 | { 155 | $command = strtoupper($info[0]); 156 | if ( $command == "SET" ) 157 | { 158 | $key = $info[1]; 159 | $val = $info[2]; 160 | $server->redis_kv_data[$key] = $val; 161 | fwrite($conn, "+OK\r\n"); 162 | } 163 | else if ( $command == "GET" ) 164 | { 165 | $key = $info[1]; 166 | $val = isset($server->redis_kv_data[$key]) ? $server->redis_kv_data[$key] : ''; 167 | fwrite($conn, "$".strlen($val)."\r\n".$val."\r\n"); 168 | } 169 | else 170 | { 171 | fwrite($conn,"+OK\r\n"); 172 | } 173 | } 174 | else 175 | { 176 | fwrite($conn,"+OK\r\n"); 177 | } 178 | }; 179 | $server->run(); 180 | -------------------------------------------------------------------------------- /library/rolling_curl.php: -------------------------------------------------------------------------------- 1 | 10 | * @copyright seatle 11 | * @link http://www.epooll.com/ 12 | * @license http://www.opensource.org/licenses/mit-license.php MIT License 13 | */ 14 | 15 | class rolling_curl 16 | { 17 | /** 18 | * @var float 19 | * 20 | * 同时运行任务数 21 | * 例如:有8个请求,则会被分成两批,第一批5个请求,第二批3个请求 22 | * 注意:采集知乎的时候,5个是比较稳定的,7个以上就开始会超时了,多进程就没有这样的问题,因为多进程很少几率会发生并发 23 | */ 24 | public $window_size = 5; 25 | 26 | /** 27 | * @var float 28 | * 29 | * Timeout is the timeout used for curl_multi_select. 30 | */ 31 | private $timeout = 10; 32 | 33 | /** 34 | * @var string|array 35 | * 36 | * 应用在每个请求的回调函数 37 | */ 38 | public $callback; 39 | 40 | /** 41 | * @var array 42 | * 43 | * 设置默认的请求参数 44 | */ 45 | protected $options = array( 46 | CURLOPT_SSL_VERIFYPEER => 0, 47 | CURLOPT_RETURNTRANSFER => 1, 48 | // 注意:TIMEOUT = CONNECTTIMEOUT + 数据获取时间,所以 TIMEOUT 一定要大于 CONNECTTIMEOUT,否则 CONNECTTIMEOUT 设置了就没意义 49 | // "Connection timed out after 30001 milliseconds" 50 | CURLOPT_CONNECTTIMEOUT => 30, 51 | CURLOPT_TIMEOUT => 60, 52 | CURLOPT_RETURNTRANSFER => 1, 53 | CURLOPT_HEADER => 0, 54 | // 在多线程处理场景下使用超时选项时,会忽略signals对应的处理函数,但是无耐的是还有小概率的crash情况发生 55 | CURLOPT_NOSIGNAL => 1, 56 | CURLOPT_USERAGENT => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36", 57 | ); 58 | 59 | /** 60 | * @var array 61 | */ 62 | private $headers = array(); 63 | 64 | /** 65 | * @var Request[] 66 | * 67 | * 请求队列 68 | */ 69 | private $requests = array(); 70 | 71 | /** 72 | * @var RequestMap[] 73 | * 74 | * Maps handles to request indexes 75 | */ 76 | private $requestMap = array(); 77 | 78 | public function __construct() 79 | { 80 | } 81 | 82 | /** 83 | * set timeout 84 | * 85 | * @param init $timeout 86 | * @return 87 | */ 88 | public function set_timeout($timeout) 89 | { 90 | $this->options[CURLOPT_TIMEOUT] = $timeout; 91 | } 92 | 93 | /** 94 | * set proxy 95 | * 96 | */ 97 | public function set_proxy($proxy) 98 | { 99 | $this->options[CURLOPT_PROXY] = $proxy; 100 | } 101 | 102 | /** 103 | * set referer 104 | * 105 | */ 106 | public function set_referer($referer) 107 | { 108 | $this->options[CURLOPT_REFERER] = $referer; 109 | } 110 | 111 | /** 112 | * 设置 user_agent 113 | * 114 | * @param string $useragent 115 | * @return void 116 | */ 117 | public function set_useragent($useragent) 118 | { 119 | $this->options[CURLOPT_USERAGENT] = $useragent; 120 | } 121 | 122 | /** 123 | * 设置COOKIE 124 | * 125 | * @param string $cookie 126 | * @return void 127 | */ 128 | public function set_cookie($cookie) 129 | { 130 | $this->options[CURLOPT_COOKIE] = $cookie; 131 | } 132 | 133 | /** 134 | * 设置COOKIE JAR 135 | * 136 | * @param string $cookie_jar 137 | * @return void 138 | */ 139 | public function set_cookiejar($cookiejar) 140 | { 141 | $this->options[CURLOPT_COOKIEJAR] = $cookiejar; 142 | } 143 | 144 | /** 145 | * 设置COOKIE FILE 146 | * 147 | * @param string $cookie_file 148 | * @return void 149 | */ 150 | public function set_cookiefile($cookiefile) 151 | { 152 | $this->options[CURLOPT_COOKIEFILE] = $cookiefile; 153 | } 154 | 155 | /** 156 | * 获取内容的时候是不是连header也一起获取 157 | * 158 | * @param mixed $http_raw 159 | * @return void 160 | * @author seatle 161 | * @created time :2016-09-18 10:17 162 | */ 163 | public function set_http_raw($http_raw = false) 164 | { 165 | $this->options[CURLOPT_HEADER] = $http_raw; 166 | } 167 | 168 | /** 169 | * 设置IP 170 | * 171 | * @param string $ip 172 | * @return void 173 | */ 174 | public function set_ip($ip) 175 | { 176 | $headers = array( 177 | 'CLIENT-IP'=>$ip, 178 | 'X-FORWARDED-FOR'=>$ip, 179 | ); 180 | $this->headers = $this->headers + $headers; 181 | } 182 | 183 | /** 184 | * 设置Headers 185 | * 186 | * @param string $headers 187 | * @return void 188 | */ 189 | public function set_headers($headers) 190 | { 191 | $this->headers = $this->headers + $headers; 192 | } 193 | 194 | /** 195 | * 设置Hosts 196 | * 197 | * @param string $hosts 198 | * @return void 199 | */ 200 | public function set_hosts($hosts) 201 | { 202 | $headers = array( 203 | 'Host'=>$hosts, 204 | ); 205 | $this->headers = $this->headers + $headers; 206 | } 207 | 208 | /** 209 | * 设置Gzip 210 | * 211 | * @param string $hosts 212 | * @return void 213 | */ 214 | public function set_gzip($gzip) 215 | { 216 | if ($gzip) 217 | { 218 | $this->options[CURLOPT_ENCODING] = 'gzip'; 219 | } 220 | } 221 | 222 | public function request($url, $method = "GET", $fields = array(), $headers = array(), $options = array()) 223 | { 224 | $this->requests[] = array('url'=>$url,'method'=>$method,'fields'=>$fields,'headers'=>$headers,'options'=>$options); 225 | return true; 226 | } 227 | 228 | public function get_options($request) 229 | { 230 | $options = $this->options; 231 | $headers = $this->headers; 232 | 233 | if (ini_get('safe_mode') == 'Off' || !ini_get('safe_mode')) 234 | { 235 | $options[CURLOPT_FOLLOWLOCATION] = 1; 236 | $options[CURLOPT_MAXREDIRS] = 5; 237 | } 238 | 239 | // 如果是 get 方式,直接拼凑一个 url 出来 240 | if (strtolower($request['method']) == 'get' && !empty($request['fields'])) 241 | { 242 | $url = $request['url'] . "?" . http_build_query($request['fields']); 243 | } 244 | // 如果是 post 方式 245 | if (strtolower($request['method']) == 'post') 246 | { 247 | $options[CURLOPT_POST] = 1; 248 | $options[CURLOPT_POSTFIELDS] = $request['fields']; 249 | } 250 | 251 | // append custom options for this specific request 252 | if ($request['options']) 253 | { 254 | $options = $request['options'] + $options; 255 | } 256 | 257 | if ($request['headers']) 258 | { 259 | $headers = $request['headers'] + $headers; 260 | } 261 | 262 | // 随机绑定 hosts,做负载均衡 263 | //if (self::$hosts) 264 | //{ 265 | //$parse_url = parse_url($url); 266 | //$host = $parse_url['host']; 267 | //$key = rand(0, count(self::$hosts)-1); 268 | //$ip = self::$hosts[$key]; 269 | //$url = str_replace($host, $ip, $url); 270 | //self::$headers = array_merge( array('Host:'.$host), self::$headers ); 271 | //} 272 | 273 | // header 要这样拼凑 274 | $headers_tmp = array(); 275 | foreach ($headers as $k=>$v) 276 | { 277 | $headers_tmp[] = $k.":".$v; 278 | } 279 | $headers = $headers_tmp; 280 | 281 | $options[CURLOPT_URL] = $request['url']; 282 | $options[CURLOPT_HTTPHEADER] = $headers; 283 | 284 | return $options; 285 | } 286 | 287 | /** 288 | * GET 请求 289 | * 290 | * @param string $url 291 | * @param array $headers 292 | * @param array $options 293 | * @return bool 294 | */ 295 | public function get($url, $fields = array(), $headers = array(), $options = array()) 296 | { 297 | return $this->request($url, 'get', $fields, $headers, $options); 298 | } 299 | 300 | /** 301 | * $fields 有三种类型:1、数组;2、http query;3、json 302 | * 1、array('name'=>'yangzetao') 2、http_build_query(array('name'=>'yangzetao')) 3、json_encode(array('name'=>'yangzetao')) 303 | * 前两种是普通的post,可以用$_POST方式获取 304 | * 第三种是post stream( json rpc,其实就是webservice ),虽然是post方式,但是只能用流方式 http://input 后者 $HTTP_RAW_POST_DATA 获取 305 | * 306 | * @param string $url 307 | * @param array $fields 308 | * @param array $headers 309 | * @param array $options 310 | * @return void 311 | */ 312 | public function post($url, $fields = array(), $headers = array(), $options = array()) 313 | { 314 | return $this->request($url, 'post', $fields, $headers, $options); 315 | } 316 | 317 | /** 318 | * Execute processing 319 | * 320 | * @param int $window_size Max number of simultaneous connections 321 | * @return string|bool 322 | */ 323 | public function execute($window_size = null) 324 | { 325 | $count = sizeof($this->requests); 326 | if ($count == 0) 327 | { 328 | return false; 329 | } 330 | // 只有一个请求 331 | elseif ($count == 1) 332 | { 333 | return $this->single_curl(); 334 | } 335 | else 336 | { 337 | // 开始 rolling curl,window_size 是最大同时连接数 338 | return $this->rolling_curl($window_size); 339 | } 340 | } 341 | 342 | private function single_curl() 343 | { 344 | $ch = curl_init(); 345 | // 从请求队列里面弹出一个来 346 | $request = array_shift($this->requests); 347 | $options = $this->get_options($request); 348 | curl_setopt_array($ch, $options); 349 | $output = curl_exec($ch); 350 | $info = curl_getinfo($ch); 351 | $error = null; 352 | if ($output === false) 353 | { 354 | $error = curl_error( $ch ); 355 | } 356 | //$output = substr($output, 10); 357 | //$output = gzinflate($output); 358 | 359 | // 其实一个请求的时候没是么必要回调,直接返回数据就好了,不过这里算是多一个功能吧,和多请求保持一样的操作 360 | if ($this->callback) 361 | { 362 | if (is_callable($this->callback)) 363 | { 364 | call_user_func($this->callback, $output, $info, $request, $error); 365 | } 366 | } 367 | else 368 | { 369 | return $output; 370 | } 371 | return true; 372 | } 373 | 374 | private function rolling_curl($window_size = null) 375 | { 376 | // 如何设置了最大任务数 377 | if ($window_size) 378 | $this->window_size = $window_size; 379 | 380 | // 如果请求数 小于 任务数,设置任务数为请求数 381 | if (sizeof($this->requests) < $this->window_size) 382 | $this->window_size = sizeof($this->requests); 383 | 384 | // 如果任务数小于2个,不应该用这个方法的,用上面的single_curl方法就好了 385 | if ($this->window_size < 2) 386 | exit("Window size must be greater than 1"); 387 | 388 | // 初始化任务队列 389 | $master = curl_multi_init(); 390 | 391 | // 开始第一批请求 392 | for ($i = 0; $i < $this->window_size; $i++) 393 | { 394 | $ch = curl_init(); 395 | $options = $this->get_options($this->requests[$i]); 396 | curl_setopt_array($ch, $options); 397 | curl_multi_add_handle($master, $ch); 398 | // 添加到请求数组 399 | $key = (string) $ch; 400 | $this->requestMap[$key] = $i; 401 | } 402 | 403 | do { 404 | while (($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM) ; 405 | 406 | // 如果 407 | if ($execrun != CURLM_OK) { break; } 408 | 409 | // 一旦有一个请求完成,找出来,因为curl底层是select,所以最大受限于1024 410 | while ($done = curl_multi_info_read($master)) 411 | { 412 | // 从请求中获取信息、内容、错误 413 | $info = curl_getinfo($done['handle']); 414 | $output = curl_multi_getcontent($done['handle']); 415 | $error = curl_error($done['handle']); 416 | 417 | // 如果绑定了回调函数 418 | $callback = $this->callback; 419 | if (is_callable($callback)) 420 | { 421 | $key = (string) $done['handle']; 422 | $request = $this->requests[$this->requestMap[$key]]; 423 | unset($this->requestMap[$key]); 424 | call_user_func($callback, $output, $info, $request, $error); 425 | } 426 | 427 | // 一个请求完了,就加一个进来,一直保证5个任务同时进行 428 | if ($i < sizeof($this->requests) && isset($this->requests[$i]) && $i < count($this->requests)) 429 | { 430 | $ch = curl_init(); 431 | $options = $this->get_options($this->requests[$i]); 432 | curl_setopt_array($ch, $options); 433 | curl_multi_add_handle($master, $ch); 434 | 435 | // 添加到请求数组 436 | $key = (string) $ch; 437 | $this->requestMap[$key] = $i; 438 | $i++; 439 | } 440 | // 把请求已经完成了得 curl handle 删除 441 | curl_multi_remove_handle($master, $done['handle']); 442 | } 443 | 444 | // 当没有数据的时候进行堵塞,把 CPU 使用权交出来,避免上面 do 死循环空跑数据导致 CPU 100% 445 | if ($running) 446 | { 447 | curl_multi_select($master, $this->timeout); 448 | } 449 | 450 | } while ($running); 451 | // 关闭任务 452 | curl_multi_close($master); 453 | 454 | // 把请求清空,否则没有重新 new rolling_curl(); 直接再次导入一批url的时候,就会把前面已经执行过的url又执行一轮 455 | unset($this->requests); 456 | return true; 457 | } 458 | 459 | /** 460 | * @return void 461 | */ 462 | public function __destruct() 463 | { 464 | unset($this->window_size, $this->callback, $this->options, $this->headers, $this->requests); 465 | } 466 | } 467 | -------------------------------------------------------------------------------- /test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import "fmt" 4 | 5 | func main() { 6 | fmt.Println("vim-go") 7 | fmt 8 | } 9 | -------------------------------------------------------------------------------- /test.php: -------------------------------------------------------------------------------- 1 | 'http://www.test.com'); 7 | $data = http_build_query($data); 8 | // Create a stream 9 | $opts = [ 10 | //"http" => [ 11 | //"method" => "POST", 12 | //"header" => "Content-Type: multipart/form-data\r\n", 13 | //"content" => $data, 14 | //], 15 | "ssl" => array( 16 | "verify_peer"=>false, 17 | "verify_peer_name"=>false, 18 | ), 19 | ]; 20 | 21 | $context = stream_context_create($opts); 22 | 23 | // Open the file using the HTTP headers set above 24 | $file = file_get_contents('https://api.potato.im:8443/10100386:Z0dT3Oalvu5IGC71OrvGs3hT/setWebhook', false, $context); 25 | 26 | var_dump($file); 27 | -------------------------------------------------------------------------------- /worker.php: -------------------------------------------------------------------------------- 1 | addServer('10.10.10.238'); 7 | $gmworker->addFunction("reverse", "reverse_fn"); 8 | 9 | print "Waiting for job...\n"; 10 | while($gmworker->work()) 11 | { 12 | if ($gmworker->returnCode() != GEARMAN_SUCCESS) 13 | { 14 | echo "return_code: " . $gmworker->returnCode() . "\n"; 15 | break; 16 | } 17 | //break; 18 | } 19 | 20 | function reverse_fn($job) 21 | { 22 | sleep(3); 23 | echo $job->workload()."\n"; 24 | return strrev($job->workload()); 25 | } 26 | 27 | 28 | echo "hello\n"; 29 | ?> 30 | 31 | 32 | 33 | --------------------------------------------------------------------------------