├── Crawler ├── Crawler.php ├── PauseData.php └── Queue.php ├── LICENSE ├── README.md ├── SiteConfig ├── 1.php ├── 2.php ├── 3.php ├── 4.php └── 5.php ├── composer.json ├── config.php ├── index.php └── test.php /Crawler/Crawler.php: -------------------------------------------------------------------------------- 1 | configs = $configs; 27 | $this->initSite(); 28 | $this->initPage(); 29 | $this->initDb(); 30 | $this->pauseDrive = PauseFactory::Create( 'Xpath' ); 31 | 32 | 33 | 34 | 35 | $temp1 = parse_url($this->configs['scanUrls'][$this->scanUrlsIndex]); 36 | $this->configs['baseUrl'] = $temp1['scheme'].'://'.$temp1['host'].'/'; 37 | $this->configs['baseUrlPath'] = $temp1['scheme'].'://'.$temp1['host'].$temp1['path']; 38 | unset($temp1); 39 | 40 | 41 | $this->queueObj = new Queue($this->configs['dbConfig']['redis']['host'],$this->configs['dbConfig']['redis']['port'],$this->configs['id'],$this->configs['action']); 42 | if ($this->queueObj->getLength()==0) { 43 | $this->addQueue($this->configs['scanUrls'][$this->scanUrlsIndex],['type'=>1]); 44 | } 45 | 46 | 47 | } 48 | 49 | /** 50 | * [启动] 51 | * @return [type] [description] 52 | */ 53 | public function start(){ 54 | 55 | 56 | $this->configs['beforeCrawl']($this->site); //beforeCrawl回调 57 | 58 | 59 | while (true) { 60 | //执行开关 61 | if (file_exists($this->configs['siteConfigDir'].$this->configs['id'].'stop.txt') ) { 62 | break; 63 | } 64 | 65 | if ($this->configs['debug'] && $this->curlTimes > $this->configs['debugNum']) { 66 | break; 67 | } 68 | //执行开关 69 | 70 | if ($this->queueObj->getLength()==0) { 71 | break; 72 | } 73 | for ($i = 1; $i <= $this->configs['dbConfig']['maxProcess']; ++$i) 74 | { 75 | $pid = pcntl_fork(); 76 | if ($pid == -1) 77 | { 78 | echo "fork child process failed\n"; 79 | exit(0); 80 | } 81 | if (!$pid) 82 | { 83 | 84 | //判断队列是否为空,下一个ScanUrl 85 | if ($this->queueObj->getLength()>0) { 86 | $queueArray = $this->removeQueue($this->configs['thread']); 87 | }else{ 88 | //nextScanUrl 89 | $this->scanUrlsIndex++; 90 | if ($this->configs['scanUrls'][$this->scanUrlsIndex]) { 91 | $queueArray[] = ['url'=>$this->configs['scanUrls'][$this->scanUrlsIndex]]; 92 | }else{ 93 | break 2; 94 | //die('done'); 95 | } 96 | } 97 | //判断队列是否为空,下一个ScanUrl 98 | 99 | // 100 | $this->configs['onChangeProxy']($this->site); //onChangeProxy回调 101 | $this->curl($queueArray); 102 | unset($queueArray); 103 | 104 | $this->curlObj->success(function($instance) { 105 | $this->log('curl:'.$instance->url); 106 | 107 | $this->page['url'] = $instance->url; 108 | 109 | $this->page['raw'] = $instance->response; 110 | $this->page['raw'] = $this->convertUtf8($this->page['raw']).$instance->requestHeaders['contextData']; //加上附加数据 111 | 112 | 113 | if ($instance->requestHeaders['contextData']) { 114 | $this->log('contextData:'.$instance->requestHeaders['contextData'],3); 115 | } 116 | 117 | $this->page['request'] = $instance;//还没改造 118 | 119 | 120 | $this->configs['isAntiSpider']($this->page['url'],$this->page['raw']); //isAntiSpider回调 121 | $this->configs['afterDownloadPage']($this->page,$this->site); //afterDownloadPage回调 122 | 123 | 124 | //scanUrls 125 | if (in_array($this->page['url'], $this->configs['scanUrls'])) { 126 | $this->log('in scanUrls:'.$this->page['url']); 127 | 128 | $r1 = $this->configs['onProcessScanPage']($this->page,$this->page['raw'],$this->site); //onProcessScanPage 回调 129 | //$this->log('preg_match:'.$value.' -> '.$this->page['url']); 130 | if ($r1 == true) { 131 | $this->parseAllUrl($this->page['raw']); 132 | } 133 | 134 | } 135 | 136 | 137 | 138 | //列表页 139 | foreach ($this->configs['helperUrlRegexes'] as $key => $value) { 140 | if (preg_match("|".$value."$|", $this->page['url'])) { 141 | $this->log('in helperUrl:'.$this->page['url']); 142 | 143 | if (strstr($instance->responseHeaders['content-type'],'application/json')) { //列表页是json数据 144 | $this->page['raw'] = serialize($this->page['raw']); 145 | } 146 | $r1 = $this->configs['onProcessHelperPage']($this->page,$this->page['raw'],$this->site); // 147 | 148 | 149 | if (!$this->configs['contentUrlRegexes']) { 150 | //当内容页正则为空时,可以直接解析列表页 151 | //$this->log('xxxxxxxxxxxxxxx:'.$this->page['url']); 152 | $this->parseData($this->page['url'],$this->page['raw']); 153 | } 154 | if ($r1 == true) { 155 | $this->parseAllUrl($this->page['raw']); 156 | } 157 | break; 158 | } else { 159 | 160 | } 161 | } 162 | //列表页 163 | 164 | 165 | //内容页 166 | foreach ($this->configs['contentUrlRegexes'] as $key => $value) { 167 | if (preg_match("|".$value."$|", $this->page['url'])) { 168 | $this->log('in contentUrl:'.$this->page['url']); 169 | $r1 = $this->configs['onProcessContentPage']($this->page,$this->page['raw'],$this->site); //onProcessContentPage 回调 170 | //$this->log('preg_match:'.$value.' -> '.$this->page['url']); 171 | $this->parseData($this->page['url'],$this->page['raw']); 172 | if ($r1 == true) { 173 | $this->parseAllUrl($this->page['raw']); 174 | } 175 | break; 176 | } else { 177 | 178 | } 179 | } 180 | //内容页 181 | unset($instance); 182 | 183 | //ob_flush(); 184 | //flush(); 185 | //usleep(1000000); 186 | }); 187 | 188 | $this->curlObj->error(function($instance) { 189 | $this->log('call to "' . $instance->url . '" was unsuccessful.' . "\n".'error code: ' . $instance->errorCode . "\n".'error message: ' . $instance->errorMessage . "\n",3); 190 | unset($instance); 191 | }); 192 | 193 | $this->curlObj->start(); 194 | exit($i); 195 | 196 | } 197 | usleep(1); 198 | } 199 | while (pcntl_waitpid(0, $status) != -1) 200 | { 201 | $status = pcntl_wexitstatus($status); 202 | if (pcntl_wifexited($status)) 203 | { 204 | //echo ""; 205 | } 206 | echo "$status finished\n"; 207 | } 208 | 209 | 210 | 211 | } 212 | 213 | 214 | 215 | 216 | } 217 | 218 | /** 219 | * [解析所有链接] 220 | * @param [string] $content [description] 221 | * @param [type] $r [description] 222 | * @return [type] [description] 223 | */ 224 | public function parseAllUrl($content){ 225 | 226 | 227 | //echo $content; 228 | $urlData = $this->pauseDrive->pause($content,"//a/@href"); 229 | if ($urlData) { 230 | foreach ($urlData as $key => $value) { 231 | 232 | //$this->log('parseAllUrl url:'.$value); 233 | $temp1 = parse_url($value); 234 | if (!$temp1['host'] || in_array($temp1['host'], $this->configs['domains']) ) { 235 | $this->urlRegexes($value); 236 | } 237 | unset($temp1); 238 | 239 | 240 | } 241 | unset($urlData); 242 | } 243 | 244 | } 245 | 246 | 247 | /** 248 | * [筛选helper/content规则才能进队列] 249 | * @param [type] $url [description] 250 | * @return [type] [description] 251 | */ 252 | public function urlRegexes($url){ 253 | 254 | 255 | foreach ($this->configs['helperUrlRegexes'] as $key => $value) { 256 | if (preg_match("|^".$value."$|", $url)) { 257 | $this->addQueue($url,['type'=>2]); //进队列 258 | break; 259 | } else { 260 | 261 | } 262 | } 263 | 264 | 265 | 266 | foreach ($this->configs['contentUrlRegexes'] as $key => $value) { 267 | if (preg_match("|^".$value."$|", $url)) { 268 | 269 | $this->addQueue($url,['type'=>3]); //进队列 270 | break; 271 | } else { 272 | 273 | } 274 | } 275 | } 276 | 277 | 278 | /** 279 | * [进队列] 280 | * @param [type] $url [description] 281 | * @param [type] $opt [description] 282 | */ 283 | 284 | public function addQueue($url,$opt=[]){ 285 | //print_r($this->configs['dbConfig']);die; 286 | $this->queueObj = $this->queueObj?$this->queueObj:new Queue($this->configs['dbConfig']['redis']['host'],$this->configs['dbConfig']['redis']['port'],$this->configs['id'],$this->configs['action']); 287 | //$this->queueObj->addLast($url); 288 | $temp1 = parse_url($url); 289 | if (!$temp1['scheme']) { //相对地址 290 | if (!$temp1['path']) { 291 | $url = $this->configs['baseUrlPath'].$url; //"?xxxx" 292 | 293 | }else{ 294 | if (substr($url, 0,1) == '/') { //"/1.html" 295 | $url = substr($url,1); 296 | } 297 | $url = $this->configs['baseUrl'].$url; //"1.html" 298 | } 299 | 300 | }else{ 301 | //绝对地址 302 | } 303 | $this->queueObj->addLast(["url"=>$url,"opt"=>$opt]); 304 | unset($temp1); 305 | unset($url); 306 | unset($opt); 307 | 308 | //$this->log('add queue:'.$url); 309 | 310 | } 311 | 312 | 313 | /** 314 | * [解析所有内容] 315 | * @param [type] $url [description] 316 | * @param [type] $content [description] 317 | * @return [type] [description] 318 | */ 319 | public function parseData($url,$content){ 320 | 321 | $this->skip = '';//重置下状态; 322 | $fieldContent = $this->parseFields($this->configs['fields'],$content); 323 | 324 | //$page['skip'](); 删掉整条 325 | $skipFlag = $this->array_search_key('skipAllPage998', $fieldContent); 326 | if ($skipFlag) { 327 | unset($fieldContent);//site->skip 328 | unset($skipFlag); 329 | } 330 | 331 | //transient 删除字段 332 | $delKey = $this->array_search_key('transient', $this->configs['fields']); 333 | if ($delKey) { 334 | foreach ($delKey as $key => $value) { 335 | $this->log('delKey:'.(string)$value,1); 336 | $this->array_remove_key($fieldContent, (string)$value); 337 | } 338 | unset($delKey); 339 | } 340 | 341 | $this->configs['afterExtractPage']($this->page,$fieldContent); //afterExtractPage回调 342 | 343 | 344 | if ($this->getDbInstance() && $fieldContent) { 345 | $this->getDbInstance()->insert($this->tableName, [ 346 | "site" => $this->configs['id'], 347 | "url" => $url, 348 | "data" => json_encode($fieldContent,JSON_UNESCAPED_UNICODE) 349 | ]); 350 | 351 | } 352 | 353 | if ($this->configs['debug'] ) { 354 | $this->log('fieldContent:'); 355 | $this->log($fieldContent,1); 356 | }else{ 357 | $this->log('quequeLeft:'.$this->queueObj->getLength(),1); 358 | } 359 | 360 | unset($fieldContent); 361 | unset($content); 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | } 371 | 372 | /** 373 | * [递归解析字段] 374 | * @param [type] $fields [description] 375 | * @param [type] $content [description] 376 | * @return [type] [description] 377 | */ 378 | public function parseFields($fields,$content){ 379 | 380 | 381 | 382 | foreach ($fields as $k => $value) { //第一层 383 | $pf = $value['selectorType'] != 'Xpath' ? PauseFactory::Create( $value['selectorType'] ): $this->pauseDrive; 384 | 385 | if ($value['sourceType']) { 386 | if ($value['sourceType'] == 'AttachedUrl') { 387 | preg_match_all("|\{([^}]+)\}|", $value['attachedUrl'],$match); 388 | if ($match[0]) { 389 | foreach ($match[0] as $matchKey =>$matchValue) { 390 | $value['attachedUrl'] = str_replace($matchValue, $con[$match[1][$matchKey]], $value['attachedUrl'] ); 391 | } 392 | } 393 | 394 | $curlOneContent = $this->curlOne($value['attachedUrl']); 395 | if ($curlOneContent) { 396 | $content = $curlOneContent; 397 | 398 | $this->configs['afterDownloadAttachedPage']($this->page,$this->site); //afterDownloadAttachedPage回调 399 | } 400 | } 401 | } 402 | 403 | 404 | $con[$value['name']] = $pf->pause($content,$value['selector']); 405 | 406 | //$con[$value['name']] = $pf->pause($content,$value['selector']); 407 | 408 | /** 409 | * 控制required,page.skip() 410 | */ 411 | if ($value['required'] && !$con[$value['name']]) { //判断required 412 | $con['skipAllPage998'] = 1;//设置一个特殊key,方便查找,找到即可删掉这整条数据; 413 | $this->log('required:'.$value['name'],3); 414 | //return $con; 415 | } 416 | if ($this->skip) { 417 | if ($this->skip == 'skipAllPage998') { 418 | $con['skipAllPage998'] = 1;//设置一个特殊key,方便查找,找到即可删掉这整条数据; 419 | } 420 | } 421 | 422 | 423 | $this->configs['beforeHandleImg']($value['name'],$img); //beforeHandleImg 424 | $this->configs['beforeCacheImg']($value['name'],$img); //beforeCacheImg 425 | $this->configs['afterExtractField']($value['name'],$con[$value['name']],$this->page); //afterExtractField 426 | 427 | 428 | if ($value['children']){ 429 | 430 | $contentRepeat = $con[$value['name']]; 431 | $con[$value['name']] = []; 432 | 433 | if (is_array($contentRepeat)) { 434 | foreach ($contentRepeat as $k2 => $repeatedContent) { //每个帖子的所有内容 435 | //if ($value['children']) { 436 | $con[$value['name']][$k2] = $this->parseFields($value['children'],(string)$repeatedContent); //遍历 437 | 438 | /** 439 | * 控制required,page.skip() 440 | */ 441 | if (!$con[$value['name']][$k2]) { 442 | $this->log('required2:'.$value['name'],3); 443 | unset($con[$value['name']][$k2]); 444 | } 445 | 446 | if ($this->skip == $value['name']) { 447 | $this->log('skip:'.$value['name'],3); 448 | unset($con[$value['name']][$k2]); 449 | $this->skip = ''; 450 | } 451 | 452 | /*if ($this->skip == 'skipAllPage998') { 453 | $this->log('skip all:'.$value['name'],3); 454 | unset($con); 455 | $this->skip = ''; 456 | return; 457 | }*/ 458 | 459 | //} 460 | 461 | } 462 | } 463 | 464 | } 465 | 466 | } 467 | 468 | unset($contentRepeat); 469 | unset($content); 470 | unset($fields); 471 | 472 | return $con; 473 | 474 | } 475 | 476 | 477 | /** 478 | * [出队列] 479 | * @param integer $c [description] 480 | * @return [type] [description] 481 | */ 482 | public function removeQueue($c=1){ 483 | 484 | $quequeCount = $this->queueObj->getLength(); 485 | $c = $c>$quequeCount?$quequeCount:$c; 486 | for ($i=0; $i < $c; $i++) { 487 | 488 | $queueArray[] = $this->queueObj->removeFirst(); 489 | 490 | } 491 | 492 | return $queueArray; 493 | 494 | } 495 | 496 | 497 | 498 | public function getCurlInstance() 499 | { 500 | static $instances = array(); 501 | $key = getmypid(); 502 | if (empty($instances[$key])) 503 | { 504 | $instances[$key] = new MultiCurl();; 505 | } 506 | return $instances[$key]; 507 | } 508 | 509 | 510 | /** 511 | * [多个请求] 512 | * @param [array] queueArray [description] 513 | */ 514 | 515 | public function curl($queueArray){ 516 | 517 | $this->curlObj = $this->getCurlInstance(); 518 | //print_r($this->curlObj); 519 | 520 | if ($this->site['header']) { 521 | foreach ($this->site['header'] as $key => $value) { 522 | 523 | $this->curlObj->setHeader($key, $value); 524 | } 525 | } 526 | 527 | 528 | if ($this->site['cookie']) { 529 | foreach ($this->site['cookie'] as $key => $value) { 530 | 531 | $this->curlObj->setCookie($key, $value); 532 | } 533 | } 534 | 535 | 536 | $this->curlObj->setUserAgent('Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'); 537 | 538 | if ($this->site['userAgent']) { 539 | 540 | $this->curlObj->setUserAgent($this->site['userAgent']); 541 | 542 | } 543 | 544 | 545 | $this->curlObj->setOpt(CURLOPT_SSL_VERIFYHOST, false); 546 | $this->curlObj->setOpt(CURLOPT_SSL_VERIFYPEER, false); 547 | 548 | //$this->log($queueArray); 549 | 550 | foreach ($queueArray as $v) { 551 | 552 | if ($v['opt']['header'] ) { 553 | 554 | foreach ($v['opt']['header'] as $key => $value) { 555 | 556 | $this->curlObj->setHeader($key, $value); 557 | } 558 | } 559 | if ($v['opt']['contextData'] ) { 560 | //contextData扔到header里了 561 | $this->curlObj->setHeader('contextData', $v['opt']['contextData']); 562 | 563 | } 564 | 565 | //$this->curlObj->setHeader('contextData', '房贷收紧福克斯的垃圾焚烧开发建设的开发设计'); 566 | 567 | if ($v['opt']['method'] == 'POST') { 568 | 569 | 570 | //$this->log($v['opt']); 571 | $this->curlObj->addPost($v['url'],$v['opt']['data']); 572 | }else{ 573 | 574 | $this->curlObj->addGet($v['url'],$v['opt']['data']); 575 | } 576 | //$this->curlTimes++; 577 | 578 | //$this->log('curlTimes:'.$this->curlTimes); 579 | } 580 | 581 | } 582 | 583 | /** 584 | * [单请求] 585 | * @param [type] $url [description] 586 | * @param array $options [description] 587 | * @return [type] [description] 588 | */ 589 | public function curlOne($url,$options=[]){ 590 | 591 | $this->log('curlOne:'.$url); 592 | $curl = new Curl(); 593 | $curl->setOpt(CURLOPT_SSL_VERIFYHOST, false); 594 | $curl->setOpt(CURLOPT_SSL_VERIFYPEER, false); 595 | 596 | 597 | if ($this->site['header']) { 598 | foreach ($this->site['header'] as $key => $value) { 599 | 600 | $curl->setHeader($key, $value); 601 | } 602 | } 603 | 604 | if ($this->site['cookie']) { 605 | foreach ($this->site['cookie'] as $key => $value) { 606 | 607 | $curl->setCookie($key, $value); 608 | } 609 | } 610 | 611 | $this->curlObj->setUserAgent('Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'); 612 | if ($this->site['userAgent']) { 613 | 614 | $curl->setUserAgent($this->site['userAgent']); 615 | 616 | } 617 | 618 | 619 | 620 | 621 | if ($options['headers'] ) { 622 | foreach ($options['headers'] as $key => $value) { 623 | 624 | $curl->setHeader($key, $value); 625 | } 626 | 627 | } 628 | if ($options['method'] == 'POST') { 629 | $curl->post($url,$options['data']); 630 | }else{ 631 | //echo $url; 632 | $curl->get($url); 633 | } 634 | 635 | 636 | $this->curlTimes++; 637 | 638 | $this->log('curlOneTimes:'.$this->curlTimes); 639 | 640 | 641 | 642 | if ($curl->error) { 643 | $this->log( 'Url: ' . $url .'Error: ' . $curl->errorCode . ': ' . $curl->errorMessage ,3); 644 | return false; 645 | } 646 | else { 647 | if ( strstr( $curl->responseHeaders['content-type'],'application/json') ) { 648 | $curl->response = json_encode( $curl->response); 649 | } 650 | 651 | $this->page['raw'] = $curl->response; 652 | $this->page['request'] = $curl->request;//还没改造 653 | 654 | if ($options['contextData'] ) { 655 | $this->page['raw'] .= $options['contextData']; 656 | 657 | } 658 | return $this->page['raw']; 659 | } 660 | 661 | } 662 | 663 | /** 664 | * [内容转utf8] 665 | * @param [type] $content [description] 666 | * @return [type] [description] 667 | */ 668 | private function convertUtf8($content){ 669 | if ($this->configs['charset']) { 670 | //$content = iconv ( $this->configs['charset'], 'utf-8' , $content ); 671 | $content = mb_convert_encoding($content, "utf-8",$this->configs['charset']); 672 | $content = preg_replace('|charset\s*=\s*(\w+)|i', 'charset=UTF-8', $content); 673 | } 674 | return $content; 675 | } 676 | 677 | /** 678 | * [删除某个数组的key,下面不能是数组] 679 | * @param [type] &$arr [description] 680 | * @param [type] $k [description] 681 | * @return [type] [description] 682 | */ 683 | public function array_remove_key(&$arr, $k){ 684 | if ($arr) { 685 | foreach ($arr as $key => &$value) { 686 | if (is_array($value)) { 687 | $this->array_remove_key($value, $k); 688 | } else { 689 | if (trim($key) == $k) { 690 | unset($arr[$k]); 691 | } 692 | } 693 | } 694 | } 695 | } 696 | 697 | 698 | public function array_search_key( $search, array $array, $mode = 'key'){ 699 | $res = array(); 700 | $temp1 = new RecursiveIteratorIterator(new RecursiveArrayIterator($array)); 701 | foreach ($temp1 as $key => $value) { 702 | if ($search === ${${"mode"}}){ 703 | if($mode == 'key'){ 704 | $res[] = $value; 705 | }else{ 706 | $res[] = $key; 707 | } 708 | unset($search); 709 | } 710 | } 711 | unset($temp1); 712 | return $res; 713 | } 714 | 715 | 716 | 717 | /** 718 | * [initSite ] 719 | * @return [type] [description] 720 | */ 721 | public function initSite(){ 722 | $this->site['scanUrls'] = []; 723 | $this->site['helperUrls'] = []; 724 | $this->site['contentUrls'] = []; 725 | 726 | $this->site['addHeader'] = function ($key,$value) { 727 | 728 | $this->site['header'][$key] = $value; 729 | }; 730 | 731 | $this->site['addCookie'] = function ($key,$value) { 732 | 733 | $this->site['cookie'][$key] = $value; 734 | }; 735 | 736 | $this->site['addCookies'] = function ($cookies) { 737 | $temp1 = explode(';', $cookies); 738 | foreach ($temp1 as $key => $value) { 739 | $temp2 = explode('=', trim($value)); 740 | $this->site['cookie'][$temp2[0]] = urldecode($temp2[1]); 741 | } 742 | unset($temp1); 743 | 744 | }; 745 | 746 | $this->site['addUrl'] = function ($url,$options=[]) { 747 | $this->addQueue($url,$options); 748 | }; 749 | 750 | $this->site['requestUrl'] = function ($url,$options=[]) { 751 | return $this->curlOne($url,$options); 752 | }; 753 | 754 | $this->site['setUserAgent'] = function ($userAgent) { 755 | $this->site['userAgent'] = $userAgent; 756 | }; 757 | 758 | 759 | } 760 | 761 | /** 762 | * [initPage] 763 | * @return [type] [description] 764 | */ 765 | public function initPage(){ 766 | 767 | $this->page['skip'] = function ($fieldName='') { 768 | $fieldName = $fieldName?$fieldName:'skipAllPage998'; 769 | $this->skip = $fieldName; 770 | unset($fieldName); 771 | 772 | $this->log('$this->skip:'.$this->skip,3); 773 | }; 774 | 775 | } 776 | 777 | 778 | public function getDbInstance() 779 | { 780 | static $instances = array(); 781 | $key = getmypid(); 782 | if (empty($instances[$key])) 783 | { 784 | $instances[$key] = new \medoo($this->configs['dbConfig']['db']); 785 | } 786 | return $instances[$key]; 787 | } 788 | 789 | public function initDb(){ 790 | 791 | if ($this->configs['action'] == 'start' || $this->configs['action'] == 'restart') { 792 | 793 | $this->db = new \medoo($this->configs['dbConfig']['db']); 794 | 795 | $this->tableName = 'jmz_data'.$this->configs['id']; 796 | 797 | if ($this->configs['action'] == 'restart') { 798 | $this->db->query("drop TABLE `".$this->tableName."`;"); 799 | } 800 | 801 | 802 | $count = $this->db->count($tableName, []); 803 | 804 | if (!$count) { 805 | $this->db->query("CREATE TABLE IF NOT EXISTS `".$this->tableName."` ( 806 | `id` int(11) NOT NULL, 807 | `create_date` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, 808 | `update_date` datetime DEFAULT NULL, 809 | `site` int(5) NOT NULL DEFAULT '0', 810 | `url` varchar(200) NOT NULL, 811 | `data` longtext, 812 | `other` text, 813 | `flag` int(3) NOT NULL DEFAULT '1' 814 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 815 | 816 | 817 | ALTER TABLE `".$this->tableName."` 818 | ADD PRIMARY KEY (`id`), 819 | ADD UNIQUE KEY `site` (`site`,`url`); 820 | 821 | 822 | ALTER TABLE `".$this->tableName."` 823 | MODIFY `id` int(11) NOT NULL AUTO_INCREMENT;"); 824 | } 825 | unset($count); 826 | 827 | 828 | 829 | 830 | if ($this->configs['dbConfig']['insertType'] == 2) { 831 | $this->db->delete($this->tableName, ["site" => $this->configs['id']]); 832 | } 833 | 834 | } 835 | } 836 | 837 | /** 838 | * [log ] 839 | * @param [type] $str [description] 840 | * @param integer $level [description] 841 | * @return [type] [description] 842 | */ 843 | public function log($str,$level=0){ 844 | 845 | $str = is_array($str)?json_encode($str,JSON_UNESCAPED_UNICODE):$str; 846 | 847 | echo date("Y-m-d H:i:s")." "; 848 | if ($level == 1) { 849 | echo "\e[1;33m $str \e[0m \n"; 850 | }elseif ($level == 2) { 851 | echo "\e[1;34m $str \e[0m \n"; 852 | }elseif ($level == 3) { 853 | echo "\e[1;31m $str \e[0m \n"; 854 | }else{ 855 | echo "$str \n"; 856 | } 857 | 858 | 859 | unset($str); 860 | 861 | 862 | } 863 | 864 | 865 | } -------------------------------------------------------------------------------- /Crawler/PauseData.php: -------------------------------------------------------------------------------- 1 | loadHTML($content); 18 | libxml_clear_errors(); 19 | $xPath = new DOMXPath($dom); 20 | $elements = $xPath->query($selector); 21 | if ($elements->length>1) { 22 | for ($i=0; $i < $elements->length; $i++) { 23 | if ($opt==true) { 24 | $ret[] = $elements->item($i)->nodeValue; 25 | }else{ 26 | $ret[] = $elements->item($i)->nodeValue?$dom->saveHtml($elements->item($i)):''; 27 | } 28 | 29 | } 30 | }else{ 31 | if ($opt==true) { 32 | $ret = $elements->item(0)->nodeValue; 33 | }else{ 34 | $ret = $elements->item($i)->nodeValue?$dom->saveHtml($elements->item(0)):''; 35 | } 36 | 37 | } 38 | return $ret; 39 | } 40 | } 41 | 42 | 43 | class PauseXPath implements PauseData 44 | { 45 | public function __construct( ) { } 46 | 47 | public function pause($content,$selector,$opt=false){ 48 | $document = new Document(); 49 | $document->loadHtml($content); 50 | $lists = $document->xpath($selector); 51 | 52 | if ($lists) { 53 | if ($lists[1]) { 54 | foreach($lists as $k =>$list) { 55 | //print_r($selector);//die; 56 | 57 | if (is_string($list)) { 58 | $r[] = trim($list); 59 | }else{ 60 | $r[] = trim($list->innerHtml()); 61 | } 62 | } 63 | }else{ 64 | 65 | if (is_string($lists[0])) { 66 | $r = trim($lists[0]); 67 | }else{ 68 | 69 | /*echo "
\n"; 70 | print_r($selector);print_r($lists); 71 | echo "
\n";*/ 72 | $r = trim($lists[0]->innerHtml()); 73 | } 74 | 75 | 76 | } 77 | } 78 | 79 | 80 | 81 | return $r; 82 | } 83 | } 84 | 85 | 86 | class PauseJsonPath implements PauseData 87 | { 88 | public function __construct( ) { } 89 | 90 | public function pause($content,$selector,$opt=false){ 91 | //echo 'pause:
'; 92 | //print_r($content); 93 | //print_r($selector); 94 | //echo '
';//die; 95 | 96 | if (is_array($content)) { 97 | $content = json_encode($content); 98 | } 99 | $content = json_decode($content,true); 100 | 101 | if (strpos($selector, '.')) { 102 | $temp1 = explode('.', $selector); 103 | foreach ($temp1 as $key => $value) { 104 | $content = $content[$value]; 105 | } 106 | }else{ 107 | $content = $content[$selector]; 108 | } 109 | 110 | return $content; 111 | } 112 | } 113 | 114 | 115 | class PauseRegex implements PauseData 116 | { 117 | public function __construct( ) { } 118 | 119 | public function pause($content,$selector,$opt=PREG_PATTERN_ORDER){ 120 | 121 | 122 | preg_match_all($selector,$content,$result,$opt); 123 | return isset($result[1][1])?$result[1]:$result[1][0]; 124 | } 125 | } 126 | 127 | class PauseCssPath implements PauseData 128 | { 129 | 130 | public function __construct( ) { } 131 | 132 | public function pause($content,$selector,$opt=false){ 133 | 134 | $document = new Document(); 135 | $document->loadHtml($content); 136 | $lists = $document->find($selector); 137 | 138 | if ($lists) { 139 | # code... 140 | if (count($lists) > 0) { 141 | foreach($lists as $k =>$list) { 142 | $r[] = $list->innerHtml(); 143 | } 144 | }else{ 145 | $r = $lists[0]->innerHtml(); 146 | } 147 | } 148 | 149 | 150 | 151 | return $r; 152 | } 153 | } 154 | 155 | class PauseFactory 156 | { 157 | public static function Create( $method ) 158 | { 159 | $class = 'Pause'.$method; 160 | //echo '--'.$class.'--';测试 161 | return new $class( ); 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /Crawler/Queue.php: -------------------------------------------------------------------------------- 1 | host = $host; 13 | $this->port = $port; 14 | $this->key = $this->key.$id; 15 | 16 | if ($action == 'restart') { 17 | $this->getInstance()->del($this->key); 18 | $this->getInstance()->del($this->key.'AllUrl'); 19 | } 20 | 21 | 22 | } 23 | 24 | /** 25 | * [尾部入队] 26 | * @param [type] $value ['url'=>$url,'opt'=$opt] 27 | */ 28 | public function addLast($value) 29 | { 30 | if ($value['opt']['reserve'] == true) { //去重选项 31 | return $this->getInstance()->rpush($this->key, serialize($value)); 32 | } 33 | 34 | //print_r($this->getInstance()); 35 | 36 | $allUrl = unserialize($this->getInstance()->get($this->key.'AllUrl')); 37 | //echo 'allUrl count:'.count($allUrl)."\n"; 38 | 39 | if (!$allUrl[$value["url"]]) { 40 | $allUrl[$value["url"]]=1; 41 | $this->getInstance()->set($this->key.'AllUrl', serialize($allUrl)); 42 | return $this->getInstance()->rpush($this->key, serialize($value)); 43 | } 44 | 45 | 46 | 47 | } 48 | /**(头部)出队**/ 49 | public function removeFirst() 50 | { 51 | return unserialize($this->getInstance()->lpop($this->key)); 52 | } 53 | 54 | /** 获取长度 **/ 55 | public function getLength() 56 | { 57 | return $this->getInstance()->llen($this->key); 58 | } 59 | 60 | public function getInstance() 61 | { 62 | static $instances = array(); 63 | $key = getmypid(); 64 | if (empty($instances[$key])) 65 | { 66 | $instances[$key] = new Redis(); 67 | //echo 111; 68 | //print_r($instances[$key]); 69 | 70 | $instances[$key]->connect($this->host, $this->port); 71 | } 72 | 73 | return $instances[$key]; 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 鸠摩智(多进程版)简介 2 | ------------- 3 | 前几天看到 http://doc.shenjianshou.cn/ 觉得不错,就省下几天守望开车时间照着文档用php实现了一遍,下面是对比shenjianshou的不足,改进和使用上的区别 4 | 5 | > **不足:** 6 | 7 | > - 没有js渲染 8 | > - 没有验证码识别 9 | > - 暂时没有自动换代理 10 | > - 暂时没有图片本地化/托管云 11 | > - 暂时没有模拟登录 12 | > - 暂时没有录入数据库 (已加) 13 | 14 | 15 | > **特点/改进:** 16 | 17 | > - 多进程,现在只能在liunx下使用了,单进程很容易内存耗尽- - 18 | > - 使用curl multi"多线程",可以自定义"线程"数,速度刷刷刷的 19 | > - 支持css选择器,xpath,正则3种选择器 20 | 21 | > **使用上的区别:** 22 | 23 | > - 查看SiteConfig目录下 数字.php 具体配置,因为用php搞的,各种传递参数啥的都不一样,但看起来大体一样; 24 | > - 用Xpath取回来的是innerHtml 25 | > - jsonpath没怎么弄 26 | > - contentUrlRegexes helperUrlRegexes 规则里没带域名 27 | 28 | #### 如何安装 29 | ``` 30 | git clone https://github.com/ketle/jiumozhi.git 31 | cd jiumozhi 32 | composer install 33 | ``` 34 | 35 | #### 如何开始 36 | ``` 37 | 配置config.php下db,redis设置 38 | Usage: php index.php <1-n> 39 | 比如: php index.php 1 test 40 | 手动停止请Ctrl+c 41 | SiteConfig目录下自带了5个例子; 42 | 第一个例子入库2599条数据 时间从2016-08-22 22:15:13 - 2016-08-22 22:17:48 才用时2分半 43 | ``` 44 | 45 | #### 为啥叫"鸠摩智" 46 | 拍脑袋想出来的,哈哈哈 -------------------------------------------------------------------------------- /SiteConfig/1.php: -------------------------------------------------------------------------------- 1 | 'article_title','alias'=>'标题','selector'=>"//h2[@class='entry-name']/text()",'selectorType'=>'XPath','required'=>1], 18 | ['name'=>'article_con','alias'=>'标题','selector'=>"//div[@class='entry-content']",'selectorType'=>'XPath','required'=>1], 19 | 20 | ]; 21 | $configs['beforeCrawl'] = function (&$site) { 22 | //$site['addHeader']("Referer", "http://buluo.qq.com/p/index.html"); 23 | //$site['addCookies']("last_item_date:10733=1467003228; mykeywords=a%3A1%3A%7Bi%3A0%3Bs%3A6%3A%22%E4%BD%90%E7%BD%97%22%3B%7D; PHPSESSID=2mq1jhshc6ssi2rc3j3iontku7; GINFO=uid%3D3519430%26nickname%3Dketle%26group_id%3D0%26avatar_t%3D%26main_group_id%3D0%26common_group_id%3D59; GKEY=0c3d0734c04ae6f2b72632d0553eb116"); 24 | 25 | }; 26 | $configs['nextScanUrl'] = function (&$url) { 27 | return ; 28 | }; 29 | $configs['onChangeProxy'] = function (&$site) { 30 | return ; 31 | }; 32 | $configs['isAntiSpider'] = function (&$url,&$content) { 33 | return ; 34 | }; 35 | $configs['afterDownloadPage'] = function (&$page,&$site) { 36 | 37 | return ; 38 | }; 39 | $configs['afterDownloadAttachedPage'] = function (&$page,&$site) { 40 | return ; 41 | }; 42 | $configs['onProcessScanPage'] = function (&$page,&$content,&$site) { 43 | /*echo 'sss:'; 44 | print_r($site['scanUrls']); 45 | print_r($site['helperUrls']); 46 | print_r($site['contentUrls']);*/ 47 | 48 | //$site['addUrl']('http://fuliba.net/%e6%88%91%e6%9c%89%e4%b8%80%e4%b8%aa%e6%a2%a6%e6%83%b3%ef%bc%9a%e9%9d%a2%e6%9c%9d%e5%8f%b8%e6%9c%ba%ef%bc%8c%e4%ba%ba%e6%bb%a1%e8%bd%a6%e5%bc%80.html'); 49 | 50 | $content = str_replace('', '', $content); //奇怪的网站,一会儿utf8,一会儿gb2312,导致xpath不正常; 51 | 52 | 53 | return true; 54 | }; 55 | $configs['onProcessHelperPage'] = function (&$page,&$content,&$site) { 56 | $content = str_replace('', '', $content); 57 | return true; 58 | }; 59 | $configs['onProcessContentPage'] = function (&$page,&$content,&$site) { 60 | $content = str_replace('', '', $content); 61 | return true; 62 | }; 63 | 64 | $configs['beforeHandleImg'] = function (&$fieldName,&$img) { 65 | return ; 66 | }; 67 | $configs['beforeCacheImg'] = function (&$fieldName,&$url) { 68 | return ; 69 | }; 70 | $configs['afterExtractField'] = function (&$fieldName,&$data,&$page) { 71 | //if ($fieldName == 'article_title' && trim($data) == '翻山新科技,i42.li,配置仅需两步') { 72 | //print_r($data);die; 73 | //$page['skip'](); 74 | //} 75 | return ; 76 | }; 77 | $configs['beforeCacheImg'] = function (&$page,&$data) { 78 | return ; 79 | }; 80 | $configs['afterExtractPage'] = function (&$page,&$data) { 81 | return ; 82 | }; -------------------------------------------------------------------------------- /SiteConfig/2.php: -------------------------------------------------------------------------------- 1 | 'article_title', 20 | 'alias'=>'标题', 21 | 'selector'=>'//*[@id="pt"]/div/a[5]/text()', 22 | 'selectorType'=>'XPath', 23 | 'required'=>1 24 | ], 25 | [ 26 | 'name'=>'article_content', 27 | 'alias'=>'内容', 28 | 'selector'=>"/html/body[@id='nv_forum']/div[@id='wp']/div[@id='ct']/div[@id='pgt']/div[@class='pgt']/div[@class='pg']/a", 29 | 'selectorType'=>'XPath', 30 | 'repeated'=>1, 31 | 'children'=> 32 | [ 33 | [ 34 | 'name'=>'page', 35 | 'alias'=>'分页', 36 | 'selector'=>'//text()', 37 | 'selectorType'=>'XPath', 38 | 'required'=>1, 39 | 'transient'=>'page' //临时变量,要删掉 40 | ], 41 | [ 42 | 'name'=>'article_content2', 43 | 'alias'=>'内容', 44 | 'selector'=>'//div[@class="im286table"]', 45 | 'selectorType'=>'XPath', 46 | 'sourceType'=>'AttachedUrl', 47 | 'attachedUrl'=>'{page}', 48 | 'repeated'=>1, 49 | 'children'=> 50 | [ 51 | [ 52 | 'name'=>'author', 53 | 'alias'=>'作者', 54 | 'selector'=>'//a[@class="xw1"]', 55 | 'selectorType'=>'XPath', 56 | 'required'=>1, 57 | 'transient'=>'author' //临时变量,要删掉 58 | ], 59 | [ 60 | 'name'=>'content', 61 | 'alias'=>'内容', 62 | 'selector'=>'//*[contains(@id,"postmessage_")]', 63 | 'selectorType'=>'XPath', 64 | 'required'=>1 65 | ], 66 | ] 67 | ], 68 | ] 69 | ], 70 | ]; 71 | $configs['beforeCrawl'] = function (&$site) { 72 | //$site['addHeader']("Referer", "http://buluo.qq.com/p/index.html"); 73 | //$site['addCookies']("last_item_date:10733=1467003228; mykeywords=a%3A1%3A%7Bi%3A0%3Bs%3A6%3A%22%E4%BD%90%E7%BD%97%22%3B%7D; PHPSESSID=2mq1jhshc6ssi2rc3j3iontku7; GINFO=uid%3D3519430%26nickname%3Dketle%26group_id%3D0%26avatar_t%3D%26main_group_id%3D0%26common_group_id%3D59; GKEY=0c3d0734c04ae6f2b72632d0553eb116"); 74 | 75 | }; 76 | $configs['nextScanUrl'] = function (&$url) { 77 | return ; 78 | }; 79 | $configs['onChangeProxy'] = function (&$site) { 80 | return ; 81 | }; 82 | $configs['isAntiSpider'] = function (&$url,&$content) { 83 | return ; 84 | }; 85 | $configs['afterDownloadPage'] = function (&$page,&$site) { 86 | //print_r($site['header']) ; 87 | 88 | if ($site) { 89 | //echo 111; 90 | } 91 | return ; 92 | }; 93 | $configs['afterDownloadAttachedPage'] = function (&$page,&$site) { 94 | return ; 95 | }; 96 | $configs['onProcessScanPage'] = function (&$page,&$content,&$site) { 97 | 98 | //$site['addUrl']('http://www.im286.net/thread-17437914-1.html'); 99 | return true; 100 | }; 101 | $configs['onProcessHelperPage'] = function (&$page,&$content,&$site) { 102 | return true; 103 | }; 104 | $configs['onProcessContentPage'] = function (&$page,&$content,&$site) { 105 | return true; 106 | }; 107 | 108 | $configs['beforeHandleImg'] = function (&$fieldName,&$img) { 109 | return ; 110 | }; 111 | $configs['beforeCacheImg'] = function (&$fieldName,&$url) { 112 | return ; 113 | }; 114 | $configs['afterExtractField'] = function (&$fieldName,&$data,&$page) { 115 | if ($fieldName == 'article_content') { 116 | //echo "
article_content:"; 117 | //print_r($page['url']);print_r($data) ; 118 | 119 | if ($data) { 120 | # code... 121 | array_unshift($data, 1); 122 | array_pop($data); 123 | 124 | foreach ($data as &$value) { 125 | 126 | $value = str_replace('-1.html','-'.intval(str_replace('...', '', $value)) .'.html',$page['url']); 127 | } 128 | //print_r($data) ; 129 | } 130 | //die; 131 | } 132 | 133 | if ($fieldName == 'author') { 134 | //echo "
article_content:"; 135 | //print_r($page['url']);print_r($data) ; 136 | 137 | if ($data == '下乡客') { 138 | $page['skip'](); 139 | //print_r($data) ; 140 | } 141 | //die; 142 | } 143 | 144 | 145 | 146 | 147 | 148 | 149 | //print_r($fieldName);echo "
";//die; 150 | return ; 151 | }; 152 | $configs['beforeCacheImg'] = function (&$page,&$data) { 153 | return ; 154 | }; 155 | $configs['afterExtractPage'] = function (&$page,&$data) { 156 | return ; 157 | }; -------------------------------------------------------------------------------- /SiteConfig/3.php: -------------------------------------------------------------------------------- 1 | 'article_title','alias'=>'标题','selector'=>'//*[@id="BContent2"]/h1/text()','selectorType'=>'XPath','required'=>1], 19 | ['name'=>'article_content','alias'=>'内容','selector'=>"//div[@class='body']",'selectorType'=>'XPath','required'=>1], 20 | ['name'=>'article_date','alias'=>'时间','selector'=>"//*[@id='contextData']",'selectorType'=>'XPath'], 21 | ]; 22 | $configs['beforeCrawl'] = function (&$site) { 23 | //$site['addHeader']("Referer", "http://buluo.qq.com/p/index.html"); 24 | //$site['addCookies']("last_item_date:10733=1467003228; mykeywords=a%3A1%3A%7Bi%3A0%3Bs%3A6%3A%22%E4%BD%90%E7%BD%97%22%3B%7D; PHPSESSID=2mq1jhshc6ssi2rc3j3iontku7; GINFO=uid%3D3519430%26nickname%3Dketle%26group_id%3D0%26avatar_t%3D%26main_group_id%3D0%26common_group_id%3D59; GKEY=0c3d0734c04ae6f2b72632d0553eb116"); 25 | 26 | }; 27 | $configs['nextScanUrl'] = function (&$url) { 28 | return ; 29 | }; 30 | $configs['onChangeProxy'] = function (&$site) { 31 | return ; 32 | }; 33 | $configs['isAntiSpider'] = function (&$url,&$content) { 34 | return ; 35 | }; 36 | $configs['afterDownloadPage'] = function (&$page,&$site) { 37 | //print_r($site['header']) ; 38 | 39 | if ($site) { 40 | //echo 111; 41 | } 42 | return ; 43 | }; 44 | $configs['afterDownloadAttachedPage'] = function (&$page,&$site) { 45 | return ; 46 | }; 47 | $configs['onProcessScanPage'] = function (&$page,&$content,&$site) { 48 | $pf = PauseFactory::Create( 'Xpath' ); 49 | $match = $pf->pause($content,"/html/body/div[@id='container']/div[@id='B']/div[@id='BContent2']/div[@id='ResBox']/div[@id='RB']/h2"); 50 | //print_r($match); 51 | 52 | if ($match) { 53 | foreach ($match as $key => $value) { 54 | 55 | preg_match('|id-(\d+).html" target="_blank">(.*)(.*)|', $value,$match2); 56 | //print_r($match2); 57 | 58 | $url = 'http://news.sise.com.cn/show.php?id-'.$match2[1].'.html'; 59 | $site['addUrl']($url,['contextData'=>'
'.trim($match2[3]).'
']); 60 | } 61 | } 62 | 63 | return false; 64 | }; 65 | $configs['onProcessHelperPage'] = function (&$page,&$content,&$site) { 66 | 67 | $pf = PauseFactory::Create( 'Xpath' ); 68 | $match = $pf->pause($content,"/html/body/div[@id='container']/div[@id='B']/div[@id='BContent2']/div[@id='ResBox']/div[@id='RB']/h2"); 69 | //print_r($match); 70 | 71 | if ($match) { 72 | foreach ($match as $key => $value) { 73 | 74 | preg_match('|id-(\d+).html" target="_blank">(.*)(.*)|', $value,$match2); 75 | //print_r($match2); 76 | 77 | $url = 'http://news.sise.com.cn/show.php?id-'.$match2[1].'.html'; 78 | $site['addUrl']($url,['contextData'=>'
'.trim($match2[3]).'
']); 79 | } 80 | } 81 | 82 | return false; 83 | }; 84 | $configs['onProcessContentPage'] = function (&$page,&$content,&$site) { 85 | //echo $content;die; 86 | return true; 87 | }; 88 | 89 | $configs['beforeHandleImg'] = function (&$fieldName,&$img) { 90 | return ; 91 | }; 92 | $configs['beforeCacheImg'] = function (&$fieldName,&$url) { 93 | return ; 94 | }; 95 | $configs['afterExtractField'] = function (&$fieldName,&$data,&$page) { 96 | return ; 97 | }; 98 | $configs['beforeCacheImg'] = function (&$page,&$data) { 99 | return ; 100 | }; 101 | $configs['afterExtractPage'] = function (&$page,&$data) { 102 | return ; 103 | }; -------------------------------------------------------------------------------- /SiteConfig/4.php: -------------------------------------------------------------------------------- 1 | 'products', 20 | 'alias'=>'内容', 21 | 'selector'=>'//div[contains(@class,"p2p_product")]', 22 | 'selectorType'=>'XPath', 23 | 'repeated'=>1, 24 | 'children'=> 25 | [ 26 | [ 27 | 'name'=>'product_name', 28 | 'alias'=>'作者', 29 | 'selector'=>'//h3/a | //h3/span', 30 | 'selectorType'=>'XPath', 31 | 'required'=>1 32 | ], 33 | [ 34 | 'name'=>'product_info', 35 | 'alias'=>'内容', 36 | 'selector'=>'//h3/a | //h3/span', 37 | 'selectorType'=>'XPath', 38 | ], 39 | ] 40 | ], 41 | ]; 42 | $configs['beforeCrawl'] = function (&$site) { 43 | //$site['addHeader']("Referer", "http://buluo.qq.com/p/index.html"); 44 | //$site['addCookies']("last_item_date:10733=1467003228; mykeywords=a%3A1%3A%7Bi%3A0%3Bs%3A6%3A%22%E4%BD%90%E7%BD%97%22%3B%7D; PHPSESSID=2mq1jhshc6ssi2rc3j3iontku7; GINFO=uid%3D3519430%26nickname%3Dketle%26group_id%3D0%26avatar_t%3D%26main_group_id%3D0%26common_group_id%3D59; GKEY=0c3d0734c04ae6f2b72632d0553eb116"); 45 | 46 | }; 47 | $configs['nextScanUrl'] = function (&$url) { 48 | return ; 49 | }; 50 | $configs['onChangeProxy'] = function (&$site) { 51 | return ; 52 | }; 53 | $configs['isAntiSpider'] = function (&$url,&$content) { 54 | return ; 55 | }; 56 | $configs['afterDownloadPage'] = function (&$page,&$site) { 57 | //print_r($site['header']) ; 58 | 59 | if ($site) { 60 | //echo 111; 61 | } 62 | return ; 63 | }; 64 | $configs['afterDownloadAttachedPage'] = function (&$page,&$site) { 65 | return ; 66 | }; 67 | $configs['onProcessScanPage'] = function (&$page,&$content,&$site) { 68 | return true; 69 | }; 70 | $configs['onProcessHelperPage'] = function (&$page,&$content,&$site) { 71 | return true; 72 | }; 73 | $configs['onProcessContentPage'] = function (&$page,&$content,&$site) { 74 | return true; 75 | }; 76 | 77 | $configs['beforeHandleImg'] = function (&$fieldName,&$img) { 78 | return ; 79 | }; 80 | $configs['beforeCacheImg'] = function (&$fieldName,&$url) { 81 | return ; 82 | }; 83 | $configs['afterExtractField'] = function (&$fieldName,&$data,&$page) { 84 | return ; 85 | }; 86 | $configs['beforeCacheImg'] = function (&$page,&$data) { 87 | return ; 88 | }; 89 | $configs['afterExtractPage'] = function (&$page,&$data) { 90 | return ; 91 | }; -------------------------------------------------------------------------------- /SiteConfig/5.php: -------------------------------------------------------------------------------- 1 | 'article_title', 19 | 'alias'=>'标题', 20 | 'selector'=>'//*[@id="log-send-article"]/div[2]/h1', 21 | 'selectorType'=>'XPath', 22 | 'required'=>1 23 | ],[ 24 | 'name'=>'article_title_img', 25 | 'alias'=>'标题图片', 26 | 'selector'=>'//*[@class="article-img-box"]/img/@src', 27 | 'selectorType'=>'XPath' 28 | ], 29 | [ 30 | 'name'=>'article_content', 31 | 'alias'=>'内容', 32 | 'selector'=>"//div[@id='article_content']", 33 | 'selectorType'=>'XPath' 34 | 35 | ], 36 | ]; 37 | $configs['beforeCrawl'] = function (&$site) { 38 | $site['addHeader']("Referer", "http://www.huxiu.com/"); 39 | $site['addCookies']("gr_user_id=e321f323-cfeb-4b1a-bc4b-a9644977d262; kr_stat_uuid=CDwxJ24517819; Hm_lvt_e8ec47088ed7458ec32cde3617b23ee3=1471093100; Hm_lvt_713123c60a0e86982326bae1a51083e1=1471069347,1471069528,1471069554,1471103123; _alicdn_sec=57b299062ae8a8c0d74782d6ebbefdff188ab528; aliyungf_tc=AQAAAD5P9Q8NZQsA90fnekPdEbGb5/qw"); 40 | 41 | }; 42 | $configs['nextScanUrl'] = function (&$url) { 43 | return ; 44 | }; 45 | $configs['onChangeProxy'] = function (&$site) { 46 | return ; 47 | }; 48 | $configs['isAntiSpider'] = function (&$url,&$content) { 49 | return ; 50 | }; 51 | $configs['afterDownloadPage'] = function (&$page,&$site) { 52 | //print_r($site['header']) ; 53 | 54 | if ($site) { 55 | //echo 111; 56 | } 57 | return ; 58 | }; 59 | $configs['afterDownloadAttachedPage'] = function (&$page,&$site) { 60 | return ; 61 | }; 62 | 63 | $global = []; 64 | $configs['onProcessScanPage'] = function (&$page,&$content,&$site) { 65 | global $global; 66 | $global['page'] = 2; 67 | //echo $content; 68 | 69 | $pf = PauseFactory::Create( 'Xpath' ); 70 | $match = $pf->pause($content,'//div[contains(@class,"get-mod-more")]/@data-cur_page'); 71 | $pagex = $match; 72 | 73 | $match = $pf->pause($content,'//div[contains(@class,"get-mod-more")]/@data-last_dateline'); 74 | $last_dateline = $match; 75 | 76 | $pf2 = PauseFactory::Create( 'Regex' ); 77 | $match = $pf2->pause($content,"|var huxiu_hash_code='(\w+)'|"); 78 | $huxiu_hash_code = $match; 79 | 80 | $url = 'https://www.huxiu.com/v2_action/article_list?page='.$global['page']; 81 | $options['method'] = 'POST'; 82 | $options['data'] = ['huxiu_hash_code'=>$huxiu_hash_code,'page'=>$global['page'],'last_dateline'=>$last_dateline]; 83 | 84 | //print_r($options);die; 85 | $site['addUrl']($url,$options); 86 | $global['huxiu_hash_code'] = $huxiu_hash_code; 87 | //die; 88 | // 89 | //echo $url; 90 | 91 | return true; 92 | }; 93 | 94 | 95 | $configs['onProcessHelperPage'] = function (&$page,&$content,&$site) { 96 | //echo $content; 97 | global $global; 98 | $content = json_decode($content,true); 99 | echo 'page:'.$global['page']."
\n"; 100 | //print_r($content); 101 | $global['last_dateline'] = $content['last_dateline']; 102 | $content = $content['data']; 103 | 104 | //echo $content;die; 105 | 106 | 107 | $global['page']++; 108 | $url = 'https://www.huxiu.com/v2_action/article_list?page='.$global['page']; 109 | $options['method'] = 'POST'; 110 | $options['data'] = ['huxiu_hash_code'=>$global['huxiu_hash_code'],'page'=>$global['page'],'last_dateline'=>$global['last_dateline']]; 111 | 112 | print_r($options);//die; 113 | echo "
\n"; 114 | $site['addUrl']($url,$options); 115 | 116 | 117 | 118 | //die; 119 | /*die; 120 | 121 | $content2 = unserialize($content); 122 | $content = json_decode($content,true); 123 | $content2 = json_decode( json_encode( $content2),true); 124 | print_r($content2);die; 125 | foreach ($content2['data']['items'] as $key => $value) { 126 | //echo $value['id']."
"; 127 | $lastId = $value['id']; 128 | $site['addUrl']('http://36kr.com/p/'.$lastId.'.html'); //内容页 129 | } 130 | $site['addUrl']('http://36kr.com/api/info-flow/main_site/posts?column_id=&b_id='.$lastId.'&per_page=20&_='.time()); //列表页json 131 | //die;*/ 132 | 133 | return true; 134 | }; 135 | $configs['onProcessContentPage'] = function (&$page,&$content,&$site) { 136 | 137 | return false; 138 | }; 139 | 140 | $configs['beforeHandleImg'] = function (&$fieldName,&$img) { 141 | return ; 142 | }; 143 | $configs['beforeCacheImg'] = function (&$fieldName,&$url) { 144 | return ; 145 | }; 146 | $configs['afterExtractField'] = function (&$fieldName,&$data,&$page) { 147 | return ; 148 | }; 149 | $configs['beforeCacheImg'] = function (&$page,&$data) { 150 | return ; 151 | }; 152 | $configs['afterExtractPage'] = function (&$page,&$data) { 153 | return ; 154 | }; -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "require": { 3 | "catfan/Medoo": "^1.1", 4 | "php-curl-class/php-curl-class": "^5.0", 5 | "imangazaliev/didom": "^1.7" 6 | }, 7 | "autoload": { 8 | "classmap": [ 9 | "Crawler" 10 | ] 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /config.php: -------------------------------------------------------------------------------- 1 | 2,//最大进程数,通常cpu core*2 ; 4 | 'insertType' => 1,//1:点击'开始'后 判断数据库是否有该条数据,新增才插入; 2:先删除该site所有数据,全新插入; 5 | 'db' => [ 6 | 'database_type' => 'mysql', 7 | 'database_name' => 'jiumozhi', 8 | 'server' => '127.0.0.1', 9 | 'port' => '3306', 10 | 'username' => 'root', 11 | 'password' => 'root123', 12 | 'charset' => 'utf8' 13 | ], 14 | 'redis' => [ 15 | 'host' => '127.0.0.1', 16 | 'port' => '6379' 17 | ] 18 | ]; -------------------------------------------------------------------------------- /index.php: -------------------------------------------------------------------------------- 1 | \n"); 16 | } 17 | array_shift($argv); 18 | 19 | //print_r($argv);die; 20 | 21 | $cliIncludeFile = $argv[0]; 22 | $cliAct = $argv[1]; 23 | 24 | $siteConfigDir = './SiteConfig/'; 25 | 26 | if (!$cliIncludeFile) { 27 | echo "
"; 28 | foreach (new DirectoryIterator($siteConfigDir) as $fileInfo) { 29 | if($fileInfo->isDot()) continue; 30 | if ($fileInfo->getExtension() == 'php') { 31 | include($siteConfigDir.$fileInfo->getFilename()); 32 | echo ''.$configs['title'].'  测试  33 | 开始  34 | 停止
'; 35 | //print_r($configs); 36 | 37 | 38 | } 39 | 40 | } 41 | }else{ 42 | 43 | $t1 = microtime(true); 44 | 45 | $includeFile = $siteConfigDir.$cliIncludeFile.'.php'; 46 | include($includeFile); 47 | $configs['siteConfigDir'] = $siteConfigDir; 48 | 49 | //print_r($configs); 50 | if ($cliAct == 'test') { 51 | $configs['dbConfig'] = include './config.php'; 52 | $instances = new Redis(); 53 | $instances->connect($configs['dbConfig']['redis']['host'], $configs['dbConfig']['redis']['port']); 54 | 55 | $instances->del('jiumozhiQueue'.$configs['id']); 56 | $instances->del('jiumozhiQueue'.$configs['id'].'AllUrl'); 57 | echo "清理redis队列完毕\n"; 58 | 59 | 60 | 61 | @unlink($configs['siteConfigDir'].$configs['id'].'stop.txt'); 62 | $configs['debug'] = 1; 63 | $configs['debugNum'] = 36; 64 | $configs['action'] = 'test'; 65 | $crawler = new Crawler($configs); 66 | $crawler->start(); 67 | }elseif ($cliAct == 'start') { 68 | 69 | $configs['dbConfig'] = include './config.php'; 70 | $configs['action'] = 'start'; 71 | //print_r($dbConfig);die; 72 | 73 | 74 | @unlink($configs['siteConfigDir'].$configs['id'].'stop.txt'); 75 | $crawler = new Crawler($configs); 76 | $crawler->start(); 77 | }elseif ($cliAct == 'restart') { 78 | 79 | $configs['dbConfig'] = include './config.php'; 80 | $configs['action'] = 'restart'; 81 | //print_r($dbConfig);die; 82 | 83 | @unlink($configs['siteConfigDir'].$configs['id'].'stop.txt'); 84 | $crawler = new Crawler($configs); 85 | $crawler->start(); 86 | }elseif ($cliAct == 'stop') { 87 | file_put_contents($configs['siteConfigDir'].$configs['id'].'stop.txt', ''); 88 | echo "已经停了吧 - -\n";die; 89 | }elseif ($cliAct == 'clean') { 90 | $configs['dbConfig'] = include './config.php'; 91 | $instances = new Redis(); 92 | $instances->connect($configs['dbConfig']['redis']['host'], $configs['dbConfig']['redis']['port']); 93 | 94 | $instances->del('jiumozhiQueue'.$configs['id']); 95 | $instances->del('jiumozhiQueue'.$configs['id'].'AllUrl'); 96 | echo "清理redis队列完毕\n"; 97 | 98 | } 99 | 100 | 101 | $t2 = microtime(true); 102 | echo "耗时".round($t2-$t1,3)."秒\n"; 103 | } 104 | 105 | 106 | //$crawler = new Crawler($configs); 107 | //$crawler->start(); 108 | -------------------------------------------------------------------------------- /test.php: -------------------------------------------------------------------------------- 1 |
"; 7 | 8 | $content = file_get_contents(trim($_POST['url'])); 9 | //echo $content; 10 | 11 | $content = mb_convert_encoding($content, "utf-8",trim($_POST['charset'])); 12 | $content = preg_replace('|charset\s*=\s*(\w+)|i', 'charset=UTF-8', $content); 13 | $pauseDrive = PauseFactory::Create( trim($_POST['drive']) ); 14 | //echo $content; 15 | $data = $pauseDrive->pause($content,trim($_POST['selector'])); 16 | 17 | //print_r($pauseDrive); 18 | print_r($data); 19 | echo "

"; 20 | } 21 | 22 | ?> 23 | 24 |
25 | 26 | 27 | url:
28 | charset:
29 | selector:
30 | drive: 31 | 32 |
--------------------------------------------------------------------------------