├── Crawler
├── Crawler.php
├── PauseData.php
└── Queue.php
├── LICENSE
├── README.md
├── SiteConfig
├── 1.php
├── 2.php
├── 3.php
├── 4.php
└── 5.php
├── composer.json
├── config.php
├── index.php
└── test.php
/Crawler/Crawler.php:
--------------------------------------------------------------------------------
1 | configs = $configs;
27 | $this->initSite();
28 | $this->initPage();
29 | $this->initDb();
30 | $this->pauseDrive = PauseFactory::Create( 'Xpath' );
31 |
32 |
33 |
34 |
35 | $temp1 = parse_url($this->configs['scanUrls'][$this->scanUrlsIndex]);
36 | $this->configs['baseUrl'] = $temp1['scheme'].'://'.$temp1['host'].'/';
37 | $this->configs['baseUrlPath'] = $temp1['scheme'].'://'.$temp1['host'].$temp1['path'];
38 | unset($temp1);
39 |
40 |
41 | $this->queueObj = new Queue($this->configs['dbConfig']['redis']['host'],$this->configs['dbConfig']['redis']['port'],$this->configs['id'],$this->configs['action']);
42 | if ($this->queueObj->getLength()==0) {
43 | $this->addQueue($this->configs['scanUrls'][$this->scanUrlsIndex],['type'=>1]);
44 | }
45 |
46 |
47 | }
48 |
49 | /**
50 | * [启动]
51 | * @return [type] [description]
52 | */
53 | public function start(){
54 |
55 |
56 | $this->configs['beforeCrawl']($this->site); //beforeCrawl回调
57 |
58 |
59 | while (true) {
60 | //执行开关
61 | if (file_exists($this->configs['siteConfigDir'].$this->configs['id'].'stop.txt') ) {
62 | break;
63 | }
64 |
65 | if ($this->configs['debug'] && $this->curlTimes > $this->configs['debugNum']) {
66 | break;
67 | }
68 | //执行开关
69 |
70 | if ($this->queueObj->getLength()==0) {
71 | break;
72 | }
73 | for ($i = 1; $i <= $this->configs['dbConfig']['maxProcess']; ++$i)
74 | {
75 | $pid = pcntl_fork();
76 | if ($pid == -1)
77 | {
78 | echo "fork child process failed\n";
79 | exit(0);
80 | }
81 | if (!$pid)
82 | {
83 |
84 | //判断队列是否为空,下一个ScanUrl
85 | if ($this->queueObj->getLength()>0) {
86 | $queueArray = $this->removeQueue($this->configs['thread']);
87 | }else{
88 | //nextScanUrl
89 | $this->scanUrlsIndex++;
90 | if ($this->configs['scanUrls'][$this->scanUrlsIndex]) {
91 | $queueArray[] = ['url'=>$this->configs['scanUrls'][$this->scanUrlsIndex]];
92 | }else{
93 | break 2;
94 | //die('done');
95 | }
96 | }
97 | //判断队列是否为空,下一个ScanUrl
98 |
99 | //
100 | $this->configs['onChangeProxy']($this->site); //onChangeProxy回调
101 | $this->curl($queueArray);
102 | unset($queueArray);
103 |
104 | $this->curlObj->success(function($instance) {
105 | $this->log('curl:'.$instance->url);
106 |
107 | $this->page['url'] = $instance->url;
108 |
109 | $this->page['raw'] = $instance->response;
110 | $this->page['raw'] = $this->convertUtf8($this->page['raw']).$instance->requestHeaders['contextData']; //加上附加数据
111 |
112 |
113 | if ($instance->requestHeaders['contextData']) {
114 | $this->log('contextData:'.$instance->requestHeaders['contextData'],3);
115 | }
116 |
117 | $this->page['request'] = $instance;//还没改造
118 |
119 |
120 | $this->configs['isAntiSpider']($this->page['url'],$this->page['raw']); //isAntiSpider回调
121 | $this->configs['afterDownloadPage']($this->page,$this->site); //afterDownloadPage回调
122 |
123 |
124 | //scanUrls
125 | if (in_array($this->page['url'], $this->configs['scanUrls'])) {
126 | $this->log('in scanUrls:'.$this->page['url']);
127 |
128 | $r1 = $this->configs['onProcessScanPage']($this->page,$this->page['raw'],$this->site); //onProcessScanPage 回调
129 | //$this->log('preg_match:'.$value.' -> '.$this->page['url']);
130 | if ($r1 == true) {
131 | $this->parseAllUrl($this->page['raw']);
132 | }
133 |
134 | }
135 |
136 |
137 |
138 | //列表页
139 | foreach ($this->configs['helperUrlRegexes'] as $key => $value) {
140 | if (preg_match("|".$value."$|", $this->page['url'])) {
141 | $this->log('in helperUrl:'.$this->page['url']);
142 |
143 | if (strstr($instance->responseHeaders['content-type'],'application/json')) { //列表页是json数据
144 | $this->page['raw'] = serialize($this->page['raw']);
145 | }
146 | $r1 = $this->configs['onProcessHelperPage']($this->page,$this->page['raw'],$this->site); //
147 |
148 |
149 | if (!$this->configs['contentUrlRegexes']) {
150 | //当内容页正则为空时,可以直接解析列表页
151 | //$this->log('xxxxxxxxxxxxxxx:'.$this->page['url']);
152 | $this->parseData($this->page['url'],$this->page['raw']);
153 | }
154 | if ($r1 == true) {
155 | $this->parseAllUrl($this->page['raw']);
156 | }
157 | break;
158 | } else {
159 |
160 | }
161 | }
162 | //列表页
163 |
164 |
165 | //内容页
166 | foreach ($this->configs['contentUrlRegexes'] as $key => $value) {
167 | if (preg_match("|".$value."$|", $this->page['url'])) {
168 | $this->log('in contentUrl:'.$this->page['url']);
169 | $r1 = $this->configs['onProcessContentPage']($this->page,$this->page['raw'],$this->site); //onProcessContentPage 回调
170 | //$this->log('preg_match:'.$value.' -> '.$this->page['url']);
171 | $this->parseData($this->page['url'],$this->page['raw']);
172 | if ($r1 == true) {
173 | $this->parseAllUrl($this->page['raw']);
174 | }
175 | break;
176 | } else {
177 |
178 | }
179 | }
180 | //内容页
181 | unset($instance);
182 |
183 | //ob_flush();
184 | //flush();
185 | //usleep(1000000);
186 | });
187 |
188 | $this->curlObj->error(function($instance) {
189 | $this->log('call to "' . $instance->url . '" was unsuccessful.' . "\n".'error code: ' . $instance->errorCode . "\n".'error message: ' . $instance->errorMessage . "\n",3);
190 | unset($instance);
191 | });
192 |
193 | $this->curlObj->start();
194 | exit($i);
195 |
196 | }
197 | usleep(1);
198 | }
199 | while (pcntl_waitpid(0, $status) != -1)
200 | {
201 | $status = pcntl_wexitstatus($status);
202 | if (pcntl_wifexited($status))
203 | {
204 | //echo "";
205 | }
206 | echo "$status finished\n";
207 | }
208 |
209 |
210 |
211 | }
212 |
213 |
214 |
215 |
216 | }
217 |
218 | /**
219 | * [解析所有链接]
220 | * @param [string] $content [description]
221 | * @param [type] $r [description]
222 | * @return [type] [description]
223 | */
224 | public function parseAllUrl($content){
225 |
226 |
227 | //echo $content;
228 | $urlData = $this->pauseDrive->pause($content,"//a/@href");
229 | if ($urlData) {
230 | foreach ($urlData as $key => $value) {
231 |
232 | //$this->log('parseAllUrl url:'.$value);
233 | $temp1 = parse_url($value);
234 | if (!$temp1['host'] || in_array($temp1['host'], $this->configs['domains']) ) {
235 | $this->urlRegexes($value);
236 | }
237 | unset($temp1);
238 |
239 |
240 | }
241 | unset($urlData);
242 | }
243 |
244 | }
245 |
246 |
247 | /**
248 | * [筛选helper/content规则才能进队列]
249 | * @param [type] $url [description]
250 | * @return [type] [description]
251 | */
252 | public function urlRegexes($url){
253 |
254 |
255 | foreach ($this->configs['helperUrlRegexes'] as $key => $value) {
256 | if (preg_match("|^".$value."$|", $url)) {
257 | $this->addQueue($url,['type'=>2]); //进队列
258 | break;
259 | } else {
260 |
261 | }
262 | }
263 |
264 |
265 |
266 | foreach ($this->configs['contentUrlRegexes'] as $key => $value) {
267 | if (preg_match("|^".$value."$|", $url)) {
268 |
269 | $this->addQueue($url,['type'=>3]); //进队列
270 | break;
271 | } else {
272 |
273 | }
274 | }
275 | }
276 |
277 |
278 | /**
279 | * [进队列]
280 | * @param [type] $url [description]
281 | * @param [type] $opt [description]
282 | */
283 |
284 | public function addQueue($url,$opt=[]){
285 | //print_r($this->configs['dbConfig']);die;
286 | $this->queueObj = $this->queueObj?$this->queueObj:new Queue($this->configs['dbConfig']['redis']['host'],$this->configs['dbConfig']['redis']['port'],$this->configs['id'],$this->configs['action']);
287 | //$this->queueObj->addLast($url);
288 | $temp1 = parse_url($url);
289 | if (!$temp1['scheme']) { //相对地址
290 | if (!$temp1['path']) {
291 | $url = $this->configs['baseUrlPath'].$url; //"?xxxx"
292 |
293 | }else{
294 | if (substr($url, 0,1) == '/') { //"/1.html"
295 | $url = substr($url,1);
296 | }
297 | $url = $this->configs['baseUrl'].$url; //"1.html"
298 | }
299 |
300 | }else{
301 | //绝对地址
302 | }
303 | $this->queueObj->addLast(["url"=>$url,"opt"=>$opt]);
304 | unset($temp1);
305 | unset($url);
306 | unset($opt);
307 |
308 | //$this->log('add queue:'.$url);
309 |
310 | }
311 |
312 |
313 | /**
314 | * [解析所有内容]
315 | * @param [type] $url [description]
316 | * @param [type] $content [description]
317 | * @return [type] [description]
318 | */
319 | public function parseData($url,$content){
320 |
321 | $this->skip = '';//重置下状态;
322 | $fieldContent = $this->parseFields($this->configs['fields'],$content);
323 |
324 | //$page['skip'](); 删掉整条
325 | $skipFlag = $this->array_search_key('skipAllPage998', $fieldContent);
326 | if ($skipFlag) {
327 | unset($fieldContent);//site->skip
328 | unset($skipFlag);
329 | }
330 |
331 | //transient 删除字段
332 | $delKey = $this->array_search_key('transient', $this->configs['fields']);
333 | if ($delKey) {
334 | foreach ($delKey as $key => $value) {
335 | $this->log('delKey:'.(string)$value,1);
336 | $this->array_remove_key($fieldContent, (string)$value);
337 | }
338 | unset($delKey);
339 | }
340 |
341 | $this->configs['afterExtractPage']($this->page,$fieldContent); //afterExtractPage回调
342 |
343 |
344 | if ($this->getDbInstance() && $fieldContent) {
345 | $this->getDbInstance()->insert($this->tableName, [
346 | "site" => $this->configs['id'],
347 | "url" => $url,
348 | "data" => json_encode($fieldContent,JSON_UNESCAPED_UNICODE)
349 | ]);
350 |
351 | }
352 |
353 | if ($this->configs['debug'] ) {
354 | $this->log('fieldContent:');
355 | $this->log($fieldContent,1);
356 | }else{
357 | $this->log('quequeLeft:'.$this->queueObj->getLength(),1);
358 | }
359 |
360 | unset($fieldContent);
361 | unset($content);
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 | }
371 |
372 | /**
373 | * [递归解析字段]
374 | * @param [type] $fields [description]
375 | * @param [type] $content [description]
376 | * @return [type] [description]
377 | */
378 | public function parseFields($fields,$content){
379 |
380 |
381 |
382 | foreach ($fields as $k => $value) { //第一层
383 | $pf = $value['selectorType'] != 'Xpath' ? PauseFactory::Create( $value['selectorType'] ): $this->pauseDrive;
384 |
385 | if ($value['sourceType']) {
386 | if ($value['sourceType'] == 'AttachedUrl') {
387 | preg_match_all("|\{([^}]+)\}|", $value['attachedUrl'],$match);
388 | if ($match[0]) {
389 | foreach ($match[0] as $matchKey =>$matchValue) {
390 | $value['attachedUrl'] = str_replace($matchValue, $con[$match[1][$matchKey]], $value['attachedUrl'] );
391 | }
392 | }
393 |
394 | $curlOneContent = $this->curlOne($value['attachedUrl']);
395 | if ($curlOneContent) {
396 | $content = $curlOneContent;
397 |
398 | $this->configs['afterDownloadAttachedPage']($this->page,$this->site); //afterDownloadAttachedPage回调
399 | }
400 | }
401 | }
402 |
403 |
404 | $con[$value['name']] = $pf->pause($content,$value['selector']);
405 |
406 | //$con[$value['name']] = $pf->pause($content,$value['selector']);
407 |
408 | /**
409 | * 控制required,page.skip()
410 | */
411 | if ($value['required'] && !$con[$value['name']]) { //判断required
412 | $con['skipAllPage998'] = 1;//设置一个特殊key,方便查找,找到即可删掉这整条数据;
413 | $this->log('required:'.$value['name'],3);
414 | //return $con;
415 | }
416 | if ($this->skip) {
417 | if ($this->skip == 'skipAllPage998') {
418 | $con['skipAllPage998'] = 1;//设置一个特殊key,方便查找,找到即可删掉这整条数据;
419 | }
420 | }
421 |
422 |
423 | $this->configs['beforeHandleImg']($value['name'],$img); //beforeHandleImg
424 | $this->configs['beforeCacheImg']($value['name'],$img); //beforeCacheImg
425 | $this->configs['afterExtractField']($value['name'],$con[$value['name']],$this->page); //afterExtractField
426 |
427 |
428 | if ($value['children']){
429 |
430 | $contentRepeat = $con[$value['name']];
431 | $con[$value['name']] = [];
432 |
433 | if (is_array($contentRepeat)) {
434 | foreach ($contentRepeat as $k2 => $repeatedContent) { //每个帖子的所有内容
435 | //if ($value['children']) {
436 | $con[$value['name']][$k2] = $this->parseFields($value['children'],(string)$repeatedContent); //遍历
437 |
438 | /**
439 | * 控制required,page.skip()
440 | */
441 | if (!$con[$value['name']][$k2]) {
442 | $this->log('required2:'.$value['name'],3);
443 | unset($con[$value['name']][$k2]);
444 | }
445 |
446 | if ($this->skip == $value['name']) {
447 | $this->log('skip:'.$value['name'],3);
448 | unset($con[$value['name']][$k2]);
449 | $this->skip = '';
450 | }
451 |
452 | /*if ($this->skip == 'skipAllPage998') {
453 | $this->log('skip all:'.$value['name'],3);
454 | unset($con);
455 | $this->skip = '';
456 | return;
457 | }*/
458 |
459 | //}
460 |
461 | }
462 | }
463 |
464 | }
465 |
466 | }
467 |
468 | unset($contentRepeat);
469 | unset($content);
470 | unset($fields);
471 |
472 | return $con;
473 |
474 | }
475 |
476 |
477 | /**
478 | * [出队列]
479 | * @param integer $c [description]
480 | * @return [type] [description]
481 | */
482 | public function removeQueue($c=1){
483 |
484 | $quequeCount = $this->queueObj->getLength();
485 | $c = $c>$quequeCount?$quequeCount:$c;
486 | for ($i=0; $i < $c; $i++) {
487 |
488 | $queueArray[] = $this->queueObj->removeFirst();
489 |
490 | }
491 |
492 | return $queueArray;
493 |
494 | }
495 |
496 |
497 |
498 | public function getCurlInstance()
499 | {
500 | static $instances = array();
501 | $key = getmypid();
502 | if (empty($instances[$key]))
503 | {
504 | $instances[$key] = new MultiCurl();;
505 | }
506 | return $instances[$key];
507 | }
508 |
509 |
510 | /**
511 | * [多个请求]
512 | * @param [array] queueArray [description]
513 | */
514 |
515 | public function curl($queueArray){
516 |
517 | $this->curlObj = $this->getCurlInstance();
518 | //print_r($this->curlObj);
519 |
520 | if ($this->site['header']) {
521 | foreach ($this->site['header'] as $key => $value) {
522 |
523 | $this->curlObj->setHeader($key, $value);
524 | }
525 | }
526 |
527 |
528 | if ($this->site['cookie']) {
529 | foreach ($this->site['cookie'] as $key => $value) {
530 |
531 | $this->curlObj->setCookie($key, $value);
532 | }
533 | }
534 |
535 |
536 | $this->curlObj->setUserAgent('Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36');
537 |
538 | if ($this->site['userAgent']) {
539 |
540 | $this->curlObj->setUserAgent($this->site['userAgent']);
541 |
542 | }
543 |
544 |
545 | $this->curlObj->setOpt(CURLOPT_SSL_VERIFYHOST, false);
546 | $this->curlObj->setOpt(CURLOPT_SSL_VERIFYPEER, false);
547 |
548 | //$this->log($queueArray);
549 |
550 | foreach ($queueArray as $v) {
551 |
552 | if ($v['opt']['header'] ) {
553 |
554 | foreach ($v['opt']['header'] as $key => $value) {
555 |
556 | $this->curlObj->setHeader($key, $value);
557 | }
558 | }
559 | if ($v['opt']['contextData'] ) {
560 | //contextData扔到header里了
561 | $this->curlObj->setHeader('contextData', $v['opt']['contextData']);
562 |
563 | }
564 |
565 | //$this->curlObj->setHeader('contextData', '房贷收紧福克斯的垃圾焚烧开发建设的开发设计');
566 |
567 | if ($v['opt']['method'] == 'POST') {
568 |
569 |
570 | //$this->log($v['opt']);
571 | $this->curlObj->addPost($v['url'],$v['opt']['data']);
572 | }else{
573 |
574 | $this->curlObj->addGet($v['url'],$v['opt']['data']);
575 | }
576 | //$this->curlTimes++;
577 |
578 | //$this->log('curlTimes:'.$this->curlTimes);
579 | }
580 |
581 | }
582 |
583 | /**
584 | * [单请求]
585 | * @param [type] $url [description]
586 | * @param array $options [description]
587 | * @return [type] [description]
588 | */
589 | public function curlOne($url,$options=[]){
590 |
591 | $this->log('curlOne:'.$url);
592 | $curl = new Curl();
593 | $curl->setOpt(CURLOPT_SSL_VERIFYHOST, false);
594 | $curl->setOpt(CURLOPT_SSL_VERIFYPEER, false);
595 |
596 |
597 | if ($this->site['header']) {
598 | foreach ($this->site['header'] as $key => $value) {
599 |
600 | $curl->setHeader($key, $value);
601 | }
602 | }
603 |
604 | if ($this->site['cookie']) {
605 | foreach ($this->site['cookie'] as $key => $value) {
606 |
607 | $curl->setCookie($key, $value);
608 | }
609 | }
610 |
611 | $this->curlObj->setUserAgent('Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36');
612 | if ($this->site['userAgent']) {
613 |
614 | $curl->setUserAgent($this->site['userAgent']);
615 |
616 | }
617 |
618 |
619 |
620 |
621 | if ($options['headers'] ) {
622 | foreach ($options['headers'] as $key => $value) {
623 |
624 | $curl->setHeader($key, $value);
625 | }
626 |
627 | }
628 | if ($options['method'] == 'POST') {
629 | $curl->post($url,$options['data']);
630 | }else{
631 | //echo $url;
632 | $curl->get($url);
633 | }
634 |
635 |
636 | $this->curlTimes++;
637 |
638 | $this->log('curlOneTimes:'.$this->curlTimes);
639 |
640 |
641 |
642 | if ($curl->error) {
643 | $this->log( 'Url: ' . $url .'Error: ' . $curl->errorCode . ': ' . $curl->errorMessage ,3);
644 | return false;
645 | }
646 | else {
647 | if ( strstr( $curl->responseHeaders['content-type'],'application/json') ) {
648 | $curl->response = json_encode( $curl->response);
649 | }
650 |
651 | $this->page['raw'] = $curl->response;
652 | $this->page['request'] = $curl->request;//还没改造
653 |
654 | if ($options['contextData'] ) {
655 | $this->page['raw'] .= $options['contextData'];
656 |
657 | }
658 | return $this->page['raw'];
659 | }
660 |
661 | }
662 |
663 | /**
664 | * [内容转utf8]
665 | * @param [type] $content [description]
666 | * @return [type] [description]
667 | */
668 | private function convertUtf8($content){
669 | if ($this->configs['charset']) {
670 | //$content = iconv ( $this->configs['charset'], 'utf-8' , $content );
671 | $content = mb_convert_encoding($content, "utf-8",$this->configs['charset']);
672 | $content = preg_replace('|charset\s*=\s*(\w+)|i', 'charset=UTF-8', $content);
673 | }
674 | return $content;
675 | }
676 |
677 | /**
678 | * [删除某个数组的key,下面不能是数组]
679 | * @param [type] &$arr [description]
680 | * @param [type] $k [description]
681 | * @return [type] [description]
682 | */
683 | public function array_remove_key(&$arr, $k){
684 | if ($arr) {
685 | foreach ($arr as $key => &$value) {
686 | if (is_array($value)) {
687 | $this->array_remove_key($value, $k);
688 | } else {
689 | if (trim($key) == $k) {
690 | unset($arr[$k]);
691 | }
692 | }
693 | }
694 | }
695 | }
696 |
697 |
698 | public function array_search_key( $search, array $array, $mode = 'key'){
699 | $res = array();
700 | $temp1 = new RecursiveIteratorIterator(new RecursiveArrayIterator($array));
701 | foreach ($temp1 as $key => $value) {
702 | if ($search === ${${"mode"}}){
703 | if($mode == 'key'){
704 | $res[] = $value;
705 | }else{
706 | $res[] = $key;
707 | }
708 | unset($search);
709 | }
710 | }
711 | unset($temp1);
712 | return $res;
713 | }
714 |
715 |
716 |
717 | /**
718 | * [initSite ]
719 | * @return [type] [description]
720 | */
721 | public function initSite(){
722 | $this->site['scanUrls'] = [];
723 | $this->site['helperUrls'] = [];
724 | $this->site['contentUrls'] = [];
725 |
726 | $this->site['addHeader'] = function ($key,$value) {
727 |
728 | $this->site['header'][$key] = $value;
729 | };
730 |
731 | $this->site['addCookie'] = function ($key,$value) {
732 |
733 | $this->site['cookie'][$key] = $value;
734 | };
735 |
736 | $this->site['addCookies'] = function ($cookies) {
737 | $temp1 = explode(';', $cookies);
738 | foreach ($temp1 as $key => $value) {
739 | $temp2 = explode('=', trim($value));
740 | $this->site['cookie'][$temp2[0]] = urldecode($temp2[1]);
741 | }
742 | unset($temp1);
743 |
744 | };
745 |
746 | $this->site['addUrl'] = function ($url,$options=[]) {
747 | $this->addQueue($url,$options);
748 | };
749 |
750 | $this->site['requestUrl'] = function ($url,$options=[]) {
751 | return $this->curlOne($url,$options);
752 | };
753 |
754 | $this->site['setUserAgent'] = function ($userAgent) {
755 | $this->site['userAgent'] = $userAgent;
756 | };
757 |
758 |
759 | }
760 |
761 | /**
762 | * [initPage]
763 | * @return [type] [description]
764 | */
765 | public function initPage(){
766 |
767 | $this->page['skip'] = function ($fieldName='') {
768 | $fieldName = $fieldName?$fieldName:'skipAllPage998';
769 | $this->skip = $fieldName;
770 | unset($fieldName);
771 |
772 | $this->log('$this->skip:'.$this->skip,3);
773 | };
774 |
775 | }
776 |
777 |
778 | public function getDbInstance()
779 | {
780 | static $instances = array();
781 | $key = getmypid();
782 | if (empty($instances[$key]))
783 | {
784 | $instances[$key] = new \medoo($this->configs['dbConfig']['db']);
785 | }
786 | return $instances[$key];
787 | }
788 |
789 | public function initDb(){
790 |
791 | if ($this->configs['action'] == 'start' || $this->configs['action'] == 'restart') {
792 |
793 | $this->db = new \medoo($this->configs['dbConfig']['db']);
794 |
795 | $this->tableName = 'jmz_data'.$this->configs['id'];
796 |
797 | if ($this->configs['action'] == 'restart') {
798 | $this->db->query("drop TABLE `".$this->tableName."`;");
799 | }
800 |
801 |
802 | $count = $this->db->count($tableName, []);
803 |
804 | if (!$count) {
805 | $this->db->query("CREATE TABLE IF NOT EXISTS `".$this->tableName."` (
806 | `id` int(11) NOT NULL,
807 | `create_date` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
808 | `update_date` datetime DEFAULT NULL,
809 | `site` int(5) NOT NULL DEFAULT '0',
810 | `url` varchar(200) NOT NULL,
811 | `data` longtext,
812 | `other` text,
813 | `flag` int(3) NOT NULL DEFAULT '1'
814 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
815 |
816 |
817 | ALTER TABLE `".$this->tableName."`
818 | ADD PRIMARY KEY (`id`),
819 | ADD UNIQUE KEY `site` (`site`,`url`);
820 |
821 |
822 | ALTER TABLE `".$this->tableName."`
823 | MODIFY `id` int(11) NOT NULL AUTO_INCREMENT;");
824 | }
825 | unset($count);
826 |
827 |
828 |
829 |
830 | if ($this->configs['dbConfig']['insertType'] == 2) {
831 | $this->db->delete($this->tableName, ["site" => $this->configs['id']]);
832 | }
833 |
834 | }
835 | }
836 |
837 | /**
838 | * [log ]
839 | * @param [type] $str [description]
840 | * @param integer $level [description]
841 | * @return [type] [description]
842 | */
843 | public function log($str,$level=0){
844 |
845 | $str = is_array($str)?json_encode($str,JSON_UNESCAPED_UNICODE):$str;
846 |
847 | echo date("Y-m-d H:i:s")." ";
848 | if ($level == 1) {
849 | echo "\e[1;33m $str \e[0m \n";
850 | }elseif ($level == 2) {
851 | echo "\e[1;34m $str \e[0m \n";
852 | }elseif ($level == 3) {
853 | echo "\e[1;31m $str \e[0m \n";
854 | }else{
855 | echo "$str \n";
856 | }
857 |
858 |
859 | unset($str);
860 |
861 |
862 | }
863 |
864 |
865 | }
--------------------------------------------------------------------------------
/Crawler/PauseData.php:
--------------------------------------------------------------------------------
1 | loadHTML($content);
18 | libxml_clear_errors();
19 | $xPath = new DOMXPath($dom);
20 | $elements = $xPath->query($selector);
21 | if ($elements->length>1) {
22 | for ($i=0; $i < $elements->length; $i++) {
23 | if ($opt==true) {
24 | $ret[] = $elements->item($i)->nodeValue;
25 | }else{
26 | $ret[] = $elements->item($i)->nodeValue?$dom->saveHtml($elements->item($i)):'';
27 | }
28 |
29 | }
30 | }else{
31 | if ($opt==true) {
32 | $ret = $elements->item(0)->nodeValue;
33 | }else{
34 | $ret = $elements->item($i)->nodeValue?$dom->saveHtml($elements->item(0)):'';
35 | }
36 |
37 | }
38 | return $ret;
39 | }
40 | }
41 |
42 |
43 | class PauseXPath implements PauseData
44 | {
45 | public function __construct( ) { }
46 |
47 | public function pause($content,$selector,$opt=false){
48 | $document = new Document();
49 | $document->loadHtml($content);
50 | $lists = $document->xpath($selector);
51 |
52 | if ($lists) {
53 | if ($lists[1]) {
54 | foreach($lists as $k =>$list) {
55 | //print_r($selector);//die;
56 |
57 | if (is_string($list)) {
58 | $r[] = trim($list);
59 | }else{
60 | $r[] = trim($list->innerHtml());
61 | }
62 | }
63 | }else{
64 |
65 | if (is_string($lists[0])) {
66 | $r = trim($lists[0]);
67 | }else{
68 |
69 | /*echo "
\n";
70 | print_r($selector);print_r($lists);
71 | echo "
\n";*/
72 | $r = trim($lists[0]->innerHtml());
73 | }
74 |
75 |
76 | }
77 | }
78 |
79 |
80 |
81 | return $r;
82 | }
83 | }
84 |
85 |
86 | class PauseJsonPath implements PauseData
87 | {
88 | public function __construct( ) { }
89 |
90 | public function pause($content,$selector,$opt=false){
91 | //echo 'pause:
';
92 | //print_r($content);
93 | //print_r($selector);
94 | //echo '
';//die;
95 |
96 | if (is_array($content)) {
97 | $content = json_encode($content);
98 | }
99 | $content = json_decode($content,true);
100 |
101 | if (strpos($selector, '.')) {
102 | $temp1 = explode('.', $selector);
103 | foreach ($temp1 as $key => $value) {
104 | $content = $content[$value];
105 | }
106 | }else{
107 | $content = $content[$selector];
108 | }
109 |
110 | return $content;
111 | }
112 | }
113 |
114 |
115 | class PauseRegex implements PauseData
116 | {
117 | public function __construct( ) { }
118 |
119 | public function pause($content,$selector,$opt=PREG_PATTERN_ORDER){
120 |
121 |
122 | preg_match_all($selector,$content,$result,$opt);
123 | return isset($result[1][1])?$result[1]:$result[1][0];
124 | }
125 | }
126 |
127 | class PauseCssPath implements PauseData
128 | {
129 |
130 | public function __construct( ) { }
131 |
132 | public function pause($content,$selector,$opt=false){
133 |
134 | $document = new Document();
135 | $document->loadHtml($content);
136 | $lists = $document->find($selector);
137 |
138 | if ($lists) {
139 | # code...
140 | if (count($lists) > 0) {
141 | foreach($lists as $k =>$list) {
142 | $r[] = $list->innerHtml();
143 | }
144 | }else{
145 | $r = $lists[0]->innerHtml();
146 | }
147 | }
148 |
149 |
150 |
151 | return $r;
152 | }
153 | }
154 |
155 | class PauseFactory
156 | {
157 | public static function Create( $method )
158 | {
159 | $class = 'Pause'.$method;
160 | //echo '--'.$class.'--';测试
161 | return new $class( );
162 | }
163 | }
164 |
--------------------------------------------------------------------------------
/Crawler/Queue.php:
--------------------------------------------------------------------------------
1 | host = $host;
13 | $this->port = $port;
14 | $this->key = $this->key.$id;
15 |
16 | if ($action == 'restart') {
17 | $this->getInstance()->del($this->key);
18 | $this->getInstance()->del($this->key.'AllUrl');
19 | }
20 |
21 |
22 | }
23 |
24 | /**
25 | * [尾部入队]
26 | * @param [type] $value ['url'=>$url,'opt'=$opt]
27 | */
28 | public function addLast($value)
29 | {
30 | if ($value['opt']['reserve'] == true) { //去重选项
31 | return $this->getInstance()->rpush($this->key, serialize($value));
32 | }
33 |
34 | //print_r($this->getInstance());
35 |
36 | $allUrl = unserialize($this->getInstance()->get($this->key.'AllUrl'));
37 | //echo 'allUrl count:'.count($allUrl)."\n";
38 |
39 | if (!$allUrl[$value["url"]]) {
40 | $allUrl[$value["url"]]=1;
41 | $this->getInstance()->set($this->key.'AllUrl', serialize($allUrl));
42 | return $this->getInstance()->rpush($this->key, serialize($value));
43 | }
44 |
45 |
46 |
47 | }
48 | /**(头部)出队**/
49 | public function removeFirst()
50 | {
51 | return unserialize($this->getInstance()->lpop($this->key));
52 | }
53 |
54 | /** 获取长度 **/
55 | public function getLength()
56 | {
57 | return $this->getInstance()->llen($this->key);
58 | }
59 |
60 | public function getInstance()
61 | {
62 | static $instances = array();
63 | $key = getmypid();
64 | if (empty($instances[$key]))
65 | {
66 | $instances[$key] = new Redis();
67 | //echo 111;
68 | //print_r($instances[$key]);
69 |
70 | $instances[$key]->connect($this->host, $this->port);
71 | }
72 |
73 | return $instances[$key];
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2016
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 鸠摩智(多进程版)简介
2 | -------------
3 | 前几天看到 http://doc.shenjianshou.cn/ 觉得不错,就省下几天守望开车时间照着文档用php实现了一遍,下面是对比shenjianshou的不足,改进和使用上的区别
4 |
5 | > **不足:**
6 |
7 | > - 没有js渲染
8 | > - 没有验证码识别
9 | > - 暂时没有自动换代理
10 | > - 暂时没有图片本地化/托管云
11 | > - 暂时没有模拟登录
12 | > - 暂时没有录入数据库 (已加)
13 |
14 |
15 | > **特点/改进:**
16 |
17 | > - 多进程,现在只能在liunx下使用了,单进程很容易内存耗尽- -
18 | > - 使用curl multi"多线程",可以自定义"线程"数,速度刷刷刷的
19 | > - 支持css选择器,xpath,正则3种选择器
20 |
21 | > **使用上的区别:**
22 |
23 | > - 查看SiteConfig目录下 数字.php 具体配置,因为用php搞的,各种传递参数啥的都不一样,但看起来大体一样;
24 | > - 用Xpath取回来的是innerHtml
25 | > - jsonpath没怎么弄
26 | > - contentUrlRegexes helperUrlRegexes 规则里没带域名
27 |
28 | #### 如何安装
29 | ```
30 | git clone https://github.com/ketle/jiumozhi.git
31 | cd jiumozhi
32 | composer install
33 | ```
34 |
35 | #### 如何开始
36 | ```
37 | 配置config.php下db,redis设置
38 | Usage: php index.php <1-n>
39 | 比如: php index.php 1 test
40 | 手动停止请Ctrl+c
41 | SiteConfig目录下自带了5个例子;
42 | 第一个例子入库2599条数据 时间从2016-08-22 22:15:13 - 2016-08-22 22:17:48 才用时2分半
43 | ```
44 |
45 | #### 为啥叫"鸠摩智"
46 | 拍脑袋想出来的,哈哈哈
--------------------------------------------------------------------------------
/SiteConfig/1.php:
--------------------------------------------------------------------------------
1 | 'article_title','alias'=>'标题','selector'=>"//h2[@class='entry-name']/text()",'selectorType'=>'XPath','required'=>1],
18 | ['name'=>'article_con','alias'=>'标题','selector'=>"//div[@class='entry-content']",'selectorType'=>'XPath','required'=>1],
19 |
20 | ];
21 | $configs['beforeCrawl'] = function (&$site) {
22 | //$site['addHeader']("Referer", "http://buluo.qq.com/p/index.html");
23 | //$site['addCookies']("last_item_date:10733=1467003228; mykeywords=a%3A1%3A%7Bi%3A0%3Bs%3A6%3A%22%E4%BD%90%E7%BD%97%22%3B%7D; PHPSESSID=2mq1jhshc6ssi2rc3j3iontku7; GINFO=uid%3D3519430%26nickname%3Dketle%26group_id%3D0%26avatar_t%3D%26main_group_id%3D0%26common_group_id%3D59; GKEY=0c3d0734c04ae6f2b72632d0553eb116");
24 |
25 | };
26 | $configs['nextScanUrl'] = function (&$url) {
27 | return ;
28 | };
29 | $configs['onChangeProxy'] = function (&$site) {
30 | return ;
31 | };
32 | $configs['isAntiSpider'] = function (&$url,&$content) {
33 | return ;
34 | };
35 | $configs['afterDownloadPage'] = function (&$page,&$site) {
36 |
37 | return ;
38 | };
39 | $configs['afterDownloadAttachedPage'] = function (&$page,&$site) {
40 | return ;
41 | };
42 | $configs['onProcessScanPage'] = function (&$page,&$content,&$site) {
43 | /*echo 'sss:';
44 | print_r($site['scanUrls']);
45 | print_r($site['helperUrls']);
46 | print_r($site['contentUrls']);*/
47 |
48 | //$site['addUrl']('http://fuliba.net/%e6%88%91%e6%9c%89%e4%b8%80%e4%b8%aa%e6%a2%a6%e6%83%b3%ef%bc%9a%e9%9d%a2%e6%9c%9d%e5%8f%b8%e6%9c%ba%ef%bc%8c%e4%ba%ba%e6%bb%a1%e8%bd%a6%e5%bc%80.html');
49 |
50 | $content = str_replace('', '', $content); //奇怪的网站,一会儿utf8,一会儿gb2312,导致xpath不正常;
51 |
52 |
53 | return true;
54 | };
55 | $configs['onProcessHelperPage'] = function (&$page,&$content,&$site) {
56 | $content = str_replace('', '', $content);
57 | return true;
58 | };
59 | $configs['onProcessContentPage'] = function (&$page,&$content,&$site) {
60 | $content = str_replace('', '', $content);
61 | return true;
62 | };
63 |
64 | $configs['beforeHandleImg'] = function (&$fieldName,&$img) {
65 | return ;
66 | };
67 | $configs['beforeCacheImg'] = function (&$fieldName,&$url) {
68 | return ;
69 | };
70 | $configs['afterExtractField'] = function (&$fieldName,&$data,&$page) {
71 | //if ($fieldName == 'article_title' && trim($data) == '翻山新科技,i42.li,配置仅需两步') {
72 | //print_r($data);die;
73 | //$page['skip']();
74 | //}
75 | return ;
76 | };
77 | $configs['beforeCacheImg'] = function (&$page,&$data) {
78 | return ;
79 | };
80 | $configs['afterExtractPage'] = function (&$page,&$data) {
81 | return ;
82 | };
--------------------------------------------------------------------------------
/SiteConfig/2.php:
--------------------------------------------------------------------------------
1 | 'article_title',
20 | 'alias'=>'标题',
21 | 'selector'=>'//*[@id="pt"]/div/a[5]/text()',
22 | 'selectorType'=>'XPath',
23 | 'required'=>1
24 | ],
25 | [
26 | 'name'=>'article_content',
27 | 'alias'=>'内容',
28 | 'selector'=>"/html/body[@id='nv_forum']/div[@id='wp']/div[@id='ct']/div[@id='pgt']/div[@class='pgt']/div[@class='pg']/a",
29 | 'selectorType'=>'XPath',
30 | 'repeated'=>1,
31 | 'children'=>
32 | [
33 | [
34 | 'name'=>'page',
35 | 'alias'=>'分页',
36 | 'selector'=>'//text()',
37 | 'selectorType'=>'XPath',
38 | 'required'=>1,
39 | 'transient'=>'page' //临时变量,要删掉
40 | ],
41 | [
42 | 'name'=>'article_content2',
43 | 'alias'=>'内容',
44 | 'selector'=>'//div[@class="im286table"]',
45 | 'selectorType'=>'XPath',
46 | 'sourceType'=>'AttachedUrl',
47 | 'attachedUrl'=>'{page}',
48 | 'repeated'=>1,
49 | 'children'=>
50 | [
51 | [
52 | 'name'=>'author',
53 | 'alias'=>'作者',
54 | 'selector'=>'//a[@class="xw1"]',
55 | 'selectorType'=>'XPath',
56 | 'required'=>1,
57 | 'transient'=>'author' //临时变量,要删掉
58 | ],
59 | [
60 | 'name'=>'content',
61 | 'alias'=>'内容',
62 | 'selector'=>'//*[contains(@id,"postmessage_")]',
63 | 'selectorType'=>'XPath',
64 | 'required'=>1
65 | ],
66 | ]
67 | ],
68 | ]
69 | ],
70 | ];
71 | $configs['beforeCrawl'] = function (&$site) {
72 | //$site['addHeader']("Referer", "http://buluo.qq.com/p/index.html");
73 | //$site['addCookies']("last_item_date:10733=1467003228; mykeywords=a%3A1%3A%7Bi%3A0%3Bs%3A6%3A%22%E4%BD%90%E7%BD%97%22%3B%7D; PHPSESSID=2mq1jhshc6ssi2rc3j3iontku7; GINFO=uid%3D3519430%26nickname%3Dketle%26group_id%3D0%26avatar_t%3D%26main_group_id%3D0%26common_group_id%3D59; GKEY=0c3d0734c04ae6f2b72632d0553eb116");
74 |
75 | };
76 | $configs['nextScanUrl'] = function (&$url) {
77 | return ;
78 | };
79 | $configs['onChangeProxy'] = function (&$site) {
80 | return ;
81 | };
82 | $configs['isAntiSpider'] = function (&$url,&$content) {
83 | return ;
84 | };
85 | $configs['afterDownloadPage'] = function (&$page,&$site) {
86 | //print_r($site['header']) ;
87 |
88 | if ($site) {
89 | //echo 111;
90 | }
91 | return ;
92 | };
93 | $configs['afterDownloadAttachedPage'] = function (&$page,&$site) {
94 | return ;
95 | };
96 | $configs['onProcessScanPage'] = function (&$page,&$content,&$site) {
97 |
98 | //$site['addUrl']('http://www.im286.net/thread-17437914-1.html');
99 | return true;
100 | };
101 | $configs['onProcessHelperPage'] = function (&$page,&$content,&$site) {
102 | return true;
103 | };
104 | $configs['onProcessContentPage'] = function (&$page,&$content,&$site) {
105 | return true;
106 | };
107 |
108 | $configs['beforeHandleImg'] = function (&$fieldName,&$img) {
109 | return ;
110 | };
111 | $configs['beforeCacheImg'] = function (&$fieldName,&$url) {
112 | return ;
113 | };
114 | $configs['afterExtractField'] = function (&$fieldName,&$data,&$page) {
115 | if ($fieldName == 'article_content') {
116 | //echo "
article_content:";
117 | //print_r($page['url']);print_r($data) ;
118 |
119 | if ($data) {
120 | # code...
121 | array_unshift($data, 1);
122 | array_pop($data);
123 |
124 | foreach ($data as &$value) {
125 |
126 | $value = str_replace('-1.html','-'.intval(str_replace('...', '', $value)) .'.html',$page['url']);
127 | }
128 | //print_r($data) ;
129 | }
130 | //die;
131 | }
132 |
133 | if ($fieldName == 'author') {
134 | //echo "
article_content:";
135 | //print_r($page['url']);print_r($data) ;
136 |
137 | if ($data == '下乡客') {
138 | $page['skip']();
139 | //print_r($data) ;
140 | }
141 | //die;
142 | }
143 |
144 |
145 |
146 |
147 |
148 |
149 | //print_r($fieldName);echo "
";//die;
150 | return ;
151 | };
152 | $configs['beforeCacheImg'] = function (&$page,&$data) {
153 | return ;
154 | };
155 | $configs['afterExtractPage'] = function (&$page,&$data) {
156 | return ;
157 | };
--------------------------------------------------------------------------------
/SiteConfig/3.php:
--------------------------------------------------------------------------------
1 | 'article_title','alias'=>'标题','selector'=>'//*[@id="BContent2"]/h1/text()','selectorType'=>'XPath','required'=>1],
19 | ['name'=>'article_content','alias'=>'内容','selector'=>"//div[@class='body']",'selectorType'=>'XPath','required'=>1],
20 | ['name'=>'article_date','alias'=>'时间','selector'=>"//*[@id='contextData']",'selectorType'=>'XPath'],
21 | ];
22 | $configs['beforeCrawl'] = function (&$site) {
23 | //$site['addHeader']("Referer", "http://buluo.qq.com/p/index.html");
24 | //$site['addCookies']("last_item_date:10733=1467003228; mykeywords=a%3A1%3A%7Bi%3A0%3Bs%3A6%3A%22%E4%BD%90%E7%BD%97%22%3B%7D; PHPSESSID=2mq1jhshc6ssi2rc3j3iontku7; GINFO=uid%3D3519430%26nickname%3Dketle%26group_id%3D0%26avatar_t%3D%26main_group_id%3D0%26common_group_id%3D59; GKEY=0c3d0734c04ae6f2b72632d0553eb116");
25 |
26 | };
27 | $configs['nextScanUrl'] = function (&$url) {
28 | return ;
29 | };
30 | $configs['onChangeProxy'] = function (&$site) {
31 | return ;
32 | };
33 | $configs['isAntiSpider'] = function (&$url,&$content) {
34 | return ;
35 | };
36 | $configs['afterDownloadPage'] = function (&$page,&$site) {
37 | //print_r($site['header']) ;
38 |
39 | if ($site) {
40 | //echo 111;
41 | }
42 | return ;
43 | };
44 | $configs['afterDownloadAttachedPage'] = function (&$page,&$site) {
45 | return ;
46 | };
47 | $configs['onProcessScanPage'] = function (&$page,&$content,&$site) {
48 | $pf = PauseFactory::Create( 'Xpath' );
49 | $match = $pf->pause($content,"/html/body/div[@id='container']/div[@id='B']/div[@id='BContent2']/div[@id='ResBox']/div[@id='RB']/h2");
50 | //print_r($match);
51 |
52 | if ($match) {
53 | foreach ($match as $key => $value) {
54 |
55 | preg_match('|id-(\d+).html" target="_blank">(.*)(.*)|', $value,$match2);
56 | //print_r($match2);
57 |
58 | $url = 'http://news.sise.com.cn/show.php?id-'.$match2[1].'.html';
59 | $site['addUrl']($url,['contextData'=>''.trim($match2[3]).'
']);
60 | }
61 | }
62 |
63 | return false;
64 | };
65 | $configs['onProcessHelperPage'] = function (&$page,&$content,&$site) {
66 |
67 | $pf = PauseFactory::Create( 'Xpath' );
68 | $match = $pf->pause($content,"/html/body/div[@id='container']/div[@id='B']/div[@id='BContent2']/div[@id='ResBox']/div[@id='RB']/h2");
69 | //print_r($match);
70 |
71 | if ($match) {
72 | foreach ($match as $key => $value) {
73 |
74 | preg_match('|id-(\d+).html" target="_blank">(.*)(.*)|', $value,$match2);
75 | //print_r($match2);
76 |
77 | $url = 'http://news.sise.com.cn/show.php?id-'.$match2[1].'.html';
78 | $site['addUrl']($url,['contextData'=>''.trim($match2[3]).'
']);
79 | }
80 | }
81 |
82 | return false;
83 | };
84 | $configs['onProcessContentPage'] = function (&$page,&$content,&$site) {
85 | //echo $content;die;
86 | return true;
87 | };
88 |
89 | $configs['beforeHandleImg'] = function (&$fieldName,&$img) {
90 | return ;
91 | };
92 | $configs['beforeCacheImg'] = function (&$fieldName,&$url) {
93 | return ;
94 | };
95 | $configs['afterExtractField'] = function (&$fieldName,&$data,&$page) {
96 | return ;
97 | };
98 | $configs['beforeCacheImg'] = function (&$page,&$data) {
99 | return ;
100 | };
101 | $configs['afterExtractPage'] = function (&$page,&$data) {
102 | return ;
103 | };
--------------------------------------------------------------------------------
/SiteConfig/4.php:
--------------------------------------------------------------------------------
1 | 'products',
20 | 'alias'=>'内容',
21 | 'selector'=>'//div[contains(@class,"p2p_product")]',
22 | 'selectorType'=>'XPath',
23 | 'repeated'=>1,
24 | 'children'=>
25 | [
26 | [
27 | 'name'=>'product_name',
28 | 'alias'=>'作者',
29 | 'selector'=>'//h3/a | //h3/span',
30 | 'selectorType'=>'XPath',
31 | 'required'=>1
32 | ],
33 | [
34 | 'name'=>'product_info',
35 | 'alias'=>'内容',
36 | 'selector'=>'//h3/a | //h3/span',
37 | 'selectorType'=>'XPath',
38 | ],
39 | ]
40 | ],
41 | ];
42 | $configs['beforeCrawl'] = function (&$site) {
43 | //$site['addHeader']("Referer", "http://buluo.qq.com/p/index.html");
44 | //$site['addCookies']("last_item_date:10733=1467003228; mykeywords=a%3A1%3A%7Bi%3A0%3Bs%3A6%3A%22%E4%BD%90%E7%BD%97%22%3B%7D; PHPSESSID=2mq1jhshc6ssi2rc3j3iontku7; GINFO=uid%3D3519430%26nickname%3Dketle%26group_id%3D0%26avatar_t%3D%26main_group_id%3D0%26common_group_id%3D59; GKEY=0c3d0734c04ae6f2b72632d0553eb116");
45 |
46 | };
47 | $configs['nextScanUrl'] = function (&$url) {
48 | return ;
49 | };
50 | $configs['onChangeProxy'] = function (&$site) {
51 | return ;
52 | };
53 | $configs['isAntiSpider'] = function (&$url,&$content) {
54 | return ;
55 | };
56 | $configs['afterDownloadPage'] = function (&$page,&$site) {
57 | //print_r($site['header']) ;
58 |
59 | if ($site) {
60 | //echo 111;
61 | }
62 | return ;
63 | };
64 | $configs['afterDownloadAttachedPage'] = function (&$page,&$site) {
65 | return ;
66 | };
67 | $configs['onProcessScanPage'] = function (&$page,&$content,&$site) {
68 | return true;
69 | };
70 | $configs['onProcessHelperPage'] = function (&$page,&$content,&$site) {
71 | return true;
72 | };
73 | $configs['onProcessContentPage'] = function (&$page,&$content,&$site) {
74 | return true;
75 | };
76 |
77 | $configs['beforeHandleImg'] = function (&$fieldName,&$img) {
78 | return ;
79 | };
80 | $configs['beforeCacheImg'] = function (&$fieldName,&$url) {
81 | return ;
82 | };
83 | $configs['afterExtractField'] = function (&$fieldName,&$data,&$page) {
84 | return ;
85 | };
86 | $configs['beforeCacheImg'] = function (&$page,&$data) {
87 | return ;
88 | };
89 | $configs['afterExtractPage'] = function (&$page,&$data) {
90 | return ;
91 | };
--------------------------------------------------------------------------------
/SiteConfig/5.php:
--------------------------------------------------------------------------------
1 | 'article_title',
19 | 'alias'=>'标题',
20 | 'selector'=>'//*[@id="log-send-article"]/div[2]/h1',
21 | 'selectorType'=>'XPath',
22 | 'required'=>1
23 | ],[
24 | 'name'=>'article_title_img',
25 | 'alias'=>'标题图片',
26 | 'selector'=>'//*[@class="article-img-box"]/img/@src',
27 | 'selectorType'=>'XPath'
28 | ],
29 | [
30 | 'name'=>'article_content',
31 | 'alias'=>'内容',
32 | 'selector'=>"//div[@id='article_content']",
33 | 'selectorType'=>'XPath'
34 |
35 | ],
36 | ];
37 | $configs['beforeCrawl'] = function (&$site) {
38 | $site['addHeader']("Referer", "http://www.huxiu.com/");
39 | $site['addCookies']("gr_user_id=e321f323-cfeb-4b1a-bc4b-a9644977d262; kr_stat_uuid=CDwxJ24517819; Hm_lvt_e8ec47088ed7458ec32cde3617b23ee3=1471093100; Hm_lvt_713123c60a0e86982326bae1a51083e1=1471069347,1471069528,1471069554,1471103123; _alicdn_sec=57b299062ae8a8c0d74782d6ebbefdff188ab528; aliyungf_tc=AQAAAD5P9Q8NZQsA90fnekPdEbGb5/qw");
40 |
41 | };
42 | $configs['nextScanUrl'] = function (&$url) {
43 | return ;
44 | };
45 | $configs['onChangeProxy'] = function (&$site) {
46 | return ;
47 | };
48 | $configs['isAntiSpider'] = function (&$url,&$content) {
49 | return ;
50 | };
51 | $configs['afterDownloadPage'] = function (&$page,&$site) {
52 | //print_r($site['header']) ;
53 |
54 | if ($site) {
55 | //echo 111;
56 | }
57 | return ;
58 | };
59 | $configs['afterDownloadAttachedPage'] = function (&$page,&$site) {
60 | return ;
61 | };
62 |
63 | $global = [];
64 | $configs['onProcessScanPage'] = function (&$page,&$content,&$site) {
65 | global $global;
66 | $global['page'] = 2;
67 | //echo $content;
68 |
69 | $pf = PauseFactory::Create( 'Xpath' );
70 | $match = $pf->pause($content,'//div[contains(@class,"get-mod-more")]/@data-cur_page');
71 | $pagex = $match;
72 |
73 | $match = $pf->pause($content,'//div[contains(@class,"get-mod-more")]/@data-last_dateline');
74 | $last_dateline = $match;
75 |
76 | $pf2 = PauseFactory::Create( 'Regex' );
77 | $match = $pf2->pause($content,"|var huxiu_hash_code='(\w+)'|");
78 | $huxiu_hash_code = $match;
79 |
80 | $url = 'https://www.huxiu.com/v2_action/article_list?page='.$global['page'];
81 | $options['method'] = 'POST';
82 | $options['data'] = ['huxiu_hash_code'=>$huxiu_hash_code,'page'=>$global['page'],'last_dateline'=>$last_dateline];
83 |
84 | //print_r($options);die;
85 | $site['addUrl']($url,$options);
86 | $global['huxiu_hash_code'] = $huxiu_hash_code;
87 | //die;
88 | //
89 | //echo $url;
90 |
91 | return true;
92 | };
93 |
94 |
95 | $configs['onProcessHelperPage'] = function (&$page,&$content,&$site) {
96 | //echo $content;
97 | global $global;
98 | $content = json_decode($content,true);
99 | echo 'page:'.$global['page']."
\n";
100 | //print_r($content);
101 | $global['last_dateline'] = $content['last_dateline'];
102 | $content = $content['data'];
103 |
104 | //echo $content;die;
105 |
106 |
107 | $global['page']++;
108 | $url = 'https://www.huxiu.com/v2_action/article_list?page='.$global['page'];
109 | $options['method'] = 'POST';
110 | $options['data'] = ['huxiu_hash_code'=>$global['huxiu_hash_code'],'page'=>$global['page'],'last_dateline'=>$global['last_dateline']];
111 |
112 | print_r($options);//die;
113 | echo "
\n";
114 | $site['addUrl']($url,$options);
115 |
116 |
117 |
118 | //die;
119 | /*die;
120 |
121 | $content2 = unserialize($content);
122 | $content = json_decode($content,true);
123 | $content2 = json_decode( json_encode( $content2),true);
124 | print_r($content2);die;
125 | foreach ($content2['data']['items'] as $key => $value) {
126 | //echo $value['id']."
";
127 | $lastId = $value['id'];
128 | $site['addUrl']('http://36kr.com/p/'.$lastId.'.html'); //内容页
129 | }
130 | $site['addUrl']('http://36kr.com/api/info-flow/main_site/posts?column_id=&b_id='.$lastId.'&per_page=20&_='.time()); //列表页json
131 | //die;*/
132 |
133 | return true;
134 | };
135 | $configs['onProcessContentPage'] = function (&$page,&$content,&$site) {
136 |
137 | return false;
138 | };
139 |
140 | $configs['beforeHandleImg'] = function (&$fieldName,&$img) {
141 | return ;
142 | };
143 | $configs['beforeCacheImg'] = function (&$fieldName,&$url) {
144 | return ;
145 | };
146 | $configs['afterExtractField'] = function (&$fieldName,&$data,&$page) {
147 | return ;
148 | };
149 | $configs['beforeCacheImg'] = function (&$page,&$data) {
150 | return ;
151 | };
152 | $configs['afterExtractPage'] = function (&$page,&$data) {
153 | return ;
154 | };
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "require": {
3 | "catfan/Medoo": "^1.1",
4 | "php-curl-class/php-curl-class": "^5.0",
5 | "imangazaliev/didom": "^1.7"
6 | },
7 | "autoload": {
8 | "classmap": [
9 | "Crawler"
10 | ]
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/config.php:
--------------------------------------------------------------------------------
1 | 2,//最大进程数,通常cpu core*2 ;
4 | 'insertType' => 1,//1:点击'开始'后 判断数据库是否有该条数据,新增才插入; 2:先删除该site所有数据,全新插入;
5 | 'db' => [
6 | 'database_type' => 'mysql',
7 | 'database_name' => 'jiumozhi',
8 | 'server' => '127.0.0.1',
9 | 'port' => '3306',
10 | 'username' => 'root',
11 | 'password' => 'root123',
12 | 'charset' => 'utf8'
13 | ],
14 | 'redis' => [
15 | 'host' => '127.0.0.1',
16 | 'port' => '6379'
17 | ]
18 | ];
--------------------------------------------------------------------------------
/index.php:
--------------------------------------------------------------------------------
1 | \n");
16 | }
17 | array_shift($argv);
18 |
19 | //print_r($argv);die;
20 |
21 | $cliIncludeFile = $argv[0];
22 | $cliAct = $argv[1];
23 |
24 | $siteConfigDir = './SiteConfig/';
25 |
26 | if (!$cliIncludeFile) {
27 | echo "
";
28 | foreach (new DirectoryIterator($siteConfigDir) as $fileInfo) {
29 | if($fileInfo->isDot()) continue;
30 | if ($fileInfo->getExtension() == 'php') {
31 | include($siteConfigDir.$fileInfo->getFilename());
32 | echo ''.$configs['title'].' 测试
33 | 开始
34 | 停止
';
35 | //print_r($configs);
36 |
37 |
38 | }
39 |
40 | }
41 | }else{
42 |
43 | $t1 = microtime(true);
44 |
45 | $includeFile = $siteConfigDir.$cliIncludeFile.'.php';
46 | include($includeFile);
47 | $configs['siteConfigDir'] = $siteConfigDir;
48 |
49 | //print_r($configs);
50 | if ($cliAct == 'test') {
51 | $configs['dbConfig'] = include './config.php';
52 | $instances = new Redis();
53 | $instances->connect($configs['dbConfig']['redis']['host'], $configs['dbConfig']['redis']['port']);
54 |
55 | $instances->del('jiumozhiQueue'.$configs['id']);
56 | $instances->del('jiumozhiQueue'.$configs['id'].'AllUrl');
57 | echo "清理redis队列完毕\n";
58 |
59 |
60 |
61 | @unlink($configs['siteConfigDir'].$configs['id'].'stop.txt');
62 | $configs['debug'] = 1;
63 | $configs['debugNum'] = 36;
64 | $configs['action'] = 'test';
65 | $crawler = new Crawler($configs);
66 | $crawler->start();
67 | }elseif ($cliAct == 'start') {
68 |
69 | $configs['dbConfig'] = include './config.php';
70 | $configs['action'] = 'start';
71 | //print_r($dbConfig);die;
72 |
73 |
74 | @unlink($configs['siteConfigDir'].$configs['id'].'stop.txt');
75 | $crawler = new Crawler($configs);
76 | $crawler->start();
77 | }elseif ($cliAct == 'restart') {
78 |
79 | $configs['dbConfig'] = include './config.php';
80 | $configs['action'] = 'restart';
81 | //print_r($dbConfig);die;
82 |
83 | @unlink($configs['siteConfigDir'].$configs['id'].'stop.txt');
84 | $crawler = new Crawler($configs);
85 | $crawler->start();
86 | }elseif ($cliAct == 'stop') {
87 | file_put_contents($configs['siteConfigDir'].$configs['id'].'stop.txt', '');
88 | echo "已经停了吧 - -\n";die;
89 | }elseif ($cliAct == 'clean') {
90 | $configs['dbConfig'] = include './config.php';
91 | $instances = new Redis();
92 | $instances->connect($configs['dbConfig']['redis']['host'], $configs['dbConfig']['redis']['port']);
93 |
94 | $instances->del('jiumozhiQueue'.$configs['id']);
95 | $instances->del('jiumozhiQueue'.$configs['id'].'AllUrl');
96 | echo "清理redis队列完毕\n";
97 |
98 | }
99 |
100 |
101 | $t2 = microtime(true);
102 | echo "耗时".round($t2-$t1,3)."秒\n";
103 | }
104 |
105 |
106 | //$crawler = new Crawler($configs);
107 | //$crawler->start();
108 |
--------------------------------------------------------------------------------
/test.php:
--------------------------------------------------------------------------------
1 |
";
7 |
8 | $content = file_get_contents(trim($_POST['url']));
9 | //echo $content;
10 |
11 | $content = mb_convert_encoding($content, "utf-8",trim($_POST['charset']));
12 | $content = preg_replace('|charset\s*=\s*(\w+)|i', 'charset=UTF-8', $content);
13 | $pauseDrive = PauseFactory::Create( trim($_POST['drive']) );
14 | //echo $content;
15 | $data = $pauseDrive->pause($content,trim($_POST['selector']));
16 |
17 | //print_r($pauseDrive);
18 | print_r($data);
19 | echo "
";
20 | }
21 |
22 | ?>
23 |
24 |
--------------------------------------------------------------------------------