${contentText.replaceAll(/\\x0a|\\n/g, '')}
`;
420 | for (const pidx in picArr) {
421 | contentHtml = contentHtml + `

`;
422 | }
423 | contentHtml += '
';
424 | // 获取作者
425 | const nickNameEle = $('.wx_follow_nickname');
426 | let byline;
427 | if (nickNameEle.length > 1) {
428 | byline = nickNameEle.first().text();
429 | } else {
430 | byline = nickNameEle.text();
431 | }
432 |
433 | return {
434 | title: title.replaceAll(/\\x0a|\\n/g, ''),
435 | content: contentHtml,
436 | textContent: '',
437 | length: 0,
438 | excerpt: '',
439 | byline: byline,
440 | dir: '',
441 | siteName: '',
442 | lang: ''
443 | };
444 | }
445 |
446 | // 解析短文字格式页面:https://mp.weixin.qq.com/s/vLuVL5owS5VdTDmMRzu3vQ
447 | // 这种格式没有内容,只有标题
448 | const shortTitleRegex = /window.msg_title\s?=\s?'(.*?)'/;
449 | function parseShortTextHtml(articleInfo: ArticleInfo, $) {
450 | const shortTitleMatch = shortTitleRegex.exec(articleInfo.html || '');
451 | let title;
452 | if (shortTitleMatch) {
453 | title = shortTitleMatch[1];
454 | } else {
455 | return null;
456 | }
457 |
458 | // 获取内容
459 | let contentText = $('meta[name=description]').attr('content');
460 | if (!contentText) {
461 | contentText = title;
462 | }
463 | let contentHtml = `${contentText.replaceAll(/\\x0a|\\n/g, '')}
`;
464 | // 获取作者
465 | const nickNameEle = $('.wx_follow_nickname');
466 | let byline;
467 | if (nickNameEle.length > 1) {
468 | byline = nickNameEle.first().text();
469 | } else {
470 | byline = nickNameEle.text();
471 | }
472 |
473 | contentHtml += '
';
474 | return {
475 | title: title.replaceAll(/\\x0a|\\n/g, ''),
476 | content: contentHtml,
477 | textContent: '',
478 | length: 0,
479 | excerpt: '',
480 | byline: byline,
481 | dir: '',
482 | siteName: '',
483 | lang: ''
484 | };
485 | }
486 |
487 | /**
488 | * 解析元数据
489 | * @param articleInfo 文章信息
490 | * @param htmlStr 微信文章网页源码
491 | * @param byline Readability解析出来的作者名
492 | */
493 | let totalJsName;
494 | function parseMeta(articleInfo: ArticleInfo, $meta: any, byline?: string) {
495 | // 判断是否需要下载元数据
496 | // if (1 != downloadOption.saveMeta) {
497 | // return;
498 | // }
499 | const authorName = articleInfo.author ? articleInfo.author : getEleText($meta('#js_author_name'));
500 | // 缓存公众号名字,防止特殊页面获取不到
501 | let jsName;
502 | if (dlEvent == DlEventEnum.BATCH_WEB) {
503 | if (!totalJsName) {
504 | totalJsName = getEleText($meta('#js_name'));
505 | }
506 | jsName = totalJsName;
507 | } else {
508 | jsName = getEleText($meta('#js_name'));
509 | }
510 | // 封面
511 | let cover: string | undefined;
512 | if (!articleInfo.cover) {
513 | const coverEles = $meta('meta[property=og:image]');
514 | if (coverEles) {
515 | if (coverEles.length > 1) {
516 | cover = coverEles.first().attr('content');
517 | } else {
518 | cover = coverEles.attr('content');
519 | }
520 | }
521 | }
522 | const copyrightFlg = $meta('#copyright_logo')?.text() ? true : false;
523 | const publicTime = articleInfo.datetime ? DateUtil.format(articleInfo.datetime, 'yyyy-MM-dd HH:mm') : '';
524 | const ipWording = service.matchIpWording($meta.html());
525 | const articleMeta = new ArticleMeta();
526 | articleMeta.copyrightFlg = copyrightFlg;
527 | articleMeta.author = authorName ? authorName : byline;
528 | articleMeta.jsName = jsName;
529 | articleMeta.publicTime = publicTime;
530 | articleMeta.ipWording = ipWording;
531 | articleInfo.metaInfo = articleMeta;
532 | articleInfo.cover = cover;
533 | }
534 |
535 | function getEleText(ele: any): string {
536 | if (ele) {
537 | if (ele.length > 1) {
538 | return StrUtil.trim(ele.first().text());
539 | } else {
540 | return StrUtil.trim(ele.text());
541 | }
542 | }
543 | return '';
544 | }
545 |
546 | // 根据过滤规则过滤文章
547 | function doFilter(articleInfo: ArticleInfo): { flgFilter: boolean; filterMsg: string } {
548 | const filterRuleStr = downloadOption.filterRule;
549 | if (!filterRuleStr) {
550 | return { flgFilter: false, filterMsg: '' };
551 | }
552 |
553 | const filterRule: FilterRuleInfo = parseFilterInfo(filterRuleStr);
554 | if (filterRule.titleInclude.length > 0 && !isInclude(articleInfo.title, filterRule.titleInclude)[0]) {
555 | return { flgFilter: true, filterMsg: '标题未包含关键词' };
556 | }
557 |
558 | if (filterRule.authInclude.length > 0 && !isInclude(articleInfo.author, filterRule.authInclude)[0]) {
559 | return { flgFilter: true, filterMsg: '作者未包含关键词' };
560 | }
561 |
562 | const [flgTitleInclude, titleExcludeWord] = isInclude(articleInfo.title, filterRule.titleExclude);
563 | if (flgTitleInclude) {
564 | return { flgFilter: true, filterMsg: `标题包含排除关键词 ${titleExcludeWord}` };
565 | }
566 |
567 | const [flgAuthInclude, authExcludeWord] = isInclude(articleInfo.author, filterRule.authExclude);
568 | if (flgAuthInclude) {
569 | return { flgFilter: true, filterMsg: `作者包含排除关键词 ${authExcludeWord}` };
570 | }
571 |
572 | return { flgFilter: false, filterMsg: '' };
573 | }
574 | /**
575 | * 判断内容是否包含关键词
576 | * @param content 内容
577 | * @param include 包含关键词
578 | */
579 | function isInclude(content: string | undefined, include: string[]): [boolean, string] {
580 | if (!content) return [false, ''];
581 | for (const includeItem of include) {
582 | if (content.includes(includeItem)) {
583 | return [true, includeItem];
584 | }
585 | }
586 | return [false, ''];
587 | }
588 |
589 | function parseFilterInfo(filterRuleStr: string): FilterRuleInfo {
590 | const filterRuleInfo = new FilterRuleInfo();
591 | const filterRule = JSON.parse(filterRuleStr);
592 | if (filterRule.title) {
593 | const titleInclude = filterRule.title.include;
594 | if (titleInclude && titleInclude.length > 0) {
595 | filterRuleInfo.titleInclude = titleInclude;
596 | }
597 | const titleExclude = filterRule.title.exclude;
598 | if (titleExclude && titleExclude.length > 0) {
599 | filterRuleInfo.titleExclude = titleExclude;
600 | }
601 | }
602 |
603 | if (filterRule.auth) {
604 | const authInclude = filterRule.auth.include;
605 | if (authInclude && authInclude.length > 0) {
606 | filterRuleInfo.authInclude = authInclude;
607 | }
608 | const authExclude = filterRule.auth.exclude;
609 | if (authExclude && authExclude.length > 0) {
610 | filterRuleInfo.authExclude = authExclude;
611 | }
612 | }
613 | return filterRuleInfo;
614 | }
615 |
616 | /*
617 | * 下载评论
618 | */
619 | async function downloadComment(articleInfo: ArticleInfo) {
620 | if (!articleInfo.html) return;
621 |
622 | const gzhInfo = articleInfo.gzhInfo;
623 | // 判断是否需要下载评论
624 | if (1 != downloadOption.dlComment || !gzhInfo) {
625 | return;
626 | }
627 |
628 | const commentId = service.matchCommentId(articleInfo.html);
629 | if (!commentId) {
630 | logger.error('获取精选评论参数失败');
631 | resp(NwrEnum.FAIL, '获取精选评论参数失败');
632 | } else if (commentId == '0') {
633 | logger.info(`【${articleInfo.title}】没有评论`);
634 | resp(NwrEnum.FAIL, `【${articleInfo.title}】没有评论`);
635 | } else {
636 | const headers = {
637 | Host: gzhInfo.Host,
638 | Connection: 'keep-alive',
639 | 'User-Agent': gzhInfo.UserAgent,
640 | Cookie: gzhInfo.Cookie,
641 | Referer: articleInfo.contentUrl
642 | };
643 | // 评论列表
644 | let commentList;
645 | // 评论回复map
646 | let replyDetailMap;
647 | await axios
648 | .get(COMMENT_LIST_URL, {
649 | params: {
650 | __biz: gzhInfo.biz,
651 | key: gzhInfo.key,
652 | uin: gzhInfo.uin,
653 | comment_id: commentId
654 | },
655 | headers: headers
656 | })
657 | .then((response) => {
658 | if (response.status != 200) {
659 | logger.error(`获取精选评论失败,状态码:${response.status}`, articleInfo.contentUrl);
660 | resp(NwrEnum.FAIL, `获取精选评论失败,状态码:${response.status}`);
661 | return;
662 | }
663 | const resData = response.data;
664 | if (resData.base_resp.ret != 0) {
665 | logger.error(`【${articleInfo.title}】获取精选评论失败`, resData, response.config.url, response.config.params);
666 | resp(NwrEnum.FAIL, `【${articleInfo.title}】获取精选评论失败:${resData.errmsg}`);
667 | return;
668 | }
669 | if (resData.elected_comment && resData.elected_comment.length > 0) {
670 | commentList = resData.elected_comment;
671 | }
672 | logger.debug(`【${articleInfo.title}】精选评论`, commentList);
673 | })
674 | .catch((error) => {
675 | logger.error(`【${articleInfo.title}】获取精选评论失败`, error, articleInfo.contentUrl);
676 | });
677 |
678 | // 处理评论的回复
679 | if (1 == downloadOption.dlCommentReply && commentList) {
680 | replyDetailMap = new Map();
681 | for (const commentItem of commentList) {
682 | const replyInfo = commentItem.reply_new;
683 | if (replyInfo.reply_total_cnt > replyInfo.reply_list.length) {
684 | await axios
685 | .get(COMMENT_REPLY_URL, {
686 | params: {
687 | __biz: gzhInfo.biz,
688 | key: gzhInfo.key,
689 | uin: gzhInfo.uin,
690 | comment_id: commentId,
691 | content_id: commentItem.content_id,
692 | max_reply_id: replyInfo.max_reply_id
693 | },
694 | headers: headers
695 | })
696 | .then((response) => {
697 | if (response.status != 200) {
698 | logger.error(`获取评论回复失败,状态码:${response.status}`, response.config.url, response.config.params);
699 | resp(NwrEnum.FAIL, `获取评论回复失败,状态码:${response.status}`);
700 | return;
701 | }
702 | const resData = response.data;
703 | if (resData.base_resp.ret != 0) {
704 | logger.error(`获取评论回复失败`, resData, response.config.url, response.config.params);
705 | resp(NwrEnum.FAIL, `获取评论回复失败:${resData.errmsg}`);
706 | return;
707 | }
708 | replyDetailMap[commentItem.content_id] = resData.reply_list.reply_list;
709 | })
710 | .catch((error) => {
711 | logger.error('获取评论回复失败', error);
712 | });
713 | }
714 | }
715 | }
716 | articleInfo.commentList = commentList;
717 | articleInfo.replyDetailMap = replyDetailMap;
718 | }
719 | }
720 |
721 | /*
722 | * 下载图片并替换src
723 | * $: cheerio对象
724 | * savePath: 保存文章的路径(已区分文章),例如: D://savePath//测试文章1
725 | * tmpPath: 缓存路径(已区分文章),例如:D://tmpPathPath//6588aec6b658b2c941f6d51d0b1691b9
726 | */
727 | async function downloadImgToHtml($, savePath: string, tmpPath: string, articleInfo: ArticleInfo): Promise<{ imgCount: number }> {
728 | const imgArr = $('img');
729 | const awaitArr: Promise