├── .dockerignore
├── .eslintignore
├── .eslintrc.js
├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── certificates
├── rootCA.crt
└── rootCA.key
├── client
├── app
│ ├── actions.js
│ ├── components
│ │ ├── Paginator.jsx
│ │ ├── edit.jsx
│ │ ├── loading.jsx
│ │ └── searchInput.jsx
│ ├── config.js
│ ├── containers
│ │ ├── doc.jsx
│ │ ├── posts.jsx
│ │ ├── profiles.jsx
│ │ └── search.jsx
│ ├── index.html
│ ├── index.jsx
│ ├── reducers.js
│ └── style
│ │ └── style.css
├── build
│ ├── 448c34a56d699c29117adc64c43affeb.woff2
│ ├── 674f50d287a8c48dc19ba404d20fe713.eot
│ ├── 89889688147bd7575d6327160d64e760.svg
│ ├── 912ec66d7572ff821749319396470bde.svg
│ ├── af7ae505a9eed503f8b8e6982036873e.woff2
│ ├── b06871f281fee6b241d60582ae9369b9.ttf
│ ├── bundle.js
│ ├── e18bbf611f2a2e43afc071aa2f4e1512.ttf
│ ├── f4769f9bdb7466be65088239c12046d1.eot
│ ├── fa2772327f55d8198301fdb8bcfc8158.woff
│ ├── fee66e712a8a08eef5805a46892932ad.woff
│ └── index.html
├── package-lock.json
├── package.json
└── webpack.config.js
├── config.js
├── docker-compose.yml
├── imgs
├── posts_screenshot.png
└── sponsor-me.jpeg
├── index.js
├── models
├── Comment.js
├── Post.js
├── Profile.js
├── ProfilePubRecord.js
├── index.js
└── plugins
│ └── paginator.js
├── package-lock.json
├── package.json
├── rule
├── basicAuth.js
├── getNextProfileLink.js
├── handleImg
│ ├── index.js
│ └── replaceImg.png
├── handlePostPage.js
├── handleProfileHistoryPage.js
├── index.js
├── insertProfileScript.html
├── postLink.js
└── savePostsData.js
├── scripts
└── checkWechatId.js
├── server
├── api
│ ├── conf.js
│ └── index.js
├── index.js
└── wrap.js
├── test
├── contentHandler.js
├── exportData.js
└── models
│ ├── Post.test.js
│ ├── Profile.test.js
│ └── ProfilePubRecord.test.js
└── utils
├── contentHandler.js
├── correctWechatId.js
├── exportData.js
├── helper.js
├── index.js
├── logger.js
├── merge.js
└── redis.js
/.dockerignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | client/app
3 | client/node_modules
4 | client/package-lock.json
5 | client/package.json
6 | client/webpack.config.js
7 | .git
8 | .gitignore
9 |
--------------------------------------------------------------------------------
/.eslintignore:
--------------------------------------------------------------------------------
1 | node_modules
2 |
--------------------------------------------------------------------------------
/.eslintrc.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | "env": {
3 | "browser": true,
4 | "commonjs": true,
5 | "es6": true,
6 | "node": true
7 | },
8 | "globals": {
9 | "describe": true,
10 | "it": true
11 | },
12 | "extends": ["eslint:recommended", "plugin:react/recommended"],
13 | "parserOptions": {
14 | "ecmaFeatures": {
15 | "experimentalObjectRestSpread": true,
16 | "jsx": true
17 | },
18 | "sourceType": "module",
19 | "ecmaVersion": 2017
20 | },
21 | "plugins": [
22 | "react"
23 | ],
24 | "rules": {
25 | "no-unused-vars": [
26 | 1
27 | ],
28 | "no-console": [
29 | 0
30 | ],
31 | "react/prop-types": [
32 | 0
33 | ],
34 | "react/no-danger": [
35 | 1
36 | ],
37 | "indent": [
38 | 1,
39 | 2,
40 | { "SwitchCase": 1 }
41 | ],
42 | "linebreak-style": [
43 | 2,
44 | "unix"
45 | ],
46 | "quotes": [
47 | 1,
48 | "single"
49 | ],
50 | "semi": [
51 | 2,
52 | "always"
53 | ],
54 | "require-yield": [0]
55 | }
56 | };
57 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Logs
2 | logs
3 | *.log
4 | npm-debug.log*
5 | yarn-debug.log*
6 | yarn-error.log*
7 |
8 | # Runtime data
9 | pids
10 | *.pid
11 | *.seed
12 | *.pid.lock
13 |
14 | # Directory for instrumented libs generated by jscoverage/JSCover
15 | lib-cov
16 |
17 | # Coverage directory used by tools like istanbul
18 | coverage
19 |
20 | # nyc test coverage
21 | .nyc_output
22 |
23 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
24 | .grunt
25 |
26 | # Bower dependency directory (https://bower.io/)
27 | bower_components
28 |
29 | # node-waf configuration
30 | .lock-wscript
31 |
32 | # Compiled binary addons (http://nodejs.org/api/addons.html)
33 | build/Release
34 |
35 | # Dependency directories
36 | node_modules/
37 | jspm_packages/
38 |
39 | # Typescript v1 declaration files
40 | typings/
41 |
42 | # Optional npm cache directory
43 | .npm
44 |
45 | # Optional eslint cache
46 | .eslintcache
47 |
48 | # Optional REPL history
49 | .node_repl_history
50 |
51 | # Output of 'npm pack'
52 | *.tgz
53 |
54 | # Yarn Integrity file
55 | .yarn-integrity
56 |
57 | # dotenv environment variables file
58 | .env
59 |
60 | private/
61 | my_config.js
62 | my_config.json
63 | .DS_Store
64 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM node:16
2 | WORKDIR /app
3 | COPY package.json package-lock.json /app/
4 | RUN npm install --only=prod
5 | COPY . /app
6 | # ubuntu 添加根证书相关操作
7 | RUN cd ~ \
8 | && mkdir .anyproxy \
9 | && cd .anyproxy \
10 | && mv /app/certificates ~/.anyproxy/ \
11 | && cp ~/.anyproxy/certificates/rootCA.crt /usr/local/share/ca-certificates/ \
12 | && update-ca-certificates
13 | # 修改时区
14 | RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
15 | EXPOSE 8101 8104 8102
16 | CMD ["node", "index.js"]
17 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 liqiang
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # wechat_spider 微信爬虫
2 |
3 | 基于 Node.js 的微信爬虫,通过中间人代理的原理,批量获取微信文章数据,包括阅读量、点赞量、在看数、评论和文章正文等数据。
4 |
5 | 使用代理模块 AnyProxy。代码已支持 AnyProxy 4 版本。
6 |
7 | 支持 Docker 部署。
8 |
9 | 项目可运行在个人电脑上,也可部署在服务器上。
10 |
11 | ## 开始
12 |
13 | ### 安装前准备
14 |
15 | - 安装 Node,推荐版本 16
16 | - 安装 MongoDB,最近版本即可
17 | - 安装 Redis,最近版本即可
18 |
19 | ### 安装
20 |
21 | ```bash
22 | git clone https://github.com/lqqyt2423/wechat_spider.git
23 | cd wechat_spider
24 | npm install
25 | ```
26 |
27 | 本项目基于代理模块 AnyProxy,解析微信 HTTPS 请求需在电脑和手机上都安装证书。可参考:[AnyProxy 文档](http://anyproxy.io/cn/#%E8%AF%81%E4%B9%A6%E9%85%8D%E7%BD%AE)。
28 |
29 | ### 通过 Docker 部署
30 |
31 | ```bash
32 | git clone https://github.com/lqqyt2423/wechat_spider.git
33 | cd wechat_spider
34 | # build image
35 | docker-compose build
36 | # 运行实例(mongo数据存储地址需通过环境变量MONGO_PATH传入)
37 | MONGO_PATH=/data/mongo docker-compose up
38 | # 终止运行
39 | docker-compose down
40 | ```
41 |
42 | - `Dockerfile` 中已经设置了在 `Linux` 环境的 Docker 中添加根证书的操作步骤,所以接下来仅需在手机上安装 https 证书即可。
43 | - 最终手机上设置的代理 ip 还是需要以自己电脑上的 ip 为准,需忽略 Docker 实例中打印的 ip 地址
44 | - 可编辑 `Dockerfile` 和 `docker-compose.yml` 改变部署规则
45 |
46 | ## 使用
47 |
48 | ```bash
49 | cd wechat_spider
50 | npm start
51 | ```
52 |
53 | 1. 确保电脑和手机连接同一 WIFI,`npm start` 之后,命令行输出`请配置代理: xx.xx.xx.xx:8101` 类似语句,手机设置代理为此 IP 和端口
54 | 2. 手机上测试打开任一公众号历史文章详情页和文章页,观察电脑命令行的输出,查看数据是否保存至 MongoDB
55 |
56 | > - 如需测试自动翻页,可先多次分别打开不同的公众号的历史详情页,等数据库中有了翻页的基础公众号信息之后,再随便进入历史页等待翻页跳转
57 | > - 翻页逻辑仅支持公众号历史页面跳公众号历史页面,微信文章页面跳微信文章页面,两个不同页面不能互相跳转
58 |
59 | ### 针对微信新版需注意
60 |
61 | 1. 历史页面可自行拼接后发送至微信中打开,拼接规则为:
62 |
63 | ```javascript
64 | var biz = 'MzI4NjQyMTM2Mw==';
65 | var history_page = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=' + biz + '&scene=124#wechat_redirect';
66 | // https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzI4NjQyMTM2Mw==&scene=124#wechat_redirect
67 | ```
68 |
69 | 2. 进入微信文章页面先刷新一下
70 |
71 | ### 自定义配置
72 |
73 | 可编辑 `config.js` 文件进行自定义配置,文件中每个配置项都有详细的说明。
74 |
75 | 可配置项举例如下:
76 |
77 | - 控制是否开启文章或历史详情页自动跳转
78 | - 控制跳转时间间隔
79 | - 根据文章发布时间控制抓取范围
80 | - 是否保存文章正文内容
81 | - 是否保存文章评论
82 |
83 | 需注意,本项目修改了 AnyProxy 的默认端口。连接代理的端口改为 8101,AnyProxy 管理界面的端口改为 8102,且仅在 `NODE_ENV=development` 时才会开启 AnyProxy 的管理界面功能。如需修改,可编辑 `config.js`。
84 |
85 | ### 可视化界面
86 |
87 | 前端页面已打包好,启动项目后,如无修改默认 `server port` 配置,浏览器直接访问 `http://localhost:8104` 即可。检测数据有无抓取保存直接刷新此页面即可。
88 |
89 | 
90 |
91 | 前端页面由 `React` 编写,如需修改,可编辑 `client` 文件中的代码。
92 |
93 | ### MongoDB 数据信息
94 |
95 | 数据库 database: wechat_spider
96 |
97 | 数据表 collections:
98 |
99 | - posts - 文章数据
100 | - profiles - 公众号数据
101 | - comments - 评论数据
102 |
103 | ### 从 MongoDB 导出数据
104 |
105 | #### 命令行直接导出数据
106 |
107 | ```bash
108 | mongoexport --db wechat_spider --collection posts --type=csv --fields title,link,publishAt,readNum,likeNum,likeNum2,msgBiz,msgMid,msgIdx,sourceUrl,cover,digest,isFail --out ~/Desktop/posts.csv
109 | ```
110 |
111 | #### 脚本导出
112 |
113 | 可参考文件 `/test/exportData.js` 。
114 |
115 | ## 感谢
116 |
117 | 感谢此文章提供思路:[微信公众号文章批量采集系统的构建](https://zhuanlan.zhihu.com/p/24302048)
118 |
119 | ### 赞助我
120 |
121 | 如果你觉得这个项目对你有帮助,不妨考虑给我买杯咖啡。
122 |
123 | 赞助时可备注来源 wechat spider,我会将你添加至下面的赞助列表中。
124 |
125 |
{
20 | this.setState({
21 | q: event.target.value
22 | });
23 | }}
24 | onKeyPress={event => {
25 | if (event.key == 'Enter') {
26 | onEnter(q);
27 | }
28 | }}
29 | hintText={hintText}
30 | fullWidth={fullWidth}
31 | />
32 | );
33 | }
34 | }
35 |
36 | SearchInput.propTypes = {
37 | onEnter: PropTypes.func.isRequired,
38 | value: PropTypes.string,
39 | hintText: PropTypes.string,
40 | fullWidth: PropTypes.bool
41 | };
42 |
--------------------------------------------------------------------------------
/client/app/config.js:
--------------------------------------------------------------------------------
1 | // const ENV = process.env.NODE_ENV || 'development';
2 |
3 | const config = {
4 | posts: '/api/posts',
5 | post: '/api/posts',
6 | profiles: '/api/profiles',
7 | profile: '/api/profiles',
8 | conf: '/api/conf',
9 | };
10 |
11 | export default config;
12 |
--------------------------------------------------------------------------------
/client/app/containers/doc.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import { connect } from 'react-redux';
3 | import {
4 | showMessage,
5 | fetchPost,
6 | updatePost,
7 | fetchProfile,
8 | updateProfile,
9 | fetchConf,
10 | updateConf,
11 | } from '../actions';
12 | import Loading from '../components/loading.jsx';
13 | import Edit from '../components/edit.jsx';
14 |
15 | class Doc extends React.Component {
16 | constructor(props) {
17 | super(props);
18 | }
19 |
20 | initDoc(props) {
21 | const { location, params, dispatch } = props;
22 | const { pathname } = location;
23 | if (pathname.includes('posts')) {
24 | this.fetchFn = fetchPost;
25 | this.updateFn = updatePost;
26 | this.stateName = 'post';
27 | this.statePath = 'posts';
28 | } else if (pathname.includes('profiles')) {
29 | this.fetchFn = fetchProfile;
30 | this.updateFn = updateProfile;
31 | this.stateName = 'profile';
32 | this.statePath = 'profiles';
33 | } else if (pathname.includes('conf')) {
34 | this.fetchFn = fetchConf;
35 | this.updateFn = updateConf;
36 | this.stateName = 'conf';
37 | this.statePath = 'conf';
38 | } else {
39 | throw new Error('invalide pathname');
40 | }
41 |
42 | const { id } = params;
43 | if (id) {
44 | dispatch(this.fetchFn(id));
45 | } else {
46 | dispatch(this.fetchFn());
47 | }
48 | }
49 |
50 | componentDidMount() {
51 | this.initDoc(this.props);
52 | }
53 |
54 | // eslint-disable-next-line
55 | componentWillReceiveProps(nextProps) {
56 | // 路由变化,重新请求接口
57 | if (nextProps.location.pathname !== this.props.location.pathname) {
58 | this.initDoc(nextProps);
59 | }
60 | }
61 |
62 | render() {
63 | if (!this.fetchFn) return ;
64 | const { isFetching, location, params, dispatch, history } = this.props;
65 | const { id } = params;
66 | const { pathname } = location;
67 | const isEdit = /edit$/.test(pathname);
68 | const doc = this.props[this.stateName].data;
69 | if (isFetching || !doc) return ;
70 |
71 | return (
72 |
73 | {
80 | let res;
81 | if (id) {
82 | res = await this.updateFn(id, doc);
83 | } else {
84 | res = await this.updateFn(doc);
85 | }
86 | if ([1, 2].includes(res.state)) dispatch(showMessage(res.message));
87 | if ([0, 1].includes(res.state)) {
88 | if (id) {
89 | dispatch(this.fetchFn(id));
90 | history.replace(`/${this.statePath}/${id}`);
91 | } else {
92 | dispatch(this.fetchFn());
93 | history.replace(`/${this.statePath}`);
94 | }
95 | }
96 | }}
97 | />
98 |
99 | );
100 | }
101 | }
102 |
103 | export default connect(state => state)(Doc);
104 |
--------------------------------------------------------------------------------
/client/app/containers/posts.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import { connect } from 'react-redux';
3 | import { fetchPosts, assembleUrl } from '../actions';
4 | import Loading from '../components/loading.jsx';
5 | import Paginator from '../components/paginator.jsx';
6 | import RaisedButton from 'material-ui/RaisedButton';
7 | import moment from 'moment';
8 | import { Link } from 'react-router';
9 | import Search from './search.jsx';
10 |
11 | function timeDiff(update, publish) {
12 | let updateMoment = moment(update);
13 | let publishMoment = moment(publish);
14 | let days = updateMoment.diff(publishMoment, 'days');
15 | if (days < 31) return `${days}天`;
16 | let months = updateMoment.diff(publishMoment, 'months');
17 | if (months < 13) return `${months}月`;
18 | let years = updateMoment.diff(publishMoment, 'years');
19 | return `${years}年`;
20 | }
21 |
22 | class Posts extends React.Component {
23 |
24 | constructor(props) {
25 | super(props);
26 | this.sortByTime = this.sortByTime.bind(this);
27 | this.judeMainDataShow = this.judeMainDataShow.bind(this);
28 | this.returnCurrentSearchArgs = this.returnCurrentSearchArgs.bind(this);
29 | }
30 |
31 | componentDidMount() {
32 | let { dispatch, location } = this.props;
33 | dispatch(fetchPosts(location.query));
34 | }
35 |
36 | // eslint-disable-next-line
37 | componentWillReceiveProps(nextProps) {
38 | if (nextProps.location.search !== this.props.location.search) {
39 | let { dispatch } = this.props;
40 | dispatch(fetchPosts(nextProps.location.query));
41 | }
42 | }
43 |
44 | returnCurrentSearchArgs() {
45 | const { location } = this.props;
46 | const { search } = location;
47 | const searchArgs = {};
48 | search.replace('?', '').split('&').forEach(item => {
49 | let key = item.split('=')[0];
50 | let value = item.replace(`${key}=`, '');
51 | if (key && value) searchArgs[key] = value;
52 | });
53 | return searchArgs;
54 | }
55 |
56 | sortByTime(sortType) {
57 | const { location, history } = this.props;
58 | const { search, pathname } = location;
59 | const searchArgs = this.returnCurrentSearchArgs();
60 | let iconClass = 'fa-sort';
61 | let nextSortType = `-${sortType}`;
62 | if (search && search.indexOf('?') === 0) {
63 | if (searchArgs.sortWay) {
64 | if (searchArgs.sortWay === sortType) {
65 | iconClass = 'fa-sort-asc';
66 | nextSortType = `-${sortType}`;
67 | }
68 | if (searchArgs.sortWay === `-${sortType}`) {
69 | iconClass = 'fa-sort-desc';
70 | nextSortType = sortType;
71 | }
72 | }
73 | }
74 | const nextQuery = Object.assign({}, searchArgs, {
75 | sortWay: nextSortType
76 | });
77 | const path = assembleUrl(pathname, nextQuery);
78 | return ( { history.push(path); }} className={`fa ${iconClass}`}>);
79 | }
80 |
81 | judeMainDataShow(key) {
82 | const searchArgs = this.returnCurrentSearchArgs();
83 | const mainDataVal = searchArgs.mainData;
84 | const primary = { primary: true };
85 | if (key === 'all' && !mainDataVal) return primary;
86 | if (key === 'yes' && mainDataVal === 'true') return primary;
87 | if (key === 'no' && mainDataVal === 'false') return primary;
88 | return null;
89 | }
90 |
91 | renderFilter() {
92 | const { location, history, posts } = this.props;
93 | const { pathname } = location;
94 | const searchArgs = this.returnCurrentSearchArgs();
95 | const style = {
96 | margin: '10px 15px 10px 0'
97 | };
98 | const { metadata } = posts;
99 | let count;
100 | if (metadata) count = metadata.count;
101 | return (
102 |
103 | {
104 | const nextQuery = { ...searchArgs };
105 | delete nextQuery.mainData;
106 | const path = assembleUrl(pathname, nextQuery);
107 | history.push(path);
108 | }} label="全部数据" style={style} />
109 | {
110 | const nextQuery = { ...searchArgs, mainData: 'true' };
111 | const path = assembleUrl(pathname, nextQuery);
112 | history.push(path);
113 | }} label="有阅读量" style={style} />
114 | {
115 | const nextQuery = { ...searchArgs, mainData: 'false' };
116 | const path = assembleUrl(pathname, nextQuery);
117 | history.push(path);
118 | }} label="无阅读量" style={style} />
119 | {!count ? '' : 共{count}条数据}
120 |
121 | );
122 | }
123 |
124 | render() {
125 | const { isFetching, posts, history, location } = this.props;
126 | const { search, pathname } = location;
127 | if (isFetching || !posts.data) return ;
128 | const { metadata, data } = posts;
129 |
130 | // show
131 | const showData = data.map(i => {
132 | i.title = i.title || '';
133 | let showTitle = i.title.substr(0, 25) || '暂无';
134 | if (i.link) {
135 | showTitle = {showTitle};
136 | } else {
137 | showTitle = {showTitle};
138 | }
139 | return {
140 | id: i.id,
141 | publishAt: i.publishAt ? moment(i.publishAt).format('YY-MM-DD HH:mm') : '暂无',
142 | showTitle,
143 | msgIdx: i.msgIdx || '0',
144 | readNum: i.readNum || '',
145 | likeNum: i.likeNum || '',
146 | updateNumAt: i.updateNumAt ? moment(i.updateNumAt).format('YY-MM-DD HH:mm') : '暂无',
147 | updateInterval: (i.updateNumAt && i.publishAt) ? timeDiff(i.updateNumAt, i.publishAt) : '',
148 | showProfile: {i.profile ? (
{i.profile.title}) : i.msgBiz}
149 | };
150 | });
151 |
152 | return (
153 |
154 | {this.renderFilter()}
155 |
161 |
162 |
163 |
164 | ID |
165 | 发布时间 {this.sortByTime('publishAt')} |
166 | 文章标题 |
167 | 位置 |
168 | 阅读数 |
169 | 点赞数 |
170 | 更新时间 {this.sortByTime('updateNumAt')} |
171 | 间隔 |
172 | 公众号 |
173 | 详情 |
174 |
175 |
176 |
177 | {
178 | showData.map(i => {
179 | return (
180 |
181 | {i.id} |
182 | {i.publishAt} |
183 | {i.showTitle} |
184 | {i.msgIdx} |
185 | {i.readNum} |
186 | {i.likeNum} |
187 | {i.updateNumAt} |
188 | {i.updateInterval} |
189 | {i.showProfile} |
190 | 详情 |
191 |
192 | );
193 | })
194 | }
195 |
196 |
197 |
198 |
199 | );
200 | }
201 | }
202 |
203 | export default connect(state => state)(Posts);
204 |
--------------------------------------------------------------------------------
/client/app/containers/profiles.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import { connect } from 'react-redux';
3 | import { fetchProfiles } from '../actions';
4 | import Loading from '../components/loading.jsx';
5 | import moment from 'moment';
6 | import Paginator from '../components/paginator.jsx';
7 | import { Link } from 'react-router';
8 | import Search from './search.jsx';
9 |
10 | class Profiles extends React.Component {
11 |
12 | constructor(props) {
13 | super(props);
14 | this.returnCurrentSearchArgs = this.returnCurrentSearchArgs.bind(this);
15 | }
16 |
17 | componentDidMount() {
18 | let { dispatch, location } = this.props;
19 | dispatch(fetchProfiles(location.query));
20 | }
21 |
22 | // eslint-disable-next-line
23 | componentWillReceiveProps(nextProps) {
24 | if (nextProps.location.search !== this.props.location.search) {
25 | let { dispatch } = this.props;
26 | dispatch(fetchProfiles(nextProps.location.query));
27 | }
28 | }
29 |
30 | returnCurrentSearchArgs() {
31 | const { location } = this.props;
32 | const { search } = location;
33 | const searchArgs = {};
34 | search.replace('?', '').split('&').forEach(item => {
35 | let key = item.split('=')[0];
36 | let value = item.replace(`${key}=`, '');
37 | if (key && value) searchArgs[key] = value;
38 | });
39 | return searchArgs;
40 | }
41 |
42 | render() {
43 | let { isFetching, profiles, history, location } = this.props;
44 | let { search, pathname } = location;
45 | if (isFetching || !profiles.data) return ;
46 | let metadata = profiles.metadata;
47 | return (
48 |
49 |
55 |
56 |
57 |
58 | ID |
59 | 更新时间 |
60 | 头像 |
61 | 公众号 |
62 | 最新 |
63 | 最旧 |
64 | 文章数 |
65 | 有数据 |
66 | 差 |
67 | MsgBiz |
68 | 详情 |
69 |
70 |
71 |
72 | {
73 | profiles.data.map(profile => {
74 | return (
75 |
76 | {profile.id} |
77 | {profile.openHistoryPageAt ? moment(profile.openHistoryPageAt).format('YY-MM-DD HH:mm') : ''} |
78 |  |
79 | {profile.title} |
80 | {profile.newestPostTime ? moment(profile.newestPostTime).format('YY-MM-DD'): ''} |
81 | {profile.oldestPostTime ? moment(profile.oldestPostTime).format('YY-MM-DD'): ''} |
82 | {profile.postsAllCount} |
83 | {profile.postsHasDataCount} |
84 | {profile.postsAllCount - profile.postsHasDataCount} |
85 | {profile.msgBiz} |
86 | 详情 |
87 |
88 | );
89 | })
90 | }
91 |
92 |
93 |
94 |
95 | );
96 | }
97 | }
98 |
99 | export default connect(state => state)(Profiles);
100 |
--------------------------------------------------------------------------------
/client/app/containers/search.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import { assembleUrl } from '../actions';
3 | import SearchInput from '../components/searchInput.jsx';
4 |
5 | class Search extends React.Component {
6 |
7 | constructor(props) {
8 | super(props);
9 | }
10 |
11 | render() {
12 | const { location, history, searchArgs, defaultText } = this.props;
13 | const { pathname } = location;
14 | let { q = '' } = searchArgs;
15 | q = decodeURIComponent(q);
16 | const nextQuery = { ...searchArgs };
17 |
18 | // 去掉分页query
19 | if (nextQuery.page) delete nextQuery.page;
20 | return (
21 |
24 | {
29 | if (q) nextQuery.q = q;
30 | if (!q && nextQuery.q) delete nextQuery.q;
31 | const path = assembleUrl(pathname, nextQuery);
32 | history.push(path);
33 | }}
34 | />
35 |
36 | );
37 | }
38 | }
39 |
40 | export default Search;
41 |
--------------------------------------------------------------------------------
/client/app/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | 微信爬虫数据管理
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/client/app/index.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import { render } from 'react-dom';
3 | import { Provider, connect } from 'react-redux';
4 | import { createStore, applyMiddleware } from 'redux';
5 | import reducer from './reducers';
6 | import { closeMessage } from './actions';
7 | import Dialog from 'material-ui/Dialog';
8 | import { Router, Route, IndexRoute } from 'react-router';
9 | import MuiThemeProvider from 'material-ui/styles/MuiThemeProvider';
10 | import AppBar from 'material-ui/AppBar';
11 | import Drawer from 'material-ui/Drawer';
12 | import { List, ListItem } from 'material-ui/List';
13 | import { createHistory, useBasename } from 'history';
14 | import 'bootstrap/dist/css/bootstrap.css';
15 | import 'font-awesome/css/font-awesome.min.css';
16 | import './style/style.css';
17 | const ENV = process.env.NODE_ENV || 'development';
18 | const BASE_URI = '/';
19 |
20 | import thunkMiddleware from 'redux-thunk';
21 | import createLogger from 'redux-logger';
22 |
23 | let reduxMiddlewares = [thunkMiddleware];
24 | if (ENV === 'development') {
25 | reduxMiddlewares.push(createLogger);
26 | }
27 | let store = createStore(
28 | reducer,
29 | applyMiddleware(...reduxMiddlewares)
30 | );
31 |
32 | import Posts from './containers/posts.jsx';
33 | import Profiles from './containers/profiles.jsx';
34 | import Doc from './containers/doc.jsx';
35 |
36 | class App extends React.Component {
37 |
38 | constructor(props) {
39 | super(props);
40 | }
41 |
42 | render() {
43 | const { history, message, dispatch } = this.props;
44 | return (
45 |
46 |
47 |
48 | { history.push('/'); }} />
49 |
50 | { history.push('/posts'); }} />
51 | { history.push('/profiles'); }} />
52 | { history.push('/conf'); }} />
53 |
54 |
55 |
56 |
57 | {this.props.children}
58 |
59 |
67 |
68 |
69 | );
70 | }
71 | }
72 |
73 | const connectedApp = connect(state => state)(App);
74 |
75 | const browserHistory = useBasename(createHistory)({
76 | basename: BASE_URI
77 | });
78 |
79 | render(
80 | (
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 | ),
97 | document.getElementById('app')
98 | );
99 |
--------------------------------------------------------------------------------
/client/app/reducers.js:
--------------------------------------------------------------------------------
1 | import {
2 | REQUEST_POSTS,
3 | RECEIVE_POSTS,
4 | REQUEST_POST,
5 | RECEIVE_POST,
6 | REQUEST_PROFILES,
7 | RECEIVE_PROFILES,
8 | REQUEST_PROFILE,
9 | RECEIVE_PROFILE,
10 | SHOW_MESSAGE,
11 | CLOSE_MESSAGE,
12 | REQUEST_CONF,
13 | RECEIVE_CONF,
14 | } from './actions';
15 |
16 | const initialState = {
17 | posts: {},
18 | post: {},
19 | profiles: {},
20 | profile: {},
21 | isFetching: false,
22 | message: {
23 | open: false,
24 | content: '',
25 | },
26 | // crawl server side config
27 | conf: {},
28 | };
29 |
30 | function reducer(state = initialState, action) {
31 | switch (action.type) {
32 | case REQUEST_POSTS:
33 | case REQUEST_POST:
34 | case REQUEST_PROFILES:
35 | case REQUEST_PROFILE:
36 | case REQUEST_CONF:
37 | return Object.assign({}, state, {
38 | isFetching: true
39 | });
40 | case RECEIVE_POSTS:
41 | return Object.assign({}, state, {
42 | isFetching: false,
43 | posts: action.posts
44 | });
45 | case RECEIVE_POST:
46 | return {
47 | ...state,
48 | isFetching: false,
49 | post: action.post
50 | };
51 | case RECEIVE_PROFILES:
52 | return Object.assign({}, state, {
53 | isFetching: false,
54 | profiles: action.profiles
55 | });
56 | case RECEIVE_PROFILE:
57 | return {
58 | ...state,
59 | isFetching: false,
60 | profile: action.profile
61 | };
62 | case SHOW_MESSAGE:
63 | return {
64 | ...state,
65 | message: {
66 | open: true,
67 | content: action.content
68 | },
69 | };
70 | case CLOSE_MESSAGE:
71 | return {
72 | ...state,
73 | message: {
74 | open: false,
75 | content: ''
76 | },
77 | };
78 | case RECEIVE_CONF:
79 | return {
80 | ...state,
81 | isFetching: false,
82 | conf: action.conf,
83 | };
84 | default:
85 | return state;
86 | }
87 | }
88 |
89 | export default reducer;
90 |
--------------------------------------------------------------------------------
/client/app/style/style.css:
--------------------------------------------------------------------------------
1 | body {
2 | padding-left: 100px;
3 | font-family: "SFMono-Regular", Consolas, "Liberation Mono", Menlo, Courier;
4 | }
5 |
6 | .wrapper {
7 | box-sizing: border-box;
8 | width: 100%;
9 | padding:0 20px;
10 | margin: 10px auto;
11 | }
--------------------------------------------------------------------------------
/client/build/448c34a56d699c29117adc64c43affeb.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lqqyt2423/wechat_spider/17bca49c5a2606f6b66c7ba194dd54ac180dadda/client/build/448c34a56d699c29117adc64c43affeb.woff2
--------------------------------------------------------------------------------
/client/build/674f50d287a8c48dc19ba404d20fe713.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lqqyt2423/wechat_spider/17bca49c5a2606f6b66c7ba194dd54ac180dadda/client/build/674f50d287a8c48dc19ba404d20fe713.eot
--------------------------------------------------------------------------------
/client/build/af7ae505a9eed503f8b8e6982036873e.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lqqyt2423/wechat_spider/17bca49c5a2606f6b66c7ba194dd54ac180dadda/client/build/af7ae505a9eed503f8b8e6982036873e.woff2
--------------------------------------------------------------------------------
/client/build/b06871f281fee6b241d60582ae9369b9.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lqqyt2423/wechat_spider/17bca49c5a2606f6b66c7ba194dd54ac180dadda/client/build/b06871f281fee6b241d60582ae9369b9.ttf
--------------------------------------------------------------------------------
/client/build/e18bbf611f2a2e43afc071aa2f4e1512.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lqqyt2423/wechat_spider/17bca49c5a2606f6b66c7ba194dd54ac180dadda/client/build/e18bbf611f2a2e43afc071aa2f4e1512.ttf
--------------------------------------------------------------------------------
/client/build/f4769f9bdb7466be65088239c12046d1.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lqqyt2423/wechat_spider/17bca49c5a2606f6b66c7ba194dd54ac180dadda/client/build/f4769f9bdb7466be65088239c12046d1.eot
--------------------------------------------------------------------------------
/client/build/fa2772327f55d8198301fdb8bcfc8158.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lqqyt2423/wechat_spider/17bca49c5a2606f6b66c7ba194dd54ac180dadda/client/build/fa2772327f55d8198301fdb8bcfc8158.woff
--------------------------------------------------------------------------------
/client/build/fee66e712a8a08eef5805a46892932ad.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lqqyt2423/wechat_spider/17bca49c5a2606f6b66c7ba194dd54ac180dadda/client/build/fee66e712a8a08eef5805a46892932ad.woff
--------------------------------------------------------------------------------
/client/build/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | 微信爬虫数据管理
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/client/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "wechat_spider_client",
3 | "version": "1.1.0",
4 | "description": "",
5 | "main": "webpack.config.js",
6 | "scripts": {
7 | "build": "NODE_ENV=production webpack",
8 | "start": "webpack-dev-server"
9 | },
10 | "author": "liqiang",
11 | "license": "ISC",
12 | "dependencies": {
13 | "bootstrap": "^3.3.7",
14 | "classnames": "^2.2.3",
15 | "font-awesome": "^4.7.0",
16 | "lodash": "^4.13.1",
17 | "lodash.assign": "^4.0.9",
18 | "lodash.camelcase": "^4.3.0",
19 | "lodash.clonedeep": "^4.5.0",
20 | "material-ui": "^0.19.2",
21 | "moment": "^2.21.0",
22 | "prop-types": "^15.5.10",
23 | "react": "^15.6.1",
24 | "react-dom": "^15.6.1",
25 | "react-redux": "^5.0.6",
26 | "react-router": "^2.0.1",
27 | "redux": "^3.7.2",
28 | "redux-logger": "^3.0.6",
29 | "redux-thunk": "^2.2.0"
30 | },
31 | "devDependencies": {
32 | "babel": "^6.23.0",
33 | "babel-core": "^6.25.0",
34 | "babel-loader": "^7.1.1",
35 | "babel-plugin-transform-runtime": "^6.23.0",
36 | "babel-preset-env": "^1.6.0",
37 | "babel-preset-es2015": "^6.24.1",
38 | "babel-preset-react": "^6.24.1",
39 | "babel-preset-stage-2": "^6.24.1",
40 | "css-loader": "^0.28.4",
41 | "file-loader": "^1.1.4",
42 | "html-webpack-plugin": "^2.29.0",
43 | "style-loader": "^0.18.2",
44 | "webpack": "^3.3.0",
45 | "webpack-dev-server": "^2.6.1"
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/client/webpack.config.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const path = require('path');
4 | const webpack = require('webpack');
5 | const HtmlWebpackPlugin = require('html-webpack-plugin');
6 |
7 | let NODE_ENV = process.env.NODE_ENV || 'development';
8 |
9 | const publicPath = '/';
10 |
11 | const babelLoader = {
12 | loader: 'babel-loader',
13 | options: {
14 | cacheDirectory: true,
15 | presets: ['es2015', 'react', 'stage-2'],
16 | plugins: [
17 | ['transform-runtime', {
18 | 'regenerator': true,
19 | }],
20 | ],
21 | }
22 | };
23 |
24 | const plugins = [
25 | new webpack.HotModuleReplacementPlugin(),
26 | new HtmlWebpackPlugin({
27 | title: 'react',
28 | template: './app/index.html'
29 | })
30 | ];
31 |
32 | if (NODE_ENV != 'development') {
33 | plugins.push(
34 | new webpack.DefinePlugin({
35 | 'process.env': {
36 | NODE_ENV: JSON.stringify('production')
37 | }
38 | }),
39 | new webpack.optimize.UglifyJsPlugin()
40 | );
41 | }
42 |
43 | module.exports = {
44 | entry: './app/index.jsx',
45 | output: {
46 | filename: 'bundle.js',
47 | path: path.resolve(__dirname, './build'),
48 | publicPath: publicPath
49 | },
50 | plugins: plugins,
51 | devtool: NODE_ENV == 'development' ? 'eval' : undefined,
52 | devServer: {
53 | hot: true,
54 | contentBase: './',
55 | historyApiFallback: true,
56 | proxy: {
57 | '/api': 'http://localhost:8104',
58 | '/favicon.png': 'http://localhost:8104'
59 | }
60 | },
61 | module: {
62 | rules: [
63 | {
64 | test: /\.js|jsx$/,
65 | use: [
66 | babelLoader
67 | ],
68 | exclude: /(node_modules|bower_components)/
69 | },
70 | {
71 | test: /\.css$/,
72 | use: [
73 | 'style-loader',
74 | 'css-loader'
75 | ]
76 | },
77 | {
78 | test: /\.(woff|woff2|eot|ttf|otf|svg)$/,
79 | use: [
80 | 'file-loader'
81 | ]
82 | }
83 | ]
84 | }
85 | };
86 |
--------------------------------------------------------------------------------
/config.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const merge = require('./utils/merge');
4 | const env = process.env.NODE_ENV || 'development';
5 |
6 | const isDev = env === 'development';
7 | const isProd = env === 'production';
8 |
9 | const config = {
10 |
11 | // 环境相关
12 | env,
13 | isDev,
14 | isProd,
15 |
16 | // 接口和前端等可视化相关的端口号
17 | serverPort: 8104,
18 |
19 | // anyproxy 的一些设置
20 | anyproxy: {
21 | // 代理的端口号,抓取开始时需手动设置
22 | port: 8101,
23 | // anyproxy 的前端可视化界面
24 | webInterface: {
25 | // 是否开启
26 | enable: isDev ? true : false,
27 | // 访问的端口地址
28 | webPort: 8102
29 | },
30 | // 限制网络速度 kb/s number
31 | // 设置为 undefined 不限速
32 | throttle: undefined,
33 | // 是否强制解析所有 HTTPS 流量
34 | // anyproxy 性能比较差,所以这里默认关闭,仅当域名是微信相关时,才会解析
35 | forceProxyHttps: false,
36 | // 是否开启 websocket 代理
37 | wsIntercept: false,
38 | // 控制 anyproxy 是否在命令行打印抓取记录等 log 信息
39 | silent: isDev ? false : true
40 | },
41 |
42 | // mongo 数据库设置
43 | mongodb: {
44 | db: 'mongodb://127.0.0.1:27017/wechat_spider'
45 | },
46 |
47 | // redis 设置
48 | redis: {
49 | port: 6379,
50 | host: '127.0.0.1',
51 |
52 | // 存储抓取文章列表的 key 名称
53 | POST_LIST_KEY: 'wechat_spider:post_list',
54 | // 存储抓取微信公众号历史列表的 key 名称
55 | PROFILE_LIST_KEY: 'wechat_spider:profile_list'
56 | },
57 |
58 | // 自定义抓取规则
59 | rule: {
60 | // 文章页面相关设置
61 | page: {
62 | // 是否关闭自动跳转页面的功能
63 | // 跳转指文章页跳至下一个文章页,不能文章页和公众号历史页互相跳转
64 | disable: false,
65 | // 跳转时间间隔,单位秒
66 | jumpInterval: 2,
67 |
68 | // 跳转文章发布时间范围
69 | minTime: new Date('2022/2/1'),
70 | maxTime: new Date('2022/8/1'),
71 |
72 | // 已经抓取过的文章是否需要再次抓取
73 | isCrawlExist: true,
74 | // 如果上面设置为 true,此项可控制再次重复抓取文章的时间间隔
75 | // updateNumAt - publishAt < crawlExistInterval => 抓取
76 | // 默认 3 天,数据已趋于稳定
77 | crawlExistInterval: 1000 * 60 * 60 * 24 * 3,
78 |
79 | // 抓取公众号 biz 范围 [string]
80 | // 为空表示不限制范围
81 | targetBiz: [],
82 |
83 | // 是否保存微信文章内容
84 | // 内容占用很多空间,尤其是html形式
85 | isSavePostContent: isDev ? true : false,
86 | // 保存内容的形式: html/text
87 | saveContentType: 'text',
88 | },
89 |
90 | // 公众号查看全部历史文章页面相关设置
91 | profile: {
92 | // 是否关闭自动跳转页面的功能
93 | // 跳转不能文章页和公众号历史页互相跳转
94 | disable: false,
95 | // 跳转时间间隔,单位秒
96 | jumpInterval: 8,
97 |
98 | // 页面会自动下拉
99 | // 下拉至此项设置的时间便会停止
100 | // 然后跳转至下一个公众号历史页面
101 | minTime: new Date('2022/2/1'),
102 |
103 | // 控制在此时间后已经抓取过的公众号本次就不用再抓取了
104 | maxUpdatedAt: new Date('2022/8/1'),
105 |
106 | // 抓取公众号 biz 范围 [string]
107 | // 为空表示不限制范围
108 | targetBiz: [],
109 | },
110 |
111 | // 功能:是否抓取评论
112 | isCrawlComments: true,
113 |
114 | // 优化项:是否替换掉所有的图片请求
115 | isReplaceImg: isDev ? false : true,
116 | // 优化项:是否替换手机上显示的正文内容
117 | isReplacePostBody: isDev ? false : true,
118 | },
119 |
120 | // 添加代理基本认证
121 | // 如果开启此配置,则代理时需要先输入用户和密码才可正常运行
122 | proxyBasicAuth: {
123 | enable: false,
124 | user: 'admin',
125 | password: '123456',
126 | },
127 |
128 | };
129 |
130 | // docker 配置
131 | if (process.env.DEPLOY === 'docker') {
132 | config.mongodb.db = 'mongodb://mongo:27017/wechat_spider';
133 | config.redis.host = 'redis';
134 | }
135 |
136 |
137 | // 加载自定义的配置
138 | try {
139 | const myConfig = require('./my_config.js');
140 | merge(config, myConfig);
141 | } catch (e) {
142 | // Do nothing
143 | }
144 |
145 | module.exports = config;
146 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 | app:
4 | build: ./
5 | image: wechat-spider:latest
6 | restart: always
7 | environment:
8 | - NODE_ENV=production
9 | - DEPLOY=docker
10 | expose:
11 | - '8101'
12 | - '8104'
13 | - '8102'
14 | ports:
15 | - 8101:8101
16 | - 8104:8104
17 | - 8102:8102
18 | links:
19 | - redis
20 | - mongo
21 | redis:
22 | image: redis:latest
23 | restart: always
24 | mongo:
25 | image: mongo:latest
26 | restart: always
27 | volumes:
28 | - '${MONGO_PATH}:/data/db'
29 |
--------------------------------------------------------------------------------
/imgs/posts_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lqqyt2423/wechat_spider/17bca49c5a2606f6b66c7ba194dd54ac180dadda/imgs/posts_screenshot.png
--------------------------------------------------------------------------------
/imgs/sponsor-me.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lqqyt2423/wechat_spider/17bca49c5a2606f6b66c7ba194dd54ac180dadda/imgs/sponsor-me.jpeg
--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const AnyProxy = require('anyproxy');
4 | const exec = require('child_process').exec;
5 | const ip = require('ip');
6 | const config = require('./config');
7 | const utils = require('./utils');
8 | const logger = require('./utils/logger');
9 |
10 | const {
11 | anyproxy: anyproxyConfig,
12 | serverPort,
13 | } = config;
14 |
15 | // 引导安装 HTTPS 证书
16 | if (!AnyProxy.utils.certMgr.ifRootCAFileExists()) {
17 | AnyProxy.utils.certMgr.generateRootCA((error, keyPath) => {
18 | if (!error) {
19 | const certDir = require('path').dirname(keyPath);
20 | logger.info('The cert is generated at %s', certDir);
21 | const isWin = /^win/.test(process.platform);
22 | if (isWin) {
23 | exec('start .', { cwd: certDir });
24 | } else {
25 | exec('open .', { cwd: certDir });
26 | }
27 | } else {
28 | logger.error(error);
29 | }
30 | });
31 | }
32 |
33 | const ipAddress = ip.address();
34 | const proxyServer = new AnyProxy.ProxyServer({
35 | ...anyproxyConfig,
36 |
37 | // 所有的抓取规则
38 | rule: require('./rule'),
39 | });
40 |
41 | proxyServer.on('ready', () => {
42 | logger.info('请配置HTTP代理: %s:8101', ipAddress);
43 | });
44 |
45 | proxyServer.on('error', (e) => {
46 | logger.error(e);
47 | });
48 |
49 | // 删除 redis 中对应缓存后再启动
50 | utils.delCrawlLinkCache().then(() => {
51 | proxyServer.start();
52 | }, e => {
53 | logger.error(e);
54 | });
55 |
56 | // when finished
57 | // proxyServer.close();
58 |
59 | require('./server').listen(serverPort, () => {
60 | logger.info('数据管理页面: http://%s:8104', ipAddress);
61 | });
62 |
--------------------------------------------------------------------------------
/models/Comment.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const mongoose = require('mongoose');
4 | const Schema = mongoose.Schema;
5 |
6 | const Comment = new Schema({
7 | postId: { type: 'ObjectId', ref: 'Post' },
8 | contentId: String,
9 | nickName: String,
10 | logoUrl: String,
11 | content: String,
12 | createTime: Date,
13 | likeNum: Number,
14 | replies: [{
15 | content: String,
16 | createTime: Date,
17 | likeNum: Number
18 | }]
19 | });
20 |
21 | Comment.plugin(require('motime'));
22 |
23 | Comment.index({ contentId: 1 }, { unique: true });
24 |
25 | mongoose.model('Comment', Comment);
26 |
--------------------------------------------------------------------------------
/models/Post.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const moment = require('moment');
4 | const mongoose = require('mongoose');
5 | const Schema = mongoose.Schema;
6 | const logger = require('../utils/logger');
7 |
8 | // 数据结构:文章
9 | const Post = new Schema({
10 | // 标题
11 | title: String,
12 | // 链接
13 | link: String,
14 | // 发布时间
15 | publishAt: Date,
16 | // 阅读数
17 | readNum: Number,
18 | // 点赞数
19 | likeNum: Number,
20 | // 在看数
21 | likeNum2: Number,
22 | // 公众号标志
23 | msgBiz: String,
24 | // 应该是每次发布的消息的标志
25 | msgMid: String,
26 | // 本次发布的条数顺序 首条、二条等等
27 | msgIdx: String,
28 | // 阅读原文链接
29 | sourceUrl: String,
30 | // 封面图片链接
31 | cover: String,
32 | // 摘要
33 | digest: String,
34 | // 作者
35 | author: String,
36 | // 11表示原创,普通文章应该值为100
37 | copyrightStat: Number,
38 | // 是否抓取失败:文章删除、其他未知原因
39 | isFail: Boolean,
40 | // 公众号 id
41 | wechatId: String,
42 | // 上次更新阅读数、点赞数的时间
43 | updateNumAt: Date,
44 |
45 | // 文章正文 纯文本
46 | content: String,
47 | // 文章正文 html
48 | html: String,
49 | }, { toJSON: { virtuals: true } });
50 |
51 | Post.plugin(require('motime'));
52 |
53 | Post.virtual('profile', {
54 | ref: 'Profile',
55 | localField: 'msgBiz',
56 | foreignField: 'msgBiz',
57 | justOne: true
58 | });
59 |
60 | // 索引
61 | Post.index({ publishAt: -1, msgIdx: 1 });
62 | Post.index({ publishAt: 1, msgIdx: 1 });
63 | Post.index({ updateNumAt: -1 });
64 | Post.index({ msgBiz: 1, publishAt: 1, msgIdx: 1 });
65 | Post.index({ msgBiz: 1, msgMid: 1, msgIdx: 1 }, { unique: true, sparse: true });
66 | Post.index({ link: 1 });
67 |
68 | // 插入或更新数据
69 | // 必须包含 msgBiz, msgMid, msgIdx
70 | Post.statics.upsert = async function (post) {
71 | if (Array.isArray(post)) {
72 | return Promise.all(post.map(this.upsert.bind(this)));
73 | }
74 |
75 | const { msgBiz, msgMid, msgIdx } = post;
76 | if (!msgBiz || !msgMid || !msgIdx) return null;
77 | return this.findOneAndUpdate(
78 | { msgBiz, msgMid, msgIdx },
79 | post,
80 | { upsert: true, new: true }
81 | );
82 | };
83 |
84 | // debug info
85 | Post.statics.debugInfo = function (posts) {
86 | if (!Array.isArray(posts)) posts = [posts];
87 | posts.forEach(post => {
88 | logger.info('[post] id: %s, title: %s, publishAt: %s', post.id, post.title, post.publishAt ? moment(post.publishAt).format('YYYY-MM-DD HH:mm') : '');
89 | });
90 | };
91 |
92 | mongoose.model('Post', Post);
93 |
--------------------------------------------------------------------------------
/models/Profile.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const moment = require('moment');
4 | const mongoose = require('mongoose');
5 | const Schema = mongoose.Schema;
6 | const logger = require('../utils/logger');
7 |
8 | // 数据结构:公众号账号
9 | const Profile = new Schema({
10 | // 公众号标题 name
11 | title: String,
12 | // 公众号 id
13 | wechatId: String,
14 | // 公众号介绍
15 | desc: String,
16 | // 公众号标志
17 | msgBiz: String,
18 | // 公众号头像
19 | headimg: String,
20 | // 原始id,示例:gh_70949f33590e
21 | // 关注二维码为:https://open.weixin.qq.com/qr/code?username=gh_70949f33590e
22 | username: String,
23 | // 上次打开历史页面的时间
24 | openHistoryPageAt: Date,
25 |
26 | // 省份
27 | province: String,
28 | // 城市
29 | city: String,
30 |
31 | // 发布的第一篇文章的发布当天 0 点的时间
32 | firstPublishAt: Date,
33 | // 最近一次的发布时间
34 | latestPublishAt: Date,
35 | // 每天的最大发布频率
36 | maxDayPubCount: { type: Number, default: 1 },
37 |
38 | // 无关的字段,可忽略
39 | property: String,
40 | });
41 |
42 | Profile.plugin(require('motime'));
43 |
44 | Profile.index({ msgBiz: 1 }, { unique: true, sparse: true });
45 |
46 | // log profile info by find msgBiz
47 | Profile.statics.logInfo = async function (msgBiz) {
48 | if (!msgBiz) return;
49 | let title;
50 | const profile = await this.findOne({ msgBiz });
51 | if (profile && profile.title) title = profile.title;
52 | logger.info('[profile] msgBiz: %s, title: %s', msgBiz, title);
53 | };
54 |
55 | // debug info
56 | Profile.statics.debugInfo = function (profiles) {
57 | if (!Array.isArray(profiles)) profiles = [profiles];
58 | profiles.forEach(profile => {
59 | logger.info('[profile] id: %s, msgBiz: %s, title: %s', profile.id, profile.msgBiz, profile.title);
60 | });
61 | };
62 |
63 | // 插入或更新数据
64 | // 必须包含 msgBiz
65 | Profile.statics.upsert = async function (profile) {
66 | if (Array.isArray(profile)) {
67 | return Promise.all(profile.map(this.upsert.bind(this)));
68 | }
69 | const { msgBiz } = profile;
70 | if (!msgBiz) return null;
71 |
72 | // 先通过 msgBiz 查找,再通过 title 查找
73 | let doc = await this.findOne({ msgBiz }).select('_id');
74 | if (!doc && profile.title) {
75 | doc = await this.findOne({ msgBiz: { $exists: false }, title: profile.title }).select('_id');
76 | }
77 | if (doc) {
78 | return await this.findByIdAndUpdate(doc.id, profile, { new: true });
79 | } else {
80 | return await this.create(profile);
81 | }
82 | };
83 |
84 | // 尝试更新最近一次的发布时间
85 | Profile.statics.updateLatestPublishAt = async function (posts) {
86 | if (!posts || !posts.length) return;
87 | const msgBiz = posts[0].msgBiz;
88 | const profile = await this.findOne({ msgBiz });
89 | if (!profile) return;
90 |
91 | let latestPublishAt = posts[0].publishAt;
92 | posts.forEach(post => {
93 | if (post.publishAt > latestPublishAt) latestPublishAt = post.publishAt;
94 | });
95 | if (!profile.latestPublishAt || latestPublishAt > profile.latestPublishAt) {
96 | await this.findByIdAndUpdate(profile.id, { $set: { latestPublishAt } });
97 | logger.info('[profile updateLatestPublishAt] biz: %s, title: %s, at: %s', msgBiz, profile.title, moment(latestPublishAt).format('YYYY-MM-DD HH:mm'));
98 | }
99 | };
100 |
101 | // 计算 maxDayPubCount,从 ProfilePubRecord 同步
102 | // 建议定时调用
103 | Profile.statics.calcMaxDayPubCount = async function () {
104 | // 计算最近90天的值
105 | const compareDate = new Date(Date.now() - 1000 * 60 * 60 * 24 * 90);
106 | const res = await mongoose.model('ProfilePubRecord').aggregate([
107 | { $match: { pubCount: { $gt: 1 }, date: { $gt: compareDate } } },
108 | { $group: { _id: '$msgBiz', maxDayPubCount: { $max: '$pubCount' } } }
109 | ]);
110 | if (res.length) {
111 | for (const item of res) {
112 | const { _id: msgBiz, maxDayPubCount } = item;
113 | await this.findOneAndUpdate({ msgBiz }, { $set: { maxDayPubCount } });
114 | logger.debug('[profile] set %s maxDayPubCount %s', msgBiz, maxDayPubCount);
115 | }
116 |
117 | const allMsgBizs = res.map(i => i._id);
118 | const result = await this.updateMany(
119 | { maxDayPubCount: { $gt: 1 }, msgBiz: { $nin: allMsgBizs } },
120 | { $set: { maxDayPubCount: 1 } }
121 | );
122 | logger.debug('[profile] 重置每天可发布多条的公众号: %s', result);
123 | }
124 | };
125 |
126 | mongoose.model('Profile', Profile);
127 |
--------------------------------------------------------------------------------
/models/ProfilePubRecord.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const mongoose = require('mongoose');
4 | const moment = require('moment');
5 | const Schema = mongoose.Schema;
6 | const config = require('../config');
7 | const redis = require('../utils/redis');
8 |
9 | const MIN_TIME = config.rule.profile.minTime;
10 | const DAY_MS = 1000 * 60 * 60 * 24;
11 |
12 | // 10 s 冗余时间
13 | const SAVE_LAST_MIN_EX = config.rule.profile.jumpInterval + 10;
14 | const SAVE_RECORD_KEY_PREFIX = 'wx:pub:record:';
15 |
16 | // 公众号的发布文章记录
17 | const ProfilePubRecord = new Schema({
18 | // 公众号
19 | msgBiz: { type: String, required: true },
20 | // 日期 包含每一天 定义为每天 0 点
21 | date: { type: Date, required: true },
22 | // 发布次数 0 - 未发布 1 - 当日发布一次(居多) 2 - 以此类推
23 | pubCount: { type: Number, default: 0 },
24 | // 发布的总条数
25 | postCount: { type: Number, default: 0 },
26 | });
27 |
28 | ProfilePubRecord.plugin(require('motime'));
29 |
30 | ProfilePubRecord.index({ msgBiz: 1, date: 1 }, { unique: true });
31 |
32 | // 传入 posts 记录发布文章记录(确保当批记录按照天区分、确保为同一 profile)
33 | // 记录公众号的发布文章记录
34 | ProfilePubRecord.statics.savePubRecords = async function (posts) {
35 | if (!posts || !posts.length) return;
36 | const msgBiz = posts[0].msgBiz;
37 | const redisKey = `${SAVE_RECORD_KEY_PREFIX}${msgBiz}`;
38 | const recordMap = {};
39 | posts.forEach(post => {
40 | const publishAt = post.publishAt;
41 | const date = moment(publishAt).startOf('day').toDate();
42 | if (recordMap[date]) {
43 | recordMap[date].postCount += 1;
44 | recordMap[date].pubTimes.add(String(publishAt));
45 | } else {
46 | recordMap[date] = {
47 | postCount: 1,
48 | pubTimes: new Set([String(publishAt)]),
49 | timestamp: date.getTime(),
50 | };
51 | }
52 | });
53 | // min and max timestamp
54 | const timestampArr = Object.keys(recordMap).map(key => recordMap[key].timestamp);
55 | const minTimestamp = Math.min(...timestampArr);
56 | let maxTimestamp = Math.max(...timestampArr);
57 |
58 | // 取出上次暂存在 redis 的最小抓取时间
59 | let tmpTimestamp = await redis('get', redisKey);
60 | tmpTimestamp = Number(tmpTimestamp);
61 | if (tmpTimestamp && tmpTimestamp > maxTimestamp) maxTimestamp = tmpTimestamp;
62 |
63 | // insert or update
64 | for (let timestamp = minTimestamp; timestamp <= maxTimestamp; timestamp += DAY_MS) {
65 | const date = new Date(timestamp);
66 | await this.findOneAndUpdate(
67 | {
68 | msgBiz,
69 | date,
70 | },
71 | {
72 | pubCount: ((recordMap[date] || {}).pubTimes || new Set()).size,
73 | postCount: (recordMap[date] || {}).postCount || 0,
74 | },
75 | { upsert: true, new: true }
76 | );
77 | }
78 |
79 | // 本次存储的最小抓取时间作为下次的最大抓取时间
80 | await redis('set', redisKey, minTimestamp, 'EX', SAVE_LAST_MIN_EX);
81 | };
82 |
83 | // 传入目标最小时间,返回需要抓取的最小时间
84 | ProfilePubRecord.statics.getMinTargetTime = async function (msgBiz, minTime = MIN_TIME) {
85 | minTime = moment(minTime).startOf('day').toDate();
86 | const records = await this.find({
87 | msgBiz,
88 | date: { $gte: minTime },
89 | }).sort('date');
90 | if (!records.length) return minTime;
91 |
92 | // 最早一篇发布文章
93 | const profile = await mongoose.model('Profile').findOne({ msgBiz });
94 | if (!profile) return minTime;
95 | if (profile.firstPublishAt && profile.firstPublishAt > minTime) {
96 | minTime = profile.firstPublishAt;
97 | }
98 |
99 | const today = moment().startOf('day').toDate();
100 | const todayTimestamp = today.getTime();
101 | let i = 0;
102 | for (let timestamp = minTime.getTime(); timestamp <= todayTimestamp; timestamp += DAY_MS) {
103 | if (!records[i]) return new Date(timestamp);
104 | if (records[i].date.getTime() !== timestamp) return new Date(timestamp);
105 | ++i;
106 | }
107 | return today;
108 | };
109 |
110 | mongoose.model('ProfilePubRecord', ProfilePubRecord);
111 |
112 | // 查询哪些公众号每天可以发布多次
113 | // db.profilepubrecords.aggregate([
114 | // { $match: { pubCount: { $gt: 1 } } },
115 | // { $group: { _id: '$msgBiz', pubCount: { $max: '$pubCount' } } },
116 | // { $sort: { pubCount: -1 } },
117 | // { $lookup: {
118 | // from: 'profiles',
119 | // localField: '_id',
120 | // foreignField: 'msgBiz',
121 | // as: 'profile'
122 | // } }
123 | // ]).pretty()
124 |
--------------------------------------------------------------------------------
/models/index.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const mongoose = require('mongoose');
4 | const path = require('path');
5 | const logger = require('../utils/logger');
6 |
7 | mongoose.Promise = global.Promise;
8 |
9 | // 载入 mongoose 插件
10 | require('./plugins/paginator');
11 |
12 | const config = require('../config');
13 |
14 | mongoose.connect(
15 | config.mongodb.db,
16 | {
17 | useNewUrlParser: true,
18 | useUnifiedTopology: true,
19 | },
20 | (err) => {
21 | if (err) {
22 | logger.warn('connect to mongodb failed');
23 | logger.error(err);
24 | process.exit(1);
25 | }
26 | }
27 | );
28 |
29 | if (config.isProd) mongoose.set('debug', false);
30 | if (config.isDev) mongoose.set('debug', true);
31 |
32 | // Load All Models
33 | [
34 | 'Post',
35 | 'Profile',
36 | 'Comment',
37 | 'ProfilePubRecord',
38 | ].forEach(function (modelName) {
39 | require(path.join(__dirname, modelName));
40 | exports[modelName] = mongoose.model(modelName);
41 | });
42 |
--------------------------------------------------------------------------------
/models/plugins/paginator.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | /**
4 | * Pagination Plugin
5 | */
6 | const _ = require('lodash');
7 | const Query = require('mongoose').Query;
8 |
9 | const DEFAULTS = {
10 | perPage: 10, // 每页条数
11 | page: 1, // 初始页数
12 | offset: 0, // 偏移数
13 | maxPerPage: 100 // 最大单页条数
14 | };
15 | const DEFAULT_KEYS = Object.keys(DEFAULTS);
16 |
17 | /**
18 | * paginate
19 | *
20 | * @param {Object} options
21 | */
22 | Query.prototype.paginate = async function (options, callback) {
23 | let opts = _.assign({}, DEFAULTS, options);
24 |
25 | // 转换值为数字
26 | DEFAULT_KEYS.forEach(k => opts[k] = Number(opts[k]));
27 |
28 | let hasCallback = typeof callback === 'function';
29 |
30 | let query = this;
31 | let model = query.model;
32 | let conditions = query._conditions;
33 |
34 | // 如果有外部传递过来的count,无需查数据库
35 | let count = opts.count || 0;
36 |
37 | try {
38 | if (!count) count = await model.where(conditions).countDocuments();
39 |
40 | // 计算每页数
41 | opts.perPage = opts.perPage >= opts.maxPerPage ? opts.maxPerPage : opts.perPage;
42 |
43 | let _skip = (opts.page - 1) * opts.perPage;
44 | _skip += opts.offset;
45 |
46 | let data = await query.skip(_skip).limit(opts.perPage);
47 |
48 | let current = parseInt(opts.page, 10) || 1;
49 |
50 | let offsetCount = count - opts.offset;
51 | offsetCount = offsetCount > 0 ? offsetCount : 0;
52 |
53 | let totalPages = Math.ceil(offsetCount / opts.perPage);
54 |
55 | let prev = !count || current === 1 ? null : current - 1;
56 | let next = !count || current === totalPages ? null : current + 1;
57 |
58 | if (!offsetCount) prev = next = null;
59 |
60 | let result = {
61 | data: data || [],
62 | options: opts,
63 | current: current,
64 | next: next,
65 | prev: prev,
66 | totalPages: totalPages,
67 | count: count,
68 |
69 | // 直接返回给前端的 metadata
70 | metadata: {
71 | count,
72 | totalPages,
73 | currentPage: current,
74 | perPage: opts.perPage,
75 | },
76 | };
77 |
78 | return hasCallback ? callback(null, result) : result;
79 | } catch (e) {
80 | if (hasCallback) {
81 | callback(e);
82 | } else {
83 | throw e;
84 | }
85 | }
86 | };
87 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "wechat_spider",
3 | "version": "1.2.0",
4 | "description": "wechat spider by Man-in-the-middle attack",
5 | "main": "index.js",
6 | "scripts": {
7 | "start": "node index.js",
8 | "dev": "nodemon index.js --ignore client/ --ignore test/",
9 | "prod": "NODE_ENV=production npm start",
10 | "test": "mocha --recursive --exit --timeout 10000"
11 | },
12 | "author": "liqiang",
13 | "license": "MIT",
14 | "dependencies": {
15 | "anyproxy": "^4.1.3",
16 | "cheerio": "^1.0.0-rc.12",
17 | "express": "^4.18.1",
18 | "ip": "^1.1.8",
19 | "json2csv": "3.11.5",
20 | "lodash": "^4.17.21",
21 | "moment": "^2.29.4",
22 | "mongoose": "^6.5.2",
23 | "morgan": "^1.10.0",
24 | "motime": "^0.0.2",
25 | "redis": "^2.8.0",
26 | "request": "^2.88.2",
27 | "request-promise": "^4.2.6",
28 | "winston": "^3.8.1"
29 | },
30 | "devDependencies": {
31 | "eslint": "^5.16.0",
32 | "eslint-plugin-react": "^7.14.3",
33 | "mocha": "^10.0.0",
34 | "nodemon": "^2.0.19"
35 | },
36 | "repository": {
37 | "type": "git",
38 | "url": "git+https://github.com/lqqyt2423/wechat_spider.git"
39 | },
40 | "keywords": [
41 | "wechat",
42 | "spider"
43 | ],
44 | "bugs": {
45 | "url": "https://github.com/lqqyt2423/wechat_spider/issues"
46 | },
47 | "homepage": "https://github.com/lqqyt2423/wechat_spider#readme"
48 | }
49 |
--------------------------------------------------------------------------------
/rule/basicAuth.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const config = require('../config');
4 |
5 | const { proxyBasicAuth } = config;
6 | const { enable, user, password } = proxyBasicAuth;
7 |
8 | const authRes = {
9 | response: {
10 | statusCode: 407,
11 | header: { 'Proxy-Authenticate': 'Basic realm="Access to internal site"' },
12 | },
13 | };
14 |
15 | // 如返回 undefined 则未开启基本认证或认证通过
16 | // 若有对象返回则认证不通过
17 | function basicAuth(headers) {
18 | if (!enable) return;
19 | if (!user || !password) return;
20 |
21 | let auth = headers['Proxy-Authorization'];
22 | if (!auth) return authRes;
23 |
24 | auth = auth.replace('Basic ', '');
25 | auth = new Buffer(auth, 'base64').toString();
26 | if (auth !== `${user}:${password}`) return authRes;
27 | }
28 |
29 | module.exports = basicAuth;
30 |
--------------------------------------------------------------------------------
/rule/getNextProfileLink.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | // 获取下一个抓取历史记录的页面地址
4 |
5 | const moment = require('moment');
6 | const redis = require('../utils/redis');
7 | const config = require('../config');
8 | const models = require('../models');
9 |
10 | // 配置抓取方式
11 | const RULE_FN = normalMode;
12 | const CACHE_LIMIT = 100;
13 |
14 | const {
15 | rule: ruleConfig,
16 | redis: redisConfig,
17 | } = config;
18 | const { profile: profileConfig } = ruleConfig;
19 | const { PROFILE_LIST_KEY } = redisConfig;
20 |
21 |
22 | // 正常模式
23 | async function normalMode() {
24 | // 没有拿到链接则从数据库中查
25 | const { maxUpdatedAt, targetBiz } = profileConfig;
26 |
27 | const searchQuery = {
28 | msgBiz: { $exists: true },
29 | $or: [
30 | { openHistoryPageAt: { $lte: maxUpdatedAt } },
31 | { openHistoryPageAt: { $exists: false } }
32 | ]
33 | };
34 |
35 | if (targetBiz && targetBiz.length > 0) searchQuery.msgBiz = { $in: targetBiz };
36 |
37 | return await models.Profile.find(searchQuery)
38 | .sort('openHistoryPageAt')
39 | .select('msgBiz')
40 | .limit(CACHE_LIMIT)
41 | .then(profiles => {
42 | if (!(profiles && profiles.length > 0)) return [];
43 | const bizs = profiles.map(p => p.msgBiz);
44 |
45 | if (targetBiz && targetBiz.length) {
46 | // 按照目标 biz 排序
47 | bizs.sort((a, b) => {
48 | if (targetBiz.indexOf(a) <= targetBiz.indexOf(b)) return -1;
49 | return 1;
50 | });
51 | }
52 |
53 | return bizs;
54 | });
55 | }
56 |
57 | // 更新模式
58 |
59 | // 理想中的采集规则:
60 | // 1. 当天已经发布过文章的公众号当天就不在采集列表中了,除了每天可以发多次的号之外,需要每天的发布频率、最新一次发布时间等字段
61 | // 2. 需要做到稍微智能些,根据平均发布时间、平均发布间隔、上次打开历史页面的时间等预测将要抓取的公众号列表
62 |
63 | // 先只实现上述规则1
64 | async function updateMode() {
65 | const today = moment().startOf('day').toDate();
66 | const query = {
67 | msgBiz: { $exists: true },
68 | $or: [
69 | { maxDayPubCount: { $gt: 1 } },
70 | { latestPublishAt: { $lt: today } },
71 | { latestPublishAt: { $exists: false } },
72 | ]
73 | };
74 | const profiles = await models.Profile.find(query).sort('openHistoryPageAt')
75 | .select('msgBiz')
76 | .limit(CACHE_LIMIT);
77 | if (!profiles.length) return [];
78 | return profiles.map(p => p.msgBiz);
79 | }
80 |
81 | async function getNextProfileLink() {
82 | let nextLink = await redis('lpop', PROFILE_LIST_KEY);
83 | if (nextLink) return nextLink;
84 |
85 | const bizs = await RULE_FN();
86 | if (bizs.length === 0) return;
87 |
88 | const links = bizs.map(bizToLink);
89 |
90 | // 将从数据库中查到的链接放入redis中
91 | await redis('rpush', PROFILE_LIST_KEY, links);
92 |
93 | // 再查一次就有下一个链接了
94 | return await getNextProfileLink();
95 | }
96 |
97 | exports = module.exports = getNextProfileLink;
98 |
99 | // 必须有一个链接返回
100 | exports.must = async function () {
101 | const link = await getNextProfileLink();
102 | if (link) return link;
103 | return bizToLink('MjM5ODIyMTE0MA==');
104 | };
105 |
106 | function bizToLink(biz) {
107 | return `https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=${biz}&scene=124#wechat_redirect`;
108 | }
109 |
--------------------------------------------------------------------------------
/rule/handleImg/index.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const fs = require('fs');
4 | const path = require('path');
5 | const config = require('../../config');
6 |
7 | const img = fs.readFileSync(path.join(__dirname, 'replaceImg.png'));
8 |
9 | module.exports = function () {
10 | if (!config.rule.isReplaceImg) return;
11 | return {
12 | response: {
13 | statusCode: 200,
14 | header: { 'content-type': 'image/png' },
15 | body: img
16 | }
17 | };
18 | };
19 |
--------------------------------------------------------------------------------
/rule/handleImg/replaceImg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lqqyt2423/wechat_spider/17bca49c5a2606f6b66c7ba194dd54ac180dadda/rule/handleImg/replaceImg.png
--------------------------------------------------------------------------------
/rule/handlePostPage.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const url = require('url');
4 | const models = require('../models');
5 | const logger = require('../utils/logger');
6 | const config = require('../config');
7 | const { getNextPostLink, debugInfo } = require('./postLink');
8 | const { getPostDetail } = require('./savePostsData');
9 |
10 | const { rule: ruleConfig } = config;
11 | const {
12 | isReplacePostBody,
13 | isCrawlComments,
14 | page: pageConfig,
15 | } = ruleConfig;
16 |
17 | // 获取文章详情页阅读量和点赞量
18 | const getReadAndLikeNum = async function (ctx) {
19 | const { req, res } = ctx;
20 |
21 | const body = res.response.body.toString();
22 | const data = JSON.parse(body);
23 | const { read_num, like_num, old_like_num } = data.appmsgstat;
24 | const [readNum, likeNum2, likeNum] = [read_num, like_num, old_like_num];
25 |
26 | const { requestData } = req;
27 | const reqData = String(requestData);
28 | const reqArgs = reqData.split('&').map(s => s.split('='));
29 | const reqObj = reqArgs.reduce((obj, arr) => {
30 | const [key, value] = arr;
31 | obj[key] = decodeURIComponent(value);
32 | return obj;
33 | }, {});
34 | const { __biz, mid, idx } = reqObj;
35 | const [msgBiz, msgMid, msgIdx] = [__biz, mid, idx];
36 |
37 | const post = await models.Post.findOneAndUpdate(
38 | { msgBiz, msgMid, msgIdx },
39 | { readNum, likeNum, likeNum2, updateNumAt: new Date() },
40 | { new: true, upsert: true }
41 | );
42 |
43 | logger.info('[获取文章阅读点赞] id: %s, title: %s, 阅读: %s, 赞: %s, 在看: %s', post.id, post.title, readNum, likeNum, likeNum2);
44 | logger.info(await debugInfo());
45 | };
46 |
47 | // 保存文章的基本信息,也可以直接通过 HTTP 请求处理
48 | const getPostBasicInfo = async function (ctx) {
49 | const { req, res } = ctx;
50 | const link = req.url;
51 | const body = res.response.body.toString();
52 |
53 | await getPostDetail(link, body);
54 | };
55 |
56 | // 注入控制代码至手机前端,实现功能:
57 | // 手机上文章正文显示自定义
58 | // 自动跳转至下一文章详情页
59 | const handlePostHtml = async function (ctx) {
60 | const { res } = ctx;
61 | const { response } = res;
62 | let body = response.body.toString();
63 |
64 | // 替换显示在手机上的正文 加速网络
65 | if (isReplacePostBody) {
66 | const info = await debugInfo();
67 | body = body.replace(/()((?:\s|\S)+?)(<\/div>\s+?
128 |
--------------------------------------------------------------------------------
/rule/postLink.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const models = require('../models');
4 | const redis = require('../utils/redis');
5 | const config = require('../config');
6 |
7 | const {
8 | rule: ruleConfig,
9 | redis: redisConfig,
10 | } = config;
11 |
12 | // 链接数组的缓存
13 | // POST_LIST_KEY: 初始跳转规则,优先级低,且每次程序重启会清空
14 | const { POST_LIST_KEY } = redisConfig;
15 |
16 | const {
17 | page: pageConfig,
18 | } = ruleConfig;
19 |
20 | // 获取下一个文章跳转链接
21 | async function getNextPostLink() {
22 | let nextLink;
23 |
24 | // 提取缓存中的跳转链接
25 | nextLink = await redis('lpop', POST_LIST_KEY);
26 | if (nextLink) return nextLink;
27 |
28 | // 没有拿到链接则从数据库中查
29 | const { minTime, maxTime, isCrawlExist, targetBiz, crawlExistInterval } = pageConfig;
30 |
31 | const searchQuery = {
32 | isFail: null,
33 | link: { $exists: true },
34 | $or: [
35 | { publishAt: { $gte: minTime, $lte: maxTime } },
36 | { publishAt: null },
37 | ],
38 | };
39 |
40 | if (targetBiz && targetBiz.length > 0) searchQuery.msgBiz = { $in: targetBiz };
41 |
42 | if (!isCrawlExist) searchQuery.updateNumAt = null;
43 |
44 | const links = await models.Post.find(searchQuery).select('link publishAt updateNumAt').then(posts => {
45 | if (!(posts && posts.length > 0)) return [];
46 |
47 | // 根据config中的是否抓取已经抓去过的文章来判断逻辑
48 | if (!isCrawlExist) {
49 | return posts.map(post => post.link);
50 | } else {
51 | return posts.filter(post => {
52 | const { publishAt, updateNumAt } = post;
53 | if (!updateNumAt) return true;
54 | if (!publishAt) return true;
55 | if (new Date(updateNumAt).getTime() - new Date(publishAt).getTime() > crawlExistInterval) {
56 | return false;
57 | } else {
58 | return true;
59 | }
60 | }).map(post => post.link);
61 | }
62 | });
63 |
64 | // 如果还查不到 则证明已经抓取完毕了 返回undefined
65 | if (links.length === 0) return;
66 |
67 | // 将从数据库中查到的链接放入redis中
68 | await redis('rpush', POST_LIST_KEY, links);
69 |
70 | // 再查一次就有下一个链接了
71 | return getNextPostLink();
72 | }
73 |
74 | // 判断是否是微信文章页面
75 | function isPostPage(link) {
76 | const isPost = /mp\.weixin\.qq\.com\/s\?__biz/.test(link);
77 | const isOldPost = /mp\/appmsg\/show/.test(link);
78 | const isShortLink = /mp\.weixin\.qq\.com\/s\/(\w|-){22}/.test(link);
79 | if (isPost || isOldPost || isShortLink) return true;
80 | return false;
81 | }
82 |
83 | // 返回剩余文章抓取长度的 debug 信息
84 | async function debugInfo() {
85 | const len = await redis('llen', POST_LIST_KEY);
86 | return `剩余文章抓取长度: ${len}`;
87 | }
88 |
89 | module.exports = {
90 | getNextPostLink,
91 | isPostPage,
92 | debugInfo,
93 | };
94 |
--------------------------------------------------------------------------------
/rule/savePostsData.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const url = require('url');
4 | const moment = require('moment');
5 | const models = require('../models');
6 | const logger = require('../utils/logger');
7 | const redis = require('../utils/redis');
8 | const config = require('../config');
9 | const helper = require('../utils/helper');
10 | const ContentHandler = require('../utils/contentHandler');
11 |
12 | const {
13 | redis: redisConfig,
14 | rule: ruleConfig,
15 | } = config;
16 |
17 | const {
18 | page: pageConfig,
19 | } = ruleConfig;
20 |
21 | // 链接数组的缓存 每次重启程序后都会清空
22 | const { PROFILE_LIST_KEY } = redisConfig;
23 |
24 | // 性能较好的查询公众号数据库
25 | class FindProfileHandler {
26 | constructor() {
27 | this.profileMap = new Map();
28 | this.profileWaitingMap = new Map();
29 | }
30 |
31 | async find(biz) {
32 | let doc = this.profileMap.get(biz);
33 | if (doc || doc === null) return doc;
34 |
35 | let waitingList = this.profileWaitingMap.get(biz);
36 | if (!waitingList) {
37 | // 首次
38 | waitingList = [];
39 | this.profileWaitingMap.set(biz, waitingList);
40 |
41 | doc = await models.Profile.findOne({ msgBiz: biz });
42 | if (!doc) doc = null;
43 | this.profileMap.set(biz, doc);
44 |
45 | // trigger
46 | for (const triggerFn of waitingList) {
47 | triggerFn(doc);
48 | }
49 |
50 | this.profileWaitingMap.delete(biz);
51 |
52 | return doc;
53 | } else {
54 | return await new Promise(resolve => {
55 | const triggerFn = doc => resolve(doc);
56 | waitingList.push(triggerFn);
57 | // logger.debug('[waitingList] len: %s', waitingList.length);
58 | });
59 | }
60 | }
61 | }
62 |
63 | // 存文章基本信息至数据库
64 | async function savePostsData(postList) {
65 | const posts = [];
66 | postList.forEach(post => {
67 | const appMsg = post.app_msg_ext_info;
68 | if (!appMsg) return;
69 | const publishAt = new Date(post.comm_msg_info.datetime * 1000);
70 | posts.push({ appMsg, publishAt });
71 |
72 | const multiAppMsg = appMsg.multi_app_msg_item_list;
73 | if (!(multiAppMsg && multiAppMsg.length > 0)) return;
74 | multiAppMsg.forEach(appMsg => {
75 | posts.push({ appMsg, publishAt });
76 | });
77 | });
78 |
79 | // 查找 profile 辅助方法
80 | const findProfileHandler = new FindProfileHandler();
81 |
82 | let savedPosts = await Promise.all(posts.map(async post => {
83 | const { appMsg, publishAt } = post;
84 | let { title, content_url: link } = appMsg;
85 | if (!(title && link)) return;
86 |
87 | link = helper.escape2Html(link);
88 | title = helper.escape2Html(title);
89 |
90 | const urlObj = url.parse(link, true);
91 | const { query } = urlObj;
92 | const { __biz, mid, idx } = query;
93 | const [msgBiz, msgMid, msgIdx] = [__biz, mid, idx];
94 |
95 | const { cover, digest, source_url: sourceUrl, author, copyright_stat: copyrightStat } = appMsg;
96 |
97 | const updateQuery = { $set: { title, link, publishAt, cover, digest, sourceUrl, author, copyrightStat } };
98 |
99 | return models.Post.findOneAndUpdate(
100 | { msgBiz, msgMid, msgIdx },
101 | updateQuery,
102 | { new: true, upsert: true }
103 | );
104 | }));
105 |
106 | savedPosts = savedPosts.filter(p => p);
107 |
108 | if (savedPosts.length) {
109 | const profile = await findProfileHandler.find(savedPosts[0].msgBiz);
110 | if (profile && profile.title) {
111 | logger.info('[profile] msgBiz: %s, title: %s', savedPosts[0].msgBiz, profile.title);
112 | }
113 | }
114 |
115 | savedPosts.forEach(post => {
116 | logger.info('[抓取历史文章] 发布时间: %s, 标题: %s', post.publishAt ? moment(post.publishAt).format('YYYY-MM-DD HH:mm') : '', post.title);
117 | });
118 |
119 | // 记录公众号的发布记录
120 | await models.ProfilePubRecord.savePubRecords(savedPosts);
121 |
122 | await redis('llen', PROFILE_LIST_KEY).then(len => {
123 | logger.info('剩余公众号抓取长度: %s', len);
124 | });
125 |
126 | return savedPosts;
127 | }
128 |
129 |
130 | // link 必传
131 | // body 可不传
132 | async function getPostDetail(link, body) {
133 |
134 | if (!link) return;
135 | const ch = new ContentHandler({ link, body });
136 |
137 | const doc = await ch.getDetail();
138 | if (!doc) {
139 | logger.warn('[getPostDetail] can not get identify, link: %s', link);
140 | return;
141 | }
142 |
143 | const { msgBiz, msgMid, msgIdx } = doc;
144 |
145 | if (doc.isFail) {
146 | await models.Post.findOneAndUpdate(
147 | { msgBiz, msgMid, msgIdx },
148 | { isFail: true },
149 | { upsert: true }
150 | );
151 | return;
152 | }
153 |
154 | const {
155 | wechatId,
156 | username,
157 | title,
158 | publishAt,
159 | sourceUrl,
160 | cover,
161 | digest,
162 | headimg,
163 | nickname,
164 | } = doc;
165 |
166 | {
167 | const updateObj = { msgBiz, msgMid, msgIdx, link };
168 | if (title) updateObj.title = title;
169 | if (wechatId) updateObj.wechatId = wechatId;
170 | if (publishAt) updateObj.publishAt = publishAt;
171 | if (sourceUrl) updateObj.sourceUrl = sourceUrl;
172 | if (cover) updateObj.cover = cover;
173 | if (digest) updateObj.digest = digest;
174 |
175 | await models.Post.findOneAndUpdate(
176 | { msgBiz, msgMid, msgIdx },
177 | { $set: updateObj },
178 | { upsert: true }
179 | );
180 | logger.info('[save post basic info] %s %s %s %s', msgBiz, msgMid, msgIdx, title);
181 | }
182 |
183 | {
184 | const updateObj = { msgBiz };
185 | if (nickname) updateObj.title = nickname;
186 | if (wechatId) updateObj.wechatId = wechatId;
187 | if (username) updateObj.username = username;
188 | if (headimg) updateObj.headimg = headimg;
189 | await models.Profile.findOneAndUpdate(
190 | { msgBiz },
191 | { $set: updateObj },
192 | { upsert: true }
193 | );
194 | logger.info('[save profile basic info from post] %s %s %s %s %s', msgBiz, nickname, wechatId, username, headimg);
195 | }
196 |
197 | // 保存正文内容
198 | if (pageConfig.isSavePostContent) {
199 | let content, html;
200 |
201 | if (pageConfig.saveContentType === 'html') {
202 | html = await ch.toHtml();
203 | content = await ch.toText();
204 | } else {
205 | content = await ch.toText();
206 | }
207 |
208 | if (content || html) {
209 | const updateObj = { msgBiz, msgMid, msgIdx };
210 | if (content) updateObj.content = content;
211 | if (html) updateObj.html = html;
212 | await models.Post.findOneAndUpdate(
213 | { msgBiz, msgMid, msgIdx },
214 | { $set: updateObj },
215 | { upsert: true }
216 | );
217 | logger.info('[save post content] %s %s %s %s', msgBiz, msgMid, msgIdx, title);
218 | }
219 | }
220 | }
221 |
222 | async function upsertPosts(posts) {
223 | if (!posts) return;
224 | let isArray = Array.isArray(posts);
225 | if (!isArray) posts = [posts];
226 |
227 | const res = await Promise.all(posts.map(async post => {
228 | const { msgBiz, msgMid, msgIdx } = post;
229 | if (!msgBiz || !msgMid || !msgIdx) return null;
230 |
231 | const updateQuery = { $set: post };
232 |
233 | return await models.Post.findOneAndUpdate(
234 | { msgBiz, msgMid, msgIdx },
235 | updateQuery,
236 | { new: true, upsert: true }
237 | );
238 | }));
239 |
240 | if (isArray) return res;
241 | return res[0];
242 | }
243 |
244 | exports = module.exports = savePostsData;
245 | exports.getPostDetail = getPostDetail;
246 | exports.FindProfileHandler = FindProfileHandler;
247 | exports.upsertPosts = upsertPosts;
248 |
--------------------------------------------------------------------------------
/scripts/checkWechatId.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const models = require('../models');
4 | const CorrectWechatId = require('../utils/correctWechatId');
5 |
6 | (async function start() {
7 | await fixWechatId();
8 | await checkWechatId();
9 | process.exit();
10 | })();
11 |
12 | async function checkWechatId() {
13 | const posts = await models.Post.find({
14 | msgBiz: { $exists: true },
15 | wechatId: { $exists: true }
16 | }).select('msgBiz wechatId');
17 |
18 | // 所有账号对象
19 | const bizObj = {};
20 | posts.forEach(post => {
21 | const { msgBiz, wechatId } = post;
22 | if (!bizObj[msgBiz]) {
23 | bizObj[msgBiz] = [wechatId];
24 | } else {
25 | if (bizObj[msgBiz].indexOf(wechatId) === -1) bizObj[msgBiz].push(wechatId);
26 | }
27 | });
28 |
29 | // 所有账号数组
30 | const bizArray = Object.keys(bizObj).map(msgBiz => {
31 | return { msgBiz: msgBiz, wechatIds: bizObj[msgBiz] };
32 | });
33 |
34 | // 仅有一个wechatId的msgBiz
35 | const singleWechatIdArray = bizArray.filter(item => {
36 | const wechatIds = item.wechatIds.filter(id => id);
37 | if (wechatIds.length === 1) {
38 | // 过滤掉中文的字符
39 | if (/[\u4e00-\u9fa5]/.test(wechatIds[0])) return false;
40 | return true;
41 | }
42 | return false;
43 | }).map(item => {
44 | return {
45 | msgBiz: item.msgBiz,
46 | wechatId: item.wechatIds.filter(id => id)[0]
47 | };
48 | });
49 |
50 | // 更新数据表记录
51 | for (let item of singleWechatIdArray) {
52 | const { msgBiz, wechatId } = item;
53 | const correctRecord = new CorrectWechatId({ msgBiz, wechatId });
54 | await correctRecord.checkPost();
55 | await correctRecord.checkProfile();
56 | }
57 |
58 | const singleMsgBizs = singleWechatIdArray.map(item => item.msgBiz);
59 |
60 | // 其余有问题的需手动解决
61 | const hasToFix = bizArray.filter(item => {
62 | return singleMsgBizs.indexOf(item.msgBiz) === -1;
63 | });
64 |
65 | hasToFix.forEach(item => {
66 | console.log('msgBiz:', item.msgBiz);
67 | console.log('wechatIds', item.wechatIds.filter(id => id).join(', '));
68 | console.log();
69 | });
70 | }
71 |
72 | // 手动修复
73 | async function fixWechatId() {
74 | const array = [
75 | ['JinanUniversity', 'MjM5OTQwMTE0Mw=='],
76 | ['gzchfb', 'MzIyODgwMTA3Mg=='],
77 | ];
78 |
79 | if (array.length === 0) return;
80 |
81 | // 更新数据表记录
82 | for (let item of array) {
83 | const [wechatId, msgBiz] = item;
84 | const correctRecord = new CorrectWechatId({ msgBiz, wechatId });
85 | await correctRecord.checkPost();
86 | await correctRecord.checkProfile();
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/server/api/conf.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const express = require('express');
4 | const merge = require('../../utils/merge');
5 | const wrap = require('../wrap');
6 | const config = require('../../config');
7 |
8 | const api = express();
9 |
10 | // get config
11 | api.get('/', wrap(async (req, res) => {
12 | res.json({ data: config });
13 | }));
14 |
15 | // update config
16 | api.put('/', wrap(async (req, res) => {
17 | const body = req.body;
18 | merge(config, body);
19 | res.json({ state: 1, message: '更新配置成功' });
20 | }));
21 |
22 | module.exports = api;
23 |
--------------------------------------------------------------------------------
/server/api/index.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const _ = require('lodash');
4 | const express = require('express');
5 | const api = express();
6 | const config = require('../../config');
7 | const models = require('../../models');
8 | const utils = require('../../utils');
9 | const wrap = require('../wrap');
10 | const conf = require('./conf');
11 |
12 | api.use('/conf', conf);
13 |
14 | const nullRes = (page, perPage) => {
15 | return {
16 | metadata: {
17 | count: 0,
18 | totalPages: 0,
19 | currentPage: page,
20 | perPage,
21 | },
22 | data: [],
23 | };
24 | };
25 |
26 | // posts api
27 | api.get('/posts', wrap(async (req, res) => {
28 | // target = true 表示显示目标抓取的公众号的条目
29 | // mainData
30 | // - = true 表示仅显示有阅读量的条目
31 | // - = false 表示仅显示无阅读量的条目
32 | // msgBiz - 筛选特定公众号的条目,逗号分隔
33 | // q - 搜索词
34 | // sortWay - 排序方式: -updateNumAt, updateNumAt, -publishAt, publishAt
35 | const { target, mainData, msgBiz, sortWay, q, page = 1, perPage = 20 } = req.query;
36 |
37 | const query = {};
38 | // 取各个筛选条件确定的 msgBiz 交集
39 | const bizsArr = [];
40 |
41 | if (q) query.title = new RegExp(_.escapeRegExp(q), 'i');
42 | if (target === 'true') {
43 | const targetBiz = config.rule.page.targetBiz;
44 | if (targetBiz && targetBiz.length) bizsArr.push(config.targetBiz);
45 | }
46 | if (mainData === 'true') query.readNum = { $exists: true };
47 | if (mainData === 'false') query.readNum = { $exists: false };
48 | if (msgBiz) bizsArr.push(msgBiz.split(','));
49 |
50 | if (bizsArr.length) {
51 | const msgBizs = _.intersection(...bizsArr);
52 | // 交集为空,返回给前端空数据
53 | if (!msgBizs.length) {
54 | return res.json(nullRes(page, perPage));
55 | }
56 | query.msgBiz = { $in: msgBizs };
57 | }
58 |
59 | let sortWayResult;
60 | switch (sortWay) {
61 | case '-updateNumAt':
62 | sortWayResult = { updateNumAt: -1 };
63 | break;
64 | case 'updateNumAt':
65 | sortWayResult = { updateNumAt: 1 };
66 | break;
67 | case '-publishAt':
68 | sortWayResult = { publishAt: -1, msgIdx: 1 };
69 | break;
70 | case 'publishAt':
71 | sortWayResult = { publishAt: 1, msgIdx: 1 };
72 | break;
73 | default:
74 | sortWayResult = { publishAt: -1, msgIdx: 1 };
75 | break;
76 | }
77 |
78 | let { metadata, data } = await models.Post.find(query)
79 | .sort(sortWayResult)
80 | .populate('profile')
81 | .paginate({ page, perPage });
82 |
83 | data = data.map(i => {
84 | let profile = null;
85 | if (i.profile) {
86 | profile = {
87 | title: i.profile.title || '',
88 | headimg: i.profile.headimg || '',
89 | };
90 | }
91 | return {
92 | id: i.id,
93 | title: i.title || '',
94 | link: i.link || '',
95 | publishAt: i.publishAt || null,
96 | msgBiz: i.msgBiz || '',
97 | msgIdx: i.msgIdx || '',
98 | readNum: i.readNum || 0,
99 | likeNum: i.likeNum || 0,
100 | likeNum2: i.likeNum2 || 0,
101 | updateNumAt: i.updateNumAt || null,
102 | profile,
103 | };
104 | });
105 | res.json({ metadata, data });
106 | }));
107 |
108 | // show post api
109 | api.get('/posts/:id', wrap(async (req, res) => {
110 | const { id } = req.params;
111 | const post = await models.Post.findById(id);
112 | res.json({ data: post.toObject() });
113 | }));
114 |
115 | // update post api
116 | // TODO: 权限, validate
117 | api.put('/posts/:id', wrap(async (req, res) => {
118 | const { id } = req.params;
119 | const fields = ['title', 'link', 'publishAt', 'readNum', 'likeNum', 'likeNum2', 'msgBiz', 'msgMid', 'msgIdx', 'sourceUrl', 'cover', 'digest', 'isFail', 'wechatId', 'updateNumAt', 'content'];
120 | const doc = utils.extract(req.body, fields);
121 | await models.Post.findByIdAndUpdate(id, doc);
122 | res.json({ state: 1, message: '更新文章成功' });
123 | }));
124 |
125 | // profiles api
126 | api.get('/profiles', wrap(async (req, res) => {
127 | // target = true 表示显示目标抓取的公众号的条目
128 | // q - 搜索词
129 | const { target, q, page = 1, perPage = 20 } = req.query;
130 |
131 | const query = {};
132 | // 取各个筛选条件确定的 msgBiz 交集
133 | const bizsArr = [];
134 |
135 | if (q) query.title = new RegExp(_.escapeRegExp(q), 'i');
136 | if (target === 'true') {
137 | const targetBiz = config.rule.profile.targetBiz;
138 | if (targetBiz && targetBiz.length) bizsArr.push(config.targetBiz);
139 | }
140 |
141 | if (bizsArr.length) {
142 | const msgBizs = _.intersection(...bizsArr);
143 | // 交集为空,返回给前端空数据
144 | if (!msgBizs.length) {
145 | return res.json(nullRes(page, perPage));
146 | }
147 | query.msgBiz = { $in: msgBizs };
148 | }
149 |
150 | let { metadata, data } = await models.Profile.find(query)
151 | .sort({ openHistoryPageAt: -1 })
152 | .paginate({ page, perPage });
153 |
154 | data = data.map(i => ({
155 | id: i.id,
156 | openHistoryPageAt: i.openHistoryPageAt || null,
157 | headimg: i.headimg || '',
158 | msgBiz: i.msgBiz || '',
159 | title: i.title || '',
160 | }));
161 |
162 | // 一些额外数据,耗时
163 | for (const item of data) {
164 | let postsAllCount = 0, postsHasDataCount = 0, newestPostTime = null, oldestPostTime = null;
165 | if (item.msgBiz) {
166 | postsAllCount = await models.Post.countDocuments({ msgBiz: item.msgBiz });
167 | postsHasDataCount = await models.Post.countDocuments({ msgBiz: item.msgBiz, readNum: { $exists: true } });
168 | newestPostTime = ((await models.Post.find({ msgBiz: item.msgBiz, publishAt: { $exists: true } }).sort({ publishAt: -1 }).limit(1))[0] || {}).publishAt || null;
169 | oldestPostTime = ((await models.Post.find({ msgBiz: item.msgBiz, publishAt: { $exists: true } }).sort({ publishAt: 1 }).limit(1))[0] || {}).publishAt || null;
170 | }
171 | item.postsAllCount = postsAllCount;
172 | item.postsHasDataCount = postsHasDataCount;
173 | item.newestPostTime = newestPostTime;
174 | item.oldestPostTime = oldestPostTime;
175 | }
176 |
177 | res.json({ metadata, data });
178 | }));
179 |
180 | // single profile api
181 | api.get('/profiles/:id', wrap(async (req, res) => {
182 | const { id } = req.params;
183 | const profile = await models.Profile.findById(id);
184 | res.json({ data: profile.toObject() });
185 | }));
186 |
187 | // profile update api
188 | api.put('/profiles/:id', wrap(async (req, res) => {
189 | const { id } = req.params;
190 | const fields = ['title', 'wechatId', 'desc', 'msgBiz', 'headimg', 'openHistoryPageAt', 'province', 'city', 'firstPublishAt', 'property'];
191 | const doc = utils.extract(req.body, fields);
192 | await models.Profile.findByIdAndUpdate(id, doc);
193 | res.json({ state: 1, message: '更新公众号成功' });
194 | }));
195 |
196 | module.exports = api;
197 |
--------------------------------------------------------------------------------
/server/index.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const http = require('http');
4 | const express = require('express');
5 | const logger = require('morgan');
6 | const path = require('path');
7 | const app = express();
8 |
9 | const api = require('./api');
10 |
11 | app.use(logger('tiny'));
12 | app.use(express.json());
13 | app.use(express.urlencoded({ extended: false }));
14 |
15 | app.use('/api', api);
16 |
17 | // 前端页面
18 | app.use('/', express.static(path.join(__dirname, '../client/build')));
19 | app.get('/*', (req, res, next) => {
20 | res.sendFile(path.join(__dirname, '../client/build/index.html'));
21 | });
22 |
23 | // handle error 参数 next 不能省略
24 | app.use((error, req, res, next) => {
25 | console.log(error);
26 | if (!res.finished) {
27 | res.status(500).send(error.message);
28 | }
29 | });
30 |
31 | const server = http.createServer(app);
32 |
33 | module.exports = server;
34 |
--------------------------------------------------------------------------------
/server/wrap.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | // 包装 api handler
4 | function wrap(fn) {
5 | return function (req, res, next) {
6 | fn.call(this, req, res, next).catch(next);
7 | };
8 | }
9 |
10 | module.exports = wrap;
11 |
--------------------------------------------------------------------------------
/test/contentHandler.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const ContentHandler = require('../utils/contentHandler');
4 |
5 | async function test(link) {
6 | const ch = new ContentHandler({ link });
7 | console.log('\n==========\n');
8 | console.log('getIdentifying:', await ch.getIdentifying());
9 | console.log('getDetail', await ch.getDetail());
10 | console.log('toHtml', await ch.toHtml());
11 | console.log('toText', await ch.toText());
12 | }
13 |
14 | (async () => {
15 | // normal
16 | await test('https://mp.weixin.qq.com/s/ERQ09QilTRQESaCLoafzYA');
17 |
18 | // image
19 | await test('https://mp.weixin.qq.com/s/eUkqq_u_cBpqE8Bl5Yxk0w');
20 | })();
21 |
--------------------------------------------------------------------------------
/test/exportData.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const ExportData = require('../utils/exportData');
4 |
5 | async function run() {
6 | const bizs = ['MjM5MjAyNDUyMA==', 'Mzk0MjE3NDE0Ng=='];
7 | const endAt = new Date();
8 | const startAt = new Date(endAt.getTime() - 1000 * 60 * 60 * 24 * 30);
9 |
10 | const exportData = new ExportData({ msgBiz: bizs });
11 |
12 | console.info('\n========== json ==========\n');
13 | console.info(await exportData.toJson(startAt, endAt));
14 | console.info('\n========== json ==========\n');
15 |
16 | console.info('\n========== csv ==========\n');
17 | console.info(await exportData.toCsv(startAt, endAt));
18 | console.info('\n========== csv ==========\n');
19 |
20 | console.info('\n========== StaJson ==========\n');
21 | console.info(await exportData.toStaJson(startAt, endAt));
22 | console.info('\n========== StaJson ==========\n');
23 |
24 | console.info('\n========== StaCsv ==========\n');
25 | console.info(await exportData.toStaCsv(startAt, endAt));
26 | console.info('\n========== StaCsv ==========\n');
27 | }
28 |
29 | run().then(() => {
30 | process.exit();
31 | }).catch(err => {
32 | console.error(err);
33 | process.exit(1);
34 | });
35 |
--------------------------------------------------------------------------------
/test/models/Post.test.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const assert = require('assert');
4 | const models = require('../../models');
5 |
6 | describe('Model Post', function() {
7 | it('statics upsert', async function() {
8 | const rawPosts = [ { title: '返程高峰杀到,地铁二号线增加列车疏运广州南站返程客流',
9 | link:
10 | 'http://mp.weixin.qq.com/s?__biz=MzAwOTAwMzA5NQ==&mid=2710598126&idx=1&sn=c4cdc58d865883b4903af69bd686a4b6&chksm=bf0298d6887511c010386385db36ceb2dc817e88f41fc5b27b271c4d06ca6f4c954394e7ad31&scene=0#rd',
11 | publishAt: new Date('2018-10-05T08:05:39.000Z'),
12 | msgBiz: 'MzAwOTAwMzA5NQ==',
13 | msgMid: '2710598126',
14 | msgIdx: '1',
15 | cover:
16 | 'http://mmbiz.qpic.cn/mmbiz_jpg/Ly1RT34mP0IibHoS6FpmiauPtgpWseAw7fR3yjVF4hOfMichuW4Wicbc7BdVxkf3ftybFt1WNVGfdDm8iapLFAL00Ng/640?wxtype=jpeg&wxfrom=0',
17 | digest: '不想堵在路上看车展的赶紧看过来' },
18 | { title: '超强台风“康妮”急速大拐弯,剩下的国庆假期还能好好玩耍?',
19 | link:
20 | 'http://mp.weixin.qq.com/s?__biz=MzAwOTAwMzA5NQ==&mid=2710598126&idx=2&sn=373bd5a91cd1c596c879f38c1ad204cb&chksm=bf0298d6887511c0cab5efa29d494babda2ee06db456e7f1604a8f4095928cbb9bad4ba78ec0&scene=0#rd',
21 | publishAt: new Date('2018-10-05T08:05:39.000Z'),
22 | msgBiz: 'MzAwOTAwMzA5NQ==',
23 | msgMid: '2710598126',
24 | msgIdx: '2',
25 | cover:
26 | 'http://mmbiz.qpic.cn/mmbiz_jpg/Ly1RT34mP0IibHoS6FpmiauPtgpWseAw7fz3FDhdPuJyaebsbjz86R0OG8MHf757RSFz8ISLS8Y3n2TefdaBD4MA/300?wxtype=jpeg&wxfrom=0',
27 | digest: '超强台风“康妮”急速大拐弯' } ];
28 |
29 | const posts = await models.Post.upsert(rawPosts);
30 | const post = await models.Post.upsert(rawPosts[0]);
31 |
32 | assert.equal(posts.length, rawPosts.length);
33 | assert.equal(posts[0].id, post.id);
34 | });
35 | });
36 |
--------------------------------------------------------------------------------
/test/models/Profile.test.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const assert = require('assert');
4 | const models = require('../../models');
5 |
6 | describe('Model Profile', function() {
7 | it('statics upsert', async function() {
8 | const rawProfile = { title: '勤天熹乐谷',
9 | desc: '勤天熹乐谷:南中国首席温泉度假综合体',
10 | msgBiz: 'MjM5MTI3OTgxOA==',
11 | province: 'Guangdong',
12 | city: 'Guangzhou' };
13 |
14 | const rawProfiles = [rawProfile, rawProfile];
15 |
16 | const profile = await models.Profile.upsert(rawProfile);
17 | const profiles = await models.Profile.upsert(rawProfiles);
18 |
19 | assert.equal(profiles.length, rawProfiles.length);
20 | assert.equal(profile.id, profiles[0].id);
21 | });
22 | });
23 |
--------------------------------------------------------------------------------
/test/models/ProfilePubRecord.test.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const assert = require('assert');
4 | const models = require('../../models');
5 |
6 | describe('Model ProfilePubRecord', function() {
7 | describe('statics getMinTargetTime', function() {
8 | it('正常情况', async function() {
9 | const msgBiz = 'MzI4NjQyMTM2Mw==';
10 | let minTime = new Date('2017/1/1');
11 | minTime = await models.ProfilePubRecord.getMinTargetTime(msgBiz, minTime);
12 | assert(minTime >= new Date('2017/5/19'));
13 | });
14 |
15 | it('超过第一篇的发布日期的情况', async function() {
16 | const msgBiz = 'MzI4NjQyMTM2Mw==';
17 | let minTime = new Date('2016/1/1');
18 | minTime = await models.ProfilePubRecord.getMinTargetTime(msgBiz, minTime);
19 | console.log('minTime', minTime);
20 | assert(minTime >= new Date('2017/5/19'));
21 | });
22 | });
23 | });
24 |
--------------------------------------------------------------------------------
/utils/contentHandler.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const url = require('url');
4 | const cheerio = require('cheerio');
5 | const rp = require('request-promise');
6 | const helper = require('./helper');
7 |
8 | function getIdentifyingFromLink(link) {
9 | const urlObj = url.parse(helper.escape2Html(link), true);
10 | const { query: { __biz: msgBiz, mid: msgMid, idx: msgIdx } } = urlObj;
11 | return { msgBiz, msgMid, msgIdx };
12 | }
13 |
14 | module.exports = class ContentHandler {
15 |
16 | constructor(options = {}) {
17 | const { link, body } = options;
18 | if (!link && !body) throw new Error('至少传入link或body');
19 | this.link = link;
20 | this.body = body;
21 |
22 | this._identifying = null;
23 |
24 | // cheerio instance
25 | this.$html = null;
26 |
27 | this.text = '';
28 | this.html = '';
29 |
30 | // normal: 正常文章
31 | // image: 图片类型
32 | this._type = 'normal';
33 | }
34 |
35 | /**
36 | * 获取微信正文html
37 | * @api public
38 | */
39 | async toHtml() {
40 | if (this.html) return this.html;
41 | this.html = (await this.parseBodyToHtml()).html() || '';
42 | return this.html.trim();
43 | }
44 |
45 | /**
46 | * 获取微信正文text
47 | * @api public
48 | */
49 | async toText() {
50 | if (this.text) return this.text;
51 | this.text = (await this.parseBodyToHtml()).text().trim() || '';
52 | return this.text;
53 | }
54 |
55 | async get$Html() {
56 | if (this.$html) return this.$html;
57 | const body = await this.getBody();
58 | this.$html = cheerio.load(body, { decodeEntities: false });
59 | return this.$html;
60 | }
61 |
62 | async parseBodyToHtml() {
63 | const $ = await this.get$Html();
64 | return $('#js_content');
65 | }
66 |
67 | async getBody() {
68 | if (this.body) return this.body;
69 | this.body = await rp(this.link);
70 | return this.body;
71 | }
72 |
73 | // 解析 msgBiz, msgMid, msgIdx
74 | async getIdentifying() {
75 | if (this._identifying) return this._identifying;
76 |
77 | let msgBiz, msgMid, msgIdx;
78 |
79 | // 尝试解析链接
80 | if (this.link) {
81 | ({ msgBiz, msgMid, msgIdx } = getIdentifyingFromLink(this.link));
82 | }
83 |
84 | // 无链接或为短链接时,则需要解析正文
85 | if (!msgBiz || !msgMid || !msgIdx) {
86 | const $ = await this.get$Html();
87 | const urlMeta = $('meta[property="og:url"]').attr();
88 | if (urlMeta && urlMeta.content) {
89 | ({ msgBiz, msgMid, msgIdx } = getIdentifyingFromLink(urlMeta.content));
90 | }
91 | }
92 |
93 | this._identifying = { msgBiz, msgMid, msgIdx };
94 |
95 | return this._identifying;
96 | }
97 |
98 | // 获取文章详情数据
99 | async getDetail() {
100 | const { msgBiz, msgMid, msgIdx } = await this.getIdentifying();
101 | if (!msgBiz || !msgMid || !msgIdx) return null;
102 |
103 | const doc = { msgBiz, msgMid, msgIdx };
104 | const body = await this.getBody();
105 |
106 | // 判断此文是否失效
107 | if (body.includes('global_error_msg') || body.includes('icon_msg warn')) {
108 | doc.isFail = true;
109 | return doc;
110 | }
111 |
112 | // 从 html 中提取必要信息
113 | const getTarget = regexp => {
114 | let target = '';
115 | body.replace(regexp, (_, t) => {
116 | target = t;
117 | });
118 | return target;
119 | };
120 |
121 | let wechatId = getTarget(/(.+?)<\/span>/);
122 |
123 | if (!wechatId) {
124 | // 图片类型,单独处理
125 | if (body.includes('id="img_list"')) {
126 | this._type = 'image';
127 | return await this.getImageDetail(doc, body);
128 | }
129 | }
130 |
131 | const username = getTarget(/var user_name = "(.+?)"/);
132 | // 如果上面找到的微信id中包含中文字符 则证明此微信号没有设置微信id 则取微信给定的 username 初始字段
133 | if (wechatId && /[\u4e00-\u9fa5]/.test(wechatId)) {
134 | wechatId = username;
135 | }
136 | const title = getTarget(/var msg_title = '(.+?)'/);
137 | let publishAt = getTarget(/var ct = "(\d+)";/);
138 | if (publishAt) publishAt = new Date(parseInt(publishAt) * 1000);
139 | const sourceUrl = getTarget(/var msg_source_url = '(.*?)';/);
140 | const cover = getTarget(/var msg_cdn_url = "(.+?)";/);
141 | const digest = getTarget(/var msg_desc = htmlDecode\("(.+?)"\);/);
142 |
143 | // 公众号头像
144 | const headimg = getTarget(/var hd_head_img = "(.+?)"/);
145 | const nickname = getTarget(/var nickname = "(.+?)"/);
146 |
147 | return {
148 | ...doc,
149 | wechatId,
150 | username,
151 | title,
152 | publishAt,
153 | sourceUrl,
154 | cover,
155 | digest,
156 | headimg,
157 | nickname,
158 | };
159 | }
160 |
161 | // 图片类型,获取详情数据
162 | async getImageDetail(doc, body) {
163 | const getTarget = regexp => {
164 | let target = '';
165 | body.replace(regexp, (_, t) => {
166 | target = t;
167 | });
168 | return target;
169 | };
170 |
171 | const username = getTarget(/user_name: "(.+?)"/);
172 | const title = getTarget(/d.title = .*'(.+?)';/);
173 | let publishAt = getTarget(/d.ct = .*'(\d+)';/);
174 | if (publishAt) publishAt = new Date(parseInt(publishAt) * 1000);
175 |
176 | const $ = await this.get$Html();
177 | const cover = ($('#img_list img').attr() || {}).src;
178 |
179 | const headimg = getTarget(/d.hd_head_img = .*'(.+?)' \|\|/);
180 | const nickname = getTarget(/d.nick_name = .*'(.+?)';/);
181 |
182 | return {
183 | ...doc,
184 | username,
185 | title,
186 | publishAt,
187 | cover,
188 | headimg,
189 | nickname,
190 | };
191 | }
192 | };
193 |
--------------------------------------------------------------------------------
/utils/correctWechatId.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const models = require('../models');
4 |
5 | module.exports = class CorrectWechatId {
6 |
7 | constructor(options = {}) {
8 | const { msgBiz, wechatId } = options;
9 | if (!msgBiz || !wechatId) throw new Error('请传入正确参数');
10 |
11 | this.msgBiz = msgBiz;
12 | this.wechatId = wechatId;
13 | }
14 |
15 | async checkPost() {
16 | const res = await this.updateWechatId('Post');
17 | if (res.nModified) {
18 | console.log(`msgBiz: ${this.msgBiz}, wechatId: ${this.wechatId}`);
19 | console.log(`文章数据表中更新了${res.nModified}条记录`);
20 | console.log();
21 | }
22 | }
23 |
24 | async checkProfile() {
25 | const res = await this.updateWechatId('Profile');
26 | if (res.nModified) {
27 | console.log(`msgBiz: ${this.msgBiz}, wechatId: ${this.wechatId}`);
28 | console.log(`账号数据表中更新了${res.nModified}条记录`);
29 | console.log();
30 | }
31 | }
32 |
33 | async updateWechatId(modelName) {
34 | return await models[modelName].updateMany(
35 | { msgBiz: this.msgBiz, wechatId: { $ne: this.wechatId } },
36 | { wechatId: this.wechatId }
37 | );
38 | }
39 |
40 | };
41 |
--------------------------------------------------------------------------------
/utils/exportData.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const models = require('../models');
4 | const json2csv = require('json2csv');
5 | const moment = require('moment');
6 |
7 | const profileMap = {
8 | 公众号: 'title',
9 | 公众号ID: 'wechatId',
10 | 公众号属性: 'property',
11 | };
12 |
13 | const postMap = {
14 | msgBiz: 'msgBiz',
15 | 标题: 'title',
16 | 链接: 'link',
17 | 发布时间: 'publishAt',
18 | 发布位置: 'msgIdx',
19 | 阅读量: 'readNum',
20 | 点赞量: 'likeNum',
21 | 摘要: 'digest',
22 | 封面: 'cover',
23 | 内容: 'content',
24 | 阅读原文: 'sourceUrl'
25 | };
26 |
27 | module.exports = class ExportData {
28 |
29 | constructor(options = {}) {
30 | const { msgBiz, bizToInfoMap } = options;
31 | this.msgBiz = [];
32 | this.bizToInfoMap = bizToInfoMap;
33 |
34 | if (msgBiz) this.msgBiz = this.msgBiz.concat(msgBiz);
35 | if (this.msgBiz.length === 0) throw new Error('请传入参数');
36 | }
37 |
38 | /**
39 | * 导出为json字符串
40 | * @param {Date} minDate
41 | * @param {Date} maxDate
42 | * @param {Object} options
43 | * @return {String}
44 | * @api public
45 | */
46 | async toJson(minDate, maxDate, options = {}) {
47 | const posts = await this.findPosts(minDate, maxDate);
48 | const keys = Object.keys(profileMap).concat(Object.keys(postMap));
49 |
50 | let replacer = null;
51 | const optionKeys = Object.keys(options);
52 | if (optionKeys.length > 0) {
53 | // 传入的key必须得在keys中存在
54 | const isContain = optionKeys.every(key => keys.indexOf(key) > -1);
55 | if (!isContain) throw new Error('确保格式化字段传入正确');
56 |
57 | // 确保value全为1或全为-1
58 | const onlyOrExcept = options[optionKeys[0]];
59 | const isAllow = optionKeys.every(key => options[key] === onlyOrExcept);
60 | if (!isAllow) throw new Error('确保value全为1或全为-1');
61 |
62 | // 更改replacer
63 | replacer = keys.filter(key => {
64 | if (onlyOrExcept === 1) return (optionKeys.indexOf(key) > -1);
65 | return (optionKeys.indexOf(key) === -1);
66 | });
67 | }
68 |
69 | return JSON.stringify(posts, replacer, 4);
70 | }
71 |
72 | /**
73 | * 导出为csv字符串
74 | * @param {Date} minDate
75 | * @param {Date} maxDate
76 | * @param {Object} options
77 | * @return {String}
78 | * @api public
79 | */
80 | async toCsv(minDate, maxDate, options = {}) {
81 | const json = await this.toJson(minDate, maxDate, options);
82 | const obj = JSON.parse(json);
83 | let csv = json2csv({ data: obj });
84 | csv = addBom(csv);
85 | return csv;
86 | }
87 |
88 | /**
89 | * 导出统计信息json字符串
90 | * @param {Date} minDate
91 | * @param {Date} maxDate
92 | * @return {String}
93 | * @api public
94 | */
95 | async toStaJson(minDate, maxDate) {
96 | const data = await this.calcStatistic(minDate, maxDate);
97 | return JSON.stringify(data, null, 4);
98 | }
99 |
100 | /**
101 | * 导出统计信息csv字符串
102 | * @param {Date} minDate
103 | * @param {Date} maxDate
104 | * @return {String}
105 | * @api public
106 | */
107 | async toStaCsv(minDate, maxDate) {
108 | const data = await this.calcStatistic(minDate, maxDate);
109 | let csv = json2csv({ data });
110 | csv = addBom(csv);
111 | return csv;
112 | }
113 |
114 | /**
115 | * 查找文章
116 | * @api private
117 | */
118 | async findPosts(minDate, maxDate) {
119 | const posts = await models.Post.find({
120 | msgBiz: { $in: this.msgBiz },
121 | publishAt: { $gte: minDate, $lt: maxDate },
122 | isFail: { $ne: true }
123 | }).sort({ msgBiz: 1, publishAt: 1, msgIdx: 1 }).populate('profile');
124 |
125 | // 按照 this.msgBiz 排序
126 | const postGroupMap = {};
127 | posts.forEach(i => {
128 | const { msgBiz } = i;
129 | postGroupMap[msgBiz] = postGroupMap[msgBiz] || [];
130 | postGroupMap[msgBiz].push(i);
131 | });
132 | const postGroupKeys = Object.keys(postGroupMap);
133 | postGroupKeys.sort((a, b) => {
134 | const i = this.msgBiz.indexOf(a) - this.msgBiz.indexOf(b);
135 | if (i < 0) return -1;
136 | if (i > 0) return 1;
137 | return 0;
138 | });
139 |
140 | let sortedPosts = [];
141 | postGroupKeys.forEach(key => {
142 | sortedPosts = sortedPosts.concat(postGroupMap[key]);
143 | });
144 |
145 | const handledPosts = sortedPosts.map(post => {
146 | const { profile, msgBiz } = post;
147 | const postObj = {};
148 |
149 | // 公众号信息
150 | Object.keys(profileMap).forEach(key => {
151 | const value = profile[profileMap[key]];
152 | if (value) postObj[key] = value;
153 | });
154 |
155 | // 传入的额外公众号信息
156 | if (this.bizToInfoMap && this.bizToInfoMap[msgBiz]) {
157 | const { property } = this.bizToInfoMap[msgBiz];
158 | postObj['公众号属性'] = property;
159 | }
160 |
161 | // 文章信息
162 | Object.keys(postMap).forEach(key => {
163 | const value = post[postMap[key]];
164 | if (value) postObj[key] = value;
165 |
166 | // 时间格式转换
167 | if (key === '发布时间' && Object.prototype.toString.call(value) == '[object Date]') postObj[key] = moment(value).format('YYYY-MM-DD HH:mm');
168 | });
169 |
170 | // 用0替换undefined
171 | if (!postObj.阅读量) postObj.阅读量 = 0;
172 | if (!postObj.点赞量) postObj.点赞量 = 0;
173 |
174 | return postObj;
175 | });
176 |
177 | return handledPosts;
178 | }
179 |
180 | /**
181 | * 计算统计信息
182 | * @api private
183 | */
184 | async calcStatistic(...args) {
185 | const json = await this.toJson(...args);
186 | const data = JSON.parse(json);
187 | let aggrObj = {};
188 | let aggrArray = [];
189 | data.forEach(item => {
190 | let key = item.msgBiz;
191 | if (key in aggrObj) {
192 | aggrObj[key].总阅读量 += item.阅读量 || 0;
193 | aggrObj[key].总点赞量 += item.点赞量 || 0;
194 | aggrObj[key].总发文数 += 1;
195 | if (item.发布位置 == '1') {
196 | aggrObj[key].头条总阅读量 += item.阅读量 || 0;
197 | aggrObj[key].头条总点赞量 += item.点赞量 || 0;
198 | aggrObj[key].推送次数 += 1;
199 | }
200 | if (item.阅读量 > aggrObj[key].单篇最高阅读量) {
201 | aggrObj[key].单篇最高阅读量 = item.阅读量;
202 | }
203 | } else {
204 | aggrObj[key] = {
205 | 分类: item.分类,
206 | 公众号属性: item.公众号属性,
207 | 机构名称: item.机构名称,
208 | 公众号: item.公众号,
209 | 公众号ID: item.公众号ID,
210 | 总阅读量: item.阅读量 || 0,
211 | 总点赞量: item.点赞量 || 0,
212 | 总发文数: 1,
213 | 头条总阅读量: 0,
214 | 头条总点赞量: 0,
215 | 推送次数: 0,
216 | 单篇最高阅读量: item.阅读量 || 0
217 | };
218 | if (item.发布位置 == '1') {
219 | aggrObj[key].头条总阅读量 = item.阅读量 || 0;
220 | aggrObj[key].头条总点赞量 = item.点赞量 || 0;
221 | aggrObj[key].推送次数 = 1;
222 | }
223 | }
224 | });
225 | Object.keys(aggrObj).forEach(key => {
226 | let item = aggrObj[key];
227 | let 公众号 = item.公众号;
228 | aggrArray.push({
229 | 公众号: 公众号,
230 | 公众号ID: item.公众号ID,
231 | 分类: item.分类,
232 | 公众号属性: item.公众号属性,
233 | 机构名称: item.机构名称,
234 | msgBiz: key,
235 | 总阅读量: item.总阅读量,
236 | 平均阅读量: Math.round(item.总阅读量 / item.总发文数),
237 | 头条总阅读量: item.头条总阅读量,
238 | 推送次数: item.推送次数,
239 | 总点赞量: item.总点赞量,
240 | 平均点赞量: Math.round(item.总点赞量 / item.总发文数),
241 | 头条总点赞量: item.头条总点赞量,
242 | 单篇最高阅读量: item.单篇最高阅读量,
243 | 总发文数: item.总发文数
244 | });
245 | });
246 | return aggrArray;
247 | }
248 | };
249 |
250 | function addBom(csv) {
251 | const bom = Buffer.from('\uFEFF');
252 | const csvBuf = Buffer.from(csv);
253 | return Buffer.concat([bom, csvBuf]).toString();
254 | }
255 |
--------------------------------------------------------------------------------
/utils/helper.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | function sleep(ms) {
4 | return new Promise((resolve) => {
5 | setTimeout(resolve, ms);
6 | });
7 | }
8 |
9 | function random(min, max) {
10 | if (max === min) return max;
11 | if (max < min) [min, max] = [max, min];
12 | return Math.round(Math.random() * (max - min) + min);
13 | }
14 |
15 | function rsleep(min, max) {
16 | const ms = random(min, max);
17 | return sleep(ms);
18 | }
19 |
20 | // html 字符串转义
21 | function escape2Html(str) {
22 | const obj = {
23 | 'lt': '<',
24 | 'gt': '>',
25 | 'nbsp': ' ',
26 | 'amp': '&',
27 | 'quot': '"'
28 | };
29 | return str.replace(/&(lt|gt|nbsp|amp|quot);/ig, (_, t) => obj[t]);
30 | }
31 |
32 | module.exports = {
33 | sleep,
34 | random,
35 | rsleep,
36 | escape2Html,
37 | };
38 |
--------------------------------------------------------------------------------
/utils/index.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const config = require('../config');
4 | const redis = require('./redis');
5 |
6 | const {
7 | redis: redisConfig,
8 | } = config;
9 |
10 | const { POST_LIST_KEY, PROFILE_LIST_KEY } = redisConfig;
11 |
12 | function extract(doc, fields) {
13 | return fields.reduce((obj, key) => {
14 | const val = doc[key];
15 | if (val !== undefined) obj[key] = val;
16 | return obj;
17 | }, {});
18 | }
19 |
20 | function delCrawlLinkCache() {
21 | return redis('del', POST_LIST_KEY, PROFILE_LIST_KEY);
22 | }
23 |
24 | exports.extract = extract;
25 | exports.delCrawlLinkCache = delCrawlLinkCache;
26 |
--------------------------------------------------------------------------------
/utils/logger.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const winston = require('winston');
4 | const config = require('../config');
5 | const isDev = config.isDev;
6 |
7 | const loggerFormats = [
8 | winston.format.timestamp({
9 | format: 'YYYY-MM-DD HH:mm:ss'
10 | }),
11 | winston.format.errors({ stack: true }),
12 | winston.format.splat(),
13 | winston.format.printf(info => {
14 | let str = `${info.timestamp} ${info.level}: `;
15 | let msg = info.message;
16 | if (typeof msg === 'object') msg = JSON.stringify(msg);
17 | str += msg;
18 | if (info.stack) str += info.stack;
19 | return str;
20 | }),
21 | ];
22 | if (isDev) {
23 | loggerFormats.unshift(winston.format.colorize());
24 | }
25 | const logger = winston.createLogger({
26 | level: isDev ? 'silly' : 'http',
27 | transports: [new winston.transports.Console()],
28 | format: winston.format.combine(...loggerFormats)
29 | });
30 |
31 | module.exports = logger;
32 |
--------------------------------------------------------------------------------
/utils/merge.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const merge = (target, source) => {
4 | const keys = Object.keys(target);
5 | keys.forEach(k => {
6 | if (source.hasOwnProperty(k)) {
7 | if (Object.prototype.toString.call(source[k]) === '[object Object]') {
8 | merge(target[k], source[k]);
9 | } else {
10 | target[k] = source[k];
11 | }
12 | }
13 | });
14 | };
15 |
16 | module.exports = merge;
17 |
--------------------------------------------------------------------------------
/utils/redis.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const redis = require('redis');
4 | const { promisify } = require('util');
5 | const config = require('../config');
6 |
7 | const { port = 6379, host = '127.0.0.1' } = config.redis;
8 |
9 | const redisClient = redis.createClient(port, host);
10 |
11 | module.exports = asyncRedis;
12 |
13 | function asyncRedis(cmd, ...args) {
14 | return promisify(redisClient[cmd]).call(redisClient, ...args);
15 | }
16 |
--------------------------------------------------------------------------------