├── server
├── favicon.png
├── index.js
└── api.js
├── targetBiz.json
├── posts_screenshot.png
├── rule
├── replaceImg.png
├── index.js
└── wechatRule.js
├── client
├── app
│ ├── config.js
│ ├── style
│ │ └── style.css
│ ├── index.html
│ ├── components
│ │ ├── loading.jsx
│ │ ├── searchInput.jsx
│ │ └── Paginator.jsx
│ ├── containers
│ │ ├── search.jsx
│ │ ├── categories.jsx
│ │ ├── profile.jsx
│ │ ├── profiles.jsx
│ │ └── posts.jsx
│ ├── reducers.js
│ ├── index.jsx
│ └── actions.js
├── package.json
└── webpack.config.js
├── 开启步骤
├── auto_driver
├── config.py
├── tes.py
├── auto_operate_phone.py
└── upload_data.py
├── utils
├── redis.js
├── correctWechatId.js
├── contentHandler.js
└── exportData.js
├── models
├── Profile.js
├── Category.js
├── Comment.js
├── index.js
├── Post.js
└── plugins
│ └── paginator.js
├── LICENSE
├── package.json
├── .eslintrc.js
├── .gitignore
├── config.js
├── README.md
├── index_no_script.js
├── index.js
├── scripts
└── checkWechatId.js
└── ts.js
/server/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mark2016goog/wechat_spider/HEAD/server/favicon.png
--------------------------------------------------------------------------------
/targetBiz.json:
--------------------------------------------------------------------------------
1 | [
2 | "MjM5MjAxNDM4MA==",
3 | "MjM5ODIyMTE0MA==",
4 | "MzA4NDEzNTMyMA=="
5 | ]
--------------------------------------------------------------------------------
/posts_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mark2016goog/wechat_spider/HEAD/posts_screenshot.png
--------------------------------------------------------------------------------
/rule/replaceImg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mark2016goog/wechat_spider/HEAD/rule/replaceImg.png
--------------------------------------------------------------------------------
/client/app/config.js:
--------------------------------------------------------------------------------
1 | // const ENV = process.env.NODE_ENV || 'development';
2 |
3 | const config = {
4 | posts: '/api/posts',
5 | profiles: '/api/profiles',
6 | profile: '/api/profile',
7 | cates: '/api/categories'
8 | };
9 |
10 | export default config;
11 |
--------------------------------------------------------------------------------
/client/app/style/style.css:
--------------------------------------------------------------------------------
1 | body {
2 | padding-left: 100px;
3 | font-family: "SFMono-Regular", Consolas, "Liberation Mono", Menlo, Courier;
4 | }
5 |
6 | .wrapper {
7 | box-sizing: border-box;
8 | width: 100%;
9 | padding:0 20px;
10 | margin: 10px auto;
11 | }
--------------------------------------------------------------------------------
/开启步骤:
--------------------------------------------------------------------------------
1 | 1. appium 服务端开启 (长期开启)
2 | 2. node 微信爬虫开启 [npm start] (长期开启)
3 | 3. appium script 点击手机触发爬虫 (定时运行触发爬虫)
4 |
5 | @ windows有计划任务 会定时运行 auto_driver/auto_operate_phone.py 去操作手机打开微信爬取, 然后存数据到远程服务器db
6 |
7 | @ 第一次数据量可能会很大,可以先手动运行一下脚本(防止爬取到第二天9点 操作微信脚本 再次运行造成冲突),之后每天爬取的量少, 能保证24小时内爬完
8 |
--------------------------------------------------------------------------------
/auto_driver/config.py:
--------------------------------------------------------------------------------
1 | MONGO = {
2 | 'host': 'localhost',
3 | 'port': 27017
4 | }
5 |
6 | # 微信数据库信息
7 | WECHAT_DB_NAME = 'wechat_spider'
8 |
9 | # 微信集合名
10 | POST_COL = 'posts'
11 | PROFILE_COL = 'profiles'
12 | COMMENTS_COL = 'comments'
13 | CATE_COL = 'categories'
14 |
15 | REMOTE_HOST = 'http://192.168.1.6:5001'
--------------------------------------------------------------------------------
/client/app/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | 微信数据展示
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/utils/redis.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const redis = require('redis');
4 | const { promisify } = require('util');
5 | const config = require('../config');
6 |
7 | const { port = 6379, host = '127.0.0.1' } = config.redis;
8 |
9 | const redisClient = redis.createClient(port, host);
10 |
11 | module.exports = asyncRedis;
12 |
13 | function asyncRedis(cmd, ...args) {
14 | return promisify(redisClient[cmd]).call(redisClient, ...args);
15 | }
16 |
--------------------------------------------------------------------------------
/models/Profile.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const mongoose = require('mongoose');
4 | const Schema = mongoose.Schema;
5 |
6 | // 数据结构:公众号账号
7 | const Profile = new Schema({
8 | title: String,
9 | wechatId: String,
10 | desc: String,
11 | msgBiz: String,
12 | headimg: String,
13 | openHistoryPageAt: Date,
14 | // 无关的字段,可忽略
15 | property: String
16 | });
17 |
18 | Profile.plugin(require('motime'));
19 |
20 | Profile.index({ msgBiz: 1 });
21 |
22 | mongoose.model('Profile', Profile);
23 |
--------------------------------------------------------------------------------
/models/Category.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const mongoose = require('mongoose');
4 | const Schema = mongoose.Schema;
5 |
6 | // 数据结构:公众号账号
7 | const Category = new Schema({
8 | name: String,
9 | msgBizs: [String]
10 | }, { toJSON: { virtuals: true } });
11 |
12 | Category.plugin(require('motime'));
13 |
14 | Category.virtual('profiles', {
15 | ref: 'Profile',
16 | localField: 'msgBizs',
17 | foreignField: 'msgBiz'
18 | });
19 |
20 | Category.index({ name: 1 });
21 |
22 | mongoose.model('Category', Category);
23 |
--------------------------------------------------------------------------------
/models/Comment.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const mongoose = require('mongoose');
4 | const Schema = mongoose.Schema;
5 |
6 | const Comment = new Schema({
7 | postId: { type: 'ObjectId', ref: 'Post' },
8 | contentId: String,
9 | nickName: String,
10 | logoUrl: String,
11 | content: String,
12 | createTime: Date,
13 | likeNum: Number,
14 | replies: [{
15 | content: String,
16 | createTime: Date,
17 | likeNum: Number
18 | }]
19 | });
20 |
21 | Comment.plugin(require('motime'));
22 |
23 | Comment.index({ contentId: 1 });
24 |
25 | mongoose.model('Comment', Comment);
26 |
--------------------------------------------------------------------------------
/models/index.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const mongoose = require('mongoose');
4 | const path = require('path');
5 |
6 | mongoose.Promise = global.Promise;
7 |
8 | // 载入 mongoose 插件
9 | require('./plugins/paginator');
10 |
11 | const config = require('../config');
12 |
13 | mongoose.connect(config.mongodb.db);
14 |
15 | mongoose.set('debug', false);
16 |
17 | // Load All Models
18 | [
19 | 'Post',
20 | 'Profile',
21 | 'Category',
22 | 'Comment'
23 | ].forEach(function(modelName) {
24 | require(path.join(__dirname, modelName));
25 | exports[modelName] = mongoose.model(modelName);
26 | });
27 |
--------------------------------------------------------------------------------
/client/app/components/loading.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import RefreshIndicator from 'material-ui/RefreshIndicator';
3 |
4 | const style = {
5 | container: {
6 | position: 'fixed',
7 | top: '50%',
8 | left: '50%'
9 | },
10 | refresh: {
11 | display: 'inline-block',
12 | position: 'relative'
13 | },
14 | };
15 |
16 | const Loading = () => (
17 |
18 |
25 |
26 | );
27 |
28 | export default Loading;
29 |
--------------------------------------------------------------------------------
/models/Post.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const mongoose = require('mongoose');
4 | const Schema = mongoose.Schema;
5 |
6 | // 数据结构:文章
7 | const Post = new Schema({
8 | title: String,
9 | link: String,
10 | publishAt: Date,
11 | readNum: Number,
12 | likeNum: Number,
13 | msgBiz: String,
14 | msgMid: String,
15 | msgIdx: String,
16 | sourceUrl: String,
17 | cover: String,
18 | digest: String,
19 | isFail: Boolean,
20 | wechatId: String,
21 | updateNumAt: Date,
22 | // 文章正文html代码
23 | content: String
24 | }, { toJSON: { virtuals: true } });
25 |
26 | Post.plugin(require('motime'));
27 |
28 | Post.virtual('profile', {
29 | ref: 'Profile',
30 | localField: 'msgBiz',
31 | foreignField: 'msgBiz',
32 | justOne: true
33 | });
34 |
35 | // 索引
36 | Post.index({ publishAt: -1, msgIdx: 1 });
37 | Post.index({ publishAt: 1, msgIdx: 1 });
38 | Post.index({ updateNumAt: -1 });
39 | Post.index({ updateNumAt: 1 });
40 | Post.index({ msgBiz: 1, publishAt: 1, msgIdx: 1 });
41 | Post.index({ msgBiz: 1, msgMid: 1, msgIdx: 1 });
42 |
43 | mongoose.model('Post', Post);
44 |
--------------------------------------------------------------------------------
/client/app/components/searchInput.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import PropTypes from 'prop-types';
3 | import TextField from 'material-ui/TextField';
4 |
5 | export default class SearchInput extends React.Component {
6 | constructor(props) {
7 | super(props);
8 | this.state = {
9 | q: props.value || ''
10 | };
11 | }
12 |
13 | render() {
14 | const { q } = this.state;
15 | const { onEnter, hintText = '', fullWidth = false } = this.props;
16 | return (
17 | {
20 | this.setState({
21 | q: event.target.value
22 | });
23 | }}
24 | onKeyPress={event => {
25 | if (event.key == 'Enter') {
26 | onEnter(q);
27 | }
28 | }}
29 | hintText={hintText}
30 | fullWidth={fullWidth}
31 | />
32 | );
33 | }
34 | }
35 |
36 | SearchInput.propTypes = {
37 | onEnter: PropTypes.func.isRequired,
38 | value: PropTypes.string,
39 | hintText: PropTypes.string,
40 | fullWidth: PropTypes.bool
41 | };
42 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 liqiang
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/client/app/containers/search.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import { assembleUrl } from '../actions';
3 | import SearchInput from '../components/searchInput.jsx';
4 |
5 | class Search extends React.Component {
6 |
7 | constructor(props) {
8 | super(props);
9 | }
10 |
11 | render() {
12 | const { location, history, searchArgs, defaultText } = this.props;
13 | const { pathname } = location;
14 | let { q = '' } = searchArgs;
15 | q = decodeURIComponent(q);
16 | const nextQuery = { ...searchArgs };
17 |
18 | // 去掉分页query
19 | if (nextQuery.page) delete nextQuery.page;
20 | return (
21 |
24 | {
29 | if (q) nextQuery.q = q;
30 | if (!q && nextQuery.q) delete nextQuery.q;
31 | const path = assembleUrl(pathname, nextQuery);
32 | history.push(path);
33 | }}
34 | />
35 |
36 | );
37 | }
38 | }
39 |
40 | export default Search;
41 |
--------------------------------------------------------------------------------
/utils/correctWechatId.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const models = require('../models');
4 |
5 | module.exports = class CorrectWechatId {
6 |
7 | constructor(options = {}) {
8 | const { msgBiz, wechatId } = options;
9 | if (!msgBiz || !wechatId) throw new Error('请传入正确参数');
10 |
11 | this.msgBiz = msgBiz;
12 | this.wechatId = wechatId;
13 | }
14 |
15 | async checkPost() {
16 | const res = await this.updateWechatId('Post');
17 | if (res.nModified) {
18 | console.log(`msgBiz: ${this.msgBiz}, wechatId: ${this.wechatId}`);
19 | console.log(`文章数据表中更新了${res.nModified}条记录`);
20 | console.log();
21 | }
22 | }
23 |
24 | async checkProfile() {
25 | const res = await this.updateWechatId('Profile');
26 | if (res.nModified) {
27 | console.log(`msgBiz: ${this.msgBiz}, wechatId: ${this.wechatId}`);
28 | console.log(`账号数据表中更新了${res.nModified}条记录`);
29 | console.log();
30 | }
31 | }
32 |
33 | async updateWechatId(modelName) {
34 | return await models[modelName].updateMany(
35 | { msgBiz: this.msgBiz, wechatId: { $ne: this.wechatId } },
36 | { wechatId: this.wechatId }
37 | );
38 | }
39 |
40 | };
41 |
--------------------------------------------------------------------------------
/utils/contentHandler.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const cheerio = require('cheerio');
4 | const rp = require('request-promise');
5 |
6 | module.exports = class ContentHandler {
7 |
8 | constructor(options = {}) {
9 | const { link, body } = options;
10 | if (!link && !body) throw new Error('至少传入link或body');
11 | this.link = link;
12 | this.body = body;
13 | this.text = '';
14 | this.html = '';
15 | }
16 |
17 | /**
18 | * 获取微信正文html
19 | * @api public
20 | */
21 | async toHtml() {
22 | if (this.html) return this.html;
23 | this.html = (await this.parseBodyToHtml()).html().trim() || '';
24 | return this.html;
25 | }
26 |
27 | /**
28 | * 获取微信正文text
29 | * @api public
30 | */
31 | async toText() {
32 | if (this.text) return this.text;
33 | this.text = (await this.parseBodyToHtml()).text().trim() || '';
34 | return this.text;
35 | }
36 |
37 | async parseBodyToHtml() {
38 | if (!this.body) await this.getBody();
39 | const $ = cheerio.load(this.body, { decodeEntities: false });
40 | return $('#js_content');
41 | }
42 |
43 | async getBody() {
44 | this.body = await rp(this.link);
45 | }
46 |
47 | };
48 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "wechat_spider",
3 | "version": "1.1.0",
4 | "description": "wechat spider by Man-in-the-middle attack",
5 | "main": "index.js",
6 | "scripts": {
7 | "start": "nodemon index.js --ignore client/",
8 | "build": "pm2 start index.js --name=\"wechat_spider\""
9 | },
10 | "author": "liqiang",
11 | "license": "MIT",
12 | "dependencies": {
13 | "anyproxy": "^4.0.5",
14 | "body-parser": "^1.18.3",
15 | "cheerio": "^1.0.0-rc.2",
16 | "cookie-parser": "^1.4.3",
17 | "ejs": "^2.5.7",
18 | "express": "^4.16.3",
19 | "ip": "^1.1.5",
20 | "json2csv": "^3.11.2",
21 | "moment": "^2.21.0",
22 | "mongoose": "^5.0.10",
23 | "morgan": "^1.8.2",
24 | "motime": "^0.0.2",
25 | "multer": "^1.3.0",
26 | "redis": "^2.8.0",
27 | "request": "^2.83.0",
28 | "request-promise": "^4.2.2"
29 | },
30 | "devDependencies": {
31 | "nodemon": "^1.11.0"
32 | },
33 | "repository": {
34 | "type": "git",
35 | "url": "git+https://github.com/lqqyt2423/wechat_spider.git"
36 | },
37 | "keywords": [
38 | "wechat",
39 | "spider"
40 | ],
41 | "bugs": {
42 | "url": "https://github.com/lqqyt2423/wechat_spider/issues"
43 | },
44 | "homepage": "https://github.com/lqqyt2423/wechat_spider#readme"
45 | }
46 |
--------------------------------------------------------------------------------
/client/app/reducers.js:
--------------------------------------------------------------------------------
1 | import {
2 | REQUEST_POSTS,
3 | RECEIVE_POSTS,
4 | REQUEST_PROFILES,
5 | RECEIVE_PROFILES,
6 | REQUEST_PROFILE,
7 | RECEIVE_PROFILE,
8 | REQUEST_CATES,
9 | RECEIVE_CATES
10 | } from './actions';
11 |
12 | const initialState = {
13 | posts: {},
14 | profiles: {},
15 | profile: {},
16 | cates: [],
17 | isFetching: false
18 | };
19 |
20 | function reducer(state = initialState, action) {
21 | switch (action.type) {
22 | case REQUEST_POSTS:
23 | case REQUEST_PROFILES:
24 | case REQUEST_PROFILE:
25 | case REQUEST_CATES:
26 | return Object.assign({}, state, {
27 | isFetching: true
28 | });
29 | case RECEIVE_POSTS:
30 | return Object.assign({}, state, {
31 | isFetching: false,
32 | posts: action.posts
33 | });
34 | case RECEIVE_PROFILES:
35 | return Object.assign({}, state, {
36 | isFetching: false,
37 | profiles: action.profiles
38 | });
39 | case RECEIVE_PROFILE:
40 | return {
41 | ...state,
42 | isFetching: false,
43 | profile: action.profile
44 | };
45 | case RECEIVE_CATES:
46 | return Object.assign({}, state, {
47 | isFetching: false,
48 | cates: action.cates
49 | });
50 | default:
51 | return state;
52 | }
53 | }
54 |
55 | export default reducer;
56 |
--------------------------------------------------------------------------------
/client/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "wechat_spider_client",
3 | "version": "1.1.0",
4 | "description": "",
5 | "main": "webpack.config.js",
6 | "scripts": {
7 | "build": "NODE_ENV=production webpack",
8 | "start": "webpack-dev-server"
9 | },
10 | "author": "liqiang",
11 | "license": "ISC",
12 | "dependencies": {
13 | "bootstrap": "^3.3.7",
14 | "classnames": "^2.2.3",
15 | "font-awesome": "^4.7.0",
16 | "lodash": "^4.13.1",
17 | "lodash.assign": "^4.0.9",
18 | "lodash.camelcase": "^4.3.0",
19 | "lodash.clonedeep": "^4.5.0",
20 | "material-ui": "^0.19.2",
21 | "moment": "^2.21.0",
22 | "prop-types": "^15.5.10",
23 | "react": "^15.6.1",
24 | "react-dom": "^15.6.1",
25 | "react-redux": "^5.0.6",
26 | "react-router": "^2.0.1",
27 | "redux": "^3.7.2",
28 | "redux-logger": "^3.0.6",
29 | "redux-thunk": "^2.2.0"
30 | },
31 | "devDependencies": {
32 | "babel": "^6.23.0",
33 | "babel-core": "^6.25.0",
34 | "babel-loader": "^7.1.1",
35 | "babel-preset-env": "^1.6.0",
36 | "babel-preset-es2015": "^6.24.1",
37 | "babel-preset-react": "^6.24.1",
38 | "babel-preset-stage-2": "^6.24.1",
39 | "css-loader": "^0.28.4",
40 | "file-loader": "^1.1.4",
41 | "html-webpack-plugin": "^2.29.0",
42 | "style-loader": "^0.18.2",
43 | "webpack": "^3.3.0",
44 | "webpack-dev-server": "^2.6.1"
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/.eslintrc.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | "env": {
3 | "browser": true,
4 | "commonjs": true,
5 | "es6": true,
6 | "node": true
7 | },
8 | "globals": {
9 | "_": true,
10 | "$": true,
11 | "WeixinJSBridge": true,
12 | "sequelize": true,
13 | "Sequelize": true,
14 | "models": true,
15 | "glue": true,
16 | "ovt": true
17 | },
18 | "extends": ["eslint:recommended", "plugin:react/recommended"],
19 | "parserOptions": {
20 | "ecmaFeatures": {
21 | "experimentalObjectRestSpread": true,
22 | "jsx": true
23 | },
24 | "sourceType": "module",
25 | "ecmaVersion": 2017
26 | },
27 | "plugins": [
28 | "react"
29 | ],
30 | "rules": {
31 | "no-unused-vars": [
32 | 1
33 | ],
34 | "no-console": [
35 | 0
36 | ],
37 | "react/prop-types": [
38 | 0
39 | ],
40 | "react/no-danger": [
41 | 1
42 | ],
43 | "indent": [
44 | 1,
45 | 2,
46 | { "SwitchCase": 1 }
47 | ],
48 | "linebreak-style": [
49 | 2,
50 | "unix"
51 | ],
52 | "quotes": [
53 | 1,
54 | "single"
55 | ],
56 | "semi": [
57 | 2,
58 | "always"
59 | ],
60 | "require-yield": [0]
61 | }
62 | };
63 |
--------------------------------------------------------------------------------
/client/app/containers/categories.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import { connect } from 'react-redux';
3 | import { fetchCates } from '../actions';
4 | import Loading from '../components/loading.jsx';
5 | import { Card, CardActions, CardTitle } from 'material-ui/Card';
6 | import FlatButton from 'material-ui/FlatButton';
7 |
8 |
9 | class Categories extends React.Component {
10 |
11 | constructor(props) {
12 | super(props);
13 | }
14 |
15 | componentDidMount() {
16 | let { dispatch } = this.props;
17 | dispatch(fetchCates());
18 | }
19 |
20 | render() {
21 | let { cates, isFetching, history } = this.props;
22 | if (isFetching || !cates.length) return ;
23 | return (
24 |
25 | {
26 | cates.map(cate => {
27 | return (
28 |
29 |
30 |
31 |
32 | { history.push(`/profiles?category=${cate._id}`); }} />
33 | { history.push(`/posts?category=${cate._id}`); }} />
34 |
35 |
36 |
37 | );
38 | })
39 | }
40 |
41 | );
42 | }
43 | }
44 |
45 | export default connect(state => state)(Categories);
46 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 | # pycharm
107 | .idea
--------------------------------------------------------------------------------
/client/app/containers/profile.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import { connect } from 'react-redux';
3 | import { fetchProfile } from '../actions';
4 | import Loading from '../components/loading.jsx';
5 | import Paper from 'material-ui/Paper';
6 | import Avatar from 'material-ui/Avatar';
7 | import moment from 'moment';
8 |
9 | function f(date) {
10 | if (date) {
11 | return moment(new Date(date)).format('YYYY-MM-DD HH:mm');
12 | } else {
13 | return date;
14 | }
15 | }
16 |
17 | class Profile extends React.Component {
18 |
19 | constructor(props) {
20 | super(props);
21 | }
22 |
23 | componentDidMount() {
24 | const { params, dispatch } = this.props;
25 | const { id } = params;
26 | dispatch(fetchProfile(id));
27 | }
28 |
29 | render() {
30 | const { isFetching, profile, params } = this.props;
31 | const { id } = params;
32 | if (isFetching) return ;
33 | if (id !== profile.id) return ;
34 | return (
35 |
41 |
42 |
47 | {profile.title}
48 |
49 |
53 |
微信ID:{profile.wechatId}
54 |
msgBiz:{profile.msgBiz}
55 |
创建时间:{f(profile.createdAt)}
56 |
更新时间:{f(profile.updatedAt)}
57 |
上次打开历史页面时间:{f(profile.openHistoryPageAt)}
58 |
属性:{profile.property}
59 |
60 |
61 | );
62 | }
63 | }
64 |
65 | export default connect(state => state)(Profile);
66 |
--------------------------------------------------------------------------------
/client/webpack.config.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const path = require('path');
4 | const webpack = require('webpack');
5 | const HtmlWebpackPlugin = require('html-webpack-plugin');
6 |
7 | let NODE_ENV = process.env.NODE_ENV || 'development';
8 |
9 | let publicPath = '/';
10 |
11 | const babelLoader = {
12 | loader: 'babel-loader',
13 | options: {
14 | cacheDirectory: true,
15 | presets: ['es2015', 'react', 'stage-2']
16 | }
17 | };
18 |
19 | const plugins = [
20 | new webpack.HotModuleReplacementPlugin(),
21 | new HtmlWebpackPlugin({
22 | title: 'react',
23 | template: './app/index.html'
24 | })
25 | ];
26 |
27 | if (NODE_ENV != 'development') {
28 | publicPath = '';
29 | plugins.push(
30 | new webpack.DefinePlugin({
31 | 'process.env': {
32 | NODE_ENV: JSON.stringify('production')
33 | }
34 | }),
35 | new webpack.optimize.UglifyJsPlugin()
36 | );
37 | }
38 |
39 | module.exports = {
40 | entry: './app/index.jsx',
41 | output: {
42 | filename: 'bundle.js',
43 | path: path.resolve(__dirname, './build'),
44 | publicPath: publicPath
45 | },
46 | plugins: plugins,
47 | devtool: NODE_ENV == 'development' ? 'eval' : undefined,
48 | devServer: {
49 | hot: true,
50 | contentBase: './',
51 | historyApiFallback: true,
52 | proxy: {
53 | '/api': 'http://localhost:8104',
54 | '/favicon.png': 'http://localhost:8104'
55 | }
56 | },
57 | module: {
58 | rules: [
59 | {
60 | test: /\.js|jsx$/,
61 | use: [
62 | babelLoader
63 | ],
64 | exclude: /(node_modules|bower_components)/
65 | },
66 | {
67 | test: /\.css$/,
68 | use: [
69 | 'style-loader',
70 | 'css-loader'
71 | ]
72 | },
73 | {
74 | test: /\.(woff|woff2|eot|ttf|otf|svg)$/,
75 | use: [
76 | 'file-loader'
77 | ]
78 | }
79 | ]
80 | }
81 | };
82 |
--------------------------------------------------------------------------------
/server/index.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const http = require('http');
4 | const express = require('express');
5 | const logger = require('morgan');
6 | const path = require('path');
7 | const app = express();
8 | const spiderConfig = require('../config');
9 | const models = require('../models');
10 | const { Category } = models;
11 |
12 | const api = require('./api');
13 |
14 | app.use(logger('dev'));
15 |
16 | app.use(express.json());
17 | app.use(express.urlencoded({ extended: false }));
18 |
19 | app.use('/api', api);
20 |
21 | // 接口设置抓取此分类内的账号
22 | // curl localhost:8104/spider -XPOST -H "Content-Type: application/json" -d '{ "categoryId": "5a50cacbb7c8a46b635878c6" }'
23 | app.post('/spider', async (req, res, next) => {
24 | try {
25 | const { categoryId } = req.body;
26 | if (!categoryId) return next(new Error('请传入categoryId'));
27 | const category = await Category.findOne({ _id: categoryId });
28 | if (!category) return next(new Error('请传入正确的categoryId'));
29 | const msgBizs = category.msgBizs;
30 | if (!msgBizs.length) return next(new Error('请传入正确的categoryId'));
31 | spiderConfig.insertJsToNextProfile.targetBiz = msgBizs;
32 | spiderConfig.insertJsToNextPage.targetBiz = msgBizs;
33 | res.send('设置成功');
34 | } catch(e) {
35 | next(e);
36 | }
37 | });
38 |
39 | // 前端页面
40 | // eslint-disable-next-line
41 | app.get('/favicon.png', (req, res, next) => {
42 | res.sendFile(path.join(__dirname, './favicon.png'));
43 | });
44 | app.use('/', express.static(path.join(__dirname, '../client/build')));
45 | // eslint-disable-next-line
46 | app.get('/*', (req, res, next) => {
47 | res.sendFile(path.join(__dirname, '../client/build/index.html'));
48 | });
49 |
50 | // handle error 参数next不能省略
51 | // eslint-disable-next-line
52 | app.use((error, req, res, next) => {
53 | if (!res.finished) {
54 | res.status(500).send(error.message);
55 | }
56 | });
57 |
58 | const server = http.createServer(app);
59 |
60 | module.exports = server;
61 |
--------------------------------------------------------------------------------
/config.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const fs = require('fs');
4 |
5 | const config = {
6 | read_mongo_count:0, //redis没有待爬去公众号列表,去读Mongo计数,为了防止一直没有了就读mongo
7 | mongodb: {
8 | db: 'mongodb://127.0.0.1:27017/wechat_spider'
9 | },
10 | redis: {
11 | port: 6379,
12 | host: '127.0.0.1',
13 | POST_LIST_KEY: 'wechat_spider:post_list',
14 | PROFILE_LIST_KEY: 'wechat_spider:profile_list'
15 | },
16 | // 是否用本地图片替换所有的图片请求 加快网络速度
17 | isReplaceImg: true,
18 | // 是否替换显示在手机上的微信文章正文内容 加快网路速度
19 | isReplacePostBody: true,
20 | insertJsToNextPage: {
21 | // 是否关闭自动跳转页面
22 | disable: false,
23 | // 跳转时间间隔 s
24 | jumpInterval: 8,
25 | // 跳转文章发布时间范围
26 | minTime: new Date(2017, 12, 1), // minTime跟 下面的 minTime 间隔2个月, 要不可能抓取到更早的文章链接,不去抓取,有没正文的情况
27 | maxTime: new Date(2050, 6, 14),
28 | // 已有数据的文章是否再抓取
29 | isCrawlExist: false,
30 | // if true updateNumAt - publishAt
31 | crawlExistInterval: 1000 * 60 * 60 * 24 * 3,
32 | // 抓取公众号biz范围
33 | targetBiz: [],
34 | // 是否保存文章内容
35 | isSavePostContent: true,
36 | // 保存内容的形式: html/text
37 | saveContentType: 'text',
38 | },
39 | insertJsToNextProfile: {
40 | // 是否关闭自动跳转页面
41 | disable: false,
42 | // 仅scroll 不跳转
43 | onlyScroll: true,
44 | // 跳转时间间隔 s
45 | jumpInterval: 5,
46 | // 抓取到minTime就跳转至下一公众号
47 | minTime: new Date(2018, 1, 1),
48 | // 自定义最近多久更新的公众号本次就不用抓取
49 | maxUpdatedAt: new Date(2050, 6, 7),
50 | // 抓取公众号biz范围
51 | targetBiz: [],
52 | // 程序开始时间
53 | beginTime: new Date()
54 | },
55 | // 是否抓取评论
56 | isCrawlComments: true
57 | };
58 |
59 | // try {
60 | // // 引入外部biz文件
61 | // fs.accessSync('./targetBiz.json');
62 | // config.insertJsToNextProfile.targetBiz = require('./targetBiz.json');
63 | // config.insertJsToNextPage.targetBiz = require('./targetBiz.json');
64 | // } catch(e) {
65 | // // Do nothing
66 | // }
67 |
68 | module.exports = config;
69 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 声明: 此项目nodejs微信爬虫原地址为 https://github.com/lqqyt2423/wechat_spider
2 | 本人在此基础:
3 | ### bug:
4 | - 修复循环爬取公众号死循环
5 | - 替换正文的正则更新
6 |
7 | ### new:
8 | - 加入appium和python脚本, 利用redis作为通信设施, 达到python操作手机脚本 和 原nodejs 程序通信配合, 让整个爬虫全自动化运行
9 |
10 | ### ps:
11 | 本人不是很懂nodejs,原作者的bug,新加的功能都是很简单的代码,重点是实现思路,如果有问题或者bug,欢迎指正
12 |
13 | ### ------以下为原作者readme------
14 |
15 | # wechat_spider 微信爬虫
16 |
17 | 基于Node 的微信爬虫,通过中间人代理的原理,批量获取微信文章数据,包括阅读量、点赞量和评论等数据。
18 |
19 | 使用代理模块AnyProxy。代码已支持AnyProxy 4版本。
20 |
21 | ## 开始
22 |
23 | ### 安装前准备
24 |
25 | - 安装Node,版本大于 8.8.1
26 | - 安装MongoDB,版本大于 3.4.6
27 | - 安装Redis
28 | - 安装Node 全局模块nodemon 和pm2
29 |
30 | ### 安装
31 |
32 | ```shell
33 | git clone https://github.com/lqqyt2423/wechat_spider.git
34 | cd wechat_spider
35 | npm install
36 | ```
37 |
38 | 本项目基于代理模块AnyProxy,解析微信HTTPS 请求需在电脑和手机上都安装证书。可参考:[AnyProxy 文档](http://anyproxy.io/cn/#%E8%AF%81%E4%B9%A6%E9%85%8D%E7%BD%AE)。
39 |
40 | ## 使用
41 |
42 | ```shell
43 | cd wechat_spider
44 | npm start
45 | ```
46 |
47 | 1. 确保电脑和手机连接同一WIFI ,`npm start` 之后,命令行输出`请配置代理: xx.xx.xx.xx:8101` 类似语句,手机设置代理为此IP 和端口
48 | 2. 手机上测试打开任一公众号历史文章详情页和文章页,观察电脑命令行的输出,查看数据是否保存至MongoDB
49 | 3. 自动翻页抓取数据需配置`config.js`
50 |
51 | ### 自定义配置
52 |
53 | 目前可支持的配置项举例如下:
54 |
55 | - 控制是否开启文章或历史详情页自动跳转
56 | - 控制跳转时间间隔
57 | - 根据文章发布时间控制抓取范围
58 | - 是否保存文章正文内容
59 | - 是否保存文章评论
60 |
61 | 可编辑`index.js` ,`config.js` 和`targetBiz.json` 进行自定义配置。文件中注释有详细说明。
62 |
63 | ### 可视化界面
64 |
65 | 前端页面已打包好,启动项目后,如无修改默认`server port` 配置,浏览器直接访问`http://localhost:8104` 即可。检测数据有无抓取保存直接刷新此页面即可。
66 |
67 | 
68 |
69 | 前端页面由`React` 编写,如需修改,可编辑`client` 文件中的代码。
70 |
71 | ### MongoDB 数据信息
72 |
73 | 数据库database: wechat_spider
74 |
75 | 数据表collections:
76 |
77 | - posts - 文章数据
78 | - profiles - 公众号数据
79 | - comments - 评论数据
80 | - categories - 自定义的公众号分类
81 |
82 |
83 | ### 从MongoDB 导出数据
84 |
85 | ```shell
86 | mongoexport --db wechat_spider --collection posts --type=csv --fields title,link,publishAt,readNum,likeNum,msgBiz,msgMid,msgIdx,sourceUrl,cover,digest,isFail --out ~/Desktop/posts.csv
87 | ```
88 |
89 | 以上命令会导出数据至桌面的`posts.csv` 中。具体的个性化导出请参考MongoDB 文档或者自己编写。
90 |
91 | ## License
92 |
93 | [MIT](LICENSE)
94 |
--------------------------------------------------------------------------------
/rule/index.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const {
4 | getReadAndLikeNum,
5 | getPostBasicInfo,
6 | handlePostHtml,
7 | getComments,
8 | getProfileBasicInfo,
9 | getPostList,
10 | handleProfileHtml
11 | } = require('./wechatRule');
12 | const config = require('../config');
13 | const fs = require('fs');
14 | const path = require('path');
15 |
16 | const { isReplaceImg } = config;
17 | let imgBuf;
18 | if (isReplaceImg) imgBuf = fs.readFileSync(path.join(__dirname, './replaceImg.png'));
19 |
20 | const sendResFns = [
21 | getReadAndLikeNum,
22 | getPostBasicInfo,
23 | handlePostHtml,
24 | getComments,
25 | getProfileBasicInfo,
26 | getPostList,
27 | handleProfileHtml
28 | ];
29 |
30 | const rule = {
31 | // 模块介绍
32 | summary: 'The rule for wechat spider, written by liqiang.',
33 |
34 | // 发送请求前拦截处理
35 | *beforeSendRequest(requestDetail) {
36 | const { requestOptions } = requestDetail;
37 | const { headers } = requestOptions;
38 | const { Accept } = headers;
39 |
40 | // 处理图片返回
41 | if (isReplaceImg && /^image/.test(Accept)) {
42 | return {
43 | response: {
44 | statusCode: 200,
45 | header: { 'content-type': 'image/png' },
46 | body: imgBuf
47 | }
48 | };
49 | }
50 | },
51 |
52 | // 发送响应前处理
53 | *beforeSendResponse(requestDetail, responseDetail) {
54 | const fnLens = sendResFns.length;
55 | if (fnLens === 0) return;
56 | let i = 0;
57 | const ctx = { req: requestDetail, res: responseDetail };
58 | const handleFn = () => {
59 | const fn = sendResFns[i];
60 | return fn(ctx).then(res => {
61 | if (res) return res;
62 | i += 1;
63 | if (i >= fnLens) return;
64 | return handleFn();
65 | });
66 | };
67 | return handleFn().catch(e => {
68 | throw e;
69 | });
70 | }
71 |
72 | // 是否处理https请求 已全局开启解析https请求 此处注释掉即可
73 | // *beforeDealHttpsRequest(requestDetail) { /* ... */ },
74 |
75 | // 请求出错的事件
76 | // *onError(requestDetail, error) { /* ... */ },
77 |
78 | // https连接服务器出错
79 | // *onConnectError(requestDetail, error) { /* ... */ }
80 | };
81 |
82 | module.exports = rule;
83 |
--------------------------------------------------------------------------------
/models/plugins/paginator.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | /**
4 | * Pagination Plugin
5 | */
6 | const util = require('util');
7 | const Query = require('mongoose').Query;
8 |
9 | const defaults = {
10 | perPage: 20, // 每页条数
11 | page : 1, // 初始页数
12 | offset : 0, // 偏移数
13 | maxPerPage: 100 // 最大单页条数
14 | };
15 |
16 | /**
17 | * paginate
18 | *
19 | * @param {Object} options
20 | */
21 | Query.prototype.paginate = function(options, callback) {
22 | let opts = util._extend({}, defaults);
23 | opts = util._extend(opts, options);
24 |
25 | // 转换值为数字
26 | Object.keys(defaults).forEach(function(k) {
27 | opts[k] = Number(opts[k]);
28 | });
29 |
30 | let query = this;
31 | let model = query.model;
32 | let conditions = query._conditions;
33 |
34 | return new Promise(function(resolve, reject) {
35 | model.count(conditions, function(err, count) {
36 |
37 | opts.perPage = opts.perPage >= opts.maxPerPage ? opts.maxPerPage : opts.perPage;
38 |
39 | let _skip = (opts.page - 1) * opts.perPage;
40 | _skip += opts.offset;
41 |
42 | query.skip(_skip).limit(opts.perPage).exec(function(err, data) {
43 | if (err) {
44 | typeof callback === 'function' ? reject(callback(err)) : reject(err);
45 | return;
46 | }
47 |
48 | data = data || [];
49 |
50 | let current = parseInt(opts.page, 10) || 1;
51 |
52 | let offsetCount = count - opts.offset;
53 | offsetCount = offsetCount > 0 ? offsetCount : 0;
54 |
55 | let totalPages = Math.ceil(offsetCount / opts.perPage);
56 |
57 | let prev = !count || current === 1 ? null : current - 1;
58 | let next = !count || current === totalPages ? null : current + 1;
59 |
60 | if (!offsetCount) {
61 | prev = next = null;
62 | }
63 |
64 | let pager = {
65 | data: data,
66 | options: opts,
67 | current: current,
68 | next: next,
69 | prev: prev,
70 | totalPages: totalPages,
71 | count: count
72 | };
73 |
74 | typeof callback === 'function' ? resolve(callback(err, pager)) : resolve(pager);
75 | });
76 | });
77 | });
78 | };
79 |
--------------------------------------------------------------------------------
/index_no_script.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const AnyProxy = require('anyproxy');
4 | const exec = require('child_process').exec;
5 | const ip = require('ip');
6 | const { log } = console;
7 | const config = require('./config');
8 | const redis = require('./utils/redis');
9 |
10 | const { POST_LIST_KEY, PROFILE_LIST_KEY } = config.redis;
11 |
12 | // 引导安装HTTPS证书
13 | if (!AnyProxy.utils.certMgr.ifRootCAFileExists()) {
14 | AnyProxy.utils.certMgr.generateRootCA((error, keyPath) => {
15 | if (!error) {
16 | const certDir = require('path').dirname(keyPath);
17 | log('The cert is generated at', certDir);
18 | const isWin = /^win/.test(process.platform);
19 | if (isWin) {
20 | exec('start .', { cwd: certDir });
21 | } else {
22 | exec('open .', { cwd: certDir });
23 | }
24 | } else {
25 | console.error('error when generating rootCA', error);
26 | }
27 | });
28 | }
29 |
30 | const options = {
31 | port: 8101,
32 | rule: require('./rule'),
33 | webInterface: {
34 | enable: false,
35 | webPort: 8102
36 | },
37 |
38 | // 默认不限速
39 | // throttle: 10000,
40 |
41 | // 强制解析所有HTTPS流量
42 | forceProxyHttps: true,
43 |
44 | // 不开启websocket代理
45 | wsIntercept: false,
46 |
47 | silent: true
48 | };
49 |
50 | const proxyServer = new AnyProxy.ProxyServer(options);
51 |
52 | proxyServer.on('ready', () => {
53 | const ipAddress = ip.address();
54 | log(`请配置代理: ${ipAddress}:8101`);
55 | log('可视化界面: http://localhost:8104\n');
56 | });
57 | proxyServer.on('error', (e) => {
58 | throw e;
59 | });
60 |
61 | // 删除redis中对应缓存后再启动
62 | redis('del', POST_LIST_KEY, PROFILE_LIST_KEY).then(() => {
63 | proxyServer.start();
64 | });
65 |
66 | // when finished
67 | // proxyServer.close();
68 |
69 | require('./server').listen(8104);
70 |
71 | // 启动python脚本控制手机开始到 历史消息 触发爬虫
72 |
73 | // setTimeout(function() {
74 | // console.info('等3秒开始运行 python 脚本.');
75 | //
76 | // var filename = 'auto_driver/operate_nokia.py'
77 | // exec('python'+' '+filename,function(err,stdout,stderr){
78 | // if(err)
79 | // {
80 | // console.log('stderr',err);
81 | // }
82 | // if(stdout)
83 | // {
84 | // console.log('stdout',stdout);
85 | // }
86 | // });
87 | //
88 | // }, 3000);
89 |
90 |
--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const AnyProxy = require('anyproxy');
4 | const exec = require('child_process').exec;
5 | const ip = require('ip');
6 | const { log } = console;
7 | const config = require('./config');
8 | const redis = require('./utils/redis');
9 |
10 | const { POST_LIST_KEY, PROFILE_LIST_KEY } = config.redis;
11 |
12 | // 引导安装HTTPS证书
13 | if (!AnyProxy.utils.certMgr.ifRootCAFileExists()) {
14 | AnyProxy.utils.certMgr.generateRootCA((error, keyPath) => {
15 | if (!error) {
16 | const certDir = require('path').dirname(keyPath);
17 | log('The cert is generated at', certDir);
18 | const isWin = /^win/.test(process.platform);
19 | if (isWin) {
20 | exec('start .', { cwd: certDir });
21 | } else {
22 | exec('open .', { cwd: certDir });
23 | }
24 | } else {
25 | console.error('error when generating rootCA', error);
26 | }
27 | });
28 | }
29 |
30 | const options = {
31 | port: 8101,
32 | rule: require('./rule'),
33 | webInterface: {
34 | enable: false,
35 | webPort: 8102
36 | },
37 |
38 | // 默认不限速
39 | // throttle: 10000,
40 |
41 | // 强制解析所有HTTPS流量
42 | forceProxyHttps: true,
43 |
44 | // 不开启websocket代理
45 | wsIntercept: false,
46 |
47 | silent: true
48 | };
49 |
50 | const proxyServer = new AnyProxy.ProxyServer(options);
51 |
52 | proxyServer.on('ready', () => {
53 | const ipAddress = ip.address();
54 | log(`请配置代理: ${ipAddress}:8101`);
55 | log('可视化界面: http://localhost:8104\n');
56 | });
57 | proxyServer.on('error', (e) => {
58 | throw e;
59 | });
60 |
61 | // 删除redis中对应缓存后再启动
62 | redis('del', POST_LIST_KEY, PROFILE_LIST_KEY).then(() => {
63 | proxyServer.start();
64 | });
65 |
66 | // redis('del', POST_LIST_KEY, PROFILE_LIST_KEY, 'profile_finish', 'post_finish' ).then(() => {
67 | // proxyServer.start();
68 | // });
69 |
70 | // when finished
71 | // proxyServer.close();
72 |
73 | require('./server').listen(8104);
74 |
75 |
76 | // 启动python脚本控制手机开始到 历史消息 触发爬虫
77 |
78 | // setTimeout(function() {
79 | // console.info('等3秒开始运行 python 脚本.');
80 | //
81 | // var filename = 'auto_driver/auto_operate_phone.py'
82 | // exec('python'+' '+filename,function(err,stdout,stderr){
83 | // if(err)
84 | // {
85 | // console.log('stderr',err);
86 | // }
87 | // if(stdout)
88 | // {
89 | // console.log('stdout',stdout);
90 | // }
91 | // });
92 | //
93 | // }, 3000);
94 |
95 |
96 | setTimeout(function() {
97 | console.info('等待appium脚本运行触发微信爬虫...');
98 | }, 3000);
99 |
100 |
--------------------------------------------------------------------------------
/auto_driver/tes.py:
--------------------------------------------------------------------------------
1 | from time import sleep
2 | from redis import *
3 | from appium import webdriver
4 | import os
5 |
6 | desire_caps = {}
7 | desire_caps['platformName'] = 'Android'
8 | # 诺基亚6
9 | # desire_caps['platformVersion'] = '8.1.0'
10 | # desire_caps['deviceName'] = 'PL2GAM1810904175'
11 |
12 | # 小米6
13 | # desire_caps['platformVersion'] = '8.0.0'
14 | # desire_caps['deviceName'] = '1231acb'
15 |
16 | # 红米5
17 | desire_caps['platformVersion'] = '7.1.2'
18 | desire_caps['deviceName'] = '2466979e7cf5'
19 |
20 | desire_caps['appPackage'] = 'com.tencent.mm'
21 | desire_caps['appActivity'] = '.ui.LauncherUI'
22 | # 以下两项主要是在点击输入框的时候,会触发系统输入法,导致可能我们发送的是字符 `234`,但是九宫格中文输入法有可能给出的是 `bei` ,这两个属性就是屏蔽系统输入法,使用appium自己的,但是测试完成后,得自己去系统设置中将输入法切换过来
23 | desire_caps['unicodeKeyboard'] = True
24 | desire_caps['resetKeyboard'] = True
25 | # 不重置apk
26 | desire_caps['noReset'] = True
27 | desire_caps["newCommandTimeout"] = 172800 # 等待下一条命令延时 2 天
28 | # desire_caps['chromeOptions'] = {'androidProcess': 'com.tencent.mm:tools'}
29 |
30 | # ip地址在pc上的 appium客户端-设置 中可以看到 `server address` 和 `port`,保持一致即可
31 | driver = webdriver.Remote('http://127.0.0.1:4723/wd/hub', desire_caps)
32 | sleep(6)
33 |
34 | # 1.点击通讯录
35 | el_contact = driver.find_elements_by_android_uiautomator("new UiSelector().text(\"通讯录\")")[0]
36 | el_contact.click()
37 |
38 | # 2.点击公众号
39 | el_public = driver.find_elements_by_android_uiautomator("new UiSelector().text(\"公众号\")")[0]
40 | el_public.click()
41 |
42 | # 3.第一个公众号
43 | # driver.tap([[210, 334], [306, 399]], 20)
44 | # el_public = driver.find_elements_by_android_uiautomator("new UiSelector().text(\"差评\")")[0]
45 | el_public = driver.find_element_by_id("com.tencent.mm:id/v9").find_element_by_class_name("android.widget.LinearLayout")
46 | el_public.click()
47 |
48 | # 4.查看历史消息
49 | el_info = driver.find_element_by_accessibility_id("聊天信息")
50 | el_info.click()
51 |
52 | sleep(2)
53 |
54 | # driver.swipe(530, 1900, 530, 200, 200) # 滑动到底部 1080p
55 |
56 | driver.swipe(350, 1260, 360, 480, 200) # 滑动到底部 720p 红米5
57 |
58 | history = driver.find_elements_by_android_uiautomator("new UiSelector().text(\"全部消息\")")[0]
59 | history.click()
60 |
61 |
62 | # 5.点击一个详情页触发详情页
63 | sleep(20)
64 | # driver.tap([(530, 1300)], 200) # 点击第一篇文章的位置进入详情
65 | driver.tap([(360, 920)], 200) # 点击第一篇文章的位置进入详情
66 |
67 | sleep(20) # 给个缓冲时间
68 | print('[NOTICE]' + ' Appium python script quit')
69 |
70 | driver.quit()
71 |
72 | print('[NOTICE]' + ' Upload Wechat info to Remote Server')
73 | current_file_path = os.path.dirname(__file__)
74 | script_name = 'upload_data.py'
75 | script_path = os.path.join(current_file_path, script_name).replace('\\', '/')
76 | os.system('python %s' % script_path)
77 | sleep(10)
78 | print('[NOTICE]' + ' Upload Compelete')
79 |
--------------------------------------------------------------------------------
/scripts/checkWechatId.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const models = require('../models');
4 | const CorrectWechatId = require('../utils/correctWechatId');
5 |
6 | (async function start() {
7 | await fixWechatId();
8 | await checkWechatId();
9 | process.exit();
10 | })();
11 |
12 | async function checkWechatId() {
13 | const posts = await models.Post.find({
14 | msgBiz: { $exists: true },
15 | wechatId: { $exists: true }
16 | }).select('msgBiz wechatId');
17 |
18 | // 所有账号对象
19 | const bizObj = {};
20 | posts.forEach(post => {
21 | const { msgBiz, wechatId } = post;
22 | if (!bizObj[msgBiz]) {
23 | bizObj[msgBiz] = [wechatId];
24 | } else {
25 | if (bizObj[msgBiz].indexOf(wechatId) === -1) bizObj[msgBiz].push(wechatId);
26 | }
27 | });
28 |
29 | // 所有账号数组
30 | const bizArray = Object.keys(bizObj).map(msgBiz => {
31 | return { msgBiz: msgBiz, wechatIds: bizObj[msgBiz] };
32 | });
33 |
34 | // 仅有一个wechatId的msgBiz
35 | const singleWechatIdArray = bizArray.filter(item => {
36 | const wechatIds = item.wechatIds.filter(id => id);
37 | if (wechatIds.length === 1) {
38 | // 过滤掉中文的字符
39 | if (/[\u4e00-\u9fa5]/.test(wechatIds[0])) return false;
40 | return true;
41 | }
42 | return false;
43 | }).map(item => {
44 | return {
45 | msgBiz: item.msgBiz,
46 | wechatId: item.wechatIds.filter(id => id)[0]
47 | };
48 | });
49 |
50 | // 更新数据表记录
51 | for (let item of singleWechatIdArray) {
52 | const { msgBiz, wechatId } = item;
53 | const correctRecord = new CorrectWechatId({ msgBiz, wechatId });
54 | await correctRecord.checkPost();
55 | await correctRecord.checkProfile();
56 | }
57 |
58 | const singleMsgBizs = singleWechatIdArray.map(item => item.msgBiz);
59 |
60 | // 其余有问题的需手动解决
61 | const hasToFix = bizArray.filter(item => {
62 | return singleMsgBizs.indexOf(item.msgBiz) === -1;
63 | });
64 |
65 | hasToFix.forEach(item => {
66 | console.log('msgBiz:', item.msgBiz);
67 | console.log('wechatIds', item.wechatIds.filter(id => id).join(', '));
68 | console.log();
69 | });
70 | }
71 |
72 | // 手动修复
73 | async function fixWechatId() {
74 | const array = [
75 | ['st_gov', 'MzI0MDU1NzcxNg=='],
76 | ['gh_255e073818cb', 'MzI0OTMyMzAyMg=='],
77 | ['gh_f6346759e760', 'MzI3MjU0ODk5OQ=='],
78 | ['gh_add771035570', 'MzI3MzExODg1MA=='],
79 | ['gh_309eed5521e7', 'MzIxODg2NjQ5MQ=='],
80 | ['gh_541d9f5f914f', 'MzIyMTczMjI1NA=='],
81 | ['gh_609018f1f8e5', 'MzU2MDExMzI5OQ=='],
82 | ['gh_6db4b6109707', 'MzU3MjI2NDgxNw=='],
83 | ['gh_22372daf9473', 'MzA5MDg2MTAxOA=='],
84 | ['gh_9af59cf5576e', 'MzIzOTE2MTUyNQ==']
85 | ];
86 |
87 | if (array.length === 0) return;
88 |
89 | // 更新数据表记录
90 | for (let item of array) {
91 | const [wechatId, msgBiz] = item;
92 | const correctRecord = new CorrectWechatId({ msgBiz, wechatId });
93 | await correctRecord.checkPost();
94 | await correctRecord.checkProfile();
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/client/app/index.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import { render } from 'react-dom';
3 | import { Provider, connect } from 'react-redux';
4 | import { createStore, applyMiddleware } from 'redux';
5 | import reducer from './reducers';
6 | import { Router, Route, IndexRoute } from 'react-router';
7 | import MuiThemeProvider from 'material-ui/styles/MuiThemeProvider';
8 | import AppBar from 'material-ui/AppBar';
9 | import Drawer from 'material-ui/Drawer';
10 | import { List, ListItem } from 'material-ui/List';
11 | import { createHistory, useBasename } from 'history';
12 | import 'bootstrap/dist/css/bootstrap.css';
13 | import 'font-awesome/css/font-awesome.min.css';
14 | import './style/style.css';
15 | const ENV = process.env.NODE_ENV || 'development';
16 | const BASE_URI = '/';
17 |
18 | import thunkMiddleware from 'redux-thunk';
19 | import createLogger from 'redux-logger';
20 |
21 | let reduxMiddlewares = [thunkMiddleware];
22 | if (ENV === 'development') {
23 | reduxMiddlewares.push(createLogger);
24 | }
25 | let store = createStore(
26 | reducer,
27 | applyMiddleware(...reduxMiddlewares)
28 | );
29 |
30 | import Posts from './containers/posts.jsx';
31 | import Profiles from './containers/profiles.jsx';
32 | import Profile from './containers/profile.jsx';
33 | import Categories from './containers/categories.jsx';
34 |
35 | class App extends React.Component {
36 |
37 | constructor(props) {
38 | super(props);
39 | }
40 |
41 | render() {
42 | let { history } = this.props;
43 | return (
44 |
45 |
46 |
47 | { history.push('/'); }} />
48 |
49 | { history.push('/posts'); }} />
50 | { history.push('/profiles'); }} />
51 | { history.push('/categories'); }} />
52 |
53 |
54 |
55 | {this.props.children}
56 |
57 |
58 |
59 | );
60 | }
61 | }
62 |
63 | const connectedApp = connect(state => state)(App);
64 |
65 | const browserHistory = useBasename(createHistory)({
66 | basename: BASE_URI
67 | });
68 |
69 | render(
70 | (
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 | ),
83 | document.getElementById('app')
84 | );
85 |
--------------------------------------------------------------------------------
/client/app/actions.js:
--------------------------------------------------------------------------------
1 | import config from './config';
2 |
3 | export function assembleUrl(path, params, method) {
4 | path = path || '';
5 | params = params || {};
6 | method = method ? method.toLowerCase() : 'get';
7 | Object.keys(params).forEach(function(key) {
8 | let _path = path.replace(`:${key}`, params[key]);
9 | if (_path === path) {
10 | if (method === 'get') {
11 | if (_path.indexOf('?') === -1) {
12 | _path = `${_path}?${key}=${params[key]}`;
13 | } else {
14 | _path = `${_path}&${key}=${params[key]}`;
15 | }
16 | delete params[key];
17 | }
18 | } else {
19 | delete params[key];
20 | }
21 | path = _path;
22 | });
23 | return path;
24 | }
25 |
26 | export const REQUEST_POSTS = 'REQUEST_POSTS';
27 |
28 | export function requestPosts() {
29 | return {
30 | type: REQUEST_POSTS
31 | };
32 | }
33 |
34 | export const RECEIVE_POSTS = 'RECEIVE_POSTS';
35 |
36 | export function receivePosts(posts) {
37 | return {
38 | type: RECEIVE_POSTS,
39 | posts
40 | };
41 | }
42 |
43 | export function fetchPosts(query) {
44 | let path = assembleUrl(config.posts, query);
45 | return function(dispatch) {
46 | dispatch(requestPosts());
47 | return fetch(path).then(res => res.json()).then(posts => {
48 | dispatch(receivePosts(posts));
49 | });
50 | };
51 | }
52 |
53 | export const REQUEST_PROFILES = 'REQUEST_PROFILES';
54 |
55 | export function requestProfiles() {
56 | return {
57 | type: REQUEST_PROFILES
58 | };
59 | }
60 |
61 | export const RECEIVE_PROFILES = 'RECEIVE_PROFILES';
62 |
63 | export function receiveProfiles(profiles) {
64 | return {
65 | type: RECEIVE_PROFILES,
66 | profiles
67 | };
68 | }
69 |
70 | export function fetchProfiles(query) {
71 | let path = assembleUrl(config.profiles, query);
72 | return function(dispatch) {
73 | dispatch(requestProfiles());
74 | return fetch(path).then(res => res.json()).then(profiles => {
75 | dispatch(receiveProfiles(profiles));
76 | });
77 | };
78 | }
79 |
80 | export const REQUEST_PROFILE = 'REQUEST_PROFILE';
81 |
82 | export function requestProfile(id) {
83 | return {
84 | type: REQUEST_PROFILE,
85 | id
86 | };
87 | }
88 |
89 | export const RECEIVE_PROFILE = 'RECEIVE_PROFILE';
90 |
91 | export function receiveProfile(profile) {
92 | return {
93 | type: RECEIVE_PROFILE,
94 | profile
95 | };
96 | }
97 |
98 | export function fetchProfile(id) {
99 | return function (dispatch) {
100 | dispatch(requestProfile(id));
101 | return fetch(`${config.profile}/${id}`).then(res => res.json()).then(profile => {
102 | dispatch(receiveProfile(profile));
103 | });
104 | };
105 | }
106 |
107 | export const REQUEST_CATES = 'REQUEST_CATES';
108 |
109 | export function requestCates() {
110 | return {
111 | type: REQUEST_CATES
112 | };
113 | }
114 |
115 | export const RECEIVE_CATES = 'RECEIVE_CATES';
116 |
117 | export function receiveCates(cates) {
118 | return {
119 | type: RECEIVE_CATES,
120 | cates
121 | };
122 | }
123 |
124 | export function fetchCates() {
125 | return function(dispatch) {
126 | dispatch(requestCates());
127 | return fetch(config.cates).then(res => res.json()).then(cates => {
128 | dispatch(receiveCates(cates));
129 | });
130 | };
131 | }
132 |
--------------------------------------------------------------------------------
/auto_driver/auto_operate_phone.py:
--------------------------------------------------------------------------------
1 | from time import sleep
2 | from redis import *
3 | from appium import webdriver
4 | import os
5 |
6 | desire_caps = {}
7 | desire_caps['platformName'] = 'Android'
8 | # 诺基亚6
9 | # desire_caps['platformVersion'] = '8.1.0'
10 | # desire_caps['deviceName'] = 'PL2GAM1810904175'
11 |
12 | # 小米6
13 | # desire_caps['platformVersion'] = '8.0.0'
14 | # desire_caps['deviceName'] = '1231acb'
15 |
16 | # 红米5
17 | desire_caps['platformVersion'] = '7.1.2'
18 | desire_caps['deviceName'] = '2466979e7cf5'
19 |
20 | desire_caps['appPackage'] = 'com.tencent.mm'
21 | desire_caps['appActivity'] = '.ui.LauncherUI'
22 | # 以下两项主要是在点击输入框的时候,会触发系统输入法,导致可能我们发送的是字符 `234`,但是九宫格中文输入法有可能给出的是 `bei` ,这两个属性就是屏蔽系统输入法,使用appium自己的,但是测试完成后,得自己去系统设置中将输入法切换过来
23 | desire_caps['unicodeKeyboard'] = True
24 | desire_caps['resetKeyboard'] = True
25 | # 不重置apk
26 | desire_caps['noReset'] = True
27 | desire_caps["newCommandTimeout"] = 172800 # 等待下一条命令延时 2 天
28 | # desire_caps['chromeOptions'] = {'androidProcess': 'com.tencent.mm:tools'}
29 |
30 | # ip地址在pc上的 appium客户端-设置 中可以看到 `server address` 和 `port`,保持一致即可
31 | driver = webdriver.Remote('http://127.0.0.1:4723/wd/hub', desire_caps)
32 |
33 | sleep(10) # 等加载出通讯录。。。
34 |
35 | # 1.点击通讯录
36 | el_contact = driver.find_elements_by_android_uiautomator("new UiSelector().text(\"通讯录\")")[0]
37 | el_contact.click()
38 |
39 | # 2.点击公众号
40 | el_public = driver.find_elements_by_android_uiautomator("new UiSelector().text(\"公众号\")")[0]
41 | el_public.click()
42 |
43 | # 3.第一个公众号
44 | # driver.tap([[210, 334], [306, 399]], 20)
45 | # el_public = driver.find_elements_by_android_uiautomator("new UiSelector().text(\"差评\")")[0]
46 | el_public = driver.find_element_by_id("com.tencent.mm:id/v9").find_element_by_class_name("android.widget.LinearLayout")
47 | el_public.click()
48 |
49 | # 4.查看历史消息
50 | el_info = driver.find_element_by_accessibility_id("聊天信息")
51 | el_info.click()
52 |
53 |
54 | sleep(3)
55 | # driver.swipe(530, 1900, 530, 200, 200) # 滑动到底部 1080p
56 | driver.swipe(350, 1260, 360, 480, 200) # 滑动到底部 720p 红米5
57 |
58 | history = driver.find_elements_by_android_uiautomator("new UiSelector().text(\"全部消息\")")[0]
59 | history.click()
60 |
61 | sleep(10) # 设置100秒等待,100秒后再循环问数据库是否爬完公众号列表
62 |
63 | # ------------------------------------------
64 | # a.循环询问redis是否爬完公众号列表 redis_key:'profile_finish'
65 | sr = StrictRedis()
66 | while True:
67 | sleep(1) # 1秒查询一次redis
68 | result = sr.get('profile_finish')
69 | if result == b'1':
70 | print('[NOTICE]' + ' Appium will click detail page')
71 | sr.delete('profile_finish') # 删掉redis 公众号爬完记录 【修改到node代码一开始启动删除】
72 | break
73 |
74 | # 5.点击一个详情页触发详情页
75 | sleep(10)
76 | # driver.tap([(530, 1300)], 200) # 点击第一篇文章的位置进入详情 1080p
77 | print('[NOTICE]' + 'click a detail')
78 | driver.tap([(360, 920)], 200) # 点击第一篇文章的位置进入详情 红米
79 |
80 | # b.再次循环询问redis是否爬玩所有文章 redis_key:'post_finish'
81 | while True:
82 | sleep(1) # 1秒查询一次redis
83 | result = sr.get('post_finish')
84 | if result == b'1':
85 | print('[NOTICE]' + ' Finish crawl')
86 | sr.delete('post_finish') # 删掉redis 文章列表爬完记录
87 | break
88 |
89 | sleep(10) # 给个缓冲时间
90 | print('[NOTICE]' + ' Appium python script quit')
91 |
92 | driver.quit()
93 |
94 | print('[NOTICE]' + ' Upload Wechat info to Remote Server')
95 | current_file_path = os.path.dirname(__file__)
96 | script_name = 'upload_data.py'
97 | script_path = os.path.join(current_file_path, script_name).replace('\\', '/')
98 | os.system('python %s' % script_path)
99 | print('[NOTICE]' + ' Upload Compelete')
100 |
101 | print('[NOTICE]' + ' 60s later close')
102 | sleep(60)
--------------------------------------------------------------------------------
/client/app/containers/profiles.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import { connect } from 'react-redux';
3 | import { fetchProfiles } from '../actions';
4 | import Loading from '../components/loading.jsx';
5 | import moment from 'moment';
6 | import Paginator from '../components/paginator.jsx';
7 | import { Link } from 'react-router';
8 | import Search from './search.jsx';
9 |
10 | class Profiles extends React.Component {
11 |
12 | constructor(props) {
13 | super(props);
14 | this.returnCurrentSearchArgs = this.returnCurrentSearchArgs.bind(this);
15 | }
16 |
17 | componentDidMount() {
18 | let { dispatch, location } = this.props;
19 | dispatch(fetchProfiles(location.query));
20 | }
21 |
22 | componentWillReceiveProps(nextProps) {
23 | if (nextProps.location.search !== this.props.location.search) {
24 | let { dispatch } = this.props;
25 | dispatch(fetchProfiles(nextProps.location.query));
26 | }
27 | }
28 |
29 | returnCurrentSearchArgs() {
30 | const { location } = this.props;
31 | const { search } = location;
32 | const searchArgs = {};
33 | search.replace('?', '').split('&').forEach(item => {
34 | let key = item.split('=')[0];
35 | let value = item.replace(`${key}=`, '');
36 | if (key && value) searchArgs[key] = value;
37 | });
38 | return searchArgs;
39 | }
40 |
41 | render() {
42 | let { isFetching, profiles, history, location } = this.props;
43 | let { search, pathname } = location;
44 | if (isFetching || !profiles.data) return ;
45 | let metadata = profiles.metadata;
46 | return (
47 |
48 |
54 |
55 |
56 |
57 | | 更新时间 |
58 | 头像 |
59 | 公众号 |
60 | 最新 |
61 | 最旧 |
62 | 文章数 |
63 | 有数据 |
64 | 差 |
65 | MsgBiz |
66 | Detail |
67 |
68 |
69 |
70 | {
71 | profiles.data.map(profile => {
72 | return (
73 |
74 | | {profile.openHistoryPageAt ? moment(profile.openHistoryPageAt).format('YY-MM-DD HH:mm') : ''} |
75 |  |
76 | {profile.title} |
77 | {profile.newestPostTime ? moment(profile.newestPostTime).format('YY-MM-DD'): ''} |
78 | {profile.oldestPostTime ? moment(profile.oldestPostTime).format('YY-MM-DD'): ''} |
79 | {profile.postsAllCount} |
80 | {profile.postsHasDataCount} |
81 | {profile.postsAllCount - profile.postsHasDataCount} |
82 | {profile.msgBiz} |
83 | detail |
84 |
85 | );
86 | })
87 | }
88 |
89 |
90 |
91 |
92 | );
93 | }
94 | }
95 |
96 | export default connect(state => state)(Profiles);
97 |
--------------------------------------------------------------------------------
/auto_driver/upload_data.py:
--------------------------------------------------------------------------------
1 | from pymongo import *
2 | from config import *
3 | import datetime
4 | import requests
5 | import json
6 |
7 | mongo_client = MongoClient(host=MONGO['host'], port=MONGO['port'])
8 | wechat_db = mongo_client.get_database(WECHAT_DB_NAME)
9 |
10 | posts_col = wechat_db.get_collection(POST_COL) # 文章列表集合
11 | profile_col = wechat_db.get_collection(PROFILE_COL) # 公众号集合
12 | comment_col = wechat_db.get_collection(COMMENTS_COL) # 评论集合
13 |
14 | # db.getCollection('posts').find({'updateNumAt':{$gte:ISODate('2018-06-11 T00:00:00.00Z')}})
15 | # 2018-06-12 09:56:39.989755
16 | # 2017-04-11 T00:00:00.00Z
17 |
18 | '''
19 | 先判断远端数据量是否为0,如果是0则是第一次上传,那么上传全部。
20 | 否则只上传当天
21 | '''
22 | count_dict = requests.get('%s/if_allupload_wechat' % REMOTE_HOST)
23 |
24 | counts = json.loads(count_dict.content.decode('utf-8'))
25 |
26 | print(counts)
27 |
28 | if counts['post_count']==0 or counts['comment_count']==0 or counts['profile_count'] == 0:
29 | '''上传全部'''
30 | post_res = posts_col.find()
31 | comment_res = comment_col.find()
32 | profile_res = profile_col.find()
33 |
34 | else:
35 | # utc_now = datetime.datetime.now() - datetime.timedelta(days=1)
36 | utc_now = datetime.datetime.now()
37 | post_res = posts_col.find({'updatedAt': {'$gte': datetime.datetime(utc_now.year, utc_now.month, utc_now.day)}})
38 | comment_res = comment_col.find({'updatedAt': {'$gte': datetime.datetime(utc_now.year, utc_now.month, utc_now.day)}})
39 | # profile_list 基本每天都会更新 updatedAt 的日期,所以本质跟上面 profile_col.find() 一样
40 | profile_res = profile_col.find({'updatedAt': {'$gte': datetime.datetime(utc_now.year, utc_now.month, utc_now.day)}})
41 |
42 | # a. 文章列表
43 | post_list = []
44 | for item in post_res:
45 | if 'content' in item:
46 | item['_id'] = str(item['_id'])
47 | # 把时间变为本地时间
48 | item['createdAt'] = str(item['createdAt'] + datetime.timedelta(hours=8))
49 | item['publishAt'] = str(item['publishAt'] + datetime.timedelta(hours=8))
50 | item['updatedAt'] = str(item['updatedAt'] + datetime.timedelta(hours=8))
51 | item['updateNumAt'] = str(item['updateNumAt'] + datetime.timedelta(hours=8))
52 | post_list.append(item)
53 | else:
54 | print(item['title'], ' | 没有爬到正文')
55 | # b. 评论列表
56 | comment_list = []
57 | for item in comment_res:
58 |
59 |
60 | item['_id'] = str(item['_id'])
61 | item['postId'] = str(item['postId'])
62 | # 每个评论的回复id也要str化
63 | if len(item['replies']) != 0:
64 | for i in item['replies']:
65 | i['_id'] = str(i['_id'])
66 | i['createTime'] = str(i['createTime'] + datetime.timedelta(hours=8))
67 |
68 |
69 | # 把时间变为本地时间
70 | item['createTime'] = str(item['createTime'] + datetime.timedelta(hours=8))
71 | item['createdAt'] = str(item['createdAt'] + datetime.timedelta(hours=8))
72 | item['updatedAt'] = str(item['updatedAt'] + datetime.timedelta(hours=8))
73 | comment_list.append(item)
74 | # c 公众号列表
75 | profile_list = []
76 | for item in profile_res:
77 | item['_id'] = str(item['_id'])
78 | # 把时间变为本地时间
79 | item['createdAt'] = str(item['createdAt'] + datetime.timedelta(hours=8))
80 | item['openHistoryPageAt'] = str(item['openHistoryPageAt'] + datetime.timedelta(hours=8))
81 | item['updatedAt'] = str(item['updatedAt'] + datetime.timedelta(hours=8))
82 | profile_list.append(item)
83 |
84 |
85 | # 加个密码验证, 防止接口泄露, 被恶意注入数据
86 | secret_key = '234sf3gsbx443gsgsrts34gsd43tsd'
87 |
88 | final_dict = dict(
89 | secret_key=secret_key,
90 | post_list=post_list,
91 | comment_list=comment_list,
92 | profile_list=profile_list
93 | )
94 |
95 | print('公众号数量:{}\r\n文章数量:{}\r\n评论数量:{}\r\n'.format(len(profile_list), len(post_list), len(comment_list)))
96 |
97 | json_data = json.dumps(final_dict, ensure_ascii=False)
98 |
99 | # 向服务器api发送新爬取的数据
100 | headers = {'Content-Type': 'application/json'}
101 | requests.post('%s/receive_wechat' % REMOTE_HOST, headers=headers, data=json_data.encode('utf-8'))
102 |
103 | print('已向远程主机发出数据')
--------------------------------------------------------------------------------
/server/api.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const express = require('express');
4 | const api = express();
5 | const config = require('../config').insertJsToNextProfile;
6 | const models = require('../models');
7 | const { Category, Profile, Post } = models;
8 |
9 | function wrap(fn) {
10 | return function(req, res, next) {
11 | fn.call(this, req, res, next).catch(next);
12 | };
13 | }
14 |
15 | api.get('/posts', (req, res, next) => {
16 | const {
17 | target,
18 | mainData,
19 | msgBiz,
20 | category,
21 | sortWay,
22 | q
23 | } = req.query;
24 |
25 | const query = { title: { $exists: true } };
26 |
27 | if (q) {
28 | query.title = new RegExp(q, 'i');
29 | }
30 | if (target === 'true') {
31 | query.msgBiz = { $in: config.targetBiz };
32 | }
33 | if (mainData === 'true') {
34 | query.readNum = { $exists: true };
35 | }
36 | if (mainData === 'false') {
37 | query.readNum = { $exists: false };
38 | }
39 | if (msgBiz) {
40 | query.msgBiz = msgBiz;
41 | }
42 |
43 | let sortWayResult = { publishAt: -1, msgIdx: 1 };
44 | if (sortWay === '-updateNumAt') {
45 | sortWayResult = { updateNumAt: -1 };
46 | }
47 | if (sortWay === 'updateNumAt') {
48 | sortWayResult = { updateNumAt: 1 };
49 | }
50 | if (sortWay === '-publishAt') {
51 | sortWayResult = { publishAt: -1, msgIdx: 1 };
52 | }
53 | if (sortWay === 'publishAt') {
54 | sortWayResult = { publishAt: 1, msgIdx: 1 };
55 | }
56 |
57 | let promise = Promise.resolve();
58 | if (category) {
59 | promise = promise.then(() => {
60 | return Category.findOne({ _id: category }).then(category => {
61 | if (!category) return;
62 | query.msgBiz = { $in: category.msgBizs };
63 | });
64 | });
65 | }
66 | return promise.then(() => {
67 | return Post.find(query).sort(sortWayResult).populate('profile').paginate(req.query).then(result => {
68 | const data = result.data;
69 | const metadata = {
70 | options: result.options,
71 | perPage: result.options.perPage,
72 | currentPage: result.current,
73 | next: result.next,
74 | prev: result.prev,
75 | totalPages: result.totalPages,
76 | count: result.count
77 | };
78 | res.json({
79 | metadata,
80 | data
81 | });
82 | });
83 | }).catch(e => {
84 | next(e);
85 | });
86 | });
87 |
88 | api.get('/profiles', (req, res, next) => {
89 | const {
90 | target,
91 | category,
92 | q
93 | } = req.query;
94 |
95 | let query = {};
96 | if (target === 'true') {
97 | query.msgBiz = { $in: config.targetBiz };
98 | }
99 | if (q) query.title = new RegExp(q, 'i');
100 | let promise = Promise.resolve();
101 | if (category) {
102 | promise = promise.then(() => {
103 | return Category.findOne({ _id: category }).then(category => {
104 | if (!category) return;
105 | query.msgBiz = { $in: category.msgBizs };
106 | });
107 | });
108 | }
109 | return promise.then(() => {
110 | return Profile.find(query).sort({ openHistoryPageAt: -1 }).paginate(req.query).then(result => {
111 | const data = result.data;
112 | const metadata = {
113 | options: result.options,
114 | perPage: result.options.perPage,
115 | currentPage: result.current,
116 | next: result.next,
117 | prev: result.prev,
118 | totalPages: result.totalPages,
119 | count: result.count
120 | };
121 | Promise.all(data.map(item => {
122 | return Promise.all([
123 | Post.count({ msgBiz: item.msgBiz }).then(count => {
124 | item._doc.postsAllCount = count;
125 | }),
126 | Post.count({ msgBiz: item.msgBiz, readNum: { $exists: true } }).then(count => {
127 | item._doc.postsHasDataCount = count;
128 | }),
129 | Post.find({ msgBiz: item.msgBiz, publishAt: { $exists: true } }).sort({ publishAt: -1 }).limit(1).then(posts => {
130 | if (!posts.length) return;
131 | item._doc.newestPostTime = posts[0].publishAt;
132 | }),
133 | Post.find({ msgBiz: item.msgBiz, publishAt: { $exists: true } }).sort({ publishAt: 1 }).limit(1).then(posts => {
134 | if (!posts.length) return;
135 | item._doc.oldestPostTime = posts[0].publishAt;
136 | })
137 | ]);
138 | })).then(() => {
139 | res.json({
140 | metadata,
141 | data
142 | });
143 | });
144 | });
145 | }).catch(e => {
146 | next(e);
147 | });
148 | });
149 |
150 | api.get('/profile/:id', wrap(async (req, res) => {
151 | const { id } = req.params;
152 | let profile = await Profile.findById(id);
153 | profile = profile.toObject();
154 |
155 | // eslint-disable-next-line
156 | const { _id, __v, ...newProfile } = profile;
157 | profile = { id: _id, ...newProfile };
158 | res.json(profile);
159 | }));
160 |
161 | api.put('/profile/:id', wrap(async (req, res) => {
162 | const { params, query } = req;
163 | const { id } = params;
164 | const { property } = query;
165 | if (!property) throw new Error('请传入property参数');
166 | await Profile.findByIdAndUpdate(id, { property });
167 | res.send('ok');
168 | }));
169 |
170 | // 新建分类
171 | api.post('/categories', (req, res, next) => {
172 | const { name, msgBizs } = req.query;
173 | if (!name || !msgBizs) return next(new Error('请传入正确的参数'));
174 | Category.findOne({ name: name }).then(category => {
175 | if (category) return next(new Error('已存在同名称分类'));
176 | category = new Category({
177 | name,
178 | msgBizs: msgBizs.split(',')
179 | });
180 | return category.save();
181 | }).then(() => {
182 | res.status(201).send('创建分类成功');
183 | }).catch(e => {
184 | next(e);
185 | });
186 | });
187 |
188 | api.get('/categories', (req, res, next) => {
189 | Category.find({}).populate('profiles').then(categories => {
190 | res.json(categories);
191 | }).catch(e => {
192 | next(e);
193 | });
194 | });
195 |
196 | module.exports = api;
197 |
--------------------------------------------------------------------------------
/client/app/components/Paginator.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import PropTypes from 'prop-types';
3 | import classnames from 'classnames';
4 | import assign from 'lodash.assign';
5 | import get from 'lodash/get';
6 | import set from 'lodash/set';
7 | import { assembleUrl } from '../actions';
8 |
9 | class Paginator extends React.Component {
10 | static get propTypes() {
11 | return {
12 | // action: PropTypes.func.isRequired,
13 | // dispatch: PropTypes.func.isRequired,
14 | action: PropTypes.func,
15 | dispatch: PropTypes.func,
16 | currentPage: PropTypes.number,
17 | perPage: PropTypes.number,
18 | totalPages: PropTypes.number,
19 | pager: PropTypes.number,
20 | query: PropTypes.object,
21 | onChange: PropTypes.func
22 | };
23 | }
24 |
25 | constructor(props) {
26 | super(props);
27 | this.state = {
28 | perPage: props.perPage || 10,
29 | Page: 1
30 | };
31 | }
32 |
33 | getValue(name) {
34 | return get(this.state, name);
35 | }
36 |
37 | handleChange(name, callback) {
38 | return (e) => {
39 | let self = this;
40 | if (e.target.type === 'file') {
41 | if (!e.target.files.length) return;
42 | let file = e.target.files[0];
43 | this.handleFileUpload(file).then(function(data) {
44 | let newState = assign({}, self.state);
45 | set(newState, name, data.data.id);
46 | self.setState(newState, callback);
47 | });
48 | } else {
49 | let newState = assign({}, self.state);
50 | set(newState, name, e.target.value);
51 | self.setState(newState, callback);
52 | }
53 | };
54 | }
55 |
56 | loadPage(page, overwrite) {
57 | return () => {
58 | let { query, onChange, pathname, search, history } = this.props;
59 | let { perPage } = this.state;
60 | if (perPage > 50) perPage = 50;
61 | if (search && search.indexOf('?') === 0) {
62 | let searchObj = {};
63 | search.replace('?', '').split('&').forEach(item => {
64 | let key = item.split('=')[0];
65 | let value = item.replace(`${key}=`, '');
66 | searchObj[key] = value;
67 | });
68 | query = assign({}, query, searchObj);
69 | }
70 | query = assign({}, query, { page, perPage });
71 | if (overwrite) query._overwrite = true;
72 |
73 | if (typeof onChange === 'function') onChange(page, perPage);
74 |
75 | // dispatch(action(query));
76 | let path = assembleUrl(pathname, query);
77 | history.push(path);
78 | };
79 | }
80 |
81 | renderPage(obj) {
82 | obj = obj || {};
83 | return (
84 |
85 |
86 | { obj.name || '' }
87 |
88 |
89 | );
90 | }
91 |
92 | handlePerPageChange() {
93 | const { currentPage } = this.props;
94 | if (this._isRefreshingOnPerPageChange) clearTimeout(this._isRefreshingOnPerPageChange);
95 |
96 | this._isRefreshingOnPerPageChange = setTimeout(() => {
97 | this.loadPage(currentPage, true)();
98 | }, 300);
99 | }
100 |
101 | changePage(e) {
102 | const { totalPages } = this.props;
103 | let Page = e.target.previousSibling.value||1;
104 | if(Page>totalPages) Page = totalPages;
105 | if(Page<1) Page = 1;
106 |
107 | this.loadPage(Page)();
108 | }
109 |
110 | handleChangePage(e) {
111 | const { totalPages } = this.props;
112 | let value = e.target.value;
113 | if(value > totalPages) value = totalPages;
114 | if(value < 1) value = '';
115 | this.setState({Page: value});
116 | }
117 |
118 | render() {
119 | let self = this;
120 | let { currentPage, totalPages, pager, count } = this.props;
121 | currentPage = currentPage || 1;
122 | totalPages = totalPages || 1;
123 | pager = pager || 5;
124 | let minPage = currentPage - pager;
125 | let maxPage = currentPage + pager;
126 |
127 | function renderMiddlePages() {
128 | let pages = [];
129 | for (let i = 1; i <= totalPages; i++) {
130 | if (i > minPage && i < maxPage) {
131 | pages.push(self.renderPage({ active: currentPage == i, page: i, name: i }));
132 | }
133 | }
134 |
135 | return pages;
136 | }
137 |
138 | if (totalPages == 1) return null;
139 |
140 | return (
141 |
172 | );
173 | }
174 | }
175 |
176 | export default Paginator;
177 |
--------------------------------------------------------------------------------
/client/app/containers/posts.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import { connect } from 'react-redux';
3 | import { fetchPosts, assembleUrl } from '../actions';
4 | import Loading from '../components/loading.jsx';
5 | import Paginator from '../components/paginator.jsx';
6 | import RaisedButton from 'material-ui/RaisedButton';
7 | import moment from 'moment';
8 | import { Link } from 'react-router';
9 | import Search from './search.jsx';
10 |
11 | function timeDiff(update, publish) {
12 | let updateMoment = moment(update);
13 | let publishMoment = moment(publish);
14 | let days = updateMoment.diff(publishMoment, 'days');
15 | if (days < 31) return `${days}天`;
16 | let months = updateMoment.diff(publishMoment, 'months');
17 | if (months < 13) return `${months}月`;
18 | let years = updateMoment.diff(publishMoment, 'years');
19 | return `${years}年`;
20 | }
21 |
22 | class Posts extends React.Component {
23 |
24 | constructor(props) {
25 | super(props);
26 | this.sortByTime = this.sortByTime.bind(this);
27 | this.judeMainDataShow = this.judeMainDataShow.bind(this);
28 | this.returnCurrentSearchArgs = this.returnCurrentSearchArgs.bind(this);
29 | }
30 |
31 | componentDidMount() {
32 | let { dispatch, location } = this.props;
33 | dispatch(fetchPosts(location.query));
34 | }
35 |
36 | componentWillReceiveProps(nextProps) {
37 | if (nextProps.location.search !== this.props.location.search) {
38 | let { dispatch } = this.props;
39 | dispatch(fetchPosts(nextProps.location.query));
40 | }
41 | }
42 |
43 | returnCurrentSearchArgs() {
44 | const { location } = this.props;
45 | const { search } = location;
46 | const searchArgs = {};
47 | search.replace('?', '').split('&').forEach(item => {
48 | let key = item.split('=')[0];
49 | let value = item.replace(`${key}=`, '');
50 | if (key && value) searchArgs[key] = value;
51 | });
52 | return searchArgs;
53 | }
54 |
55 | sortByTime(sortType) {
56 | const { location, history } = this.props;
57 | const { search, pathname } = location;
58 | const searchArgs = this.returnCurrentSearchArgs();
59 | let iconClass = 'fa-sort';
60 | let nextSortType = `-${sortType}`;
61 | if (search && search.indexOf('?') === 0) {
62 | if (searchArgs.sortWay) {
63 | if (searchArgs.sortWay === sortType) {
64 | iconClass = 'fa-sort-asc';
65 | nextSortType = `-${sortType}`;
66 | }
67 | if (searchArgs.sortWay === `-${sortType}`) {
68 | iconClass = 'fa-sort-desc';
69 | nextSortType = sortType;
70 | }
71 | }
72 | }
73 | const nextQuery = Object.assign({}, searchArgs, {
74 | sortWay: nextSortType
75 | });
76 | const path = assembleUrl(pathname, nextQuery);
77 | return ( { history.push(path); }} className={`fa ${iconClass}`}>);
78 | }
79 |
80 | judeMainDataShow(key) {
81 | const searchArgs = this.returnCurrentSearchArgs();
82 | const mainDataVal = searchArgs.mainData;
83 | const primary = { primary: true };
84 | if (key === 'all' && !mainDataVal) return primary;
85 | if (key === 'yes' && mainDataVal === 'true') return primary;
86 | if (key === 'no' && mainDataVal === 'false') return primary;
87 | return null;
88 | }
89 |
90 | renderFilter() {
91 | const { location, history, posts } = this.props;
92 | const { pathname } = location;
93 | const searchArgs = this.returnCurrentSearchArgs();
94 | const style = {
95 | margin: '10px 15px 10px 0'
96 | };
97 | const { metadata } = posts;
98 | let count;
99 | if (metadata) count = metadata.count;
100 | return (
101 |
102 | {
103 | const nextQuery = { ...searchArgs };
104 | delete nextQuery.mainData;
105 | const path = assembleUrl(pathname, nextQuery);
106 | history.push(path);
107 | }} label="全部数据" style={style} />
108 | {
109 | const nextQuery = { ...searchArgs, mainData: 'true' };
110 | const path = assembleUrl(pathname, nextQuery);
111 | history.push(path);
112 | }} label="有阅读量" style={style} />
113 | {
114 | const nextQuery = { ...searchArgs, mainData: 'false' };
115 | const path = assembleUrl(pathname, nextQuery);
116 | history.push(path);
117 | }} label="无阅读量" style={style} />
118 | { !count ? '' : 共{count}条数据 }
119 |
120 | );
121 | }
122 |
123 | render() {
124 | let { isFetching, posts, history, location } = this.props;
125 | let { search, pathname } = location;
126 | if (isFetching || !posts.data) return ;
127 | let metadata = posts.metadata;
128 | return (
129 |
130 | {this.renderFilter()}
131 |
137 |
138 |
139 |
140 | | 发布时间 {this.sortByTime('publishAt')} |
141 | 文章标题 |
142 | 位置 |
143 | 阅读数 |
144 | 点赞数 |
145 | 更新时间 {this.sortByTime('updateNumAt')} |
146 | 时间间隔 |
147 | 公众号 |
148 |
149 |
150 |
151 | {
152 | posts.data.map(post => {
153 | return (
154 |
155 | | {moment(post.publishAt).format('YY-MM-DD HH:mm')} |
156 | {post.title.substr(0, 25)} |
157 | {post.msgIdx} |
158 | {post.readNum >= 0 ? post.readNum : ''} |
159 | {post.likeNum >= 0 ? post.likeNum : ''} |
160 | {post.updateNumAt ? moment(post.updateNumAt).format('YY-MM-DD HH:mm') : ''} |
161 | {post.updateNumAt ? timeDiff(post.updateNumAt, post.publishAt) : ''} |
162 | {post.profile ? ( {post.profile.title}) : post.msgBiz} |
163 |
164 | );
165 | })
166 | }
167 |
168 |
169 |
170 |
171 | );
172 | }
173 | }
174 |
175 | export default connect(state => state)(Posts);
176 |
--------------------------------------------------------------------------------
/utils/exportData.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const models = require('../models');
4 | const json2csv = require('json2csv');
5 | const moment = require('moment');
6 |
7 | const profileMap = {
8 | 公众号: 'title',
9 | 公众号ID: 'wechatId',
10 | 公众号属性: 'property'
11 | };
12 |
13 | const postMap = {
14 | msgBiz: 'msgBiz',
15 | 标题: 'title',
16 | 链接: 'link',
17 | 发布时间: 'publishAt',
18 | 发布位置: 'msgIdx',
19 | 阅读量: 'readNum',
20 | 点赞量: 'likeNum',
21 | 摘要: 'digest',
22 | 封面: 'cover',
23 | 内容: 'content',
24 | 阅读原文: 'sourceUrl'
25 | };
26 |
27 | module.exports = class ExportData {
28 |
29 | constructor(options = {}) {
30 | const { msgBiz, category } = options;
31 | this.msgBiz = [];
32 | this.category = [];
33 | this.shouldGetMsgBiz = false;
34 | this.bizCategoryNameMap = {};
35 |
36 | if (msgBiz) this.msgBiz = this.msgBiz.concat(msgBiz);
37 | if (category) this.category = this.category.concat(category);
38 | if (this.msgBiz.length === 0 && this.category.length === 0) throw new Error('请传入参数');
39 | if (this.category.length > 0) this.shouldGetMsgBiz = true;
40 | }
41 |
42 | /**
43 | * 导出为json字符串
44 | * @param {Date} minDate
45 | * @param {Date} maxDate
46 | * @param {Object} options
47 | * @return {String}
48 | * @api public
49 | */
50 | async toJson(minDate, maxDate, options = {}) {
51 | const posts = await this.findPosts(minDate, maxDate);
52 | const keys = Object.keys(profileMap).concat(Object.keys(postMap));
53 |
54 | let replacer = null;
55 | const optionKeys = Object.keys(options);
56 | if (optionKeys.length > 0) {
57 | // 传入的key必须得在keys中存在
58 | const isContain = optionKeys.every(key => keys.indexOf(key) > -1);
59 | if (!isContain) throw new Error('确保格式化字段传入正确');
60 |
61 | // 确保value全为1或全为-1
62 | const onlyOrExcept = options[optionKeys[0]];
63 | const isAllow = optionKeys.every(key => options[key] === onlyOrExcept);
64 | if (!isAllow) throw new Error('确保value全为1或全为-1');
65 |
66 | // 更改replacer
67 | replacer = keys.filter(key => {
68 | if (onlyOrExcept === 1) return (optionKeys.indexOf(key) > -1);
69 | return (optionKeys.indexOf(key) === -1);
70 | });
71 | }
72 |
73 | return JSON.stringify(posts, replacer, 4);
74 | }
75 |
76 | /**
77 | * 导出为csv字符串
78 | * @param {Date} minDate
79 | * @param {Date} maxDate
80 | * @param {Object} options
81 | * @return {String}
82 | * @api public
83 | */
84 | async toCsv(minDate, maxDate, options = {}) {
85 | const json = await this.toJson(minDate, maxDate, options);
86 | const obj = JSON.parse(json);
87 | let csv = json2csv({ data: obj });
88 | csv = addBom(csv);
89 | return csv;
90 | }
91 |
92 | /**
93 | * 导出统计信息json字符串
94 | * @param {Date} minDate
95 | * @param {Date} maxDate
96 | * @return {String}
97 | * @api public
98 | */
99 | async toStaJson(minDate, maxDate) {
100 | const data = await this.calcStatistic(minDate, maxDate);
101 | return JSON.stringify(data, null, 4);
102 | }
103 |
104 | /**
105 | * 导出统计信息csv字符串
106 | * @param {Date} minDate
107 | * @param {Date} maxDate
108 | * @return {String}
109 | * @api public
110 | */
111 | async toStaCsv(minDate, maxDate) {
112 | const data = await this.calcStatistic(minDate, maxDate);
113 | let csv = json2csv({ data });
114 | csv = addBom(csv);
115 | return csv;
116 | }
117 |
118 | /**
119 | * 查找文章
120 | * @api private
121 | */
122 | async findPosts(minDate, maxDate) {
123 | if (this.shouldGetMsgBiz) await this.getMsgBiz();
124 | const posts = await models.Post.find({
125 | msgBiz: { $in: this.msgBiz },
126 | publishAt: { $gte: minDate, $lt: maxDate },
127 | isFail: { $ne: true }
128 | }).sort({ msgBiz: 1, publishAt: 1, msgIdx: 1 }).populate('profile');
129 |
130 | const handledPosts = posts.map(post => {
131 | const { profile, msgBiz } = post;
132 | const postObj = {};
133 |
134 | // 分类
135 | const category = this.bizCategoryNameMap[msgBiz];
136 | if (category) postObj.分类 = category;
137 |
138 | // 公众号信息
139 | Object.keys(profileMap).forEach(key => {
140 | const value = profile[profileMap[key]];
141 | if (value) postObj[key] = value;
142 | });
143 |
144 | // 文章信息
145 | Object.keys(postMap).forEach(key => {
146 | const value = post[postMap[key]];
147 | if (value) postObj[key] = value;
148 |
149 | // 时间格式转换
150 | if (key === '发布时间' && Object.prototype.toString.call(value) == '[object Date]') postObj[key] = moment(value).format('YYYY-MM-DD HH:mm');
151 | });
152 |
153 | // 用0替换undefined
154 | if (!postObj.阅读量) postObj.阅读量 = 0;
155 | if (!postObj.点赞量) postObj.点赞量 = 0;
156 |
157 | return postObj;
158 | });
159 |
160 | return handledPosts;
161 | }
162 |
163 | /**
164 | * 计算统计信息
165 | * @api private
166 | */
167 | async calcStatistic(...args) {
168 | const json = await this.toJson(...args);
169 | const data = JSON.parse(json);
170 | let aggrObj = {};
171 | let aggrArray = [];
172 | data.forEach(item => {
173 | let key = item.msgBiz;
174 | if (key in aggrObj) {
175 | aggrObj[key].总阅读量 += item.阅读量 || 0;
176 | aggrObj[key].总点赞量 += item.点赞量 || 0;
177 | aggrObj[key].总发文数 += 1;
178 | if (item.发布位置 == '1') {
179 | aggrObj[key].头条总阅读量 += item.阅读量 || 0;
180 | aggrObj[key].头条总点赞量 += item.点赞量 || 0;
181 | aggrObj[key].推送次数 += 1;
182 | }
183 | if (item.阅读量 > aggrObj[key].单篇最高阅读量) {
184 | aggrObj[key].单篇最高阅读量 = item.阅读量;
185 | }
186 | } else {
187 | aggrObj[key] = {
188 | 分类: item.分类,
189 | 公众号属性: item.公众号属性,
190 | 公众号: item.公众号,
191 | 公众号ID: item.公众号ID,
192 | 总阅读量: item.阅读量 || 0,
193 | 总点赞量: item.点赞量 || 0,
194 | 总发文数: 1,
195 | 头条总阅读量: 0,
196 | 头条总点赞量: 0,
197 | 推送次数: 0,
198 | 单篇最高阅读量: item.阅读量 || 0
199 | };
200 | if (item.发布位置 == '1') {
201 | aggrObj[key].头条总阅读量 = item.阅读量 || 0;
202 | aggrObj[key].头条总点赞量 = item.点赞量 || 0;
203 | aggrObj[key].推送次数 = 1;
204 | }
205 | }
206 | });
207 | Object.keys(aggrObj).forEach(key => {
208 | let item = aggrObj[key];
209 | let 公众号 = item.公众号;
210 | aggrArray.push({
211 | 公众号: 公众号,
212 | 公众号ID: item.公众号ID,
213 | 分类: item.分类,
214 | 公众号属性: item.公众号属性,
215 | msgBiz: key,
216 | 总阅读量: item.总阅读量,
217 | 平均阅读量: Math.round(item.总阅读量 / item.总发文数),
218 | 头条总阅读量: item.头条总阅读量,
219 | 推送次数: item.推送次数,
220 | 总点赞量: item.总点赞量,
221 | 平均点赞量: Math.round(item.总点赞量 / item.总发文数),
222 | 头条总点赞量: item.头条总点赞量,
223 | 单篇最高阅读量: item.单篇最高阅读量,
224 | 总发文数: item.总发文数
225 | });
226 | });
227 | return aggrArray;
228 | }
229 |
230 | /**
231 | * 通过category获取msgbizs
232 | * @api private
233 | */
234 | async getMsgBiz() {
235 | const categories = await models.Category.find({ _id: { $in: this.category } });
236 | if (!(categories && categories.length)) return;
237 |
238 | categories.forEach(category => {
239 | const { name, msgBizs } = category;
240 |
241 | // 找到所有的msgBiz都加入进来
242 | this.msgBiz = this.msgBiz.concat(msgBizs);
243 |
244 | // 添加msgBiz和分类名称的映射
245 | msgBizs.forEach(msgBiz => {
246 | this.bizCategoryNameMap[msgBiz] = name;
247 | });
248 | });
249 | }
250 |
251 | };
252 |
253 | function addBom(csv) {
254 | const bom = Buffer.from('\uFEFF');
255 | const csvBuf = Buffer.from(csv);
256 | return Buffer.concat([bom, csvBuf]).toString();
257 | }
258 |
--------------------------------------------------------------------------------
/rule/wechatRule.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const url = require('url');
4 | const moment = require('moment');
5 | const models = require('../models');
6 | const { log } = console;
7 | const config = require('../config');
8 | const cheerio = require('cheerio');
9 | const redis = require('../utils/redis');
10 |
11 | // 链接数组的缓存 每次重启程序后都会清空
12 | const { POST_LIST_KEY, PROFILE_LIST_KEY } = config.redis;
13 |
14 | const getReadAndLikeNum = async function(ctx) {
15 | const { req, res } = ctx;
16 | const link = req.url;
17 | if (!/mp\/getappmsgext/.test(link)) return;
18 |
19 | try {
20 | const body = res.response.body.toString();
21 | const data = JSON.parse(body);
22 | const { read_num, like_num } = data.appmsgstat;
23 | const [readNum, likeNum] = [read_num, like_num];
24 |
25 | const { requestData } = req;
26 | const reqData = String(requestData);
27 | const reqArgs = reqData.split('&').map(s => s.split('='));
28 | const reqObj = reqArgs.reduce((obj, arr) => {
29 | const [key, value] = arr;
30 | obj[key] = decodeURIComponent(value);
31 | return obj;
32 | }, {});
33 | const { __biz, mid, idx } = reqObj;
34 | const [msgBiz, msgMid, msgIdx] = [__biz, mid, idx];
35 |
36 | const post = await models.Post.findOneAndUpdate(
37 | { msgBiz, msgMid, msgIdx },
38 | { readNum, likeNum, updateNumAt: new Date() },
39 | { new: true, upsert: true }
40 | );
41 | const { id, title } = post;
42 | if (title) {
43 | log('文章标题:',title);
44 | } else {
45 | log('文章id:',id);
46 | }
47 | log('阅读量:', readNum, '点赞量:', likeNum);
48 | log();
49 |
50 | await redis('llen', POST_LIST_KEY).then(len => {
51 | log('剩余文章抓取长度:', len);
52 | log();
53 | });
54 |
55 | } catch(e) {
56 | throw e;
57 | }
58 | };
59 |
60 | const getPostBasicInfo = async function(ctx) {
61 | if (!isPostPage(ctx)) return;
62 |
63 | const { req, res } = ctx;
64 | const link = req.url;
65 | const body = res.response.body.toString();
66 |
67 | const urlObj = url.parse(link, true);
68 | const { query } = urlObj;
69 | const { __biz, mid, idx } = query;
70 | const [msgBiz, msgMid, msgIdx] = [__biz, mid, idx];
71 |
72 | // 判断此文是否失效
73 | if (body.indexOf('global_error_msg') > -1 || body.indexOf('icon_msg warn') > -1) {
74 | await models.Post.findOneAndUpdate(
75 | { msgBiz, msgMid, msgIdx },
76 | { isFail: true },
77 | { upsert: true }
78 | );
79 | return;
80 | }
81 |
82 |
83 | // 若数据库中不存在此篇文章 则更新基础信息
84 | await models.Post.findOne({ msgBiz, msgMid, msgIdx }).then(post => {
85 | if (post && post.title && post.link && post.wechatId) return;
86 |
87 | const getTarget = regexp => {
88 | let target;
89 | body.replace(regexp, (_, t) => {
90 | target = t;
91 | });
92 | return target;
93 | };
94 |
95 | let wechatId = getTarget(/(.+?)<\/span>/);
96 | // 如果上面找到的微信id中包含中文字符 则证明此微信号没有设置微信id 则取微信给定的user_name初始字段
97 | if (wechatId && /[\u4e00-\u9fa5]/.test(wechatId)) {
98 | wechatId = getTarget(/var user_name = "(.+?)"/);
99 | }
100 |
101 | // 更新wechatId
102 | if (wechatId && post && (!post.wechatId) && post.title && post.link) {
103 | return models.Post.findOneAndUpdate(
104 | { msgBiz, msgMid, msgIdx },
105 | { wechatId },
106 | { upsert: true }
107 | );
108 | }
109 |
110 | const title = getTarget(/var msg_title = "(.+?)";/);
111 | let publishAt = getTarget(/var ct = "(\d+)";/);
112 | if (publishAt) publishAt = new Date(parseInt(publishAt) * 1000);
113 | const sourceUrl = getTarget(/var msg_source_url = '(.*?)';/);
114 | const cover = getTarget(/var msg_cdn_url = "(.+?)";/);
115 | const digest = getTarget(/var msg_desc = "(.+?)";/);
116 |
117 | return models.Post.findOneAndUpdate(
118 | { msgBiz, msgMid, msgIdx },
119 | { title, link, publishAt, sourceUrl, cover, digest, wechatId },
120 | { upsert: true }
121 | );
122 | });
123 |
124 | // 保存正文内容
125 | if (config.insertJsToNextPage.isSavePostContent) {
126 | const $ = cheerio.load(body, { decodeEntities: false });
127 | let content;
128 | if (config.insertJsToNextPage.saveContentType === 'html') {
129 | content = $('#js_content').html() || '';
130 | } else {
131 | content = $('#js_content').text() || '';
132 | }
133 | content = content.trim();
134 | await models.Post.findOneAndUpdate(
135 | { msgBiz, msgMid, msgIdx },
136 | { content },
137 | { upsert: true }
138 | );
139 | }
140 |
141 | };
142 |
143 | const handlePostHtml = async function(ctx) {
144 | if (!isPostPage(ctx)) return;
145 |
146 | const { res } = ctx;
147 | let body = res.response.body.toString();
148 |
149 | // 替换显示在手机上的正文 加速网络
150 | if (config.isReplacePostBody) {
151 | const len = await redis('llen', POST_LIST_KEY);
152 | body.replace(/((\s|\S)+?)<\/div>\s+?`;
391 | body = body.replace('','').replace('','');
392 | body = body.replace('