├── .gitignore
├── README.md
├── anyproxy
    ├── __init__.py
    ├── restart_anyproxy.py
    └── rule_default.js
├── api.py
├── cfg
    ├── __init__.py
    ├── cfg.py
    ├── cfg_prod.py
    └── cfg_test.py
├── crawler
    ├── __init__.py
    ├── crawler.py
    └── reset_crawl.py
├── ctl_prod.sh
├── ctl_test.sh
├── db
    ├── __init__.py
    ├── db.sql
    ├── mysql_conn.py
    └── mysql_operate.py
├── public_number
    ├── __init__.py
    └── get_public_number.py
├── requirements.txt
└── start.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.out


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | + 此项目的详细介绍：https://www.jianshu.com/p/667f3668cd70
 2 | + 项目中用到的android按键精灵源码：https://github.com/zjhpure/PublicNumberQuickMacro
 3 | 
 4 | 
 5 | ## 环境准备
 6 | + 一台爬虫服，python3环境，建议在ubuntu16.04下，不用再装一次python3
 7 | + 一台代理服，root权限，anyproxy环境和pm2环境，要先装好npm和node才能装anyproxy，pm2是用来控制anyproxy的
 8 | + 至少一台android手机/模拟器（公众号越多，android就需要越多，按键精灵的点击频率根据实际调整），装上微信，微信版本要求：6.66，登录一个微信号（建议微信号进行认证，否则很快会被禁止登录），安装上按键精灵
 9 | + 一个mysql数据库
10 | + 一个redis数据库
11 | + 如果机器不够，可以把爬虫服、代理服、mysql、redis都放在一台机器上
12 | 
13 | 
14 | ## 启动前准备
15 | + 下载项目，git clone https://github.com/zjhpure/crawler_public_number
16 | + 修改项目anyproxy目录下的rule_default.js文件第三四行，指定自己的redis
17 | + 复制项目anyproxy目录下的rule_default.js文件到代理服的/usr/local/lib/node_modules/anyproxy/lib下
18 | + 在代理服启动anyproxy，首次启动需要执行sudo pm2 start anyproxy -x -- -i，之后的启动执行sudo pm2 start anyproxy
19 | + 在代理服的/usr/local/lib/node_modules/anyproxy目录下，执行sudo npm install redis，以增加node的redis模块
20 | + 在代理服可以执行sudo pm2 list，查看是否启动anyproxy成功
21 | + 也可以在浏览器输入：代理服ip:8002，查看anyproxy运行情况
22 | + 复制项目anyproxy目录下的restart_anyproxy.py文件到代理服
23 | + 可以在代理服执行nohup python3 -u restart_anyproxy.py &，让anyproxy每天凌晨重启一次，因为anyproxy运行太久会变卡顿
24 | + android连接wifi指定anyproxy代理，代理地址是代理服ip，端口是8001
25 | + 在代理服ip:8002网址上，点击RootCA后，屏幕出现二维码和download按钮
26 | + 可以点击download直接下载到电脑然后复制到手机安装CA证书，或者用手机浏览器扫二维码安装CA证书，还可以用手机浏览器访问：代理服ip:8002/fetchCrtFile安装CA证书
27 | + android按键精灵配置获取公众号的接口的地址，在项目目录下的api.py文件中
28 | + android启动按键精灵
29 | + 手机/模拟器进入微信，这里测试两个公众号，关注公众号：pythonbuluo（Python程序员），python-china（Python中文社区）
30 | + 也可以在public_number表下自己添加其他的公众号，分别需要填上公众号微信号、公众号名称和公众号biz值
31 | + biz值的获取，点击公众号的历史消息，点击右上角按钮，点击复制链接，在一个地方查看这个链接
32 | + 链接像如下：https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MjM5NzU0MzU0Nw==&scene=123&pass_ticket={pass_ticket值}
33 | + 那这里的biz值就是MjM5NzU0MzU0Nw==
34 | + 修改项目cfg目录下的cfg_test.py和cfg.prod.py文件，配置测服和正服，指定项目的mysql、redis和按键精灵接口的端口
35 | + 使用项目db目录下的db.sql文件生成数据库
36 | + 复制项目到测服和正服，目录放在/data/crawler/下，也可以放在其他地方，但是ctl_test.sh和ctl_test.sh的ROOT就要对应地改变
37 | 
38 | 
39 | ## 启动项目
40 | 
41 | #### 测服
42 | + 进入项目目录
43 | + 执行chmod +x ctl_test.sh，使ctl_test.sh可执行
44 | + 执行sh ctl_test.sh start_api，启动按键精灵接口
45 | + 执行sh ctl_test.sh start，启动爬虫
46 | + 手机/模拟器启动按键精灵，如果没有按键精灵，可以手动点击公众号的历史消息来测试
47 | + 项目运行后会在项目目录下生成nohup.out文件记录运行情况
48 | 
49 | #### 正服
50 | + 进入项目目录
51 | + 执行chmod +x ctl_prod.sh，使ctl_test.sh可执行
52 | + 执行sh ctl_prod.sh start_api，启动按键精灵接口
53 | + 执行sh ctl_prod.sh start，启动爬虫
54 | + 手机/模拟器启动按键精灵，如果没有按键精灵，可以手动点击公众号的历史消息来测试
55 | + 项目运行后会在项目目录下生成nohup.out文件记录运行情况
56 | 
57 | ## 说明
58 | + 这里的微信文章内容和封面只是保存其链接
59 | + 如果真的要上线，要自己实现下载封面和文章内容，然后上传到云存储上（比如：七牛云）
60 | + android按键精灵的流程是：间隔一定时间请求一次按键精灵接口ip:/crawler/public_number/get_click_public_number，获取到公众号名字，然后点击公众号的历史消息


--------------------------------------------------------------------------------
/anyproxy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjhpure/crawler_public_number/a39aaa91d5b35e2a033ea88059237961d46d2412/anyproxy/__init__.py


--------------------------------------------------------------------------------
/anyproxy/restart_anyproxy.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import datetime
 3 | import time
 4 | import os
 5 | 
 6 | from time import strftime
 7 | 
 8 | 
 9 | # 每天凌晨重启anyproxy,anyproxy长时间运行不重启会很卡
10 | class RestartAnyproxy(object):
11 |     def run(self):
12 |         while True:
13 |             self.print_with_time('sleep 10s')
14 |             time.sleep(10)
15 |             # 获取当前时间
16 |             now = str(datetime.datetime.now())
17 |             # 获取当前时间的小时数
18 |             hour = now.split(' ')[1].split(':')[0]
19 |             # 若小时数为00,则证明已经到了凌晨
20 |             if '00' == hour:
21 |                 os.system('sudo pm2 restart anyproxy')
22 |                 self.print_with_time('sleep 4000s')
23 |                 # 执行完就休眠一个小时,一天只重置一次
24 |                 time.sleep(4000)
25 | 
26 |     @staticmethod
27 |     def print_with_time(content):
28 |         print(strftime('%Y-%m-%d %H:%M:%S'))
29 |         print(content)
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     restart_anyproxy = RestartAnyproxy()
34 |     restart_anyproxy.run()
35 | 


--------------------------------------------------------------------------------
/anyproxy/rule_default.js:
--------------------------------------------------------------------------------
 1 | function sendToRedis(x_wechat_key, x_wechat_uin, user_agent, cookie, url) {
 2 |     var redis = require("redis");
 3 |     client = redis.createClient(6379, 'localhost', {});
 4 |     client.auth('123456');
 5 |     client.on("error", function (err) {
 6 |         console.log("error:" + err);
 7 |     });
 8 |     var now = Math.round(new Date().getTime() / 1000)
 9 |     console.log(now);
10 |     client.rpush('click_public_number', x_wechat_key + '&&' + x_wechat_uin + '&&' + user_agent + '&&' + cookie + '&&' + now + '&&' + url, redis.print)
11 |     client.quit();
12 | };
13 | 
14 | 'use strict';
15 | 
16 | module.exports = {
17 | 
18 |     summary: 'the default rule for AnyProxy',
19 | 
20 |     /**
21 |      *
22 |      *
23 |      * @param {object} requestDetail
24 |      * @param {string} requestDetail.protocol
25 |      * @param {object} requestDetail.requestOptions
26 |      * @param {object} requestDetail.requestData
27 |      * @param {object} requestDetail.response
28 |      * @param {number} requestDetail.response.statusCode
29 |      * @param {object} requestDetail.response.header
30 |      * @param {buffer} requestDetail.response.body
31 |      * @returns
32 |      */
33 |     *beforeSendRequest(requestDetail) {
34 |         return null;
35 |     },
36 | 
37 |     /**
38 |      *
39 |      *
40 |      * @param {object} requestDetail
41 |      * @param {object} responseDetail
42 |      */
43 |     *beforeSendResponse(requestDetail, responseDetail) {
44 |         var tempStr = "mp.weixin.qq.com/mp/profile_ext?action=home";
45 |         var res = requestDetail.url.indexOf(tempStr)
46 |         if (res > 0) {
47 |             var body = responseDetail.response.body
48 |             var regu = "操作频繁，请稍后再试";
49 |             if (body.indexOf(regu) >= 0) {
50 |                 console.log('微信操作频繁网页');
51 |             } else {
52 |                 var data = requestDetail.requestOptions;
53 |                 sendToRedis(data.headers['x-wechat-key'], data.headers['x-wechat-uin'], data.headers['User-Agent'], data.headers['Cookie'], requestDetail.url)
54 |                 console.log(data);
55 |             }
56 |         }
57 |         return null;
58 |     },
59 | 
60 | 
61 |     /**
62 |      *
63 |      *
64 |      * @param {any} requestDetail
65 |      * @returns
66 |      */
67 |     *beforeDealHttpsRequest(requestDetail) {
68 |         return false;
69 |     },
70 | 
71 |     /**
72 |      *
73 |      *
74 |      * @param {any} requestDetail
75 |      * @param {any} error
76 |      * @returns
77 |      */
78 |     *onError(requestDetail, error) {
79 |         return null;
80 |     },
81 | 
82 | 
83 |     /**
84 |      *
85 |      *
86 |      * @param {any} requestDetail
87 |      * @param {any} error
88 |      * @returns
89 |      */
90 |     *onConnectError(requestDetail, error) {
91 |         return null;
92 |     },
93 | };
94 | 


--------------------------------------------------------------------------------
/api.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import json
 3 | 
 4 | from cfg.cfg import redis_db, api_port
 5 | from flask import Flask
 6 | from redis import StrictRedis
 7 | 
 8 | app = Flask(__name__)
 9 | 
10 | 
11 | @app.route('/')
12 | def hello():
13 |     return 'hello world!'
14 | 
15 | 
16 | @app.route('/crawler/public_number/get_click_public_number', methods=['GET', 'POST'])
17 | def get_click_public_number():
18 |     # 连接redis
19 |     redis = StrictRedis(host=redis_db['host'], port=redis_db['port'], password=redis_db['password'])
20 |     if redis.llen('public_number') > 0:
21 |         # redis长度不为0,从左pop出数据,按键精灵可以点击
22 |         info = str(redis.lpop('public_number'), encoding='utf-8')
23 |         info = info.split('&&')
24 |         print(info)
25 |         # data = {"errcode": 0, "msg": "获取公众号成功",
26 |         #         "result": {"public_number_name": info[0],
27 |         #                    "public_number_wechat_id": info[1],
28 |         #                    "public_number_biz": info[2]}}
29 |         data = {"errcode": 0, "msg": "获取公众号成功",
30 |                 "result": {"publicNumberName": info[0],
31 |                            "publicNumberWechatId": info[1],
32 |                            "publicNumberBiz": info[2]}}
33 |     else:
34 |         # redis长度为0,按键精灵不用点击
35 |         data = {"errcode": 1, "msg": "无公众号获取"}
36 |     result = json.dumps(data, ensure_ascii=False)
37 |     print(result)
38 |     return result
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     app.run(host='0.0.0.0', port=api_port, debug=True)
43 | 


--------------------------------------------------------------------------------
/cfg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjhpure/crawler_public_number/a39aaa91d5b35e2a033ea88059237961d46d2412/cfg/__init__.py


--------------------------------------------------------------------------------
/cfg/cfg.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | api_port = 10002
 3 | 
 4 | redis_db = {
 5 |     'host': 'localhost',
 6 |     'port': 6379,
 7 |     'password': '123456'
 8 | }
 9 | 
10 | mysql_db = {
11 |     'host': 'localhost',
12 |     'user': 'root',
13 |     'password': '123456',
14 |     'db': 'crawler_public_number',
15 |     'port': 3306
16 | }
17 | 


--------------------------------------------------------------------------------
/cfg/cfg_prod.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | api_port = 10002
 3 | 
 4 | redis_db = {
 5 |     'host': 'localhost',
 6 |     'port': 6379,
 7 |     'password': '123456'
 8 | }
 9 | 
10 | mysql_db = {
11 |     'host': 'localhost',
12 |     'user': 'root',
13 |     'password': '123456',
14 |     'db': 'crawler_public_number',
15 |     'port': 3306
16 | }
17 | 


--------------------------------------------------------------------------------
/cfg/cfg_test.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | api_port = 10002
 3 | 
 4 | redis_db = {
 5 |     'host': 'localhost',
 6 |     'port': 6379,
 7 |     'password': '123456'
 8 | }
 9 | 
10 | mysql_db = {
11 |     'host': 'localhost',
12 |     'user': 'root',
13 |     'password': '123456',
14 |     'db': 'crawler_public_number',
15 |     'port': 3306
16 | }
17 | 


--------------------------------------------------------------------------------
/crawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjhpure/crawler_public_number/a39aaa91d5b35e2a033ea88059237961d46d2412/crawler/__init__.py


--------------------------------------------------------------------------------
/crawler/crawler.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | import datetime
  3 | import json
  4 | import random
  5 | import re
  6 | import time
  7 | import traceback
  8 | from time import strftime
  9 | 
 10 | import requests
 11 | from lxml import etree
 12 | from redis import StrictRedis
 13 | from cfg.cfg import redis_db
 14 | from db.mysql_operate import MysqlOperate
 15 | 
 16 | 
 17 | class PublicNumberSpider(object):
 18 |     headers = {
 19 |         'Host': 'mp.weixin.qq.com',
 20 |         'Connection': 'keep-alive',
 21 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 22 |         'Accept-Encoding': 'gzip,deflate',
 23 |         'Accept-Language': 'zh-CN,en-US;q=0.8',
 24 |         'X-Requested-With': 'com.tencent.mm'
 25 |     }
 26 |     mysql_operate = MysqlOperate()
 27 | 
 28 |     def start_requests(self):
 29 |         while True:
 30 |             # 每隔5-10s取一次redis数据
 31 |             n = random.randint(5, 10)
 32 |             self.print_with_time('sleep ' + str(n) + 's')
 33 |             time.sleep(n)
 34 |             # 从redis获取请求参数和biz值
 35 |             x_wechat_key, x_wechat_uin, user_agent, cookie, url = self.operate_redis()
 36 |             headers = self.headers
 37 |             headers['x-wechat-key'] = x_wechat_key
 38 |             headers['x-wechat-uin'] = x_wechat_uin
 39 |             headers['User-Agent'] = user_agent
 40 |             headers['Cookie'] = cookie
 41 |             biz = url.split('&')[1].split('biz=')[1]
 42 |             # 通过biz值查询数据库里是否有今天未爬取的此公众号
 43 |             result = self.mysql_operate.query_public_number_by_biz(public_number_biz=biz)
 44 |             self.print_with_time(result)
 45 |             if len(result) > 0:
 46 |                 row = result[0]
 47 |                 public_number_id = row[0]
 48 |                 public_number_wechat_id = row[1]
 49 |                 public_number_name = row[2]
 50 |                 try:
 51 |                     self.print_with_time('public_number_wechat_id:' + public_number_wechat_id
 52 |                                          + ' public_number_name:' + public_number_name)
 53 |                     response = requests.request('GET', url, headers=headers)
 54 |                     meta = {'public_number_wechat_id': public_number_wechat_id,
 55 |                             'public_number_name': public_number_name,
 56 |                             'public_number_id': public_number_id}
 57 |                     article_num = self.parse(response, meta)
 58 |                     # 若今天的文章数量为0,可能公众号还没有发表文章
 59 |                     if article_num > 0:
 60 |                         # 今天已爬取,标记为1,这里一旦有一次今天爬取成功了,就标记为今天已爬取
 61 |                         # 大多数公众号一天只能发文一次,除了少数早期的公众号可以发文多次,新申请应该都是一天只能发文一次
 62 |                         self.mysql_operate.update_public_number_today_is_crawl(
 63 |                             public_number_wechat_id=str(public_number_wechat_id), today_is_crawl=str(1))
 64 |                     # 录入爬取记录,1为爬取成功
 65 |                     self.mysql_operate.insert_crawl_record(public_number_id=public_number_id, crawl_status=1)
 66 |                 except Exception as e:
 67 |                     self.print_with_time(e)
 68 |                     traceback.print_exc()
 69 |                     self.print_with_time(
 70 |                         'crawler failure, ' + 'public_number_wechat_id:' + public_number_wechat_id
 71 |                         + ', public_number_name:' + public_number_name)
 72 |                     # 录入爬取记录,0为爬取失败
 73 |                     self.mysql_operate.insert_crawl_record(public_number_id=public_number_id, crawl_status=0)
 74 | 
 75 |     def parse(self, response, meta):
 76 |         # print(response.text)
 77 |         if '<h2 class="weui_msg_title">操作频繁，请稍后再试</h2>' in response.text:
 78 |             self.print_with_time('操作频繁')
 79 |             return 0
 80 |         if '<p>请在微信客户端打开链接。</p>' in response.text > 0:
 81 |             self.print_with_time('连接失效')
 82 |             return 0
 83 |         # 今天的文章数量
 84 |         article_today_num = 0
 85 |         # 获取公众号历史消息第一页
 86 |         msg_list = json.loads(re.findall(r'{"list":.*]}', response.text.replace('&quot;', '"'))[0])
 87 |         for sel in msg_list['list']:
 88 |             if 'app_msg_ext_info' in sel:
 89 |                 # 获取公众号文章发表时间
 90 |                 ltime = time.localtime(sel['comm_msg_info']['datetime'])
 91 |                 day = time.strftime('%Y-%m-%d', ltime)
 92 |                 # 今天
 93 |                 today = datetime.date.today()
 94 |                 # 昨天
 95 |                 yesterday = today - datetime.timedelta(days=1)
 96 |                 # 只抓取今天和昨天的文章,抓取昨天的文章是为了避免可能出现在凌晨前发文,而由于爬取的循环刚好在凌晨的前后而错过了爬取
 97 |                 # 之抓取今天和昨天的文章也是为了减少访问次数,微信对同一个微信号一天访问公众号历史消息的次数是有限制的
 98 |                 # 若超过了次数,大概需要等待12小时后才能恢复
 99 |                 if str(yesterday) == day:
100 |                     content_url = sel['app_msg_ext_info']['content_url']
101 |                     # 发表后又删除了的文章content_url会为空
102 |                     if content_url != '':
103 |                         url_temp = content_url.split('/s?')[1]
104 |                         url = 'https://mp.weixin.qq.com/s?' + re.sub('amp;', '', url_temp)
105 |                         meta['cover'] = sel['app_msg_ext_info']['cover'].replace('\\', '')
106 |                         self.print_with_time(url)
107 |                         meta['url'] = url
108 |                         response = requests.request('GET', url, headers=self.headers)
109 |                         self.get_pub_article(response, meta)
110 |                     for s in sel['app_msg_ext_info']['multi_app_msg_item_list']:
111 |                         # 发表后又删除了的文章content_url会为空
112 |                         if s['content_url'] != '':
113 |                             muti_url_temp = s['content_url'].split('/s?')[1]
114 |                             muti_url = 'https://mp.weixin.qq.com/s?' + re.sub('amp;', '', muti_url_temp)
115 |                             meta['cover'] = s['cover'].replace('\\', '')
116 |                             self.print_with_time(muti_url)
117 |                             meta['url'] = muti_url
118 |                             response = requests.request('GET', muti_url, headers=self.headers)
119 |                             self.get_pub_article(response, meta)
120 |                 if str(today) == day:
121 |                     content_url = sel['app_msg_ext_info']['content_url']
122 |                     # 发表后又删除了的文章content_url会为空
123 |                     if content_url != '':
124 |                         url_temp = content_url.split('/s?')[1]
125 |                         url = 'https://mp.weixin.qq.com/s?' + re.sub('amp;', '', url_temp)
126 |                         meta['cover'] = sel['app_msg_ext_info']['cover'].replace('\\', '')
127 |                         self.print_with_time(url)
128 |                         meta['url'] = url
129 |                         response = requests.request('GET', url, headers=self.headers)
130 |                         self.get_pub_article(response, meta)
131 |                         article_today_num = article_today_num + 1
132 |                     for s in sel['app_msg_ext_info']['multi_app_msg_item_list']:
133 |                         # 发表后又删除了的文章content_url会为空
134 |                         if s['content_url'] != '':
135 |                             muti_url_temp = s['content_url'].split('/s?')[1]
136 |                             muti_url = 'https://mp.weixin.qq.com/s?' + re.sub('amp;', '', muti_url_temp)
137 |                             meta['cover'] = s['cover'].replace('\\', '')
138 |                             self.print_with_time(muti_url)
139 |                             meta['url'] = muti_url
140 |                             response = requests.request('GET', muti_url, headers=self.headers)
141 |                             self.get_pub_article(response, meta)
142 |                             article_today_num = article_today_num + 1
143 |         return article_today_num
144 | 
145 |     def get_pub_article(self, response, meta):
146 |         public_number_id = meta['public_number_id']
147 |         public_number_wechat_id = meta['public_number_wechat_id']
148 |         public_number_name = meta['public_number_name']
149 |         # 把响应返回的文本转换为结点对象,xpath方法要用结点对象
150 |         html = etree.HTML(response.text)
151 |         # 若文章是转载的
152 |         if len(html.xpath('//*[@class="original_page"]')) > 0:
153 |             # 获取原文章的连接,这里的网址是重定向网址
154 |             url = html.xpath('//*[@id="js_share_source"]/@href')[0].strip()
155 |             self.print_with_time('pub_article redirect to: ' + url)
156 |             response = requests.request('GET', url, headers=self.headers)
157 |             self.get_pub_article(response, meta)
158 |         # 若文章不是转载的
159 |         else:
160 |             public_number_article_title = html.xpath('//*[@id="activity-name"]/text()')[0].strip()
161 |             self.print_with_time('public_number_article_title:' + public_number_article_title)
162 |             public_number_article_publish_time = html.xpath('//*[@id="post-date"]/text()')[0].strip()
163 |             self.print_with_time('public_number_article_publish_time:' + public_number_article_publish_time)
164 |             count = self.mysql_operate.query_public_number_article(public_number_wechat_id=public_number_wechat_id,
165 |                                                                    public_number_article_title=public_number_article_title,
166 |                                                                    public_number_article_publish_time=public_number_article_publish_time)
167 |             self.print_with_time(
168 |                 'public_number_wechat_id:' + public_number_wechat_id + ' public_number_name:' + public_number_name)
169 |             # 通过比较文章的标题和发表时间来判断文章是否已经爬取过,以爬取过的不再爬取
170 |             if count <= 0:
171 |                 # 爬取文章封面
172 |                 pub_article_cover = meta['cover']
173 |                 # 文章链接
174 |                 pub_article_content_url = meta['url']
175 |                 # 这里说明一下,封面和文章内容,需要下载下来,然后上传到云存储上(比如:七牛云),这里就不详细解说了
176 |                 # response.text可以获取到文章内容
177 |                 # 文章信息录入数据库
178 |                 self.mysql_operate.insert_public_number_article(public_number_wechat_id=public_number_wechat_id,
179 |                                                                 public_number_name=public_number_name,
180 |                                                                 public_number_article_title=public_number_article_title,
181 |                                                                 public_number_article_publish_time=public_number_article_publish_time,
182 |                                                                 public_number_article_content_url=pub_article_content_url,
183 |                                                                 public_number_article_cover=pub_article_cover)
184 |                 self.print_with_time(
185 |                     'public_number_id:' + str(public_number_id)
186 |                     + ' public_number_wechat_id:' + public_number_wechat_id
187 |                     + ' public_number_name:' + public_number_name
188 |                     + ' public_number_article_title:' + public_number_article_title
189 |                     + ' public_number_article_publish_time:' + public_number_article_publish_time
190 |                     + ' pub_article_content_url:' + pub_article_content_url
191 |                     + ' pub_article_cover:' + pub_article_cover)
192 | 
193 |     def operate_redis(self):
194 |         x_wechat_key = None
195 |         x_wechat_uin = None
196 |         user_agent = None
197 |         cookie = None
198 |         url = None
199 |         flag = True
200 |         while flag:
201 |             self.print_with_time('prepare to connect redis')
202 |             # 连接redis
203 |             redis = StrictRedis(host=redis_db['host'], port=redis_db['port'], password=redis_db['password'])
204 |             # 从左边pop出数据,b表示若没有数据,则会一直堵塞等待
205 |             info = str(redis.blpop('click_public_number')[1], encoding='utf-8')
206 |             info = info.split('&&')
207 |             self.print_with_time(info)
208 |             # 获取从anyproxy拦截公众号历史消息请求时储存在redis上的时间戳
209 |             t = info[4]
210 |             # 获取当前时间戳
211 |             now = int(time.time())
212 |             self.print_with_time('now: ' + str(now))
213 |             # 公众号历史消息请求使用的参数有时效性,为了避免请求失效,这里时间戳不大于当前时间戳500的时间戳,即500秒
214 |             # 还需url包含pass_ticket,因为有些网址不完整,需要去掉,如下:
215 |             # 有时网址是这样:https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz={biz值}&scene=124&
216 |             # 有时网址是这样:https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz={biz值}&scene=124&devicetype=android-23&version=26060135&lang=zh_CN&nettype=WIFI&a8scene=3&pass_ticket={pass_ticket值}&wx_header=1
217 |             # 要把前者去掉
218 |             if now - int(t) <= 500 and 'pass_ticket' in info[5]:
219 |                 flag = False
220 |                 x_wechat_key = info[0]
221 |                 x_wechat_uin = info[1]
222 |                 user_agent = info[2]
223 |                 cookie = info[3]
224 |                 url = info[5]
225 |                 self.print_with_time('x_wechat_key: ' + x_wechat_key)
226 |                 self.print_with_time('x_wechat_uin: ' + x_wechat_uin)
227 |                 self.print_with_time('user_agent: ' + user_agent)
228 |                 self.print_with_time('cookie: ' + cookie)
229 |                 self.print_with_time('time: ' + t)
230 |         self.print_with_time('get public_number headers by redis success')
231 |         return x_wechat_key, x_wechat_uin, user_agent, cookie, url
232 | 
233 |     @staticmethod
234 |     def print_with_time(content):
235 |         print(strftime('%Y-%m-%d %H:%M:%S'))
236 |         print(content)
237 | 
238 | 
239 | if __name__ == '__main__':
240 |     public_number_spider = PublicNumberSpider()
241 |     public_number_spider.start_requests()
242 | 


--------------------------------------------------------------------------------
/crawler/reset_crawl.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import datetime
 3 | import time
 4 | 
 5 | from time import strftime
 6 | from db.mysql_operate import MysqlOperate
 7 | 
 8 | 
 9 | # 每天凌晨重置所有公众号的今天是否已爬取为0
10 | class ResetCrawler(object):
11 |     mysql_operate = MysqlOperate()
12 | 
13 |     def run(self):
14 |         while True:
15 |             self.print_with_time('sleep 10s')
16 |             time.sleep(10)
17 |             # 获取当前时间
18 |             now = str(datetime.datetime.now())
19 |             # 获取当前时间的小时数
20 |             hour = now.split(' ')[1].split(':')[0]
21 |             # 若小时数为00,则证明已经到了凌晨
22 |             if '00' == hour:
23 |                 # 重置所有公众号的今天是否已爬取为0
24 |                 self.mysql_operate.reset_all_public_number_today_is_crawl()
25 |                 self.print_with_time('sleep 4000s')
26 |                 # 执行完就休眠一个小时,一天只重置一次
27 |                 time.sleep(4000)
28 | 
29 |     @staticmethod
30 |     def print_with_time(content):
31 |         print(strftime('%Y-%m-%d %H:%M:%S'))
32 |         print(content)
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     reset_crawler = ResetCrawler()
37 |     reset_crawler.run()
38 | 


--------------------------------------------------------------------------------
/ctl_prod.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | ROOT=/data/crawler/crawler_public_number
 4 | 
 5 | assist() {
 6 |     echo """
 7 |          start                   使用生产配置启动进程
 8 |          stop                    关闭进程
 9 |          start_api               使用生产配置启动API进程
10 |          stop_api                关闭API进程
11 |          """
12 | }
13 | 
14 | start() {
15 |     rm -rf ${ROOT}/cfg/cfg.py
16 |     cp ${ROOT}/cfg/cfg_prod.py ${ROOT}/cfg/cfg.py
17 |     nohup python3 -u ${ROOT}/start.py &
18 | }
19 | 
20 | stop() {
21 |     pid=$(ps -ef | grep ${ROOT}/start.py | grep -v grep | awk '{print $2}')
22 |     [ -n "${pid}" ] && kill -9 ${pid}
23 |     echo "已关闭：${pid}"
24 | }
25 | 
26 | start_api() {
27 |     rm -rf ${ROOT}/cfg/cfg.py
28 |     cp ${ROOT}/cfg/cfg_prod.py ${ROOT}/cfg/cfg.py
29 |     nohup python3 -u ${ROOT}/api.py &
30 | }
31 | 
32 | stop_api() {
33 |     pid=$(ps -ef | grep ${ROOT}/api.py | grep -v grep | awk '{print $2}')
34 |     [ -n "${pid}" ] && kill -9 ${pid}
35 |     echo "已关闭：${pid}"
36 | }
37 | 
38 | case $1 in
39 |         start)
40 |                 start
41 |                 ;;
42 |         stop)
43 |                 stop
44 |                 ;;
45 |         start_api)
46 |                 start_api
47 |                 ;;
48 |         stop_api)
49 |                 stop_api
50 |                 ;;
51 |         *)
52 |                 assist
53 |                 ;;
54 | esac
55 | 


--------------------------------------------------------------------------------
/ctl_test.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | ROOT=/data/crawler/crawler_public_number
 4 | 
 5 | assist() {
 6 |     echo """
 7 |          start                   使用测试配置启动进程
 8 |          stop                    关闭进程
 9 |          start_api               使用测试配置启动API进程
10 |          stop_api                关闭API进程
11 |          """
12 | }
13 | 
14 | start() {
15 |     rm -rf ${ROOT}/cfg/cfg.py
16 |     cp ${ROOT}/cfg/cfg_test.py ${ROOT}/cfg/cfg.py
17 |     nohup python3 -u ${ROOT}/start.py &
18 | }
19 | 
20 | stop() {
21 |     pid=$(ps -ef | grep ${ROOT}/start.py | grep -v grep | awk '{print $2}')
22 |     [ -n "${pid}" ] && kill -9 ${pid}
23 |     echo "已关闭：${pid}"
24 | }
25 | 
26 | start_api() {
27 |     rm -rf ${ROOT}/cfg/cfg.py
28 |     cp ${ROOT}/cfg/cfg_test.py ${ROOT}/cfg/cfg.py
29 |     nohup python3 -u ${ROOT}/api.py &
30 | }
31 | 
32 | stop_api() {
33 |     pid=$(ps -ef | grep ${ROOT}/api.py | grep -v grep | awk '{print $2}')
34 |     [ -n "${pid}" ] && kill -9 ${pid}
35 |     echo "已关闭：${pid}"
36 | }
37 | 
38 | case $1 in
39 |         start)
40 |                 start
41 |                 ;;
42 |         stop)
43 |                 stop
44 |                 ;;
45 |         start_api)
46 |                 start_api
47 |                 ;;
48 |         stop_api)
49 |                 stop_api
50 |                 ;;
51 |         *)
52 |                 assist
53 |                 ;;
54 | esac
55 | 


--------------------------------------------------------------------------------
/db/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjhpure/crawler_public_number/a39aaa91d5b35e2a033ea88059237961d46d2412/db/__init__.py


--------------------------------------------------------------------------------
/db/db.sql:
--------------------------------------------------------------------------------
 1 | drop table if exists `public_number`;
 2 | create table `public_number` (
 3 |   `id` int(10) not null auto_increment comment '公众号id',
 4 |   `public_number_wechat_id` varchar(255) not null comment '公众号微信号',
 5 |   `public_number_name` varchar(255) not null comment '公众号名称',
 6 |   `public_number_biz` varchar(255) not null comment '公众号biz值(biz值为公众号的唯一且不重复的标识)',
 7 |   `today_is_crawl` tinyint(4) not null default '0' comment '今天是否已爬取(0--今天未爬取,1--今天已爬取)',
 8 |   `create_time` timestamp default current_timestamp comment '创建时间',
 9 |   `update_time` timestamp default current_timestamp on update current_timestamp comment '更新时间',
10 |   `is_del` tinyint(4) not null default '0' comment '是否删除(0--否,1--是)',
11 |   primary key (`id`)
12 | ) engine=InnoDB auto_increment=1 default charset=utf8 comment='公众号表';
13 | insert into `public_number`(public_number_wechat_id, public_number_name, public_number_biz) values('pythonbuluo', 'Python程序员', 'MjM5NzU0MzU0Nw=='),
14 | ('python-china', 'Python中文社区', 'MzAxMjUyNDQ5OA==');
15 | 
16 | drop table if exists `public_number_article`;
17 | create table `public_number_article` (
18 |   `id` int(10) not null auto_increment comment '公众号文章id',
19 |   `public_number_wechat_id` varchar(255) not null comment '公众号微信号',
20 |   `public_number_name` varchar(255) not null comment '公众号名称',
21 |   `public_number_article_cover` text not null comment '公众号文章封面',
22 |   `public_number_article_publish_time` timestamp default current_timestamp comment '公众号文章发表时间',
23 |   `public_number_article_title` varchar(255) not null comment '公众号文章标题',
24 |   `public_number_article_content_url` text not null comment '公众号文章内容',
25 |   `create_time` timestamp default current_timestamp comment '创建时间',
26 |   `update_time` timestamp default current_timestamp on update current_timestamp comment '更新时间',
27 |   `is_del` tinyint(4) not null default '0' comment '是否删除(0--否,1--是)',
28 |   primary key (`id`)
29 | ) engine=InnoDB auto_increment=1 default charset=utf8 comment='公众号文章表';
30 | 
31 | drop table if exists `crawl_record`;
32 | create table `crawl_record` (
33 |   `id` int(10) not null auto_increment comment '爬取记录id',
34 |   `public_number_id` int(10) not null comment '公众号id',
35 |   `crawl_status` tinyint(4) not null default '-1' comment '爬取状态(0--爬取失败,1--爬取成功)',
36 |   `create_time` timestamp default current_timestamp comment '创建时间',
37 |   `update_time` timestamp default current_timestamp on update current_timestamp comment '更新时间',
38 |   `is_del` tinyint(4) not null default '0' comment '是否删除(0--否,1--是)',
39 |   primary key (`id`)
40 | ) engine=InnoDB auto_increment=1 default charset=utf8 comment='爬取记录表';
41 | 


--------------------------------------------------------------------------------
/db/mysql_conn.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import pymysql
 3 | from time import strftime
 4 | 
 5 | 
 6 | class MysqlConn(object):
 7 |     conn = None
 8 |     cursor = None
 9 | 
10 |     def __init__(self, host, user, password, db, port, charset):
11 |         self.host = host
12 |         self.user = user
13 |         self.password = password
14 |         self.db = db
15 |         self.port = port
16 |         self.charset = charset
17 | 
18 |     def connect(self):
19 |         try:
20 |             self.conn = pymysql.connect(host=self.host,
21 |                                         user=self.user,
22 |                                         password=self.password,
23 |                                         db=self.db,
24 |                                         port=self.port,
25 |                                         charset=self.charset,
26 |                                         autocommit=True)
27 |             self.cursor = self.conn.cursor()
28 |             self.print_with_time('mysql connect success')
29 |         except Exception as e:
30 |             self.print_with_time(e)
31 |             self.print_with_time('mysql connect failure')
32 | 
33 |     def ping(self):
34 |         try:
35 |             self.conn.ping()
36 |             self.print_with_time('mysql ping success')
37 |         except Exception as e:
38 |             self.print_with_time(e)
39 |             self.print_with_time('mysql ping failure')
40 |             try:
41 |                 self.connect()
42 |                 self.print_with_time('mysql reconnect success')
43 |             except Exception as e:
44 |                 self.print_with_time(e)
45 |                 self.print_with_time('mysql reconnect failure')
46 | 
47 |     def close(self):
48 |         try:
49 |             self.conn.close()
50 |             self.print_with_time('mysql close success')
51 |         except Exception as e:
52 |             self.print_with_time(e)
53 |             self.print_with_time('mysql close failure')
54 | 
55 |     @staticmethod
56 |     def print_with_time(content):
57 |         print(strftime('%Y-%m-%d %H:%M:%S'))
58 |         print(content)
59 | 


--------------------------------------------------------------------------------
/db/mysql_operate.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from db.mysql_conn import MysqlConn
 3 | from cfg.cfg import mysql_db
 4 | from time import strftime
 5 | 
 6 | 
 7 | class MysqlOperate(object):
 8 |     def __init__(self):
 9 |         self.mysql_conn = MysqlConn(host=mysql_db['host'],
10 |                                     user=mysql_db['user'],
11 |                                     password=mysql_db['password'],
12 |                                     db=mysql_db['db'],
13 |                                     port=mysql_db['port'],
14 |                                     charset='utf8')
15 |         self.mysql_conn.connect()
16 |         self.conn = self.mysql_conn.conn
17 |         self.cursor = self.mysql_conn.cursor
18 | 
19 |     def query_public_number(self):
20 |         self.mysql_conn.ping()
21 |         sql_query = 'select * from public_number where is_del = 0 and today_is_crawl = 0'
22 |         self.cursor.execute(sql_query)
23 |         self.print_with_time('query public_number all')
24 |         return self.cursor.fetchall()
25 | 
26 |     def query_public_number_by_biz(self, public_number_biz):
27 |         self.mysql_conn.ping()
28 |         sql_query = "select * from public_number where is_del = 0 and today_is_crawl = 0 " \
29 |                     "and public_number_biz = '" + public_number_biz + "'"
30 |         self.cursor.execute(sql_query)
31 |         self.print_with_time('query public_number by biz, biz = ' + public_number_biz)
32 |         return self.cursor.fetchall()
33 | 
34 |     def update_public_number_today_is_crawl(self, public_number_wechat_id, today_is_crawl):
35 |         self.mysql_conn.ping()
36 |         sql = 'update public_number set today_is_crawl = ' + today_is_crawl \
37 |               + " where public_number_wechat_id = '" + public_number_wechat_id \
38 |               + "' and is_del = 0"
39 |         self.cursor.execute(sql)
40 |         self.print_with_time('update public_number today_is_crawl is ' + today_is_crawl)
41 |         self.conn.commit()
42 | 
43 |     def reset_all_public_number_today_is_crawl(self):
44 |         self.mysql_conn.ping()
45 |         sql = 'update public_number set today_is_crawl = 0'
46 |         self.cursor.execute(sql)
47 |         self.print_with_time('reset all public_number today_is_crawl is 0')
48 |         self.conn.commit()
49 | 
50 |     def insert_crawl_record(self, public_number_id, crawl_status):
51 |         self.mysql_conn.ping()
52 |         self.cursor.execute(
53 |             'insert into crawl_record'
54 |             '(id, public_number_id, crawl_status) '
55 |             'values(%s, %s, %s)',
56 |             (None, public_number_id, crawl_status))
57 |         self.print_with_time('insert crawl_record')
58 |         self.conn.commit()
59 | 
60 |     def query_public_number_article(self, public_number_wechat_id, public_number_article_title,
61 |                                     public_number_article_publish_time):
62 |         self.mysql_conn.ping()
63 |         sql_query = "select * from public_number_article where public_number_wechat_id = '" + public_number_wechat_id \
64 |                     + "' and public_number_article_title = '" + public_number_article_title \
65 |                     + "' and public_number_article_publish_time = '" + public_number_article_publish_time + "'"
66 |         count = self.cursor.execute(sql_query)
67 |         self.print_with_time('query public_number article')
68 |         return count
69 | 
70 |     def insert_public_number_article(self, public_number_wechat_id, public_number_name, public_number_article_title,
71 |                                      public_number_article_publish_time, public_number_article_content_url,
72 |                                      public_number_article_cover):
73 |         self.mysql_conn.ping()
74 |         self.cursor.execute(
75 |             'insert into public_number_article'
76 |             '(id, public_number_wechat_id, public_number_name, public_number_article_title, '
77 |             'public_number_article_publish_time, public_number_article_content_url, public_number_article_cover) '
78 |             'values(%s, %s, %s, %s, %s, %s, %s)',
79 |             (None, public_number_wechat_id, public_number_name, public_number_article_title,
80 |              public_number_article_publish_time, public_number_article_content_url, public_number_article_cover))
81 |         self.print_with_time('insert public_number article')
82 |         self.conn.commit()
83 | 
84 |     def close(self):
85 |         self.mysql_conn.close()
86 | 
87 |     @staticmethod
88 |     def print_with_time(content):
89 |         print(strftime('%Y-%m-%d %H:%M:%S'))
90 |         print(content)
91 | 


--------------------------------------------------------------------------------
/public_number/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjhpure/crawler_public_number/a39aaa91d5b35e2a033ea88059237961d46d2412/public_number/__init__.py


--------------------------------------------------------------------------------
/public_number/get_public_number.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import time
 3 | 
 4 | from time import strftime
 5 | from db.mysql_operate import MysqlOperate
 6 | from redis import StrictRedis
 7 | from cfg.cfg import redis_db
 8 | 
 9 | 
10 | class GetPublicNumber(object):
11 |     mysql_operate = MysqlOperate()
12 | 
13 |     def query_public_number_count(self):
14 |         # 连接redis
15 |         redis = StrictRedis(host=redis_db['host'], port=redis_db['port'], password=redis_db['password'])
16 |         n = 19
17 |         while True:
18 |             self.print_with_time('sleep 10s')
19 |             time.sleep(10)
20 |             for row in self.mysql_operate.query_public_number():
21 |                 self.print_with_time('sleep 2s')
22 |                 time.sleep(2)
23 |                 public_number_wechat_id = row[1]
24 |                 public_number_name = row[2]
25 |                 public_number_biz = row[3]
26 |                 if redis.llen('public_number') <= 19:
27 |                     redis.rpush('public_number', public_number_name + '&&' + public_number_wechat_id
28 |                                 + '&&' + public_number_biz)
29 |                     print('public_number_wechat_id:' + public_number_wechat_id
30 |                           + ' public_number_name:' + public_number_name
31 |                           + ' public_number_biz:' + public_number_biz)
32 |                 else:
33 |                     self.print_with_time('public_number size can not more than ' + str(n + 1))
34 |                     self.print_with_time('sleep 2s')
35 |                     time.sleep(2)
36 |                     while redis.llen('public_number') <= 19:
37 |                         if redis.llen('public_number') <= 19:
38 |                             redis.rpush('public_number', public_number_name + '&&' + public_number_wechat_id
39 |                                         + '&&' + public_number_biz)
40 |                             print('public_number_wechat_id:' + public_number_wechat_id
41 |                                   + ' public_number_name:' + public_number_name
42 |                                   + ' public_number_biz:' + public_number_biz)
43 |                         else:
44 |                             self.print_with_time('public_number size can not more than ' + str(n + 1))
45 |                             self.print_with_time('sleep 2s')
46 |                             time.sleep(2)
47 | 
48 |     @staticmethod
49 |     def print_with_time(content):
50 |         print(strftime('%Y-%m-%d %H:%M:%S'))
51 |         print(content)
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     get_public_number = GetPublicNumber()
56 |     get_public_number.query_public_number_count()
57 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | certifi==2018.1.18
 2 | chardet==3.0.4
 3 | click==6.7
 4 | Flask==0.12.2
 5 | idna==2.6
 6 | itsdangerous==0.24
 7 | Jinja2==2.10
 8 | lxml==4.2.1
 9 | MarkupSafe==1.0
10 | numpy==1.14.3
11 | PyMySQL==0.8.0
12 | redis==2.10.6
13 | requests==2.18.4
14 | urllib3==1.22
15 | Werkzeug==0.14.1
16 | 


--------------------------------------------------------------------------------
/start.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import multiprocessing
 3 | 
 4 | from public_number.get_public_number import GetPublicNumber
 5 | from crawler.reset_crawl import ResetCrawler
 6 | from crawler.crawler import PublicNumberSpider
 7 | 
 8 | 
 9 | def start_get_public_number():
10 |     get_public_number = GetPublicNumber()
11 |     get_public_number.query_public_number_count()
12 | 
13 | 
14 | def start_reset_crawler():
15 |     reset_crawler = ResetCrawler()
16 |     reset_crawler.run()
17 | 
18 | 
19 | def start_crawler():
20 |     public_number_spider = PublicNumberSpider()
21 |     public_number_spider.start_requests()
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     multiprocessing.Process(target=start_reset_crawler, name='process: start_reset_crawler').start()
26 |     multiprocessing.Process(target=start_get_public_number, name='process: start_get_public_number').start()
27 |     multiprocessing.Process(target=start_crawler, name='process: start_crawler').start()
28 | 


--------------------------------------------------------------------------------