├── .gitignore ├── README.md ├── anyproxy ├── __init__.py ├── restart_anyproxy.py └── rule_default.js ├── api.py ├── cfg ├── __init__.py ├── cfg.py ├── cfg_prod.py └── cfg_test.py ├── crawler ├── __init__.py ├── crawler.py └── reset_crawl.py ├── ctl_prod.sh ├── ctl_test.sh ├── db ├── __init__.py ├── db.sql ├── mysql_conn.py └── mysql_operate.py ├── public_number ├── __init__.py └── get_public_number.py ├── requirements.txt └── start.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.out -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | + 此项目的详细介绍:https://www.jianshu.com/p/667f3668cd70 2 | + 项目中用到的android按键精灵源码:https://github.com/zjhpure/PublicNumberQuickMacro 3 | 4 | 5 | ## 环境准备 6 | + 一台爬虫服,python3环境,建议在ubuntu16.04下,不用再装一次python3 7 | + 一台代理服,root权限,anyproxy环境和pm2环境,要先装好npm和node才能装anyproxy,pm2是用来控制anyproxy的 8 | + 至少一台android手机/模拟器(公众号越多,android就需要越多,按键精灵的点击频率根据实际调整),装上微信,微信版本要求:6.66,登录一个微信号(建议微信号进行认证,否则很快会被禁止登录),安装上按键精灵 9 | + 一个mysql数据库 10 | + 一个redis数据库 11 | + 如果机器不够,可以把爬虫服、代理服、mysql、redis都放在一台机器上 12 | 13 | 14 | ## 启动前准备 15 | + 下载项目,git clone https://github.com/zjhpure/crawler_public_number 16 | + 修改项目anyproxy目录下的rule_default.js文件第三四行,指定自己的redis 17 | + 复制项目anyproxy目录下的rule_default.js文件到代理服的/usr/local/lib/node_modules/anyproxy/lib下 18 | + 在代理服启动anyproxy,首次启动需要执行sudo pm2 start anyproxy -x -- -i,之后的启动执行sudo pm2 start anyproxy 19 | + 在代理服的/usr/local/lib/node_modules/anyproxy目录下,执行sudo npm install redis,以增加node的redis模块 20 | + 在代理服可以执行sudo pm2 list,查看是否启动anyproxy成功 21 | + 也可以在浏览器输入:代理服ip:8002,查看anyproxy运行情况 22 | + 复制项目anyproxy目录下的restart_anyproxy.py文件到代理服 23 | + 可以在代理服执行nohup python3 -u restart_anyproxy.py &,让anyproxy每天凌晨重启一次,因为anyproxy运行太久会变卡顿 24 | + android连接wifi指定anyproxy代理,代理地址是代理服ip,端口是8001 25 | + 在代理服ip:8002网址上,点击RootCA后,屏幕出现二维码和download按钮 26 | + 可以点击download直接下载到电脑然后复制到手机安装CA证书,或者用手机浏览器扫二维码安装CA证书,还可以用手机浏览器访问:代理服ip:8002/fetchCrtFile安装CA证书 27 | + android按键精灵配置获取公众号的接口的地址,在项目目录下的api.py文件中 28 | + android启动按键精灵 29 | + 手机/模拟器进入微信,这里测试两个公众号,关注公众号:pythonbuluo(Python程序员),python-china(Python中文社区) 30 | + 也可以在public_number表下自己添加其他的公众号,分别需要填上公众号微信号、公众号名称和公众号biz值 31 | + biz值的获取,点击公众号的历史消息,点击右上角按钮,点击复制链接,在一个地方查看这个链接 32 | + 链接像如下:https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MjM5NzU0MzU0Nw==&scene=123&pass_ticket={pass_ticket值} 33 | + 那这里的biz值就是MjM5NzU0MzU0Nw== 34 | + 修改项目cfg目录下的cfg_test.py和cfg.prod.py文件,配置测服和正服,指定项目的mysql、redis和按键精灵接口的端口 35 | + 使用项目db目录下的db.sql文件生成数据库 36 | + 复制项目到测服和正服,目录放在/data/crawler/下,也可以放在其他地方,但是ctl_test.sh和ctl_test.sh的ROOT就要对应地改变 37 | 38 | 39 | ## 启动项目 40 | 41 | #### 测服 42 | + 进入项目目录 43 | + 执行chmod +x ctl_test.sh,使ctl_test.sh可执行 44 | + 执行sh ctl_test.sh start_api,启动按键精灵接口 45 | + 执行sh ctl_test.sh start,启动爬虫 46 | + 手机/模拟器启动按键精灵,如果没有按键精灵,可以手动点击公众号的历史消息来测试 47 | + 项目运行后会在项目目录下生成nohup.out文件记录运行情况 48 | 49 | #### 正服 50 | + 进入项目目录 51 | + 执行chmod +x ctl_prod.sh,使ctl_test.sh可执行 52 | + 执行sh ctl_prod.sh start_api,启动按键精灵接口 53 | + 执行sh ctl_prod.sh start,启动爬虫 54 | + 手机/模拟器启动按键精灵,如果没有按键精灵,可以手动点击公众号的历史消息来测试 55 | + 项目运行后会在项目目录下生成nohup.out文件记录运行情况 56 | 57 | ## 说明 58 | + 这里的微信文章内容和封面只是保存其链接 59 | + 如果真的要上线,要自己实现下载封面和文章内容,然后上传到云存储上(比如:七牛云) 60 | + android按键精灵的流程是:间隔一定时间请求一次按键精灵接口ip:/crawler/public_number/get_click_public_number,获取到公众号名字,然后点击公众号的历史消息 -------------------------------------------------------------------------------- /anyproxy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjhpure/crawler_public_number/a39aaa91d5b35e2a033ea88059237961d46d2412/anyproxy/__init__.py -------------------------------------------------------------------------------- /anyproxy/restart_anyproxy.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import datetime 3 | import time 4 | import os 5 | 6 | from time import strftime 7 | 8 | 9 | # 每天凌晨重启anyproxy,anyproxy长时间运行不重启会很卡 10 | class RestartAnyproxy(object): 11 | def run(self): 12 | while True: 13 | self.print_with_time('sleep 10s') 14 | time.sleep(10) 15 | # 获取当前时间 16 | now = str(datetime.datetime.now()) 17 | # 获取当前时间的小时数 18 | hour = now.split(' ')[1].split(':')[0] 19 | # 若小时数为00,则证明已经到了凌晨 20 | if '00' == hour: 21 | os.system('sudo pm2 restart anyproxy') 22 | self.print_with_time('sleep 4000s') 23 | # 执行完就休眠一个小时,一天只重置一次 24 | time.sleep(4000) 25 | 26 | @staticmethod 27 | def print_with_time(content): 28 | print(strftime('%Y-%m-%d %H:%M:%S')) 29 | print(content) 30 | 31 | 32 | if __name__ == '__main__': 33 | restart_anyproxy = RestartAnyproxy() 34 | restart_anyproxy.run() 35 | -------------------------------------------------------------------------------- /anyproxy/rule_default.js: -------------------------------------------------------------------------------- 1 | function sendToRedis(x_wechat_key, x_wechat_uin, user_agent, cookie, url) { 2 | var redis = require("redis"); 3 | client = redis.createClient(6379, 'localhost', {}); 4 | client.auth('123456'); 5 | client.on("error", function (err) { 6 | console.log("error:" + err); 7 | }); 8 | var now = Math.round(new Date().getTime() / 1000) 9 | console.log(now); 10 | client.rpush('click_public_number', x_wechat_key + '&&' + x_wechat_uin + '&&' + user_agent + '&&' + cookie + '&&' + now + '&&' + url, redis.print) 11 | client.quit(); 12 | }; 13 | 14 | 'use strict'; 15 | 16 | module.exports = { 17 | 18 | summary: 'the default rule for AnyProxy', 19 | 20 | /** 21 | * 22 | * 23 | * @param {object} requestDetail 24 | * @param {string} requestDetail.protocol 25 | * @param {object} requestDetail.requestOptions 26 | * @param {object} requestDetail.requestData 27 | * @param {object} requestDetail.response 28 | * @param {number} requestDetail.response.statusCode 29 | * @param {object} requestDetail.response.header 30 | * @param {buffer} requestDetail.response.body 31 | * @returns 32 | */ 33 | *beforeSendRequest(requestDetail) { 34 | return null; 35 | }, 36 | 37 | /** 38 | * 39 | * 40 | * @param {object} requestDetail 41 | * @param {object} responseDetail 42 | */ 43 | *beforeSendResponse(requestDetail, responseDetail) { 44 | var tempStr = "mp.weixin.qq.com/mp/profile_ext?action=home"; 45 | var res = requestDetail.url.indexOf(tempStr) 46 | if (res > 0) { 47 | var body = responseDetail.response.body 48 | var regu = "操作频繁,请稍后再试"; 49 | if (body.indexOf(regu) >= 0) { 50 | console.log('微信操作频繁网页'); 51 | } else { 52 | var data = requestDetail.requestOptions; 53 | sendToRedis(data.headers['x-wechat-key'], data.headers['x-wechat-uin'], data.headers['User-Agent'], data.headers['Cookie'], requestDetail.url) 54 | console.log(data); 55 | } 56 | } 57 | return null; 58 | }, 59 | 60 | 61 | /** 62 | * 63 | * 64 | * @param {any} requestDetail 65 | * @returns 66 | */ 67 | *beforeDealHttpsRequest(requestDetail) { 68 | return false; 69 | }, 70 | 71 | /** 72 | * 73 | * 74 | * @param {any} requestDetail 75 | * @param {any} error 76 | * @returns 77 | */ 78 | *onError(requestDetail, error) { 79 | return null; 80 | }, 81 | 82 | 83 | /** 84 | * 85 | * 86 | * @param {any} requestDetail 87 | * @param {any} error 88 | * @returns 89 | */ 90 | *onConnectError(requestDetail, error) { 91 | return null; 92 | }, 93 | }; 94 | -------------------------------------------------------------------------------- /api.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | 4 | from cfg.cfg import redis_db, api_port 5 | from flask import Flask 6 | from redis import StrictRedis 7 | 8 | app = Flask(__name__) 9 | 10 | 11 | @app.route('/') 12 | def hello(): 13 | return 'hello world!' 14 | 15 | 16 | @app.route('/crawler/public_number/get_click_public_number', methods=['GET', 'POST']) 17 | def get_click_public_number(): 18 | # 连接redis 19 | redis = StrictRedis(host=redis_db['host'], port=redis_db['port'], password=redis_db['password']) 20 | if redis.llen('public_number') > 0: 21 | # redis长度不为0,从左pop出数据,按键精灵可以点击 22 | info = str(redis.lpop('public_number'), encoding='utf-8') 23 | info = info.split('&&') 24 | print(info) 25 | # data = {"errcode": 0, "msg": "获取公众号成功", 26 | # "result": {"public_number_name": info[0], 27 | # "public_number_wechat_id": info[1], 28 | # "public_number_biz": info[2]}} 29 | data = {"errcode": 0, "msg": "获取公众号成功", 30 | "result": {"publicNumberName": info[0], 31 | "publicNumberWechatId": info[1], 32 | "publicNumberBiz": info[2]}} 33 | else: 34 | # redis长度为0,按键精灵不用点击 35 | data = {"errcode": 1, "msg": "无公众号获取"} 36 | result = json.dumps(data, ensure_ascii=False) 37 | print(result) 38 | return result 39 | 40 | 41 | if __name__ == '__main__': 42 | app.run(host='0.0.0.0', port=api_port, debug=True) 43 | -------------------------------------------------------------------------------- /cfg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjhpure/crawler_public_number/a39aaa91d5b35e2a033ea88059237961d46d2412/cfg/__init__.py -------------------------------------------------------------------------------- /cfg/cfg.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | api_port = 10002 3 | 4 | redis_db = { 5 | 'host': 'localhost', 6 | 'port': 6379, 7 | 'password': '123456' 8 | } 9 | 10 | mysql_db = { 11 | 'host': 'localhost', 12 | 'user': 'root', 13 | 'password': '123456', 14 | 'db': 'crawler_public_number', 15 | 'port': 3306 16 | } 17 | -------------------------------------------------------------------------------- /cfg/cfg_prod.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | api_port = 10002 3 | 4 | redis_db = { 5 | 'host': 'localhost', 6 | 'port': 6379, 7 | 'password': '123456' 8 | } 9 | 10 | mysql_db = { 11 | 'host': 'localhost', 12 | 'user': 'root', 13 | 'password': '123456', 14 | 'db': 'crawler_public_number', 15 | 'port': 3306 16 | } 17 | -------------------------------------------------------------------------------- /cfg/cfg_test.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | api_port = 10002 3 | 4 | redis_db = { 5 | 'host': 'localhost', 6 | 'port': 6379, 7 | 'password': '123456' 8 | } 9 | 10 | mysql_db = { 11 | 'host': 'localhost', 12 | 'user': 'root', 13 | 'password': '123456', 14 | 'db': 'crawler_public_number', 15 | 'port': 3306 16 | } 17 | -------------------------------------------------------------------------------- /crawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjhpure/crawler_public_number/a39aaa91d5b35e2a033ea88059237961d46d2412/crawler/__init__.py -------------------------------------------------------------------------------- /crawler/crawler.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import datetime 3 | import json 4 | import random 5 | import re 6 | import time 7 | import traceback 8 | from time import strftime 9 | 10 | import requests 11 | from lxml import etree 12 | from redis import StrictRedis 13 | from cfg.cfg import redis_db 14 | from db.mysql_operate import MysqlOperate 15 | 16 | 17 | class PublicNumberSpider(object): 18 | headers = { 19 | 'Host': 'mp.weixin.qq.com', 20 | 'Connection': 'keep-alive', 21 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 22 | 'Accept-Encoding': 'gzip,deflate', 23 | 'Accept-Language': 'zh-CN,en-US;q=0.8', 24 | 'X-Requested-With': 'com.tencent.mm' 25 | } 26 | mysql_operate = MysqlOperate() 27 | 28 | def start_requests(self): 29 | while True: 30 | # 每隔5-10s取一次redis数据 31 | n = random.randint(5, 10) 32 | self.print_with_time('sleep ' + str(n) + 's') 33 | time.sleep(n) 34 | # 从redis获取请求参数和biz值 35 | x_wechat_key, x_wechat_uin, user_agent, cookie, url = self.operate_redis() 36 | headers = self.headers 37 | headers['x-wechat-key'] = x_wechat_key 38 | headers['x-wechat-uin'] = x_wechat_uin 39 | headers['User-Agent'] = user_agent 40 | headers['Cookie'] = cookie 41 | biz = url.split('&')[1].split('biz=')[1] 42 | # 通过biz值查询数据库里是否有今天未爬取的此公众号 43 | result = self.mysql_operate.query_public_number_by_biz(public_number_biz=biz) 44 | self.print_with_time(result) 45 | if len(result) > 0: 46 | row = result[0] 47 | public_number_id = row[0] 48 | public_number_wechat_id = row[1] 49 | public_number_name = row[2] 50 | try: 51 | self.print_with_time('public_number_wechat_id:' + public_number_wechat_id 52 | + ' public_number_name:' + public_number_name) 53 | response = requests.request('GET', url, headers=headers) 54 | meta = {'public_number_wechat_id': public_number_wechat_id, 55 | 'public_number_name': public_number_name, 56 | 'public_number_id': public_number_id} 57 | article_num = self.parse(response, meta) 58 | # 若今天的文章数量为0,可能公众号还没有发表文章 59 | if article_num > 0: 60 | # 今天已爬取,标记为1,这里一旦有一次今天爬取成功了,就标记为今天已爬取 61 | # 大多数公众号一天只能发文一次,除了少数早期的公众号可以发文多次,新申请应该都是一天只能发文一次 62 | self.mysql_operate.update_public_number_today_is_crawl( 63 | public_number_wechat_id=str(public_number_wechat_id), today_is_crawl=str(1)) 64 | # 录入爬取记录,1为爬取成功 65 | self.mysql_operate.insert_crawl_record(public_number_id=public_number_id, crawl_status=1) 66 | except Exception as e: 67 | self.print_with_time(e) 68 | traceback.print_exc() 69 | self.print_with_time( 70 | 'crawler failure, ' + 'public_number_wechat_id:' + public_number_wechat_id 71 | + ', public_number_name:' + public_number_name) 72 | # 录入爬取记录,0为爬取失败 73 | self.mysql_operate.insert_crawl_record(public_number_id=public_number_id, crawl_status=0) 74 | 75 | def parse(self, response, meta): 76 | # print(response.text) 77 | if '

操作频繁,请稍后再试

' in response.text: 78 | self.print_with_time('操作频繁') 79 | return 0 80 | if '

请在微信客户端打开链接。

' in response.text > 0: 81 | self.print_with_time('连接失效') 82 | return 0 83 | # 今天的文章数量 84 | article_today_num = 0 85 | # 获取公众号历史消息第一页 86 | msg_list = json.loads(re.findall(r'{"list":.*]}', response.text.replace('"', '"'))[0]) 87 | for sel in msg_list['list']: 88 | if 'app_msg_ext_info' in sel: 89 | # 获取公众号文章发表时间 90 | ltime = time.localtime(sel['comm_msg_info']['datetime']) 91 | day = time.strftime('%Y-%m-%d', ltime) 92 | # 今天 93 | today = datetime.date.today() 94 | # 昨天 95 | yesterday = today - datetime.timedelta(days=1) 96 | # 只抓取今天和昨天的文章,抓取昨天的文章是为了避免可能出现在凌晨前发文,而由于爬取的循环刚好在凌晨的前后而错过了爬取 97 | # 之抓取今天和昨天的文章也是为了减少访问次数,微信对同一个微信号一天访问公众号历史消息的次数是有限制的 98 | # 若超过了次数,大概需要等待12小时后才能恢复 99 | if str(yesterday) == day: 100 | content_url = sel['app_msg_ext_info']['content_url'] 101 | # 发表后又删除了的文章content_url会为空 102 | if content_url != '': 103 | url_temp = content_url.split('/s?')[1] 104 | url = 'https://mp.weixin.qq.com/s?' + re.sub('amp;', '', url_temp) 105 | meta['cover'] = sel['app_msg_ext_info']['cover'].replace('\\', '') 106 | self.print_with_time(url) 107 | meta['url'] = url 108 | response = requests.request('GET', url, headers=self.headers) 109 | self.get_pub_article(response, meta) 110 | for s in sel['app_msg_ext_info']['multi_app_msg_item_list']: 111 | # 发表后又删除了的文章content_url会为空 112 | if s['content_url'] != '': 113 | muti_url_temp = s['content_url'].split('/s?')[1] 114 | muti_url = 'https://mp.weixin.qq.com/s?' + re.sub('amp;', '', muti_url_temp) 115 | meta['cover'] = s['cover'].replace('\\', '') 116 | self.print_with_time(muti_url) 117 | meta['url'] = muti_url 118 | response = requests.request('GET', muti_url, headers=self.headers) 119 | self.get_pub_article(response, meta) 120 | if str(today) == day: 121 | content_url = sel['app_msg_ext_info']['content_url'] 122 | # 发表后又删除了的文章content_url会为空 123 | if content_url != '': 124 | url_temp = content_url.split('/s?')[1] 125 | url = 'https://mp.weixin.qq.com/s?' + re.sub('amp;', '', url_temp) 126 | meta['cover'] = sel['app_msg_ext_info']['cover'].replace('\\', '') 127 | self.print_with_time(url) 128 | meta['url'] = url 129 | response = requests.request('GET', url, headers=self.headers) 130 | self.get_pub_article(response, meta) 131 | article_today_num = article_today_num + 1 132 | for s in sel['app_msg_ext_info']['multi_app_msg_item_list']: 133 | # 发表后又删除了的文章content_url会为空 134 | if s['content_url'] != '': 135 | muti_url_temp = s['content_url'].split('/s?')[1] 136 | muti_url = 'https://mp.weixin.qq.com/s?' + re.sub('amp;', '', muti_url_temp) 137 | meta['cover'] = s['cover'].replace('\\', '') 138 | self.print_with_time(muti_url) 139 | meta['url'] = muti_url 140 | response = requests.request('GET', muti_url, headers=self.headers) 141 | self.get_pub_article(response, meta) 142 | article_today_num = article_today_num + 1 143 | return article_today_num 144 | 145 | def get_pub_article(self, response, meta): 146 | public_number_id = meta['public_number_id'] 147 | public_number_wechat_id = meta['public_number_wechat_id'] 148 | public_number_name = meta['public_number_name'] 149 | # 把响应返回的文本转换为结点对象,xpath方法要用结点对象 150 | html = etree.HTML(response.text) 151 | # 若文章是转载的 152 | if len(html.xpath('//*[@class="original_page"]')) > 0: 153 | # 获取原文章的连接,这里的网址是重定向网址 154 | url = html.xpath('//*[@id="js_share_source"]/@href')[0].strip() 155 | self.print_with_time('pub_article redirect to: ' + url) 156 | response = requests.request('GET', url, headers=self.headers) 157 | self.get_pub_article(response, meta) 158 | # 若文章不是转载的 159 | else: 160 | public_number_article_title = html.xpath('//*[@id="activity-name"]/text()')[0].strip() 161 | self.print_with_time('public_number_article_title:' + public_number_article_title) 162 | public_number_article_publish_time = html.xpath('//*[@id="post-date"]/text()')[0].strip() 163 | self.print_with_time('public_number_article_publish_time:' + public_number_article_publish_time) 164 | count = self.mysql_operate.query_public_number_article(public_number_wechat_id=public_number_wechat_id, 165 | public_number_article_title=public_number_article_title, 166 | public_number_article_publish_time=public_number_article_publish_time) 167 | self.print_with_time( 168 | 'public_number_wechat_id:' + public_number_wechat_id + ' public_number_name:' + public_number_name) 169 | # 通过比较文章的标题和发表时间来判断文章是否已经爬取过,以爬取过的不再爬取 170 | if count <= 0: 171 | # 爬取文章封面 172 | pub_article_cover = meta['cover'] 173 | # 文章链接 174 | pub_article_content_url = meta['url'] 175 | # 这里说明一下,封面和文章内容,需要下载下来,然后上传到云存储上(比如:七牛云),这里就不详细解说了 176 | # response.text可以获取到文章内容 177 | # 文章信息录入数据库 178 | self.mysql_operate.insert_public_number_article(public_number_wechat_id=public_number_wechat_id, 179 | public_number_name=public_number_name, 180 | public_number_article_title=public_number_article_title, 181 | public_number_article_publish_time=public_number_article_publish_time, 182 | public_number_article_content_url=pub_article_content_url, 183 | public_number_article_cover=pub_article_cover) 184 | self.print_with_time( 185 | 'public_number_id:' + str(public_number_id) 186 | + ' public_number_wechat_id:' + public_number_wechat_id 187 | + ' public_number_name:' + public_number_name 188 | + ' public_number_article_title:' + public_number_article_title 189 | + ' public_number_article_publish_time:' + public_number_article_publish_time 190 | + ' pub_article_content_url:' + pub_article_content_url 191 | + ' pub_article_cover:' + pub_article_cover) 192 | 193 | def operate_redis(self): 194 | x_wechat_key = None 195 | x_wechat_uin = None 196 | user_agent = None 197 | cookie = None 198 | url = None 199 | flag = True 200 | while flag: 201 | self.print_with_time('prepare to connect redis') 202 | # 连接redis 203 | redis = StrictRedis(host=redis_db['host'], port=redis_db['port'], password=redis_db['password']) 204 | # 从左边pop出数据,b表示若没有数据,则会一直堵塞等待 205 | info = str(redis.blpop('click_public_number')[1], encoding='utf-8') 206 | info = info.split('&&') 207 | self.print_with_time(info) 208 | # 获取从anyproxy拦截公众号历史消息请求时储存在redis上的时间戳 209 | t = info[4] 210 | # 获取当前时间戳 211 | now = int(time.time()) 212 | self.print_with_time('now: ' + str(now)) 213 | # 公众号历史消息请求使用的参数有时效性,为了避免请求失效,这里时间戳不大于当前时间戳500的时间戳,即500秒 214 | # 还需url包含pass_ticket,因为有些网址不完整,需要去掉,如下: 215 | # 有时网址是这样:https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz={biz值}&scene=124& 216 | # 有时网址是这样:https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz={biz值}&scene=124&devicetype=android-23&version=26060135&lang=zh_CN&nettype=WIFI&a8scene=3&pass_ticket={pass_ticket值}&wx_header=1 217 | # 要把前者去掉 218 | if now - int(t) <= 500 and 'pass_ticket' in info[5]: 219 | flag = False 220 | x_wechat_key = info[0] 221 | x_wechat_uin = info[1] 222 | user_agent = info[2] 223 | cookie = info[3] 224 | url = info[5] 225 | self.print_with_time('x_wechat_key: ' + x_wechat_key) 226 | self.print_with_time('x_wechat_uin: ' + x_wechat_uin) 227 | self.print_with_time('user_agent: ' + user_agent) 228 | self.print_with_time('cookie: ' + cookie) 229 | self.print_with_time('time: ' + t) 230 | self.print_with_time('get public_number headers by redis success') 231 | return x_wechat_key, x_wechat_uin, user_agent, cookie, url 232 | 233 | @staticmethod 234 | def print_with_time(content): 235 | print(strftime('%Y-%m-%d %H:%M:%S')) 236 | print(content) 237 | 238 | 239 | if __name__ == '__main__': 240 | public_number_spider = PublicNumberSpider() 241 | public_number_spider.start_requests() 242 | -------------------------------------------------------------------------------- /crawler/reset_crawl.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import datetime 3 | import time 4 | 5 | from time import strftime 6 | from db.mysql_operate import MysqlOperate 7 | 8 | 9 | # 每天凌晨重置所有公众号的今天是否已爬取为0 10 | class ResetCrawler(object): 11 | mysql_operate = MysqlOperate() 12 | 13 | def run(self): 14 | while True: 15 | self.print_with_time('sleep 10s') 16 | time.sleep(10) 17 | # 获取当前时间 18 | now = str(datetime.datetime.now()) 19 | # 获取当前时间的小时数 20 | hour = now.split(' ')[1].split(':')[0] 21 | # 若小时数为00,则证明已经到了凌晨 22 | if '00' == hour: 23 | # 重置所有公众号的今天是否已爬取为0 24 | self.mysql_operate.reset_all_public_number_today_is_crawl() 25 | self.print_with_time('sleep 4000s') 26 | # 执行完就休眠一个小时,一天只重置一次 27 | time.sleep(4000) 28 | 29 | @staticmethod 30 | def print_with_time(content): 31 | print(strftime('%Y-%m-%d %H:%M:%S')) 32 | print(content) 33 | 34 | 35 | if __name__ == '__main__': 36 | reset_crawler = ResetCrawler() 37 | reset_crawler.run() 38 | -------------------------------------------------------------------------------- /ctl_prod.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | ROOT=/data/crawler/crawler_public_number 4 | 5 | assist() { 6 | echo """ 7 | start 使用生产配置启动进程 8 | stop 关闭进程 9 | start_api 使用生产配置启动API进程 10 | stop_api 关闭API进程 11 | """ 12 | } 13 | 14 | start() { 15 | rm -rf ${ROOT}/cfg/cfg.py 16 | cp ${ROOT}/cfg/cfg_prod.py ${ROOT}/cfg/cfg.py 17 | nohup python3 -u ${ROOT}/start.py & 18 | } 19 | 20 | stop() { 21 | pid=$(ps -ef | grep ${ROOT}/start.py | grep -v grep | awk '{print $2}') 22 | [ -n "${pid}" ] && kill -9 ${pid} 23 | echo "已关闭:${pid}" 24 | } 25 | 26 | start_api() { 27 | rm -rf ${ROOT}/cfg/cfg.py 28 | cp ${ROOT}/cfg/cfg_prod.py ${ROOT}/cfg/cfg.py 29 | nohup python3 -u ${ROOT}/api.py & 30 | } 31 | 32 | stop_api() { 33 | pid=$(ps -ef | grep ${ROOT}/api.py | grep -v grep | awk '{print $2}') 34 | [ -n "${pid}" ] && kill -9 ${pid} 35 | echo "已关闭:${pid}" 36 | } 37 | 38 | case $1 in 39 | start) 40 | start 41 | ;; 42 | stop) 43 | stop 44 | ;; 45 | start_api) 46 | start_api 47 | ;; 48 | stop_api) 49 | stop_api 50 | ;; 51 | *) 52 | assist 53 | ;; 54 | esac 55 | -------------------------------------------------------------------------------- /ctl_test.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | ROOT=/data/crawler/crawler_public_number 4 | 5 | assist() { 6 | echo """ 7 | start 使用测试配置启动进程 8 | stop 关闭进程 9 | start_api 使用测试配置启动API进程 10 | stop_api 关闭API进程 11 | """ 12 | } 13 | 14 | start() { 15 | rm -rf ${ROOT}/cfg/cfg.py 16 | cp ${ROOT}/cfg/cfg_test.py ${ROOT}/cfg/cfg.py 17 | nohup python3 -u ${ROOT}/start.py & 18 | } 19 | 20 | stop() { 21 | pid=$(ps -ef | grep ${ROOT}/start.py | grep -v grep | awk '{print $2}') 22 | [ -n "${pid}" ] && kill -9 ${pid} 23 | echo "已关闭:${pid}" 24 | } 25 | 26 | start_api() { 27 | rm -rf ${ROOT}/cfg/cfg.py 28 | cp ${ROOT}/cfg/cfg_test.py ${ROOT}/cfg/cfg.py 29 | nohup python3 -u ${ROOT}/api.py & 30 | } 31 | 32 | stop_api() { 33 | pid=$(ps -ef | grep ${ROOT}/api.py | grep -v grep | awk '{print $2}') 34 | [ -n "${pid}" ] && kill -9 ${pid} 35 | echo "已关闭:${pid}" 36 | } 37 | 38 | case $1 in 39 | start) 40 | start 41 | ;; 42 | stop) 43 | stop 44 | ;; 45 | start_api) 46 | start_api 47 | ;; 48 | stop_api) 49 | stop_api 50 | ;; 51 | *) 52 | assist 53 | ;; 54 | esac 55 | -------------------------------------------------------------------------------- /db/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjhpure/crawler_public_number/a39aaa91d5b35e2a033ea88059237961d46d2412/db/__init__.py -------------------------------------------------------------------------------- /db/db.sql: -------------------------------------------------------------------------------- 1 | drop table if exists `public_number`; 2 | create table `public_number` ( 3 | `id` int(10) not null auto_increment comment '公众号id', 4 | `public_number_wechat_id` varchar(255) not null comment '公众号微信号', 5 | `public_number_name` varchar(255) not null comment '公众号名称', 6 | `public_number_biz` varchar(255) not null comment '公众号biz值(biz值为公众号的唯一且不重复的标识)', 7 | `today_is_crawl` tinyint(4) not null default '0' comment '今天是否已爬取(0--今天未爬取,1--今天已爬取)', 8 | `create_time` timestamp default current_timestamp comment '创建时间', 9 | `update_time` timestamp default current_timestamp on update current_timestamp comment '更新时间', 10 | `is_del` tinyint(4) not null default '0' comment '是否删除(0--否,1--是)', 11 | primary key (`id`) 12 | ) engine=InnoDB auto_increment=1 default charset=utf8 comment='公众号表'; 13 | insert into `public_number`(public_number_wechat_id, public_number_name, public_number_biz) values('pythonbuluo', 'Python程序员', 'MjM5NzU0MzU0Nw=='), 14 | ('python-china', 'Python中文社区', 'MzAxMjUyNDQ5OA=='); 15 | 16 | drop table if exists `public_number_article`; 17 | create table `public_number_article` ( 18 | `id` int(10) not null auto_increment comment '公众号文章id', 19 | `public_number_wechat_id` varchar(255) not null comment '公众号微信号', 20 | `public_number_name` varchar(255) not null comment '公众号名称', 21 | `public_number_article_cover` text not null comment '公众号文章封面', 22 | `public_number_article_publish_time` timestamp default current_timestamp comment '公众号文章发表时间', 23 | `public_number_article_title` varchar(255) not null comment '公众号文章标题', 24 | `public_number_article_content_url` text not null comment '公众号文章内容', 25 | `create_time` timestamp default current_timestamp comment '创建时间', 26 | `update_time` timestamp default current_timestamp on update current_timestamp comment '更新时间', 27 | `is_del` tinyint(4) not null default '0' comment '是否删除(0--否,1--是)', 28 | primary key (`id`) 29 | ) engine=InnoDB auto_increment=1 default charset=utf8 comment='公众号文章表'; 30 | 31 | drop table if exists `crawl_record`; 32 | create table `crawl_record` ( 33 | `id` int(10) not null auto_increment comment '爬取记录id', 34 | `public_number_id` int(10) not null comment '公众号id', 35 | `crawl_status` tinyint(4) not null default '-1' comment '爬取状态(0--爬取失败,1--爬取成功)', 36 | `create_time` timestamp default current_timestamp comment '创建时间', 37 | `update_time` timestamp default current_timestamp on update current_timestamp comment '更新时间', 38 | `is_del` tinyint(4) not null default '0' comment '是否删除(0--否,1--是)', 39 | primary key (`id`) 40 | ) engine=InnoDB auto_increment=1 default charset=utf8 comment='爬取记录表'; 41 | -------------------------------------------------------------------------------- /db/mysql_conn.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import pymysql 3 | from time import strftime 4 | 5 | 6 | class MysqlConn(object): 7 | conn = None 8 | cursor = None 9 | 10 | def __init__(self, host, user, password, db, port, charset): 11 | self.host = host 12 | self.user = user 13 | self.password = password 14 | self.db = db 15 | self.port = port 16 | self.charset = charset 17 | 18 | def connect(self): 19 | try: 20 | self.conn = pymysql.connect(host=self.host, 21 | user=self.user, 22 | password=self.password, 23 | db=self.db, 24 | port=self.port, 25 | charset=self.charset, 26 | autocommit=True) 27 | self.cursor = self.conn.cursor() 28 | self.print_with_time('mysql connect success') 29 | except Exception as e: 30 | self.print_with_time(e) 31 | self.print_with_time('mysql connect failure') 32 | 33 | def ping(self): 34 | try: 35 | self.conn.ping() 36 | self.print_with_time('mysql ping success') 37 | except Exception as e: 38 | self.print_with_time(e) 39 | self.print_with_time('mysql ping failure') 40 | try: 41 | self.connect() 42 | self.print_with_time('mysql reconnect success') 43 | except Exception as e: 44 | self.print_with_time(e) 45 | self.print_with_time('mysql reconnect failure') 46 | 47 | def close(self): 48 | try: 49 | self.conn.close() 50 | self.print_with_time('mysql close success') 51 | except Exception as e: 52 | self.print_with_time(e) 53 | self.print_with_time('mysql close failure') 54 | 55 | @staticmethod 56 | def print_with_time(content): 57 | print(strftime('%Y-%m-%d %H:%M:%S')) 58 | print(content) 59 | -------------------------------------------------------------------------------- /db/mysql_operate.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from db.mysql_conn import MysqlConn 3 | from cfg.cfg import mysql_db 4 | from time import strftime 5 | 6 | 7 | class MysqlOperate(object): 8 | def __init__(self): 9 | self.mysql_conn = MysqlConn(host=mysql_db['host'], 10 | user=mysql_db['user'], 11 | password=mysql_db['password'], 12 | db=mysql_db['db'], 13 | port=mysql_db['port'], 14 | charset='utf8') 15 | self.mysql_conn.connect() 16 | self.conn = self.mysql_conn.conn 17 | self.cursor = self.mysql_conn.cursor 18 | 19 | def query_public_number(self): 20 | self.mysql_conn.ping() 21 | sql_query = 'select * from public_number where is_del = 0 and today_is_crawl = 0' 22 | self.cursor.execute(sql_query) 23 | self.print_with_time('query public_number all') 24 | return self.cursor.fetchall() 25 | 26 | def query_public_number_by_biz(self, public_number_biz): 27 | self.mysql_conn.ping() 28 | sql_query = "select * from public_number where is_del = 0 and today_is_crawl = 0 " \ 29 | "and public_number_biz = '" + public_number_biz + "'" 30 | self.cursor.execute(sql_query) 31 | self.print_with_time('query public_number by biz, biz = ' + public_number_biz) 32 | return self.cursor.fetchall() 33 | 34 | def update_public_number_today_is_crawl(self, public_number_wechat_id, today_is_crawl): 35 | self.mysql_conn.ping() 36 | sql = 'update public_number set today_is_crawl = ' + today_is_crawl \ 37 | + " where public_number_wechat_id = '" + public_number_wechat_id \ 38 | + "' and is_del = 0" 39 | self.cursor.execute(sql) 40 | self.print_with_time('update public_number today_is_crawl is ' + today_is_crawl) 41 | self.conn.commit() 42 | 43 | def reset_all_public_number_today_is_crawl(self): 44 | self.mysql_conn.ping() 45 | sql = 'update public_number set today_is_crawl = 0' 46 | self.cursor.execute(sql) 47 | self.print_with_time('reset all public_number today_is_crawl is 0') 48 | self.conn.commit() 49 | 50 | def insert_crawl_record(self, public_number_id, crawl_status): 51 | self.mysql_conn.ping() 52 | self.cursor.execute( 53 | 'insert into crawl_record' 54 | '(id, public_number_id, crawl_status) ' 55 | 'values(%s, %s, %s)', 56 | (None, public_number_id, crawl_status)) 57 | self.print_with_time('insert crawl_record') 58 | self.conn.commit() 59 | 60 | def query_public_number_article(self, public_number_wechat_id, public_number_article_title, 61 | public_number_article_publish_time): 62 | self.mysql_conn.ping() 63 | sql_query = "select * from public_number_article where public_number_wechat_id = '" + public_number_wechat_id \ 64 | + "' and public_number_article_title = '" + public_number_article_title \ 65 | + "' and public_number_article_publish_time = '" + public_number_article_publish_time + "'" 66 | count = self.cursor.execute(sql_query) 67 | self.print_with_time('query public_number article') 68 | return count 69 | 70 | def insert_public_number_article(self, public_number_wechat_id, public_number_name, public_number_article_title, 71 | public_number_article_publish_time, public_number_article_content_url, 72 | public_number_article_cover): 73 | self.mysql_conn.ping() 74 | self.cursor.execute( 75 | 'insert into public_number_article' 76 | '(id, public_number_wechat_id, public_number_name, public_number_article_title, ' 77 | 'public_number_article_publish_time, public_number_article_content_url, public_number_article_cover) ' 78 | 'values(%s, %s, %s, %s, %s, %s, %s)', 79 | (None, public_number_wechat_id, public_number_name, public_number_article_title, 80 | public_number_article_publish_time, public_number_article_content_url, public_number_article_cover)) 81 | self.print_with_time('insert public_number article') 82 | self.conn.commit() 83 | 84 | def close(self): 85 | self.mysql_conn.close() 86 | 87 | @staticmethod 88 | def print_with_time(content): 89 | print(strftime('%Y-%m-%d %H:%M:%S')) 90 | print(content) 91 | -------------------------------------------------------------------------------- /public_number/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjhpure/crawler_public_number/a39aaa91d5b35e2a033ea88059237961d46d2412/public_number/__init__.py -------------------------------------------------------------------------------- /public_number/get_public_number.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import time 3 | 4 | from time import strftime 5 | from db.mysql_operate import MysqlOperate 6 | from redis import StrictRedis 7 | from cfg.cfg import redis_db 8 | 9 | 10 | class GetPublicNumber(object): 11 | mysql_operate = MysqlOperate() 12 | 13 | def query_public_number_count(self): 14 | # 连接redis 15 | redis = StrictRedis(host=redis_db['host'], port=redis_db['port'], password=redis_db['password']) 16 | n = 19 17 | while True: 18 | self.print_with_time('sleep 10s') 19 | time.sleep(10) 20 | for row in self.mysql_operate.query_public_number(): 21 | self.print_with_time('sleep 2s') 22 | time.sleep(2) 23 | public_number_wechat_id = row[1] 24 | public_number_name = row[2] 25 | public_number_biz = row[3] 26 | if redis.llen('public_number') <= 19: 27 | redis.rpush('public_number', public_number_name + '&&' + public_number_wechat_id 28 | + '&&' + public_number_biz) 29 | print('public_number_wechat_id:' + public_number_wechat_id 30 | + ' public_number_name:' + public_number_name 31 | + ' public_number_biz:' + public_number_biz) 32 | else: 33 | self.print_with_time('public_number size can not more than ' + str(n + 1)) 34 | self.print_with_time('sleep 2s') 35 | time.sleep(2) 36 | while redis.llen('public_number') <= 19: 37 | if redis.llen('public_number') <= 19: 38 | redis.rpush('public_number', public_number_name + '&&' + public_number_wechat_id 39 | + '&&' + public_number_biz) 40 | print('public_number_wechat_id:' + public_number_wechat_id 41 | + ' public_number_name:' + public_number_name 42 | + ' public_number_biz:' + public_number_biz) 43 | else: 44 | self.print_with_time('public_number size can not more than ' + str(n + 1)) 45 | self.print_with_time('sleep 2s') 46 | time.sleep(2) 47 | 48 | @staticmethod 49 | def print_with_time(content): 50 | print(strftime('%Y-%m-%d %H:%M:%S')) 51 | print(content) 52 | 53 | 54 | if __name__ == '__main__': 55 | get_public_number = GetPublicNumber() 56 | get_public_number.query_public_number_count() 57 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | certifi==2018.1.18 2 | chardet==3.0.4 3 | click==6.7 4 | Flask==0.12.2 5 | idna==2.6 6 | itsdangerous==0.24 7 | Jinja2==2.10 8 | lxml==4.2.1 9 | MarkupSafe==1.0 10 | numpy==1.14.3 11 | PyMySQL==0.8.0 12 | redis==2.10.6 13 | requests==2.18.4 14 | urllib3==1.22 15 | Werkzeug==0.14.1 16 | -------------------------------------------------------------------------------- /start.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import multiprocessing 3 | 4 | from public_number.get_public_number import GetPublicNumber 5 | from crawler.reset_crawl import ResetCrawler 6 | from crawler.crawler import PublicNumberSpider 7 | 8 | 9 | def start_get_public_number(): 10 | get_public_number = GetPublicNumber() 11 | get_public_number.query_public_number_count() 12 | 13 | 14 | def start_reset_crawler(): 15 | reset_crawler = ResetCrawler() 16 | reset_crawler.run() 17 | 18 | 19 | def start_crawler(): 20 | public_number_spider = PublicNumberSpider() 21 | public_number_spider.start_requests() 22 | 23 | 24 | if __name__ == '__main__': 25 | multiprocessing.Process(target=start_reset_crawler, name='process: start_reset_crawler').start() 26 | multiprocessing.Process(target=start_get_public_number, name='process: start_get_public_number').start() 27 | multiprocessing.Process(target=start_crawler, name='process: start_crawler').start() 28 | --------------------------------------------------------------------------------