├── changelog.md ├── log.txt ├── requirements.txt ├── screenshot ├── wx.png └── zfb.png ├── wechatsogou ├── db.pyc ├── api.pyc ├── base.pyc ├── basic.pyc ├── tools.pyc ├── config.pyc ├── __init__.pyc ├── exceptions.pyc ├── filecache.pyc ├── ruokuaicode.pyc ├── __pycache__ │ ├── db.cpython-35.pyc │ ├── api.cpython-35.pyc │ ├── base.cpython-35.pyc │ ├── basic.cpython-35.pyc │ ├── config.cpython-35.pyc │ ├── tools.cpython-35.pyc │ ├── __init__.cpython-35.pyc │ ├── filecache.cpython-35.pyc │ ├── exceptions.cpython-35.pyc │ └── ruokuaicode.cpython-35.pyc ├── base.py ├── __init__.py ├── config.py ├── tools.py ├── exceptions.py ├── filecache.py ├── ruokuaicode.py ├── db.py ├── basic.py └── api.py ├── cache ├── 2029240f6d1128be89ddc32729463129 └── 8f0f136a8d509c9a5f221e61e813c820 ├── test.py ├── logging.conf ├── auto_add_mp_logging.conf ├── README.md ├── auto_add_mp.py ├── cookies.txt ├── updatewenzhang.py ├── updatemp.py ├── jubang.sql └── auto_add_mp_log.txt /changelog.md: -------------------------------------------------------------------------------- 1 | # 1.0.0 2 | 3 | - 重写项目 -------------------------------------------------------------------------------- /log.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/log.txt -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bs4 2 | requests 3 | PyMySQL 4 | lxml 5 | pillow 6 | werkzeug 7 | -------------------------------------------------------------------------------- /screenshot/wx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/screenshot/wx.png -------------------------------------------------------------------------------- /screenshot/zfb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/screenshot/zfb.png -------------------------------------------------------------------------------- /wechatsogou/db.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/db.pyc -------------------------------------------------------------------------------- /wechatsogou/api.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/api.pyc -------------------------------------------------------------------------------- /wechatsogou/base.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/base.pyc -------------------------------------------------------------------------------- /wechatsogou/basic.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/basic.pyc -------------------------------------------------------------------------------- /wechatsogou/tools.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/tools.pyc -------------------------------------------------------------------------------- /wechatsogou/config.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/config.pyc -------------------------------------------------------------------------------- /wechatsogou/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/__init__.pyc -------------------------------------------------------------------------------- /wechatsogou/exceptions.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/exceptions.pyc -------------------------------------------------------------------------------- /wechatsogou/filecache.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/filecache.pyc -------------------------------------------------------------------------------- /wechatsogou/ruokuaicode.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/ruokuaicode.pyc -------------------------------------------------------------------------------- /cache/2029240f6d1128be89ddc32729463129: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/cache/2029240f6d1128be89ddc32729463129 -------------------------------------------------------------------------------- /cache/8f0f136a8d509c9a5f221e61e813c820: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/cache/8f0f136a8d509c9a5f221e61e813c820 -------------------------------------------------------------------------------- /wechatsogou/__pycache__/db.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/__pycache__/db.cpython-35.pyc -------------------------------------------------------------------------------- /wechatsogou/__pycache__/api.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/__pycache__/api.cpython-35.pyc -------------------------------------------------------------------------------- /wechatsogou/__pycache__/base.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/__pycache__/base.cpython-35.pyc -------------------------------------------------------------------------------- /wechatsogou/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | class WechatSogouBase(object): 4 | """基于搜狗搜索的的微信公众号爬虫接口 基类 5 | """ 6 | pass 7 | -------------------------------------------------------------------------------- /wechatsogou/__pycache__/basic.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/__pycache__/basic.cpython-35.pyc -------------------------------------------------------------------------------- /wechatsogou/__pycache__/config.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/__pycache__/config.cpython-35.pyc -------------------------------------------------------------------------------- /wechatsogou/__pycache__/tools.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/__pycache__/tools.cpython-35.pyc -------------------------------------------------------------------------------- /wechatsogou/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /wechatsogou/__pycache__/filecache.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/__pycache__/filecache.cpython-35.pyc -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #查找公众号最新文章 3 | import sys 4 | 5 | print(sys.version_info[0]) 6 | is_python3 = sys.version_info[0] > 2 7 | print(is_python3) -------------------------------------------------------------------------------- /wechatsogou/__pycache__/exceptions.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/__pycache__/exceptions.cpython-35.pyc -------------------------------------------------------------------------------- /wechatsogou/__pycache__/ruokuaicode.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/__pycache__/ruokuaicode.cpython-35.pyc -------------------------------------------------------------------------------- /wechatsogou/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from wechatsogou.api import WechatSogouApi 4 | from wechatsogou.db import mysql 5 | from wechatsogou.filecache import WechatCache 6 | 7 | __all__ = ['WechatSogouApi', 'WechatCache', 'mysql'] 8 | 9 | __version__ = "1.1.7" 10 | -------------------------------------------------------------------------------- /wechatsogou/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # 缓存配置 4 | cache_dir = 'cache' 5 | cache_session_name = 'requests_wechatsogou_session' 6 | 7 | # mysql数据库配置 8 | host = '127.0.0.1' 9 | user = 'sougou' # 数据库用户名 10 | passwd = '123456' # 数据库密码 11 | db = 'jubang' # 默认数据库 12 | charset = 'utf8mb4' 13 | prefix = '' # 默认数据表前缀,可以不用写 14 | 15 | # 打码平台配置ruokuai http://www.ruokuai.com/ 16 | # 注册并充值后,就可以直接使用,识别一个验证码大约0.008元 17 | # 搜狗微信有点变态,有时明明验证码是正确的,他非说是错误的,这是没有办法的事情,好在这个概率非常低 18 | dama_name = 'xxx' #用户名 19 | dama_pswd = 'xxx' #密码 20 | -------------------------------------------------------------------------------- /logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root 3 | 4 | [handlers] 5 | keys=rotateFileHandler 6 | 7 | [formatters] 8 | keys=simpleFormatter 9 | 10 | [logger_root] 11 | level=WARNING 12 | handlers=rotateFileHandler 13 | qualname=simpleExample 14 | propagate=0 15 | 16 | [handler_rotateFileHandler] 17 | class=handlers.RotatingFileHandler 18 | level=WARNING 19 | formatter=simpleFormatter 20 | args=('log.txt', 'a+', 200000, 9) 21 | 22 | [formatter_simpleFormatter] 23 | format=%(asctime)s - [%(filename)s:%(lineno)d] - %(levelname)s - %(message)s 24 | datefmt= -------------------------------------------------------------------------------- /auto_add_mp_logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root 3 | 4 | [handlers] 5 | keys=rotateFileHandler 6 | 7 | [formatters] 8 | keys=simpleFormatter 9 | 10 | [logger_root] 11 | level=WARNING 12 | handlers=rotateFileHandler 13 | qualname=simpleExample 14 | propagate=0 15 | 16 | [handler_rotateFileHandler] 17 | class=handlers.RotatingFileHandler 18 | level=WARNING 19 | formatter=simpleFormatter 20 | args=('auto_add_mp_log.txt', 'a+', 200000, 9) 21 | 22 | [formatter_simpleFormatter] 23 | format=%(asctime)s - [%(filename)s:%(lineno)d] - %(levelname)s - %(message)s 24 | datefmt= -------------------------------------------------------------------------------- /wechatsogou/tools.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | 5 | def prdict(content): 6 | msg = json.dumps(content, indent=1, ensure_ascii=False) 7 | print(msg) 8 | 9 | def list_or_empty(content, contype=None): 10 | if isinstance(content, list): 11 | if content: 12 | return contype(content[0]) if contype else content[0] 13 | else: 14 | if contype: 15 | if contype == int: 16 | return 0 17 | elif contype == str: 18 | return '' 19 | elif contype == list: 20 | return [] 21 | else: 22 | raise Exception('only cna deal int str list') 23 | else: 24 | return '' 25 | else: 26 | raise Exception('need list') -------------------------------------------------------------------------------- /wechatsogou/exceptions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | class WechatSogouException(Exception): 4 | """基于搜狗搜索的的微信公众号爬虫接口 异常基类 5 | """ 6 | pass 7 | 8 | 9 | class WechatSogouVcodeException(WechatSogouException): 10 | """基于搜狗搜索的的微信公众号爬虫接口 出现验证码 异常类 11 | """ 12 | pass 13 | 14 | 15 | class WechatSogouJsonException(WechatSogouException): 16 | """基于搜狗搜索的的微信公众号爬虫接口 非标准json数据 异常类 17 | """ 18 | pass 19 | 20 | 21 | class WechatSogouEndException(WechatSogouException): 22 | """基于搜狗搜索的的微信公众号爬虫接口 数据处理完成 异常类 23 | """ 24 | pass 25 | 26 | class WechatSogouBreakException(WechatSogouException): 27 | """基于搜狗搜索的的微信公众号爬虫接口 中断 异常类 28 | """ 29 | pass 30 | 31 | class WechatSogouHistoryMsgException(WechatSogouException): 32 | """基于搜狗搜索的的微信公众号爬虫接口 数据处理完成 异常类 33 | """ 34 | pass 35 | 36 | class ConfigException(WechatSogouException): 37 | """基于搜狗搜索的的微信公众号爬虫接口 配置错误 异常类 38 | """ 39 | pass 40 | 41 | class WechatSogouRequestsException(WechatSogouException): 42 | """基于搜狗搜索的的微信公众号爬虫接口 抓取 异常类 43 | """ 44 | 45 | def __init__(self, errmsg, status_code): 46 | WechatSogouException(errmsg) 47 | self.status_code = status_code 48 | -------------------------------------------------------------------------------- /wechatsogou/filecache.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from werkzeug.contrib.cache import FileSystemCache 4 | 5 | from .base import WechatSogouBase 6 | 7 | class WechatCache(WechatSogouBase): 8 | """基于文件的缓存 9 | 10 | """ 11 | 12 | def __init__(self, cache_dir='cache', default_timeout=300): 13 | """初始化 14 | 15 | cache_dir是缓存目录 16 | """ 17 | self.cache = FileSystemCache(cache_dir, default_timeout=default_timeout) 18 | 19 | def clear(self): 20 | """清空缓存 21 | """ 22 | return self.cache.clear() 23 | 24 | def get(self, key): 25 | """获取缓存 26 | 27 | 获取键值key的缓存值 28 | 如果没有对应缓存,返回None 29 | """ 30 | return self.cache.get(key) 31 | 32 | def add(self, key, value, timeout=None): 33 | """增加缓存 34 | 35 | 如果键值key对应的缓存不存在,那么增加值value到键值key,过期时间timeout,默认300秒 36 | 否则返回False(即不能覆盖设置缓存) 37 | """ 38 | return self.cache.add(key, value, timeout) 39 | 40 | def set(self, key, value, timeout=None): 41 | """设置缓存 42 | 43 | 设置键值key的缓存为value,过期时间300秒 44 | """ 45 | return self.cache.set(key, value, timeout) 46 | 47 | def delete(self, key): 48 | """删除缓存 49 | 50 | 删除键值key存储的缓存 51 | """ 52 | return self.cache.delete(key) 53 | 54 | 55 | if __name__ == '__main__': 56 | cache = WechatCache() 57 | import requests 58 | 59 | r = requests.session() 60 | print(cache.set('1', r)) 61 | print(cache.get('1'), type(cache.get('1'))) 62 | -------------------------------------------------------------------------------- /wechatsogou/ruokuaicode.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import requests 4 | from hashlib import md5 5 | 6 | from .base import WechatSogouBase 7 | 8 | class RClient(WechatSogouBase): 9 | 10 | def __init__(self, username, password, soft_id, soft_key): 11 | self.username = username 12 | self.password = md5(password.encode('utf-8')).hexdigest() 13 | self.soft_id = soft_id 14 | self.soft_key = soft_key 15 | self.base_params = { 16 | 'username': self.username, 17 | 'password': self.password, 18 | 'softid': self.soft_id, 19 | 'softkey': self.soft_key, 20 | } 21 | self.headers = { 22 | 'Connection': 'Keep-Alive', 23 | 'Expect': '100-continue', 24 | 'User-Agent': 'ben', 25 | } 26 | 27 | def create(self, im, im_type, timeout=60): 28 | """ 29 | im: 图片字节 30 | im_type: 题目类型 31 | """ 32 | params = { 33 | 'typeid': im_type, 34 | 'timeout': timeout, 35 | } 36 | params.update(self.base_params) 37 | files = {'image': ('a.jpg', im)} 38 | r = requests.post('http://api.ruokuai.com/create.json', data=params, files=files, headers=self.headers) 39 | return r.json() 40 | 41 | def report_error(self, im_id): 42 | """ 43 | im_id:报错题目的ID 44 | """ 45 | params = { 46 | 'id': im_id, 47 | } 48 | params.update(self.base_params) 49 | r = requests.post('http://api.ruokuai.com/reporterror.json', data=params, headers=self.headers) 50 | return r.json() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 基于搜狗微信搜索的微信公众号爬虫 2 | === 3 | 4 | [![pypi supported versions](https://img.shields.io/pypi/pyversions/kubernetes.svg)](https://github.com/jaryee/wechat_sogou_crawl) 5 | 6 | 2019-03-30 适应搜狗2019-03-29规则变化 7 | 8 | 2019-03-07 增加对py3的支持,同时支持py2和py3 9 | 10 | 2017-4-27搜狗微信取消了阅读、点攒及评论数据,所以无法通过搜狗获取这些数据了. 11 | 12 | # 项目简介 13 | 基于搜狗微信搜索的微信公众号爬虫 14 | 可以抓取指定公众号的文章信息 15 | 16 | # 赞助作者 17 | 俺是自由职业者,好汉们如果可能的话赞助一些让俺将开源事业进行到底,谢谢!!! 18 | 19 | 20 | 21 | 22 | 兄弟我弄了个淘宝店,有时间的兄弟给捧个场啊,新店需要信誉积分,跪谢!只要一块钱,就能温暖你我他 23 | https://item.taobao.com/item.htm?spm=a230r.1.14.16.PRhaio&id=543333631871&ns=1&abbucket=6#detail 24 | 25 | 26 | 27 | 使用教程大家可以去我的微博查看: 28 | http://blog.csdn.net/niuxiaojia09/article/details/55260770 29 | 30 | 31 | 2017-1-20 增加如何使程序进入搜狗微信登录状态的说明,在Updatemp.py和UpdateWenzhang.py中都有操作说明 32 | 2017-3-21 在API.py中增加把文章本地化的函数,可以根据自己的需要把文章下载到本地 33 | 34 | # 项目使用 35 | 36 | 一、使用说明 37 | 38 | 1、在mysql数据库中创建数据库,数据库命名为Jubang,数据格式为utf8mb4,然后导入jubang.sql文件,创建对应的数据库表 39 | 40 | 2、修改config.py文件中对应的设置,打码平台配置ruokuai这个一定要设置,否则出现验证码就不能正常工作了 41 | 42 | 3、执行:pip install -r requirements.txt 安装所需要的第三方包 43 | 44 | 4、手动或自动在add_mp_list表中增加数据,然后运行auto_add_mp.py文件。 45 | 比如可以这样用:给auto_add_mp.py设定一个定时任务,5分钟或10分钟,然后前台页面文件让使用者添加待抓取的 46 | 公众号信息,然后定时任务执行时就可以把这些公众号加入待抓取列表了 47 | add_mp_list中 48 | name字段是模糊抓取,会根据输入的名称模糊加入10个公众号 49 | wx_hao字段是精确抓取,这个是公众号的微信号,只抓取一个 50 | 这两个字段可以任意填入一个就行 51 | 52 | 5、执行updatemp.py文件,文件说明看后面。使用中可以给该文件设定定时任务30分钟或其它间隔,每隔一定时间,运行该 53 | 文件就会抓取已添加的公众号是否有新文章发出来。 54 | 第一次使用会抓取公众号的最近10条群发数据 55 | 56 | 6、执行updatewenzhang.py文件,该文件是抓取文章阅读及点攒数的。最新的数据会写入wenzhang_info表中,并且会在表wenzhang_statistics中 57 | 添加增量记录,可以根据wenzhang_statistics表中的数据生成曲线图 58 | 使用中可以给该文件添加5分钟或其它时间的定时任务,这样就可以来生成对应的阅读曲线图了 59 | 60 | 二、文件说明 61 | 62 | 1、updatemp.py 63 | 该文件遍历待抓取列表(数据库表:mp_info),查询表中的公众号是否有新文章发布,如果有,就抓取新的文章信息并 64 | 放入数据库表wenzhang_info中 65 | 66 | 2、updatewenzhang.py 67 | 该文件遍历文章表,然后抓取24小时之内的文章阅读数据存入表wenzhang_info和表wenzhang_statistics中 68 | 69 | 3、 auto_add_mp.py 70 | 该文件将指定的公众号添加到待抓取列表中 71 | 该文件读取数据库表(add_mp_list)中的内容,然后将其中指定的公众号填入数据库表(mp_info)中 72 | 73 | 74 | 75 | # TODO 76 | - [x] 使用py2.7 77 | - [x] 获取指定公众号文章 78 | - [x] 文章详情页信息 79 | - [x] 验证码自动识别 80 | 81 | --- 82 | -------------------------------------------------------------------------------- /auto_add_mp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #添加指定公众号到爬虫数据库 3 | 4 | # 导入包 5 | from wechatsogou.tools import * 6 | from wechatsogou import * 7 | from PIL import Image 8 | import datetime 9 | import time 10 | import sys,locale 11 | import logging 12 | import logging.config 13 | 14 | # 日志 15 | logging.config.fileConfig('auto_add_mp_logging.conf') 16 | logger = logging.getLogger() 17 | 18 | # 搜索API实例 19 | wechats = WechatSogouApi() 20 | 21 | #数据库实例 22 | mysql = mysql('add_mp_list') 23 | 24 | 25 | add_list = mysql.find(0) 26 | succ_count = 0 27 | 28 | for add_item in add_list : 29 | try: 30 | print(add_item) 31 | if add_item['wx_hao']: 32 | print("add by wx_hao") 33 | mysql.where_sql = "wx_hao ='" + add_item['wx_hao'] + "'" 34 | mp_data = mysql.table('mp_info').find(1) 35 | if not mp_data : 36 | wechat_info = wechats.get_gzh_info(add_item['wx_hao']) 37 | time.sleep(1) 38 | #print(wechat_info) 39 | if(wechat_info != ""): 40 | mysql.table('mp_info').add({'name':wechat_info['name'], 41 | 'wx_hao':wechat_info['wechatid'], 42 | 'company':wechat_info['renzhen'], 43 | 'description':wechat_info['jieshao'], 44 | 'logo_url':wechat_info['img'], 45 | 'qr_url': wechat_info['qrcode'], 46 | 'wz_url': wechat_info['url'], 47 | 'last_qunfa_id': 0, 48 | 'create_time':time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))}) 49 | else: 50 | print(u"已经存在的公众号") 51 | elif add_item['name']: 52 | #获取对应信息 53 | print("add by name") 54 | wechat_infos = wechats.search_gzh_info(add_item['name'].encode('utf8')) 55 | time.sleep(1) 56 | #print(wechat_infos) 57 | for wx_item in wechat_infos : 58 | #公众号数据写入数据库 59 | #搜索一下是否已经存在 60 | print(wx_item['name']) 61 | mysql.where_sql = "wx_hao ='" + wx_item['wechatid'] + "'" 62 | print(mysql.where_sql) 63 | mp_data = mysql.table('mp_info').find(1) 64 | if not mp_data : 65 | print(wx_item['name'].decode("utf-8")) 66 | mysql.table('mp_info').add({ 'name':wx_item['name'], 67 | 'wx_hao':wx_item['wechatid'], 68 | 'company':wx_item['renzhen'], 69 | 'description':wx_item['jieshao'], 70 | 'logo_url':wx_item['img'], 71 | 'qr_url': wx_item['qrcode'], 72 | 'wz_url': wx_item['url'], 73 | 'last_qunfa_id': 0, 74 | 'create_time':time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))}) 75 | else: 76 | print(u"已经存在的公众号") 77 | 78 | #删除已添加项 79 | mysql.table('add_mp_list').where({'_id':add_item['_id']}).delete() 80 | except: 81 | print(u"出错,继续") 82 | continue 83 | 84 | 85 | print("success") 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /cookies.txt: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "domain": ".sogou.com", 4 | "expirationDate": 1585149164.37005, 5 | "hostOnly": false, 6 | "httpOnly": false, 7 | "name": "IPLOC", 8 | "path": "/", 9 | "sameSite": "no_restriction", 10 | "secure": false, 11 | "session": false, 12 | "storeId": "0", 13 | "value": "CN1100", 14 | "id": 1 15 | }, 16 | { 17 | "domain": ".sogou.com", 18 | "expirationDate": 1595289600, 19 | "hostOnly": false, 20 | "httpOnly": false, 21 | "name": "sct", 22 | "path": "/", 23 | "sameSite": "no_restriction", 24 | "secure": false, 25 | "session": false, 26 | "storeId": "0", 27 | "value": "39", 28 | "id": 2 29 | }, 30 | { 31 | "domain": ".sogou.com", 32 | "expirationDate": 1585148845, 33 | "hostOnly": false, 34 | "httpOnly": false, 35 | "name": "SNUID", 36 | "path": "/", 37 | "sameSite": "no_restriction", 38 | "secure": false, 39 | "session": false, 40 | "storeId": "0", 41 | "value": "327715CDBFBA3ABE12793907BFD60EC1", 42 | "id": 3 43 | }, 44 | { 45 | "domain": ".sogou.com", 46 | "expirationDate": 2181949036.801871, 47 | "hostOnly": false, 48 | "httpOnly": false, 49 | "name": "SUID", 50 | "path": "/", 51 | "sameSite": "no_restriction", 52 | "secure": false, 53 | "session": false, 54 | "storeId": "0", 55 | "value": "6078AB732320940A000000005C75E06B", 56 | "id": 4 57 | }, 58 | { 59 | "domain": ".sogou.com", 60 | "expirationDate": 1866589037.181813, 61 | "hostOnly": false, 62 | "httpOnly": false, 63 | "name": "SUV", 64 | "path": "/", 65 | "sameSite": "no_restriction", 66 | "secure": false, 67 | "session": false, 68 | "storeId": "0", 69 | "value": "00E11DBD73AB78605C75E06C33CEE997", 70 | "id": 5 71 | }, 72 | { 73 | "domain": ".weixin.sogou.com", 74 | "expirationDate": 2181949036.693053, 75 | "hostOnly": false, 76 | "httpOnly": false, 77 | "name": "SUID", 78 | "path": "/", 79 | "sameSite": "no_restriction", 80 | "secure": false, 81 | "session": false, 82 | "storeId": "0", 83 | "value": "6078AB737D29990A000000005C75E06B", 84 | "id": 6 85 | }, 86 | { 87 | "domain": "weixin.sogou.com", 88 | "expirationDate": 1553821036.664727, 89 | "hostOnly": true, 90 | "httpOnly": false, 91 | "name": "ABTEST", 92 | "path": "/", 93 | "sameSite": "no_restriction", 94 | "secure": false, 95 | "session": false, 96 | "storeId": "0", 97 | "value": "0|1551229035|v1", 98 | "id": 7 99 | }, 100 | { 101 | "domain": "weixin.sogou.com", 102 | "hostOnly": true, 103 | "httpOnly": false, 104 | "name": "JSESSIONID", 105 | "path": "/", 106 | "sameSite": "no_restriction", 107 | "secure": false, 108 | "session": true, 109 | "storeId": "0", 110 | "value": "aaataTUG3ggNN7aMO65Mw", 111 | "id": 8 112 | }, 113 | { 114 | "domain": "weixin.sogou.com", 115 | "hostOnly": true, 116 | "httpOnly": false, 117 | "name": "PHPSESSID", 118 | "path": "/", 119 | "sameSite": "no_restriction", 120 | "secure": false, 121 | "session": true, 122 | "storeId": "0", 123 | "value": "hipbln966cc23kddoj9qb54385", 124 | "id": 9 125 | }, 126 | { 127 | "domain": "weixin.sogou.com", 128 | "expirationDate": 1559869037, 129 | "hostOnly": true, 130 | "httpOnly": false, 131 | "name": "weixinIndexVisited", 132 | "path": "/", 133 | "sameSite": "no_restriction", 134 | "secure": false, 135 | "session": false, 136 | "storeId": "0", 137 | "value": "1", 138 | "id": 10 139 | } 140 | ] -------------------------------------------------------------------------------- /updatewenzhang.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #更新文章阅读数据,目前一篇文章只监控24小时 3 | 4 | # 导入包 5 | from wechatsogou.tools import * 6 | from wechatsogou import * 7 | from PIL import Image 8 | import datetime 9 | import time 10 | import logging 11 | import logging.config 12 | 13 | # 日志 14 | logging.config.fileConfig('logging.conf') 15 | logger = logging.getLogger() 16 | 17 | 18 | # 搜索API实例 19 | wechats = WechatSogouApi() 20 | 21 | #如果想使用外部cookie,主要是为了实现搜狗微信登录状态 22 | #你需要安装chrom浏览器,然后给浏览器安装EditThisCooke这个插件 23 | #1、使用Chrom浏览器登录搜狗微信 24 | #2、使用EditThisCooke插件复制当前Cookie信息 25 | #3、把cookie信息复制到代码目录下的cookies.txt文件 26 | #4、开启下面这行语句 27 | #wechats = WechatSogouApi(cookies_file={'file_name':'cookies.txt'}) #使用外部cookie 28 | 29 | 30 | #数据库实例 31 | mysql.order_sql = " order by _id desc" 32 | mysql = mysql('mp_info') 33 | 34 | #循环获取数据库中所有公众号 35 | mp_list = mysql.find(0) 36 | 37 | 38 | now_time = datetime.datetime.now() 39 | yes_time = now_time + datetime.timedelta(days=-1) #只更新1天之内的数据,可以修改days=-2就是2天 40 | succ_count = 1 41 | 42 | for item in mp_list: 43 | try: 44 | #为了效率,首先查看该公众号是否有24小时之内的文章 45 | mysql.where_sql = "mp_id=%d and date_time >'%s'" %(item['_id'],yes_time) 46 | wz_time = mysql.table('wenzhang_info').find(1) 47 | if not wz_time : 48 | continue 49 | 50 | print(item['name']) 51 | #print('1') 52 | wz_url = "" 53 | if item.has_key('wz_url') : 54 | wz_url = item['wz_url'] 55 | else : 56 | wechat_info = wechats.get_gzh_info(item['wx_hao']) 57 | if not wechat_info.has_key('url') : 58 | continue 59 | wz_url = wechat_info['url']; 60 | 61 | #print('2') 62 | wz_list = wechats.get_gzh_message(url=wz_url) 63 | if u'链接已过期' in wz_list: 64 | wechat_info = wechats.get_gzh_info(item['wx_hao']) 65 | print(wechat_info) 66 | if not wechat_info.has_key('url') : 67 | continue 68 | print('guo qi sz chong xin huo qu success') 69 | wz_url = wechat_info['url']; 70 | wz_list = wechats.get_gzh_message(url=wz_url) 71 | mysql.where_sql = " _id=%s" %(item['_id']) 72 | mysql.table('mp_info').save({'wz_url':wechat_info['url'],'logo_url':wechat_info['img'],'qr_url':wechat_info['qrcode']}) 73 | #type==49表示是图文消息 74 | #print('3') 75 | for wz_item in wz_list : 76 | #只监控24小时之内的文章 77 | if(wz_item['datetime'] < time.mktime(yes_time.timetuple())): 78 | break 79 | 80 | if wz_item['type'] == '49': 81 | #获取文章数据 82 | time.sleep(0.5) 83 | article_info = wechats.deal_article(url=wz_item['content_url']) 84 | mysql.where_sql = " mp_id=%d and qunfa_id=%d and msg_index=%d" %(item['_id'],wz_item['qunfa_id'],wz_item['main']) 85 | #print(mysql.where_sql) 86 | wz_data = mysql.table('wenzhang_info').find(1) 87 | if not wz_data : 88 | print(u"公众号有新文章了,请执行Updtaemp.py进行抓取") 89 | continue 90 | 91 | #获取当前的数据 92 | print(succ_count) 93 | succ_count += 1 94 | read_count = wz_data['read_count'] 95 | like_count = wz_data['like_count'] 96 | comment_count = wz_data['comment_count'] 97 | print("%d new_read:%d new_like:%d read:%d like:%d" %(wz_data['_id'], article_info['comment']['read_num'],article_info['comment']['like_num'],read_count,like_count)) 98 | #把文章写入数据库 99 | mysql.table('wenzhang_statistics').add({'wz_id':wz_data['_id'], 100 | 'create_time':time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())), 101 | 'read_count':int(article_info['comment']['read_num'])-read_count, 102 | 'like_count':int(article_info['comment']['like_num'])-like_count, 103 | 'comment_count': int(article_info['comment']['elected_comment_total_cnt'])-comment_count}) 104 | #print('5') 105 | #更新文章总阅读数 106 | mysql.where_sql = " _id=%s" %(wz_data['_id']) 107 | mysql.table('wenzhang_info').save({'read_count':int(article_info['comment']['read_num']), 108 | 'like_count':int(article_info['comment']['like_num']), 109 | 'comment_count': int(article_info['comment']['elected_comment_total_cnt'])}) 110 | except KeyboardInterrupt: 111 | break 112 | except: #如果不想因为错误使程序退出,可以开启这两句代码 113 | print(u"出错,继续") 114 | continue 115 | 116 | print('success') 117 | 118 | -------------------------------------------------------------------------------- /updatemp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #查找公众号最新文章 3 | 4 | # 导入包 5 | from wechatsogou.tools import * 6 | from wechatsogou import * 7 | from PIL import Image 8 | import datetime 9 | import time 10 | import logging 11 | import logging.config 12 | import random 13 | 14 | # 日志 15 | logging.config.fileConfig('logging.conf') 16 | logger = logging.getLogger() 17 | 18 | # 搜索API实例 19 | wechats = WechatSogouApi() #不使用外部Cookie 20 | 21 | 22 | #如果想使用外部cookie,主要是为了实现搜狗微信登录状态 23 | #你需要安装chrom浏览器,然后给浏览器安装EditThisCooke这个插件 24 | #1、使用Chrom浏览器登录搜狗微信 25 | #2、使用EditThisCooke插件复制当前Cookie信息 26 | #3、把cookie信息复制到代码目录下的cookies.txt文件 27 | #4、开启下面这行语句 28 | #wechats = WechatSogouApi(cookies_file={'file_name':'cookies.txt'}) #使用外部cookie 29 | 30 | 31 | #数据库实例 32 | mysql = mysql('mp_info') 33 | 34 | #循环获取数据库中所有公众号 35 | mysql.order_sql = " order by _id desc" 36 | mp_list = mysql.find(0) 37 | 38 | succ_count = 0 39 | 40 | now_time = datetime.datetime.today() 41 | now_time = datetime.datetime(now_time.year, now_time.month, now_time.day, 0, 0, 0) 42 | #now_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(now_time)) 43 | 44 | for item in mp_list: 45 | try: 46 | time.sleep(random.randrange(1,3)) 47 | #查看一下该号今天是否已经发送文章 48 | last_qunfa_id = item['last_qunfa_id'] 49 | last_qunfa_time = item['last_qufa_time'] 50 | 51 | cur_qunfa_id = last_qunfa_id 52 | wz_url = item['wz_url'] 53 | 54 | print(item['name']) 55 | 56 | #获取最近文章信息 57 | wz_list = wechats.get_gzh_message(url=wz_url) 58 | if u'链接已过期' in wz_list: 59 | wechat_info = wechats.get_gzh_info(item['wx_hao']) 60 | if 'url' not in wechat_info : 61 | continue 62 | print('guo qi sz chong xin huo qu success') 63 | wz_url = wechat_info['url']; 64 | wz_list = wechats.get_gzh_message(url=wz_url) 65 | mysql.where_sql = " _id=%s" %(item['_id']) 66 | mysql.table('mp_info').where({'_id':item['_id']}).save({'wz_url':wechat_info['url'],'logo_url':wechat_info['img'],'qr_url':wechat_info['qrcode']}) 67 | #type==49表示是图文消息 68 | qunfa_time = '' 69 | for wz_item in wz_list : 70 | temp_qunfa_id = int(wz_item['qunfa_id']) 71 | if(last_qunfa_id >= temp_qunfa_id): 72 | print(u"没有更新文章") 73 | print(u"") 74 | break 75 | if(cur_qunfa_id < temp_qunfa_id): 76 | cur_qunfa_id = temp_qunfa_id 77 | qunfa_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(wz_item['datetime'])) 78 | succ_count += 1 79 | if wz_item['type'] == '49': 80 | #把文章写入数据库 81 | #更新文章条数 82 | print(succ_count) 83 | print(wz_item['content_url']) 84 | if not wz_item['content_url'] : 85 | continue 86 | 87 | sourceurl = wz_item['source_url'] 88 | if len(sourceurl) >= 300 : 89 | sourceurl = '' 90 | 91 | #如果想把文章下载到本地,请开启下面的语句,请确保已经安装:urllib2,httplib2,BeautifulSoup4 92 | #返回值为下载的html文件路径,可以自己保存到数据库 93 | #index_html_path = wechats.down_html(wz_item['content_url'],wz_item['title']) 94 | 95 | #获取文章正文 96 | wz_content = wechats.deal_article_content(url=wz_item['content_url']) 97 | 98 | mysql.table('wenzhang_info').add({'title':wz_item['title'], 99 | 'source_url':sourceurl, 100 | 'content_url':wz_item['content_url'], 101 | 'cover_url':wz_item['cover'], 102 | 'description':wz_item['digest'], 103 | 'date_time': time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(wz_item['datetime'])), 104 | 'mp_id':item['_id'], 105 | 'author':wz_item['author'], 106 | 'msg_index':wz_item['main'], 107 | 'copyright_stat':wz_item['copyright_stat'], 108 | 'qunfa_id':wz_item['qunfa_id'], 109 | 'type':wz_item['type'], 110 | 'like_count':0, 111 | 'read_count':0, 112 | 'comment_count':0, 113 | 'content':wz_content}) 114 | 115 | 116 | 117 | #更新最新推送ID 118 | if(last_qunfa_id < cur_qunfa_id): 119 | mysql.where_sql = " _id=%s" %(item['_id']) 120 | mysql.table('mp_info').save({'last_qunfa_id':cur_qunfa_id,'last_qufa_time':qunfa_time,'update_time':time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))}) 121 | except KeyboardInterrupt: 122 | break 123 | # except: #如果不想因为错误使程序退出,可以开启这两句代码 124 | # print(u"出错,继续") 125 | # continue 126 | 127 | print('success') -------------------------------------------------------------------------------- /jubang.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Navicat MySQL Data Transfer 3 | 4 | Source Server : 47.105.144.60 5 | Source Server Version : 50723 6 | Source Host : 47.105.144.60:3306 7 | Source Database : test 8 | 9 | Target Server Type : MYSQL 10 | Target Server Version : 50723 11 | File Encoding : 65001 12 | 13 | Date: 2019-03-07 20:26:19 14 | */ 15 | 16 | SET FOREIGN_KEY_CHECKS=0; 17 | 18 | -- ---------------------------- 19 | -- Table structure for `add_mp_list` 20 | -- ---------------------------- 21 | DROP TABLE IF EXISTS `add_mp_list`; 22 | CREATE TABLE `add_mp_list` ( 23 | `_id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增ID', 24 | `name` varchar(50) DEFAULT '' COMMENT '要添加的公众号名称', 25 | `wx_hao` varchar(50) DEFAULT '' COMMENT '公众号的微信号', 26 | PRIMARY KEY (`_id`) 27 | ) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=utf8mb4; 28 | 29 | -- ---------------------------- 30 | -- Records of add_mp_list 31 | -- ---------------------------- 32 | 33 | -- ---------------------------- 34 | -- Table structure for `mp_info` 35 | -- ---------------------------- 36 | DROP TABLE IF EXISTS `mp_info`; 37 | CREATE TABLE `mp_info` ( 38 | `_id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增ID', 39 | `name` varchar(50) DEFAULT '' COMMENT '公众号名称', 40 | `wx_hao` varchar(20) DEFAULT '' COMMENT '公众号的微信号', 41 | `company` varchar(100) DEFAULT '' COMMENT '主体名称', 42 | `description` varchar(200) DEFAULT '' COMMENT '功能简介', 43 | `logo_url` varchar(200) DEFAULT '' COMMENT 'logo url', 44 | `qr_url` varchar(200) DEFAULT '' COMMENT '二维码URL', 45 | `create_time` datetime DEFAULT NULL COMMENT '加入牛榜时间', 46 | `update_time` datetime DEFAULT NULL COMMENT '最后更新时间', 47 | `rank_article_release_count` int(11) DEFAULT '0' COMMENT '群发次数', 48 | `rank_article_count` int(11) DEFAULT '0' COMMENT '群发篇数', 49 | `last_qunfa_id` int(30) DEFAULT '0' COMMENT '最后的群发ID', 50 | `last_qufa_time` datetime DEFAULT NULL COMMENT '最后一次群发的时间', 51 | `wz_url` varchar(300) DEFAULT '' COMMENT '最近文章URL', 52 | PRIMARY KEY (`_id`) 53 | ) ENGINE=InnoDB AUTO_INCREMENT=287 DEFAULT CHARSET=utf8mb4; 54 | 55 | -- ---------------------------- 56 | -- Records of mp_info 57 | -- ---------------------------- 58 | INSERT INTO `mp_info` VALUES ('266', '今日头条', 'headline_today', '北京字节跳动科技有限公司', '今日头条官方帐号', 'http://img01.sogoucdn.com/app/a/100520090/oIWsFt3Om27KzYpmW9LaBGPCUxaU', '', '2017-02-16 17:15:09', null, '0', '0', '0', null, 'http://mp.weixin.qq.com/profile?src=3×tamp=1487236535&ver=1&signature=nDdjBk7tfBptUPQVaSHn*uoQ9hysPGOoChQf5umkzBbz3PSaIHThKmZzsU23I7vU1tNr6R6t8eQS6lC586yDLQ=='); 59 | INSERT INTO `mp_info` VALUES ('276', '新榜', 'newrankcn', '上海看榜信息科技有限公司', '涨粉、变现、运营、观察,新榜给你不一样的新思路.新榜——内容创业服务平台,www.newrank.cn', 'http://img01.sogoucdn.com/app/a/100520090/oIWsFt3CUA6HniQM4e_i7zncqWkk', '', '2017-02-16 17:16:04', null, '0', '0', '0', null, 'http://mp.weixin.qq.com/profile?src=3×tamp=1487236590&ver=1&signature=A38golU5GzltuG*u78AoIZkLnJS--EsX4PCDJyq3coRVjU3ZoBZ9UUWZNyOHDzCFw1Q34XVteeqgSGthakK1Ig=='); 60 | INSERT INTO `mp_info` VALUES ('278', '娱乐新榜', 'yulexinbang', '北京快络科技有限公司', '娱乐新人第一自媒体平台,为导演找新人,为新人找发展.深度开挖新人潜力与特色,助力新人演艺事业快速起步.向导演制片等影视从业人员提供第一手新晋艺人资料,实现艺人资源与影视需求的完美对接.', '//img01.sogoucdn.com/app/a/100520090/oIWsFt8lrEWgjvNDVlT1S7wL5Nyw', '', '2017-02-16 17:16:04', null, '0', '0', '0', null, 'http://mp.weixin.qq.com/profile?src=3×tamp=1551960049&ver=1&signature=fd*NZOcIHHxSZQ6Y44LFP1WmzZvhuKe0sJd2PpGunRcPNotPrCVBSO7sVIDjNkOkF8MkVzv35-iroU38v0GQww=='); 61 | INSERT INTO `mp_info` VALUES ('286', '人民日报', 'rmrbwx', '人民日报社', '参与、沟通、记录时代.', '//img01.sogoucdn.com/app/a/100520090/oIWsFt8_jYUmdw1PQgNVhH9vOEvI', '', '2019-03-07 19:54:26', '2019-03-07 19:58:58', '0', '0', '1000008043', '2019-03-07 18:57:13', 'http://mp.weixin.qq.com/profile?src=3×tamp=1551959664&ver=1&signature=bSSQMK1LY77M4O22qTi37cbhjhwNV7C9V4aor9HLhAt-Wdr*jWO2gFh3jN4KhPmYamKHzx9fg9SuHxCB1nGehg=='); 62 | 63 | -- ---------------------------- 64 | -- Table structure for `wenzhang_info` 65 | -- ---------------------------- 66 | DROP TABLE IF EXISTS `wenzhang_info`; 67 | CREATE TABLE `wenzhang_info` ( 68 | `_id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增ID', 69 | `title` text COMMENT '文章标题', 70 | `source_url` text COMMENT '原文地址', 71 | `cover_url` text COMMENT '封面图URL', 72 | `description` text COMMENT '文章摘要', 73 | `date_time` datetime DEFAULT NULL COMMENT '文章推送时间', 74 | `mp_id` int(11) DEFAULT '0' COMMENT '对应的公众号ID', 75 | `read_count` int(11) DEFAULT '0' COMMENT '阅读数', 76 | `like_count` int(11) DEFAULT '0' COMMENT '点攒数', 77 | `comment_count` int(11) DEFAULT '0' COMMENT '评论数', 78 | `content_url` text COMMENT '文章临时地址', 79 | `author` varchar(50) DEFAULT '' COMMENT '作者', 80 | `msg_index` int(11) DEFAULT '0' COMMENT '一次群发中的图文顺序 1是头条 ', 81 | `copyright_stat` int(1) DEFAULT '0' COMMENT '11表示原创 其它表示非原创', 82 | `qunfa_id` int(30) DEFAULT '0' COMMENT '群发消息ID', 83 | `type` int(11) DEFAULT '0' COMMENT '消息类型', 84 | `content` longtext COMMENT '文章正文', 85 | PRIMARY KEY (`_id`) 86 | ) ENGINE=InnoDB AUTO_INCREMENT=6579 DEFAULT CHARSET=utf8mb4; 87 | 88 | -- ---------------------------- 89 | -- Records of wenzhang_info 90 | -- ---------------------------- 91 | 92 | -- ---------------------------- 93 | -- Table structure for `wenzhang_statistics` 94 | -- ---------------------------- 95 | DROP TABLE IF EXISTS `wenzhang_statistics`; 96 | CREATE TABLE `wenzhang_statistics` ( 97 | `_id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增ID', 98 | `wz_id` int(11) DEFAULT '0' COMMENT '对应的文章ID', 99 | `create_time` datetime DEFAULT NULL COMMENT '统计时间', 100 | `read_count` int(11) DEFAULT '0' COMMENT '新增阅读数', 101 | `like_count` int(11) DEFAULT '0' COMMENT '新增点攒数', 102 | `comment_count` int(11) DEFAULT '0' COMMENT '新增评论数', 103 | PRIMARY KEY (`_id`) 104 | ) ENGINE=InnoDB AUTO_INCREMENT=4006 DEFAULT CHARSET=utf8mb4; 105 | 106 | -- ---------------------------- 107 | -- Records of wenzhang_statistics 108 | -- ---------------------------- 109 | -------------------------------------------------------------------------------- /wechatsogou/db.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import pymysql 4 | from . import config 5 | 6 | 7 | class DbException(Exception): 8 | """数据库 异常 基类 9 | """ 10 | pass 11 | 12 | 13 | class MysqlDbException(DbException): 14 | """数据库 myslq 异常类 15 | """ 16 | pass 17 | 18 | 19 | class mysql(): 20 | """数据库类 21 | 22 | 例子 23 | m = M('user') 24 | m.table('user').add({}) # 插入 25 | m.table('user').where({}).save({}) # 更新 26 | m.table('user').field(['id']).where({}).order({'id':'desc'}).find() # 读取,asc,desc 27 | m.where({}).delete() # 删除 28 | """ 29 | 30 | def __init__(self,table='', prefix='', host='',user='',passwd='',db='',charset=''): 31 | """初始化 32 | 33 | table是初始化选择的表,后面可以使用table()函数更改 34 | prefix是数据表前缀,一般配置在config中 35 | """ 36 | self.host = config.host 37 | self.user = config.user 38 | self.passwd = config.passwd 39 | self.db = config.db 40 | self.charset = config.charset 41 | 42 | if host: 43 | self.host = host 44 | if user: 45 | self.user = user 46 | if passwd: 47 | self.passwd = passwd 48 | if db: 49 | self.db = db 50 | if charset: 51 | self.charset = charset 52 | if prefix: 53 | self.prefix = prefix + '_' 54 | elif config.prefix: 55 | self.prefix = config.prefix + '_' 56 | else: 57 | self.prefix = '' 58 | if table: 59 | self.tablename = self.prefix + table 60 | self.__conn() 61 | 62 | def __conn(self): 63 | """连接数据库函数 64 | """ 65 | self.conn = pymysql.connect(host=self.host, user=self.user, passwd=self.passwd, db=self.db, 66 | charset=self.charset, cursorclass=pymysql.cursors.DictCursor) 67 | self.cur = self.conn.cursor() 68 | return self 69 | 70 | def __update(self, sqls): 71 | """更新语句,可执行update,insert语句 72 | """ 73 | if type(sqls) is str: 74 | sta = self.cur.execute(sqls) 75 | elif type(sqls) is list: 76 | for sql in sqls: 77 | sta = self.cur.execute(sql) 78 | else: 79 | raise MysqlDbException('更新语句参数错误 - Model.__update') 80 | self.conn.commit() 81 | 82 | return self.cur.lastrowid 83 | 84 | def __delete(self, sql): 85 | """删除语句 86 | """ 87 | return self.cur.execute(sql) 88 | 89 | def __query(self, sql): 90 | """查询语句 91 | """ 92 | return self.cur.execute(sql) 93 | 94 | def __close(self): 95 | """关闭所有连接 96 | """ 97 | self.cur.close() 98 | self.conn.close() 99 | 100 | def __del__(self): 101 | """析构函数 102 | """ 103 | self.conn.commit() 104 | self.__close() 105 | 106 | """ 107 | 以下是封装的提供使用的 108 | """ 109 | 110 | def table(self, table, prefix=''): 111 | """设置数据表, 链式操作 112 | """ 113 | if prefix: 114 | prefix = prefix + '_' 115 | elif hasattr(self, 'prefix'): 116 | prefix = self.prefix 117 | else: 118 | prefix = '' 119 | self.tablename = prefix + table 120 | return self 121 | 122 | def limit(self, pre, count): 123 | self.limit_sql = 'limit ' + str(pre) + ',' + str(count) 124 | return self 125 | 126 | def where(self, where): 127 | """设置条件, 链式操作 128 | """ 129 | if type(where) is str: 130 | raise MysqlDbException('请输入字典 - Model.where') 131 | # self.where_sql = where 132 | elif type(where) is dict: 133 | where_sql = '' 134 | for k, v in where.items(): 135 | where_sql += "`" + str(k) + "` LIKE '" + str(v) + "' and " 136 | self.where_sql = where_sql[:-5] 137 | return self 138 | 139 | def field(self, field): 140 | """设置操作的字段 141 | """ 142 | if type(field) is str: 143 | if field == '*': 144 | self.field_sql = "*" 145 | else: 146 | self.field_sql = "`" + field + "`" 147 | elif type(field) is list: 148 | field_dian = [] 149 | for f in field: 150 | field_dian.append("`" + f + "`") 151 | self.field_sql = ','.join(field_dian) 152 | else: 153 | raise MysqlDbException('field参数不是字符或者列表 - Model.field') 154 | return self 155 | 156 | def order(self, order): 157 | """排序 158 | """ 159 | if type(order) is dict: 160 | for k, v in order.items(): 161 | self.order_sql = " order by `" + k + "` " + v 162 | break 163 | else: 164 | raise MysqlDbException('排序参数不是字典 - Model.order') 165 | return self 166 | 167 | def add(self, data): 168 | """插入数据 169 | """ 170 | ks = '' 171 | vs = '' 172 | for k, v in data.items(): 173 | ks += "`" + str(k).replace('\'', '\\\'') + "`," 174 | vs += "'" + str(v).replace('\'', '\\\'') + "'," 175 | if hasattr(self, 'tablename'): 176 | sql = "insert into `" + self.tablename + "` (" + ks[:-1] + ") values (" + vs[:-1] + ")" 177 | try: 178 | return self.__update(sql) 179 | except pymysql.err.IntegrityError: 180 | pass 181 | else: 182 | raise MysqlDbException('缺少数据表 - Model.add') 183 | 184 | def save(self, data): 185 | """更新数据 186 | """ 187 | if not hasattr(self, 'where_sql'): 188 | raise MysqlDbException('缺少where语句 - Model.save') 189 | if not hasattr(self, 'tablename'): 190 | raise MysqlDbException('缺少tablename - Model.save') 191 | data_sql = '' 192 | for k, v in data.items(): 193 | data_sql += "`" + str(k) + "` = '" + str(v) + "'," 194 | sql = "update `" + self.tablename + "` set " + data_sql[:-1] + " where " + self.where_sql + ";" 195 | self.__update(sql) 196 | 197 | def find(self, size=25): 198 | """查询数据 199 | """ 200 | where_sql = " where " + self.where_sql if hasattr(self, 'where_sql') else "" 201 | field_sql = self.field_sql if hasattr(self, 'field_sql') else "*" 202 | order_sql = self.order_sql if hasattr(self, 'order_sql') else "" 203 | limit_sql = self.limit_sql if hasattr(self, 'limit_sql') else "" 204 | sql = "select " + field_sql + " from `" + self.tablename + "`" + where_sql + order_sql + limit_sql 205 | self.__query(sql) 206 | if size == 0: 207 | return self.cur.fetchall() 208 | elif size == 1: 209 | return self.cur.fetchone() 210 | else: 211 | return self.cur.fetchmany(size) 212 | 213 | def delete(self): 214 | """删除语句 215 | """ 216 | where_sql = " where " + self.where_sql if hasattr(self, 'where_sql') else "" 217 | sql = "delete from `" + self.tablename + "`" + where_sql 218 | return self.__delete(sql) 219 | 220 | 221 | if __name__ == '__main__': 222 | pass 223 | -------------------------------------------------------------------------------- /auto_add_mp_log.txt: -------------------------------------------------------------------------------- 1 | 2016-10-20 10:35:14,977 - [basic.py:158] - ERROR - 2 | 3 | 4 | 5 | 6 | 搜狗搜索 7 | 8 | 9 | 10 | 70 | 71 | 72 |
73 | 74 |
您的访问出错了返回首页>>
75 |
76 |
77 |

IP:36.110.68.16
访问时间:2016.10.20 10:35:37

78 |

用户您好,您的访问过于频繁,为确认本次访问为正常用户行为,需要您协助验证。

79 |

80 |
81 |

82 | 83 | 84 | 85 | 86 | 87 | 88 | 请输入图中的验证码 89 | 90 | 91 | 换一张 92 | 93 |

94 |
95 |

96 | 提交 97 | 提交后没解决问题?欢迎反馈 98 |

99 |
100 |
企业推广关于搜狗免责声明意见反馈
 © 2016 SOGOU - 京ICP证050897号 - 京公网安备110000000025号
101 | 102 | 103 | 104 | 105 | 2016-10-20 10:35:19,500 - [basic.py:219] - ERROR - verify code ocr: 解封成功,正在为您跳转来源地址... 106 | 2016-10-20 10:45:59,701 - [basic.py:158] - ERROR - 107 | 108 | 109 | 110 | 111 | 搜狗搜索 112 | 113 | 114 | 115 | 175 | 176 | 177 |
178 | 179 |
您的访问出错了返回首页>>
180 |
181 |
182 |

IP:36.110.68.19
访问时间:2016.10.20 10:46:22

183 |

用户您好,您的访问过于频繁,为确认本次访问为正常用户行为,需要您协助验证。

184 |

185 |
186 |

187 | 188 | 189 | 190 | 191 | 192 | 193 | 请输入图中的验证码 194 | 195 | 196 | 换一张 197 | 198 |

199 |
200 |

201 | 提交 202 | 提交后没解决问题?欢迎反馈 203 |

204 |
205 |
企业推广关于搜狗免责声明意见反馈
 © 2016 SOGOU - 京ICP证050897号 - 京公网安备110000000025号
206 | 207 | 208 | 209 | 210 | 2016-10-20 10:46:03,927 - [basic.py:219] - ERROR - verify code ocr: 解封成功,正在为您跳转来源地址... 211 | 2016-10-21 09:15:58,844 - [basic.py:158] - ERROR - 出现验证码。。。 212 | 2016-10-21 09:16:09,729 - [basic.py:219] - ERROR - verify code ocr: 解封成功,正在为您跳转来源地址... 213 | 2016-10-21 09:16:09,767 - [basic.py:158] - ERROR - 出现验证码。。。 214 | 2016-10-21 17:30:58,819 - [basic.py:158] - ERROR - 出现验证码。。。 215 | 2016-10-21 17:31:11,736 - [basic.py:210] - ERROR - verify code erro: 验证码输入错误, 请重新输入! 216 | -------------------------------------------------------------------------------- /wechatsogou/basic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | import requests 5 | import random 6 | import time 7 | import re 8 | import sys 9 | from lxml import etree 10 | from PIL import Image 11 | is_python3 = sys.version_info[0] > 2 12 | if is_python3 == False: 13 | import cookielib 14 | else: 15 | import http.cookiejar as cookielib 16 | import json 17 | 18 | try: 19 | from urllib.request import quote as quote 20 | except ImportError: 21 | from urllib import quote as quote 22 | import sys 23 | 24 | reload(sys) 25 | sys.setdefaultencoding('utf-8') 26 | 27 | try: 28 | import StringIO 29 | 30 | 31 | def readimg(content): 32 | return Image.open(StringIO.StringIO(content)) 33 | except ImportError: 34 | import tempfile 35 | 36 | 37 | def readimg(content): 38 | f = tempfile.TemporaryFile() 39 | f.write(content) 40 | return Image.open(f) 41 | 42 | try: 43 | import urlparse as url_parse 44 | except ImportError: 45 | import urllib.parse as url_parse 46 | 47 | from lxml import etree 48 | from PIL import Image 49 | 50 | from . import config 51 | from .base import WechatSogouBase 52 | from .exceptions import * 53 | from .ruokuaicode import RClient 54 | from .filecache import WechatCache 55 | 56 | import logging 57 | 58 | logger = logging.getLogger() 59 | 60 | 61 | class WechatSogouBasic(WechatSogouBase): 62 | """基于搜狗搜索的的微信公众号爬虫接口 基本功能类 63 | """ 64 | 65 | def __init__(self, **kwargs): 66 | self._cache = WechatCache(config.cache_dir, 60 * 60) 67 | self._session = self._cache.get(config.cache_session_name) if self._cache.get( 68 | config.cache_session_name) else requests.session() 69 | 70 | self.cookies = "" 71 | cookies_file = kwargs.get('cookies_file') 72 | if cookies_file: 73 | #使用外部cookies 74 | print(u"使用外部cookies文件加载") 75 | cookie_jar = cookielib.MozillaCookieJar() 76 | cookies = open(cookies_file.get('file_name')).read() 77 | for cookie in json.loads(cookies): 78 | print(cookie['name']) 79 | cookie_jar.set_cookie(cookielib.Cookie(version=0, name=cookie['name'], value=cookie['value'], port=None, port_specified=False, domain=cookie['domain'], domain_specified=False, domain_initial_dot=False, path=cookie['path'], path_specified=True, secure=cookie['secure'], expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)) 80 | self._session.cookies.update(cookie_jar) 81 | 82 | self.dama_name = config.dama_name 83 | self.dama_pswd = config.dama_pswd 84 | if self.dama_name != '' and self.dama_pswd != '': 85 | self._ocr = RClient(self.dama_name, self.dama_pswd, '70021', 'dcefe229cb9b4e1785b48fbc3525d011') 86 | 87 | self._agent = [ 88 | "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0", 89 | "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko", 90 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586", 91 | "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36", 92 | "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36", 93 | ] 94 | 95 | def _get_elem_text(self, elem): 96 | """抽取lxml.etree库中elem对象中文字 97 | 98 | Args: 99 | elem: lxml.etree库中elem对象 100 | 101 | Returns: 102 | elem中文字 103 | """ 104 | rc = [] 105 | for node in elem.itertext(): 106 | rc.append(node.strip()) 107 | return ''.join(rc) 108 | 109 | def _get_encoding_from_reponse(self, r): 110 | """获取requests库get或post返回的对象编码 111 | 112 | Args: 113 | r: requests库get或post返回的对象 114 | 115 | Returns: 116 | 对象编码 117 | """ 118 | encoding = requests.utils.get_encodings_from_content(r.text) 119 | return encoding[0] if encoding else requests.utils.get_encoding_from_headers(r.headers) 120 | 121 | def _get(self, url, rtype='get', **kwargs): 122 | """封装request库get,post方法 123 | 124 | Args: 125 | url: 请求url 126 | host: 请求host 127 | referer: 请求referer 128 | proxy: 是否启用代理请求 129 | 130 | Returns: 131 | text: 请求url的网页内容 132 | 133 | Raises: 134 | WechatSogouException: 操作频繁以致出现验证码或requests请求返回码错误 135 | """ 136 | referer = kwargs.get('referer', None) 137 | host = kwargs.get('host', None) 138 | if host: 139 | del kwargs['host'] 140 | if referer: 141 | del kwargs['referer'] 142 | headers = { 143 | "Host": host if host else 'weixin.sogou.com', 144 | "Upgrade-Insecure-Requests":'1', 145 | "User-Agent": self._agent[random.randint(0, len(self._agent) - 1)], 146 | "Accept":'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 147 | "Referer": referer if referer else 'https://weixin.sogou.com/', 148 | "Accept-Encoding":'gzip, deflate, br', 149 | "Accept-Language":'zh,zh-TW;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6' 150 | 151 | } 152 | if rtype == 'get': 153 | #self._session.cookies.set 154 | r = self._session.get(url, headers=headers,verify=False, **kwargs) 155 | else: 156 | data = kwargs.get('data', None) 157 | json = kwargs.get('json', None) 158 | r = self._session.post(url, data=data, json=json, headers=headers,verify=False, **kwargs) 159 | 160 | #logger.error(r.text) 161 | if u'链接已过期' in r.text: 162 | return '链接已过期' 163 | if r.status_code == requests.codes.ok: 164 | r.encoding = self._get_encoding_from_reponse(r) 165 | if u'用户您好,您的访问过于频繁,为确认本次访问为正常用户行为,需要您协助验证' in r.text or u'用户您好,我们的系统检测到您网络中存在异常访问请求' in r.text: 166 | self._vcode_url = url 167 | logger.error(u'出现验证码。。。') 168 | print(u'用户您好,您的访问过于频繁,为确认本次访问为正常用户行为,需要您协助验证') 169 | raise WechatSogouVcodeException('weixin.sogou.com verification code') 170 | else: 171 | logger.error('requests status_code error %d' % (r.status_code)) 172 | raise WechatSogouRequestsException('requests status_code error', r.status_code) 173 | return r.text 174 | 175 | def _jiefeng(self): 176 | """对于出现验证码,识别验证码,解封 177 | 178 | Args: 179 | ruokuai: 是否采用若快打码平台 180 | 181 | Raises: 182 | WechatSogouVcodeException: 解封失败,可能验证码识别失败 183 | """ 184 | max_count = 0 185 | while(max_count < 10) : 186 | print(u"出现验证码,准备自动识别") 187 | max_count += 1 188 | logger.debug('vcode appear, using _jiefeng') 189 | codeurl = 'https://weixin.sogou.com/antispider/util/seccode.php?tc=' + str(time.time())[0:10] 190 | 191 | user_agent = self._agent[random.randint(0, len(self._agent) - 1)] 192 | headers = { 193 | "Host": 'weixin.sogou.com', 194 | "Upgrade-Insecure-Requests":'1', 195 | "User-Agent": user_agent, 196 | "Accept":'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 197 | "Referer":'https://weixin.sogou.com/', 198 | "Accept-Encoding":'gzip, deflate, sdch', 199 | "Accept-Language":'zh-CN,zh;q=0.8' 200 | } 201 | 202 | coder = self._session.get(codeurl,headers=headers,timeout=10,verify=False) 203 | 204 | codeID = "0" 205 | 206 | if hasattr(self, '_ocr'): 207 | result = self._ocr.create(coder.content, 3060) 208 | print(result) 209 | if 'Result' not in result : 210 | print(u"若快识别失败,1秒后更换验证码再次尝试,尝试次数:%d" %(max_count)) 211 | time.sleep(1) 212 | continue #验证码识别错误,再次执行 213 | else: 214 | print(u"验证码识别成功 验证码:%s" %(result['Result'])) 215 | 216 | img_code = result['Result'] 217 | codeID = result['Id'] 218 | 219 | post_url = 'https://weixin.sogou.com/antispider/thank.php' 220 | post_data = { 221 | 'c': img_code, 222 | 'r': quote(self._vcode_url), 223 | 'v': 5 224 | } 225 | 226 | headers = { 227 | "User-Agent": user_agent, 228 | 'Host': 'weixin.sogou.com', 229 | 'Referer': 'https://weixin.sogou.com/antispider/?from=%2f' + quote( 230 | self._vcode_url.replace('http://', '')) 231 | } 232 | #time.sleep(3) 233 | rr = self._session.post(post_url, post_data, headers=headers,verify=False) 234 | remsg = eval(rr.content) 235 | if remsg['code'] != 0: 236 | print(u"搜狗返回验证码错误,1秒后更换验证码再次启动尝试,尝试次数:%d" %(max_count)) 237 | time.sleep(1) 238 | continue 239 | 240 | #搜狗又增加验证码机制 241 | time.sleep(0.05) 242 | cookie_jar = cookielib.MozillaCookieJar() 243 | cookie_jar.set_cookie(cookielib.Cookie(version=0, name='SNUID', value=remsg['id'], port=None, port_specified=False, domain='sogou.com', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=None, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)) 244 | self._session.cookies.update(cookie_jar) 245 | 246 | pbsnuid = remsg['id'] #pb_cookie['SNUID'].value 247 | pbsuv = ''#pb_cookie['SUV'].value 248 | print(pbsnuid) 249 | print(pbsuv) 250 | pburl = 'http://pb.sogou.com/pv.gif?uigs_productid=webapp&type=antispider&subtype=0_seccodeInputSuccess&domain=weixin&suv=%s&snuid=%s&t=%s' %(pbsuv,pbsnuid,str(time.time())[0:10]) 251 | 252 | headers = { 253 | "User-Agent": user_agent, 254 | 'Host': 'pb.sogou.com', 255 | 'Referer': 'https://weixin.sogou.com/antispider/?from=%2f' + quote( 256 | self._vcode_url.replace('http://', '')) 257 | } 258 | 259 | try: 260 | self._session.get(pburl, headers=headers,timeout=10,verify=False) 261 | except: 262 | print('') 263 | 264 | 265 | time.sleep(0.5) 266 | 267 | print(u"搜狗返回验证码识别成功,继续执行") 268 | self._cache.set(config.cache_session_name, self._session) 269 | logger.error('verify code ocr: ' + remsg['msg']) 270 | break 271 | 272 | else: 273 | print(u"没有设置自动识别模块用户名、密码,无法执行") 274 | break 275 | 276 | 277 | 278 | 279 | def _ocr_for_get_gzh_article_by_url_text(self, url): 280 | print(u"出现验证码,准备自动识别2") 281 | logger.debug('vcode appear, using _ocr_for_get_gzh_article_by_url_text') 282 | 283 | if hasattr(self, '_ocr'): 284 | max_count = 0 285 | while(max_count < 10): 286 | max_count += 1 287 | timestr = str(time.time()).replace('.', '') 288 | timever = timestr[0:13] + '.' + timestr[13:17] 289 | codeurl = 'http://mp.weixin.qq.com/mp/verifycode?cert=' + timever 290 | coder = self._session.get(codeurl,verify=False) 291 | logger.debug('vcode appear, using _ocr_for_get_gzh_article_by_url_text') 292 | result = self._ocr.create(coder.content, 2040) 293 | print(result) 294 | if 'Result' not in result : 295 | print(u"若快识别失败,1秒后更换验证码再次尝试,尝试次数:%d" %(max_count)) 296 | time.sleep(1) 297 | continue #验证码识别错误,再次执行 298 | else: 299 | print(u"若快识别成功 验证码:%s" %(result['Result'])) 300 | 301 | img_code = result['Result'] 302 | codeID = result['Id'] 303 | 304 | post_url = 'http://mp.weixin.qq.com/mp/verifycode' 305 | post_data = { 306 | 'cert': timever, 307 | 'input': img_code 308 | } 309 | headers = { 310 | "User-Agent": self._agent[random.randint(0, len(self._agent) - 1)], 311 | 'Host': 'mp.weixin.qq.com', 312 | 'Referer': url 313 | } 314 | rr = self._session.post(post_url, post_data, headers=headers,verify=False) 315 | remsg = eval(rr.text) 316 | if remsg['ret'] != 0: 317 | print(u"搜狗返回验证码错误,1秒后更换验证码再次启动尝试,尝试次数:%d" %(max_count)) 318 | time.sleep(1) 319 | continue 320 | 321 | print(u"搜狗返回验证码识别成功,继续执行") 322 | self._cache.set(config.cache_session_name, self._session) 323 | logger.debug('ocr ', remsg['errmsg']) 324 | break 325 | 326 | break 327 | else: 328 | print(u"没有设置自动识别模块用户名、密码,无法执行") 329 | 330 | 331 | def _replace_html(self, s): 332 | """替换html‘"’等转义内容为正常内容 333 | 334 | Args: 335 | s: 文字内容 336 | 337 | Returns: 338 | s: 处理反转义后的文字 339 | """ 340 | s = s.replace(''', '\'') 341 | s = s.replace('"', '"') 342 | s = s.replace('&', '&') 343 | s = s.replace('>', '>') 344 | s = s.replace('<', '<') 345 | s = s.replace('¥', '¥') 346 | s = s.replace('amp;', '') 347 | s = s.replace('<', '<') 348 | s = s.replace('>', '>') 349 | s = s.replace(' ', ' ') 350 | s = s.replace('\\', '') 351 | return s 352 | 353 | def _replace_dict(self, dicts): 354 | retu_dict = dict() 355 | for k, v in dicts.items(): 356 | retu_dict[self._replace_all(k)] = self._replace_all(v) 357 | return retu_dict 358 | 359 | def _replace_list(self, lists): 360 | retu_list = list() 361 | for l in lists: 362 | retu_list.append(self._replace_all(l)) 363 | return retu_list 364 | 365 | def _replace_all(self, data): 366 | if isinstance(data, dict): 367 | return self._replace_dict(data) 368 | elif isinstance(data, list): 369 | return self._replace_list(data) 370 | elif isinstance(data, str): 371 | return self._replace_html(data) 372 | else: 373 | return data 374 | 375 | def _str_to_dict(self, json_str): 376 | json_dict = eval(json_str) 377 | return self._replace_all(json_dict) 378 | 379 | def _replace_space(self, s): 380 | s = s.replace(' ', '') 381 | s = s.replace('\r\n', '') 382 | return s 383 | 384 | def _get_url_param(self, url): 385 | result = url_parse.urlparse(url) 386 | return url_parse.parse_qs(result.query, True) 387 | 388 | def _search_gzh_text(self, name, page=1): 389 | """通过搜狗搜索获取关键字返回的文本 390 | 391 | Args: 392 | name: 搜索关键字 393 | page: 搜索的页数 394 | 395 | Returns: 396 | text: 返回的文本 397 | """ 398 | request_url = 'https://weixin.sogou.com/weixin?type=1&s_from=input&query=' + quote( 399 | name) + '&ie=utf8&_sug_=n&_sug_type_=&page=' + str(page) 400 | 401 | try: 402 | text = self._get(request_url) 403 | except WechatSogouVcodeException: 404 | 405 | try: 406 | self._jiefeng() 407 | text = self._get(request_url, 'get', 408 | referer='https://weixin.sogou.com/antispider/?from=%2f' + quote( 409 | self._vcode_url.replace('http://', ''))) 410 | except WechatSogouVcodeException: 411 | text = "" 412 | 413 | try: 414 | new_url = "https://weixin.sogou.com" + re.findall('var account_anti_url = "(.+?)";', text, re.S)[0] 415 | self._get(new_url, 'get', referer=request_url) 416 | except: 417 | print("error") 418 | 419 | return text,request_url 420 | 421 | def _search_article_text(self, name, page=1): 422 | """通过搜狗搜索微信文章关键字返回的文本 423 | Args: 424 | name: 搜索文章关键字 425 | page: 搜索的页数 426 | 427 | Returns: 428 | text: 返回的文本 429 | """ 430 | request_url = 'https://weixin.sogou.com/weixin?query=' + quote( 431 | name) + '&_sug_type_=&_sug_=n&type=2&page=' + str(page) + '&ie=utf8' 432 | 433 | try: 434 | text = self._get(request_url) 435 | except WechatSogouVcodeException: 436 | 437 | try: 438 | self._jiefeng() 439 | text = self._get(request_url, 'get', 440 | referer='https://weixin.sogou.com/antispider/?from=%2f' + quote( 441 | self._vcode_url.replace('http://', ''))) 442 | except WechatSogouVcodeException: 443 | text = "" 444 | return text 445 | 446 | def _get_gzh_article_by_url_text(self, url): 447 | """最近文章页的文本 448 | 449 | Args: 450 | url: 最近文章页地址 451 | 452 | Returns: 453 | text: 返回的文本 454 | """ 455 | if "https://weixin.sogou.com" in url: 456 | return "链接已过期" 457 | 458 | text = self._get(url, 'get', host='mp.weixin.qq.com') 459 | 460 | if u'为了保护你的网络安全,请输入验证码' in text: 461 | print(u'为了保护你的网络安全,请输入验证码') 462 | try: 463 | self._ocr_for_get_gzh_article_by_url_text(url) 464 | 465 | text = self._get(url, 'get', host='mp.weixin.qq.com') 466 | except: 467 | text = "" 468 | return text 469 | 470 | def _get_gzh_article_gzh_by_url_dict(self, text, url): 471 | """最近文章页 公众号信息 472 | 473 | Args: 474 | text: 最近文章文本 475 | 476 | Returns: 477 | 字典{'name':name,'wechatid':wechatid,'jieshao':jieshao,'renzhen':renzhen,'qrcode':qrcodes,'img':img,'url':url} 478 | name: 公众号名称 479 | wechatid: 公众号id 480 | jieshao: 介绍 481 | renzhen: 认证,为空表示未认证 482 | qrcode: 二维码 483 | img: 头像图片 484 | url: 最近文章地址 485 | """ 486 | page = etree.HTML(text) 487 | profile_info_area = page.xpath("//div[@class='profile_info_area']")[0] 488 | img = profile_info_area.xpath('div[1]/span/img/@src')[0] 489 | name = profile_info_area.xpath('div[1]/div/strong/text()')[0] 490 | name = self._replace_space(name) 491 | wechatid = profile_info_area.xpath('div[1]/div/p/text()') 492 | if wechatid: 493 | wechatid = wechatid[0].replace(u'微信号: ', '') 494 | else: 495 | wechatid = '' 496 | jieshao = profile_info_area.xpath('ul/li[1]/div/text()')[0] 497 | renzhen = profile_info_area.xpath('ul/li[2]/div/text()') 498 | renzhen = renzhen[0] if renzhen else '' 499 | qrcode = page.xpath('//*[@id="js_pc_qr_code_img"]/@src')[0] 500 | qrcode = 'http://mp.weixin.qq.com/' + qrcode if qrcode else '' 501 | return { 502 | 'name': name, 503 | 'wechatid': wechatid, 504 | 'jieshao': jieshao, 505 | 'renzhen': renzhen, 506 | 'qrcode': qrcode, 507 | 'img': img, 508 | 'url': url 509 | } 510 | 511 | def _get_gzh_article_by_url_dict(self, text): 512 | """最近文章页 文章信息 513 | 514 | Args: 515 | text: 最近文章文本 516 | 517 | Returns: 518 | msgdict: 最近文章信息字典 519 | """ 520 | try: 521 | msglist = re.findall("var msgList = (.+?)};", text, re.S)[0] 522 | msglist = msglist + '}' 523 | 524 | html = msglist 525 | html = html.replace(''', '\'') 526 | html = html.replace('&', '&') 527 | html = html.replace('>', '>') 528 | html = html.replace('<', '<') 529 | html = html.replace('¥', '¥') 530 | html = html.replace('amp;', '') 531 | html = html.replace('<', '<') 532 | html = html.replace('>', '>') 533 | html = html.replace(' ', ' ') 534 | html = html.replace('\\', '') 535 | 536 | msgdict = eval(html) 537 | return msgdict 538 | except: 539 | return '' 540 | 541 | def _deal_gzh_article_dict(self, msgdict, **kwargs): 542 | """解析 公众号 群发消息 543 | 544 | Args: 545 | msgdict: 信息字典 546 | 547 | Returns: 548 | 列表,均是字典,一定含有一下字段qunfa_id,datetime,type 549 | 550 | 当type不同时,含有不同的字段,具体见文档 551 | """ 552 | biz = kwargs.get('biz', '') 553 | uin = kwargs.get('uin', '') 554 | key = kwargs.get('key', '') 555 | items = list() 556 | for listdic in msgdict['list']: 557 | item = dict() 558 | comm_msg_info = listdic['comm_msg_info'] 559 | item['qunfa_id'] = comm_msg_info.get('id', '') # 不可判重,一次群发的消息的id是一样的 560 | item['datetime'] = comm_msg_info.get('datetime', '') 561 | item['type'] = str(comm_msg_info.get('type', '')) 562 | if item['type'] == '1': 563 | # 文字 564 | item['content'] = comm_msg_info.get('content', '') 565 | elif item['type'] == '3': 566 | # 图片 567 | item[ 568 | 'img_url'] = 'https://mp.weixin.qq.com/mp/getmediadata?__biz=' + biz + '&type=img&mode=small&msgid=' + \ 569 | str(item['qunfa_id']) + '&uin=' + uin + '&key=' + key 570 | elif item['type'] == '34': 571 | # 音频 572 | item['play_length'] = listdic['voice_msg_ext_info'].get('play_length', '') 573 | item['fileid'] = listdic['voice_msg_ext_info'].get('fileid', '') 574 | item['audio_src'] = 'https://mp.weixin.qq.com/mp/getmediadata?__biz=' + biz + '&type=voice&msgid=' + \ 575 | str(item['qunfa_id']) + '&uin=' + uin + '&key=' + key 576 | elif item['type'] == '49': 577 | # 图文 578 | app_msg_ext_info = listdic['app_msg_ext_info'] 579 | url = app_msg_ext_info.get('content_url') 580 | if url: 581 | url = 'http://mp.weixin.qq.com' + url if 'http://mp.weixin.qq.com' not in url else url 582 | else: 583 | url = '' 584 | msg_index = 1 585 | item['main'] = msg_index 586 | item['title'] = app_msg_ext_info.get('title', '') 587 | item['digest'] = app_msg_ext_info.get('digest', '') 588 | item['fileid'] = app_msg_ext_info.get('fileid', '') 589 | item['content_url'] = url 590 | item['source_url'] = app_msg_ext_info.get('source_url', '') 591 | item['cover'] = app_msg_ext_info.get('cover', '') 592 | item['author'] = app_msg_ext_info.get('author', '') 593 | item['copyright_stat'] = app_msg_ext_info.get('copyright_stat', '') 594 | items.append(item) 595 | if app_msg_ext_info.get('is_multi', 0) == 1: 596 | for multidic in app_msg_ext_info['multi_app_msg_item_list']: 597 | url = multidic.get('content_url') 598 | if url: 599 | url = 'http://mp.weixin.qq.com' + url if 'http://mp.weixin.qq.com' not in url else url 600 | else: 601 | url = '' 602 | itemnew = dict() 603 | itemnew['qunfa_id'] = item['qunfa_id'] 604 | itemnew['datetime'] = item['datetime'] 605 | itemnew['type'] = item['type'] 606 | msg_index += 1 607 | itemnew['main'] = msg_index 608 | itemnew['title'] = multidic.get('title', '') 609 | itemnew['digest'] = multidic.get('digest', '') 610 | itemnew['fileid'] = multidic.get('fileid', '') 611 | itemnew['content_url'] = url 612 | itemnew['source_url'] = multidic.get('source_url', '') 613 | itemnew['cover'] = multidic.get('cover', '') 614 | itemnew['author'] = multidic.get('author', '') 615 | itemnew['copyright_stat'] = multidic.get('copyright_stat', '') 616 | items.append(itemnew) 617 | continue 618 | elif item['type'] == '62': 619 | item['cdn_videoid'] = listdic['video_msg_ext_info'].get('cdn_videoid', '') 620 | item['thumb'] = listdic['video_msg_ext_info'].get('thumb', '') 621 | item['video_src'] = 'https://mp.weixin.qq.com/mp/getcdnvideourl?__biz=' + biz + '&cdn_videoid=' + item[ 622 | 'cdn_videoid'] + '&thumb=' + item['thumb'] + '&uin=' + uin + '&key=' + key 623 | items.append(item) 624 | return items 625 | 626 | def _get_gzh_article_text(self, url): 627 | """获取文章文本 628 | 629 | Args: 630 | url: 文章链接 631 | 632 | Returns: 633 | text: 文章文本 634 | """ 635 | return self._get(url, 'get', host='mp.weixin.qq.com') 636 | 637 | def _deal_related(self, url, title): 638 | """获取文章相似文章 639 | 640 | Args: 641 | url: 文章链接 642 | title: 文章标题 643 | 644 | Returns: 645 | related_dict: 相似文章字典 646 | 647 | Raises: 648 | WechatSogouException: 错误信息errmsg 649 | """ 650 | related_req_url = 'http://mp.weixin.qq.com/mp/getrelatedmsg?' \ 651 | 'url=' + quote(url) \ 652 | + '&title=' + title \ 653 | + '&uin=&key=&pass_ticket=&wxtoken=&devicetype=&clientversion=0&x5=0' 654 | related_text = self._get(related_req_url, 'get', host='mp.weixin.qq.com', referer=url) 655 | related_dict = eval(related_text) 656 | ret = related_dict['base_resp']['ret'] 657 | errmsg = related_dict['base_resp']['errmsg'] if related_dict['base_resp']['errmsg'] else 'ret:' + str(ret) 658 | if ret != 0: 659 | #logger.error(errmsg) 660 | raise WechatSogouException(errmsg) 661 | return related_dict 662 | 663 | def _uinkeybiz(self, keyword, uin=None, key=None, biz=None, pass_ticket=None, msgid=None): 664 | if uin: 665 | self._cache.set(keyword + 'uin', uin, 36000) 666 | self._cache.set(keyword + 'key', key, 36000) 667 | self._cache.set(keyword + 'biz', biz, 36000) 668 | self._cache.set(keyword + 'pass_ticket', pass_ticket, 36000) 669 | self._cache.set(keyword + 'msgid', msgid, 36000) 670 | else: 671 | uin = self._cache.get(keyword + 'uin') 672 | key = self._cache.get(keyword + 'key') 673 | biz = self._cache.get(keyword + 'biz') 674 | pass_ticket = self._cache.get(keyword + 'pass_ticket') 675 | msgid = self._cache.get(keyword + 'msgid') 676 | return uin, key, biz, pass_ticket, msgid 677 | 678 | def _cache_history_session(self, keyword, session=None): 679 | if session: 680 | self._cache.set(keyword + 'session', session, 36000) 681 | else: 682 | return self._cache.get(keyword + 'session') 683 | -------------------------------------------------------------------------------- /wechatsogou/api.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import re 4 | import requests 5 | import time 6 | from lxml import etree 7 | from wechatsogou.tools import * 8 | from .basic import WechatSogouBasic 9 | from .exceptions import * 10 | import json 11 | import logging 12 | import codecs,os 13 | import random 14 | from bs4 import BeautifulSoup 15 | logger = logging.getLogger() 16 | 17 | 18 | class WechatSogouApi(WechatSogouBasic): 19 | """基于搜狗搜索的的微信公众号爬虫接口 接口类 20 | """ 21 | 22 | def __init__(self, **kwargs): 23 | super(WechatSogouApi, self).__init__(**kwargs) 24 | 25 | def get_k_h(self,url,text): 26 | """计算k和h""" 27 | try: 28 | k = random.randrange(1,100) 29 | normal = re.findall('a\+4\+parseInt\("(.*?)"', text, re.S)[0] 30 | h = url[34+int(normal)+k] 31 | except Exception as e: 32 | traceback.print_exc() 33 | return str(k),h 34 | 35 | def search_gzh_info(self, name, page=1): 36 | """搜索公众号 37 | 38 | Args: 39 | name: 搜索关键字 40 | page: 搜索的页数 41 | 42 | Returns: 43 | 列表,每一项均是{'name':name,'wechatid':wechatid,'jieshao':jieshao,'renzhen':renzhen,'qrcode':qrcodes,'img':img,'url':url} 44 | name: 公众号名称 45 | wechatid: 公众号id 46 | jieshao: 介绍 47 | renzhen: 认证,为空表示未认证 48 | qrcode: 二维码 暂无 49 | img: 头像图片 50 | url: 文章地址 51 | last_url: 最后一篇文章地址 暂无 52 | """ 53 | htmlText,request_url = self._search_gzh_text(name, page) 54 | 55 | try: 56 | page = etree.HTML(htmlText) 57 | except: 58 | return "" 59 | 60 | img = list() 61 | #头像 62 | info_imgs = page.xpath(u"//div[@class='img-box']//img") 63 | for info_img in info_imgs: 64 | img.append(info_img.attrib['src']) 65 | #文章列表 66 | url = list() 67 | info_urls = page.xpath(u"//div[@class='img-box']//a"); 68 | for info_url in info_urls: 69 | urlTemp = info_url.attrib['href'] 70 | realurl = "" 71 | if "https" not in urlTemp: 72 | urlTemp = "https://weixin.sogou.com" + urlTemp 73 | #urlTemp = "https://weixin.sogou.com/link?url=dn9a_-gY295K0Rci_xozVXfdMkSQTLW6EzDJysI4ql5MPrOUp16838dGRMI7NnPqd7f2zaZT8G5XX6CVLv7ghwwvDqyjOWdzJRR9kv142zmzO5mLYzdWtr0gCwKco-MoXapf6ecdCpf0FojXSUCaI0AbdUwNO9bh1Gmjh__CSkSsWFwwodqOp8Ow2hU_0OwS0h4lvHQbidemvuZ2FfgnOGRTpLLTNgHY&type=1&query=mh_syxx&k=56&h=V" 74 | try: 75 | 76 | #计算加密k 77 | k,h = self.get_k_h(urlTemp,htmlText) 78 | urlTemp = "%s&k=%s&h=%s" %(urlTemp,k,h) 79 | #转成正式的文章列表url 80 | print(u"先获取正式的文章列表url") 81 | text = self._get(urlTemp,referer=request_url) 82 | arr = text.split("url +="); 83 | for iterating_var in arr: 84 | realurl+=iterating_var.split("'")[1]; 85 | except WechatSogouVcodeException: 86 | realurl = "" 87 | 88 | 89 | url.append(realurl) 90 | 91 | #微信号 92 | wechatid = page.xpath(u"//label[@name='em_weixinhao']/text()"); 93 | 94 | #公众号名称 95 | name = list() 96 | name_list = page.xpath(u"//div[@class='txt-box']/p/a") 97 | for name_item in name_list: 98 | name.append(name_item.xpath('string(.)')) 99 | 100 | last_url = list() 101 | jieshao = list() 102 | renzhen = list() 103 | list_index = 0 104 | #介绍、认证、最近文章 105 | info_instructions = page.xpath(u"//ul[@class='news-list2']/li") 106 | for info_instruction in info_instructions: 107 | cache = self._get_elem_text(info_instruction) 108 | cache = cache.replace('red_beg', '').replace('red_end', '') 109 | cache_list = cache.split('\n') 110 | cache_re = re.split(u'功能介绍:|认证:|最近文章:', cache_list[0]) 111 | if(cache.find("最近文章") == -1) : 112 | last_url.insert(list_index,"") 113 | list_index += 1 114 | 115 | if(len(cache_re) > 1): 116 | jieshao.append(re.sub("document.write\(authname\('[0-9]'\)\)", "", cache_re[1])) 117 | if "authname" in cache_re[1]: 118 | renzhen.append(cache_re[2]) 119 | else: 120 | renzhen.append('') 121 | else: 122 | #没取到,都为空吧 123 | jieshao.append('') 124 | renzhen.append('') 125 | 126 | returns = list() 127 | for i in range(len(name)): 128 | returns.append( 129 | { 130 | 'name': name[i], 131 | 'wechatid': wechatid[i], 132 | 'jieshao': jieshao[i], 133 | 'renzhen': renzhen[i], 134 | 'qrcode': '', 135 | 'img': img[i], 136 | 'url': url[i], 137 | 'last_url': '' 138 | } 139 | ) 140 | return returns 141 | 142 | def get_gzh_info(self, wechatid): 143 | """获取公众号微信号wechatid的信息 144 | 145 | 因为wechatid唯一确定,所以第一个就是要搜索的公众号 146 | 147 | Args: 148 | wechatid: 公众号id 149 | 150 | Returns: 151 | 字典{'name':name,'wechatid':wechatid,'jieshao':jieshao,'renzhen':renzhen,'qrcode':qrcodes,'img':img,'url':url} 152 | name: 公众号名称 153 | wechatid: 公众号id 154 | jieshao: 介绍 155 | renzhen: 认证,为空表示未认证 156 | qrcode: 二维码 157 | img: 头像图片 158 | url: 最近文章地址 159 | """ 160 | try: 161 | info = self.search_gzh_info(wechatid, 1) 162 | return info[0] if info else "" 163 | except: 164 | return "" 165 | 166 | 167 | def search_article_info(self, name, page=1): 168 | """搜索文章 169 | 170 | Args: 171 | name: 搜索文章关键字 172 | page: 搜索的页数 173 | 174 | Returns: 175 | 列表,每一项均是{'name','url','img','zhaiyao','gzhname','gzhqrcodes','gzhurl','time'} 176 | name: 文章标题 177 | url: 文章链接 178 | img: 文章封面图片缩略图,可转为高清大图 179 | zhaiyao: 文章摘要 180 | time: 文章推送时间,10位时间戳 181 | gzhname: 公众号名称 182 | gzhqrcodes: 公众号二维码 183 | gzhurl: 公众号最近文章地址 184 | page_count:共有多少页 185 | 186 | """ 187 | text = self._search_article_text(name, page) 188 | text = text.replace("amp;","") 189 | page = etree.HTML(text) 190 | #搜索到的总条数 191 | page_count = page.xpath(u"//div[@class='mun']/text()") 192 | page_count = page_count[0].replace(',','').replace('找到约','').replace('条结果','') 193 | 194 | #文章信息 195 | zhaiyao = list() 196 | #摘要 197 | zhaiyao_list = page.xpath(u"//ul[@class='news-list']/li//p[@class='txt-info']") 198 | for zhaiyao_item in zhaiyao_list: 199 | zhaiyao.append(zhaiyao_item.xpath('string(.)')) 200 | 201 | #标题 202 | name = list() 203 | info_names = page.xpath(u"//div[@class='txt-box']/h3/a") 204 | for info_name in info_names: 205 | name.append(info_name.xpath('string(.)')) 206 | 207 | #公众号名称 208 | gzhname = list() 209 | gzhwxhao = list() 210 | gzhqrcodes = list() 211 | gzhurl = list() 212 | info_gzhs = page.xpath(u"//div[@class='txt-box']/div[@class='s-p']/a") 213 | for info_gzh in info_gzhs: 214 | #gzhname.append(info_gzh.attrib['data-sourcename']) 215 | #gzhwxhao.append(info_gzh.attrib['data-username']) 216 | #gzhqrcodes.append(info_gzh.attrib['data-encqrcodeurl']) 217 | gzhurl.append(info_gzh.attrib['href']) 218 | 219 | #文章URL 220 | url = list() 221 | info_urls = page.xpath(u"//div[@class='txt-box']/h3/a") 222 | for info_url in info_urls: 223 | url.append(info_url.attrib['href']) 224 | 225 | #文章时间 226 | time = list() 227 | info_times = page.xpath(u"//div[@class='txt-box']/div[@class='s-p']") 228 | for info_time in info_times: 229 | time.append(info_time.attrib['t']) 230 | 231 | #封面 232 | img = list() 233 | info_imgs = page.xpath(u"//ul[@class='news-list']/li") 234 | for info_img in info_imgs: 235 | img_box = info_img.xpath(u"div[@class='img-box']/a/img") 236 | if len(img_box) > 0 : 237 | #普通封面的 238 | img.append(img_box[0].attrib['src']) 239 | else: 240 | #3张封面的 241 | img_box = info_img.xpath(u"div[@class='txt-box']/div[@class='img-d']/a/span/img") 242 | if len(img_box) > 0 : 243 | #拿第一个 244 | img.append(img_box[0].attrib['src']) 245 | else: 246 | #没拿到 247 | img.append("") 248 | 249 | returns = list() 250 | for i in range(len(url)): 251 | returns.append( 252 | { 253 | 'name': name[i], 254 | 'url': url[i], 255 | 'img': img[i], 256 | 'zhaiyao': zhaiyao[i], 257 | 'gzhname': list_or_empty(gzhname), 258 | 'gzhqrcodes': list_or_empty(gzhqrcodes), 259 | 'gzhurl': gzhurl[i], 260 | 'time': time[i], 261 | 'page_count':int(page_count) 262 | } 263 | ) 264 | return returns 265 | 266 | def get_gzh_message(self, **kwargs): 267 | """解析最近文章页 或 解析历史消息记录 268 | 269 | Args: 270 | ::param url 最近文章地址 271 | ::param wechatid 微信号 272 | ::param wechat_name 微信昵称(不推荐,因为不唯一) 273 | 274 | 最保险的做法是提供url或者wechatid 275 | 276 | Returns: 277 | gzh_messages 是 列表,每一项均是字典,一定含有字段qunfa_id,datetime,type 278 | 当type不同时,含有不同的字段,具体见文档 279 | """ 280 | url = kwargs.get('url', None) 281 | wechatid = kwargs.get('wechatid', None) 282 | wechat_name = kwargs.get('wechat_name', None) 283 | if url: 284 | text = self._get_gzh_article_by_url_text(url) 285 | elif wechatid: 286 | gzh_info = self.get_gzh_info(wechatid) 287 | url = gzh_info['url'] 288 | text = self._get_gzh_article_by_url_text(url) 289 | elif wechat_name: 290 | gzh_info = self.get_gzh_info(wechat_name) 291 | url = gzh_info['url'] 292 | text = self._get_gzh_article_by_url_text(url) 293 | else: 294 | raise WechatSogouException('get_gzh_recent_info need param text and url') 295 | 296 | if u'链接已过期' in text: 297 | return '链接已过期' 298 | return self._deal_gzh_article_dict(self._get_gzh_article_by_url_dict(text)) 299 | 300 | def get_gzh_message_and_info(self, **kwargs): 301 | """最近文章页 公众号信息 和 群发信息 302 | 303 | Args: 304 | ::param url 最近文章地址 305 | ::param wechatid 微信号 306 | ::param wechat_name 微信昵称(不推荐,因为不唯一) 307 | 308 | 最保险的做法是提供url或者wechatid 309 | 310 | Returns: 311 | 字典{'gzh_info':gzh_info, 'gzh_messages':gzh_messages} 312 | 313 | gzh_info 也是字典{'name':name,'wechatid':wechatid,'jieshao':jieshao,'renzhen':renzhen,'qrcode':qrcodes,'img':img,'url':url} 314 | name: 公众号名称 315 | wechatid: 公众号id 316 | jieshao: 介绍 317 | renzhen: 认证,为空表示未认证 318 | qrcode: 二维码 319 | img: 头像图片 320 | url: 最近文章地址 321 | 322 | gzh_messages 是 列表,每一项均是字典,一定含有字段qunfa_id,datetime,type 323 | 当type不同时,含有不同的字段,具体见文档 324 | """ 325 | url = kwargs.get('url', None) 326 | wechatid = kwargs.get('wechatid', None) 327 | wechat_name = kwargs.get('wechat_name', None) 328 | if url: 329 | text = self._get_gzh_article_by_url_text(url) 330 | elif wechatid: 331 | gzh_info = self.get_gzh_info(wechatid) 332 | url = gzh_info['url'] 333 | text = self._get_gzh_article_by_url_text(url) 334 | elif wechat_name: 335 | gzh_info = self.get_gzh_info(wechat_name) 336 | url = gzh_info['url'] 337 | text = self._get_gzh_article_by_url_text(url) 338 | else: 339 | raise WechatSogouException('get_gzh_recent_info need param text and url') 340 | 341 | return { 342 | 'gzh_info': self._get_gzh_article_gzh_by_url_dict(text, url), 343 | 'gzh_messages': self._deal_gzh_article_dict(self._get_gzh_article_by_url_dict(text)) 344 | } 345 | 346 | def deal_article_content(self, **kwargs): 347 | """获取文章内容 348 | 349 | Args: 350 | ::param url 文章页 url 351 | ::param text 文章页 文本 352 | 353 | Returns: 354 | content_html, content_rich, content_text 355 | content_html: 原始文章内容,包括html标签及样式 356 | content_rich: 包含图片(包括图片应展示的样式)的文章内容 357 | content_text: 包含图片(``格式)的文章内容 358 | """ 359 | url = kwargs.get('url', None) 360 | text = kwargs.get('text', None) 361 | 362 | if text: 363 | pass 364 | elif url: 365 | text = self._get_gzh_article_text(url) 366 | else: 367 | raise WechatSogouException('deal_content need param url or text') 368 | 369 | #纯文字 370 | bsObj = BeautifulSoup(text) 371 | content_text = bsObj.find("div", {"class":"rich_media_content", "id":"js_content"}) 372 | if not content_text: #分享的文章 373 | content_text = bsObj.find("div", {"class":"share_media", "id":"js_share_content"}) 374 | 375 | content_html = "" 376 | if content_text: 377 | content_html = content_text.get_text() 378 | 379 | return content_html 380 | 381 | def deal_article_related(self, url, title): 382 | """获取文章相似文章 383 | 384 | Args: 385 | url: 文章链接 386 | title: 文章标题 387 | 388 | Returns: 389 | related_dict: 相似文章字典 390 | 391 | Raises: 392 | WechatSogouException: 错误信息errmsg 393 | """ 394 | return self._deal_related(url, title) 395 | 396 | def deal_article_comment(self, **kwargs): 397 | """获取文章评论 398 | 399 | Args: 400 | text: 文章文本 401 | 402 | Returns: 403 | comment_dict: 评论字典 404 | 405 | Raises: 406 | WechatSogouException: 错误信息errmsg 407 | """ 408 | url = kwargs.get('url', None) 409 | text = kwargs.get('text', None) 410 | 411 | if text: 412 | pass 413 | elif url: 414 | text = self._get_gzh_article_text(url) 415 | else: 416 | raise WechatSogouException('deal_content need param url or text') 417 | 418 | sg_data = re.findall(u'window.sg_data={(.*?)}', text, re.S) 419 | if not sg_data : 420 | return "" 421 | sg_data = '{' + sg_data[0].replace(u'\r\n', '').replace(' ', '') + '}' 422 | sg_data = re.findall(u'{src:"(.*?)",ver:"(.*?)",timestamp:"(.*?)",signature:"(.*?)"}', sg_data)[0] 423 | comment_req_url = 'http://mp.weixin.qq.com/mp/getcomment?src=' + sg_data[0] + '&ver=' + sg_data[ 424 | 1] + '×tamp=' + sg_data[2] + '&signature=' + sg_data[ 425 | 3] + '&uin=&key=&pass_ticket=&wxtoken=&devicetype=&clientversion=0&x5=0' 426 | comment_text = self._get(comment_req_url, 'get', host='mp.weixin.qq.com', referer='http://mp.weixin.qq.com') 427 | comment_dict = eval(comment_text) 428 | ret = comment_dict['base_resp']['ret'] 429 | errmsg = comment_dict['base_resp']['errmsg'] if comment_dict['base_resp']['errmsg'] else 'ret:' + str(ret) 430 | if ret != 0: 431 | logger.error(errmsg) 432 | raise WechatSogouException(errmsg) 433 | return comment_dict 434 | 435 | def deal_article_yuan(self, **kwargs): 436 | url = kwargs.get('url', None) 437 | text = kwargs.get('text', None) 438 | 439 | if text: 440 | pass 441 | elif url: 442 | text = self._get_gzh_article_text(url) 443 | else: 444 | raise WechatSogouException('deal_article_yuan need param url or text') 445 | try: 446 | yuan = re.findall('var msg_link = "(.*?)";', text)[0].replace('amp;', '') 447 | except IndexError as e: 448 | if '系统出错' not in text: 449 | logger.error(e) 450 | print(e) 451 | print(text) 452 | 453 | raise WechatSogouBreakException() 454 | return yuan 455 | 456 | def deal_article(self, url, title=None): 457 | """获取文章详情 458 | 459 | Args: 460 | url: 文章链接 461 | title: 文章标题 462 | 注意,title可以为空,则表示不根据title获取相似文章 463 | 464 | Returns: 465 | {'yuan':'','related':'','comment':'','content': {'content_html':'','content_rich':'','content_text':''} 466 | yuan: 文章固定地址 467 | related: 相似文章信息字典 468 | comment: 评论信息字典 469 | content: 文章内容 470 | """ 471 | text = self._get_gzh_article_text(url) 472 | 473 | yuan_url = url #self.deal_get_real_url(url) 2017-5-3搜狗升级获取永久链接方法 474 | 475 | comment = '' #2017-04-27搜狗微信取消评论数据self.deal_article_comment(text=text) 476 | content_html = self.deal_article_content(text=text) 477 | retu = { 478 | 'yuan': yuan_url, 479 | 'comment': comment, 480 | 'content_html': content_html 481 | } 482 | 483 | if title is not None: 484 | related = self.deal_article_related(url, title) 485 | retu['related'] = related 486 | return retu 487 | else: 488 | return retu 489 | 490 | def get_recent_article_url_by_index_single(self, kind=0, page=0): 491 | """获取首页推荐文章公众号最近文章地址 492 | 493 | Args: 494 | kind: 类别,从0开始,经检测,至少应检查0-19,不保证之间每个都有 495 | page: 页数,从0开始 496 | 497 | Returns: 498 | recent_article_urls或者False 499 | recent_article_urls: 最近文章地址列表 500 | False: 该kind和page对应的页数没有文章 501 | """ 502 | if page == 0: 503 | page_str = 'pc_0' 504 | else: 505 | page_str = str(page) 506 | url = 'https://weixin.sogou.com/pcindex/pc/pc_' + str(kind) + '/' + page_str + '.html' 507 | try: 508 | text = self._get(url) 509 | page = etree.HTML(text) 510 | recent_article_urls = page.xpath('//li/div[@class="pos-wxrw"]/a/@href') 511 | reurls = [] 512 | for reurl in recent_article_urls: 513 | if 'mp.weixin.qq.com' in reurl: 514 | reurls.append(reurl) 515 | return reurls 516 | except WechatSogouRequestsException as e: 517 | if e.status_code == 404: 518 | return False 519 | 520 | def get_recent_article_url_by_index_all(self): 521 | """获取首页推荐文章公众号最近文章地址,所有分类,所有页数 522 | 523 | Returns: 524 | return_urls: 最近文章地址列表 525 | """ 526 | return_urls = [] 527 | for i in range(20): 528 | j = 0 529 | urls = self.get_recent_article_url_by_index_single(i, j) 530 | while urls: 531 | return_urls.extend(urls) 532 | j += 1 533 | urls = self.get_recent_article_url_by_index_single(i, j) 534 | return return_urls 535 | 536 | def get_sugg(self, keyword): 537 | """获取微信搜狗搜索关键词联想 538 | 539 | Args: 540 | keyword: 关键词 541 | 542 | Returns: 543 | sugg: 联想关键词列表 544 | 545 | Raises: 546 | WechatSogouException: get_sugg keyword error 关键词不是str或者不是可以str()的类型 547 | WechatSogouException: sugg refind error 返回分析错误 548 | """ 549 | try: 550 | keyword = str(keyword) if type(keyword) != str else keyword 551 | except Exception as e: 552 | logger.error('get_sugg keyword error', e) 553 | raise WechatSogouException('get_sugg keyword error') 554 | url = 'http://w.sugg.sogou.com/sugg/ajaj_json.jsp?key=' + keyword + '&type=wxpub&pr=web' 555 | text = self._get(url, 'get', host='w.sugg.sogou.com') 556 | try: 557 | sugg = re.findall(u'\["' + keyword + '",(.*?),\["', text)[0] 558 | sugg = eval(sugg) 559 | return sugg 560 | except Exception as e: 561 | logger.error('sugg refind error', e) 562 | raise WechatSogouException('sugg refind error') 563 | 564 | def deal_mass_send_msg(self, url, wechatid): 565 | """解析 历史消息 566 | 567 | ::param url是抓包获取的历史消息页 568 | """ 569 | session = requests.session() 570 | r = session.get(url, verify=False) 571 | #print(r) 572 | if r.status_code == requests.codes.ok: 573 | try: 574 | biz = re.findall('biz = \'(.*?)\',', r.text)[0] 575 | key = re.findall('key = \'(.*?)\',', r.text)[0] 576 | uin = re.findall('uin = \'(.*?)\',', r.text)[0] 577 | pass_ticket = self._get_url_param(url).get('pass_ticket', [''])[0] 578 | 579 | self._uinkeybiz(wechatid, uin, key, biz, pass_ticket, 0) 580 | self._cache_history_session(wechatid, session) 581 | 582 | except IndexError: 583 | logger.error('deal_mass_send_msg error. maybe you should get the mp url again') 584 | #raise WechatSogouHistoryMsgException('deal_mass_send_msg error. maybe you should get the mp url again') 585 | return 404 586 | else: 587 | logger.error('requests status_code error', r.status_code) 588 | raise WechatSogouRequestsException('requests status_code error', r.status_code) 589 | 590 | #获取历史消息 591 | def deal_mass_send_msg_page(self, wechatid, updatecache=True): 592 | url = 'http://mp.weixin.qq.com/mp/getmasssendmsg?' 593 | uin, key, biz, pass_ticket, frommsgid = self._uinkeybiz(wechatid) 594 | #print([uin, key, biz, pass_ticket, frommsgid]) 595 | url = url + 'uin=' + uin + '&' 596 | url = url + 'key=' + key + '&' 597 | url = url + '__biz=' + biz + '&' 598 | url = url + 'pass_ticket=' + pass_ticket + '&' 599 | url = url + 'frommsgid=' + str(frommsgid) + '&' 600 | data = { 601 | 'f': 'json', 602 | 'count': '10', 603 | 'wxtoken': '', 604 | 'x5': '0' 605 | } 606 | for k, v in data.items(): 607 | url = url + k + '=' + v + '&' 608 | url = url[:-1] 609 | # print(url) 610 | 611 | try: 612 | session = self._cache_history_session(wechatid) 613 | r = session.get(url, headers={'Host': 'mp.weixin.qq.com'}, verify=False) 614 | #print(r.text) 615 | rdic = eval(r.text) 616 | if rdic['ret'] == 0: 617 | 618 | data_dict_from_str = self._str_to_dict(rdic['general_msg_list']) 619 | 620 | if rdic['is_continue'] == 0 and rdic['count'] == 0: 621 | raise WechatSogouEndException() 622 | 623 | msg_dict = self._deal_gzh_article_dict(data_dict_from_str) 624 | msg_dict_new = reversed(msg_dict) 625 | msgid = 0 626 | for m in msg_dict_new: 627 | if int(m['type']) == 49: 628 | msgid = m['qunfa_id'] 629 | break 630 | 631 | if updatecache: 632 | self._uinkeybiz(wechatid, rdic['uin_code'], rdic['key'], rdic['bizuin_code'], pass_ticket, msgid) 633 | 634 | return msg_dict 635 | else: 636 | logger.error('deal_mass_send_msg_page ret ' + str(rdic['ret']) + ' errmsg ' + rdic['errmsg']) 637 | raise WechatSogouHistoryMsgException( 638 | 'deal_mass_send_msg_page ret ' + str(rdic['ret']) + ' errmsg ' + rdic['errmsg']) 639 | except AttributeError: 640 | logger.error('deal_mass_send_msg_page error, please delete cache file') 641 | raise WechatSogouHistoryMsgException('deal_mass_send_msg_page error, please delete cache file') 642 | 643 | 644 | #获取阅读数据 645 | def deal_get_fwh_read(self, wechatid, updatecache,**kwargs): 646 | url = 'http://mp.weixin.qq.com/mp/getappmsgext?' 647 | uin, key, biz, pass_ticket, frommsgid = self._uinkeybiz(wechatid) 648 | #print([uin, key, biz, pass_ticket, frommsgid]) 649 | url = url + 'uin=' + uin + '&' 650 | url = url + 'key=' + key + '&' 651 | url = url + '__biz=' + biz + '&' 652 | url = url + 'pass_ticket=' + pass_ticket + '&' 653 | url = url + 'frommsgid=' + str(frommsgid) + '&' 654 | url = url + 'mid=' + kwargs.get('mid', None) + '&' 655 | url = url + 'sn=' + kwargs.get('sn', None) + '&' 656 | url = url + 'idx=' + kwargs.get('idx', None) + '&' 657 | 658 | data = { 659 | 'f': 'json', 660 | 'count': '10', 661 | 'wxtoken': '', 662 | 'x5': '0' 663 | } 664 | for k, v in data.items(): 665 | url = url + k + '=' + v + '&' 666 | url = url[:-1] 667 | # print(url) 668 | 669 | try: 670 | session = self._cache_history_session(wechatid) 671 | print(url) 672 | r = session.post(url,headers={'Host': 'mp.weixin.qq.com', 673 | 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat'}, 674 | data={'is_only_read':1}, verify=False) 675 | 676 | 677 | if r.status_code == requests.codes.ok: 678 | try: 679 | rdic = json.loads(r.text) 680 | return rdic['appmsgstat'] 681 | 682 | except IndexError: 683 | logger.error('deal_mass_send_msg error. maybe you should get the mp url again') 684 | #raise WechatSogouHistoryMsgException('deal_mass_send_msg error. maybe you should get the mp url again') 685 | return 404 686 | else : 687 | logger.error('requests status_code error', r.status_code) 688 | raise WechatSogouRequestsException('requests status_code error', r.status_code) 689 | 690 | except AttributeError: 691 | logger.error('deal_mass_send_msg_page error, please delete cache file') 692 | raise WechatSogouHistoryMsgException('deal_mass_send_msg_page error, please delete cache file') 693 | 694 | #获取搜狗微信文章上的真实链接 695 | def deal_get_real_url(self, url): 696 | try: 697 | url = url + '&uin=MjExMTY2MjUzNg==' 698 | text = requests.get(url,allow_redirects=False) 699 | return text.headers['Location'] 700 | except: 701 | return "" 702 | 703 | #下载文章到本地 704 | def down_html(self, url,dir_name): 705 | try: 706 | url = url.replace('\\x26','&') 707 | url = url.replace('x26','&') 708 | 709 | print(url) 710 | h = httplib2.Http(timeout=30) 711 | html = self._get_gzh_article_text(url) 712 | content = html 713 | 714 | # 正则表达式javascript里的获取相关变量 715 | ct = re.findall('var ct = "(.*?)";', content)[0] 716 | msg_cdn_url = re.findall('var msg_cdn_url = "(.*?)";', content)[0] 717 | nickname = re.findall('var nickname = "(.*?)";', content)[0] 718 | if(nickname == ""): 719 | nickname = "not has name" 720 | if(ct == ""): 721 | ct = time.time() 722 | 723 | ctime = time.strftime("%Y%m%d%H%M%S", time.localtime(int(ct))) # int将字符串转成数字,不区分int和long, 这里将时间秒数转成日期格式 724 | # 建立文件夹 725 | #编码转换 726 | if isinstance(dir_name, unicode): 727 | dir_name = dir_name.encode('GB18030','ignore') 728 | else: 729 | dir_name = dir_name.decode('utf-8','ignore').encode('GB18030','ignore') 730 | 731 | #print 732 | if isinstance(nickname, unicode): 733 | nickname = nickname.encode('GB18030','ignore') 734 | else: 735 | if chardet.detect(nickname)['encoding'] == 'KOI8-R' : 736 | print("KOI8") 737 | nickname = nickname.decode('KOI8-R','ignore').encode('GB18030','ignore') 738 | else: 739 | print("GB18030") 740 | nickname = nickname.decode('utf-8','ignore').encode('GB18030','ignore') 741 | 742 | dir = 'WeiXinGZH/' + nickname + '/' + ctime + '/' + dir_name + '/' 743 | #dir = 'WeiXinGZH/' + dir_name + '/' 744 | dir = dir.decode('gb2312','ignore') 745 | dir = dir.replace("?", "") 746 | dir = dir.replace("\\", "") 747 | dir = dir.replace("*", "") 748 | dir = dir.replace(":", "") 749 | dir = dir.replace('\"', "") 750 | dir = dir.replace("<", "") 751 | dir = dir.replace(">", "") 752 | dir = dir.replace("|", "") 753 | 754 | 755 | try : 756 | os.makedirs(dir) # 建立相应的文件夹 757 | 758 | except : 759 | #不处理 760 | errormsg = 'none' 761 | 762 | # 下载封面 763 | url = msg_cdn_url 764 | print(u'正在下载文章:' + url) 765 | resp, contentface = h.request(url) 766 | 767 | file_name = dir + 'cover.jpg' 768 | codecs.open(file_name,mode='wb').write(contentface) 769 | 770 | # 下载其他图片 771 | soup = BeautifulSoup(content, 'html.parser') 772 | count = 0 773 | #logger.error(html) 774 | err_count = 0 775 | for link in soup.find_all('img') : 776 | try: 777 | err_count += 1 778 | if(err_count > 200) : 779 | break #防止陷阱 780 | 781 | if None != link.get('data-src') : 782 | count = count + 1 783 | orurl = link.get('data-src') 784 | url = orurl.split('?')[0] # 重新构造url,原来的url有一部分无法下载 785 | #print u'正在下载:' + url 786 | resp, content = h.request(url) 787 | 788 | matchurlvalue = re.search(r'wx_fmt=(?P[^&]*)', orurl) # 无参数的可能是gif,也有可能是jpg 789 | if None != matchurlvalue: 790 | wx_fmt = matchurlvalue.group('wx_fmt') # 优先通过wx_fmt参数的值判断文件类型 791 | else: 792 | wx_fmt = binascii.b2a_hex(content[0:4]) # 读取前4字节转化为16进制字符串 793 | 794 | #print wx_fmt 795 | phototype = { 'jpeg': '.jpg', 'gif' : '.gif', 'png' : '.png', 'jpg' : '.jpg', '47494638' : '.gif', 'ffd8ffe0' : '.jpg', 'ffd8ffe1' : '.jpg', 'ffd8ffdb' : '.jpg', 'ffd8fffe' : '.jpg', 'other' : '.jpg', '89504e47' : '.png' } # 方便写文件格式 796 | file_name = 'Picture' + str(count) + phototype[wx_fmt] 797 | file_path = dir + file_name 798 | open(file_path, 'wb').write(content) 799 | 800 | #图片替换成本地地址 801 | re_url = 'data-src="%s(.+?)"' % (url[:-5]) 802 | re_pic = 'src="%s"' % (file_name) 803 | html = re.sub(re_url, re_pic, html) 804 | except: 805 | continue 806 | 807 | with open("%sindex.html" % (dir), "wb") as code : 808 | code.write(html) 809 | 810 | print(u'文章下载完成') 811 | ret_path = os.path.abspath('.') 812 | ret_path = ret_path.replace('\\', "/") 813 | ret_path = "%s/%sindex.html" %(ret_path.decode('GB18030').encode('utf-8'),dir) 814 | #print(ret_path) 815 | #except: 816 | except WechatSogouHistoryMsgException: 817 | print(u'文章内容有异常编码,无法下载') 818 | return "" 819 | return ret_path 820 | 821 | 822 | --------------------------------------------------------------------------------