├── changelog.md
├── log.txt
├── requirements.txt
├── screenshot
    ├── wx.png
    └── zfb.png
├── wechatsogou
    ├── db.pyc
    ├── api.pyc
    ├── base.pyc
    ├── basic.pyc
    ├── tools.pyc
    ├── config.pyc
    ├── __init__.pyc
    ├── exceptions.pyc
    ├── filecache.pyc
    ├── ruokuaicode.pyc
    ├── __pycache__
    │   ├── db.cpython-35.pyc
    │   ├── api.cpython-35.pyc
    │   ├── base.cpython-35.pyc
    │   ├── basic.cpython-35.pyc
    │   ├── config.cpython-35.pyc
    │   ├── tools.cpython-35.pyc
    │   ├── __init__.cpython-35.pyc
    │   ├── filecache.cpython-35.pyc
    │   ├── exceptions.cpython-35.pyc
    │   └── ruokuaicode.cpython-35.pyc
    ├── base.py
    ├── __init__.py
    ├── config.py
    ├── tools.py
    ├── exceptions.py
    ├── filecache.py
    ├── ruokuaicode.py
    ├── db.py
    ├── basic.py
    └── api.py
├── cache
    ├── 2029240f6d1128be89ddc32729463129
    └── 8f0f136a8d509c9a5f221e61e813c820
├── test.py
├── logging.conf
├── auto_add_mp_logging.conf
├── README.md
├── auto_add_mp.py
├── cookies.txt
├── updatewenzhang.py
├── updatemp.py
├── jubang.sql
└── auto_add_mp_log.txt


/changelog.md:
--------------------------------------------------------------------------------
1 | # 1.0.0
2 | 
3 | - 重写项目


--------------------------------------------------------------------------------
/log.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/log.txt


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | bs4
2 | requests
3 | PyMySQL
4 | lxml
5 | pillow
6 | werkzeug
7 | 


--------------------------------------------------------------------------------
/screenshot/wx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/screenshot/wx.png


--------------------------------------------------------------------------------
/screenshot/zfb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/screenshot/zfb.png


--------------------------------------------------------------------------------
/wechatsogou/db.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/db.pyc


--------------------------------------------------------------------------------
/wechatsogou/api.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/api.pyc


--------------------------------------------------------------------------------
/wechatsogou/base.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/base.pyc


--------------------------------------------------------------------------------
/wechatsogou/basic.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/basic.pyc


--------------------------------------------------------------------------------
/wechatsogou/tools.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/tools.pyc


--------------------------------------------------------------------------------
/wechatsogou/config.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/config.pyc


--------------------------------------------------------------------------------
/wechatsogou/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/__init__.pyc


--------------------------------------------------------------------------------
/wechatsogou/exceptions.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/exceptions.pyc


--------------------------------------------------------------------------------
/wechatsogou/filecache.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/filecache.pyc


--------------------------------------------------------------------------------
/wechatsogou/ruokuaicode.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/ruokuaicode.pyc


--------------------------------------------------------------------------------
/cache/2029240f6d1128be89ddc32729463129:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/cache/2029240f6d1128be89ddc32729463129


--------------------------------------------------------------------------------
/cache/8f0f136a8d509c9a5f221e61e813c820:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/cache/8f0f136a8d509c9a5f221e61e813c820


--------------------------------------------------------------------------------
/wechatsogou/__pycache__/db.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/__pycache__/db.cpython-35.pyc


--------------------------------------------------------------------------------
/wechatsogou/__pycache__/api.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/__pycache__/api.cpython-35.pyc


--------------------------------------------------------------------------------
/wechatsogou/__pycache__/base.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/__pycache__/base.cpython-35.pyc


--------------------------------------------------------------------------------
/wechatsogou/base.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | class WechatSogouBase(object):
4 |     """基于搜狗搜索的的微信公众号爬虫接口 基类
5 |     """
6 |     pass
7 | 


--------------------------------------------------------------------------------
/wechatsogou/__pycache__/basic.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/__pycache__/basic.cpython-35.pyc


--------------------------------------------------------------------------------
/wechatsogou/__pycache__/config.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/__pycache__/config.cpython-35.pyc


--------------------------------------------------------------------------------
/wechatsogou/__pycache__/tools.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/__pycache__/tools.cpython-35.pyc


--------------------------------------------------------------------------------
/wechatsogou/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/wechatsogou/__pycache__/filecache.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/__pycache__/filecache.cpython-35.pyc


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #查找公众号最新文章
3 | import sys
4 | 
5 | print(sys.version_info[0])
6 | is_python3 = sys.version_info[0] > 2
7 | print(is_python3)


--------------------------------------------------------------------------------
/wechatsogou/__pycache__/exceptions.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/__pycache__/exceptions.cpython-35.pyc


--------------------------------------------------------------------------------
/wechatsogou/__pycache__/ruokuaicode.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaryee/wechat_sogou_crawl/HEAD/wechatsogou/__pycache__/ruokuaicode.cpython-35.pyc


--------------------------------------------------------------------------------
/wechatsogou/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from wechatsogou.api import WechatSogouApi
 4 | from wechatsogou.db import mysql
 5 | from wechatsogou.filecache import WechatCache
 6 | 
 7 | __all__ = ['WechatSogouApi', 'WechatCache', 'mysql']
 8 | 
 9 | __version__ = "1.1.7"
10 | 


--------------------------------------------------------------------------------
/wechatsogou/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # 缓存配置
 4 | cache_dir = 'cache'
 5 | cache_session_name = 'requests_wechatsogou_session'
 6 | 
 7 | # mysql数据库配置
 8 | host = '127.0.0.1'
 9 | user = 'sougou'   # 数据库用户名
10 | passwd = '123456'   # 数据库密码
11 | db = 'jubang'  # 默认数据库
12 | charset = 'utf8mb4'
13 | prefix = ''  # 默认数据表前缀,可以不用写
14 | 
15 | # 打码平台配置ruokuai  http://www.ruokuai.com/
16 | # 注册并充值后，就可以直接使用，识别一个验证码大约0.008元
17 | # 搜狗微信有点变态，有时明明验证码是正确的，他非说是错误的，这是没有办法的事情,好在这个概率非常低
18 | dama_name = 'xxx'    #用户名
19 | dama_pswd = 'xxx'  #密码
20 | 


--------------------------------------------------------------------------------
/logging.conf:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root
 3 | 
 4 | [handlers]
 5 | keys=rotateFileHandler
 6 | 
 7 | [formatters]
 8 | keys=simpleFormatter
 9 | 
10 | [logger_root]
11 | level=WARNING
12 | handlers=rotateFileHandler
13 | qualname=simpleExample
14 | propagate=0
15 | 
16 | [handler_rotateFileHandler]
17 | class=handlers.RotatingFileHandler
18 | level=WARNING
19 | formatter=simpleFormatter
20 | args=('log.txt', 'a+', 200000, 9)
21 | 
22 | [formatter_simpleFormatter]
23 | format=%(asctime)s - [%(filename)s:%(lineno)d] - %(levelname)s - %(message)s
24 | datefmt=


--------------------------------------------------------------------------------
/auto_add_mp_logging.conf:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root
 3 | 
 4 | [handlers]
 5 | keys=rotateFileHandler
 6 | 
 7 | [formatters]
 8 | keys=simpleFormatter
 9 | 
10 | [logger_root]
11 | level=WARNING
12 | handlers=rotateFileHandler
13 | qualname=simpleExample
14 | propagate=0
15 | 
16 | [handler_rotateFileHandler]
17 | class=handlers.RotatingFileHandler
18 | level=WARNING
19 | formatter=simpleFormatter
20 | args=('auto_add_mp_log.txt', 'a+', 200000, 9)
21 | 
22 | [formatter_simpleFormatter]
23 | format=%(asctime)s - [%(filename)s:%(lineno)d] - %(levelname)s - %(message)s
24 | datefmt=


--------------------------------------------------------------------------------
/wechatsogou/tools.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import json
 4 | 
 5 | def prdict(content):
 6 |     msg = json.dumps(content, indent=1, ensure_ascii=False)
 7 |     print(msg)
 8 | 
 9 | def list_or_empty(content, contype=None):
10 |     if isinstance(content, list):
11 |         if content:
12 |             return contype(content[0]) if contype else content[0]
13 |         else:
14 |             if contype:
15 |                 if contype == int:
16 |                     return 0
17 |                 elif contype == str:
18 |                     return ''
19 |                 elif contype == list:
20 |                     return []
21 |                 else:
22 |                     raise Exception('only cna deal int str list')
23 |             else:
24 |                 return ''
25 |     else:
26 |         raise Exception('need list')


--------------------------------------------------------------------------------
/wechatsogou/exceptions.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | class WechatSogouException(Exception):
 4 |     """基于搜狗搜索的的微信公众号爬虫接口  异常基类
 5 |     """
 6 |     pass
 7 | 
 8 | 
 9 | class WechatSogouVcodeException(WechatSogouException):
10 |     """基于搜狗搜索的的微信公众号爬虫接口 出现验证码 异常类
11 |     """
12 |     pass
13 | 
14 | 
15 | class WechatSogouJsonException(WechatSogouException):
16 |     """基于搜狗搜索的的微信公众号爬虫接口 非标准json数据 异常类
17 |     """
18 |     pass
19 | 
20 | 
21 | class WechatSogouEndException(WechatSogouException):
22 |     """基于搜狗搜索的的微信公众号爬虫接口 数据处理完成 异常类
23 |     """
24 |     pass
25 | 
26 | class WechatSogouBreakException(WechatSogouException):
27 |     """基于搜狗搜索的的微信公众号爬虫接口 中断 异常类
28 |     """
29 |     pass
30 | 
31 | class WechatSogouHistoryMsgException(WechatSogouException):
32 |     """基于搜狗搜索的的微信公众号爬虫接口 数据处理完成 异常类
33 |     """
34 |     pass
35 | 
36 | class ConfigException(WechatSogouException):
37 |     """基于搜狗搜索的的微信公众号爬虫接口 配置错误 异常类
38 |     """
39 |     pass
40 | 
41 | class WechatSogouRequestsException(WechatSogouException):
42 |     """基于搜狗搜索的的微信公众号爬虫接口 抓取 异常类
43 |     """
44 | 
45 |     def __init__(self, errmsg, status_code):
46 |         WechatSogouException(errmsg)
47 |         self.status_code = status_code
48 | 


--------------------------------------------------------------------------------
/wechatsogou/filecache.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from werkzeug.contrib.cache import FileSystemCache
 4 | 
 5 | from .base import WechatSogouBase
 6 | 
 7 | class WechatCache(WechatSogouBase):
 8 |     """基于文件的缓存
 9 | 
10 |     """
11 | 
12 |     def __init__(self, cache_dir='cache', default_timeout=300):
13 |         """初始化
14 | 
15 |         cache_dir是缓存目录
16 |         """
17 |         self.cache = FileSystemCache(cache_dir, default_timeout=default_timeout)
18 | 
19 |     def clear(self):
20 |         """清空缓存
21 |         """
22 |         return self.cache.clear()
23 | 
24 |     def get(self, key):
25 |         """获取缓存
26 | 
27 |         获取键值key的缓存值
28 |         如果没有对应缓存，返回None
29 |         """
30 |         return self.cache.get(key)
31 | 
32 |     def add(self, key, value, timeout=None):
33 |         """增加缓存
34 | 
35 |         如果键值key对应的缓存不存在，那么增加值value到键值key，过期时间timeout，默认300秒
36 |         否则返回False（即不能覆盖设置缓存）
37 |         """
38 |         return self.cache.add(key, value, timeout)
39 | 
40 |     def set(self, key, value, timeout=None):
41 |         """设置缓存
42 | 
43 |         设置键值key的缓存为value,过期时间300秒
44 |         """
45 |         return self.cache.set(key, value, timeout)
46 | 
47 |     def delete(self, key):
48 |         """删除缓存
49 | 
50 |         删除键值key存储的缓存
51 |         """
52 |         return self.cache.delete(key)
53 | 
54 | 
55 | if __name__ == '__main__':
56 |     cache = WechatCache()
57 |     import requests
58 | 
59 |     r = requests.session()
60 |     print(cache.set('1', r))
61 |     print(cache.get('1'), type(cache.get('1')))
62 | 


--------------------------------------------------------------------------------
/wechatsogou/ruokuaicode.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import requests
 4 | from hashlib import md5
 5 | 
 6 | from .base import WechatSogouBase
 7 | 
 8 | class RClient(WechatSogouBase):
 9 | 
10 |     def __init__(self, username, password, soft_id, soft_key):
11 |         self.username = username
12 |         self.password = md5(password.encode('utf-8')).hexdigest()
13 |         self.soft_id = soft_id
14 |         self.soft_key = soft_key
15 |         self.base_params = {
16 |             'username': self.username,
17 |             'password': self.password,
18 |             'softid': self.soft_id,
19 |             'softkey': self.soft_key,
20 |         }
21 |         self.headers = {
22 |             'Connection': 'Keep-Alive',
23 |             'Expect': '100-continue',
24 |             'User-Agent': 'ben',
25 |         }
26 | 
27 |     def create(self, im, im_type, timeout=60):
28 |         """
29 |         im: 图片字节
30 |         im_type: 题目类型
31 |         """
32 |         params = {
33 |             'typeid': im_type,
34 |             'timeout': timeout,
35 |         }
36 |         params.update(self.base_params)
37 |         files = {'image': ('a.jpg', im)}
38 |         r = requests.post('http://api.ruokuai.com/create.json', data=params, files=files, headers=self.headers)
39 |         return r.json()
40 | 
41 |     def report_error(self, im_id):
42 |         """
43 |         im_id:报错题目的ID
44 |         """
45 |         params = {
46 |             'id': im_id,
47 |         }
48 |         params.update(self.base_params)
49 |         r = requests.post('http://api.ruokuai.com/reporterror.json', data=params, headers=self.headers)
50 |         return r.json()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 基于搜狗微信搜索的微信公众号爬虫
 2 | ===
 3 | 
 4 | [![pypi supported versions](https://img.shields.io/pypi/pyversions/kubernetes.svg)](https://github.com/jaryee/wechat_sogou_crawl)
 5 | 
 6 | 2019-03-30 适应搜狗2019-03-29规则变化
 7 | 
 8 | 2019-03-07 增加对py3的支持，同时支持py2和py3
 9 | 
10 | 2017-4-27搜狗微信取消了阅读、点攒及评论数据，所以无法通过搜狗获取这些数据了.
11 | 
12 | # 项目简介
13 | 基于搜狗微信搜索的微信公众号爬虫
14 | 可以抓取指定公众号的文章信息
15 | 
16 | # 赞助作者
17 | 俺是自由职业者，好汉们如果可能的话赞助一些让俺将开源事业进行到底，谢谢！！！
18 | 
19 | <img src="https://github.com/jaryee/wechat_sogou_crawl/blob/master/screenshot/wx.png" width="250" />
20 | <img src="https://github.com/jaryee/wechat_sogou_crawl/blob/master/screenshot/zfb.png" width="250" />
21 | 
22 | 兄弟我弄了个淘宝店，有时间的兄弟给捧个场啊，新店需要信誉积分，跪谢！只要一块钱，就能温暖你我他
23 | https://item.taobao.com/item.htm?spm=a230r.1.14.16.PRhaio&id=543333631871&ns=1&abbucket=6#detail
24 | 
25 | 
26 | 
27 | 使用教程大家可以去我的微博查看：
28 | http://blog.csdn.net/niuxiaojia09/article/details/55260770
29 | 
30 | 
31 | 2017-1-20 增加如何使程序进入搜狗微信登录状态的说明，在Updatemp.py和UpdateWenzhang.py中都有操作说明
32 | 2017-3-21 在API.py中增加把文章本地化的函数，可以根据自己的需要把文章下载到本地
33 | 
34 | # 项目使用
35 | 
36 | 一、使用说明
37 | 
38 | 1、在mysql数据库中创建数据库，数据库命名为Jubang,数据格式为utf8mb4，然后导入jubang.sql文件，创建对应的数据库表
39 | 
40 | 2、修改config.py文件中对应的设置，打码平台配置ruokuai这个一定要设置，否则出现验证码就不能正常工作了
41 | 
42 | 3、执行：pip install -r requirements.txt  安装所需要的第三方包
43 | 
44 | 4、手动或自动在add_mp_list表中增加数据，然后运行auto_add_mp.py文件。
45 |    比如可以这样用：给auto_add_mp.py设定一个定时任务，5分钟或10分钟，然后前台页面文件让使用者添加待抓取的
46 |    公众号信息，然后定时任务执行时就可以把这些公众号加入待抓取列表了
47 |    add_mp_list中
48 |    name字段是模糊抓取，会根据输入的名称模糊加入10个公众号
49 |    wx_hao字段是精确抓取，这个是公众号的微信号，只抓取一个
50 |    这两个字段可以任意填入一个就行
51 | 
52 | 5、执行updatemp.py文件，文件说明看后面。使用中可以给该文件设定定时任务30分钟或其它间隔，每隔一定时间，运行该
53 |    文件就会抓取已添加的公众号是否有新文章发出来。
54 |    第一次使用会抓取公众号的最近10条群发数据
55 | 
56 | 6、执行updatewenzhang.py文件，该文件是抓取文章阅读及点攒数的。最新的数据会写入wenzhang_info表中，并且会在表wenzhang_statistics中
57 |    添加增量记录，可以根据wenzhang_statistics表中的数据生成曲线图
58 |    使用中可以给该文件添加5分钟或其它时间的定时任务，这样就可以来生成对应的阅读曲线图了
59 | 
60 | 二、文件说明
61 | 
62 | 1、updatemp.py
63 | 该文件遍历待抓取列表（数据库表：mp_info），查询表中的公众号是否有新文章发布，如果有，就抓取新的文章信息并
64 | 放入数据库表wenzhang_info中
65 | 
66 | 2、updatewenzhang.py
67 | 该文件遍历文章表，然后抓取24小时之内的文章阅读数据存入表wenzhang_info和表wenzhang_statistics中
68 | 
69 | 3、 auto_add_mp.py
70 | 该文件将指定的公众号添加到待抓取列表中
71 | 该文件读取数据库表（add_mp_list）中的内容，然后将其中指定的公众号填入数据库表（mp_info）中
72 | 
73 | 
74 | 
75 | # TODO
76 | - [x] 使用py2.7
77 | - [x] 获取指定公众号文章
78 | - [x] 文章详情页信息
79 | - [x] 验证码自动识别
80 | 
81 | ---
82 | 


--------------------------------------------------------------------------------
/auto_add_mp.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #添加指定公众号到爬虫数据库
 3 | 
 4 | # 导入包
 5 | from wechatsogou.tools import *
 6 | from wechatsogou import *
 7 | from PIL import Image
 8 | import datetime
 9 | import time
10 | import sys,locale
11 | import logging
12 | import logging.config
13 | 
14 | # 日志
15 | logging.config.fileConfig('auto_add_mp_logging.conf')
16 | logger = logging.getLogger()
17 | 
18 | # 搜索API实例
19 | wechats = WechatSogouApi()
20 | 
21 | #数据库实例
22 | mysql = mysql('add_mp_list')
23 | 
24 | 
25 | add_list = mysql.find(0)
26 | succ_count = 0
27 | 
28 | for add_item in add_list :
29 |     try:
30 |         print(add_item)
31 |         if add_item['wx_hao']:
32 |             print("add by wx_hao")
33 |             mysql.where_sql = "wx_hao ='" + add_item['wx_hao'] + "'"
34 |             mp_data = mysql.table('mp_info').find(1)
35 |             if not mp_data :
36 |                 wechat_info = wechats.get_gzh_info(add_item['wx_hao'])
37 |                 time.sleep(1)
38 |                 #print(wechat_info)
39 |                 if(wechat_info != ""):
40 |                     mysql.table('mp_info').add({'name':wechat_info['name'],
41 |                                                 'wx_hao':wechat_info['wechatid'],
42 |                                                 'company':wechat_info['renzhen'],
43 |                                                 'description':wechat_info['jieshao'],
44 |                                                 'logo_url':wechat_info['img'],
45 |                                                 'qr_url': wechat_info['qrcode'],
46 |                                                 'wz_url': wechat_info['url'],
47 |                                                 'last_qunfa_id': 0,
48 |                                                 'create_time':time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))})
49 |             else:
50 |                 print(u"已经存在的公众号")
51 |         elif add_item['name']:
52 |             #获取对应信息
53 |             print("add by name")
54 |             wechat_infos = wechats.search_gzh_info(add_item['name'].encode('utf8'))
55 |             time.sleep(1)
56 |             #print(wechat_infos)
57 |             for wx_item in wechat_infos :
58 |                 #公众号数据写入数据库
59 |                 #搜索一下是否已经存在
60 |                 print(wx_item['name'])
61 |                 mysql.where_sql = "wx_hao ='" + wx_item['wechatid'] + "'"
62 |                 print(mysql.where_sql)
63 |                 mp_data = mysql.table('mp_info').find(1)
64 |                 if not mp_data :
65 |                     print(wx_item['name'].decode("utf-8"))
66 |                     mysql.table('mp_info').add({ 'name':wx_item['name'],
67 |                                 'wx_hao':wx_item['wechatid'],
68 |                                 'company':wx_item['renzhen'],
69 |                                 'description':wx_item['jieshao'],
70 |                                 'logo_url':wx_item['img'],
71 |                                 'qr_url': wx_item['qrcode'],
72 |                                 'wz_url': wx_item['url'],
73 |                                 'last_qunfa_id': 0,
74 |                                 'create_time':time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))})
75 |                 else:
76 |                     print(u"已经存在的公众号")
77 |                 
78 |         #删除已添加项
79 |         mysql.table('add_mp_list').where({'_id':add_item['_id']}).delete()
80 |     except:
81 |         print(u"出错，继续")
82 |         continue
83 | 
84 | 
85 | print("success")
86 | 
87 |     
88 | 
89 | 


--------------------------------------------------------------------------------
/cookies.txt:
--------------------------------------------------------------------------------
  1 | [
  2 | {
  3 |     "domain": ".sogou.com",
  4 |     "expirationDate": 1585149164.37005,
  5 |     "hostOnly": false,
  6 |     "httpOnly": false,
  7 |     "name": "IPLOC",
  8 |     "path": "/",
  9 |     "sameSite": "no_restriction",
 10 |     "secure": false,
 11 |     "session": false,
 12 |     "storeId": "0",
 13 |     "value": "CN1100",
 14 |     "id": 1
 15 | },
 16 | {
 17 |     "domain": ".sogou.com",
 18 |     "expirationDate": 1595289600,
 19 |     "hostOnly": false,
 20 |     "httpOnly": false,
 21 |     "name": "sct",
 22 |     "path": "/",
 23 |     "sameSite": "no_restriction",
 24 |     "secure": false,
 25 |     "session": false,
 26 |     "storeId": "0",
 27 |     "value": "39",
 28 |     "id": 2
 29 | },
 30 | {
 31 |     "domain": ".sogou.com",
 32 |     "expirationDate": 1585148845,
 33 |     "hostOnly": false,
 34 |     "httpOnly": false,
 35 |     "name": "SNUID",
 36 |     "path": "/",
 37 |     "sameSite": "no_restriction",
 38 |     "secure": false,
 39 |     "session": false,
 40 |     "storeId": "0",
 41 |     "value": "327715CDBFBA3ABE12793907BFD60EC1",
 42 |     "id": 3
 43 | },
 44 | {
 45 |     "domain": ".sogou.com",
 46 |     "expirationDate": 2181949036.801871,
 47 |     "hostOnly": false,
 48 |     "httpOnly": false,
 49 |     "name": "SUID",
 50 |     "path": "/",
 51 |     "sameSite": "no_restriction",
 52 |     "secure": false,
 53 |     "session": false,
 54 |     "storeId": "0",
 55 |     "value": "6078AB732320940A000000005C75E06B",
 56 |     "id": 4
 57 | },
 58 | {
 59 |     "domain": ".sogou.com",
 60 |     "expirationDate": 1866589037.181813,
 61 |     "hostOnly": false,
 62 |     "httpOnly": false,
 63 |     "name": "SUV",
 64 |     "path": "/",
 65 |     "sameSite": "no_restriction",
 66 |     "secure": false,
 67 |     "session": false,
 68 |     "storeId": "0",
 69 |     "value": "00E11DBD73AB78605C75E06C33CEE997",
 70 |     "id": 5
 71 | },
 72 | {
 73 |     "domain": ".weixin.sogou.com",
 74 |     "expirationDate": 2181949036.693053,
 75 |     "hostOnly": false,
 76 |     "httpOnly": false,
 77 |     "name": "SUID",
 78 |     "path": "/",
 79 |     "sameSite": "no_restriction",
 80 |     "secure": false,
 81 |     "session": false,
 82 |     "storeId": "0",
 83 |     "value": "6078AB737D29990A000000005C75E06B",
 84 |     "id": 6
 85 | },
 86 | {
 87 |     "domain": "weixin.sogou.com",
 88 |     "expirationDate": 1553821036.664727,
 89 |     "hostOnly": true,
 90 |     "httpOnly": false,
 91 |     "name": "ABTEST",
 92 |     "path": "/",
 93 |     "sameSite": "no_restriction",
 94 |     "secure": false,
 95 |     "session": false,
 96 |     "storeId": "0",
 97 |     "value": "0|1551229035|v1",
 98 |     "id": 7
 99 | },
100 | {
101 |     "domain": "weixin.sogou.com",
102 |     "hostOnly": true,
103 |     "httpOnly": false,
104 |     "name": "JSESSIONID",
105 |     "path": "/",
106 |     "sameSite": "no_restriction",
107 |     "secure": false,
108 |     "session": true,
109 |     "storeId": "0",
110 |     "value": "aaataTUG3ggNN7aMO65Mw",
111 |     "id": 8
112 | },
113 | {
114 |     "domain": "weixin.sogou.com",
115 |     "hostOnly": true,
116 |     "httpOnly": false,
117 |     "name": "PHPSESSID",
118 |     "path": "/",
119 |     "sameSite": "no_restriction",
120 |     "secure": false,
121 |     "session": true,
122 |     "storeId": "0",
123 |     "value": "hipbln966cc23kddoj9qb54385",
124 |     "id": 9
125 | },
126 | {
127 |     "domain": "weixin.sogou.com",
128 |     "expirationDate": 1559869037,
129 |     "hostOnly": true,
130 |     "httpOnly": false,
131 |     "name": "weixinIndexVisited",
132 |     "path": "/",
133 |     "sameSite": "no_restriction",
134 |     "secure": false,
135 |     "session": false,
136 |     "storeId": "0",
137 |     "value": "1",
138 |     "id": 10
139 | }
140 | ]


--------------------------------------------------------------------------------
/updatewenzhang.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #更新文章阅读数据，目前一篇文章只监控24小时
  3 | 
  4 | # 导入包
  5 | from wechatsogou.tools import *
  6 | from wechatsogou import *
  7 | from PIL import Image
  8 | import datetime
  9 | import time
 10 | import logging
 11 | import logging.config
 12 | 
 13 | # 日志
 14 | logging.config.fileConfig('logging.conf')
 15 | logger = logging.getLogger()
 16 | 
 17 | 
 18 | # 搜索API实例
 19 | wechats = WechatSogouApi()
 20 | 
 21 | #如果想使用外部cookie，主要是为了实现搜狗微信登录状态
 22 | #你需要安装chrom浏览器，然后给浏览器安装EditThisCooke这个插件
 23 | #1、使用Chrom浏览器登录搜狗微信
 24 | #2、使用EditThisCooke插件复制当前Cookie信息
 25 | #3、把cookie信息复制到代码目录下的cookies.txt文件
 26 | #4、开启下面这行语句
 27 | #wechats = WechatSogouApi(cookies_file={'file_name':'cookies.txt'})  #使用外部cookie
 28 | 
 29 | 
 30 | #数据库实例
 31 | mysql.order_sql = " order by _id desc"
 32 | mysql = mysql('mp_info')
 33 | 
 34 | #循环获取数据库中所有公众号
 35 | mp_list = mysql.find(0)
 36 | 
 37 | 
 38 | now_time = datetime.datetime.now()
 39 | yes_time = now_time + datetime.timedelta(days=-1) #只更新1天之内的数据，可以修改days=-2就是2天
 40 | succ_count = 1
 41 | 
 42 | for item in mp_list:
 43 |     try:
 44 |         #为了效率，首先查看该公众号是否有24小时之内的文章
 45 |         mysql.where_sql = "mp_id=%d and date_time >'%s'" %(item['_id'],yes_time)
 46 |         wz_time = mysql.table('wenzhang_info').find(1)
 47 |         if not wz_time :
 48 |             continue
 49 | 
 50 |         print(item['name'])
 51 |         #print('1')
 52 |         wz_url = ""
 53 |         if item.has_key('wz_url') :
 54 |             wz_url = item['wz_url']
 55 |         else :
 56 |             wechat_info = wechats.get_gzh_info(item['wx_hao'])
 57 |             if not wechat_info.has_key('url') :
 58 |                 continue
 59 |             wz_url = wechat_info['url'];
 60 | 
 61 |         #print('2')
 62 |         wz_list = wechats.get_gzh_message(url=wz_url)
 63 |         if u'链接已过期' in wz_list:
 64 |             wechat_info = wechats.get_gzh_info(item['wx_hao'])
 65 |             print(wechat_info)
 66 |             if not wechat_info.has_key('url') :
 67 |                 continue
 68 |             print('guo qi sz chong xin huo qu success')
 69 |             wz_url = wechat_info['url'];
 70 |             wz_list = wechats.get_gzh_message(url=wz_url)
 71 |             mysql.where_sql = " _id=%s" %(item['_id'])
 72 |             mysql.table('mp_info').save({'wz_url':wechat_info['url'],'logo_url':wechat_info['img'],'qr_url':wechat_info['qrcode']})
 73 |         #type==49表示是图文消息
 74 |         #print('3')
 75 |         for wz_item in wz_list :
 76 |             #只监控24小时之内的文章
 77 |             if(wz_item['datetime'] < time.mktime(yes_time.timetuple())):
 78 |                 break
 79 | 
 80 |             if wz_item['type'] == '49':
 81 |                 #获取文章数据
 82 |                 time.sleep(0.5)
 83 |                 article_info = wechats.deal_article(url=wz_item['content_url'])
 84 |                 mysql.where_sql = " mp_id=%d and qunfa_id=%d and msg_index=%d" %(item['_id'],wz_item['qunfa_id'],wz_item['main'])
 85 |                 #print(mysql.where_sql)
 86 |                 wz_data = mysql.table('wenzhang_info').find(1)
 87 |                 if not wz_data :
 88 |                     print(u"公众号有新文章了，请执行Updtaemp.py进行抓取")
 89 |                     continue
 90 | 
 91 |                 #获取当前的数据
 92 |                 print(succ_count)
 93 |                 succ_count += 1
 94 |                 read_count = wz_data['read_count']
 95 |                 like_count = wz_data['like_count']
 96 |                 comment_count = wz_data['comment_count']
 97 |                 print("%d new_read:%d  new_like:%d read:%d  like:%d" %(wz_data['_id'], article_info['comment']['read_num'],article_info['comment']['like_num'],read_count,like_count))
 98 |                 #把文章写入数据库
 99 |                 mysql.table('wenzhang_statistics').add({'wz_id':wz_data['_id'],
100 |                                                 'create_time':time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())),
101 |                                                 'read_count':int(article_info['comment']['read_num'])-read_count,
102 |                                                 'like_count':int(article_info['comment']['like_num'])-like_count,
103 |                                                 'comment_count': int(article_info['comment']['elected_comment_total_cnt'])-comment_count})
104 |                 #print('5')
105 |             #更新文章总阅读数
106 |             mysql.where_sql = " _id=%s" %(wz_data['_id'])
107 |             mysql.table('wenzhang_info').save({'read_count':int(article_info['comment']['read_num']),
108 |                                                                             'like_count':int(article_info['comment']['like_num']),
109 |                                                                             'comment_count': int(article_info['comment']['elected_comment_total_cnt'])})
110 |     except KeyboardInterrupt:
111 |         break
112 |     except: #如果不想因为错误使程序退出，可以开启这两句代码
113 |         print(u"出错，继续")
114 |         continue
115 |                 
116 | print('success')
117 | 
118 | 


--------------------------------------------------------------------------------
/updatemp.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #查找公众号最新文章
  3 | 
  4 | # 导入包
  5 | from wechatsogou.tools import *
  6 | from wechatsogou import *
  7 | from PIL import Image
  8 | import datetime
  9 | import time
 10 | import logging
 11 | import logging.config
 12 | import random
 13 | 
 14 | # 日志
 15 | logging.config.fileConfig('logging.conf')
 16 | logger = logging.getLogger()
 17 | 
 18 | # 搜索API实例
 19 | wechats = WechatSogouApi() #不使用外部Cookie
 20 | 
 21 | 
 22 | #如果想使用外部cookie，主要是为了实现搜狗微信登录状态
 23 | #你需要安装chrom浏览器，然后给浏览器安装EditThisCooke这个插件
 24 | #1、使用Chrom浏览器登录搜狗微信
 25 | #2、使用EditThisCooke插件复制当前Cookie信息
 26 | #3、把cookie信息复制到代码目录下的cookies.txt文件
 27 | #4、开启下面这行语句
 28 | #wechats = WechatSogouApi(cookies_file={'file_name':'cookies.txt'})  #使用外部cookie
 29 | 
 30 | 
 31 | #数据库实例
 32 | mysql = mysql('mp_info')
 33 | 
 34 | #循环获取数据库中所有公众号
 35 | mysql.order_sql = " order by _id desc"
 36 | mp_list = mysql.find(0)
 37 | 
 38 | succ_count = 0
 39 | 
 40 | now_time = datetime.datetime.today()
 41 | now_time = datetime.datetime(now_time.year, now_time.month, now_time.day, 0, 0, 0)
 42 | #now_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(now_time))
 43 | 
 44 | for item in mp_list:
 45 |     try:
 46 |         time.sleep(random.randrange(1,3))
 47 |         #查看一下该号今天是否已经发送文章
 48 |         last_qunfa_id = item['last_qunfa_id']
 49 |         last_qunfa_time = item['last_qufa_time']
 50 | 
 51 |         cur_qunfa_id = last_qunfa_id
 52 |         wz_url = item['wz_url']
 53 |             
 54 |         print(item['name'])
 55 |         
 56 |         #获取最近文章信息
 57 |         wz_list = wechats.get_gzh_message(url=wz_url)
 58 |         if u'链接已过期' in wz_list:
 59 |             wechat_info = wechats.get_gzh_info(item['wx_hao'])
 60 |             if 'url' not in wechat_info :
 61 |                 continue
 62 |             print('guo qi sz chong xin huo qu success')
 63 |             wz_url = wechat_info['url'];
 64 |             wz_list = wechats.get_gzh_message(url=wz_url)
 65 |             mysql.where_sql = " _id=%s" %(item['_id'])
 66 |             mysql.table('mp_info').where({'_id':item['_id']}).save({'wz_url':wechat_info['url'],'logo_url':wechat_info['img'],'qr_url':wechat_info['qrcode']})
 67 |         #type==49表示是图文消息
 68 |         qunfa_time = ''
 69 |         for wz_item in wz_list :
 70 |             temp_qunfa_id = int(wz_item['qunfa_id'])
 71 |             if(last_qunfa_id >= temp_qunfa_id):
 72 |                 print(u"没有更新文章")
 73 |                 print(u"")
 74 |                 break
 75 |             if(cur_qunfa_id < temp_qunfa_id):
 76 |                 cur_qunfa_id = temp_qunfa_id
 77 |                 qunfa_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(wz_item['datetime']))
 78 |             succ_count += 1
 79 |             if wz_item['type'] == '49':
 80 |                 #把文章写入数据库
 81 |                 #更新文章条数
 82 |                 print(succ_count)
 83 |                 print(wz_item['content_url'])
 84 |                 if not wz_item['content_url'] :
 85 |                     continue
 86 |                 
 87 |                 sourceurl = wz_item['source_url']
 88 |                 if len(sourceurl) >= 300 :
 89 |                     sourceurl = ''
 90 | 
 91 |                 #如果想把文章下载到本地，请开启下面的语句,请确保已经安装：urllib2，httplib2，BeautifulSoup4
 92 |                 #返回值为下载的html文件路径，可以自己保存到数据库
 93 |                 #index_html_path = wechats.down_html(wz_item['content_url'],wz_item['title'])
 94 | 
 95 |                 #获取文章正文
 96 |                 wz_content = wechats.deal_article_content(url=wz_item['content_url'])
 97 | 
 98 |                 mysql.table('wenzhang_info').add({'title':wz_item['title'],
 99 |                                                 'source_url':sourceurl,
100 |                                                 'content_url':wz_item['content_url'],
101 |                                                 'cover_url':wz_item['cover'],
102 |                                                 'description':wz_item['digest'],
103 |                                                 'date_time': time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(wz_item['datetime'])),
104 |                                                 'mp_id':item['_id'],
105 |                                                 'author':wz_item['author'],
106 |                                                 'msg_index':wz_item['main'],
107 |                                                 'copyright_stat':wz_item['copyright_stat'],
108 |                                                 'qunfa_id':wz_item['qunfa_id'],
109 |                                                 'type':wz_item['type'],
110 |                                                 'like_count':0,
111 |                                                 'read_count':0,
112 |                                                 'comment_count':0,
113 |                                                 'content':wz_content})
114 | 
115 |                 
116 | 
117 |         #更新最新推送ID
118 |         if(last_qunfa_id < cur_qunfa_id):
119 |             mysql.where_sql = " _id=%s" %(item['_id'])
120 |             mysql.table('mp_info').save({'last_qunfa_id':cur_qunfa_id,'last_qufa_time':qunfa_time,'update_time':time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))})
121 |     except KeyboardInterrupt:
122 |         break
123 |     # except: #如果不想因为错误使程序退出，可以开启这两句代码
124 |     #     print(u"出错，继续")
125 |     #     continue
126 |             
127 | print('success')


--------------------------------------------------------------------------------
/jubang.sql:
--------------------------------------------------------------------------------
  1 | /*
  2 | Navicat MySQL Data Transfer
  3 | 
  4 | Source Server         : 47.105.144.60
  5 | Source Server Version : 50723
  6 | Source Host           : 47.105.144.60:3306
  7 | Source Database       : test
  8 | 
  9 | Target Server Type    : MYSQL
 10 | Target Server Version : 50723
 11 | File Encoding         : 65001
 12 | 
 13 | Date: 2019-03-07 20:26:19
 14 | */
 15 | 
 16 | SET FOREIGN_KEY_CHECKS=0;
 17 | 
 18 | -- ----------------------------
 19 | -- Table structure for `add_mp_list`
 20 | -- ----------------------------
 21 | DROP TABLE IF EXISTS `add_mp_list`;
 22 | CREATE TABLE `add_mp_list` (
 23 |   `_id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增ID',
 24 |   `name` varchar(50) DEFAULT '' COMMENT '要添加的公众号名称',
 25 |   `wx_hao` varchar(50) DEFAULT '' COMMENT '公众号的微信号',
 26 |   PRIMARY KEY (`_id`)
 27 | ) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=utf8mb4;
 28 | 
 29 | -- ----------------------------
 30 | -- Records of add_mp_list
 31 | -- ----------------------------
 32 | 
 33 | -- ----------------------------
 34 | -- Table structure for `mp_info`
 35 | -- ----------------------------
 36 | DROP TABLE IF EXISTS `mp_info`;
 37 | CREATE TABLE `mp_info` (
 38 |   `_id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增ID',
 39 |   `name` varchar(50) DEFAULT '' COMMENT '公众号名称',
 40 |   `wx_hao` varchar(20) DEFAULT '' COMMENT '公众号的微信号',
 41 |   `company` varchar(100) DEFAULT '' COMMENT '主体名称',
 42 |   `description` varchar(200) DEFAULT '' COMMENT '功能简介',
 43 |   `logo_url` varchar(200) DEFAULT '' COMMENT 'logo url',
 44 |   `qr_url` varchar(200) DEFAULT '' COMMENT '二维码URL',
 45 |   `create_time` datetime DEFAULT NULL COMMENT '加入牛榜时间',
 46 |   `update_time` datetime DEFAULT NULL COMMENT '最后更新时间',
 47 |   `rank_article_release_count` int(11) DEFAULT '0' COMMENT '群发次数',
 48 |   `rank_article_count` int(11) DEFAULT '0' COMMENT '群发篇数',
 49 |   `last_qunfa_id` int(30) DEFAULT '0' COMMENT '最后的群发ID',
 50 |   `last_qufa_time` datetime DEFAULT NULL COMMENT '最后一次群发的时间',
 51 |   `wz_url` varchar(300) DEFAULT '' COMMENT '最近文章URL',
 52 |   PRIMARY KEY (`_id`)
 53 | ) ENGINE=InnoDB AUTO_INCREMENT=287 DEFAULT CHARSET=utf8mb4;
 54 | 
 55 | -- ----------------------------
 56 | -- Records of mp_info
 57 | -- ----------------------------
 58 | INSERT INTO `mp_info` VALUES ('266', '今日头条', 'headline_today', '北京字节跳动科技有限公司', '今日头条官方帐号', 'http://img01.sogoucdn.com/app/a/100520090/oIWsFt3Om27KzYpmW9LaBGPCUxaU', '', '2017-02-16 17:15:09', null, '0', '0', '0', null, 'http://mp.weixin.qq.com/profile?src=3&timestamp=1487236535&ver=1&signature=nDdjBk7tfBptUPQVaSHn*uoQ9hysPGOoChQf5umkzBbz3PSaIHThKmZzsU23I7vU1tNr6R6t8eQS6lC586yDLQ==');
 59 | INSERT INTO `mp_info` VALUES ('276', '新榜', 'newrankcn', '上海看榜信息科技有限公司', '涨粉、变现、运营、观察,新榜给你不一样的新思路.新榜——内容创业服务平台,www.newrank.cn', 'http://img01.sogoucdn.com/app/a/100520090/oIWsFt3CUA6HniQM4e_i7zncqWkk', '', '2017-02-16 17:16:04', null, '0', '0', '0', null, 'http://mp.weixin.qq.com/profile?src=3&timestamp=1487236590&ver=1&signature=A38golU5GzltuG*u78AoIZkLnJS--EsX4PCDJyq3coRVjU3ZoBZ9UUWZNyOHDzCFw1Q34XVteeqgSGthakK1Ig==');
 60 | INSERT INTO `mp_info` VALUES ('278', '娱乐新榜', 'yulexinbang', '北京快络科技有限公司', '娱乐新人第一自媒体平台,为导演找新人,为新人找发展.深度开挖新人潜力与特色,助力新人演艺事业快速起步.向导演制片等影视从业人员提供第一手新晋艺人资料,实现艺人资源与影视需求的完美对接.', '//img01.sogoucdn.com/app/a/100520090/oIWsFt8lrEWgjvNDVlT1S7wL5Nyw', '', '2017-02-16 17:16:04', null, '0', '0', '0', null, 'http://mp.weixin.qq.com/profile?src=3&timestamp=1551960049&ver=1&signature=fd*NZOcIHHxSZQ6Y44LFP1WmzZvhuKe0sJd2PpGunRcPNotPrCVBSO7sVIDjNkOkF8MkVzv35-iroU38v0GQww==');
 61 | INSERT INTO `mp_info` VALUES ('286', '人民日报', 'rmrbwx', '人民日报社', '参与、沟通、记录时代.', '//img01.sogoucdn.com/app/a/100520090/oIWsFt8_jYUmdw1PQgNVhH9vOEvI', '', '2019-03-07 19:54:26', '2019-03-07 19:58:58', '0', '0', '1000008043', '2019-03-07 18:57:13', 'http://mp.weixin.qq.com/profile?src=3&timestamp=1551959664&ver=1&signature=bSSQMK1LY77M4O22qTi37cbhjhwNV7C9V4aor9HLhAt-Wdr*jWO2gFh3jN4KhPmYamKHzx9fg9SuHxCB1nGehg==');
 62 | 
 63 | -- ----------------------------
 64 | -- Table structure for `wenzhang_info`
 65 | -- ----------------------------
 66 | DROP TABLE IF EXISTS `wenzhang_info`;
 67 | CREATE TABLE `wenzhang_info` (
 68 |   `_id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增ID',
 69 |   `title` text COMMENT '文章标题',
 70 |   `source_url` text COMMENT '原文地址',
 71 |   `cover_url` text COMMENT '封面图URL',
 72 |   `description` text COMMENT '文章摘要',
 73 |   `date_time` datetime DEFAULT NULL COMMENT '文章推送时间',
 74 |   `mp_id` int(11) DEFAULT '0' COMMENT '对应的公众号ID',
 75 |   `read_count` int(11) DEFAULT '0' COMMENT '阅读数',
 76 |   `like_count` int(11) DEFAULT '0' COMMENT '点攒数',
 77 |   `comment_count` int(11) DEFAULT '0' COMMENT '评论数',
 78 |   `content_url` text COMMENT '文章临时地址',
 79 |   `author` varchar(50) DEFAULT '' COMMENT '作者',
 80 |   `msg_index` int(11) DEFAULT '0' COMMENT '一次群发中的图文顺序 1是头条 ',
 81 |   `copyright_stat` int(1) DEFAULT '0' COMMENT '11表示原创 其它表示非原创',
 82 |   `qunfa_id` int(30) DEFAULT '0' COMMENT '群发消息ID',
 83 |   `type` int(11) DEFAULT '0' COMMENT '消息类型',
 84 |   `content` longtext COMMENT '文章正文',
 85 |   PRIMARY KEY (`_id`)
 86 | ) ENGINE=InnoDB AUTO_INCREMENT=6579 DEFAULT CHARSET=utf8mb4;
 87 | 
 88 | -- ----------------------------
 89 | -- Records of wenzhang_info
 90 | -- ----------------------------
 91 | 
 92 | -- ----------------------------
 93 | -- Table structure for `wenzhang_statistics`
 94 | -- ----------------------------
 95 | DROP TABLE IF EXISTS `wenzhang_statistics`;
 96 | CREATE TABLE `wenzhang_statistics` (
 97 |   `_id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增ID',
 98 |   `wz_id` int(11) DEFAULT '0' COMMENT '对应的文章ID',
 99 |   `create_time` datetime DEFAULT NULL COMMENT '统计时间',
100 |   `read_count` int(11) DEFAULT '0' COMMENT '新增阅读数',
101 |   `like_count` int(11) DEFAULT '0' COMMENT '新增点攒数',
102 |   `comment_count` int(11) DEFAULT '0' COMMENT '新增评论数',
103 |   PRIMARY KEY (`_id`)
104 | ) ENGINE=InnoDB AUTO_INCREMENT=4006 DEFAULT CHARSET=utf8mb4;
105 | 
106 | -- ----------------------------
107 | -- Records of wenzhang_statistics
108 | -- ----------------------------
109 | 


--------------------------------------------------------------------------------
/wechatsogou/db.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import pymysql
  4 | from . import config
  5 | 
  6 | 
  7 | class DbException(Exception):
  8 |     """数据库 异常 基类
  9 |     """
 10 |     pass
 11 | 
 12 | 
 13 | class MysqlDbException(DbException):
 14 |     """数据库 myslq 异常类
 15 |     """
 16 |     pass
 17 | 
 18 | 
 19 | class mysql():
 20 |     """数据库类
 21 | 
 22 |     例子
 23 |     m = M('user')
 24 |     m.table('user').add({}) # 插入
 25 |     m.table('user').where({}).save({}) # 更新
 26 |     m.table('user').field(['id']).where({}).order({'id':'desc'}).find() # 读取，asc，desc
 27 |     m.where({}).delete() # 删除
 28 |     """
 29 | 
 30 |     def __init__(self,table='', prefix='', host='',user='',passwd='',db='',charset=''):
 31 |         """初始化
 32 | 
 33 |         table是初始化选择的表，后面可以使用table()函数更改
 34 |         prefix是数据表前缀，一般配置在config中
 35 |         """
 36 |         self.host = config.host
 37 |         self.user = config.user
 38 |         self.passwd = config.passwd
 39 |         self.db = config.db
 40 |         self.charset = config.charset
 41 | 
 42 |         if host:
 43 |             self.host = host
 44 |         if user:
 45 |             self.user = user
 46 |         if passwd:
 47 |             self.passwd = passwd
 48 |         if db:
 49 |             self.db = db
 50 |         if charset:
 51 |             self.charset = charset
 52 |         if prefix:
 53 |             self.prefix = prefix + '_'
 54 |         elif config.prefix:
 55 |             self.prefix = config.prefix + '_'
 56 |         else:
 57 |             self.prefix = ''
 58 |         if table:
 59 |             self.tablename = self.prefix + table
 60 |         self.__conn()
 61 | 
 62 |     def __conn(self):
 63 |         """连接数据库函数
 64 |         """
 65 |         self.conn = pymysql.connect(host=self.host, user=self.user, passwd=self.passwd, db=self.db,
 66 |                                     charset=self.charset, cursorclass=pymysql.cursors.DictCursor)
 67 |         self.cur = self.conn.cursor()
 68 |         return self
 69 | 
 70 |     def __update(self, sqls):
 71 |         """更新语句，可执行update,insert语句
 72 |         """
 73 |         if type(sqls) is str:
 74 |             sta = self.cur.execute(sqls)
 75 |         elif type(sqls) is list:
 76 |             for sql in sqls:
 77 |                 sta = self.cur.execute(sql)
 78 |         else:
 79 |             raise MysqlDbException('更新语句参数错误 - Model.__update')
 80 |         self.conn.commit()
 81 | 
 82 |         return self.cur.lastrowid
 83 | 
 84 |     def __delete(self, sql):
 85 |         """删除语句
 86 |         """
 87 |         return self.cur.execute(sql)
 88 | 
 89 |     def __query(self, sql):
 90 |         """查询语句
 91 |         """
 92 |         return self.cur.execute(sql)
 93 | 
 94 |     def __close(self):
 95 |         """关闭所有连接
 96 |         """
 97 |         self.cur.close()
 98 |         self.conn.close()
 99 | 
100 |     def __del__(self):
101 |         """析构函数
102 |         """
103 |         self.conn.commit()
104 |         self.__close()
105 | 
106 |     """
107 |     以下是封装的提供使用的
108 |     """
109 | 
110 |     def table(self, table, prefix=''):
111 |         """设置数据表, 链式操作
112 |         """
113 |         if prefix:
114 |             prefix = prefix + '_'
115 |         elif hasattr(self, 'prefix'):
116 |             prefix = self.prefix
117 |         else:
118 |             prefix = ''
119 |         self.tablename = prefix + table
120 |         return self
121 | 
122 |     def limit(self, pre, count):
123 |         self.limit_sql = 'limit ' + str(pre) + ',' + str(count)
124 |         return self
125 | 
126 |     def where(self, where):
127 |         """设置条件, 链式操作
128 |         """
129 |         if type(where) is str:
130 |             raise MysqlDbException('请输入字典 - Model.where')
131 |             # self.where_sql = where
132 |         elif type(where) is dict:
133 |             where_sql = ''
134 |             for k, v in where.items():
135 |                 where_sql += "`" + str(k) + "` LIKE '" + str(v) + "' and "
136 |             self.where_sql = where_sql[:-5]
137 |         return self
138 | 
139 |     def field(self, field):
140 |         """设置操作的字段
141 |         """
142 |         if type(field) is str:
143 |             if field == '*':
144 |                 self.field_sql = "*"
145 |             else:
146 |                 self.field_sql = "`" + field + "`"
147 |         elif type(field) is list:
148 |             field_dian = []
149 |             for f in field:
150 |                 field_dian.append("`" + f + "`")
151 |             self.field_sql = ','.join(field_dian)
152 |         else:
153 |             raise MysqlDbException('field参数不是字符或者列表 - Model.field')
154 |         return self
155 | 
156 |     def order(self, order):
157 |         """排序
158 |         """
159 |         if type(order) is dict:
160 |             for k, v in order.items():
161 |                 self.order_sql = " order by `" + k + "` " + v
162 |                 break
163 |         else:
164 |             raise MysqlDbException('排序参数不是字典 - Model.order')
165 |         return self
166 | 
167 |     def add(self, data):
168 |         """插入数据
169 |         """
170 |         ks = ''
171 |         vs = ''
172 |         for k, v in data.items():
173 |             ks += "`" + str(k).replace('\'', '\\\'') + "`,"
174 |             vs += "'" + str(v).replace('\'', '\\\'') + "',"
175 |         if hasattr(self, 'tablename'):
176 |             sql = "insert into `" + self.tablename + "` (" + ks[:-1] + ") values (" + vs[:-1] + ")"
177 |             try:
178 |                 return self.__update(sql)
179 |             except pymysql.err.IntegrityError:
180 |                 pass
181 |         else:
182 |             raise MysqlDbException('缺少数据表 - Model.add')
183 | 
184 |     def save(self, data):
185 |         """更新数据
186 |         """
187 |         if not hasattr(self, 'where_sql'):
188 |             raise MysqlDbException('缺少where语句 - Model.save')
189 |         if not hasattr(self, 'tablename'):
190 |             raise MysqlDbException('缺少tablename - Model.save')
191 |         data_sql = ''
192 |         for k, v in data.items():
193 |             data_sql += "`" + str(k) + "` = '" + str(v) + "',"
194 |         sql = "update `" + self.tablename + "` set " + data_sql[:-1] + " where " + self.where_sql + ";"
195 |         self.__update(sql)
196 | 
197 |     def find(self, size=25):
198 |         """查询数据
199 |         """
200 |         where_sql = " where " + self.where_sql if hasattr(self, 'where_sql') else ""
201 |         field_sql = self.field_sql if hasattr(self, 'field_sql') else "*"
202 |         order_sql = self.order_sql if hasattr(self, 'order_sql') else ""
203 |         limit_sql = self.limit_sql if hasattr(self, 'limit_sql') else ""
204 |         sql = "select " + field_sql + " from `" + self.tablename + "`" + where_sql + order_sql + limit_sql
205 |         self.__query(sql)
206 |         if size == 0:
207 |             return self.cur.fetchall()
208 |         elif size == 1:
209 |             return self.cur.fetchone()
210 |         else:
211 |             return self.cur.fetchmany(size)
212 | 
213 |     def delete(self):
214 |         """删除语句
215 |         """
216 |         where_sql = " where " + self.where_sql if hasattr(self, 'where_sql') else ""
217 |         sql = "delete from `" + self.tablename + "`" + where_sql
218 |         return self.__delete(sql)
219 | 
220 | 
221 | if __name__ == '__main__':
222 |     pass
223 | 


--------------------------------------------------------------------------------
/auto_add_mp_log.txt:
--------------------------------------------------------------------------------
  1 | 2016-10-20 10:35:14,977 - [basic.py:158] - ERROR - <!DOCTYPE HTML>
  2 | <html>
  3 | <head>
  4 |     <meta charset="utf-8">
  5 |     <link rel="shortcut icon" href="//www.sogou.com/images/logo2014/new/favicon.ico" type="image/x-icon">
  6 |     <title>搜狗搜索</title>
  7 |     <link rel="stylesheet" href="static/css/anti.min.css?v=1"/>
  8 |     <script src="//dl.web.sogoucdn.com/common/lib/jquery/jquery-1.11.0.min.js"></script>
  9 |     <script src="static/js/antispider.min.js?v=2"></script>
 10 |     <script>
 11 |         var domain = getDomain();
 12 |         window.imgCode = -1;
 13 | 
 14 |         (function() {
 15 |             function checkSNUID() {
 16 |                 var cookieArr = document.cookie.split('; '),
 17 |                     count = 0;
 18 | 
 19 |                 for(var i = 0, len = cookieArr.length; i < len; i++) {
 20 |                     if (cookieArr[i].indexOf('SNUID=') > -1) {
 21 |                         count++;
 22 |                     }
 23 |                 }
 24 | 
 25 |                 return count > 1;
 26 |             }
 27 | 
 28 |             if(checkSNUID()) {
 29 |                 var date = new Date(), expires;
 30 |                 date.setTime(date.getTime() -100000);
 31 | 
 32 |                 expires = date.toGMTString();
 33 | 
 34 |                 document.cookie = 'SNUID=1;path=/;expires=' + expires;
 35 |                 document.cookie = 'SNUID=1;path=/;expires=' + expires + ';domain=.www.sogou.com';
 36 |                 document.cookie = 'SNUID=1;path=/;expires=' + expires + ';domain=.weixin.sogou.com';
 37 |                 document.cookie = 'SNUID=1;path=/;expires=' + expires + ';domain=.sogou.com';
 38 |                 document.cookie = 'SNUID=1;path=/;expires=' + expires + ';domain=.snapshot.sogoucdn.com';
 39 | 
 40 |                 sendLog('delSNUID');
 41 |             }
 42 | 
 43 |             if(getCookie('seccodeRight') === 'success') {
 44 |                 sendLog('verifyLoop');
 45 | 
 46 |                 setCookie('seccodeRight', 1, getUTCString(-1), location.hostname, '/');
 47 |             }
 48 | 
 49 |             if(getCookie('refresh')) {
 50 |                 sendLog('refresh');
 51 |             }
 52 |         })();
 53 | 
 54 |         function setImgCode(code) {
 55 |             try {
 56 |                 var t = new Date().getTime() - imgRequestTime.getTime();
 57 |                 sendLog('imgCost',"cost="+t);
 58 |             } catch (e) {
 59 |             }
 60 |             window.imgCode = code;
 61 |         }
 62 |         sendLog('index');
 63 | 
 64 |         function changeImg2() {
 65 |         	if(window.event) {
 66 |         		window.event.returnValue=false
 67 |         	}
 68 |         }
 69 |     </script>
 70 | </head>
 71 | <body>
 72 | <div class="header">
 73 |     <div class="logo"><a href="/"><img width="180" height="60" src="//www.sogou.com/images/logo2014/error180x60.png"></a></div>
 74 |     <div class="other"><span class="s1">您的访问出错了</span><span class="s2"><a href="/">返回首页&gt;&gt;</a></span></div>
 75 | </div>
 76 | <div class="content-box">
 77 |     <p class="ip-time-p">IP:36.110.68.16<br>访问时间：2016.10.20 10:35:37</p>
 78 |     <p class="p2">用户您好，您的访问过于频繁，为确认本次访问为正常用户行为，需要您协助验证。</p>
 79 |     <p class="p3"><label for="seccodeInput">验证码：</label></p>
 80 |     <form name="authform" method="POST" id="seccodeForm" action="/">
 81 |         <p class="p4">
 82 |             <input type=text name="c" value="" placeholder="请输入验证码" id="seccodeInput">
 83 |             <input type="hidden" name="tc" id="tc" value="">
 84 |             <input type="hidden" name="r" id="from" value="%2Fweixin%3Fquery%3Dqiaoqiaohuli%26_sug_type_%3D%26_sug_%3Dn%26type%3D1%26page%3D1%26ie%3Dutf8" >
 85 |             <input type="hidden" name="m" value="0" >            <span class="s1">
 86 |                 <script>imgRequestTime=new Date();</script>
 87 |                 <a onclick="changeImg2();" href="javascript:void(0)">
 88 |                     <img id="seccodeImage" onload="setImgCode(1)" onerror="setImgCode(0)" src="util/seccode.php?tc=1476930937" width="100" height="40" alt="请输入图中的验证码" title="请输入图中的验证码">
 89 |                 </a>
 90 |             </span>
 91 |             <a href="javascript:void(0);" id="change-img" onclick="changeImg2();" style="padding-left:50px;">换一张</a>
 92 |             <span class="s2" id="error-tips" style="display: none;"></span>
 93 |         </p>
 94 |     </form>
 95 |     <p class="p5">
 96 |         <a href="javascript:void(0);" id="submit">提交</a>
 97 |         <span>提交后没解决问题？欢迎<a href="http://fankui.help.sogou.com/index.php/web/web/index?type=10&anti_time=1476930937&domain=weixin.sogou.com" target="_blank">反馈</a>。</span>
 98 |     </p>
 99 | </div>
100 | <div id="ft"><a href="http://fuwu.sogou.com/" target="_blank">企业推广</a><a href="http://corp.sogou.com/" target="_blank">关于搜狗</a><a href="/docs/terms.htm?v=1" target="_blank">免责声明</a><a href="http://fankui.help.sogou.com/index.php/web/web/index?type=10&anti_time=1476930937&domain=weixin.sogou.com" target="_blank">意见反馈</a><br>&nbsp;&copy;&nbsp;2016<span id="footer-year"></span>&nbsp;SOGOU&nbsp;-&nbsp;<a href="http://www.miibeian.gov.cn" target="_blank" class="g">京ICP证050897号</a>&nbsp;-&nbsp;京公网安备1100<span class="ba">00000025号</span></div>
101 | <script src="static/js/index.min.js?v=0.1.3"></script>
102 | </body>
103 | </html><!--zly-->
104 | 
105 | 2016-10-20 10:35:19,500 - [basic.py:219] - ERROR - verify code ocr: 解封成功，正在为您跳转来源地址...
106 | 2016-10-20 10:45:59,701 - [basic.py:158] - ERROR - <!DOCTYPE HTML>
107 | <html>
108 | <head>
109 |     <meta charset="utf-8">
110 |     <link rel="shortcut icon" href="//www.sogou.com/images/logo2014/new/favicon.ico" type="image/x-icon">
111 |     <title>搜狗搜索</title>
112 |     <link rel="stylesheet" href="static/css/anti.min.css?v=1"/>
113 |     <script src="//dl.web.sogoucdn.com/common/lib/jquery/jquery-1.11.0.min.js"></script>
114 |     <script src="static/js/antispider.min.js?v=2"></script>
115 |     <script>
116 |         var domain = getDomain();
117 |         window.imgCode = -1;
118 | 
119 |         (function() {
120 |             function checkSNUID() {
121 |                 var cookieArr = document.cookie.split('; '),
122 |                     count = 0;
123 | 
124 |                 for(var i = 0, len = cookieArr.length; i < len; i++) {
125 |                     if (cookieArr[i].indexOf('SNUID=') > -1) {
126 |                         count++;
127 |                     }
128 |                 }
129 | 
130 |                 return count > 1;
131 |             }
132 | 
133 |             if(checkSNUID()) {
134 |                 var date = new Date(), expires;
135 |                 date.setTime(date.getTime() -100000);
136 | 
137 |                 expires = date.toGMTString();
138 | 
139 |                 document.cookie = 'SNUID=1;path=/;expires=' + expires;
140 |                 document.cookie = 'SNUID=1;path=/;expires=' + expires + ';domain=.www.sogou.com';
141 |                 document.cookie = 'SNUID=1;path=/;expires=' + expires + ';domain=.weixin.sogou.com';
142 |                 document.cookie = 'SNUID=1;path=/;expires=' + expires + ';domain=.sogou.com';
143 |                 document.cookie = 'SNUID=1;path=/;expires=' + expires + ';domain=.snapshot.sogoucdn.com';
144 | 
145 |                 sendLog('delSNUID');
146 |             }
147 | 
148 |             if(getCookie('seccodeRight') === 'success') {
149 |                 sendLog('verifyLoop');
150 | 
151 |                 setCookie('seccodeRight', 1, getUTCString(-1), location.hostname, '/');
152 |             }
153 | 
154 |             if(getCookie('refresh')) {
155 |                 sendLog('refresh');
156 |             }
157 |         })();
158 | 
159 |         function setImgCode(code) {
160 |             try {
161 |                 var t = new Date().getTime() - imgRequestTime.getTime();
162 |                 sendLog('imgCost',"cost="+t);
163 |             } catch (e) {
164 |             }
165 |             window.imgCode = code;
166 |         }
167 |         sendLog('index');
168 | 
169 |         function changeImg2() {
170 |         	if(window.event) {
171 |         		window.event.returnValue=false
172 |         	}
173 |         }
174 |     </script>
175 | </head>
176 | <body>
177 | <div class="header">
178 |     <div class="logo"><a href="/"><img width="180" height="60" src="//www.sogou.com/images/logo2014/error180x60.png"></a></div>
179 |     <div class="other"><span class="s1">您的访问出错了</span><span class="s2"><a href="/">返回首页&gt;&gt;</a></span></div>
180 | </div>
181 | <div class="content-box">
182 |     <p class="ip-time-p">IP:36.110.68.19<br>访问时间：2016.10.20 10:46:22</p>
183 |     <p class="p2">用户您好，您的访问过于频繁，为确认本次访问为正常用户行为，需要您协助验证。</p>
184 |     <p class="p3"><label for="seccodeInput">验证码：</label></p>
185 |     <form name="authform" method="POST" id="seccodeForm" action="/">
186 |         <p class="p4">
187 |             <input type=text name="c" value="" placeholder="请输入验证码" id="seccodeInput">
188 |             <input type="hidden" name="tc" id="tc" value="">
189 |             <input type="hidden" name="r" id="from" value="%2Fweixin%3Fquery%3Dinfoqchina%26_sug_type_%3D%26_sug_%3Dn%26type%3D1%26page%3D1%26ie%3Dutf8" >
190 |             <input type="hidden" name="m" value="0" >            <span class="s1">
191 |                 <script>imgRequestTime=new Date();</script>
192 |                 <a onclick="changeImg2();" href="javascript:void(0)">
193 |                     <img id="seccodeImage" onload="setImgCode(1)" onerror="setImgCode(0)" src="util/seccode.php?tc=1476931582" width="100" height="40" alt="请输入图中的验证码" title="请输入图中的验证码">
194 |                 </a>
195 |             </span>
196 |             <a href="javascript:void(0);" id="change-img" onclick="changeImg2();" style="padding-left:50px;">换一张</a>
197 |             <span class="s2" id="error-tips" style="display: none;"></span>
198 |         </p>
199 |     </form>
200 |     <p class="p5">
201 |         <a href="javascript:void(0);" id="submit">提交</a>
202 |         <span>提交后没解决问题？欢迎<a href="http://fankui.help.sogou.com/index.php/web/web/index?type=10&anti_time=1476931582&domain=weixin.sogou.com" target="_blank">反馈</a>。</span>
203 |     </p>
204 | </div>
205 | <div id="ft"><a href="http://fuwu.sogou.com/" target="_blank">企业推广</a><a href="http://corp.sogou.com/" target="_blank">关于搜狗</a><a href="/docs/terms.htm?v=1" target="_blank">免责声明</a><a href="http://fankui.help.sogou.com/index.php/web/web/index?type=10&anti_time=1476931582&domain=weixin.sogou.com" target="_blank">意见反馈</a><br>&nbsp;&copy;&nbsp;2016<span id="footer-year"></span>&nbsp;SOGOU&nbsp;-&nbsp;<a href="http://www.miibeian.gov.cn" target="_blank" class="g">京ICP证050897号</a>&nbsp;-&nbsp;京公网安备1100<span class="ba">00000025号</span></div>
206 | <script src="static/js/index.min.js?v=0.1.3"></script>
207 | </body>
208 | </html><!--zly-->
209 | 
210 | 2016-10-20 10:46:03,927 - [basic.py:219] - ERROR - verify code ocr: 解封成功，正在为您跳转来源地址...
211 | 2016-10-21 09:15:58,844 - [basic.py:158] - ERROR - 出现验证码。。。
212 | 2016-10-21 09:16:09,729 - [basic.py:219] - ERROR - verify code ocr: 解封成功，正在为您跳转来源地址...
213 | 2016-10-21 09:16:09,767 - [basic.py:158] - ERROR - 出现验证码。。。
214 | 2016-10-21 17:30:58,819 - [basic.py:158] - ERROR - 出现验证码。。。
215 | 2016-10-21 17:31:11,736 - [basic.py:210] - ERROR - verify code erro: 验证码输入错误, 请重新输入！
216 | 


--------------------------------------------------------------------------------
/wechatsogou/basic.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import logging
  4 | import requests
  5 | import random
  6 | import time
  7 | import re
  8 | import sys
  9 | from lxml import etree
 10 | from PIL import Image
 11 | is_python3 = sys.version_info[0] > 2
 12 | if is_python3 == False:
 13 |     import cookielib
 14 | else:
 15 |     import http.cookiejar as cookielib
 16 | import json
 17 | 
 18 | try:
 19 |     from urllib.request import quote as quote
 20 | except ImportError:
 21 |     from urllib import quote as quote
 22 |     import sys
 23 | 
 24 |     reload(sys)
 25 |     sys.setdefaultencoding('utf-8')
 26 | 
 27 | try:
 28 |     import StringIO
 29 | 
 30 | 
 31 |     def readimg(content):
 32 |         return Image.open(StringIO.StringIO(content))
 33 | except ImportError:
 34 |     import tempfile
 35 | 
 36 | 
 37 |     def readimg(content):
 38 |         f = tempfile.TemporaryFile()
 39 |         f.write(content)
 40 |         return Image.open(f)
 41 | 
 42 | try:
 43 |     import urlparse as url_parse
 44 | except ImportError:
 45 |     import urllib.parse as url_parse
 46 | 
 47 | from lxml import etree
 48 | from PIL import Image
 49 | 
 50 | from . import config
 51 | from .base import WechatSogouBase
 52 | from .exceptions import *
 53 | from .ruokuaicode import RClient
 54 | from .filecache import WechatCache
 55 | 
 56 | import logging
 57 | 
 58 | logger = logging.getLogger()
 59 | 
 60 | 
 61 | class WechatSogouBasic(WechatSogouBase):
 62 |     """基于搜狗搜索的的微信公众号爬虫接口 基本功能类
 63 |     """
 64 | 
 65 |     def __init__(self, **kwargs):
 66 |         self._cache = WechatCache(config.cache_dir, 60 * 60)
 67 |         self._session = self._cache.get(config.cache_session_name) if self._cache.get(
 68 |             config.cache_session_name) else requests.session()
 69 |         
 70 |         self.cookies = ""
 71 |         cookies_file = kwargs.get('cookies_file')
 72 |         if cookies_file:
 73 |             #使用外部cookies
 74 |             print(u"使用外部cookies文件加载")
 75 |             cookie_jar = cookielib.MozillaCookieJar()  
 76 |             cookies = open(cookies_file.get('file_name')).read()
 77 |             for cookie in json.loads(cookies):  
 78 |                 print(cookie['name'])
 79 |                 cookie_jar.set_cookie(cookielib.Cookie(version=0, name=cookie['name'], value=cookie['value'], port=None, port_specified=False, domain=cookie['domain'], domain_specified=False, domain_initial_dot=False, path=cookie['path'], path_specified=True, secure=cookie['secure'], expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False))  
 80 |             self._session.cookies.update(cookie_jar)
 81 |         
 82 |         self.dama_name = config.dama_name
 83 |         self.dama_pswd = config.dama_pswd
 84 |         if self.dama_name != '' and self.dama_pswd != '':
 85 |            self._ocr = RClient(self.dama_name, self.dama_pswd, '70021', 'dcefe229cb9b4e1785b48fbc3525d011')
 86 | 
 87 |         self._agent = [
 88 |             "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0",
 89 |             "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
 90 |             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586",
 91 |             "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
 92 |             "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
 93 |         ]
 94 | 
 95 |     def _get_elem_text(self, elem):
 96 |         """抽取lxml.etree库中elem对象中文字
 97 | 
 98 |         Args:
 99 |             elem: lxml.etree库中elem对象
100 | 
101 |         Returns:
102 |             elem中文字
103 |         """
104 |         rc = []
105 |         for node in elem.itertext():
106 |             rc.append(node.strip())
107 |         return ''.join(rc)
108 | 
109 |     def _get_encoding_from_reponse(self, r):
110 |         """获取requests库get或post返回的对象编码
111 | 
112 |         Args:
113 |             r: requests库get或post返回的对象
114 | 
115 |         Returns:
116 |             对象编码
117 |         """
118 |         encoding = requests.utils.get_encodings_from_content(r.text)
119 |         return encoding[0] if encoding else requests.utils.get_encoding_from_headers(r.headers)
120 | 
121 |     def _get(self, url, rtype='get', **kwargs):
122 |         """封装request库get,post方法
123 | 
124 |         Args:
125 |             url: 请求url
126 |             host: 请求host
127 |             referer: 请求referer
128 |             proxy: 是否启用代理请求
129 | 
130 |         Returns:
131 |             text: 请求url的网页内容
132 | 
133 |         Raises:
134 |             WechatSogouException: 操作频繁以致出现验证码或requests请求返回码错误
135 |         """
136 |         referer = kwargs.get('referer', None)
137 |         host = kwargs.get('host', None)
138 |         if host:
139 |             del kwargs['host']
140 |         if referer:
141 |             del kwargs['referer']
142 |         headers = {
143 |             "Host": host if host else 'weixin.sogou.com',
144 |             "Upgrade-Insecure-Requests":'1',
145 |             "User-Agent": self._agent[random.randint(0, len(self._agent) - 1)],
146 |             "Accept":'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
147 |             "Referer": referer if referer else 'https://weixin.sogou.com/',
148 |             "Accept-Encoding":'gzip, deflate, br',
149 |             "Accept-Language":'zh,zh-TW;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6'
150 |             
151 |         }
152 |         if rtype == 'get':
153 |             #self._session.cookies.set
154 |             r = self._session.get(url, headers=headers,verify=False, **kwargs)
155 |         else:
156 |             data = kwargs.get('data', None)
157 |             json = kwargs.get('json', None)
158 |             r = self._session.post(url, data=data, json=json, headers=headers,verify=False, **kwargs)
159 |         
160 |         #logger.error(r.text)
161 |         if u'链接已过期' in r.text:
162 |             return '链接已过期'
163 |         if r.status_code == requests.codes.ok:
164 |             r.encoding = self._get_encoding_from_reponse(r)
165 |             if u'用户您好，您的访问过于频繁，为确认本次访问为正常用户行为，需要您协助验证' in r.text or u'用户您好，我们的系统检测到您网络中存在异常访问请求' in r.text:
166 |                 self._vcode_url = url
167 |                 logger.error(u'出现验证码。。。')
168 |                 print(u'用户您好，您的访问过于频繁，为确认本次访问为正常用户行为，需要您协助验证')
169 |                 raise WechatSogouVcodeException('weixin.sogou.com verification code')
170 |         else:
171 |             logger.error('requests status_code error %d' % (r.status_code))
172 |             raise WechatSogouRequestsException('requests status_code error', r.status_code)
173 |         return r.text
174 | 
175 |     def _jiefeng(self):
176 |         """对于出现验证码，识别验证码，解封
177 | 
178 |         Args:
179 |             ruokuai: 是否采用若快打码平台
180 | 
181 |         Raises:
182 |             WechatSogouVcodeException: 解封失败，可能验证码识别失败
183 |         """
184 |         max_count = 0
185 |         while(max_count < 10) :
186 |             print(u"出现验证码，准备自动识别")
187 |             max_count += 1
188 |             logger.debug('vcode appear, using _jiefeng')
189 |             codeurl = 'https://weixin.sogou.com/antispider/util/seccode.php?tc=' + str(time.time())[0:10]
190 | 
191 |             user_agent = self._agent[random.randint(0, len(self._agent) - 1)]
192 |             headers = {
193 |             "Host": 'weixin.sogou.com',
194 |             "Upgrade-Insecure-Requests":'1',
195 |             "User-Agent": user_agent,
196 |             "Accept":'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
197 |             "Referer":'https://weixin.sogou.com/',
198 |             "Accept-Encoding":'gzip, deflate, sdch',
199 |             "Accept-Language":'zh-CN,zh;q=0.8'
200 |             }
201 | 
202 |             coder = self._session.get(codeurl,headers=headers,timeout=10,verify=False)
203 |                 
204 |             codeID = "0"
205 |             
206 |             if hasattr(self, '_ocr'):
207 |                 result = self._ocr.create(coder.content, 3060)
208 |                 print(result)
209 |                 if 'Result' not in result :
210 |                     print(u"若快识别失败，1秒后更换验证码再次尝试，尝试次数：%d" %(max_count))
211 |                     time.sleep(1)
212 |                     continue #验证码识别错误，再次执行
213 |                 else:
214 |                     print(u"验证码识别成功 验证码：%s" %(result['Result']))
215 | 
216 |                     img_code = result['Result']
217 |                     codeID = result['Id']
218 | 
219 |                     post_url = 'https://weixin.sogou.com/antispider/thank.php'
220 |                     post_data = {
221 |                         'c': img_code,
222 |                         'r': quote(self._vcode_url),
223 |                         'v': 5
224 |                     }
225 |                     
226 |                     headers = {
227 |                         "User-Agent": user_agent,
228 |                         'Host': 'weixin.sogou.com',
229 |                         'Referer': 'https://weixin.sogou.com/antispider/?from=%2f' + quote(
230 |                             self._vcode_url.replace('http://', ''))
231 |                     }
232 |                     #time.sleep(3)
233 |                     rr = self._session.post(post_url, post_data, headers=headers,verify=False)
234 |                     remsg = eval(rr.content)
235 |                     if remsg['code'] != 0:
236 |                         print(u"搜狗返回验证码错误，1秒后更换验证码再次启动尝试，尝试次数：%d" %(max_count))
237 |                         time.sleep(1)
238 |                         continue
239 | 
240 |                     #搜狗又增加验证码机制
241 |                     time.sleep(0.05)
242 |                     cookie_jar = cookielib.MozillaCookieJar()  
243 |                     cookie_jar.set_cookie(cookielib.Cookie(version=0, name='SNUID', value=remsg['id'], port=None, port_specified=False, domain='sogou.com', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=None, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False))  
244 |                     self._session.cookies.update(cookie_jar)
245 | 
246 |                     pbsnuid = remsg['id'] #pb_cookie['SNUID'].value
247 |                     pbsuv = ''#pb_cookie['SUV'].value
248 |                     print(pbsnuid)
249 |                     print(pbsuv)
250 |                     pburl = 'http://pb.sogou.com/pv.gif?uigs_productid=webapp&type=antispider&subtype=0_seccodeInputSuccess&domain=weixin&suv=%s&snuid=%s&t=%s' %(pbsuv,pbsnuid,str(time.time())[0:10])
251 |                     
252 |                     headers = {
253 |                         "User-Agent": user_agent,
254 |                         'Host': 'pb.sogou.com',
255 |                         'Referer': 'https://weixin.sogou.com/antispider/?from=%2f' + quote(
256 |                             self._vcode_url.replace('http://', ''))
257 |                     }
258 |                     
259 |                     try:
260 |                         self._session.get(pburl, headers=headers,timeout=10,verify=False)
261 |                     except:
262 |                         print('')
263 | 
264 | 						
265 |                     time.sleep(0.5)
266 |                     
267 |                     print(u"搜狗返回验证码识别成功，继续执行")
268 |                     self._cache.set(config.cache_session_name, self._session)
269 |                     logger.error('verify code ocr: ' + remsg['msg'])
270 |                     break
271 |                 
272 |             else:
273 |                 print(u"没有设置自动识别模块用户名、密码，无法执行")
274 |                 break
275 | 
276 | 
277 |             
278 | 
279 |     def _ocr_for_get_gzh_article_by_url_text(self, url):
280 |         print(u"出现验证码，准备自动识别2")
281 |         logger.debug('vcode appear, using _ocr_for_get_gzh_article_by_url_text')
282 |         
283 |         if hasattr(self, '_ocr'):
284 |             max_count = 0
285 |             while(max_count < 10):
286 |                 max_count += 1
287 |                 timestr = str(time.time()).replace('.', '')
288 |                 timever = timestr[0:13] + '.' + timestr[13:17]
289 |                 codeurl = 'http://mp.weixin.qq.com/mp/verifycode?cert=' + timever
290 |                 coder = self._session.get(codeurl,verify=False)
291 |                 logger.debug('vcode appear, using _ocr_for_get_gzh_article_by_url_text')
292 |                 result = self._ocr.create(coder.content, 2040)
293 |                 print(result)
294 |                 if 'Result' not in result :
295 |                     print(u"若快识别失败，1秒后更换验证码再次尝试，尝试次数：%d" %(max_count))
296 |                     time.sleep(1)
297 |                     continue #验证码识别错误，再次执行
298 |                 else:
299 |                     print(u"若快识别成功 验证码：%s" %(result['Result']))
300 | 
301 |                     img_code = result['Result']
302 |                     codeID = result['Id']
303 | 
304 |                     post_url = 'http://mp.weixin.qq.com/mp/verifycode'
305 |                     post_data = {
306 |                         'cert': timever,
307 |                         'input': img_code
308 |                     }
309 |                     headers = {
310 |                         "User-Agent": self._agent[random.randint(0, len(self._agent) - 1)],
311 |                         'Host': 'mp.weixin.qq.com',
312 |                         'Referer': url
313 |                     }
314 |                     rr = self._session.post(post_url, post_data, headers=headers,verify=False)
315 |                     remsg = eval(rr.text)
316 |                     if remsg['ret'] != 0:
317 |                         print(u"搜狗返回验证码错误，1秒后更换验证码再次启动尝试，尝试次数：%d" %(max_count))
318 |                         time.sleep(1)
319 |                         continue
320 |                     
321 |                     print(u"搜狗返回验证码识别成功，继续执行")
322 |                     self._cache.set(config.cache_session_name, self._session)
323 |                     logger.debug('ocr ', remsg['errmsg'])
324 |                     break
325 | 
326 |                 break
327 |         else:
328 |             print(u"没有设置自动识别模块用户名、密码，无法执行")
329 | 
330 | 
331 |     def _replace_html(self, s):
332 |         """替换html‘&quot;’等转义内容为正常内容
333 | 
334 |         Args:
335 |             s: 文字内容
336 | 
337 |         Returns:
338 |             s: 处理反转义后的文字
339 |         """
340 |         s = s.replace('&#39;', '\'')
341 |         s = s.replace('&quot;', '"')
342 |         s = s.replace('&amp;', '&')
343 |         s = s.replace('&gt;', '>')
344 |         s = s.replace('&lt;', '<')
345 |         s = s.replace('&yen;', '¥')
346 |         s = s.replace('amp;', '')
347 |         s = s.replace('&lt;', '<')
348 |         s = s.replace('&gt;', '>')
349 |         s = s.replace('&nbsp;', ' ')
350 |         s = s.replace('\\', '')
351 |         return s
352 | 
353 |     def _replace_dict(self, dicts):
354 |         retu_dict = dict()
355 |         for k, v in dicts.items():
356 |             retu_dict[self._replace_all(k)] = self._replace_all(v)
357 |         return retu_dict
358 | 
359 |     def _replace_list(self, lists):
360 |         retu_list = list()
361 |         for l in lists:
362 |             retu_list.append(self._replace_all(l))
363 |         return retu_list
364 | 
365 |     def _replace_all(self, data):
366 |         if isinstance(data, dict):
367 |             return self._replace_dict(data)
368 |         elif isinstance(data, list):
369 |             return self._replace_list(data)
370 |         elif isinstance(data, str):
371 |             return self._replace_html(data)
372 |         else:
373 |             return data
374 | 
375 |     def _str_to_dict(self, json_str):
376 |         json_dict = eval(json_str)
377 |         return self._replace_all(json_dict)
378 | 
379 |     def _replace_space(self, s):
380 |         s = s.replace(' ', '')
381 |         s = s.replace('\r\n', '')
382 |         return s
383 | 
384 |     def _get_url_param(self, url):
385 |         result = url_parse.urlparse(url)
386 |         return url_parse.parse_qs(result.query, True)
387 | 
388 |     def _search_gzh_text(self, name, page=1):
389 |         """通过搜狗搜索获取关键字返回的文本
390 | 
391 |         Args:
392 |             name: 搜索关键字
393 |             page: 搜索的页数
394 | 
395 |         Returns:
396 |             text: 返回的文本
397 |         """
398 |         request_url = 'https://weixin.sogou.com/weixin?type=1&s_from=input&query='  + quote(
399 |             name) + '&ie=utf8&_sug_=n&_sug_type_=&page=' + str(page)
400 | 
401 |         try:
402 |             text = self._get(request_url)
403 |         except WechatSogouVcodeException:
404 |             
405 |             try:
406 |                 self._jiefeng()
407 |                 text = self._get(request_url, 'get', 
408 |                                 referer='https://weixin.sogou.com/antispider/?from=%2f' + quote(
409 |                                     self._vcode_url.replace('http://', '')))
410 |             except WechatSogouVcodeException:
411 |                 text = ""
412 | 
413 |         try:
414 |             new_url = "https://weixin.sogou.com" + re.findall('var account_anti_url = "(.+?)";', text, re.S)[0]
415 |             self._get(new_url, 'get', referer=request_url)
416 |         except:
417 |             print("error")
418 | 
419 |         return text,request_url
420 | 
421 |     def _search_article_text(self, name, page=1):
422 |         """通过搜狗搜索微信文章关键字返回的文本
423 |         Args:
424 |             name: 搜索文章关键字
425 |             page: 搜索的页数
426 | 
427 |         Returns:
428 |             text: 返回的文本
429 |         """
430 |         request_url = 'https://weixin.sogou.com/weixin?query=' + quote(
431 |             name) + '&_sug_type_=&_sug_=n&type=2&page=' + str(page) + '&ie=utf8'
432 | 
433 |         try:
434 |             text = self._get(request_url)
435 |         except WechatSogouVcodeException:
436 |             
437 |             try:
438 |                 self._jiefeng()
439 |                 text = self._get(request_url, 'get', 
440 |                                 referer='https://weixin.sogou.com/antispider/?from=%2f' + quote(
441 |                                   self._vcode_url.replace('http://', '')))
442 |             except WechatSogouVcodeException:
443 |                 text = ""
444 |         return text
445 | 
446 |     def _get_gzh_article_by_url_text(self, url):
447 |         """最近文章页的文本
448 | 
449 |         Args:
450 |             url: 最近文章页地址
451 | 
452 |         Returns:
453 |             text: 返回的文本
454 |         """
455 |         if "https://weixin.sogou.com" in url:
456 |             return "链接已过期"
457 | 
458 |         text = self._get(url, 'get', host='mp.weixin.qq.com')
459 |         
460 |         if u'为了保护你的网络安全，请输入验证码' in text:
461 |             print(u'为了保护你的网络安全，请输入验证码')
462 |             try:
463 |                 self._ocr_for_get_gzh_article_by_url_text(url)
464 | 
465 |                 text = self._get(url, 'get', host='mp.weixin.qq.com')
466 |             except:
467 |                 text = ""
468 |         return text
469 | 
470 |     def _get_gzh_article_gzh_by_url_dict(self, text, url):
471 |         """最近文章页  公众号信息
472 | 
473 |         Args:
474 |             text: 最近文章文本
475 | 
476 |         Returns:
477 |             字典{'name':name,'wechatid':wechatid,'jieshao':jieshao,'renzhen':renzhen,'qrcode':qrcodes,'img':img,'url':url}
478 |             name: 公众号名称
479 |             wechatid: 公众号id
480 |             jieshao: 介绍
481 |             renzhen: 认证，为空表示未认证
482 |             qrcode: 二维码
483 |             img: 头像图片
484 |             url: 最近文章地址
485 |         """
486 |         page = etree.HTML(text)
487 |         profile_info_area = page.xpath("//div[@class='profile_info_area']")[0]
488 |         img = profile_info_area.xpath('div[1]/span/img/@src')[0]
489 |         name = profile_info_area.xpath('div[1]/div/strong/text()')[0]
490 |         name = self._replace_space(name)
491 |         wechatid = profile_info_area.xpath('div[1]/div/p/text()')
492 |         if wechatid:
493 |             wechatid = wechatid[0].replace(u'微信号: ', '')
494 |         else:
495 |             wechatid = ''
496 |         jieshao = profile_info_area.xpath('ul/li[1]/div/text()')[0]
497 |         renzhen = profile_info_area.xpath('ul/li[2]/div/text()')
498 |         renzhen = renzhen[0] if renzhen else ''
499 |         qrcode = page.xpath('//*[@id="js_pc_qr_code_img"]/@src')[0]
500 |         qrcode = 'http://mp.weixin.qq.com/' + qrcode if qrcode else ''
501 |         return {
502 |             'name': name,
503 |             'wechatid': wechatid,
504 |             'jieshao': jieshao,
505 |             'renzhen': renzhen,
506 |             'qrcode': qrcode,
507 |             'img': img,
508 |             'url': url
509 |         }
510 | 
511 |     def _get_gzh_article_by_url_dict(self, text):
512 |         """最近文章页 文章信息
513 | 
514 |         Args:
515 |             text: 最近文章文本
516 | 
517 |         Returns:
518 |             msgdict: 最近文章信息字典
519 |         """
520 |         try:
521 |             msglist = re.findall("var msgList = (.+?)};", text, re.S)[0]
522 |             msglist = msglist + '}'
523 | 
524 |             html = msglist
525 |             html = html.replace('&#39;', '\'')
526 |             html = html.replace('&amp;', '&')
527 |             html = html.replace('&gt;', '>')
528 |             html = html.replace('&lt;', '<')
529 |             html = html.replace('&yen;', '¥')
530 |             html = html.replace('amp;', '')
531 |             html = html.replace('&lt;', '<')
532 |             html = html.replace('&gt;', '>')
533 |             html = html.replace('&nbsp;', ' ')
534 |             html = html.replace('\\', '')
535 | 
536 |             msgdict = eval(html)
537 |             return msgdict
538 |         except:
539 |             return ''
540 | 
541 |     def _deal_gzh_article_dict(self, msgdict, **kwargs):
542 |         """解析 公众号 群发消息
543 | 
544 |         Args:
545 |             msgdict: 信息字典
546 | 
547 |         Returns:
548 |             列表，均是字典，一定含有一下字段qunfa_id,datetime,type
549 | 
550 |             当type不同时，含有不同的字段，具体见文档
551 |         """
552 |         biz = kwargs.get('biz', '')
553 |         uin = kwargs.get('uin', '')
554 |         key = kwargs.get('key', '')
555 |         items = list()
556 |         for listdic in msgdict['list']:
557 |             item = dict()
558 |             comm_msg_info = listdic['comm_msg_info']
559 |             item['qunfa_id'] = comm_msg_info.get('id', '')  # 不可判重，一次群发的消息的id是一样的
560 |             item['datetime'] = comm_msg_info.get('datetime', '')
561 |             item['type'] = str(comm_msg_info.get('type', ''))
562 |             if item['type'] == '1':
563 |                 # 文字
564 |                 item['content'] = comm_msg_info.get('content', '')
565 |             elif item['type'] == '3':
566 |                 # 图片
567 |                 item[
568 |                     'img_url'] = 'https://mp.weixin.qq.com/mp/getmediadata?__biz=' + biz + '&type=img&mode=small&msgid=' + \
569 |                                  str(item['qunfa_id']) + '&uin=' + uin + '&key=' + key
570 |             elif item['type'] == '34':
571 |                 # 音频
572 |                 item['play_length'] = listdic['voice_msg_ext_info'].get('play_length', '')
573 |                 item['fileid'] = listdic['voice_msg_ext_info'].get('fileid', '')
574 |                 item['audio_src'] = 'https://mp.weixin.qq.com/mp/getmediadata?__biz=' + biz + '&type=voice&msgid=' + \
575 |                                     str(item['qunfa_id']) + '&uin=' + uin + '&key=' + key
576 |             elif item['type'] == '49':
577 |                 # 图文
578 |                 app_msg_ext_info = listdic['app_msg_ext_info']
579 |                 url = app_msg_ext_info.get('content_url')
580 |                 if url:
581 |                     url = 'http://mp.weixin.qq.com' + url if 'http://mp.weixin.qq.com' not in url else url
582 |                 else:
583 |                     url = ''
584 |                 msg_index = 1
585 |                 item['main'] = msg_index
586 |                 item['title'] = app_msg_ext_info.get('title', '')
587 |                 item['digest'] = app_msg_ext_info.get('digest', '')
588 |                 item['fileid'] = app_msg_ext_info.get('fileid', '')
589 |                 item['content_url'] = url
590 |                 item['source_url'] = app_msg_ext_info.get('source_url', '')
591 |                 item['cover'] = app_msg_ext_info.get('cover', '')
592 |                 item['author'] = app_msg_ext_info.get('author', '')
593 |                 item['copyright_stat'] = app_msg_ext_info.get('copyright_stat', '')
594 |                 items.append(item)
595 |                 if app_msg_ext_info.get('is_multi', 0) == 1:
596 |                     for multidic in app_msg_ext_info['multi_app_msg_item_list']:
597 |                         url = multidic.get('content_url')
598 |                         if url:
599 |                             url = 'http://mp.weixin.qq.com' + url if 'http://mp.weixin.qq.com' not in url else url
600 |                         else:
601 |                             url = ''
602 |                         itemnew = dict()
603 |                         itemnew['qunfa_id'] = item['qunfa_id']
604 |                         itemnew['datetime'] = item['datetime']
605 |                         itemnew['type'] = item['type']
606 |                         msg_index += 1
607 |                         itemnew['main'] = msg_index
608 |                         itemnew['title'] = multidic.get('title', '')
609 |                         itemnew['digest'] = multidic.get('digest', '')
610 |                         itemnew['fileid'] = multidic.get('fileid', '')
611 |                         itemnew['content_url'] = url
612 |                         itemnew['source_url'] = multidic.get('source_url', '')
613 |                         itemnew['cover'] = multidic.get('cover', '')
614 |                         itemnew['author'] = multidic.get('author', '')
615 |                         itemnew['copyright_stat'] = multidic.get('copyright_stat', '')
616 |                         items.append(itemnew)
617 |                 continue
618 |             elif item['type'] == '62':
619 |                 item['cdn_videoid'] = listdic['video_msg_ext_info'].get('cdn_videoid', '')
620 |                 item['thumb'] = listdic['video_msg_ext_info'].get('thumb', '')
621 |                 item['video_src'] = 'https://mp.weixin.qq.com/mp/getcdnvideourl?__biz=' + biz + '&cdn_videoid=' + item[
622 |                     'cdn_videoid'] + '&thumb=' + item['thumb'] + '&uin=' + uin + '&key=' + key
623 |             items.append(item)
624 |         return items
625 | 
626 |     def _get_gzh_article_text(self, url):
627 |         """获取文章文本
628 | 
629 |         Args:
630 |             url: 文章链接
631 | 
632 |         Returns:
633 |             text: 文章文本
634 |         """
635 |         return self._get(url, 'get', host='mp.weixin.qq.com')
636 | 
637 |     def _deal_related(self, url, title):
638 |         """获取文章相似文章
639 | 
640 |         Args:
641 |             url: 文章链接
642 |             title: 文章标题
643 | 
644 |         Returns:
645 |             related_dict: 相似文章字典
646 | 
647 |         Raises:
648 |             WechatSogouException: 错误信息errmsg
649 |         """
650 |         related_req_url = 'http://mp.weixin.qq.com/mp/getrelatedmsg?' \
651 |                           'url=' + quote(url) \
652 |                           + '&title=' + title \
653 |                           + '&uin=&key=&pass_ticket=&wxtoken=&devicetype=&clientversion=0&x5=0'
654 |         related_text = self._get(related_req_url, 'get', host='mp.weixin.qq.com', referer=url)
655 |         related_dict = eval(related_text)
656 |         ret = related_dict['base_resp']['ret']
657 |         errmsg = related_dict['base_resp']['errmsg'] if related_dict['base_resp']['errmsg'] else 'ret:' + str(ret)
658 |         if ret != 0:
659 |             #logger.error(errmsg)
660 |             raise WechatSogouException(errmsg)
661 |         return related_dict
662 | 
663 |     def _uinkeybiz(self, keyword, uin=None, key=None, biz=None, pass_ticket=None, msgid=None):
664 |         if uin:
665 |             self._cache.set(keyword + 'uin', uin, 36000)
666 |             self._cache.set(keyword + 'key', key, 36000)
667 |             self._cache.set(keyword + 'biz', biz, 36000)
668 |             self._cache.set(keyword + 'pass_ticket', pass_ticket, 36000)
669 |             self._cache.set(keyword + 'msgid', msgid, 36000)
670 |         else:
671 |             uin = self._cache.get(keyword + 'uin')
672 |             key = self._cache.get(keyword + 'key')
673 |             biz = self._cache.get(keyword + 'biz')
674 |             pass_ticket = self._cache.get(keyword + 'pass_ticket')
675 |             msgid = self._cache.get(keyword + 'msgid')
676 |             return uin, key, biz, pass_ticket, msgid
677 | 
678 |     def _cache_history_session(self, keyword, session=None):
679 |         if session:
680 |             self._cache.set(keyword + 'session', session, 36000)
681 |         else:
682 |             return self._cache.get(keyword + 'session')
683 | 


--------------------------------------------------------------------------------
/wechatsogou/api.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import re
  4 | import requests
  5 | import time
  6 | from lxml import etree
  7 | from wechatsogou.tools import *
  8 | from .basic import WechatSogouBasic
  9 | from .exceptions import *
 10 | import json
 11 | import logging
 12 | import codecs,os
 13 | import random
 14 | from bs4 import BeautifulSoup
 15 | logger = logging.getLogger()
 16 | 
 17 | 
 18 | class WechatSogouApi(WechatSogouBasic):
 19 |     """基于搜狗搜索的的微信公众号爬虫接口  接口类
 20 |     """
 21 | 
 22 |     def __init__(self, **kwargs):
 23 |         super(WechatSogouApi, self).__init__(**kwargs)
 24 | 
 25 |     def get_k_h(self,url,text):
 26 |         """计算k和h"""
 27 |         try:
 28 |             k = random.randrange(1,100)
 29 |             normal = re.findall('a\+4\+parseInt\("(.*?)"', text, re.S)[0]
 30 |             h = url[34+int(normal)+k]
 31 |         except Exception as e:
 32 |             traceback.print_exc()
 33 |         return str(k),h
 34 | 
 35 |     def search_gzh_info(self, name, page=1):
 36 |         """搜索公众号
 37 | 
 38 |         Args:
 39 |             name: 搜索关键字
 40 |             page: 搜索的页数
 41 | 
 42 |         Returns:
 43 |             列表，每一项均是{'name':name,'wechatid':wechatid,'jieshao':jieshao,'renzhen':renzhen,'qrcode':qrcodes,'img':img,'url':url}
 44 |             name: 公众号名称
 45 |             wechatid: 公众号id
 46 |             jieshao: 介绍
 47 |             renzhen: 认证，为空表示未认证
 48 |             qrcode: 二维码 暂无
 49 |             img: 头像图片
 50 |             url: 文章地址
 51 |             last_url: 最后一篇文章地址 暂无
 52 |         """
 53 |         htmlText,request_url = self._search_gzh_text(name, page)
 54 |         
 55 |         try:
 56 |             page = etree.HTML(htmlText)
 57 |         except:
 58 |             return ""
 59 | 
 60 |         img = list()
 61 |         #头像
 62 |         info_imgs = page.xpath(u"//div[@class='img-box']//img")
 63 |         for info_img in info_imgs:
 64 |             img.append(info_img.attrib['src'])
 65 |         #文章列表
 66 |         url = list()
 67 |         info_urls = page.xpath(u"//div[@class='img-box']//a");
 68 |         for info_url in info_urls:
 69 |             urlTemp = info_url.attrib['href']
 70 |             realurl = ""
 71 |             if "https" not in urlTemp:
 72 |                 urlTemp = "https://weixin.sogou.com" + urlTemp
 73 |                 #urlTemp = "https://weixin.sogou.com/link?url=dn9a_-gY295K0Rci_xozVXfdMkSQTLW6EzDJysI4ql5MPrOUp16838dGRMI7NnPqd7f2zaZT8G5XX6CVLv7ghwwvDqyjOWdzJRR9kv142zmzO5mLYzdWtr0gCwKco-MoXapf6ecdCpf0FojXSUCaI0AbdUwNO9bh1Gmjh__CSkSsWFwwodqOp8Ow2hU_0OwS0h4lvHQbidemvuZ2FfgnOGRTpLLTNgHY&type=1&query=mh_syxx&k=56&h=V"
 74 |             try:
 75 | 
 76 |                 #计算加密k
 77 |                 k,h = self.get_k_h(urlTemp,htmlText)
 78 |                 urlTemp = "%s&k=%s&h=%s" %(urlTemp,k,h)
 79 |                 #转成正式的文章列表url
 80 |                 print(u"先获取正式的文章列表url")
 81 |                 text = self._get(urlTemp,referer=request_url)
 82 |                 arr = text.split("url +=");
 83 |                 for iterating_var in arr:
 84 |                     realurl+=iterating_var.split("'")[1];
 85 |             except WechatSogouVcodeException:
 86 |                 realurl = ""
 87 |        
 88 | 
 89 |             url.append(realurl)
 90 |         
 91 |         #微信号
 92 |         wechatid = page.xpath(u"//label[@name='em_weixinhao']/text()");
 93 | 
 94 |         #公众号名称
 95 |         name = list()
 96 |         name_list = page.xpath(u"//div[@class='txt-box']/p/a")
 97 |         for name_item in name_list:
 98 |             name.append(name_item.xpath('string(.)'))
 99 |        
100 |         last_url = list()
101 |         jieshao = list()
102 |         renzhen = list()
103 |         list_index = 0
104 |         #介绍、认证、最近文章
105 |         info_instructions = page.xpath(u"//ul[@class='news-list2']/li")
106 |         for info_instruction in info_instructions:
107 |             cache = self._get_elem_text(info_instruction)
108 |             cache = cache.replace('red_beg', '').replace('red_end', '')
109 |             cache_list = cache.split('\n')
110 |             cache_re = re.split(u'功能介绍：|认证：|最近文章：', cache_list[0])
111 |             if(cache.find("最近文章") == -1) :
112 |                 last_url.insert(list_index,"")
113 |             list_index += 1
114 | 
115 |             if(len(cache_re) > 1):
116 |                 jieshao.append(re.sub("document.write\(authname\('[0-9]'\)\)", "", cache_re[1]))
117 |                 if "authname" in cache_re[1]:
118 |                     renzhen.append(cache_re[2])
119 |                 else:
120 |                     renzhen.append('')
121 |             else:
122 |                 #没取到，都为空吧
123 |                 jieshao.append('')
124 |                 renzhen.append('')
125 | 
126 |         returns = list()
127 |         for i in range(len(name)):
128 |             returns.append(
129 |                 {
130 |                     'name': name[i],
131 |                     'wechatid': wechatid[i],
132 |                     'jieshao': jieshao[i],
133 |                     'renzhen': renzhen[i],
134 |                     'qrcode': '',
135 |                     'img': img[i],
136 |                     'url': url[i],
137 |                     'last_url': ''
138 |                 }
139 |             )
140 |         return returns
141 | 
142 |     def get_gzh_info(self, wechatid):
143 |         """获取公众号微信号wechatid的信息
144 | 
145 |         因为wechatid唯一确定，所以第一个就是要搜索的公众号
146 | 
147 |         Args:
148 |             wechatid: 公众号id
149 | 
150 |         Returns:
151 |             字典{'name':name,'wechatid':wechatid,'jieshao':jieshao,'renzhen':renzhen,'qrcode':qrcodes,'img':img,'url':url}
152 |             name: 公众号名称
153 |             wechatid: 公众号id
154 |             jieshao: 介绍
155 |             renzhen: 认证，为空表示未认证
156 |             qrcode: 二维码
157 |             img: 头像图片
158 |             url: 最近文章地址
159 |         """
160 |         try:
161 |             info = self.search_gzh_info(wechatid, 1)
162 |             return info[0] if info else ""
163 |         except:
164 |             return ""
165 | 
166 | 
167 |     def search_article_info(self, name, page=1):
168 |         """搜索文章
169 | 
170 |         Args:
171 |             name: 搜索文章关键字
172 |             page: 搜索的页数
173 | 
174 |         Returns:
175 |             列表，每一项均是{'name','url','img','zhaiyao','gzhname','gzhqrcodes','gzhurl','time'}
176 |             name: 文章标题
177 |             url: 文章链接
178 |             img: 文章封面图片缩略图，可转为高清大图
179 |             zhaiyao: 文章摘要
180 |             time: 文章推送时间，10位时间戳
181 |             gzhname: 公众号名称
182 |             gzhqrcodes: 公众号二维码
183 |             gzhurl: 公众号最近文章地址
184 |             page_count:共有多少页
185 | 
186 |         """
187 |         text = self._search_article_text(name, page)
188 |         text = text.replace("amp;","")
189 |         page = etree.HTML(text)
190 |         #搜索到的总条数
191 |         page_count = page.xpath(u"//div[@class='mun']/text()")
192 |         page_count = page_count[0].replace(',','').replace('找到约','').replace('条结果','')
193 | 
194 |         #文章信息
195 |         zhaiyao = list()
196 |         #摘要
197 |         zhaiyao_list = page.xpath(u"//ul[@class='news-list']/li//p[@class='txt-info']")
198 |         for zhaiyao_item in zhaiyao_list:
199 |             zhaiyao.append(zhaiyao_item.xpath('string(.)'))
200 |         
201 |         #标题
202 |         name = list()
203 |         info_names = page.xpath(u"//div[@class='txt-box']/h3/a")
204 |         for info_name in info_names:
205 |             name.append(info_name.xpath('string(.)'))
206 |         
207 |         #公众号名称
208 |         gzhname = list()
209 |         gzhwxhao = list()
210 |         gzhqrcodes = list()
211 |         gzhurl = list()
212 |         info_gzhs = page.xpath(u"//div[@class='txt-box']/div[@class='s-p']/a")
213 |         for info_gzh in info_gzhs:
214 |             #gzhname.append(info_gzh.attrib['data-sourcename'])
215 |             #gzhwxhao.append(info_gzh.attrib['data-username'])
216 |             #gzhqrcodes.append(info_gzh.attrib['data-encqrcodeurl'])
217 |             gzhurl.append(info_gzh.attrib['href'])
218 | 
219 |         #文章URL
220 |         url = list()
221 |         info_urls = page.xpath(u"//div[@class='txt-box']/h3/a")
222 |         for info_url in info_urls:
223 |             url.append(info_url.attrib['href'])
224 |         
225 |         #文章时间
226 |         time = list()
227 |         info_times = page.xpath(u"//div[@class='txt-box']/div[@class='s-p']")
228 |         for info_time in info_times:
229 |             time.append(info_time.attrib['t'])
230 | 
231 |         #封面
232 |         img = list()
233 |         info_imgs = page.xpath(u"//ul[@class='news-list']/li")
234 |         for info_img in info_imgs:
235 |             img_box = info_img.xpath(u"div[@class='img-box']/a/img")
236 |             if len(img_box) > 0 :
237 |                 #普通封面的
238 |                 img.append(img_box[0].attrib['src'])
239 |             else:
240 |                 #3张封面的
241 |                 img_box = info_img.xpath(u"div[@class='txt-box']/div[@class='img-d']/a/span/img")
242 |                 if len(img_box) > 0 :
243 |                     #拿第一个
244 |                     img.append(img_box[0].attrib['src'])
245 |                 else:
246 |                     #没拿到
247 |                     img.append("")
248 | 
249 |         returns = list()
250 |         for i in range(len(url)):
251 |             returns.append(
252 |                 {
253 |                     'name': name[i],
254 |                     'url': url[i],
255 |                     'img': img[i],
256 |                     'zhaiyao': zhaiyao[i],
257 |                     'gzhname': list_or_empty(gzhname),
258 |                     'gzhqrcodes': list_or_empty(gzhqrcodes),
259 |                     'gzhurl': gzhurl[i],
260 |                     'time': time[i],
261 |                     'page_count':int(page_count)
262 |                 }
263 |             )
264 |         return returns
265 | 
266 |     def get_gzh_message(self, **kwargs):
267 |         """解析最近文章页  或  解析历史消息记录
268 | 
269 |         Args:
270 |             ::param url 最近文章地址
271 |             ::param wechatid 微信号
272 |             ::param wechat_name 微信昵称(不推荐，因为不唯一)
273 | 
274 |             最保险的做法是提供url或者wechatid
275 | 
276 |         Returns:
277 |             gzh_messages 是 列表，每一项均是字典，一定含有字段qunfa_id,datetime,type
278 |             当type不同时，含有不同的字段，具体见文档
279 |         """
280 |         url = kwargs.get('url', None)
281 |         wechatid = kwargs.get('wechatid', None)
282 |         wechat_name = kwargs.get('wechat_name', None)
283 |         if url:
284 |             text = self._get_gzh_article_by_url_text(url)
285 |         elif wechatid:
286 |             gzh_info = self.get_gzh_info(wechatid)
287 |             url = gzh_info['url']
288 |             text = self._get_gzh_article_by_url_text(url)
289 |         elif wechat_name:
290 |             gzh_info = self.get_gzh_info(wechat_name)
291 |             url = gzh_info['url']
292 |             text = self._get_gzh_article_by_url_text(url)
293 |         else:
294 |             raise WechatSogouException('get_gzh_recent_info need param text and url')
295 |         
296 |         if u'链接已过期' in text:
297 |             return '链接已过期'
298 |         return self._deal_gzh_article_dict(self._get_gzh_article_by_url_dict(text))
299 | 
300 |     def get_gzh_message_and_info(self, **kwargs):
301 |         """最近文章页  公众号信息 和 群发信息
302 | 
303 |         Args:
304 |             ::param url 最近文章地址
305 |             ::param wechatid 微信号
306 |             ::param wechat_name 微信昵称(不推荐，因为不唯一)
307 | 
308 |             最保险的做法是提供url或者wechatid
309 | 
310 |         Returns:
311 |             字典{'gzh_info':gzh_info, 'gzh_messages':gzh_messages}
312 | 
313 |             gzh_info 也是字典{'name':name,'wechatid':wechatid,'jieshao':jieshao,'renzhen':renzhen,'qrcode':qrcodes,'img':img,'url':url}
314 |             name: 公众号名称
315 |             wechatid: 公众号id
316 |             jieshao: 介绍
317 |             renzhen: 认证，为空表示未认证
318 |             qrcode: 二维码
319 |             img: 头像图片
320 |             url: 最近文章地址
321 | 
322 |             gzh_messages 是 列表，每一项均是字典，一定含有字段qunfa_id,datetime,type
323 |             当type不同时，含有不同的字段，具体见文档
324 |         """
325 |         url = kwargs.get('url', None)
326 |         wechatid = kwargs.get('wechatid', None)
327 |         wechat_name = kwargs.get('wechat_name', None)
328 |         if url:
329 |             text = self._get_gzh_article_by_url_text(url)
330 |         elif wechatid:
331 |             gzh_info = self.get_gzh_info(wechatid)
332 |             url = gzh_info['url']
333 |             text = self._get_gzh_article_by_url_text(url)
334 |         elif wechat_name:
335 |             gzh_info = self.get_gzh_info(wechat_name)
336 |             url = gzh_info['url']
337 |             text = self._get_gzh_article_by_url_text(url)
338 |         else:
339 |             raise WechatSogouException('get_gzh_recent_info need param text and url')
340 | 
341 |         return {
342 |             'gzh_info': self._get_gzh_article_gzh_by_url_dict(text, url),
343 |             'gzh_messages': self._deal_gzh_article_dict(self._get_gzh_article_by_url_dict(text))
344 |         }
345 | 
346 |     def deal_article_content(self, **kwargs):
347 |         """获取文章内容
348 | 
349 |         Args:
350 |             ::param url 文章页 url
351 |             ::param text 文章页 文本
352 | 
353 |         Returns:
354 |             content_html, content_rich, content_text
355 |             content_html: 原始文章内容，包括html标签及样式
356 |             content_rich: 包含图片（包括图片应展示的样式）的文章内容
357 |             content_text: 包含图片（`<img src="..." />`格式）的文章内容
358 |         """
359 |         url = kwargs.get('url', None)
360 |         text = kwargs.get('text', None)
361 | 
362 |         if text:
363 |             pass
364 |         elif url:
365 |             text = self._get_gzh_article_text(url)
366 |         else:
367 |             raise WechatSogouException('deal_content need param url or text')
368 | 
369 |         #纯文字
370 |         bsObj = BeautifulSoup(text)
371 |         content_text = bsObj.find("div", {"class":"rich_media_content", "id":"js_content"})
372 |         if not content_text: #分享的文章
373 |             content_text = bsObj.find("div", {"class":"share_media", "id":"js_share_content"})
374 | 
375 |         content_html = ""
376 |         if content_text:
377 |             content_html = content_text.get_text()
378 | 
379 |         return content_html
380 | 
381 |     def deal_article_related(self, url, title):
382 |         """获取文章相似文章
383 | 
384 |         Args:
385 |             url: 文章链接
386 |             title: 文章标题
387 | 
388 |         Returns:
389 |             related_dict: 相似文章字典
390 | 
391 |         Raises:
392 |             WechatSogouException: 错误信息errmsg
393 |         """
394 |         return self._deal_related(url, title)
395 | 
396 |     def deal_article_comment(self, **kwargs):
397 |         """获取文章评论
398 | 
399 |         Args:
400 |             text: 文章文本
401 | 
402 |         Returns:
403 |             comment_dict: 评论字典
404 | 
405 |         Raises:
406 |             WechatSogouException: 错误信息errmsg
407 |         """
408 |         url = kwargs.get('url', None)
409 |         text = kwargs.get('text', None)
410 | 
411 |         if text:
412 |             pass
413 |         elif url:
414 |             text = self._get_gzh_article_text(url)
415 |         else:
416 |             raise WechatSogouException('deal_content need param url or text')
417 | 
418 |         sg_data = re.findall(u'window.sg_data={(.*?)}', text, re.S)
419 |         if not sg_data :
420 |             return ""
421 |         sg_data = '{' + sg_data[0].replace(u'\r\n', '').replace(' ', '') + '}'
422 |         sg_data = re.findall(u'{src:"(.*?)",ver:"(.*?)",timestamp:"(.*?)",signature:"(.*?)"}', sg_data)[0]
423 |         comment_req_url = 'http://mp.weixin.qq.com/mp/getcomment?src=' + sg_data[0] + '&ver=' + sg_data[
424 |             1] + '&timestamp=' + sg_data[2] + '&signature=' + sg_data[
425 |                               3] + '&uin=&key=&pass_ticket=&wxtoken=&devicetype=&clientversion=0&x5=0'
426 |         comment_text = self._get(comment_req_url, 'get', host='mp.weixin.qq.com', referer='http://mp.weixin.qq.com')
427 |         comment_dict = eval(comment_text)
428 |         ret = comment_dict['base_resp']['ret']
429 |         errmsg = comment_dict['base_resp']['errmsg'] if comment_dict['base_resp']['errmsg'] else 'ret:' + str(ret)
430 |         if ret != 0:
431 |             logger.error(errmsg)
432 |             raise WechatSogouException(errmsg)
433 |         return comment_dict
434 | 
435 |     def deal_article_yuan(self, **kwargs):
436 |         url = kwargs.get('url', None)
437 |         text = kwargs.get('text', None)
438 | 
439 |         if text:
440 |             pass
441 |         elif url:
442 |             text = self._get_gzh_article_text(url)
443 |         else:
444 |             raise WechatSogouException('deal_article_yuan need param url or text')
445 |         try:
446 |             yuan = re.findall('var msg_link = "(.*?)";', text)[0].replace('amp;', '')
447 |         except IndexError as e:
448 |             if '系统出错' not in text:
449 |                 logger.error(e)
450 |                 print(e)
451 |                 print(text)
452 | 
453 |             raise WechatSogouBreakException()
454 |         return yuan
455 | 
456 |     def deal_article(self, url, title=None):
457 |         """获取文章详情
458 | 
459 |         Args:
460 |             url: 文章链接
461 |             title: 文章标题
462 |             注意，title可以为空，则表示不根据title获取相似文章
463 | 
464 |         Returns:
465 |             {'yuan':'','related':'','comment':'','content': {'content_html':'','content_rich':'','content_text':''}
466 |             yuan: 文章固定地址
467 |             related: 相似文章信息字典
468 |             comment: 评论信息字典
469 |             content: 文章内容
470 |         """
471 |         text = self._get_gzh_article_text(url)
472 |         
473 |         yuan_url = url #self.deal_get_real_url(url) 2017-5-3搜狗升级获取永久链接方法
474 | 
475 |         comment = '' #2017-04-27搜狗微信取消评论数据self.deal_article_comment(text=text)
476 |         content_html = self.deal_article_content(text=text)
477 |         retu = {
478 |             'yuan': yuan_url,
479 |             'comment': comment,
480 |             'content_html': content_html
481 |         }
482 | 
483 |         if title is not None:
484 |             related = self.deal_article_related(url, title)
485 |             retu['related'] = related
486 |             return retu
487 |         else:
488 |             return retu
489 | 
490 |     def get_recent_article_url_by_index_single(self, kind=0, page=0):
491 |         """获取首页推荐文章公众号最近文章地址
492 | 
493 |         Args:
494 |             kind: 类别，从0开始，经检测，至少应检查0-19，不保证之间每个都有
495 |             page: 页数，从0开始
496 | 
497 |         Returns:
498 |             recent_article_urls或者False
499 |             recent_article_urls: 最近文章地址列表
500 |             False: 该kind和page对应的页数没有文章
501 |         """
502 |         if page == 0:
503 |             page_str = 'pc_0'
504 |         else:
505 |             page_str = str(page)
506 |         url = 'https://weixin.sogou.com/pcindex/pc/pc_' + str(kind) + '/' + page_str + '.html'
507 |         try:
508 |             text = self._get(url)
509 |             page = etree.HTML(text)
510 |             recent_article_urls = page.xpath('//li/div[@class="pos-wxrw"]/a/@href')
511 |             reurls = []
512 |             for reurl in recent_article_urls:
513 |                 if 'mp.weixin.qq.com' in reurl:
514 |                     reurls.append(reurl)
515 |             return reurls
516 |         except WechatSogouRequestsException as e:
517 |             if e.status_code == 404:
518 |                 return False
519 | 
520 |     def get_recent_article_url_by_index_all(self):
521 |         """获取首页推荐文章公众号最近文章地址，所有分类，所有页数
522 | 
523 |         Returns:
524 |             return_urls: 最近文章地址列表
525 |         """
526 |         return_urls = []
527 |         for i in range(20):
528 |             j = 0
529 |             urls = self.get_recent_article_url_by_index_single(i, j)
530 |             while urls:
531 |                 return_urls.extend(urls)
532 |                 j += 1
533 |                 urls = self.get_recent_article_url_by_index_single(i, j)
534 |         return return_urls
535 | 
536 |     def get_sugg(self, keyword):
537 |         """获取微信搜狗搜索关键词联想
538 | 
539 |         Args:
540 |             keyword: 关键词
541 | 
542 |         Returns:
543 |             sugg: 联想关键词列表
544 | 
545 |         Raises:
546 |             WechatSogouException: get_sugg keyword error 关键词不是str或者不是可以str()的类型
547 |             WechatSogouException: sugg refind error 返回分析错误
548 |         """
549 |         try:
550 |             keyword = str(keyword) if type(keyword) != str else keyword
551 |         except Exception as e:
552 |             logger.error('get_sugg keyword error', e)
553 |             raise WechatSogouException('get_sugg keyword error')
554 |         url = 'http://w.sugg.sogou.com/sugg/ajaj_json.jsp?key=' + keyword + '&type=wxpub&pr=web'
555 |         text = self._get(url, 'get', host='w.sugg.sogou.com')
556 |         try:
557 |             sugg = re.findall(u'\["' + keyword + '",(.*?),\["', text)[0]
558 |             sugg = eval(sugg)
559 |             return sugg
560 |         except Exception as e:
561 |             logger.error('sugg refind error', e)
562 |             raise WechatSogouException('sugg refind error')
563 | 
564 |     def deal_mass_send_msg(self, url, wechatid):
565 |         """解析 历史消息
566 | 
567 |         ::param url是抓包获取的历史消息页
568 |         """
569 |         session = requests.session()
570 |         r = session.get(url, verify=False)
571 |         #print(r)
572 |         if r.status_code == requests.codes.ok:
573 |             try:
574 |                 biz = re.findall('biz = \'(.*?)\',', r.text)[0]
575 |                 key = re.findall('key = \'(.*?)\',', r.text)[0]
576 |                 uin = re.findall('uin = \'(.*?)\',', r.text)[0]
577 |                 pass_ticket = self._get_url_param(url).get('pass_ticket', [''])[0]
578 | 
579 |                 self._uinkeybiz(wechatid, uin, key, biz, pass_ticket, 0)
580 |                 self._cache_history_session(wechatid, session)
581 | 
582 |             except IndexError:
583 |                 logger.error('deal_mass_send_msg error. maybe you should get the mp url again')
584 |                 #raise WechatSogouHistoryMsgException('deal_mass_send_msg error. maybe you should get the mp url again')
585 |                 return 404
586 |         else:
587 |             logger.error('requests status_code error', r.status_code)
588 |             raise WechatSogouRequestsException('requests status_code error', r.status_code)
589 | 
590 |     #获取历史消息
591 |     def deal_mass_send_msg_page(self, wechatid, updatecache=True):
592 |         url = 'http://mp.weixin.qq.com/mp/getmasssendmsg?'
593 |         uin, key, biz, pass_ticket, frommsgid = self._uinkeybiz(wechatid)
594 |         #print([uin, key, biz, pass_ticket, frommsgid])
595 |         url = url + 'uin=' + uin + '&'
596 |         url = url + 'key=' + key + '&'
597 |         url = url + '__biz=' + biz + '&'
598 |         url = url + 'pass_ticket=' + pass_ticket + '&'
599 |         url = url + 'frommsgid=' + str(frommsgid) + '&'
600 |         data = {
601 |             'f': 'json',
602 |             'count': '10',
603 |             'wxtoken': '',
604 |             'x5': '0'
605 |         }
606 |         for k, v in data.items():
607 |             url = url + k + '=' + v + '&'
608 |         url = url[:-1]
609 |         # print(url)
610 | 
611 |         try:
612 |             session = self._cache_history_session(wechatid)
613 |             r = session.get(url, headers={'Host': 'mp.weixin.qq.com'}, verify=False)
614 |             #print(r.text)
615 |             rdic = eval(r.text)
616 |             if rdic['ret'] == 0:
617 | 
618 |                 data_dict_from_str = self._str_to_dict(rdic['general_msg_list'])
619 | 
620 |                 if rdic['is_continue'] == 0 and rdic['count'] == 0:
621 |                     raise WechatSogouEndException()
622 | 
623 |                 msg_dict = self._deal_gzh_article_dict(data_dict_from_str)
624 |                 msg_dict_new = reversed(msg_dict)
625 |                 msgid = 0
626 |                 for m in msg_dict_new:
627 |                     if int(m['type']) == 49:
628 |                         msgid = m['qunfa_id']
629 |                         break
630 | 
631 |                 if updatecache:
632 |                     self._uinkeybiz(wechatid, rdic['uin_code'], rdic['key'], rdic['bizuin_code'], pass_ticket, msgid)
633 | 
634 |                 return msg_dict
635 |             else:
636 |                 logger.error('deal_mass_send_msg_page ret ' + str(rdic['ret']) + ' errmsg ' + rdic['errmsg'])
637 |                 raise WechatSogouHistoryMsgException(
638 |                     'deal_mass_send_msg_page ret ' + str(rdic['ret']) + ' errmsg ' + rdic['errmsg'])
639 |         except AttributeError:
640 |             logger.error('deal_mass_send_msg_page error, please delete cache file')
641 |             raise WechatSogouHistoryMsgException('deal_mass_send_msg_page error, please delete cache file')
642 | 
643 | 
644 |     #获取阅读数据
645 |     def deal_get_fwh_read(self, wechatid, updatecache,**kwargs):
646 |         url = 'http://mp.weixin.qq.com/mp/getappmsgext?'
647 |         uin, key, biz, pass_ticket, frommsgid = self._uinkeybiz(wechatid)
648 |         #print([uin, key, biz, pass_ticket, frommsgid])
649 |         url = url + 'uin=' + uin + '&'
650 |         url = url + 'key=' + key + '&'
651 |         url = url + '__biz=' + biz + '&'
652 |         url = url + 'pass_ticket=' + pass_ticket + '&'
653 |         url = url + 'frommsgid=' + str(frommsgid) + '&'
654 |         url = url + 'mid=' + kwargs.get('mid', None) + '&'
655 |         url = url + 'sn=' + kwargs.get('sn', None) + '&'
656 |         url = url + 'idx=' + kwargs.get('idx', None) + '&'
657 | 
658 |         data = {
659 |             'f': 'json',
660 |             'count': '10',
661 |             'wxtoken': '',
662 |             'x5': '0'
663 |         }
664 |         for k, v in data.items():
665 |             url = url + k + '=' + v + '&'
666 |         url = url[:-1]
667 |         # print(url)
668 | 
669 |         try:
670 |             session = self._cache_history_session(wechatid)
671 |             print(url)
672 |             r = session.post(url,headers={'Host': 'mp.weixin.qq.com',
673 |                                           'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat'},
674 |                                           data={'is_only_read':1}, verify=False)
675 |             
676 | 
677 |             if r.status_code == requests.codes.ok:
678 |                 try:
679 |                     rdic = json.loads(r.text)
680 |                     return rdic['appmsgstat']
681 |                     
682 |                 except IndexError:
683 |                     logger.error('deal_mass_send_msg error. maybe you should get the mp url again')
684 |                     #raise WechatSogouHistoryMsgException('deal_mass_send_msg error. maybe you should get the mp url again')
685 |                     return 404
686 |             else :
687 |                 logger.error('requests status_code error', r.status_code)
688 |                 raise WechatSogouRequestsException('requests status_code error', r.status_code)
689 | 
690 |         except AttributeError:
691 |             logger.error('deal_mass_send_msg_page error, please delete cache file')
692 |             raise WechatSogouHistoryMsgException('deal_mass_send_msg_page error, please delete cache file')
693 | 
694 |     #获取搜狗微信文章上的真实链接
695 |     def deal_get_real_url(self, url):
696 |         try:
697 |             url = url + '&uin=MjExMTY2MjUzNg=='
698 |             text = requests.get(url,allow_redirects=False)
699 |             return text.headers['Location']
700 |         except:
701 |             return ""
702 | 
703 |     #下载文章到本地
704 |     def down_html(self, url,dir_name):
705 |         try:
706 |             url = url.replace('\\x26','&')
707 |             url = url.replace('x26','&')
708 | 
709 |             print(url)
710 |             h = httplib2.Http(timeout=30)
711 |             html = self._get_gzh_article_text(url)
712 |             content = html
713 | 
714 |             # 正则表达式javascript里的获取相关变量
715 |             ct = re.findall('var ct = "(.*?)";', content)[0]
716 |             msg_cdn_url = re.findall('var msg_cdn_url = "(.*?)";', content)[0]
717 |             nickname = re.findall('var nickname = "(.*?)";', content)[0]
718 |             if(nickname == ""):
719 |                 nickname = "not has name"
720 |             if(ct == ""):
721 |                 ct = time.time()
722 | 
723 |             ctime = time.strftime("%Y%m%d%H%M%S", time.localtime(int(ct))) # int将字符串转成数字，不区分int和long, 这里将时间秒数转成日期格式
724 |             # 建立文件夹
725 |             #编码转换
726 |             if isinstance(dir_name, unicode): 
727 |                 dir_name = dir_name.encode('GB18030','ignore')
728 |             else: 
729 |                 dir_name = dir_name.decode('utf-8','ignore').encode('GB18030','ignore')
730 |             
731 |             #print 
732 |             if isinstance(nickname, unicode): 
733 |                 nickname = nickname.encode('GB18030','ignore')
734 |             else: 
735 |                 if chardet.detect(nickname)['encoding'] == 'KOI8-R' :
736 |                     print("KOI8")
737 |                     nickname = nickname.decode('KOI8-R','ignore').encode('GB18030','ignore')
738 |                 else:
739 |                     print("GB18030")
740 |                     nickname = nickname.decode('utf-8','ignore').encode('GB18030','ignore')
741 | 
742 |             dir = 'WeiXinGZH/' + nickname + '/' + ctime + '/' + dir_name + '/'
743 |             #dir = 'WeiXinGZH/' + dir_name + '/'
744 |             dir = dir.decode('gb2312','ignore')
745 |             dir = dir.replace("?", "")
746 |             dir = dir.replace("\\", "")
747 |             dir = dir.replace("*", "")
748 |             dir = dir.replace(":", "")
749 |             dir = dir.replace('\"', "")
750 |             dir = dir.replace("<", "")
751 |             dir = dir.replace(">", "")
752 |             dir = dir.replace("|", "")
753 | 
754 | 
755 |             try :
756 |                 os.makedirs(dir)  # 建立相应的文件夹
757 |                 
758 |             except :
759 |                 #不处理
760 |                 errormsg = 'none'
761 | 
762 |             # 下载封面
763 |             url = msg_cdn_url
764 |             print(u'正在下载文章：' + url)
765 |             resp, contentface = h.request(url)
766 |             
767 |             file_name = dir + 'cover.jpg'
768 |             codecs.open(file_name,mode='wb').write(contentface)
769 | 
770 |             # 下载其他图片
771 |             soup = BeautifulSoup(content, 'html.parser')
772 |             count = 0
773 |             #logger.error(html)
774 |             err_count = 0
775 |             for link in soup.find_all('img') :
776 |                 try:
777 |                     err_count += 1
778 |                     if(err_count > 200) :
779 |                         break #防止陷阱
780 | 
781 |                     if None != link.get('data-src') :
782 |                         count = count + 1
783 |                         orurl = link.get('data-src')
784 |                         url = orurl.split('?')[0]  # 重新构造url，原来的url有一部分无法下载
785 |                         #print u'正在下载：' + url
786 |                         resp, content = h.request(url)
787 | 
788 |                         matchurlvalue = re.search(r'wx_fmt=(?P<wx_fmt>[^&]*)', orurl) # 无参数的可能是gif，也有可能是jpg
789 |                         if None != matchurlvalue:
790 |                             wx_fmt = matchurlvalue.group('wx_fmt') # 优先通过wx_fmt参数的值判断文件类型
791 |                         else:
792 |                             wx_fmt = binascii.b2a_hex(content[0:4]) # 读取前4字节转化为16进制字符串
793 | 
794 |                         #print wx_fmt
795 |                         phototype = { 'jpeg': '.jpg', 'gif' : '.gif', 'png' : '.png', 'jpg' : '.jpg', '47494638' : '.gif', 'ffd8ffe0' : '.jpg', 'ffd8ffe1' : '.jpg', 'ffd8ffdb' : '.jpg', 'ffd8fffe' : '.jpg', 'other' : '.jpg', '89504e47' : '.png' }  # 方便写文件格式
796 |                         file_name = 'Picture' + str(count) + phototype[wx_fmt]
797 |                         file_path = dir + file_name
798 |                         open(file_path, 'wb').write(content)
799 | 
800 |                         #图片替换成本地地址
801 |                         re_url = 'data-src="%s(.+?)"' % (url[:-5])
802 |                         re_pic = 'src="%s"' % (file_name)
803 |                         html = re.sub(re_url, re_pic, html)
804 |                 except:
805 |                     continue
806 | 
807 |             with open("%sindex.html" % (dir), "wb") as code :
808 |                 code.write(html)
809 | 
810 |             print(u'文章下载完成')
811 |             ret_path = os.path.abspath('.')
812 |             ret_path = ret_path.replace('\\', "/")
813 |             ret_path = "%s/%sindex.html" %(ret_path.decode('GB18030').encode('utf-8'),dir)
814 |             #print(ret_path)
815 |         #except:
816 |         except WechatSogouHistoryMsgException:
817 |             print(u'文章内容有异常编码，无法下载')
818 |             return ""
819 |         return ret_path
820 | 
821 | 
822 | 


--------------------------------------------------------------------------------