├── .gitattributes
├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── coffee.md
├── docs
├── HISTORY.rst
└── README.rst
├── mkdocs.yml
├── requirements.txt
├── screenshot
├── alipay_hongbao.png
├── get_gzh_article_by_history.png
├── get_gzh_article_by_hot.png
├── get_gzh_info.png
├── get_sugg.png
├── pay_ali.jpg
├── pay_wechat.jpg
├── search_article.png
└── search_gzh.png
├── setup.cfg
├── setup.py
├── test
├── __init__.py
├── fateadm.py
├── file
│ ├── article_detail_backgroud-image.html
│ ├── article_detail_expired.html
│ ├── article_detail_iframe.html
│ ├── article_detail_mpvoice.html
│ ├── article_detail_qqmusic.html
│ ├── bitsea-history.html
│ ├── search-gaokao-article.html
│ ├── search-gaokao-article.json
│ ├── search-gaokao-gzh-error.html
│ ├── search-gaokao-gzh.html
│ └── wapindex-wap-0612-wap_8-0.html
├── rk.py
├── test_api.py
├── test_const.py
├── test_request_gen_hot_url.py
├── test_request_gen_search_article_url.py
├── test_request_gen_search_gzh_url.py
├── test_structuring.py
└── test_tools.py
├── tox.ini
└── wechatsogou
├── __init__.py
├── api.py
├── const.py
├── exceptions.py
├── filecache.py
├── five.py
├── identify_image.py
├── request.py
├── structuring.py
└── tools.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | test/file/* linguist-vendored
2 | docs/bootstrap/* linguist-vendored
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | .git
3 | __pycache__
4 | cache
5 | wechatsogou/cache
6 | wechatsogou/ocr
7 | sprider
8 | wechatsogou.egg-info
9 | dist
10 | demo
11 | log.txt
12 | web
13 | wechatid.txt
14 | .DS_Store
15 | test/config.py
16 | .python-version
17 | *.pyc
18 | .tox/
19 | build/
20 | .hypothesis/
21 | test/.hypothesis/
22 | t.py
23 | y.py
24 | tencent_captcha/
25 | docs/src/node_modules/
26 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 |
3 | python:
4 | - "2.7.12"
5 | - "3.5.3"
6 | - "3.6.1"
7 |
8 | cache:
9 | directories:
10 | - $HOME/.cache/pip
11 |
12 | env:
13 | global:
14 | - PIP_WHEEL_DIR=$HOME/.cache/pip/wheels
15 | - PIP_FIND_LINKS=file://$HOME/.cache/pip/wheels
16 |
17 | install:
18 | - pip install tox tox-travis flake8
19 | - pip install -r requirements.txt
20 |
21 | before_script:
22 | - export PYTHONPATH=$PYTHONPATH:$(pwd)
23 |
24 | script:
25 | - make flake8
26 | - make dry_publish
27 | - tox
28 |
29 | notifications:
30 | email:
31 | on_success: never
32 | on_failure: never
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License 2.0
2 | Copyright [2018] [Chyroc https://blog.chyroc.cn]
3 |
4 | Apache License
5 | Version 2.0, January 2004
6 | http://www.apache.org/licenses/
7 |
8 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
9 |
10 | 1. Definitions.
11 |
12 | "License" shall mean the terms and conditions for use, reproduction,
13 | and distribution as defined by Sections 1 through 9 of this document.
14 |
15 | "Licensor" shall mean the copyright owner or entity authorized by
16 | the copyright owner that is granting the License.
17 |
18 | "Legal Entity" shall mean the union of the acting entity and all
19 | other entities that control, are controlled by, or are under common
20 | control with that entity. For the purposes of this definition,
21 | "control" means (i) the power, direct or indirect, to cause the
22 | direction or management of such entity, whether by contract or
23 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
24 | outstanding shares, or (iii) beneficial ownership of such entity.
25 |
26 | "You" (or "Your") shall mean an individual or Legal Entity
27 | exercising permissions granted by this License.
28 |
29 | "Source" form shall mean the preferred form for making modifications,
30 | including but not limited to software source code, documentation
31 | source, and configuration files.
32 |
33 | "Object" form shall mean any form resulting from mechanical
34 | transformation or translation of a Source form, including but
35 | not limited to compiled object code, generated documentation,
36 | and conversions to other media types.
37 |
38 | "Work" shall mean the work of authorship, whether in Source or
39 | Object form, made available under the License, as indicated by a
40 | copyright notice that is included in or attached to the work
41 | (an example is provided in the Appendix below).
42 |
43 | "Derivative Works" shall mean any work, whether in Source or Object
44 | form, that is based on (or derived from) the Work and for which the
45 | editorial revisions, annotations, elaborations, or other modifications
46 | represent, as a whole, an original work of authorship. For the purposes
47 | of this License, Derivative Works shall not include works that remain
48 | separable from, or merely link (or bind by name) to the interfaces of,
49 | the Work and Derivative Works thereof.
50 |
51 | "Contribution" shall mean any work of authorship, including
52 | the original version of the Work and any modifications or additions
53 | to that Work or Derivative Works thereof, that is intentionally
54 | submitted to Licensor for inclusion in the Work by the copyright owner
55 | or by an individual or Legal Entity authorized to submit on behalf of
56 | the copyright owner. For the purposes of this definition, "submitted"
57 | means any form of electronic, verbal, or written communication sent
58 | to the Licensor or its representatives, including but not limited to
59 | communication on electronic mailing lists, source code control systems,
60 | and issue tracking systems that are managed by, or on behalf of, the
61 | Licensor for the purpose of discussing and improving the Work, but
62 | excluding communication that is conspicuously marked or otherwise
63 | designated in writing by the copyright owner as "Not a Contribution."
64 |
65 | "Contributor" shall mean Licensor and any individual or Legal Entity
66 | on behalf of whom a Contribution has been received by Licensor and
67 | subsequently incorporated within the Work.
68 |
69 | 2. Grant of Copyright License. Subject to the terms and conditions of
70 | this License, each Contributor hereby grants to You a perpetual,
71 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
72 | copyright license to reproduce, prepare Derivative Works of,
73 | publicly display, publicly perform, sublicense, and distribute the
74 | Work and such Derivative Works in Source or Object form.
75 |
76 | 3. Grant of Patent License. Subject to the terms and conditions of
77 | this License, each Contributor hereby grants to You a perpetual,
78 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
79 | (except as stated in this section) patent license to make, have made,
80 | use, offer to sell, sell, import, and otherwise transfer the Work,
81 | where such license applies only to those patent claims licensable
82 | by such Contributor that are necessarily infringed by their
83 | Contribution(s) alone or by combination of their Contribution(s)
84 | with the Work to which such Contribution(s) was submitted. If You
85 | institute patent litigation against any entity (including a
86 | cross-claim or counterclaim in a lawsuit) alleging that the Work
87 | or a Contribution incorporated within the Work constitutes direct
88 | or contributory patent infringement, then any patent licenses
89 | granted to You under this License for that Work shall terminate
90 | as of the date such litigation is filed.
91 |
92 | 4. Redistribution. You may reproduce and distribute copies of the
93 | Work or Derivative Works thereof in any medium, with or without
94 | modifications, and in Source or Object form, provided that You
95 | meet the following conditions:
96 |
97 | (a) You must give any other recipients of the Work or
98 | Derivative Works a copy of this License; and
99 |
100 | (b) You must cause any modified files to carry prominent notices
101 | stating that You changed the files; and
102 |
103 | (c) You must retain, in the Source form of any Derivative Works
104 | that You distribute, all copyright, patent, trademark, and
105 | attribution notices from the Source form of the Work,
106 | excluding those notices that do not pertain to any part of
107 | the Derivative Works; and
108 |
109 | (d) If the Work includes a "NOTICE" text file as part of its
110 | distribution, then any Derivative Works that You distribute must
111 | include a readable copy of the attribution notices contained
112 | within such NOTICE file, excluding those notices that do not
113 | pertain to any part of the Derivative Works, in at least one
114 | of the following places: within a NOTICE text file distributed
115 | as part of the Derivative Works; within the Source form or
116 | documentation, if provided along with the Derivative Works; or,
117 | within a display generated by the Derivative Works, if and
118 | wherever such third-party notices normally appear. The contents
119 | of the NOTICE file are for informational purposes only and
120 | do not modify the License. You may add Your own attribution
121 | notices within Derivative Works that You distribute, alongside
122 | or as an addendum to the NOTICE text from the Work, provided
123 | that such additional attribution notices cannot be construed
124 | as modifying the License.
125 |
126 | You may add Your own copyright statement to Your modifications and
127 | may provide additional or different license terms and conditions
128 | for use, reproduction, or distribution of Your modifications, or
129 | for any such Derivative Works as a whole, provided Your use,
130 | reproduction, and distribution of the Work otherwise complies with
131 | the conditions stated in this License.
132 |
133 | 5. Submission of Contributions. Unless You explicitly state otherwise,
134 | any Contribution intentionally submitted for inclusion in the Work
135 | by You to the Licensor shall be under the terms and conditions of
136 | this License, without any additional terms or conditions.
137 | Notwithstanding the above, nothing herein shall supersede or modify
138 | the terms of any separate license agreement you may have executed
139 | with Licensor regarding such Contributions.
140 |
141 | 6. Trademarks. This License does not grant permission to use the trade
142 | names, trademarks, service marks, or product names of the Licensor,
143 | except as required for reasonable and customary use in describing the
144 | origin of the Work and reproducing the content of the NOTICE file.
145 |
146 | 7. Disclaimer of Warranty. Unless required by applicable law or
147 | agreed to in writing, Licensor provides the Work (and each
148 | Contributor provides its Contributions) on an "AS IS" BASIS,
149 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
150 | implied, including, without limitation, any warranties or conditions
151 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
152 | PARTICULAR PURPOSE. You are solely responsible for determining the
153 | appropriateness of using or redistributing the Work and assume any
154 | risks associated with Your exercise of permissions under this License.
155 |
156 | 8. Limitation of Liability. In no event and under no legal theory,
157 | whether in tort (including negligence), contract, or otherwise,
158 | unless required by applicable law (such as deliberate and grossly
159 | negligent acts) or agreed to in writing, shall any Contributor be
160 | liable to You for damages, including any direct, indirect, special,
161 | incidental, or consequential damages of any character arising as a
162 | result of this License or out of the use or inability to use the
163 | Work (including but not limited to damages for loss of goodwill,
164 | work stoppage, computer failure or malfunction, or any and all
165 | other commercial damages or losses), even if such Contributor
166 | has been advised of the possibility of such damages.
167 |
168 | 9. Accepting Warranty or Additional Liability. While redistributing
169 | the Work or Derivative Works thereof, You may choose to offer,
170 | and charge a fee for, acceptance of support, warranty, indemnity,
171 | or other liability obligations and/or rights consistent with this
172 | License. However, in accepting such obligations, You may act only
173 | on Your own behalf and on Your sole responsibility, not on behalf
174 | of any other Contributor, and only if You agree to indemnify,
175 | defend, and hold each Contributor harmless for any liability
176 | incurred by, or claims asserted against, such Contributor by reason
177 | of your accepting any such warranty or additional liability.
178 |
179 | END OF TERMS AND CONDITIONS
180 |
181 | APPENDIX: How to apply the Apache License to your work.
182 |
183 | To apply the Apache License to your work, attach the following
184 | boilerplate notice, with the fields enclosed by brackets "[]"
185 | replaced with your own identifying information. (Don't include
186 | the brackets!) The text should be enclosed in the appropriate
187 | comment syntax for the file format. We also recommend that a
188 | file or class name and description of purpose be included on the
189 | same "printed page" as the copyright notice for easier
190 | identification within third-party archives.
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | graft docs
3 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: doc dry_publish
2 |
3 | docdir = docs
4 | doc:
5 | if [ -a $(docdir)/README.rst ]; then rm $(docdir)/README.rst; fi;
6 | pandoc --from=markdown --to=rst --output=$(docdir)/README.rst README.md
7 | if [ -a $(docdir)/HISTORY.rst ]; then rm $(docdir)/HISTORY.rst; fi;
8 | pandoc --from=markdown --to=rst --output=$(docdir)/HISTORY.rst CHANGELOG.md
9 | python setup.py check --restructuredtext
10 |
11 | dry_publish:
12 | rm -rf dist/ build/
13 | python setup.py sdist bdist_wheel
14 |
15 | publish: dry_publish
16 | twine upload -s dist/*
17 |
18 | flake8:
19 | flake8 --ignore=E501,F401,E128,E402,E731,F821 wechatsogou
20 |
21 | tox:
22 | pyenv local 2.7.12 3.5.3 3.6.1
23 | tox
24 |
25 | gendoc:
26 | echo '---\nname: Change Log\n---\n' > docs/src/CHANGELOG.mdx
27 | cat CHANGELOG.md >> docs/src/CHANGELOG.mdx
28 | cd docs/src/ && yarn build && rm -rf ../static && mv .docz/dist/* ../
29 |
30 | clean:
31 | @rm -rf build/ wechatsogou.egg-info/ dist/
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 基于搜狗微信搜索的微信公众号爬虫接口
2 | ===
3 |
4 | [](https://github.com/Chyroc/WechatSogou)
5 | [](https://github.com/Chyroc/WechatSogou)
6 | [](https://github.com/Chyroc/WechatSogou)
7 | [](https://github.com/Chyroc/WechatSogou)
8 | [](https://github.com/Chyroc/WechatSogou)
9 |
10 | 我的另外一个作品: https://github.com/chyroc/lark ,基于代码生成的 Lark/飞书 Go SDK,欢迎 star 。
11 |
12 | 
13 |
14 | ```
15 | __ __ _ _ ____
16 | \ \ / /__ ___| |__ __ _| |_/ ___| ___ __ _ ___ _ _
17 | \ \ /\ / / _ \/ __| '_ \ / _` | __\___ \ / _ \ / _` |/ _ \| | | |
18 | \ V V / __/ (__| | | | (_| | |_ ___) | (_) | (_| | (_) | |_| |
19 | \_/\_/ \___|\___|_| |_|\__,_|\__|____/ \___/ \__, |\___/ \__,_|
20 | |___/
21 | ```
22 |
23 | # 项目简介
24 | 基于搜狗微信搜索的微信公众号爬虫接口,可以扩展成基于搜狗搜索的爬虫
25 |
26 | 如果有问题,请提issue
27 |
28 | [CHANGELOG](./CHANGELOG.md)
29 |
30 | # 交流分享
31 |
32 | - QQ群(只需加一个)
33 | - 一群 132955136(已满)
34 | - 二群 819084985
35 |
36 | - 微信群
37 |
38 |
39 | # 赞助作者
40 | 甲鱼说,咖啡是灵魂的饮料,买点咖啡
41 |
42 | [谢谢这些人的☕️](./coffee.md)
43 |
44 | 支付宝扫码大家一起领红包:
45 |
46 |
47 |
48 | 或者直接转账:
49 |
50 | 
51 |
52 |
53 | # 问题集锦
54 | Q:没有得到原始文章url / 提示链接已经过期?
55 | A:微信屏蔽此接口,请在临时链接有效期内保存文章内容。
56 |
57 | Q:获取文章只能10篇?
58 | A:是的,仅显示最近10条群发。
59 |
60 | Q:使用的是python 2 还是 3?
61 | A:都支持,若出错,请报BUG。
62 |
63 | # 安装
64 | ```
65 | pip install wechatsogou --upgrade
66 | ```
67 |
68 | # 使用
69 |
70 | ### 初始化 API
71 |
72 | ```python
73 | import wechatsogou
74 |
75 | # 可配置参数
76 |
77 | # 直连
78 | ws_api = wechatsogou.WechatSogouAPI()
79 |
80 | # 验证码输入错误的重试次数,默认为1
81 | ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3)
82 |
83 | # 所有requests库的参数都能在这用
84 | # 如 配置代理,代理列表中至少需包含1个 HTTPS 协议的代理, 并确保代理可用
85 | ws_api = wechatsogou.WechatSogouAPI(proxies={
86 | "http": "127.0.0.1:8888",
87 | "https": "127.0.0.1:8888",
88 | })
89 |
90 | # 如 设置超时
91 | ws_api = wechatsogou.WechatSogouAPI(timeout=0.1)
92 | ```
93 |
94 |
95 | ### 获取特定公众号信息 - get_gzh_info
96 |
97 | 
98 |
99 | - 使用
100 | ```
101 | In [5]: import wechatsogou
102 | ...:
103 | ...: ws_api =wechatsogou.WechatSogouAPI()
104 | ...: ws_api.get_gzh_info('南航青年志愿者')
105 | ...:
106 | Out[5]:
107 | {
108 | 'authentication': '南京航空航天大学',
109 | 'headimage': 'http://img01.sogoucdn.com/app/a/100520090/oIWsFt1tmWoG6vO6BcsS7St61bRE',
110 | 'introduction': '南航大志愿活动的领跑者,为你提供校内外的志愿资源和精彩消息.',
111 | 'post_perm': 26,
112 | 'view_perm': 1000,
113 | 'profile_url': 'http://mp.weixin.qq.com/profile?src=3×tamp=1501140102&ver=1&signature=OpcTZp20TUdKHjSqWh7m73RWBIzwYwINpib2ZktBkLG8NyHamTvK2jtzl7mf-VdpE246zXAq18GNm*S*bq4klw==',
114 | 'qrcode': 'http://mp.weixin.qq.com/rr?src=3×tamp=1501140102&ver=1&signature=-DnFampQflbiOadckRJaTaDRzGSNfisIfECELSo-lN-GeEOH8-XTtM*ASdavl0xuavw-bmAEQXOa1T39*EIsjzxz30LjyBNkjmgbT6bGnZM=',
115 | 'wechat_id': 'nanhangqinggong',
116 | 'wechat_name': '南航青年志愿者'
117 | }
118 | ```
119 |
120 | - 返回数据结构
121 | ```python
122 | {
123 | 'profile_url': '', # 最近10条群发页链接
124 | 'headimage': '', # 头像
125 | 'wechat_name': '', # 名称
126 | 'wechat_id': '', # 微信id
127 | 'post_perm': int, # 最近一月群发数
128 | 'view_perm': int, # 最近一月阅读量
129 | 'qrcode': '', # 二维码
130 | 'introduction': '', # 简介
131 | 'authentication': '' # 认证
132 | }
133 | ```
134 |
135 | ### 搜索公众号
136 |
137 | 
138 |
139 | - 使用
140 | ```
141 | In [6]: import wechatsogou
142 | ...:
143 | ...: ws_api =wechatsogou.WechatSogouAPI()
144 | ...: ws_api.search_gzh('南京航空航天大学')
145 | ...:
146 | Out[6]:
147 | [
148 | {
149 | 'authentication': '南京航空航天大学',
150 | 'headimage': 'http://img01.sogoucdn.com/app/a/100520090/oIWsFt1MvjqspMDVvZjpmxyo36sU',
151 | 'introduction': '南京航空航天大学官方微信',
152 | 'post_perm': 0,
153 | 'view_perm': 0,
154 | 'profile_url': 'http://mp.weixin.qq.com/profile?src=3×tamp=1501141990&ver=1&signature=S-7U131D3eQERC8yJGVAg2edySXn*qGVi5uE8QyQU034di*2mS6vGJVnQBRB0It9t9M-Qn7ynvjRKZNQrjBMEg==',
155 | 'qrcode': 'http://mp.weixin.qq.com/rr?src=3×tamp=1501141990&ver=1&signature=Tlp-r0AaBRxtx3TuuyjdxmjiR4aEJY-hjh0kmtV6byVu3QIQYiMlJttJgGu0hwtZMZCCntdfaP5jD4JXipTwoGecAze8ycEF5KYZqtLSsNE=',
156 | 'wechat_id': 'NUAA_1952',
157 | 'wechat_name': '南京航空航天大学'
158 | },
159 | {
160 | 'authentication': '南京航空航天大学',
161 | 'headimage': 'http://img01.sogoucdn.com/app/a/100520090/oIWsFtwVmjdK_57vIKeMceGXF5BQ',
162 | 'introduction': '南京航空航天大学团委官方微信平台',
163 | 'post_perm': 0,
164 | 'view_perm': 0,
165 | 'profile_url': 'http://mp.weixin.qq.com/profile?src=3×tamp=1501141990&ver=1&signature=aXFQrSDOiZJHedlL7vtAkvFMckxBmubE9VGrVczTwS601bOIT5Nrr8Pcgs6bQ-oEd6jdQ0aK5WCQjNwMAhJnyQ==',
166 | 'qrcode': 'http://mp.weixin.qq.com/rr?src=3×tamp=1501141990&ver=1&signature=7Cpbd9CVQsXJkExRcU5VM6NuyoxDQQfVfF7*CGI-PTR0y6stHPtdSDqzAzvPMWz67Xz9IMF2TDfu4Cndj5bKxlsFh6wGhiLH0b9ZKqgCW5k=',
167 | 'wechat_id': 'nuaa_tw',
168 | 'wechat_name': '南京航空航天大学团委'
169 | },
170 | ...
171 | ]
172 | ```
173 |
174 | - 数据结构
175 |
176 | list of dict, dict:
177 |
178 | ```python
179 | {
180 | 'profile_url': '', # 最近10条群发页链接
181 | 'headimage': '', # 头像
182 | 'wechat_name': '', # 名称
183 | 'wechat_id': '', # 微信id
184 | 'post_perm': int, # 最近一月群发数
185 | 'view_perm': int, # 最近一月阅读量
186 | 'qrcode': '', # 二维码
187 | 'introduction': '', # 介绍
188 | 'authentication': '' # 认证
189 | }
190 | ```
191 |
192 | ### 搜索微信文章
193 |
194 | 
195 |
196 | - 使用
197 | ```
198 | In [7]: import wechatsogou
199 | ...:
200 | ...: ws_api =wechatsogou.WechatSogouAPI()
201 | ...: ws_api.search_article('南京航空航天大学')
202 | ...:
203 | Out[7]:
204 | [
205 | {
206 | 'article': {
207 | 'abstract': '【院校省份】江苏【报名时间】4月5日截止【考试时间】6月10日-11日南京航空航天大学2017年自主招生简章南京航空航天大学2017...',
208 | 'imgs': ['http://img01.sogoucdn.com/net/a/04/link?appid=100520033&url=http://mmbiz.qpic.cn/mmbiz_png/P07yicBRJfC71QB3lREx4J4x34QOibGaia5BkiaaiaiaibicWkTBULou9R08K6FaxlUA1RFBFWCmpO1Lepk7ZcXK45vguQ/0?wx_fmt=png'],
209 | 'time': 1490270644,
210 | 'title': '南京航空航天大学2017年自主招生简章',
211 | 'url': 'http://mp.weixin.qq.com/s?src=3×tamp=1501142580&ver=1&signature=hRMlQOLQpu4BNhBACavusZdmk**D65qHyz5LWDq1lPjVcm7*iiBS0l7Pq40h0fiCX*bZ8vSMLzAMDNzELYFKIQ7mND0-7cQi-N0BtfTBql*CQdsHun-GtaYEqRva6Ukwce3gZh46SXJzo90kyZ3dwVYl6*589bGDIzG6JTGfpxI='
212 | },
213 | 'gzh': {
214 | 'headimage': 'http://wx.qlogo.cn/mmhead/Q3auHgzwzM5kiawibor6ABhnibMYnOADvqdcrl5XWiaFfM5mGYZ8cUica6A/0',
215 | 'isv': 0,
216 | 'profile_url': 'http://mp.weixin.qq.com/profile?src=3×tamp=1501142580&ver=1&signature=dVkDdcFr1suL1WHdCOJj7pwZhG9W*APi-j5kRtS09ccv-WID-zNs0ecDiiz1wwE7qbNSk5HBL*ffpyVXcF0fFQ==',
217 | 'wechat_name': '自主招生在线'
218 | }
219 | },
220 | ...
221 | ]
222 | ```
223 |
224 | - 数据结构
225 |
226 | list of dict, dict:
227 | ```python
228 | {
229 | 'article': {
230 | 'title': '', # 文章标题
231 | 'url': '', # 文章链接
232 | 'imgs': '', # 文章图片list
233 | 'abstract': '', # 文章摘要
234 | 'time': int # 文章推送时间 10位时间戳
235 | },
236 | 'gzh': {
237 | 'profile_url': '', # 公众号最近10条群发页链接
238 | 'headimage': '', # 头像
239 | 'wechat_name': '', # 名称
240 | 'isv': int, # 是否加v 1 or 0
241 | }
242 | }
243 | ```
244 |
245 | ### 解析最近文章页 - get_gzh_article_by_history
246 |
247 | 
248 |
249 | - 使用
250 | ```
251 | In [1]: import wechatsogou
252 | ...:
253 | ...: ws_api =wechatsogou.WechatSogouAPI()
254 | ...: ws_api.get_gzh_article_by_history('南航青年志愿者')
255 | ...:
256 | Out[1]:
257 | {
258 | 'article': [
259 | {
260 | 'abstract': '我们所做的,并不能立马去改变什么——\n但千里之行,绿勤行永不止步。\n我们不会就此止步,之后我们又将再出发。\n 民勤,再见。\n绿勤行,不再见。',
261 | 'author': '',
262 | 'content_url': 'http://mp.weixin.qq.com/s?timestamp=1501143158&src=3&ver=1&signature=B-*tqUrFyO7OqpFeJZwTA7JJtsHpz6BgC8ugyfgpOnyWLtPb85R5Zmu0JuZRbZKG72x4bQjMCcsfA5mC3GSSOPbYd-9tzvTgmroGRmc4Tzk8090KCiEu6EjA0YMHeytWJWpxr51M2FUYQhTWJ01pTmNnXLVAG6Ex6AG52uvvmQA=',
263 | 'copyright_stat': 100,
264 | 'cover': 'http://mmbiz.qpic.cn/mmbiz_jpg/icFYWMxnmxHDYgXNjAle7szYLgQmicbaQlb1eVFuwp2vxEu5eNVwYacaHah2N5W8dKAm725vxv5aM6DFlM59Wftg/0?wx_fmt=jpeg',
265 | 'datetime': 1501072594,
266 | 'fileid': 502326199,
267 | 'main': 1,
268 | 'send_id': 1000000306,
269 | 'source_url': '',
270 | 'title': '绿勤行——不说再见',
271 | 'type': '49'
272 | },
273 | {
274 | 'abstract': '当时不杂,过往不恋,志愿不老,我们不散!',
275 | 'author': '',
276 | 'content_url': 'http://mp.weixin.qq.com/s?timestamp=1501143158&src=3&ver=1&signature=B-*tqUrFyO7OqpFeJZwTA7JJtsHpz6BgC8ugyfgpOnyWLtPb85R5Zmu0JuZRbZKG72x4bQjMCcsfA5mC3GSSOGUrM*jg*EP1jU-Dyf2CVqmPnOgBiET2wlitek4FcRbXorAswWHm*1rqODcN52NtfKD-OcRTazQS*t5SnJtu3ZA=',
277 | 'copyright_stat': 100,
278 | 'cover': 'http://mmbiz.qpic.cn/mmbiz_jpg/icFYWMxnmxHCoY44nPUXvkSgpZI1LaEsZfkZvtGaiaNW2icjibCp6qs93xLlr9kXMJEP3z1pmQ6TbRZNicHibGzRwh1w/0?wx_fmt=jpeg',
279 | 'datetime': 1500979158,
280 | 'fileid': 502326196,
281 | 'main': 1,
282 | 'send_id': 1000000305,
283 | 'source_url': '',
284 | 'title': '有始有终 | 2016-2017年度环境保护服务部工作总结',
285 | 'type': '49'
286 | },
287 | ...
288 | ],
289 | 'gzh': {
290 | 'authentication': '南京航空航天大学',
291 | 'headimage': 'http://wx.qlogo.cn/mmhead/Q3auHgzwzM4xV5PgPjK5XoPaaQoxnWJAFicibMvPAnsoybawMBFxua1g/0',
292 | 'introduction': '南航大志愿活动的领跑者,为你提供校内外的志愿资源和精彩消息。',
293 | 'wechat_id': 'nanhangqinggong',
294 | 'wechat_name': '南航青年志愿者'
295 | }
296 | }
297 | ```
298 | - 数据结构
299 | ```python
300 | {
301 | 'gzh': {
302 | 'wechat_name': '', # 名称
303 | 'wechat_id': '', # 微信id
304 | 'introduction': '', # 简介
305 | 'authentication': '', # 认证
306 | 'headimage': '' # 头像
307 | },
308 | 'article': [
309 | {
310 | 'send_id': int, # 群发id,注意不唯一,因为同一次群发多个消息,而群发id一致
311 | 'datetime': int, # 群发datatime 10位时间戳
312 | 'type': '', # 消息类型,均是49(在手机端历史消息页有其他类型,网页端最近10条消息页只有49),表示图文
313 | 'main': int, # 是否是一次群发的第一次消息 1 or 0
314 | 'title': '', # 文章标题
315 | 'abstract': '', # 摘要
316 | 'fileid': int, #
317 | 'content_url': '', # 文章链接
318 | 'source_url': '', # 阅读原文的链接
319 | 'cover': '', # 封面图
320 | 'author': '', # 作者
321 | 'copyright_stat': int, # 文章类型,例如:原创啊
322 | },
323 | ...
324 | ]
325 | }
326 |
327 | ```
328 |
329 | ### 解析 首页热门 页 - get_gzh_article_by_hot
330 |
331 | 
332 |
333 | - 使用
334 | ```
335 | In [1]: from pprint import pprint
336 | ...: from wechatsogou import WechatSogouAPI, WechatSogouConst
337 | ...:
338 | ...: ws_api = WechatSogouAPI()
339 | ...: gzh_articles = ws_api.get_gzh_article_by_hot(WechatSogouConst.hot_index.food)
340 | ...: for i in gzh_articles:
341 | ...: pprint(i)
342 | ...:
343 | {
344 | 'article': {
345 | 'abstract': '闷热的夏天有什么事情能比吃上凉凉的甜品更惬意的呢?快一起动手做起来吧,简单方便,放冰箱冻一冻,那感觉~橙汁蒸木瓜木瓜1个(300-400克左右),橙子4个,枫糖浆20克(如果家里没有,也可以用蜂蜜、炼乳等代替),椰果适量。做法1.用削皮',
346 | 'main_img': 'http://img01.sogoucdn.com/net/a/04/link?appid=100520033&url=http%3A%2F%2Fmmbiz.qpic.cn%2Fmmbiz_jpg%2Fw9UGwFPia7QTUIadPibgW8OFkqf1ibR40xicKfzofRS0sDpaFp3CG0jkPyQKeXl44TXswztW1SJnic7tmCibjB8rIIGw%2F0%3Fwx_fmt%3Djpeg',
347 | 'open_id': 'oIWsFty9hHVI9F10amtzx5TOWIq8',
348 | 'time': 1501325220,
349 | 'title': '夏日甜品制作方法,不收藏后悔哦!',
350 | 'url': 'http://mp.weixin.qq.com/s?src=3×tamp=1501328525&ver=1&signature=n9*oX0k4YbNFhNMsOjIekYrsha44lfBSCbG9jicAbGYrWNN8*48NzpcaHdxwUnC12syY5-ZxwcBfiJlMzdbAwWKlo26EW14w2Ax*gjLVlOX-AGXB4443obZ-GK0pw*AFZAGZD8sI4AFBZSZpyeaxN4sS7cpynxdIuw6S2h*--LI='
351 | },
352 | 'gzh': {
353 | 'headimage': 'http://img03.sogoucdn.com/app/a/100520090/oIWsFty9hHVI9F10amtzx5TOWIq8',
354 | 'wechat_name': '甜品烘焙制作坊'
355 | }
356 | }
357 | ...
358 | ...
359 | ```
360 |
361 | - 数据结构
362 | ```python
363 | {
364 | 'gzh': {
365 | 'headimage': str, # 公众号头像
366 | 'wechat_name': str, # 公众号名称
367 | },
368 | 'article': {
369 | 'url': str, # 文章临时链接
370 | 'title': str, # 文章标题
371 | 'abstract': str, # 文章摘要
372 | 'time': int, # 推送时间,10位时间戳
373 | 'open_id': str, # open id
374 | 'main_img': str # 封面图片
375 | }
376 | }
377 | ```
378 |
379 | ### 获取关键字联想词
380 | - 使用
381 | ```
382 | In [1]: import wechatsogou
383 | ...:
384 | ...: ws_api =wechatsogou.WechatSogouAPI()
385 | ...: ws_api.get_sugg('高考')
386 | ...:
387 | Out[1]:
388 | ['高考e通',
389 | '高考专业培训',
390 | '高考地理俱乐部',
391 | '高考志愿填报咨讯',
392 | '高考报考资讯',
393 | '高考教育',
394 | '高考早知道',
395 | '高考服务志愿者',
396 | '高考机构',
397 | '高考福音']
398 | ```
399 |
400 | - 数据结构
401 |
402 | 关键词列表
403 | ```python
404 | ['a', 'b', ...]
405 | ```
406 | ---
407 |
408 | # TODO
409 | - [x] ~~相似文章的公众号获取~~
410 | - [ ] 主页热门公众号获取
411 | - [ ] 文章详情页信息
412 | - [x] ~~所有类型的解析~~
413 | - [ ] 验证码识别
414 | - [ ] 接入爬虫框架
415 | - [x] 兼容py2
416 |
417 | ---
418 |
--------------------------------------------------------------------------------
/coffee.md:
--------------------------------------------------------------------------------
1 | 谢谢这些人的☕️
2 |
3 | name | age
4 | ---- | ---
5 | ax4 | 50
6 | 风雨坛·君 | 50
7 | 陆小凤 | 28.88
8 | 朋鑫 | 18.88
9 | JenkinsY94 | 9.99
10 | yudun1989 | 50
11 | 妥妥的 | 50
12 |
--------------------------------------------------------------------------------
/docs/README.rst:
--------------------------------------------------------------------------------
1 | 基于搜狗微信搜索的微信公众号爬虫接口
2 | ====================================
3 |
4 | |Build Status| |PyPI version| |PyPI| |py27,py35,py36| |PyPI|
5 |
6 | .. figure:: https://raw.githubusercontent.com/chyroc/wechatsogou/master/screenshot/get_gzh_info.png
7 | :alt: ws_api.get_gzh_info(‘南航青年志愿者’)
8 |
9 | ws_api.get_gzh_info(‘南航青年志愿者’)
10 |
11 | ::
12 |
13 | __ __ _ _ ____
14 | \ \ / /__ ___| |__ __ _| |_/ ___| ___ __ _ ___ _ _
15 | \ \ /\ / / _ \/ __| '_ \ / _` | __\___ \ / _ \ / _` |/ _ \| | | |
16 | \ V V / __/ (__| | | | (_| | |_ ___) | (_) | (_| | (_) | |_| |
17 | \_/\_/ \___|\___|_| |_|\__,_|\__|____/ \___/ \__, |\___/ \__,_|
18 | |___/
19 |
20 | 项目简介
21 | ========
22 |
23 | 基于搜狗微信搜索的微信公众号爬虫接口,可以扩展成基于搜狗搜索的爬虫
24 |
25 | 如果有问题,请提issue
26 |
27 | `CHANGELOG <./CHANGELOG.md>`__
28 |
29 | 交流分享
30 | ========
31 |
32 | - QQ群(只需加一个)
33 |
34 | - 一群 132955136(已满)
35 | - 二群 819084985
36 |
37 | - 微信群
38 |
39 | 赞助作者
40 | ========
41 |
42 | 甲鱼说,咖啡是灵魂的饮料,买点咖啡
43 |
44 | `谢谢这些人的☕️ <./coffee.md>`__
45 |
46 | 支付宝扫码大家一起领红包:
47 |
48 | 或者直接转账:
49 |
50 | 问题集锦
51 | ========
52 |
53 | ::
54 |
55 | Q:没有得到原始文章url / 提示链接已经过期?
56 | A:微信屏蔽此接口,请在临时链接有效期内保存文章内容。
57 |
58 | Q:获取文章只能10篇?
59 | A:是的,仅显示最近10条群发。
60 |
61 | Q:使用的是python 2 还是 3?
62 | A:都支持,若出错,请报BUG。
63 |
64 | 安装
65 | ====
66 |
67 | ::
68 |
69 | pip install wechatsogou --upgrade
70 |
71 | 使用
72 | ====
73 |
74 | 初始化 API
75 | ~~~~~~~~~~
76 |
77 | .. code:: python
78 |
79 | import wechatsogou
80 |
81 | # 可配置参数
82 |
83 | # 直连
84 | ws_api = wechatsogou.WechatSogouAPI()
85 |
86 | # 验证码输入错误的重试次数,默认为1
87 | ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3)
88 |
89 | # 所有requests库的参数都能在这用
90 | # 如 配置代理,代理列表中至少需包含1个 HTTPS 协议的代理, 并确保代理可用
91 | ws_api = wechatsogou.WechatSogouAPI(proxies={
92 | "http": "127.0.0.1:8888",
93 | "https": "127.0.0.1:8888",
94 | })
95 |
96 | # 如 设置超时
97 | ws_api = wechatsogou.WechatSogouAPI(timeout=0.1)
98 |
99 | 获取特定公众号信息 - get_gzh_info
100 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
101 |
102 | .. figure:: https://raw.githubusercontent.com/chyroc/wechatsogou/master/screenshot/get_gzh_info.png
103 | :alt: ws_api.get_gzh_info(‘南航青年志愿者’)
104 |
105 | ws_api.get_gzh_info(‘南航青年志愿者’)
106 |
107 | - 使用
108 |
109 | ::
110 |
111 | In [5]: import wechatsogou
112 | ...:
113 | ...: ws_api =wechatsogou.WechatSogouAPI()
114 | ...: ws_api.get_gzh_info('南航青年志愿者')
115 | ...:
116 | Out[5]:
117 | {
118 | 'authentication': '南京航空航天大学',
119 | 'headimage': 'http://img01.sogoucdn.com/app/a/100520090/oIWsFt1tmWoG6vO6BcsS7St61bRE',
120 | 'introduction': '南航大志愿活动的领跑者,为你提供校内外的志愿资源和精彩消息.',
121 | 'post_perm': 26,
122 | 'view_perm': 1000,
123 | 'profile_url': 'http://mp.weixin.qq.com/profile?src=3×tamp=1501140102&ver=1&signature=OpcTZp20TUdKHjSqWh7m73RWBIzwYwINpib2ZktBkLG8NyHamTvK2jtzl7mf-VdpE246zXAq18GNm*S*bq4klw==',
124 | 'qrcode': 'http://mp.weixin.qq.com/rr?src=3×tamp=1501140102&ver=1&signature=-DnFampQflbiOadckRJaTaDRzGSNfisIfECELSo-lN-GeEOH8-XTtM*ASdavl0xuavw-bmAEQXOa1T39*EIsjzxz30LjyBNkjmgbT6bGnZM=',
125 | 'wechat_id': 'nanhangqinggong',
126 | 'wechat_name': '南航青年志愿者'
127 | }
128 |
129 | - 返回数据结构
130 |
131 | .. code:: python
132 |
133 | {
134 | 'profile_url': '', # 最近10条群发页链接
135 | 'headimage': '', # 头像
136 | 'wechat_name': '', # 名称
137 | 'wechat_id': '', # 微信id
138 | 'post_perm': int, # 最近一月群发数
139 | 'view_perm': int, # 最近一月阅读量
140 | 'qrcode': '', # 二维码
141 | 'introduction': '', # 简介
142 | 'authentication': '' # 认证
143 | }
144 |
145 | 搜索公众号
146 | ~~~~~~~~~~
147 |
148 | .. figure:: https://raw.githubusercontent.com/chyroc/wechatsogou/master/screenshot/search_gzh.png
149 | :alt: ws_api.search_gzh(‘南京航空航天大学’)
150 |
151 | ws_api.search_gzh(‘南京航空航天大学’)
152 |
153 | - 使用
154 |
155 | ::
156 |
157 | In [6]: import wechatsogou
158 | ...:
159 | ...: ws_api =wechatsogou.WechatSogouAPI()
160 | ...: ws_api.search_gzh('南京航空航天大学')
161 | ...:
162 | Out[6]:
163 | [
164 | {
165 | 'authentication': '南京航空航天大学',
166 | 'headimage': 'http://img01.sogoucdn.com/app/a/100520090/oIWsFt1MvjqspMDVvZjpmxyo36sU',
167 | 'introduction': '南京航空航天大学官方微信',
168 | 'post_perm': 0,
169 | 'view_perm': 0,
170 | 'profile_url': 'http://mp.weixin.qq.com/profile?src=3×tamp=1501141990&ver=1&signature=S-7U131D3eQERC8yJGVAg2edySXn*qGVi5uE8QyQU034di*2mS6vGJVnQBRB0It9t9M-Qn7ynvjRKZNQrjBMEg==',
171 | 'qrcode': 'http://mp.weixin.qq.com/rr?src=3×tamp=1501141990&ver=1&signature=Tlp-r0AaBRxtx3TuuyjdxmjiR4aEJY-hjh0kmtV6byVu3QIQYiMlJttJgGu0hwtZMZCCntdfaP5jD4JXipTwoGecAze8ycEF5KYZqtLSsNE=',
172 | 'wechat_id': 'NUAA_1952',
173 | 'wechat_name': '南京航空航天大学'
174 | },
175 | {
176 | 'authentication': '南京航空航天大学',
177 | 'headimage': 'http://img01.sogoucdn.com/app/a/100520090/oIWsFtwVmjdK_57vIKeMceGXF5BQ',
178 | 'introduction': '南京航空航天大学团委官方微信平台',
179 | 'post_perm': 0,
180 | 'view_perm': 0,
181 | 'profile_url': 'http://mp.weixin.qq.com/profile?src=3×tamp=1501141990&ver=1&signature=aXFQrSDOiZJHedlL7vtAkvFMckxBmubE9VGrVczTwS601bOIT5Nrr8Pcgs6bQ-oEd6jdQ0aK5WCQjNwMAhJnyQ==',
182 | 'qrcode': 'http://mp.weixin.qq.com/rr?src=3×tamp=1501141990&ver=1&signature=7Cpbd9CVQsXJkExRcU5VM6NuyoxDQQfVfF7*CGI-PTR0y6stHPtdSDqzAzvPMWz67Xz9IMF2TDfu4Cndj5bKxlsFh6wGhiLH0b9ZKqgCW5k=',
183 | 'wechat_id': 'nuaa_tw',
184 | 'wechat_name': '南京航空航天大学团委'
185 | },
186 | ...
187 | ]
188 |
189 | - 数据结构
190 |
191 | list of dict, dict:
192 |
193 | .. code:: python
194 |
195 | {
196 | 'profile_url': '', # 最近10条群发页链接
197 | 'headimage': '', # 头像
198 | 'wechat_name': '', # 名称
199 | 'wechat_id': '', # 微信id
200 | 'post_perm': int, # 最近一月群发数
201 | 'view_perm': int, # 最近一月阅读量
202 | 'qrcode': '', # 二维码
203 | 'introduction': '', # 介绍
204 | 'authentication': '' # 认证
205 | }
206 |
207 | 搜索微信文章
208 | ~~~~~~~~~~~~
209 |
210 | .. figure:: https://raw.githubusercontent.com/chyroc/wechatsogou/master/screenshot/search_article.png
211 | :alt: ws_api.search_article(‘南京航空航天大学’)
212 |
213 | ws_api.search_article(‘南京航空航天大学’)
214 |
215 | - 使用
216 |
217 | ::
218 |
219 | In [7]: import wechatsogou
220 | ...:
221 | ...: ws_api =wechatsogou.WechatSogouAPI()
222 | ...: ws_api.search_article('南京航空航天大学')
223 | ...:
224 | Out[7]:
225 | [
226 | {
227 | 'article': {
228 | 'abstract': '【院校省份】江苏【报名时间】4月5日截止【考试时间】6月10日-11日南京航空航天大学2017年自主招生简章南京航空航天大学2017...',
229 | 'imgs': ['http://img01.sogoucdn.com/net/a/04/link?appid=100520033&url=http://mmbiz.qpic.cn/mmbiz_png/P07yicBRJfC71QB3lREx4J4x34QOibGaia5BkiaaiaiaibicWkTBULou9R08K6FaxlUA1RFBFWCmpO1Lepk7ZcXK45vguQ/0?wx_fmt=png'],
230 | 'time': 1490270644,
231 | 'title': '南京航空航天大学2017年自主招生简章',
232 | 'url': 'http://mp.weixin.qq.com/s?src=3×tamp=1501142580&ver=1&signature=hRMlQOLQpu4BNhBACavusZdmk**D65qHyz5LWDq1lPjVcm7*iiBS0l7Pq40h0fiCX*bZ8vSMLzAMDNzELYFKIQ7mND0-7cQi-N0BtfTBql*CQdsHun-GtaYEqRva6Ukwce3gZh46SXJzo90kyZ3dwVYl6*589bGDIzG6JTGfpxI='
233 | },
234 | 'gzh': {
235 | 'headimage': 'http://wx.qlogo.cn/mmhead/Q3auHgzwzM5kiawibor6ABhnibMYnOADvqdcrl5XWiaFfM5mGYZ8cUica6A/0',
236 | 'isv': 0,
237 | 'profile_url': 'http://mp.weixin.qq.com/profile?src=3×tamp=1501142580&ver=1&signature=dVkDdcFr1suL1WHdCOJj7pwZhG9W*APi-j5kRtS09ccv-WID-zNs0ecDiiz1wwE7qbNSk5HBL*ffpyVXcF0fFQ==',
238 | 'wechat_name': '自主招生在线'
239 | }
240 | },
241 | ...
242 | ]
243 |
244 | - 数据结构
245 |
246 | list of dict, dict:
247 |
248 | .. code:: python
249 |
250 | {
251 | 'article': {
252 | 'title': '', # 文章标题
253 | 'url': '', # 文章链接
254 | 'imgs': '', # 文章图片list
255 | 'abstract': '', # 文章摘要
256 | 'time': int # 文章推送时间 10位时间戳
257 | },
258 | 'gzh': {
259 | 'profile_url': '', # 公众号最近10条群发页链接
260 | 'headimage': '', # 头像
261 | 'wechat_name': '', # 名称
262 | 'isv': int, # 是否加v 1 or 0
263 | }
264 | }
265 |
266 | 解析最近文章页 - get_gzh_article_by_history
267 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
268 |
269 | .. figure:: https://raw.githubusercontent.com/chyroc/wechatsogou/master/screenshot/get_gzh_article_by_history.png
270 | :alt: ws_api.search_article(‘南京航空航天大学’)
271 |
272 | ws_api.search_article(‘南京航空航天大学’)
273 |
274 | - 使用
275 |
276 | ::
277 |
278 | In [1]: import wechatsogou
279 | ...:
280 | ...: ws_api =wechatsogou.WechatSogouAPI()
281 | ...: ws_api.get_gzh_article_by_history('南航青年志愿者')
282 | ...:
283 | Out[1]:
284 | {
285 | 'article': [
286 | {
287 | 'abstract': '我们所做的,并不能立马去改变什么——\n但千里之行,绿勤行永不止步。\n我们不会就此止步,之后我们又将再出发。\n 民勤,再见。\n绿勤行,不再见。',
288 | 'author': '',
289 | 'content_url': 'http://mp.weixin.qq.com/s?timestamp=1501143158&src=3&ver=1&signature=B-*tqUrFyO7OqpFeJZwTA7JJtsHpz6BgC8ugyfgpOnyWLtPb85R5Zmu0JuZRbZKG72x4bQjMCcsfA5mC3GSSOPbYd-9tzvTgmroGRmc4Tzk8090KCiEu6EjA0YMHeytWJWpxr51M2FUYQhTWJ01pTmNnXLVAG6Ex6AG52uvvmQA=',
290 | 'copyright_stat': 100,
291 | 'cover': 'http://mmbiz.qpic.cn/mmbiz_jpg/icFYWMxnmxHDYgXNjAle7szYLgQmicbaQlb1eVFuwp2vxEu5eNVwYacaHah2N5W8dKAm725vxv5aM6DFlM59Wftg/0?wx_fmt=jpeg',
292 | 'datetime': 1501072594,
293 | 'fileid': 502326199,
294 | 'main': 1,
295 | 'send_id': 1000000306,
296 | 'source_url': '',
297 | 'title': '绿勤行——不说再见',
298 | 'type': '49'
299 | },
300 | {
301 | 'abstract': '当时不杂,过往不恋,志愿不老,我们不散!',
302 | 'author': '',
303 | 'content_url': 'http://mp.weixin.qq.com/s?timestamp=1501143158&src=3&ver=1&signature=B-*tqUrFyO7OqpFeJZwTA7JJtsHpz6BgC8ugyfgpOnyWLtPb85R5Zmu0JuZRbZKG72x4bQjMCcsfA5mC3GSSOGUrM*jg*EP1jU-Dyf2CVqmPnOgBiET2wlitek4FcRbXorAswWHm*1rqODcN52NtfKD-OcRTazQS*t5SnJtu3ZA=',
304 | 'copyright_stat': 100,
305 | 'cover': 'http://mmbiz.qpic.cn/mmbiz_jpg/icFYWMxnmxHCoY44nPUXvkSgpZI1LaEsZfkZvtGaiaNW2icjibCp6qs93xLlr9kXMJEP3z1pmQ6TbRZNicHibGzRwh1w/0?wx_fmt=jpeg',
306 | 'datetime': 1500979158,
307 | 'fileid': 502326196,
308 | 'main': 1,
309 | 'send_id': 1000000305,
310 | 'source_url': '',
311 | 'title': '有始有终 | 2016-2017年度环境保护服务部工作总结',
312 | 'type': '49'
313 | },
314 | ...
315 | ],
316 | 'gzh': {
317 | 'authentication': '南京航空航天大学',
318 | 'headimage': 'http://wx.qlogo.cn/mmhead/Q3auHgzwzM4xV5PgPjK5XoPaaQoxnWJAFicibMvPAnsoybawMBFxua1g/0',
319 | 'introduction': '南航大志愿活动的领跑者,为你提供校内外的志愿资源和精彩消息。',
320 | 'wechat_id': 'nanhangqinggong',
321 | 'wechat_name': '南航青年志愿者'
322 | }
323 | }
324 |
325 | - 数据结构
326 |
327 | .. code:: python
328 |
329 | {
330 | 'gzh': {
331 | 'wechat_name': '', # 名称
332 | 'wechat_id': '', # 微信id
333 | 'introduction': '', # 简介
334 | 'authentication': '', # 认证
335 | 'headimage': '' # 头像
336 | },
337 | 'article': [
338 | {
339 | 'send_id': int, # 群发id,注意不唯一,因为同一次群发多个消息,而群发id一致
340 | 'datetime': int, # 群发datatime 10位时间戳
341 | 'type': '', # 消息类型,均是49(在手机端历史消息页有其他类型,网页端最近10条消息页只有49),表示图文
342 | 'main': int, # 是否是一次群发的第一次消息 1 or 0
343 | 'title': '', # 文章标题
344 | 'abstract': '', # 摘要
345 | 'fileid': int, #
346 | 'content_url': '', # 文章链接
347 | 'source_url': '', # 阅读原文的链接
348 | 'cover': '', # 封面图
349 | 'author': '', # 作者
350 | 'copyright_stat': int, # 文章类型,例如:原创啊
351 | },
352 | ...
353 | ]
354 | }
355 |
356 | 解析 首页热门 页 - get_gzh_article_by_hot
357 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
358 |
359 | .. figure:: https://raw.githubusercontent.com/chyroc/wechatsogou/master/screenshot/get_gzh_article_by_hot.png
360 | :alt: ws_api.get_gzh_article_by_hot(WechatSogouConst.hot_index.food)
361 |
362 | ws_api.get_gzh_article_by_hot(WechatSogouConst.hot_index.food)
363 |
364 | - 使用
365 |
366 | ::
367 |
368 | In [1]: from pprint import pprint
369 | ...: from wechatsogou import WechatSogouAPI, WechatSogouConst
370 | ...:
371 | ...: ws_api = WechatSogouAPI()
372 | ...: gzh_articles = ws_api.get_gzh_article_by_hot(WechatSogouConst.hot_index.food)
373 | ...: for i in gzh_articles:
374 | ...: pprint(i)
375 | ...:
376 | {
377 | 'article': {
378 | 'abstract': '闷热的夏天有什么事情能比吃上凉凉的甜品更惬意的呢?快一起动手做起来吧,简单方便,放冰箱冻一冻,那感觉~橙汁蒸木瓜木瓜1个(300-400克左右),橙子4个,枫糖浆20克(如果家里没有,也可以用蜂蜜、炼乳等代替),椰果适量。做法1.用削皮',
379 | 'main_img': 'http://img01.sogoucdn.com/net/a/04/link?appid=100520033&url=http%3A%2F%2Fmmbiz.qpic.cn%2Fmmbiz_jpg%2Fw9UGwFPia7QTUIadPibgW8OFkqf1ibR40xicKfzofRS0sDpaFp3CG0jkPyQKeXl44TXswztW1SJnic7tmCibjB8rIIGw%2F0%3Fwx_fmt%3Djpeg',
380 | 'open_id': 'oIWsFty9hHVI9F10amtzx5TOWIq8',
381 | 'time': 1501325220,
382 | 'title': '夏日甜品制作方法,不收藏后悔哦!',
383 | 'url': 'http://mp.weixin.qq.com/s?src=3×tamp=1501328525&ver=1&signature=n9*oX0k4YbNFhNMsOjIekYrsha44lfBSCbG9jicAbGYrWNN8*48NzpcaHdxwUnC12syY5-ZxwcBfiJlMzdbAwWKlo26EW14w2Ax*gjLVlOX-AGXB4443obZ-GK0pw*AFZAGZD8sI4AFBZSZpyeaxN4sS7cpynxdIuw6S2h*--LI='
384 | },
385 | 'gzh': {
386 | 'headimage': 'http://img03.sogoucdn.com/app/a/100520090/oIWsFty9hHVI9F10amtzx5TOWIq8',
387 | 'wechat_name': '甜品烘焙制作坊'
388 | }
389 | }
390 | ...
391 | ...
392 |
393 | - 数据结构
394 |
395 | .. code:: python
396 |
397 | {
398 | 'gzh': {
399 | 'headimage': str, # 公众号头像
400 | 'wechat_name': str, # 公众号名称
401 | },
402 | 'article': {
403 | 'url': str, # 文章临时链接
404 | 'title': str, # 文章标题
405 | 'abstract': str, # 文章摘要
406 | 'time': int, # 推送时间,10位时间戳
407 | 'open_id': str, # open id
408 | 'main_img': str # 封面图片
409 | }
410 | }
411 |
412 | 获取关键字联想词
413 | ~~~~~~~~~~~~~~~~
414 |
415 | - 使用
416 |
417 | ::
418 |
419 | In [1]: import wechatsogou
420 | ...:
421 | ...: ws_api =wechatsogou.WechatSogouAPI()
422 | ...: ws_api.get_sugg('高考')
423 | ...:
424 | Out[1]:
425 | ['高考e通',
426 | '高考专业培训',
427 | '高考地理俱乐部',
428 | '高考志愿填报咨讯',
429 | '高考报考资讯',
430 | '高考教育',
431 | '高考早知道',
432 | '高考服务志愿者',
433 | '高考机构',
434 | '高考福音']
435 |
436 | - 数据结构
437 |
438 | 关键词列表
439 |
440 | .. code:: python
441 |
442 | ['a', 'b', ...]
443 |
444 | --------------
445 |
446 | TODO
447 | ====
448 |
449 | - ☒ [STRIKEOUT:相似文章的公众号获取]
450 | - ☐ 主页热门公众号获取
451 | - ☐ 文章详情页信息
452 | - ☒ [STRIKEOUT:所有类型的解析]
453 | - ☐ 验证码识别
454 | - ☐ 接入爬虫框架
455 | - ☒ 兼容py2
456 |
457 | --------------
458 |
459 | .. |Build Status| image:: https://travis-ci.org/Chyroc/WechatSogou.svg?branch=master
460 | :target: https://github.com/Chyroc/WechatSogou
461 | .. |PyPI version| image:: https://badge.fury.io/py/wechatsogou.svg
462 | :target: https://github.com/Chyroc/WechatSogou
463 | .. |PyPI| image:: https://img.shields.io/pypi/wheel/wechatsogou.svg
464 | :target: https://github.com/Chyroc/WechatSogou
465 | .. |py27,py35,py36| image:: https://img.shields.io/pypi/pyversions/wechatsogou.svg
466 | :target: https://github.com/Chyroc/WechatSogou
467 | .. |PyPI| image:: https://img.shields.io/pypi/l/wechatsogou.svg
468 | :target: https://github.com/Chyroc/WechatSogou
469 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: 'wechatsogou'
2 | pages:
3 | - 'API列表': 'README.md'
4 | - '更新日志': 'CHANGELOG.md'
5 | - 'FAQ': 'FAQ.md'
6 | extra_css: ['docs/bootstrap/css']
7 | extra_javascript: ['docs/bootstrap/js']
8 | theme_dir: 'docs/bootstrap'
9 | repo_url: 'https://github.com/Chyroc/WechatSogou'
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | future==0.16.0
2 | lxml==4.6.2
3 | Pillow==8.3.2
4 | requests>=2.20.0
5 | six==1.10.0
6 | Werkzeug==0.15.3
7 | xlrd==1.0.0
8 | bs4==0.0.1
--------------------------------------------------------------------------------
/screenshot/alipay_hongbao.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chyroc/WechatSogou/45731524c1e43925c61a340694774a0d022b895f/screenshot/alipay_hongbao.png
--------------------------------------------------------------------------------
/screenshot/get_gzh_article_by_history.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chyroc/WechatSogou/45731524c1e43925c61a340694774a0d022b895f/screenshot/get_gzh_article_by_history.png
--------------------------------------------------------------------------------
/screenshot/get_gzh_article_by_hot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chyroc/WechatSogou/45731524c1e43925c61a340694774a0d022b895f/screenshot/get_gzh_article_by_hot.png
--------------------------------------------------------------------------------
/screenshot/get_gzh_info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chyroc/WechatSogou/45731524c1e43925c61a340694774a0d022b895f/screenshot/get_gzh_info.png
--------------------------------------------------------------------------------
/screenshot/get_sugg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chyroc/WechatSogou/45731524c1e43925c61a340694774a0d022b895f/screenshot/get_sugg.png
--------------------------------------------------------------------------------
/screenshot/pay_ali.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chyroc/WechatSogou/45731524c1e43925c61a340694774a0d022b895f/screenshot/pay_ali.jpg
--------------------------------------------------------------------------------
/screenshot/pay_wechat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chyroc/WechatSogou/45731524c1e43925c61a340694774a0d022b895f/screenshot/pay_wechat.jpg
--------------------------------------------------------------------------------
/screenshot/search_article.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chyroc/WechatSogou/45731524c1e43925c61a340694774a0d022b895f/screenshot/search_article.png
--------------------------------------------------------------------------------
/screenshot/search_gzh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chyroc/WechatSogou/45731524c1e43925c61a340694774a0d022b895f/screenshot/search_gzh.png
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal = 1
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import codecs
2 | import re
3 |
4 | from setuptools import setup
5 |
6 | readme = codecs.open('docs/README.rst', encoding='utf-8').read()
7 | history = codecs.open('docs/HISTORY.rst', encoding='utf-8').read()
8 | with codecs.open("wechatsogou/__init__.py", encoding="utf8") as f:
9 | version = re.search(r'__version__ = "(.*?)"', f.read()).group(1)
10 |
11 | setup(
12 | name='wechatsogou',
13 | version=version,
14 | description='Api for wechat mp with sogou',
15 | long_description=u'\n\n'.join([readme, history]),
16 | author='Chyroc',
17 | author_email='chen_yunpeng@foxmail.com',
18 | url='https://github.com/Chyroc/WechatSogou',
19 | packages=[
20 | 'wechatsogou',
21 | ],
22 | setup_requires=[
23 | # minimum version to use environment markers
24 | 'setuptools>=20.6.8',
25 | ],
26 | install_requires=[
27 | 'future', 'lxml', 'Pillow', 'requests', 'six', 'Werkzeug', 'xlrd', 'bs4'
28 | ],
29 | include_package_data=True,
30 | license='MIT License',
31 | classifiers=[
32 | 'Intended Audience :: Developers',
33 | 'License :: OSI Approved :: MIT License',
34 | 'Operating System :: MacOS :: MacOS X',
35 | 'Operating System :: Microsoft :: Windows',
36 | 'Operating System :: POSIX',
37 | 'Programming Language :: Python',
38 | 'Programming Language :: Python :: 2.7',
39 | 'Programming Language :: Python :: 3.5',
40 | 'Programming Language :: Python :: 3.6',
41 | 'Topic :: Software Development :: Libraries :: Python Modules',
42 | 'Programming Language :: Python :: Implementation :: PyPy',
43 | 'Programming Language :: Python :: Implementation :: CPython',
44 | ],
45 | )
46 |
--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import absolute_import, unicode_literals, print_function
4 |
5 | import os
6 |
7 | from wechatsogou.request import WechatSogouRequest
8 | from wechatsogou.structuring import WechatSogouStructuring
9 |
10 | ws = WechatSogouRequest()
11 | ws_structuring = WechatSogouStructuring()
12 |
13 | empty_search_result_keyword = 'gggggggggggggggggg'
14 | gaokao_keyword = '高考'
15 | fake_data_path = '{}/file'.format(os.getcwd() if 'test' in os.getcwd() else '{}/test'.format(os.getcwd()))
16 |
--------------------------------------------------------------------------------
/test/fateadm.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import base64
4 | import hashlib
5 | import json
6 | import time
7 |
8 | import requests
9 |
10 |
11 | class FateadmAPI():
12 | def __init__(self, app_id, app_key, usr_id, usr_key):
13 | self.app_id = app_id
14 | self.app_key = app_key
15 | self.usr_id = usr_id
16 | self.usr_key = usr_key
17 | self.host = 'http://pred.fateadm.com'
18 |
19 | def calc_sign(self, usr_id, passwd, timestamp):
20 | md5 = hashlib.md5()
21 | md5.update((timestamp + passwd).encode())
22 | csign = md5.hexdigest()
23 |
24 | md5 = hashlib.md5()
25 | md5.update((usr_id + timestamp + csign).encode())
26 | csign = md5.hexdigest()
27 | return csign
28 |
29 | # 识别验证码
30 | def predict(self, pred_type, img_data):
31 | tm = str(int(time.time()))
32 |
33 | param = {
34 | 'user_id': self.usr_id,
35 | 'timestamp': tm,
36 | 'sign': self.calc_sign(self.usr_id, self.usr_key, tm),
37 | 'predict_type': pred_type,
38 | 'img_data': base64.b64encode(img_data),
39 | }
40 |
41 | if self.app_id != '':
42 | asign = self.calc_sign(self.app_id, self.app_key, tm)
43 | param['appid'] = self.app_id
44 | param['asign'] = asign
45 |
46 | r = requests.post('{}/api/capreg'.format(self.host), param)
47 | try:
48 | data = r.json()
49 | return json.loads(data['RspData'])['result']
50 | except Exception:
51 | raise Exception(r.text)
52 |
--------------------------------------------------------------------------------
/test/file/article_detail_expired.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
18 |
19 |
20 |
21 |
22 |
69 |
70 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 | 链接已过期
86 |
87 |
88 |
89 |
90 |
91 |
92 |
99 |
100 |
137 |
138 |
139 |
142 |
143 |
144 |
145 |
146 |
147 |
--------------------------------------------------------------------------------
/test/file/bitsea-history.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
19 |
20 |
21 |
22 |
23 |
69 |
70 | 槽边往事
71 |
72 |
73 |
74 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 | 槽边往事
91 |
92 |
微信号: bitsea
93 |
94 |
95 |
96 | -
97 |
98 |
99 |
100 | -
101 |
102 |

和菜头的微信Blog,用于分享各种新鲜资讯
103 |
104 |
105 |
106 |
109 |
110 |
111 |
最近10条群发
112 |
113 |
114 |
115 |
仅显示最近10条群发
116 |
117 |
120 |
123 |
124 |
125 |
126 |
127 |

128 |
微信扫一扫
关注该公众号
129 |
130 |
131 |
132 |
133 |
134 |
135 |
142 |
143 |
161 |
162 |
163 |
174 |
175 |
176 |
177 |
178 |
179 |
--------------------------------------------------------------------------------
/test/file/search-gaokao-gzh-error.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | 搜狗搜索
8 |
9 |
10 |
11 |
71 |
72 |
73 |
77 |
78 |
IP:123.116.247.15
访问时间:2017.07.25 22:36:19
79 |
用户您好,您的访问过于频繁,为确认本次访问为正常用户行为,需要您协助验证。
80 |
81 |
96 |
97 | 提交
98 | 提交后没解决问题?欢迎反馈。
99 |
100 |
101 |
102 |
103 |
104 |
105 |
--------------------------------------------------------------------------------
/test/file/wapindex-wap-0612-wap_8-0.html:
--------------------------------------------------------------------------------
1 | 同事小李最近有点烦躁,原因在于他的爱车新换的轮胎,才一万公里都没跑到就全被磨光了!前两天刚把两条防爆胎换下来,小3k就这么没了!说实话,吃土的小编很想来一句:壕做朋!换个轮胎就给豁出去这样大的一笔支出,这让身为老司机的修车师傅也有点儿郁闷。
全球汽车精选
日前,J.D.Power公布了一份表单,这份表单的可谓是非常重磅的东西的,不过当国内消费者看到的时候,脸色一青,大家都慌了神……因为这个表单列举了在美国市场上新车的质量表现,并一一做了排名,然后国人认为不能接受……其他的不说那么多,先来看看
车早茶
一提起古德伍德,很多人都觉得这个位于英国西萨赛克斯郡(West Sussex)的小镇特别耳熟能详,因为这里每一年都会举办全球规模最大、最负盛名古德伍德速度节,这是在每一个车迷都梦寐以求参加的盛会,因为它不仅仅是一个老爷赛车重回青春的宝地,也
吴佩频道
学车是一件慢工出细活的事,每一个细节都需要重视。尤其是方向盘的掌控问题,对于很多新手来说,一上车基本就蒙圈了,手都不知道放哪,也不知道该怎么控制方向盘。接来下典典就给大家说说关于方向盘的那些事:学员在科目二考试训练时控制方向盘容易出现以下问
驾考宝典
欢 迎 来 到 第 三 十 二 期 《 B B 大 讲 堂 》今 天 B B 哥 带 大 家 了 解奇葩的汽车设计 我们开车时接触最多的东西之一就是挡把。可以说,挡把的造型在很大程度上影响着驾驶的舒适度和操控性。挡把的人体工程学设计的好,那
腾讯汽车
你一定是在很久以前就知道点击上方蓝字◮就可以加关注了 原谅我们的不正经,这次竟然拿途昂来跟途锐对比。尽管途昂要加价,途锐有十多万的优惠,此消彼长后,但两者依然差距悬殊。在途昂推出时,很多人说什么途昂比途锐还大,途锐已经没有存在意义。这种以大
新车评
特斯拉又摊上事了?汽车召回,顾名思义,就是将有问题缺陷的汽车产品由厂家及时召唤回去,进行改造升级。就在6月30日,国家质检总局发布了2017年上半年国内汽车召回的所有信息。中国乘用车市场共38个汽车品牌发布了118次召回公告,累计召回486
非常好车
咱们经常会在新闻上看到货车侧翻压死人的消息,马路上看到货车都躲得远远的!其实不仅仅是货车,下面这7类车,你也得小心。1、渣土车渣土车一般在晚上行驶,经常还在市区闯红灯,加速过路口。很多渣土车不会按照要求加盖,就特别容易导致渣土车的砖头在路上
汽车情报所
今晚,上汽通用别克全新一代君威于上海东方体育中心上市。全新君威包含20T、28T和30H车型,分别搭载1.5T、2.0T与1.8L+电动机三种动力规格。另外,与普通版车型一起上市的,还有全新一代君威GS,其内外设计比君威更加强调运动,同样搭
一猫汽车资讯
历史上曾出现一位富可敌国的朝廷大人,对于该位大人的奢享生活,据史料记载,其每天都会将珍珠磨成的粉用来做早餐,并且对珍珠的品相也颇有要求。后世的慈禧太后,作为同样喜吃珍珠之人,也只是半月才吃上一回,某大人却天天吃,足以见得他何其懂得享受。就连
资深科技控
作者|阿何 微信|阿何有话说最近,微博上有个帖子引起了巨大的反响:外卖小哥送餐到一栋写字楼,可是因为写字楼不让无关人等入内,小哥只能在接近40度的地面等待顾客下来。广州这两天如果不下雨的话,地面温度在35度左右。前两天出外办事,我在露天呆了
郎club
“传统汽车仅仅是驾驶员手、脚和力量的延伸,控制车辆行为的是人。到了L3阶段,让汽车成为驾驶员自己、或者说让机器成为自己,应该是人工智能时代最有意义的事情之一。”在7月22日召开的CCAI2017中国人工智能大会上,中国工程院院士、中国人工智
科技日报
之前歆歆写过很多在高速公路上,车主不遵守交通规则导致的事故,其实除此之外,一些突发情况,也容易让车主反应不及,引发事故。假如你正在高速行车,突然发现前方有一块石头或者窜出小动物,你会怎么去做呢?急打方向、猛踩刹车还是直接冲过去?有这样一位车
汽车使用宝典
广汽讴歌TLX-L量产版要来了!实车也终于亮相,装作神秘,其实它的样子早就不陌生了。年初的上海车展,讴歌已经给我们看到这款TLX-L的Prototype,也就是原型车。外观基本上和这次亮相的实车相似,造型差别就在保险杠下方的格栅部分。不过,
名车报
问世间什么最难不外乎三伏天出门上班大暑,中伏老天爷给今天贴上这两个标签就注定其承载不一样的意义幸好今天是周末终于可以空调WiFi西瓜,葛优同款沙发……但有人就是想不通啊为了早拿到驾照顶着大太阳冒着酷暑去学车结果呢就因为她和男教练在车里做了一
科普中国网
2 |
--------------------------------------------------------------------------------
/test/rk.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 | from hashlib import md5
5 |
6 | import requests
7 |
8 |
9 | class RClient(object):
10 | def __init__(self, username, password, soft_id, soft_key):
11 | self.base_params = {
12 | 'username': username,
13 | 'password': md5(password.encode('utf-8')).hexdigest(),
14 | 'softid': soft_id,
15 | 'softkey': soft_key,
16 | }
17 | self.headers = {
18 | 'Connection': 'Keep-Alive',
19 | 'Expect': '100-continue',
20 | 'User-Agent': 'ben',
21 | }
22 |
23 | def rk_create(self, im, im_type, timeout=60):
24 | params = {
25 | 'typeid': im_type,
26 | 'timeout': timeout,
27 | }
28 | params.update(self.base_params)
29 | files = {'image': ('a.jpg', im)}
30 | r = requests.post('http://api.ruokuai.com/create.json', data=params, files=files, headers=self.headers)
31 | return r.json()
32 |
33 | def rk_report_error(self, im_id):
34 | params = {
35 | 'id': im_id,
36 | }
37 | params.update(self.base_params)
38 | r = requests.post('http://api.ruokuai.com/reporterror.json', data=params, headers=self.headers)
39 | return r.json()
40 |
41 |
42 | def __identify_image_callback(img, code):
43 | try:
44 | username = os.environ['rk_username']
45 | password = os.environ['rk_password']
46 | id_ = os.environ['rk_id']
47 | key = os.environ['rk_key']
48 | rc = RClient(username, password, id_, key)
49 | result = rc.rk_create(img, code)
50 | print('验证码:', result['Result'])
51 | return result['Result']
52 | except Exception:
53 | raise Exception('识别验证码错误')
54 |
55 |
56 | def identify_image_callback_ruokuai_sogou(img):
57 | return __identify_image_callback(img, 3060)
58 |
59 |
60 | def identify_image_callback_ruokuai_weixin(img):
61 | return __identify_image_callback(img, 3040)
62 |
--------------------------------------------------------------------------------
/test/test_api.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import absolute_import, unicode_literals, print_function
4 |
5 | import os
6 | import time
7 | import unittest
8 |
9 | from nose.tools import assert_equal, assert_true, assert_in, assert_greater_equal
10 |
11 | from wechatsogou.const import WechatSogouConst
12 | from wechatsogou.api import WechatSogouAPI
13 | from wechatsogou.identify_image import identify_image_callback_by_hand
14 | from test import gaokao_keyword, empty_search_result_keyword
15 | from test.rk import identify_image_callback_ruokuai_sogou, identify_image_callback_ruokuai_weixin
16 |
17 | ws_api = WechatSogouAPI(captcha_break_time=3)
18 |
19 |
20 | class TestAPIReal(unittest.TestCase):
21 | # todo use chinese
22 | def setUp(self):
23 | self.identify_image_callback_sogou = identify_image_callback_ruokuai_sogou if os.environ.get(
24 | 'WechatSogouCI') else identify_image_callback_by_hand
25 | self.identify_image_callback_ruokuai_weixin = identify_image_callback_ruokuai_weixin if os.environ.get(
26 | 'WechatSogouCI') else identify_image_callback_by_hand
27 |
28 | def test_search_gzh_real(self):
29 | gzh_list = ws_api.search_gzh(gaokao_keyword, identify_image_callback=self.identify_image_callback_sogou)
30 | assert_greater_equal(len(gzh_list), 8)
31 | assert_true(any(gaokao_keyword in i['wechat_name'] for i in gzh_list))
32 | assert_true(any(i['open_id'] != '' for i in gzh_list))
33 |
34 | def test_get_gzh_article_by_history_real(self):
35 | gzh_article = ws_api.get_gzh_article_by_history(gaokao_keyword,
36 | identify_image_callback_sogou=self.identify_image_callback_sogou,
37 | identify_image_callback_weixin=self.identify_image_callback_ruokuai_weixin)
38 | assert_in('gzh', gzh_article)
39 | assert_in('article', gzh_article)
40 | assert_in('wx.qlogo.cn', gzh_article['gzh']['headimage'])
41 | assert_greater_equal(len(gzh_article['article']), 1)
42 |
43 | def test_get_gzh_article_by_hot_real(self):
44 | gzh_articles = ws_api.get_gzh_article_by_hot(WechatSogouConst.hot_index.gaoxiao,
45 | identify_image_callback=self.identify_image_callback_sogou)
46 | for gzh_article in gzh_articles:
47 | assert_in('gzh', gzh_article)
48 | assert_in('article', gzh_article)
49 | assert_in('http://mp.weixin.qq.com/s?src=', gzh_article['article']['url'])
50 | assert_greater_equal(len(gzh_articles), 10)
51 |
52 | def test_get_sugg(self):
53 | sugg_gaokao = ws_api.get_sugg(gaokao_keyword)
54 | assert_equal(10, len(sugg_gaokao))
55 |
56 | def test_get_article_content(self):
57 | gzh_article = ws_api.get_gzh_article_by_history(gaokao_keyword,
58 | identify_image_callback_sogou=self.identify_image_callback_sogou,
59 | identify_image_callback_weixin=self.identify_image_callback_ruokuai_weixin)
60 | assert_in('gzh', gzh_article)
61 | assert_in('article', gzh_article)
62 | assert_in('wx.qlogo.cn', gzh_article['gzh']['headimage'])
63 | assert_greater_equal(len(gzh_article['article']), 1)
64 | # 防止测试时被封IP
65 | time.sleep(11)
66 | article_url = gzh_article['article'][0]['content_url']
67 |
68 | article_info = ws_api.get_article_content(article_url,
69 | identify_image_callback=self.identify_image_callback_sogou)
70 |
71 | assert_in('content_html', article_info)
72 | assert_in('content_img_list', article_info)
73 |
74 | def test_gzh_by_history_profile_none(self):
75 | gzh_article = ws_api.get_gzh_article_by_history(empty_search_result_keyword,
76 | identify_image_callback_sogou=self.identify_image_callback_sogou,
77 | identify_image_callback_weixin=self.identify_image_callback_ruokuai_weixin)
78 | assert_equal({}, gzh_article)
79 |
80 |
81 | if __name__ == '__main__':
82 | unittest.main()
83 |
--------------------------------------------------------------------------------
/test/test_const.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import unittest
4 |
5 | from nose.tools import assert_true, assert_equal
6 |
7 | from wechatsogou.const import WechatSogouConst
8 |
9 |
10 | class TestConst(unittest.TestCase):
11 | def test_const_hot_index(self):
12 | assert_true(hasattr(WechatSogouConst, 'hot_index'))
13 |
14 | assert_equal(WechatSogouConst.hot_index.hot, 'hot')
15 | assert_equal(WechatSogouConst.hot_index.gaoxiao, 'gaoxiao')
16 | assert_equal(WechatSogouConst.hot_index.duanzi, 'duanzi')
17 | assert_equal(WechatSogouConst.hot_index.health, 'health')
18 | assert_equal(WechatSogouConst.hot_index.sifanghua, 'sifanghua')
19 | assert_equal(WechatSogouConst.hot_index.gossip, 'gossip')
20 | assert_equal(WechatSogouConst.hot_index.life, 'life')
21 | assert_equal(WechatSogouConst.hot_index.finance, 'finance')
22 | assert_equal(WechatSogouConst.hot_index.car, 'car')
23 | assert_equal(WechatSogouConst.hot_index.technology, 'technology')
24 | assert_equal(WechatSogouConst.hot_index.fashion, 'fashion')
25 | assert_equal(WechatSogouConst.hot_index.mummy, 'mummy')
26 | assert_equal(WechatSogouConst.hot_index.dianzan, 'dianzan')
27 | assert_equal(WechatSogouConst.hot_index.travel, 'travel')
28 | assert_equal(WechatSogouConst.hot_index.job, 'job')
29 | assert_equal(WechatSogouConst.hot_index.food, 'food')
30 | assert_equal(WechatSogouConst.hot_index.history, 'history')
31 | assert_equal(WechatSogouConst.hot_index.study, 'study')
32 | assert_equal(WechatSogouConst.hot_index.constellation, 'constellation')
33 | assert_equal(WechatSogouConst.hot_index.sport, 'sport')
34 |
35 | def test_const_search_article_type(self):
36 | assert_true(hasattr(WechatSogouConst, 'search_article_type'))
37 |
38 | assert_equal(WechatSogouConst.search_article_type.all, 'all')
39 | assert_equal(WechatSogouConst.search_article_type.rich, 'rich')
40 | assert_equal(WechatSogouConst.search_article_type.video, 'video')
41 | assert_equal(WechatSogouConst.search_article_type.image, 'image')
42 |
43 | def test_const_search_article_time(self):
44 | assert_true(hasattr(WechatSogouConst, 'search_article_time'))
45 |
46 | assert_equal(WechatSogouConst.search_article_time.anytime, 0)
47 | assert_equal(WechatSogouConst.search_article_time.day, 1)
48 | assert_equal(WechatSogouConst.search_article_time.week, 2)
49 | assert_equal(WechatSogouConst.search_article_time.month, 3)
50 | assert_equal(WechatSogouConst.search_article_time.year, 4)
51 | assert_equal(WechatSogouConst.search_article_time.specific, 5)
52 |
53 |
54 | if __name__ == '__main__':
55 | unittest.main()
56 |
--------------------------------------------------------------------------------
/test/test_request_gen_hot_url.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import absolute_import, unicode_literals, print_function
4 |
5 | import unittest
6 | from nose.tools import assert_in, assert_raises
7 |
8 | from wechatsogou.const import WechatSogouConst
9 | from wechatsogou.request import WechatSogouRequest
10 |
11 |
12 | class TestBasicGenSearchArticleURL(unittest.TestCase):
13 | def test_gen_hot_url(self):
14 | for hot_index in filter(lambda x: not x.startswith('__'), dir(WechatSogouConst.hot_index)):
15 | url = WechatSogouRequest.gen_hot_url(hot_index)
16 | assert_in('http://weixin.sogou.com/wapindex/wap/0612/wap_', url)
17 | assert_in('0.html', url)
18 |
19 | with assert_raises(AssertionError):
20 | WechatSogouRequest.gen_hot_url(hot_index, 0)
21 |
22 | for page in range(1, 5):
23 | url = WechatSogouRequest.gen_hot_url(hot_index, page)
24 | assert_in('http://weixin.sogou.com/wapindex/wap/0612/wap_', url)
25 | assert_in('{}.html'.format(page - 1), url)
26 |
27 |
28 | if __name__ == '__main__':
29 | unittest.main()
30 |
--------------------------------------------------------------------------------
/test/test_request_gen_search_article_url.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import absolute_import, unicode_literals, print_function
4 |
5 | import unittest
6 | from nose.tools import assert_raises, assert_equal, assert_in, assert_not_in
7 |
8 | from hypothesis import given, strategies as st
9 |
10 | from wechatsogou.const import WechatSogouConst
11 | from wechatsogou.request import WechatSogouRequest
12 | from test import gaokao_keyword
13 |
14 |
15 | class TestBasicGenSearchArticleURL(unittest.TestCase):
16 | def test_gen_search_article_url_keyword(self):
17 | url = WechatSogouRequest.gen_search_article_url(gaokao_keyword)
18 | assert_equal('http://weixin.sogou.com/weixin?type=2&page=1&ie=utf8&query=%E9%AB%98%E8%80%83&interation=', url)
19 |
20 | @given(st.integers(min_value=-20000, max_value=20000))
21 | def test_gen_search_article_url_page(self, page):
22 | if page > 0:
23 | url = WechatSogouRequest.gen_search_article_url(gaokao_keyword, page)
24 | assert_in('page={}'.format(page), url)
25 | else:
26 | with assert_raises(AssertionError):
27 | WechatSogouRequest.gen_search_article_url(gaokao_keyword, page)
28 |
29 | @given(st.integers(min_value=-50, max_value=50), st.dates(), st.dates())
30 | def test_gen_search_article_url_timesn(self, timesn, ft, et):
31 | if timesn == 0:
32 | url = WechatSogouRequest.gen_search_article_url(gaokao_keyword, timesn=timesn)
33 | assert_in('type=2&page=1&ie=utf8&query=', url)
34 | assert_not_in('ft=&et=', url)
35 |
36 | url = WechatSogouRequest.gen_search_article_url(gaokao_keyword, timesn=timesn, ft=ft)
37 | assert_in('type=2&page=1&ie=utf8&query=', url)
38 | assert_not_in('ft=&et=', url)
39 | elif timesn in [1, 2, 3, 4]:
40 | url = WechatSogouRequest.gen_search_article_url(gaokao_keyword, timesn=timesn)
41 | assert_in('tsn={}&ft=&et='.format(timesn), url)
42 |
43 | url = WechatSogouRequest.gen_search_article_url(gaokao_keyword, timesn=timesn, ft=ft)
44 | assert_in('tsn={}&ft=&et='.format(timesn), url)
45 | elif timesn == 5:
46 | if ft <= et:
47 | url = WechatSogouRequest.gen_search_article_url(gaokao_keyword, timesn=timesn, ft=ft, et=et)
48 | assert_in('tsn=5&ft={}&et={}'.format(ft, et), url)
49 | else:
50 | with assert_raises(AssertionError):
51 | WechatSogouRequest.gen_search_article_url(gaokao_keyword, timesn=timesn)
52 | WechatSogouRequest.gen_search_article_url(gaokao_keyword, timesn=timesn, ft=ft, et=et)
53 | else:
54 | with assert_raises(AssertionError):
55 | WechatSogouRequest.gen_search_article_url(gaokao_keyword, timesn=timesn)
56 |
57 | def test_gen_search_article_url_article_type(self):
58 | url = WechatSogouRequest.gen_search_article_url(gaokao_keyword,
59 | article_type=WechatSogouConst.search_article_type.all)
60 | assert_equal('interation=', url[-11:])
61 |
62 | url = WechatSogouRequest.gen_search_article_url(gaokao_keyword,
63 | article_type=WechatSogouConst.search_article_type.image)
64 | assert_in('interation=458754', url)
65 |
66 | url = WechatSogouRequest.gen_search_article_url(gaokao_keyword,
67 | article_type=WechatSogouConst.search_article_type.video)
68 | assert_in('interation=458756', url)
69 |
70 | url = WechatSogouRequest.gen_search_article_url(gaokao_keyword,
71 | article_type=WechatSogouConst.search_article_type.rich)
72 | assert_in('interation=458754%2C458756', url)
73 |
--------------------------------------------------------------------------------
/test/test_request_gen_search_gzh_url.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import absolute_import, unicode_literals, print_function
4 |
5 | import unittest
6 | from nose.tools import assert_raises, assert_equal, assert_in
7 |
8 | from hypothesis import given, strategies as st
9 |
10 | from wechatsogou.request import WechatSogouRequest
11 | from test import gaokao_keyword
12 |
13 |
14 | class TestBasicGenSearchGzhURL(unittest.TestCase):
15 | def test_gen_search_article_url_keyword(self):
16 | url = WechatSogouRequest.gen_search_gzh_url(gaokao_keyword)
17 | assert_equal('http://weixin.sogou.com/weixin?type=1&page=1&ie=utf8&query=%E9%AB%98%E8%80%83', url)
18 |
19 | @given(st.integers(min_value=-20000, max_value=20000))
20 | def test_gen_search_gzh_url_page(self, page):
21 | if page > 0:
22 | url = WechatSogouRequest.gen_search_gzh_url(gaokao_keyword, page)
23 | assert_in('page={}'.format(page), url)
24 | else:
25 | with assert_raises(AssertionError):
26 | WechatSogouRequest.gen_search_gzh_url(gaokao_keyword, page)
27 |
--------------------------------------------------------------------------------
/test/test_structuring.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import absolute_import, unicode_literals, print_function
4 |
5 | import datetime
6 | import io
7 | import json
8 | import re
9 | import os
10 | import unittest
11 | from bs4 import BeautifulSoup
12 | from nose.tools import assert_equal, assert_in, assert_true, assert_greater_equal, assert_is_none, assert_not_in
13 |
14 | from test import fake_data_path, gaokao_keyword
15 | from wechatsogou.structuring import WechatSogouStructuring
16 |
17 | assert_equal.__self__.maxDiff = None
18 |
19 |
20 | class TestStructuringGzh(unittest.TestCase):
21 | def test_get_gzh_by_search(self):
22 | file_name = os.path.join(fake_data_path, 'search-gaokao-gzh.html')
23 | with io.open(file_name, encoding='utf-8') as f:
24 | search_gaokao_gzh = f.read()
25 |
26 | gzh_list = WechatSogouStructuring.get_gzh_by_search(search_gaokao_gzh)
27 |
28 | names = []
29 | wechat_ids = []
30 | post_perms = []
31 | introductions = []
32 | authentications = []
33 | open_ids = []
34 | assert_equal(10, len(gzh_list))
35 | for gzh in gzh_list:
36 | names.append(gzh['wechat_name'])
37 | wechat_ids.append(gzh['wechat_id'])
38 | post_perms.append(gzh['post_perm'])
39 | introductions.append(gzh['introduction'])
40 | authentications.append(gzh['authentication'])
41 | open_ids.append(gzh['open_id'])
42 |
43 | assert_in('mp.weixin.qq.com/profile?src=3×tamp=', gzh['profile_url'])
44 | assert_in('mp.weixin.qq.com/rr?src=', gzh['qrcode'])
45 | assert_in('img01.sogoucdn.com/', gzh['headimage'])
46 |
47 | assert_equal(['oIWsFt6fv4FH0OBNCyoonNoAp2OM',
48 | 'oIWsFtzwnqHRVPsRY-eEzPo344jQ',
49 | 'oIWsFt_PvlvuqFxQFPbOO26_GQh4',
50 | 'oIWsFtzpOSqygkGiyzj1vVGi2zM4',
51 | 'oIWsFt-lCZYAtfVXRykjgsWZMoJA',
52 | 'oIWsFtzJBFA82fTPb7xU-gkPiyqA',
53 | 'oIWsFt_wgF0dHou131y47qIMcuM0',
54 | 'oIWsFt67sO47_fHfOFQC0rBHhxcY',
55 | 'oIWsFt5Kltl1uXsy8fhj96eIVen8',
56 | 'oIWsFt-2JeqhMEEVQuFw_geRzmbY'],
57 | open_ids)
58 | assert_equal(['山东高考指南',
59 | '高考家长圈',
60 | '河南高考指南',
61 | '高考360',
62 | '云天高考',
63 | '腾讯高考',
64 | '高考快讯',
65 | '专业中高考教育',
66 | '晟嘉高考',
67 | '新东方在线高考辅导'],
68 | names)
69 | assert_equal([u'sdgkzn',
70 | u'sinagkjzq',
71 | u'hngaokao',
72 | u'sctvgaokao360',
73 | u'yuntiangaokao',
74 | u'qq_gaokao',
75 | u'gkkx678',
76 | u'gh_591a43050b5f',
77 | u'tjsjgk',
78 | u'koogaokao'],
79 | wechat_ids)
80 | assert_equal([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1], post_perms)
81 | assert_equal(
82 | ['这里是山东最权威最专业的高考交流平台,由山东商报徐玉芹教育工作室独家运作.本平台与山东商报高考交流群互为依托,为山东考生和家长提供最及时、最准确的高考政策及信息解读,以及一流的填报志愿咨询服务.合作...',
83 | '定期推送高三家长关注的优秀家长经验交流、志愿填报技巧、考生心理辅导方法、考前营养搭配等诸多优质内容;为家长搭建交流互动平台.',
84 | '发布最新高考政策,分享高效学习方法,制定高考应试策略.考点总结政策分析名校介绍高考大纲试卷解析艺考文化课,权威专业的高考资讯一手掌握.',
85 | '360天,360度,用心伴您升学路.四川电视台科教频道每晚7:45播出.',
86 | '高端教育品牌,高分考生的加油站,重点中学的合作伙伴.开阔考生视野、提升认知,在以“研究”为主线的基础上,将考生培养成一个全面型的人才.课程特色,全科协调、单科精讲.云天高考一直深受高分考生和家长的追...',
87 | '腾讯高考频道是中国最具互动性高考门户网站.主要为中国高考生及家长提供有价值的资讯和辅导.内容包括:新闻、评论、视频、各科辅导、志愿填报、家长指南等多方面.',
88 | '高考快讯平台专为考生家长提供最新高考资讯、志愿填报指南、名校排行榜、状元经验、学习方法、高分秘籍等等,我们的努力将伴随着您圆大学梦,欢迎关注阅读!',
89 | '旨在做最专业的中高考教育交流平台,第一时间传递权威的中高考资讯,为孩子的未来保驾护航!',
90 | '关于天津高考,你关注我们一个就够啦!',
91 | '提供高考资讯、高考院校库、在线答疑、政策解读及试题发布.'],
92 | introductions)
93 | assert_equal(['《山东商报》社',
94 | '新浪网技术(中国)有限公司',
95 | '郑州新东方培训学校',
96 | '四川省电化教育馆(四川教育电视台)',
97 | '北京云天共业教育科技有限公司',
98 | '深圳市腾讯计算机系统有限公司',
99 | '广州卓越教育培训中心',
100 | '大连沙河口科苑文化培训学校',
101 | '天津市南开区晟嘉培训中心',
102 | '北京新东方迅程网络科技股份有限公司'],
103 | authentications)
104 |
105 | def test_get_article_by_search(self):
106 | file_name = os.path.join(fake_data_path, 'search-gaokao-article.html')
107 | with io.open(file_name, encoding='utf-8') as f:
108 | search_gaokao_article = f.read()
109 |
110 | article_list = WechatSogouStructuring.get_article_by_search(search_gaokao_article)
111 |
112 | titles = []
113 | abstracts = []
114 | gzh_names = []
115 | isvs = []
116 | assert_equal(10, len(article_list))
117 | for i in article_list:
118 | article = i['article']
119 | titles.append(article['title'])
120 | abstracts.append(article['abstract'])
121 |
122 | assert_in('mp.weixin.qq.com/s?src=3×tamp=', article['url'])
123 | assert_true(isinstance(article['imgs'], list))
124 | assert_greater_equal(len(article['imgs']), 1)
125 |
126 | gzh = i['gzh']
127 |
128 | assert_in('mp.weixin.qq.com/profile?src=3×tamp', gzh['profile_url'])
129 | assert_in('wx.qlogo.cn/mmhead', gzh['headimage'])
130 | gzh_names.append(gzh['wechat_name'])
131 | isvs.append(gzh['isv'])
132 |
133 | # article
134 | assert_equal(['高考有多重要,为什么要重视高考?丨微观点',
135 | '高考:穷人考不好,中产考状元,精英不高考',
136 | '关于高考志愿的一点建议,仅供参考!',
137 | '刚刚,高考“满分”诞生了!(附各省高考分数线)',
138 | '高考学霸榜出炉!义乌最高分是她!排名...',
139 | '【高考】权威发布!2017年我省高考各项日程',
140 | '【高考】黑龙江省2017年普通高考成绩即将发布',
141 | '高考2017 | 全国各省区市高考录取时间大汇总,最新最全!',
142 | '高考志愿这么填,等于多考20分!这位特级教师的志愿填报方法很管用!',
143 | '高考填志愿,如何选专业?学长学姐有话说'],
144 | titles)
145 | assert_equal(['针对这个问题,其实占豪已经谈过,但还是想借高考之后、借这位小战友的留言,结合自己的人生经验,谈谈个人对这件事的看法....',
146 | '#条条大路通罗马,有人就出生在罗马#前几天北京文科高考状元熊轩昂接受澎湃新闻的采访的时候,说了下面这段话. “农村地区的...',
147 | '最近一直有哥迷留言问,填报高考志愿该选什么专业? 讲真,这个问题很难回答.专业选择没有绝对的好坏对错,跟考试成绩、个人兴...',
148 | '高考会有满分的情况吗?还真有!6月22日开始,全国各省的高考成绩陆续发布.22日晚上,成都市青白江区一个小区内人声鼎沸,因...',
149 | '浙江新高考各类别各段分数线及考生成绩于昨日揭晓.考生可凭考生号、密码查询自己的考试成绩!今年的高考成绩,经浙江省教育考...',
150 | '根据我省招生录取工作安排,现将近期有关高考工作日程公布如下:一、高考成绩公布时间6月24日左右省招考院通过黑龙江省招生考...',
151 | '黑龙江省2017年普通高考成绩即将发布 我省今年高考网上评卷工作现已结束,经过成绩核查、成绩校验等多个环节后,我省高考成绩...',
152 | '2017年高考录取工作开始了,各省区市高考录取工作何时进行?为了方便考生和家长及时了解,小编为大家作了最新最全的梳理.(图...',
153 | '各地高考成绩已陆续公布,在本公众号回复“高考查分”即可查询!~长按二维码即可关注本车~自昨天开始,全国各省份陆续公布...',
154 | '导语高考成绩和批次线已经出来了,想必同学们已经开始进入另一重要环节——志愿填报.你是不是在为选专业而纠结痛苦?不怕!...'],
155 | abstracts)
156 |
157 | # gzh
158 | assert_equal(['占豪',
159 | '才华有限青年',
160 | '新闻哥',
161 | '光明网',
162 | '义乌十八腔',
163 | '龙招港',
164 | '龙招港',
165 | '微言教育',
166 | '高考直通车',
167 | '阳光高考信息平台', ],
168 | gzh_names)
169 | assert_in(1, isvs)
170 | assert_in(0, isvs)
171 |
172 | def test_get_gzh_info_by_history(self):
173 | file_name = os.path.join(fake_data_path, 'bitsea-history.html')
174 | with io.open(file_name, encoding='utf-8') as f:
175 | gzh_history = f.read()
176 |
177 | gzh_info = WechatSogouStructuring.get_gzh_info_by_history(gzh_history)
178 |
179 | assert_equal('槽边往事', gzh_info['wechat_name'])
180 | assert_equal('bitsea', gzh_info['wechat_id'])
181 | assert_equal('和菜头的微信Blog,用于分享各种新鲜资讯', gzh_info['authentication'])
182 | assert_equal('http://wx.qlogo.cn/mmhead/Q3auHgzwzM6zmSwQkvHdgXDtnpAyLYjuib8QdW6ibKKGo8zcZVbYxiaUw/0',
183 | gzh_info['headimage'])
184 | assert_equal(' ', gzh_info['introduction'])
185 |
186 | def test_get_article_by_history_json(self):
187 | file_name = os.path.join(fake_data_path, 'bitsea-history.html')
188 | with io.open(file_name, encoding='utf-8') as f:
189 | gzh_history = f.read()
190 |
191 | article_list = WechatSogouStructuring.get_article_by_history_json(gzh_history)
192 | titles = []
193 | urls = []
194 | digests = []
195 | for i in article_list:
196 | assert_equal('和菜头', i['author'])
197 | assert_equal('49', i['type'])
198 | assert_in('mp.weixin.qq.com/s?timestamp=', i['content_url'])
199 | assert_in(i['copyright_stat'], [11, 100])
200 | assert_in('mmbiz.qpic.cn/mmbiz_jpg/', i['cover'])
201 | assert_greater_equal(datetime.datetime.fromtimestamp(i['datetime']), datetime.datetime(2000, 1, 1))
202 |
203 | urls.append(i['content_url'])
204 | titles.append(i['title'])
205 | digests.append(i['abstract'])
206 |
207 | assert_equal(
208 | ['帝都深处好修行',
209 | '如果我有个好一点的初中英文老师',
210 | '【广告】让手机清凉一哈',
211 | '写给各位陛下',
212 | '可能是年度电影的《大护法》',
213 | '怎样决定要不要去相信一个人',
214 | '照亮世界的那个人',
215 | '《冈仁波齐》观后',
216 | '没有什么火候不火候的',
217 | '完美受害人', ],
218 | titles)
219 |
220 | assert_equal([
221 | 'http://mp.weixin.qq.com/s?timestamp=1500903767&src=3&ver=1&signature=X4l0IQ091w0DY2ERU7fD*h0VUwBxeHPOJH-Uk-vAfaPamMl6ij7fqAIHomnXQ2X2*2J94H0pixVjsjEkL0TbILtKInZ4hqPp3-lC1nQZcN9Fd*BGbTQp7WlZyzLvCXy0Z8yFVF*lIDlo75pemv7kW8wov4Hz5-uiVzBT5q*Nwaw=',
222 | 'http://mp.weixin.qq.com/s?timestamp=1500903767&src=3&ver=1&signature=X4l0IQ091w0DY2ERU7fD*h0VUwBxeHPOJH-Uk-vAfaPamMl6ij7fqAIHomnXQ2X2*2J94H0pixVjsjEkL0TbIPsfeXemAw1IR5Pt5J*6JqjpgotoKPL*6eVHbdcbi4JCEfsnhbnsQUTLQWpBZe5UILx8062e6A2L00LyjQArkxU=',
223 | 'http://mp.weixin.qq.com/s?timestamp=1500903767&src=3&ver=1&signature=X4l0IQ091w0DY2ERU7fD*h0VUwBxeHPOJH-Uk-vAfaPamMl6ij7fqAIHomnXQ2X2*2J94H0pixVjsjEkL0TbIOVd*HwElAYiJum8Q6su3tILWksr-4u9WZPSrfT7A6nErJ3f0kW8V1Jv9evurTe5X4pQrjjCZcE6WeYGwDJIH0Q=',
224 | 'http://mp.weixin.qq.com/s?timestamp=1500903767&src=3&ver=1&signature=X4l0IQ091w0DY2ERU7fD*h0VUwBxeHPOJH-Uk-vAfaPamMl6ij7fqAIHomnXQ2X2*2J94H0pixVjsjEkL0TbIBtaRJpx-JbQsm-5X*GWfaS-jBtKyhOmAxio5OIROqwV71OrvtaxYq1oZG-WM9apKbLGDPIBc0sCFUB4WBOagwk=',
225 | 'http://mp.weixin.qq.com/s?timestamp=1500903767&src=3&ver=1&signature=X4l0IQ091w0DY2ERU7fD*h0VUwBxeHPOJH-Uk-vAfaPamMl6ij7fqAIHomnXQ2X2*2J94H0pixVjsjEkL0TbID-eM8BIKq1ef1ajiKO1jz1k0E6xa1ROpt2Eo3Af6OHQGfYIq-WrfEsn3jLwps1V*TXmP6443wUYgrrStzJwKPc=',
226 | 'http://mp.weixin.qq.com/s?timestamp=1500903767&src=3&ver=1&signature=X4l0IQ091w0DY2ERU7fD*h0VUwBxeHPOJH-Uk-vAfaPamMl6ij7fqAIHomnXQ2X2*2J94H0pixVjsjEkL0TbIJenG0s3GyCaMQIK18U3CHsWrrGwuL5Z0X*DSoztV49L-ZPrf39mbml1GBkZnX*gueDdUJBIHgvyFsaVCTePLrI=',
227 | 'http://mp.weixin.qq.com/s?timestamp=1500903767&src=3&ver=1&signature=X4l0IQ091w0DY2ERU7fD*h0VUwBxeHPOJH-Uk-vAfaPamMl6ij7fqAIHomnXQ2X2*2J94H0pixVjsjEkL0TbIE2LQ5dJqrG018DC4M7E5RQ3D4V1p*eBszVaqr2saxG864LssINc8RKcASbkdSDEMiguB9xwuMcJXgGANUpBjtg=',
228 | 'http://mp.weixin.qq.com/s?timestamp=1500903767&src=3&ver=1&signature=X4l0IQ091w0DY2ERU7fD*h0VUwBxeHPOJH-Uk-vAfaPamMl6ij7fqAIHomnXQ2X2*2J94H0pixVjsjEkL0TbINN4P-L*qGaX0SopEwmBNGbOUc*Ad5D8TKEUZOPNduI4uupwRQFL*I4r151vpRYSA92EYzb34uf82WZJMa5-kTU=',
229 | 'http://mp.weixin.qq.com/s?timestamp=1500903767&src=3&ver=1&signature=X4l0IQ091w0DY2ERU7fD*h0VUwBxeHPOJH-Uk-vAfaPamMl6ij7fqAIHomnXQ2X2*2J94H0pixVjsjEkL0TbIEhfSajMgMm4uzkdEhe*6MP8H9YKg1q38xqFlBV3*sJxgwupUV8b1Q2c6OhhBEZgCTyKQvHWnGLDLBH0gvC10zQ=',
230 | 'http://mp.weixin.qq.com/s?timestamp=1500903767&src=3&ver=1&signature=X4l0IQ091w0DY2ERU7fD*h0VUwBxeHPOJH-Uk-vAfaPamMl6ij7fqAIHomnXQ2X2*2J94H0pixVjsjEkL0TbIBK5p9HtcN9dTEMbIU5Vspa3IaeGox55FYOfhNbWBL2Td4hxYt3GKGzRe-TlOPVlDWXuy8CvdD1ap1fmhNt9Cy0=']
231 | , urls)
232 |
233 | assert_equal(['善哉,善哉!',
234 | '说出来今天的人根本不会信,我的初中英文老师李女士在上课的时候打毛衣。',
235 | '奔走相告:过气网红接到新广告!请点击,请阅读,请留言!',
236 | '陛下们!微臣有话要说!',
237 | '对,我就那么说了,不服来咬我啊?',
238 | '在一个现代商业社会里,如何决定要不要去相信一个人?如何把人际关系判定的时间精力节省下来?网络慈父和菜头是这么说的:',
239 | '在一名凡夫身上,我看到了菩萨那样的行止。',
240 | '昨晚看了电影《冈仁波齐》,我不喜欢。',
241 | '如果你是厨艺初学者,忘掉火候,那不是你应该关心的事情。',
242 | '野鸡给自己加戏,观众不说话,并不等于看不明白。', ], digests)
243 |
244 | def test_get_gzh_info_and_article_by_history(self):
245 | file_name = os.path.join(fake_data_path, 'bitsea-history.html')
246 | with io.open(file_name, encoding='utf-8') as f:
247 | gzh_info_and_article_by_history = f.read()
248 |
249 | gzh_article_list = WechatSogouStructuring.get_gzh_info_and_article_by_history(gzh_info_and_article_by_history)
250 | assert_in('gzh', gzh_article_list)
251 | assert_in('article', gzh_article_list)
252 |
253 | def test_get_gzh_article_by_hot(self):
254 | file_name = os.path.join(fake_data_path, 'wapindex-wap-0612-wap_8-0.html')
255 | with io.open(file_name, encoding='utf-8') as f:
256 | gzh_article_by_hot = f.read()
257 |
258 | gzh_articles = WechatSogouStructuring.get_gzh_article_by_hot(gzh_article_by_hot)
259 |
260 | for gzh_article in gzh_articles:
261 | assert_in('gzh', gzh_article)
262 | assert_in('article', gzh_article)
263 | assert_in('http://mp.weixin.qq.com/s?src=', gzh_article['article']['url'])
264 | assert_greater_equal(len(gzh_articles), 10)
265 |
266 | wechat_names = []
267 | headimages = []
268 | titles = []
269 | times = []
270 | for i in gzh_articles:
271 | wechat_names.append(i['gzh']['wechat_name'])
272 | headimages.append(i['gzh']['headimage'])
273 | titles.append(i['article']['title'])
274 | times.append(i['article']['time'])
275 |
276 | assert_equal(
277 | ['全球汽车精选', '车早茶', '吴佩频道', '驾考宝典', '腾讯汽车', '新车评', '非常好车', '汽车情报所',
278 | '一猫汽车资讯', '资深科技控', '郎club', '科技日报', '汽车使用宝典', '名车报', '科普中国网'],
279 | wechat_names)
280 | assert_equal(['http://img03.sogoucdn.com/app/a/100520090/oIWsFt1dGMefD1f8dOg2UCwQUjKs',
281 | 'http://img04.sogoucdn.com/app/a/100520090/oIWsFtwoQX8wX7w6loDevPqLEC_I',
282 | 'http://img03.sogoucdn.com/app/a/100520090/oIWsFt9Hbbtr9VLnfR9i_K5Z8D48',
283 | 'http://img04.sogoucdn.com/app/a/100520090/oIWsFt3txmWu-usvUa6gU0qlyEVo',
284 | 'http://img01.sogoucdn.com/app/a/100520090/oIWsFt8VDujUqNSCfruXtMNfekaw',
285 | 'http://img01.sogoucdn.com/app/a/100520090/oIWsFt9YD5HWLDe5QAkuvh0JWrgw',
286 | 'http://img01.sogoucdn.com/app/a/100520090/oIWsFt_WUnpQ7lZajAstgL8o1lWo',
287 | 'http://img02.sogoucdn.com/app/a/100520090/oIWsFtzUnzWUMz1PMek5zjVlS42U',
288 | 'http://img03.sogoucdn.com/app/a/100520090/oIWsFt2yk491dhhSP940JzLEameY',
289 | 'http://img03.sogoucdn.com/app/a/100520090/oIWsFtzm9UtmgY-SkOTFwQFpGsU8',
290 | 'http://img02.sogoucdn.com/app/a/100520090/oIWsFt7VwiM8GqYcv8DBNb-k5NBQ',
291 | 'http://img03.sogoucdn.com/app/a/100520090/oIWsFt2tjckivF8b0MP_nNTdESkE',
292 | 'http://img01.sogoucdn.com/app/a/100520090/oIWsFtzC2r61_riTCWp5iHX04fmo',
293 | 'http://img02.sogoucdn.com/app/a/100520090/oIWsFt8JIY_-o7DBMxorP19hcF0Q',
294 | 'http://img04.sogoucdn.com/app/a/100520090/oIWsFtyV5sdIXU2uy4m6oVBq77nA'],
295 | headimages)
296 | assert_equal(['不做这个动作,你的轮胎3个月就要换!',
297 | '新车质量最差的十个品牌?国人表示难以接受……',
298 | '带着米其林的指引去看古德伍德|品牌',
299 | '方向盘打法巧记口诀,科目二提分就靠它了!',
300 | '宝马“鸡腿”、奥迪“游艇”,这些奇葩的挡杆你见过几个?',
301 | '你没看错,我们做了期途昂和途锐的对比',
302 | '7成特斯拉被召回,难道是质量不过关?',
303 | '在中国惹不起的7种车,遇到请回避!',
304 | '迈腾摊上大事儿了 全新一代君威17.58万起', '面对这份驾享,朝廷大人都忍不住亲自上阵!',
305 | '外卖小哥被暴晒:底层人士的悲哀,有钱人不会懂',
306 | '自动驾驶还处于“新手”阶段,何时成为“老司机”?院士这样说……',
307 | '高速上碰到石头,是躲还是撞?', '装什么神秘,不就是加长版的讴歌TLX吗!',
308 | '一个动作,车里的人集体中毒!很多人都忽略了'],
309 | titles)
310 | assert_equal(
311 | [1501328135, 1501327941, 1501326826, 1501326716, 1501326675, 1501326455, 1501326222, 1501325595,
312 | 1501325529, 1501325521, 1501325223, 1501324531, 1501324443, 1501324310, 1501323274],
313 | times)
314 |
315 | def test_get_article_by_search_wap(self):
316 | file_name = os.path.join(fake_data_path, 'search-gaokao-article.json')
317 | with io.open(file_name, encoding='utf-8') as f:
318 | wap_json = json.load(f)
319 |
320 | gzh_articles = WechatSogouStructuring.get_article_by_search_wap(gaokao_keyword, wap_json)
321 | assert_equal(10, len(gzh_articles))
322 |
323 | titles = []
324 | abstracts = []
325 | gzh_names = []
326 | isvs = []
327 | open_ids = []
328 | for i in gzh_articles:
329 | assert_in('gzh', i)
330 | assert_in('article', i)
331 |
332 | article = i['article']
333 |
334 | titles.append(article['title'])
335 | abstracts.append(article['abstract'])
336 | assert_in('mp.weixin.qq.com/', article['url'])
337 |
338 | gzh = i['gzh']
339 |
340 | assert_in('mp.weixin.qq.com/profile?src=3×tamp', gzh['profile_url'])
341 | assert_in('wx.qlogo.cn/mmhead', gzh['headimage'])
342 | gzh_names.append(gzh['wechat_name'])
343 | isvs.append(gzh['isv'])
344 | open_ids.append(gzh['open_id'])
345 |
346 | assert_equal(['高考有多重要,为什么要重视高考?丨微观点',
347 | '高考:穷人考不好,中产考状元,精英不高考',
348 | '17个高考落榜者的“逆袭”故事:高考失败,天不会塌',
349 | '刚刚,高考“满分”诞生了!(附各省高考分数线)',
350 | '高考2017 | 全国各省区市高考录取时间大汇总,最新最全!',
351 | '28省公布高考分数线!各省高考状元出炉!',
352 | '高考2017 | 教育部发布高招录取工作通知!六大事项看过来',
353 | '高考录取过程详解',
354 | '高考前互有好感,高考后开始拍拖,还一同被清华录取!学霸早恋...',
355 | '高考复读,你怕了吗?'],
356 | titles)
357 | assert_equal(['针对这个问题,其实占豪已经谈过,但还是想借高考之后、借这位小战友的留言,结合自己的人生经验,谈谈个人对这件事的看法.在占豪看来,现实的社会是分层的,一个一个阶...',
358 | '#条条大路通罗马,有人就出生在罗马#前几天北京文科高考状元熊轩昂接受澎湃新闻的采访的时候,说了下面这段话. “农村地区的孩子越来越难考上好学校,而像我这种父母都...',
359 | '从高考分数出来的那一刻,今年的考生们大概都会大胆猜想自己未来的命运:高分者,一脚踏进名牌高校工作不愁,似乎人生已经平步青云;落榜者,面对落魄的分数整日哀叹,或...',
360 | '高考会有满分的情况吗?还真有!6月22日开始,全国各省的高考成绩陆续发布.22日晚上,成都市青白江区一个小区内人声鼎沸,因为小区里有一位今年参加高考的学生,总分...',
361 | '2017年高考录取工作开始了,各省区市高考录取工作何时进行?为了方便考生和家长及时了解,小编为大家作了最新最全的梳理.(图片可点击放大查看) 北京7月6日,飞行专业...',
362 | '随着阅卷工作的结束,各地开始陆续公布2017年高考录取分数线.目前,已有28个省份公布了高考分数线.青海、新疆、西藏尚未公布.据媒体报道,青海将于6月30日前发布成绩...',
363 | '有关省级教育行政部门、招生考试机构要精心实施减少录取批次改革,完善平行志愿投档录取办法,努力提高考生志愿满足率.上海、浙江要精心组织新高考录取工作,细化完善工...',
364 | '在高考录取过程中,我省和全国各地一样都实行计算机远程网上录取的方式.录取中坚持“学校负责、招办监督”的原则,整个录取过程严格按照录取日程安排,分批次进行录取....',
365 | '但学霸们在这个问题上有自己的选择,今年佛山有一对高分学霸,两人虽早有好感,但均理性选择高考后才开始拍拖,两人一同考上清华,在班上传为佳话.然而,有家长担心孩子...',
366 | '我家孩子高考失利了,只考了326分,刚到本科线,本科没希望了,哎!我家闺女也是文科370分,真愁人,该怎么办呢?让孩子走专科,孩子不甘心,做家长的也不甘心,复习,...']
367 | , abstracts)
368 | assert_equal(['占豪', '才华有限青年', '新闻哥', '光明网', '微言教育', '中国经济网', '阳光高考信息平台', '甘肃教育', '广州日报', '河北高考'], gzh_names)
369 | assert_equal(['0', '1', '1', '1', '1', '1', '1', '1', '1', '0'], isvs)
370 | assert_equal(['oIWsFt8nKJlpLQbQ5H9NMPBjxup8', 'oIWsFt24BFRU0oh5C8cGFo7vAwYk', 'oIWsFt7B8jj2BkEA1WsGkPU40uhU',
371 | 'oIWsFtwaY2ERrY_oAgz5pHTn4aGc', 'oIWsFt5d7GugmQYi0cNC60qYV9c4', 'oIWsFt0B7LsVbUCMpgksNY8tqIno',
372 | 'oIWsFtzrEz_Tydpahalp9daXMg0Y', 'oIWsFt5kk9RnueF3AiUOao2XrP9o', 'oIWsFt7aLTQfT_wmrF4GpT27_xjg',
373 | 'oIWsFt3nYBUhqb4beN3rTBxdUHD8'],
374 | open_ids)
375 |
376 | def test_get_article_detail(self):
377 | file_name = os.path.join(fake_data_path, 'article_detail_backgroud-image.html')
378 | with io.open(file_name, encoding='utf-8') as f:
379 | text = f.read()
380 |
381 | article_detail = WechatSogouStructuring.get_article_detail(text)
382 | assert_equal(len(article_detail['content_img_list']), 29, article_detail)
383 | assert_true('data-wxurl' not in article_detail['content_html'], article_detail['content_html'])
384 | assert_true('qqmusic' not in article_detail['content_html'], article_detail['content_html'])
385 | # 图片有src属性,无data-src属性
386 | content_html = BeautifulSoup(article_detail['content_html'], 'lxml')
387 | imgs = content_html.find_all("img", src=re.compile(r'http'))
388 | assert_equal(len(imgs), 29, imgs)
389 | for img in imgs:
390 | assert_is_none(img.attrs.get('data-src'))
391 |
392 | file_name = os.path.join(fake_data_path, 'article_detail_mpvoice.html')
393 | with io.open(file_name, encoding='utf-8') as f:
394 | text = f.read()
395 |
396 | article_detail = WechatSogouStructuring.get_article_detail(text)
397 | assert_equal(len(article_detail['content_img_list']), 9, article_detail)
398 | assert_true('data-wxurl' not in article_detail['content_html'], article_detail['content_html'])
399 | assert_true('qqmusic' not in article_detail['content_html'], article_detail['content_html'])
400 | assert_true('mpvoice' not in article_detail['content_html'], article_detail['content_html'])
401 |
402 | file_name = os.path.join(fake_data_path, 'article_detail_qqmusic.html')
403 | with io.open(file_name, encoding='utf-8') as f:
404 | text = f.read()
405 |
406 | article_detail = WechatSogouStructuring.get_article_detail(text)
407 | assert_equal(len(article_detail['content_img_list']), 2, article_detail)
408 | assert_true('data-wxurl' not in article_detail['content_html'], article_detail['content_html'])
409 | assert_true('qqmusic' not in article_detail['content_html'], article_detail['content_html'])
410 | assert_true('mpvoice' not in article_detail['content_html'], article_detail['content_html'])
411 |
412 | file_name = os.path.join(fake_data_path, 'article_detail_iframe.html')
413 | with io.open(file_name, encoding='utf-8') as f:
414 | text = f.read()
415 |
416 | article_detail = WechatSogouStructuring.get_article_detail(text)
417 | assert_equal(len(article_detail['content_img_list']), 6, article_detail)
418 | assert_not_in('data-wxurl', article_detail['content_html'], article_detail['content_html'])
419 | assert_not_in('qqmusic', article_detail['content_html'], article_detail['content_html'])
420 | assert_not_in('mpvoice', article_detail['content_html'], article_detail['content_html'])
421 |
422 | # 图片有src属性,无data-src属性
423 | content_html = BeautifulSoup(article_detail['content_html'], 'lxml')
424 | iframes = content_html.find_all("iframe", src=re.compile(r'http'))
425 | assert_equal(len(iframes), 1, iframes)
426 | for iframe in iframes:
427 | assert_is_none(iframe.attrs.get('data-src'))
428 |
429 |
430 | if __name__ == '__main__':
431 | unittest.main()
432 |
--------------------------------------------------------------------------------
/test/test_tools.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import unittest
4 |
5 | from nose.tools import assert_raises, assert_equal
6 | from lxml import etree
7 |
8 | from wechatsogou.tools import list_or_empty, get_elem_text, replace_html, str_to_dict, replace_space, get_url_param
9 |
10 |
11 | class TestTools(unittest.TestCase):
12 | def test_list_or_empty(self):
13 | with assert_raises(AssertionError):
14 | list_or_empty('test for fun')
15 |
16 | assert_equal(list_or_empty(['1', '2'], int), 1)
17 | assert_equal(list_or_empty(['1', '2']), '1')
18 | assert_equal(list_or_empty([], int), 0)
19 | assert_equal(list_or_empty([], str), '')
20 | assert_equal(list_or_empty([], list), [])
21 |
22 | def test_get_elem_text(self):
23 | html = '''
24 |
28 | '''
29 | elem = etree.HTML(html)
30 | assert_equal(get_elem_text(elem), '111222')
31 |
32 | def test_replace_html(self):
33 | html = ''''"&¥amp;<> \\'''
34 | assert_equal(replace_html(html), '\'"&¥<> ')
35 |
36 | html = [''', '"', '&', '¥', 'amp;', '<', '>', ' ', '\\']
37 | assert_equal(replace_html(html), ['\'', '"', '&', '¥', '', '<', '>', ' ', ''])
38 |
39 | html = {''': '"'}
40 | assert_equal(replace_html(html), {'\'': '"'})
41 |
42 | def test_str_to_dict(self):
43 | string = "{'a':'a'}"
44 | assert_equal(str_to_dict(string), {'a': 'a'})
45 |
46 | def test_replace_space(self):
47 | string = 'ss ss'
48 | assert_equal(replace_space(string), 'ssss')
49 |
50 | def test_get_url_param(self):
51 | url = 'http://example.com?a=1&b=2&a=3'
52 | assert_equal(get_url_param(url), {'a': ['1', '3'], 'b': ['2']})
53 |
54 |
55 | if __name__ == '__main__':
56 | unittest.main()
57 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py27,py35,py36
3 |
4 | [testenv]
5 | passenv = *
6 | deps=
7 | setuptools==34.3.1
8 | requests
9 | lxml
10 | future
11 | Pillow
12 | Werkzeug
13 | nose
14 | httpretty
15 | hypothesis
16 | bs4
17 | commands =
18 | #python setup.py test
19 | python -m nose -vs
20 |
--------------------------------------------------------------------------------
/wechatsogou/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # __ __ _ _ ____
4 | # \ \ / /__ ___| |__ __ _| |_/ ___| ___ __ _ ___ _ _
5 | # \ \ /\ / / _ \/ __| '_ \ / _` | __\___ \ / _ \ / _` |/ _ \| | | |
6 | # \ V V / __/ (__| | | | (_| | |_ ___) | (_) | (_| | (_) | |_| |
7 | # \_/\_/ \___|\___|_| |_|\__,_|\__|____/ \___/ \__, |\___/ \__,_|
8 | # |___/
9 |
10 | """
11 | WechatSogou Crawler Library
12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
13 |
14 | """
15 |
16 | from wechatsogou.api import WechatSogouAPI
17 | from wechatsogou.const import WechatSogouConst
18 | from wechatsogou.request import WechatSogouRequest
19 | from wechatsogou.structuring import WechatSogouStructuring
20 | from wechatsogou.exceptions import WechatSogouException, WechatSogouVcodeOcrException, WechatSogouRequestsException
21 |
22 | __all__ = [
23 | 'WechatSogouConst',
24 |
25 | 'WechatSogouAPI',
26 | 'WechatSogouRequest',
27 | 'WechatSogouStructuring',
28 |
29 | 'WechatSogouException',
30 | 'WechatSogouVcodeOcrException',
31 | 'WechatSogouRequestsException']
32 |
33 | __title__ = 'wechatsogou'
34 | __version__ = "4.5.4"
35 | __author__ = 'Chyroc'
36 |
37 | """doc string
38 |
39 | https://www.jetbrains.com/help/pycharm/type-hinting-in-pycharm.html
40 | https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt
41 | """
42 |
--------------------------------------------------------------------------------
/wechatsogou/api.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import absolute_import, print_function, unicode_literals
4 |
5 | import json
6 | import math
7 | import random
8 | import re
9 | import time
10 |
11 | import requests
12 |
13 | from wechatsogou.const import agents, WechatSogouConst
14 | from wechatsogou.exceptions import WechatSogouException, WechatSogouRequestsException, WechatSogouVcodeOcrException
15 | from wechatsogou.five import must_str, quote
16 | from wechatsogou.identify_image import (identify_image_callback_by_hand, unlock_sogou_callback_example, unlock_weixin_callback_example, ws_cache)
17 | from wechatsogou.request import WechatSogouRequest
18 | from wechatsogou.structuring import WechatSogouStructuring
19 | from wechatsogou.tools import may_int
20 |
21 |
22 | class WechatSogouAPI(object):
23 | def __init__(self, captcha_break_time=1, headers=None, **kwargs):
24 | """初始化参数
25 |
26 | Parameters
27 | ----------
28 | captcha_break_time : int
29 | 验证码输入错误重试次数
30 | proxies : dict
31 | 代理
32 | timeout : float
33 | 超时时间
34 | """
35 | assert isinstance(captcha_break_time, int) and 0 < captcha_break_time < 20
36 |
37 | self.captcha_break_times = captcha_break_time
38 | self.requests_kwargs = kwargs
39 | self.headers = headers
40 | if self.headers:
41 | self.headers['User-Agent'] = random.choice(agents)
42 | else:
43 | self.headers = {'User-Agent': random.choice(agents)}
44 |
45 | def __set_cookie(self, suv=None, snuid=None, referer=None):
46 | suv = ws_cache.get('suv') if suv is None else suv
47 | snuid = ws_cache.get('snuid') if snuid is None else snuid
48 | _headers = {'Cookie': 'SUV={};SNUID={};'.format(suv, snuid)}
49 | if referer is not None:
50 | _headers['Referer'] = referer
51 | return _headers
52 |
53 | def __set_cache(self, suv, snuid):
54 | ws_cache.set('suv', suv)
55 | ws_cache.set('snuid', snuid)
56 |
57 | def __get(self, url, session, headers):
58 | h = {}
59 | if headers:
60 | for k, v in headers.items():
61 | h[k] = v
62 | if self.headers:
63 | for k, v in self.headers.items():
64 | h[k] = v
65 | resp = session.get(url, headers=h, **self.requests_kwargs)
66 |
67 | if not resp.ok:
68 | raise WechatSogouRequestsException('WechatSogouAPI get error', resp)
69 |
70 | return resp
71 |
72 | def __unlock_sogou(self, url, resp, session, unlock_callback=None, identify_image_callback=None):
73 | if unlock_callback is None:
74 | unlock_callback = unlock_sogou_callback_example
75 | millis = int(round(time.time() * 1000))
76 | r_captcha = session.get('http://weixin.sogou.com/antispider/util/seccode.php?tc={}'.format(millis), headers={
77 | 'Referer': url,
78 | })
79 | if not r_captcha.ok:
80 | raise WechatSogouRequestsException('WechatSogouAPI get img', r_captcha)
81 |
82 | r_unlock = unlock_callback(url, session, resp, r_captcha.content, identify_image_callback)
83 |
84 | if r_unlock['code'] != 0:
85 | raise WechatSogouVcodeOcrException(
86 | '[WechatSogouAPI identify image] code: {code}, msg: {msg}'.format(code=r_unlock.get('code'),
87 | msg=r_unlock.get('msg')))
88 | else:
89 | self.__set_cache(session.cookies.get('SUID'), r_unlock['id'])
90 |
91 | def __unlock_wechat(self, url, resp, session, unlock_callback=None, identify_image_callback=None):
92 | if unlock_callback is None:
93 | unlock_callback = unlock_weixin_callback_example
94 |
95 | r_captcha = session.get('https://mp.weixin.qq.com/mp/verifycode?cert={}'.format(time.time() * 1000))
96 | if not r_captcha.ok:
97 | raise WechatSogouRequestsException('WechatSogouAPI unlock_history get img', resp)
98 |
99 | r_unlock = unlock_callback(url, session, resp, r_captcha.content, identify_image_callback)
100 |
101 | if r_unlock['ret'] != 0:
102 | raise WechatSogouVcodeOcrException(
103 | '[WechatSogouAPI identify image] code: {ret}, msg: {errmsg}, cookie_count: {cookie_count}'.format(
104 | ret=r_unlock.get('ret'), errmsg=r_unlock.get('errmsg'), cookie_count=r_unlock.get('cookie_count')))
105 |
106 | def __get_by_unlock(self, url, referer=None, unlock_platform=None, unlock_callback=None, identify_image_callback=None, session=None):
107 | assert unlock_platform is None or callable(unlock_platform)
108 |
109 | if identify_image_callback is None:
110 | identify_image_callback = identify_image_callback_by_hand
111 | assert unlock_callback is None or callable(unlock_callback)
112 | assert callable(identify_image_callback)
113 |
114 | if not session:
115 | session = requests.session()
116 | resp = self.__get(url, session, headers=self.__set_cookie(referer=referer))
117 | resp.encoding = 'utf-8'
118 | if 'antispider' in resp.url or '请输入验证码' in resp.text:
119 | for i in range(self.captcha_break_times):
120 | try:
121 | unlock_platform(url=url, resp=resp, session=session, unlock_callback=unlock_callback, identify_image_callback=identify_image_callback)
122 | break
123 | except WechatSogouVcodeOcrException as e:
124 | if i == self.captcha_break_times - 1:
125 | raise WechatSogouVcodeOcrException(e)
126 |
127 | if '请输入验证码' in resp.text:
128 | resp = session.get(url)
129 | resp.encoding = 'utf-8'
130 | else:
131 | headers = self.__set_cookie(referer=referer)
132 | headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; WOW64)'
133 | resp = self.__get(url, session, headers)
134 | resp.encoding = 'utf-8'
135 |
136 | return resp
137 |
138 | def __hosting_wechat_img(self, content_info, hosting_callback):
139 | """将微信明细中图片托管到云端,同时将html页面中的对应图片替换
140 |
141 | Parameters
142 | ----------
143 | content_info : dict 微信文章明细字典
144 | {
145 | 'content_img_list': [], # 从微信文章解析出的原始图片列表
146 | 'content_html': '', # 从微信文章解析出文章的内容
147 | }
148 | hosting_callback : callable
149 | 托管回调函数,传入单个图片链接,返回托管后的图片链接
150 |
151 | Returns
152 | -------
153 | dict
154 | {
155 | 'content_img_list': '', # 托管后的图片列表
156 | 'content_html': '', # 图片链接为托管后的图片链接内容
157 | }
158 | """
159 | assert callable(hosting_callback)
160 |
161 | content_img_list = content_info.pop("content_img_list")
162 | content_html = content_info.pop("content_html")
163 | for idx, img_url in enumerate(content_img_list):
164 | hosting_img_url = hosting_callback(img_url)
165 | if not hosting_img_url:
166 | # todo 定义标准异常
167 | raise Exception()
168 | content_img_list[idx] = hosting_img_url
169 | content_html = content_html.replace(img_url, hosting_img_url)
170 |
171 | return dict(content_img_list=content_img_list, content_html=content_html)
172 |
173 | def __format_url(self, url, referer, text, unlock_callback=None, identify_image_callback=None, session=None):
174 | def _parse_url(url, pads):
175 | b = math.floor(random.random() * 100) + 1
176 | a = url.find("url=")
177 | c = url.find("&k=")
178 | if a != -1 and c == -1:
179 | sum = 0
180 | for i in list(pads) + [a, b]:
181 | sum += int(must_str(i))
182 | a = url[sum]
183 |
184 | return '{}&k={}&h={}'.format(url, may_int(b), may_int(a))
185 |
186 | if url.startswith('/link?url='):
187 | url = 'https://weixin.sogou.com{}'.format(url)
188 |
189 | pads = re.findall(r'href\.substr\(a\+(\d+)\+parseInt\("(\d+)"\)\+b,1\)', text)
190 | url = _parse_url(url, pads[0] if pads else [])
191 | resp = self.__get_by_unlock(url,
192 | referer=referer,
193 | unlock_platform=self.__unlock_sogou,
194 | unlock_callback=unlock_callback,
195 | identify_image_callback=identify_image_callback,
196 | session=session)
197 | uri = ''
198 | base_url = re.findall(r'var url = \'(.*?)\';', resp.text)
199 | if base_url and len(base_url) > 0:
200 | uri = base_url[0]
201 |
202 | mp_url = re.findall(r'url \+= \'(.*?)\';', resp.text)
203 | if mp_url:
204 | uri = uri + ''.join(mp_url)
205 | url = uri.replace('@', '')
206 | return url
207 |
208 | def get_gzh_info(self, wecgat_id_or_name, unlock_callback=None, identify_image_callback=None, decode_url=True):
209 | """获取公众号微信号 wechatid 的信息
210 |
211 | 因为wechatid唯一确定,所以第一个就是要搜索的公众号
212 |
213 | Parameters
214 | ----------
215 | wecgat_id_or_name : str or unicode
216 | wechat_id or wechat_name
217 | unlock_callback : callable
218 | 处理出现验证码页面的函数,参见 unlock_callback_example
219 | identify_image_callback : callable
220 | 处理验证码函数,输入验证码二进制数据,输出文字,参见 identify_image_callback_example
221 |
222 | Returns
223 | -------
224 | dict or None
225 | {
226 | 'open_id': '', # 微信号唯一ID
227 | 'profile_url': '', # 最近10条群发页链接
228 | 'headimage': '', # 头像
229 | 'wechat_name': '', # 名称
230 | 'wechat_id': '', # 微信id
231 | 'post_perm': '', # 最近一月群发数
232 | 'qrcode': '', # 二维码
233 | 'introduction': '', # 介绍
234 | 'authentication': '' # 认证
235 | }
236 | """
237 | info = self.search_gzh(wecgat_id_or_name, 1, unlock_callback, identify_image_callback, decode_url)
238 | try:
239 | return next(info)
240 | except StopIteration:
241 | return None
242 |
243 | def search_gzh(self, keyword, page=1, unlock_callback=None, identify_image_callback=None, decode_url=True):
244 | """搜索 公众号
245 |
246 | 对于出现验证码的情况,可以由使用者自己提供:
247 | 1、函数 unlock_callback ,这个函数 handle 出现验证码到解决的整个流程
248 | 2、也可以 只提供函数 identify_image_callback,这个函数输入验证码二进制数据,输出验证码文字,剩下的由 wechatsogou 包来解决
249 | 注意:
250 | 函数 unlock_callback 和 identify_image_callback 只需要提供一个,如果都提供了,那么 identify_image_callback 不起作用
251 |
252 | Parameters
253 | ----------
254 | keyword : str or unicode
255 | 搜索文字
256 | page : int, optional
257 | 页数 the default is 1
258 | unlock_callback : callable
259 | 处理出现验证码页面的函数,参见 unlock_callback_example
260 | identify_image_callback : callable
261 | 处理验证码函数,输入验证码二进制数据,输出文字,参见 identify_image_callback_example
262 | decode_url : bool
263 | 是否解析 url
264 |
265 | Returns
266 | -------
267 | list[dict]
268 | {
269 | 'open_id': '', # 微信号唯一ID
270 | 'profile_url': '', # 最近10条群发页链接
271 | 'headimage': '', # 头像
272 | 'wechat_name': '', # 名称
273 | 'wechat_id': '', # 微信id
274 | 'post_perm': '', # 最近一月群发数
275 | 'qrcode': '', # 二维码
276 | 'introduction': '', # 介绍
277 | 'authentication': '' # 认证
278 | }
279 |
280 | Raises
281 | ------
282 | WechatSogouRequestsException
283 | requests error
284 | """
285 | url = WechatSogouRequest.gen_search_gzh_url(keyword, page)
286 | session = requests.session()
287 | resp = self.__get_by_unlock(url,
288 | unlock_platform=self.__unlock_sogou,
289 | unlock_callback=unlock_callback,
290 | identify_image_callback=identify_image_callback,
291 | session=session)
292 | gzh_list = WechatSogouStructuring.get_gzh_by_search(resp.text)
293 | for i in gzh_list:
294 | if decode_url:
295 | i['profile_url'] = self.__format_url(i['profile_url'], url, resp.text, unlock_callback=unlock_callback, identify_image_callback=identify_image_callback, session=session)
296 | yield i
297 |
298 | def search_article(self, keyword, page=1, timesn=WechatSogouConst.search_article_time.anytime,
299 | article_type=WechatSogouConst.search_article_type.all, ft=None, et=None,
300 | unlock_callback=None,
301 | identify_image_callback=None,
302 | decode_url=True):
303 | """搜索 文章
304 |
305 | 对于出现验证码的情况,可以由使用者自己提供:
306 | 1、函数 unlock_callback ,这个函数 handle 出现验证码到解决的整个流程
307 | 2、也可以 只提供函数 identify_image_callback,这个函数输入验证码二进制数据,输出验证码文字,剩下的由 wechatsogou 包来解决
308 | 注意:
309 | 函数 unlock_callback 和 identify_image_callback 只需要提供一个,如果都提供了,那么 identify_image_callback 不起作用
310 |
311 | Parameters
312 | ----------
313 | keyword : str or unicode
314 | 搜索文字
315 | page : int, optional
316 | 页数 the default is 1
317 | timesn : WechatSogouConst.search_article_time
318 | 时间 anytime 没有限制 / day 一天 / week 一周 / month 一月 / year 一年 / specific 自定
319 | the default is anytime
320 | article_type : WechatSogouConst.search_article_type
321 | 含有内容的类型 image 有图 / video 有视频 / rich 有图和视频 / all 啥都有
322 | ft, et : datetime.date or None
323 | 当 tsn 是 specific 时,ft 代表开始时间,如: 2017-07-01
324 | 当 tsn 是 specific 时,et 代表结束时间,如: 2017-07-15
325 | unlock_callback : callable
326 | 处理出现验证码页面的函数,参见 unlock_callback_example
327 | identify_image_callback : callable
328 | 处理验证码函数,输入验证码二进制数据,输出文字,参见 identify_image_callback_example
329 | decode_url : bool
330 | 是否解析 url
331 |
332 | Returns
333 | -------
334 | list[dict]
335 | {
336 | 'article': {
337 | 'title': '', # 文章标题
338 | 'url': '', # 文章链接
339 | 'imgs': '', # 文章图片list
340 | 'abstract': '', # 文章摘要
341 | 'time': '' # 文章推送时间
342 | },
343 | 'gzh': {
344 | 'profile_url': '', # 公众号最近10条群发页链接
345 | 'headimage': '', # 头像
346 | 'wechat_name': '', # 名称
347 | 'isv': '', # 是否加v
348 | }
349 | }
350 |
351 | Raises
352 | ------
353 | WechatSogouRequestsException
354 | requests error
355 | """
356 | url = WechatSogouRequest.gen_search_article_url(keyword, page, timesn, article_type, ft, et)
357 | session = requests.session()
358 | resp = self.__get_by_unlock(url, WechatSogouRequest.gen_search_article_url(keyword),
359 | unlock_platform=self.__unlock_sogou,
360 | unlock_callback=unlock_callback,
361 | identify_image_callback=identify_image_callback,
362 | session=session)
363 |
364 | article_list = WechatSogouStructuring.get_article_by_search(resp.text)
365 | for i in article_list:
366 | if decode_url:
367 | i['article']['url'] = self.__format_url(i['article']['url'], url, resp.text, unlock_callback=unlock_callback, identify_image_callback=identify_image_callback, session=session)
368 | i['gzh']['profile_url'] = self.__format_url(i['gzh']['profile_url'], url, resp.text, unlock_callback=unlock_callback, identify_image_callback=identify_image_callback, session=session)
369 | yield i
370 |
371 | def get_gzh_article_by_history(self, keyword=None, url=None,
372 | unlock_callback_sogou=None,
373 | identify_image_callback_sogou=None,
374 | unlock_callback_weixin=None,
375 | identify_image_callback_weixin=None):
376 | """从 公众号的最近10条群发页面 提取公众号信息 和 文章列表信息
377 |
378 | 对于出现验证码的情况,可以由使用者自己提供:
379 | 1、函数 unlock_callback ,这个函数 handle 出现验证码到解决的整个流程
380 | 2、也可以 只提供函数 identify_image_callback,这个函数输入验证码二进制数据,输出验证码文字,剩下的由 wechatsogou 包来解决
381 | 注意:
382 | 函数 unlock_callback 和 identify_image_callback 只需要提供一个,如果都提供了,那么 identify_image_callback 不起作用
383 |
384 | Parameters
385 | ----------
386 | keyword : str or unicode
387 | 公众号的id 或者name
388 | url : str or unicode
389 | 群发页url,如果不提供url,就先去搜索一遍拿到url
390 | unlock_callback_sogou : callable
391 | 处理出现 搜索 的时候出现验证码的函数,参见 unlock_callback_example
392 | identify_image_callback_sogou : callable
393 | 处理 搜索 的时候处理验证码函数,输入验证码二进制数据,输出文字,参见 identify_image_callback_example
394 | unlock_callback_weixin : callable
395 | 处理出现 历史页 的时候出现验证码的函数,参见 unlock_callback_example
396 | identify_image_callback_weixin : callable
397 | 处理 历史页 的时候处理验证码函数,输入验证码二进制数据,输出文字,参见 identify_image_callback_example
398 |
399 | Returns
400 | -------
401 | dict
402 | {
403 | 'gzh': {
404 | 'wechat_name': '', # 名称
405 | 'wechat_id': '', # 微信id
406 | 'introduction': '', # 描述
407 | 'authentication': '', # 认证
408 | 'headimage': '' # 头像
409 | },
410 | 'article': [
411 | {
412 | 'send_id': '', # 群发id,注意不唯一,因为同一次群发多个消息,而群发id一致
413 | 'datetime': '', # 群发datatime
414 | 'type': '', # 消息类型,均是49,表示图文
415 | 'main': 0, # 是否是一次群发的第一次消息
416 | 'title': '', # 文章标题
417 | 'abstract': '', # 摘要
418 | 'fileid': '', #
419 | 'content_url': '', # 文章链接
420 | 'source_url': '', # 阅读原文的链接
421 | 'cover': '', # 封面图
422 | 'author': '', # 作者
423 | 'copyright_stat': '', # 文章类型,例如:原创啊
424 | },
425 | ...
426 | ]
427 | }
428 |
429 |
430 | Raises
431 | ------
432 | WechatSogouRequestsException
433 | requests error
434 | """
435 | if url is None:
436 | gzh_list = self.get_gzh_info(keyword, unlock_callback_sogou, identify_image_callback_sogou)
437 | if gzh_list is None:
438 | return {}
439 | if 'profile_url' not in gzh_list:
440 | raise Exception() # todo use ws exception
441 | url = gzh_list['profile_url']
442 |
443 | resp = self.__get_by_unlock(url, WechatSogouRequest.gen_search_article_url(keyword),
444 | unlock_platform=self.__unlock_wechat,
445 | unlock_callback=unlock_callback_weixin,
446 | identify_image_callback=identify_image_callback_weixin)
447 |
448 | return WechatSogouStructuring.get_gzh_info_and_article_by_history(resp.text)
449 |
450 | def get_gzh_article_by_hot(self, hot_index, page=1, unlock_callback=None, identify_image_callback=None):
451 | """获取 首页热门文章
452 |
453 | Parameters
454 | ----------
455 | hot_index : WechatSogouConst.hot_index
456 | 首页热门文章的分类(常量):WechatSogouConst.hot_index.xxx
457 | page : int
458 | 页数
459 |
460 | Returns
461 | -------
462 | list[dict]
463 | {
464 | 'gzh': {
465 | 'headimage': str, # 公众号头像
466 | 'wechat_name': str, # 公众号名称
467 | },
468 | 'article': {
469 | 'url': str, # 文章临时链接
470 | 'title': str, # 文章标题
471 | 'abstract': str, # 文章摘要
472 | 'time': int, # 推送时间,10位时间戳
473 | 'open_id': str, # open id
474 | 'main_img': str # 封面图片
475 | }
476 | }
477 | """
478 |
479 | assert hasattr(WechatSogouConst.hot_index, hot_index)
480 | assert isinstance(page, int) and page > 0
481 |
482 | url = WechatSogouRequest.gen_hot_url(hot_index, page)
483 | resp = self.__get_by_unlock(url,
484 | unlock_platform=self.__unlock_sogou,
485 | unlock_callback=unlock_callback,
486 | identify_image_callback=identify_image_callback)
487 |
488 | resp.encoding = 'utf-8'
489 | return WechatSogouStructuring.get_gzh_article_by_hot(resp.text)
490 |
491 | def get_article_content(self, url, del_qqmusic=True, del_mpvoice=True, unlock_callback=None,
492 | identify_image_callback=None, hosting_callback=None, raw=False):
493 | """获取文章原文,避免临时链接失效
494 |
495 | Parameters
496 | ----------
497 | url : str or unicode
498 | 原文链接,临时链接
499 | raw : bool
500 | True: 返回原始html
501 | False: 返回处理后的html
502 | del_qqmusic: bool
503 | True:微信原文中有插入的qq音乐,则删除
504 | False:微信源文中有插入的qq音乐,则保留
505 | del_mpvoice: bool
506 | True:微信原文中有插入的语音消息,则删除
507 | False:微信源文中有插入的语音消息,则保留
508 | unlock_callback : callable
509 | 处理 文章明细 的时候出现验证码的函数,参见 unlock_callback_example
510 | identify_image_callback : callable
511 | 处理 文章明细 的时候处理验证码函数,输入验证码二进制数据,输出文字,参见 identify_image_callback_example
512 | hosting_callback: callable
513 | 将微信采集的文章托管到7牛或者阿里云回调函数,输入微信图片源地址,返回托管后地址
514 |
515 | Returns
516 | -------
517 | content_html
518 | 原文内容
519 | content_img_list
520 | 文章中图片列表
521 |
522 | Raises
523 | ------
524 | WechatSogouRequestsException
525 | """
526 |
527 | resp = self.__get_by_unlock(url,
528 | unlock_platform=self.__unlock_wechat,
529 | unlock_callback=unlock_callback,
530 | identify_image_callback=identify_image_callback)
531 |
532 | resp.encoding = 'utf-8'
533 | if '链接已过期' in resp.text:
534 | raise WechatSogouException('get_article_content 链接 [{}] 已过期'.format(url))
535 | if raw:
536 | return resp.text
537 | content_info = WechatSogouStructuring.get_article_detail(resp.text, del_qqmusic=del_qqmusic,
538 | del_voice=del_mpvoice)
539 | if hosting_callback:
540 | content_info = self.__hosting_wechat_img(content_info, hosting_callback)
541 | return content_info
542 |
543 | def get_sugg(self, keyword):
544 | """获取微信搜狗搜索关键词联想
545 |
546 | Parameters
547 | ----------
548 | keyword : str or unicode
549 | 关键词
550 |
551 | Returns
552 | -------
553 | list[str]
554 | 联想关键词列表
555 |
556 | Raises
557 | ------
558 | WechatSogouRequestsException
559 | """
560 | url = 'http://w.sugg.sogou.com/sugg/ajaj_json.jsp?key={}&type=wxpub&pr=web'.format(
561 | quote(keyword.encode('utf-8')))
562 | r = requests.get(url)
563 | if not r.ok:
564 | raise WechatSogouRequestsException('get_sugg', r)
565 |
566 | sugg = re.findall(u'\["' + keyword + '",(.*?),\["', r.text)[0]
567 | return json.loads(sugg)
568 |
--------------------------------------------------------------------------------
/wechatsogou/const.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from functools import wraps
4 |
5 | from wechatsogou.exceptions import WechatSogouException
6 |
7 | agents = [
8 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
9 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
10 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
11 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
12 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
13 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
14 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
15 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
16 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
17 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
18 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
19 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
20 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
21 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
22 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
23 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
24 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
25 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
26 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
27 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
28 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
29 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
30 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
31 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
32 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
33 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
34 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
35 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
36 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
37 | "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
38 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
39 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
40 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
41 | "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
42 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
43 | ]
44 |
45 |
46 | def Const(cls):
47 | @wraps(cls)
48 | def new_setattr(self, name, value):
49 | raise WechatSogouException('const : {} can not be changed'.format(name))
50 |
51 | cls.__setattr__ = new_setattr
52 | return cls
53 |
54 |
55 | @Const
56 | class _WechatSogouSearchArticleTypeConst(object):
57 | all = 'all'
58 | rich = 'rich'
59 | video = 'video'
60 | image = 'image'
61 |
62 |
63 | @Const
64 | class _WechatSogouSearchArticleTimeConst(object):
65 | """搜索条件 时间
66 |
67 | 0 没有限制 / 1一天 / 2一周 / 3一月 / 4一年 / 5自定
68 | """
69 | anytime = 0
70 | day = 1
71 | week = 2
72 | month = 3
73 | year = 4
74 | specific = 5
75 |
76 |
77 | @Const
78 | class _WechatSogouHotIndexConst(object):
79 | hot = 'hot' # 热门
80 | gaoxiao = 'gaoxiao' # 搞笑
81 | health = 'health' # 养生
82 | sifanghua = 'sifanghua' # 私房话
83 | gossip = 'gossip' # 八卦
84 | technology = 'technology' # 科技
85 | finance = 'finance' # 财经
86 | car = 'car' # 汽车
87 | life = 'life' # 生活
88 | fashion = 'fashion' # 时尚
89 | mummy = 'mummy' # 辣妈 / 育儿
90 | travel = 'travel' # 旅行
91 | job = 'job' # 职场
92 | food = 'food' # 美食
93 | history = 'history' # 历史
94 | study = 'study' # 学霸 / 教育
95 | constellation = 'constellation' # 星座
96 | sport = 'sport' # 体育
97 | military = 'military' # 军事
98 | game = 'game' # 游戏
99 | pet = 'pet' # 萌宠
100 |
101 |
102 | @Const
103 | class _Const(object):
104 | hot_index = _WechatSogouHotIndexConst()
105 | search_article_type = _WechatSogouSearchArticleTypeConst()
106 | search_article_time = _WechatSogouSearchArticleTimeConst()
107 |
108 |
109 | WechatSogouConst = _Const()
110 |
--------------------------------------------------------------------------------
/wechatsogou/exceptions.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | class WechatSogouException(Exception):
5 | """基于搜狗搜索的的微信公众号爬虫接口 异常基类
6 | """
7 | pass
8 |
9 |
10 | class WechatSogouVcodeOcrException(WechatSogouException):
11 | """基于搜狗搜索的的微信公众号爬虫接口 验证码 识别错误 异常类
12 | """
13 | pass
14 |
15 |
16 | class WechatSogouRequestsException(WechatSogouException):
17 | """基于搜狗搜索的的微信公众号爬虫接口 抓取 异常类
18 |
19 | Parameters
20 | ----------
21 | errmsg : str or unicode
22 | msg
23 | r : requests.models.Response
24 | return of requests
25 | """
26 |
27 | def __init__(self, errmsg, r):
28 | WechatSogouException('{} [url {}] [content {}]'.format(errmsg, r.url, r.content))
29 | self.status_code = r.status_code
30 |
--------------------------------------------------------------------------------
/wechatsogou/filecache.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from werkzeug.contrib.cache import FileSystemCache
4 |
5 |
6 | class WechatCache(FileSystemCache):
7 | """基于文件的缓存
8 |
9 | """
10 |
11 | def __init__(self, cache_dir='/tmp/wechatsogou-cache', default_timeout=300):
12 | """初始化
13 |
14 | cache_dir是缓存目录
15 | """
16 | super(WechatCache, self).__init__(cache_dir, default_timeout)
17 |
18 | def get(self, key):
19 | try:
20 | return super(WechatCache, self).get(key)
21 | except ValueError:
22 | return None
23 |
--------------------------------------------------------------------------------
/wechatsogou/five.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from PIL import Image
4 | import six
5 |
6 | if six.PY2:
7 | import sys
8 | import urlparse as url_parse
9 | from urllib import urlencode
10 | from urllib import unquote
11 | from urllib import quote as quote
12 | import StringIO
13 |
14 | def readimg(content):
15 | return Image.open(StringIO.StringIO(content))
16 |
17 | reload(sys)
18 | sys.setdefaultencoding('utf-8')
19 | input = raw_input
20 | str_to_bytes = bytes
21 | def must_str(s):
22 | if isinstance(s,unicode):
23 | s = s.encode('utf-8')
24 | return s
25 | else:
26 | import urllib.parse as url_parse
27 | import urllib.parse
28 | from urllib.parse import unquote
29 | from urllib.request import quote as quote
30 | import tempfile
31 |
32 | def readimg(content):
33 | f = tempfile.TemporaryFile()
34 | f.write(content)
35 | return Image.open(f)
36 |
37 | urlencode = urllib.parse.urlencode
38 | input = input
39 | str_to_bytes = lambda x: bytes(x, encoding='utf-8')
40 | def must_str(s):
41 | return s
42 |
43 |
--------------------------------------------------------------------------------
/wechatsogou/identify_image.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import absolute_import, unicode_literals, print_function
4 |
5 | import time
6 |
7 | import requests
8 |
9 | from wechatsogou.five import readimg, input
10 | from wechatsogou.filecache import WechatCache
11 | from wechatsogou.exceptions import WechatSogouVcodeOcrException
12 |
13 | ws_cache = WechatCache()
14 |
15 |
16 | def identify_image_callback_by_hand(img):
17 | """识别二维码
18 |
19 | Parameters
20 | ----------
21 | img : bytes
22 | 验证码图片二进制数据
23 |
24 | Returns
25 | -------
26 | str
27 | 验证码文字
28 | """
29 | im = readimg(img)
30 | im.show()
31 | return input("please input code: ")
32 |
33 |
34 | def unlock_sogou_callback_example(url, req, resp, img, identify_image_callback):
35 | """手动打码解锁
36 |
37 | Parameters
38 | ----------
39 | url : str or unicode
40 | 验证码页面 之前的 url
41 | req : requests.sessions.Session
42 | requests.Session() 供调用解锁
43 | resp : requests.models.Response
44 | requests 访问页面返回的,已经跳转了
45 | img : bytes
46 | 验证码图片二进制数据
47 | identify_image_callback : callable
48 | 处理验证码函数,输入验证码二进制数据,输出文字,参见 identify_image_callback_example
49 |
50 | Returns
51 | -------
52 | dict
53 | {
54 | 'code': '',
55 | 'msg': '',
56 | }
57 | """
58 | # no use resp
59 | url_quote = url.split('weixin.sogou.com/')[-1]
60 | unlock_url = 'http://weixin.sogou.com/antispider/thank.php'
61 | data = {
62 | 'c': identify_image_callback(img),
63 | 'r': '%2F' + url_quote,
64 | 'v': 5
65 | }
66 | headers = {
67 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
68 | 'Referer': 'http://weixin.sogou.com/antispider/?from=%2f' + url_quote
69 | }
70 | r_unlock = req.post(unlock_url, data, headers=headers)
71 | r_unlock.encoding = 'utf-8'
72 | if not r_unlock.ok:
73 | raise WechatSogouVcodeOcrException(
74 | 'unlock[{}] failed: {}'.format(unlock_url, r_unlock.text, r_unlock.status_code))
75 |
76 | return r_unlock.json()
77 |
78 |
79 | def unlock_weixin_callback_example(url, req, resp, img, identify_image_callback):
80 | """手动打码解锁
81 |
82 | Parameters
83 | ----------
84 | url : str or unicode
85 | 验证码页面 之前的 url
86 | req : requests.sessions.Session
87 | requests.Session() 供调用解锁
88 | resp : requests.models.Response
89 | requests 访问页面返回的,已经跳转了
90 | img : bytes
91 | 验证码图片二进制数据
92 | identify_image_callback : callable
93 | 处理验证码函数,输入验证码二进制数据,输出文字,参见 identify_image_callback_example
94 |
95 | Returns
96 | -------
97 | dict
98 | {
99 | 'ret': '',
100 | 'errmsg': '',
101 | 'cookie_count': '',
102 | }
103 | """
104 | # no use resp
105 |
106 | unlock_url = 'https://mp.weixin.qq.com/mp/verifycode'
107 | data = {
108 | 'cert': time.time() * 1000,
109 | 'input': identify_image_callback(img)
110 | }
111 | headers = {
112 | 'Host': 'mp.weixin.qq.com',
113 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
114 | 'Referer': url
115 | }
116 | r_unlock = req.post(unlock_url, data, headers=headers)
117 | if not r_unlock.ok:
118 | raise WechatSogouVcodeOcrException(
119 | 'unlock[{}] failed: {}[{}]'.format(unlock_url, r_unlock.text, r_unlock.status_code))
120 |
121 | return r_unlock.json()
122 |
--------------------------------------------------------------------------------
/wechatsogou/request.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import absolute_import, unicode_literals, print_function
4 |
5 | import datetime
6 | from collections import OrderedDict
7 |
8 | from wechatsogou.const import WechatSogouConst
9 | from wechatsogou.five import urlencode
10 |
11 | _search_type_gzh = 1 # 公众号
12 | _search_type_article = 2 # 文章
13 |
14 |
15 | class WechatSogouRequest(object):
16 | @staticmethod
17 | def gen_search_article_url(keyword, page=1, timesn=WechatSogouConst.search_article_time.anytime,
18 | article_type=WechatSogouConst.search_article_type.all, ft=None, et=None):
19 | """拼接搜索 文章 URL
20 |
21 | Parameters
22 | ----------
23 | keyword : str or unicode
24 | 搜索文字
25 | page : int, optional
26 | 页数 the default is 1
27 | timesn : WechatSogouConst.search_article_time
28 | 时间 anytime 没有限制 / day 一天 / week 一周 / month 一月 / year 一年 / specific 自定
29 | 默认是 anytime
30 | article_type : WechatSogouConst.search_article_type
31 | 含有内容的类型 image 有图 / video 有视频 / rich 有图和视频 / all 啥都有
32 | 默认是 all
33 | ft, et : datetime.date
34 | 当 tsn 是 specific 时,ft 代表开始时间,如: 2017-07-01
35 | 当 tsn 是 specific 时,et 代表结束时间,如: 2017-07-15
36 |
37 | Returns
38 | -------
39 | str
40 | search_article_url
41 | """
42 | assert isinstance(page, int) and page > 0
43 | assert timesn in [WechatSogouConst.search_article_time.anytime,
44 | WechatSogouConst.search_article_time.day,
45 | WechatSogouConst.search_article_time.week,
46 | WechatSogouConst.search_article_time.month,
47 | WechatSogouConst.search_article_time.year,
48 | WechatSogouConst.search_article_time.specific]
49 |
50 | if timesn == WechatSogouConst.search_article_time.specific:
51 | assert isinstance(ft, datetime.date)
52 | assert isinstance(et, datetime.date)
53 | assert ft <= et
54 | else:
55 | ft = ''
56 | et = ''
57 |
58 | interation_image = 458754
59 | interation_video = 458756
60 | if article_type == WechatSogouConst.search_article_type.rich:
61 | interation = '{},{}'.format(interation_image, interation_video)
62 | elif article_type == WechatSogouConst.search_article_type.image:
63 | interation = interation_image
64 | elif article_type == WechatSogouConst.search_article_type.video:
65 | interation = interation_video
66 | else:
67 | interation = ''
68 |
69 | qs_dict = OrderedDict()
70 | qs_dict['type'] = _search_type_article
71 | qs_dict['page'] = page
72 | qs_dict['ie'] = 'utf8'
73 | qs_dict['query'] = keyword
74 | qs_dict['interation'] = interation
75 | if timesn != 0:
76 | qs_dict['tsn'] = timesn
77 | qs_dict['ft'] = str(ft)
78 | qs_dict['et'] = str(et)
79 |
80 | # TODO 账号内搜索
81 | # '账号内 http://weixin.sogou.com/weixin?type=2&ie=utf8&query=%E9%AB%98%E8%80%83&tsn=3&ft=&et=&interation=458754
82 | # &wxid=oIWsFt1tmWoG6vO6BcsS7St61bRE&usip=nanhangqinggong'
83 | # qs['wxid'] = wxid
84 | # qs['usip'] = usip
85 |
86 | return 'http://weixin.sogou.com/weixin?{}'.format(urlencode(qs_dict))
87 |
88 | @staticmethod
89 | def gen_search_gzh_url(keyword, page=1):
90 | """拼接搜索 公众号 URL
91 |
92 | Parameters
93 | ----------
94 | keyword : str or unicode
95 | 搜索文字
96 | page : int, optional
97 | 页数 the default is 1
98 |
99 | Returns
100 | -------
101 | str
102 | search_gzh_url
103 | """
104 | assert isinstance(page, int) and page > 0
105 |
106 | qs_dict = OrderedDict()
107 | qs_dict['type'] = _search_type_gzh
108 | qs_dict['page'] = page
109 | qs_dict['ie'] = 'utf8'
110 | qs_dict['query'] = keyword
111 |
112 | return 'http://weixin.sogou.com/weixin?{}'.format(urlencode(qs_dict))
113 |
114 | @staticmethod
115 | def gen_hot_url(hot_index, page=1):
116 | """拼接 首页热门文章 URL
117 |
118 | Parameters
119 | ----------
120 | hot_index : WechatSogouConst.hot_index
121 | 首页热门文章的分类(常量):WechatSogouConst.hot_index.xxx
122 | page : int
123 | 页数
124 |
125 | Returns
126 | -------
127 | str
128 | 热门文章分类的url
129 | """
130 |
131 | assert hasattr(WechatSogouConst.hot_index, hot_index)
132 | assert isinstance(page, int) and page > 0
133 |
134 | index_urls = {
135 | WechatSogouConst.hot_index.hot: 0, # 热门
136 | WechatSogouConst.hot_index.gaoxiao: 1, # 搞笑
137 | WechatSogouConst.hot_index.health: 2, # 养生
138 | WechatSogouConst.hot_index.sifanghua: 3, # 私房话
139 | WechatSogouConst.hot_index.gossip: 4, # 八卦
140 | WechatSogouConst.hot_index.technology: 5, # 科技
141 | WechatSogouConst.hot_index.finance: 6, # 财经
142 | WechatSogouConst.hot_index.car: 7, # 汽车
143 | WechatSogouConst.hot_index.life: 8, # 生活
144 | WechatSogouConst.hot_index.fashion: 9, # 时尚
145 | WechatSogouConst.hot_index.mummy: 10, # 辣妈 / 育儿
146 | WechatSogouConst.hot_index.travel: 11, # 旅行
147 | WechatSogouConst.hot_index.job: 12, # 职场
148 | WechatSogouConst.hot_index.food: 13, # 美食
149 | WechatSogouConst.hot_index.history: 14, # 历史
150 | WechatSogouConst.hot_index.study: 15, # 学霸 / 教育
151 | WechatSogouConst.hot_index.constellation: 16, # 星座
152 | WechatSogouConst.hot_index.sport: 17, # 体育
153 | WechatSogouConst.hot_index.military: 18, # 军事
154 | WechatSogouConst.hot_index.game: 19, # 游戏
155 | WechatSogouConst.hot_index.pet: 20, # 萌宠
156 |
157 | }
158 | return 'http://weixin.sogou.com/wapindex/wap/0612/wap_{}/{}.html'.format(index_urls[hot_index], page - 1)
159 |
--------------------------------------------------------------------------------
/wechatsogou/structuring.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import absolute_import, unicode_literals, print_function
4 |
5 | import json
6 | import re
7 |
8 | import requests
9 | from bs4 import BeautifulSoup
10 | from lxml import etree
11 | from lxml.etree import XML
12 |
13 | from wechatsogou.exceptions import WechatSogouException
14 | from wechatsogou.five import str_to_bytes
15 | from wechatsogou.tools import get_elem_text, list_or_empty, replace_html, get_first_of_element, format_image_url
16 |
17 | backgroud_image_p = re.compile('background-image:[ ]+url\(\"([\w\W]+?)\"\)')
18 | js_content = re.compile('js_content.*?>((\s|\S)+)')
19 | find_article_json_re = re.compile('var msgList = (.*?)}}]};')
20 | get_post_view_perm = re.compile('')
21 |
22 |
23 | class WechatSogouStructuring(object):
24 | @staticmethod
25 | def __handle_content_url(content_url):
26 | content_url = replace_html(content_url)
27 | return ('http://mp.weixin.qq.com{}'.format(
28 | content_url) if 'http://mp.weixin.qq.com' not in content_url else content_url) if content_url else ''
29 |
30 | @staticmethod
31 | def __get_post_view_perm(text):
32 | result = get_post_view_perm.findall(text)
33 | if not result or len(result) < 1 or not result[0]:
34 | return None
35 |
36 | r = requests.get('http://weixin.sogou.com{}'.format(result[0]))
37 | if not r.ok:
38 | return None
39 |
40 | if r.json().get('code') != 'success':
41 | return None
42 |
43 | return r.json().get('msg')
44 |
45 | @staticmethod
46 | def get_gzh_by_search(text):
47 | """从搜索公众号获得的文本 提取公众号信息
48 |
49 | Parameters
50 | ----------
51 | text : str or unicode
52 | 搜索公众号获得的文本
53 |
54 | Returns
55 | -------
56 | list[dict]
57 | {
58 | 'open_id': '', # 微信号唯一ID
59 | 'profile_url': '', # 最近10条群发页链接
60 | 'headimage': '', # 头像
61 | 'wechat_name': '', # 名称
62 | 'wechat_id': '', # 微信id
63 | 'post_perm': '', # 最近一月群发数
64 | 'view_perm': '', # 最近一月阅读量
65 | 'qrcode': '', # 二维码
66 | 'introduction': '', # 介绍
67 | 'authentication': '' # 认证
68 | }
69 | """
70 | post_view_perms = WechatSogouStructuring.__get_post_view_perm(text)
71 |
72 | page = etree.HTML(text)
73 | lis = page.xpath('//ul[@class="news-list2"]/li')
74 | relist = []
75 | for li in lis:
76 | url = get_first_of_element(li, 'div/div[1]/a/@href')
77 | headimage = format_image_url(get_first_of_element(li, 'div/div[1]/a/img/@src'))
78 | wechat_name = get_elem_text(get_first_of_element(li, 'div/div[2]/p[1]'))
79 | info = get_elem_text(get_first_of_element(li, 'div/div[2]/p[2]'))
80 | qrcode = get_first_of_element(li, 'div/div[3]/span/img[1]/@src')
81 | introduction = get_elem_text(get_first_of_element(li, 'dl[1]/dd'))
82 | authentication = get_first_of_element(li, 'dl[2]/dd/text()')
83 |
84 | relist.append({
85 | 'open_id': headimage.split('/')[-1],
86 | 'profile_url': url,
87 | 'headimage': headimage,
88 | 'wechat_name': wechat_name.replace('red_beg', '').replace('red_end', ''),
89 | 'wechat_id': info.replace('微信号:', ''),
90 | 'qrcode': qrcode,
91 | 'introduction': introduction.replace('red_beg', '').replace('red_end', ''),
92 | 'authentication': authentication,
93 | 'post_perm': -1,
94 | 'view_perm': -1,
95 | })
96 |
97 | if post_view_perms:
98 | for i in relist:
99 | if i['open_id'] in post_view_perms:
100 | post_view_perm = post_view_perms[i['open_id']].split(',')
101 | if len(post_view_perm) == 2:
102 | i['post_perm'] = int(post_view_perm[0])
103 | i['view_perm'] = int(post_view_perm[1])
104 | return relist
105 |
106 | @staticmethod
107 | def get_article_by_search_wap(keyword, wap_dict):
108 | datas = []
109 | for i in wap_dict['items']:
110 | item = str_to_bytes(i).replace(b'\xee\x90\x8a' + str_to_bytes(keyword) + b'\xee\x90\x8b',
111 | str_to_bytes(keyword))
112 | root = XML(item)
113 | display = root.find('.//display')
114 | datas.append({
115 | 'gzh': {
116 | 'profile_url': display.find('encGzhUrl').text,
117 | 'open_id': display.find('openid').text,
118 | 'isv': display.find('isV').text,
119 | 'wechat_name': display.find('sourcename').text,
120 | 'wechat_id': display.find('username').text,
121 | 'headimage': display.find('headimage').text,
122 | 'qrcode': display.find('encQrcodeUrl').text,
123 | },
124 | 'article': {
125 | 'title': display.find('title').text,
126 | 'url': display.find('url').text, # encArticleUrl
127 | 'main_img': display.find('imglink').text,
128 | 'abstract': display.find('content168').text,
129 | 'time': display.find('lastModified').text,
130 | },
131 | })
132 |
133 | return datas
134 |
135 | @staticmethod
136 | def get_article_by_search(text):
137 | """从搜索文章获得的文本 提取章列表信息
138 |
139 | Parameters
140 | ----------
141 | text : str or unicode
142 | 搜索文章获得的文本
143 |
144 | Returns
145 | -------
146 | list[dict]
147 | {
148 | 'article': {
149 | 'title': '', # 文章标题
150 | 'url': '', # 文章链接
151 | 'imgs': '', # 文章图片list
152 | 'abstract': '', # 文章摘要
153 | 'time': '' # 文章推送时间
154 | },
155 | 'gzh': {
156 | 'profile_url': '', # 公众号最近10条群发页链接
157 | 'headimage': '', # 头像
158 | 'wechat_name': '', # 名称
159 | 'isv': '', # 是否加v
160 | }
161 | }
162 | """
163 | page = etree.HTML(text)
164 | lis = page.xpath('//ul[@class="news-list"]/li')
165 |
166 | articles = []
167 | for li in lis:
168 | url = get_first_of_element(li, 'div[1]/a/@href')
169 | if url:
170 | title = get_first_of_element(li, 'div[2]/h3/a')
171 | imgs = li.xpath('div[1]/a/img/@src')
172 | abstract = get_first_of_element(li, 'div[2]/p')
173 | time = get_first_of_element(li, 'div[2]/div/span/script/text()')
174 | gzh_info = li.xpath('div[2]/div/a')[0]
175 | else:
176 | url = get_first_of_element(li, 'div/h3/a/@href')
177 | title = get_first_of_element(li, 'div/h3/a')
178 | imgs = []
179 | spans = li.xpath('div/div[1]/a')
180 | for span in spans:
181 | img = span.xpath('span/img/@src')
182 | if img:
183 | imgs.append(img)
184 | abstract = get_first_of_element(li, 'div/p')
185 | time = get_first_of_element(li, 'div/div[2]/span/script/text()')
186 | gzh_info = li.xpath('div/div[2]/a')[0]
187 |
188 | if title is not None:
189 | title = get_elem_text(title).replace("red_beg", "").replace("red_end", "")
190 | if abstract is not None:
191 | abstract = get_elem_text(abstract).replace("red_beg", "").replace("red_end", "")
192 |
193 | time = re.findall('timeConvert\(\'(.*?)\'\)', time)
194 | time = list_or_empty(time, int)
195 | profile_url = get_first_of_element(gzh_info, '@href')
196 | headimage = get_first_of_element(gzh_info, '@data-headimage')
197 | wechat_name = get_first_of_element(gzh_info, 'text()')
198 | gzh_isv = get_first_of_element(gzh_info, '@data-isv', int)
199 |
200 | articles.append({
201 | 'article': {
202 | 'title': title,
203 | 'url': url,
204 | 'imgs': format_image_url(imgs),
205 | 'abstract': abstract,
206 | 'time': time
207 | },
208 | 'gzh': {
209 | 'profile_url': profile_url,
210 | 'headimage': headimage,
211 | 'wechat_name': wechat_name,
212 | 'isv': gzh_isv,
213 | }
214 | })
215 | return articles
216 |
217 | @staticmethod
218 | def get_gzh_info_by_history(text):
219 | """从 历史消息页的文本 提取公众号信息
220 |
221 | Parameters
222 | ----------
223 | text : str or unicode
224 | 历史消息页的文本
225 |
226 | Returns
227 | -------
228 | dict
229 | {
230 | 'wechat_name': '', # 名称
231 | 'wechat_id': '', # 微信id
232 | 'introduction': '', # 描述
233 | 'authentication': '', # 认证
234 | 'headimage': '' # 头像
235 | }
236 | """
237 |
238 | page = etree.HTML(text)
239 | profile_area = get_first_of_element(page, '//div[@class="profile_info_area"]')
240 |
241 | profile_img = get_first_of_element(profile_area, 'div[1]/span/img/@src')
242 | profile_name = get_first_of_element(profile_area, 'div[1]/div/strong/text()')
243 | profile_wechat_id = get_first_of_element(profile_area, 'div[1]/div/p/text()')
244 | profile_desc = get_first_of_element(profile_area, 'ul/li[1]/div/text()')
245 | profile_principal = get_first_of_element(profile_area, 'ul/li[2]/div/text()')
246 |
247 | return {
248 | 'wechat_name': profile_name.strip(),
249 | 'wechat_id': profile_wechat_id.replace('微信号: ', '').strip('\n'),
250 | 'introduction': profile_desc,
251 | 'authentication': profile_principal,
252 | 'headimage': profile_img
253 | }
254 |
255 | @staticmethod
256 | def get_article_by_history_json(text, article_json=None):
257 | """从 历史消息页的文本 提取文章列表信息
258 |
259 | Parameters
260 | ----------
261 | text : str or unicode
262 | 历史消息页的文本
263 | article_json : dict
264 | 历史消息页的文本 提取出来的文章json dict
265 |
266 | Returns
267 | -------
268 | list[dict]
269 | {
270 | 'send_id': '', # 群发id,注意不唯一,因为同一次群发多个消息,而群发id一致
271 | 'datetime': '', # 群发datatime
272 | 'type': '', # 消息类型,均是49,表示图文
273 | 'main': 0, # 是否是一次群发的第一次消息
274 | 'title': '', # 文章标题
275 | 'abstract': '', # 摘要
276 | 'fileid': '', #
277 | 'content_url': '', # 文章链接
278 | 'source_url': '', # 阅读原文的链接
279 | 'cover': '', # 封面图
280 | 'author': '', # 作者
281 | 'copyright_stat': '', # 文章类型,例如:原创啊
282 | }
283 |
284 | """
285 | if article_json is None:
286 | article_json = find_article_json_re.findall(text)
287 | if not article_json:
288 | return []
289 | article_json = article_json[0] + '}}]}'
290 | article_json = json.loads(article_json)
291 |
292 | items = list()
293 |
294 | for listdic in article_json['list']:
295 | if str(listdic['comm_msg_info'].get('type', '')) != '49':
296 | continue
297 |
298 | comm_msg_info = listdic['comm_msg_info']
299 | app_msg_ext_info = listdic['app_msg_ext_info']
300 | send_id = comm_msg_info.get('id', '')
301 | msg_datetime = comm_msg_info.get('datetime', '')
302 | msg_type = str(comm_msg_info.get('type', ''))
303 |
304 | items.append({
305 | 'send_id': send_id,
306 | 'datetime': msg_datetime,
307 | 'type': msg_type,
308 | 'main': 1, 'title': app_msg_ext_info.get('title', ''),
309 | 'abstract': app_msg_ext_info.get('digest', ''),
310 | 'fileid': app_msg_ext_info.get('fileid', ''),
311 | 'content_url': WechatSogouStructuring.__handle_content_url(app_msg_ext_info.get('content_url')),
312 | 'source_url': app_msg_ext_info.get('source_url', ''),
313 | 'cover': app_msg_ext_info.get('cover', ''),
314 | 'author': app_msg_ext_info.get('author', ''),
315 | 'copyright_stat': app_msg_ext_info.get('copyright_stat', '')
316 | })
317 |
318 | if app_msg_ext_info.get('is_multi', 0) == 1:
319 | for multi_dict in app_msg_ext_info['multi_app_msg_item_list']:
320 | items.append({
321 | 'send_id': send_id,
322 | 'datetime': msg_datetime,
323 | 'type': msg_type,
324 | 'main': 0, 'title': multi_dict.get('title', ''),
325 | 'abstract': multi_dict.get('digest', ''),
326 | 'fileid': multi_dict.get('fileid', ''),
327 | 'content_url': WechatSogouStructuring.__handle_content_url(multi_dict.get('content_url')),
328 | 'source_url': multi_dict.get('source_url', ''),
329 | 'cover': multi_dict.get('cover', ''),
330 | 'author': multi_dict.get('author', ''),
331 | 'copyright_stat': multi_dict.get('copyright_stat', '')
332 | })
333 |
334 | return list(filter(lambda x: x['content_url'], items)) # 删除搜狗本身携带的空数据
335 |
336 | @staticmethod
337 | def get_gzh_info_and_article_by_history(text):
338 | """从 历史消息页的文本 提取公众号信息 和 文章列表信息
339 |
340 | Parameters
341 | ----------
342 | text : str or unicode
343 | 历史消息页的文本
344 |
345 | Returns
346 | -------
347 | dict
348 | {
349 | 'gzh': {
350 | 'wechat_name': '', # 名称
351 | 'wechat_id': '', # 微信id
352 | 'introduction': '', # 描述
353 | 'authentication': '', # 认证
354 | 'headimage': '' # 头像
355 | },
356 | 'article': [
357 | {
358 | 'send_id': '', # 群发id,注意不唯一,因为同一次群发多个消息,而群发id一致
359 | 'datetime': '', # 群发datatime
360 | 'type': '', # 消息类型,均是49,表示图文
361 | 'main': 0, # 是否是一次群发的第一次消息
362 | 'title': '', # 文章标题
363 | 'abstract': '', # 摘要
364 | 'fileid': '', #
365 | 'content_url': '', # 文章链接
366 | 'source_url': '', # 阅读原文的链接
367 | 'cover': '', # 封面图
368 | 'author': '', # 作者
369 | 'copyright_stat': '', # 文章类型,例如:原创啊
370 | },
371 | ...
372 | ]
373 | }
374 | """
375 | return {
376 | 'gzh': WechatSogouStructuring.get_gzh_info_by_history(text),
377 | 'article': WechatSogouStructuring.get_article_by_history_json(text)
378 | }
379 |
380 | @staticmethod
381 | def get_gzh_article_by_hot(text):
382 | """从 首页热门搜索 提取公众号信息 和 文章列表信息
383 |
384 | Parameters
385 | ----------
386 | text : str or unicode
387 | 首页热门搜索 页 中 某一页 的文本
388 |
389 | Returns
390 | -------
391 | list[dict]
392 | {
393 | 'gzh': {
394 | 'headimage': str, # 公众号头像
395 | 'wechat_name': str, # 公众号名称
396 | },
397 | 'article': {
398 | 'url': str, # 文章临时链接
399 | 'title': str, # 文章标题
400 | 'abstract': str, # 文章摘要
401 | 'time': int, # 推送时间,10位时间戳
402 | 'open_id': str, # open id
403 | 'main_img': str # 封面图片
404 | }
405 | }
406 | """
407 | page = etree.HTML(text)
408 | lis = page.xpath('/html/body/li')
409 | gzh_article_list = []
410 | for li in lis:
411 | url = get_first_of_element(li, 'div[1]/h4/a/@href')
412 | title = get_first_of_element(li, 'div[1]/h4/a/div/text()')
413 | abstract = get_first_of_element(li, 'div[1]/p[1]/text()')
414 | xpath_time = get_first_of_element(li, 'div[1]/p[2]')
415 | open_id = get_first_of_element(xpath_time, 'span/@data-openid')
416 | headimage = get_first_of_element(xpath_time, 'span/@data-headimage')
417 | gzh_name = get_first_of_element(xpath_time, 'span/text()')
418 | send_time = xpath_time.xpath('a/span/@data-lastmodified')
419 | main_img = get_first_of_element(li, 'div[2]/a/img/@src')
420 |
421 | try:
422 | send_time = int(send_time[0])
423 | except ValueError:
424 | send_time = send_time[0]
425 |
426 | gzh_article_list.append({
427 | 'gzh': {
428 | 'headimage': headimage,
429 | 'wechat_name': gzh_name,
430 | },
431 | 'article': {
432 | 'url': url,
433 | 'title': title,
434 | 'abstract': abstract,
435 | 'time': send_time,
436 | 'open_id': open_id,
437 | 'main_img': main_img
438 | }
439 | })
440 |
441 | return gzh_article_list
442 |
443 | @staticmethod
444 | def get_article_detail(text, del_qqmusic=True, del_voice=True):
445 | """根据微信文章的临时链接获取明细
446 |
447 | 1. 获取文本中所有的图片链接列表
448 | 2. 获取微信文章的html内容页面(去除标题等信息)
449 |
450 | Parameters
451 | ----------
452 | text : str or unicode
453 | 一篇微信文章的文本
454 | del_qqmusic: bool
455 | 删除文章中的qq音乐
456 | del_voice: bool
457 | 删除文章中的语音内容
458 |
459 | Returns
460 | -------
461 | dict
462 | {
463 | 'content_html': str # 微信文本内容
464 | 'content_img_list': list[img_url1, img_url2, ...] # 微信文本中图片列表
465 |
466 | }
467 | """
468 | # 1. 获取微信文本content
469 | html_obj = BeautifulSoup(text, "lxml")
470 | content_text = html_obj.find('div', {'class': 'rich_media_content', 'id': 'js_content'})
471 |
472 | # 2. 删除部分标签
473 | if del_qqmusic:
474 | qqmusic = content_text.find_all('qqmusic') or []
475 | for music in qqmusic:
476 | music.parent.decompose()
477 |
478 | if del_voice:
479 | # voice是一个p标签下的mpvoice标签以及class为'js_audio_frame db'的span构成,所以将父标签删除
480 | voices = content_text.find_all('mpvoice') or []
481 | for voice in voices:
482 | voice.parent.decompose()
483 |
484 | # 3. 获取所有的图片 [img标签,和style中的background-image]
485 | all_img_set = set()
486 | all_img_element = content_text.find_all('img') or []
487 | for ele in all_img_element:
488 | # 删除部分属性
489 | img_url = format_image_url(ele.attrs['data-src'])
490 | del ele.attrs['data-src']
491 |
492 | ele.attrs['src'] = img_url
493 |
494 | if not img_url.startswith('http'):
495 | raise WechatSogouException('img_url [{}] 不合法'.format(img_url))
496 | all_img_set.add(img_url)
497 |
498 | backgroud_image = content_text.find_all(style=re.compile("background-image")) or []
499 | for ele in backgroud_image:
500 | # 删除部分属性
501 | if ele.attrs.get('data-src'):
502 | del ele.attrs['data-src']
503 |
504 | if ele.attrs.get('data-wxurl'):
505 | del ele.attrs['data-wxurl']
506 | img_url = re.findall(backgroud_image_p, str(ele))
507 | if not img_url:
508 | continue
509 | all_img_set.add(img_url[0])
510 |
511 | # 4. 处理iframe
512 | all_img_element = content_text.find_all('iframe') or []
513 | for ele in all_img_element:
514 | # 删除部分属性
515 | img_url = ele.attrs['data-src']
516 | del ele.attrs['data-src']
517 | ele.attrs['src'] = img_url
518 |
519 | # 5. 返回数据
520 | all_img_list = list(all_img_set)
521 | content_html = content_text.prettify()
522 | # 去除div[id=js_content]
523 | content_html = re.findall(js_content, content_html)[0][0]
524 | return {
525 | 'content_html': content_html,
526 | 'content_img_list': all_img_list
527 | }
528 |
--------------------------------------------------------------------------------
/wechatsogou/tools.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import absolute_import, unicode_literals, print_function
4 |
5 | import ast
6 |
7 | import requests
8 |
9 | from wechatsogou.five import url_parse
10 |
11 |
12 | def list_or_empty(content, contype=None):
13 | assert isinstance(content, list), 'content is not list: {}'.format(content)
14 |
15 | if content:
16 | return contype(content[0]) if contype else content[0]
17 | else:
18 | if contype:
19 | if contype == int:
20 | return 0
21 | elif contype == str:
22 | return ''
23 | elif contype == list:
24 | return []
25 | else:
26 | raise Exception('only can deal int str list')
27 | else:
28 | return ''
29 |
30 |
31 | def get_elem_text(elem):
32 | """抽取lxml.etree库中elem对象中文字
33 |
34 | Args:
35 | elem: lxml.etree库中elem对象
36 |
37 | Returns:
38 | elem中文字
39 | """
40 | if elem != '':
41 | return ''.join([node.strip() for node in elem.itertext()])
42 | else:
43 | return ''
44 |
45 |
46 | def get_first_of_element(element, sub, contype=None):
47 | """抽取lxml.etree库中elem对象中文字
48 |
49 | Args:
50 | element: lxml.etree.Element
51 | sub: str
52 |
53 | Returns:
54 | elem中文字
55 | """
56 | content = element.xpath(sub)
57 | return list_or_empty(content, contype)
58 |
59 |
60 | def get_encoding_from_reponse(r):
61 | """获取requests库get或post返回的对象编码
62 |
63 | Args:
64 | r: requests库get或post返回的对象
65 |
66 | Returns:
67 | 对象编码
68 | """
69 | encoding = requests.utils.get_encodings_from_content(r.text)
70 | return encoding[0] if encoding else requests.utils.get_encoding_from_headers(r.headers)
71 |
72 |
73 | def _replace_str_html(s):
74 | """替换html‘"’等转义内容为正常内容
75 |
76 | Args:
77 | s: 文字内容
78 |
79 | Returns:
80 | s: 处理反转义后的文字
81 | """
82 | html_str_list = [
83 | (''', '\''),
84 | ('"', '"'),
85 | ('&', '&'),
86 | ('¥', '¥'),
87 | ('amp;', ''),
88 | ('<', '<'),
89 | ('>', '>'),
90 | (' ', ' '),
91 | ('\\', '')
92 | ]
93 | for i in html_str_list:
94 | s = s.replace(i[0], i[1])
95 | return s
96 |
97 |
98 | def replace_html(data):
99 | if isinstance(data, dict):
100 | return dict([(replace_html(k), replace_html(v)) for k, v in data.items()])
101 | elif isinstance(data, list):
102 | return [replace_html(l) for l in data]
103 | elif isinstance(data, str) or isinstance(data, unicode):
104 | return _replace_str_html(data)
105 | else:
106 | return data
107 |
108 |
109 | def str_to_dict(json_str):
110 | json_dict = ast.literal_eval(json_str)
111 | return replace_html(json_dict)
112 |
113 |
114 | def replace_space(s):
115 | return s.replace(' ', '').replace('\r\n', '')
116 |
117 |
118 | def get_url_param(url):
119 | result = url_parse.urlparse(url)
120 | return url_parse.parse_qs(result.query, True)
121 |
122 |
123 | def format_image_url(url):
124 | if isinstance(url, list):
125 | return [format_image_url(i) for i in url]
126 |
127 | if url.startswith('//'):
128 | url = 'https:{}'.format(url)
129 | return url
130 |
131 |
132 | def may_int(i):
133 | try:
134 | return int(i)
135 | except Exception:
136 | return i
137 |
--------------------------------------------------------------------------------