├── .gitattributes ├── .gitignore ├── .travis.yml ├── CHANGELOG.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── coffee.md ├── docs ├── HISTORY.rst └── README.rst ├── mkdocs.yml ├── requirements.txt ├── screenshot ├── alipay_hongbao.png ├── get_gzh_article_by_history.png ├── get_gzh_article_by_hot.png ├── get_gzh_info.png ├── get_sugg.png ├── pay_ali.jpg ├── pay_wechat.jpg ├── search_article.png └── search_gzh.png ├── setup.cfg ├── setup.py ├── test ├── __init__.py ├── fateadm.py ├── file │ ├── article_detail_backgroud-image.html │ ├── article_detail_expired.html │ ├── article_detail_iframe.html │ ├── article_detail_mpvoice.html │ ├── article_detail_qqmusic.html │ ├── bitsea-history.html │ ├── search-gaokao-article.html │ ├── search-gaokao-article.json │ ├── search-gaokao-gzh-error.html │ ├── search-gaokao-gzh.html │ └── wapindex-wap-0612-wap_8-0.html ├── rk.py ├── test_api.py ├── test_const.py ├── test_request_gen_hot_url.py ├── test_request_gen_search_article_url.py ├── test_request_gen_search_gzh_url.py ├── test_structuring.py └── test_tools.py ├── tox.ini └── wechatsogou ├── __init__.py ├── api.py ├── const.py ├── exceptions.py ├── filecache.py ├── five.py ├── identify_image.py ├── request.py ├── structuring.py └── tools.py /.gitattributes: -------------------------------------------------------------------------------- 1 | test/file/* linguist-vendored 2 | docs/bootstrap/* linguist-vendored 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .git 3 | __pycache__ 4 | cache 5 | wechatsogou/cache 6 | wechatsogou/ocr 7 | sprider 8 | wechatsogou.egg-info 9 | dist 10 | demo 11 | log.txt 12 | web 13 | wechatid.txt 14 | .DS_Store 15 | test/config.py 16 | .python-version 17 | *.pyc 18 | .tox/ 19 | build/ 20 | .hypothesis/ 21 | test/.hypothesis/ 22 | t.py 23 | y.py 24 | tencent_captcha/ 25 | docs/src/node_modules/ 26 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - "2.7.12" 5 | - "3.5.3" 6 | - "3.6.1" 7 | 8 | cache: 9 | directories: 10 | - $HOME/.cache/pip 11 | 12 | env: 13 | global: 14 | - PIP_WHEEL_DIR=$HOME/.cache/pip/wheels 15 | - PIP_FIND_LINKS=file://$HOME/.cache/pip/wheels 16 | 17 | install: 18 | - pip install tox tox-travis flake8 19 | - pip install -r requirements.txt 20 | 21 | before_script: 22 | - export PYTHONPATH=$PYTHONPATH:$(pwd) 23 | 24 | script: 25 | - make flake8 26 | - make dry_publish 27 | - tox 28 | 29 | notifications: 30 | email: 31 | on_success: never 32 | on_failure: never -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2.0 2 | Copyright [2018] [Chyroc https://blog.chyroc.cn] 3 | 4 | Apache License 5 | Version 2.0, January 2004 6 | http://www.apache.org/licenses/ 7 | 8 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 9 | 10 | 1. Definitions. 11 | 12 | "License" shall mean the terms and conditions for use, reproduction, 13 | and distribution as defined by Sections 1 through 9 of this document. 14 | 15 | "Licensor" shall mean the copyright owner or entity authorized by 16 | the copyright owner that is granting the License. 17 | 18 | "Legal Entity" shall mean the union of the acting entity and all 19 | other entities that control, are controlled by, or are under common 20 | control with that entity. For the purposes of this definition, 21 | "control" means (i) the power, direct or indirect, to cause the 22 | direction or management of such entity, whether by contract or 23 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 24 | outstanding shares, or (iii) beneficial ownership of such entity. 25 | 26 | "You" (or "Your") shall mean an individual or Legal Entity 27 | exercising permissions granted by this License. 28 | 29 | "Source" form shall mean the preferred form for making modifications, 30 | including but not limited to software source code, documentation 31 | source, and configuration files. 32 | 33 | "Object" form shall mean any form resulting from mechanical 34 | transformation or translation of a Source form, including but 35 | not limited to compiled object code, generated documentation, 36 | and conversions to other media types. 37 | 38 | "Work" shall mean the work of authorship, whether in Source or 39 | Object form, made available under the License, as indicated by a 40 | copyright notice that is included in or attached to the work 41 | (an example is provided in the Appendix below). 42 | 43 | "Derivative Works" shall mean any work, whether in Source or Object 44 | form, that is based on (or derived from) the Work and for which the 45 | editorial revisions, annotations, elaborations, or other modifications 46 | represent, as a whole, an original work of authorship. For the purposes 47 | of this License, Derivative Works shall not include works that remain 48 | separable from, or merely link (or bind by name) to the interfaces of, 49 | the Work and Derivative Works thereof. 50 | 51 | "Contribution" shall mean any work of authorship, including 52 | the original version of the Work and any modifications or additions 53 | to that Work or Derivative Works thereof, that is intentionally 54 | submitted to Licensor for inclusion in the Work by the copyright owner 55 | or by an individual or Legal Entity authorized to submit on behalf of 56 | the copyright owner. For the purposes of this definition, "submitted" 57 | means any form of electronic, verbal, or written communication sent 58 | to the Licensor or its representatives, including but not limited to 59 | communication on electronic mailing lists, source code control systems, 60 | and issue tracking systems that are managed by, or on behalf of, the 61 | Licensor for the purpose of discussing and improving the Work, but 62 | excluding communication that is conspicuously marked or otherwise 63 | designated in writing by the copyright owner as "Not a Contribution." 64 | 65 | "Contributor" shall mean Licensor and any individual or Legal Entity 66 | on behalf of whom a Contribution has been received by Licensor and 67 | subsequently incorporated within the Work. 68 | 69 | 2. Grant of Copyright License. Subject to the terms and conditions of 70 | this License, each Contributor hereby grants to You a perpetual, 71 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 72 | copyright license to reproduce, prepare Derivative Works of, 73 | publicly display, publicly perform, sublicense, and distribute the 74 | Work and such Derivative Works in Source or Object form. 75 | 76 | 3. Grant of Patent License. Subject to the terms and conditions of 77 | this License, each Contributor hereby grants to You a perpetual, 78 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 79 | (except as stated in this section) patent license to make, have made, 80 | use, offer to sell, sell, import, and otherwise transfer the Work, 81 | where such license applies only to those patent claims licensable 82 | by such Contributor that are necessarily infringed by their 83 | Contribution(s) alone or by combination of their Contribution(s) 84 | with the Work to which such Contribution(s) was submitted. If You 85 | institute patent litigation against any entity (including a 86 | cross-claim or counterclaim in a lawsuit) alleging that the Work 87 | or a Contribution incorporated within the Work constitutes direct 88 | or contributory patent infringement, then any patent licenses 89 | granted to You under this License for that Work shall terminate 90 | as of the date such litigation is filed. 91 | 92 | 4. Redistribution. You may reproduce and distribute copies of the 93 | Work or Derivative Works thereof in any medium, with or without 94 | modifications, and in Source or Object form, provided that You 95 | meet the following conditions: 96 | 97 | (a) You must give any other recipients of the Work or 98 | Derivative Works a copy of this License; and 99 | 100 | (b) You must cause any modified files to carry prominent notices 101 | stating that You changed the files; and 102 | 103 | (c) You must retain, in the Source form of any Derivative Works 104 | that You distribute, all copyright, patent, trademark, and 105 | attribution notices from the Source form of the Work, 106 | excluding those notices that do not pertain to any part of 107 | the Derivative Works; and 108 | 109 | (d) If the Work includes a "NOTICE" text file as part of its 110 | distribution, then any Derivative Works that You distribute must 111 | include a readable copy of the attribution notices contained 112 | within such NOTICE file, excluding those notices that do not 113 | pertain to any part of the Derivative Works, in at least one 114 | of the following places: within a NOTICE text file distributed 115 | as part of the Derivative Works; within the Source form or 116 | documentation, if provided along with the Derivative Works; or, 117 | within a display generated by the Derivative Works, if and 118 | wherever such third-party notices normally appear. The contents 119 | of the NOTICE file are for informational purposes only and 120 | do not modify the License. You may add Your own attribution 121 | notices within Derivative Works that You distribute, alongside 122 | or as an addendum to the NOTICE text from the Work, provided 123 | that such additional attribution notices cannot be construed 124 | as modifying the License. 125 | 126 | You may add Your own copyright statement to Your modifications and 127 | may provide additional or different license terms and conditions 128 | for use, reproduction, or distribution of Your modifications, or 129 | for any such Derivative Works as a whole, provided Your use, 130 | reproduction, and distribution of the Work otherwise complies with 131 | the conditions stated in this License. 132 | 133 | 5. Submission of Contributions. Unless You explicitly state otherwise, 134 | any Contribution intentionally submitted for inclusion in the Work 135 | by You to the Licensor shall be under the terms and conditions of 136 | this License, without any additional terms or conditions. 137 | Notwithstanding the above, nothing herein shall supersede or modify 138 | the terms of any separate license agreement you may have executed 139 | with Licensor regarding such Contributions. 140 | 141 | 6. Trademarks. This License does not grant permission to use the trade 142 | names, trademarks, service marks, or product names of the Licensor, 143 | except as required for reasonable and customary use in describing the 144 | origin of the Work and reproducing the content of the NOTICE file. 145 | 146 | 7. Disclaimer of Warranty. Unless required by applicable law or 147 | agreed to in writing, Licensor provides the Work (and each 148 | Contributor provides its Contributions) on an "AS IS" BASIS, 149 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 150 | implied, including, without limitation, any warranties or conditions 151 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 152 | PARTICULAR PURPOSE. You are solely responsible for determining the 153 | appropriateness of using or redistributing the Work and assume any 154 | risks associated with Your exercise of permissions under this License. 155 | 156 | 8. Limitation of Liability. In no event and under no legal theory, 157 | whether in tort (including negligence), contract, or otherwise, 158 | unless required by applicable law (such as deliberate and grossly 159 | negligent acts) or agreed to in writing, shall any Contributor be 160 | liable to You for damages, including any direct, indirect, special, 161 | incidental, or consequential damages of any character arising as a 162 | result of this License or out of the use or inability to use the 163 | Work (including but not limited to damages for loss of goodwill, 164 | work stoppage, computer failure or malfunction, or any and all 165 | other commercial damages or losses), even if such Contributor 166 | has been advised of the possibility of such damages. 167 | 168 | 9. Accepting Warranty or Additional Liability. While redistributing 169 | the Work or Derivative Works thereof, You may choose to offer, 170 | and charge a fee for, acceptance of support, warranty, indemnity, 171 | or other liability obligations and/or rights consistent with this 172 | License. However, in accepting such obligations, You may act only 173 | on Your own behalf and on Your sole responsibility, not on behalf 174 | of any other Contributor, and only if You agree to indemnify, 175 | defend, and hold each Contributor harmless for any liability 176 | incurred by, or claims asserted against, such Contributor by reason 177 | of your accepting any such warranty or additional liability. 178 | 179 | END OF TERMS AND CONDITIONS 180 | 181 | APPENDIX: How to apply the Apache License to your work. 182 | 183 | To apply the Apache License to your work, attach the following 184 | boilerplate notice, with the fields enclosed by brackets "[]" 185 | replaced with your own identifying information. (Don't include 186 | the brackets!) The text should be enclosed in the appropriate 187 | comment syntax for the file format. We also recommend that a 188 | file or class name and description of purpose be included on the 189 | same "printed page" as the copyright notice for easier 190 | identification within third-party archives. 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | graft docs 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: doc dry_publish 2 | 3 | docdir = docs 4 | doc: 5 | if [ -a $(docdir)/README.rst ]; then rm $(docdir)/README.rst; fi; 6 | pandoc --from=markdown --to=rst --output=$(docdir)/README.rst README.md 7 | if [ -a $(docdir)/HISTORY.rst ]; then rm $(docdir)/HISTORY.rst; fi; 8 | pandoc --from=markdown --to=rst --output=$(docdir)/HISTORY.rst CHANGELOG.md 9 | python setup.py check --restructuredtext 10 | 11 | dry_publish: 12 | rm -rf dist/ build/ 13 | python setup.py sdist bdist_wheel 14 | 15 | publish: dry_publish 16 | twine upload -s dist/* 17 | 18 | flake8: 19 | flake8 --ignore=E501,F401,E128,E402,E731,F821 wechatsogou 20 | 21 | tox: 22 | pyenv local 2.7.12 3.5.3 3.6.1 23 | tox 24 | 25 | gendoc: 26 | echo '---\nname: Change Log\n---\n' > docs/src/CHANGELOG.mdx 27 | cat CHANGELOG.md >> docs/src/CHANGELOG.mdx 28 | cd docs/src/ && yarn build && rm -rf ../static && mv .docz/dist/* ../ 29 | 30 | clean: 31 | @rm -rf build/ wechatsogou.egg-info/ dist/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 基于搜狗微信搜索的微信公众号爬虫接口 2 | === 3 | 4 | [![Build Status](https://travis-ci.org/Chyroc/WechatSogou.svg?branch=master)](https://github.com/Chyroc/WechatSogou) 5 | [![PyPI version](https://badge.fury.io/py/wechatsogou.svg)](https://github.com/Chyroc/WechatSogou) 6 | [![PyPI](https://img.shields.io/pypi/wheel/wechatsogou.svg)](https://github.com/Chyroc/WechatSogou) 7 | [![py27,py35,py36](https://img.shields.io/pypi/pyversions/wechatsogou.svg)](https://github.com/Chyroc/WechatSogou) 8 | [![PyPI](https://img.shields.io/pypi/l/wechatsogou.svg)](https://github.com/Chyroc/WechatSogou) 9 | 10 | 我的另外一个作品: https://github.com/chyroc/lark ,基于代码生成的 Lark/飞书 Go SDK,欢迎 star 。 11 | 12 | ![ws_api.get_gzh_info('南航青年志愿者')](https://raw.githubusercontent.com/chyroc/wechatsogou/master/screenshot/get_gzh_info.png) 13 | 14 | ``` 15 | __ __ _ _ ____ 16 | \ \ / /__ ___| |__ __ _| |_/ ___| ___ __ _ ___ _ _ 17 | \ \ /\ / / _ \/ __| '_ \ / _` | __\___ \ / _ \ / _` |/ _ \| | | | 18 | \ V V / __/ (__| | | | (_| | |_ ___) | (_) | (_| | (_) | |_| | 19 | \_/\_/ \___|\___|_| |_|\__,_|\__|____/ \___/ \__, |\___/ \__,_| 20 | |___/ 21 | ``` 22 | 23 | # 项目简介 24 | 基于搜狗微信搜索的微信公众号爬虫接口,可以扩展成基于搜狗搜索的爬虫 25 | 26 | 如果有问题,请提issue 27 | 28 | [CHANGELOG](./CHANGELOG.md) 29 | 30 | # 交流分享 31 | 32 | - QQ群(只需加一个) 33 | - 一群 132955136(已满) 34 | - 二群 819084985 35 | 36 | - 微信群 37 | 38 | 39 | # 赞助作者 40 | 甲鱼说,咖啡是灵魂的饮料,买点咖啡 41 | 42 | [谢谢这些人的☕️](./coffee.md) 43 | 44 | 支付宝扫码大家一起领红包: 45 | 46 | 47 | 48 | 或者直接转账: 49 | 50 | 51 | 52 | 53 | # 问题集锦 54 | Q:没有得到原始文章url / 提示链接已经过期? 55 | A:微信屏蔽此接口,请在临时链接有效期内保存文章内容。 56 | 57 | Q:获取文章只能10篇? 58 | A:是的,仅显示最近10条群发。 59 | 60 | Q:使用的是python 2 还是 3? 61 | A:都支持,若出错,请报BUG。 62 | 63 | # 安装 64 | ``` 65 | pip install wechatsogou --upgrade 66 | ``` 67 | 68 | # 使用 69 | 70 | ### 初始化 API 71 | 72 | ```python 73 | import wechatsogou 74 | 75 | # 可配置参数 76 | 77 | # 直连 78 | ws_api = wechatsogou.WechatSogouAPI() 79 | 80 | # 验证码输入错误的重试次数,默认为1 81 | ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3) 82 | 83 | # 所有requests库的参数都能在这用 84 | # 如 配置代理,代理列表中至少需包含1个 HTTPS 协议的代理, 并确保代理可用 85 | ws_api = wechatsogou.WechatSogouAPI(proxies={ 86 | "http": "127.0.0.1:8888", 87 | "https": "127.0.0.1:8888", 88 | }) 89 | 90 | # 如 设置超时 91 | ws_api = wechatsogou.WechatSogouAPI(timeout=0.1) 92 | ``` 93 | 94 | 95 | ### 获取特定公众号信息 - get_gzh_info 96 | 97 | ![ws_api.get_gzh_info('南航青年志愿者')](https://raw.githubusercontent.com/chyroc/wechatsogou/master/screenshot/get_gzh_info.png) 98 | 99 | - 使用 100 | ``` 101 | In [5]: import wechatsogou 102 | ...: 103 | ...: ws_api =wechatsogou.WechatSogouAPI() 104 | ...: ws_api.get_gzh_info('南航青年志愿者') 105 | ...: 106 | Out[5]: 107 | { 108 | 'authentication': '南京航空航天大学', 109 | 'headimage': 'http://img01.sogoucdn.com/app/a/100520090/oIWsFt1tmWoG6vO6BcsS7St61bRE', 110 | 'introduction': '南航大志愿活动的领跑者,为你提供校内外的志愿资源和精彩消息.', 111 | 'post_perm': 26, 112 | 'view_perm': 1000, 113 | 'profile_url': 'http://mp.weixin.qq.com/profile?src=3×tamp=1501140102&ver=1&signature=OpcTZp20TUdKHjSqWh7m73RWBIzwYwINpib2ZktBkLG8NyHamTvK2jtzl7mf-VdpE246zXAq18GNm*S*bq4klw==', 114 | 'qrcode': 'http://mp.weixin.qq.com/rr?src=3×tamp=1501140102&ver=1&signature=-DnFampQflbiOadckRJaTaDRzGSNfisIfECELSo-lN-GeEOH8-XTtM*ASdavl0xuavw-bmAEQXOa1T39*EIsjzxz30LjyBNkjmgbT6bGnZM=', 115 | 'wechat_id': 'nanhangqinggong', 116 | 'wechat_name': '南航青年志愿者' 117 | } 118 | ``` 119 | 120 | - 返回数据结构 121 | ```python 122 | { 123 | 'profile_url': '', # 最近10条群发页链接 124 | 'headimage': '', # 头像 125 | 'wechat_name': '', # 名称 126 | 'wechat_id': '', # 微信id 127 | 'post_perm': int, # 最近一月群发数 128 | 'view_perm': int, # 最近一月阅读量 129 | 'qrcode': '', # 二维码 130 | 'introduction': '', # 简介 131 | 'authentication': '' # 认证 132 | } 133 | ``` 134 | 135 | ### 搜索公众号 136 | 137 | ![ws_api.search_gzh('南京航空航天大学')](https://raw.githubusercontent.com/chyroc/wechatsogou/master/screenshot/search_gzh.png) 138 | 139 | - 使用 140 | ``` 141 | In [6]: import wechatsogou 142 | ...: 143 | ...: ws_api =wechatsogou.WechatSogouAPI() 144 | ...: ws_api.search_gzh('南京航空航天大学') 145 | ...: 146 | Out[6]: 147 | [ 148 | { 149 | 'authentication': '南京航空航天大学', 150 | 'headimage': 'http://img01.sogoucdn.com/app/a/100520090/oIWsFt1MvjqspMDVvZjpmxyo36sU', 151 | 'introduction': '南京航空航天大学官方微信', 152 | 'post_perm': 0, 153 | 'view_perm': 0, 154 | 'profile_url': 'http://mp.weixin.qq.com/profile?src=3×tamp=1501141990&ver=1&signature=S-7U131D3eQERC8yJGVAg2edySXn*qGVi5uE8QyQU034di*2mS6vGJVnQBRB0It9t9M-Qn7ynvjRKZNQrjBMEg==', 155 | 'qrcode': 'http://mp.weixin.qq.com/rr?src=3×tamp=1501141990&ver=1&signature=Tlp-r0AaBRxtx3TuuyjdxmjiR4aEJY-hjh0kmtV6byVu3QIQYiMlJttJgGu0hwtZMZCCntdfaP5jD4JXipTwoGecAze8ycEF5KYZqtLSsNE=', 156 | 'wechat_id': 'NUAA_1952', 157 | 'wechat_name': '南京航空航天大学' 158 | }, 159 | { 160 | 'authentication': '南京航空航天大学', 161 | 'headimage': 'http://img01.sogoucdn.com/app/a/100520090/oIWsFtwVmjdK_57vIKeMceGXF5BQ', 162 | 'introduction': '南京航空航天大学团委官方微信平台', 163 | 'post_perm': 0, 164 | 'view_perm': 0, 165 | 'profile_url': 'http://mp.weixin.qq.com/profile?src=3×tamp=1501141990&ver=1&signature=aXFQrSDOiZJHedlL7vtAkvFMckxBmubE9VGrVczTwS601bOIT5Nrr8Pcgs6bQ-oEd6jdQ0aK5WCQjNwMAhJnyQ==', 166 | 'qrcode': 'http://mp.weixin.qq.com/rr?src=3×tamp=1501141990&ver=1&signature=7Cpbd9CVQsXJkExRcU5VM6NuyoxDQQfVfF7*CGI-PTR0y6stHPtdSDqzAzvPMWz67Xz9IMF2TDfu4Cndj5bKxlsFh6wGhiLH0b9ZKqgCW5k=', 167 | 'wechat_id': 'nuaa_tw', 168 | 'wechat_name': '南京航空航天大学团委' 169 | }, 170 | ... 171 | ] 172 | ``` 173 | 174 | - 数据结构 175 | 176 | list of dict, dict: 177 | 178 | ```python 179 | { 180 | 'profile_url': '', # 最近10条群发页链接 181 | 'headimage': '', # 头像 182 | 'wechat_name': '', # 名称 183 | 'wechat_id': '', # 微信id 184 | 'post_perm': int, # 最近一月群发数 185 | 'view_perm': int, # 最近一月阅读量 186 | 'qrcode': '', # 二维码 187 | 'introduction': '', # 介绍 188 | 'authentication': '' # 认证 189 | } 190 | ``` 191 | 192 | ### 搜索微信文章 193 | 194 | ![ws_api.search_article('南京航空航天大学')](https://raw.githubusercontent.com/chyroc/wechatsogou/master/screenshot/search_article.png) 195 | 196 | - 使用 197 | ``` 198 | In [7]: import wechatsogou 199 | ...: 200 | ...: ws_api =wechatsogou.WechatSogouAPI() 201 | ...: ws_api.search_article('南京航空航天大学') 202 | ...: 203 | Out[7]: 204 | [ 205 | { 206 | 'article': { 207 | 'abstract': '【院校省份】江苏【报名时间】4月5日截止【考试时间】6月10日-11日南京航空航天大学2017年自主招生简章南京航空航天大学2017...', 208 | 'imgs': ['http://img01.sogoucdn.com/net/a/04/link?appid=100520033&url=http://mmbiz.qpic.cn/mmbiz_png/P07yicBRJfC71QB3lREx4J4x34QOibGaia5BkiaaiaiaibicWkTBULou9R08K6FaxlUA1RFBFWCmpO1Lepk7ZcXK45vguQ/0?wx_fmt=png'], 209 | 'time': 1490270644, 210 | 'title': '南京航空航天大学2017年自主招生简章', 211 | 'url': 'http://mp.weixin.qq.com/s?src=3×tamp=1501142580&ver=1&signature=hRMlQOLQpu4BNhBACavusZdmk**D65qHyz5LWDq1lPjVcm7*iiBS0l7Pq40h0fiCX*bZ8vSMLzAMDNzELYFKIQ7mND0-7cQi-N0BtfTBql*CQdsHun-GtaYEqRva6Ukwce3gZh46SXJzo90kyZ3dwVYl6*589bGDIzG6JTGfpxI=' 212 | }, 213 | 'gzh': { 214 | 'headimage': 'http://wx.qlogo.cn/mmhead/Q3auHgzwzM5kiawibor6ABhnibMYnOADvqdcrl5XWiaFfM5mGYZ8cUica6A/0', 215 | 'isv': 0, 216 | 'profile_url': 'http://mp.weixin.qq.com/profile?src=3×tamp=1501142580&ver=1&signature=dVkDdcFr1suL1WHdCOJj7pwZhG9W*APi-j5kRtS09ccv-WID-zNs0ecDiiz1wwE7qbNSk5HBL*ffpyVXcF0fFQ==', 217 | 'wechat_name': '自主招生在线' 218 | } 219 | }, 220 | ... 221 | ] 222 | ``` 223 | 224 | - 数据结构 225 | 226 | list of dict, dict: 227 | ```python 228 | { 229 | 'article': { 230 | 'title': '', # 文章标题 231 | 'url': '', # 文章链接 232 | 'imgs': '', # 文章图片list 233 | 'abstract': '', # 文章摘要 234 | 'time': int # 文章推送时间 10位时间戳 235 | }, 236 | 'gzh': { 237 | 'profile_url': '', # 公众号最近10条群发页链接 238 | 'headimage': '', # 头像 239 | 'wechat_name': '', # 名称 240 | 'isv': int, # 是否加v 1 or 0 241 | } 242 | } 243 | ``` 244 | 245 | ### 解析最近文章页 - get_gzh_article_by_history 246 | 247 | ![ws_api.search_article('南京航空航天大学')](https://raw.githubusercontent.com/chyroc/wechatsogou/master/screenshot/get_gzh_article_by_history.png) 248 | 249 | - 使用 250 | ``` 251 | In [1]: import wechatsogou 252 | ...: 253 | ...: ws_api =wechatsogou.WechatSogouAPI() 254 | ...: ws_api.get_gzh_article_by_history('南航青年志愿者') 255 | ...: 256 | Out[1]: 257 | { 258 | 'article': [ 259 | { 260 | 'abstract': '我们所做的,并不能立马去改变什么——\n但千里之行,绿勤行永不止步。\n我们不会就此止步,之后我们又将再出发。\n 民勤,再见。\n绿勤行,不再见。', 261 | 'author': '', 262 | 'content_url': 'http://mp.weixin.qq.com/s?timestamp=1501143158&src=3&ver=1&signature=B-*tqUrFyO7OqpFeJZwTA7JJtsHpz6BgC8ugyfgpOnyWLtPb85R5Zmu0JuZRbZKG72x4bQjMCcsfA5mC3GSSOPbYd-9tzvTgmroGRmc4Tzk8090KCiEu6EjA0YMHeytWJWpxr51M2FUYQhTWJ01pTmNnXLVAG6Ex6AG52uvvmQA=', 263 | 'copyright_stat': 100, 264 | 'cover': 'http://mmbiz.qpic.cn/mmbiz_jpg/icFYWMxnmxHDYgXNjAle7szYLgQmicbaQlb1eVFuwp2vxEu5eNVwYacaHah2N5W8dKAm725vxv5aM6DFlM59Wftg/0?wx_fmt=jpeg', 265 | 'datetime': 1501072594, 266 | 'fileid': 502326199, 267 | 'main': 1, 268 | 'send_id': 1000000306, 269 | 'source_url': '', 270 | 'title': '绿勤行——不说再见', 271 | 'type': '49' 272 | }, 273 | { 274 | 'abstract': '当时不杂,过往不恋,志愿不老,我们不散!', 275 | 'author': '', 276 | 'content_url': 'http://mp.weixin.qq.com/s?timestamp=1501143158&src=3&ver=1&signature=B-*tqUrFyO7OqpFeJZwTA7JJtsHpz6BgC8ugyfgpOnyWLtPb85R5Zmu0JuZRbZKG72x4bQjMCcsfA5mC3GSSOGUrM*jg*EP1jU-Dyf2CVqmPnOgBiET2wlitek4FcRbXorAswWHm*1rqODcN52NtfKD-OcRTazQS*t5SnJtu3ZA=', 277 | 'copyright_stat': 100, 278 | 'cover': 'http://mmbiz.qpic.cn/mmbiz_jpg/icFYWMxnmxHCoY44nPUXvkSgpZI1LaEsZfkZvtGaiaNW2icjibCp6qs93xLlr9kXMJEP3z1pmQ6TbRZNicHibGzRwh1w/0?wx_fmt=jpeg', 279 | 'datetime': 1500979158, 280 | 'fileid': 502326196, 281 | 'main': 1, 282 | 'send_id': 1000000305, 283 | 'source_url': '', 284 | 'title': '有始有终 | 2016-2017年度环境保护服务部工作总结', 285 | 'type': '49' 286 | }, 287 | ... 288 | ], 289 | 'gzh': { 290 | 'authentication': '南京航空航天大学', 291 | 'headimage': 'http://wx.qlogo.cn/mmhead/Q3auHgzwzM4xV5PgPjK5XoPaaQoxnWJAFicibMvPAnsoybawMBFxua1g/0', 292 | 'introduction': '南航大志愿活动的领跑者,为你提供校内外的志愿资源和精彩消息。', 293 | 'wechat_id': 'nanhangqinggong', 294 | 'wechat_name': '南航青年志愿者' 295 | } 296 | } 297 | ``` 298 | - 数据结构 299 | ```python 300 | { 301 | 'gzh': { 302 | 'wechat_name': '', # 名称 303 | 'wechat_id': '', # 微信id 304 | 'introduction': '', # 简介 305 | 'authentication': '', # 认证 306 | 'headimage': '' # 头像 307 | }, 308 | 'article': [ 309 | { 310 | 'send_id': int, # 群发id,注意不唯一,因为同一次群发多个消息,而群发id一致 311 | 'datetime': int, # 群发datatime 10位时间戳 312 | 'type': '', # 消息类型,均是49(在手机端历史消息页有其他类型,网页端最近10条消息页只有49),表示图文 313 | 'main': int, # 是否是一次群发的第一次消息 1 or 0 314 | 'title': '', # 文章标题 315 | 'abstract': '', # 摘要 316 | 'fileid': int, # 317 | 'content_url': '', # 文章链接 318 | 'source_url': '', # 阅读原文的链接 319 | 'cover': '', # 封面图 320 | 'author': '', # 作者 321 | 'copyright_stat': int, # 文章类型,例如:原创啊 322 | }, 323 | ... 324 | ] 325 | } 326 | 327 | ``` 328 | 329 | ### 解析 首页热门 页 - get_gzh_article_by_hot 330 | 331 | ![ws_api.get_gzh_article_by_hot(WechatSogouConst.hot_index.food)](https://raw.githubusercontent.com/chyroc/wechatsogou/master/screenshot/get_gzh_article_by_hot.png) 332 | 333 | - 使用 334 | ``` 335 | In [1]: from pprint import pprint 336 | ...: from wechatsogou import WechatSogouAPI, WechatSogouConst 337 | ...: 338 | ...: ws_api = WechatSogouAPI() 339 | ...: gzh_articles = ws_api.get_gzh_article_by_hot(WechatSogouConst.hot_index.food) 340 | ...: for i in gzh_articles: 341 | ...: pprint(i) 342 | ...: 343 | { 344 | 'article': { 345 | 'abstract': '闷热的夏天有什么事情能比吃上凉凉的甜品更惬意的呢?快一起动手做起来吧,简单方便,放冰箱冻一冻,那感觉~橙汁蒸木瓜木瓜1个(300-400克左右),橙子4个,枫糖浆20克(如果家里没有,也可以用蜂蜜、炼乳等代替),椰果适量。做法1.用削皮', 346 | 'main_img': 'http://img01.sogoucdn.com/net/a/04/link?appid=100520033&url=http%3A%2F%2Fmmbiz.qpic.cn%2Fmmbiz_jpg%2Fw9UGwFPia7QTUIadPibgW8OFkqf1ibR40xicKfzofRS0sDpaFp3CG0jkPyQKeXl44TXswztW1SJnic7tmCibjB8rIIGw%2F0%3Fwx_fmt%3Djpeg', 347 | 'open_id': 'oIWsFty9hHVI9F10amtzx5TOWIq8', 348 | 'time': 1501325220, 349 | 'title': '夏日甜品制作方法,不收藏后悔哦!', 350 | 'url': 'http://mp.weixin.qq.com/s?src=3×tamp=1501328525&ver=1&signature=n9*oX0k4YbNFhNMsOjIekYrsha44lfBSCbG9jicAbGYrWNN8*48NzpcaHdxwUnC12syY5-ZxwcBfiJlMzdbAwWKlo26EW14w2Ax*gjLVlOX-AGXB4443obZ-GK0pw*AFZAGZD8sI4AFBZSZpyeaxN4sS7cpynxdIuw6S2h*--LI=' 351 | }, 352 | 'gzh': { 353 | 'headimage': 'http://img03.sogoucdn.com/app/a/100520090/oIWsFty9hHVI9F10amtzx5TOWIq8', 354 | 'wechat_name': '甜品烘焙制作坊' 355 | } 356 | } 357 | ... 358 | ... 359 | ``` 360 | 361 | - 数据结构 362 | ```python 363 | { 364 | 'gzh': { 365 | 'headimage': str, # 公众号头像 366 | 'wechat_name': str, # 公众号名称 367 | }, 368 | 'article': { 369 | 'url': str, # 文章临时链接 370 | 'title': str, # 文章标题 371 | 'abstract': str, # 文章摘要 372 | 'time': int, # 推送时间,10位时间戳 373 | 'open_id': str, # open id 374 | 'main_img': str # 封面图片 375 | } 376 | } 377 | ``` 378 | 379 | ### 获取关键字联想词 380 | - 使用 381 | ``` 382 | In [1]: import wechatsogou 383 | ...: 384 | ...: ws_api =wechatsogou.WechatSogouAPI() 385 | ...: ws_api.get_sugg('高考') 386 | ...: 387 | Out[1]: 388 | ['高考e通', 389 | '高考专业培训', 390 | '高考地理俱乐部', 391 | '高考志愿填报咨讯', 392 | '高考报考资讯', 393 | '高考教育', 394 | '高考早知道', 395 | '高考服务志愿者', 396 | '高考机构', 397 | '高考福音'] 398 | ``` 399 | 400 | - 数据结构 401 | 402 | 关键词列表 403 | ```python 404 | ['a', 'b', ...] 405 | ``` 406 | --- 407 | 408 | # TODO 409 | - [x] ~~相似文章的公众号获取~~ 410 | - [ ] 主页热门公众号获取 411 | - [ ] 文章详情页信息 412 | - [x] ~~所有类型的解析~~ 413 | - [ ] 验证码识别 414 | - [ ] 接入爬虫框架 415 | - [x] 兼容py2 416 | 417 | --- 418 | -------------------------------------------------------------------------------- /coffee.md: -------------------------------------------------------------------------------- 1 | 谢谢这些人的☕️ 2 | 3 | name | age 4 | ---- | --- 5 | ax4 | 50 6 | 风雨坛·君 | 50 7 | 陆小凤 | 28.88 8 | 朋鑫 | 18.88 9 | JenkinsY94 | 9.99 10 | yudun1989 | 50 11 | 妥妥的 | 50 12 | -------------------------------------------------------------------------------- /docs/README.rst: -------------------------------------------------------------------------------- 1 | 基于搜狗微信搜索的微信公众号爬虫接口 2 | ==================================== 3 | 4 | |Build Status| |PyPI version| |PyPI| |py27,py35,py36| |PyPI| 5 | 6 | .. figure:: https://raw.githubusercontent.com/chyroc/wechatsogou/master/screenshot/get_gzh_info.png 7 | :alt: ws_api.get_gzh_info(‘南航青年志愿者’) 8 | 9 | ws_api.get_gzh_info(‘南航青年志愿者’) 10 | 11 | :: 12 | 13 | __ __ _ _ ____ 14 | \ \ / /__ ___| |__ __ _| |_/ ___| ___ __ _ ___ _ _ 15 | \ \ /\ / / _ \/ __| '_ \ / _` | __\___ \ / _ \ / _` |/ _ \| | | | 16 | \ V V / __/ (__| | | | (_| | |_ ___) | (_) | (_| | (_) | |_| | 17 | \_/\_/ \___|\___|_| |_|\__,_|\__|____/ \___/ \__, |\___/ \__,_| 18 | |___/ 19 | 20 | 项目简介 21 | ======== 22 | 23 | 基于搜狗微信搜索的微信公众号爬虫接口,可以扩展成基于搜狗搜索的爬虫 24 | 25 | 如果有问题,请提issue 26 | 27 | `CHANGELOG <./CHANGELOG.md>`__ 28 | 29 | 交流分享 30 | ======== 31 | 32 | - QQ群(只需加一个) 33 | 34 | - 一群 132955136(已满) 35 | - 二群 819084985 36 | 37 | - 微信群 38 | 39 | 赞助作者 40 | ======== 41 | 42 | 甲鱼说,咖啡是灵魂的饮料,买点咖啡 43 | 44 | `谢谢这些人的☕️ <./coffee.md>`__ 45 | 46 | 支付宝扫码大家一起领红包: 47 | 48 | 或者直接转账: 49 | 50 | 问题集锦 51 | ======== 52 | 53 | :: 54 | 55 | Q:没有得到原始文章url / 提示链接已经过期? 56 | A:微信屏蔽此接口,请在临时链接有效期内保存文章内容。 57 | 58 | Q:获取文章只能10篇? 59 | A:是的,仅显示最近10条群发。 60 | 61 | Q:使用的是python 2 还是 3? 62 | A:都支持,若出错,请报BUG。 63 | 64 | 安装 65 | ==== 66 | 67 | :: 68 | 69 | pip install wechatsogou --upgrade 70 | 71 | 使用 72 | ==== 73 | 74 | 初始化 API 75 | ~~~~~~~~~~ 76 | 77 | .. code:: python 78 | 79 | import wechatsogou 80 | 81 | # 可配置参数 82 | 83 | # 直连 84 | ws_api = wechatsogou.WechatSogouAPI() 85 | 86 | # 验证码输入错误的重试次数,默认为1 87 | ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3) 88 | 89 | # 所有requests库的参数都能在这用 90 | # 如 配置代理,代理列表中至少需包含1个 HTTPS 协议的代理, 并确保代理可用 91 | ws_api = wechatsogou.WechatSogouAPI(proxies={ 92 | "http": "127.0.0.1:8888", 93 | "https": "127.0.0.1:8888", 94 | }) 95 | 96 | # 如 设置超时 97 | ws_api = wechatsogou.WechatSogouAPI(timeout=0.1) 98 | 99 | 获取特定公众号信息 - get_gzh_info 100 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 101 | 102 | .. figure:: https://raw.githubusercontent.com/chyroc/wechatsogou/master/screenshot/get_gzh_info.png 103 | :alt: ws_api.get_gzh_info(‘南航青年志愿者’) 104 | 105 | ws_api.get_gzh_info(‘南航青年志愿者’) 106 | 107 | - 使用 108 | 109 | :: 110 | 111 | In [5]: import wechatsogou 112 | ...: 113 | ...: ws_api =wechatsogou.WechatSogouAPI() 114 | ...: ws_api.get_gzh_info('南航青年志愿者') 115 | ...: 116 | Out[5]: 117 | { 118 | 'authentication': '南京航空航天大学', 119 | 'headimage': 'http://img01.sogoucdn.com/app/a/100520090/oIWsFt1tmWoG6vO6BcsS7St61bRE', 120 | 'introduction': '南航大志愿活动的领跑者,为你提供校内外的志愿资源和精彩消息.', 121 | 'post_perm': 26, 122 | 'view_perm': 1000, 123 | 'profile_url': 'http://mp.weixin.qq.com/profile?src=3×tamp=1501140102&ver=1&signature=OpcTZp20TUdKHjSqWh7m73RWBIzwYwINpib2ZktBkLG8NyHamTvK2jtzl7mf-VdpE246zXAq18GNm*S*bq4klw==', 124 | 'qrcode': 'http://mp.weixin.qq.com/rr?src=3×tamp=1501140102&ver=1&signature=-DnFampQflbiOadckRJaTaDRzGSNfisIfECELSo-lN-GeEOH8-XTtM*ASdavl0xuavw-bmAEQXOa1T39*EIsjzxz30LjyBNkjmgbT6bGnZM=', 125 | 'wechat_id': 'nanhangqinggong', 126 | 'wechat_name': '南航青年志愿者' 127 | } 128 | 129 | - 返回数据结构 130 | 131 | .. code:: python 132 | 133 | { 134 | 'profile_url': '', # 最近10条群发页链接 135 | 'headimage': '', # 头像 136 | 'wechat_name': '', # 名称 137 | 'wechat_id': '', # 微信id 138 | 'post_perm': int, # 最近一月群发数 139 | 'view_perm': int, # 最近一月阅读量 140 | 'qrcode': '', # 二维码 141 | 'introduction': '', # 简介 142 | 'authentication': '' # 认证 143 | } 144 | 145 | 搜索公众号 146 | ~~~~~~~~~~ 147 | 148 | .. figure:: https://raw.githubusercontent.com/chyroc/wechatsogou/master/screenshot/search_gzh.png 149 | :alt: ws_api.search_gzh(‘南京航空航天大学’) 150 | 151 | ws_api.search_gzh(‘南京航空航天大学’) 152 | 153 | - 使用 154 | 155 | :: 156 | 157 | In [6]: import wechatsogou 158 | ...: 159 | ...: ws_api =wechatsogou.WechatSogouAPI() 160 | ...: ws_api.search_gzh('南京航空航天大学') 161 | ...: 162 | Out[6]: 163 | [ 164 | { 165 | 'authentication': '南京航空航天大学', 166 | 'headimage': 'http://img01.sogoucdn.com/app/a/100520090/oIWsFt1MvjqspMDVvZjpmxyo36sU', 167 | 'introduction': '南京航空航天大学官方微信', 168 | 'post_perm': 0, 169 | 'view_perm': 0, 170 | 'profile_url': 'http://mp.weixin.qq.com/profile?src=3×tamp=1501141990&ver=1&signature=S-7U131D3eQERC8yJGVAg2edySXn*qGVi5uE8QyQU034di*2mS6vGJVnQBRB0It9t9M-Qn7ynvjRKZNQrjBMEg==', 171 | 'qrcode': 'http://mp.weixin.qq.com/rr?src=3×tamp=1501141990&ver=1&signature=Tlp-r0AaBRxtx3TuuyjdxmjiR4aEJY-hjh0kmtV6byVu3QIQYiMlJttJgGu0hwtZMZCCntdfaP5jD4JXipTwoGecAze8ycEF5KYZqtLSsNE=', 172 | 'wechat_id': 'NUAA_1952', 173 | 'wechat_name': '南京航空航天大学' 174 | }, 175 | { 176 | 'authentication': '南京航空航天大学', 177 | 'headimage': 'http://img01.sogoucdn.com/app/a/100520090/oIWsFtwVmjdK_57vIKeMceGXF5BQ', 178 | 'introduction': '南京航空航天大学团委官方微信平台', 179 | 'post_perm': 0, 180 | 'view_perm': 0, 181 | 'profile_url': 'http://mp.weixin.qq.com/profile?src=3×tamp=1501141990&ver=1&signature=aXFQrSDOiZJHedlL7vtAkvFMckxBmubE9VGrVczTwS601bOIT5Nrr8Pcgs6bQ-oEd6jdQ0aK5WCQjNwMAhJnyQ==', 182 | 'qrcode': 'http://mp.weixin.qq.com/rr?src=3×tamp=1501141990&ver=1&signature=7Cpbd9CVQsXJkExRcU5VM6NuyoxDQQfVfF7*CGI-PTR0y6stHPtdSDqzAzvPMWz67Xz9IMF2TDfu4Cndj5bKxlsFh6wGhiLH0b9ZKqgCW5k=', 183 | 'wechat_id': 'nuaa_tw', 184 | 'wechat_name': '南京航空航天大学团委' 185 | }, 186 | ... 187 | ] 188 | 189 | - 数据结构 190 | 191 | list of dict, dict: 192 | 193 | .. code:: python 194 | 195 | { 196 | 'profile_url': '', # 最近10条群发页链接 197 | 'headimage': '', # 头像 198 | 'wechat_name': '', # 名称 199 | 'wechat_id': '', # 微信id 200 | 'post_perm': int, # 最近一月群发数 201 | 'view_perm': int, # 最近一月阅读量 202 | 'qrcode': '', # 二维码 203 | 'introduction': '', # 介绍 204 | 'authentication': '' # 认证 205 | } 206 | 207 | 搜索微信文章 208 | ~~~~~~~~~~~~ 209 | 210 | .. figure:: https://raw.githubusercontent.com/chyroc/wechatsogou/master/screenshot/search_article.png 211 | :alt: ws_api.search_article(‘南京航空航天大学’) 212 | 213 | ws_api.search_article(‘南京航空航天大学’) 214 | 215 | - 使用 216 | 217 | :: 218 | 219 | In [7]: import wechatsogou 220 | ...: 221 | ...: ws_api =wechatsogou.WechatSogouAPI() 222 | ...: ws_api.search_article('南京航空航天大学') 223 | ...: 224 | Out[7]: 225 | [ 226 | { 227 | 'article': { 228 | 'abstract': '【院校省份】江苏【报名时间】4月5日截止【考试时间】6月10日-11日南京航空航天大学2017年自主招生简章南京航空航天大学2017...', 229 | 'imgs': ['http://img01.sogoucdn.com/net/a/04/link?appid=100520033&url=http://mmbiz.qpic.cn/mmbiz_png/P07yicBRJfC71QB3lREx4J4x34QOibGaia5BkiaaiaiaibicWkTBULou9R08K6FaxlUA1RFBFWCmpO1Lepk7ZcXK45vguQ/0?wx_fmt=png'], 230 | 'time': 1490270644, 231 | 'title': '南京航空航天大学2017年自主招生简章', 232 | 'url': 'http://mp.weixin.qq.com/s?src=3×tamp=1501142580&ver=1&signature=hRMlQOLQpu4BNhBACavusZdmk**D65qHyz5LWDq1lPjVcm7*iiBS0l7Pq40h0fiCX*bZ8vSMLzAMDNzELYFKIQ7mND0-7cQi-N0BtfTBql*CQdsHun-GtaYEqRva6Ukwce3gZh46SXJzo90kyZ3dwVYl6*589bGDIzG6JTGfpxI=' 233 | }, 234 | 'gzh': { 235 | 'headimage': 'http://wx.qlogo.cn/mmhead/Q3auHgzwzM5kiawibor6ABhnibMYnOADvqdcrl5XWiaFfM5mGYZ8cUica6A/0', 236 | 'isv': 0, 237 | 'profile_url': 'http://mp.weixin.qq.com/profile?src=3×tamp=1501142580&ver=1&signature=dVkDdcFr1suL1WHdCOJj7pwZhG9W*APi-j5kRtS09ccv-WID-zNs0ecDiiz1wwE7qbNSk5HBL*ffpyVXcF0fFQ==', 238 | 'wechat_name': '自主招生在线' 239 | } 240 | }, 241 | ... 242 | ] 243 | 244 | - 数据结构 245 | 246 | list of dict, dict: 247 | 248 | .. code:: python 249 | 250 | { 251 | 'article': { 252 | 'title': '', # 文章标题 253 | 'url': '', # 文章链接 254 | 'imgs': '', # 文章图片list 255 | 'abstract': '', # 文章摘要 256 | 'time': int # 文章推送时间 10位时间戳 257 | }, 258 | 'gzh': { 259 | 'profile_url': '', # 公众号最近10条群发页链接 260 | 'headimage': '', # 头像 261 | 'wechat_name': '', # 名称 262 | 'isv': int, # 是否加v 1 or 0 263 | } 264 | } 265 | 266 | 解析最近文章页 - get_gzh_article_by_history 267 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 268 | 269 | .. figure:: https://raw.githubusercontent.com/chyroc/wechatsogou/master/screenshot/get_gzh_article_by_history.png 270 | :alt: ws_api.search_article(‘南京航空航天大学’) 271 | 272 | ws_api.search_article(‘南京航空航天大学’) 273 | 274 | - 使用 275 | 276 | :: 277 | 278 | In [1]: import wechatsogou 279 | ...: 280 | ...: ws_api =wechatsogou.WechatSogouAPI() 281 | ...: ws_api.get_gzh_article_by_history('南航青年志愿者') 282 | ...: 283 | Out[1]: 284 | { 285 | 'article': [ 286 | { 287 | 'abstract': '我们所做的,并不能立马去改变什么——\n但千里之行,绿勤行永不止步。\n我们不会就此止步,之后我们又将再出发。\n 民勤,再见。\n绿勤行,不再见。', 288 | 'author': '', 289 | 'content_url': 'http://mp.weixin.qq.com/s?timestamp=1501143158&src=3&ver=1&signature=B-*tqUrFyO7OqpFeJZwTA7JJtsHpz6BgC8ugyfgpOnyWLtPb85R5Zmu0JuZRbZKG72x4bQjMCcsfA5mC3GSSOPbYd-9tzvTgmroGRmc4Tzk8090KCiEu6EjA0YMHeytWJWpxr51M2FUYQhTWJ01pTmNnXLVAG6Ex6AG52uvvmQA=', 290 | 'copyright_stat': 100, 291 | 'cover': 'http://mmbiz.qpic.cn/mmbiz_jpg/icFYWMxnmxHDYgXNjAle7szYLgQmicbaQlb1eVFuwp2vxEu5eNVwYacaHah2N5W8dKAm725vxv5aM6DFlM59Wftg/0?wx_fmt=jpeg', 292 | 'datetime': 1501072594, 293 | 'fileid': 502326199, 294 | 'main': 1, 295 | 'send_id': 1000000306, 296 | 'source_url': '', 297 | 'title': '绿勤行——不说再见', 298 | 'type': '49' 299 | }, 300 | { 301 | 'abstract': '当时不杂,过往不恋,志愿不老,我们不散!', 302 | 'author': '', 303 | 'content_url': 'http://mp.weixin.qq.com/s?timestamp=1501143158&src=3&ver=1&signature=B-*tqUrFyO7OqpFeJZwTA7JJtsHpz6BgC8ugyfgpOnyWLtPb85R5Zmu0JuZRbZKG72x4bQjMCcsfA5mC3GSSOGUrM*jg*EP1jU-Dyf2CVqmPnOgBiET2wlitek4FcRbXorAswWHm*1rqODcN52NtfKD-OcRTazQS*t5SnJtu3ZA=', 304 | 'copyright_stat': 100, 305 | 'cover': 'http://mmbiz.qpic.cn/mmbiz_jpg/icFYWMxnmxHCoY44nPUXvkSgpZI1LaEsZfkZvtGaiaNW2icjibCp6qs93xLlr9kXMJEP3z1pmQ6TbRZNicHibGzRwh1w/0?wx_fmt=jpeg', 306 | 'datetime': 1500979158, 307 | 'fileid': 502326196, 308 | 'main': 1, 309 | 'send_id': 1000000305, 310 | 'source_url': '', 311 | 'title': '有始有终 | 2016-2017年度环境保护服务部工作总结', 312 | 'type': '49' 313 | }, 314 | ... 315 | ], 316 | 'gzh': { 317 | 'authentication': '南京航空航天大学', 318 | 'headimage': 'http://wx.qlogo.cn/mmhead/Q3auHgzwzM4xV5PgPjK5XoPaaQoxnWJAFicibMvPAnsoybawMBFxua1g/0', 319 | 'introduction': '南航大志愿活动的领跑者,为你提供校内外的志愿资源和精彩消息。', 320 | 'wechat_id': 'nanhangqinggong', 321 | 'wechat_name': '南航青年志愿者' 322 | } 323 | } 324 | 325 | - 数据结构 326 | 327 | .. code:: python 328 | 329 | { 330 | 'gzh': { 331 | 'wechat_name': '', # 名称 332 | 'wechat_id': '', # 微信id 333 | 'introduction': '', # 简介 334 | 'authentication': '', # 认证 335 | 'headimage': '' # 头像 336 | }, 337 | 'article': [ 338 | { 339 | 'send_id': int, # 群发id,注意不唯一,因为同一次群发多个消息,而群发id一致 340 | 'datetime': int, # 群发datatime 10位时间戳 341 | 'type': '', # 消息类型,均是49(在手机端历史消息页有其他类型,网页端最近10条消息页只有49),表示图文 342 | 'main': int, # 是否是一次群发的第一次消息 1 or 0 343 | 'title': '', # 文章标题 344 | 'abstract': '', # 摘要 345 | 'fileid': int, # 346 | 'content_url': '', # 文章链接 347 | 'source_url': '', # 阅读原文的链接 348 | 'cover': '', # 封面图 349 | 'author': '', # 作者 350 | 'copyright_stat': int, # 文章类型,例如:原创啊 351 | }, 352 | ... 353 | ] 354 | } 355 | 356 | 解析 首页热门 页 - get_gzh_article_by_hot 357 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 358 | 359 | .. figure:: https://raw.githubusercontent.com/chyroc/wechatsogou/master/screenshot/get_gzh_article_by_hot.png 360 | :alt: ws_api.get_gzh_article_by_hot(WechatSogouConst.hot_index.food) 361 | 362 | ws_api.get_gzh_article_by_hot(WechatSogouConst.hot_index.food) 363 | 364 | - 使用 365 | 366 | :: 367 | 368 | In [1]: from pprint import pprint 369 | ...: from wechatsogou import WechatSogouAPI, WechatSogouConst 370 | ...: 371 | ...: ws_api = WechatSogouAPI() 372 | ...: gzh_articles = ws_api.get_gzh_article_by_hot(WechatSogouConst.hot_index.food) 373 | ...: for i in gzh_articles: 374 | ...: pprint(i) 375 | ...: 376 | { 377 | 'article': { 378 | 'abstract': '闷热的夏天有什么事情能比吃上凉凉的甜品更惬意的呢?快一起动手做起来吧,简单方便,放冰箱冻一冻,那感觉~橙汁蒸木瓜木瓜1个(300-400克左右),橙子4个,枫糖浆20克(如果家里没有,也可以用蜂蜜、炼乳等代替),椰果适量。做法1.用削皮', 379 | 'main_img': 'http://img01.sogoucdn.com/net/a/04/link?appid=100520033&url=http%3A%2F%2Fmmbiz.qpic.cn%2Fmmbiz_jpg%2Fw9UGwFPia7QTUIadPibgW8OFkqf1ibR40xicKfzofRS0sDpaFp3CG0jkPyQKeXl44TXswztW1SJnic7tmCibjB8rIIGw%2F0%3Fwx_fmt%3Djpeg', 380 | 'open_id': 'oIWsFty9hHVI9F10amtzx5TOWIq8', 381 | 'time': 1501325220, 382 | 'title': '夏日甜品制作方法,不收藏后悔哦!', 383 | 'url': 'http://mp.weixin.qq.com/s?src=3×tamp=1501328525&ver=1&signature=n9*oX0k4YbNFhNMsOjIekYrsha44lfBSCbG9jicAbGYrWNN8*48NzpcaHdxwUnC12syY5-ZxwcBfiJlMzdbAwWKlo26EW14w2Ax*gjLVlOX-AGXB4443obZ-GK0pw*AFZAGZD8sI4AFBZSZpyeaxN4sS7cpynxdIuw6S2h*--LI=' 384 | }, 385 | 'gzh': { 386 | 'headimage': 'http://img03.sogoucdn.com/app/a/100520090/oIWsFty9hHVI9F10amtzx5TOWIq8', 387 | 'wechat_name': '甜品烘焙制作坊' 388 | } 389 | } 390 | ... 391 | ... 392 | 393 | - 数据结构 394 | 395 | .. code:: python 396 | 397 | { 398 | 'gzh': { 399 | 'headimage': str, # 公众号头像 400 | 'wechat_name': str, # 公众号名称 401 | }, 402 | 'article': { 403 | 'url': str, # 文章临时链接 404 | 'title': str, # 文章标题 405 | 'abstract': str, # 文章摘要 406 | 'time': int, # 推送时间,10位时间戳 407 | 'open_id': str, # open id 408 | 'main_img': str # 封面图片 409 | } 410 | } 411 | 412 | 获取关键字联想词 413 | ~~~~~~~~~~~~~~~~ 414 | 415 | - 使用 416 | 417 | :: 418 | 419 | In [1]: import wechatsogou 420 | ...: 421 | ...: ws_api =wechatsogou.WechatSogouAPI() 422 | ...: ws_api.get_sugg('高考') 423 | ...: 424 | Out[1]: 425 | ['高考e通', 426 | '高考专业培训', 427 | '高考地理俱乐部', 428 | '高考志愿填报咨讯', 429 | '高考报考资讯', 430 | '高考教育', 431 | '高考早知道', 432 | '高考服务志愿者', 433 | '高考机构', 434 | '高考福音'] 435 | 436 | - 数据结构 437 | 438 | 关键词列表 439 | 440 | .. code:: python 441 | 442 | ['a', 'b', ...] 443 | 444 | -------------- 445 | 446 | TODO 447 | ==== 448 | 449 | - ☒ [STRIKEOUT:相似文章的公众号获取] 450 | - ☐ 主页热门公众号获取 451 | - ☐ 文章详情页信息 452 | - ☒ [STRIKEOUT:所有类型的解析] 453 | - ☐ 验证码识别 454 | - ☐ 接入爬虫框架 455 | - ☒ 兼容py2 456 | 457 | -------------- 458 | 459 | .. |Build Status| image:: https://travis-ci.org/Chyroc/WechatSogou.svg?branch=master 460 | :target: https://github.com/Chyroc/WechatSogou 461 | .. |PyPI version| image:: https://badge.fury.io/py/wechatsogou.svg 462 | :target: https://github.com/Chyroc/WechatSogou 463 | .. |PyPI| image:: https://img.shields.io/pypi/wheel/wechatsogou.svg 464 | :target: https://github.com/Chyroc/WechatSogou 465 | .. |py27,py35,py36| image:: https://img.shields.io/pypi/pyversions/wechatsogou.svg 466 | :target: https://github.com/Chyroc/WechatSogou 467 | .. |PyPI| image:: https://img.shields.io/pypi/l/wechatsogou.svg 468 | :target: https://github.com/Chyroc/WechatSogou 469 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: 'wechatsogou' 2 | pages: 3 | - 'API列表': 'README.md' 4 | - '更新日志': 'CHANGELOG.md' 5 | - 'FAQ': 'FAQ.md' 6 | extra_css: ['docs/bootstrap/css'] 7 | extra_javascript: ['docs/bootstrap/js'] 8 | theme_dir: 'docs/bootstrap' 9 | repo_url: 'https://github.com/Chyroc/WechatSogou' -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | future==0.16.0 2 | lxml==4.6.2 3 | Pillow==8.3.2 4 | requests>=2.20.0 5 | six==1.10.0 6 | Werkzeug==0.15.3 7 | xlrd==1.0.0 8 | bs4==0.0.1 -------------------------------------------------------------------------------- /screenshot/alipay_hongbao.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chyroc/WechatSogou/45731524c1e43925c61a340694774a0d022b895f/screenshot/alipay_hongbao.png -------------------------------------------------------------------------------- /screenshot/get_gzh_article_by_history.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chyroc/WechatSogou/45731524c1e43925c61a340694774a0d022b895f/screenshot/get_gzh_article_by_history.png -------------------------------------------------------------------------------- /screenshot/get_gzh_article_by_hot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chyroc/WechatSogou/45731524c1e43925c61a340694774a0d022b895f/screenshot/get_gzh_article_by_hot.png -------------------------------------------------------------------------------- /screenshot/get_gzh_info.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chyroc/WechatSogou/45731524c1e43925c61a340694774a0d022b895f/screenshot/get_gzh_info.png -------------------------------------------------------------------------------- /screenshot/get_sugg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chyroc/WechatSogou/45731524c1e43925c61a340694774a0d022b895f/screenshot/get_sugg.png -------------------------------------------------------------------------------- /screenshot/pay_ali.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chyroc/WechatSogou/45731524c1e43925c61a340694774a0d022b895f/screenshot/pay_ali.jpg -------------------------------------------------------------------------------- /screenshot/pay_wechat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chyroc/WechatSogou/45731524c1e43925c61a340694774a0d022b895f/screenshot/pay_wechat.jpg -------------------------------------------------------------------------------- /screenshot/search_article.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chyroc/WechatSogou/45731524c1e43925c61a340694774a0d022b895f/screenshot/search_article.png -------------------------------------------------------------------------------- /screenshot/search_gzh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chyroc/WechatSogou/45731524c1e43925c61a340694774a0d022b895f/screenshot/search_gzh.png -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import re 3 | 4 | from setuptools import setup 5 | 6 | readme = codecs.open('docs/README.rst', encoding='utf-8').read() 7 | history = codecs.open('docs/HISTORY.rst', encoding='utf-8').read() 8 | with codecs.open("wechatsogou/__init__.py", encoding="utf8") as f: 9 | version = re.search(r'__version__ = "(.*?)"', f.read()).group(1) 10 | 11 | setup( 12 | name='wechatsogou', 13 | version=version, 14 | description='Api for wechat mp with sogou', 15 | long_description=u'\n\n'.join([readme, history]), 16 | author='Chyroc', 17 | author_email='chen_yunpeng@foxmail.com', 18 | url='https://github.com/Chyroc/WechatSogou', 19 | packages=[ 20 | 'wechatsogou', 21 | ], 22 | setup_requires=[ 23 | # minimum version to use environment markers 24 | 'setuptools>=20.6.8', 25 | ], 26 | install_requires=[ 27 | 'future', 'lxml', 'Pillow', 'requests', 'six', 'Werkzeug', 'xlrd', 'bs4' 28 | ], 29 | include_package_data=True, 30 | license='MIT License', 31 | classifiers=[ 32 | 'Intended Audience :: Developers', 33 | 'License :: OSI Approved :: MIT License', 34 | 'Operating System :: MacOS :: MacOS X', 35 | 'Operating System :: Microsoft :: Windows', 36 | 'Operating System :: POSIX', 37 | 'Programming Language :: Python', 38 | 'Programming Language :: Python :: 2.7', 39 | 'Programming Language :: Python :: 3.5', 40 | 'Programming Language :: Python :: 3.6', 41 | 'Topic :: Software Development :: Libraries :: Python Modules', 42 | 'Programming Language :: Python :: Implementation :: PyPy', 43 | 'Programming Language :: Python :: Implementation :: CPython', 44 | ], 45 | ) 46 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import, unicode_literals, print_function 4 | 5 | import os 6 | 7 | from wechatsogou.request import WechatSogouRequest 8 | from wechatsogou.structuring import WechatSogouStructuring 9 | 10 | ws = WechatSogouRequest() 11 | ws_structuring = WechatSogouStructuring() 12 | 13 | empty_search_result_keyword = 'gggggggggggggggggg' 14 | gaokao_keyword = '高考' 15 | fake_data_path = '{}/file'.format(os.getcwd() if 'test' in os.getcwd() else '{}/test'.format(os.getcwd())) 16 | -------------------------------------------------------------------------------- /test/fateadm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import base64 4 | import hashlib 5 | import json 6 | import time 7 | 8 | import requests 9 | 10 | 11 | class FateadmAPI(): 12 | def __init__(self, app_id, app_key, usr_id, usr_key): 13 | self.app_id = app_id 14 | self.app_key = app_key 15 | self.usr_id = usr_id 16 | self.usr_key = usr_key 17 | self.host = 'http://pred.fateadm.com' 18 | 19 | def calc_sign(self, usr_id, passwd, timestamp): 20 | md5 = hashlib.md5() 21 | md5.update((timestamp + passwd).encode()) 22 | csign = md5.hexdigest() 23 | 24 | md5 = hashlib.md5() 25 | md5.update((usr_id + timestamp + csign).encode()) 26 | csign = md5.hexdigest() 27 | return csign 28 | 29 | # 识别验证码 30 | def predict(self, pred_type, img_data): 31 | tm = str(int(time.time())) 32 | 33 | param = { 34 | 'user_id': self.usr_id, 35 | 'timestamp': tm, 36 | 'sign': self.calc_sign(self.usr_id, self.usr_key, tm), 37 | 'predict_type': pred_type, 38 | 'img_data': base64.b64encode(img_data), 39 | } 40 | 41 | if self.app_id != '': 42 | asign = self.calc_sign(self.app_id, self.app_key, tm) 43 | param['appid'] = self.app_id 44 | param['asign'] = asign 45 | 46 | r = requests.post('{}/api/capreg'.format(self.host), param) 47 | try: 48 | data = r.json() 49 | return json.loads(data['RspData'])['result'] 50 | except Exception: 51 | raise Exception(r.text) 52 | -------------------------------------------------------------------------------- /test/file/article_detail_expired.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 18 | 19 | 20 | 21 | 22 | 69 | 70 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 |
82 |
83 |
84 |

85 | 链接已过期 86 |

87 |
88 |
89 | 90 | 91 | 92 | 99 | 100 | 137 | 138 | 139 | 142 | 143 | 144 | 145 | 146 | 147 | -------------------------------------------------------------------------------- /test/file/bitsea-history.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 19 | 20 | 21 | 22 | 23 | 69 | 70 | 槽边往事 71 | 72 | 73 | 74 | 77 | 78 | 79 | 80 | 81 |
82 |
83 |
84 |
85 | 86 | 87 | 88 |
89 | 90 | 槽边往事 91 | 92 | 93 |
94 |
95 |
    96 |
  • 97 | 98 |
    99 |
  • 100 |
  • 101 | 102 |
    和菜头的微信Blog,用于分享各种新鲜资讯
    103 |
  • 104 |
105 | 106 | 109 | 110 |
111 |
最近10条群发
112 |
113 | 114 |
115 |
仅显示最近10条群发
116 | 117 | 120 | 123 |
124 |
125 |
126 |
127 | 128 |

微信扫一扫
关注该公众号

129 |
130 |
131 |
132 |
133 | 134 | 135 | 142 | 143 | 161 | 162 | 163 | 174 | 175 | 176 | 177 | 178 | 179 | -------------------------------------------------------------------------------- /test/file/search-gaokao-gzh-error.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 搜狗搜索 8 | 9 | 10 | 11 | 71 | 72 | 73 |
74 | 75 |
您的访问出错了返回首页>>
76 |
77 |
78 |

IP:123.116.247.15
访问时间:2017.07.25 22:36:19

79 |

用户您好,您的访问过于频繁,为确认本次访问为正常用户行为,需要您协助验证。

80 |

81 |
82 |

83 | 84 | 85 | 86 | 87 | 88 | 89 | 请输入图中的验证码 90 | 91 | 92 | 换一张 93 | 94 |

95 |
96 |

97 | 提交 98 | 提交后没解决问题?欢迎反馈 99 |

100 |
101 |
企业推广关于搜狗免责声明意见反馈
 © 2017 SOGOU - 京ICP证050897号 - 京公网安备110000000025号
102 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /test/file/wapindex-wap-0612-wap_8-0.html: -------------------------------------------------------------------------------- 1 |
  • 不做这个动作,你的轮胎3个月就要换!

    同事小李最近有点烦躁,原因在于他的爱车新换的轮胎,才一万公里都没跑到就全被磨光了!前两天刚把两条防爆胎换下来,小3k就这么没了!说实话,吃土的小编很想来一句:壕做朋!换个轮胎就给豁出去这样大的一笔支出,这让身为老司机的修车师傅也有点儿郁闷。

    全球汽车精选

  • 新车质量最差的十个品牌?国人表示难以接受……

    日前,J.D.Power公布了一份表单,这份表单的可谓是非常重磅的东西的,不过当国内消费者看到的时候,脸色一青,大家都慌了神……因为这个表单列举了在美国市场上新车的质量表现,并一一做了排名,然后国人认为不能接受……其他的不说那么多,先来看看

    车早茶

  • 带着米其林的指引去看古德伍德|品牌

    一提起古德伍德,很多人都觉得这个位于英国西萨赛克斯郡(West Sussex)的小镇特别耳熟能详,因为这里每一年都会举办全球规模最大、最负盛名古德伍德速度节,这是在每一个车迷都梦寐以求参加的盛会,因为它不仅仅是一个老爷赛车重回青春的宝地,也

    吴佩频道

  • 方向盘打法巧记口诀,科目二提分就靠它了!

    学车是一件慢工出细活的事,每一个细节都需要重视。尤其是方向盘的掌控问题,对于很多新手来说,一上车基本就蒙圈了,手都不知道放哪,也不知道该怎么控制方向盘。接来下典典就给大家说说关于方向盘的那些事:学员在科目二考试训练时控制方向盘容易出现以下问

    驾考宝典

  • 宝马“鸡腿”、奥迪“游艇”,这些奇葩的挡杆你见过几个?

    欢 迎 来 到 第 三 十 二 期 《 B B 大 讲 堂 》今 天 B B 哥 带 大 家 了 解奇葩的汽车设计 我们开车时接触最多的东西之一就是挡把。可以说,挡把的造型在很大程度上影响着驾驶的舒适度和操控性。挡把的人体工程学设计的好,那

    腾讯汽车

  • 你没看错,我们做了期途昂和途锐的对比

    你一定是在很久以前就知道点击上方蓝字◮就可以加关注了 原谅我们的不正经,这次竟然拿途昂来跟途锐对比。尽管途昂要加价,途锐有十多万的优惠,此消彼长后,但两者依然差距悬殊。在途昂推出时,很多人说什么途昂比途锐还大,途锐已经没有存在意义。这种以大

    新车评

  • 7成特斯拉被召回,难道是质量不过关?

    特斯拉又摊上事了?汽车召回,顾名思义,就是将有问题缺陷的汽车产品由厂家及时召唤回去,进行改造升级。就在6月30日,国家质检总局发布了2017年上半年国内汽车召回的所有信息。中国乘用车市场共38个汽车品牌发布了118次召回公告,累计召回486

    非常好车

  • 在中国惹不起的7种车,遇到请回避!

    咱们经常会在新闻上看到货车侧翻压死人的消息,马路上看到货车都躲得远远的!其实不仅仅是货车,下面这7类车,你也得小心。1、渣土车渣土车一般在晚上行驶,经常还在市区闯红灯,加速过路口。很多渣土车不会按照要求加盖,就特别容易导致渣土车的砖头在路上

    汽车情报所

  • 迈腾摊上大事儿了 全新一代君威17.58万起

    今晚,上汽通用别克全新一代君威于上海东方体育中心上市。全新君威包含20T、28T和30H车型,分别搭载1.5T、2.0T与1.8L+电动机三种动力规格。另外,与普通版车型一起上市的,还有全新一代君威GS,其内外设计比君威更加强调运动,同样搭

    一猫汽车资讯

  • 面对这份驾享,朝廷大人都忍不住亲自上阵!

    历史上曾出现一位富可敌国的朝廷大人,对于该位大人的奢享生活,据史料记载,其每天都会将珍珠磨成的粉用来做早餐,并且对珍珠的品相也颇有要求。后世的慈禧太后,作为同样喜吃珍珠之人,也只是半月才吃上一回,某大人却天天吃,足以见得他何其懂得享受。就连

    资深科技控

  • 外卖小哥被暴晒:底层人士的悲哀,有钱人不会懂

    作者|阿何 微信|阿何有话说最近,微博上有个帖子引起了巨大的反响:外卖小哥送餐到一栋写字楼,可是因为写字楼不让无关人等入内,小哥只能在接近40度的地面等待顾客下来。广州这两天如果不下雨的话,地面温度在35度左右。前两天出外办事,我在露天呆了

    郎club

  • 自动驾驶还处于“新手”阶段,何时成为“老司机”?院士这样说……

    “传统汽车仅仅是驾驶员手、脚和力量的延伸,控制车辆行为的是人。到了L3阶段,让汽车成为驾驶员自己、或者说让机器成为自己,应该是人工智能时代最有意义的事情之一。”在7月22日召开的CCAI2017中国人工智能大会上,中国工程院院士、中国人工智

    科技日报

  • 高速上碰到石头,是躲还是撞?

    之前歆歆写过很多在高速公路上,车主不遵守交通规则导致的事故,其实除此之外,一些突发情况,也容易让车主反应不及,引发事故。假如你正在高速行车,突然发现前方有一块石头或者窜出小动物,你会怎么去做呢?急打方向、猛踩刹车还是直接冲过去?有这样一位车

    汽车使用宝典

  • 装什么神秘,不就是加长版的讴歌TLX吗!

    广汽讴歌TLX-L量产版要来了!实车也终于亮相,装作神秘,其实它的样子早就不陌生了。年初的上海车展,讴歌已经给我们看到这款TLX-L的Prototype,也就是原型车。外观基本上和这次亮相的实车相似,造型差别就在保险杠下方的格栅部分。不过,

    名车报

  • 一个动作,车里的人集体中毒!很多人都忽略了

    问世间什么最难不外乎三伏天出门上班大暑,中伏老天爷给今天贴上这两个标签就注定其承载不一样的意义幸好今天是周末终于可以空调WiFi西瓜,葛优同款沙发……但有人就是想不通啊为了早拿到驾照顶着大太阳冒着酷暑去学车结果呢就因为她和男教练在车里做了一

    科普中国网

  • 2 | -------------------------------------------------------------------------------- /test/rk.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | from hashlib import md5 5 | 6 | import requests 7 | 8 | 9 | class RClient(object): 10 | def __init__(self, username, password, soft_id, soft_key): 11 | self.base_params = { 12 | 'username': username, 13 | 'password': md5(password.encode('utf-8')).hexdigest(), 14 | 'softid': soft_id, 15 | 'softkey': soft_key, 16 | } 17 | self.headers = { 18 | 'Connection': 'Keep-Alive', 19 | 'Expect': '100-continue', 20 | 'User-Agent': 'ben', 21 | } 22 | 23 | def rk_create(self, im, im_type, timeout=60): 24 | params = { 25 | 'typeid': im_type, 26 | 'timeout': timeout, 27 | } 28 | params.update(self.base_params) 29 | files = {'image': ('a.jpg', im)} 30 | r = requests.post('http://api.ruokuai.com/create.json', data=params, files=files, headers=self.headers) 31 | return r.json() 32 | 33 | def rk_report_error(self, im_id): 34 | params = { 35 | 'id': im_id, 36 | } 37 | params.update(self.base_params) 38 | r = requests.post('http://api.ruokuai.com/reporterror.json', data=params, headers=self.headers) 39 | return r.json() 40 | 41 | 42 | def __identify_image_callback(img, code): 43 | try: 44 | username = os.environ['rk_username'] 45 | password = os.environ['rk_password'] 46 | id_ = os.environ['rk_id'] 47 | key = os.environ['rk_key'] 48 | rc = RClient(username, password, id_, key) 49 | result = rc.rk_create(img, code) 50 | print('验证码:', result['Result']) 51 | return result['Result'] 52 | except Exception: 53 | raise Exception('识别验证码错误') 54 | 55 | 56 | def identify_image_callback_ruokuai_sogou(img): 57 | return __identify_image_callback(img, 3060) 58 | 59 | 60 | def identify_image_callback_ruokuai_weixin(img): 61 | return __identify_image_callback(img, 3040) 62 | -------------------------------------------------------------------------------- /test/test_api.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import, unicode_literals, print_function 4 | 5 | import os 6 | import time 7 | import unittest 8 | 9 | from nose.tools import assert_equal, assert_true, assert_in, assert_greater_equal 10 | 11 | from wechatsogou.const import WechatSogouConst 12 | from wechatsogou.api import WechatSogouAPI 13 | from wechatsogou.identify_image import identify_image_callback_by_hand 14 | from test import gaokao_keyword, empty_search_result_keyword 15 | from test.rk import identify_image_callback_ruokuai_sogou, identify_image_callback_ruokuai_weixin 16 | 17 | ws_api = WechatSogouAPI(captcha_break_time=3) 18 | 19 | 20 | class TestAPIReal(unittest.TestCase): 21 | # todo use chinese 22 | def setUp(self): 23 | self.identify_image_callback_sogou = identify_image_callback_ruokuai_sogou if os.environ.get( 24 | 'WechatSogouCI') else identify_image_callback_by_hand 25 | self.identify_image_callback_ruokuai_weixin = identify_image_callback_ruokuai_weixin if os.environ.get( 26 | 'WechatSogouCI') else identify_image_callback_by_hand 27 | 28 | def test_search_gzh_real(self): 29 | gzh_list = ws_api.search_gzh(gaokao_keyword, identify_image_callback=self.identify_image_callback_sogou) 30 | assert_greater_equal(len(gzh_list), 8) 31 | assert_true(any(gaokao_keyword in i['wechat_name'] for i in gzh_list)) 32 | assert_true(any(i['open_id'] != '' for i in gzh_list)) 33 | 34 | def test_get_gzh_article_by_history_real(self): 35 | gzh_article = ws_api.get_gzh_article_by_history(gaokao_keyword, 36 | identify_image_callback_sogou=self.identify_image_callback_sogou, 37 | identify_image_callback_weixin=self.identify_image_callback_ruokuai_weixin) 38 | assert_in('gzh', gzh_article) 39 | assert_in('article', gzh_article) 40 | assert_in('wx.qlogo.cn', gzh_article['gzh']['headimage']) 41 | assert_greater_equal(len(gzh_article['article']), 1) 42 | 43 | def test_get_gzh_article_by_hot_real(self): 44 | gzh_articles = ws_api.get_gzh_article_by_hot(WechatSogouConst.hot_index.gaoxiao, 45 | identify_image_callback=self.identify_image_callback_sogou) 46 | for gzh_article in gzh_articles: 47 | assert_in('gzh', gzh_article) 48 | assert_in('article', gzh_article) 49 | assert_in('http://mp.weixin.qq.com/s?src=', gzh_article['article']['url']) 50 | assert_greater_equal(len(gzh_articles), 10) 51 | 52 | def test_get_sugg(self): 53 | sugg_gaokao = ws_api.get_sugg(gaokao_keyword) 54 | assert_equal(10, len(sugg_gaokao)) 55 | 56 | def test_get_article_content(self): 57 | gzh_article = ws_api.get_gzh_article_by_history(gaokao_keyword, 58 | identify_image_callback_sogou=self.identify_image_callback_sogou, 59 | identify_image_callback_weixin=self.identify_image_callback_ruokuai_weixin) 60 | assert_in('gzh', gzh_article) 61 | assert_in('article', gzh_article) 62 | assert_in('wx.qlogo.cn', gzh_article['gzh']['headimage']) 63 | assert_greater_equal(len(gzh_article['article']), 1) 64 | # 防止测试时被封IP 65 | time.sleep(11) 66 | article_url = gzh_article['article'][0]['content_url'] 67 | 68 | article_info = ws_api.get_article_content(article_url, 69 | identify_image_callback=self.identify_image_callback_sogou) 70 | 71 | assert_in('content_html', article_info) 72 | assert_in('content_img_list', article_info) 73 | 74 | def test_gzh_by_history_profile_none(self): 75 | gzh_article = ws_api.get_gzh_article_by_history(empty_search_result_keyword, 76 | identify_image_callback_sogou=self.identify_image_callback_sogou, 77 | identify_image_callback_weixin=self.identify_image_callback_ruokuai_weixin) 78 | assert_equal({}, gzh_article) 79 | 80 | 81 | if __name__ == '__main__': 82 | unittest.main() 83 | -------------------------------------------------------------------------------- /test/test_const.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | 5 | from nose.tools import assert_true, assert_equal 6 | 7 | from wechatsogou.const import WechatSogouConst 8 | 9 | 10 | class TestConst(unittest.TestCase): 11 | def test_const_hot_index(self): 12 | assert_true(hasattr(WechatSogouConst, 'hot_index')) 13 | 14 | assert_equal(WechatSogouConst.hot_index.hot, 'hot') 15 | assert_equal(WechatSogouConst.hot_index.gaoxiao, 'gaoxiao') 16 | assert_equal(WechatSogouConst.hot_index.duanzi, 'duanzi') 17 | assert_equal(WechatSogouConst.hot_index.health, 'health') 18 | assert_equal(WechatSogouConst.hot_index.sifanghua, 'sifanghua') 19 | assert_equal(WechatSogouConst.hot_index.gossip, 'gossip') 20 | assert_equal(WechatSogouConst.hot_index.life, 'life') 21 | assert_equal(WechatSogouConst.hot_index.finance, 'finance') 22 | assert_equal(WechatSogouConst.hot_index.car, 'car') 23 | assert_equal(WechatSogouConst.hot_index.technology, 'technology') 24 | assert_equal(WechatSogouConst.hot_index.fashion, 'fashion') 25 | assert_equal(WechatSogouConst.hot_index.mummy, 'mummy') 26 | assert_equal(WechatSogouConst.hot_index.dianzan, 'dianzan') 27 | assert_equal(WechatSogouConst.hot_index.travel, 'travel') 28 | assert_equal(WechatSogouConst.hot_index.job, 'job') 29 | assert_equal(WechatSogouConst.hot_index.food, 'food') 30 | assert_equal(WechatSogouConst.hot_index.history, 'history') 31 | assert_equal(WechatSogouConst.hot_index.study, 'study') 32 | assert_equal(WechatSogouConst.hot_index.constellation, 'constellation') 33 | assert_equal(WechatSogouConst.hot_index.sport, 'sport') 34 | 35 | def test_const_search_article_type(self): 36 | assert_true(hasattr(WechatSogouConst, 'search_article_type')) 37 | 38 | assert_equal(WechatSogouConst.search_article_type.all, 'all') 39 | assert_equal(WechatSogouConst.search_article_type.rich, 'rich') 40 | assert_equal(WechatSogouConst.search_article_type.video, 'video') 41 | assert_equal(WechatSogouConst.search_article_type.image, 'image') 42 | 43 | def test_const_search_article_time(self): 44 | assert_true(hasattr(WechatSogouConst, 'search_article_time')) 45 | 46 | assert_equal(WechatSogouConst.search_article_time.anytime, 0) 47 | assert_equal(WechatSogouConst.search_article_time.day, 1) 48 | assert_equal(WechatSogouConst.search_article_time.week, 2) 49 | assert_equal(WechatSogouConst.search_article_time.month, 3) 50 | assert_equal(WechatSogouConst.search_article_time.year, 4) 51 | assert_equal(WechatSogouConst.search_article_time.specific, 5) 52 | 53 | 54 | if __name__ == '__main__': 55 | unittest.main() 56 | -------------------------------------------------------------------------------- /test/test_request_gen_hot_url.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import, unicode_literals, print_function 4 | 5 | import unittest 6 | from nose.tools import assert_in, assert_raises 7 | 8 | from wechatsogou.const import WechatSogouConst 9 | from wechatsogou.request import WechatSogouRequest 10 | 11 | 12 | class TestBasicGenSearchArticleURL(unittest.TestCase): 13 | def test_gen_hot_url(self): 14 | for hot_index in filter(lambda x: not x.startswith('__'), dir(WechatSogouConst.hot_index)): 15 | url = WechatSogouRequest.gen_hot_url(hot_index) 16 | assert_in('http://weixin.sogou.com/wapindex/wap/0612/wap_', url) 17 | assert_in('0.html', url) 18 | 19 | with assert_raises(AssertionError): 20 | WechatSogouRequest.gen_hot_url(hot_index, 0) 21 | 22 | for page in range(1, 5): 23 | url = WechatSogouRequest.gen_hot_url(hot_index, page) 24 | assert_in('http://weixin.sogou.com/wapindex/wap/0612/wap_', url) 25 | assert_in('{}.html'.format(page - 1), url) 26 | 27 | 28 | if __name__ == '__main__': 29 | unittest.main() 30 | -------------------------------------------------------------------------------- /test/test_request_gen_search_article_url.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import, unicode_literals, print_function 4 | 5 | import unittest 6 | from nose.tools import assert_raises, assert_equal, assert_in, assert_not_in 7 | 8 | from hypothesis import given, strategies as st 9 | 10 | from wechatsogou.const import WechatSogouConst 11 | from wechatsogou.request import WechatSogouRequest 12 | from test import gaokao_keyword 13 | 14 | 15 | class TestBasicGenSearchArticleURL(unittest.TestCase): 16 | def test_gen_search_article_url_keyword(self): 17 | url = WechatSogouRequest.gen_search_article_url(gaokao_keyword) 18 | assert_equal('http://weixin.sogou.com/weixin?type=2&page=1&ie=utf8&query=%E9%AB%98%E8%80%83&interation=', url) 19 | 20 | @given(st.integers(min_value=-20000, max_value=20000)) 21 | def test_gen_search_article_url_page(self, page): 22 | if page > 0: 23 | url = WechatSogouRequest.gen_search_article_url(gaokao_keyword, page) 24 | assert_in('page={}'.format(page), url) 25 | else: 26 | with assert_raises(AssertionError): 27 | WechatSogouRequest.gen_search_article_url(gaokao_keyword, page) 28 | 29 | @given(st.integers(min_value=-50, max_value=50), st.dates(), st.dates()) 30 | def test_gen_search_article_url_timesn(self, timesn, ft, et): 31 | if timesn == 0: 32 | url = WechatSogouRequest.gen_search_article_url(gaokao_keyword, timesn=timesn) 33 | assert_in('type=2&page=1&ie=utf8&query=', url) 34 | assert_not_in('ft=&et=', url) 35 | 36 | url = WechatSogouRequest.gen_search_article_url(gaokao_keyword, timesn=timesn, ft=ft) 37 | assert_in('type=2&page=1&ie=utf8&query=', url) 38 | assert_not_in('ft=&et=', url) 39 | elif timesn in [1, 2, 3, 4]: 40 | url = WechatSogouRequest.gen_search_article_url(gaokao_keyword, timesn=timesn) 41 | assert_in('tsn={}&ft=&et='.format(timesn), url) 42 | 43 | url = WechatSogouRequest.gen_search_article_url(gaokao_keyword, timesn=timesn, ft=ft) 44 | assert_in('tsn={}&ft=&et='.format(timesn), url) 45 | elif timesn == 5: 46 | if ft <= et: 47 | url = WechatSogouRequest.gen_search_article_url(gaokao_keyword, timesn=timesn, ft=ft, et=et) 48 | assert_in('tsn=5&ft={}&et={}'.format(ft, et), url) 49 | else: 50 | with assert_raises(AssertionError): 51 | WechatSogouRequest.gen_search_article_url(gaokao_keyword, timesn=timesn) 52 | WechatSogouRequest.gen_search_article_url(gaokao_keyword, timesn=timesn, ft=ft, et=et) 53 | else: 54 | with assert_raises(AssertionError): 55 | WechatSogouRequest.gen_search_article_url(gaokao_keyword, timesn=timesn) 56 | 57 | def test_gen_search_article_url_article_type(self): 58 | url = WechatSogouRequest.gen_search_article_url(gaokao_keyword, 59 | article_type=WechatSogouConst.search_article_type.all) 60 | assert_equal('interation=', url[-11:]) 61 | 62 | url = WechatSogouRequest.gen_search_article_url(gaokao_keyword, 63 | article_type=WechatSogouConst.search_article_type.image) 64 | assert_in('interation=458754', url) 65 | 66 | url = WechatSogouRequest.gen_search_article_url(gaokao_keyword, 67 | article_type=WechatSogouConst.search_article_type.video) 68 | assert_in('interation=458756', url) 69 | 70 | url = WechatSogouRequest.gen_search_article_url(gaokao_keyword, 71 | article_type=WechatSogouConst.search_article_type.rich) 72 | assert_in('interation=458754%2C458756', url) 73 | -------------------------------------------------------------------------------- /test/test_request_gen_search_gzh_url.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import, unicode_literals, print_function 4 | 5 | import unittest 6 | from nose.tools import assert_raises, assert_equal, assert_in 7 | 8 | from hypothesis import given, strategies as st 9 | 10 | from wechatsogou.request import WechatSogouRequest 11 | from test import gaokao_keyword 12 | 13 | 14 | class TestBasicGenSearchGzhURL(unittest.TestCase): 15 | def test_gen_search_article_url_keyword(self): 16 | url = WechatSogouRequest.gen_search_gzh_url(gaokao_keyword) 17 | assert_equal('http://weixin.sogou.com/weixin?type=1&page=1&ie=utf8&query=%E9%AB%98%E8%80%83', url) 18 | 19 | @given(st.integers(min_value=-20000, max_value=20000)) 20 | def test_gen_search_gzh_url_page(self, page): 21 | if page > 0: 22 | url = WechatSogouRequest.gen_search_gzh_url(gaokao_keyword, page) 23 | assert_in('page={}'.format(page), url) 24 | else: 25 | with assert_raises(AssertionError): 26 | WechatSogouRequest.gen_search_gzh_url(gaokao_keyword, page) 27 | -------------------------------------------------------------------------------- /test/test_structuring.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import, unicode_literals, print_function 4 | 5 | import datetime 6 | import io 7 | import json 8 | import re 9 | import os 10 | import unittest 11 | from bs4 import BeautifulSoup 12 | from nose.tools import assert_equal, assert_in, assert_true, assert_greater_equal, assert_is_none, assert_not_in 13 | 14 | from test import fake_data_path, gaokao_keyword 15 | from wechatsogou.structuring import WechatSogouStructuring 16 | 17 | assert_equal.__self__.maxDiff = None 18 | 19 | 20 | class TestStructuringGzh(unittest.TestCase): 21 | def test_get_gzh_by_search(self): 22 | file_name = os.path.join(fake_data_path, 'search-gaokao-gzh.html') 23 | with io.open(file_name, encoding='utf-8') as f: 24 | search_gaokao_gzh = f.read() 25 | 26 | gzh_list = WechatSogouStructuring.get_gzh_by_search(search_gaokao_gzh) 27 | 28 | names = [] 29 | wechat_ids = [] 30 | post_perms = [] 31 | introductions = [] 32 | authentications = [] 33 | open_ids = [] 34 | assert_equal(10, len(gzh_list)) 35 | for gzh in gzh_list: 36 | names.append(gzh['wechat_name']) 37 | wechat_ids.append(gzh['wechat_id']) 38 | post_perms.append(gzh['post_perm']) 39 | introductions.append(gzh['introduction']) 40 | authentications.append(gzh['authentication']) 41 | open_ids.append(gzh['open_id']) 42 | 43 | assert_in('mp.weixin.qq.com/profile?src=3×tamp=', gzh['profile_url']) 44 | assert_in('mp.weixin.qq.com/rr?src=', gzh['qrcode']) 45 | assert_in('img01.sogoucdn.com/', gzh['headimage']) 46 | 47 | assert_equal(['oIWsFt6fv4FH0OBNCyoonNoAp2OM', 48 | 'oIWsFtzwnqHRVPsRY-eEzPo344jQ', 49 | 'oIWsFt_PvlvuqFxQFPbOO26_GQh4', 50 | 'oIWsFtzpOSqygkGiyzj1vVGi2zM4', 51 | 'oIWsFt-lCZYAtfVXRykjgsWZMoJA', 52 | 'oIWsFtzJBFA82fTPb7xU-gkPiyqA', 53 | 'oIWsFt_wgF0dHou131y47qIMcuM0', 54 | 'oIWsFt67sO47_fHfOFQC0rBHhxcY', 55 | 'oIWsFt5Kltl1uXsy8fhj96eIVen8', 56 | 'oIWsFt-2JeqhMEEVQuFw_geRzmbY'], 57 | open_ids) 58 | assert_equal(['山东高考指南', 59 | '高考家长圈', 60 | '河南高考指南', 61 | '高考360', 62 | '云天高考', 63 | '腾讯高考', 64 | '高考快讯', 65 | '专业中高考教育', 66 | '晟嘉高考', 67 | '新东方在线高考辅导'], 68 | names) 69 | assert_equal([u'sdgkzn', 70 | u'sinagkjzq', 71 | u'hngaokao', 72 | u'sctvgaokao360', 73 | u'yuntiangaokao', 74 | u'qq_gaokao', 75 | u'gkkx678', 76 | u'gh_591a43050b5f', 77 | u'tjsjgk', 78 | u'koogaokao'], 79 | wechat_ids) 80 | assert_equal([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1], post_perms) 81 | assert_equal( 82 | ['这里是山东最权威最专业的高考交流平台,由山东商报徐玉芹教育工作室独家运作.本平台与山东商报高考交流群互为依托,为山东考生和家长提供最及时、最准确的高考政策及信息解读,以及一流的填报志愿咨询服务.合作...', 83 | '定期推送高三家长关注的优秀家长经验交流、志愿填报技巧、考生心理辅导方法、考前营养搭配等诸多优质内容;为家长搭建交流互动平台.', 84 | '发布最新高考政策,分享高效学习方法,制定高考应试策略.考点总结政策分析名校介绍高考大纲试卷解析艺考文化课,权威专业的高考资讯一手掌握.', 85 | '360天,360度,用心伴您升学路.四川电视台科教频道每晚7:45播出.', 86 | '高端教育品牌,高分考生的加油站,重点中学的合作伙伴.开阔考生视野、提升认知,在以“研究”为主线的基础上,将考生培养成一个全面型的人才.课程特色,全科协调、单科精讲.云天高考一直深受高分考生和家长的追...', 87 | '腾讯高考频道是中国最具互动性高考门户网站.主要为中国高考生及家长提供有价值的资讯和辅导.内容包括:新闻、评论、视频、各科辅导、志愿填报、家长指南等多方面.', 88 | '高考快讯平台专为考生家长提供最新高考资讯、志愿填报指南、名校排行榜、状元经验、学习方法、高分秘籍等等,我们的努力将伴随着您圆大学梦,欢迎关注阅读!', 89 | '旨在做最专业的中高考教育交流平台,第一时间传递权威的中高考资讯,为孩子的未来保驾护航!', 90 | '关于天津高考,你关注我们一个就够啦!', 91 | '提供高考资讯、高考院校库、在线答疑、政策解读及试题发布.'], 92 | introductions) 93 | assert_equal(['《山东商报》社', 94 | '新浪网技术(中国)有限公司', 95 | '郑州新东方培训学校', 96 | '四川省电化教育馆(四川教育电视台)', 97 | '北京云天共业教育科技有限公司', 98 | '深圳市腾讯计算机系统有限公司', 99 | '广州卓越教育培训中心', 100 | '大连沙河口科苑文化培训学校', 101 | '天津市南开区晟嘉培训中心', 102 | '北京新东方迅程网络科技股份有限公司'], 103 | authentications) 104 | 105 | def test_get_article_by_search(self): 106 | file_name = os.path.join(fake_data_path, 'search-gaokao-article.html') 107 | with io.open(file_name, encoding='utf-8') as f: 108 | search_gaokao_article = f.read() 109 | 110 | article_list = WechatSogouStructuring.get_article_by_search(search_gaokao_article) 111 | 112 | titles = [] 113 | abstracts = [] 114 | gzh_names = [] 115 | isvs = [] 116 | assert_equal(10, len(article_list)) 117 | for i in article_list: 118 | article = i['article'] 119 | titles.append(article['title']) 120 | abstracts.append(article['abstract']) 121 | 122 | assert_in('mp.weixin.qq.com/s?src=3×tamp=', article['url']) 123 | assert_true(isinstance(article['imgs'], list)) 124 | assert_greater_equal(len(article['imgs']), 1) 125 | 126 | gzh = i['gzh'] 127 | 128 | assert_in('mp.weixin.qq.com/profile?src=3×tamp', gzh['profile_url']) 129 | assert_in('wx.qlogo.cn/mmhead', gzh['headimage']) 130 | gzh_names.append(gzh['wechat_name']) 131 | isvs.append(gzh['isv']) 132 | 133 | # article 134 | assert_equal(['高考有多重要,为什么要重视高考?丨微观点', 135 | '高考:穷人考不好,中产考状元,精英不高考', 136 | '关于高考志愿的一点建议,仅供参考!', 137 | '刚刚,高考“满分”诞生了!(附各省高考分数线)', 138 | '高考学霸榜出炉!义乌最高分是她!排名...', 139 | '【高考】权威发布!2017年我省高考各项日程', 140 | '【高考】黑龙江省2017年普通高考成绩即将发布', 141 | '高考2017 | 全国各省区市高考录取时间大汇总,最新最全!', 142 | '高考志愿这么填,等于多考20分!这位特级教师的志愿填报方法很管用!', 143 | '高考填志愿,如何选专业?学长学姐有话说'], 144 | titles) 145 | assert_equal(['针对这个问题,其实占豪已经谈过,但还是想借高考之后、借这位小战友的留言,结合自己的人生经验,谈谈个人对这件事的看法....', 146 | '#条条大路通罗马,有人就出生在罗马#前几天北京文科高考状元熊轩昂接受澎湃新闻的采访的时候,说了下面这段话. “农村地区的...', 147 | '最近一直有哥迷留言问,填报高考志愿该选什么专业? 讲真,这个问题很难回答.专业选择没有绝对的好坏对错,跟考试成绩、个人兴...', 148 | '高考会有满分的情况吗?还真有!6月22日开始,全国各省的高考成绩陆续发布.22日晚上,成都市青白江区一个小区内人声鼎沸,因...', 149 | '浙江新高考各类别各段分数线及考生成绩于昨日揭晓.考生可凭考生号、密码查询自己的考试成绩!今年的高考成绩,经浙江省教育考...', 150 | '根据我省招生录取工作安排,现将近期有关高考工作日程公布如下:一、高考成绩公布时间6月24日左右省招考院通过黑龙江省招生考...', 151 | '黑龙江省2017年普通高考成绩即将发布 我省今年高考网上评卷工作现已结束,经过成绩核查、成绩校验等多个环节后,我省高考成绩...', 152 | '2017年高考录取工作开始了,各省区市高考录取工作何时进行?为了方便考生和家长及时了解,小编为大家作了最新最全的梳理.(图...', 153 | '各地高考成绩已陆续公布,在本公众号回复“高考查分”即可查询!~长按二维码即可关注本车~自昨天开始,全国各省份陆续公布...', 154 | '导语高考成绩和批次线已经出来了,想必同学们已经开始进入另一重要环节——志愿填报.你是不是在为选专业而纠结痛苦?不怕!...'], 155 | abstracts) 156 | 157 | # gzh 158 | assert_equal(['占豪', 159 | '才华有限青年', 160 | '新闻哥', 161 | '光明网', 162 | '义乌十八腔', 163 | '龙招港', 164 | '龙招港', 165 | '微言教育', 166 | '高考直通车', 167 | '阳光高考信息平台', ], 168 | gzh_names) 169 | assert_in(1, isvs) 170 | assert_in(0, isvs) 171 | 172 | def test_get_gzh_info_by_history(self): 173 | file_name = os.path.join(fake_data_path, 'bitsea-history.html') 174 | with io.open(file_name, encoding='utf-8') as f: 175 | gzh_history = f.read() 176 | 177 | gzh_info = WechatSogouStructuring.get_gzh_info_by_history(gzh_history) 178 | 179 | assert_equal('槽边往事', gzh_info['wechat_name']) 180 | assert_equal('bitsea', gzh_info['wechat_id']) 181 | assert_equal('和菜头的微信Blog,用于分享各种新鲜资讯', gzh_info['authentication']) 182 | assert_equal('http://wx.qlogo.cn/mmhead/Q3auHgzwzM6zmSwQkvHdgXDtnpAyLYjuib8QdW6ibKKGo8zcZVbYxiaUw/0', 183 | gzh_info['headimage']) 184 | assert_equal(' ', gzh_info['introduction']) 185 | 186 | def test_get_article_by_history_json(self): 187 | file_name = os.path.join(fake_data_path, 'bitsea-history.html') 188 | with io.open(file_name, encoding='utf-8') as f: 189 | gzh_history = f.read() 190 | 191 | article_list = WechatSogouStructuring.get_article_by_history_json(gzh_history) 192 | titles = [] 193 | urls = [] 194 | digests = [] 195 | for i in article_list: 196 | assert_equal('和菜头', i['author']) 197 | assert_equal('49', i['type']) 198 | assert_in('mp.weixin.qq.com/s?timestamp=', i['content_url']) 199 | assert_in(i['copyright_stat'], [11, 100]) 200 | assert_in('mmbiz.qpic.cn/mmbiz_jpg/', i['cover']) 201 | assert_greater_equal(datetime.datetime.fromtimestamp(i['datetime']), datetime.datetime(2000, 1, 1)) 202 | 203 | urls.append(i['content_url']) 204 | titles.append(i['title']) 205 | digests.append(i['abstract']) 206 | 207 | assert_equal( 208 | ['帝都深处好修行', 209 | '如果我有个好一点的初中英文老师', 210 | '【广告】让手机清凉一哈', 211 | '写给各位陛下', 212 | '可能是年度电影的《大护法》', 213 | '怎样决定要不要去相信一个人', 214 | '照亮世界的那个人', 215 | '《冈仁波齐》观后', 216 | '没有什么火候不火候的', 217 | '完美受害人', ], 218 | titles) 219 | 220 | assert_equal([ 221 | 'http://mp.weixin.qq.com/s?timestamp=1500903767&src=3&ver=1&signature=X4l0IQ091w0DY2ERU7fD*h0VUwBxeHPOJH-Uk-vAfaPamMl6ij7fqAIHomnXQ2X2*2J94H0pixVjsjEkL0TbILtKInZ4hqPp3-lC1nQZcN9Fd*BGbTQp7WlZyzLvCXy0Z8yFVF*lIDlo75pemv7kW8wov4Hz5-uiVzBT5q*Nwaw=', 222 | 'http://mp.weixin.qq.com/s?timestamp=1500903767&src=3&ver=1&signature=X4l0IQ091w0DY2ERU7fD*h0VUwBxeHPOJH-Uk-vAfaPamMl6ij7fqAIHomnXQ2X2*2J94H0pixVjsjEkL0TbIPsfeXemAw1IR5Pt5J*6JqjpgotoKPL*6eVHbdcbi4JCEfsnhbnsQUTLQWpBZe5UILx8062e6A2L00LyjQArkxU=', 223 | 'http://mp.weixin.qq.com/s?timestamp=1500903767&src=3&ver=1&signature=X4l0IQ091w0DY2ERU7fD*h0VUwBxeHPOJH-Uk-vAfaPamMl6ij7fqAIHomnXQ2X2*2J94H0pixVjsjEkL0TbIOVd*HwElAYiJum8Q6su3tILWksr-4u9WZPSrfT7A6nErJ3f0kW8V1Jv9evurTe5X4pQrjjCZcE6WeYGwDJIH0Q=', 224 | 'http://mp.weixin.qq.com/s?timestamp=1500903767&src=3&ver=1&signature=X4l0IQ091w0DY2ERU7fD*h0VUwBxeHPOJH-Uk-vAfaPamMl6ij7fqAIHomnXQ2X2*2J94H0pixVjsjEkL0TbIBtaRJpx-JbQsm-5X*GWfaS-jBtKyhOmAxio5OIROqwV71OrvtaxYq1oZG-WM9apKbLGDPIBc0sCFUB4WBOagwk=', 225 | 'http://mp.weixin.qq.com/s?timestamp=1500903767&src=3&ver=1&signature=X4l0IQ091w0DY2ERU7fD*h0VUwBxeHPOJH-Uk-vAfaPamMl6ij7fqAIHomnXQ2X2*2J94H0pixVjsjEkL0TbID-eM8BIKq1ef1ajiKO1jz1k0E6xa1ROpt2Eo3Af6OHQGfYIq-WrfEsn3jLwps1V*TXmP6443wUYgrrStzJwKPc=', 226 | 'http://mp.weixin.qq.com/s?timestamp=1500903767&src=3&ver=1&signature=X4l0IQ091w0DY2ERU7fD*h0VUwBxeHPOJH-Uk-vAfaPamMl6ij7fqAIHomnXQ2X2*2J94H0pixVjsjEkL0TbIJenG0s3GyCaMQIK18U3CHsWrrGwuL5Z0X*DSoztV49L-ZPrf39mbml1GBkZnX*gueDdUJBIHgvyFsaVCTePLrI=', 227 | 'http://mp.weixin.qq.com/s?timestamp=1500903767&src=3&ver=1&signature=X4l0IQ091w0DY2ERU7fD*h0VUwBxeHPOJH-Uk-vAfaPamMl6ij7fqAIHomnXQ2X2*2J94H0pixVjsjEkL0TbIE2LQ5dJqrG018DC4M7E5RQ3D4V1p*eBszVaqr2saxG864LssINc8RKcASbkdSDEMiguB9xwuMcJXgGANUpBjtg=', 228 | 'http://mp.weixin.qq.com/s?timestamp=1500903767&src=3&ver=1&signature=X4l0IQ091w0DY2ERU7fD*h0VUwBxeHPOJH-Uk-vAfaPamMl6ij7fqAIHomnXQ2X2*2J94H0pixVjsjEkL0TbINN4P-L*qGaX0SopEwmBNGbOUc*Ad5D8TKEUZOPNduI4uupwRQFL*I4r151vpRYSA92EYzb34uf82WZJMa5-kTU=', 229 | 'http://mp.weixin.qq.com/s?timestamp=1500903767&src=3&ver=1&signature=X4l0IQ091w0DY2ERU7fD*h0VUwBxeHPOJH-Uk-vAfaPamMl6ij7fqAIHomnXQ2X2*2J94H0pixVjsjEkL0TbIEhfSajMgMm4uzkdEhe*6MP8H9YKg1q38xqFlBV3*sJxgwupUV8b1Q2c6OhhBEZgCTyKQvHWnGLDLBH0gvC10zQ=', 230 | 'http://mp.weixin.qq.com/s?timestamp=1500903767&src=3&ver=1&signature=X4l0IQ091w0DY2ERU7fD*h0VUwBxeHPOJH-Uk-vAfaPamMl6ij7fqAIHomnXQ2X2*2J94H0pixVjsjEkL0TbIBK5p9HtcN9dTEMbIU5Vspa3IaeGox55FYOfhNbWBL2Td4hxYt3GKGzRe-TlOPVlDWXuy8CvdD1ap1fmhNt9Cy0='] 231 | , urls) 232 | 233 | assert_equal(['善哉,善哉!', 234 | '说出来今天的人根本不会信,我的初中英文老师李女士在上课的时候打毛衣。', 235 | '奔走相告:过气网红接到新广告!请点击,请阅读,请留言!', 236 | '陛下们!微臣有话要说!', 237 | '对,我就那么说了,不服来咬我啊?', 238 | '在一个现代商业社会里,如何决定要不要去相信一个人?如何把人际关系判定的时间精力节省下来?网络慈父和菜头是这么说的:', 239 | '在一名凡夫身上,我看到了菩萨那样的行止。', 240 | '昨晚看了电影《冈仁波齐》,我不喜欢。', 241 | '如果你是厨艺初学者,忘掉火候,那不是你应该关心的事情。', 242 | '野鸡给自己加戏,观众不说话,并不等于看不明白。', ], digests) 243 | 244 | def test_get_gzh_info_and_article_by_history(self): 245 | file_name = os.path.join(fake_data_path, 'bitsea-history.html') 246 | with io.open(file_name, encoding='utf-8') as f: 247 | gzh_info_and_article_by_history = f.read() 248 | 249 | gzh_article_list = WechatSogouStructuring.get_gzh_info_and_article_by_history(gzh_info_and_article_by_history) 250 | assert_in('gzh', gzh_article_list) 251 | assert_in('article', gzh_article_list) 252 | 253 | def test_get_gzh_article_by_hot(self): 254 | file_name = os.path.join(fake_data_path, 'wapindex-wap-0612-wap_8-0.html') 255 | with io.open(file_name, encoding='utf-8') as f: 256 | gzh_article_by_hot = f.read() 257 | 258 | gzh_articles = WechatSogouStructuring.get_gzh_article_by_hot(gzh_article_by_hot) 259 | 260 | for gzh_article in gzh_articles: 261 | assert_in('gzh', gzh_article) 262 | assert_in('article', gzh_article) 263 | assert_in('http://mp.weixin.qq.com/s?src=', gzh_article['article']['url']) 264 | assert_greater_equal(len(gzh_articles), 10) 265 | 266 | wechat_names = [] 267 | headimages = [] 268 | titles = [] 269 | times = [] 270 | for i in gzh_articles: 271 | wechat_names.append(i['gzh']['wechat_name']) 272 | headimages.append(i['gzh']['headimage']) 273 | titles.append(i['article']['title']) 274 | times.append(i['article']['time']) 275 | 276 | assert_equal( 277 | ['全球汽车精选', '车早茶', '吴佩频道', '驾考宝典', '腾讯汽车', '新车评', '非常好车', '汽车情报所', 278 | '一猫汽车资讯', '资深科技控', '郎club', '科技日报', '汽车使用宝典', '名车报', '科普中国网'], 279 | wechat_names) 280 | assert_equal(['http://img03.sogoucdn.com/app/a/100520090/oIWsFt1dGMefD1f8dOg2UCwQUjKs', 281 | 'http://img04.sogoucdn.com/app/a/100520090/oIWsFtwoQX8wX7w6loDevPqLEC_I', 282 | 'http://img03.sogoucdn.com/app/a/100520090/oIWsFt9Hbbtr9VLnfR9i_K5Z8D48', 283 | 'http://img04.sogoucdn.com/app/a/100520090/oIWsFt3txmWu-usvUa6gU0qlyEVo', 284 | 'http://img01.sogoucdn.com/app/a/100520090/oIWsFt8VDujUqNSCfruXtMNfekaw', 285 | 'http://img01.sogoucdn.com/app/a/100520090/oIWsFt9YD5HWLDe5QAkuvh0JWrgw', 286 | 'http://img01.sogoucdn.com/app/a/100520090/oIWsFt_WUnpQ7lZajAstgL8o1lWo', 287 | 'http://img02.sogoucdn.com/app/a/100520090/oIWsFtzUnzWUMz1PMek5zjVlS42U', 288 | 'http://img03.sogoucdn.com/app/a/100520090/oIWsFt2yk491dhhSP940JzLEameY', 289 | 'http://img03.sogoucdn.com/app/a/100520090/oIWsFtzm9UtmgY-SkOTFwQFpGsU8', 290 | 'http://img02.sogoucdn.com/app/a/100520090/oIWsFt7VwiM8GqYcv8DBNb-k5NBQ', 291 | 'http://img03.sogoucdn.com/app/a/100520090/oIWsFt2tjckivF8b0MP_nNTdESkE', 292 | 'http://img01.sogoucdn.com/app/a/100520090/oIWsFtzC2r61_riTCWp5iHX04fmo', 293 | 'http://img02.sogoucdn.com/app/a/100520090/oIWsFt8JIY_-o7DBMxorP19hcF0Q', 294 | 'http://img04.sogoucdn.com/app/a/100520090/oIWsFtyV5sdIXU2uy4m6oVBq77nA'], 295 | headimages) 296 | assert_equal(['不做这个动作,你的轮胎3个月就要换!', 297 | '新车质量最差的十个品牌?国人表示难以接受……', 298 | '带着米其林的指引去看古德伍德|品牌', 299 | '方向盘打法巧记口诀,科目二提分就靠它了!', 300 | '宝马“鸡腿”、奥迪“游艇”,这些奇葩的挡杆你见过几个?', 301 | '你没看错,我们做了期途昂和途锐的对比', 302 | '7成特斯拉被召回,难道是质量不过关?', 303 | '在中国惹不起的7种车,遇到请回避!', 304 | '迈腾摊上大事儿了 全新一代君威17.58万起', '面对这份驾享,朝廷大人都忍不住亲自上阵!', 305 | '外卖小哥被暴晒:底层人士的悲哀,有钱人不会懂', 306 | '自动驾驶还处于“新手”阶段,何时成为“老司机”?院士这样说……', 307 | '高速上碰到石头,是躲还是撞?', '装什么神秘,不就是加长版的讴歌TLX吗!', 308 | '一个动作,车里的人集体中毒!很多人都忽略了'], 309 | titles) 310 | assert_equal( 311 | [1501328135, 1501327941, 1501326826, 1501326716, 1501326675, 1501326455, 1501326222, 1501325595, 312 | 1501325529, 1501325521, 1501325223, 1501324531, 1501324443, 1501324310, 1501323274], 313 | times) 314 | 315 | def test_get_article_by_search_wap(self): 316 | file_name = os.path.join(fake_data_path, 'search-gaokao-article.json') 317 | with io.open(file_name, encoding='utf-8') as f: 318 | wap_json = json.load(f) 319 | 320 | gzh_articles = WechatSogouStructuring.get_article_by_search_wap(gaokao_keyword, wap_json) 321 | assert_equal(10, len(gzh_articles)) 322 | 323 | titles = [] 324 | abstracts = [] 325 | gzh_names = [] 326 | isvs = [] 327 | open_ids = [] 328 | for i in gzh_articles: 329 | assert_in('gzh', i) 330 | assert_in('article', i) 331 | 332 | article = i['article'] 333 | 334 | titles.append(article['title']) 335 | abstracts.append(article['abstract']) 336 | assert_in('mp.weixin.qq.com/', article['url']) 337 | 338 | gzh = i['gzh'] 339 | 340 | assert_in('mp.weixin.qq.com/profile?src=3×tamp', gzh['profile_url']) 341 | assert_in('wx.qlogo.cn/mmhead', gzh['headimage']) 342 | gzh_names.append(gzh['wechat_name']) 343 | isvs.append(gzh['isv']) 344 | open_ids.append(gzh['open_id']) 345 | 346 | assert_equal(['高考有多重要,为什么要重视高考?丨微观点', 347 | '高考:穷人考不好,中产考状元,精英不高考', 348 | '17个高考落榜者的“逆袭”故事:高考失败,天不会塌', 349 | '刚刚,高考“满分”诞生了!(附各省高考分数线)', 350 | '高考2017 | 全国各省区市高考录取时间大汇总,最新最全!', 351 | '28省公布高考分数线!各省高考状元出炉!', 352 | '高考2017 | 教育部发布高招录取工作通知!六大事项看过来', 353 | '高考录取过程详解', 354 | '高考前互有好感,高考后开始拍拖,还一同被清华录取!学霸早恋...', 355 | '高考复读,你怕了吗?'], 356 | titles) 357 | assert_equal(['针对这个问题,其实占豪已经谈过,但还是想借高考之后、借这位小战友的留言,结合自己的人生经验,谈谈个人对这件事的看法.在占豪看来,现实的社会是分层的,一个一个阶...', 358 | '#条条大路通罗马,有人就出生在罗马#前几天北京文科高考状元熊轩昂接受澎湃新闻的采访的时候,说了下面这段话. “农村地区的孩子越来越难考上好学校,而像我这种父母都...', 359 | '从高考分数出来的那一刻,今年的考生们大概都会大胆猜想自己未来的命运:高分者,一脚踏进名牌高校工作不愁,似乎人生已经平步青云;落榜者,面对落魄的分数整日哀叹,或...', 360 | '高考会有满分的情况吗?还真有!6月22日开始,全国各省的高考成绩陆续发布.22日晚上,成都市青白江区一个小区内人声鼎沸,因为小区里有一位今年参加高考的学生,总分...', 361 | '2017年高考录取工作开始了,各省区市高考录取工作何时进行?为了方便考生和家长及时了解,小编为大家作了最新最全的梳理.(图片可点击放大查看) 北京7月6日,飞行专业...', 362 | '随着阅卷工作的结束,各地开始陆续公布2017年高考录取分数线.目前,已有28个省份公布了高考分数线.青海、新疆、西藏尚未公布.据媒体报道,青海将于6月30日前发布成绩...', 363 | '有关省级教育行政部门、招生考试机构要精心实施减少录取批次改革,完善平行志愿投档录取办法,努力提高考生志愿满足率.上海、浙江要精心组织新高考录取工作,细化完善工...', 364 | '在高考录取过程中,我省和全国各地一样都实行计算机远程网上录取的方式.录取中坚持“学校负责、招办监督”的原则,整个录取过程严格按照录取日程安排,分批次进行录取....', 365 | '但学霸们在这个问题上有自己的选择,今年佛山有一对高分学霸,两人虽早有好感,但均理性选择高考后才开始拍拖,两人一同考上清华,在班上传为佳话.然而,有家长担心孩子...', 366 | '我家孩子高考失利了,只考了326分,刚到本科线,本科没希望了,哎!我家闺女也是文科370分,真愁人,该怎么办呢?让孩子走专科,孩子不甘心,做家长的也不甘心,复习,...'] 367 | , abstracts) 368 | assert_equal(['占豪', '才华有限青年', '新闻哥', '光明网', '微言教育', '中国经济网', '阳光高考信息平台', '甘肃教育', '广州日报', '河北高考'], gzh_names) 369 | assert_equal(['0', '1', '1', '1', '1', '1', '1', '1', '1', '0'], isvs) 370 | assert_equal(['oIWsFt8nKJlpLQbQ5H9NMPBjxup8', 'oIWsFt24BFRU0oh5C8cGFo7vAwYk', 'oIWsFt7B8jj2BkEA1WsGkPU40uhU', 371 | 'oIWsFtwaY2ERrY_oAgz5pHTn4aGc', 'oIWsFt5d7GugmQYi0cNC60qYV9c4', 'oIWsFt0B7LsVbUCMpgksNY8tqIno', 372 | 'oIWsFtzrEz_Tydpahalp9daXMg0Y', 'oIWsFt5kk9RnueF3AiUOao2XrP9o', 'oIWsFt7aLTQfT_wmrF4GpT27_xjg', 373 | 'oIWsFt3nYBUhqb4beN3rTBxdUHD8'], 374 | open_ids) 375 | 376 | def test_get_article_detail(self): 377 | file_name = os.path.join(fake_data_path, 'article_detail_backgroud-image.html') 378 | with io.open(file_name, encoding='utf-8') as f: 379 | text = f.read() 380 | 381 | article_detail = WechatSogouStructuring.get_article_detail(text) 382 | assert_equal(len(article_detail['content_img_list']), 29, article_detail) 383 | assert_true('data-wxurl' not in article_detail['content_html'], article_detail['content_html']) 384 | assert_true('qqmusic' not in article_detail['content_html'], article_detail['content_html']) 385 | # 图片有src属性,无data-src属性 386 | content_html = BeautifulSoup(article_detail['content_html'], 'lxml') 387 | imgs = content_html.find_all("img", src=re.compile(r'http')) 388 | assert_equal(len(imgs), 29, imgs) 389 | for img in imgs: 390 | assert_is_none(img.attrs.get('data-src')) 391 | 392 | file_name = os.path.join(fake_data_path, 'article_detail_mpvoice.html') 393 | with io.open(file_name, encoding='utf-8') as f: 394 | text = f.read() 395 | 396 | article_detail = WechatSogouStructuring.get_article_detail(text) 397 | assert_equal(len(article_detail['content_img_list']), 9, article_detail) 398 | assert_true('data-wxurl' not in article_detail['content_html'], article_detail['content_html']) 399 | assert_true('qqmusic' not in article_detail['content_html'], article_detail['content_html']) 400 | assert_true('mpvoice' not in article_detail['content_html'], article_detail['content_html']) 401 | 402 | file_name = os.path.join(fake_data_path, 'article_detail_qqmusic.html') 403 | with io.open(file_name, encoding='utf-8') as f: 404 | text = f.read() 405 | 406 | article_detail = WechatSogouStructuring.get_article_detail(text) 407 | assert_equal(len(article_detail['content_img_list']), 2, article_detail) 408 | assert_true('data-wxurl' not in article_detail['content_html'], article_detail['content_html']) 409 | assert_true('qqmusic' not in article_detail['content_html'], article_detail['content_html']) 410 | assert_true('mpvoice' not in article_detail['content_html'], article_detail['content_html']) 411 | 412 | file_name = os.path.join(fake_data_path, 'article_detail_iframe.html') 413 | with io.open(file_name, encoding='utf-8') as f: 414 | text = f.read() 415 | 416 | article_detail = WechatSogouStructuring.get_article_detail(text) 417 | assert_equal(len(article_detail['content_img_list']), 6, article_detail) 418 | assert_not_in('data-wxurl', article_detail['content_html'], article_detail['content_html']) 419 | assert_not_in('qqmusic', article_detail['content_html'], article_detail['content_html']) 420 | assert_not_in('mpvoice', article_detail['content_html'], article_detail['content_html']) 421 | 422 | # 图片有src属性,无data-src属性 423 | content_html = BeautifulSoup(article_detail['content_html'], 'lxml') 424 | iframes = content_html.find_all("iframe", src=re.compile(r'http')) 425 | assert_equal(len(iframes), 1, iframes) 426 | for iframe in iframes: 427 | assert_is_none(iframe.attrs.get('data-src')) 428 | 429 | 430 | if __name__ == '__main__': 431 | unittest.main() 432 | -------------------------------------------------------------------------------- /test/test_tools.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | 5 | from nose.tools import assert_raises, assert_equal 6 | from lxml import etree 7 | 8 | from wechatsogou.tools import list_or_empty, get_elem_text, replace_html, str_to_dict, replace_space, get_url_param 9 | 10 | 11 | class TestTools(unittest.TestCase): 12 | def test_list_or_empty(self): 13 | with assert_raises(AssertionError): 14 | list_or_empty('test for fun') 15 | 16 | assert_equal(list_or_empty(['1', '2'], int), 1) 17 | assert_equal(list_or_empty(['1', '2']), '1') 18 | assert_equal(list_or_empty([], int), 0) 19 | assert_equal(list_or_empty([], str), '') 20 | assert_equal(list_or_empty([], list), []) 21 | 22 | def test_get_elem_text(self): 23 | html = ''' 24 |
    25 |
    111
    26 |
    222
    27 |
    28 | ''' 29 | elem = etree.HTML(html) 30 | assert_equal(get_elem_text(elem), '111222') 31 | 32 | def test_replace_html(self): 33 | html = ''''"&¥amp;<> \\''' 34 | assert_equal(replace_html(html), '\'"&¥<> ') 35 | 36 | html = [''', '"', '&', '¥', 'amp;', '<', '>', ' ', '\\'] 37 | assert_equal(replace_html(html), ['\'', '"', '&', '¥', '', '<', '>', ' ', '']) 38 | 39 | html = {''': '"'} 40 | assert_equal(replace_html(html), {'\'': '"'}) 41 | 42 | def test_str_to_dict(self): 43 | string = "{'a':'a'}" 44 | assert_equal(str_to_dict(string), {'a': 'a'}) 45 | 46 | def test_replace_space(self): 47 | string = 'ss ss' 48 | assert_equal(replace_space(string), 'ssss') 49 | 50 | def test_get_url_param(self): 51 | url = 'http://example.com?a=1&b=2&a=3' 52 | assert_equal(get_url_param(url), {'a': ['1', '3'], 'b': ['2']}) 53 | 54 | 55 | if __name__ == '__main__': 56 | unittest.main() 57 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27,py35,py36 3 | 4 | [testenv] 5 | passenv = * 6 | deps= 7 | setuptools==34.3.1 8 | requests 9 | lxml 10 | future 11 | Pillow 12 | Werkzeug 13 | nose 14 | httpretty 15 | hypothesis 16 | bs4 17 | commands = 18 | #python setup.py test 19 | python -m nose -vs 20 | -------------------------------------------------------------------------------- /wechatsogou/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # __ __ _ _ ____ 4 | # \ \ / /__ ___| |__ __ _| |_/ ___| ___ __ _ ___ _ _ 5 | # \ \ /\ / / _ \/ __| '_ \ / _` | __\___ \ / _ \ / _` |/ _ \| | | | 6 | # \ V V / __/ (__| | | | (_| | |_ ___) | (_) | (_| | (_) | |_| | 7 | # \_/\_/ \___|\___|_| |_|\__,_|\__|____/ \___/ \__, |\___/ \__,_| 8 | # |___/ 9 | 10 | """ 11 | WechatSogou Crawler Library 12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | """ 15 | 16 | from wechatsogou.api import WechatSogouAPI 17 | from wechatsogou.const import WechatSogouConst 18 | from wechatsogou.request import WechatSogouRequest 19 | from wechatsogou.structuring import WechatSogouStructuring 20 | from wechatsogou.exceptions import WechatSogouException, WechatSogouVcodeOcrException, WechatSogouRequestsException 21 | 22 | __all__ = [ 23 | 'WechatSogouConst', 24 | 25 | 'WechatSogouAPI', 26 | 'WechatSogouRequest', 27 | 'WechatSogouStructuring', 28 | 29 | 'WechatSogouException', 30 | 'WechatSogouVcodeOcrException', 31 | 'WechatSogouRequestsException'] 32 | 33 | __title__ = 'wechatsogou' 34 | __version__ = "4.5.4" 35 | __author__ = 'Chyroc' 36 | 37 | """doc string 38 | 39 | https://www.jetbrains.com/help/pycharm/type-hinting-in-pycharm.html 40 | https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt 41 | """ 42 | -------------------------------------------------------------------------------- /wechatsogou/api.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import, print_function, unicode_literals 4 | 5 | import json 6 | import math 7 | import random 8 | import re 9 | import time 10 | 11 | import requests 12 | 13 | from wechatsogou.const import agents, WechatSogouConst 14 | from wechatsogou.exceptions import WechatSogouException, WechatSogouRequestsException, WechatSogouVcodeOcrException 15 | from wechatsogou.five import must_str, quote 16 | from wechatsogou.identify_image import (identify_image_callback_by_hand, unlock_sogou_callback_example, unlock_weixin_callback_example, ws_cache) 17 | from wechatsogou.request import WechatSogouRequest 18 | from wechatsogou.structuring import WechatSogouStructuring 19 | from wechatsogou.tools import may_int 20 | 21 | 22 | class WechatSogouAPI(object): 23 | def __init__(self, captcha_break_time=1, headers=None, **kwargs): 24 | """初始化参数 25 | 26 | Parameters 27 | ---------- 28 | captcha_break_time : int 29 | 验证码输入错误重试次数 30 | proxies : dict 31 | 代理 32 | timeout : float 33 | 超时时间 34 | """ 35 | assert isinstance(captcha_break_time, int) and 0 < captcha_break_time < 20 36 | 37 | self.captcha_break_times = captcha_break_time 38 | self.requests_kwargs = kwargs 39 | self.headers = headers 40 | if self.headers: 41 | self.headers['User-Agent'] = random.choice(agents) 42 | else: 43 | self.headers = {'User-Agent': random.choice(agents)} 44 | 45 | def __set_cookie(self, suv=None, snuid=None, referer=None): 46 | suv = ws_cache.get('suv') if suv is None else suv 47 | snuid = ws_cache.get('snuid') if snuid is None else snuid 48 | _headers = {'Cookie': 'SUV={};SNUID={};'.format(suv, snuid)} 49 | if referer is not None: 50 | _headers['Referer'] = referer 51 | return _headers 52 | 53 | def __set_cache(self, suv, snuid): 54 | ws_cache.set('suv', suv) 55 | ws_cache.set('snuid', snuid) 56 | 57 | def __get(self, url, session, headers): 58 | h = {} 59 | if headers: 60 | for k, v in headers.items(): 61 | h[k] = v 62 | if self.headers: 63 | for k, v in self.headers.items(): 64 | h[k] = v 65 | resp = session.get(url, headers=h, **self.requests_kwargs) 66 | 67 | if not resp.ok: 68 | raise WechatSogouRequestsException('WechatSogouAPI get error', resp) 69 | 70 | return resp 71 | 72 | def __unlock_sogou(self, url, resp, session, unlock_callback=None, identify_image_callback=None): 73 | if unlock_callback is None: 74 | unlock_callback = unlock_sogou_callback_example 75 | millis = int(round(time.time() * 1000)) 76 | r_captcha = session.get('http://weixin.sogou.com/antispider/util/seccode.php?tc={}'.format(millis), headers={ 77 | 'Referer': url, 78 | }) 79 | if not r_captcha.ok: 80 | raise WechatSogouRequestsException('WechatSogouAPI get img', r_captcha) 81 | 82 | r_unlock = unlock_callback(url, session, resp, r_captcha.content, identify_image_callback) 83 | 84 | if r_unlock['code'] != 0: 85 | raise WechatSogouVcodeOcrException( 86 | '[WechatSogouAPI identify image] code: {code}, msg: {msg}'.format(code=r_unlock.get('code'), 87 | msg=r_unlock.get('msg'))) 88 | else: 89 | self.__set_cache(session.cookies.get('SUID'), r_unlock['id']) 90 | 91 | def __unlock_wechat(self, url, resp, session, unlock_callback=None, identify_image_callback=None): 92 | if unlock_callback is None: 93 | unlock_callback = unlock_weixin_callback_example 94 | 95 | r_captcha = session.get('https://mp.weixin.qq.com/mp/verifycode?cert={}'.format(time.time() * 1000)) 96 | if not r_captcha.ok: 97 | raise WechatSogouRequestsException('WechatSogouAPI unlock_history get img', resp) 98 | 99 | r_unlock = unlock_callback(url, session, resp, r_captcha.content, identify_image_callback) 100 | 101 | if r_unlock['ret'] != 0: 102 | raise WechatSogouVcodeOcrException( 103 | '[WechatSogouAPI identify image] code: {ret}, msg: {errmsg}, cookie_count: {cookie_count}'.format( 104 | ret=r_unlock.get('ret'), errmsg=r_unlock.get('errmsg'), cookie_count=r_unlock.get('cookie_count'))) 105 | 106 | def __get_by_unlock(self, url, referer=None, unlock_platform=None, unlock_callback=None, identify_image_callback=None, session=None): 107 | assert unlock_platform is None or callable(unlock_platform) 108 | 109 | if identify_image_callback is None: 110 | identify_image_callback = identify_image_callback_by_hand 111 | assert unlock_callback is None or callable(unlock_callback) 112 | assert callable(identify_image_callback) 113 | 114 | if not session: 115 | session = requests.session() 116 | resp = self.__get(url, session, headers=self.__set_cookie(referer=referer)) 117 | resp.encoding = 'utf-8' 118 | if 'antispider' in resp.url or '请输入验证码' in resp.text: 119 | for i in range(self.captcha_break_times): 120 | try: 121 | unlock_platform(url=url, resp=resp, session=session, unlock_callback=unlock_callback, identify_image_callback=identify_image_callback) 122 | break 123 | except WechatSogouVcodeOcrException as e: 124 | if i == self.captcha_break_times - 1: 125 | raise WechatSogouVcodeOcrException(e) 126 | 127 | if '请输入验证码' in resp.text: 128 | resp = session.get(url) 129 | resp.encoding = 'utf-8' 130 | else: 131 | headers = self.__set_cookie(referer=referer) 132 | headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; WOW64)' 133 | resp = self.__get(url, session, headers) 134 | resp.encoding = 'utf-8' 135 | 136 | return resp 137 | 138 | def __hosting_wechat_img(self, content_info, hosting_callback): 139 | """将微信明细中图片托管到云端,同时将html页面中的对应图片替换 140 | 141 | Parameters 142 | ---------- 143 | content_info : dict 微信文章明细字典 144 | { 145 | 'content_img_list': [], # 从微信文章解析出的原始图片列表 146 | 'content_html': '', # 从微信文章解析出文章的内容 147 | } 148 | hosting_callback : callable 149 | 托管回调函数,传入单个图片链接,返回托管后的图片链接 150 | 151 | Returns 152 | ------- 153 | dict 154 | { 155 | 'content_img_list': '', # 托管后的图片列表 156 | 'content_html': '', # 图片链接为托管后的图片链接内容 157 | } 158 | """ 159 | assert callable(hosting_callback) 160 | 161 | content_img_list = content_info.pop("content_img_list") 162 | content_html = content_info.pop("content_html") 163 | for idx, img_url in enumerate(content_img_list): 164 | hosting_img_url = hosting_callback(img_url) 165 | if not hosting_img_url: 166 | # todo 定义标准异常 167 | raise Exception() 168 | content_img_list[idx] = hosting_img_url 169 | content_html = content_html.replace(img_url, hosting_img_url) 170 | 171 | return dict(content_img_list=content_img_list, content_html=content_html) 172 | 173 | def __format_url(self, url, referer, text, unlock_callback=None, identify_image_callback=None, session=None): 174 | def _parse_url(url, pads): 175 | b = math.floor(random.random() * 100) + 1 176 | a = url.find("url=") 177 | c = url.find("&k=") 178 | if a != -1 and c == -1: 179 | sum = 0 180 | for i in list(pads) + [a, b]: 181 | sum += int(must_str(i)) 182 | a = url[sum] 183 | 184 | return '{}&k={}&h={}'.format(url, may_int(b), may_int(a)) 185 | 186 | if url.startswith('/link?url='): 187 | url = 'https://weixin.sogou.com{}'.format(url) 188 | 189 | pads = re.findall(r'href\.substr\(a\+(\d+)\+parseInt\("(\d+)"\)\+b,1\)', text) 190 | url = _parse_url(url, pads[0] if pads else []) 191 | resp = self.__get_by_unlock(url, 192 | referer=referer, 193 | unlock_platform=self.__unlock_sogou, 194 | unlock_callback=unlock_callback, 195 | identify_image_callback=identify_image_callback, 196 | session=session) 197 | uri = '' 198 | base_url = re.findall(r'var url = \'(.*?)\';', resp.text) 199 | if base_url and len(base_url) > 0: 200 | uri = base_url[0] 201 | 202 | mp_url = re.findall(r'url \+= \'(.*?)\';', resp.text) 203 | if mp_url: 204 | uri = uri + ''.join(mp_url) 205 | url = uri.replace('@', '') 206 | return url 207 | 208 | def get_gzh_info(self, wecgat_id_or_name, unlock_callback=None, identify_image_callback=None, decode_url=True): 209 | """获取公众号微信号 wechatid 的信息 210 | 211 | 因为wechatid唯一确定,所以第一个就是要搜索的公众号 212 | 213 | Parameters 214 | ---------- 215 | wecgat_id_or_name : str or unicode 216 | wechat_id or wechat_name 217 | unlock_callback : callable 218 | 处理出现验证码页面的函数,参见 unlock_callback_example 219 | identify_image_callback : callable 220 | 处理验证码函数,输入验证码二进制数据,输出文字,参见 identify_image_callback_example 221 | 222 | Returns 223 | ------- 224 | dict or None 225 | { 226 | 'open_id': '', # 微信号唯一ID 227 | 'profile_url': '', # 最近10条群发页链接 228 | 'headimage': '', # 头像 229 | 'wechat_name': '', # 名称 230 | 'wechat_id': '', # 微信id 231 | 'post_perm': '', # 最近一月群发数 232 | 'qrcode': '', # 二维码 233 | 'introduction': '', # 介绍 234 | 'authentication': '' # 认证 235 | } 236 | """ 237 | info = self.search_gzh(wecgat_id_or_name, 1, unlock_callback, identify_image_callback, decode_url) 238 | try: 239 | return next(info) 240 | except StopIteration: 241 | return None 242 | 243 | def search_gzh(self, keyword, page=1, unlock_callback=None, identify_image_callback=None, decode_url=True): 244 | """搜索 公众号 245 | 246 | 对于出现验证码的情况,可以由使用者自己提供: 247 | 1、函数 unlock_callback ,这个函数 handle 出现验证码到解决的整个流程 248 | 2、也可以 只提供函数 identify_image_callback,这个函数输入验证码二进制数据,输出验证码文字,剩下的由 wechatsogou 包来解决 249 | 注意: 250 | 函数 unlock_callback 和 identify_image_callback 只需要提供一个,如果都提供了,那么 identify_image_callback 不起作用 251 | 252 | Parameters 253 | ---------- 254 | keyword : str or unicode 255 | 搜索文字 256 | page : int, optional 257 | 页数 the default is 1 258 | unlock_callback : callable 259 | 处理出现验证码页面的函数,参见 unlock_callback_example 260 | identify_image_callback : callable 261 | 处理验证码函数,输入验证码二进制数据,输出文字,参见 identify_image_callback_example 262 | decode_url : bool 263 | 是否解析 url 264 | 265 | Returns 266 | ------- 267 | list[dict] 268 | { 269 | 'open_id': '', # 微信号唯一ID 270 | 'profile_url': '', # 最近10条群发页链接 271 | 'headimage': '', # 头像 272 | 'wechat_name': '', # 名称 273 | 'wechat_id': '', # 微信id 274 | 'post_perm': '', # 最近一月群发数 275 | 'qrcode': '', # 二维码 276 | 'introduction': '', # 介绍 277 | 'authentication': '' # 认证 278 | } 279 | 280 | Raises 281 | ------ 282 | WechatSogouRequestsException 283 | requests error 284 | """ 285 | url = WechatSogouRequest.gen_search_gzh_url(keyword, page) 286 | session = requests.session() 287 | resp = self.__get_by_unlock(url, 288 | unlock_platform=self.__unlock_sogou, 289 | unlock_callback=unlock_callback, 290 | identify_image_callback=identify_image_callback, 291 | session=session) 292 | gzh_list = WechatSogouStructuring.get_gzh_by_search(resp.text) 293 | for i in gzh_list: 294 | if decode_url: 295 | i['profile_url'] = self.__format_url(i['profile_url'], url, resp.text, unlock_callback=unlock_callback, identify_image_callback=identify_image_callback, session=session) 296 | yield i 297 | 298 | def search_article(self, keyword, page=1, timesn=WechatSogouConst.search_article_time.anytime, 299 | article_type=WechatSogouConst.search_article_type.all, ft=None, et=None, 300 | unlock_callback=None, 301 | identify_image_callback=None, 302 | decode_url=True): 303 | """搜索 文章 304 | 305 | 对于出现验证码的情况,可以由使用者自己提供: 306 | 1、函数 unlock_callback ,这个函数 handle 出现验证码到解决的整个流程 307 | 2、也可以 只提供函数 identify_image_callback,这个函数输入验证码二进制数据,输出验证码文字,剩下的由 wechatsogou 包来解决 308 | 注意: 309 | 函数 unlock_callback 和 identify_image_callback 只需要提供一个,如果都提供了,那么 identify_image_callback 不起作用 310 | 311 | Parameters 312 | ---------- 313 | keyword : str or unicode 314 | 搜索文字 315 | page : int, optional 316 | 页数 the default is 1 317 | timesn : WechatSogouConst.search_article_time 318 | 时间 anytime 没有限制 / day 一天 / week 一周 / month 一月 / year 一年 / specific 自定 319 | the default is anytime 320 | article_type : WechatSogouConst.search_article_type 321 | 含有内容的类型 image 有图 / video 有视频 / rich 有图和视频 / all 啥都有 322 | ft, et : datetime.date or None 323 | 当 tsn 是 specific 时,ft 代表开始时间,如: 2017-07-01 324 | 当 tsn 是 specific 时,et 代表结束时间,如: 2017-07-15 325 | unlock_callback : callable 326 | 处理出现验证码页面的函数,参见 unlock_callback_example 327 | identify_image_callback : callable 328 | 处理验证码函数,输入验证码二进制数据,输出文字,参见 identify_image_callback_example 329 | decode_url : bool 330 | 是否解析 url 331 | 332 | Returns 333 | ------- 334 | list[dict] 335 | { 336 | 'article': { 337 | 'title': '', # 文章标题 338 | 'url': '', # 文章链接 339 | 'imgs': '', # 文章图片list 340 | 'abstract': '', # 文章摘要 341 | 'time': '' # 文章推送时间 342 | }, 343 | 'gzh': { 344 | 'profile_url': '', # 公众号最近10条群发页链接 345 | 'headimage': '', # 头像 346 | 'wechat_name': '', # 名称 347 | 'isv': '', # 是否加v 348 | } 349 | } 350 | 351 | Raises 352 | ------ 353 | WechatSogouRequestsException 354 | requests error 355 | """ 356 | url = WechatSogouRequest.gen_search_article_url(keyword, page, timesn, article_type, ft, et) 357 | session = requests.session() 358 | resp = self.__get_by_unlock(url, WechatSogouRequest.gen_search_article_url(keyword), 359 | unlock_platform=self.__unlock_sogou, 360 | unlock_callback=unlock_callback, 361 | identify_image_callback=identify_image_callback, 362 | session=session) 363 | 364 | article_list = WechatSogouStructuring.get_article_by_search(resp.text) 365 | for i in article_list: 366 | if decode_url: 367 | i['article']['url'] = self.__format_url(i['article']['url'], url, resp.text, unlock_callback=unlock_callback, identify_image_callback=identify_image_callback, session=session) 368 | i['gzh']['profile_url'] = self.__format_url(i['gzh']['profile_url'], url, resp.text, unlock_callback=unlock_callback, identify_image_callback=identify_image_callback, session=session) 369 | yield i 370 | 371 | def get_gzh_article_by_history(self, keyword=None, url=None, 372 | unlock_callback_sogou=None, 373 | identify_image_callback_sogou=None, 374 | unlock_callback_weixin=None, 375 | identify_image_callback_weixin=None): 376 | """从 公众号的最近10条群发页面 提取公众号信息 和 文章列表信息 377 | 378 | 对于出现验证码的情况,可以由使用者自己提供: 379 | 1、函数 unlock_callback ,这个函数 handle 出现验证码到解决的整个流程 380 | 2、也可以 只提供函数 identify_image_callback,这个函数输入验证码二进制数据,输出验证码文字,剩下的由 wechatsogou 包来解决 381 | 注意: 382 | 函数 unlock_callback 和 identify_image_callback 只需要提供一个,如果都提供了,那么 identify_image_callback 不起作用 383 | 384 | Parameters 385 | ---------- 386 | keyword : str or unicode 387 | 公众号的id 或者name 388 | url : str or unicode 389 | 群发页url,如果不提供url,就先去搜索一遍拿到url 390 | unlock_callback_sogou : callable 391 | 处理出现 搜索 的时候出现验证码的函数,参见 unlock_callback_example 392 | identify_image_callback_sogou : callable 393 | 处理 搜索 的时候处理验证码函数,输入验证码二进制数据,输出文字,参见 identify_image_callback_example 394 | unlock_callback_weixin : callable 395 | 处理出现 历史页 的时候出现验证码的函数,参见 unlock_callback_example 396 | identify_image_callback_weixin : callable 397 | 处理 历史页 的时候处理验证码函数,输入验证码二进制数据,输出文字,参见 identify_image_callback_example 398 | 399 | Returns 400 | ------- 401 | dict 402 | { 403 | 'gzh': { 404 | 'wechat_name': '', # 名称 405 | 'wechat_id': '', # 微信id 406 | 'introduction': '', # 描述 407 | 'authentication': '', # 认证 408 | 'headimage': '' # 头像 409 | }, 410 | 'article': [ 411 | { 412 | 'send_id': '', # 群发id,注意不唯一,因为同一次群发多个消息,而群发id一致 413 | 'datetime': '', # 群发datatime 414 | 'type': '', # 消息类型,均是49,表示图文 415 | 'main': 0, # 是否是一次群发的第一次消息 416 | 'title': '', # 文章标题 417 | 'abstract': '', # 摘要 418 | 'fileid': '', # 419 | 'content_url': '', # 文章链接 420 | 'source_url': '', # 阅读原文的链接 421 | 'cover': '', # 封面图 422 | 'author': '', # 作者 423 | 'copyright_stat': '', # 文章类型,例如:原创啊 424 | }, 425 | ... 426 | ] 427 | } 428 | 429 | 430 | Raises 431 | ------ 432 | WechatSogouRequestsException 433 | requests error 434 | """ 435 | if url is None: 436 | gzh_list = self.get_gzh_info(keyword, unlock_callback_sogou, identify_image_callback_sogou) 437 | if gzh_list is None: 438 | return {} 439 | if 'profile_url' not in gzh_list: 440 | raise Exception() # todo use ws exception 441 | url = gzh_list['profile_url'] 442 | 443 | resp = self.__get_by_unlock(url, WechatSogouRequest.gen_search_article_url(keyword), 444 | unlock_platform=self.__unlock_wechat, 445 | unlock_callback=unlock_callback_weixin, 446 | identify_image_callback=identify_image_callback_weixin) 447 | 448 | return WechatSogouStructuring.get_gzh_info_and_article_by_history(resp.text) 449 | 450 | def get_gzh_article_by_hot(self, hot_index, page=1, unlock_callback=None, identify_image_callback=None): 451 | """获取 首页热门文章 452 | 453 | Parameters 454 | ---------- 455 | hot_index : WechatSogouConst.hot_index 456 | 首页热门文章的分类(常量):WechatSogouConst.hot_index.xxx 457 | page : int 458 | 页数 459 | 460 | Returns 461 | ------- 462 | list[dict] 463 | { 464 | 'gzh': { 465 | 'headimage': str, # 公众号头像 466 | 'wechat_name': str, # 公众号名称 467 | }, 468 | 'article': { 469 | 'url': str, # 文章临时链接 470 | 'title': str, # 文章标题 471 | 'abstract': str, # 文章摘要 472 | 'time': int, # 推送时间,10位时间戳 473 | 'open_id': str, # open id 474 | 'main_img': str # 封面图片 475 | } 476 | } 477 | """ 478 | 479 | assert hasattr(WechatSogouConst.hot_index, hot_index) 480 | assert isinstance(page, int) and page > 0 481 | 482 | url = WechatSogouRequest.gen_hot_url(hot_index, page) 483 | resp = self.__get_by_unlock(url, 484 | unlock_platform=self.__unlock_sogou, 485 | unlock_callback=unlock_callback, 486 | identify_image_callback=identify_image_callback) 487 | 488 | resp.encoding = 'utf-8' 489 | return WechatSogouStructuring.get_gzh_article_by_hot(resp.text) 490 | 491 | def get_article_content(self, url, del_qqmusic=True, del_mpvoice=True, unlock_callback=None, 492 | identify_image_callback=None, hosting_callback=None, raw=False): 493 | """获取文章原文,避免临时链接失效 494 | 495 | Parameters 496 | ---------- 497 | url : str or unicode 498 | 原文链接,临时链接 499 | raw : bool 500 | True: 返回原始html 501 | False: 返回处理后的html 502 | del_qqmusic: bool 503 | True:微信原文中有插入的qq音乐,则删除 504 | False:微信源文中有插入的qq音乐,则保留 505 | del_mpvoice: bool 506 | True:微信原文中有插入的语音消息,则删除 507 | False:微信源文中有插入的语音消息,则保留 508 | unlock_callback : callable 509 | 处理 文章明细 的时候出现验证码的函数,参见 unlock_callback_example 510 | identify_image_callback : callable 511 | 处理 文章明细 的时候处理验证码函数,输入验证码二进制数据,输出文字,参见 identify_image_callback_example 512 | hosting_callback: callable 513 | 将微信采集的文章托管到7牛或者阿里云回调函数,输入微信图片源地址,返回托管后地址 514 | 515 | Returns 516 | ------- 517 | content_html 518 | 原文内容 519 | content_img_list 520 | 文章中图片列表 521 | 522 | Raises 523 | ------ 524 | WechatSogouRequestsException 525 | """ 526 | 527 | resp = self.__get_by_unlock(url, 528 | unlock_platform=self.__unlock_wechat, 529 | unlock_callback=unlock_callback, 530 | identify_image_callback=identify_image_callback) 531 | 532 | resp.encoding = 'utf-8' 533 | if '链接已过期' in resp.text: 534 | raise WechatSogouException('get_article_content 链接 [{}] 已过期'.format(url)) 535 | if raw: 536 | return resp.text 537 | content_info = WechatSogouStructuring.get_article_detail(resp.text, del_qqmusic=del_qqmusic, 538 | del_voice=del_mpvoice) 539 | if hosting_callback: 540 | content_info = self.__hosting_wechat_img(content_info, hosting_callback) 541 | return content_info 542 | 543 | def get_sugg(self, keyword): 544 | """获取微信搜狗搜索关键词联想 545 | 546 | Parameters 547 | ---------- 548 | keyword : str or unicode 549 | 关键词 550 | 551 | Returns 552 | ------- 553 | list[str] 554 | 联想关键词列表 555 | 556 | Raises 557 | ------ 558 | WechatSogouRequestsException 559 | """ 560 | url = 'http://w.sugg.sogou.com/sugg/ajaj_json.jsp?key={}&type=wxpub&pr=web'.format( 561 | quote(keyword.encode('utf-8'))) 562 | r = requests.get(url) 563 | if not r.ok: 564 | raise WechatSogouRequestsException('get_sugg', r) 565 | 566 | sugg = re.findall(u'\["' + keyword + '",(.*?),\["', r.text)[0] 567 | return json.loads(sugg) 568 | -------------------------------------------------------------------------------- /wechatsogou/const.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from functools import wraps 4 | 5 | from wechatsogou.exceptions import WechatSogouException 6 | 7 | agents = [ 8 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 9 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 10 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 11 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 12 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 13 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 14 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 15 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 16 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 17 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 18 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 19 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 20 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 21 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 22 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 23 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 24 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", 25 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", 26 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", 27 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", 28 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", 29 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", 30 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", 31 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", 32 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", 33 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", 34 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", 35 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", 36 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", 37 | "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", 38 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", 39 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", 40 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", 41 | "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", 42 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", 43 | ] 44 | 45 | 46 | def Const(cls): 47 | @wraps(cls) 48 | def new_setattr(self, name, value): 49 | raise WechatSogouException('const : {} can not be changed'.format(name)) 50 | 51 | cls.__setattr__ = new_setattr 52 | return cls 53 | 54 | 55 | @Const 56 | class _WechatSogouSearchArticleTypeConst(object): 57 | all = 'all' 58 | rich = 'rich' 59 | video = 'video' 60 | image = 'image' 61 | 62 | 63 | @Const 64 | class _WechatSogouSearchArticleTimeConst(object): 65 | """搜索条件 时间 66 | 67 | 0 没有限制 / 1一天 / 2一周 / 3一月 / 4一年 / 5自定 68 | """ 69 | anytime = 0 70 | day = 1 71 | week = 2 72 | month = 3 73 | year = 4 74 | specific = 5 75 | 76 | 77 | @Const 78 | class _WechatSogouHotIndexConst(object): 79 | hot = 'hot' # 热门 80 | gaoxiao = 'gaoxiao' # 搞笑 81 | health = 'health' # 养生 82 | sifanghua = 'sifanghua' # 私房话 83 | gossip = 'gossip' # 八卦 84 | technology = 'technology' # 科技 85 | finance = 'finance' # 财经 86 | car = 'car' # 汽车 87 | life = 'life' # 生活 88 | fashion = 'fashion' # 时尚 89 | mummy = 'mummy' # 辣妈 / 育儿 90 | travel = 'travel' # 旅行 91 | job = 'job' # 职场 92 | food = 'food' # 美食 93 | history = 'history' # 历史 94 | study = 'study' # 学霸 / 教育 95 | constellation = 'constellation' # 星座 96 | sport = 'sport' # 体育 97 | military = 'military' # 军事 98 | game = 'game' # 游戏 99 | pet = 'pet' # 萌宠 100 | 101 | 102 | @Const 103 | class _Const(object): 104 | hot_index = _WechatSogouHotIndexConst() 105 | search_article_type = _WechatSogouSearchArticleTypeConst() 106 | search_article_time = _WechatSogouSearchArticleTimeConst() 107 | 108 | 109 | WechatSogouConst = _Const() 110 | -------------------------------------------------------------------------------- /wechatsogou/exceptions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | class WechatSogouException(Exception): 5 | """基于搜狗搜索的的微信公众号爬虫接口 异常基类 6 | """ 7 | pass 8 | 9 | 10 | class WechatSogouVcodeOcrException(WechatSogouException): 11 | """基于搜狗搜索的的微信公众号爬虫接口 验证码 识别错误 异常类 12 | """ 13 | pass 14 | 15 | 16 | class WechatSogouRequestsException(WechatSogouException): 17 | """基于搜狗搜索的的微信公众号爬虫接口 抓取 异常类 18 | 19 | Parameters 20 | ---------- 21 | errmsg : str or unicode 22 | msg 23 | r : requests.models.Response 24 | return of requests 25 | """ 26 | 27 | def __init__(self, errmsg, r): 28 | WechatSogouException('{} [url {}] [content {}]'.format(errmsg, r.url, r.content)) 29 | self.status_code = r.status_code 30 | -------------------------------------------------------------------------------- /wechatsogou/filecache.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from werkzeug.contrib.cache import FileSystemCache 4 | 5 | 6 | class WechatCache(FileSystemCache): 7 | """基于文件的缓存 8 | 9 | """ 10 | 11 | def __init__(self, cache_dir='/tmp/wechatsogou-cache', default_timeout=300): 12 | """初始化 13 | 14 | cache_dir是缓存目录 15 | """ 16 | super(WechatCache, self).__init__(cache_dir, default_timeout) 17 | 18 | def get(self, key): 19 | try: 20 | return super(WechatCache, self).get(key) 21 | except ValueError: 22 | return None 23 | -------------------------------------------------------------------------------- /wechatsogou/five.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from PIL import Image 4 | import six 5 | 6 | if six.PY2: 7 | import sys 8 | import urlparse as url_parse 9 | from urllib import urlencode 10 | from urllib import unquote 11 | from urllib import quote as quote 12 | import StringIO 13 | 14 | def readimg(content): 15 | return Image.open(StringIO.StringIO(content)) 16 | 17 | reload(sys) 18 | sys.setdefaultencoding('utf-8') 19 | input = raw_input 20 | str_to_bytes = bytes 21 | def must_str(s): 22 | if isinstance(s,unicode): 23 | s = s.encode('utf-8') 24 | return s 25 | else: 26 | import urllib.parse as url_parse 27 | import urllib.parse 28 | from urllib.parse import unquote 29 | from urllib.request import quote as quote 30 | import tempfile 31 | 32 | def readimg(content): 33 | f = tempfile.TemporaryFile() 34 | f.write(content) 35 | return Image.open(f) 36 | 37 | urlencode = urllib.parse.urlencode 38 | input = input 39 | str_to_bytes = lambda x: bytes(x, encoding='utf-8') 40 | def must_str(s): 41 | return s 42 | 43 | -------------------------------------------------------------------------------- /wechatsogou/identify_image.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import, unicode_literals, print_function 4 | 5 | import time 6 | 7 | import requests 8 | 9 | from wechatsogou.five import readimg, input 10 | from wechatsogou.filecache import WechatCache 11 | from wechatsogou.exceptions import WechatSogouVcodeOcrException 12 | 13 | ws_cache = WechatCache() 14 | 15 | 16 | def identify_image_callback_by_hand(img): 17 | """识别二维码 18 | 19 | Parameters 20 | ---------- 21 | img : bytes 22 | 验证码图片二进制数据 23 | 24 | Returns 25 | ------- 26 | str 27 | 验证码文字 28 | """ 29 | im = readimg(img) 30 | im.show() 31 | return input("please input code: ") 32 | 33 | 34 | def unlock_sogou_callback_example(url, req, resp, img, identify_image_callback): 35 | """手动打码解锁 36 | 37 | Parameters 38 | ---------- 39 | url : str or unicode 40 | 验证码页面 之前的 url 41 | req : requests.sessions.Session 42 | requests.Session() 供调用解锁 43 | resp : requests.models.Response 44 | requests 访问页面返回的,已经跳转了 45 | img : bytes 46 | 验证码图片二进制数据 47 | identify_image_callback : callable 48 | 处理验证码函数,输入验证码二进制数据,输出文字,参见 identify_image_callback_example 49 | 50 | Returns 51 | ------- 52 | dict 53 | { 54 | 'code': '', 55 | 'msg': '', 56 | } 57 | """ 58 | # no use resp 59 | url_quote = url.split('weixin.sogou.com/')[-1] 60 | unlock_url = 'http://weixin.sogou.com/antispider/thank.php' 61 | data = { 62 | 'c': identify_image_callback(img), 63 | 'r': '%2F' + url_quote, 64 | 'v': 5 65 | } 66 | headers = { 67 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 68 | 'Referer': 'http://weixin.sogou.com/antispider/?from=%2f' + url_quote 69 | } 70 | r_unlock = req.post(unlock_url, data, headers=headers) 71 | r_unlock.encoding = 'utf-8' 72 | if not r_unlock.ok: 73 | raise WechatSogouVcodeOcrException( 74 | 'unlock[{}] failed: {}'.format(unlock_url, r_unlock.text, r_unlock.status_code)) 75 | 76 | return r_unlock.json() 77 | 78 | 79 | def unlock_weixin_callback_example(url, req, resp, img, identify_image_callback): 80 | """手动打码解锁 81 | 82 | Parameters 83 | ---------- 84 | url : str or unicode 85 | 验证码页面 之前的 url 86 | req : requests.sessions.Session 87 | requests.Session() 供调用解锁 88 | resp : requests.models.Response 89 | requests 访问页面返回的,已经跳转了 90 | img : bytes 91 | 验证码图片二进制数据 92 | identify_image_callback : callable 93 | 处理验证码函数,输入验证码二进制数据,输出文字,参见 identify_image_callback_example 94 | 95 | Returns 96 | ------- 97 | dict 98 | { 99 | 'ret': '', 100 | 'errmsg': '', 101 | 'cookie_count': '', 102 | } 103 | """ 104 | # no use resp 105 | 106 | unlock_url = 'https://mp.weixin.qq.com/mp/verifycode' 107 | data = { 108 | 'cert': time.time() * 1000, 109 | 'input': identify_image_callback(img) 110 | } 111 | headers = { 112 | 'Host': 'mp.weixin.qq.com', 113 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 114 | 'Referer': url 115 | } 116 | r_unlock = req.post(unlock_url, data, headers=headers) 117 | if not r_unlock.ok: 118 | raise WechatSogouVcodeOcrException( 119 | 'unlock[{}] failed: {}[{}]'.format(unlock_url, r_unlock.text, r_unlock.status_code)) 120 | 121 | return r_unlock.json() 122 | -------------------------------------------------------------------------------- /wechatsogou/request.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import, unicode_literals, print_function 4 | 5 | import datetime 6 | from collections import OrderedDict 7 | 8 | from wechatsogou.const import WechatSogouConst 9 | from wechatsogou.five import urlencode 10 | 11 | _search_type_gzh = 1 # 公众号 12 | _search_type_article = 2 # 文章 13 | 14 | 15 | class WechatSogouRequest(object): 16 | @staticmethod 17 | def gen_search_article_url(keyword, page=1, timesn=WechatSogouConst.search_article_time.anytime, 18 | article_type=WechatSogouConst.search_article_type.all, ft=None, et=None): 19 | """拼接搜索 文章 URL 20 | 21 | Parameters 22 | ---------- 23 | keyword : str or unicode 24 | 搜索文字 25 | page : int, optional 26 | 页数 the default is 1 27 | timesn : WechatSogouConst.search_article_time 28 | 时间 anytime 没有限制 / day 一天 / week 一周 / month 一月 / year 一年 / specific 自定 29 | 默认是 anytime 30 | article_type : WechatSogouConst.search_article_type 31 | 含有内容的类型 image 有图 / video 有视频 / rich 有图和视频 / all 啥都有 32 | 默认是 all 33 | ft, et : datetime.date 34 | 当 tsn 是 specific 时,ft 代表开始时间,如: 2017-07-01 35 | 当 tsn 是 specific 时,et 代表结束时间,如: 2017-07-15 36 | 37 | Returns 38 | ------- 39 | str 40 | search_article_url 41 | """ 42 | assert isinstance(page, int) and page > 0 43 | assert timesn in [WechatSogouConst.search_article_time.anytime, 44 | WechatSogouConst.search_article_time.day, 45 | WechatSogouConst.search_article_time.week, 46 | WechatSogouConst.search_article_time.month, 47 | WechatSogouConst.search_article_time.year, 48 | WechatSogouConst.search_article_time.specific] 49 | 50 | if timesn == WechatSogouConst.search_article_time.specific: 51 | assert isinstance(ft, datetime.date) 52 | assert isinstance(et, datetime.date) 53 | assert ft <= et 54 | else: 55 | ft = '' 56 | et = '' 57 | 58 | interation_image = 458754 59 | interation_video = 458756 60 | if article_type == WechatSogouConst.search_article_type.rich: 61 | interation = '{},{}'.format(interation_image, interation_video) 62 | elif article_type == WechatSogouConst.search_article_type.image: 63 | interation = interation_image 64 | elif article_type == WechatSogouConst.search_article_type.video: 65 | interation = interation_video 66 | else: 67 | interation = '' 68 | 69 | qs_dict = OrderedDict() 70 | qs_dict['type'] = _search_type_article 71 | qs_dict['page'] = page 72 | qs_dict['ie'] = 'utf8' 73 | qs_dict['query'] = keyword 74 | qs_dict['interation'] = interation 75 | if timesn != 0: 76 | qs_dict['tsn'] = timesn 77 | qs_dict['ft'] = str(ft) 78 | qs_dict['et'] = str(et) 79 | 80 | # TODO 账号内搜索 81 | # '账号内 http://weixin.sogou.com/weixin?type=2&ie=utf8&query=%E9%AB%98%E8%80%83&tsn=3&ft=&et=&interation=458754 82 | # &wxid=oIWsFt1tmWoG6vO6BcsS7St61bRE&usip=nanhangqinggong' 83 | # qs['wxid'] = wxid 84 | # qs['usip'] = usip 85 | 86 | return 'http://weixin.sogou.com/weixin?{}'.format(urlencode(qs_dict)) 87 | 88 | @staticmethod 89 | def gen_search_gzh_url(keyword, page=1): 90 | """拼接搜索 公众号 URL 91 | 92 | Parameters 93 | ---------- 94 | keyword : str or unicode 95 | 搜索文字 96 | page : int, optional 97 | 页数 the default is 1 98 | 99 | Returns 100 | ------- 101 | str 102 | search_gzh_url 103 | """ 104 | assert isinstance(page, int) and page > 0 105 | 106 | qs_dict = OrderedDict() 107 | qs_dict['type'] = _search_type_gzh 108 | qs_dict['page'] = page 109 | qs_dict['ie'] = 'utf8' 110 | qs_dict['query'] = keyword 111 | 112 | return 'http://weixin.sogou.com/weixin?{}'.format(urlencode(qs_dict)) 113 | 114 | @staticmethod 115 | def gen_hot_url(hot_index, page=1): 116 | """拼接 首页热门文章 URL 117 | 118 | Parameters 119 | ---------- 120 | hot_index : WechatSogouConst.hot_index 121 | 首页热门文章的分类(常量):WechatSogouConst.hot_index.xxx 122 | page : int 123 | 页数 124 | 125 | Returns 126 | ------- 127 | str 128 | 热门文章分类的url 129 | """ 130 | 131 | assert hasattr(WechatSogouConst.hot_index, hot_index) 132 | assert isinstance(page, int) and page > 0 133 | 134 | index_urls = { 135 | WechatSogouConst.hot_index.hot: 0, # 热门 136 | WechatSogouConst.hot_index.gaoxiao: 1, # 搞笑 137 | WechatSogouConst.hot_index.health: 2, # 养生 138 | WechatSogouConst.hot_index.sifanghua: 3, # 私房话 139 | WechatSogouConst.hot_index.gossip: 4, # 八卦 140 | WechatSogouConst.hot_index.technology: 5, # 科技 141 | WechatSogouConst.hot_index.finance: 6, # 财经 142 | WechatSogouConst.hot_index.car: 7, # 汽车 143 | WechatSogouConst.hot_index.life: 8, # 生活 144 | WechatSogouConst.hot_index.fashion: 9, # 时尚 145 | WechatSogouConst.hot_index.mummy: 10, # 辣妈 / 育儿 146 | WechatSogouConst.hot_index.travel: 11, # 旅行 147 | WechatSogouConst.hot_index.job: 12, # 职场 148 | WechatSogouConst.hot_index.food: 13, # 美食 149 | WechatSogouConst.hot_index.history: 14, # 历史 150 | WechatSogouConst.hot_index.study: 15, # 学霸 / 教育 151 | WechatSogouConst.hot_index.constellation: 16, # 星座 152 | WechatSogouConst.hot_index.sport: 17, # 体育 153 | WechatSogouConst.hot_index.military: 18, # 军事 154 | WechatSogouConst.hot_index.game: 19, # 游戏 155 | WechatSogouConst.hot_index.pet: 20, # 萌宠 156 | 157 | } 158 | return 'http://weixin.sogou.com/wapindex/wap/0612/wap_{}/{}.html'.format(index_urls[hot_index], page - 1) 159 | -------------------------------------------------------------------------------- /wechatsogou/structuring.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import, unicode_literals, print_function 4 | 5 | import json 6 | import re 7 | 8 | import requests 9 | from bs4 import BeautifulSoup 10 | from lxml import etree 11 | from lxml.etree import XML 12 | 13 | from wechatsogou.exceptions import WechatSogouException 14 | from wechatsogou.five import str_to_bytes 15 | from wechatsogou.tools import get_elem_text, list_or_empty, replace_html, get_first_of_element, format_image_url 16 | 17 | backgroud_image_p = re.compile('background-image:[ ]+url\(\"([\w\W]+?)\"\)') 18 | js_content = re.compile('js_content.*?>((\s|\S)+)') 19 | find_article_json_re = re.compile('var msgList = (.*?)}}]};') 20 | get_post_view_perm = re.compile('') 21 | 22 | 23 | class WechatSogouStructuring(object): 24 | @staticmethod 25 | def __handle_content_url(content_url): 26 | content_url = replace_html(content_url) 27 | return ('http://mp.weixin.qq.com{}'.format( 28 | content_url) if 'http://mp.weixin.qq.com' not in content_url else content_url) if content_url else '' 29 | 30 | @staticmethod 31 | def __get_post_view_perm(text): 32 | result = get_post_view_perm.findall(text) 33 | if not result or len(result) < 1 or not result[0]: 34 | return None 35 | 36 | r = requests.get('http://weixin.sogou.com{}'.format(result[0])) 37 | if not r.ok: 38 | return None 39 | 40 | if r.json().get('code') != 'success': 41 | return None 42 | 43 | return r.json().get('msg') 44 | 45 | @staticmethod 46 | def get_gzh_by_search(text): 47 | """从搜索公众号获得的文本 提取公众号信息 48 | 49 | Parameters 50 | ---------- 51 | text : str or unicode 52 | 搜索公众号获得的文本 53 | 54 | Returns 55 | ------- 56 | list[dict] 57 | { 58 | 'open_id': '', # 微信号唯一ID 59 | 'profile_url': '', # 最近10条群发页链接 60 | 'headimage': '', # 头像 61 | 'wechat_name': '', # 名称 62 | 'wechat_id': '', # 微信id 63 | 'post_perm': '', # 最近一月群发数 64 | 'view_perm': '', # 最近一月阅读量 65 | 'qrcode': '', # 二维码 66 | 'introduction': '', # 介绍 67 | 'authentication': '' # 认证 68 | } 69 | """ 70 | post_view_perms = WechatSogouStructuring.__get_post_view_perm(text) 71 | 72 | page = etree.HTML(text) 73 | lis = page.xpath('//ul[@class="news-list2"]/li') 74 | relist = [] 75 | for li in lis: 76 | url = get_first_of_element(li, 'div/div[1]/a/@href') 77 | headimage = format_image_url(get_first_of_element(li, 'div/div[1]/a/img/@src')) 78 | wechat_name = get_elem_text(get_first_of_element(li, 'div/div[2]/p[1]')) 79 | info = get_elem_text(get_first_of_element(li, 'div/div[2]/p[2]')) 80 | qrcode = get_first_of_element(li, 'div/div[3]/span/img[1]/@src') 81 | introduction = get_elem_text(get_first_of_element(li, 'dl[1]/dd')) 82 | authentication = get_first_of_element(li, 'dl[2]/dd/text()') 83 | 84 | relist.append({ 85 | 'open_id': headimage.split('/')[-1], 86 | 'profile_url': url, 87 | 'headimage': headimage, 88 | 'wechat_name': wechat_name.replace('red_beg', '').replace('red_end', ''), 89 | 'wechat_id': info.replace('微信号:', ''), 90 | 'qrcode': qrcode, 91 | 'introduction': introduction.replace('red_beg', '').replace('red_end', ''), 92 | 'authentication': authentication, 93 | 'post_perm': -1, 94 | 'view_perm': -1, 95 | }) 96 | 97 | if post_view_perms: 98 | for i in relist: 99 | if i['open_id'] in post_view_perms: 100 | post_view_perm = post_view_perms[i['open_id']].split(',') 101 | if len(post_view_perm) == 2: 102 | i['post_perm'] = int(post_view_perm[0]) 103 | i['view_perm'] = int(post_view_perm[1]) 104 | return relist 105 | 106 | @staticmethod 107 | def get_article_by_search_wap(keyword, wap_dict): 108 | datas = [] 109 | for i in wap_dict['items']: 110 | item = str_to_bytes(i).replace(b'\xee\x90\x8a' + str_to_bytes(keyword) + b'\xee\x90\x8b', 111 | str_to_bytes(keyword)) 112 | root = XML(item) 113 | display = root.find('.//display') 114 | datas.append({ 115 | 'gzh': { 116 | 'profile_url': display.find('encGzhUrl').text, 117 | 'open_id': display.find('openid').text, 118 | 'isv': display.find('isV').text, 119 | 'wechat_name': display.find('sourcename').text, 120 | 'wechat_id': display.find('username').text, 121 | 'headimage': display.find('headimage').text, 122 | 'qrcode': display.find('encQrcodeUrl').text, 123 | }, 124 | 'article': { 125 | 'title': display.find('title').text, 126 | 'url': display.find('url').text, # encArticleUrl 127 | 'main_img': display.find('imglink').text, 128 | 'abstract': display.find('content168').text, 129 | 'time': display.find('lastModified').text, 130 | }, 131 | }) 132 | 133 | return datas 134 | 135 | @staticmethod 136 | def get_article_by_search(text): 137 | """从搜索文章获得的文本 提取章列表信息 138 | 139 | Parameters 140 | ---------- 141 | text : str or unicode 142 | 搜索文章获得的文本 143 | 144 | Returns 145 | ------- 146 | list[dict] 147 | { 148 | 'article': { 149 | 'title': '', # 文章标题 150 | 'url': '', # 文章链接 151 | 'imgs': '', # 文章图片list 152 | 'abstract': '', # 文章摘要 153 | 'time': '' # 文章推送时间 154 | }, 155 | 'gzh': { 156 | 'profile_url': '', # 公众号最近10条群发页链接 157 | 'headimage': '', # 头像 158 | 'wechat_name': '', # 名称 159 | 'isv': '', # 是否加v 160 | } 161 | } 162 | """ 163 | page = etree.HTML(text) 164 | lis = page.xpath('//ul[@class="news-list"]/li') 165 | 166 | articles = [] 167 | for li in lis: 168 | url = get_first_of_element(li, 'div[1]/a/@href') 169 | if url: 170 | title = get_first_of_element(li, 'div[2]/h3/a') 171 | imgs = li.xpath('div[1]/a/img/@src') 172 | abstract = get_first_of_element(li, 'div[2]/p') 173 | time = get_first_of_element(li, 'div[2]/div/span/script/text()') 174 | gzh_info = li.xpath('div[2]/div/a')[0] 175 | else: 176 | url = get_first_of_element(li, 'div/h3/a/@href') 177 | title = get_first_of_element(li, 'div/h3/a') 178 | imgs = [] 179 | spans = li.xpath('div/div[1]/a') 180 | for span in spans: 181 | img = span.xpath('span/img/@src') 182 | if img: 183 | imgs.append(img) 184 | abstract = get_first_of_element(li, 'div/p') 185 | time = get_first_of_element(li, 'div/div[2]/span/script/text()') 186 | gzh_info = li.xpath('div/div[2]/a')[0] 187 | 188 | if title is not None: 189 | title = get_elem_text(title).replace("red_beg", "").replace("red_end", "") 190 | if abstract is not None: 191 | abstract = get_elem_text(abstract).replace("red_beg", "").replace("red_end", "") 192 | 193 | time = re.findall('timeConvert\(\'(.*?)\'\)', time) 194 | time = list_or_empty(time, int) 195 | profile_url = get_first_of_element(gzh_info, '@href') 196 | headimage = get_first_of_element(gzh_info, '@data-headimage') 197 | wechat_name = get_first_of_element(gzh_info, 'text()') 198 | gzh_isv = get_first_of_element(gzh_info, '@data-isv', int) 199 | 200 | articles.append({ 201 | 'article': { 202 | 'title': title, 203 | 'url': url, 204 | 'imgs': format_image_url(imgs), 205 | 'abstract': abstract, 206 | 'time': time 207 | }, 208 | 'gzh': { 209 | 'profile_url': profile_url, 210 | 'headimage': headimage, 211 | 'wechat_name': wechat_name, 212 | 'isv': gzh_isv, 213 | } 214 | }) 215 | return articles 216 | 217 | @staticmethod 218 | def get_gzh_info_by_history(text): 219 | """从 历史消息页的文本 提取公众号信息 220 | 221 | Parameters 222 | ---------- 223 | text : str or unicode 224 | 历史消息页的文本 225 | 226 | Returns 227 | ------- 228 | dict 229 | { 230 | 'wechat_name': '', # 名称 231 | 'wechat_id': '', # 微信id 232 | 'introduction': '', # 描述 233 | 'authentication': '', # 认证 234 | 'headimage': '' # 头像 235 | } 236 | """ 237 | 238 | page = etree.HTML(text) 239 | profile_area = get_first_of_element(page, '//div[@class="profile_info_area"]') 240 | 241 | profile_img = get_first_of_element(profile_area, 'div[1]/span/img/@src') 242 | profile_name = get_first_of_element(profile_area, 'div[1]/div/strong/text()') 243 | profile_wechat_id = get_first_of_element(profile_area, 'div[1]/div/p/text()') 244 | profile_desc = get_first_of_element(profile_area, 'ul/li[1]/div/text()') 245 | profile_principal = get_first_of_element(profile_area, 'ul/li[2]/div/text()') 246 | 247 | return { 248 | 'wechat_name': profile_name.strip(), 249 | 'wechat_id': profile_wechat_id.replace('微信号: ', '').strip('\n'), 250 | 'introduction': profile_desc, 251 | 'authentication': profile_principal, 252 | 'headimage': profile_img 253 | } 254 | 255 | @staticmethod 256 | def get_article_by_history_json(text, article_json=None): 257 | """从 历史消息页的文本 提取文章列表信息 258 | 259 | Parameters 260 | ---------- 261 | text : str or unicode 262 | 历史消息页的文本 263 | article_json : dict 264 | 历史消息页的文本 提取出来的文章json dict 265 | 266 | Returns 267 | ------- 268 | list[dict] 269 | { 270 | 'send_id': '', # 群发id,注意不唯一,因为同一次群发多个消息,而群发id一致 271 | 'datetime': '', # 群发datatime 272 | 'type': '', # 消息类型,均是49,表示图文 273 | 'main': 0, # 是否是一次群发的第一次消息 274 | 'title': '', # 文章标题 275 | 'abstract': '', # 摘要 276 | 'fileid': '', # 277 | 'content_url': '', # 文章链接 278 | 'source_url': '', # 阅读原文的链接 279 | 'cover': '', # 封面图 280 | 'author': '', # 作者 281 | 'copyright_stat': '', # 文章类型,例如:原创啊 282 | } 283 | 284 | """ 285 | if article_json is None: 286 | article_json = find_article_json_re.findall(text) 287 | if not article_json: 288 | return [] 289 | article_json = article_json[0] + '}}]}' 290 | article_json = json.loads(article_json) 291 | 292 | items = list() 293 | 294 | for listdic in article_json['list']: 295 | if str(listdic['comm_msg_info'].get('type', '')) != '49': 296 | continue 297 | 298 | comm_msg_info = listdic['comm_msg_info'] 299 | app_msg_ext_info = listdic['app_msg_ext_info'] 300 | send_id = comm_msg_info.get('id', '') 301 | msg_datetime = comm_msg_info.get('datetime', '') 302 | msg_type = str(comm_msg_info.get('type', '')) 303 | 304 | items.append({ 305 | 'send_id': send_id, 306 | 'datetime': msg_datetime, 307 | 'type': msg_type, 308 | 'main': 1, 'title': app_msg_ext_info.get('title', ''), 309 | 'abstract': app_msg_ext_info.get('digest', ''), 310 | 'fileid': app_msg_ext_info.get('fileid', ''), 311 | 'content_url': WechatSogouStructuring.__handle_content_url(app_msg_ext_info.get('content_url')), 312 | 'source_url': app_msg_ext_info.get('source_url', ''), 313 | 'cover': app_msg_ext_info.get('cover', ''), 314 | 'author': app_msg_ext_info.get('author', ''), 315 | 'copyright_stat': app_msg_ext_info.get('copyright_stat', '') 316 | }) 317 | 318 | if app_msg_ext_info.get('is_multi', 0) == 1: 319 | for multi_dict in app_msg_ext_info['multi_app_msg_item_list']: 320 | items.append({ 321 | 'send_id': send_id, 322 | 'datetime': msg_datetime, 323 | 'type': msg_type, 324 | 'main': 0, 'title': multi_dict.get('title', ''), 325 | 'abstract': multi_dict.get('digest', ''), 326 | 'fileid': multi_dict.get('fileid', ''), 327 | 'content_url': WechatSogouStructuring.__handle_content_url(multi_dict.get('content_url')), 328 | 'source_url': multi_dict.get('source_url', ''), 329 | 'cover': multi_dict.get('cover', ''), 330 | 'author': multi_dict.get('author', ''), 331 | 'copyright_stat': multi_dict.get('copyright_stat', '') 332 | }) 333 | 334 | return list(filter(lambda x: x['content_url'], items)) # 删除搜狗本身携带的空数据 335 | 336 | @staticmethod 337 | def get_gzh_info_and_article_by_history(text): 338 | """从 历史消息页的文本 提取公众号信息 和 文章列表信息 339 | 340 | Parameters 341 | ---------- 342 | text : str or unicode 343 | 历史消息页的文本 344 | 345 | Returns 346 | ------- 347 | dict 348 | { 349 | 'gzh': { 350 | 'wechat_name': '', # 名称 351 | 'wechat_id': '', # 微信id 352 | 'introduction': '', # 描述 353 | 'authentication': '', # 认证 354 | 'headimage': '' # 头像 355 | }, 356 | 'article': [ 357 | { 358 | 'send_id': '', # 群发id,注意不唯一,因为同一次群发多个消息,而群发id一致 359 | 'datetime': '', # 群发datatime 360 | 'type': '', # 消息类型,均是49,表示图文 361 | 'main': 0, # 是否是一次群发的第一次消息 362 | 'title': '', # 文章标题 363 | 'abstract': '', # 摘要 364 | 'fileid': '', # 365 | 'content_url': '', # 文章链接 366 | 'source_url': '', # 阅读原文的链接 367 | 'cover': '', # 封面图 368 | 'author': '', # 作者 369 | 'copyright_stat': '', # 文章类型,例如:原创啊 370 | }, 371 | ... 372 | ] 373 | } 374 | """ 375 | return { 376 | 'gzh': WechatSogouStructuring.get_gzh_info_by_history(text), 377 | 'article': WechatSogouStructuring.get_article_by_history_json(text) 378 | } 379 | 380 | @staticmethod 381 | def get_gzh_article_by_hot(text): 382 | """从 首页热门搜索 提取公众号信息 和 文章列表信息 383 | 384 | Parameters 385 | ---------- 386 | text : str or unicode 387 | 首页热门搜索 页 中 某一页 的文本 388 | 389 | Returns 390 | ------- 391 | list[dict] 392 | { 393 | 'gzh': { 394 | 'headimage': str, # 公众号头像 395 | 'wechat_name': str, # 公众号名称 396 | }, 397 | 'article': { 398 | 'url': str, # 文章临时链接 399 | 'title': str, # 文章标题 400 | 'abstract': str, # 文章摘要 401 | 'time': int, # 推送时间,10位时间戳 402 | 'open_id': str, # open id 403 | 'main_img': str # 封面图片 404 | } 405 | } 406 | """ 407 | page = etree.HTML(text) 408 | lis = page.xpath('/html/body/li') 409 | gzh_article_list = [] 410 | for li in lis: 411 | url = get_first_of_element(li, 'div[1]/h4/a/@href') 412 | title = get_first_of_element(li, 'div[1]/h4/a/div/text()') 413 | abstract = get_first_of_element(li, 'div[1]/p[1]/text()') 414 | xpath_time = get_first_of_element(li, 'div[1]/p[2]') 415 | open_id = get_first_of_element(xpath_time, 'span/@data-openid') 416 | headimage = get_first_of_element(xpath_time, 'span/@data-headimage') 417 | gzh_name = get_first_of_element(xpath_time, 'span/text()') 418 | send_time = xpath_time.xpath('a/span/@data-lastmodified') 419 | main_img = get_first_of_element(li, 'div[2]/a/img/@src') 420 | 421 | try: 422 | send_time = int(send_time[0]) 423 | except ValueError: 424 | send_time = send_time[0] 425 | 426 | gzh_article_list.append({ 427 | 'gzh': { 428 | 'headimage': headimage, 429 | 'wechat_name': gzh_name, 430 | }, 431 | 'article': { 432 | 'url': url, 433 | 'title': title, 434 | 'abstract': abstract, 435 | 'time': send_time, 436 | 'open_id': open_id, 437 | 'main_img': main_img 438 | } 439 | }) 440 | 441 | return gzh_article_list 442 | 443 | @staticmethod 444 | def get_article_detail(text, del_qqmusic=True, del_voice=True): 445 | """根据微信文章的临时链接获取明细 446 | 447 | 1. 获取文本中所有的图片链接列表 448 | 2. 获取微信文章的html内容页面(去除标题等信息) 449 | 450 | Parameters 451 | ---------- 452 | text : str or unicode 453 | 一篇微信文章的文本 454 | del_qqmusic: bool 455 | 删除文章中的qq音乐 456 | del_voice: bool 457 | 删除文章中的语音内容 458 | 459 | Returns 460 | ------- 461 | dict 462 | { 463 | 'content_html': str # 微信文本内容 464 | 'content_img_list': list[img_url1, img_url2, ...] # 微信文本中图片列表 465 | 466 | } 467 | """ 468 | # 1. 获取微信文本content 469 | html_obj = BeautifulSoup(text, "lxml") 470 | content_text = html_obj.find('div', {'class': 'rich_media_content', 'id': 'js_content'}) 471 | 472 | # 2. 删除部分标签 473 | if del_qqmusic: 474 | qqmusic = content_text.find_all('qqmusic') or [] 475 | for music in qqmusic: 476 | music.parent.decompose() 477 | 478 | if del_voice: 479 | # voice是一个p标签下的mpvoice标签以及class为'js_audio_frame db'的span构成,所以将父标签删除 480 | voices = content_text.find_all('mpvoice') or [] 481 | for voice in voices: 482 | voice.parent.decompose() 483 | 484 | # 3. 获取所有的图片 [img标签,和style中的background-image] 485 | all_img_set = set() 486 | all_img_element = content_text.find_all('img') or [] 487 | for ele in all_img_element: 488 | # 删除部分属性 489 | img_url = format_image_url(ele.attrs['data-src']) 490 | del ele.attrs['data-src'] 491 | 492 | ele.attrs['src'] = img_url 493 | 494 | if not img_url.startswith('http'): 495 | raise WechatSogouException('img_url [{}] 不合法'.format(img_url)) 496 | all_img_set.add(img_url) 497 | 498 | backgroud_image = content_text.find_all(style=re.compile("background-image")) or [] 499 | for ele in backgroud_image: 500 | # 删除部分属性 501 | if ele.attrs.get('data-src'): 502 | del ele.attrs['data-src'] 503 | 504 | if ele.attrs.get('data-wxurl'): 505 | del ele.attrs['data-wxurl'] 506 | img_url = re.findall(backgroud_image_p, str(ele)) 507 | if not img_url: 508 | continue 509 | all_img_set.add(img_url[0]) 510 | 511 | # 4. 处理iframe 512 | all_img_element = content_text.find_all('iframe') or [] 513 | for ele in all_img_element: 514 | # 删除部分属性 515 | img_url = ele.attrs['data-src'] 516 | del ele.attrs['data-src'] 517 | ele.attrs['src'] = img_url 518 | 519 | # 5. 返回数据 520 | all_img_list = list(all_img_set) 521 | content_html = content_text.prettify() 522 | # 去除div[id=js_content] 523 | content_html = re.findall(js_content, content_html)[0][0] 524 | return { 525 | 'content_html': content_html, 526 | 'content_img_list': all_img_list 527 | } 528 | -------------------------------------------------------------------------------- /wechatsogou/tools.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import, unicode_literals, print_function 4 | 5 | import ast 6 | 7 | import requests 8 | 9 | from wechatsogou.five import url_parse 10 | 11 | 12 | def list_or_empty(content, contype=None): 13 | assert isinstance(content, list), 'content is not list: {}'.format(content) 14 | 15 | if content: 16 | return contype(content[0]) if contype else content[0] 17 | else: 18 | if contype: 19 | if contype == int: 20 | return 0 21 | elif contype == str: 22 | return '' 23 | elif contype == list: 24 | return [] 25 | else: 26 | raise Exception('only can deal int str list') 27 | else: 28 | return '' 29 | 30 | 31 | def get_elem_text(elem): 32 | """抽取lxml.etree库中elem对象中文字 33 | 34 | Args: 35 | elem: lxml.etree库中elem对象 36 | 37 | Returns: 38 | elem中文字 39 | """ 40 | if elem != '': 41 | return ''.join([node.strip() for node in elem.itertext()]) 42 | else: 43 | return '' 44 | 45 | 46 | def get_first_of_element(element, sub, contype=None): 47 | """抽取lxml.etree库中elem对象中文字 48 | 49 | Args: 50 | element: lxml.etree.Element 51 | sub: str 52 | 53 | Returns: 54 | elem中文字 55 | """ 56 | content = element.xpath(sub) 57 | return list_or_empty(content, contype) 58 | 59 | 60 | def get_encoding_from_reponse(r): 61 | """获取requests库get或post返回的对象编码 62 | 63 | Args: 64 | r: requests库get或post返回的对象 65 | 66 | Returns: 67 | 对象编码 68 | """ 69 | encoding = requests.utils.get_encodings_from_content(r.text) 70 | return encoding[0] if encoding else requests.utils.get_encoding_from_headers(r.headers) 71 | 72 | 73 | def _replace_str_html(s): 74 | """替换html‘"’等转义内容为正常内容 75 | 76 | Args: 77 | s: 文字内容 78 | 79 | Returns: 80 | s: 处理反转义后的文字 81 | """ 82 | html_str_list = [ 83 | (''', '\''), 84 | ('"', '"'), 85 | ('&', '&'), 86 | ('¥', '¥'), 87 | ('amp;', ''), 88 | ('<', '<'), 89 | ('>', '>'), 90 | (' ', ' '), 91 | ('\\', '') 92 | ] 93 | for i in html_str_list: 94 | s = s.replace(i[0], i[1]) 95 | return s 96 | 97 | 98 | def replace_html(data): 99 | if isinstance(data, dict): 100 | return dict([(replace_html(k), replace_html(v)) for k, v in data.items()]) 101 | elif isinstance(data, list): 102 | return [replace_html(l) for l in data] 103 | elif isinstance(data, str) or isinstance(data, unicode): 104 | return _replace_str_html(data) 105 | else: 106 | return data 107 | 108 | 109 | def str_to_dict(json_str): 110 | json_dict = ast.literal_eval(json_str) 111 | return replace_html(json_dict) 112 | 113 | 114 | def replace_space(s): 115 | return s.replace(' ', '').replace('\r\n', '') 116 | 117 | 118 | def get_url_param(url): 119 | result = url_parse.urlparse(url) 120 | return url_parse.parse_qs(result.query, True) 121 | 122 | 123 | def format_image_url(url): 124 | if isinstance(url, list): 125 | return [format_image_url(i) for i in url] 126 | 127 | if url.startswith('//'): 128 | url = 'https:{}'.format(url) 129 | return url 130 | 131 | 132 | def may_int(i): 133 | try: 134 | return int(i) 135 | except Exception: 136 | return i 137 | --------------------------------------------------------------------------------