├── .gitignore ├── LICENSE ├── PageSpeedInsights ├── .gitignore ├── README.md ├── image │ ├── 1_score.png │ ├── 2_real_data.png │ ├── 3_lab_data.png │ ├── 4_opportunities.png │ ├── 5_diagnostics.png │ ├── 6_passed_audits.png │ ├── gcp_cs_create.jpg │ ├── gcp_cs_list.jpg │ ├── gcp_pubsub_topic_list.jpg │ ├── psi.mermaid.svg │ └── psi.png ├── job.py ├── main.py ├── psi.mermaid ├── psi.py ├── requirements.txt └── zip.sh ├── README.md ├── auc_pr_roc ├── README.md └── auc_pr_roc.py ├── excel_combine ├── README.md └── excel_combine.py ├── geetest_offline ├── README.md ├── README_gd.md ├── constants.py ├── gd_list.json ├── geetest_offline.py ├── geetest_offline_gd.py ├── geetest_offline_nm.py └── util.py ├── geetest_online ├── README.md ├── constants.py ├── geetest_online.py ├── image │ ├── bg.jpg │ ├── bg.webp │ ├── fullbg.jpg │ ├── fullbg.webp │ └── slice.webp ├── test │ ├── TraceSample01.txt │ ├── TraceSample01Parse.txt │ ├── TraceSample02.txt │ ├── TraceSample02Parse.txt │ ├── TraceSample03.txt │ ├── TraceSample03Parse.txt │ ├── TraceSample04.txt │ ├── TraceSample04Parse.txt │ ├── test_pyexecjs.py │ ├── test_token.py │ └── testgeetestjs.py └── util.py ├── gitstats ├── README.md └── gitstats.py ├── gsxt_mobile ├── README.md ├── gsxt_mobile.py └── 腾讯科技50.txt ├── lagou ├── README.md └── lagou.py ├── level ├── README.md └── levelhelper.py ├── monkeyrunner ├── README.md └── uiparser.py ├── nacao_v1 ├── README.md ├── constants.py ├── nacao_v1.py └── sql_injection.txt └── nacao_v2 ├── README.md ├── constants.py └── nacao_v2.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # VSCode config 104 | .vscode/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /PageSpeedInsights/.gitignore: -------------------------------------------------------------------------------- 1 | *.zip 2 | -------------------------------------------------------------------------------- /PageSpeedInsights/README.md: -------------------------------------------------------------------------------- 1 | ![PageSpeed Insights](image/psi.png) 2 | 3 | 使用 Google Cloud Scheduler, Pub/Sub, Functions , Storage 等云服务,搭建 PageSpeed Insights 前端网站网页的质量和性能 benchmark 定时审查系统。与 CI/CD 流程结合,定时大批量审查网站技术性能指标。 4 | 5 | ### 1. PageSpeed Insights 6 | 7 | #### 1.1 简介 8 | 9 | PageSpeed Insights 是 Google 提供的一款网页性能检测优化工具,能够针对移动设备和桌面设备生成网页的实际性能报告,并提供关于如何改进相应网页的建议。它采用 Google Lighthouse 提供的各种最佳实践作为测试基准,使用 Blink 渲染工具(即 Google Chrome 的渲染引擎),模拟移动设备和桌面设备,抓取目标网站网页,进行优化分析。 10 | 以下简称PSI。 11 | 12 | #### 1.2 版本历史 13 | 14 | 版本 | 发布时间 | 功能更新 15 | --|--|-- 16 | V5 | 2018年Q4 | 当前最新版本。2019.05.08更新使用 Lighthouse 5.0 作为其分析引擎。 17 | V4 | 2018年1月 | 2019年Q3之前停用 18 | V2 | 2015年1月| 已停用 19 | V1 | 更早期 | 已停用 20 | 21 | #### 1.3 分析报告组成 22 | 23 | #### 1.3.1 综合速度得分 24 | 25 | 评分和等级: 26 | 27 | + 快 90分以上 28 | + 中等 50-90分 29 | + 慢 50分以下 30 | 31 | V5版本使用 Lighthouse 计算多项性能指标的综合加权得分。 32 | V4及之前版本结合 Chrome 用户体验报告数据库中的真实用户测速数据,计算评分和等级。主要参考以下两项指标。 33 | 34 | + FCP (First Contentful Paint)首次内容绘制,用于衡量用户何时看到来自相应网页的可见响应。所用时间越短,留住用户的可能性就越大。 35 | + DCL 文档内容加载,用于衡量何时完成 HTML 文档的加载和解析。所用时间越短,跳出率越低。 36 | 37 | #### 1.3.2 实测数据 38 | 39 | 结合 Chrome 用户体验报告中的其他网页过去30天内的实测数据相比的得分。 40 | 41 | #### 1.3.3 实验室数据 42 | 43 | 给出以下几项指标的耗时绝对值数据: 44 | 45 | + First Contentful Paint 首次内容绘制时间 46 | + First Meaningful Paint 首次有效绘制时间 47 | + Speed Index 速度指数 48 | + First CPU Idle 首次 CPU 闲置时间 49 | + Time to Interactive 可交互前的耗时 50 | + Estimated Input Latency 最长的潜在FID 51 | 52 | #### 1.3.4 关于如何加快网页加载速度的优化建议 53 | 54 | #### 1.3.5 关于Web开发最佳实践的详细诊断建议。 55 | 56 | #### 1.3.6 已通过的符合最佳实践的审查项 57 | 58 | #### 1.4 实际案例 59 | 60 | 以携程机票H5航班动态首页的某线上版本为例,直观的查看分析报告: 61 | https://m.ctrip.com/webapp/flight/schedule/detail.html 62 | 63 | ![综合速度得分](image/1_score.png) 64 | 65 | ![实测数据](image/2_real_data.png) 66 | 67 | ![实验室数据](image/3_lab_data.png) 68 | 69 | ![优化建议](image/4_opportunities.png) 70 | 71 | ![诊断建议](image/5_diagnostics.png) 72 | 73 | ![已通过](image/6_passed_audits.png) 74 | 75 | #### 1.5 使用方法 76 | 77 | PSI API是Google RESTful APIs之一, 仅需一次 HTTP 请求 ,应答返回一个 JSON Ojbect。使用极其简便。 78 | 79 | #### HTTP Request 80 | 81 | > GET https://www.googleapis.com/pagespeedonline/v5/runPagespeed 82 | 83 | 必选参数1个: 84 | 85 | + `url`: 目标分析网页的链接 86 | 87 | 可选参数6个: 88 | 89 | + category:`accessibility`,`best-practices`,`performance`,`pwa`,`seo`。默认是`performance`。 90 | + locale:返回结果文本的本地化语言版本。目前支持40种。默认英语`en`。 91 | + strategy:`desktop` 针对桌面浏览器进行优化分析,`mobile` 针对移动设备浏览器进行优化分析。 92 | + utm_campaign:广告系列名称 93 | + utm_source:广告系列来源 94 | + fields: 定制 Response 内容字段。 95 | 96 | #### HTTP Response 97 | 98 | 返回一个 JSON Object ,字段内容较多,此处省略,详见官网文档。 99 | 100 | #### 最简单命令行调用 101 | 102 | > curl https://www.googleapis.com/pagespeedonline/v5/runPagespeed?url=https://m.ctrip.com 103 | 104 | ### 2. Google Cloude Platform (GCP) 105 | 106 | #### 2.1 系统流程图 107 | 108 | ![workflow](image/psi.mermaid.svg) 109 | 110 | #### 2.2 Cloud Scheduler 111 | 112 | Cloud Scheduler 是 GCP 的一项全托管式企业级 cron 作业调度服务。支持 App Engine、Cloud Pub/Sub 和任意 HTTP 端点,允许作业触发 Compute Engine、Google Kubernetes Engine 和本地资源。 113 | 使用 Google Cloud Console 创建Job。目标有3种:HTTP,Pub/Sub,App Engine HTTP。这里选择 Pub/Sub 。设置每天22:00自动触发。 114 | 115 | ![Cloud Scheduler Create](image/gcp_cs_create.jpg) 116 | 117 | 创建成功后查看部署状态,部署成功后可以直接“立即运行”,查看日志,确认运行正常。 118 | 119 | ![Cloud Scheduler List](image/gcp_cs_list.jpg) 120 | 121 | #### 2.3 Cloud Pub/Sub 122 | 123 | Cloud Pub/Sub 是 GCP 的一项简单、可靠、可伸缩,可以用作数据流分析和事件驱动型计算系统的基础。 124 | 这里创建两个主题,`psi-job` 用于 Cloude Scheduler Job 的事件数据中转,`psi-single` 用于 Cloud Functions 的并发 HTTP 请求的事件数据中转。 125 | ![Cloud Pub/Sub](image/gcp_pubsub_topic_list.jpg) 126 | 127 | ### 2.4 Cloud Functions 128 | 129 | 实现并发大量网页的 PageSpeed Insights 检查,有多种方式。可以使用 Google App engine, Google Compute Engine。鉴于 PSI API 是上下文无关的简单 HTTP RESTful API,Cloud Functions Serverless 是最佳最简实现。 130 | Cloud Functions 是 GCP 的一项事件驱动型无服务器计算平台。通过构建多个分别专注于做好一件事的小型独立功能单元,再将这些功能单元组合成一个系统,实现快速开发和部署。支持在单个函数(而不是整个应用、容器或虚拟机)级构建和部署服务。 131 | 132 | #### 2.4.1 编写 Function 133 | 134 | 目前支持以下几种方案: 135 | 136 | 语言 | JavaScript 137 | --|-- 138 | 运行时 | Node.js 6(已弃用)、8、10(测试版) 139 | HTTP 框架 | Express 140 | HTTP 函数 | Express Request & Response Context 141 | 后台函数 | (data, context, callback) 142 | 依赖项管理 | npm/yarn + package.json 143 | 144 | 语言 | Python 145 | --|-- 146 | 运行时 | 3.7.1 147 | HTTP 框架 | Flask 148 | HTTP 函数 | 入参:Flask Request Object。返回值:符合 Flask.make_response() 的任意对象。 149 | 后台函数 | (data, context) 150 | 依赖项管理 | pip + requirements.txt 151 | 152 | 语言 | Go 153 | --|-- 154 | 运行时 | Go 1.11 155 | HTTP 框架 | http.HandlerFunc 标准接口 156 | HTTP 函数 | request: *http.Request. response: http.ResponseWriter. 157 | 后台函数 | (ctx, Event) 158 | 依赖项管理 | go.mod/vendor 159 | 160 | #### 2.4.2 部署 Function 161 | 162 | 目前支持以下几种方式: 163 | 164 | + 从本地机器部署。`使用 gcloud 命令行工具。` 165 | + 通过源代码控制系统部署。`使用 Google Cloud Source Repositories ,通过 OAuth 关联源代码仓库(如 GitHub 或 Bitbucket)。` 166 | + 通过 GCP Console 部署。 167 | + 网页内嵌编辑器.`直接在线编写函数代码。` 168 | + 上传本地ZIP文件。`文件夹目录结构与上述依赖性管理的源码工程结构一致。` 169 | + 导入 Cloud Storage 中的 ZIP 文件。`同上。` 170 | + 引用 Google Cloud Source Repositories的源代码工程。 171 | + 通过CI/CD部署。`使用 Cloud Build 搭建持续集成和部署系统。` 172 | 173 | #### 2.4.3 监控 Function 174 | 175 | Google Stackdriver 提供了服务监控工具,包括 `Debugger,Monitoring,Trace,Logging, Error Reporting,Profiler`。 176 | 177 | ### 3. PSI Functions 实现 178 | 179 | 创建好一个 Scheduler Job 和两个 Pub/Sub 主题后,接下来实现两个对应的 Functions 。 180 | 181 | #### 3.1 psi-single function 182 | 183 | psi-single() 负责针对具体单一 URL ,调用 PSI API 获取 JSON 结果的功能。 184 | Google APIs 支持多种调用方式。 185 | 186 | **3.1.1 使用 `google api client` 。** 187 | 通过 `Discovery API` ,获得已经封装好的 `Service` ,再调用具体接口。 188 | 189 | ```Python 190 | from googleapiclient.discovery import build 191 | 192 | def run(url): 193 | pagespeedonline = build( 194 | serviceName = 'pagespeedonline', 195 | version = 'v5', 196 | developerKey = API_KEY 197 | ) 198 | response = pagespeedonline.pagespeedapi().runpagespeed(url = url).execute() 199 | print(response) 200 | return 'OK' 201 | ``` 202 | 203 | **3.1.2 针对简单接口,直接调用 `HTTP RESTful API` 。** 204 | 205 | ```Python 206 | import requests 207 | GAPI_PSI = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed" 208 | 209 | def run(url): 210 | try: 211 | payload = {"url": url, 212 | "key": API_KEY 213 | } 214 | with requests.Session() as session: 215 | response = session.get(url=GAPI_PSI, params=payload) 216 | print(response.status_code) 217 | print(response.json()) 218 | except requests.RequestException as _e: 219 | print(_e) 220 | return 'OK' 221 | ``` 222 | 223 | **3.1.3 实现 Pub/Sub 主题的订阅** 224 | 订阅消息 `event` 的格式详见官网文档,其中 data 属性是一段 `base64` 编码的 `ByteArray` ,承载了实际的数据内容。 225 | 226 | ```Python 227 | import base64 228 | 229 | def run_pubsub(event, context): 230 | pubsub_message = base64.urlsafe_b64decode(event['data']).decode('utf-8') 231 | return run(pubsub_message) 232 | ``` 233 | 234 | #### 3.2 psi-job function 235 | 236 | psi-job() 由 Scheduler Job 触发,将所有需审查的 URL 以 Pub/Sub 事件形式,并行分发给 psi-single() 。 237 | 238 | ```Python 239 | from google.cloud import pubsub_v1 240 | 241 | def run(event, context): 242 | publisher = pubsub_v1.PublisherClient() 243 | topic = publisher.topic_path(PROJECT_ID, TOPIC_NAME) 244 | for url in URL_DICT: 245 | data = url.encode('utf-8') 246 | publisher.publish(topic, data) 247 | return 'OK' 248 | ``` 249 | 250 | #### 3.3 环境变量和依赖项 251 | 252 | 为了避免安全敏感信息泄漏,可以将关键信息写入 Functions 环境变量和本地环境变量(本地开发调试使用)。 253 | 上述代码中 `API_KEY, PROJECT_ID` 等数据通过 `os.getenv()` 获取。 254 | Cloude Functions 已内置常用依赖库,详见官网文档。如需增加依赖项,配置各语言对应的工程文件。上述代码引用了两个依赖库。 255 | 256 | ```Python 257 | # requirements.txt 258 | # Function dependencies 259 | requests==2.21.0 260 | google-cloud-pubsub==0.40.0 261 | ``` 262 | 263 | ### 4. Storage 264 | 265 | 上述代码中的 `print()` 会写入 StackDriver 日志库,供后续过滤分析。鉴于每一个 URL 的审查结果是一个 JSON Object 字符串,可以进一步写入 BigTable , 使用 BigQuery 进行查询分析,再进一步导入 Google Data Studio , 进行可视化报表展示。 266 | 这里使用 Cloud Storage 存储 JSON 字符串为单一文件。 267 | 268 | ```Python 269 | from urllib import parse 270 | from google.cloud import storage 271 | from google.cloud.storage import Blob 272 | 273 | def save(url, report): 274 | '''Save to https://console.cloud.google.com/storage/browser/[bucket-id]/''' 275 | client = storage.Client() 276 | bucket = client.get_bucket("psi-report") 277 | blob = Blob(f"${parse.quote_plus(url)}.json", bucket) 278 | blob.upload_from_string(report, "application/json") 279 | ``` 280 | 281 | 添加依赖项。 282 | 283 | ```Python 284 | # requirements.txt 285 | # Function dependencies 286 | google-cloud-storage==1.15.0 287 | ``` 288 | 289 | ### 5. 源代码 290 | 291 | https://github.com/9468305/python-script/tree/master/PageSpeedInsights 292 | 293 | ### 6. 文档链接 294 | 295 | 1. PageSpeed Insights 296 | https://developers.google.com/speed/pagespeed/insights 297 | 2. Google Lighthouse 298 | https://developers.google.com/web/tools/lighthouse/ 299 | 3. Google Cloud Scheduler 300 | https://cloud.google.com/scheduler/ 301 | 4. Google Cloud Pub/Sub 302 | https://cloud.google.com/pubsub/ 303 | 5. Google Cloud Functions 304 | https://cloud.google.com/functions/ 305 | 6. Google Cloud Storage 306 | https://cloud.google.com/storage/ 307 | 7. Google Cloud Build 308 | https://cloud.google.com/cloud-build/ 309 | 8. Google Stackdriver 310 | https://cloud.google.com/stackdriver/ 311 | -------------------------------------------------------------------------------- /PageSpeedInsights/image/1_score.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/PageSpeedInsights/image/1_score.png -------------------------------------------------------------------------------- /PageSpeedInsights/image/2_real_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/PageSpeedInsights/image/2_real_data.png -------------------------------------------------------------------------------- /PageSpeedInsights/image/3_lab_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/PageSpeedInsights/image/3_lab_data.png -------------------------------------------------------------------------------- /PageSpeedInsights/image/4_opportunities.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/PageSpeedInsights/image/4_opportunities.png -------------------------------------------------------------------------------- /PageSpeedInsights/image/5_diagnostics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/PageSpeedInsights/image/5_diagnostics.png -------------------------------------------------------------------------------- /PageSpeedInsights/image/6_passed_audits.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/PageSpeedInsights/image/6_passed_audits.png -------------------------------------------------------------------------------- /PageSpeedInsights/image/gcp_cs_create.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/PageSpeedInsights/image/gcp_cs_create.jpg -------------------------------------------------------------------------------- /PageSpeedInsights/image/gcp_cs_list.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/PageSpeedInsights/image/gcp_cs_list.jpg -------------------------------------------------------------------------------- /PageSpeedInsights/image/gcp_pubsub_topic_list.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/PageSpeedInsights/image/gcp_pubsub_topic_list.jpg -------------------------------------------------------------------------------- /PageSpeedInsights/image/psi.mermaid.svg: -------------------------------------------------------------------------------- 1 |
Google APIs
Cloud Functions
Cloud Pub/Sub
Job Pub/Sub
PSI Pub/Sub
Cloud Scheduler
Push
Push
并发 HTTP
并发 Push
并发 HTTP Request
PageSpeed Insights API
Job Function Service
PSI Function Service
PSI Publisher
PSI Subscriber
Job Publisher
Job Subscriber
定时作业任务
-------------------------------------------------------------------------------- /PageSpeedInsights/image/psi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/PageSpeedInsights/image/psi.png -------------------------------------------------------------------------------- /PageSpeedInsights/job.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | '''PageSpeed Insights Job + Google Cloud Functions''' 4 | import os 5 | from google.cloud import pubsub_v1 6 | 7 | PROJECT_ID = os.getenv("GCP_PROJECT_ID") 8 | TOPIC_NAME = "psi-single" 9 | 10 | URL_DICT = ["https://m.ctrip.com/webapp/flight/schedule/detail.html"] 11 | 12 | def run(event, context): 13 | publisher = pubsub_v1.PublisherClient() 14 | topic = publisher.topic_path(PROJECT_ID, TOPIC_NAME) 15 | for url in URL_DICT: 16 | data = url.encode('utf-8') 17 | publisher.publish(topic, data) 18 | return 'OK' 19 | 20 | def test_job(): 21 | print('TODO') 22 | 23 | if __name__ == "__main__": 24 | test_job() 25 | -------------------------------------------------------------------------------- /PageSpeedInsights/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import psi 5 | import job 6 | 7 | def psi_pubsub(event, context): 8 | psi.run_pubsub(event, context) 9 | 10 | def job_pubsub(event, context): 11 | job.run(event, context) 12 | -------------------------------------------------------------------------------- /PageSpeedInsights/psi.mermaid: -------------------------------------------------------------------------------- 1 | graph TB 2 | Job(定时作业任务) 3 | CPS_Job_P(Job Publisher) 4 | CPS_Job_S(Job Subscriber) 5 | CPS_PSI_P(PSI Publisher) 6 | CPS_PSI_S(PSI Subscriber) 7 | CF_Job(Job Function Service) 8 | CF_PSI(PSI Function Service) 9 | GAPI_PSI(PageSpeed Insights API) 10 | 11 | Job -->|Push| CPS_Job_P 12 | CPS_Job_S -->|Push| CF_Job 13 | CF_Job -->| 并发 HTTP | CPS_PSI_P 14 | CPS_PSI_S -->| 并发 Push | CF_PSI 15 | CF_PSI -->| 并发 HTTP Request | GAPI_PSI 16 | 17 | subgraph Cloud Scheduler 18 | Job 19 | end 20 | 21 | subgraph Cloud Pub/Sub 22 | subgraph Job Pub/Sub 23 | CPS_Job_P --> CPS_Job_S 24 | end 25 | 26 | subgraph PSI Pub/Sub 27 | CPS_PSI_P --> CPS_PSI_S 28 | end 29 | end 30 | 31 | subgraph Cloud Functions 32 | CF_Job 33 | CF_PSI 34 | end 35 | 36 | subgraph Google APIs 37 | GAPI_PSI 38 | end -------------------------------------------------------------------------------- /PageSpeedInsights/psi.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | '''PageSpeed Insights Single + Google Cloud Functions''' 4 | import os 5 | import base64 6 | from urllib import parse 7 | import requests 8 | from google.cloud import storage 9 | from google.cloud.storage import Blob 10 | 11 | # Access Token, generated from GCP Console Credentials page. 12 | API_KEY = os.getenv('GCP_API_KEY') 13 | 14 | GAPI_PSI = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed" 15 | 16 | SESSION = requests.Session() 17 | 18 | PROXIES = None 19 | 20 | 21 | def save(url, report): 22 | '''Save to https://console.cloud.google.com/storage/browser/[bucket-id]/''' 23 | client = storage.Client() 24 | bucket = client.get_bucket("psi-report") 25 | blob = Blob(f"${parse.quote_plus(url)}.json", bucket) 26 | blob.upload_from_string(report, "application/json") 27 | 28 | 29 | def run(url): 30 | try: 31 | payload = {"url": url, 32 | "category": "performance", 33 | "locale": "zh-CN", 34 | "strategy": "mobile", 35 | "key": API_KEY 36 | } 37 | response = SESSION.get(url=GAPI_PSI, params=payload, proxies=PROXIES) 38 | print(response.status_code) 39 | if 200 == response.status_code: 40 | save(url, response.text) 41 | except requests.RequestException as _e: 42 | print(_e) 43 | return 'OK' 44 | 45 | 46 | def run_pubsub(event, context): 47 | pubsub_message = base64.urlsafe_b64decode(event['data']).decode('utf-8') 48 | return run(pubsub_message) 49 | 50 | 51 | def test_run_http(test_url): 52 | run(test_url) 53 | 54 | 55 | def test_run_pubsub(test_url): 56 | event = {"data": base64.urlsafe_b64encode(test_url.encode('utf-8'))} 57 | context = None 58 | run_pubsub(event, context) 59 | 60 | 61 | if __name__ == "__main__": 62 | _proxy = os.getenv("HTTP_PROXY") 63 | PROXIES = { 64 | "http": _proxy, 65 | "https": _proxy, 66 | } 67 | _test_url = "https://m.ctrip.com/webapp/flight/schedule/detail.html" 68 | test_run_http(_test_url) 69 | test_run_pubsub(_test_url) 70 | -------------------------------------------------------------------------------- /PageSpeedInsights/requirements.txt: -------------------------------------------------------------------------------- 1 | # Function dependencies 2 | requests==2.31.0 3 | google-cloud-pubsub==0.40.0 4 | google-cloud-storage==1.15.0 5 | #google-cloud-bigtable==0.32.1 6 | #google-cloud-core==0.29.1 -------------------------------------------------------------------------------- /PageSpeedInsights/zip.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # -*- coding: utf-8 -*- 3 | 4 | # chmod +x ./zip.sh 5 | zip -r functions.zip ./ -x *.DS_Store* 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | My Python Script 2 | ---- 3 | 4 | 1. [auc_pr_roc](/auc_pr_roc) 5 | Python scikit-learn计算PR ROC曲线AUC值。 6 | 2. [excel_combine](/excel_combine) 7 | Python实现 - Excel多文件一键自动合并。 8 | 3. [geetest_offline](/geetest_offline) 9 | Python破解GeeTest滑块验证码offline V5.10.10,以[国家企业信用信息公示系统](http://www.gsxt.gov.cn)网站为例。 10 | 4. [geetest_offline_gd](/geetest_offline/README_gd.md) 11 | Python爬虫 - [国家企业信用信息公示系统(广东)](http://gd.gsxt.gov.cn) 企业详细信息。 12 | 5. [geetest_online](/geetest_online) 13 | Python破解GeeTest滑块验证码online,以[国家企业信用信息公示系统](http://www.gsxt.gov.cn)网站为例。 14 | 6. [gitstats](/gitstats) 15 | Python实现 - Git commit log统计分析。 16 | 7. [gsxt_mobile](/gsxt_mobile) 17 | Python爬虫 - 国家企业信用信息公示系统 App,通过 App HTTP API 查询企业信息。 18 | 8. [lagou](/lagou) 19 | Python爬虫 - Selenium [拉勾网](https://www.lagou.com) 数据采集。 20 | 9. [level](/level) 21 | Python leveldb Utils 常用方法封装。 22 | 10. [nacao_v1](/nacao_v1) 23 | Python爬虫 - [全国组织结构代码管理中心](http://www.nacao.org.cn)V1.0。 24 | 11. [nacao_v2](/nacao_v2) 25 | Python爬虫 - [全国组织结构代码管理中心](http://www.nacao.org.cn)V2.0。 26 | 12. [MonkeyRunner](/monkeyrunner) 27 | MonkeyRunner is DEAD! 28 | 13. [PageSpeed Insights](/PageSpeedInsights) 29 | 前端DevOps之PageSpeed Insights - 使用 Google Cloud Scheduler, Pub/Sub, Functions , Storage 等云服务,搭建 PageSpeed Insights 前端网站网页的质量和性能 benchmark 定时审查系统。与 CI/CD 流程结合,定时大批量审查网站技术性能指标。 30 | 31 | License 32 | ---- 33 | 34 | ```txt 35 | Copyright 2017 ChenQi 36 | 37 | Licensed under the Apache License, Version 2.0 (the "License"); 38 | you may not use this file except in compliance with the License. 39 | You may obtain a copy of the License at 40 | 41 | http://www.apache.org/licenses/LICENSE-2.0 42 | 43 | Unless required by applicable law or agreed to in writing, software 44 | distributed under the License is distributed on an "AS IS" BASIS, 45 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 46 | See the License for the specific language governing permissions and 47 | limitations under the License. 48 | ``` 49 | -------------------------------------------------------------------------------- /auc_pr_roc/README.md: -------------------------------------------------------------------------------- 1 | ### 背景 2 | 3 | 2017年某算法竞赛平台出题,飞机航班延误预测算法,需要验收提交答案的准确性。 4 | 5 | 源数据样本csv格式:失效,省略。 6 | 提交预测样本csv格式说明:失效,省略。 7 | 示例: 8 | 9 | Flightno | FlightDepcode | FlightArrcode | PlannedDeptime | PlannedArrtime | prob 10 | -- | -- | -- | -- | -- | -- 11 | CA1351 | PEK | CAN | 1496273700 | 1496285700 | 0.041386555 12 | 8L9647 | KMG | HIA | 1496272200 | 1496282400 | 0.022590361 13 | CZ6299 | DLC | SZX | 1496274000 | 1496286900 | 0.025210084 14 | HU7377 | URC | CKG | 1496273700 | 1496287500 | 0.106757728 15 | 16 | 本次比赛采用PR曲线的AUC(baseline:auc=0.45)。评估指标参考:[The Relationship Between Precision-Recall and ROC Curves](http://mark.goadrich.com/articles/davisgoadrichcamera2.pdf) 17 | 18 | ### 实现 19 | 20 | csv文件读取使用pandas库。 21 | 22 | ```Python 23 | def load_label_prob(real_csv, result_csv): 24 | '''读取real.csv和result.csv表格数据的label数组和prob数组''' 25 | real_df, result_df = pandas.read_csv(real_csv), pandas.read_csv(result_csv) 26 | # 检查real.csv和result.csv的数据是否合规 27 | check_format(real_df, result_df) 28 | label, prob = real_df['label'].values, result_df['prob'].values 29 | # 四舍五入, 小数点后保留4位 30 | for _i, _e in enumerate(prob): 31 | prob[_i] = round(_e, 4) 32 | return label, prob 33 | ``` 34 | 35 | PR曲线AUC值计算使用sklearn库。 36 | 37 | ```Python 38 | '''使用real.csv和result.csv列数据,计算PR曲线的AUC值''' 39 | precision, recall, _thresholds = metrics.precision_recall_curve(label, prob) 40 | area = metrics.auc(recall, precision) 41 | return area 42 | ``` 43 | 44 | 附:ROC曲线的AUC值计算。 45 | 46 | ```Python 47 | '''使用real.csv和result.csv列数据,计算ROC曲线的AUC值''' 48 | area = metrics.roc_auc_score(label, prob) 49 | return area 50 | ``` 51 | 52 | ### 环境搭建 53 | 54 | scikit-learn Windows 环境搭建略繁琐,对 NumPy 和 SciPy 版本有要求。 55 | 因此直接使用[第三方预编译库](http://www.lfd.uci.edu/~gohlke/pythonlibs/)。 56 | 57 | ```bash 58 | pip install http://www.lfd.uci.edu/~gohlke/pythonlibs/ru4fxw3r/numpy-1.13.1+mkl-cp36-cp36m-win32.whl 59 | pip install http://www.lfd.uci.edu/~gohlke/pythonlibs/ru4fxw3r/scipy-0.19.1-cp36-cp36m-win32.whl 60 | pip install pandas 61 | pip install scikit-learn 62 | ``` 63 | 64 | ### [GitHub源码](https://github.com/9468305/python-script/blob/master/auc_pr_roc/) 65 | -------------------------------------------------------------------------------- /auc_pr_roc/auc_pr_roc.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | '''使用real.csv和result.csv表格数据,计算PR ROC曲线的AUC值。''' 4 | 5 | import sys 6 | import pandas 7 | #from pandas import DataFrame 8 | from sklearn import metrics 9 | 10 | REAL_HEADER = ['Flightno', 11 | 'FlightDepcode', 12 | 'FlightArrcode', 13 | 'PlannedDeptime', 14 | 'PlannedArrtime', 15 | 'label'] 16 | 17 | RESULT_HEADER = ['Flightno', 18 | 'FlightDepcode', 19 | 'FlightArrcode', 20 | 'PlannedDeptime', 21 | 'PlannedArrtime', 22 | 'prob'] 23 | 24 | 25 | def check_column(column1, column2): 26 | '''检查列数据是否一致''' 27 | if not column1.equals(column2): 28 | print('Error: csv column has different data!') 29 | exit(1) 30 | 31 | 32 | def check_format(real_df, result_df): 33 | '''检查real.csv和result.csv的数据是否合规''' 34 | real_header, result_header = real_df.columns.values.tolist(), result_df.columns.values.tolist() 35 | if REAL_HEADER != real_header or RESULT_HEADER != result_header: 36 | print('Error: csv has different headers!') 37 | print(real_header) 38 | print(result_header) 39 | exit(1) 40 | check_column(real_df['Flightno'], result_df['Flightno']) 41 | check_column(real_df['FlightDepcode'], result_df['FlightDepcode']) 42 | check_column(real_df['FlightArrcode'], result_df['FlightArrcode']) 43 | check_column(real_df['PlannedDeptime'], result_df['PlannedDeptime']) 44 | check_column(real_df['PlannedArrtime'], result_df['PlannedArrtime']) 45 | 46 | 47 | def load_label_prob(real_csv, result_csv): 48 | '''读取real.csv和result.csv表格数据的label列数组和prob列数组''' 49 | real_df, result_df = pandas.read_csv(real_csv), pandas.read_csv(result_csv) 50 | check_format(real_df, result_df) 51 | label, prob = real_df['label'].values, result_df['prob'].values 52 | # 四舍五入, 小数点后保留4位 53 | for _i, _e in enumerate(prob): 54 | prob[_i] = round(_e, 4) 55 | return label, prob 56 | 57 | 58 | def auc_roc(real_csv, result_csv): 59 | '''使用real.csv和result.csv列数据,计算ROC曲线的AUC值''' 60 | label, prob = load_label_prob(real_csv, result_csv) 61 | area = metrics.roc_auc_score(label, prob) 62 | #print(area) 63 | return area 64 | 65 | 66 | def auc_pr(real_csv, result_csv): 67 | '''使用real.csv和result.csv列数据,计算PR曲线的AUC值''' 68 | label, prob = load_label_prob(real_csv, result_csv) 69 | precision, recall, _thresholds = metrics.precision_recall_curve(label, prob) 70 | area = metrics.auc(recall, precision) 71 | #print(area) 72 | return area 73 | 74 | 75 | if __name__ == "__main__": 76 | auc_pr(sys.argv[1], sys.argv[2]) 77 | #auc_roc(sys.argv[1], sys.argv[2]) 78 | #hard code for test 79 | #print(auc_pr('real.csv', 'result.csv')) 80 | #print(auc_roc('real.csv', 'result.csv')) 81 | -------------------------------------------------------------------------------- /excel_combine/README.md: -------------------------------------------------------------------------------- 1 | 工作中经常遇到一些繁琐事情,例如绩效考核,晋升调薪,固定资产盘点,团建出游时,部门秘书发来一个全部门的Excel文件,各小组Leader拆分出自己团队的Excel文件。下发给每个成员。成员填写完毕后发回给小组Leader,小组Leader汇总后发给部门秘书。部门秘书再汇总各小组Excel到一个完整的大Excel文件。 2 | 一直没有找到方便好用的Excel多文件合并工具。于是自己动手撸一个Python脚本,一键自动合并。 3 | **吐槽:由于种种原因,我司内网Web OA系统建设不足,很多事情都以Excel分发和汇总。** 4 | 5 | ### 一键执行 6 | 7 | 为了方便非技术人员使用,不添加任何启动参数,直接将脚本放在Excel文件夹根目录,双击执行。 8 | 也不需要指定合并汇总文件名,`combine.xlsx`这个名字的重复率很低,使用者留意即可。 9 | 10 | ```Python 11 | if __name__ == "__main__": 12 | FROM_DIR = os.getcwd() 13 | TO_FILE = os.path.join(FROM_DIR, 'combine.xlsx') 14 | combine(FROM_DIR, TO_FILE) 15 | ``` 16 | 17 | ### openpyxl 18 | 19 | 目前主流Office版本支持Excel 2010格式,即xlsx后缀名。如果源文件是xls后缀名,直接另存为xlsx即可。因此使用openpyxl库读写Excel文件,忽略xls后缀名文件。 20 | [openpyxl - A Python library to read/write Excel 2010 xlsx/xlsm files](https://openpyxl.readthedocs.io/en/default/) 21 | 22 | ### 遍历文件夹,查找Excel文件 23 | 24 | 使用os.walk() 25 | 26 | ```Python 27 | _results = [] 28 | for _root, _dirs, _files in os.walk(from_dir): 29 | for _file in _files: 30 | if _file.endswith('.xlsx'): 31 | _results.append(os.path.join(_root, _file)) 32 | return _results 33 | ``` 34 | 35 | ### 注意:删除合并汇总文件,即combine.xlsx 36 | 37 | 合并之前,删除结果文件,以防数据错误。 38 | 39 | ```Python 40 | _result = search_file(from_dir) 41 | try: 42 | _result.remove(to_file) 43 | except ValueError: 44 | print('Result file not exist.') 45 | return _result 46 | ``` 47 | 48 | ### 多文件重复数据清理 49 | 50 | + 如何确定Excel中每行数据是唯一的? 51 | + 如何检测重复的Excel文件(例如文件名不同,内容相同)? 52 | + 如何合并不同格式(行,列)的Excel文件? 53 | 54 | 这里设定一些潜规则: 55 | 56 | + Excel第一行必须是Title标题。 57 | + Excel第一列必须是唯一Key。例如工号,邮箱等全局唯一值。 58 | 59 | 因此使用 Python 内建的 collections 集合模块的 OrderedDict 有序字典,以第一列为Key。 60 | 完整的读取Excel文件并建立内存字典如下: 61 | 62 | ```Python 63 | _wb = load_workbook(excel_file, read_only=True) 64 | _ws = _wb.active 65 | _title = [] 66 | _items = collections.OrderedDict() 67 | for _r in _ws.rows: 68 | if not _title: 69 | for _i in _r: 70 | _title.append(_i.value) 71 | else: 72 | _item = [] 73 | for _i in _r: 74 | _item.append(_i.value) 75 | _items[_item[0]] = _item 76 | _wb.close() 77 | return _title, _items 78 | ``` 79 | 80 | ### 如何判断2个字典的元素一致 81 | 82 | Python內建了强大的[operator](https://docs.python.org/3/library/operator.html)。 83 | 84 | ```Python 85 | if not operator.eq(dict_src, dict_dst): 86 | print('Warning: dict elements are different!') 87 | ``` 88 | 89 | ### 最后把数据写入Excel文件 90 | 91 | ```Python 92 | _wb = Workbook() 93 | _ws = _wb.active 94 | _ws.append(excel_title) 95 | for _k, _v in excel_items.items(): 96 | _ws.append(_v) 97 | _wb.save(excel_file) 98 | ``` 99 | 100 | ### 遗留的坑 101 | 102 | + 没有处理多个Sheet的Case。 103 | + 没有处理Excel公式计算。 104 | 105 | ### [GitHub源码](https://github.com/9468305/python-script/tree/master/excel_combine) 106 | -------------------------------------------------------------------------------- /excel_combine/excel_combine.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | '''合并指定文件夹下所有Excel文件到同一个文件''' 4 | import os 5 | import collections 6 | import operator 7 | from openpyxl import load_workbook 8 | from openpyxl import Workbook 9 | 10 | def search_excel(from_dir, to_file): 11 | '''遍历from_dir文件夹,查找Excel文件,返回结果列表''' 12 | _results = [] 13 | for _root, _dirs, _files in os.walk(from_dir): 14 | for _file in _files: 15 | if _file.endswith('.xlsx'): 16 | _results.append(os.path.join(_root, _file)) 17 | 18 | try: 19 | print('Remove combine.xlsx.') 20 | _results.remove(to_file) 21 | except ValueError: 22 | print('combine.xlsx not exist.') 23 | return _results 24 | 25 | 26 | def load_excel(excel_file): 27 | '''读取Excel文件内容,返回Excel的标题数组和数据有序字典''' 28 | _wb = load_workbook(excel_file, read_only=True) 29 | _ws = _wb.active 30 | _title = [] 31 | _items = collections.OrderedDict() 32 | for _r in _ws.rows: 33 | if not _title: 34 | for _i in _r: 35 | _title.append(_i.value) 36 | else: 37 | _item = [] 38 | for _i in _r: 39 | _item.append(_i.value) 40 | _items[_item[0]] = _item 41 | 42 | _wb.close() 43 | return _title, _items 44 | 45 | 46 | def save_excel(excel_file, excel_title, excel_items): 47 | '''保存Excel文件''' 48 | _wb = Workbook() 49 | _ws = _wb.active 50 | _ws.append(excel_title) 51 | for _k, _v in excel_items.items(): 52 | _ws.append(_v) 53 | _wb.save(excel_file) 54 | 55 | 56 | def combine(from_dir, to_file): 57 | '''合并指定文件夹下所有Excel文件到同一个文件''' 58 | _excel_files = search_excel(from_dir, to_file) 59 | if not _excel_files: 60 | return 61 | _excel_title = [] 62 | _excel_content = collections.OrderedDict() 63 | for _file in _excel_files: 64 | print('Parsing ' + _file) 65 | _title, _items = load_excel(_file) 66 | if not _title or not _items: 67 | print('Skip since it is empty.') 68 | continue 69 | 70 | if not _excel_title: 71 | _excel_title = _title 72 | elif not operator.eq(_title, _excel_title): 73 | print('Warning: Excel title format are different!') 74 | 75 | for _k, _v in _items.items(): 76 | _excel_content[_k] = _v 77 | print('Parsing done.') 78 | 79 | if not _excel_title or not _excel_content: 80 | print('All files is empty.') 81 | return 82 | save_excel(to_file, _excel_title, _excel_content) 83 | 84 | 85 | if __name__ == "__main__": 86 | print('begin') 87 | FROM_DIR = os.getcwd() 88 | TO_FILE = os.path.join(FROM_DIR, 'combine.xlsx') 89 | combine(FROM_DIR, TO_FILE) 90 | print('end') 91 | -------------------------------------------------------------------------------- /geetest_offline/README.md: -------------------------------------------------------------------------------- 1 | # GeeTest滑块验证码offline模式的分析 2 | 3 | GeeTest滑块验证码通过机器学习检查鼠标行为轨迹,识别人工或机器行为。 4 | online在线验证的流程,目前最全面的分析文档详见 [https://zhuanlan.zhihu.com/windev](https://zhuanlan.zhihu.com/windev) 。 5 | online模式的验证流程,网站后台与GeeTest后台 [http://api.geetest.com](http://api.geetest.com) 进行通讯验证。浏览器前端仅做数据采集和简单加密传输。 6 | offline模式的离线验证,网站后台自行验证,GeeTest后台 [http://static.geetest.com](http://static.geetest.com) 仅提供滑块验证码图片下载功能。浏览器前端做数据采集和本地验证。 7 | 在安全性上,online模式相对非常可靠,offline模式仅仅是障眼法。 8 | 9 | ### 1. 测试网站 10 | 11 | 以[国家企业信用信息公示系统](http://www.gsxt.gov.cn)为例:主站使用 geetest 5.10.10 online 在线验证模式。各省市站点使用的版本和模块略有差异。 12 | 13 | **geetest offline 5.9.0** 14 | 15 | + 上海 16 | + 河北 17 | + 内蒙古 18 | + 辽宁 19 | + 福建 20 | + 山东 21 | + 广东 22 | + 海南 23 | + 湖北 24 | + 湖南 25 | + 四川 26 | + 云南 27 | + 西藏 28 | + 青海 29 | + 宁夏 30 | 31 | **geetest offline 5.10.10** 32 | 33 | + 贵州 34 | + 陕西 35 | 36 | ### 2. offline模式验证流程 37 | 38 | 以[上海站点](http://sh.gsxt.gov.cn)为例。 39 | 40 | #### 2.1 GET 首页 http://sh.gsxt.gov.cn/notice/ 41 | 42 | 返回HTML页面,解析得到session.token。 43 | 44 | #### 2.2 GET register http://sh.gsxt.gov.cn/notice/pc-geetest/register 45 | 46 | 返回JSON数据 47 | 48 | ```json 49 | { 50 | "success":0, 51 | "gt":"39134c54afef1e0b19228627406614e9", 52 | "challenge":"d2ddea18f00665ce8623e36bd4e3c7c543" 53 | } 54 | ``` 55 | 56 | success = 0 表示启用 offline 验证模式。 57 | 58 | #### 2.3 POST http://sh.gsxt.gov.cn/notice/security/verify_ip 59 | 60 | 返回200 True,表示成功。 61 | 62 | #### 2.4 POST http://sh.gsxt.gov.cn/notice/security/verify_keyword 63 | 64 | 返回200 True,表示成功。 65 | 66 | #### 2.5 POST http://sh.gsxt.gov.cn/notice/pc-geetest/validate 67 | 68 | 上传滑块验证码的本地验证结果数据,简称validate。返回JSON数据 69 | 70 | ```json 71 | { 72 | "status":"success", 73 | "version":"3.3.0" 74 | } 75 | ``` 76 | 77 | offline模式,后台根本不知道浏览器使用哪张滑块验证码图片,只知道浏览器上传了验证结果的数据。所以完全可以省略下载验证码图片,进行图像识别,计算滑块移动位置,模拟鼠标滑动轨迹这些步骤。 78 | 79 | **验证数据格式** 80 | 81 | 例如:`1517aab3f_51aa460f_75555a6a38`,其中以 `_` 分隔的3段数据是由 `geetest.5.x.x.js` 中的 distance, rand0, rand1 加密混淆得到。 82 | 具体加密过程分析详见 [寻找阿登高地——爬虫工程师如何绕过验证码](http://www.jianshu.com/p/5b6fb04ea686) 。 83 | 其实不用关心加密算法的实现细节,只需找到JavaScript调用入口,传入参数执行即可: 84 | 85 | ```javascript 86 | function userresponse(a, b) { 87 | for (var c = b.slice(32), d = [], e = 0; e < c.length; e++) { 88 | var f = c.charCodeAt(e); 89 | d[e] = f > 57 ? f - 87 : f - 48 90 | } 91 | c = 36 * d[0] + d[1]; 92 | var g = Math.round(a) + c; b = b.slice(0, 32); 93 | var h, i = [ [], [], [], [], [] ], j = {}, k = 0; e = 0; 94 | for (var l = b.length; e < l; e++) 95 | h = b.charAt(e), j[h] || (j[h] = 1, i[k].push(h), k++, k = 5 == k ? 0 : k); 96 | for (var m, n = g, o = 4, p = "", q = [1, 2, 5, 10, 50]; n > 0;) 97 | n - q[o] >= 0 ? (m = parseInt(Math.random() * i[o].length, 10), p += i[o][m], n -= q[o]) : (i.splice(o, 1), q.splice(o, 1), o -= 1); 98 | return p 99 | } 100 | ``` 101 | 102 | **3个参数的正确生成** 103 | 104 | distance,rand0,rand1,这3个参数都是随机生成,但是如果写代码直接随机生成,会发现验证成功率不高,那么这3个参数之间存在什么隐藏关联关系?后台是如何校验这3个随机数的正确性? 105 | 其实它们之间存在什么关系不重要,重要的是能够成功通过验证。 106 | 只需人工采样N次,构造足够的样本数组,每次随机选取1个,调用JavaScript加密方法,得到验证数据即可。 107 | 108 | #### 2.6 POST http://sh.gsxt.gov.cn/notice/search/ent_info_list 109 | 110 | 上传session.token(步骤1获得),challenge(步骤2获得),validate(步骤5计算),keyword(查询关键字),返回HTML页面,解析DOM结构,即可获得查询结果和session.token的更新(用于下一次查询)。 111 | 112 | ### 3. 源码 113 | 114 | Python 3.6 115 | 116 | Install: 117 | 118 | ```bash 119 | pip install requests # HTTP Request库 120 | pip install PyExecJS # Python调用JavaScript, 配合node.js更佳 121 | pip install beautifulsoup4 # 解析HTML页面 122 | ``` 123 | 124 | Demo: 125 | 126 | ```bash 127 | python ./geetest_offline.py 128 | python ./geetest_offline_nm.py 129 | ``` 130 | 131 | #### Entry Code 132 | 133 | [geetest_offline.py](/geetest_offline/geetest_offline.py) for 上海,河北。 134 | 135 | [geetest_offline_nm.py](/geetest_offline/geetest_offline_nm.py) for 内蒙古,HTTP Request&Response 略有不同。 136 | -------------------------------------------------------------------------------- /geetest_offline/README_gd.md: -------------------------------------------------------------------------------- 1 | 这次的爬虫换个目标,不仅仅抓取统一社会信用代码(税号),还要抓取企业基础信息。目标网站设定为 http://gd.gsxt.gov.cn 国家企业信用信息公示系统(广东)。 2 | 3 | 首页依然是采用GeeTest滑块验证码Offline模式验证,校验过程内容省略。以搜索关键字“腾讯科技”为例,第一步获得如下数据。 4 | 5 | ```python 6 | [ 7 | ('广州腾讯科技有限公司', 8 | 'http://gsxt.gzaic.gov.cn/aiccips/GSpublicity/GSpublicityList.html?service=entInfo_nPNw57QPCnL961TNeXO4Gqc/FgBy7ESTwWPrP4zJe5g=-FBrJ/suNwXMupXtmIUvNKg=='), 9 | ('深圳兴腾讯科技有限公司', 10 | 'https://www.szcredit.org.cn/GJQYCredit/GSZJGSPTS/QYGS.aspx?rid=6B553DC2860F51DD8179F9821CA72F8094E73CE96BD2D49EC7C4690757FA61D9'), 11 | ('腾讯科技(深圳)有限公司', 12 | 'https://www.szcredit.org.cn/GJQYCredit/GSZJGSPTS/QYGS.aspx?rid=B0819DEB6219A8B1'), 13 | ('深圳市联腾讯科技有限公司', 14 | 'https://www.szcredit.org.cn/GJQYCredit/GSZJGSPTS/QYGS.aspx?rid=DB80B6DEA7F44F35C9A10E5985D4FAA2D4F342323238AB811179ADA6138BD8D4'), 15 | ('惠州云达腾讯科技有限公司', 16 | 'http://gd.gsxt.gov.cn/aiccips/CheckEntContext/../GSpublicity/GSpublicityList.html?service=entInfo_SesJBXGCYofnRPu6PUIM/1lSj0vJHOw5gTgVbtsLB1BTAOYLpc4gxgb5a3wjX8k3-dA+Hj5oOjXjQTgAhKSP1lA=='), 17 | ('深圳市华腾讯科技有限公司', 18 | 'https://www.szcredit.org.cn/GJQYCredit/GSZJGSPTS/QYGS.aspx?rid=6B553DC2860F51DD8179F9821CA72F80820C9FD043746B01E89676307B6B60EF'), 19 | ('中山腾讯科技电子有限公司', 20 | 'http://gd.gsxt.gov.cn/aiccips/CheckEntContext/../GSpublicity/GSpublicityList.html?service=entInfo_ZECp7scr3rINuX8+ial6uIv57yGPPUCA1RAvDHoM0tBrXZJ9+1otoDp51Oi7UabK-7kW54gFL28iQmsO8Qn3cTA=='), 21 | ('深圳市安腾讯科技电子有限公司', 22 | 'https://www.szcredit.org.cn/GJQYCredit/GSZJGSPTS/QYGS.aspx?rid=6B553DC2860F51DD8179F9821CA72F808CC6A55FD01EE165A1560ECF17B3E73C'), 23 | ('中山市纸箱总厂腾讯科技亚太电子厂', 24 | 'http://gd.gsxt.gov.cn/aiccips/CheckEntContext/../GSpublicity/GSpublicityList.html?service=entInfo_M+Q/CD12sdYKPqPXAzRChoB2xhauTJBsWbk/xaaA92MJ4dcDV+KRZ71QUWHSpwQ+-7kW54gFL28iQmsO8Qn3cTA=='), 25 | ('深圳龙腾讯威科技有限公司', 26 | 'https://www.szcredit.org.cn/GJQYCredit/GSZJGSPTS/QYGS.aspx?rid=6B553DC2860F51DD7501B40D8BFA3C22E27771C25B8DF96FD1F35DF7C350F5A9') 27 | ] 28 | ``` 29 | 30 | **吐槽:企业详细信息页面分3种。每种网站又有不同的网页模板,因此解析HTML页面元素需要分别处理。深圳的网页模板是一套,广州和其他的是另一套。所以需要区分2种DOM树进行解析。** 31 | 32 | + 深圳 https://www.szcredit.org.cn 33 | + 广州 http://gsxt.gzaic.gov.cn/aiccips/GSpublicity/GSpublicityList.html 34 | + 其他 http://gd.gsxt.gov.cn/aiccips/CheckEntContext/../GSpublicity/GSpublicityList.html 35 | 36 | 使用 BeautifulSoup 解析搜索结果页面后,需要判断URL: 37 | 38 | ```Python 39 | _url = _company['href'] 40 | if _url.startswith('../'): 41 | _url = INDEX + '/aiccips/CheckEntContext/' + _url 42 | ``` 43 | 44 | 最终整理得到数据: 45 | 46 | ```python 47 | { 48 | '注册号/统一社会信用代码': '91440101327598294H', 49 | '注册资本': '0', 50 | '企业名称': '广州腾讯科技有限公司', 51 | '类型': '有限责任公司(外商投资企业法人独资)', 52 | '成立日期': '2014年12月31日', 53 | '营业期限自': '2014年12月31日', 54 | '营业期限至': '2018年07月15日', 55 | '登记机关': '广州市海珠区工商行政管理局', 56 | '核准日期': '2016年12月23日', 57 | '登记状态': '存续', 58 | '经营范围': '电子、通信与自动控制技术研究、开发;网络技术的研究、开发;计算机技术开发、技术服务;软件服务;软件测试服务;软件批发;软件零售;软件开发;游戏软件设计制作;信息技术咨询服务;数据处理和存储服务;(依法须经批准的项目,经相关部门批准后方可开展经营活动)〓' 59 | } 60 | ``` 61 | 62 | **吐槽+1:〓是什么鬼?** 63 | 64 | 由于这些网站性能极差,默认的15秒超时经常失败,因此在每次网络请求之上添加保护,对于可以多次重试的请求,添加循环等待。 65 | 66 | ```Python 67 | def safe_query_detail(url): 68 | '''Safe query url, handle network timeout and retry multi times.''' 69 | for _ in range(5): 70 | try: 71 | with requests.Session() as session: 72 | return query_detail(session, url) 73 | except requests.RequestException as _e: 74 | logging.error(_e) 75 | time.sleep(5) 76 | return None 77 | ``` 78 | 79 | **吐槽+2:降低网站性能也是一种非常有效的反爬技术。** 80 | 81 | 2017.12.05 更新 82 | 83 | 从 http://gd.gsxt.gov.cn 查询“深圳兴腾讯科技有限公司”,跳转链接失败,服务器500错误。 84 | > https://www.szcredit.org.cn/GJQYCredit/GSZJGSPTS/QYGS.aspx?rid=6B553DC2860F51DD8179F9821CA72F8094E73CE96BD2D49EC7C4690757FA61D9 85 | 86 | 从 https://www.szcredit.org.cn ,查询和跳转正常。 87 | > https://www.szcredit.org.cn/web/gspt/newGSPTDetail3.aspx?ID=2e82a6a7aaec419884738d2421e7a838 88 | 89 | **吐槽+3:这都是什么运维水平?** 90 | -------------------------------------------------------------------------------- /geetest_offline/constants.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | HTTP Request 常用常量 5 | ''' 6 | 7 | ACCEPT_ANY = '*/*' 8 | 9 | ACCEPT_TEXT = 'text/plain, */*; q=0.01' 10 | 11 | ACCEPT_HTML = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' 12 | 13 | ACCEPT_JSON = 'application/json, text/javascript, */*; q=0.01' 14 | 15 | ACCEPT_IMAGE = 'image/webp,image/*,*/*;q=0.8' 16 | 17 | ACCEPT_LANGUAGE = 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2' 18 | 19 | UA_CHROME_WIN = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' 20 | 21 | UA_CHROME_MAC = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 22 | 23 | USER_AGENT = UA_CHROME_MAC 24 | -------------------------------------------------------------------------------- /geetest_offline/gd_list.json: -------------------------------------------------------------------------------- 1 | [ 2 | "腾讯科技", 3 | "百度", 4 | "阿里巴巴" 5 | ] -------------------------------------------------------------------------------- /geetest_offline/geetest_offline.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | geetest offline 5.9.0 - 6.0.0 for sh.gsxt.gov.cn, he.gsxt.gov.cn 5 | ''' 6 | 7 | import os 8 | import time 9 | import random 10 | import logging 11 | from logging import NullHandler 12 | import requests 13 | import execjs 14 | from bs4 import BeautifulSoup 15 | import constants 16 | import util 17 | 18 | logging.getLogger(__name__).addHandler(NullHandler()) 19 | logging.basicConfig(level=logging.DEBUG) 20 | 21 | HOST = '' 22 | INDEX = '' 23 | 24 | JSRUNTIME = execjs.get(execjs.runtime_names.Node) 25 | 26 | CAPTCHA_JSON = [] 27 | 28 | USERRESPONSE_JSCONTEXT = JSRUNTIME.compile(util.USERRESPONSE_JS) 29 | 30 | TIMEOUT = 10 31 | 32 | GSXT_HOST_SH = 'http://sh.gsxt.gov.cn' 33 | GSXT_INDEX_SH = GSXT_HOST_SH + '/notice/' 34 | GSXT_HOST_HE = 'http://he.gsxt.gov.cn' 35 | GSXT_INDEX_HE = GSXT_HOST_HE + '/notice/' 36 | 37 | def config(host, index): 38 | '''设置 host and index URL''' 39 | global HOST, INDEX 40 | HOST, INDEX = host, index 41 | 42 | 43 | def calc_userresponse(distance, challenge): 44 | '''根据滑动距离distance和challenge,计算userresponse值''' 45 | return USERRESPONSE_JSCONTEXT.call('userresponse', distance, challenge) 46 | 47 | 48 | def calc_validate(challenge): 49 | '''计算validate值''' 50 | _r = random.randint(0, len(util.OFFLINE_SAMPLE)-1) 51 | distance, rand0, rand1 = util.OFFLINE_SAMPLE[_r] 52 | distance_r = calc_userresponse(distance, challenge) 53 | rand0_r = calc_userresponse(rand0, challenge) 54 | rand1_r = calc_userresponse(rand1, challenge) 55 | validate = distance_r + '_' + rand0_r + '_' + rand1_r 56 | logging.debug(validate) 57 | return validate 58 | 59 | 60 | def parse_token(html_doc): 61 | '''使用BeautifulSoup解析HTML页面, 查找session.token''' 62 | soup = BeautifulSoup(html_doc, 'html.parser') 63 | _find = soup.find('input', attrs={'name': 'session.token'}) 64 | return _find['value'] if _find else None 65 | 66 | 67 | def parse_code(html_doc): 68 | '''使用BeautifulSoup解析HTML页面,查找统一社会信用代码''' 69 | _soup = BeautifulSoup(html_doc, 'html.parser') 70 | _findall = _soup.find_all('div', class_='tableContent page-item') 71 | _result = [] 72 | if _findall: 73 | for _a in _findall: 74 | _td = _a.find('td') 75 | _td_str = ''.join(_td.get_text().split()) 76 | _i = _a.find('i') 77 | _i_str = ''.join(_i.get_text().split()) 78 | _td_str = _td_str[0: -len(_i_str)] 79 | _th = _a.find('th', class_='icon1') 80 | _em = _th.find('em') 81 | _result.append((_td_str.encode('utf-8'), _em.get_text().encode('utf-8'))) 82 | else: 83 | logging.info('Code Not Found') 84 | return _result 85 | 86 | 87 | def get_main(session): 88 | '''Get gsxt 首页''' 89 | _url = INDEX 90 | logging.debug('GET ' + _url) 91 | _headers = {'Accept': constants.ACCEPT_HTML, 92 | 'Accept-Language': constants.ACCEPT_LANGUAGE, 93 | 'User-Agent': constants.USER_AGENT} 94 | _response = session.get(_url, headers=_headers, timeout=TIMEOUT) 95 | logging.debug('response code:' + str(_response.status_code)) 96 | return parse_token(_response.text) if _response.status_code == 200 else None 97 | 98 | 99 | def get_register(session): 100 | ''' 101 | {"success": 0, 102 | "gt": "39134c54afef1e0b19228627406614e9", 103 | "challenge": "fc490ca45c00b1249bbe3554a4fdf6fb35"} 104 | ''' 105 | _url = INDEX + 'pc-geetest/register' 106 | logging.debug('GET ' + _url) 107 | _headers = {'Accept': constants.ACCEPT_JSON, 108 | 'Accept-Language': constants.ACCEPT_LANGUAGE, 109 | 'User-Agent': constants.USER_AGENT, 110 | 'Referer': INDEX, 111 | 'X-Requested-With': 'XMLHttpRequest'} 112 | _params = {'v': str(int(time.time() * 1000))} 113 | _response = session.get(_url, headers=_headers, params=_params, timeout=TIMEOUT) 114 | logging.debug('response code: ' + str(_response.status_code)) 115 | logging.debug('response text: ' + _response.text) 116 | if _response.status_code != 200: 117 | return False 118 | global CAPTCHA_JSON 119 | CAPTCHA_JSON = _response.json() 120 | return True 121 | 122 | 123 | def post_verify_ip(session): 124 | ''' POST /notice/security/verify_ip''' 125 | _url = INDEX + 'security/verify_ip' 126 | logging.debug('POST ' + _url) 127 | _headers = {'Accept': constants.ACCEPT_TEXT, 128 | 'Accept-Language': constants.ACCEPT_LANGUAGE, 129 | 'User-Agent': constants.USER_AGENT, 130 | 'Referer': INDEX, 131 | 'X-Requested-With': 'XMLHttpRequest', 132 | 'Origin': HOST} 133 | _response = session.post(_url, headers=_headers, timeout=TIMEOUT) 134 | logging.debug('response code: ' + str(_response.status_code)) 135 | logging.debug('response text: ' + _response.text) 136 | return _response.status_code == 200 137 | 138 | 139 | def post_verify_keyword(session, keyword): 140 | ''' POST /notice/security/verify_keyword HTTP/1.1''' 141 | _url = INDEX + 'security/verify_keyword' 142 | logging.debug('POST ' + _url) 143 | _headers = {'Accept': constants.ACCEPT_TEXT, 144 | 'Accept-Language': constants.ACCEPT_LANGUAGE, 145 | 'User-Agent': constants.USER_AGENT, 146 | 'Referer': INDEX, 147 | 'X-Requested-With': 'XMLHttpRequest', 148 | 'Origin': HOST} 149 | _params = {'keyword': keyword} 150 | _response = session.post(_url, headers=_headers, data=_params, timeout=TIMEOUT) 151 | logging.debug('response code: ' + str(_response.status_code)) 152 | logging.debug('response text: ' + _response.text) 153 | return _response.status_code == 200 154 | 155 | 156 | def post_validate(session, validate): 157 | ''' POST /notice/pc-geetest/validate''' 158 | _url = INDEX + 'pc-geetest/validate' 159 | logging.debug('POST ' + _url) 160 | _headers = {'Accept': constants.ACCEPT_JSON, 161 | 'Accept-Language': constants.ACCEPT_LANGUAGE, 162 | 'User-Agent': constants.USER_AGENT, 163 | 'Referer': INDEX, 164 | 'X-Requested-With': 'XMLHttpRequest', 165 | 'Origin': HOST} 166 | _params = [('geetest_challenge', CAPTCHA_JSON['challenge']), 167 | ('geetest_validate', validate), 168 | ('geetest_seccode', validate + '|jordan')] 169 | _response = session.post(_url, headers=_headers, data=_params, timeout=TIMEOUT) 170 | logging.debug('response code: ' + str(_response.status_code)) 171 | logging.debug('response text: ' + _response.text) 172 | if _response.status_code != 200: 173 | return False 174 | _json_obj = _response.json() # {"status":"success","version":"3.3.0"} 175 | logging.debug(_json_obj) 176 | return _json_obj['status'] == 'success' 177 | 178 | 179 | def post_search(session, validate, keyword, token): 180 | ''' POST /notice/search/ent_info_list HTTP/1.1''' 181 | _url = INDEX + 'search/ent_info_list' 182 | logging.debug('POST ' + _url) 183 | _headers = {'Accept': constants.ACCEPT_HTML, 184 | 'Accept-Language': constants.ACCEPT_LANGUAGE, 185 | 'User-Agent': constants.USER_AGENT, 186 | 'Referer': INDEX, 187 | 'X-Requested-With': 'XMLHttpRequest', 188 | 'Origin': HOST} 189 | _params = [('condition.searchType', 1), 190 | ('captcha', ''), 191 | ('geetest_challenge', CAPTCHA_JSON['challenge']), 192 | ('geetest_validate', validate), 193 | ('geetest_seccode', validate + '|jordan'), 194 | ('session.token', token), 195 | ('condition.keyword', keyword)] 196 | _response = session.post(_url, headers=_headers, data=_params, timeout=TIMEOUT) 197 | logging.debug('response code: ' + str(_response.status_code)) 198 | #logger.debug('response text: ' + _response.text) 199 | if _response.status_code != 200: 200 | return None, None 201 | return parse_code(_response.text), parse_token(_response.text) 202 | 203 | 204 | def get_validate(session, keyword): 205 | '''循环进行validate验证''' 206 | for _ in range(10): 207 | if not get_register(session): 208 | return None 209 | 210 | if not post_verify_ip(session): 211 | return None 212 | 213 | if not post_verify_keyword(session, keyword): 214 | return None 215 | 216 | validate = calc_validate(CAPTCHA_JSON['challenge']) 217 | if post_validate(session, validate): 218 | return validate 219 | return None 220 | 221 | 222 | def query_keyword(session, keyword, token): 223 | '''使用session, 查询keyword, 更新session.token''' 224 | if not token: 225 | token = get_main(session) 226 | if not token: 227 | return None 228 | 229 | validate = get_validate(session, keyword) 230 | if not validate: 231 | return None 232 | 233 | return post_search(session, validate, keyword, token) 234 | 235 | 236 | def query_leveldb(query_db, save_db, queryed_db): 237 | '''query by leveldb''' 238 | try: 239 | with requests.Session() as session: 240 | _token = '' 241 | for _name, _code in query_db.RangeIter(): 242 | if not util.has_key(save_db, _name) and not util.has_key(queryed_db, _name): 243 | # 模糊查询 244 | _subname = _name[0: 18] if len(_name) > 18 else _name 245 | logging.info(_name + ' -> ' + _subname) 246 | _query_code, _token = query_keyword(session, _subname, _token) 247 | if _query_code: 248 | for _r in _query_code: 249 | logging.info(_r[0].decode() + ' : ' + _r[1].decode()) 250 | save_db.Put(_r[0], _r[1], sync=True) 251 | queryed_db.Put(_name, '', sync=True) 252 | return True 253 | except requests.RequestException as _e: 254 | logging.error(_e) 255 | return False 256 | 257 | 258 | def query_keyword_helper(keyword): 259 | '''针对gsxt分站,根据keyword查询一次''' 260 | try: 261 | with requests.Session() as session: 262 | _token = '' 263 | logging.info(keyword) 264 | _query_code, _token = query_keyword(session, keyword, _token) 265 | if _query_code: 266 | for _r in _query_code: 267 | logging.info(_r[0].decode() + ' : ' + _r[1].decode()) 268 | return True 269 | except requests.RequestException as _e: 270 | logging.error(_e) 271 | return False 272 | 273 | 274 | def query_leveldb_helper(): 275 | '''批量查询leveldb database中所有数据''' 276 | try: 277 | import leveldb 278 | except ImportError: 279 | raise ImportError('You do not install leveldb package.') 280 | 281 | config(GSXT_HOST_HE, GSXT_INDEX_HE) 282 | 283 | query_db_file = os.path.join(os.getcwd(), 'data', 'shanghai.db') 284 | query_db = leveldb.LevelDB(query_db_file) 285 | 286 | save_db_file = os.path.join(os.getcwd(), 'data', 'shanghai_code.db') 287 | save_db = leveldb.LevelDB(save_db_file) 288 | 289 | queryed_db_file = os.path.join(os.getcwd(), 'data', 'shanghai_queryed.db') 290 | queryed_db = leveldb.LevelDB(queryed_db_file) 291 | 292 | _loop = True 293 | while _loop: 294 | _loop = not query_leveldb(query_db, save_db, queryed_db) 295 | 296 | 297 | if __name__ == "__main__": 298 | config(GSXT_HOST_SH, GSXT_INDEX_SH) 299 | query_keyword_helper('百度') 300 | config(GSXT_HOST_HE, GSXT_INDEX_HE) 301 | query_keyword_helper('百度') 302 | -------------------------------------------------------------------------------- /geetest_offline/geetest_offline_gd.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | geetest offline 6.0.0 spider for gd.gsxt.org.cn 5 | ''' 6 | 7 | import os 8 | import time 9 | import random 10 | import logging 11 | from logging import NullHandler 12 | import json 13 | import requests 14 | import execjs 15 | from bs4 import BeautifulSoup 16 | 17 | import constants 18 | import util 19 | 20 | 21 | logging.getLogger(__name__).addHandler(NullHandler()) 22 | logging.basicConfig(level=logging.DEBUG) 23 | 24 | HOST = 'http://gd.gsxt.gov.cn' 25 | INDEX = HOST 26 | 27 | JSRUNTIME = execjs.get(execjs.runtime_names.Node) 28 | 29 | USERRESPONSE_JSCONTEXT = JSRUNTIME.compile(util.USERRESPONSE_JS) 30 | 31 | TIMEOUT = 15 32 | 33 | GD_LIST_FILE = 'gd_list.json' 34 | GD_RESULT_FILE = 'gd_result.json' 35 | GD_NOTFOUND_FILE = 'gd_notfound.json' 36 | 37 | def load_json(json_file): 38 | '''load json file''' 39 | if not os.path.isfile(json_file): 40 | logging.info("Json File Not Exist") 41 | return None 42 | with open(json_file, 'r', encoding='utf8') as _f: 43 | json_data = json.load(_f) 44 | logging.info(len(json_data)) 45 | return json_data 46 | 47 | 48 | def save_json(json_file, json_data): 49 | '''save json file''' 50 | with open(json_file, 'w', encoding='utf8') as _f: 51 | json.dump(json_data, _f, indent=2, sort_keys=True, ensure_ascii=False) 52 | logging.info(len(json_data)) 53 | 54 | 55 | def calc_userresponse(distance, challenge): 56 | '''根据滑动距离 distance 和 challenge ,计算 userresponse。''' 57 | return USERRESPONSE_JSCONTEXT.call('userresponse', distance, challenge) 58 | 59 | 60 | def calc_validate(challenge): 61 | '''calculate validate''' 62 | _r = random.randint(0, len(util.OFFLINE_SAMPLE)-1) 63 | distance, rand0, rand1 = util.OFFLINE_SAMPLE[_r] 64 | distance_r = calc_userresponse(distance, challenge) 65 | rand0_r = calc_userresponse(rand0, challenge) 66 | rand1_r = calc_userresponse(rand1, challenge) 67 | validate = distance_r + '_' + rand0_r + '_' + rand1_r 68 | logging.debug(validate) 69 | return validate 70 | 71 | 72 | def parse_name_url(html_doc): 73 | '''使用BeautifulSoup解析HTML页面,查找详情链接''' 74 | _soup = BeautifulSoup(html_doc, 'html.parser') 75 | _findall = _soup.find_all('div', 76 | class_="clickStyle", 77 | style='margin-left: 160px;padding-left: 10px;') 78 | name_url_array = [] 79 | if _findall: 80 | for _a in _findall: 81 | _company = _a.find('a') 82 | _name = ''.join(_company.get_text().split()) 83 | _url = _company['href'] 84 | if _url.startswith('../'): 85 | _url = INDEX + '/aiccips/CheckEntContext/' + _url 86 | name_url_array.append((_name, _url)) 87 | logging.info(name_url_array) 88 | else: 89 | logging.error('Company Link Not Found') 90 | return name_url_array 91 | 92 | 93 | def get_mainpage(session): 94 | ''' 95 | Get http://gd.gsxt.gov.cn 96 | Response Code 200 97 | ''' 98 | logging.debug('GET ' + INDEX) 99 | _headers = {'Accept': constants.ACCEPT_HTML, 100 | 'Accept-Language': constants.ACCEPT_LANGUAGE, 101 | 'User-Agent': constants.USER_AGENT} 102 | _response = session.get(INDEX, headers=_headers, timeout=TIMEOUT) 103 | logging.debug('response code:' + str(_response.status_code)) 104 | return _response.status_code == 200 105 | 106 | 107 | def get_captcha(session): 108 | ''' 109 | GET /aiccips//verify/start.html 110 | Response JSON 111 | { 112 | "success": 0, 113 | "gt": "c02ee51ee0afe88899efe6dc729627fc", 114 | "challenge": "ed3d2c21991e3bef5e069713af9fa6caed" 115 | } 116 | ''' 117 | _url = INDEX + '/aiccips//verify/start.html' 118 | logging.debug('GET ' + _url) 119 | _headers = {'Accept': constants.ACCEPT_JSON, 120 | 'Accept-Language': constants.ACCEPT_LANGUAGE, 121 | 'User-Agent': constants.USER_AGENT, 122 | 'Referer': INDEX, 123 | 'X-Requested-With': 'XMLHttpRequest'} 124 | _params = {'t': str(int(time.time() * 1000))} 125 | _response = session.get(_url, headers=_headers, params=_params, timeout=TIMEOUT) 126 | logging.debug('response code: ' + str(_response.status_code)) 127 | logging.debug('response text: ' + _response.text) 128 | if _response.status_code != 200: 129 | return False 130 | return _response.json() 131 | 132 | 133 | def post_validate(session, challenge, validate, keyword): 134 | ''' 135 | POST /aiccips/verify/sec.html 136 | Response JSON 137 | { 138 | "status": "success", 139 | "textfield": "waY5F5lZyxvKw9bMM4nBs7HUgWS1SRpagFutRKqs/+DkRqCIS9N4PUCqM9fmrbg1", 140 | "version": "3.3.0" 141 | } 142 | ''' 143 | _url = INDEX + '/aiccips/verify/sec.html' 144 | logging.debug('POST ' + _url) 145 | _headers = {'Accept': constants.ACCEPT_JSON, 146 | 'Accept-Language': constants.ACCEPT_LANGUAGE, 147 | 'User-Agent': constants.USER_AGENT, 148 | 'Referer': INDEX, 149 | 'X-Requested-With': 'XMLHttpRequest', 150 | 'Origin': HOST} 151 | _params = [('textfield', keyword), 152 | ('geetest_challenge', challenge), 153 | ('geetest_validate', validate), 154 | ('geetest_seccode', validate + '|jordan')] 155 | _response = session.post(_url, headers=_headers, data=_params, timeout=TIMEOUT) 156 | logging.debug('response code: ' + str(_response.status_code)) 157 | logging.debug('response text: ' + _response.text) 158 | if _response.status_code != 200: 159 | return False 160 | _json_obj = _response.json() 161 | logging.debug(_json_obj) 162 | return _json_obj['textfield'] if _json_obj['status'] == 'success' else None 163 | 164 | 165 | def post_search(session, textfield): 166 | ''' 167 | POST /aiccips/CheckEntContext/showCheck.html 168 | Response HTML WebPage 169 | ''' 170 | _url = INDEX + '/aiccips/CheckEntContext/showCheck.html' 171 | logging.debug('POST ' + _url) 172 | _headers = {'Accept': constants.ACCEPT_HTML, 173 | 'Accept-Language': constants.ACCEPT_LANGUAGE, 174 | 'User-Agent': constants.USER_AGENT, 175 | 'Referer': INDEX, 176 | 'X-Requested-With': 'XMLHttpRequest', 177 | 'Origin': HOST} 178 | _params = [('textfield', textfield), 179 | ('type', 'nomal')] 180 | _response = session.post(_url, headers=_headers, data=_params, timeout=TIMEOUT) 181 | logging.debug('response code: ' + str(_response.status_code)) 182 | logging.debug('response text: ' + _response.text) 183 | if _response.status_code != 200: 184 | return None 185 | return parse_name_url(_response.text) 186 | 187 | 188 | def get_validate(session, keyword): 189 | '''safe loop post validate''' 190 | for _ in range(10): 191 | captcha = get_captcha(session) 192 | if not captcha: 193 | return None 194 | 195 | validate = calc_validate(captcha['challenge']) 196 | textfield = post_validate(session, captcha['challenge'], validate, keyword) 197 | if textfield: 198 | return textfield 199 | return None 200 | 201 | 202 | def parse_detail_sz(html_doc): 203 | '''parse company detail for shenzhen''' 204 | _soup = BeautifulSoup(html_doc, 'html.parser') 205 | _yyzz = _soup.find('div', class_='item_box', id='yyzz') 206 | if not _yyzz: 207 | logging.error('Detail yyzz Not Found') 208 | return None 209 | 210 | _li_all = _yyzz.find_all('li') 211 | if not _li_all: 212 | logging.error("Detail li Not Found") 213 | return None 214 | 215 | _info = {} 216 | for _li in _li_all: 217 | _text = ''.join(_li.get_text().split()) 218 | _k, _v = _text.split(sep=':', maxsplit=1) 219 | _info[_k] = _v 220 | logging.info(_info) 221 | if not _info['企业名称']: 222 | _info = None # for safe 223 | return _info 224 | 225 | 226 | def parse_detail(html_doc): 227 | '''parse company detail for guangzhou and other''' 228 | _soup = BeautifulSoup(html_doc, 'html.parser') 229 | _table = _soup.find('table', cellspacing='6') 230 | if not _table: 231 | logging.error('Detail table Not Found') 232 | return None 233 | 234 | _tr_all = _table.find_all('td') 235 | if not _tr_all: 236 | logging.error("Detail td Not Found") 237 | return None 238 | 239 | _info = {} 240 | for _td in _tr_all: 241 | _text = ''.join(_td.get_text().split()) 242 | if _text == '营业执照信息': 243 | continue 244 | _k, _v = _text.split(sep=':', maxsplit=1) 245 | _temp = {} 246 | _temp[_k] = _v 247 | for _k2, _v2 in _temp.items(): 248 | if _k2 == '.企业名称' or _k2 == '.名称': 249 | _info['企业名称'] = _v2 250 | elif _k2 == '.统一社会信用代码/注册号' or _k2 == '.注册号': 251 | _info['注册号/统一社会信用代码'] = _v2 252 | elif _k2 == '.类型': 253 | _info['类型'] = _v2 254 | elif _k2 == '.负责人' or _k2 == '.经营者': 255 | _info['法定代表人'] = _v2 256 | elif _k2 == '.成立日期' or _k2 == '.注册日期': 257 | _info['成立日期'] = _v2 258 | elif _k2 == '.营业期限自': 259 | _info['营业期限自'] = _v2 260 | elif _k2 == '.营业期限至': 261 | _info['营业期限至'] = _v2 262 | elif _k2 == '.登记机关': 263 | _info['登记机关'] = _v2 264 | elif _k2 == '.核准日期': 265 | _info['核准日期'] = _v2 266 | elif _k2 == '.登记状态': 267 | _info['登记状态'] = _v2 268 | elif _k2 == '.营业场所' or _k2 == '.经营场所': 269 | _info['住所'] = _v2 270 | elif _k2 == '.经营范围': 271 | _info['经营范围'] = _v2 272 | _info['注册资本'] = '0' 273 | logging.info(_info) 274 | if not _info['企业名称']: 275 | _info = None # for safe 276 | return _info 277 | 278 | 279 | def query_keyword(session, keyword): 280 | '''query keyword''' 281 | #if not get_mainpage(session): 282 | # return None 283 | logging.info(keyword) 284 | textfield = get_validate(session, keyword) 285 | if textfield: 286 | return post_search(session, textfield) 287 | return None 288 | 289 | 290 | def safe_query_keyword(keyword): 291 | '''Safe query keyword, handle network timeout and retry''' 292 | for _ in range(5): 293 | try: 294 | with requests.Session() as session: 295 | return query_keyword(session, keyword) 296 | except requests.RequestException as _e: 297 | logging.error(_e) 298 | time.sleep(5) 299 | return None 300 | 301 | 302 | def query_detail(session, url): 303 | '''query company detail url''' 304 | logging.debug('GET ' + url) 305 | _headers = {'Accept': constants.ACCEPT_HTML, 306 | 'Accept-Language': constants.ACCEPT_LANGUAGE, 307 | 'User-Agent': constants.USER_AGENT} 308 | _response = session.get(url, headers=_headers, timeout=TIMEOUT) 309 | logging.debug('response code:' + str(_response.status_code)) 310 | if _response.status_code == 200: 311 | if url.find('www.szcredit.org.cn') is not -1: 312 | return parse_detail_sz(_response.text) 313 | elif url.find('GSpublicityList.html') is not -1: 314 | return parse_detail(_response.text) 315 | else: 316 | logging.error('URL Type Not Support') 317 | return None 318 | 319 | 320 | def safe_query_detail(url): 321 | '''Safe query url, handle network timeout and retry multi times.''' 322 | for _ in range(5): 323 | try: 324 | with requests.Session() as session: 325 | return query_detail(session, url) 326 | except requests.RequestException as _e: 327 | logging.error(_e) 328 | time.sleep(5) 329 | return None 330 | 331 | 332 | def query_entry(): 333 | '''main entry''' 334 | lists = load_json(GD_LIST_FILE) 335 | if not lists: 336 | lists = [] 337 | results = load_json(GD_RESULT_FILE) 338 | if not results: 339 | results = {} 340 | notfound = load_json(GD_NOTFOUND_FILE) 341 | if not notfound: 342 | notfound = [] 343 | 344 | for keyword in lists: 345 | if keyword in results: 346 | continue 347 | if keyword in notfound: 348 | continue 349 | name_url_array = safe_query_keyword(keyword) 350 | if not name_url_array: 351 | notfound.append(keyword) 352 | continue 353 | for name, url in name_url_array: 354 | if name in results: 355 | continue 356 | detail_dict = safe_query_detail(url) 357 | if detail_dict: 358 | results.update({name : detail_dict}) 359 | save_json('result.json', results) 360 | save_json('notfound.json', notfound) 361 | logging.info('done') 362 | 363 | 364 | if __name__ == "__main__": 365 | query_entry() 366 | -------------------------------------------------------------------------------- /geetest_offline/geetest_offline_nm.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | geetest offline 5.9.0 - 6.0.0 spider for nm.gsxt.gov.cn 5 | HTTP协议与其他站点略有不同 6 | ''' 7 | 8 | import time 9 | import random 10 | import logging 11 | from logging import NullHandler 12 | import requests 13 | import execjs 14 | from bs4 import BeautifulSoup 15 | import constants 16 | import util 17 | 18 | logging.getLogger(__name__).addHandler(NullHandler()) 19 | logging.basicConfig(level=logging.DEBUG) 20 | 21 | HOST = '' 22 | INDEX = '' 23 | 24 | JSRUNTIME = execjs.get(execjs.runtime_names.Node) 25 | 26 | CAPTCHA_JSON = [] 27 | 28 | USERRESPONSE_JSCONTEXT = JSRUNTIME.compile(util.USERRESPONSE_JS) 29 | 30 | TIMEOUT = 10 31 | 32 | GSXT_HOST_NM = 'http://nm.gsxt.gov.cn:58888' 33 | GSXT_INDEX_NM = GSXT_HOST_NM + '/' 34 | 35 | def config(host, index): 36 | '''设置 host and index URL''' 37 | global HOST, INDEX 38 | HOST, INDEX = host, index 39 | 40 | 41 | def calc_userresponse(distance, challenge): 42 | '''根据滑动距离distance和challenge,计算userresponse值''' 43 | return USERRESPONSE_JSCONTEXT.call('userresponse', distance, challenge) 44 | 45 | 46 | def calc_validate(challenge): 47 | '''计算validate值''' 48 | _r = random.randint(0, len(util.OFFLINE_SAMPLE)-1) 49 | distance, rand0, rand1 = util.OFFLINE_SAMPLE[_r] 50 | distance_r = calc_userresponse(distance, challenge) 51 | rand0_r = calc_userresponse(rand0, challenge) 52 | rand1_r = calc_userresponse(rand1, challenge) 53 | validate = distance_r + '_' + rand0_r + '_' + rand1_r 54 | logging.debug(validate) 55 | return validate 56 | 57 | 58 | def parse_code(html_doc): 59 | '''使用BeautifulSoup解析HTML页面,查找统一社会信用代码,查找代码总数(下一页)''' 60 | _soup = BeautifulSoup(html_doc, 'html.parser') 61 | # find result number 62 | _span = _soup.find('span', attrs={'style': 'color: red'}) 63 | _number = int(''.join(_span.get_text().split())) if _span else 0 64 | logging.debug('page number = ' + str(_number)) 65 | if not _number: 66 | logging.error('Number Not Found') 67 | return None, 0 68 | 69 | _div_all = _soup.find_all('div', class_='clickStyle', attrs={'onclick': 'details(this)'}) 70 | _result = [] 71 | if _div_all: 72 | for _div in _div_all: 73 | _a = _div.find('a', class_='font16', attrs={'target': '_blank'}) 74 | _a_str = _a.get_text() 75 | _td = _div.find('td', attrs={'style': 'width: 35%'}) 76 | _span = _td.find('span', class_='dataTextStyle') 77 | _span_str = ''.join(_span.get_text().split()) 78 | _result.append((_a_str.encode('utf-8'), _span_str.encode('utf-8'))) 79 | else: 80 | logging.info('Code Not Found') 81 | logging.info(html_doc) 82 | return _result, _number 83 | 84 | 85 | def get_main(session): 86 | '''Get gsxt 首页''' 87 | _url = INDEX 88 | logging.debug('GET ' + _url) 89 | _headers = {'Accept': constants.ACCEPT_HTML, 90 | 'Accept-Language': constants.ACCEPT_LANGUAGE, 91 | 'User-Agent': constants.USER_AGENT} 92 | _response = session.get(_url, headers=_headers, timeout=TIMEOUT) 93 | logging.debug('response code:' + str(_response.status_code)) 94 | return _response.status_code == 200 95 | 96 | 97 | def get_verify_start(session): 98 | ''' 99 | {"success": 0, 100 | "gt": "39134c54afef1e0b19228627406614e9", 101 | "challenge": "fc490ca45c00b1249bbe3554a4fdf6fb35"} 102 | ''' 103 | _url = INDEX + '/verify/start.html' 104 | logging.debug('GET ' + _url) 105 | _headers = {'Accept': constants.ACCEPT_JSON, 106 | 'Accept-Language': constants.ACCEPT_LANGUAGE, 107 | 'User-Agent': constants.USER_AGENT, 108 | 'Referer': INDEX, 109 | 'X-Requested-With': 'XMLHttpRequest'} 110 | _params = {'v': str(int(time.time() * 1000))} 111 | _response = session.get(_url, headers=_headers, params=_params, timeout=TIMEOUT) 112 | logging.debug('response code: ' + str(_response.status_code)) 113 | logging.debug('response text: ' + _response.text) 114 | if _response.status_code != 200: 115 | return False 116 | global CAPTCHA_JSON 117 | CAPTCHA_JSON = _response.json() 118 | return True 119 | 120 | 121 | def post_verify_sec(session, validate, keyword): 122 | ''' POST /verify/sec.html''' 123 | _url = INDEX + 'verify/sec.html' 124 | logging.debug('POST ' + _url) 125 | _headers = {'Accept': constants.ACCEPT_JSON, 126 | 'Accept-Language': constants.ACCEPT_LANGUAGE, 127 | 'User-Agent': constants.USER_AGENT, 128 | 'Referer': INDEX, 129 | 'X-Requested-With': 'XMLHttpRequest', 130 | 'Origin': HOST} 131 | _params = [('textfield', keyword), 132 | ('geetest_challenge', CAPTCHA_JSON['challenge']), 133 | ('geetest_validate', validate), 134 | ('geetest_seccode', validate + '|jordan')] 135 | _response = session.post(_url, headers=_headers, data=_params, timeout=TIMEOUT) 136 | logging.debug('response code: ' + str(_response.status_code)) 137 | logging.debug('response text: ' + _response.text) 138 | if _response.status_code != 200: 139 | return None 140 | _json_obj = _response.json() 141 | logging.debug(_json_obj) 142 | return _json_obj['textfield'] if _json_obj['status'] == 'success' else None 143 | 144 | 145 | def post_search(session, textfield, page): 146 | ''' POST /CheckEntContext/showCheck.html''' 147 | _url = INDEX + 'CheckEntContext/showCheck.html' 148 | logging.debug('POST ' + _url) 149 | _headers = {'Accept': constants.ACCEPT_HTML, 150 | 'Accept-Language': constants.ACCEPT_LANGUAGE, 151 | 'User-Agent': constants.USER_AGENT, 152 | 'Referer': INDEX, 153 | 'X-Requested-With': 'XMLHttpRequest', 154 | 'Origin': HOST} 155 | _params = [('textfield', textfield), 156 | ('type', 'nomal')] 157 | if page > 1: 158 | _params.append(('total', '')) 159 | _params.append(('pageNo', page)) 160 | 161 | _response = session.post(_url, headers=_headers, data=_params, timeout=TIMEOUT) 162 | logging.debug('response code: ' + str(_response.status_code)) 163 | #logger.debug('response text: ' + _response.text) 164 | if _response.status_code != 200: 165 | return None, None 166 | return parse_code(_response.text) 167 | 168 | 169 | def get_validate(session, keyword): 170 | '''循环进行validate验证''' 171 | for _ in range(10): 172 | if not get_verify_start(session): 173 | return None 174 | 175 | validate = calc_validate(CAPTCHA_JSON['challenge']) 176 | textfield = post_verify_sec(session, validate, keyword) 177 | if textfield: 178 | return textfield 179 | return None 180 | 181 | 182 | def query_keyword(session, keyword): 183 | '''使用session, 查询keyword, 更新session.token''' 184 | if not get_main(session): 185 | return None 186 | 187 | textfield = get_validate(session, keyword) 188 | if not textfield: 189 | return None 190 | 191 | _code_all = [] 192 | _number = 50 # max result number 193 | _page = 0 # start page number 194 | while _page * 10 < _number: 195 | _page += 1 196 | _code, _number = post_search(session, textfield, _page) 197 | if _code: 198 | _code_all.extend(_code) 199 | else: 200 | break 201 | 202 | return _code_all 203 | 204 | 205 | def query_leveldb(query_db, save_db, queryed_db): 206 | '''query by leveldb''' 207 | try: 208 | with requests.Session() as session: 209 | for _name, _code in query_db.RangeIter(): 210 | if not util.has_key(save_db, _name) and not util.has_key(queryed_db, _name): 211 | # 模糊查询 212 | _subname = _name[0: 18] if len(_name) > 18 else _name 213 | logging.info(_name + ' -> ' + _subname) 214 | _code_all = query_keyword(session, _subname) 215 | if _code_all: 216 | for _c in _code_all: 217 | logging.info(_c[0] + ' : ' + _c[1]) 218 | save_db.Put(_c[0], _c[1], sync=True) 219 | queryed_db.Put(_name, '', sync=True) 220 | return True 221 | except requests.RequestException as _e: 222 | logging.error(_e) 223 | return False 224 | 225 | 226 | def query_keyword_helper(keyword): 227 | '''根据keyword查询一次''' 228 | try: 229 | with requests.Session() as session: 230 | _code_all = query_keyword(session, keyword) 231 | if _code_all: 232 | logging.info(len(_code_all)) 233 | for _r in _code_all: 234 | logging.info(_r[0].decode() + ' : ' + _r[1].decode()) 235 | except requests.RequestException as _e: 236 | logging.error(_e) 237 | 238 | 239 | if __name__ == "__main__": 240 | config(GSXT_HOST_NM, GSXT_INDEX_NM) 241 | query_keyword_helper('百度') 242 | -------------------------------------------------------------------------------- /geetest_offline/util.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | geetest常用公共方法 5 | ''' 6 | 7 | SPLIT_ARRAY_JS = ''' 8 | function getSplitArray() { 9 | for (var a, b = "6_11_7_10_4_12_3_1_0_5_2_9_8".split("_"), c = [], d = 0, e = 52; d < e; d++) 10 | a = 2 * parseInt(b[parseInt(d % 26 / 2)]) + d % 2, 11 | parseInt(d / 2) % 2 || (a += d % 2 ? -1 : 1), 12 | a += d < 26 ? 26 : 0, 13 | c.push(a); 14 | return c 15 | } 16 | ''' 17 | 18 | USERRESPONSE_JS = ''' 19 | function userresponse(a, b) { 20 | for (var c = b.slice(32), d = [], e = 0; e < c.length; e++) { 21 | var f = c.charCodeAt(e); 22 | d[e] = f > 57 ? f - 87 : f - 48 23 | } 24 | c = 36 * d[0] + d[1]; 25 | var g = Math.round(a) + c; b = b.slice(0, 32); 26 | var h, i = [ [], [], [], [], [] ], j = {}, k = 0; e = 0; 27 | for (var l = b.length; e < l; e++) 28 | h = b.charAt(e), j[h] || (j[h] = 1, i[k].push(h), k++, k = 5 == k ? 0 : k); 29 | for (var m, n = g, o = 4, p = "", q = [1, 2, 5, 10, 50]; n > 0;) 30 | n - q[o] >= 0 ? (m = parseInt(Math.random() * i[o].length, 10), p += i[o][m], n -= q[o]) : (i.splice(o, 1), q.splice(o, 1), o -= 1); 31 | return p 32 | } 33 | ''' 34 | 35 | OFFLINE_SAMPLE = ((186, 1, 98), 36 | (82, 0, 136), 37 | (61, 5, 108), 38 | (128, 2, 7), 39 | (130, 4, 99), 40 | (189, 3, 65), 41 | (108, 5, 285), 42 | (136, 0, 36), 43 | (41, 0, 263), 44 | (124, 3, 185)) 45 | 46 | 47 | TRACE_JS = ''' 48 | var tracer = function () { 49 | c = function (traceArray) { 50 | for (var b, c, d, e = [], f = 0, g = [], h = 0, i = traceArray.length - 1; h < i; h++) { 51 | b = Math.round(traceArray[h + 1][0] - traceArray[h][0]), 52 | c = Math.round(traceArray[h + 1][1] - traceArray[h][1]), 53 | d = Math.round(traceArray[h + 1][2] - traceArray[h][2]), 54 | g.push([b, c, d]), 0 == b && 0 == c && 0 == d || (0 == b && 0 == c ? f += d : (e.push([b, c, d + f]), f = 0)); 55 | } 56 | return 0 !== f && e.push([b, c, f]), e 57 | }, 58 | d = function (a) { 59 | var b = "()*,-./0123456789:?@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqr", 60 | c = b.length, 61 | d = "", 62 | e = Math.abs(a), 63 | f = parseInt(e / c); 64 | f >= c && (f = c - 1), f && (d = b.charAt(f)), e %= c; 65 | var g = ""; 66 | return a < 0 && (g += "!"), d && (g += "$"), g + d + b.charAt(e) 67 | }, 68 | e = function (a) { 69 | for (var b = [ 70 | [1, 0], 71 | [2, 0], 72 | [1, -1], 73 | [1, 1], 74 | [0, 1], 75 | [0, -1], 76 | [3, 0], 77 | [2, -1], 78 | [2, 1] 79 | ], c = "stuvwxyz~", d = 0, e = b.length; d < e; d++) 80 | if (a[0] == b[d][0] && a[1] == b[d][1]) return c[d]; 81 | return 0 82 | }, 83 | f = function (traceArray) { 84 | for (var b, f = c(traceArray), g = [], h = [], i = [], j = 0, k = f.length; j < k; j++) { 85 | b = e(f[j]), b ? h.push(b) : (g.push(d(f[j][0])), h.push(d(f[j][1]))), i.push(d(f[j][2])); 86 | } 87 | return g.join("") + "!!" + h.join("") + "!!" + i.join("") 88 | }, 89 | g = function (traceArray) { 90 | var a = f(traceArray); 91 | return encodeURIComponent(a) 92 | }; 93 | return { 94 | trace: g 95 | } 96 | }(); 97 | exports.tracer = tracer; 98 | ''' 99 | 100 | def has_key(database, key): 101 | '''安全的检查leveldb是否存在key''' 102 | try: 103 | database.Get(key) 104 | return True 105 | except KeyError: 106 | return False 107 | -------------------------------------------------------------------------------- /geetest_online/README.md: -------------------------------------------------------------------------------- 1 | ### GeeTest滑块验证码online模式的破解 2 | 3 | 继续以[国家企业信用信息公示系统](http://www.gsxt.gov.cn)为例。补充一个完成度80%的项目和文档。代码实现主要参考[https://zhuanlan.zhihu.com/windev](https://zhuanlan.zhihu.com/windev)的相关分析文章。 4 | 5 | ### 已实现功能 6 | 7 | #### 1. 所有HTTP Request & Response协议 8 | 9 | 使用`requests`库。 10 | 11 | #### 2. 验证码图片的拼图重组和识别 12 | 13 | 使用`Pillow`库,实现滑块拼图位置的精确定位。 14 | 全局变量`IMAGE_DEBUG`,实现不同精准度的图片本地临时文件存储,以便观察定位效果和改进。 15 | 16 | #### 3. GeeTest Javascript 加解密算法破解 17 | 18 | 使用`PyExecJS`库,执行GeeTest Javascript方法,获得正确的明文和密文。 19 | 配合`NodeJS`使用更佳。 20 | 21 | #### 4. 使用`BeautifulSoup4`库,进行网页数据解析 22 | 23 | ### 未完成的20% 24 | 25 | + 完善用户鼠标轨迹运行的数据仿真算法。 26 | + 补全官网针对爬虫返回 HTTP 521 的处理,补全Cookie校验逻辑。 27 | 28 | ### Python Dependence 29 | 30 | ```bash 31 | pip install requests 32 | pip install Pillow 33 | pip install PyExecJS 34 | pip install beautifulsoup4 35 | ``` 36 | -------------------------------------------------------------------------------- /geetest_online/constants.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | HTTP Request 常用常量 5 | ''' 6 | 7 | ACCEPT_ANY = '*/*' 8 | 9 | ACCEPT_TEXT = 'text/plain, */*; q=0.01' 10 | 11 | ACCEPT_HTML = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' 12 | 13 | ACCEPT_JSON = 'application/json, text/javascript, */*; q=0.01' 14 | 15 | ACCEPT_IMAGE = 'image/webp,image/*,*/*;q=0.8' 16 | 17 | ACCEPT_LANGUAGE = 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2' 18 | 19 | UA_CHROME_WIN = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' 20 | 21 | UA_CHROME_MAC = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 22 | 23 | USER_AGENT = UA_CHROME_MAC 24 | -------------------------------------------------------------------------------- /geetest_online/image/bg.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/geetest_online/image/bg.jpg -------------------------------------------------------------------------------- /geetest_online/image/bg.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/geetest_online/image/bg.webp -------------------------------------------------------------------------------- /geetest_online/image/fullbg.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/geetest_online/image/fullbg.jpg -------------------------------------------------------------------------------- /geetest_online/image/fullbg.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/geetest_online/image/fullbg.webp -------------------------------------------------------------------------------- /geetest_online/image/slice.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/geetest_online/image/slice.webp -------------------------------------------------------------------------------- /geetest_online/test/TraceSample01.txt: -------------------------------------------------------------------------------- 1 | [ 2 | [-20, -27, 0], 3 | [0, 0, 0], 4 | [0, 0, 1], 5 | [3, 0, 376], 6 | [6, 0, 385], 7 | [11, 0, 392], 8 | [18, 0, 400], 9 | [25, 0, 408], 10 | [30, 0, 416], 11 | [35, 0, 425], 12 | [41, 0, 432], 13 | [44, 0, 440], 14 | [48, 0, 448], 15 | [51, 0, 457], 16 | [53, 0, 464], 17 | [54, 0, 472], 18 | [56, 0, 480], 19 | [58, 0, 488], 20 | [60, 0, 496], 21 | [61, 0, 512], 22 | [62, 0, 520], 23 | [64, 0, 528], 24 | [65, 0, 536], 25 | [67, 0, 544], 26 | [68, 0, 552], 27 | [70, 0, 560], 28 | [71, 0, 568], 29 | [73, 0, 584], 30 | [74, 0, 600], 31 | [76, 0, 624], 32 | [77, 0, 632], 33 | [78, 0, 648], 34 | [79, 0, 656], 35 | [80, 0, 672], 36 | [81, 0, 680], 37 | [83, 0, 688], 38 | [84, 0, 704], 39 | [85, 0, 712], 40 | [86, 0, 720], 41 | [87, 0, 728], 42 | [88, 0, 737], 43 | [89, 0, 744], 44 | [90, 0, 760], 45 | [91, 0, 769], 46 | [92, 0, 776], 47 | [93, 0, 800], 48 | [94, 0, 824], 49 | [95, 0, 832], 50 | [96, 0, 856], 51 | [97, 0, 864], 52 | [98, 0, 880], 53 | [99, 0, 920], 54 | [100, 0, 928], 55 | [101, 0, 936], 56 | [102, 0, 944], 57 | [103, 0, 952], 58 | [104, 0, 992], 59 | [105, 0, 1040], 60 | [106, 0, 1056], 61 | [107, 0, 1128], 62 | [108, 0, 1192], 63 | [109, 0, 1457], 64 | [110, 0, 1473], 65 | [110, -1, 1529], 66 | [110, -1, 3521] 67 | ] -------------------------------------------------------------------------------- /geetest_online/test/TraceSample01Parse.txt: -------------------------------------------------------------------------------- 1 | distance = 110 2 | (0, 1) 3 | (3, 375) 4 | (3, 9) 5 | (5, 7) 6 | (7, 8) 7 | (7, 8) 8 | (5, 8) 9 | (5, 9) 10 | (6, 7) 11 | (3, 8) 12 | (4, 8) 13 | (3, 9) 14 | (2, 7) 15 | (1, 8) 16 | (2, 8) 17 | (2, 8) 18 | (2, 8) 19 | (1, 16) 20 | (1, 8) 21 | (2, 8) 22 | (1, 8) 23 | (2, 8) 24 | (1, 8) 25 | (2, 8) 26 | (1, 8) 27 | (2, 16) 28 | (1, 16) 29 | (2, 24) 30 | (1, 8) 31 | (1, 16) 32 | (1, 8) 33 | (1, 16) 34 | (1, 8) 35 | (2, 8) 36 | (1, 16) 37 | (1, 8) 38 | (1, 8) 39 | (1, 8) 40 | (1, 9) 41 | (1, 7) 42 | (1, 16) 43 | (1, 9) 44 | (1, 7) 45 | (1, 24) 46 | (1, 24) 47 | (1, 8) 48 | (1, 24) 49 | (1, 8) 50 | (1, 16) 51 | (1, 40) 52 | (1, 8) 53 | (1, 8) 54 | (1, 8) 55 | (1, 8) 56 | (1, 40) 57 | (1, 48) 58 | (1, 16) 59 | (1, 72) 60 | (1, 64) 61 | (1, 265) 62 | (1, 16) 63 | (0, 56) 64 | (0, 1992) 65 | -------------------------------------------------------------------------------- /geetest_online/test/TraceSample02.txt: -------------------------------------------------------------------------------- 1 | [ 2 | [-21, -24, 0], 3 | [0, 0, 0], 4 | [0, 0, 4], 5 | [1, 0, 504], 6 | [4, 0, 520], 7 | [5, 0, 529], 8 | [6, 0, 537], 9 | [8, 0, 545], 10 | [11, 0, 552], 11 | [13, 0, 560], 12 | [15, 0, 568], 13 | [19, 0, 576], 14 | [22, 0, 584], 15 | [26, 0, 592], 16 | [31, 1, 600], 17 | [37, 3, 608], 18 | [40, 3, 616], 19 | [44, 4, 624], 20 | [46, 4, 632], 21 | [48, 4, 640], 22 | [52, 4, 648], 23 | [54, 4, 656], 24 | [55, 4, 664], 25 | [58, 4, 673], 26 | [61, 4, 680], 27 | [64, 4, 688], 28 | [66, 4, 696], 29 | [69, 4, 706], 30 | [74, 4, 712], 31 | [76, 4, 720], 32 | [79, 4, 729], 33 | [81, 4, 736], 34 | [82, 4, 745], 35 | [84, 4, 752], 36 | [86, 4, 760], 37 | [87, 4, 768], 38 | [88, 4, 776], 39 | [89, 4, 784], 40 | [91, 4, 792], 41 | [92, 4, 799], 42 | [95, 4, 808], 43 | [96, 4, 815], 44 | [99, 4, 824], 45 | [101, 4, 832], 46 | [103, 4, 840], 47 | [106, 4, 847], 48 | [107, 4, 856], 49 | [109, 4, 863], 50 | [110, 4, 872], 51 | [112, 4, 879], 52 | [113, 4, 896], 53 | [115, 4, 904], 54 | [116, 4, 911], 55 | [117, 4, 920], 56 | [119, 4, 927], 57 | [120, 4, 936], 58 | [121, 4, 944], 59 | [122, 4, 952], 60 | [123, 4, 968], 61 | [124, 4, 976], 62 | [125, 4, 983], 63 | [126, 4, 992], 64 | [127, 4, 999], 65 | [129, 4, 1008], 66 | [131, 4, 1024], 67 | [132, 4, 1032], 68 | [134, 4, 1040], 69 | [135, 4, 1055], 70 | [136, 4, 1063], 71 | [137, 4, 1072], 72 | [138, 4, 1095], 73 | [139, 4, 1104], 74 | [140, 4, 1111], 75 | [141, 4, 1120], 76 | [142, 4, 1127], 77 | [143, 4, 1136], 78 | [144, 4, 1168], 79 | [145, 4, 1184], 80 | [146, 4, 1200], 81 | [147, 4, 1215], 82 | [148, 4, 1232], 83 | [149, 4, 1239], 84 | [150, 4, 1256], 85 | [151, 4, 1296], 86 | [152, 4, 1312], 87 | [153, 4, 1320], 88 | [154, 4, 1336], 89 | [155, 4, 1368], 90 | [156, 4, 1376], 91 | [157, 4, 1424], 92 | [158, 4, 1440], 93 | [159, 4, 1464], 94 | [160, 4, 1488], 95 | [161, 4, 1520], 96 | [162, 4, 1552], 97 | [163, 3, 1600], 98 | [164, 3, 1616], 99 | [165, 3, 1640], 100 | [166, 3, 1712], 101 | [167, 3, 1736], 102 | [168, 3, 1760], 103 | [169, 3, 1872], 104 | [170, 3, 1905], 105 | [171, 3, 1952], 106 | [172, 1, 2072], 107 | [173, 1, 2280], 108 | [173, 1, 3072] 109 | ] -------------------------------------------------------------------------------- /geetest_online/test/TraceSample02Parse.txt: -------------------------------------------------------------------------------- 1 | distance = 173 2 | (0, 4) 3 | (1, 500) 4 | (3, 16) 5 | (1, 9) 6 | (1, 8) 7 | (2, 8) 8 | (3, 7) 9 | (2, 8) 10 | (2, 8) 11 | (4, 8) 12 | (3, 8) 13 | (4, 8) 14 | (5, 8) 15 | (6, 8) 16 | (3, 8) 17 | (4, 8) 18 | (2, 8) 19 | (2, 8) 20 | (4, 8) 21 | (2, 8) 22 | (1, 8) 23 | (3, 9) 24 | (3, 7) 25 | (3, 8) 26 | (2, 8) 27 | (3, 10) 28 | (5, 6) 29 | (2, 8) 30 | (3, 9) 31 | (2, 7) 32 | (1, 9) 33 | (2, 7) 34 | (2, 8) 35 | (1, 8) 36 | (1, 8) 37 | (1, 8) 38 | (2, 8) 39 | (1, 7) 40 | (3, 9) 41 | (1, 7) 42 | (3, 9) 43 | (2, 8) 44 | (2, 8) 45 | (3, 7) 46 | (1, 9) 47 | (2, 7) 48 | (1, 9) 49 | (2, 7) 50 | (1, 17) 51 | (2, 8) 52 | (1, 7) 53 | (1, 9) 54 | (2, 7) 55 | (1, 9) 56 | (1, 8) 57 | (1, 8) 58 | (1, 16) 59 | (1, 8) 60 | (1, 7) 61 | (1, 9) 62 | (1, 7) 63 | (2, 9) 64 | (2, 16) 65 | (1, 8) 66 | (2, 8) 67 | (1, 15) 68 | (1, 8) 69 | (1, 9) 70 | (1, 23) 71 | (1, 9) 72 | (1, 7) 73 | (1, 9) 74 | (1, 7) 75 | (1, 9) 76 | (1, 32) 77 | (1, 16) 78 | (1, 16) 79 | (1, 15) 80 | (1, 17) 81 | (1, 7) 82 | (1, 17) 83 | (1, 40) 84 | (1, 16) 85 | (1, 8) 86 | (1, 16) 87 | (1, 32) 88 | (1, 8) 89 | (1, 48) 90 | (1, 16) 91 | (1, 24) 92 | (1, 24) 93 | (1, 32) 94 | (1, 32) 95 | (1, 48) 96 | (1, 16) 97 | (1, 24) 98 | (1, 72) 99 | (1, 24) 100 | (1, 24) 101 | (1, 112) 102 | (1, 33) 103 | (1, 47) 104 | (1, 120) 105 | (1, 208) 106 | (0, 792) 107 | -------------------------------------------------------------------------------- /geetest_online/test/TraceSample03.txt: -------------------------------------------------------------------------------- 1 | [ 2 | [-18, -24, 0], 3 | [0, 0, 0], 4 | [1, 0, 216], 5 | [2, 0, 224], 6 | [3, 0, 312], 7 | [4, 0, 328], 8 | [5, 0, 336], 9 | [7, 0, 352], 10 | [8, 0, 360], 11 | [10, 0, 368], 12 | [12, 0, 376], 13 | [14, 0, 385], 14 | [17, 0, 392], 15 | [19, 0, 400], 16 | [20, 0, 408], 17 | [23, 0, 416], 18 | [25, 0, 424], 19 | [27, 0, 433], 20 | [28, 0, 441], 21 | [30, 0, 449], 22 | [32, 0, 457], 23 | [33, 0, 466], 24 | [34, 0, 473], 25 | [35, 0, 480], 26 | [37, 0, 489], 27 | [38, 0, 496], 28 | [41, 0, 504], 29 | [43, 0, 513], 30 | [44, 0, 519], 31 | [45, 0, 528], 32 | [47, 0, 535], 33 | [48, 0, 544], 34 | [49, 0, 560], 35 | [50, 0, 576], 36 | [51, 0, 583], 37 | [52, 0, 592], 38 | [54, 0, 599], 39 | [56, 0, 616], 40 | [57, 0, 624], 41 | [58, 0, 632], 42 | [59, 0, 649], 43 | [60, 0, 672], 44 | [62, 0, 688], 45 | [63, 0, 712], 46 | [65, 0, 752], 47 | [66, 0, 784], 48 | [68, 0, 808], 49 | [69, 0, 888], 50 | [71, 0, 920], 51 | [72, 0, 984], 52 | [72, -1, 1024], 53 | [74, -1, 1080], 54 | [75, -1, 1232], 55 | [75, -2, 1247], 56 | [77, -2, 1352], 57 | [78, -2, 1432], 58 | [78, -2, 2136] 59 | ] -------------------------------------------------------------------------------- /geetest_online/test/TraceSample03Parse.txt: -------------------------------------------------------------------------------- 1 | distance = 78 2 | (1, 216) 3 | (1, 8) 4 | (1, 88) 5 | (1, 16) 6 | (1, 8) 7 | (2, 16) 8 | (1, 8) 9 | (2, 8) 10 | (2, 8) 11 | (2, 9) 12 | (3, 7) 13 | (2, 8) 14 | (1, 8) 15 | (3, 8) 16 | (2, 8) 17 | (2, 9) 18 | (1, 8) 19 | (2, 8) 20 | (2, 8) 21 | (1, 9) 22 | (1, 7) 23 | (1, 7) 24 | (2, 9) 25 | (1, 7) 26 | (3, 8) 27 | (2, 9) 28 | (1, 6) 29 | (1, 9) 30 | (2, 7) 31 | (1, 9) 32 | (1, 16) 33 | (1, 16) 34 | (1, 7) 35 | (1, 9) 36 | (2, 7) 37 | (2, 17) 38 | (1, 8) 39 | (1, 8) 40 | (1, 17) 41 | (1, 23) 42 | (2, 16) 43 | (1, 24) 44 | (2, 40) 45 | (1, 32) 46 | (2, 24) 47 | (1, 80) 48 | (2, 32) 49 | (1, 64) 50 | (0, 40) 51 | (2, 56) 52 | (1, 152) 53 | (0, 15) 54 | (2, 105) 55 | (1, 80) 56 | (0, 704) 57 | -------------------------------------------------------------------------------- /geetest_online/test/TraceSample04.txt: -------------------------------------------------------------------------------- 1 | [ 2 | [-23, -18, 0], 3 | [0, 0, 0], 4 | [1, 0, 223], 5 | [2, 0, 239], 6 | [3, 0, 247], 7 | [4, 0, 255], 8 | [5, 0, 263], 9 | [6, 0, 271], 10 | [8, 0, 279], 11 | [9, 0, 287], 12 | [11, 0, 295], 13 | [12, 0, 303], 14 | [13, 0, 311], 15 | [15, 0, 319], 16 | [16, 0, 327], 17 | [18, 0, 335], 18 | [20, 0, 351], 19 | [22, 0, 359], 20 | [25, 0, 367], 21 | [27, 0, 375], 22 | [29, 0, 383], 23 | [32, 0, 391], 24 | [34, 0, 399], 25 | [37, 0, 407], 26 | [40, 0, 415], 27 | [42, 0, 423], 28 | [44, 0, 431], 29 | [47, 0, 439], 30 | [48, 0, 447], 31 | [49, 0, 455], 32 | [51, 0, 463], 33 | [52, 0, 471], 34 | [54, 0, 479], 35 | [56, 0, 487], 36 | [59, 0, 495], 37 | [61, 0, 503], 38 | [63, 0, 511], 39 | [66, 0, 519], 40 | [69, 0, 527], 41 | [72, 0, 536], 42 | [74, 0, 543], 43 | [75, 0, 553], 44 | [76, 0, 559], 45 | [79, -1, 567], 46 | [81, -3, 575], 47 | [83, -3, 584], 48 | [85, -3, 591], 49 | [88, -3, 600], 50 | [91, -3, 607], 51 | [93, -3, 615], 52 | [96, -3, 623], 53 | [99, -3, 631], 54 | [100, -3, 638], 55 | [102, -3, 647], 56 | [105, -3, 655], 57 | [107, -3, 663], 58 | [110, -4, 671], 59 | [111, -4, 681], 60 | [114, -4, 687], 61 | [117, -4, 695], 62 | [120, -4, 703], 63 | [122, -4, 711], 64 | [124, -4, 719], 65 | [125, -4, 727], 66 | [127, -4, 735], 67 | [128, -4, 743], 68 | [129, -4, 752], 69 | [130, -4, 759], 70 | [132, -4, 767], 71 | [134, -4, 775], 72 | [136, -4, 783], 73 | [138, -4, 791], 74 | [141, -4, 799], 75 | [143, -4, 807], 76 | [145, -4, 816], 77 | [146, -4, 823], 78 | [148, -4, 832], 79 | [149, -4, 839], 80 | [150, -4, 847], 81 | [152, -5, 855], 82 | [153, -5, 871], 83 | [155, -5, 881], 84 | [156, -5, 887], 85 | [157, -5, 895], 86 | [159, -5, 903], 87 | [160, -5, 912], 88 | [161, -5, 919], 89 | [162, -5, 927], 90 | [163, -5, 935], 91 | [165, -5, 943], 92 | [166, -5, 952], 93 | [168, -5, 959], 94 | [169, -5, 967], 95 | [171, -5, 975], 96 | [173, -5, 983], 97 | [175, -5, 999], 98 | [176, -5, 1006], 99 | [178, -5, 1023], 100 | [179, -5, 1047], 101 | [179, -6, 1055], 102 | [180, -6, 1063], 103 | [182, -6, 1087], 104 | [184, -6, 1111], 105 | [185, -7, 1128], 106 | [187, -7, 1143], 107 | [188, -7, 1151], 108 | [190, -7, 1167], 109 | [191, -7, 1175], 110 | [193, -7, 1183], 111 | [194, -7, 1191], 112 | [195, -7, 1199], 113 | [196, -7, 1207], 114 | [197, -7, 1224], 115 | [198, -7, 1239], 116 | [199, -7, 1255], 117 | [200, -7, 1263], 118 | [201, -7, 1271], 119 | [202, -7, 1287], 120 | [204, -7, 1295], 121 | [206, -7, 1319], 122 | [207, -7, 1343], 123 | [209, -7, 1375], 124 | [210, -7, 1399], 125 | [209, -7, 1687], 126 | [208, -7, 1695], 127 | [206, -8, 1703], 128 | [204, -8, 1711], 129 | [203, -8, 1719], 130 | [202, -8, 1727], 131 | [200, -8, 1735], 132 | [199, -8, 1743], 133 | [197, -8, 1751], 134 | [196, -8, 1767], 135 | [194, -8, 1785], 136 | [193, -8, 1791], 137 | [191, -8, 1802], 138 | [189, -8, 1808], 139 | [187, -8, 1823], 140 | [186, -8, 1832], 141 | [183, -8, 1839], 142 | [181, -8, 1855], 143 | [180, -8, 1863], 144 | [179, -8, 1871], 145 | [178, -8, 1879], 146 | [177, -8, 1887], 147 | [176, -8, 1903], 148 | [174, -8, 1912], 149 | [173, -8, 1919], 150 | [172, -8, 1935], 151 | [171, -8, 1952], 152 | [170, -8, 1975], 153 | [168, -8, 1991], 154 | [167, -8, 1999], 155 | [165, -8, 2007], 156 | [164, -8, 2023], 157 | [163, -8, 2032], 158 | [162, -8, 2039], 159 | [160, -8, 2047], 160 | [159, -8, 2055], 161 | [158, -8, 2063], 162 | [157, -8, 2071], 163 | [156, -8, 2081], 164 | [155, -8, 2087], 165 | [154, -8, 2111], 166 | [153, -8, 2119], 167 | [151, -8, 2128], 168 | [151, -7, 2135], 169 | [149, -7, 2143], 170 | [148, -7, 2152], 171 | [146, -6, 2159], 172 | [144, -6, 2175], 173 | [143, -6, 2183], 174 | [142, -6, 2191], 175 | [140, -6, 2199], 176 | [137, -6, 2207], 177 | [134, -6, 2216], 178 | [132, -6, 2223], 179 | [131, -6, 2231], 180 | [129, -6, 2239], 181 | [127, -6, 2247], 182 | [124, -5, 2255], 183 | [121, -5, 2263], 184 | [119, -3, 2271], 185 | [115, -3, 2280], 186 | [111, -3, 2287], 187 | [106, -3, 2295], 188 | [103, -3, 2303], 189 | [101, -3, 2311], 190 | [100, -3, 2319], 191 | [98, -3, 2328], 192 | [96, -3, 2335], 193 | [94, -3, 2343], 194 | [92, -3, 2352], 195 | [89, -3, 2359], 196 | [88, -3, 2367], 197 | [87, -3, 2375], 198 | [86, -3, 2383], 199 | [86, -2, 2623], 200 | [88, -2, 2631], 201 | [90, -2, 2639], 202 | [93, -1, 2647], 203 | [94, -1, 2655], 204 | [97, -1, 2663], 205 | [99, -1, 2671], 206 | [101, -1, 2680], 207 | [104, -1, 2687], 208 | [107, -1, 2695], 209 | [109, -1, 2703], 210 | [111, -1, 2711], 211 | [112, -1, 2720], 212 | [116, -1, 2727], 213 | [118, -1, 2743], 214 | [119, -1, 2752], 215 | [120, -1, 2768], 216 | [121, -1, 2775], 217 | [122, -1, 2784], 218 | [123, -1, 2799], 219 | [125, -1, 2807], 220 | [127, -1, 2815], 221 | [128, -1, 2823], 222 | [129, -1, 2839], 223 | [130, -1, 2847], 224 | [131, -1, 2855], 225 | [132, -1, 2879], 226 | [133, -1, 2895], 227 | [134, -1, 2927], 228 | [135, -1, 3023], 229 | [136, -1, 3047], 230 | [135, -1, 3471], 231 | [134, -1, 3503], 232 | [133, -1, 3511], 233 | [132, -1, 3519], 234 | [131, -1, 3583], 235 | [130, -1, 3607], 236 | [129, -1, 3639], 237 | [129, -2, 3919], 238 | [130, -2, 3943], 239 | [130, -3, 3959], 240 | [131, -3, 3983], 241 | [132, -3, 4031], 242 | [134, -3, 4135], 243 | [134, -3, 5064] 244 | ] -------------------------------------------------------------------------------- /geetest_online/test/TraceSample04Parse.txt: -------------------------------------------------------------------------------- 1 | (1, 223) 2 | (1, 16) 3 | (1, 8) 4 | (1, 8) 5 | (1, 8) 6 | (1, 8) 7 | (2, 8) 8 | (1, 8) 9 | (2, 8) 10 | (1, 8) 11 | (1, 8) 12 | (2, 8) 13 | (1, 8) 14 | (2, 8) 15 | (2, 16) 16 | (2, 8) 17 | (3, 8) 18 | (2, 8) 19 | (2, 8) 20 | (3, 8) 21 | (2, 8) 22 | (3, 8) 23 | (3, 8) 24 | (2, 8) 25 | (2, 8) 26 | (3, 8) 27 | (1, 8) 28 | (1, 8) 29 | (2, 8) 30 | (1, 8) 31 | (2, 8) 32 | (2, 8) 33 | (3, 8) 34 | (2, 8) 35 | (2, 8) 36 | (3, 8) 37 | (3, 8) 38 | (3, 9) 39 | (2, 7) 40 | (1, 10) 41 | (1, 6) 42 | (3, 8) 43 | (2, 8) 44 | (2, 9) 45 | (2, 7) 46 | (3, 6) 47 | (3, 8) 48 | (3, 8) 49 | (2, 8) 50 | (2, 8) 51 | (1, 8) 52 | (2, 8) 53 | (1, 8) 54 | (1, 9) 55 | (1, 7) 56 | (2, 8) 57 | (2, 8) 58 | (2, 8) 59 | (2, 8) 60 | (3, 8) 61 | (2, 8) 62 | (2, 9) 63 | (1, 7) 64 | (2, 9) 65 | (1, 7) 66 | (1, 8) 67 | (2, 8) 68 | (1, 16) 69 | (2, 10) 70 | (1, 6) 71 | (1, 8) 72 | (2, 8) 73 | (1, 9) 74 | (1, 7) 75 | (1, 8) 76 | (1, 8) 77 | (2, 8) 78 | (1, 9) 79 | (2, 7) 80 | (1, 8) 81 | (2, 8) 82 | (2, 8) 83 | (2, 16) 84 | (1, 7) 85 | (2, 17) 86 | (1, 24) 87 | (0, 8) 88 | (1, 8) 89 | (2, 24) 90 | (2, 24) 91 | (1, 17) 92 | (2, 15) 93 | (1, 8) 94 | (2, 16) 95 | (1, 8) 96 | (2, 8) 97 | (1, 8) 98 | (1, 8) 99 | (1, 8) 100 | (1, 17) 101 | (1, 15) 102 | (1, 16) 103 | (1, 8) 104 | (1, 8) 105 | (1, 16) 106 | (2, 8) 107 | (2, 24) 108 | (1, 24) 109 | (2, 32) 110 | (1, 24) 111 | (-1, 288) 112 | (-1, 8) 113 | (-2, 8) 114 | (-2, 8) 115 | (-1, 8) 116 | (-1, 8) 117 | (-2, 8) 118 | (-1, 8) 119 | (-2, 8) 120 | (-1, 16) 121 | (-2, 18) 122 | (-1, 6) 123 | (-2, 11) 124 | (-2, 6) 125 | (-2, 15) 126 | (-1, 9) 127 | (-3, 7) 128 | (-2, 16) 129 | (-1, 8) 130 | (-1, 8) 131 | (-1, 8) 132 | (-1, 8) 133 | (-1, 16) 134 | (-2, 9) 135 | (-1, 7) 136 | (-1, 16) 137 | (-1, 17) 138 | (-1, 23) 139 | (-2, 16) 140 | (-1, 8) 141 | (-2, 8) 142 | (-1, 16) 143 | (-1, 9) 144 | (-1, 7) 145 | (-2, 8) 146 | (-1, 8) 147 | (-1, 8) 148 | (-1, 8) 149 | (-1, 10) 150 | (-1, 6) 151 | (-1, 24) 152 | (-1, 8) 153 | (-2, 9) 154 | (0, 7) 155 | (-2, 8) 156 | (-1, 9) 157 | (-2, 7) 158 | (-2, 16) 159 | (-1, 8) 160 | (-1, 8) 161 | (-2, 8) 162 | (-3, 8) 163 | (-3, 9) 164 | (-2, 7) 165 | (-1, 8) 166 | (-2, 8) 167 | (-2, 8) 168 | (-3, 8) 169 | (-3, 8) 170 | (-2, 8) 171 | (-4, 9) 172 | (-4, 7) 173 | (-5, 8) 174 | (-3, 8) 175 | (-2, 8) 176 | (-1, 8) 177 | (-2, 9) 178 | (-2, 7) 179 | (-2, 8) 180 | (-2, 9) 181 | (-3, 7) 182 | (-1, 8) 183 | (-1, 8) 184 | (-1, 8) 185 | (0, 240) 186 | (2, 8) 187 | (2, 8) 188 | (3, 8) 189 | (1, 8) 190 | (3, 8) 191 | (2, 8) 192 | (2, 9) 193 | (3, 7) 194 | (3, 8) 195 | (2, 8) 196 | (2, 8) 197 | (1, 9) 198 | (4, 7) 199 | (2, 16) 200 | (1, 9) 201 | (1, 16) 202 | (1, 7) 203 | (1, 9) 204 | (1, 15) 205 | (2, 8) 206 | (2, 8) 207 | (1, 8) 208 | (1, 16) 209 | (1, 8) 210 | (1, 8) 211 | (1, 24) 212 | (1, 16) 213 | (1, 32) 214 | (1, 96) 215 | (1, 24) 216 | (-1, 424) 217 | (-1, 32) 218 | (-1, 8) 219 | (-1, 8) 220 | (-1, 64) 221 | (-1, 24) 222 | (-1, 32) 223 | (0, 280) 224 | (1, 24) 225 | (0, 16) 226 | (1, 24) 227 | (1, 48) 228 | (2, 104) 229 | (0, 929) 230 | -------------------------------------------------------------------------------- /geetest_online/test/test_pyexecjs.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | '''Unit Test for PyExecJS.''' 4 | import execjs 5 | 6 | JSRUNTIME = execjs.get(execjs.runtime_names.Node) 7 | 8 | TOKEN_JS = ''' 9 | function check_browser(data){ 10 | location_info = data.value ^ 536870911 11 | } 12 | location_info = 4995595067; 13 | ''' 14 | 15 | 16 | def test_context(): 17 | '''Test JSRuntime Context functions.''' 18 | _context = JSRUNTIME.compile(TOKEN_JS) 19 | print(_context.eval('location_info')) 20 | print(_context.call('check_browser', '{ value: 499382950}')) 21 | print(_context.eval('location_info')) 22 | 23 | 24 | if __name__ == "__main__": 25 | test_context() 26 | -------------------------------------------------------------------------------- /geetest_online/test/test_token.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | '''Unit Test for token decode''' 4 | 5 | def test_token(): 6 | '''Test token bytes to string.''' 7 | _a = [102, 117, 110, 99, 116, 105, 111, 110, 32, 99, 104, 101, 99, 107, 95, 98, 8 | 114, 111, 119, 115, 101, 114, 40, 100, 97, 116, 97, 41, 123, 32, 10, 32, 9 | 32, 32, 32, 32, 108, 111, 99, 97, 116, 105, 111, 110, 95, 105, 110, 102, 10 | 111, 32, 61, 32, 100, 97, 116, 97, 46, 118, 97, 108, 117, 101, 32, 94, 11 | 32, 53, 51, 54, 56, 55, 48, 57, 49, 49, 10, 125, 32, 10, 108, 111, 12 | 99, 97, 116, 105, 111, 110, 95, 105, 110, 102, 111, 32, 61, 32, 52, 57, 13 | 57, 53, 53, 57, 53, 48, 54, 55, 59] 14 | print(''.join(chr(i) for i in _a)) 15 | 16 | _b = [105, 102, 40, 33, 104, 97, 115, 86, 97, 108, 105, 100, 41, 123, 98, 114, 17 | 111, 119, 115, 101, 114, 95, 118, 101, 114, 115, 105, 111, 110, 40, 123, 32, 18 | 118, 97, 108, 117, 101, 58, 32, 52, 57, 57, 53, 53, 57, 53, 52, 57, 19 | 125, 41, 59, 104, 97, 115, 86, 97, 108, 105, 100, 61, 116, 114, 117, 101, 20 | 59, 125] 21 | print(''.join(chr(i) for i in _b)) 22 | 23 | _c = [102, 117, 110, 99, 116, 105, 111, 110, 32, 99, 104, 101, 99, 107, 95, 98, 24 | 114, 111, 119, 115, 101, 114, 40, 100, 97, 116, 97, 41, 123, 32, 10, 32, 25 | 32, 32, 32, 32, 108, 111, 99, 97, 116, 105, 111, 110, 95, 105, 110, 102, 26 | 111, 32, 61, 32, 100, 97, 116, 97, 46, 118, 97, 108, 117, 101, 32, 94, 27 | 32, 53, 51, 54, 56, 55, 48, 57, 49, 49, 10, 125, 32, 10, 108, 111, 28 | 99, 97, 116, 105, 111, 110, 95, 105, 110, 102, 111, 32, 61, 32, 53, 48, 29 | 48, 48, 54, 51, 53, 48, 48, 51, 59] 30 | print(''.join(chr(i) for i in _c)) 31 | 32 | 33 | if __name__ == "__main__": 34 | test_token() 35 | -------------------------------------------------------------------------------- /geetest_online/test/testgeetestjs.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | '''Unit Test for geetest.js''' 4 | import os 5 | import random 6 | import codecs 7 | import json 8 | import execjs 9 | from PIL import Image 10 | from bs4 import BeautifulSoup 11 | 12 | JSRUNTIME = execjs.get(execjs.runtime_names.Node) 13 | 14 | G_SPLIT_ARRAY_JS = ''' 15 | function getSplitArray() { 16 | for (var a, b = "6_11_7_10_4_12_3_1_0_5_2_9_8".split("_"), c = [], d = 0, e = 52; d < e; d++) 17 | a = 2 * parseInt(b[parseInt(d % 26 / 2)]) + d % 2, 18 | parseInt(d / 2) % 2 || (a += d % 2 ? -1 : 1), 19 | a += d < 26 ? 26 : 0, 20 | c.push(a); 21 | return c 22 | } 23 | ''' 24 | 25 | USERRESPONSE_JS = ''' 26 | function userresponse(a, b) { 27 | for (var c = b.slice(32), d = [], e = 0; e < c.length; e++) { 28 | var f = c.charCodeAt(e); 29 | d[e] = f > 57 ? f - 87 : f - 48 30 | } 31 | c = 36 * d[0] + d[1]; 32 | var g = Math.round(a) + c; 33 | b = b.slice(0, 32); 34 | var h, i = [ 35 | [], 36 | [], 37 | [], 38 | [], 39 | [] 40 | ], 41 | j = {}, 42 | k = 0; 43 | e = 0; 44 | for (var l = b.length; e < l; e++) h = b.charAt(e), j[h] || (j[h] = 1, i[k].push(h), k++, k = 5 == k ? 0 : k); 45 | for (var m, n = g, o = 4, p = "", q = [1, 2, 5, 10, 50]; n > 0;) n - q[o] >= 0 ? (m = parseInt(Math.random() * i[o].length, 10), p += i[o][m], n -= q[o]) : (i.splice(o, 1), q.splice(o, 1), o -= 1); 46 | return p 47 | } 48 | ''' 49 | 50 | USERRESPONSE_JSCONTEXT = JSRUNTIME.compile(USERRESPONSE_JS) 51 | 52 | TRACE_JS = ''' 53 | var tracer = function () { 54 | c = function (traceArray) { 55 | for (var b, c, d, e = [], f = 0, g = [], h = 0, i = traceArray.length - 1; h < i; h++) { 56 | b = Math.round(traceArray[h + 1][0] - traceArray[h][0]), c = Math.round(traceArray[h + 1][1] - traceArray[h][1]), d = Math.round(traceArray[h + 1][2] - traceArray[h][2]), g.push([b, c, d]), 0 == b && 0 == c && 0 == d || (0 == b && 0 == c ? f += d : (e.push([b, c, d + f]), f = 0)); 57 | } 58 | return 0 !== f && e.push([b, c, f]), e 59 | }, 60 | d = function (a) { 61 | var b = "()*,-./0123456789:?@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqr", 62 | c = b.length, 63 | d = "", 64 | e = Math.abs(a), 65 | f = parseInt(e / c); 66 | f >= c && (f = c - 1), f && (d = b.charAt(f)), e %= c; 67 | var g = ""; 68 | return a < 0 && (g += "!"), d && (g += "$"), g + d + b.charAt(e) 69 | }, 70 | e = function (a) { 71 | for (var b = [ 72 | [1, 0], 73 | [2, 0], 74 | [1, -1], 75 | [1, 1], 76 | [0, 1], 77 | [0, -1], 78 | [3, 0], 79 | [2, -1], 80 | [2, 1] 81 | ], c = "stuvwxyz~", d = 0, e = b.length; d < e; d++) 82 | if (a[0] == b[d][0] && a[1] == b[d][1]) return c[d]; 83 | return 0 84 | }, 85 | f = function (traceArray) { 86 | for (var b, f = c(traceArray), g = [], h = [], i = [], j = 0, k = f.length; j < k; j++) { 87 | b = e(f[j]), b ? h.push(b) : (g.push(d(f[j][0])), h.push(d(f[j][1]))), i.push(d(f[j][2])); 88 | } 89 | return g.join("") + "!!" + h.join("") + "!!" + i.join("") 90 | }, 91 | g = function (traceArray) { 92 | var a = f(traceArray); 93 | return encodeURIComponent(a) 94 | }; 95 | return { 96 | trace: g 97 | } 98 | }(); 99 | exports.tracer = tracer; 100 | ''' 101 | 102 | TRACE_JS_CONTEXT = JSRUNTIME.compile(TRACE_JS) 103 | 104 | 105 | def load_filetext(filename): 106 | '''load text from file as utf-8 codecs''' 107 | text = '' 108 | with codecs.open(filename, 'r', 'utf-8') as _f: 109 | text = _f.read() 110 | return text 111 | 112 | 113 | def test_load_geetest_js(): 114 | '''load javascript text from file, compile, return context object''' 115 | jsfile = os.path.join(os.getcwd(), 'gsxt', 'geetest.5.10.10.js') 116 | print(jsfile) 117 | js_context = JSRUNTIME.compile(load_filetext(jsfile)) 118 | print(js_context) 119 | 120 | 121 | def test_get_splite_array(): 122 | '''load split array data from call javascript''' 123 | context = JSRUNTIME.compile(G_SPLIT_ARRAY_JS) 124 | splite_array = context.call('getSplitArray') 125 | print('split array = ' + str(splite_array)) 126 | return splite_array 127 | 128 | 129 | def test_offset_position(height, split_array): 130 | '''parse offset position array from split array''' 131 | offset_array = [] 132 | for i in split_array: 133 | _x = i % 26 * 12 + 1 134 | _y = height / 2 if i > 25 else 0 135 | offset_array.append([_x, _y]) 136 | print('offset array = ' + str(offset_array)) 137 | return offset_array 138 | 139 | 140 | def test_rewrite_image(image_name, offset_array): 141 | '''load image from file, recombined to new image by offset array''' 142 | img = Image.open(os.path.join(os.getcwd(), 'temp', image_name)) 143 | print(img.format, img.size, img.mode) 144 | 145 | rows, columns, offsetwidth, offsetheight = 2, 26, 10, 58 146 | img_new = Image.new('RGB', (columns*offsetwidth, rows*offsetheight)) 147 | for row in range(rows): 148 | for column in range(columns): 149 | from_x, from_y = offset_array[row * columns + column] 150 | box = (from_x, from_y, from_x + offsetwidth, from_y + offsetheight) 151 | to_x, to_y = column*offsetwidth, row*offsetheight 152 | box_new = (to_x, to_y, to_x + offsetwidth, to_y + offsetheight) 153 | img_new.paste(img.crop(box), box_new) 154 | 155 | img_new.save(os.path.join(os.getcwd(), 'temp', image_name + '.jpg'), format='JPEG') 156 | print(img_new.format, img_new.size, img_new.mode) 157 | img.close() 158 | img_new.close() 159 | 160 | 161 | def comparepixel(src, dst, threshold): 162 | '''compare two pixel value by threshold.''' 163 | return abs(src[0] - dst[0]) < threshold \ 164 | and abs(src[1] - dst[1]) < threshold \ 165 | and abs(src[2] - dst[2]) < threshold 166 | 167 | 168 | def get_diff_xy(img1, img2, start_x, start_y, threshold): 169 | '''Calculate the difference between image1 and image2.''' 170 | width, height = img1.size 171 | img_diff = img2.copy() 172 | pixel_diff = [] 173 | for _x in range(start_x, width): 174 | for _y in range(start_y, height): 175 | pixel1, pixel2 = img1.getpixel((_x, _y)), img2.getpixel((_x, _y)) 176 | if not comparepixel(pixel1, pixel2, threshold): 177 | pixel_diff.append((_x, _y)) 178 | 179 | min_xy, max_xy = min(pixel_diff), max(pixel_diff) 180 | for _y in range(height): 181 | img_diff.putpixel((min_xy[0], _y), (0, 0, 0)) 182 | img_diff.putpixel((max_xy[0], _y), (0, 0, 0)) 183 | 184 | name = 'diff_' + str(threshold) + '_' + str(min_xy[0]) + '_' + str(max_xy[0]) + '.jpg' 185 | img_diff.save(os.path.join(os.getcwd(), 'temp', name), format='JPEG') 186 | img_diff.close() 187 | print(threshold, min_xy[0], max_xy[0]) 188 | return min_xy[0], max_xy[0] 189 | 190 | 191 | def get_best_diff(img1, img2, start_x, start_y): 192 | '''Calculate the best different positon.''' 193 | _x, _y = 0, 0 194 | for threshold in range(5, 71, 5): 195 | _x, _y = get_diff_xy(img1, img2, start_x, start_y, threshold) 196 | return _x, _y 197 | 198 | 199 | def test_diff_image(image1, image2, start_x, start_y): 200 | '''find different between two images''' 201 | image_path_src = os.path.join(os.getcwd(), 'temp', image1) 202 | image_path_dst = os.path.join(os.getcwd(), 'temp', image2) 203 | img1, img2 = Image.open(image_path_src), Image.open(image_path_dst) 204 | if img1.size != img2.size: 205 | print('2 images size is different') 206 | img1.close() 207 | img2.close() 208 | return 209 | _x, _y = get_best_diff(img1, img2, start_x, start_y) 210 | img1.close() 211 | img2.close() 212 | return _x, _y 213 | 214 | 215 | def userresponse(distance, challenge): 216 | '''根据滑动距离distance和challenge,计算userresponse值''' 217 | return USERRESPONSE_JSCONTEXT.call('userresponse', distance, challenge) 218 | 219 | 220 | def imgload(): 221 | '''图片加载时间(毫秒),用于统计,无验证功能。''' 222 | return random.randint(100, 200) 223 | 224 | 225 | def adjust_distance(distance): 226 | '''滑块slice图片的尺寸:59*50,上下左右四周可能间隔6像素,因此实际尺寸:47*38。''' 227 | return distance - 6 228 | 229 | 230 | def parsetrace(trace_file): 231 | '''parse trace distance''' 232 | with open(os.path.join(os.getcwd(), 'test', trace_file)) as tracedata: 233 | trace = json.load(tracedata) 234 | print('trace analyse:') 235 | for index in range(2, len(trace)): 236 | print(trace[index][0] - trace[index-1][0], trace[index][2] - trace[index-1][2]) 237 | 238 | 239 | def usertrace(distance): 240 | ''' 241 | 采集用户鼠标拖动轨迹,构造数组(x坐标, y坐标, 时间间隔毫秒),加密。 242 | geetest.5.10.10.js的变量: Q.t("arr", a) 243 | 输出加密前的明文数组: console.log(JSON.stringify(Q.t("arr", a))) 244 | 轨迹样本见TraceSample.txt 245 | 轨迹样本分析见TraceSampleParse.txt 246 | ''' 247 | # 轨迹间隔数组 248 | trace = [] 249 | # 根据距离distance,计算总共步数,范围采样50%-75% 250 | total_steps = int(distance * random.uniform(0.5, 0.75)) 251 | # 滑动阶段1:慢速起步,按下鼠标时间间隔较长 252 | move_instance = random.randint(1, 4) 253 | trace.append((move_instance, 0, random.randint(200, 500))) 254 | # 滑动阶段2:中速运行,鼠标拖动速度中等 255 | for _i in range(total_steps): 256 | if move_instance < distance: 257 | step = random.randint(1, 3) 258 | move_instance = move_instance + step 259 | trace.append((step, 0, random.randint(8, 24))) 260 | # 滑动阶段3:慢速到达,鼠标接近目标位置时减速 261 | trace.append(((distance - move_instance), 0, random.randint(100, 800))) 262 | trace.append((0, 0, random.randint(100, 500))) 263 | print(trace) 264 | 265 | # 轨迹间隔数组转成轨迹坐标数组 266 | position = [] 267 | # 鼠标点击坐标相对于滑块图片边缘的值 268 | position.append((-random.randint(14, 30), -random.randint(14, 30), 0)) 269 | # 起始值 270 | current_position = (0, 0, 0) 271 | position.append(current_position) 272 | for _i in trace: 273 | next_positon = (current_position[0] + _i[0], _i[1], current_position[2] + _i[2]) 274 | position.append(next_positon) 275 | current_position = next_positon 276 | 277 | passtime = position[-1][2] 278 | print(position) 279 | print(passtime) 280 | return position, passtime 281 | 282 | 283 | def encrypttrace(trace): 284 | '''encrypt trace data by JSCall''' 285 | return TRACE_JS_CONTEXT.call('tracer.trace', trace) 286 | 287 | 288 | def fun_c(param): 289 | '''reversed from geetest.js''' 290 | _b = 0 291 | _c = 0 292 | _d = 0 293 | _e = [] 294 | _f = 0 295 | _g = [] 296 | _h = 0 297 | for _h in range(0, len(param)-1): 298 | _b = round(param[_h + 1][0] - param[_h][0]) 299 | _c = round(param[_h + 1][1] - param[_h][1]) 300 | _d = round(param[_h + 1][2] - param[_h][2]) 301 | _g.append([_b, _c, _d]) 302 | if _b == 0 and _c == 0 and _d == 0: 303 | continue 304 | else: 305 | if _b == 0 and _c == 0: 306 | _f = _f + _d 307 | else: 308 | _e.append([_b, _c, _d + _f]) 309 | _f = 0 310 | if _f != 0: 311 | _e.append([_b, _c, _f]) 312 | return _e 313 | 314 | 315 | def fun_d(param): 316 | '''reversed from geetest.js''' 317 | _b = "()*,-./0123456789:?@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqr" 318 | _c = len(_b) 319 | _d = "" 320 | _e = abs(param) 321 | _f = int(_e / _c) 322 | if _f >= _c: 323 | _f = _c - 1 324 | if _f: 325 | _d = chr(_f) 326 | _e = _e % _c 327 | _g = '' 328 | if param < 0: 329 | _g = _g + '!' 330 | if _d: 331 | _g = _g + '$' 332 | return _g + _d + _b[int(_e)] 333 | 334 | 335 | def fun_e(param): 336 | '''reversed from geetest.js''' 337 | _b = [[1, 0], [2, 0], [1, -1], [1, 1], [0, 1], [0, -1], [3, 0], [2, -1], [2, 1]] 338 | _c = "stuvwxyz~" 339 | _d = 0 340 | _e = len(_b) 341 | for _d in range(0, len(_b)): 342 | if param[0] == _b[_d][0] and param[1] == _b[_d][1]: 343 | return _c[_d] 344 | return 0 345 | 346 | 347 | def fun_f(param): 348 | '''reversed from geetest.js''' 349 | _b = None 350 | _f = fun_c(param) 351 | _g = [] 352 | _h = [] 353 | _i = [] 354 | for j in _f: 355 | _b = fun_e(j) 356 | if _b: 357 | _h.append(_b) 358 | else: 359 | _g.append(fun_d(j[0])) 360 | _h.append(fun_d(j[1])) 361 | _i.append(fun_d(j[2])) 362 | 363 | return ''.join(j for j in _g) + '!!' + ''.join(j for j in _h) + '!!' + ''.join(j for j in _i) 364 | 365 | 366 | def parse_html(html_doc, page): 367 | '''parse html webpage elements.''' 368 | soup = BeautifulSoup(html_doc, 'html.parser') 369 | _result = [] 370 | _findall = soup.find_all('a', class_='search_list_item db') 371 | for _a in _findall: 372 | _name = _a.find('h1', class_='f20') 373 | _name_str = ''.join(_name.get_text().split()) 374 | _code = _a.find('div', class_='div-map2') 375 | _number = _code.find('span', class_='g3') 376 | _number_str = ''.join(_number.get_text().split()) 377 | _result.append([_name_str, _number_str]) 378 | 379 | print(json.dumps(_result, indent=2, sort_keys=True, ensure_ascii=False)) 380 | _findall = soup.find_all('a', href='javascript:turnOverPage({})'.format(page + 1)) 381 | return _result, True if _findall else False 382 | 383 | 384 | def print_json_type(): 385 | '''Dump json object type.''' 386 | json_text = '''{ 387 | "bg": "pictures/gt/fc064fc73/bg/e1777734e.jpg", 388 | "link": "", 389 | "challenge": "0a80f1e4b0ff6381e26425b7fa3e71f4c2", 390 | "ypos": 24, 391 | "fullbg": "pictures/gt/fc064fc73/fc064fc73.jpg", 392 | "id": "", 393 | "xpos": 0, 394 | "feedback": "", 395 | "height": 116, 396 | "slice": "pictures/gt/fc064fc73/slice/e1777734e.png", 397 | "type": "slide" 398 | }''' 399 | json_object = json.loads(json_text) 400 | for _k, _v in json_object.items(): 401 | print(type(_k)) 402 | print(type(_v)) 403 | 404 | 405 | def test_geetest(): 406 | '''test geetest related functions.''' 407 | test_load_geetest_js() 408 | print(userresponse(100, '1196277ad0c2a2142efce133857c5c8bja')) 409 | print(userresponse(100, "1196277ad0c2a2142efce133857c5c8bja")) 410 | splites = test_get_splite_array() 411 | offsets = test_offset_position(116, splites) 412 | test_rewrite_image('fullbg', offsets) 413 | test_rewrite_image('bg', offsets) 414 | _x, _y = test_diff_image('fullbg.jpg', 'bg.jpg', 0, 12) 415 | print(imgload()) 416 | parsetrace('TraceSample01.txt') 417 | parsetrace('TraceSample02.txt') 418 | parsetrace('TraceSample03.txt') 419 | parsetrace('TraceSample04.txt') 420 | trace, _passtime = usertrace(100) 421 | print(encrypttrace(trace)) 422 | print(fun_f(trace)) 423 | trace, _passtime = usertrace(200) 424 | print(encrypttrace(trace)) 425 | print(fun_f(trace)) 426 | print(parse_html(load_filetext(os.path.join(os.getcwd(), 'test', 'result.html')), 1)) 427 | print(parse_html(load_filetext(os.path.join(os.getcwd(), 'test', 'result2.html')), 2)) 428 | print_json_type() 429 | 430 | 431 | if __name__ == "__main__": 432 | test_geetest() 433 | -------------------------------------------------------------------------------- /geetest_online/util.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | geetest常用公共方法 5 | ''' 6 | 7 | SPLIT_ARRAY_JS = ''' 8 | function getSplitArray() { 9 | for (var a, b = "6_11_7_10_4_12_3_1_0_5_2_9_8".split("_"), c = [], d = 0, e = 52; d < e; d++) 10 | a = 2 * parseInt(b[parseInt(d % 26 / 2)]) + d % 2, 11 | parseInt(d / 2) % 2 || (a += d % 2 ? -1 : 1), 12 | a += d < 26 ? 26 : 0, 13 | c.push(a); 14 | return c 15 | } 16 | ''' 17 | 18 | USERRESPONSE_JS = ''' 19 | function userresponse(a, b) { 20 | for (var c = b.slice(32), d = [], e = 0; e < c.length; e++) { 21 | var f = c.charCodeAt(e); 22 | d[e] = f > 57 ? f - 87 : f - 48 23 | } 24 | c = 36 * d[0] + d[1]; 25 | var g = Math.round(a) + c; b = b.slice(0, 32); 26 | var h, i = [ [], [], [], [], [] ], j = {}, k = 0; e = 0; 27 | for (var l = b.length; e < l; e++) 28 | h = b.charAt(e), j[h] || (j[h] = 1, i[k].push(h), k++, k = 5 == k ? 0 : k); 29 | for (var m, n = g, o = 4, p = "", q = [1, 2, 5, 10, 50]; n > 0;) 30 | n - q[o] >= 0 ? (m = parseInt(Math.random() * i[o].length, 10), p += i[o][m], n -= q[o]) : (i.splice(o, 1), q.splice(o, 1), o -= 1); 31 | return p 32 | } 33 | ''' 34 | 35 | OFFLINE_SAMPLE = ((186, 1, 98), 36 | (82, 0, 136), 37 | (61, 5, 108), 38 | (128, 2, 7), 39 | (130, 4, 99), 40 | (189, 3, 65), 41 | (108, 5, 285), 42 | (136, 0, 36), 43 | (41, 0, 263), 44 | (124, 3, 185)) 45 | 46 | 47 | TRACE_JS = ''' 48 | var tracer = function () { 49 | c = function (traceArray) { 50 | for (var b, c, d, e = [], f = 0, g = [], h = 0, i = traceArray.length - 1; h < i; h++) { 51 | b = Math.round(traceArray[h + 1][0] - traceArray[h][0]), 52 | c = Math.round(traceArray[h + 1][1] - traceArray[h][1]), 53 | d = Math.round(traceArray[h + 1][2] - traceArray[h][2]), 54 | g.push([b, c, d]), 0 == b && 0 == c && 0 == d || (0 == b && 0 == c ? f += d : (e.push([b, c, d + f]), f = 0)); 55 | } 56 | return 0 !== f && e.push([b, c, f]), e 57 | }, 58 | d = function (a) { 59 | var b = "()*,-./0123456789:?@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqr", 60 | c = b.length, 61 | d = "", 62 | e = Math.abs(a), 63 | f = parseInt(e / c); 64 | f >= c && (f = c - 1), f && (d = b.charAt(f)), e %= c; 65 | var g = ""; 66 | return a < 0 && (g += "!"), d && (g += "$"), g + d + b.charAt(e) 67 | }, 68 | e = function (a) { 69 | for (var b = [ 70 | [1, 0], 71 | [2, 0], 72 | [1, -1], 73 | [1, 1], 74 | [0, 1], 75 | [0, -1], 76 | [3, 0], 77 | [2, -1], 78 | [2, 1] 79 | ], c = "stuvwxyz~", d = 0, e = b.length; d < e; d++) 80 | if (a[0] == b[d][0] && a[1] == b[d][1]) return c[d]; 81 | return 0 82 | }, 83 | f = function (traceArray) { 84 | for (var b, f = c(traceArray), g = [], h = [], i = [], j = 0, k = f.length; j < k; j++) { 85 | b = e(f[j]), b ? h.push(b) : (g.push(d(f[j][0])), h.push(d(f[j][1]))), i.push(d(f[j][2])); 86 | } 87 | return g.join("") + "!!" + h.join("") + "!!" + i.join("") 88 | }, 89 | g = function (traceArray) { 90 | var a = f(traceArray); 91 | return encodeURIComponent(a) 92 | }; 93 | return { 94 | trace: g 95 | } 96 | }(); 97 | exports.tracer = tracer; 98 | ''' 99 | -------------------------------------------------------------------------------- /gitstats/README.md: -------------------------------------------------------------------------------- 1 | Python实现的一个小工具,用于分析Git commit log,获得Git Project每个成员的简单行为数据。 2 | 3 | **Warning:代码量不能代表程序员能力水平!** 4 | 5 | ### 启动参数 6 | 7 | 共5个。 8 | 9 | + Repo地址 10 | + Commit 起始日期 11 | + Commit 结束日期 12 | + Git仓库子目录 13 | + 统计分析结果CSV文件目标路径 14 | 15 | ### exec_git 16 | 17 | Git Log命令: 18 | 19 | > git -C {} log --since={} --until={} --pretty=tformat:%ae --shortstat --no-merges -- {} > {} 20 | 21 | 填入参数,调用系统命令 `os.system()`,输出结果至本地临时文件。读取至内存,简单的String Array。 22 | 23 | ### parse 24 | 25 | Git Log输出有3种格式,对应3种正则表达式。 26 | 27 | ```Python 28 | REPATTERN_FULL = r"\s(\d+)\D+(\d+)\D+(\d+)\D+\n" 29 | REPATTERN_INSERT_ONLY = r"\s(\d+)\D+(\d+)\sinsertion\D+\n" 30 | REPATTERN_DELETE_ONLY = r"\s(\d+)\D+(\d+)\sdeletion\D+\n" 31 | ``` 32 | 33 | 遍历得到的数据,首先构造一个以Author为Key,分析结果为Value的字典。分析结果构造一个元祖,包括: 34 | 35 | + Commit 次数 36 | + 增加代码行数 37 | + 删除代码行数 38 | + 变更代码行数 39 | 40 | ### save_csv 41 | 42 | 简单省略。 43 | -------------------------------------------------------------------------------- /gitstats/gitstats.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | '''Analyse git branch commit log, for every version, every person.''' 4 | import os 5 | import sys 6 | import re 7 | import csv 8 | 9 | GIT_LOG = r'git -C {} log --since={} --until={} --pretty=tformat:%ae --shortstat --no-merges -- {} > {}' 10 | 11 | REPATTERN_FULL = r"\s(\d+)\D+(\d+)\D+(\d+)\D+\n" 12 | REPATTERN_INSERT_ONLY = r"\s(\d+)\D+(\d+)\sinsertion\D+\n" 13 | REPATTERN_DELETE_ONLY = r"\s(\d+)\D+(\d+)\sdeletion\D+\n" 14 | 15 | CSV_FILE_HEADER = ["Author", "Commit", "Insert", "Delete", "Loc"] 16 | 17 | 18 | def exec_git(repo, since, until, subdir): 19 | '''Execute git log commant, return string array.''' 20 | logfile = os.path.join(os.getcwd(), 'gitstats.txt') 21 | git_log_command = GIT_LOG.format(repo, since, until, subdir, logfile) 22 | os.system(git_log_command) 23 | lines = None 24 | with open(logfile, 'r', encoding='utf-8') as logfilehandler: 25 | lines = logfilehandler.readlines() 26 | return lines 27 | 28 | 29 | def save_csv(stats, csvfile): 30 | '''save stats data to csv file.''' 31 | with open(csvfile, 'w', encoding='utf-8') as csvfilehandler: 32 | writer = csv.writer(csvfilehandler) 33 | writer.writerow(CSV_FILE_HEADER) 34 | for author, stat in stats.items(): 35 | writer.writerow([author, stat[0], stat[1], stat[2], stat[3]]) 36 | 37 | 38 | def parse(lines): 39 | '''Analyse git log and sort to csv file.''' 40 | prog_full = re.compile(REPATTERN_FULL) 41 | prog_insert_only = re.compile(REPATTERN_INSERT_ONLY) 42 | prog_delete_only = re.compile(REPATTERN_DELETE_ONLY) 43 | 44 | stats = {} 45 | for i in range(0, len(lines), 3): 46 | author = lines[i] 47 | #empty = lines[i+1] 48 | info = lines[i+2] 49 | #change = 0 50 | insert, delete = int(0), int(0) 51 | result = prog_full.search(info) 52 | if result: 53 | #change = result[0] 54 | insert = int(result.group(2)) 55 | delete = int(result.group(3)) 56 | else: 57 | result = prog_insert_only.search(info) 58 | if result: 59 | #change = result[0] 60 | insert = int(result.group(2)) 61 | delete = int(0) 62 | else: 63 | result = prog_delete_only.search(info) 64 | if result: 65 | #change = result[0] 66 | insert = int(0) 67 | delete = int(result.group(2)) 68 | else: 69 | print('Regular expression fail!') 70 | return 71 | 72 | loc = insert - delete 73 | stat = stats.get(author) 74 | if stat is None: 75 | stats[author] = [1, insert, delete, loc] 76 | else: 77 | stat[0] += 1 78 | stat[1] += insert 79 | stat[2] += delete 80 | stat[3] += loc 81 | 82 | return stats 83 | 84 | 85 | if __name__ == "__main__": 86 | print('gitstats begin') 87 | if len(sys.argv) != 6: 88 | print('Invalid argv parameters.') 89 | exit(0) 90 | 91 | REPO = os.path.join(os.getcwd(), sys.argv[1]) 92 | SINCE = sys.argv[2] 93 | UNTIL = sys.argv[3] 94 | SUB_DIR = sys.argv[4] 95 | CSV_FILE = os.path.join(os.getcwd(), sys.argv[5]) 96 | LINES = exec_git(REPO, SINCE, UNTIL, SUB_DIR) 97 | assert LINES is not None 98 | STATS = parse(LINES) 99 | save_csv(STATS, CSV_FILE) 100 | print('gitstats done') 101 | -------------------------------------------------------------------------------- /gsxt_mobile/README.md: -------------------------------------------------------------------------------- 1 | [国家企业信用信息公示系统](http://www.gsxt.gov.cn)使用GeeTest滑块验证码。主站使用online验证模式,难破解。部分分站使用offline验证模式,易破解但多次HTTP请求应答往复,查询效率低。 2 | [国家工商总局](http://www.saic.gov.cn/)提供了Android,iOS App,这次就来尝试分析一下App的情况。 3 | 4 | 总局网站有2套: 5 | 6 | + 新版 http://www.saic.gov.cn/ 7 | + 旧版 http://old.saic.gov.cn/ 8 | 9 | 于是App下载说明页面也有2套: 10 | 11 | + 新版 http://gzhd.saic.gov.cn/gszj/index/telephone/android2.html 12 | + 旧版 http://gzhd.saic.gov.cn/gszj/index/telephone/android.html 13 | 14 | 还好App只有1套。 15 | 16 | 国家工商行政管理总局移动版客户端: 17 | 18 | + Android版 http://gzhd.saic.gov.cn/gszj/saicwap.apk 19 | + iOS版 https://itunes.apple.com/cn/app/gong-shang-zong-ju/id725956822?mt=8 20 | 21 | 国家企业信用信息公示系统: 22 | 23 | + Android版 http://gzhd.saic.gov.cn/gszj/gongshi.apk 24 | + iOS版 https://itunes.apple.com/cn/app/%E5%9B%BD%E5%AE%B6%E4%BC%81%E4%B8%9A%E4%BF%A1%E7%94%A8%E4%BF%A1%E6%81%AF%E5%85%AC%E7%A4%BA%E7%B3%BB%E7%BB%9F/id1048375712?mt=8 25 | 26 | ### 分析 27 | 28 | **saicwap.apk,看这个名称,好像已经明白了什么。** 29 | 安装&运行&解包查看`国家企业信用信息公示系统Android APK`文件。 30 | UI交互体验基本上就是一个网页。 31 | dex很小,assets文件很多。 32 | 根据名称搜索加猜测,直接得出结论,WebView外壳,JQuery+AJAX实现网页。 33 | 使用Fiddler抓包,仅有一条简单的HTTP Request & Response。 34 | Response是标准JSON文本。 35 | 36 | 随手写个实现 37 | 38 | ### 填写Android Mobile HTTP Header参数 39 | 40 | ```Python 41 | URL = 'http://yd.gsxt.gov.cn/QuerySummary' 42 | MOBILE_ACTION = 'entSearch' 43 | TOPIC = 1 44 | PAGE_NUM = 1 45 | PAGE_SIZE = 10 46 | USER_ID = 'id001' 47 | USER_IP = '192.168.0.1' 48 | USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.4.2; vivo Y28L Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/30.0.0.0 Mobile Safari/537.36 Html5Plus/1.0' 49 | ACCEPT_LANGUAGE = 'zh-CN,en-US;q=0.8' 50 | XRW = 'com.zongjucredit' 51 | ORIGIN = 'file://' 52 | CHARSET = 'application/x-www-form-urlencoded; charset=UTF-8' 53 | ``` 54 | 55 | ### 使用requests库 56 | 57 | ```Python 58 | def query(keyword): 59 | _data = [('mobileAction', MOBILE_ACTION), 60 | ('keywords', keyword), 61 | ('topic', TOPIC), 62 | ('pageNum', PAGE_NUM), 63 | ('pageSize', PAGE_SIZE), 64 | ('userID', USER_ID), 65 | ('userIP', USER_IP)] 66 | _headers = {'User-Agent': USER_AGENT, 67 | 'Accept-Language': ACCEPT_LANGUAGE, 68 | 'X-Requested-With': XRW, 69 | 'Origin': ORIGIN, 70 | 'Content-Type': CHARSET} 71 | 72 | _response = requests.post(URL, data=_data, headers=_headers) 73 | print(_response.status_code) 74 | if _response.status_code == 200: 75 | _content = _response.json() 76 | print(json.dumps(_content, indent=2, sort_keys=True, ensure_ascii=False)) 77 | ``` 78 | 79 | ### 测试运行 80 | 81 | 搜索关键字`腾讯科技`,得到[50条数据](https://github.com/9468305/python-script/blob/master/gsxt_mobile/%E8%85%BE%E8%AE%AF%E7%A7%91%E6%8A%8050.txt)。格式示例: 82 | 83 | ```JSON 84 | { 85 | "BUSEXCEPTCOUNT": "0", 86 | "CAT18": "10", 87 | "CAT2NAME": "法定代表人", 88 | "ENTNAME": "腾讯科技(成都)有限公司", 89 | "ENTTYPE": "6150", 90 | "ESTDATE": "2008年07月10日", 91 | "NAME": "奚丹", 92 | "PRIPID": "BFA63C5493A3045829033A5B114CE66AFD1B796865F63020C39130E7149AE9152BAC6972D71F0C3A65B342A32972C4439717E803CD7E66773D486FDD9FCBAEC8", 93 | "REGNO": "510100400024413", 94 | "REGSTATE_CN": "存续(在营、开业、在册)", 95 | "S_EXT_NODENUM": "510000", 96 | "UNISCID": "915101006771521538" 97 | } 98 | ``` 99 | 100 | **实测,有时会封IP,24小时解禁,一旦封禁,爬虫和官方App一概屏蔽。** 101 | -------------------------------------------------------------------------------- /gsxt_mobile/gsxt_mobile.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | '''通过国家企业信用信息公示系统(www.gsxt.gov.cn) Mobile App HTTP API 查询企业信息''' 4 | 5 | import json 6 | import requests 7 | 8 | URL = 'http://yd.gsxt.gov.cn/QuerySummary' 9 | MOBILE_ACTION = 'entSearch' 10 | TOPIC = 1 11 | PAGE_NUM = 1 12 | PAGE_SIZE = 10 13 | USER_ID = 'id001' 14 | USER_IP = '192.168.0.1' 15 | USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.4.2; vivo Y28L Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/30.0.0.0 Mobile Safari/537.36 Html5Plus/1.0' 16 | ACCEPT_LANGUAGE = 'zh-CN,en-US;q=0.8' 17 | XRW = 'com.zongjucredit' 18 | ORIGIN = 'file://' 19 | CHARSET = 'application/x-www-form-urlencoded; charset=UTF-8' 20 | 21 | 22 | def query(keyword): 23 | '''main entry''' 24 | _data = [('mobileAction', MOBILE_ACTION), 25 | ('keywords', keyword), 26 | ('topic', TOPIC), 27 | ('pageNum', PAGE_NUM), 28 | ('pageSize', PAGE_SIZE), 29 | ('userID', USER_ID), 30 | ('userIP', USER_IP)] 31 | 32 | _headers = {'User-Agent': USER_AGENT, 33 | 'Accept-Language': ACCEPT_LANGUAGE, 34 | 'X-Requested-With': XRW, 35 | 'Origin': ORIGIN, 36 | 'Content-Type': CHARSET} 37 | 38 | _response = requests.post(URL, data=_data, headers=_headers) 39 | print(_response.status_code) 40 | if _response.status_code == 200: 41 | _content = _response.json() 42 | print(len(_content)) 43 | print(json.dumps(_content, indent=2, sort_keys=True, ensure_ascii=False)) 44 | with open(keyword + str(len(_content)) + '.txt', 'w', encoding='utf-8') as _f: 45 | json.dump(_content, _f, indent=2, sort_keys=True, ensure_ascii=False) 46 | else: 47 | print('request fail') 48 | 49 | if __name__ == "__main__": 50 | query('腾讯科技') 51 | -------------------------------------------------------------------------------- /lagou/README.md: -------------------------------------------------------------------------------- 1 | 关于拉勾网数据采集爬虫的文章,网上已经写烂了。这里简单记录一个之前帮助同事妹子写的小爬虫工具。 2 | 某天,HR同事妹子接到一个任务,收集并分析拉勾网BAT三家公司所有招聘岗位的分类,要求,薪酬范围,人数等信息。 3 | 人肉采集辛苦枯燥,随手写段代码搭救妹子。 4 | 5 | ### 开始 6 | 7 | 拉勾网页面可能改版,以下代码实现可能已失效,不考虑持续维护更新。 8 | 拉勾网给每家注册公司分配一个数字,URL形式是: 9 | 10 | ```Python 11 | LAGOU_URL = r'https://www.lagou.com/gongsi/j%d.html' 12 | ``` 13 | 14 | 人肉筛选目标公司如下: 15 | 16 | ```Python 17 | COMPANY = { 18 | '腾讯': 451, 19 | '阿里优酷': 1914, 20 | '阿里高德': 91, 21 | '阿里天猫': 52840, 22 | '阿里UC': 2202, 23 | '阿里神马搜索': 90948, 24 | '百度': 1575, 25 | '百度外卖': 104601 26 | } 27 | ``` 28 | 29 | 每家公司子页面的实现,使用了较多复杂Javascript代码和框架,因此不采用抓包分析HTTP协议的方案。 30 | 简单粗暴直接的组合: Selenium + WebDriver + Chrome。 31 | 32 | ### Selenium 33 | 34 | 官网 http://www.seleniumhq.org/ 35 | GitHub https://github.com/SeleniumHQ/selenium 36 | 文档 http://selenium-python.readthedocs.io/ 37 | 38 | ### ChromeDriver 39 | 40 | [ChromeDriver - WebDriver for Chrome](https://sites.google.com/a/chromium.org/chromedriver/) 41 | 为什么不使用运行效率更高的 [PhantomJS](http://phantomjs.org/) ? 42 | 因为需要频繁调试代码和观察运行情况。稳定运行后可以随时修改一行代码参数,替换成 PhantomJS 。 43 | **Chrome 59 beta 开始支持 Headless。** 详见 [Getting Started with Headless Chrome](https://developers.google.com/web/updates/2017/04/headless-chrome)。所以以后应该也不再需要 PhantomJS 了。 44 | 45 | ### 数据定义 46 | 47 | 继续简单粗暴直接:(参数有点多,PyLint 报 Warning 了,无视吧。) 48 | 49 | ```Python 50 | class JobInfo(object): 51 | '''Job Info Object''' 52 | def __init__(self, company, job_filter, title, salary_min, salary_max, exp, edu): 53 | self.company = company 54 | self.filter = job_filter 55 | self.title = title 56 | self.salary_min = salary_min 57 | self.salary_max = salary_max 58 | self.exp = exp 59 | self.edu = edu 60 | 61 | @staticmethod 62 | def header(): 63 | '''csv file header''' 64 | return ['公司', '类别', '职位', '薪酬区间低', '薪酬区间高', '经验要求', '学历要求'] 65 | 66 | def array(self): 67 | '''object to array''' 68 | return [self.company, 69 | self.filter, 70 | self.title, 71 | self.salary_min, 72 | self.salary_max, 73 | self.exp, 74 | self.edu] 75 | ``` 76 | 77 | ### 页面加载解析 78 | 79 | WebDriver API 方便好用强大。 80 | 81 | ```Python 82 | con_list_item = WebDriverWait(browser, SLEEPTIME).until(lambda x: x.find_elements_by_class_name('con_list_item')) 83 | ``` 84 | 85 | 执行点击翻页跳转 86 | 87 | ```Python 88 | try: 89 | pages = browser.find_element_by_class_name('pages') 90 | spans = pages.find_elements_by_tag_name('span') 91 | span = get_next_span(spans) 92 | if span: 93 | span.click() 94 | time.sleep(SLEEPTIME) 95 | except NoSuchElementException as _e: 96 | print(_e) 97 | ``` 98 | 99 | **数据采集完成后写入csv文件,略。** 100 | 101 | ### 坑 102 | 103 | WebDriver API 简单易用,但超时处理机制仍不完善。 104 | 105 | ```Python 106 | browser = webdriver.Chrome() 107 | browser.get(url) 108 | browser.refresh() 109 | browser.quit() 110 | ``` 111 | 112 | `implicitly_wait()` 无法判断页面内部各种Ajax操作执行完成的时机。只好注释掉这一行代码。 113 | 114 | ```Python 115 | browser.implicitly_wait(TIMEOUT) 116 | ``` 117 | -------------------------------------------------------------------------------- /lagou/lagou.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | '''spider for https://www.lagou.com''' 4 | 5 | import time 6 | import csv 7 | import os 8 | import sys 9 | from selenium import webdriver 10 | from selenium.webdriver.support.ui import WebDriverWait 11 | from selenium.common.exceptions import NoSuchElementException 12 | 13 | LAGOU_URL = r'https://www.lagou.com/gongsi/j%d.html' 14 | COMPANY = { 15 | '腾讯': 451, 16 | '阿里优酷': 1914, 17 | '阿里高德': 91, 18 | '阿里天猫': 52840, 19 | '阿里UC': 2202, 20 | '阿里神马搜索': 90948, 21 | '百度': 1575, 22 | '百度外卖': 104601 23 | } 24 | 25 | SLEEPTIME = 3 #seconds 26 | 27 | class JobInfo(object): 28 | '''Job Info Object''' 29 | def __init__(self, company, job_filter, title, salary_min, salary_max, exp, edu): 30 | self.company = company 31 | self.filter = job_filter 32 | self.title = title 33 | self.salary_min = salary_min 34 | self.salary_max = salary_max 35 | self.exp = exp 36 | self.edu = edu 37 | 38 | @staticmethod 39 | def header(): 40 | '''csv file header''' 41 | return ['公司', '类别', '职位', '薪酬区间低', '薪酬区间高', '经验要求', '学历要求'] 42 | 43 | def array(self): 44 | '''object to array''' 45 | return [self.company, 46 | self.filter, 47 | self.title, 48 | self.salary_min, 49 | self.salary_max, 50 | self.exp, 51 | self.edu] 52 | 53 | 54 | def lagou_page(browser, job_list, company_name, job_filter): 55 | '''filter for every page''' 56 | con_list_item = WebDriverWait(browser, SLEEPTIME)\ 57 | .until(lambda x: x.find_elements_by_class_name('con_list_item')) 58 | for item in con_list_item: 59 | job = item.text.split('\n') 60 | job_title = job[0] 61 | #job_date = job[1] 62 | job_salary = job[2].split('-') 63 | job_salary_min = job_salary[0] 64 | if len(job_salary) > 1: 65 | job_salary_max = job_salary[1] 66 | else: 67 | job_salary_max = job_salary[0] 68 | job_desc = job[3].split('/') 69 | job_exp = job_desc[0] 70 | if ' ' in job_exp: 71 | job_exp = job_exp.strip(' ') 72 | if '经验' in job_exp: 73 | job_exp = job_exp.lstrip('经验') 74 | job_edu = job_desc[1] 75 | if ' ' in job_edu: 76 | job_edu = job_edu.strip(' ') 77 | 78 | job = JobInfo(company_name, 79 | job_filter, 80 | job_title, 81 | job_salary_min, 82 | job_salary_max, 83 | job_exp, 84 | job_edu) 85 | job_list.append(job) 86 | print(job_title) 87 | print(job_salary_min) 88 | print(job_salary_max) 89 | print(job_exp) 90 | print(job_edu) 91 | 92 | 93 | def get_next_span(spans): 94 | '''find next page button''' 95 | for span in spans: 96 | print(span.text) 97 | if span.text == '下一页': 98 | if span.get_attribute('class') == 'next': 99 | return span 100 | return None 101 | 102 | 103 | def lagou_filter(browser, job_list, company_name, job_filter): 104 | '''filter by job types''' 105 | while True: 106 | lagou_page(browser, job_list, company_name, job_filter) 107 | #check next page 108 | try: 109 | pages = browser.find_element_by_class_name('pages') 110 | spans = pages.find_elements_by_tag_name('span') 111 | span = get_next_span(spans) 112 | if span: 113 | span.click() 114 | time.sleep(SLEEPTIME) 115 | else: 116 | return 117 | except NoSuchElementException as _e: 118 | print(_e) 119 | return 120 | 121 | 122 | def lagou_company(browser, company_name, company_number): 123 | '''filter for certain company''' 124 | company_url = LAGOU_URL % int(company_number) 125 | company_job_list = [] 126 | browser.get(company_url) 127 | time.sleep(SLEEPTIME*3) 128 | while True: 129 | try: 130 | print(browser.title) 131 | con_filter_li = WebDriverWait(browser, SLEEPTIME)\ 132 | .until(lambda x: x.find_elements_by_class_name('con_filter_li')) 133 | for line in con_filter_li: 134 | print(line.text) 135 | if line.text == '全部': 136 | print('skip') 137 | continue 138 | line.click() 139 | time.sleep(SLEEPTIME) 140 | lagou_filter(browser, company_job_list, company_name, line.text) 141 | except NoSuchElementException as _e: 142 | print(_e) 143 | del company_job_list[:] 144 | # company_job_list.clear() only work for python3 145 | browser.refresh() 146 | time.sleep(SLEEPTIME*3) 147 | else: 148 | #save result to company file 149 | save_file = os.path.join(os.getcwd(), company_name + '.csv') 150 | with open(save_file, 'w', newline='') as save_file_handler: 151 | writer = csv.writer(save_file_handler) 152 | writer.writerow(JobInfo.header()) 153 | for job in company_job_list: 154 | writer.writerow(job.array()) 155 | return 156 | 157 | 158 | def lagou(browser, company_number): 159 | '''lagou entity: target one company or all.''' 160 | print('lagou start') 161 | for name, code in COMPANY.items(): 162 | if company_number is not None: 163 | if int(code) == int(company_number): 164 | lagou_company(browser, name, code) 165 | break 166 | else: 167 | lagou_company(browser, name, code) 168 | print('lagou end') 169 | 170 | 171 | if __name__ == '__main__': 172 | BROWSER = webdriver.Chrome() 173 | #implicitly_wait seems can not waiting for Ajax loading complete 174 | #_browser.implicitly_wait(TIMEOUT) 175 | SINGLE_COMPANY = None 176 | if len(sys.argv) > 1: 177 | SINGLE_COMPANY = sys.argv[1] 178 | lagou(BROWSER, SINGLE_COMPANY) 179 | BROWSER.quit() 180 | -------------------------------------------------------------------------------- /level/README.md: -------------------------------------------------------------------------------- 1 | ### Python leveldb Utils 常用方法封装。 2 | 3 | [leveldb](http://leveldb.org/)是Google开源的一个轻量级,高性能,KeyValue 存储数据库。作者是Google战神Jeff Dean,基于他自己的BigTable论文,使用C++ POSIX实现。 4 | 5 | > LevelDB is a light-weight, single-purpose library for persistence with bindings to many platforms. 6 | 7 | 官网 http://leveldb.org/ 8 | GitHub https://github.com/google/leveldb 9 | 官方 Javascript Binding https://github.com/Level/levelup 10 | 11 | ### Python Binding 12 | 13 | 早期官方仅提供C++和Javascript。Python实现均是第三方开发。其中使用较广泛和稳定的是 https://github.com/rjpower/py-leveldb 。目前处于稳定运行,维护停滞状态。 14 | 在Python爬虫实现过程中,常常需要快速简单处理存储大量数据至本地文件,构建SQL数据库表过于复杂,变更不灵活。使用JSON文本格式,缺乏索引,过滤,随机增删数据。因此leveldb是一种轻便快捷的最佳解决方案。 15 | 16 | 这里封装了一些常用方法,均是日常爬虫数据采集存储的常用方法。 17 | 18 | ### exist() 19 | 20 | 判断key是否存在于database中。返回Boolean值。 21 | 22 | ```Python 23 | def exist(db_src, key): 24 | try: 25 | db_src.Get(_key_obj) 26 | return True 27 | except KeyError: 28 | return False 29 | ``` 30 | 31 | ### count() 32 | 33 | 统计database中数据总量计数,支持key过滤子字符串,value过滤子字符串,返回总数和过滤后有效总数。 34 | 35 | ```Python 36 | def count(db_src, k_filter, v_filter): 37 | total, valid = 0, 0 38 | for _k, _v in db_src.RangeIter(): 39 | total += 1 40 | if k_filter: 41 | if _k.find(k_filter) == -1: 42 | continue 43 | if v_filter: 44 | if _v.find(v_filter) == -1: 45 | continue 46 | valid += 1 47 | return total, valid 48 | ``` 49 | 50 | ### copy() 51 | 52 | 从源库到目标库,拷贝database,支持key过滤子字符串。返回源库总数和过滤后有效拷贝总数。 53 | 54 | ```Python 55 | def copy(db_src, db_dst, k_filter): 56 | total, valid = 0, 0 57 | for _k, _v in db_src.RangeIter(): 58 | total += 1 59 | if k_filter: 60 | if _k.find(k_filter) != -1: 61 | valid += 1 62 | db_dst.Put(_k, _v, sync=True) 63 | else: 64 | valid += 1 65 | db_dst.Put(_k, _v, sync=True) 66 | return total, valid 67 | ``` 68 | 69 | ### delete() 70 | 71 | 删除目标库中与源库相同key的数据项。 72 | 73 | ```Python 74 | def delete(db_src, db_dst): 75 | for _k, _v in db_src.RangeIter(): 76 | db_dst.Delete(_k) 77 | ``` 78 | 79 | ### diff() 80 | 81 | 查找源库与目标库的key值差异数据项,存储至差异库。返回差异项总数。 82 | 83 | ```Python 84 | def diff(db_src, db_dst, db_diff): 85 | diff_count = 0 86 | for _k, _v in db_src.RangeIter(): 87 | if not exist(db_dst, _k): 88 | diff_count += 1 89 | db_diff.Put(_k, _v) 90 | return diff_count 91 | ``` 92 | 93 | ### clean_copy() 94 | 95 | 拷贝源库至目标库,并删除value值为空的数据项。返回拷贝总数。 96 | 97 | ```Python 98 | def clean_copy(db_src, db_dst): 99 | total = 0 100 | for _k, _v in db_src.RangeIter(): 101 | if _v: 102 | db_dst.Put(_k, _v) 103 | total += 1 104 | return total 105 | 106 | ``` 107 | 108 | ### dump() 109 | 110 | 打印输出当前数据库中所有key value数据。 111 | 安全兼容:当参数是字符串时,当作本地路径文件名处理,临时打开数据库。 112 | 113 | ```Python 114 | def dump(db_src): 115 | _db = leveldb.LevelDB(db_src, create_if_missing=False) if isinstance(db_src, str) else db_src 116 | for _k, _v in _db.RangeIter(): 117 | print(_k.decode(), _v.decode()) 118 | ``` 119 | 120 | ### db_to_text() 121 | 122 | 导出leveldb数据库至文本文件,以','分隔。 123 | 安全兼容:当参数是字符串时,当作本地路径文件名处理,临时打开数据库。 124 | 125 | ```Python 126 | def db_to_text(from_db, to_text): 127 | _db = leveldb.LevelDB(from_db, create_if_missing=False) if isinstance(from_db, str) else from_db 128 | with open(to_text, 'w', encoding='utf-8') as _f: 129 | for _k, _v in _db.RangeIter(): 130 | _f.write(_k.decode() + ',' + _v.decode() + '\n') 131 | ``` 132 | 133 | ### text_to_db() 134 | 135 | 从文本文件导入至leveldb数据库。参数支持自定义分隔符。 136 | 安全兼容:当参数是字符串时,当作本地路径文件名处理,临时打开数据库。 137 | 138 | ```Python 139 | def text_to_db(from_text, to_db, split_char): 140 | total, invalid = 0, 0 141 | _split = split_char if split_char else ',' 142 | _db = leveldb.LevelDB(to_db, create_if_missing=True) if isinstance(to_db, str) else to_db 143 | with open(from_text, 'r', encoding='utf-8') as _f: 144 | lines = _f.readlines() 145 | total = len(lines) 146 | for line in lines: 147 | if not line: 148 | invalid += 1 149 | continue 150 | # line = line.strip() 151 | if _split in line: 152 | _sub = line.split(_split, 1) 153 | _db.Put(_sub[0].encode('utf-8'), _sub[1].encode('utf-8')) 154 | else: 155 | _db.Put(line, '') 156 | return total, invalid 157 | ``` 158 | 159 | ### db_to_excel() 160 | 161 | 导出leveldb数据库至Excel文件,返回总数。 162 | Excel文件共2列,分别对应leveldb的Key,Value。 163 | 安全兼容:当参数是字符串时,当作本地路径文件名处理,临时打开数据库。 164 | 165 | ```Python 166 | def db_to_excel(from_db, to_excel): 167 | _db = leveldb.LevelDB(from_db, create_if_missing=False) if isinstance(from_db, str) else from_db 168 | _wb = Workbook() 169 | _ws = _wb.active 170 | total = 0 171 | for _k, _v in _db.RangeIter(): 172 | _ws.append([_k.decode(), _v.decode()]) 173 | total += 1 174 | _wb.save(to_excel) 175 | return total 176 | ``` 177 | 178 | ### excel_to_db() 179 | 180 | 从Excel文件导入至leveldb数据库。 181 | 仅读取Excel文件中的前2列数据,对应leveldb的Key,Value。 182 | 安全兼容:当参数是字符串时,当作本地路径文件名处理,临时打开数据库。 183 | 184 | ```Python 185 | def excel_to_db(from_excel, to_db): 186 | _wb = load_workbook(from_excel, read_only=True) 187 | _ws = _wb.active 188 | _db = leveldb.LevelDB(to_db, create_if_missing=True) if isinstance(to_db, str) else to_db 189 | total = 0 190 | for _row in _ws.iter_rows(min_row=2, min_col=1, max_col=1): 191 | if _row and _row[0] and _row[1]: 192 | _key, _value = '', '' 193 | if _row[0].data_type == cell.Cell.TYPE_STRING: 194 | _key = _row[0].value.encode('utf-8') 195 | _key = ''.join(_key.split()) 196 | if _row[1].data_type == cell.Cell.TYPE_STRING: 197 | _value = _row[0].value.encode('utf-8') 198 | _value = ''.join(_value.split()) 199 | _db.Put(_key, _value) 200 | total += 1 201 | 202 | _wb.close() 203 | return total 204 | ``` 205 | 206 | ### 后记 207 | 208 | 这篇文档整理完成时,leveldb官网已经推出官方Python版本。 209 | 详见 https://plyvel.readthedocs.io/en/latest/ 210 | -------------------------------------------------------------------------------- /level/levelhelper.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | '''leveldb helper''' 4 | 5 | import leveldb 6 | from openpyxl import Workbook 7 | from openpyxl import load_workbook 8 | from openpyxl import cell 9 | 10 | 11 | def exist(db_src, key): 12 | '''Safely check whether key exist or not''' 13 | _key_obj = bytes(key.encode('utf-8')) if isinstance(key, str) else key 14 | try: 15 | db_src.Get(_key_obj) 16 | return True 17 | except KeyError: 18 | return False 19 | 20 | 21 | def count(db_src, k_filter, v_filter): 22 | '''Count database items, support key filter and/or value filter, return total and valid.''' 23 | total, valid = 0, 0 24 | for _k, _v in db_src.RangeIter(): 25 | total += 1 26 | if k_filter: 27 | if _k.find(k_filter) == -1: 28 | continue 29 | if v_filter: 30 | if _v.find(v_filter) == -1: 31 | continue 32 | valid += 1 33 | return total, valid 34 | 35 | 36 | def copy(db_src, db_dst, k_filter): 37 | '''copy db_src to db_dst, support key filter, return total and valid.''' 38 | total, valid = 0, 0 39 | for _k, _v in db_src.RangeIter(): 40 | total += 1 41 | if k_filter: 42 | if _k.find(k_filter) != -1: 43 | valid += 1 44 | db_dst.Put(_k, _v, sync=True) 45 | else: 46 | valid += 1 47 | db_dst.Put(_k, _v, sync=True) 48 | return total, valid 49 | 50 | 51 | def delete(db_src, db_dst): 52 | '''Delete db_src items in db_dst.''' 53 | for _k, _v in db_src.RangeIter(): 54 | db_dst.Delete(_k) 55 | 56 | 57 | def diff(db_src, db_dst, db_diff): 58 | '''Find differents between db_src and db_dst, save to db_diff, return diff count.''' 59 | diff_count = 0 60 | for _k, _v in db_src.RangeIter(): 61 | if not exist(db_dst, _k): 62 | diff_count += 1 63 | db_diff.Put(_k, _v) 64 | return diff_count 65 | 66 | 67 | def clean_copy(db_src, db_dst): 68 | '''copy db_src to db_dst, clean empty value, return total count.''' 69 | total = 0 70 | for _k, _v in db_src.RangeIter(): 71 | if _v: 72 | db_dst.Put(_k, _v) 73 | total += 1 74 | return total 75 | 76 | 77 | def dump(db_src): 78 | '''Dump database key and value items.''' 79 | _db = leveldb.LevelDB(db_src, create_if_missing=False) if isinstance(db_src, str) else db_src 80 | for _k, _v in _db.RangeIter(): 81 | print(_k.decode(), _v.decode()) 82 | 83 | 84 | def db_to_text(from_db, to_text): 85 | '''Transfer leveldb to text file.''' 86 | _db = leveldb.LevelDB(from_db, create_if_missing=False) if isinstance(from_db, str) else from_db 87 | with open(to_text, 'w', encoding='utf-8') as _f: 88 | for _k, _v in _db.RangeIter(): 89 | _f.write(_k.decode() + ',' + _v.decode() + '\n') 90 | 91 | 92 | def text_to_db(from_text, to_db, split_char): 93 | '''Transfer text file to leveldb, return total and invalid count.''' 94 | total, invalid = 0, 0 95 | _split = split_char if split_char else ',' 96 | _db = leveldb.LevelDB(to_db, create_if_missing=True) if isinstance(to_db, str) else to_db 97 | with open(from_text, 'r', encoding='utf-8') as _f: 98 | lines = _f.readlines() 99 | total = len(lines) 100 | for line in lines: 101 | if not line: 102 | invalid += 1 103 | continue 104 | # line = line.strip() 105 | if _split in line: 106 | _sub = line.split(_split, 1) 107 | _db.Put(_sub[0].encode('utf-8'), _sub[1].encode('utf-8')) 108 | else: 109 | _db.Put(line, '') 110 | return total, invalid 111 | 112 | 113 | def db_to_excel(from_db, to_excel): 114 | '''Transfer leveldb to Excel file, return total count.''' 115 | _db = leveldb.LevelDB(from_db, create_if_missing=False) if isinstance(from_db, str) else from_db 116 | _wb = Workbook() 117 | _ws = _wb.active 118 | total = 0 119 | for _k, _v in _db.RangeIter(): 120 | _ws.append([_k.decode(), _v.decode()]) 121 | total += 1 122 | _wb.save(to_excel) 123 | return total 124 | 125 | 126 | def excel_to_db(from_excel, to_db): 127 | '''Transfer Excel file to leveldb, return total count.''' 128 | _wb = load_workbook(from_excel, read_only=True) 129 | _ws = _wb.active 130 | _db = leveldb.LevelDB(to_db, create_if_missing=True) if isinstance(to_db, str) else to_db 131 | total = 0 132 | for _row in _ws.iter_rows(min_row=2, min_col=1, max_col=1): 133 | if _row and _row[0] and _row[1]: 134 | _key, _value = '', '' 135 | if _row[0].data_type == cell.Cell.TYPE_STRING: 136 | _key = _row[0].value.encode('utf-8') 137 | _key = ''.join(_key.split()) 138 | if _row[1].data_type == cell.Cell.TYPE_STRING: 139 | _value = _row[0].value.encode('utf-8') 140 | _value = ''.join(_value.split()) 141 | _db.Put(_key, _value) 142 | total += 1 143 | 144 | _wb.close() 145 | return total 146 | -------------------------------------------------------------------------------- /monkeyrunner/README.md: -------------------------------------------------------------------------------- 1 | # MonkeyRunner is DEAD 2 | 3 | ## UI Automator 4 | 5 | https://developer.android.com/training/testing/ui-automator 6 | 7 | Android 平台所有自动化测试框架的底层实现都依赖官方提供的 UI Automator 测试框架,适用于跨系统和已安装应用程序的跨应用程序功能UI测试。主要功能包括三部分: 8 | 9 | + UI Automator Viewer 检查布局层次结构的查看器。 10 | + UiDevice 设备状态信息并在目标设备上执行操作的API。 11 | + UI Automator API 支持跨应用程序UI测试的API。 12 | 13 | ## UI Automator Viewer 14 | 15 | PC 端 GUI 工具,扫描和分析 Android 设备上当前显示的 UI 组件。展示 UI 布局层次结构,查看设备上当前对用户可见的 UI 组件的属性。从名称可以看出,它是 UI Automator 的只读功能部分,即只能查看 UI 组件的树形结构和属性,不能操作控制 UI 组件。 16 | 17 | `uiautomatorviewer` 位于 `/tools/bin` 目录。 18 | 启动入口是一个bash文件,实际调用 `/tools/lib` 目录的 `uiautomatorviewer-26.0.0-dev.jar` 。 19 | GUI 基于 Eclipse + SWT 实现,使用 Gradle 构建。 20 | 系列工具源码在 `https://android.googlesource.com/platform/tools/swt/` 。 21 | 依赖 `https://android.googlesource.com/platform/tools/base/` 。 22 | 活跃分支: `mirror-goog-studio-master-dev` 。 23 | 该仓库还包含以下工具。 24 | 25 | + chimpchat 26 | + ddms 27 | + hierarchyviewer2 28 | + monkeyrunner 29 | + swtmenubar 30 | + traceview 31 | 32 | 其内部实现基于 `adb shell uiautomator dump` 。从源码仓库提交记录看,主要功能开发的活跃时间是 2014-2015,2016之后已经很少更新维护。那个年代的 Android 开发主要使用 Eclipse , 所以基于 SWT 实现多平台 PC GUI ,在当时合理。 33 | 34 | 该工具实际使用运行不稳定,极易报错。 35 | > `Error while obtaining UI hierarchy XML file: com.android.ddmlib.SyncException: Remote object doesn't exist!` 36 | 37 | 错误原因通常是: 38 | 39 | + adb 连接通道不稳定。 40 | + 机型兼容性问题,权限问题。 41 | + 当前手机应用程序界面处于动态,例如播放视频,动画。并且10秒超时时间仍未进入静态。 42 | 43 | 分析源码可知,错误都源于 `Android Framework uiautomator` 。 44 | 45 | ## MonkeyRunner 46 | 47 | https://developer.android.com/studio/test/monkeyrunner 48 | 49 | 官方提供的另外一个工具,封装 uiautomator API,供 Python 脚本调用,也可注入 java 扩展插件。 50 | 相比 `uiautomatorviewer` 和 `uiautomator` 命令行工具,可编程扩展性更佳。 51 | MonkeyRunner 使用了比较冷门的 Jython 实现。 52 | 53 | ### 1. 启动运行入口 54 | 55 | > monkeyrunner -plugin 56 | 57 | monkeyrunner 是一个bash文件,位于 `/tools/bin` ,启动调用 `/tools/lib/monkeyrunner-26.0.0-dev.jar` 。 58 | 59 | ```bash 60 | export ANDROID_HOME="~/Library/Android/sdk" 61 | $ANDROID_HOME/tools/bin/monkeyrunner uiparser.py 62 | ``` 63 | 64 | ### 2. 主要方法 65 | 66 | #### MonkeyDevice.getProperty() 67 | 68 | 等同于调用 `adb shell getprop ` 。获取设备系统环境变量。 69 | 不同厂商的设备,key可能不同。针对具体测试机型,可使用 `adb shell getprop` ,显示所有系统环境变量的key。 70 | 71 | #### MonkeyDevce.shell() 72 | 73 | 等同于调用`adb shell`命令。 74 | 75 | ### 3. 缺陷 76 | 77 | MonkeyRunner 基于 Jython 2.5.3 。看上去结合了Java和Python的优势,实际对于Java和Python编程都不友好。 78 | 79 | + Jython 2.5.3 过时,主流的Python 3.x和2.7的很多语法和库无法使用。 80 | + 使用vscode等编辑器编码时,缺少智能提示和自动补全。编辑器和pylint无法识别导入的库, 例如 `from com.android.monkeyrunner import MonkeyRunner, MonkeyDevice, MonkeyImage` 。 81 | + Jython 似乎不能像常规的python程序一样引用外部库。实测只能使用 MonkeyRunner 内置的 `os, sys, subprocess` 等库。 82 | + Java extend plugin 能做的事情较少。 83 | 84 | MonkeyRunner 实际仍然是使用 `adb shell` 和其中的 `uiautomator` 命令获取UI组件状态和属性。所以它跟 `UI Automator Viewer` 一样受限于 `uiautomator` 本身的缺陷,导致运行不稳定。 85 | 86 | ## adb shell uiautomator 87 | 88 | **adb** 89 | https://developer.android.google.cn/studio/command-line/adb 90 | 91 | **adb shell am** 92 | https://developer.android.google.cn/studio/command-line/adb#am 93 | 使用 Activity Manager (am) 工具发出命令以执行各种系统操作,如启动 Activity、强行停止进程、广播 intent、修改设备屏幕属性及其他操作。 94 | 95 | **adb shell pm** 96 | https://developer.android.google.cn/studio/command-line/adb#pm 97 | 使用软件包管理器 Package Manager (pm) 工具发出命令,安装,卸载,查询安装包。 98 | 99 | **adb shell uiatomator** 100 | 官网相关页面已被删除,仅能从搜索引擎历史快照中找到。猜测可能近期会有变更,或者官方建议不再使用。 101 | 通过执行命令可以查看使用方法和参数。 102 | 103 | ```bash 104 | Usage: uiautomator [options] 105 | 106 | Available subcommands: 107 | 108 | help: displays help message 109 | 110 | runtest: executes UI automation tests 111 | runtest [options] 112 | : < -c | -e class > 113 | : a list of jar files containing test classes and dependencies. If 114 | the path is relative, it's assumed to be under /data/local/tmp. Use 115 | absolute path if the file is elsewhere. Multiple files can be 116 | specified, separated by space. 117 | : a list of test class names to run, separated by comma. To 118 | a single method, use TestClass#testMethod format. The -e or -c option 119 | may be repeated. This option is not required and if not provided then 120 | all the tests in provided jars will be run automatically. 121 | options: 122 | --nohup: trap SIG_HUP, so test won't terminate even if parent process 123 | is terminated, e.g. USB is disconnected. 124 | -e debug [true|false]: wait for debugger to connect before starting. 125 | -e runner [CLASS]: use specified test runner class instead. If 126 | unspecified, framework default runner will be used. 127 | -e : other name-value pairs to be passed to test classes. 128 | May be repeated. 129 | -e outputFormat simple | -s: enabled less verbose JUnit style output. 130 | 131 | dump: creates an XML dump of current UI hierarchy 132 | dump [--verbose][file] 133 | [--compressed]: dumps compressed layout information. 134 | [file]: the location where the dumped XML should be stored, default is 135 | /sdcard/window_dump.xml 136 | 137 | events: prints out accessibility events until terminated 138 | ``` 139 | 140 | ### uiautomator 缺陷 141 | 142 | 运行耗时长,失败率高,频繁报错。 143 | `ERROR: could not get idle state.` 通常表示当前UI处于动态渲染刷新期间,例如正在播放视频,动画。在10秒超时时间内仍未进入静态。因为此时 UI 树的节点对象快速变化中,不能稳定获取。 144 | 145 | ### uiautomator 源码 146 | 147 | PC端工具源码位于仓库 https://android.googlesource.com/platform/frameworks/testing/ `master` 分支。 148 | 最新更新于 2014.11.14。之后活跃分支变更为 `android-support-test` 分支。`uiautomator` 源码被移除,改成 `android.support.test library, expresso` 等工具的源码工程。 149 | 手机端框架源码位于仓库 https://android.googlesource.com/platform/frameworks/base/ `master` 分支。 150 | `uiAutomation.waitForIdle(1000, 1000 * 10);` 是报错的关键代码,即单次超时等待1秒,最长超时等待10秒。超时抛出异常。 151 | 152 | `DumpCommand.java` 153 | > https://android.googlesource.com/platform/frameworks/testing/+/master/uiautomator/cmds/uiautomator/src/com/android/commands/uiautomator/DumpCommand.java 154 | 155 | ```Java 156 | // It appears that the bridge needs time to be ready. Making calls to the 157 | // bridge immediately after connecting seems to cause exceptions. So let's also 158 | // do a wait for idle in case the app is busy. 159 | try { 160 | UiAutomation uiAutomation = automationWrapper.getUiAutomation(); 161 | uiAutomation.waitForIdle(1000, 1000 * 10); 162 | AccessibilityNodeInfo info = uiAutomation.getRootInActiveWindow(); 163 | if (info == null) { 164 | System.err.println("ERROR: null root node returned by UiTestAutomationBridge."); 165 | return; 166 | } 167 | Display display = 168 | DisplayManagerGlobal.getInstance().getRealDisplay(Display.DEFAULT_DISPLAY); 169 | int rotation = display.getRotation(); 170 | Point size = new Point(); 171 | display.getSize(size); 172 | AccessibilityNodeInfoDumper.dumpWindowToFile(info, dumpFile, rotation, size.x, size.y); 173 | } catch (TimeoutException re) { 174 | System.err.println("ERROR: could not get idle state."); 175 | return; 176 | } finally { 177 | automationWrapper.disconnect(); 178 | } 179 | System.out.println( 180 | String.format("UI hierchary dumped to: %s", dumpFile.getAbsolutePath())); 181 | ``` 182 | 183 | `UiAutomation.java` 184 | > https://android.googlesource.com/platform/frameworks/base/+/master/core/java/android/app/UiAutomation.java 185 | 186 | ```Java 187 | /** 188 | * Waits for the accessibility event stream to become idle, which is not to 189 | * have received an accessibility event within idleTimeoutMillis. 190 | * The total time spent to wait for an idle accessibility event stream is bounded 191 | * by the globalTimeoutMillis. 192 | * 193 | * @param idleTimeoutMillis The timeout in milliseconds between two events 194 | * to consider the device idle. 195 | * @param globalTimeoutMillis The maximal global timeout in milliseconds in 196 | * which to wait for an idle state. 197 | * 198 | * @throws TimeoutException If no idle state was detected within 199 | * globalTimeoutMillis. 200 | */ 201 | public void waitForIdle(long idleTimeoutMillis, long globalTimeoutMillis) 202 | throws TimeoutException { 203 | synchronized (mLock) { 204 | throwIfNotConnectedLocked(); 205 | final long startTimeMillis = SystemClock.uptimeMillis(); 206 | if (mLastEventTimeMillis <= 0) { 207 | mLastEventTimeMillis = startTimeMillis; 208 | } 209 | while (true) { 210 | final long currentTimeMillis = SystemClock.uptimeMillis(); 211 | // Did we get idle state within the global timeout? 212 | final long elapsedGlobalTimeMillis = currentTimeMillis - startTimeMillis; 213 | final long remainingGlobalTimeMillis = 214 | globalTimeoutMillis - elapsedGlobalTimeMillis; 215 | if (remainingGlobalTimeMillis <= 0) { 216 | throw new TimeoutException("No idle state with idle timeout: " 217 | + idleTimeoutMillis + " within global timeout: " 218 | + globalTimeoutMillis); 219 | } 220 | // Did we get an idle state within the idle timeout? 221 | final long elapsedIdleTimeMillis = currentTimeMillis - mLastEventTimeMillis; 222 | final long remainingIdleTimeMillis = idleTimeoutMillis - elapsedIdleTimeMillis; 223 | if (remainingIdleTimeMillis <= 0) { 224 | return; 225 | } 226 | try { 227 | mLock.wait(remainingIdleTimeMillis); 228 | } catch (InterruptedException ie) { 229 | /* ignore */ 230 | } 231 | } 232 | } 233 | } 234 | ``` 235 | 236 | ## Android Device Monitor 237 | 238 | https://developer.android.com/studio/profile/monitor 239 | 240 | Android SDK 工具集的 `Android Device Monitor` 已废弃。 241 | 242 | >Android Device Monitor was deprecated in Android Studio 3.1 and removed from Android Studio 3.2. The features that you could use through the Android Device Monitor have been replaced by new features. The table below helps you decide which features you should use instead of these deprecated and removed features. 243 | 244 | 官方给出的替代品 `Layout Inspector` 功能更强大,界面也更美观,但目前还不成熟,相比 iOS 神器 [Reveal](https://revealapp.com/) , 仍需努力。 245 | https://developer.android.com/studio/debug/layout-inspector 246 | 247 | ## uiparser 248 | 249 | 参照 MonkeyRunner 官方文档实现的 Python Demo。 250 | 251 | https://github.com/9468305/python-script/tree/master/monkeyrunner 252 | -------------------------------------------------------------------------------- /monkeyrunner/uiparser.py: -------------------------------------------------------------------------------- 1 | #! $ANDROID_HOME/tools/bin monkeyrunner 2 | # -*- coding: utf-8 -*- 3 | '''uiparser''' 4 | 5 | import os 6 | import sys 7 | import subprocess 8 | import datetime 9 | import logging 10 | from com.android.monkeyrunner import MonkeyRunner, MonkeyDevice, MonkeyImage #pylint: disable=import-error 11 | 12 | class NullHandler(logging.Handler): 13 | def emit(self, record): 14 | pass 15 | 16 | logging.getLogger(__name__).addHandler(NullHandler()) 17 | logging.basicConfig(level=logging.DEBUG) 18 | 19 | SHORT = 1 20 | MIDDLE = 5 21 | LONG = 15 22 | 23 | ADB = os.path.join(os.environ['ANDROID_HOME'], 'platform-tools', 'adb') 24 | 25 | # Example of Ctrip Android Apk 26 | TARGET_PACKAGE = 'ctrip.android.view' 27 | LAUNCH_ACTIVITY = 'ctrip.business.splash.CtripSplashActivity' 28 | HOME_ACTIVITY = 'ctrip.android.publicproduct.home.view.CtripHomeActivity' 29 | FLIGHT_ACTIVITY = 'ctrip.android.flight.view.inland.FlightInquireActivity' 30 | START_COMPONENT = TARGET_PACKAGE + '/' + LAUNCH_ACTIVITY 31 | 32 | DEVICE_DIR = '/sdcard/uiparser/' 33 | HOST_DIR = './' 34 | 35 | 36 | def capture(device, index): 37 | '''''' 38 | _dumpXML = DEVICE_DIR + index + '.xml' 39 | _localXML = HOST_DIR + index + '.xml' 40 | _localImage = HOST_DIR + index + '.png' 41 | 42 | _shell = [ADB, 'shell', 'uiautomator', 'dump', _dumpXML] 43 | logging.debug(datetime.datetime.now()) 44 | subprocess.call(_shell) # Stupid uiautomator, always failed here! 45 | logging.debug(datetime.datetime.now()) 46 | #MonkeyRunner.sleep(MIDDLE) 47 | 48 | _shell = [ADB, 'pull', _dumpXML, _localXML] 49 | subprocess.call(_shell) 50 | 51 | _image = device.takeSnapshot() 52 | _image.writeToFile(_localImage, 'png') 53 | 54 | 55 | def uiparser(): 56 | '''Main Entry''' 57 | device = MonkeyRunner.waitForConnection(MIDDLE) 58 | 59 | _shell = [ADB, 'shell', 'rm', '-rf', DEVICE_DIR] 60 | subprocess.call(_shell) 61 | 62 | _shell = [ADB, 'shell', 'mkdir', '-p', DEVICE_DIR] 63 | subprocess.call(_shell) 64 | 65 | device.startActivity(component=START_COMPONENT) 66 | MonkeyRunner.sleep(MIDDLE) 67 | 68 | capture(device, str(0)) 69 | 70 | 71 | if __name__ == "__main__": 72 | # MonkeyRunner Jython version is 2.5.3 (Outdated!) 73 | logging.info(sys.version) 74 | uiparser() 75 | -------------------------------------------------------------------------------- /nacao_v1/README.md: -------------------------------------------------------------------------------- 1 | 这次的爬虫目标网站是全国组织机构代码管理中心 http://www.nacao.org.cn 。 2 | 搜索关键字是公司中文名称,搜索结果是公司对应的统一社会信用代码(税号)。 3 | 代码实现非常简单,仅需2个HTTP请求,1个搜索公司列表,1个分页查询结果。 4 | 动手编码之前,先来看看该网站的前端设计实现,有哪些值得吐槽的技术点。 5 | 6 | ### 槽点1:网站开放服务时间 每天12小时 7 | 8 | >重要公告:网站核查平台服务时间为7*12小时(即每天8:00-20:00) 9 | 10 | [知乎:为什么全国组织机构代码管理中心网站(www.nacao.org.cn)只在上班时间开放查询呢?](https://www.zhihu.com/question/33204926) 11 | 网站页面Javascript代码直接写死时间判断逻辑,超出规定时间就直接报错。而服务器始终正常运行。 12 | 第一期,技术人员直接调用浏览器系统时间进行拦截判断。然后用户学会了修改电脑时间。 13 | 第二期,技术人员使用服务器时间进行拦截判断。然后用户学会了绕过Javascript验证。 14 | 第三期,技术人员直接注释了前端Javascript拦截代码。(服务器是否拦截判断,未核实。) 15 | 16 | ### 槽点2:信息核查系统 vs 信息校核结果公示系统 17 | 18 | 首页从上至下,分为2个查询系统。 19 | 第一个是`全国统一社会信用代码信息核查系统`。输入关键字,弹出英文字母图片验证码。 20 | 第二个是`全国统一社会信用代码信息校核结果公示系统`。输入关键字,直接跳转结果页面!!! 21 | 查询数据结果对比,基本一致。那么谁还去用第一个图片验证码系统??? 22 | 多次测试后发现,第一个系统还有IP反爬机制,一旦封禁,3工作日解封。第二个系统任意使用无限制。 23 | 24 | ```txt 25 | IP被封怎么办? 26 | 本网站核查平台是为公众基于统一社会信用代码信息的一般性核查设立的,不支持大量且频繁的查询请求,如查询过程中出现下图 所示,说明您的查询过于频繁,系统已经对您进行了查询限制,限制期为3个工作日。请于限制期后再访问系统进行查询。 27 | ``` 28 | 29 | ### 槽点3:存在疑似SQL注入漏洞 30 | 31 | 在分析SQL注入漏洞之前,先来看看系统2的爬虫实现,即HTTP参数的含义解释。 32 | 33 | #### 关键字查询 34 | 35 | GET http://125.35.63.141:8080/PublicNotificationWeb/search.do 36 | 参数:searchText = 关键字,searchType = 3。 37 | 响应:返回一个字符串Referer。用于query.do的HTTP header参数。 38 | 39 | #### 分页获取数据 40 | 41 | POST http://125.35.63.141:8080/PublicNotificationWeb/query.do 42 | 参数: 43 | 44 | ```python 45 | _params = [ ('pageSize', 20), 46 | ('searchText', keyword), 47 | ('searchType', 3), 48 | ('DJBMBM', ''), 49 | ('sortField', ''), 50 | ('currentPage', 1)] 51 | ``` 52 | 53 | 含义: 54 | 55 | + pageSize 分页数据量,默认20,实测可以改成100。即一页返回100项数据。 56 | + searchText 搜索关键字 57 | + searchType 搜索类型,固定=3 58 | + DJBMBM 未知,固定为空 59 | + sortField 未知,固定为空 60 | + currentPage 当前分页索引,实测最多5页,值范围0-4。 61 | 62 | 响应:返回一个JSON字符串。 63 | 64 | + totalPage 总页数,最大5页。 65 | + foundCount 总共查询结果数量 66 | + dataList JSON数组 67 | + JGMC 公司名称 68 | + TYSHXYDM 统一社会信用代码(税号) 69 | 70 | 至此,爬虫已经实现完毕。接下来探索每个参数的可能性。 71 | 72 | + pageSize 从20改成100,最多可一次获得500条数据。 73 | + searchText 搜索关键字改成*,可获得服务器数据库默认排序数据。此时foundCount等于官网数据库数据总量。 74 | 75 | 实测数据日志见[sql_injection.txt](https://github.com/9468305/python-script/tree/master/nacao_v1/sql_injection.txt)。 76 | 所以这里能否使用SQLInjection,获得数据库访问权限,直接拖库呢?留待有心人探索。 77 | 78 | ### 后记 79 | 80 | 官网进行了改版,原方案接口失效。新方案V2.0: 81 | https://github.com/9468305/python-script/tree/master/nacao_v2 82 | -------------------------------------------------------------------------------- /nacao_v1/constants.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | HTTP Request 常用常量 5 | ''' 6 | 7 | ACCEPT_ANY = '*/*' 8 | 9 | ACCEPT_TEXT = 'text/plain, */*; q=0.01' 10 | 11 | ACCEPT_HTML = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' 12 | 13 | ACCEPT_JSON = 'application/json, text/javascript, */*; q=0.01' 14 | 15 | ACCEPT_IMAGE = 'image/webp,image/*,*/*;q=0.8' 16 | 17 | ACCEPT_LANGUAGE = 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2' 18 | 19 | UA_CHROME_WIN = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' 20 | 21 | UA_CHROME_MAC = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 22 | 23 | USER_AGENT = UA_CHROME_MAC 24 | -------------------------------------------------------------------------------- /nacao_v1/nacao_v1.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | 全国组织机构代码管理中心 5 | http://www.nacao.org.cn/ 6 | 全国统一社会信用代码信息校核结果公示系统 7 | http://125.35.63.141:8080/PublicNotificationWeb/search.do 8 | ''' 9 | 10 | import logging 11 | from logging import NullHandler 12 | import requests 13 | import constants 14 | 15 | logging.getLogger(__name__).addHandler(NullHandler()) 16 | logging.basicConfig(level=logging.DEBUG) 17 | 18 | HOST = 'http://125.35.63.141:8080' 19 | 20 | # 有时响应很慢,需要随时调整超时时间阈值 21 | TIMEOUT = 20 22 | 23 | 24 | def get_search(session, keyword): 25 | '''查询keyword''' 26 | _url = HOST + '/PublicNotificationWeb/search.do' 27 | logging.debug('GET ' + _url) 28 | _headers = {'Accept': constants.ACCEPT_HTML, 29 | 'Accept-Language': constants.ACCEPT_LANGUAGE, 30 | 'User-Agent': constants.USER_AGENT} 31 | _params = [('searchText', keyword), 32 | ('searchType', 3)] 33 | _response = session.get(_url, headers=_headers, params=_params, timeout=TIMEOUT) 34 | logging.debug('response code:' + str(_response.status_code)) 35 | return _response.url if _response.status_code == 200 else None 36 | 37 | 38 | def post_query(session, keyword, referer, current_page): 39 | '''分页获取keyword json数据''' 40 | _url = HOST + '/PublicNotificationWeb/query.do' 41 | logging.debug('POST ' + _url) 42 | _headers = {'Accept': constants.ACCEPT_ANY, 43 | 'Accept-Language': constants.ACCEPT_LANGUAGE, 44 | 'User-Agent': constants.USER_AGENT, 45 | 'Referer': referer, 46 | 'X-Requested-With': 'XMLHttpRequest', 47 | 'Origin': HOST} 48 | _params = [('pageSize', 100), 49 | ('searchText', keyword), 50 | ('searchType', 3), 51 | ('DJBMBM', ''), 52 | ('sortField', ''), 53 | ('currentPage', current_page if current_page > 1 else '')] 54 | _response = session.post(_url, headers=_headers, data=_params, timeout=TIMEOUT) 55 | logging.debug('response code: ' + str(_response.status_code)) 56 | logging.debug('response text: ' + _response.text) 57 | return _response.json() if _response.status_code == 200 else None 58 | 59 | 60 | def query_keyword(session, keyword): 61 | '''使用session,查询keyword''' 62 | _referer = get_search(session, keyword) 63 | if not _referer: 64 | return None 65 | 66 | _code_all = [] 67 | _current_page = 0 68 | _total_page = 5 69 | while _current_page < _total_page: 70 | _current_page += 1 71 | _json_obj = post_query(session, keyword, _referer, _current_page) 72 | if _json_obj: 73 | _total_page = _json_obj['totalPage'] 74 | _found_count = _json_obj['foundCount'] 75 | _data_list = _json_obj['dataList'] 76 | if _found_count and _data_list: 77 | for _i in _data_list: 78 | _code_all.append((_i['JGMC'].encode('utf-8'), _i['TYSHXYDM'].encode('utf-8'))) 79 | else: 80 | break 81 | 82 | return _code_all 83 | 84 | 85 | def query(): 86 | '''query entry''' 87 | try: 88 | with requests.Session() as session: 89 | # * 就是通配符, 可替换成任意公司名称, 该接口可能存在SQL注入漏洞, 未深入验证。 90 | _code_all = query_keyword(session, '*') 91 | if _code_all: 92 | logging.info(len(_code_all)) 93 | for _r in _code_all: 94 | logging.info(_r[0] + ' : ' + _r[1]) 95 | except requests.RequestException as _e: 96 | logging.error(_e) 97 | 98 | 99 | if __name__ == "__main__": 100 | query() 101 | -------------------------------------------------------------------------------- /nacao_v2/README.md: -------------------------------------------------------------------------------- 1 | 书接前文,[全国组织机构代码管理中心](http://www.nacao.org.cn)网站改版,V1.0 方案已失效。继续分析这次改版的变化,以及V2.0方案的实现。 2 | 3 | ### IP变域名 4 | 5 | 125.35.63.141:8080 变成 dmedu.org.cn 。(该域名解析IP仍是 125.35.63.141 )因此2个API接口改变。 6 | 7 | 旧: 8 | 9 | http://125.35.63.141:8080/PublicNotificationWeb/search.do 10 | http://125.35.63.141:8080/PublicNotificationWeb/query.do 11 | 12 | 新: 13 | 14 | http://www.dmedu.org.cn/search.do 15 | http://www.dmedu.org.cn/query.do 16 | 17 | ### search.do可绕过 18 | 19 | search.do接口返回一个字符串Referer,用于query.do的HTTP Header参数。实测不再需要,直接调用query.do,Referer空值。 20 | 21 | ### 参数一致 22 | 23 | + V1.0和V2.0参数没有变化。 24 | + pageSize 20扩大至100 继续可用 25 | + searchText使用*通配符,继续可用 26 | 27 | **因此V2.0的代码更简单。** 28 | 29 | ### 后记 30 | 31 | 目前官网已经针对该接口增加了图片字符验证码,因此该方案已失效。关于如何识别图片字符验证码,搜索“远程打码”。 32 | -------------------------------------------------------------------------------- /nacao_v2/constants.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | HTTP Request 常用常量 5 | ''' 6 | 7 | ACCEPT_ANY = '*/*' 8 | 9 | ACCEPT_TEXT = 'text/plain, */*; q=0.01' 10 | 11 | ACCEPT_HTML = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' 12 | 13 | ACCEPT_JSON = 'application/json, text/javascript, */*; q=0.01' 14 | 15 | ACCEPT_IMAGE = 'image/webp,image/*,*/*;q=0.8' 16 | 17 | ACCEPT_LANGUAGE = 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2' 18 | 19 | UA_CHROME_WIN = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' 20 | 21 | UA_CHROME_MAC = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 22 | 23 | USER_AGENT = UA_CHROME_MAC 24 | -------------------------------------------------------------------------------- /nacao_v2/nacao_v2.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | 全国组织机构代码管理中心 5 | http://www.nacao.org.cn/ 6 | 全国统一社会信用代码信息校核结果公示系统 7 | http://www.dmedu.org.cn/query.do 8 | ''' 9 | 10 | import json 11 | import logging 12 | from logging import NullHandler 13 | import requests 14 | import constants 15 | 16 | logging.getLogger(__name__).addHandler(NullHandler()) 17 | logging.basicConfig(level=logging.INFO) 18 | 19 | HOST = 'http://www.dmedu.org.cn' 20 | 21 | # 有时响应很慢,需要随时调整超时阈值 22 | TIMEOUT = 20 23 | 24 | def post_query(session, keyword, current_page): 25 | '''分页获取keyword json数据''' 26 | _url = HOST + '/query.do' 27 | logging.debug('POST ' + _url) 28 | _headers = {'Accept': constants.ACCEPT_ANY, 29 | 'Accept-Language': constants.ACCEPT_LANGUAGE, 30 | 'User-Agent': constants.USER_AGENT, 31 | #'Referer': referer, 32 | 'X-Requested-With': 'XMLHttpRequest', 33 | 'Origin': HOST} 34 | _params = [('pageSize', 100), 35 | ('searchText', keyword), 36 | ('searchType', 3), 37 | ('DJBMBM', ''), 38 | ('sortField', ''), 39 | ('currentPage', current_page if current_page > 1 else '')] 40 | _response = session.post(_url, headers=_headers, data=_params, timeout=TIMEOUT) 41 | logging.debug('response code: ' + str(_response.status_code)) 42 | logging.debug('response text: ' + _response.text) 43 | return _response.json() if _response.status_code == 200 else None 44 | 45 | 46 | def query_keyword(session, keyword): 47 | '''查询关键字''' 48 | _current_page = 0 49 | _total_page = 5 # Max 50 | while _current_page < _total_page: 51 | _current_page += 1 52 | _json_obj = post_query(session, keyword, _current_page) 53 | if _json_obj: 54 | _total_page = _json_obj['totalPage'] 55 | print(json.dumps(_json_obj, indent=2, sort_keys=True, ensure_ascii=False)) 56 | else: 57 | break 58 | 59 | 60 | def query(): 61 | '''query entry''' 62 | try: 63 | with requests.Session() as session: 64 | # * 就是通配符, 可替换成任意公司名称, 该接口可能存在SQL注入漏洞, 未深入验证。 65 | query_keyword(session, '*') 66 | except requests.RequestException as _e: 67 | logging.error(_e) 68 | 69 | 70 | if __name__ == "__main__": 71 | query() 72 | --------------------------------------------------------------------------------