├── .gitignore
├── LICENSE
├── PageSpeedInsights
├── .gitignore
├── README.md
├── image
│ ├── 1_score.png
│ ├── 2_real_data.png
│ ├── 3_lab_data.png
│ ├── 4_opportunities.png
│ ├── 5_diagnostics.png
│ ├── 6_passed_audits.png
│ ├── gcp_cs_create.jpg
│ ├── gcp_cs_list.jpg
│ ├── gcp_pubsub_topic_list.jpg
│ ├── psi.mermaid.svg
│ └── psi.png
├── job.py
├── main.py
├── psi.mermaid
├── psi.py
├── requirements.txt
└── zip.sh
├── README.md
├── auc_pr_roc
├── README.md
└── auc_pr_roc.py
├── excel_combine
├── README.md
└── excel_combine.py
├── geetest_offline
├── README.md
├── README_gd.md
├── constants.py
├── gd_list.json
├── geetest_offline.py
├── geetest_offline_gd.py
├── geetest_offline_nm.py
└── util.py
├── geetest_online
├── README.md
├── constants.py
├── geetest_online.py
├── image
│ ├── bg.jpg
│ ├── bg.webp
│ ├── fullbg.jpg
│ ├── fullbg.webp
│ └── slice.webp
├── test
│ ├── TraceSample01.txt
│ ├── TraceSample01Parse.txt
│ ├── TraceSample02.txt
│ ├── TraceSample02Parse.txt
│ ├── TraceSample03.txt
│ ├── TraceSample03Parse.txt
│ ├── TraceSample04.txt
│ ├── TraceSample04Parse.txt
│ ├── test_pyexecjs.py
│ ├── test_token.py
│ └── testgeetestjs.py
└── util.py
├── gitstats
├── README.md
└── gitstats.py
├── gsxt_mobile
├── README.md
├── gsxt_mobile.py
└── 腾讯科技50.txt
├── lagou
├── README.md
└── lagou.py
├── level
├── README.md
└── levelhelper.py
├── monkeyrunner
├── README.md
└── uiparser.py
├── nacao_v1
├── README.md
├── constants.py
├── nacao_v1.py
└── sql_injection.txt
└── nacao_v2
├── README.md
├── constants.py
└── nacao_v2.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
103 | # VSCode config
104 | .vscode/
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/PageSpeedInsights/.gitignore:
--------------------------------------------------------------------------------
1 | *.zip
2 |
--------------------------------------------------------------------------------
/PageSpeedInsights/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | 使用 Google Cloud Scheduler, Pub/Sub, Functions , Storage 等云服务,搭建 PageSpeed Insights 前端网站网页的质量和性能 benchmark 定时审查系统。与 CI/CD 流程结合,定时大批量审查网站技术性能指标。
4 |
5 | ### 1. PageSpeed Insights
6 |
7 | #### 1.1 简介
8 |
9 | PageSpeed Insights 是 Google 提供的一款网页性能检测优化工具,能够针对移动设备和桌面设备生成网页的实际性能报告,并提供关于如何改进相应网页的建议。它采用 Google Lighthouse 提供的各种最佳实践作为测试基准,使用 Blink 渲染工具(即 Google Chrome 的渲染引擎),模拟移动设备和桌面设备,抓取目标网站网页,进行优化分析。
10 | 以下简称PSI。
11 |
12 | #### 1.2 版本历史
13 |
14 | 版本 | 发布时间 | 功能更新
15 | --|--|--
16 | V5 | 2018年Q4 | 当前最新版本。2019.05.08更新使用 Lighthouse 5.0 作为其分析引擎。
17 | V4 | 2018年1月 | 2019年Q3之前停用
18 | V2 | 2015年1月| 已停用
19 | V1 | 更早期 | 已停用
20 |
21 | #### 1.3 分析报告组成
22 |
23 | #### 1.3.1 综合速度得分
24 |
25 | 评分和等级:
26 |
27 | + 快 90分以上
28 | + 中等 50-90分
29 | + 慢 50分以下
30 |
31 | V5版本使用 Lighthouse 计算多项性能指标的综合加权得分。
32 | V4及之前版本结合 Chrome 用户体验报告数据库中的真实用户测速数据,计算评分和等级。主要参考以下两项指标。
33 |
34 | + FCP (First Contentful Paint)首次内容绘制,用于衡量用户何时看到来自相应网页的可见响应。所用时间越短,留住用户的可能性就越大。
35 | + DCL 文档内容加载,用于衡量何时完成 HTML 文档的加载和解析。所用时间越短,跳出率越低。
36 |
37 | #### 1.3.2 实测数据
38 |
39 | 结合 Chrome 用户体验报告中的其他网页过去30天内的实测数据相比的得分。
40 |
41 | #### 1.3.3 实验室数据
42 |
43 | 给出以下几项指标的耗时绝对值数据:
44 |
45 | + First Contentful Paint 首次内容绘制时间
46 | + First Meaningful Paint 首次有效绘制时间
47 | + Speed Index 速度指数
48 | + First CPU Idle 首次 CPU 闲置时间
49 | + Time to Interactive 可交互前的耗时
50 | + Estimated Input Latency 最长的潜在FID
51 |
52 | #### 1.3.4 关于如何加快网页加载速度的优化建议
53 |
54 | #### 1.3.5 关于Web开发最佳实践的详细诊断建议。
55 |
56 | #### 1.3.6 已通过的符合最佳实践的审查项
57 |
58 | #### 1.4 实际案例
59 |
60 | 以携程机票H5航班动态首页的某线上版本为例,直观的查看分析报告:
61 | https://m.ctrip.com/webapp/flight/schedule/detail.html
62 |
63 | 
64 |
65 | 
66 |
67 | 
68 |
69 | 
70 |
71 | 
72 |
73 | 
74 |
75 | #### 1.5 使用方法
76 |
77 | PSI API是Google RESTful APIs之一, 仅需一次 HTTP 请求 ,应答返回一个 JSON Ojbect。使用极其简便。
78 |
79 | #### HTTP Request
80 |
81 | > GET https://www.googleapis.com/pagespeedonline/v5/runPagespeed
82 |
83 | 必选参数1个:
84 |
85 | + `url`: 目标分析网页的链接
86 |
87 | 可选参数6个:
88 |
89 | + category:`accessibility`,`best-practices`,`performance`,`pwa`,`seo`。默认是`performance`。
90 | + locale:返回结果文本的本地化语言版本。目前支持40种。默认英语`en`。
91 | + strategy:`desktop` 针对桌面浏览器进行优化分析,`mobile` 针对移动设备浏览器进行优化分析。
92 | + utm_campaign:广告系列名称
93 | + utm_source:广告系列来源
94 | + fields: 定制 Response 内容字段。
95 |
96 | #### HTTP Response
97 |
98 | 返回一个 JSON Object ,字段内容较多,此处省略,详见官网文档。
99 |
100 | #### 最简单命令行调用
101 |
102 | > curl https://www.googleapis.com/pagespeedonline/v5/runPagespeed?url=https://m.ctrip.com
103 |
104 | ### 2. Google Cloude Platform (GCP)
105 |
106 | #### 2.1 系统流程图
107 |
108 | 
109 |
110 | #### 2.2 Cloud Scheduler
111 |
112 | Cloud Scheduler 是 GCP 的一项全托管式企业级 cron 作业调度服务。支持 App Engine、Cloud Pub/Sub 和任意 HTTP 端点,允许作业触发 Compute Engine、Google Kubernetes Engine 和本地资源。
113 | 使用 Google Cloud Console 创建Job。目标有3种:HTTP,Pub/Sub,App Engine HTTP。这里选择 Pub/Sub 。设置每天22:00自动触发。
114 |
115 | 
116 |
117 | 创建成功后查看部署状态,部署成功后可以直接“立即运行”,查看日志,确认运行正常。
118 |
119 | 
120 |
121 | #### 2.3 Cloud Pub/Sub
122 |
123 | Cloud Pub/Sub 是 GCP 的一项简单、可靠、可伸缩,可以用作数据流分析和事件驱动型计算系统的基础。
124 | 这里创建两个主题,`psi-job` 用于 Cloude Scheduler Job 的事件数据中转,`psi-single` 用于 Cloud Functions 的并发 HTTP 请求的事件数据中转。
125 | 
126 |
127 | ### 2.4 Cloud Functions
128 |
129 | 实现并发大量网页的 PageSpeed Insights 检查,有多种方式。可以使用 Google App engine, Google Compute Engine。鉴于 PSI API 是上下文无关的简单 HTTP RESTful API,Cloud Functions Serverless 是最佳最简实现。
130 | Cloud Functions 是 GCP 的一项事件驱动型无服务器计算平台。通过构建多个分别专注于做好一件事的小型独立功能单元,再将这些功能单元组合成一个系统,实现快速开发和部署。支持在单个函数(而不是整个应用、容器或虚拟机)级构建和部署服务。
131 |
132 | #### 2.4.1 编写 Function
133 |
134 | 目前支持以下几种方案:
135 |
136 | 语言 | JavaScript
137 | --|--
138 | 运行时 | Node.js 6(已弃用)、8、10(测试版)
139 | HTTP 框架 | Express
140 | HTTP 函数 | Express Request & Response Context
141 | 后台函数 | (data, context, callback)
142 | 依赖项管理 | npm/yarn + package.json
143 |
144 | 语言 | Python
145 | --|--
146 | 运行时 | 3.7.1
147 | HTTP 框架 | Flask
148 | HTTP 函数 | 入参:Flask Request Object。返回值:符合 Flask.make_response() 的任意对象。
149 | 后台函数 | (data, context)
150 | 依赖项管理 | pip + requirements.txt
151 |
152 | 语言 | Go
153 | --|--
154 | 运行时 | Go 1.11
155 | HTTP 框架 | http.HandlerFunc 标准接口
156 | HTTP 函数 | request: *http.Request. response: http.ResponseWriter.
157 | 后台函数 | (ctx, Event)
158 | 依赖项管理 | go.mod/vendor
159 |
160 | #### 2.4.2 部署 Function
161 |
162 | 目前支持以下几种方式:
163 |
164 | + 从本地机器部署。`使用 gcloud 命令行工具。`
165 | + 通过源代码控制系统部署。`使用 Google Cloud Source Repositories ,通过 OAuth 关联源代码仓库(如 GitHub 或 Bitbucket)。`
166 | + 通过 GCP Console 部署。
167 | + 网页内嵌编辑器.`直接在线编写函数代码。`
168 | + 上传本地ZIP文件。`文件夹目录结构与上述依赖性管理的源码工程结构一致。`
169 | + 导入 Cloud Storage 中的 ZIP 文件。`同上。`
170 | + 引用 Google Cloud Source Repositories的源代码工程。
171 | + 通过CI/CD部署。`使用 Cloud Build 搭建持续集成和部署系统。`
172 |
173 | #### 2.4.3 监控 Function
174 |
175 | Google Stackdriver 提供了服务监控工具,包括 `Debugger,Monitoring,Trace,Logging, Error Reporting,Profiler`。
176 |
177 | ### 3. PSI Functions 实现
178 |
179 | 创建好一个 Scheduler Job 和两个 Pub/Sub 主题后,接下来实现两个对应的 Functions 。
180 |
181 | #### 3.1 psi-single function
182 |
183 | psi-single() 负责针对具体单一 URL ,调用 PSI API 获取 JSON 结果的功能。
184 | Google APIs 支持多种调用方式。
185 |
186 | **3.1.1 使用 `google api client` 。**
187 | 通过 `Discovery API` ,获得已经封装好的 `Service` ,再调用具体接口。
188 |
189 | ```Python
190 | from googleapiclient.discovery import build
191 |
192 | def run(url):
193 | pagespeedonline = build(
194 | serviceName = 'pagespeedonline',
195 | version = 'v5',
196 | developerKey = API_KEY
197 | )
198 | response = pagespeedonline.pagespeedapi().runpagespeed(url = url).execute()
199 | print(response)
200 | return 'OK'
201 | ```
202 |
203 | **3.1.2 针对简单接口,直接调用 `HTTP RESTful API` 。**
204 |
205 | ```Python
206 | import requests
207 | GAPI_PSI = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed"
208 |
209 | def run(url):
210 | try:
211 | payload = {"url": url,
212 | "key": API_KEY
213 | }
214 | with requests.Session() as session:
215 | response = session.get(url=GAPI_PSI, params=payload)
216 | print(response.status_code)
217 | print(response.json())
218 | except requests.RequestException as _e:
219 | print(_e)
220 | return 'OK'
221 | ```
222 |
223 | **3.1.3 实现 Pub/Sub 主题的订阅**
224 | 订阅消息 `event` 的格式详见官网文档,其中 data 属性是一段 `base64` 编码的 `ByteArray` ,承载了实际的数据内容。
225 |
226 | ```Python
227 | import base64
228 |
229 | def run_pubsub(event, context):
230 | pubsub_message = base64.urlsafe_b64decode(event['data']).decode('utf-8')
231 | return run(pubsub_message)
232 | ```
233 |
234 | #### 3.2 psi-job function
235 |
236 | psi-job() 由 Scheduler Job 触发,将所有需审查的 URL 以 Pub/Sub 事件形式,并行分发给 psi-single() 。
237 |
238 | ```Python
239 | from google.cloud import pubsub_v1
240 |
241 | def run(event, context):
242 | publisher = pubsub_v1.PublisherClient()
243 | topic = publisher.topic_path(PROJECT_ID, TOPIC_NAME)
244 | for url in URL_DICT:
245 | data = url.encode('utf-8')
246 | publisher.publish(topic, data)
247 | return 'OK'
248 | ```
249 |
250 | #### 3.3 环境变量和依赖项
251 |
252 | 为了避免安全敏感信息泄漏,可以将关键信息写入 Functions 环境变量和本地环境变量(本地开发调试使用)。
253 | 上述代码中 `API_KEY, PROJECT_ID` 等数据通过 `os.getenv()` 获取。
254 | Cloude Functions 已内置常用依赖库,详见官网文档。如需增加依赖项,配置各语言对应的工程文件。上述代码引用了两个依赖库。
255 |
256 | ```Python
257 | # requirements.txt
258 | # Function dependencies
259 | requests==2.21.0
260 | google-cloud-pubsub==0.40.0
261 | ```
262 |
263 | ### 4. Storage
264 |
265 | 上述代码中的 `print()` 会写入 StackDriver 日志库,供后续过滤分析。鉴于每一个 URL 的审查结果是一个 JSON Object 字符串,可以进一步写入 BigTable , 使用 BigQuery 进行查询分析,再进一步导入 Google Data Studio , 进行可视化报表展示。
266 | 这里使用 Cloud Storage 存储 JSON 字符串为单一文件。
267 |
268 | ```Python
269 | from urllib import parse
270 | from google.cloud import storage
271 | from google.cloud.storage import Blob
272 |
273 | def save(url, report):
274 | '''Save to https://console.cloud.google.com/storage/browser/[bucket-id]/'''
275 | client = storage.Client()
276 | bucket = client.get_bucket("psi-report")
277 | blob = Blob(f"${parse.quote_plus(url)}.json", bucket)
278 | blob.upload_from_string(report, "application/json")
279 | ```
280 |
281 | 添加依赖项。
282 |
283 | ```Python
284 | # requirements.txt
285 | # Function dependencies
286 | google-cloud-storage==1.15.0
287 | ```
288 |
289 | ### 5. 源代码
290 |
291 | https://github.com/9468305/python-script/tree/master/PageSpeedInsights
292 |
293 | ### 6. 文档链接
294 |
295 | 1. PageSpeed Insights
296 | https://developers.google.com/speed/pagespeed/insights
297 | 2. Google Lighthouse
298 | https://developers.google.com/web/tools/lighthouse/
299 | 3. Google Cloud Scheduler
300 | https://cloud.google.com/scheduler/
301 | 4. Google Cloud Pub/Sub
302 | https://cloud.google.com/pubsub/
303 | 5. Google Cloud Functions
304 | https://cloud.google.com/functions/
305 | 6. Google Cloud Storage
306 | https://cloud.google.com/storage/
307 | 7. Google Cloud Build
308 | https://cloud.google.com/cloud-build/
309 | 8. Google Stackdriver
310 | https://cloud.google.com/stackdriver/
311 |
--------------------------------------------------------------------------------
/PageSpeedInsights/image/1_score.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/PageSpeedInsights/image/1_score.png
--------------------------------------------------------------------------------
/PageSpeedInsights/image/2_real_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/PageSpeedInsights/image/2_real_data.png
--------------------------------------------------------------------------------
/PageSpeedInsights/image/3_lab_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/PageSpeedInsights/image/3_lab_data.png
--------------------------------------------------------------------------------
/PageSpeedInsights/image/4_opportunities.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/PageSpeedInsights/image/4_opportunities.png
--------------------------------------------------------------------------------
/PageSpeedInsights/image/5_diagnostics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/PageSpeedInsights/image/5_diagnostics.png
--------------------------------------------------------------------------------
/PageSpeedInsights/image/6_passed_audits.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/PageSpeedInsights/image/6_passed_audits.png
--------------------------------------------------------------------------------
/PageSpeedInsights/image/gcp_cs_create.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/PageSpeedInsights/image/gcp_cs_create.jpg
--------------------------------------------------------------------------------
/PageSpeedInsights/image/gcp_cs_list.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/PageSpeedInsights/image/gcp_cs_list.jpg
--------------------------------------------------------------------------------
/PageSpeedInsights/image/gcp_pubsub_topic_list.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/PageSpeedInsights/image/gcp_pubsub_topic_list.jpg
--------------------------------------------------------------------------------
/PageSpeedInsights/image/psi.mermaid.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/PageSpeedInsights/image/psi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/PageSpeedInsights/image/psi.png
--------------------------------------------------------------------------------
/PageSpeedInsights/job.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 | '''PageSpeed Insights Job + Google Cloud Functions'''
4 | import os
5 | from google.cloud import pubsub_v1
6 |
7 | PROJECT_ID = os.getenv("GCP_PROJECT_ID")
8 | TOPIC_NAME = "psi-single"
9 |
10 | URL_DICT = ["https://m.ctrip.com/webapp/flight/schedule/detail.html"]
11 |
12 | def run(event, context):
13 | publisher = pubsub_v1.PublisherClient()
14 | topic = publisher.topic_path(PROJECT_ID, TOPIC_NAME)
15 | for url in URL_DICT:
16 | data = url.encode('utf-8')
17 | publisher.publish(topic, data)
18 | return 'OK'
19 |
20 | def test_job():
21 | print('TODO')
22 |
23 | if __name__ == "__main__":
24 | test_job()
25 |
--------------------------------------------------------------------------------
/PageSpeedInsights/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import psi
5 | import job
6 |
7 | def psi_pubsub(event, context):
8 | psi.run_pubsub(event, context)
9 |
10 | def job_pubsub(event, context):
11 | job.run(event, context)
12 |
--------------------------------------------------------------------------------
/PageSpeedInsights/psi.mermaid:
--------------------------------------------------------------------------------
1 | graph TB
2 | Job(定时作业任务)
3 | CPS_Job_P(Job Publisher)
4 | CPS_Job_S(Job Subscriber)
5 | CPS_PSI_P(PSI Publisher)
6 | CPS_PSI_S(PSI Subscriber)
7 | CF_Job(Job Function Service)
8 | CF_PSI(PSI Function Service)
9 | GAPI_PSI(PageSpeed Insights API)
10 |
11 | Job -->|Push| CPS_Job_P
12 | CPS_Job_S -->|Push| CF_Job
13 | CF_Job -->| 并发 HTTP | CPS_PSI_P
14 | CPS_PSI_S -->| 并发 Push | CF_PSI
15 | CF_PSI -->| 并发 HTTP Request | GAPI_PSI
16 |
17 | subgraph Cloud Scheduler
18 | Job
19 | end
20 |
21 | subgraph Cloud Pub/Sub
22 | subgraph Job Pub/Sub
23 | CPS_Job_P --> CPS_Job_S
24 | end
25 |
26 | subgraph PSI Pub/Sub
27 | CPS_PSI_P --> CPS_PSI_S
28 | end
29 | end
30 |
31 | subgraph Cloud Functions
32 | CF_Job
33 | CF_PSI
34 | end
35 |
36 | subgraph Google APIs
37 | GAPI_PSI
38 | end
--------------------------------------------------------------------------------
/PageSpeedInsights/psi.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 | '''PageSpeed Insights Single + Google Cloud Functions'''
4 | import os
5 | import base64
6 | from urllib import parse
7 | import requests
8 | from google.cloud import storage
9 | from google.cloud.storage import Blob
10 |
11 | # Access Token, generated from GCP Console Credentials page.
12 | API_KEY = os.getenv('GCP_API_KEY')
13 |
14 | GAPI_PSI = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed"
15 |
16 | SESSION = requests.Session()
17 |
18 | PROXIES = None
19 |
20 |
21 | def save(url, report):
22 | '''Save to https://console.cloud.google.com/storage/browser/[bucket-id]/'''
23 | client = storage.Client()
24 | bucket = client.get_bucket("psi-report")
25 | blob = Blob(f"${parse.quote_plus(url)}.json", bucket)
26 | blob.upload_from_string(report, "application/json")
27 |
28 |
29 | def run(url):
30 | try:
31 | payload = {"url": url,
32 | "category": "performance",
33 | "locale": "zh-CN",
34 | "strategy": "mobile",
35 | "key": API_KEY
36 | }
37 | response = SESSION.get(url=GAPI_PSI, params=payload, proxies=PROXIES)
38 | print(response.status_code)
39 | if 200 == response.status_code:
40 | save(url, response.text)
41 | except requests.RequestException as _e:
42 | print(_e)
43 | return 'OK'
44 |
45 |
46 | def run_pubsub(event, context):
47 | pubsub_message = base64.urlsafe_b64decode(event['data']).decode('utf-8')
48 | return run(pubsub_message)
49 |
50 |
51 | def test_run_http(test_url):
52 | run(test_url)
53 |
54 |
55 | def test_run_pubsub(test_url):
56 | event = {"data": base64.urlsafe_b64encode(test_url.encode('utf-8'))}
57 | context = None
58 | run_pubsub(event, context)
59 |
60 |
61 | if __name__ == "__main__":
62 | _proxy = os.getenv("HTTP_PROXY")
63 | PROXIES = {
64 | "http": _proxy,
65 | "https": _proxy,
66 | }
67 | _test_url = "https://m.ctrip.com/webapp/flight/schedule/detail.html"
68 | test_run_http(_test_url)
69 | test_run_pubsub(_test_url)
70 |
--------------------------------------------------------------------------------
/PageSpeedInsights/requirements.txt:
--------------------------------------------------------------------------------
1 | # Function dependencies
2 | requests==2.31.0
3 | google-cloud-pubsub==0.40.0
4 | google-cloud-storage==1.15.0
5 | #google-cloud-bigtable==0.32.1
6 | #google-cloud-core==0.29.1
--------------------------------------------------------------------------------
/PageSpeedInsights/zip.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # -*- coding: utf-8 -*-
3 |
4 | # chmod +x ./zip.sh
5 | zip -r functions.zip ./ -x *.DS_Store*
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | My Python Script
2 | ----
3 |
4 | 1. [auc_pr_roc](/auc_pr_roc)
5 | Python scikit-learn计算PR ROC曲线AUC值。
6 | 2. [excel_combine](/excel_combine)
7 | Python实现 - Excel多文件一键自动合并。
8 | 3. [geetest_offline](/geetest_offline)
9 | Python破解GeeTest滑块验证码offline V5.10.10,以[国家企业信用信息公示系统](http://www.gsxt.gov.cn)网站为例。
10 | 4. [geetest_offline_gd](/geetest_offline/README_gd.md)
11 | Python爬虫 - [国家企业信用信息公示系统(广东)](http://gd.gsxt.gov.cn) 企业详细信息。
12 | 5. [geetest_online](/geetest_online)
13 | Python破解GeeTest滑块验证码online,以[国家企业信用信息公示系统](http://www.gsxt.gov.cn)网站为例。
14 | 6. [gitstats](/gitstats)
15 | Python实现 - Git commit log统计分析。
16 | 7. [gsxt_mobile](/gsxt_mobile)
17 | Python爬虫 - 国家企业信用信息公示系统 App,通过 App HTTP API 查询企业信息。
18 | 8. [lagou](/lagou)
19 | Python爬虫 - Selenium [拉勾网](https://www.lagou.com) 数据采集。
20 | 9. [level](/level)
21 | Python leveldb Utils 常用方法封装。
22 | 10. [nacao_v1](/nacao_v1)
23 | Python爬虫 - [全国组织结构代码管理中心](http://www.nacao.org.cn)V1.0。
24 | 11. [nacao_v2](/nacao_v2)
25 | Python爬虫 - [全国组织结构代码管理中心](http://www.nacao.org.cn)V2.0。
26 | 12. [MonkeyRunner](/monkeyrunner)
27 | MonkeyRunner is DEAD!
28 | 13. [PageSpeed Insights](/PageSpeedInsights)
29 | 前端DevOps之PageSpeed Insights - 使用 Google Cloud Scheduler, Pub/Sub, Functions , Storage 等云服务,搭建 PageSpeed Insights 前端网站网页的质量和性能 benchmark 定时审查系统。与 CI/CD 流程结合,定时大批量审查网站技术性能指标。
30 |
31 | License
32 | ----
33 |
34 | ```txt
35 | Copyright 2017 ChenQi
36 |
37 | Licensed under the Apache License, Version 2.0 (the "License");
38 | you may not use this file except in compliance with the License.
39 | You may obtain a copy of the License at
40 |
41 | http://www.apache.org/licenses/LICENSE-2.0
42 |
43 | Unless required by applicable law or agreed to in writing, software
44 | distributed under the License is distributed on an "AS IS" BASIS,
45 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
46 | See the License for the specific language governing permissions and
47 | limitations under the License.
48 | ```
49 |
--------------------------------------------------------------------------------
/auc_pr_roc/README.md:
--------------------------------------------------------------------------------
1 | ### 背景
2 |
3 | 2017年某算法竞赛平台出题,飞机航班延误预测算法,需要验收提交答案的准确性。
4 |
5 | 源数据样本csv格式:失效,省略。
6 | 提交预测样本csv格式说明:失效,省略。
7 | 示例:
8 |
9 | Flightno | FlightDepcode | FlightArrcode | PlannedDeptime | PlannedArrtime | prob
10 | -- | -- | -- | -- | -- | --
11 | CA1351 | PEK | CAN | 1496273700 | 1496285700 | 0.041386555
12 | 8L9647 | KMG | HIA | 1496272200 | 1496282400 | 0.022590361
13 | CZ6299 | DLC | SZX | 1496274000 | 1496286900 | 0.025210084
14 | HU7377 | URC | CKG | 1496273700 | 1496287500 | 0.106757728
15 |
16 | 本次比赛采用PR曲线的AUC(baseline:auc=0.45)。评估指标参考:[The Relationship Between Precision-Recall and ROC Curves](http://mark.goadrich.com/articles/davisgoadrichcamera2.pdf)
17 |
18 | ### 实现
19 |
20 | csv文件读取使用pandas库。
21 |
22 | ```Python
23 | def load_label_prob(real_csv, result_csv):
24 | '''读取real.csv和result.csv表格数据的label数组和prob数组'''
25 | real_df, result_df = pandas.read_csv(real_csv), pandas.read_csv(result_csv)
26 | # 检查real.csv和result.csv的数据是否合规
27 | check_format(real_df, result_df)
28 | label, prob = real_df['label'].values, result_df['prob'].values
29 | # 四舍五入, 小数点后保留4位
30 | for _i, _e in enumerate(prob):
31 | prob[_i] = round(_e, 4)
32 | return label, prob
33 | ```
34 |
35 | PR曲线AUC值计算使用sklearn库。
36 |
37 | ```Python
38 | '''使用real.csv和result.csv列数据,计算PR曲线的AUC值'''
39 | precision, recall, _thresholds = metrics.precision_recall_curve(label, prob)
40 | area = metrics.auc(recall, precision)
41 | return area
42 | ```
43 |
44 | 附:ROC曲线的AUC值计算。
45 |
46 | ```Python
47 | '''使用real.csv和result.csv列数据,计算ROC曲线的AUC值'''
48 | area = metrics.roc_auc_score(label, prob)
49 | return area
50 | ```
51 |
52 | ### 环境搭建
53 |
54 | scikit-learn Windows 环境搭建略繁琐,对 NumPy 和 SciPy 版本有要求。
55 | 因此直接使用[第三方预编译库](http://www.lfd.uci.edu/~gohlke/pythonlibs/)。
56 |
57 | ```bash
58 | pip install http://www.lfd.uci.edu/~gohlke/pythonlibs/ru4fxw3r/numpy-1.13.1+mkl-cp36-cp36m-win32.whl
59 | pip install http://www.lfd.uci.edu/~gohlke/pythonlibs/ru4fxw3r/scipy-0.19.1-cp36-cp36m-win32.whl
60 | pip install pandas
61 | pip install scikit-learn
62 | ```
63 |
64 | ### [GitHub源码](https://github.com/9468305/python-script/blob/master/auc_pr_roc/)
65 |
--------------------------------------------------------------------------------
/auc_pr_roc/auc_pr_roc.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 | '''使用real.csv和result.csv表格数据,计算PR ROC曲线的AUC值。'''
4 |
5 | import sys
6 | import pandas
7 | #from pandas import DataFrame
8 | from sklearn import metrics
9 |
10 | REAL_HEADER = ['Flightno',
11 | 'FlightDepcode',
12 | 'FlightArrcode',
13 | 'PlannedDeptime',
14 | 'PlannedArrtime',
15 | 'label']
16 |
17 | RESULT_HEADER = ['Flightno',
18 | 'FlightDepcode',
19 | 'FlightArrcode',
20 | 'PlannedDeptime',
21 | 'PlannedArrtime',
22 | 'prob']
23 |
24 |
25 | def check_column(column1, column2):
26 | '''检查列数据是否一致'''
27 | if not column1.equals(column2):
28 | print('Error: csv column has different data!')
29 | exit(1)
30 |
31 |
32 | def check_format(real_df, result_df):
33 | '''检查real.csv和result.csv的数据是否合规'''
34 | real_header, result_header = real_df.columns.values.tolist(), result_df.columns.values.tolist()
35 | if REAL_HEADER != real_header or RESULT_HEADER != result_header:
36 | print('Error: csv has different headers!')
37 | print(real_header)
38 | print(result_header)
39 | exit(1)
40 | check_column(real_df['Flightno'], result_df['Flightno'])
41 | check_column(real_df['FlightDepcode'], result_df['FlightDepcode'])
42 | check_column(real_df['FlightArrcode'], result_df['FlightArrcode'])
43 | check_column(real_df['PlannedDeptime'], result_df['PlannedDeptime'])
44 | check_column(real_df['PlannedArrtime'], result_df['PlannedArrtime'])
45 |
46 |
47 | def load_label_prob(real_csv, result_csv):
48 | '''读取real.csv和result.csv表格数据的label列数组和prob列数组'''
49 | real_df, result_df = pandas.read_csv(real_csv), pandas.read_csv(result_csv)
50 | check_format(real_df, result_df)
51 | label, prob = real_df['label'].values, result_df['prob'].values
52 | # 四舍五入, 小数点后保留4位
53 | for _i, _e in enumerate(prob):
54 | prob[_i] = round(_e, 4)
55 | return label, prob
56 |
57 |
58 | def auc_roc(real_csv, result_csv):
59 | '''使用real.csv和result.csv列数据,计算ROC曲线的AUC值'''
60 | label, prob = load_label_prob(real_csv, result_csv)
61 | area = metrics.roc_auc_score(label, prob)
62 | #print(area)
63 | return area
64 |
65 |
66 | def auc_pr(real_csv, result_csv):
67 | '''使用real.csv和result.csv列数据,计算PR曲线的AUC值'''
68 | label, prob = load_label_prob(real_csv, result_csv)
69 | precision, recall, _thresholds = metrics.precision_recall_curve(label, prob)
70 | area = metrics.auc(recall, precision)
71 | #print(area)
72 | return area
73 |
74 |
75 | if __name__ == "__main__":
76 | auc_pr(sys.argv[1], sys.argv[2])
77 | #auc_roc(sys.argv[1], sys.argv[2])
78 | #hard code for test
79 | #print(auc_pr('real.csv', 'result.csv'))
80 | #print(auc_roc('real.csv', 'result.csv'))
81 |
--------------------------------------------------------------------------------
/excel_combine/README.md:
--------------------------------------------------------------------------------
1 | 工作中经常遇到一些繁琐事情,例如绩效考核,晋升调薪,固定资产盘点,团建出游时,部门秘书发来一个全部门的Excel文件,各小组Leader拆分出自己团队的Excel文件。下发给每个成员。成员填写完毕后发回给小组Leader,小组Leader汇总后发给部门秘书。部门秘书再汇总各小组Excel到一个完整的大Excel文件。
2 | 一直没有找到方便好用的Excel多文件合并工具。于是自己动手撸一个Python脚本,一键自动合并。
3 | **吐槽:由于种种原因,我司内网Web OA系统建设不足,很多事情都以Excel分发和汇总。**
4 |
5 | ### 一键执行
6 |
7 | 为了方便非技术人员使用,不添加任何启动参数,直接将脚本放在Excel文件夹根目录,双击执行。
8 | 也不需要指定合并汇总文件名,`combine.xlsx`这个名字的重复率很低,使用者留意即可。
9 |
10 | ```Python
11 | if __name__ == "__main__":
12 | FROM_DIR = os.getcwd()
13 | TO_FILE = os.path.join(FROM_DIR, 'combine.xlsx')
14 | combine(FROM_DIR, TO_FILE)
15 | ```
16 |
17 | ### openpyxl
18 |
19 | 目前主流Office版本支持Excel 2010格式,即xlsx后缀名。如果源文件是xls后缀名,直接另存为xlsx即可。因此使用openpyxl库读写Excel文件,忽略xls后缀名文件。
20 | [openpyxl - A Python library to read/write Excel 2010 xlsx/xlsm files](https://openpyxl.readthedocs.io/en/default/)
21 |
22 | ### 遍历文件夹,查找Excel文件
23 |
24 | 使用os.walk()
25 |
26 | ```Python
27 | _results = []
28 | for _root, _dirs, _files in os.walk(from_dir):
29 | for _file in _files:
30 | if _file.endswith('.xlsx'):
31 | _results.append(os.path.join(_root, _file))
32 | return _results
33 | ```
34 |
35 | ### 注意:删除合并汇总文件,即combine.xlsx
36 |
37 | 合并之前,删除结果文件,以防数据错误。
38 |
39 | ```Python
40 | _result = search_file(from_dir)
41 | try:
42 | _result.remove(to_file)
43 | except ValueError:
44 | print('Result file not exist.')
45 | return _result
46 | ```
47 |
48 | ### 多文件重复数据清理
49 |
50 | + 如何确定Excel中每行数据是唯一的?
51 | + 如何检测重复的Excel文件(例如文件名不同,内容相同)?
52 | + 如何合并不同格式(行,列)的Excel文件?
53 |
54 | 这里设定一些潜规则:
55 |
56 | + Excel第一行必须是Title标题。
57 | + Excel第一列必须是唯一Key。例如工号,邮箱等全局唯一值。
58 |
59 | 因此使用 Python 内建的 collections 集合模块的 OrderedDict 有序字典,以第一列为Key。
60 | 完整的读取Excel文件并建立内存字典如下:
61 |
62 | ```Python
63 | _wb = load_workbook(excel_file, read_only=True)
64 | _ws = _wb.active
65 | _title = []
66 | _items = collections.OrderedDict()
67 | for _r in _ws.rows:
68 | if not _title:
69 | for _i in _r:
70 | _title.append(_i.value)
71 | else:
72 | _item = []
73 | for _i in _r:
74 | _item.append(_i.value)
75 | _items[_item[0]] = _item
76 | _wb.close()
77 | return _title, _items
78 | ```
79 |
80 | ### 如何判断2个字典的元素一致
81 |
82 | Python內建了强大的[operator](https://docs.python.org/3/library/operator.html)。
83 |
84 | ```Python
85 | if not operator.eq(dict_src, dict_dst):
86 | print('Warning: dict elements are different!')
87 | ```
88 |
89 | ### 最后把数据写入Excel文件
90 |
91 | ```Python
92 | _wb = Workbook()
93 | _ws = _wb.active
94 | _ws.append(excel_title)
95 | for _k, _v in excel_items.items():
96 | _ws.append(_v)
97 | _wb.save(excel_file)
98 | ```
99 |
100 | ### 遗留的坑
101 |
102 | + 没有处理多个Sheet的Case。
103 | + 没有处理Excel公式计算。
104 |
105 | ### [GitHub源码](https://github.com/9468305/python-script/tree/master/excel_combine)
106 |
--------------------------------------------------------------------------------
/excel_combine/excel_combine.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 | '''合并指定文件夹下所有Excel文件到同一个文件'''
4 | import os
5 | import collections
6 | import operator
7 | from openpyxl import load_workbook
8 | from openpyxl import Workbook
9 |
10 | def search_excel(from_dir, to_file):
11 | '''遍历from_dir文件夹,查找Excel文件,返回结果列表'''
12 | _results = []
13 | for _root, _dirs, _files in os.walk(from_dir):
14 | for _file in _files:
15 | if _file.endswith('.xlsx'):
16 | _results.append(os.path.join(_root, _file))
17 |
18 | try:
19 | print('Remove combine.xlsx.')
20 | _results.remove(to_file)
21 | except ValueError:
22 | print('combine.xlsx not exist.')
23 | return _results
24 |
25 |
26 | def load_excel(excel_file):
27 | '''读取Excel文件内容,返回Excel的标题数组和数据有序字典'''
28 | _wb = load_workbook(excel_file, read_only=True)
29 | _ws = _wb.active
30 | _title = []
31 | _items = collections.OrderedDict()
32 | for _r in _ws.rows:
33 | if not _title:
34 | for _i in _r:
35 | _title.append(_i.value)
36 | else:
37 | _item = []
38 | for _i in _r:
39 | _item.append(_i.value)
40 | _items[_item[0]] = _item
41 |
42 | _wb.close()
43 | return _title, _items
44 |
45 |
46 | def save_excel(excel_file, excel_title, excel_items):
47 | '''保存Excel文件'''
48 | _wb = Workbook()
49 | _ws = _wb.active
50 | _ws.append(excel_title)
51 | for _k, _v in excel_items.items():
52 | _ws.append(_v)
53 | _wb.save(excel_file)
54 |
55 |
56 | def combine(from_dir, to_file):
57 | '''合并指定文件夹下所有Excel文件到同一个文件'''
58 | _excel_files = search_excel(from_dir, to_file)
59 | if not _excel_files:
60 | return
61 | _excel_title = []
62 | _excel_content = collections.OrderedDict()
63 | for _file in _excel_files:
64 | print('Parsing ' + _file)
65 | _title, _items = load_excel(_file)
66 | if not _title or not _items:
67 | print('Skip since it is empty.')
68 | continue
69 |
70 | if not _excel_title:
71 | _excel_title = _title
72 | elif not operator.eq(_title, _excel_title):
73 | print('Warning: Excel title format are different!')
74 |
75 | for _k, _v in _items.items():
76 | _excel_content[_k] = _v
77 | print('Parsing done.')
78 |
79 | if not _excel_title or not _excel_content:
80 | print('All files is empty.')
81 | return
82 | save_excel(to_file, _excel_title, _excel_content)
83 |
84 |
85 | if __name__ == "__main__":
86 | print('begin')
87 | FROM_DIR = os.getcwd()
88 | TO_FILE = os.path.join(FROM_DIR, 'combine.xlsx')
89 | combine(FROM_DIR, TO_FILE)
90 | print('end')
91 |
--------------------------------------------------------------------------------
/geetest_offline/README.md:
--------------------------------------------------------------------------------
1 | # GeeTest滑块验证码offline模式的分析
2 |
3 | GeeTest滑块验证码通过机器学习检查鼠标行为轨迹,识别人工或机器行为。
4 | online在线验证的流程,目前最全面的分析文档详见 [https://zhuanlan.zhihu.com/windev](https://zhuanlan.zhihu.com/windev) 。
5 | online模式的验证流程,网站后台与GeeTest后台 [http://api.geetest.com](http://api.geetest.com) 进行通讯验证。浏览器前端仅做数据采集和简单加密传输。
6 | offline模式的离线验证,网站后台自行验证,GeeTest后台 [http://static.geetest.com](http://static.geetest.com) 仅提供滑块验证码图片下载功能。浏览器前端做数据采集和本地验证。
7 | 在安全性上,online模式相对非常可靠,offline模式仅仅是障眼法。
8 |
9 | ### 1. 测试网站
10 |
11 | 以[国家企业信用信息公示系统](http://www.gsxt.gov.cn)为例:主站使用 geetest 5.10.10 online 在线验证模式。各省市站点使用的版本和模块略有差异。
12 |
13 | **geetest offline 5.9.0**
14 |
15 | + 上海
16 | + 河北
17 | + 内蒙古
18 | + 辽宁
19 | + 福建
20 | + 山东
21 | + 广东
22 | + 海南
23 | + 湖北
24 | + 湖南
25 | + 四川
26 | + 云南
27 | + 西藏
28 | + 青海
29 | + 宁夏
30 |
31 | **geetest offline 5.10.10**
32 |
33 | + 贵州
34 | + 陕西
35 |
36 | ### 2. offline模式验证流程
37 |
38 | 以[上海站点](http://sh.gsxt.gov.cn)为例。
39 |
40 | #### 2.1 GET 首页 http://sh.gsxt.gov.cn/notice/
41 |
42 | 返回HTML页面,解析得到session.token。
43 |
44 | #### 2.2 GET register http://sh.gsxt.gov.cn/notice/pc-geetest/register
45 |
46 | 返回JSON数据
47 |
48 | ```json
49 | {
50 | "success":0,
51 | "gt":"39134c54afef1e0b19228627406614e9",
52 | "challenge":"d2ddea18f00665ce8623e36bd4e3c7c543"
53 | }
54 | ```
55 |
56 | success = 0 表示启用 offline 验证模式。
57 |
58 | #### 2.3 POST http://sh.gsxt.gov.cn/notice/security/verify_ip
59 |
60 | 返回200 True,表示成功。
61 |
62 | #### 2.4 POST http://sh.gsxt.gov.cn/notice/security/verify_keyword
63 |
64 | 返回200 True,表示成功。
65 |
66 | #### 2.5 POST http://sh.gsxt.gov.cn/notice/pc-geetest/validate
67 |
68 | 上传滑块验证码的本地验证结果数据,简称validate。返回JSON数据
69 |
70 | ```json
71 | {
72 | "status":"success",
73 | "version":"3.3.0"
74 | }
75 | ```
76 |
77 | offline模式,后台根本不知道浏览器使用哪张滑块验证码图片,只知道浏览器上传了验证结果的数据。所以完全可以省略下载验证码图片,进行图像识别,计算滑块移动位置,模拟鼠标滑动轨迹这些步骤。
78 |
79 | **验证数据格式**
80 |
81 | 例如:`1517aab3f_51aa460f_75555a6a38`,其中以 `_` 分隔的3段数据是由 `geetest.5.x.x.js` 中的 distance, rand0, rand1 加密混淆得到。
82 | 具体加密过程分析详见 [寻找阿登高地——爬虫工程师如何绕过验证码](http://www.jianshu.com/p/5b6fb04ea686) 。
83 | 其实不用关心加密算法的实现细节,只需找到JavaScript调用入口,传入参数执行即可:
84 |
85 | ```javascript
86 | function userresponse(a, b) {
87 | for (var c = b.slice(32), d = [], e = 0; e < c.length; e++) {
88 | var f = c.charCodeAt(e);
89 | d[e] = f > 57 ? f - 87 : f - 48
90 | }
91 | c = 36 * d[0] + d[1];
92 | var g = Math.round(a) + c; b = b.slice(0, 32);
93 | var h, i = [ [], [], [], [], [] ], j = {}, k = 0; e = 0;
94 | for (var l = b.length; e < l; e++)
95 | h = b.charAt(e), j[h] || (j[h] = 1, i[k].push(h), k++, k = 5 == k ? 0 : k);
96 | for (var m, n = g, o = 4, p = "", q = [1, 2, 5, 10, 50]; n > 0;)
97 | n - q[o] >= 0 ? (m = parseInt(Math.random() * i[o].length, 10), p += i[o][m], n -= q[o]) : (i.splice(o, 1), q.splice(o, 1), o -= 1);
98 | return p
99 | }
100 | ```
101 |
102 | **3个参数的正确生成**
103 |
104 | distance,rand0,rand1,这3个参数都是随机生成,但是如果写代码直接随机生成,会发现验证成功率不高,那么这3个参数之间存在什么隐藏关联关系?后台是如何校验这3个随机数的正确性?
105 | 其实它们之间存在什么关系不重要,重要的是能够成功通过验证。
106 | 只需人工采样N次,构造足够的样本数组,每次随机选取1个,调用JavaScript加密方法,得到验证数据即可。
107 |
108 | #### 2.6 POST http://sh.gsxt.gov.cn/notice/search/ent_info_list
109 |
110 | 上传session.token(步骤1获得),challenge(步骤2获得),validate(步骤5计算),keyword(查询关键字),返回HTML页面,解析DOM结构,即可获得查询结果和session.token的更新(用于下一次查询)。
111 |
112 | ### 3. 源码
113 |
114 | Python 3.6
115 |
116 | Install:
117 |
118 | ```bash
119 | pip install requests # HTTP Request库
120 | pip install PyExecJS # Python调用JavaScript, 配合node.js更佳
121 | pip install beautifulsoup4 # 解析HTML页面
122 | ```
123 |
124 | Demo:
125 |
126 | ```bash
127 | python ./geetest_offline.py
128 | python ./geetest_offline_nm.py
129 | ```
130 |
131 | #### Entry Code
132 |
133 | [geetest_offline.py](/geetest_offline/geetest_offline.py) for 上海,河北。
134 |
135 | [geetest_offline_nm.py](/geetest_offline/geetest_offline_nm.py) for 内蒙古,HTTP Request&Response 略有不同。
136 |
--------------------------------------------------------------------------------
/geetest_offline/README_gd.md:
--------------------------------------------------------------------------------
1 | 这次的爬虫换个目标,不仅仅抓取统一社会信用代码(税号),还要抓取企业基础信息。目标网站设定为 http://gd.gsxt.gov.cn 国家企业信用信息公示系统(广东)。
2 |
3 | 首页依然是采用GeeTest滑块验证码Offline模式验证,校验过程内容省略。以搜索关键字“腾讯科技”为例,第一步获得如下数据。
4 |
5 | ```python
6 | [
7 | ('广州腾讯科技有限公司',
8 | 'http://gsxt.gzaic.gov.cn/aiccips/GSpublicity/GSpublicityList.html?service=entInfo_nPNw57QPCnL961TNeXO4Gqc/FgBy7ESTwWPrP4zJe5g=-FBrJ/suNwXMupXtmIUvNKg=='),
9 | ('深圳兴腾讯科技有限公司',
10 | 'https://www.szcredit.org.cn/GJQYCredit/GSZJGSPTS/QYGS.aspx?rid=6B553DC2860F51DD8179F9821CA72F8094E73CE96BD2D49EC7C4690757FA61D9'),
11 | ('腾讯科技(深圳)有限公司',
12 | 'https://www.szcredit.org.cn/GJQYCredit/GSZJGSPTS/QYGS.aspx?rid=B0819DEB6219A8B1'),
13 | ('深圳市联腾讯科技有限公司',
14 | 'https://www.szcredit.org.cn/GJQYCredit/GSZJGSPTS/QYGS.aspx?rid=DB80B6DEA7F44F35C9A10E5985D4FAA2D4F342323238AB811179ADA6138BD8D4'),
15 | ('惠州云达腾讯科技有限公司',
16 | 'http://gd.gsxt.gov.cn/aiccips/CheckEntContext/../GSpublicity/GSpublicityList.html?service=entInfo_SesJBXGCYofnRPu6PUIM/1lSj0vJHOw5gTgVbtsLB1BTAOYLpc4gxgb5a3wjX8k3-dA+Hj5oOjXjQTgAhKSP1lA=='),
17 | ('深圳市华腾讯科技有限公司',
18 | 'https://www.szcredit.org.cn/GJQYCredit/GSZJGSPTS/QYGS.aspx?rid=6B553DC2860F51DD8179F9821CA72F80820C9FD043746B01E89676307B6B60EF'),
19 | ('中山腾讯科技电子有限公司',
20 | 'http://gd.gsxt.gov.cn/aiccips/CheckEntContext/../GSpublicity/GSpublicityList.html?service=entInfo_ZECp7scr3rINuX8+ial6uIv57yGPPUCA1RAvDHoM0tBrXZJ9+1otoDp51Oi7UabK-7kW54gFL28iQmsO8Qn3cTA=='),
21 | ('深圳市安腾讯科技电子有限公司',
22 | 'https://www.szcredit.org.cn/GJQYCredit/GSZJGSPTS/QYGS.aspx?rid=6B553DC2860F51DD8179F9821CA72F808CC6A55FD01EE165A1560ECF17B3E73C'),
23 | ('中山市纸箱总厂腾讯科技亚太电子厂',
24 | 'http://gd.gsxt.gov.cn/aiccips/CheckEntContext/../GSpublicity/GSpublicityList.html?service=entInfo_M+Q/CD12sdYKPqPXAzRChoB2xhauTJBsWbk/xaaA92MJ4dcDV+KRZ71QUWHSpwQ+-7kW54gFL28iQmsO8Qn3cTA=='),
25 | ('深圳龙腾讯威科技有限公司',
26 | 'https://www.szcredit.org.cn/GJQYCredit/GSZJGSPTS/QYGS.aspx?rid=6B553DC2860F51DD7501B40D8BFA3C22E27771C25B8DF96FD1F35DF7C350F5A9')
27 | ]
28 | ```
29 |
30 | **吐槽:企业详细信息页面分3种。每种网站又有不同的网页模板,因此解析HTML页面元素需要分别处理。深圳的网页模板是一套,广州和其他的是另一套。所以需要区分2种DOM树进行解析。**
31 |
32 | + 深圳 https://www.szcredit.org.cn
33 | + 广州 http://gsxt.gzaic.gov.cn/aiccips/GSpublicity/GSpublicityList.html
34 | + 其他 http://gd.gsxt.gov.cn/aiccips/CheckEntContext/../GSpublicity/GSpublicityList.html
35 |
36 | 使用 BeautifulSoup 解析搜索结果页面后,需要判断URL:
37 |
38 | ```Python
39 | _url = _company['href']
40 | if _url.startswith('../'):
41 | _url = INDEX + '/aiccips/CheckEntContext/' + _url
42 | ```
43 |
44 | 最终整理得到数据:
45 |
46 | ```python
47 | {
48 | '注册号/统一社会信用代码': '91440101327598294H',
49 | '注册资本': '0',
50 | '企业名称': '广州腾讯科技有限公司',
51 | '类型': '有限责任公司(外商投资企业法人独资)',
52 | '成立日期': '2014年12月31日',
53 | '营业期限自': '2014年12月31日',
54 | '营业期限至': '2018年07月15日',
55 | '登记机关': '广州市海珠区工商行政管理局',
56 | '核准日期': '2016年12月23日',
57 | '登记状态': '存续',
58 | '经营范围': '电子、通信与自动控制技术研究、开发;网络技术的研究、开发;计算机技术开发、技术服务;软件服务;软件测试服务;软件批发;软件零售;软件开发;游戏软件设计制作;信息技术咨询服务;数据处理和存储服务;(依法须经批准的项目,经相关部门批准后方可开展经营活动)〓'
59 | }
60 | ```
61 |
62 | **吐槽+1:〓是什么鬼?**
63 |
64 | 由于这些网站性能极差,默认的15秒超时经常失败,因此在每次网络请求之上添加保护,对于可以多次重试的请求,添加循环等待。
65 |
66 | ```Python
67 | def safe_query_detail(url):
68 | '''Safe query url, handle network timeout and retry multi times.'''
69 | for _ in range(5):
70 | try:
71 | with requests.Session() as session:
72 | return query_detail(session, url)
73 | except requests.RequestException as _e:
74 | logging.error(_e)
75 | time.sleep(5)
76 | return None
77 | ```
78 |
79 | **吐槽+2:降低网站性能也是一种非常有效的反爬技术。**
80 |
81 | 2017.12.05 更新
82 |
83 | 从 http://gd.gsxt.gov.cn 查询“深圳兴腾讯科技有限公司”,跳转链接失败,服务器500错误。
84 | > https://www.szcredit.org.cn/GJQYCredit/GSZJGSPTS/QYGS.aspx?rid=6B553DC2860F51DD8179F9821CA72F8094E73CE96BD2D49EC7C4690757FA61D9
85 |
86 | 从 https://www.szcredit.org.cn ,查询和跳转正常。
87 | > https://www.szcredit.org.cn/web/gspt/newGSPTDetail3.aspx?ID=2e82a6a7aaec419884738d2421e7a838
88 |
89 | **吐槽+3:这都是什么运维水平?**
90 |
--------------------------------------------------------------------------------
/geetest_offline/constants.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 | '''
4 | HTTP Request 常用常量
5 | '''
6 |
7 | ACCEPT_ANY = '*/*'
8 |
9 | ACCEPT_TEXT = 'text/plain, */*; q=0.01'
10 |
11 | ACCEPT_HTML = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
12 |
13 | ACCEPT_JSON = 'application/json, text/javascript, */*; q=0.01'
14 |
15 | ACCEPT_IMAGE = 'image/webp,image/*,*/*;q=0.8'
16 |
17 | ACCEPT_LANGUAGE = 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2'
18 |
19 | UA_CHROME_WIN = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
20 |
21 | UA_CHROME_MAC = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
22 |
23 | USER_AGENT = UA_CHROME_MAC
24 |
--------------------------------------------------------------------------------
/geetest_offline/gd_list.json:
--------------------------------------------------------------------------------
1 | [
2 | "腾讯科技",
3 | "百度",
4 | "阿里巴巴"
5 | ]
--------------------------------------------------------------------------------
/geetest_offline/geetest_offline.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 | '''
4 | geetest offline 5.9.0 - 6.0.0 for sh.gsxt.gov.cn, he.gsxt.gov.cn
5 | '''
6 |
7 | import os
8 | import time
9 | import random
10 | import logging
11 | from logging import NullHandler
12 | import requests
13 | import execjs
14 | from bs4 import BeautifulSoup
15 | import constants
16 | import util
17 |
18 | logging.getLogger(__name__).addHandler(NullHandler())
19 | logging.basicConfig(level=logging.DEBUG)
20 |
21 | HOST = ''
22 | INDEX = ''
23 |
24 | JSRUNTIME = execjs.get(execjs.runtime_names.Node)
25 |
26 | CAPTCHA_JSON = []
27 |
28 | USERRESPONSE_JSCONTEXT = JSRUNTIME.compile(util.USERRESPONSE_JS)
29 |
30 | TIMEOUT = 10
31 |
32 | GSXT_HOST_SH = 'http://sh.gsxt.gov.cn'
33 | GSXT_INDEX_SH = GSXT_HOST_SH + '/notice/'
34 | GSXT_HOST_HE = 'http://he.gsxt.gov.cn'
35 | GSXT_INDEX_HE = GSXT_HOST_HE + '/notice/'
36 |
37 | def config(host, index):
38 | '''设置 host and index URL'''
39 | global HOST, INDEX
40 | HOST, INDEX = host, index
41 |
42 |
43 | def calc_userresponse(distance, challenge):
44 | '''根据滑动距离distance和challenge,计算userresponse值'''
45 | return USERRESPONSE_JSCONTEXT.call('userresponse', distance, challenge)
46 |
47 |
48 | def calc_validate(challenge):
49 | '''计算validate值'''
50 | _r = random.randint(0, len(util.OFFLINE_SAMPLE)-1)
51 | distance, rand0, rand1 = util.OFFLINE_SAMPLE[_r]
52 | distance_r = calc_userresponse(distance, challenge)
53 | rand0_r = calc_userresponse(rand0, challenge)
54 | rand1_r = calc_userresponse(rand1, challenge)
55 | validate = distance_r + '_' + rand0_r + '_' + rand1_r
56 | logging.debug(validate)
57 | return validate
58 |
59 |
60 | def parse_token(html_doc):
61 | '''使用BeautifulSoup解析HTML页面, 查找session.token'''
62 | soup = BeautifulSoup(html_doc, 'html.parser')
63 | _find = soup.find('input', attrs={'name': 'session.token'})
64 | return _find['value'] if _find else None
65 |
66 |
67 | def parse_code(html_doc):
68 | '''使用BeautifulSoup解析HTML页面,查找统一社会信用代码'''
69 | _soup = BeautifulSoup(html_doc, 'html.parser')
70 | _findall = _soup.find_all('div', class_='tableContent page-item')
71 | _result = []
72 | if _findall:
73 | for _a in _findall:
74 | _td = _a.find('td')
75 | _td_str = ''.join(_td.get_text().split())
76 | _i = _a.find('i')
77 | _i_str = ''.join(_i.get_text().split())
78 | _td_str = _td_str[0: -len(_i_str)]
79 | _th = _a.find('th', class_='icon1')
80 | _em = _th.find('em')
81 | _result.append((_td_str.encode('utf-8'), _em.get_text().encode('utf-8')))
82 | else:
83 | logging.info('Code Not Found')
84 | return _result
85 |
86 |
87 | def get_main(session):
88 | '''Get gsxt 首页'''
89 | _url = INDEX
90 | logging.debug('GET ' + _url)
91 | _headers = {'Accept': constants.ACCEPT_HTML,
92 | 'Accept-Language': constants.ACCEPT_LANGUAGE,
93 | 'User-Agent': constants.USER_AGENT}
94 | _response = session.get(_url, headers=_headers, timeout=TIMEOUT)
95 | logging.debug('response code:' + str(_response.status_code))
96 | return parse_token(_response.text) if _response.status_code == 200 else None
97 |
98 |
99 | def get_register(session):
100 | '''
101 | {"success": 0,
102 | "gt": "39134c54afef1e0b19228627406614e9",
103 | "challenge": "fc490ca45c00b1249bbe3554a4fdf6fb35"}
104 | '''
105 | _url = INDEX + 'pc-geetest/register'
106 | logging.debug('GET ' + _url)
107 | _headers = {'Accept': constants.ACCEPT_JSON,
108 | 'Accept-Language': constants.ACCEPT_LANGUAGE,
109 | 'User-Agent': constants.USER_AGENT,
110 | 'Referer': INDEX,
111 | 'X-Requested-With': 'XMLHttpRequest'}
112 | _params = {'v': str(int(time.time() * 1000))}
113 | _response = session.get(_url, headers=_headers, params=_params, timeout=TIMEOUT)
114 | logging.debug('response code: ' + str(_response.status_code))
115 | logging.debug('response text: ' + _response.text)
116 | if _response.status_code != 200:
117 | return False
118 | global CAPTCHA_JSON
119 | CAPTCHA_JSON = _response.json()
120 | return True
121 |
122 |
123 | def post_verify_ip(session):
124 | ''' POST /notice/security/verify_ip'''
125 | _url = INDEX + 'security/verify_ip'
126 | logging.debug('POST ' + _url)
127 | _headers = {'Accept': constants.ACCEPT_TEXT,
128 | 'Accept-Language': constants.ACCEPT_LANGUAGE,
129 | 'User-Agent': constants.USER_AGENT,
130 | 'Referer': INDEX,
131 | 'X-Requested-With': 'XMLHttpRequest',
132 | 'Origin': HOST}
133 | _response = session.post(_url, headers=_headers, timeout=TIMEOUT)
134 | logging.debug('response code: ' + str(_response.status_code))
135 | logging.debug('response text: ' + _response.text)
136 | return _response.status_code == 200
137 |
138 |
139 | def post_verify_keyword(session, keyword):
140 | ''' POST /notice/security/verify_keyword HTTP/1.1'''
141 | _url = INDEX + 'security/verify_keyword'
142 | logging.debug('POST ' + _url)
143 | _headers = {'Accept': constants.ACCEPT_TEXT,
144 | 'Accept-Language': constants.ACCEPT_LANGUAGE,
145 | 'User-Agent': constants.USER_AGENT,
146 | 'Referer': INDEX,
147 | 'X-Requested-With': 'XMLHttpRequest',
148 | 'Origin': HOST}
149 | _params = {'keyword': keyword}
150 | _response = session.post(_url, headers=_headers, data=_params, timeout=TIMEOUT)
151 | logging.debug('response code: ' + str(_response.status_code))
152 | logging.debug('response text: ' + _response.text)
153 | return _response.status_code == 200
154 |
155 |
156 | def post_validate(session, validate):
157 | ''' POST /notice/pc-geetest/validate'''
158 | _url = INDEX + 'pc-geetest/validate'
159 | logging.debug('POST ' + _url)
160 | _headers = {'Accept': constants.ACCEPT_JSON,
161 | 'Accept-Language': constants.ACCEPT_LANGUAGE,
162 | 'User-Agent': constants.USER_AGENT,
163 | 'Referer': INDEX,
164 | 'X-Requested-With': 'XMLHttpRequest',
165 | 'Origin': HOST}
166 | _params = [('geetest_challenge', CAPTCHA_JSON['challenge']),
167 | ('geetest_validate', validate),
168 | ('geetest_seccode', validate + '|jordan')]
169 | _response = session.post(_url, headers=_headers, data=_params, timeout=TIMEOUT)
170 | logging.debug('response code: ' + str(_response.status_code))
171 | logging.debug('response text: ' + _response.text)
172 | if _response.status_code != 200:
173 | return False
174 | _json_obj = _response.json() # {"status":"success","version":"3.3.0"}
175 | logging.debug(_json_obj)
176 | return _json_obj['status'] == 'success'
177 |
178 |
179 | def post_search(session, validate, keyword, token):
180 | ''' POST /notice/search/ent_info_list HTTP/1.1'''
181 | _url = INDEX + 'search/ent_info_list'
182 | logging.debug('POST ' + _url)
183 | _headers = {'Accept': constants.ACCEPT_HTML,
184 | 'Accept-Language': constants.ACCEPT_LANGUAGE,
185 | 'User-Agent': constants.USER_AGENT,
186 | 'Referer': INDEX,
187 | 'X-Requested-With': 'XMLHttpRequest',
188 | 'Origin': HOST}
189 | _params = [('condition.searchType', 1),
190 | ('captcha', ''),
191 | ('geetest_challenge', CAPTCHA_JSON['challenge']),
192 | ('geetest_validate', validate),
193 | ('geetest_seccode', validate + '|jordan'),
194 | ('session.token', token),
195 | ('condition.keyword', keyword)]
196 | _response = session.post(_url, headers=_headers, data=_params, timeout=TIMEOUT)
197 | logging.debug('response code: ' + str(_response.status_code))
198 | #logger.debug('response text: ' + _response.text)
199 | if _response.status_code != 200:
200 | return None, None
201 | return parse_code(_response.text), parse_token(_response.text)
202 |
203 |
204 | def get_validate(session, keyword):
205 | '''循环进行validate验证'''
206 | for _ in range(10):
207 | if not get_register(session):
208 | return None
209 |
210 | if not post_verify_ip(session):
211 | return None
212 |
213 | if not post_verify_keyword(session, keyword):
214 | return None
215 |
216 | validate = calc_validate(CAPTCHA_JSON['challenge'])
217 | if post_validate(session, validate):
218 | return validate
219 | return None
220 |
221 |
222 | def query_keyword(session, keyword, token):
223 | '''使用session, 查询keyword, 更新session.token'''
224 | if not token:
225 | token = get_main(session)
226 | if not token:
227 | return None
228 |
229 | validate = get_validate(session, keyword)
230 | if not validate:
231 | return None
232 |
233 | return post_search(session, validate, keyword, token)
234 |
235 |
236 | def query_leveldb(query_db, save_db, queryed_db):
237 | '''query by leveldb'''
238 | try:
239 | with requests.Session() as session:
240 | _token = ''
241 | for _name, _code in query_db.RangeIter():
242 | if not util.has_key(save_db, _name) and not util.has_key(queryed_db, _name):
243 | # 模糊查询
244 | _subname = _name[0: 18] if len(_name) > 18 else _name
245 | logging.info(_name + ' -> ' + _subname)
246 | _query_code, _token = query_keyword(session, _subname, _token)
247 | if _query_code:
248 | for _r in _query_code:
249 | logging.info(_r[0].decode() + ' : ' + _r[1].decode())
250 | save_db.Put(_r[0], _r[1], sync=True)
251 | queryed_db.Put(_name, '', sync=True)
252 | return True
253 | except requests.RequestException as _e:
254 | logging.error(_e)
255 | return False
256 |
257 |
258 | def query_keyword_helper(keyword):
259 | '''针对gsxt分站,根据keyword查询一次'''
260 | try:
261 | with requests.Session() as session:
262 | _token = ''
263 | logging.info(keyword)
264 | _query_code, _token = query_keyword(session, keyword, _token)
265 | if _query_code:
266 | for _r in _query_code:
267 | logging.info(_r[0].decode() + ' : ' + _r[1].decode())
268 | return True
269 | except requests.RequestException as _e:
270 | logging.error(_e)
271 | return False
272 |
273 |
274 | def query_leveldb_helper():
275 | '''批量查询leveldb database中所有数据'''
276 | try:
277 | import leveldb
278 | except ImportError:
279 | raise ImportError('You do not install leveldb package.')
280 |
281 | config(GSXT_HOST_HE, GSXT_INDEX_HE)
282 |
283 | query_db_file = os.path.join(os.getcwd(), 'data', 'shanghai.db')
284 | query_db = leveldb.LevelDB(query_db_file)
285 |
286 | save_db_file = os.path.join(os.getcwd(), 'data', 'shanghai_code.db')
287 | save_db = leveldb.LevelDB(save_db_file)
288 |
289 | queryed_db_file = os.path.join(os.getcwd(), 'data', 'shanghai_queryed.db')
290 | queryed_db = leveldb.LevelDB(queryed_db_file)
291 |
292 | _loop = True
293 | while _loop:
294 | _loop = not query_leveldb(query_db, save_db, queryed_db)
295 |
296 |
297 | if __name__ == "__main__":
298 | config(GSXT_HOST_SH, GSXT_INDEX_SH)
299 | query_keyword_helper('百度')
300 | config(GSXT_HOST_HE, GSXT_INDEX_HE)
301 | query_keyword_helper('百度')
302 |
--------------------------------------------------------------------------------
/geetest_offline/geetest_offline_gd.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 | '''
4 | geetest offline 6.0.0 spider for gd.gsxt.org.cn
5 | '''
6 |
7 | import os
8 | import time
9 | import random
10 | import logging
11 | from logging import NullHandler
12 | import json
13 | import requests
14 | import execjs
15 | from bs4 import BeautifulSoup
16 |
17 | import constants
18 | import util
19 |
20 |
21 | logging.getLogger(__name__).addHandler(NullHandler())
22 | logging.basicConfig(level=logging.DEBUG)
23 |
24 | HOST = 'http://gd.gsxt.gov.cn'
25 | INDEX = HOST
26 |
27 | JSRUNTIME = execjs.get(execjs.runtime_names.Node)
28 |
29 | USERRESPONSE_JSCONTEXT = JSRUNTIME.compile(util.USERRESPONSE_JS)
30 |
31 | TIMEOUT = 15
32 |
33 | GD_LIST_FILE = 'gd_list.json'
34 | GD_RESULT_FILE = 'gd_result.json'
35 | GD_NOTFOUND_FILE = 'gd_notfound.json'
36 |
37 | def load_json(json_file):
38 | '''load json file'''
39 | if not os.path.isfile(json_file):
40 | logging.info("Json File Not Exist")
41 | return None
42 | with open(json_file, 'r', encoding='utf8') as _f:
43 | json_data = json.load(_f)
44 | logging.info(len(json_data))
45 | return json_data
46 |
47 |
48 | def save_json(json_file, json_data):
49 | '''save json file'''
50 | with open(json_file, 'w', encoding='utf8') as _f:
51 | json.dump(json_data, _f, indent=2, sort_keys=True, ensure_ascii=False)
52 | logging.info(len(json_data))
53 |
54 |
55 | def calc_userresponse(distance, challenge):
56 | '''根据滑动距离 distance 和 challenge ,计算 userresponse。'''
57 | return USERRESPONSE_JSCONTEXT.call('userresponse', distance, challenge)
58 |
59 |
60 | def calc_validate(challenge):
61 | '''calculate validate'''
62 | _r = random.randint(0, len(util.OFFLINE_SAMPLE)-1)
63 | distance, rand0, rand1 = util.OFFLINE_SAMPLE[_r]
64 | distance_r = calc_userresponse(distance, challenge)
65 | rand0_r = calc_userresponse(rand0, challenge)
66 | rand1_r = calc_userresponse(rand1, challenge)
67 | validate = distance_r + '_' + rand0_r + '_' + rand1_r
68 | logging.debug(validate)
69 | return validate
70 |
71 |
72 | def parse_name_url(html_doc):
73 | '''使用BeautifulSoup解析HTML页面,查找详情链接'''
74 | _soup = BeautifulSoup(html_doc, 'html.parser')
75 | _findall = _soup.find_all('div',
76 | class_="clickStyle",
77 | style='margin-left: 160px;padding-left: 10px;')
78 | name_url_array = []
79 | if _findall:
80 | for _a in _findall:
81 | _company = _a.find('a')
82 | _name = ''.join(_company.get_text().split())
83 | _url = _company['href']
84 | if _url.startswith('../'):
85 | _url = INDEX + '/aiccips/CheckEntContext/' + _url
86 | name_url_array.append((_name, _url))
87 | logging.info(name_url_array)
88 | else:
89 | logging.error('Company Link Not Found')
90 | return name_url_array
91 |
92 |
93 | def get_mainpage(session):
94 | '''
95 | Get http://gd.gsxt.gov.cn
96 | Response Code 200
97 | '''
98 | logging.debug('GET ' + INDEX)
99 | _headers = {'Accept': constants.ACCEPT_HTML,
100 | 'Accept-Language': constants.ACCEPT_LANGUAGE,
101 | 'User-Agent': constants.USER_AGENT}
102 | _response = session.get(INDEX, headers=_headers, timeout=TIMEOUT)
103 | logging.debug('response code:' + str(_response.status_code))
104 | return _response.status_code == 200
105 |
106 |
107 | def get_captcha(session):
108 | '''
109 | GET /aiccips//verify/start.html
110 | Response JSON
111 | {
112 | "success": 0,
113 | "gt": "c02ee51ee0afe88899efe6dc729627fc",
114 | "challenge": "ed3d2c21991e3bef5e069713af9fa6caed"
115 | }
116 | '''
117 | _url = INDEX + '/aiccips//verify/start.html'
118 | logging.debug('GET ' + _url)
119 | _headers = {'Accept': constants.ACCEPT_JSON,
120 | 'Accept-Language': constants.ACCEPT_LANGUAGE,
121 | 'User-Agent': constants.USER_AGENT,
122 | 'Referer': INDEX,
123 | 'X-Requested-With': 'XMLHttpRequest'}
124 | _params = {'t': str(int(time.time() * 1000))}
125 | _response = session.get(_url, headers=_headers, params=_params, timeout=TIMEOUT)
126 | logging.debug('response code: ' + str(_response.status_code))
127 | logging.debug('response text: ' + _response.text)
128 | if _response.status_code != 200:
129 | return False
130 | return _response.json()
131 |
132 |
133 | def post_validate(session, challenge, validate, keyword):
134 | '''
135 | POST /aiccips/verify/sec.html
136 | Response JSON
137 | {
138 | "status": "success",
139 | "textfield": "waY5F5lZyxvKw9bMM4nBs7HUgWS1SRpagFutRKqs/+DkRqCIS9N4PUCqM9fmrbg1",
140 | "version": "3.3.0"
141 | }
142 | '''
143 | _url = INDEX + '/aiccips/verify/sec.html'
144 | logging.debug('POST ' + _url)
145 | _headers = {'Accept': constants.ACCEPT_JSON,
146 | 'Accept-Language': constants.ACCEPT_LANGUAGE,
147 | 'User-Agent': constants.USER_AGENT,
148 | 'Referer': INDEX,
149 | 'X-Requested-With': 'XMLHttpRequest',
150 | 'Origin': HOST}
151 | _params = [('textfield', keyword),
152 | ('geetest_challenge', challenge),
153 | ('geetest_validate', validate),
154 | ('geetest_seccode', validate + '|jordan')]
155 | _response = session.post(_url, headers=_headers, data=_params, timeout=TIMEOUT)
156 | logging.debug('response code: ' + str(_response.status_code))
157 | logging.debug('response text: ' + _response.text)
158 | if _response.status_code != 200:
159 | return False
160 | _json_obj = _response.json()
161 | logging.debug(_json_obj)
162 | return _json_obj['textfield'] if _json_obj['status'] == 'success' else None
163 |
164 |
165 | def post_search(session, textfield):
166 | '''
167 | POST /aiccips/CheckEntContext/showCheck.html
168 | Response HTML WebPage
169 | '''
170 | _url = INDEX + '/aiccips/CheckEntContext/showCheck.html'
171 | logging.debug('POST ' + _url)
172 | _headers = {'Accept': constants.ACCEPT_HTML,
173 | 'Accept-Language': constants.ACCEPT_LANGUAGE,
174 | 'User-Agent': constants.USER_AGENT,
175 | 'Referer': INDEX,
176 | 'X-Requested-With': 'XMLHttpRequest',
177 | 'Origin': HOST}
178 | _params = [('textfield', textfield),
179 | ('type', 'nomal')]
180 | _response = session.post(_url, headers=_headers, data=_params, timeout=TIMEOUT)
181 | logging.debug('response code: ' + str(_response.status_code))
182 | logging.debug('response text: ' + _response.text)
183 | if _response.status_code != 200:
184 | return None
185 | return parse_name_url(_response.text)
186 |
187 |
188 | def get_validate(session, keyword):
189 | '''safe loop post validate'''
190 | for _ in range(10):
191 | captcha = get_captcha(session)
192 | if not captcha:
193 | return None
194 |
195 | validate = calc_validate(captcha['challenge'])
196 | textfield = post_validate(session, captcha['challenge'], validate, keyword)
197 | if textfield:
198 | return textfield
199 | return None
200 |
201 |
202 | def parse_detail_sz(html_doc):
203 | '''parse company detail for shenzhen'''
204 | _soup = BeautifulSoup(html_doc, 'html.parser')
205 | _yyzz = _soup.find('div', class_='item_box', id='yyzz')
206 | if not _yyzz:
207 | logging.error('Detail yyzz Not Found')
208 | return None
209 |
210 | _li_all = _yyzz.find_all('li')
211 | if not _li_all:
212 | logging.error("Detail li Not Found")
213 | return None
214 |
215 | _info = {}
216 | for _li in _li_all:
217 | _text = ''.join(_li.get_text().split())
218 | _k, _v = _text.split(sep=':', maxsplit=1)
219 | _info[_k] = _v
220 | logging.info(_info)
221 | if not _info['企业名称']:
222 | _info = None # for safe
223 | return _info
224 |
225 |
226 | def parse_detail(html_doc):
227 | '''parse company detail for guangzhou and other'''
228 | _soup = BeautifulSoup(html_doc, 'html.parser')
229 | _table = _soup.find('table', cellspacing='6')
230 | if not _table:
231 | logging.error('Detail table Not Found')
232 | return None
233 |
234 | _tr_all = _table.find_all('td')
235 | if not _tr_all:
236 | logging.error("Detail td Not Found")
237 | return None
238 |
239 | _info = {}
240 | for _td in _tr_all:
241 | _text = ''.join(_td.get_text().split())
242 | if _text == '营业执照信息':
243 | continue
244 | _k, _v = _text.split(sep=':', maxsplit=1)
245 | _temp = {}
246 | _temp[_k] = _v
247 | for _k2, _v2 in _temp.items():
248 | if _k2 == '.企业名称' or _k2 == '.名称':
249 | _info['企业名称'] = _v2
250 | elif _k2 == '.统一社会信用代码/注册号' or _k2 == '.注册号':
251 | _info['注册号/统一社会信用代码'] = _v2
252 | elif _k2 == '.类型':
253 | _info['类型'] = _v2
254 | elif _k2 == '.负责人' or _k2 == '.经营者':
255 | _info['法定代表人'] = _v2
256 | elif _k2 == '.成立日期' or _k2 == '.注册日期':
257 | _info['成立日期'] = _v2
258 | elif _k2 == '.营业期限自':
259 | _info['营业期限自'] = _v2
260 | elif _k2 == '.营业期限至':
261 | _info['营业期限至'] = _v2
262 | elif _k2 == '.登记机关':
263 | _info['登记机关'] = _v2
264 | elif _k2 == '.核准日期':
265 | _info['核准日期'] = _v2
266 | elif _k2 == '.登记状态':
267 | _info['登记状态'] = _v2
268 | elif _k2 == '.营业场所' or _k2 == '.经营场所':
269 | _info['住所'] = _v2
270 | elif _k2 == '.经营范围':
271 | _info['经营范围'] = _v2
272 | _info['注册资本'] = '0'
273 | logging.info(_info)
274 | if not _info['企业名称']:
275 | _info = None # for safe
276 | return _info
277 |
278 |
279 | def query_keyword(session, keyword):
280 | '''query keyword'''
281 | #if not get_mainpage(session):
282 | # return None
283 | logging.info(keyword)
284 | textfield = get_validate(session, keyword)
285 | if textfield:
286 | return post_search(session, textfield)
287 | return None
288 |
289 |
290 | def safe_query_keyword(keyword):
291 | '''Safe query keyword, handle network timeout and retry'''
292 | for _ in range(5):
293 | try:
294 | with requests.Session() as session:
295 | return query_keyword(session, keyword)
296 | except requests.RequestException as _e:
297 | logging.error(_e)
298 | time.sleep(5)
299 | return None
300 |
301 |
302 | def query_detail(session, url):
303 | '''query company detail url'''
304 | logging.debug('GET ' + url)
305 | _headers = {'Accept': constants.ACCEPT_HTML,
306 | 'Accept-Language': constants.ACCEPT_LANGUAGE,
307 | 'User-Agent': constants.USER_AGENT}
308 | _response = session.get(url, headers=_headers, timeout=TIMEOUT)
309 | logging.debug('response code:' + str(_response.status_code))
310 | if _response.status_code == 200:
311 | if url.find('www.szcredit.org.cn') is not -1:
312 | return parse_detail_sz(_response.text)
313 | elif url.find('GSpublicityList.html') is not -1:
314 | return parse_detail(_response.text)
315 | else:
316 | logging.error('URL Type Not Support')
317 | return None
318 |
319 |
320 | def safe_query_detail(url):
321 | '''Safe query url, handle network timeout and retry multi times.'''
322 | for _ in range(5):
323 | try:
324 | with requests.Session() as session:
325 | return query_detail(session, url)
326 | except requests.RequestException as _e:
327 | logging.error(_e)
328 | time.sleep(5)
329 | return None
330 |
331 |
332 | def query_entry():
333 | '''main entry'''
334 | lists = load_json(GD_LIST_FILE)
335 | if not lists:
336 | lists = []
337 | results = load_json(GD_RESULT_FILE)
338 | if not results:
339 | results = {}
340 | notfound = load_json(GD_NOTFOUND_FILE)
341 | if not notfound:
342 | notfound = []
343 |
344 | for keyword in lists:
345 | if keyword in results:
346 | continue
347 | if keyword in notfound:
348 | continue
349 | name_url_array = safe_query_keyword(keyword)
350 | if not name_url_array:
351 | notfound.append(keyword)
352 | continue
353 | for name, url in name_url_array:
354 | if name in results:
355 | continue
356 | detail_dict = safe_query_detail(url)
357 | if detail_dict:
358 | results.update({name : detail_dict})
359 | save_json('result.json', results)
360 | save_json('notfound.json', notfound)
361 | logging.info('done')
362 |
363 |
364 | if __name__ == "__main__":
365 | query_entry()
366 |
--------------------------------------------------------------------------------
/geetest_offline/geetest_offline_nm.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 | '''
4 | geetest offline 5.9.0 - 6.0.0 spider for nm.gsxt.gov.cn
5 | HTTP协议与其他站点略有不同
6 | '''
7 |
8 | import time
9 | import random
10 | import logging
11 | from logging import NullHandler
12 | import requests
13 | import execjs
14 | from bs4 import BeautifulSoup
15 | import constants
16 | import util
17 |
18 | logging.getLogger(__name__).addHandler(NullHandler())
19 | logging.basicConfig(level=logging.DEBUG)
20 |
21 | HOST = ''
22 | INDEX = ''
23 |
24 | JSRUNTIME = execjs.get(execjs.runtime_names.Node)
25 |
26 | CAPTCHA_JSON = []
27 |
28 | USERRESPONSE_JSCONTEXT = JSRUNTIME.compile(util.USERRESPONSE_JS)
29 |
30 | TIMEOUT = 10
31 |
32 | GSXT_HOST_NM = 'http://nm.gsxt.gov.cn:58888'
33 | GSXT_INDEX_NM = GSXT_HOST_NM + '/'
34 |
35 | def config(host, index):
36 | '''设置 host and index URL'''
37 | global HOST, INDEX
38 | HOST, INDEX = host, index
39 |
40 |
41 | def calc_userresponse(distance, challenge):
42 | '''根据滑动距离distance和challenge,计算userresponse值'''
43 | return USERRESPONSE_JSCONTEXT.call('userresponse', distance, challenge)
44 |
45 |
46 | def calc_validate(challenge):
47 | '''计算validate值'''
48 | _r = random.randint(0, len(util.OFFLINE_SAMPLE)-1)
49 | distance, rand0, rand1 = util.OFFLINE_SAMPLE[_r]
50 | distance_r = calc_userresponse(distance, challenge)
51 | rand0_r = calc_userresponse(rand0, challenge)
52 | rand1_r = calc_userresponse(rand1, challenge)
53 | validate = distance_r + '_' + rand0_r + '_' + rand1_r
54 | logging.debug(validate)
55 | return validate
56 |
57 |
58 | def parse_code(html_doc):
59 | '''使用BeautifulSoup解析HTML页面,查找统一社会信用代码,查找代码总数(下一页)'''
60 | _soup = BeautifulSoup(html_doc, 'html.parser')
61 | # find result number
62 | _span = _soup.find('span', attrs={'style': 'color: red'})
63 | _number = int(''.join(_span.get_text().split())) if _span else 0
64 | logging.debug('page number = ' + str(_number))
65 | if not _number:
66 | logging.error('Number Not Found')
67 | return None, 0
68 |
69 | _div_all = _soup.find_all('div', class_='clickStyle', attrs={'onclick': 'details(this)'})
70 | _result = []
71 | if _div_all:
72 | for _div in _div_all:
73 | _a = _div.find('a', class_='font16', attrs={'target': '_blank'})
74 | _a_str = _a.get_text()
75 | _td = _div.find('td', attrs={'style': 'width: 35%'})
76 | _span = _td.find('span', class_='dataTextStyle')
77 | _span_str = ''.join(_span.get_text().split())
78 | _result.append((_a_str.encode('utf-8'), _span_str.encode('utf-8')))
79 | else:
80 | logging.info('Code Not Found')
81 | logging.info(html_doc)
82 | return _result, _number
83 |
84 |
85 | def get_main(session):
86 | '''Get gsxt 首页'''
87 | _url = INDEX
88 | logging.debug('GET ' + _url)
89 | _headers = {'Accept': constants.ACCEPT_HTML,
90 | 'Accept-Language': constants.ACCEPT_LANGUAGE,
91 | 'User-Agent': constants.USER_AGENT}
92 | _response = session.get(_url, headers=_headers, timeout=TIMEOUT)
93 | logging.debug('response code:' + str(_response.status_code))
94 | return _response.status_code == 200
95 |
96 |
97 | def get_verify_start(session):
98 | '''
99 | {"success": 0,
100 | "gt": "39134c54afef1e0b19228627406614e9",
101 | "challenge": "fc490ca45c00b1249bbe3554a4fdf6fb35"}
102 | '''
103 | _url = INDEX + '/verify/start.html'
104 | logging.debug('GET ' + _url)
105 | _headers = {'Accept': constants.ACCEPT_JSON,
106 | 'Accept-Language': constants.ACCEPT_LANGUAGE,
107 | 'User-Agent': constants.USER_AGENT,
108 | 'Referer': INDEX,
109 | 'X-Requested-With': 'XMLHttpRequest'}
110 | _params = {'v': str(int(time.time() * 1000))}
111 | _response = session.get(_url, headers=_headers, params=_params, timeout=TIMEOUT)
112 | logging.debug('response code: ' + str(_response.status_code))
113 | logging.debug('response text: ' + _response.text)
114 | if _response.status_code != 200:
115 | return False
116 | global CAPTCHA_JSON
117 | CAPTCHA_JSON = _response.json()
118 | return True
119 |
120 |
121 | def post_verify_sec(session, validate, keyword):
122 | ''' POST /verify/sec.html'''
123 | _url = INDEX + 'verify/sec.html'
124 | logging.debug('POST ' + _url)
125 | _headers = {'Accept': constants.ACCEPT_JSON,
126 | 'Accept-Language': constants.ACCEPT_LANGUAGE,
127 | 'User-Agent': constants.USER_AGENT,
128 | 'Referer': INDEX,
129 | 'X-Requested-With': 'XMLHttpRequest',
130 | 'Origin': HOST}
131 | _params = [('textfield', keyword),
132 | ('geetest_challenge', CAPTCHA_JSON['challenge']),
133 | ('geetest_validate', validate),
134 | ('geetest_seccode', validate + '|jordan')]
135 | _response = session.post(_url, headers=_headers, data=_params, timeout=TIMEOUT)
136 | logging.debug('response code: ' + str(_response.status_code))
137 | logging.debug('response text: ' + _response.text)
138 | if _response.status_code != 200:
139 | return None
140 | _json_obj = _response.json()
141 | logging.debug(_json_obj)
142 | return _json_obj['textfield'] if _json_obj['status'] == 'success' else None
143 |
144 |
145 | def post_search(session, textfield, page):
146 | ''' POST /CheckEntContext/showCheck.html'''
147 | _url = INDEX + 'CheckEntContext/showCheck.html'
148 | logging.debug('POST ' + _url)
149 | _headers = {'Accept': constants.ACCEPT_HTML,
150 | 'Accept-Language': constants.ACCEPT_LANGUAGE,
151 | 'User-Agent': constants.USER_AGENT,
152 | 'Referer': INDEX,
153 | 'X-Requested-With': 'XMLHttpRequest',
154 | 'Origin': HOST}
155 | _params = [('textfield', textfield),
156 | ('type', 'nomal')]
157 | if page > 1:
158 | _params.append(('total', ''))
159 | _params.append(('pageNo', page))
160 |
161 | _response = session.post(_url, headers=_headers, data=_params, timeout=TIMEOUT)
162 | logging.debug('response code: ' + str(_response.status_code))
163 | #logger.debug('response text: ' + _response.text)
164 | if _response.status_code != 200:
165 | return None, None
166 | return parse_code(_response.text)
167 |
168 |
169 | def get_validate(session, keyword):
170 | '''循环进行validate验证'''
171 | for _ in range(10):
172 | if not get_verify_start(session):
173 | return None
174 |
175 | validate = calc_validate(CAPTCHA_JSON['challenge'])
176 | textfield = post_verify_sec(session, validate, keyword)
177 | if textfield:
178 | return textfield
179 | return None
180 |
181 |
182 | def query_keyword(session, keyword):
183 | '''使用session, 查询keyword, 更新session.token'''
184 | if not get_main(session):
185 | return None
186 |
187 | textfield = get_validate(session, keyword)
188 | if not textfield:
189 | return None
190 |
191 | _code_all = []
192 | _number = 50 # max result number
193 | _page = 0 # start page number
194 | while _page * 10 < _number:
195 | _page += 1
196 | _code, _number = post_search(session, textfield, _page)
197 | if _code:
198 | _code_all.extend(_code)
199 | else:
200 | break
201 |
202 | return _code_all
203 |
204 |
205 | def query_leveldb(query_db, save_db, queryed_db):
206 | '''query by leveldb'''
207 | try:
208 | with requests.Session() as session:
209 | for _name, _code in query_db.RangeIter():
210 | if not util.has_key(save_db, _name) and not util.has_key(queryed_db, _name):
211 | # 模糊查询
212 | _subname = _name[0: 18] if len(_name) > 18 else _name
213 | logging.info(_name + ' -> ' + _subname)
214 | _code_all = query_keyword(session, _subname)
215 | if _code_all:
216 | for _c in _code_all:
217 | logging.info(_c[0] + ' : ' + _c[1])
218 | save_db.Put(_c[0], _c[1], sync=True)
219 | queryed_db.Put(_name, '', sync=True)
220 | return True
221 | except requests.RequestException as _e:
222 | logging.error(_e)
223 | return False
224 |
225 |
226 | def query_keyword_helper(keyword):
227 | '''根据keyword查询一次'''
228 | try:
229 | with requests.Session() as session:
230 | _code_all = query_keyword(session, keyword)
231 | if _code_all:
232 | logging.info(len(_code_all))
233 | for _r in _code_all:
234 | logging.info(_r[0].decode() + ' : ' + _r[1].decode())
235 | except requests.RequestException as _e:
236 | logging.error(_e)
237 |
238 |
239 | if __name__ == "__main__":
240 | config(GSXT_HOST_NM, GSXT_INDEX_NM)
241 | query_keyword_helper('百度')
242 |
--------------------------------------------------------------------------------
/geetest_offline/util.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 | '''
4 | geetest常用公共方法
5 | '''
6 |
7 | SPLIT_ARRAY_JS = '''
8 | function getSplitArray() {
9 | for (var a, b = "6_11_7_10_4_12_3_1_0_5_2_9_8".split("_"), c = [], d = 0, e = 52; d < e; d++)
10 | a = 2 * parseInt(b[parseInt(d % 26 / 2)]) + d % 2,
11 | parseInt(d / 2) % 2 || (a += d % 2 ? -1 : 1),
12 | a += d < 26 ? 26 : 0,
13 | c.push(a);
14 | return c
15 | }
16 | '''
17 |
18 | USERRESPONSE_JS = '''
19 | function userresponse(a, b) {
20 | for (var c = b.slice(32), d = [], e = 0; e < c.length; e++) {
21 | var f = c.charCodeAt(e);
22 | d[e] = f > 57 ? f - 87 : f - 48
23 | }
24 | c = 36 * d[0] + d[1];
25 | var g = Math.round(a) + c; b = b.slice(0, 32);
26 | var h, i = [ [], [], [], [], [] ], j = {}, k = 0; e = 0;
27 | for (var l = b.length; e < l; e++)
28 | h = b.charAt(e), j[h] || (j[h] = 1, i[k].push(h), k++, k = 5 == k ? 0 : k);
29 | for (var m, n = g, o = 4, p = "", q = [1, 2, 5, 10, 50]; n > 0;)
30 | n - q[o] >= 0 ? (m = parseInt(Math.random() * i[o].length, 10), p += i[o][m], n -= q[o]) : (i.splice(o, 1), q.splice(o, 1), o -= 1);
31 | return p
32 | }
33 | '''
34 |
35 | OFFLINE_SAMPLE = ((186, 1, 98),
36 | (82, 0, 136),
37 | (61, 5, 108),
38 | (128, 2, 7),
39 | (130, 4, 99),
40 | (189, 3, 65),
41 | (108, 5, 285),
42 | (136, 0, 36),
43 | (41, 0, 263),
44 | (124, 3, 185))
45 |
46 |
47 | TRACE_JS = '''
48 | var tracer = function () {
49 | c = function (traceArray) {
50 | for (var b, c, d, e = [], f = 0, g = [], h = 0, i = traceArray.length - 1; h < i; h++) {
51 | b = Math.round(traceArray[h + 1][0] - traceArray[h][0]),
52 | c = Math.round(traceArray[h + 1][1] - traceArray[h][1]),
53 | d = Math.round(traceArray[h + 1][2] - traceArray[h][2]),
54 | g.push([b, c, d]), 0 == b && 0 == c && 0 == d || (0 == b && 0 == c ? f += d : (e.push([b, c, d + f]), f = 0));
55 | }
56 | return 0 !== f && e.push([b, c, f]), e
57 | },
58 | d = function (a) {
59 | var b = "()*,-./0123456789:?@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqr",
60 | c = b.length,
61 | d = "",
62 | e = Math.abs(a),
63 | f = parseInt(e / c);
64 | f >= c && (f = c - 1), f && (d = b.charAt(f)), e %= c;
65 | var g = "";
66 | return a < 0 && (g += "!"), d && (g += "$"), g + d + b.charAt(e)
67 | },
68 | e = function (a) {
69 | for (var b = [
70 | [1, 0],
71 | [2, 0],
72 | [1, -1],
73 | [1, 1],
74 | [0, 1],
75 | [0, -1],
76 | [3, 0],
77 | [2, -1],
78 | [2, 1]
79 | ], c = "stuvwxyz~", d = 0, e = b.length; d < e; d++)
80 | if (a[0] == b[d][0] && a[1] == b[d][1]) return c[d];
81 | return 0
82 | },
83 | f = function (traceArray) {
84 | for (var b, f = c(traceArray), g = [], h = [], i = [], j = 0, k = f.length; j < k; j++) {
85 | b = e(f[j]), b ? h.push(b) : (g.push(d(f[j][0])), h.push(d(f[j][1]))), i.push(d(f[j][2]));
86 | }
87 | return g.join("") + "!!" + h.join("") + "!!" + i.join("")
88 | },
89 | g = function (traceArray) {
90 | var a = f(traceArray);
91 | return encodeURIComponent(a)
92 | };
93 | return {
94 | trace: g
95 | }
96 | }();
97 | exports.tracer = tracer;
98 | '''
99 |
100 | def has_key(database, key):
101 | '''安全的检查leveldb是否存在key'''
102 | try:
103 | database.Get(key)
104 | return True
105 | except KeyError:
106 | return False
107 |
--------------------------------------------------------------------------------
/geetest_online/README.md:
--------------------------------------------------------------------------------
1 | ### GeeTest滑块验证码online模式的破解
2 |
3 | 继续以[国家企业信用信息公示系统](http://www.gsxt.gov.cn)为例。补充一个完成度80%的项目和文档。代码实现主要参考[https://zhuanlan.zhihu.com/windev](https://zhuanlan.zhihu.com/windev)的相关分析文章。
4 |
5 | ### 已实现功能
6 |
7 | #### 1. 所有HTTP Request & Response协议
8 |
9 | 使用`requests`库。
10 |
11 | #### 2. 验证码图片的拼图重组和识别
12 |
13 | 使用`Pillow`库,实现滑块拼图位置的精确定位。
14 | 全局变量`IMAGE_DEBUG`,实现不同精准度的图片本地临时文件存储,以便观察定位效果和改进。
15 |
16 | #### 3. GeeTest Javascript 加解密算法破解
17 |
18 | 使用`PyExecJS`库,执行GeeTest Javascript方法,获得正确的明文和密文。
19 | 配合`NodeJS`使用更佳。
20 |
21 | #### 4. 使用`BeautifulSoup4`库,进行网页数据解析
22 |
23 | ### 未完成的20%
24 |
25 | + 完善用户鼠标轨迹运行的数据仿真算法。
26 | + 补全官网针对爬虫返回 HTTP 521 的处理,补全Cookie校验逻辑。
27 |
28 | ### Python Dependence
29 |
30 | ```bash
31 | pip install requests
32 | pip install Pillow
33 | pip install PyExecJS
34 | pip install beautifulsoup4
35 | ```
36 |
--------------------------------------------------------------------------------
/geetest_online/constants.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 | '''
4 | HTTP Request 常用常量
5 | '''
6 |
7 | ACCEPT_ANY = '*/*'
8 |
9 | ACCEPT_TEXT = 'text/plain, */*; q=0.01'
10 |
11 | ACCEPT_HTML = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
12 |
13 | ACCEPT_JSON = 'application/json, text/javascript, */*; q=0.01'
14 |
15 | ACCEPT_IMAGE = 'image/webp,image/*,*/*;q=0.8'
16 |
17 | ACCEPT_LANGUAGE = 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2'
18 |
19 | UA_CHROME_WIN = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
20 |
21 | UA_CHROME_MAC = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
22 |
23 | USER_AGENT = UA_CHROME_MAC
24 |
--------------------------------------------------------------------------------
/geetest_online/image/bg.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/geetest_online/image/bg.jpg
--------------------------------------------------------------------------------
/geetest_online/image/bg.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/geetest_online/image/bg.webp
--------------------------------------------------------------------------------
/geetest_online/image/fullbg.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/geetest_online/image/fullbg.jpg
--------------------------------------------------------------------------------
/geetest_online/image/fullbg.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/geetest_online/image/fullbg.webp
--------------------------------------------------------------------------------
/geetest_online/image/slice.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/9468305/python-script/d49588574986fae8c86d701d59b3535e58537610/geetest_online/image/slice.webp
--------------------------------------------------------------------------------
/geetest_online/test/TraceSample01.txt:
--------------------------------------------------------------------------------
1 | [
2 | [-20, -27, 0],
3 | [0, 0, 0],
4 | [0, 0, 1],
5 | [3, 0, 376],
6 | [6, 0, 385],
7 | [11, 0, 392],
8 | [18, 0, 400],
9 | [25, 0, 408],
10 | [30, 0, 416],
11 | [35, 0, 425],
12 | [41, 0, 432],
13 | [44, 0, 440],
14 | [48, 0, 448],
15 | [51, 0, 457],
16 | [53, 0, 464],
17 | [54, 0, 472],
18 | [56, 0, 480],
19 | [58, 0, 488],
20 | [60, 0, 496],
21 | [61, 0, 512],
22 | [62, 0, 520],
23 | [64, 0, 528],
24 | [65, 0, 536],
25 | [67, 0, 544],
26 | [68, 0, 552],
27 | [70, 0, 560],
28 | [71, 0, 568],
29 | [73, 0, 584],
30 | [74, 0, 600],
31 | [76, 0, 624],
32 | [77, 0, 632],
33 | [78, 0, 648],
34 | [79, 0, 656],
35 | [80, 0, 672],
36 | [81, 0, 680],
37 | [83, 0, 688],
38 | [84, 0, 704],
39 | [85, 0, 712],
40 | [86, 0, 720],
41 | [87, 0, 728],
42 | [88, 0, 737],
43 | [89, 0, 744],
44 | [90, 0, 760],
45 | [91, 0, 769],
46 | [92, 0, 776],
47 | [93, 0, 800],
48 | [94, 0, 824],
49 | [95, 0, 832],
50 | [96, 0, 856],
51 | [97, 0, 864],
52 | [98, 0, 880],
53 | [99, 0, 920],
54 | [100, 0, 928],
55 | [101, 0, 936],
56 | [102, 0, 944],
57 | [103, 0, 952],
58 | [104, 0, 992],
59 | [105, 0, 1040],
60 | [106, 0, 1056],
61 | [107, 0, 1128],
62 | [108, 0, 1192],
63 | [109, 0, 1457],
64 | [110, 0, 1473],
65 | [110, -1, 1529],
66 | [110, -1, 3521]
67 | ]
--------------------------------------------------------------------------------
/geetest_online/test/TraceSample01Parse.txt:
--------------------------------------------------------------------------------
1 | distance = 110
2 | (0, 1)
3 | (3, 375)
4 | (3, 9)
5 | (5, 7)
6 | (7, 8)
7 | (7, 8)
8 | (5, 8)
9 | (5, 9)
10 | (6, 7)
11 | (3, 8)
12 | (4, 8)
13 | (3, 9)
14 | (2, 7)
15 | (1, 8)
16 | (2, 8)
17 | (2, 8)
18 | (2, 8)
19 | (1, 16)
20 | (1, 8)
21 | (2, 8)
22 | (1, 8)
23 | (2, 8)
24 | (1, 8)
25 | (2, 8)
26 | (1, 8)
27 | (2, 16)
28 | (1, 16)
29 | (2, 24)
30 | (1, 8)
31 | (1, 16)
32 | (1, 8)
33 | (1, 16)
34 | (1, 8)
35 | (2, 8)
36 | (1, 16)
37 | (1, 8)
38 | (1, 8)
39 | (1, 8)
40 | (1, 9)
41 | (1, 7)
42 | (1, 16)
43 | (1, 9)
44 | (1, 7)
45 | (1, 24)
46 | (1, 24)
47 | (1, 8)
48 | (1, 24)
49 | (1, 8)
50 | (1, 16)
51 | (1, 40)
52 | (1, 8)
53 | (1, 8)
54 | (1, 8)
55 | (1, 8)
56 | (1, 40)
57 | (1, 48)
58 | (1, 16)
59 | (1, 72)
60 | (1, 64)
61 | (1, 265)
62 | (1, 16)
63 | (0, 56)
64 | (0, 1992)
65 |
--------------------------------------------------------------------------------
/geetest_online/test/TraceSample02.txt:
--------------------------------------------------------------------------------
1 | [
2 | [-21, -24, 0],
3 | [0, 0, 0],
4 | [0, 0, 4],
5 | [1, 0, 504],
6 | [4, 0, 520],
7 | [5, 0, 529],
8 | [6, 0, 537],
9 | [8, 0, 545],
10 | [11, 0, 552],
11 | [13, 0, 560],
12 | [15, 0, 568],
13 | [19, 0, 576],
14 | [22, 0, 584],
15 | [26, 0, 592],
16 | [31, 1, 600],
17 | [37, 3, 608],
18 | [40, 3, 616],
19 | [44, 4, 624],
20 | [46, 4, 632],
21 | [48, 4, 640],
22 | [52, 4, 648],
23 | [54, 4, 656],
24 | [55, 4, 664],
25 | [58, 4, 673],
26 | [61, 4, 680],
27 | [64, 4, 688],
28 | [66, 4, 696],
29 | [69, 4, 706],
30 | [74, 4, 712],
31 | [76, 4, 720],
32 | [79, 4, 729],
33 | [81, 4, 736],
34 | [82, 4, 745],
35 | [84, 4, 752],
36 | [86, 4, 760],
37 | [87, 4, 768],
38 | [88, 4, 776],
39 | [89, 4, 784],
40 | [91, 4, 792],
41 | [92, 4, 799],
42 | [95, 4, 808],
43 | [96, 4, 815],
44 | [99, 4, 824],
45 | [101, 4, 832],
46 | [103, 4, 840],
47 | [106, 4, 847],
48 | [107, 4, 856],
49 | [109, 4, 863],
50 | [110, 4, 872],
51 | [112, 4, 879],
52 | [113, 4, 896],
53 | [115, 4, 904],
54 | [116, 4, 911],
55 | [117, 4, 920],
56 | [119, 4, 927],
57 | [120, 4, 936],
58 | [121, 4, 944],
59 | [122, 4, 952],
60 | [123, 4, 968],
61 | [124, 4, 976],
62 | [125, 4, 983],
63 | [126, 4, 992],
64 | [127, 4, 999],
65 | [129, 4, 1008],
66 | [131, 4, 1024],
67 | [132, 4, 1032],
68 | [134, 4, 1040],
69 | [135, 4, 1055],
70 | [136, 4, 1063],
71 | [137, 4, 1072],
72 | [138, 4, 1095],
73 | [139, 4, 1104],
74 | [140, 4, 1111],
75 | [141, 4, 1120],
76 | [142, 4, 1127],
77 | [143, 4, 1136],
78 | [144, 4, 1168],
79 | [145, 4, 1184],
80 | [146, 4, 1200],
81 | [147, 4, 1215],
82 | [148, 4, 1232],
83 | [149, 4, 1239],
84 | [150, 4, 1256],
85 | [151, 4, 1296],
86 | [152, 4, 1312],
87 | [153, 4, 1320],
88 | [154, 4, 1336],
89 | [155, 4, 1368],
90 | [156, 4, 1376],
91 | [157, 4, 1424],
92 | [158, 4, 1440],
93 | [159, 4, 1464],
94 | [160, 4, 1488],
95 | [161, 4, 1520],
96 | [162, 4, 1552],
97 | [163, 3, 1600],
98 | [164, 3, 1616],
99 | [165, 3, 1640],
100 | [166, 3, 1712],
101 | [167, 3, 1736],
102 | [168, 3, 1760],
103 | [169, 3, 1872],
104 | [170, 3, 1905],
105 | [171, 3, 1952],
106 | [172, 1, 2072],
107 | [173, 1, 2280],
108 | [173, 1, 3072]
109 | ]
--------------------------------------------------------------------------------
/geetest_online/test/TraceSample02Parse.txt:
--------------------------------------------------------------------------------
1 | distance = 173
2 | (0, 4)
3 | (1, 500)
4 | (3, 16)
5 | (1, 9)
6 | (1, 8)
7 | (2, 8)
8 | (3, 7)
9 | (2, 8)
10 | (2, 8)
11 | (4, 8)
12 | (3, 8)
13 | (4, 8)
14 | (5, 8)
15 | (6, 8)
16 | (3, 8)
17 | (4, 8)
18 | (2, 8)
19 | (2, 8)
20 | (4, 8)
21 | (2, 8)
22 | (1, 8)
23 | (3, 9)
24 | (3, 7)
25 | (3, 8)
26 | (2, 8)
27 | (3, 10)
28 | (5, 6)
29 | (2, 8)
30 | (3, 9)
31 | (2, 7)
32 | (1, 9)
33 | (2, 7)
34 | (2, 8)
35 | (1, 8)
36 | (1, 8)
37 | (1, 8)
38 | (2, 8)
39 | (1, 7)
40 | (3, 9)
41 | (1, 7)
42 | (3, 9)
43 | (2, 8)
44 | (2, 8)
45 | (3, 7)
46 | (1, 9)
47 | (2, 7)
48 | (1, 9)
49 | (2, 7)
50 | (1, 17)
51 | (2, 8)
52 | (1, 7)
53 | (1, 9)
54 | (2, 7)
55 | (1, 9)
56 | (1, 8)
57 | (1, 8)
58 | (1, 16)
59 | (1, 8)
60 | (1, 7)
61 | (1, 9)
62 | (1, 7)
63 | (2, 9)
64 | (2, 16)
65 | (1, 8)
66 | (2, 8)
67 | (1, 15)
68 | (1, 8)
69 | (1, 9)
70 | (1, 23)
71 | (1, 9)
72 | (1, 7)
73 | (1, 9)
74 | (1, 7)
75 | (1, 9)
76 | (1, 32)
77 | (1, 16)
78 | (1, 16)
79 | (1, 15)
80 | (1, 17)
81 | (1, 7)
82 | (1, 17)
83 | (1, 40)
84 | (1, 16)
85 | (1, 8)
86 | (1, 16)
87 | (1, 32)
88 | (1, 8)
89 | (1, 48)
90 | (1, 16)
91 | (1, 24)
92 | (1, 24)
93 | (1, 32)
94 | (1, 32)
95 | (1, 48)
96 | (1, 16)
97 | (1, 24)
98 | (1, 72)
99 | (1, 24)
100 | (1, 24)
101 | (1, 112)
102 | (1, 33)
103 | (1, 47)
104 | (1, 120)
105 | (1, 208)
106 | (0, 792)
107 |
--------------------------------------------------------------------------------
/geetest_online/test/TraceSample03.txt:
--------------------------------------------------------------------------------
1 | [
2 | [-18, -24, 0],
3 | [0, 0, 0],
4 | [1, 0, 216],
5 | [2, 0, 224],
6 | [3, 0, 312],
7 | [4, 0, 328],
8 | [5, 0, 336],
9 | [7, 0, 352],
10 | [8, 0, 360],
11 | [10, 0, 368],
12 | [12, 0, 376],
13 | [14, 0, 385],
14 | [17, 0, 392],
15 | [19, 0, 400],
16 | [20, 0, 408],
17 | [23, 0, 416],
18 | [25, 0, 424],
19 | [27, 0, 433],
20 | [28, 0, 441],
21 | [30, 0, 449],
22 | [32, 0, 457],
23 | [33, 0, 466],
24 | [34, 0, 473],
25 | [35, 0, 480],
26 | [37, 0, 489],
27 | [38, 0, 496],
28 | [41, 0, 504],
29 | [43, 0, 513],
30 | [44, 0, 519],
31 | [45, 0, 528],
32 | [47, 0, 535],
33 | [48, 0, 544],
34 | [49, 0, 560],
35 | [50, 0, 576],
36 | [51, 0, 583],
37 | [52, 0, 592],
38 | [54, 0, 599],
39 | [56, 0, 616],
40 | [57, 0, 624],
41 | [58, 0, 632],
42 | [59, 0, 649],
43 | [60, 0, 672],
44 | [62, 0, 688],
45 | [63, 0, 712],
46 | [65, 0, 752],
47 | [66, 0, 784],
48 | [68, 0, 808],
49 | [69, 0, 888],
50 | [71, 0, 920],
51 | [72, 0, 984],
52 | [72, -1, 1024],
53 | [74, -1, 1080],
54 | [75, -1, 1232],
55 | [75, -2, 1247],
56 | [77, -2, 1352],
57 | [78, -2, 1432],
58 | [78, -2, 2136]
59 | ]
--------------------------------------------------------------------------------
/geetest_online/test/TraceSample03Parse.txt:
--------------------------------------------------------------------------------
1 | distance = 78
2 | (1, 216)
3 | (1, 8)
4 | (1, 88)
5 | (1, 16)
6 | (1, 8)
7 | (2, 16)
8 | (1, 8)
9 | (2, 8)
10 | (2, 8)
11 | (2, 9)
12 | (3, 7)
13 | (2, 8)
14 | (1, 8)
15 | (3, 8)
16 | (2, 8)
17 | (2, 9)
18 | (1, 8)
19 | (2, 8)
20 | (2, 8)
21 | (1, 9)
22 | (1, 7)
23 | (1, 7)
24 | (2, 9)
25 | (1, 7)
26 | (3, 8)
27 | (2, 9)
28 | (1, 6)
29 | (1, 9)
30 | (2, 7)
31 | (1, 9)
32 | (1, 16)
33 | (1, 16)
34 | (1, 7)
35 | (1, 9)
36 | (2, 7)
37 | (2, 17)
38 | (1, 8)
39 | (1, 8)
40 | (1, 17)
41 | (1, 23)
42 | (2, 16)
43 | (1, 24)
44 | (2, 40)
45 | (1, 32)
46 | (2, 24)
47 | (1, 80)
48 | (2, 32)
49 | (1, 64)
50 | (0, 40)
51 | (2, 56)
52 | (1, 152)
53 | (0, 15)
54 | (2, 105)
55 | (1, 80)
56 | (0, 704)
57 |
--------------------------------------------------------------------------------
/geetest_online/test/TraceSample04.txt:
--------------------------------------------------------------------------------
1 | [
2 | [-23, -18, 0],
3 | [0, 0, 0],
4 | [1, 0, 223],
5 | [2, 0, 239],
6 | [3, 0, 247],
7 | [4, 0, 255],
8 | [5, 0, 263],
9 | [6, 0, 271],
10 | [8, 0, 279],
11 | [9, 0, 287],
12 | [11, 0, 295],
13 | [12, 0, 303],
14 | [13, 0, 311],
15 | [15, 0, 319],
16 | [16, 0, 327],
17 | [18, 0, 335],
18 | [20, 0, 351],
19 | [22, 0, 359],
20 | [25, 0, 367],
21 | [27, 0, 375],
22 | [29, 0, 383],
23 | [32, 0, 391],
24 | [34, 0, 399],
25 | [37, 0, 407],
26 | [40, 0, 415],
27 | [42, 0, 423],
28 | [44, 0, 431],
29 | [47, 0, 439],
30 | [48, 0, 447],
31 | [49, 0, 455],
32 | [51, 0, 463],
33 | [52, 0, 471],
34 | [54, 0, 479],
35 | [56, 0, 487],
36 | [59, 0, 495],
37 | [61, 0, 503],
38 | [63, 0, 511],
39 | [66, 0, 519],
40 | [69, 0, 527],
41 | [72, 0, 536],
42 | [74, 0, 543],
43 | [75, 0, 553],
44 | [76, 0, 559],
45 | [79, -1, 567],
46 | [81, -3, 575],
47 | [83, -3, 584],
48 | [85, -3, 591],
49 | [88, -3, 600],
50 | [91, -3, 607],
51 | [93, -3, 615],
52 | [96, -3, 623],
53 | [99, -3, 631],
54 | [100, -3, 638],
55 | [102, -3, 647],
56 | [105, -3, 655],
57 | [107, -3, 663],
58 | [110, -4, 671],
59 | [111, -4, 681],
60 | [114, -4, 687],
61 | [117, -4, 695],
62 | [120, -4, 703],
63 | [122, -4, 711],
64 | [124, -4, 719],
65 | [125, -4, 727],
66 | [127, -4, 735],
67 | [128, -4, 743],
68 | [129, -4, 752],
69 | [130, -4, 759],
70 | [132, -4, 767],
71 | [134, -4, 775],
72 | [136, -4, 783],
73 | [138, -4, 791],
74 | [141, -4, 799],
75 | [143, -4, 807],
76 | [145, -4, 816],
77 | [146, -4, 823],
78 | [148, -4, 832],
79 | [149, -4, 839],
80 | [150, -4, 847],
81 | [152, -5, 855],
82 | [153, -5, 871],
83 | [155, -5, 881],
84 | [156, -5, 887],
85 | [157, -5, 895],
86 | [159, -5, 903],
87 | [160, -5, 912],
88 | [161, -5, 919],
89 | [162, -5, 927],
90 | [163, -5, 935],
91 | [165, -5, 943],
92 | [166, -5, 952],
93 | [168, -5, 959],
94 | [169, -5, 967],
95 | [171, -5, 975],
96 | [173, -5, 983],
97 | [175, -5, 999],
98 | [176, -5, 1006],
99 | [178, -5, 1023],
100 | [179, -5, 1047],
101 | [179, -6, 1055],
102 | [180, -6, 1063],
103 | [182, -6, 1087],
104 | [184, -6, 1111],
105 | [185, -7, 1128],
106 | [187, -7, 1143],
107 | [188, -7, 1151],
108 | [190, -7, 1167],
109 | [191, -7, 1175],
110 | [193, -7, 1183],
111 | [194, -7, 1191],
112 | [195, -7, 1199],
113 | [196, -7, 1207],
114 | [197, -7, 1224],
115 | [198, -7, 1239],
116 | [199, -7, 1255],
117 | [200, -7, 1263],
118 | [201, -7, 1271],
119 | [202, -7, 1287],
120 | [204, -7, 1295],
121 | [206, -7, 1319],
122 | [207, -7, 1343],
123 | [209, -7, 1375],
124 | [210, -7, 1399],
125 | [209, -7, 1687],
126 | [208, -7, 1695],
127 | [206, -8, 1703],
128 | [204, -8, 1711],
129 | [203, -8, 1719],
130 | [202, -8, 1727],
131 | [200, -8, 1735],
132 | [199, -8, 1743],
133 | [197, -8, 1751],
134 | [196, -8, 1767],
135 | [194, -8, 1785],
136 | [193, -8, 1791],
137 | [191, -8, 1802],
138 | [189, -8, 1808],
139 | [187, -8, 1823],
140 | [186, -8, 1832],
141 | [183, -8, 1839],
142 | [181, -8, 1855],
143 | [180, -8, 1863],
144 | [179, -8, 1871],
145 | [178, -8, 1879],
146 | [177, -8, 1887],
147 | [176, -8, 1903],
148 | [174, -8, 1912],
149 | [173, -8, 1919],
150 | [172, -8, 1935],
151 | [171, -8, 1952],
152 | [170, -8, 1975],
153 | [168, -8, 1991],
154 | [167, -8, 1999],
155 | [165, -8, 2007],
156 | [164, -8, 2023],
157 | [163, -8, 2032],
158 | [162, -8, 2039],
159 | [160, -8, 2047],
160 | [159, -8, 2055],
161 | [158, -8, 2063],
162 | [157, -8, 2071],
163 | [156, -8, 2081],
164 | [155, -8, 2087],
165 | [154, -8, 2111],
166 | [153, -8, 2119],
167 | [151, -8, 2128],
168 | [151, -7, 2135],
169 | [149, -7, 2143],
170 | [148, -7, 2152],
171 | [146, -6, 2159],
172 | [144, -6, 2175],
173 | [143, -6, 2183],
174 | [142, -6, 2191],
175 | [140, -6, 2199],
176 | [137, -6, 2207],
177 | [134, -6, 2216],
178 | [132, -6, 2223],
179 | [131, -6, 2231],
180 | [129, -6, 2239],
181 | [127, -6, 2247],
182 | [124, -5, 2255],
183 | [121, -5, 2263],
184 | [119, -3, 2271],
185 | [115, -3, 2280],
186 | [111, -3, 2287],
187 | [106, -3, 2295],
188 | [103, -3, 2303],
189 | [101, -3, 2311],
190 | [100, -3, 2319],
191 | [98, -3, 2328],
192 | [96, -3, 2335],
193 | [94, -3, 2343],
194 | [92, -3, 2352],
195 | [89, -3, 2359],
196 | [88, -3, 2367],
197 | [87, -3, 2375],
198 | [86, -3, 2383],
199 | [86, -2, 2623],
200 | [88, -2, 2631],
201 | [90, -2, 2639],
202 | [93, -1, 2647],
203 | [94, -1, 2655],
204 | [97, -1, 2663],
205 | [99, -1, 2671],
206 | [101, -1, 2680],
207 | [104, -1, 2687],
208 | [107, -1, 2695],
209 | [109, -1, 2703],
210 | [111, -1, 2711],
211 | [112, -1, 2720],
212 | [116, -1, 2727],
213 | [118, -1, 2743],
214 | [119, -1, 2752],
215 | [120, -1, 2768],
216 | [121, -1, 2775],
217 | [122, -1, 2784],
218 | [123, -1, 2799],
219 | [125, -1, 2807],
220 | [127, -1, 2815],
221 | [128, -1, 2823],
222 | [129, -1, 2839],
223 | [130, -1, 2847],
224 | [131, -1, 2855],
225 | [132, -1, 2879],
226 | [133, -1, 2895],
227 | [134, -1, 2927],
228 | [135, -1, 3023],
229 | [136, -1, 3047],
230 | [135, -1, 3471],
231 | [134, -1, 3503],
232 | [133, -1, 3511],
233 | [132, -1, 3519],
234 | [131, -1, 3583],
235 | [130, -1, 3607],
236 | [129, -1, 3639],
237 | [129, -2, 3919],
238 | [130, -2, 3943],
239 | [130, -3, 3959],
240 | [131, -3, 3983],
241 | [132, -3, 4031],
242 | [134, -3, 4135],
243 | [134, -3, 5064]
244 | ]
--------------------------------------------------------------------------------
/geetest_online/test/TraceSample04Parse.txt:
--------------------------------------------------------------------------------
1 | (1, 223)
2 | (1, 16)
3 | (1, 8)
4 | (1, 8)
5 | (1, 8)
6 | (1, 8)
7 | (2, 8)
8 | (1, 8)
9 | (2, 8)
10 | (1, 8)
11 | (1, 8)
12 | (2, 8)
13 | (1, 8)
14 | (2, 8)
15 | (2, 16)
16 | (2, 8)
17 | (3, 8)
18 | (2, 8)
19 | (2, 8)
20 | (3, 8)
21 | (2, 8)
22 | (3, 8)
23 | (3, 8)
24 | (2, 8)
25 | (2, 8)
26 | (3, 8)
27 | (1, 8)
28 | (1, 8)
29 | (2, 8)
30 | (1, 8)
31 | (2, 8)
32 | (2, 8)
33 | (3, 8)
34 | (2, 8)
35 | (2, 8)
36 | (3, 8)
37 | (3, 8)
38 | (3, 9)
39 | (2, 7)
40 | (1, 10)
41 | (1, 6)
42 | (3, 8)
43 | (2, 8)
44 | (2, 9)
45 | (2, 7)
46 | (3, 6)
47 | (3, 8)
48 | (3, 8)
49 | (2, 8)
50 | (2, 8)
51 | (1, 8)
52 | (2, 8)
53 | (1, 8)
54 | (1, 9)
55 | (1, 7)
56 | (2, 8)
57 | (2, 8)
58 | (2, 8)
59 | (2, 8)
60 | (3, 8)
61 | (2, 8)
62 | (2, 9)
63 | (1, 7)
64 | (2, 9)
65 | (1, 7)
66 | (1, 8)
67 | (2, 8)
68 | (1, 16)
69 | (2, 10)
70 | (1, 6)
71 | (1, 8)
72 | (2, 8)
73 | (1, 9)
74 | (1, 7)
75 | (1, 8)
76 | (1, 8)
77 | (2, 8)
78 | (1, 9)
79 | (2, 7)
80 | (1, 8)
81 | (2, 8)
82 | (2, 8)
83 | (2, 16)
84 | (1, 7)
85 | (2, 17)
86 | (1, 24)
87 | (0, 8)
88 | (1, 8)
89 | (2, 24)
90 | (2, 24)
91 | (1, 17)
92 | (2, 15)
93 | (1, 8)
94 | (2, 16)
95 | (1, 8)
96 | (2, 8)
97 | (1, 8)
98 | (1, 8)
99 | (1, 8)
100 | (1, 17)
101 | (1, 15)
102 | (1, 16)
103 | (1, 8)
104 | (1, 8)
105 | (1, 16)
106 | (2, 8)
107 | (2, 24)
108 | (1, 24)
109 | (2, 32)
110 | (1, 24)
111 | (-1, 288)
112 | (-1, 8)
113 | (-2, 8)
114 | (-2, 8)
115 | (-1, 8)
116 | (-1, 8)
117 | (-2, 8)
118 | (-1, 8)
119 | (-2, 8)
120 | (-1, 16)
121 | (-2, 18)
122 | (-1, 6)
123 | (-2, 11)
124 | (-2, 6)
125 | (-2, 15)
126 | (-1, 9)
127 | (-3, 7)
128 | (-2, 16)
129 | (-1, 8)
130 | (-1, 8)
131 | (-1, 8)
132 | (-1, 8)
133 | (-1, 16)
134 | (-2, 9)
135 | (-1, 7)
136 | (-1, 16)
137 | (-1, 17)
138 | (-1, 23)
139 | (-2, 16)
140 | (-1, 8)
141 | (-2, 8)
142 | (-1, 16)
143 | (-1, 9)
144 | (-1, 7)
145 | (-2, 8)
146 | (-1, 8)
147 | (-1, 8)
148 | (-1, 8)
149 | (-1, 10)
150 | (-1, 6)
151 | (-1, 24)
152 | (-1, 8)
153 | (-2, 9)
154 | (0, 7)
155 | (-2, 8)
156 | (-1, 9)
157 | (-2, 7)
158 | (-2, 16)
159 | (-1, 8)
160 | (-1, 8)
161 | (-2, 8)
162 | (-3, 8)
163 | (-3, 9)
164 | (-2, 7)
165 | (-1, 8)
166 | (-2, 8)
167 | (-2, 8)
168 | (-3, 8)
169 | (-3, 8)
170 | (-2, 8)
171 | (-4, 9)
172 | (-4, 7)
173 | (-5, 8)
174 | (-3, 8)
175 | (-2, 8)
176 | (-1, 8)
177 | (-2, 9)
178 | (-2, 7)
179 | (-2, 8)
180 | (-2, 9)
181 | (-3, 7)
182 | (-1, 8)
183 | (-1, 8)
184 | (-1, 8)
185 | (0, 240)
186 | (2, 8)
187 | (2, 8)
188 | (3, 8)
189 | (1, 8)
190 | (3, 8)
191 | (2, 8)
192 | (2, 9)
193 | (3, 7)
194 | (3, 8)
195 | (2, 8)
196 | (2, 8)
197 | (1, 9)
198 | (4, 7)
199 | (2, 16)
200 | (1, 9)
201 | (1, 16)
202 | (1, 7)
203 | (1, 9)
204 | (1, 15)
205 | (2, 8)
206 | (2, 8)
207 | (1, 8)
208 | (1, 16)
209 | (1, 8)
210 | (1, 8)
211 | (1, 24)
212 | (1, 16)
213 | (1, 32)
214 | (1, 96)
215 | (1, 24)
216 | (-1, 424)
217 | (-1, 32)
218 | (-1, 8)
219 | (-1, 8)
220 | (-1, 64)
221 | (-1, 24)
222 | (-1, 32)
223 | (0, 280)
224 | (1, 24)
225 | (0, 16)
226 | (1, 24)
227 | (1, 48)
228 | (2, 104)
229 | (0, 929)
230 |
--------------------------------------------------------------------------------
/geetest_online/test/test_pyexecjs.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 | '''Unit Test for PyExecJS.'''
4 | import execjs
5 |
6 | JSRUNTIME = execjs.get(execjs.runtime_names.Node)
7 |
8 | TOKEN_JS = '''
9 | function check_browser(data){
10 | location_info = data.value ^ 536870911
11 | }
12 | location_info = 4995595067;
13 | '''
14 |
15 |
16 | def test_context():
17 | '''Test JSRuntime Context functions.'''
18 | _context = JSRUNTIME.compile(TOKEN_JS)
19 | print(_context.eval('location_info'))
20 | print(_context.call('check_browser', '{ value: 499382950}'))
21 | print(_context.eval('location_info'))
22 |
23 |
24 | if __name__ == "__main__":
25 | test_context()
26 |
--------------------------------------------------------------------------------
/geetest_online/test/test_token.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 | '''Unit Test for token decode'''
4 |
5 | def test_token():
6 | '''Test token bytes to string.'''
7 | _a = [102, 117, 110, 99, 116, 105, 111, 110, 32, 99, 104, 101, 99, 107, 95, 98,
8 | 114, 111, 119, 115, 101, 114, 40, 100, 97, 116, 97, 41, 123, 32, 10, 32,
9 | 32, 32, 32, 32, 108, 111, 99, 97, 116, 105, 111, 110, 95, 105, 110, 102,
10 | 111, 32, 61, 32, 100, 97, 116, 97, 46, 118, 97, 108, 117, 101, 32, 94,
11 | 32, 53, 51, 54, 56, 55, 48, 57, 49, 49, 10, 125, 32, 10, 108, 111,
12 | 99, 97, 116, 105, 111, 110, 95, 105, 110, 102, 111, 32, 61, 32, 52, 57,
13 | 57, 53, 53, 57, 53, 48, 54, 55, 59]
14 | print(''.join(chr(i) for i in _a))
15 |
16 | _b = [105, 102, 40, 33, 104, 97, 115, 86, 97, 108, 105, 100, 41, 123, 98, 114,
17 | 111, 119, 115, 101, 114, 95, 118, 101, 114, 115, 105, 111, 110, 40, 123, 32,
18 | 118, 97, 108, 117, 101, 58, 32, 52, 57, 57, 53, 53, 57, 53, 52, 57,
19 | 125, 41, 59, 104, 97, 115, 86, 97, 108, 105, 100, 61, 116, 114, 117, 101,
20 | 59, 125]
21 | print(''.join(chr(i) for i in _b))
22 |
23 | _c = [102, 117, 110, 99, 116, 105, 111, 110, 32, 99, 104, 101, 99, 107, 95, 98,
24 | 114, 111, 119, 115, 101, 114, 40, 100, 97, 116, 97, 41, 123, 32, 10, 32,
25 | 32, 32, 32, 32, 108, 111, 99, 97, 116, 105, 111, 110, 95, 105, 110, 102,
26 | 111, 32, 61, 32, 100, 97, 116, 97, 46, 118, 97, 108, 117, 101, 32, 94,
27 | 32, 53, 51, 54, 56, 55, 48, 57, 49, 49, 10, 125, 32, 10, 108, 111,
28 | 99, 97, 116, 105, 111, 110, 95, 105, 110, 102, 111, 32, 61, 32, 53, 48,
29 | 48, 48, 54, 51, 53, 48, 48, 51, 59]
30 | print(''.join(chr(i) for i in _c))
31 |
32 |
33 | if __name__ == "__main__":
34 | test_token()
35 |
--------------------------------------------------------------------------------
/geetest_online/test/testgeetestjs.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 | '''Unit Test for geetest.js'''
4 | import os
5 | import random
6 | import codecs
7 | import json
8 | import execjs
9 | from PIL import Image
10 | from bs4 import BeautifulSoup
11 |
12 | JSRUNTIME = execjs.get(execjs.runtime_names.Node)
13 |
14 | G_SPLIT_ARRAY_JS = '''
15 | function getSplitArray() {
16 | for (var a, b = "6_11_7_10_4_12_3_1_0_5_2_9_8".split("_"), c = [], d = 0, e = 52; d < e; d++)
17 | a = 2 * parseInt(b[parseInt(d % 26 / 2)]) + d % 2,
18 | parseInt(d / 2) % 2 || (a += d % 2 ? -1 : 1),
19 | a += d < 26 ? 26 : 0,
20 | c.push(a);
21 | return c
22 | }
23 | '''
24 |
25 | USERRESPONSE_JS = '''
26 | function userresponse(a, b) {
27 | for (var c = b.slice(32), d = [], e = 0; e < c.length; e++) {
28 | var f = c.charCodeAt(e);
29 | d[e] = f > 57 ? f - 87 : f - 48
30 | }
31 | c = 36 * d[0] + d[1];
32 | var g = Math.round(a) + c;
33 | b = b.slice(0, 32);
34 | var h, i = [
35 | [],
36 | [],
37 | [],
38 | [],
39 | []
40 | ],
41 | j = {},
42 | k = 0;
43 | e = 0;
44 | for (var l = b.length; e < l; e++) h = b.charAt(e), j[h] || (j[h] = 1, i[k].push(h), k++, k = 5 == k ? 0 : k);
45 | for (var m, n = g, o = 4, p = "", q = [1, 2, 5, 10, 50]; n > 0;) n - q[o] >= 0 ? (m = parseInt(Math.random() * i[o].length, 10), p += i[o][m], n -= q[o]) : (i.splice(o, 1), q.splice(o, 1), o -= 1);
46 | return p
47 | }
48 | '''
49 |
50 | USERRESPONSE_JSCONTEXT = JSRUNTIME.compile(USERRESPONSE_JS)
51 |
52 | TRACE_JS = '''
53 | var tracer = function () {
54 | c = function (traceArray) {
55 | for (var b, c, d, e = [], f = 0, g = [], h = 0, i = traceArray.length - 1; h < i; h++) {
56 | b = Math.round(traceArray[h + 1][0] - traceArray[h][0]), c = Math.round(traceArray[h + 1][1] - traceArray[h][1]), d = Math.round(traceArray[h + 1][2] - traceArray[h][2]), g.push([b, c, d]), 0 == b && 0 == c && 0 == d || (0 == b && 0 == c ? f += d : (e.push([b, c, d + f]), f = 0));
57 | }
58 | return 0 !== f && e.push([b, c, f]), e
59 | },
60 | d = function (a) {
61 | var b = "()*,-./0123456789:?@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqr",
62 | c = b.length,
63 | d = "",
64 | e = Math.abs(a),
65 | f = parseInt(e / c);
66 | f >= c && (f = c - 1), f && (d = b.charAt(f)), e %= c;
67 | var g = "";
68 | return a < 0 && (g += "!"), d && (g += "$"), g + d + b.charAt(e)
69 | },
70 | e = function (a) {
71 | for (var b = [
72 | [1, 0],
73 | [2, 0],
74 | [1, -1],
75 | [1, 1],
76 | [0, 1],
77 | [0, -1],
78 | [3, 0],
79 | [2, -1],
80 | [2, 1]
81 | ], c = "stuvwxyz~", d = 0, e = b.length; d < e; d++)
82 | if (a[0] == b[d][0] && a[1] == b[d][1]) return c[d];
83 | return 0
84 | },
85 | f = function (traceArray) {
86 | for (var b, f = c(traceArray), g = [], h = [], i = [], j = 0, k = f.length; j < k; j++) {
87 | b = e(f[j]), b ? h.push(b) : (g.push(d(f[j][0])), h.push(d(f[j][1]))), i.push(d(f[j][2]));
88 | }
89 | return g.join("") + "!!" + h.join("") + "!!" + i.join("")
90 | },
91 | g = function (traceArray) {
92 | var a = f(traceArray);
93 | return encodeURIComponent(a)
94 | };
95 | return {
96 | trace: g
97 | }
98 | }();
99 | exports.tracer = tracer;
100 | '''
101 |
102 | TRACE_JS_CONTEXT = JSRUNTIME.compile(TRACE_JS)
103 |
104 |
105 | def load_filetext(filename):
106 | '''load text from file as utf-8 codecs'''
107 | text = ''
108 | with codecs.open(filename, 'r', 'utf-8') as _f:
109 | text = _f.read()
110 | return text
111 |
112 |
113 | def test_load_geetest_js():
114 | '''load javascript text from file, compile, return context object'''
115 | jsfile = os.path.join(os.getcwd(), 'gsxt', 'geetest.5.10.10.js')
116 | print(jsfile)
117 | js_context = JSRUNTIME.compile(load_filetext(jsfile))
118 | print(js_context)
119 |
120 |
121 | def test_get_splite_array():
122 | '''load split array data from call javascript'''
123 | context = JSRUNTIME.compile(G_SPLIT_ARRAY_JS)
124 | splite_array = context.call('getSplitArray')
125 | print('split array = ' + str(splite_array))
126 | return splite_array
127 |
128 |
129 | def test_offset_position(height, split_array):
130 | '''parse offset position array from split array'''
131 | offset_array = []
132 | for i in split_array:
133 | _x = i % 26 * 12 + 1
134 | _y = height / 2 if i > 25 else 0
135 | offset_array.append([_x, _y])
136 | print('offset array = ' + str(offset_array))
137 | return offset_array
138 |
139 |
140 | def test_rewrite_image(image_name, offset_array):
141 | '''load image from file, recombined to new image by offset array'''
142 | img = Image.open(os.path.join(os.getcwd(), 'temp', image_name))
143 | print(img.format, img.size, img.mode)
144 |
145 | rows, columns, offsetwidth, offsetheight = 2, 26, 10, 58
146 | img_new = Image.new('RGB', (columns*offsetwidth, rows*offsetheight))
147 | for row in range(rows):
148 | for column in range(columns):
149 | from_x, from_y = offset_array[row * columns + column]
150 | box = (from_x, from_y, from_x + offsetwidth, from_y + offsetheight)
151 | to_x, to_y = column*offsetwidth, row*offsetheight
152 | box_new = (to_x, to_y, to_x + offsetwidth, to_y + offsetheight)
153 | img_new.paste(img.crop(box), box_new)
154 |
155 | img_new.save(os.path.join(os.getcwd(), 'temp', image_name + '.jpg'), format='JPEG')
156 | print(img_new.format, img_new.size, img_new.mode)
157 | img.close()
158 | img_new.close()
159 |
160 |
161 | def comparepixel(src, dst, threshold):
162 | '''compare two pixel value by threshold.'''
163 | return abs(src[0] - dst[0]) < threshold \
164 | and abs(src[1] - dst[1]) < threshold \
165 | and abs(src[2] - dst[2]) < threshold
166 |
167 |
168 | def get_diff_xy(img1, img2, start_x, start_y, threshold):
169 | '''Calculate the difference between image1 and image2.'''
170 | width, height = img1.size
171 | img_diff = img2.copy()
172 | pixel_diff = []
173 | for _x in range(start_x, width):
174 | for _y in range(start_y, height):
175 | pixel1, pixel2 = img1.getpixel((_x, _y)), img2.getpixel((_x, _y))
176 | if not comparepixel(pixel1, pixel2, threshold):
177 | pixel_diff.append((_x, _y))
178 |
179 | min_xy, max_xy = min(pixel_diff), max(pixel_diff)
180 | for _y in range(height):
181 | img_diff.putpixel((min_xy[0], _y), (0, 0, 0))
182 | img_diff.putpixel((max_xy[0], _y), (0, 0, 0))
183 |
184 | name = 'diff_' + str(threshold) + '_' + str(min_xy[0]) + '_' + str(max_xy[0]) + '.jpg'
185 | img_diff.save(os.path.join(os.getcwd(), 'temp', name), format='JPEG')
186 | img_diff.close()
187 | print(threshold, min_xy[0], max_xy[0])
188 | return min_xy[0], max_xy[0]
189 |
190 |
191 | def get_best_diff(img1, img2, start_x, start_y):
192 | '''Calculate the best different positon.'''
193 | _x, _y = 0, 0
194 | for threshold in range(5, 71, 5):
195 | _x, _y = get_diff_xy(img1, img2, start_x, start_y, threshold)
196 | return _x, _y
197 |
198 |
199 | def test_diff_image(image1, image2, start_x, start_y):
200 | '''find different between two images'''
201 | image_path_src = os.path.join(os.getcwd(), 'temp', image1)
202 | image_path_dst = os.path.join(os.getcwd(), 'temp', image2)
203 | img1, img2 = Image.open(image_path_src), Image.open(image_path_dst)
204 | if img1.size != img2.size:
205 | print('2 images size is different')
206 | img1.close()
207 | img2.close()
208 | return
209 | _x, _y = get_best_diff(img1, img2, start_x, start_y)
210 | img1.close()
211 | img2.close()
212 | return _x, _y
213 |
214 |
215 | def userresponse(distance, challenge):
216 | '''根据滑动距离distance和challenge,计算userresponse值'''
217 | return USERRESPONSE_JSCONTEXT.call('userresponse', distance, challenge)
218 |
219 |
220 | def imgload():
221 | '''图片加载时间(毫秒),用于统计,无验证功能。'''
222 | return random.randint(100, 200)
223 |
224 |
225 | def adjust_distance(distance):
226 | '''滑块slice图片的尺寸:59*50,上下左右四周可能间隔6像素,因此实际尺寸:47*38。'''
227 | return distance - 6
228 |
229 |
230 | def parsetrace(trace_file):
231 | '''parse trace distance'''
232 | with open(os.path.join(os.getcwd(), 'test', trace_file)) as tracedata:
233 | trace = json.load(tracedata)
234 | print('trace analyse:')
235 | for index in range(2, len(trace)):
236 | print(trace[index][0] - trace[index-1][0], trace[index][2] - trace[index-1][2])
237 |
238 |
239 | def usertrace(distance):
240 | '''
241 | 采集用户鼠标拖动轨迹,构造数组(x坐标, y坐标, 时间间隔毫秒),加密。
242 | geetest.5.10.10.js的变量: Q.t("arr", a)
243 | 输出加密前的明文数组: console.log(JSON.stringify(Q.t("arr", a)))
244 | 轨迹样本见TraceSample.txt
245 | 轨迹样本分析见TraceSampleParse.txt
246 | '''
247 | # 轨迹间隔数组
248 | trace = []
249 | # 根据距离distance,计算总共步数,范围采样50%-75%
250 | total_steps = int(distance * random.uniform(0.5, 0.75))
251 | # 滑动阶段1:慢速起步,按下鼠标时间间隔较长
252 | move_instance = random.randint(1, 4)
253 | trace.append((move_instance, 0, random.randint(200, 500)))
254 | # 滑动阶段2:中速运行,鼠标拖动速度中等
255 | for _i in range(total_steps):
256 | if move_instance < distance:
257 | step = random.randint(1, 3)
258 | move_instance = move_instance + step
259 | trace.append((step, 0, random.randint(8, 24)))
260 | # 滑动阶段3:慢速到达,鼠标接近目标位置时减速
261 | trace.append(((distance - move_instance), 0, random.randint(100, 800)))
262 | trace.append((0, 0, random.randint(100, 500)))
263 | print(trace)
264 |
265 | # 轨迹间隔数组转成轨迹坐标数组
266 | position = []
267 | # 鼠标点击坐标相对于滑块图片边缘的值
268 | position.append((-random.randint(14, 30), -random.randint(14, 30), 0))
269 | # 起始值
270 | current_position = (0, 0, 0)
271 | position.append(current_position)
272 | for _i in trace:
273 | next_positon = (current_position[0] + _i[0], _i[1], current_position[2] + _i[2])
274 | position.append(next_positon)
275 | current_position = next_positon
276 |
277 | passtime = position[-1][2]
278 | print(position)
279 | print(passtime)
280 | return position, passtime
281 |
282 |
283 | def encrypttrace(trace):
284 | '''encrypt trace data by JSCall'''
285 | return TRACE_JS_CONTEXT.call('tracer.trace', trace)
286 |
287 |
288 | def fun_c(param):
289 | '''reversed from geetest.js'''
290 | _b = 0
291 | _c = 0
292 | _d = 0
293 | _e = []
294 | _f = 0
295 | _g = []
296 | _h = 0
297 | for _h in range(0, len(param)-1):
298 | _b = round(param[_h + 1][0] - param[_h][0])
299 | _c = round(param[_h + 1][1] - param[_h][1])
300 | _d = round(param[_h + 1][2] - param[_h][2])
301 | _g.append([_b, _c, _d])
302 | if _b == 0 and _c == 0 and _d == 0:
303 | continue
304 | else:
305 | if _b == 0 and _c == 0:
306 | _f = _f + _d
307 | else:
308 | _e.append([_b, _c, _d + _f])
309 | _f = 0
310 | if _f != 0:
311 | _e.append([_b, _c, _f])
312 | return _e
313 |
314 |
315 | def fun_d(param):
316 | '''reversed from geetest.js'''
317 | _b = "()*,-./0123456789:?@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqr"
318 | _c = len(_b)
319 | _d = ""
320 | _e = abs(param)
321 | _f = int(_e / _c)
322 | if _f >= _c:
323 | _f = _c - 1
324 | if _f:
325 | _d = chr(_f)
326 | _e = _e % _c
327 | _g = ''
328 | if param < 0:
329 | _g = _g + '!'
330 | if _d:
331 | _g = _g + '$'
332 | return _g + _d + _b[int(_e)]
333 |
334 |
335 | def fun_e(param):
336 | '''reversed from geetest.js'''
337 | _b = [[1, 0], [2, 0], [1, -1], [1, 1], [0, 1], [0, -1], [3, 0], [2, -1], [2, 1]]
338 | _c = "stuvwxyz~"
339 | _d = 0
340 | _e = len(_b)
341 | for _d in range(0, len(_b)):
342 | if param[0] == _b[_d][0] and param[1] == _b[_d][1]:
343 | return _c[_d]
344 | return 0
345 |
346 |
347 | def fun_f(param):
348 | '''reversed from geetest.js'''
349 | _b = None
350 | _f = fun_c(param)
351 | _g = []
352 | _h = []
353 | _i = []
354 | for j in _f:
355 | _b = fun_e(j)
356 | if _b:
357 | _h.append(_b)
358 | else:
359 | _g.append(fun_d(j[0]))
360 | _h.append(fun_d(j[1]))
361 | _i.append(fun_d(j[2]))
362 |
363 | return ''.join(j for j in _g) + '!!' + ''.join(j for j in _h) + '!!' + ''.join(j for j in _i)
364 |
365 |
366 | def parse_html(html_doc, page):
367 | '''parse html webpage elements.'''
368 | soup = BeautifulSoup(html_doc, 'html.parser')
369 | _result = []
370 | _findall = soup.find_all('a', class_='search_list_item db')
371 | for _a in _findall:
372 | _name = _a.find('h1', class_='f20')
373 | _name_str = ''.join(_name.get_text().split())
374 | _code = _a.find('div', class_='div-map2')
375 | _number = _code.find('span', class_='g3')
376 | _number_str = ''.join(_number.get_text().split())
377 | _result.append([_name_str, _number_str])
378 |
379 | print(json.dumps(_result, indent=2, sort_keys=True, ensure_ascii=False))
380 | _findall = soup.find_all('a', href='javascript:turnOverPage({})'.format(page + 1))
381 | return _result, True if _findall else False
382 |
383 |
384 | def print_json_type():
385 | '''Dump json object type.'''
386 | json_text = '''{
387 | "bg": "pictures/gt/fc064fc73/bg/e1777734e.jpg",
388 | "link": "",
389 | "challenge": "0a80f1e4b0ff6381e26425b7fa3e71f4c2",
390 | "ypos": 24,
391 | "fullbg": "pictures/gt/fc064fc73/fc064fc73.jpg",
392 | "id": "",
393 | "xpos": 0,
394 | "feedback": "",
395 | "height": 116,
396 | "slice": "pictures/gt/fc064fc73/slice/e1777734e.png",
397 | "type": "slide"
398 | }'''
399 | json_object = json.loads(json_text)
400 | for _k, _v in json_object.items():
401 | print(type(_k))
402 | print(type(_v))
403 |
404 |
405 | def test_geetest():
406 | '''test geetest related functions.'''
407 | test_load_geetest_js()
408 | print(userresponse(100, '1196277ad0c2a2142efce133857c5c8bja'))
409 | print(userresponse(100, "1196277ad0c2a2142efce133857c5c8bja"))
410 | splites = test_get_splite_array()
411 | offsets = test_offset_position(116, splites)
412 | test_rewrite_image('fullbg', offsets)
413 | test_rewrite_image('bg', offsets)
414 | _x, _y = test_diff_image('fullbg.jpg', 'bg.jpg', 0, 12)
415 | print(imgload())
416 | parsetrace('TraceSample01.txt')
417 | parsetrace('TraceSample02.txt')
418 | parsetrace('TraceSample03.txt')
419 | parsetrace('TraceSample04.txt')
420 | trace, _passtime = usertrace(100)
421 | print(encrypttrace(trace))
422 | print(fun_f(trace))
423 | trace, _passtime = usertrace(200)
424 | print(encrypttrace(trace))
425 | print(fun_f(trace))
426 | print(parse_html(load_filetext(os.path.join(os.getcwd(), 'test', 'result.html')), 1))
427 | print(parse_html(load_filetext(os.path.join(os.getcwd(), 'test', 'result2.html')), 2))
428 | print_json_type()
429 |
430 |
431 | if __name__ == "__main__":
432 | test_geetest()
433 |
--------------------------------------------------------------------------------
/geetest_online/util.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 | '''
4 | geetest常用公共方法
5 | '''
6 |
7 | SPLIT_ARRAY_JS = '''
8 | function getSplitArray() {
9 | for (var a, b = "6_11_7_10_4_12_3_1_0_5_2_9_8".split("_"), c = [], d = 0, e = 52; d < e; d++)
10 | a = 2 * parseInt(b[parseInt(d % 26 / 2)]) + d % 2,
11 | parseInt(d / 2) % 2 || (a += d % 2 ? -1 : 1),
12 | a += d < 26 ? 26 : 0,
13 | c.push(a);
14 | return c
15 | }
16 | '''
17 |
18 | USERRESPONSE_JS = '''
19 | function userresponse(a, b) {
20 | for (var c = b.slice(32), d = [], e = 0; e < c.length; e++) {
21 | var f = c.charCodeAt(e);
22 | d[e] = f > 57 ? f - 87 : f - 48
23 | }
24 | c = 36 * d[0] + d[1];
25 | var g = Math.round(a) + c; b = b.slice(0, 32);
26 | var h, i = [ [], [], [], [], [] ], j = {}, k = 0; e = 0;
27 | for (var l = b.length; e < l; e++)
28 | h = b.charAt(e), j[h] || (j[h] = 1, i[k].push(h), k++, k = 5 == k ? 0 : k);
29 | for (var m, n = g, o = 4, p = "", q = [1, 2, 5, 10, 50]; n > 0;)
30 | n - q[o] >= 0 ? (m = parseInt(Math.random() * i[o].length, 10), p += i[o][m], n -= q[o]) : (i.splice(o, 1), q.splice(o, 1), o -= 1);
31 | return p
32 | }
33 | '''
34 |
35 | OFFLINE_SAMPLE = ((186, 1, 98),
36 | (82, 0, 136),
37 | (61, 5, 108),
38 | (128, 2, 7),
39 | (130, 4, 99),
40 | (189, 3, 65),
41 | (108, 5, 285),
42 | (136, 0, 36),
43 | (41, 0, 263),
44 | (124, 3, 185))
45 |
46 |
47 | TRACE_JS = '''
48 | var tracer = function () {
49 | c = function (traceArray) {
50 | for (var b, c, d, e = [], f = 0, g = [], h = 0, i = traceArray.length - 1; h < i; h++) {
51 | b = Math.round(traceArray[h + 1][0] - traceArray[h][0]),
52 | c = Math.round(traceArray[h + 1][1] - traceArray[h][1]),
53 | d = Math.round(traceArray[h + 1][2] - traceArray[h][2]),
54 | g.push([b, c, d]), 0 == b && 0 == c && 0 == d || (0 == b && 0 == c ? f += d : (e.push([b, c, d + f]), f = 0));
55 | }
56 | return 0 !== f && e.push([b, c, f]), e
57 | },
58 | d = function (a) {
59 | var b = "()*,-./0123456789:?@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqr",
60 | c = b.length,
61 | d = "",
62 | e = Math.abs(a),
63 | f = parseInt(e / c);
64 | f >= c && (f = c - 1), f && (d = b.charAt(f)), e %= c;
65 | var g = "";
66 | return a < 0 && (g += "!"), d && (g += "$"), g + d + b.charAt(e)
67 | },
68 | e = function (a) {
69 | for (var b = [
70 | [1, 0],
71 | [2, 0],
72 | [1, -1],
73 | [1, 1],
74 | [0, 1],
75 | [0, -1],
76 | [3, 0],
77 | [2, -1],
78 | [2, 1]
79 | ], c = "stuvwxyz~", d = 0, e = b.length; d < e; d++)
80 | if (a[0] == b[d][0] && a[1] == b[d][1]) return c[d];
81 | return 0
82 | },
83 | f = function (traceArray) {
84 | for (var b, f = c(traceArray), g = [], h = [], i = [], j = 0, k = f.length; j < k; j++) {
85 | b = e(f[j]), b ? h.push(b) : (g.push(d(f[j][0])), h.push(d(f[j][1]))), i.push(d(f[j][2]));
86 | }
87 | return g.join("") + "!!" + h.join("") + "!!" + i.join("")
88 | },
89 | g = function (traceArray) {
90 | var a = f(traceArray);
91 | return encodeURIComponent(a)
92 | };
93 | return {
94 | trace: g
95 | }
96 | }();
97 | exports.tracer = tracer;
98 | '''
99 |
--------------------------------------------------------------------------------
/gitstats/README.md:
--------------------------------------------------------------------------------
1 | Python实现的一个小工具,用于分析Git commit log,获得Git Project每个成员的简单行为数据。
2 |
3 | **Warning:代码量不能代表程序员能力水平!**
4 |
5 | ### 启动参数
6 |
7 | 共5个。
8 |
9 | + Repo地址
10 | + Commit 起始日期
11 | + Commit 结束日期
12 | + Git仓库子目录
13 | + 统计分析结果CSV文件目标路径
14 |
15 | ### exec_git
16 |
17 | Git Log命令:
18 |
19 | > git -C {} log --since={} --until={} --pretty=tformat:%ae --shortstat --no-merges -- {} > {}
20 |
21 | 填入参数,调用系统命令 `os.system()`,输出结果至本地临时文件。读取至内存,简单的String Array。
22 |
23 | ### parse
24 |
25 | Git Log输出有3种格式,对应3种正则表达式。
26 |
27 | ```Python
28 | REPATTERN_FULL = r"\s(\d+)\D+(\d+)\D+(\d+)\D+\n"
29 | REPATTERN_INSERT_ONLY = r"\s(\d+)\D+(\d+)\sinsertion\D+\n"
30 | REPATTERN_DELETE_ONLY = r"\s(\d+)\D+(\d+)\sdeletion\D+\n"
31 | ```
32 |
33 | 遍历得到的数据,首先构造一个以Author为Key,分析结果为Value的字典。分析结果构造一个元祖,包括:
34 |
35 | + Commit 次数
36 | + 增加代码行数
37 | + 删除代码行数
38 | + 变更代码行数
39 |
40 | ### save_csv
41 |
42 | 简单省略。
43 |
--------------------------------------------------------------------------------
/gitstats/gitstats.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 | '''Analyse git branch commit log, for every version, every person.'''
4 | import os
5 | import sys
6 | import re
7 | import csv
8 |
9 | GIT_LOG = r'git -C {} log --since={} --until={} --pretty=tformat:%ae --shortstat --no-merges -- {} > {}'
10 |
11 | REPATTERN_FULL = r"\s(\d+)\D+(\d+)\D+(\d+)\D+\n"
12 | REPATTERN_INSERT_ONLY = r"\s(\d+)\D+(\d+)\sinsertion\D+\n"
13 | REPATTERN_DELETE_ONLY = r"\s(\d+)\D+(\d+)\sdeletion\D+\n"
14 |
15 | CSV_FILE_HEADER = ["Author", "Commit", "Insert", "Delete", "Loc"]
16 |
17 |
18 | def exec_git(repo, since, until, subdir):
19 | '''Execute git log commant, return string array.'''
20 | logfile = os.path.join(os.getcwd(), 'gitstats.txt')
21 | git_log_command = GIT_LOG.format(repo, since, until, subdir, logfile)
22 | os.system(git_log_command)
23 | lines = None
24 | with open(logfile, 'r', encoding='utf-8') as logfilehandler:
25 | lines = logfilehandler.readlines()
26 | return lines
27 |
28 |
29 | def save_csv(stats, csvfile):
30 | '''save stats data to csv file.'''
31 | with open(csvfile, 'w', encoding='utf-8') as csvfilehandler:
32 | writer = csv.writer(csvfilehandler)
33 | writer.writerow(CSV_FILE_HEADER)
34 | for author, stat in stats.items():
35 | writer.writerow([author, stat[0], stat[1], stat[2], stat[3]])
36 |
37 |
38 | def parse(lines):
39 | '''Analyse git log and sort to csv file.'''
40 | prog_full = re.compile(REPATTERN_FULL)
41 | prog_insert_only = re.compile(REPATTERN_INSERT_ONLY)
42 | prog_delete_only = re.compile(REPATTERN_DELETE_ONLY)
43 |
44 | stats = {}
45 | for i in range(0, len(lines), 3):
46 | author = lines[i]
47 | #empty = lines[i+1]
48 | info = lines[i+2]
49 | #change = 0
50 | insert, delete = int(0), int(0)
51 | result = prog_full.search(info)
52 | if result:
53 | #change = result[0]
54 | insert = int(result.group(2))
55 | delete = int(result.group(3))
56 | else:
57 | result = prog_insert_only.search(info)
58 | if result:
59 | #change = result[0]
60 | insert = int(result.group(2))
61 | delete = int(0)
62 | else:
63 | result = prog_delete_only.search(info)
64 | if result:
65 | #change = result[0]
66 | insert = int(0)
67 | delete = int(result.group(2))
68 | else:
69 | print('Regular expression fail!')
70 | return
71 |
72 | loc = insert - delete
73 | stat = stats.get(author)
74 | if stat is None:
75 | stats[author] = [1, insert, delete, loc]
76 | else:
77 | stat[0] += 1
78 | stat[1] += insert
79 | stat[2] += delete
80 | stat[3] += loc
81 |
82 | return stats
83 |
84 |
85 | if __name__ == "__main__":
86 | print('gitstats begin')
87 | if len(sys.argv) != 6:
88 | print('Invalid argv parameters.')
89 | exit(0)
90 |
91 | REPO = os.path.join(os.getcwd(), sys.argv[1])
92 | SINCE = sys.argv[2]
93 | UNTIL = sys.argv[3]
94 | SUB_DIR = sys.argv[4]
95 | CSV_FILE = os.path.join(os.getcwd(), sys.argv[5])
96 | LINES = exec_git(REPO, SINCE, UNTIL, SUB_DIR)
97 | assert LINES is not None
98 | STATS = parse(LINES)
99 | save_csv(STATS, CSV_FILE)
100 | print('gitstats done')
101 |
--------------------------------------------------------------------------------
/gsxt_mobile/README.md:
--------------------------------------------------------------------------------
1 | [国家企业信用信息公示系统](http://www.gsxt.gov.cn)使用GeeTest滑块验证码。主站使用online验证模式,难破解。部分分站使用offline验证模式,易破解但多次HTTP请求应答往复,查询效率低。
2 | [国家工商总局](http://www.saic.gov.cn/)提供了Android,iOS App,这次就来尝试分析一下App的情况。
3 |
4 | 总局网站有2套:
5 |
6 | + 新版 http://www.saic.gov.cn/
7 | + 旧版 http://old.saic.gov.cn/
8 |
9 | 于是App下载说明页面也有2套:
10 |
11 | + 新版 http://gzhd.saic.gov.cn/gszj/index/telephone/android2.html
12 | + 旧版 http://gzhd.saic.gov.cn/gszj/index/telephone/android.html
13 |
14 | 还好App只有1套。
15 |
16 | 国家工商行政管理总局移动版客户端:
17 |
18 | + Android版 http://gzhd.saic.gov.cn/gszj/saicwap.apk
19 | + iOS版 https://itunes.apple.com/cn/app/gong-shang-zong-ju/id725956822?mt=8
20 |
21 | 国家企业信用信息公示系统:
22 |
23 | + Android版 http://gzhd.saic.gov.cn/gszj/gongshi.apk
24 | + iOS版 https://itunes.apple.com/cn/app/%E5%9B%BD%E5%AE%B6%E4%BC%81%E4%B8%9A%E4%BF%A1%E7%94%A8%E4%BF%A1%E6%81%AF%E5%85%AC%E7%A4%BA%E7%B3%BB%E7%BB%9F/id1048375712?mt=8
25 |
26 | ### 分析
27 |
28 | **saicwap.apk,看这个名称,好像已经明白了什么。**
29 | 安装&运行&解包查看`国家企业信用信息公示系统Android APK`文件。
30 | UI交互体验基本上就是一个网页。
31 | dex很小,assets文件很多。
32 | 根据名称搜索加猜测,直接得出结论,WebView外壳,JQuery+AJAX实现网页。
33 | 使用Fiddler抓包,仅有一条简单的HTTP Request & Response。
34 | Response是标准JSON文本。
35 |
36 | 随手写个实现
37 |
38 | ### 填写Android Mobile HTTP Header参数
39 |
40 | ```Python
41 | URL = 'http://yd.gsxt.gov.cn/QuerySummary'
42 | MOBILE_ACTION = 'entSearch'
43 | TOPIC = 1
44 | PAGE_NUM = 1
45 | PAGE_SIZE = 10
46 | USER_ID = 'id001'
47 | USER_IP = '192.168.0.1'
48 | USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.4.2; vivo Y28L Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/30.0.0.0 Mobile Safari/537.36 Html5Plus/1.0'
49 | ACCEPT_LANGUAGE = 'zh-CN,en-US;q=0.8'
50 | XRW = 'com.zongjucredit'
51 | ORIGIN = 'file://'
52 | CHARSET = 'application/x-www-form-urlencoded; charset=UTF-8'
53 | ```
54 |
55 | ### 使用requests库
56 |
57 | ```Python
58 | def query(keyword):
59 | _data = [('mobileAction', MOBILE_ACTION),
60 | ('keywords', keyword),
61 | ('topic', TOPIC),
62 | ('pageNum', PAGE_NUM),
63 | ('pageSize', PAGE_SIZE),
64 | ('userID', USER_ID),
65 | ('userIP', USER_IP)]
66 | _headers = {'User-Agent': USER_AGENT,
67 | 'Accept-Language': ACCEPT_LANGUAGE,
68 | 'X-Requested-With': XRW,
69 | 'Origin': ORIGIN,
70 | 'Content-Type': CHARSET}
71 |
72 | _response = requests.post(URL, data=_data, headers=_headers)
73 | print(_response.status_code)
74 | if _response.status_code == 200:
75 | _content = _response.json()
76 | print(json.dumps(_content, indent=2, sort_keys=True, ensure_ascii=False))
77 | ```
78 |
79 | ### 测试运行
80 |
81 | 搜索关键字`腾讯科技`,得到[50条数据](https://github.com/9468305/python-script/blob/master/gsxt_mobile/%E8%85%BE%E8%AE%AF%E7%A7%91%E6%8A%8050.txt)。格式示例:
82 |
83 | ```JSON
84 | {
85 | "BUSEXCEPTCOUNT": "0",
86 | "CAT18": "10",
87 | "CAT2NAME": "法定代表人",
88 | "ENTNAME": "腾讯科技(成都)有限公司",
89 | "ENTTYPE": "6150",
90 | "ESTDATE": "2008年07月10日",
91 | "NAME": "奚丹",
92 | "PRIPID": "BFA63C5493A3045829033A5B114CE66AFD1B796865F63020C39130E7149AE9152BAC6972D71F0C3A65B342A32972C4439717E803CD7E66773D486FDD9FCBAEC8",
93 | "REGNO": "510100400024413",
94 | "REGSTATE_CN": "存续(在营、开业、在册)",
95 | "S_EXT_NODENUM": "510000",
96 | "UNISCID": "915101006771521538"
97 | }
98 | ```
99 |
100 | **实测,有时会封IP,24小时解禁,一旦封禁,爬虫和官方App一概屏蔽。**
101 |
--------------------------------------------------------------------------------
/gsxt_mobile/gsxt_mobile.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 | '''通过国家企业信用信息公示系统(www.gsxt.gov.cn) Mobile App HTTP API 查询企业信息'''
4 |
5 | import json
6 | import requests
7 |
8 | URL = 'http://yd.gsxt.gov.cn/QuerySummary'
9 | MOBILE_ACTION = 'entSearch'
10 | TOPIC = 1
11 | PAGE_NUM = 1
12 | PAGE_SIZE = 10
13 | USER_ID = 'id001'
14 | USER_IP = '192.168.0.1'
15 | USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.4.2; vivo Y28L Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/30.0.0.0 Mobile Safari/537.36 Html5Plus/1.0'
16 | ACCEPT_LANGUAGE = 'zh-CN,en-US;q=0.8'
17 | XRW = 'com.zongjucredit'
18 | ORIGIN = 'file://'
19 | CHARSET = 'application/x-www-form-urlencoded; charset=UTF-8'
20 |
21 |
22 | def query(keyword):
23 | '''main entry'''
24 | _data = [('mobileAction', MOBILE_ACTION),
25 | ('keywords', keyword),
26 | ('topic', TOPIC),
27 | ('pageNum', PAGE_NUM),
28 | ('pageSize', PAGE_SIZE),
29 | ('userID', USER_ID),
30 | ('userIP', USER_IP)]
31 |
32 | _headers = {'User-Agent': USER_AGENT,
33 | 'Accept-Language': ACCEPT_LANGUAGE,
34 | 'X-Requested-With': XRW,
35 | 'Origin': ORIGIN,
36 | 'Content-Type': CHARSET}
37 |
38 | _response = requests.post(URL, data=_data, headers=_headers)
39 | print(_response.status_code)
40 | if _response.status_code == 200:
41 | _content = _response.json()
42 | print(len(_content))
43 | print(json.dumps(_content, indent=2, sort_keys=True, ensure_ascii=False))
44 | with open(keyword + str(len(_content)) + '.txt', 'w', encoding='utf-8') as _f:
45 | json.dump(_content, _f, indent=2, sort_keys=True, ensure_ascii=False)
46 | else:
47 | print('request fail')
48 |
49 | if __name__ == "__main__":
50 | query('腾讯科技')
51 |
--------------------------------------------------------------------------------
/lagou/README.md:
--------------------------------------------------------------------------------
1 | 关于拉勾网数据采集爬虫的文章,网上已经写烂了。这里简单记录一个之前帮助同事妹子写的小爬虫工具。
2 | 某天,HR同事妹子接到一个任务,收集并分析拉勾网BAT三家公司所有招聘岗位的分类,要求,薪酬范围,人数等信息。
3 | 人肉采集辛苦枯燥,随手写段代码搭救妹子。
4 |
5 | ### 开始
6 |
7 | 拉勾网页面可能改版,以下代码实现可能已失效,不考虑持续维护更新。
8 | 拉勾网给每家注册公司分配一个数字,URL形式是:
9 |
10 | ```Python
11 | LAGOU_URL = r'https://www.lagou.com/gongsi/j%d.html'
12 | ```
13 |
14 | 人肉筛选目标公司如下:
15 |
16 | ```Python
17 | COMPANY = {
18 | '腾讯': 451,
19 | '阿里优酷': 1914,
20 | '阿里高德': 91,
21 | '阿里天猫': 52840,
22 | '阿里UC': 2202,
23 | '阿里神马搜索': 90948,
24 | '百度': 1575,
25 | '百度外卖': 104601
26 | }
27 | ```
28 |
29 | 每家公司子页面的实现,使用了较多复杂Javascript代码和框架,因此不采用抓包分析HTTP协议的方案。
30 | 简单粗暴直接的组合: Selenium + WebDriver + Chrome。
31 |
32 | ### Selenium
33 |
34 | 官网 http://www.seleniumhq.org/
35 | GitHub https://github.com/SeleniumHQ/selenium
36 | 文档 http://selenium-python.readthedocs.io/
37 |
38 | ### ChromeDriver
39 |
40 | [ChromeDriver - WebDriver for Chrome](https://sites.google.com/a/chromium.org/chromedriver/)
41 | 为什么不使用运行效率更高的 [PhantomJS](http://phantomjs.org/) ?
42 | 因为需要频繁调试代码和观察运行情况。稳定运行后可以随时修改一行代码参数,替换成 PhantomJS 。
43 | **Chrome 59 beta 开始支持 Headless。** 详见 [Getting Started with Headless Chrome](https://developers.google.com/web/updates/2017/04/headless-chrome)。所以以后应该也不再需要 PhantomJS 了。
44 |
45 | ### 数据定义
46 |
47 | 继续简单粗暴直接:(参数有点多,PyLint 报 Warning 了,无视吧。)
48 |
49 | ```Python
50 | class JobInfo(object):
51 | '''Job Info Object'''
52 | def __init__(self, company, job_filter, title, salary_min, salary_max, exp, edu):
53 | self.company = company
54 | self.filter = job_filter
55 | self.title = title
56 | self.salary_min = salary_min
57 | self.salary_max = salary_max
58 | self.exp = exp
59 | self.edu = edu
60 |
61 | @staticmethod
62 | def header():
63 | '''csv file header'''
64 | return ['公司', '类别', '职位', '薪酬区间低', '薪酬区间高', '经验要求', '学历要求']
65 |
66 | def array(self):
67 | '''object to array'''
68 | return [self.company,
69 | self.filter,
70 | self.title,
71 | self.salary_min,
72 | self.salary_max,
73 | self.exp,
74 | self.edu]
75 | ```
76 |
77 | ### 页面加载解析
78 |
79 | WebDriver API 方便好用强大。
80 |
81 | ```Python
82 | con_list_item = WebDriverWait(browser, SLEEPTIME).until(lambda x: x.find_elements_by_class_name('con_list_item'))
83 | ```
84 |
85 | 执行点击翻页跳转
86 |
87 | ```Python
88 | try:
89 | pages = browser.find_element_by_class_name('pages')
90 | spans = pages.find_elements_by_tag_name('span')
91 | span = get_next_span(spans)
92 | if span:
93 | span.click()
94 | time.sleep(SLEEPTIME)
95 | except NoSuchElementException as _e:
96 | print(_e)
97 | ```
98 |
99 | **数据采集完成后写入csv文件,略。**
100 |
101 | ### 坑
102 |
103 | WebDriver API 简单易用,但超时处理机制仍不完善。
104 |
105 | ```Python
106 | browser = webdriver.Chrome()
107 | browser.get(url)
108 | browser.refresh()
109 | browser.quit()
110 | ```
111 |
112 | `implicitly_wait()` 无法判断页面内部各种Ajax操作执行完成的时机。只好注释掉这一行代码。
113 |
114 | ```Python
115 | browser.implicitly_wait(TIMEOUT)
116 | ```
117 |
--------------------------------------------------------------------------------
/lagou/lagou.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 | '''spider for https://www.lagou.com'''
4 |
5 | import time
6 | import csv
7 | import os
8 | import sys
9 | from selenium import webdriver
10 | from selenium.webdriver.support.ui import WebDriverWait
11 | from selenium.common.exceptions import NoSuchElementException
12 |
13 | LAGOU_URL = r'https://www.lagou.com/gongsi/j%d.html'
14 | COMPANY = {
15 | '腾讯': 451,
16 | '阿里优酷': 1914,
17 | '阿里高德': 91,
18 | '阿里天猫': 52840,
19 | '阿里UC': 2202,
20 | '阿里神马搜索': 90948,
21 | '百度': 1575,
22 | '百度外卖': 104601
23 | }
24 |
25 | SLEEPTIME = 3 #seconds
26 |
27 | class JobInfo(object):
28 | '''Job Info Object'''
29 | def __init__(self, company, job_filter, title, salary_min, salary_max, exp, edu):
30 | self.company = company
31 | self.filter = job_filter
32 | self.title = title
33 | self.salary_min = salary_min
34 | self.salary_max = salary_max
35 | self.exp = exp
36 | self.edu = edu
37 |
38 | @staticmethod
39 | def header():
40 | '''csv file header'''
41 | return ['公司', '类别', '职位', '薪酬区间低', '薪酬区间高', '经验要求', '学历要求']
42 |
43 | def array(self):
44 | '''object to array'''
45 | return [self.company,
46 | self.filter,
47 | self.title,
48 | self.salary_min,
49 | self.salary_max,
50 | self.exp,
51 | self.edu]
52 |
53 |
54 | def lagou_page(browser, job_list, company_name, job_filter):
55 | '''filter for every page'''
56 | con_list_item = WebDriverWait(browser, SLEEPTIME)\
57 | .until(lambda x: x.find_elements_by_class_name('con_list_item'))
58 | for item in con_list_item:
59 | job = item.text.split('\n')
60 | job_title = job[0]
61 | #job_date = job[1]
62 | job_salary = job[2].split('-')
63 | job_salary_min = job_salary[0]
64 | if len(job_salary) > 1:
65 | job_salary_max = job_salary[1]
66 | else:
67 | job_salary_max = job_salary[0]
68 | job_desc = job[3].split('/')
69 | job_exp = job_desc[0]
70 | if ' ' in job_exp:
71 | job_exp = job_exp.strip(' ')
72 | if '经验' in job_exp:
73 | job_exp = job_exp.lstrip('经验')
74 | job_edu = job_desc[1]
75 | if ' ' in job_edu:
76 | job_edu = job_edu.strip(' ')
77 |
78 | job = JobInfo(company_name,
79 | job_filter,
80 | job_title,
81 | job_salary_min,
82 | job_salary_max,
83 | job_exp,
84 | job_edu)
85 | job_list.append(job)
86 | print(job_title)
87 | print(job_salary_min)
88 | print(job_salary_max)
89 | print(job_exp)
90 | print(job_edu)
91 |
92 |
93 | def get_next_span(spans):
94 | '''find next page button'''
95 | for span in spans:
96 | print(span.text)
97 | if span.text == '下一页':
98 | if span.get_attribute('class') == 'next':
99 | return span
100 | return None
101 |
102 |
103 | def lagou_filter(browser, job_list, company_name, job_filter):
104 | '''filter by job types'''
105 | while True:
106 | lagou_page(browser, job_list, company_name, job_filter)
107 | #check next page
108 | try:
109 | pages = browser.find_element_by_class_name('pages')
110 | spans = pages.find_elements_by_tag_name('span')
111 | span = get_next_span(spans)
112 | if span:
113 | span.click()
114 | time.sleep(SLEEPTIME)
115 | else:
116 | return
117 | except NoSuchElementException as _e:
118 | print(_e)
119 | return
120 |
121 |
122 | def lagou_company(browser, company_name, company_number):
123 | '''filter for certain company'''
124 | company_url = LAGOU_URL % int(company_number)
125 | company_job_list = []
126 | browser.get(company_url)
127 | time.sleep(SLEEPTIME*3)
128 | while True:
129 | try:
130 | print(browser.title)
131 | con_filter_li = WebDriverWait(browser, SLEEPTIME)\
132 | .until(lambda x: x.find_elements_by_class_name('con_filter_li'))
133 | for line in con_filter_li:
134 | print(line.text)
135 | if line.text == '全部':
136 | print('skip')
137 | continue
138 | line.click()
139 | time.sleep(SLEEPTIME)
140 | lagou_filter(browser, company_job_list, company_name, line.text)
141 | except NoSuchElementException as _e:
142 | print(_e)
143 | del company_job_list[:]
144 | # company_job_list.clear() only work for python3
145 | browser.refresh()
146 | time.sleep(SLEEPTIME*3)
147 | else:
148 | #save result to company file
149 | save_file = os.path.join(os.getcwd(), company_name + '.csv')
150 | with open(save_file, 'w', newline='') as save_file_handler:
151 | writer = csv.writer(save_file_handler)
152 | writer.writerow(JobInfo.header())
153 | for job in company_job_list:
154 | writer.writerow(job.array())
155 | return
156 |
157 |
158 | def lagou(browser, company_number):
159 | '''lagou entity: target one company or all.'''
160 | print('lagou start')
161 | for name, code in COMPANY.items():
162 | if company_number is not None:
163 | if int(code) == int(company_number):
164 | lagou_company(browser, name, code)
165 | break
166 | else:
167 | lagou_company(browser, name, code)
168 | print('lagou end')
169 |
170 |
171 | if __name__ == '__main__':
172 | BROWSER = webdriver.Chrome()
173 | #implicitly_wait seems can not waiting for Ajax loading complete
174 | #_browser.implicitly_wait(TIMEOUT)
175 | SINGLE_COMPANY = None
176 | if len(sys.argv) > 1:
177 | SINGLE_COMPANY = sys.argv[1]
178 | lagou(BROWSER, SINGLE_COMPANY)
179 | BROWSER.quit()
180 |
--------------------------------------------------------------------------------
/level/README.md:
--------------------------------------------------------------------------------
1 | ### Python leveldb Utils 常用方法封装。
2 |
3 | [leveldb](http://leveldb.org/)是Google开源的一个轻量级,高性能,KeyValue 存储数据库。作者是Google战神Jeff Dean,基于他自己的BigTable论文,使用C++ POSIX实现。
4 |
5 | > LevelDB is a light-weight, single-purpose library for persistence with bindings to many platforms.
6 |
7 | 官网 http://leveldb.org/
8 | GitHub https://github.com/google/leveldb
9 | 官方 Javascript Binding https://github.com/Level/levelup
10 |
11 | ### Python Binding
12 |
13 | 早期官方仅提供C++和Javascript。Python实现均是第三方开发。其中使用较广泛和稳定的是 https://github.com/rjpower/py-leveldb 。目前处于稳定运行,维护停滞状态。
14 | 在Python爬虫实现过程中,常常需要快速简单处理存储大量数据至本地文件,构建SQL数据库表过于复杂,变更不灵活。使用JSON文本格式,缺乏索引,过滤,随机增删数据。因此leveldb是一种轻便快捷的最佳解决方案。
15 |
16 | 这里封装了一些常用方法,均是日常爬虫数据采集存储的常用方法。
17 |
18 | ### exist()
19 |
20 | 判断key是否存在于database中。返回Boolean值。
21 |
22 | ```Python
23 | def exist(db_src, key):
24 | try:
25 | db_src.Get(_key_obj)
26 | return True
27 | except KeyError:
28 | return False
29 | ```
30 |
31 | ### count()
32 |
33 | 统计database中数据总量计数,支持key过滤子字符串,value过滤子字符串,返回总数和过滤后有效总数。
34 |
35 | ```Python
36 | def count(db_src, k_filter, v_filter):
37 | total, valid = 0, 0
38 | for _k, _v in db_src.RangeIter():
39 | total += 1
40 | if k_filter:
41 | if _k.find(k_filter) == -1:
42 | continue
43 | if v_filter:
44 | if _v.find(v_filter) == -1:
45 | continue
46 | valid += 1
47 | return total, valid
48 | ```
49 |
50 | ### copy()
51 |
52 | 从源库到目标库,拷贝database,支持key过滤子字符串。返回源库总数和过滤后有效拷贝总数。
53 |
54 | ```Python
55 | def copy(db_src, db_dst, k_filter):
56 | total, valid = 0, 0
57 | for _k, _v in db_src.RangeIter():
58 | total += 1
59 | if k_filter:
60 | if _k.find(k_filter) != -1:
61 | valid += 1
62 | db_dst.Put(_k, _v, sync=True)
63 | else:
64 | valid += 1
65 | db_dst.Put(_k, _v, sync=True)
66 | return total, valid
67 | ```
68 |
69 | ### delete()
70 |
71 | 删除目标库中与源库相同key的数据项。
72 |
73 | ```Python
74 | def delete(db_src, db_dst):
75 | for _k, _v in db_src.RangeIter():
76 | db_dst.Delete(_k)
77 | ```
78 |
79 | ### diff()
80 |
81 | 查找源库与目标库的key值差异数据项,存储至差异库。返回差异项总数。
82 |
83 | ```Python
84 | def diff(db_src, db_dst, db_diff):
85 | diff_count = 0
86 | for _k, _v in db_src.RangeIter():
87 | if not exist(db_dst, _k):
88 | diff_count += 1
89 | db_diff.Put(_k, _v)
90 | return diff_count
91 | ```
92 |
93 | ### clean_copy()
94 |
95 | 拷贝源库至目标库,并删除value值为空的数据项。返回拷贝总数。
96 |
97 | ```Python
98 | def clean_copy(db_src, db_dst):
99 | total = 0
100 | for _k, _v in db_src.RangeIter():
101 | if _v:
102 | db_dst.Put(_k, _v)
103 | total += 1
104 | return total
105 |
106 | ```
107 |
108 | ### dump()
109 |
110 | 打印输出当前数据库中所有key value数据。
111 | 安全兼容:当参数是字符串时,当作本地路径文件名处理,临时打开数据库。
112 |
113 | ```Python
114 | def dump(db_src):
115 | _db = leveldb.LevelDB(db_src, create_if_missing=False) if isinstance(db_src, str) else db_src
116 | for _k, _v in _db.RangeIter():
117 | print(_k.decode(), _v.decode())
118 | ```
119 |
120 | ### db_to_text()
121 |
122 | 导出leveldb数据库至文本文件,以','分隔。
123 | 安全兼容:当参数是字符串时,当作本地路径文件名处理,临时打开数据库。
124 |
125 | ```Python
126 | def db_to_text(from_db, to_text):
127 | _db = leveldb.LevelDB(from_db, create_if_missing=False) if isinstance(from_db, str) else from_db
128 | with open(to_text, 'w', encoding='utf-8') as _f:
129 | for _k, _v in _db.RangeIter():
130 | _f.write(_k.decode() + ',' + _v.decode() + '\n')
131 | ```
132 |
133 | ### text_to_db()
134 |
135 | 从文本文件导入至leveldb数据库。参数支持自定义分隔符。
136 | 安全兼容:当参数是字符串时,当作本地路径文件名处理,临时打开数据库。
137 |
138 | ```Python
139 | def text_to_db(from_text, to_db, split_char):
140 | total, invalid = 0, 0
141 | _split = split_char if split_char else ','
142 | _db = leveldb.LevelDB(to_db, create_if_missing=True) if isinstance(to_db, str) else to_db
143 | with open(from_text, 'r', encoding='utf-8') as _f:
144 | lines = _f.readlines()
145 | total = len(lines)
146 | for line in lines:
147 | if not line:
148 | invalid += 1
149 | continue
150 | # line = line.strip()
151 | if _split in line:
152 | _sub = line.split(_split, 1)
153 | _db.Put(_sub[0].encode('utf-8'), _sub[1].encode('utf-8'))
154 | else:
155 | _db.Put(line, '')
156 | return total, invalid
157 | ```
158 |
159 | ### db_to_excel()
160 |
161 | 导出leveldb数据库至Excel文件,返回总数。
162 | Excel文件共2列,分别对应leveldb的Key,Value。
163 | 安全兼容:当参数是字符串时,当作本地路径文件名处理,临时打开数据库。
164 |
165 | ```Python
166 | def db_to_excel(from_db, to_excel):
167 | _db = leveldb.LevelDB(from_db, create_if_missing=False) if isinstance(from_db, str) else from_db
168 | _wb = Workbook()
169 | _ws = _wb.active
170 | total = 0
171 | for _k, _v in _db.RangeIter():
172 | _ws.append([_k.decode(), _v.decode()])
173 | total += 1
174 | _wb.save(to_excel)
175 | return total
176 | ```
177 |
178 | ### excel_to_db()
179 |
180 | 从Excel文件导入至leveldb数据库。
181 | 仅读取Excel文件中的前2列数据,对应leveldb的Key,Value。
182 | 安全兼容:当参数是字符串时,当作本地路径文件名处理,临时打开数据库。
183 |
184 | ```Python
185 | def excel_to_db(from_excel, to_db):
186 | _wb = load_workbook(from_excel, read_only=True)
187 | _ws = _wb.active
188 | _db = leveldb.LevelDB(to_db, create_if_missing=True) if isinstance(to_db, str) else to_db
189 | total = 0
190 | for _row in _ws.iter_rows(min_row=2, min_col=1, max_col=1):
191 | if _row and _row[0] and _row[1]:
192 | _key, _value = '', ''
193 | if _row[0].data_type == cell.Cell.TYPE_STRING:
194 | _key = _row[0].value.encode('utf-8')
195 | _key = ''.join(_key.split())
196 | if _row[1].data_type == cell.Cell.TYPE_STRING:
197 | _value = _row[0].value.encode('utf-8')
198 | _value = ''.join(_value.split())
199 | _db.Put(_key, _value)
200 | total += 1
201 |
202 | _wb.close()
203 | return total
204 | ```
205 |
206 | ### 后记
207 |
208 | 这篇文档整理完成时,leveldb官网已经推出官方Python版本。
209 | 详见 https://plyvel.readthedocs.io/en/latest/
210 |
--------------------------------------------------------------------------------
/level/levelhelper.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 | '''leveldb helper'''
4 |
5 | import leveldb
6 | from openpyxl import Workbook
7 | from openpyxl import load_workbook
8 | from openpyxl import cell
9 |
10 |
11 | def exist(db_src, key):
12 | '''Safely check whether key exist or not'''
13 | _key_obj = bytes(key.encode('utf-8')) if isinstance(key, str) else key
14 | try:
15 | db_src.Get(_key_obj)
16 | return True
17 | except KeyError:
18 | return False
19 |
20 |
21 | def count(db_src, k_filter, v_filter):
22 | '''Count database items, support key filter and/or value filter, return total and valid.'''
23 | total, valid = 0, 0
24 | for _k, _v in db_src.RangeIter():
25 | total += 1
26 | if k_filter:
27 | if _k.find(k_filter) == -1:
28 | continue
29 | if v_filter:
30 | if _v.find(v_filter) == -1:
31 | continue
32 | valid += 1
33 | return total, valid
34 |
35 |
36 | def copy(db_src, db_dst, k_filter):
37 | '''copy db_src to db_dst, support key filter, return total and valid.'''
38 | total, valid = 0, 0
39 | for _k, _v in db_src.RangeIter():
40 | total += 1
41 | if k_filter:
42 | if _k.find(k_filter) != -1:
43 | valid += 1
44 | db_dst.Put(_k, _v, sync=True)
45 | else:
46 | valid += 1
47 | db_dst.Put(_k, _v, sync=True)
48 | return total, valid
49 |
50 |
51 | def delete(db_src, db_dst):
52 | '''Delete db_src items in db_dst.'''
53 | for _k, _v in db_src.RangeIter():
54 | db_dst.Delete(_k)
55 |
56 |
57 | def diff(db_src, db_dst, db_diff):
58 | '''Find differents between db_src and db_dst, save to db_diff, return diff count.'''
59 | diff_count = 0
60 | for _k, _v in db_src.RangeIter():
61 | if not exist(db_dst, _k):
62 | diff_count += 1
63 | db_diff.Put(_k, _v)
64 | return diff_count
65 |
66 |
67 | def clean_copy(db_src, db_dst):
68 | '''copy db_src to db_dst, clean empty value, return total count.'''
69 | total = 0
70 | for _k, _v in db_src.RangeIter():
71 | if _v:
72 | db_dst.Put(_k, _v)
73 | total += 1
74 | return total
75 |
76 |
77 | def dump(db_src):
78 | '''Dump database key and value items.'''
79 | _db = leveldb.LevelDB(db_src, create_if_missing=False) if isinstance(db_src, str) else db_src
80 | for _k, _v in _db.RangeIter():
81 | print(_k.decode(), _v.decode())
82 |
83 |
84 | def db_to_text(from_db, to_text):
85 | '''Transfer leveldb to text file.'''
86 | _db = leveldb.LevelDB(from_db, create_if_missing=False) if isinstance(from_db, str) else from_db
87 | with open(to_text, 'w', encoding='utf-8') as _f:
88 | for _k, _v in _db.RangeIter():
89 | _f.write(_k.decode() + ',' + _v.decode() + '\n')
90 |
91 |
92 | def text_to_db(from_text, to_db, split_char):
93 | '''Transfer text file to leveldb, return total and invalid count.'''
94 | total, invalid = 0, 0
95 | _split = split_char if split_char else ','
96 | _db = leveldb.LevelDB(to_db, create_if_missing=True) if isinstance(to_db, str) else to_db
97 | with open(from_text, 'r', encoding='utf-8') as _f:
98 | lines = _f.readlines()
99 | total = len(lines)
100 | for line in lines:
101 | if not line:
102 | invalid += 1
103 | continue
104 | # line = line.strip()
105 | if _split in line:
106 | _sub = line.split(_split, 1)
107 | _db.Put(_sub[0].encode('utf-8'), _sub[1].encode('utf-8'))
108 | else:
109 | _db.Put(line, '')
110 | return total, invalid
111 |
112 |
113 | def db_to_excel(from_db, to_excel):
114 | '''Transfer leveldb to Excel file, return total count.'''
115 | _db = leveldb.LevelDB(from_db, create_if_missing=False) if isinstance(from_db, str) else from_db
116 | _wb = Workbook()
117 | _ws = _wb.active
118 | total = 0
119 | for _k, _v in _db.RangeIter():
120 | _ws.append([_k.decode(), _v.decode()])
121 | total += 1
122 | _wb.save(to_excel)
123 | return total
124 |
125 |
126 | def excel_to_db(from_excel, to_db):
127 | '''Transfer Excel file to leveldb, return total count.'''
128 | _wb = load_workbook(from_excel, read_only=True)
129 | _ws = _wb.active
130 | _db = leveldb.LevelDB(to_db, create_if_missing=True) if isinstance(to_db, str) else to_db
131 | total = 0
132 | for _row in _ws.iter_rows(min_row=2, min_col=1, max_col=1):
133 | if _row and _row[0] and _row[1]:
134 | _key, _value = '', ''
135 | if _row[0].data_type == cell.Cell.TYPE_STRING:
136 | _key = _row[0].value.encode('utf-8')
137 | _key = ''.join(_key.split())
138 | if _row[1].data_type == cell.Cell.TYPE_STRING:
139 | _value = _row[0].value.encode('utf-8')
140 | _value = ''.join(_value.split())
141 | _db.Put(_key, _value)
142 | total += 1
143 |
144 | _wb.close()
145 | return total
146 |
--------------------------------------------------------------------------------
/monkeyrunner/README.md:
--------------------------------------------------------------------------------
1 | # MonkeyRunner is DEAD
2 |
3 | ## UI Automator
4 |
5 | https://developer.android.com/training/testing/ui-automator
6 |
7 | Android 平台所有自动化测试框架的底层实现都依赖官方提供的 UI Automator 测试框架,适用于跨系统和已安装应用程序的跨应用程序功能UI测试。主要功能包括三部分:
8 |
9 | + UI Automator Viewer 检查布局层次结构的查看器。
10 | + UiDevice 设备状态信息并在目标设备上执行操作的API。
11 | + UI Automator API 支持跨应用程序UI测试的API。
12 |
13 | ## UI Automator Viewer
14 |
15 | PC 端 GUI 工具,扫描和分析 Android 设备上当前显示的 UI 组件。展示 UI 布局层次结构,查看设备上当前对用户可见的 UI 组件的属性。从名称可以看出,它是 UI Automator 的只读功能部分,即只能查看 UI 组件的树形结构和属性,不能操作控制 UI 组件。
16 |
17 | `uiautomatorviewer` 位于 `/tools/bin` 目录。
18 | 启动入口是一个bash文件,实际调用 `/tools/lib` 目录的 `uiautomatorviewer-26.0.0-dev.jar` 。
19 | GUI 基于 Eclipse + SWT 实现,使用 Gradle 构建。
20 | 系列工具源码在 `https://android.googlesource.com/platform/tools/swt/` 。
21 | 依赖 `https://android.googlesource.com/platform/tools/base/` 。
22 | 活跃分支: `mirror-goog-studio-master-dev` 。
23 | 该仓库还包含以下工具。
24 |
25 | + chimpchat
26 | + ddms
27 | + hierarchyviewer2
28 | + monkeyrunner
29 | + swtmenubar
30 | + traceview
31 |
32 | 其内部实现基于 `adb shell uiautomator dump` 。从源码仓库提交记录看,主要功能开发的活跃时间是 2014-2015,2016之后已经很少更新维护。那个年代的 Android 开发主要使用 Eclipse , 所以基于 SWT 实现多平台 PC GUI ,在当时合理。
33 |
34 | 该工具实际使用运行不稳定,极易报错。
35 | > `Error while obtaining UI hierarchy XML file: com.android.ddmlib.SyncException: Remote object doesn't exist!`
36 |
37 | 错误原因通常是:
38 |
39 | + adb 连接通道不稳定。
40 | + 机型兼容性问题,权限问题。
41 | + 当前手机应用程序界面处于动态,例如播放视频,动画。并且10秒超时时间仍未进入静态。
42 |
43 | 分析源码可知,错误都源于 `Android Framework uiautomator` 。
44 |
45 | ## MonkeyRunner
46 |
47 | https://developer.android.com/studio/test/monkeyrunner
48 |
49 | 官方提供的另外一个工具,封装 uiautomator API,供 Python 脚本调用,也可注入 java 扩展插件。
50 | 相比 `uiautomatorviewer` 和 `uiautomator` 命令行工具,可编程扩展性更佳。
51 | MonkeyRunner 使用了比较冷门的 Jython 实现。
52 |
53 | ### 1. 启动运行入口
54 |
55 | > monkeyrunner -plugin
56 |
57 | monkeyrunner 是一个bash文件,位于 `/tools/bin` ,启动调用 `/tools/lib/monkeyrunner-26.0.0-dev.jar` 。
58 |
59 | ```bash
60 | export ANDROID_HOME="~/Library/Android/sdk"
61 | $ANDROID_HOME/tools/bin/monkeyrunner uiparser.py
62 | ```
63 |
64 | ### 2. 主要方法
65 |
66 | #### MonkeyDevice.getProperty()
67 |
68 | 等同于调用 `adb shell getprop ` 。获取设备系统环境变量。
69 | 不同厂商的设备,key可能不同。针对具体测试机型,可使用 `adb shell getprop` ,显示所有系统环境变量的key。
70 |
71 | #### MonkeyDevce.shell()
72 |
73 | 等同于调用`adb shell`命令。
74 |
75 | ### 3. 缺陷
76 |
77 | MonkeyRunner 基于 Jython 2.5.3 。看上去结合了Java和Python的优势,实际对于Java和Python编程都不友好。
78 |
79 | + Jython 2.5.3 过时,主流的Python 3.x和2.7的很多语法和库无法使用。
80 | + 使用vscode等编辑器编码时,缺少智能提示和自动补全。编辑器和pylint无法识别导入的库, 例如 `from com.android.monkeyrunner import MonkeyRunner, MonkeyDevice, MonkeyImage` 。
81 | + Jython 似乎不能像常规的python程序一样引用外部库。实测只能使用 MonkeyRunner 内置的 `os, sys, subprocess` 等库。
82 | + Java extend plugin 能做的事情较少。
83 |
84 | MonkeyRunner 实际仍然是使用 `adb shell` 和其中的 `uiautomator` 命令获取UI组件状态和属性。所以它跟 `UI Automator Viewer` 一样受限于 `uiautomator` 本身的缺陷,导致运行不稳定。
85 |
86 | ## adb shell uiautomator
87 |
88 | **adb**
89 | https://developer.android.google.cn/studio/command-line/adb
90 |
91 | **adb shell am**
92 | https://developer.android.google.cn/studio/command-line/adb#am
93 | 使用 Activity Manager (am) 工具发出命令以执行各种系统操作,如启动 Activity、强行停止进程、广播 intent、修改设备屏幕属性及其他操作。
94 |
95 | **adb shell pm**
96 | https://developer.android.google.cn/studio/command-line/adb#pm
97 | 使用软件包管理器 Package Manager (pm) 工具发出命令,安装,卸载,查询安装包。
98 |
99 | **adb shell uiatomator**
100 | 官网相关页面已被删除,仅能从搜索引擎历史快照中找到。猜测可能近期会有变更,或者官方建议不再使用。
101 | 通过执行命令可以查看使用方法和参数。
102 |
103 | ```bash
104 | Usage: uiautomator [options]
105 |
106 | Available subcommands:
107 |
108 | help: displays help message
109 |
110 | runtest: executes UI automation tests
111 | runtest [options]
112 | : < -c | -e class >
113 | : a list of jar files containing test classes and dependencies. If
114 | the path is relative, it's assumed to be under /data/local/tmp. Use
115 | absolute path if the file is elsewhere. Multiple files can be
116 | specified, separated by space.
117 | : a list of test class names to run, separated by comma. To
118 | a single method, use TestClass#testMethod format. The -e or -c option
119 | may be repeated. This option is not required and if not provided then
120 | all the tests in provided jars will be run automatically.
121 | options:
122 | --nohup: trap SIG_HUP, so test won't terminate even if parent process
123 | is terminated, e.g. USB is disconnected.
124 | -e debug [true|false]: wait for debugger to connect before starting.
125 | -e runner [CLASS]: use specified test runner class instead. If
126 | unspecified, framework default runner will be used.
127 | -e : other name-value pairs to be passed to test classes.
128 | May be repeated.
129 | -e outputFormat simple | -s: enabled less verbose JUnit style output.
130 |
131 | dump: creates an XML dump of current UI hierarchy
132 | dump [--verbose][file]
133 | [--compressed]: dumps compressed layout information.
134 | [file]: the location where the dumped XML should be stored, default is
135 | /sdcard/window_dump.xml
136 |
137 | events: prints out accessibility events until terminated
138 | ```
139 |
140 | ### uiautomator 缺陷
141 |
142 | 运行耗时长,失败率高,频繁报错。
143 | `ERROR: could not get idle state.` 通常表示当前UI处于动态渲染刷新期间,例如正在播放视频,动画。在10秒超时时间内仍未进入静态。因为此时 UI 树的节点对象快速变化中,不能稳定获取。
144 |
145 | ### uiautomator 源码
146 |
147 | PC端工具源码位于仓库 https://android.googlesource.com/platform/frameworks/testing/ `master` 分支。
148 | 最新更新于 2014.11.14。之后活跃分支变更为 `android-support-test` 分支。`uiautomator` 源码被移除,改成 `android.support.test library, expresso` 等工具的源码工程。
149 | 手机端框架源码位于仓库 https://android.googlesource.com/platform/frameworks/base/ `master` 分支。
150 | `uiAutomation.waitForIdle(1000, 1000 * 10);` 是报错的关键代码,即单次超时等待1秒,最长超时等待10秒。超时抛出异常。
151 |
152 | `DumpCommand.java`
153 | > https://android.googlesource.com/platform/frameworks/testing/+/master/uiautomator/cmds/uiautomator/src/com/android/commands/uiautomator/DumpCommand.java
154 |
155 | ```Java
156 | // It appears that the bridge needs time to be ready. Making calls to the
157 | // bridge immediately after connecting seems to cause exceptions. So let's also
158 | // do a wait for idle in case the app is busy.
159 | try {
160 | UiAutomation uiAutomation = automationWrapper.getUiAutomation();
161 | uiAutomation.waitForIdle(1000, 1000 * 10);
162 | AccessibilityNodeInfo info = uiAutomation.getRootInActiveWindow();
163 | if (info == null) {
164 | System.err.println("ERROR: null root node returned by UiTestAutomationBridge.");
165 | return;
166 | }
167 | Display display =
168 | DisplayManagerGlobal.getInstance().getRealDisplay(Display.DEFAULT_DISPLAY);
169 | int rotation = display.getRotation();
170 | Point size = new Point();
171 | display.getSize(size);
172 | AccessibilityNodeInfoDumper.dumpWindowToFile(info, dumpFile, rotation, size.x, size.y);
173 | } catch (TimeoutException re) {
174 | System.err.println("ERROR: could not get idle state.");
175 | return;
176 | } finally {
177 | automationWrapper.disconnect();
178 | }
179 | System.out.println(
180 | String.format("UI hierchary dumped to: %s", dumpFile.getAbsolutePath()));
181 | ```
182 |
183 | `UiAutomation.java`
184 | > https://android.googlesource.com/platform/frameworks/base/+/master/core/java/android/app/UiAutomation.java
185 |
186 | ```Java
187 | /**
188 | * Waits for the accessibility event stream to become idle, which is not to
189 | * have received an accessibility event within idleTimeoutMillis
.
190 | * The total time spent to wait for an idle accessibility event stream is bounded
191 | * by the globalTimeoutMillis
.
192 | *
193 | * @param idleTimeoutMillis The timeout in milliseconds between two events
194 | * to consider the device idle.
195 | * @param globalTimeoutMillis The maximal global timeout in milliseconds in
196 | * which to wait for an idle state.
197 | *
198 | * @throws TimeoutException If no idle state was detected within
199 | * globalTimeoutMillis.
200 | */
201 | public void waitForIdle(long idleTimeoutMillis, long globalTimeoutMillis)
202 | throws TimeoutException {
203 | synchronized (mLock) {
204 | throwIfNotConnectedLocked();
205 | final long startTimeMillis = SystemClock.uptimeMillis();
206 | if (mLastEventTimeMillis <= 0) {
207 | mLastEventTimeMillis = startTimeMillis;
208 | }
209 | while (true) {
210 | final long currentTimeMillis = SystemClock.uptimeMillis();
211 | // Did we get idle state within the global timeout?
212 | final long elapsedGlobalTimeMillis = currentTimeMillis - startTimeMillis;
213 | final long remainingGlobalTimeMillis =
214 | globalTimeoutMillis - elapsedGlobalTimeMillis;
215 | if (remainingGlobalTimeMillis <= 0) {
216 | throw new TimeoutException("No idle state with idle timeout: "
217 | + idleTimeoutMillis + " within global timeout: "
218 | + globalTimeoutMillis);
219 | }
220 | // Did we get an idle state within the idle timeout?
221 | final long elapsedIdleTimeMillis = currentTimeMillis - mLastEventTimeMillis;
222 | final long remainingIdleTimeMillis = idleTimeoutMillis - elapsedIdleTimeMillis;
223 | if (remainingIdleTimeMillis <= 0) {
224 | return;
225 | }
226 | try {
227 | mLock.wait(remainingIdleTimeMillis);
228 | } catch (InterruptedException ie) {
229 | /* ignore */
230 | }
231 | }
232 | }
233 | }
234 | ```
235 |
236 | ## Android Device Monitor
237 |
238 | https://developer.android.com/studio/profile/monitor
239 |
240 | Android SDK 工具集的 `Android Device Monitor` 已废弃。
241 |
242 | >Android Device Monitor was deprecated in Android Studio 3.1 and removed from Android Studio 3.2. The features that you could use through the Android Device Monitor have been replaced by new features. The table below helps you decide which features you should use instead of these deprecated and removed features.
243 |
244 | 官方给出的替代品 `Layout Inspector` 功能更强大,界面也更美观,但目前还不成熟,相比 iOS 神器 [Reveal](https://revealapp.com/) , 仍需努力。
245 | https://developer.android.com/studio/debug/layout-inspector
246 |
247 | ## uiparser
248 |
249 | 参照 MonkeyRunner 官方文档实现的 Python Demo。
250 |
251 | https://github.com/9468305/python-script/tree/master/monkeyrunner
252 |
--------------------------------------------------------------------------------
/monkeyrunner/uiparser.py:
--------------------------------------------------------------------------------
1 | #! $ANDROID_HOME/tools/bin monkeyrunner
2 | # -*- coding: utf-8 -*-
3 | '''uiparser'''
4 |
5 | import os
6 | import sys
7 | import subprocess
8 | import datetime
9 | import logging
10 | from com.android.monkeyrunner import MonkeyRunner, MonkeyDevice, MonkeyImage #pylint: disable=import-error
11 |
12 | class NullHandler(logging.Handler):
13 | def emit(self, record):
14 | pass
15 |
16 | logging.getLogger(__name__).addHandler(NullHandler())
17 | logging.basicConfig(level=logging.DEBUG)
18 |
19 | SHORT = 1
20 | MIDDLE = 5
21 | LONG = 15
22 |
23 | ADB = os.path.join(os.environ['ANDROID_HOME'], 'platform-tools', 'adb')
24 |
25 | # Example of Ctrip Android Apk
26 | TARGET_PACKAGE = 'ctrip.android.view'
27 | LAUNCH_ACTIVITY = 'ctrip.business.splash.CtripSplashActivity'
28 | HOME_ACTIVITY = 'ctrip.android.publicproduct.home.view.CtripHomeActivity'
29 | FLIGHT_ACTIVITY = 'ctrip.android.flight.view.inland.FlightInquireActivity'
30 | START_COMPONENT = TARGET_PACKAGE + '/' + LAUNCH_ACTIVITY
31 |
32 | DEVICE_DIR = '/sdcard/uiparser/'
33 | HOST_DIR = './'
34 |
35 |
36 | def capture(device, index):
37 | ''''''
38 | _dumpXML = DEVICE_DIR + index + '.xml'
39 | _localXML = HOST_DIR + index + '.xml'
40 | _localImage = HOST_DIR + index + '.png'
41 |
42 | _shell = [ADB, 'shell', 'uiautomator', 'dump', _dumpXML]
43 | logging.debug(datetime.datetime.now())
44 | subprocess.call(_shell) # Stupid uiautomator, always failed here!
45 | logging.debug(datetime.datetime.now())
46 | #MonkeyRunner.sleep(MIDDLE)
47 |
48 | _shell = [ADB, 'pull', _dumpXML, _localXML]
49 | subprocess.call(_shell)
50 |
51 | _image = device.takeSnapshot()
52 | _image.writeToFile(_localImage, 'png')
53 |
54 |
55 | def uiparser():
56 | '''Main Entry'''
57 | device = MonkeyRunner.waitForConnection(MIDDLE)
58 |
59 | _shell = [ADB, 'shell', 'rm', '-rf', DEVICE_DIR]
60 | subprocess.call(_shell)
61 |
62 | _shell = [ADB, 'shell', 'mkdir', '-p', DEVICE_DIR]
63 | subprocess.call(_shell)
64 |
65 | device.startActivity(component=START_COMPONENT)
66 | MonkeyRunner.sleep(MIDDLE)
67 |
68 | capture(device, str(0))
69 |
70 |
71 | if __name__ == "__main__":
72 | # MonkeyRunner Jython version is 2.5.3 (Outdated!)
73 | logging.info(sys.version)
74 | uiparser()
75 |
--------------------------------------------------------------------------------
/nacao_v1/README.md:
--------------------------------------------------------------------------------
1 | 这次的爬虫目标网站是全国组织机构代码管理中心 http://www.nacao.org.cn 。
2 | 搜索关键字是公司中文名称,搜索结果是公司对应的统一社会信用代码(税号)。
3 | 代码实现非常简单,仅需2个HTTP请求,1个搜索公司列表,1个分页查询结果。
4 | 动手编码之前,先来看看该网站的前端设计实现,有哪些值得吐槽的技术点。
5 |
6 | ### 槽点1:网站开放服务时间 每天12小时
7 |
8 | >重要公告:网站核查平台服务时间为7*12小时(即每天8:00-20:00)
9 |
10 | [知乎:为什么全国组织机构代码管理中心网站(www.nacao.org.cn)只在上班时间开放查询呢?](https://www.zhihu.com/question/33204926)
11 | 网站页面Javascript代码直接写死时间判断逻辑,超出规定时间就直接报错。而服务器始终正常运行。
12 | 第一期,技术人员直接调用浏览器系统时间进行拦截判断。然后用户学会了修改电脑时间。
13 | 第二期,技术人员使用服务器时间进行拦截判断。然后用户学会了绕过Javascript验证。
14 | 第三期,技术人员直接注释了前端Javascript拦截代码。(服务器是否拦截判断,未核实。)
15 |
16 | ### 槽点2:信息核查系统 vs 信息校核结果公示系统
17 |
18 | 首页从上至下,分为2个查询系统。
19 | 第一个是`全国统一社会信用代码信息核查系统`。输入关键字,弹出英文字母图片验证码。
20 | 第二个是`全国统一社会信用代码信息校核结果公示系统`。输入关键字,直接跳转结果页面!!!
21 | 查询数据结果对比,基本一致。那么谁还去用第一个图片验证码系统???
22 | 多次测试后发现,第一个系统还有IP反爬机制,一旦封禁,3工作日解封。第二个系统任意使用无限制。
23 |
24 | ```txt
25 | IP被封怎么办?
26 | 本网站核查平台是为公众基于统一社会信用代码信息的一般性核查设立的,不支持大量且频繁的查询请求,如查询过程中出现下图 所示,说明您的查询过于频繁,系统已经对您进行了查询限制,限制期为3个工作日。请于限制期后再访问系统进行查询。
27 | ```
28 |
29 | ### 槽点3:存在疑似SQL注入漏洞
30 |
31 | 在分析SQL注入漏洞之前,先来看看系统2的爬虫实现,即HTTP参数的含义解释。
32 |
33 | #### 关键字查询
34 |
35 | GET http://125.35.63.141:8080/PublicNotificationWeb/search.do
36 | 参数:searchText = 关键字,searchType = 3。
37 | 响应:返回一个字符串Referer。用于query.do的HTTP header参数。
38 |
39 | #### 分页获取数据
40 |
41 | POST http://125.35.63.141:8080/PublicNotificationWeb/query.do
42 | 参数:
43 |
44 | ```python
45 | _params = [ ('pageSize', 20),
46 | ('searchText', keyword),
47 | ('searchType', 3),
48 | ('DJBMBM', ''),
49 | ('sortField', ''),
50 | ('currentPage', 1)]
51 | ```
52 |
53 | 含义:
54 |
55 | + pageSize 分页数据量,默认20,实测可以改成100。即一页返回100项数据。
56 | + searchText 搜索关键字
57 | + searchType 搜索类型,固定=3
58 | + DJBMBM 未知,固定为空
59 | + sortField 未知,固定为空
60 | + currentPage 当前分页索引,实测最多5页,值范围0-4。
61 |
62 | 响应:返回一个JSON字符串。
63 |
64 | + totalPage 总页数,最大5页。
65 | + foundCount 总共查询结果数量
66 | + dataList JSON数组
67 | + JGMC 公司名称
68 | + TYSHXYDM 统一社会信用代码(税号)
69 |
70 | 至此,爬虫已经实现完毕。接下来探索每个参数的可能性。
71 |
72 | + pageSize 从20改成100,最多可一次获得500条数据。
73 | + searchText 搜索关键字改成*,可获得服务器数据库默认排序数据。此时foundCount等于官网数据库数据总量。
74 |
75 | 实测数据日志见[sql_injection.txt](https://github.com/9468305/python-script/tree/master/nacao_v1/sql_injection.txt)。
76 | 所以这里能否使用SQLInjection,获得数据库访问权限,直接拖库呢?留待有心人探索。
77 |
78 | ### 后记
79 |
80 | 官网进行了改版,原方案接口失效。新方案V2.0:
81 | https://github.com/9468305/python-script/tree/master/nacao_v2
82 |
--------------------------------------------------------------------------------
/nacao_v1/constants.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 | '''
4 | HTTP Request 常用常量
5 | '''
6 |
7 | ACCEPT_ANY = '*/*'
8 |
9 | ACCEPT_TEXT = 'text/plain, */*; q=0.01'
10 |
11 | ACCEPT_HTML = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
12 |
13 | ACCEPT_JSON = 'application/json, text/javascript, */*; q=0.01'
14 |
15 | ACCEPT_IMAGE = 'image/webp,image/*,*/*;q=0.8'
16 |
17 | ACCEPT_LANGUAGE = 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2'
18 |
19 | UA_CHROME_WIN = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
20 |
21 | UA_CHROME_MAC = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
22 |
23 | USER_AGENT = UA_CHROME_MAC
24 |
--------------------------------------------------------------------------------
/nacao_v1/nacao_v1.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 | '''
4 | 全国组织机构代码管理中心
5 | http://www.nacao.org.cn/
6 | 全国统一社会信用代码信息校核结果公示系统
7 | http://125.35.63.141:8080/PublicNotificationWeb/search.do
8 | '''
9 |
10 | import logging
11 | from logging import NullHandler
12 | import requests
13 | import constants
14 |
15 | logging.getLogger(__name__).addHandler(NullHandler())
16 | logging.basicConfig(level=logging.DEBUG)
17 |
18 | HOST = 'http://125.35.63.141:8080'
19 |
20 | # 有时响应很慢,需要随时调整超时时间阈值
21 | TIMEOUT = 20
22 |
23 |
24 | def get_search(session, keyword):
25 | '''查询keyword'''
26 | _url = HOST + '/PublicNotificationWeb/search.do'
27 | logging.debug('GET ' + _url)
28 | _headers = {'Accept': constants.ACCEPT_HTML,
29 | 'Accept-Language': constants.ACCEPT_LANGUAGE,
30 | 'User-Agent': constants.USER_AGENT}
31 | _params = [('searchText', keyword),
32 | ('searchType', 3)]
33 | _response = session.get(_url, headers=_headers, params=_params, timeout=TIMEOUT)
34 | logging.debug('response code:' + str(_response.status_code))
35 | return _response.url if _response.status_code == 200 else None
36 |
37 |
38 | def post_query(session, keyword, referer, current_page):
39 | '''分页获取keyword json数据'''
40 | _url = HOST + '/PublicNotificationWeb/query.do'
41 | logging.debug('POST ' + _url)
42 | _headers = {'Accept': constants.ACCEPT_ANY,
43 | 'Accept-Language': constants.ACCEPT_LANGUAGE,
44 | 'User-Agent': constants.USER_AGENT,
45 | 'Referer': referer,
46 | 'X-Requested-With': 'XMLHttpRequest',
47 | 'Origin': HOST}
48 | _params = [('pageSize', 100),
49 | ('searchText', keyword),
50 | ('searchType', 3),
51 | ('DJBMBM', ''),
52 | ('sortField', ''),
53 | ('currentPage', current_page if current_page > 1 else '')]
54 | _response = session.post(_url, headers=_headers, data=_params, timeout=TIMEOUT)
55 | logging.debug('response code: ' + str(_response.status_code))
56 | logging.debug('response text: ' + _response.text)
57 | return _response.json() if _response.status_code == 200 else None
58 |
59 |
60 | def query_keyword(session, keyword):
61 | '''使用session,查询keyword'''
62 | _referer = get_search(session, keyword)
63 | if not _referer:
64 | return None
65 |
66 | _code_all = []
67 | _current_page = 0
68 | _total_page = 5
69 | while _current_page < _total_page:
70 | _current_page += 1
71 | _json_obj = post_query(session, keyword, _referer, _current_page)
72 | if _json_obj:
73 | _total_page = _json_obj['totalPage']
74 | _found_count = _json_obj['foundCount']
75 | _data_list = _json_obj['dataList']
76 | if _found_count and _data_list:
77 | for _i in _data_list:
78 | _code_all.append((_i['JGMC'].encode('utf-8'), _i['TYSHXYDM'].encode('utf-8')))
79 | else:
80 | break
81 |
82 | return _code_all
83 |
84 |
85 | def query():
86 | '''query entry'''
87 | try:
88 | with requests.Session() as session:
89 | # * 就是通配符, 可替换成任意公司名称, 该接口可能存在SQL注入漏洞, 未深入验证。
90 | _code_all = query_keyword(session, '*')
91 | if _code_all:
92 | logging.info(len(_code_all))
93 | for _r in _code_all:
94 | logging.info(_r[0] + ' : ' + _r[1])
95 | except requests.RequestException as _e:
96 | logging.error(_e)
97 |
98 |
99 | if __name__ == "__main__":
100 | query()
101 |
--------------------------------------------------------------------------------
/nacao_v2/README.md:
--------------------------------------------------------------------------------
1 | 书接前文,[全国组织机构代码管理中心](http://www.nacao.org.cn)网站改版,V1.0 方案已失效。继续分析这次改版的变化,以及V2.0方案的实现。
2 |
3 | ### IP变域名
4 |
5 | 125.35.63.141:8080 变成 dmedu.org.cn 。(该域名解析IP仍是 125.35.63.141 )因此2个API接口改变。
6 |
7 | 旧:
8 |
9 | http://125.35.63.141:8080/PublicNotificationWeb/search.do
10 | http://125.35.63.141:8080/PublicNotificationWeb/query.do
11 |
12 | 新:
13 |
14 | http://www.dmedu.org.cn/search.do
15 | http://www.dmedu.org.cn/query.do
16 |
17 | ### search.do可绕过
18 |
19 | search.do接口返回一个字符串Referer,用于query.do的HTTP Header参数。实测不再需要,直接调用query.do,Referer空值。
20 |
21 | ### 参数一致
22 |
23 | + V1.0和V2.0参数没有变化。
24 | + pageSize 20扩大至100 继续可用
25 | + searchText使用*通配符,继续可用
26 |
27 | **因此V2.0的代码更简单。**
28 |
29 | ### 后记
30 |
31 | 目前官网已经针对该接口增加了图片字符验证码,因此该方案已失效。关于如何识别图片字符验证码,搜索“远程打码”。
32 |
--------------------------------------------------------------------------------
/nacao_v2/constants.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 | '''
4 | HTTP Request 常用常量
5 | '''
6 |
7 | ACCEPT_ANY = '*/*'
8 |
9 | ACCEPT_TEXT = 'text/plain, */*; q=0.01'
10 |
11 | ACCEPT_HTML = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
12 |
13 | ACCEPT_JSON = 'application/json, text/javascript, */*; q=0.01'
14 |
15 | ACCEPT_IMAGE = 'image/webp,image/*,*/*;q=0.8'
16 |
17 | ACCEPT_LANGUAGE = 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2'
18 |
19 | UA_CHROME_WIN = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
20 |
21 | UA_CHROME_MAC = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
22 |
23 | USER_AGENT = UA_CHROME_MAC
24 |
--------------------------------------------------------------------------------
/nacao_v2/nacao_v2.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # -*- coding: utf-8 -*-
3 | '''
4 | 全国组织机构代码管理中心
5 | http://www.nacao.org.cn/
6 | 全国统一社会信用代码信息校核结果公示系统
7 | http://www.dmedu.org.cn/query.do
8 | '''
9 |
10 | import json
11 | import logging
12 | from logging import NullHandler
13 | import requests
14 | import constants
15 |
16 | logging.getLogger(__name__).addHandler(NullHandler())
17 | logging.basicConfig(level=logging.INFO)
18 |
19 | HOST = 'http://www.dmedu.org.cn'
20 |
21 | # 有时响应很慢,需要随时调整超时阈值
22 | TIMEOUT = 20
23 |
24 | def post_query(session, keyword, current_page):
25 | '''分页获取keyword json数据'''
26 | _url = HOST + '/query.do'
27 | logging.debug('POST ' + _url)
28 | _headers = {'Accept': constants.ACCEPT_ANY,
29 | 'Accept-Language': constants.ACCEPT_LANGUAGE,
30 | 'User-Agent': constants.USER_AGENT,
31 | #'Referer': referer,
32 | 'X-Requested-With': 'XMLHttpRequest',
33 | 'Origin': HOST}
34 | _params = [('pageSize', 100),
35 | ('searchText', keyword),
36 | ('searchType', 3),
37 | ('DJBMBM', ''),
38 | ('sortField', ''),
39 | ('currentPage', current_page if current_page > 1 else '')]
40 | _response = session.post(_url, headers=_headers, data=_params, timeout=TIMEOUT)
41 | logging.debug('response code: ' + str(_response.status_code))
42 | logging.debug('response text: ' + _response.text)
43 | return _response.json() if _response.status_code == 200 else None
44 |
45 |
46 | def query_keyword(session, keyword):
47 | '''查询关键字'''
48 | _current_page = 0
49 | _total_page = 5 # Max
50 | while _current_page < _total_page:
51 | _current_page += 1
52 | _json_obj = post_query(session, keyword, _current_page)
53 | if _json_obj:
54 | _total_page = _json_obj['totalPage']
55 | print(json.dumps(_json_obj, indent=2, sort_keys=True, ensure_ascii=False))
56 | else:
57 | break
58 |
59 |
60 | def query():
61 | '''query entry'''
62 | try:
63 | with requests.Session() as session:
64 | # * 就是通配符, 可替换成任意公司名称, 该接口可能存在SQL注入漏洞, 未深入验证。
65 | query_keyword(session, '*')
66 | except requests.RequestException as _e:
67 | logging.error(_e)
68 |
69 |
70 | if __name__ == "__main__":
71 | query()
72 |
--------------------------------------------------------------------------------