├── .gitignore
├── README.md
├── decrypt.py
├── demo.html
├── demo
├── __init__.py
└── get_cookie.py
└── demo2
├── __init__.py
├── get_cookie.py
└── requirements.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 | ### Python template
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | pip-wheel-metadata/
26 | share/python-wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | MANIFEST
31 |
32 | # PyInstaller
33 | # Usually these files are written by a python script from a template
34 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 |
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 |
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .nox/
46 | .coverage
47 | .coverage.*
48 | .cache
49 | nosetests.xml
50 | coverage.xml
51 | *.cover
52 | .hypothesis/
53 | .pytest_cache/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | db.sqlite3
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # celery beat schedule file
95 | celerybeat-schedule
96 |
97 | # SageMath parsed files
98 | *.sage.py
99 |
100 | # Environments
101 | .env
102 | .venv
103 | env/
104 | venv/
105 | ENV/
106 | env.bak/
107 | venv.bak/
108 |
109 | # Spyder project settings
110 | .spyderproject
111 | .spyproject
112 |
113 | # Rope project settings
114 | .ropeproject
115 |
116 | # mkdocs documentation
117 | /site
118 |
119 | # mypy
120 | .mypy_cache/
121 | .dmypy.json
122 | dmypy.json
123 |
124 | # Pyre type checker
125 | .pyre/
126 |
127 | 将删除 .idea/
128 | 将删除 demo/__init__.py
129 | .idea
130 | venv
131 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 文书网cookie获取两种方式
2 |
3 | > 本项目只做技术探讨,请勿用于违法用途。
4 |
5 | ## 第一种获取方式
6 |
7 | > 文书网cookie获取 2020-04-16(通过splash获取cookie)
8 |
9 | 获取cookie的demo请见[demo](./demo/get_cookie.py)
10 |
11 |
12 | ⚠️如果获取cookie的地址是 `https://wenshu.court.gov.cn/` 这是https,那么是不会返回 `80S` 和 `80T` 这两个cookie的,返回的是 `443S` 和 `443T`
13 |
14 | ⚠️获取cookie的地址是 `http://wenshu.court.gov.cn/` 这是http的,那么才会返回 `80S` 和 `80T` 这两个cookie的
15 |
16 |
17 | 获取的 cookie 是通过 http的链接获取,后面的爬取也用http
18 |
19 | 获取 cookie 是通过 https 获取的,后面的爬取也用https 即可
20 |
21 |
22 | ⚠️注意:请替换为 `https` , `http` 已阵亡!!!!!!!!!!!!!!!!!!
23 |
24 |
25 | ### 获取教程
26 |
27 | 一. 安装splash
28 |
29 | 推荐docker启动一个splash容器
30 | ```
31 | docker run -it -p 8050:8050 scrapinghub/splash
32 | ```
33 |
34 | [splash安装教程📖](https://splash.readthedocs.io/en/stable/install.html#linux-docker)
35 |
36 | 二. 通过splash获取这三个cookie,代码如下
37 |
38 | [demo](./demo/get_cookie.py)
39 |
40 |
41 | ## 第二种获取方式
42 |
43 | > 通过 PyQtWebEngine 获取cookie
44 |
45 | 示例代码如 [demo2](./demo2/get_cookie.py)
46 |
47 |
48 |
49 | # 反爬应对措施,2019-10-24 再次更新
50 |
51 | > 返回响应代码 202 的解决方式(这个更新了,需要重新逆向这个js,最近没太多时间)
52 |
53 | 1. 这次如果按照以前的请求方式,会返回一个html页面
54 |
55 |
56 | 这个页面如 [demo.html](https://github.com/nciefeiniu/wenshu/blob/master/demo.html)
57 |
58 |
59 | ```html
60 |
61 |
62 |
63 |
64 |
67 |
72 |
73 |
74 |
75 |
76 | ```
77 |
78 |
79 | 2. 用浏览器打开这个文件,会发现会重定向到一个新的URL
80 |
81 | 如:http://localhost:63342/WZWSREL3dlYnNpdGUvcGFyc2UvcmVzdC5xNHc=?wzwschallenge=V1pXU19DT05GSVJNX1BSRUZJWF9MQUJFTDQxNjUyNzE=
82 |
83 | 因为是本地打开的,所以域名是`localhost:63342`
84 |
85 | 3. 把这个本地地址换成 `http://wenshu.court.gov.cn`这个后
86 |
87 | 神奇的事情发生了,可以获取到数据了。而且后面的请求也没返回这个 `html` 文件了。
88 |
89 |
90 | #### 所以这次反爬解决方案
91 |
92 | 1. 在请求返回的地方增加一个判断,如果是 `html` 文件,那么就解析这个文件,获取新的URL,并重试,发送 `post` 请求即可。
93 |
94 | 2. 这个html怎么解析??
95 |
96 | ~这个可以看看 @songguoxiong 的项目下的 [decrypt.py文件](https://github.com/songguoxiong/wenshu_utils/blob/master/wenshu_utils/old/wzws/decrypt.py)~
97 |
98 | 请看 `decrypt.py` 文件
99 |
100 | ⚠️ 注意 splash返回的cookie中,需要去除 `wzws_cid` 这个cookie
--------------------------------------------------------------------------------
/decrypt.py:
--------------------------------------------------------------------------------
1 | import re
2 | import base64
3 |
4 | from urllib.parse import urljoin
5 |
6 |
7 | _pattern = re.compile(r"dynamicurl\|(?P.+?)\|wzwsquestion\|(?P.+?)\|wzwsfactor\|(?P\d+)")
8 |
9 |
10 | def decrypt_wzws(text: str) -> str:
11 | # noinspection PyBroadException
12 | try:
13 | return _decrypt_by_python(text)
14 | except Exception:
15 | print("解析html错误")
16 |
17 |
18 | def _decrypt_by_python(text: str) -> str:
19 | base_url = "http://wenshu.court.gov.cn"
20 |
21 | group_dict = _pattern.search(text).groupdict()
22 | question = group_dict["question"]
23 | factor = int(group_dict["factor"])
24 | path = group_dict["path"]
25 |
26 | label = "WZWS_CONFIRM_PREFIX_LABEL{}".format(sum(ord(i) for i in question) * factor + 111111)
27 | challenge = base64.b64encode(label.encode()).decode()
28 |
29 | dynamic_url = urljoin(base_url, path)
30 | dynamic_url = "{url}?{query}".format(url=dynamic_url, query="wzwschallenge={}".format(challenge))
31 | return dynamic_url
32 |
33 |
34 | if __name__ == "__main__":
35 | with open("demo.html") as f:
36 | _content = f.read()
37 | _resp = decrypt_wzws(_content)
38 | print(_resp)
39 |
--------------------------------------------------------------------------------
/demo.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
8 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/demo/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
--------------------------------------------------------------------------------
/demo/get_cookie.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import os
5 | import json
6 |
7 | from dataclasses import dataclass
8 | from urllib import request
9 | from urllib.parse import urljoin
10 | from typing import List, Dict
11 |
12 | envget = os.environ.get
13 |
14 |
15 | @dataclass
16 | class WSCookie:
17 | splash_url = envget("SPLASH_URL", "http://192.168.3.83:8050")
18 |
19 | def send_request(self, retry_num=0) -> List[Dict]:
20 | if retry_num > 3:
21 | print("尝试重新获取数据3次,还是未获取到cookie,请考虑增加代理")
22 | return []
23 | post_body = {
24 | "har": "1",
25 | "html5_media": "false",
26 | "http_method": "GET",
27 | "png": 1,
28 | "render_all": False,
29 | "request_body": False,
30 | "resource_timeout": 0,
31 | "response_body": False,
32 | "viewport": "1920x1080",
33 | "wait": 3,
34 | "images": 1,
35 | "html": 1,
36 | "expand": 1,
37 | "timeout": 90,
38 | "url": "http://wenshu.court.gov.cn/",
39 | "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36",
40 | "lua_source": """function main(splash, args)
41 | assert(splash:go(args.url))
42 | assert(splash:wait(0.5))
43 | splash.images_enabled = false
44 | return {
45 | cookie = splash:get_cookies()
46 | }
47 | end
48 | """
49 | }
50 | req = request.Request(url=urljoin(self.splash_url, "/execute"), data=json.dumps(post_body).encode('utf-8'),
51 | headers={"content-type": "application/json"})
52 | resp = request.urlopen(req)
53 |
54 | if resp.status != 200:
55 | return self.send_request(retry_num+1)
56 |
57 | resp_json = json.loads(resp.read())
58 |
59 | if 'cookie' not in resp_json:
60 | return self.send_request(retry_num+1)
61 | return resp_json['cookie']
62 |
63 | @staticmethod
64 | def parse_cookie(cookies: List[Dict]):
65 | return {cookie['name']: cookie['value'] for cookie in cookies if cookie['name'] not in ['wzws_cid', 'SESSION']}
66 |
67 | def get_cookie(self) -> Dict[str, str]:
68 | return self.parse_cookie(self.send_request())
69 |
70 |
71 | if __name__ == '__main__':
72 | ws_cookie = WSCookie()
73 | _cookie = ws_cookie.get_cookie()
74 | print(_cookie)
--------------------------------------------------------------------------------
/demo2/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
--------------------------------------------------------------------------------
/demo2/get_cookie.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import sys
5 |
6 | from typing import Dict
7 |
8 | from PyQt5.QtCore import QEventLoop, QUrl
9 | from PyQt5.QtWidgets import QApplication
10 | from PyQt5.QtWebEngineWidgets import QWebEngineView, QWebEngineProfile
11 |
12 |
13 | def callback(html):
14 | print(html)
15 |
16 |
17 | def get_cookie(url: str) -> Dict[str, str]:
18 |
19 | class Render(QWebEngineView):
20 | cookies = {}
21 | html = None
22 |
23 | def __init__(self, url):
24 | self.app = QApplication(sys.argv)
25 | super(Render, self). __init__()
26 | self.page().profile().setHttpUserAgent(
27 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"
28 | )
29 | self.resize(1920, 1080)
30 | self.loadFinished.connect(self._loadFinished)
31 | self.load(QUrl(url))
32 |
33 | QWebEngineProfile.defaultProfile().cookieStore().cookieAdded.connect(self._onCookieAdd)
34 |
35 | while self.html is None:
36 | self.app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
37 |
38 | def _onCookieAdd(self, cookie):
39 | print(cookie.domain())
40 | if cookie.domain() != 'wenshu.court.gov.cn':
41 | return
42 | name = cookie.name().data().decode('utf-8')
43 | value = cookie.value().data().decode('utf-8')
44 | self.cookies[name] = value
45 |
46 | def _callable(self, data):
47 | self.html = data
48 |
49 | def _loadFinished(self):
50 | self.page().toHtml(self._callable)
51 |
52 | def __del__(self):
53 | self.app.quit()
54 |
55 | return Render(url).cookies
56 |
57 |
58 | if __name__ == '__main__':
59 | urll = "http://wenshu.court.gov.cn/"
60 | print(get_cookie(urll))
--------------------------------------------------------------------------------
/demo2/requirements.txt:
--------------------------------------------------------------------------------
1 | PyQt5
2 | PyQtWebEngine
--------------------------------------------------------------------------------