├── .gitignore ├── README.md ├── decrypt.py ├── demo.html ├── demo ├── __init__.py └── get_cookie.py └── demo2 ├── __init__.py ├── get_cookie.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # celery beat schedule file 95 | celerybeat-schedule 96 | 97 | # SageMath parsed files 98 | *.sage.py 99 | 100 | # Environments 101 | .env 102 | .venv 103 | env/ 104 | venv/ 105 | ENV/ 106 | env.bak/ 107 | venv.bak/ 108 | 109 | # Spyder project settings 110 | .spyderproject 111 | .spyproject 112 | 113 | # Rope project settings 114 | .ropeproject 115 | 116 | # mkdocs documentation 117 | /site 118 | 119 | # mypy 120 | .mypy_cache/ 121 | .dmypy.json 122 | dmypy.json 123 | 124 | # Pyre type checker 125 | .pyre/ 126 | 127 | 将删除 .idea/ 128 | 将删除 demo/__init__.py 129 | .idea 130 | venv 131 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 文书网cookie获取两种方式 2 | 3 | > 本项目只做技术探讨,请勿用于违法用途。 4 | 5 | ## 第一种获取方式 6 | 7 | > 文书网cookie获取 2020-04-16(通过splash获取cookie) 8 | 9 | 获取cookie的demo请见[demo](./demo/get_cookie.py) 10 | 11 | 12 | ⚠️如果获取cookie的地址是 `https://wenshu.court.gov.cn/` 这是https,那么是不会返回 `80S` 和 `80T` 这两个cookie的,返回的是 `443S` 和 `443T` 13 | 14 | ⚠️获取cookie的地址是 `http://wenshu.court.gov.cn/` 这是http的,那么才会返回 `80S` 和 `80T` 这两个cookie的 15 | 16 | 17 | 获取的 cookie 是通过 http的链接获取,后面的爬取也用http 18 | 19 | 获取 cookie 是通过 https 获取的,后面的爬取也用https 即可 20 | 21 | 22 | ⚠️注意:请替换为 `https` , `http` 已阵亡!!!!!!!!!!!!!!!!!! 23 | 24 | 25 | ### 获取教程 26 | 27 | 一. 安装splash 28 | 29 | 推荐docker启动一个splash容器 30 | ``` 31 | docker run -it -p 8050:8050 scrapinghub/splash 32 | ``` 33 | 34 | [splash安装教程📖](https://splash.readthedocs.io/en/stable/install.html#linux-docker) 35 | 36 | 二. 通过splash获取这三个cookie,代码如下 37 | 38 | [demo](./demo/get_cookie.py) 39 | 40 | 41 | ## 第二种获取方式 42 | 43 | > 通过 PyQtWebEngine 获取cookie 44 | 45 | 示例代码如 [demo2](./demo2/get_cookie.py) 46 | 47 | 48 | 49 | # 反爬应对措施,2019-10-24 再次更新 50 | 51 | > 返回响应代码 202 的解决方式(这个更新了,需要重新逆向这个js,最近没太多时间) 52 | 53 | 1. 这次如果按照以前的请求方式,会返回一个html页面 54 | 55 | 56 | 这个页面如 [demo.html](https://github.com/nciefeiniu/wenshu/blob/master/demo.html) 57 | 58 | 59 | ```html 60 | 61 | 62 | 63 | 64 | 67 | 72 | 73 | 74 | 75 | 76 | ``` 77 | 78 | 79 | 2. 用浏览器打开这个文件,会发现会重定向到一个新的URL 80 | 81 | 如:http://localhost:63342/WZWSREL3dlYnNpdGUvcGFyc2UvcmVzdC5xNHc=?wzwschallenge=V1pXU19DT05GSVJNX1BSRUZJWF9MQUJFTDQxNjUyNzE= 82 | 83 | 因为是本地打开的,所以域名是`localhost:63342` 84 | 85 | 3. 把这个本地地址换成 `http://wenshu.court.gov.cn`这个后 86 | 87 | 神奇的事情发生了,可以获取到数据了。而且后面的请求也没返回这个 `html` 文件了。 88 | 89 | 90 | #### 所以这次反爬解决方案 91 | 92 | 1. 在请求返回的地方增加一个判断,如果是 `html` 文件,那么就解析这个文件,获取新的URL,并重试,发送 `post` 请求即可。 93 | 94 | 2. 这个html怎么解析?? 95 | 96 | ~这个可以看看 @songguoxiong 的项目下的 [decrypt.py文件](https://github.com/songguoxiong/wenshu_utils/blob/master/wenshu_utils/old/wzws/decrypt.py)~ 97 | 98 | 请看 `decrypt.py` 文件 99 | 100 | ⚠️ 注意 splash返回的cookie中,需要去除 `wzws_cid` 这个cookie -------------------------------------------------------------------------------- /decrypt.py: -------------------------------------------------------------------------------- 1 | import re 2 | import base64 3 | 4 | from urllib.parse import urljoin 5 | 6 | 7 | _pattern = re.compile(r"dynamicurl\|(?P.+?)\|wzwsquestion\|(?P.+?)\|wzwsfactor\|(?P\d+)") 8 | 9 | 10 | def decrypt_wzws(text: str) -> str: 11 | # noinspection PyBroadException 12 | try: 13 | return _decrypt_by_python(text) 14 | except Exception: 15 | print("解析html错误") 16 | 17 | 18 | def _decrypt_by_python(text: str) -> str: 19 | base_url = "http://wenshu.court.gov.cn" 20 | 21 | group_dict = _pattern.search(text).groupdict() 22 | question = group_dict["question"] 23 | factor = int(group_dict["factor"]) 24 | path = group_dict["path"] 25 | 26 | label = "WZWS_CONFIRM_PREFIX_LABEL{}".format(sum(ord(i) for i in question) * factor + 111111) 27 | challenge = base64.b64encode(label.encode()).decode() 28 | 29 | dynamic_url = urljoin(base_url, path) 30 | dynamic_url = "{url}?{query}".format(url=dynamic_url, query="wzwschallenge={}".format(challenge)) 31 | return dynamic_url 32 | 33 | 34 | if __name__ == "__main__": 35 | with open("demo.html") as f: 36 | _content = f.read() 37 | _resp = decrypt_wzws(_content) 38 | print(_resp) 39 | -------------------------------------------------------------------------------- /demo.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 8 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /demo/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | -------------------------------------------------------------------------------- /demo/get_cookie.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import json 6 | 7 | from dataclasses import dataclass 8 | from urllib import request 9 | from urllib.parse import urljoin 10 | from typing import List, Dict 11 | 12 | envget = os.environ.get 13 | 14 | 15 | @dataclass 16 | class WSCookie: 17 | splash_url = envget("SPLASH_URL", "http://192.168.3.83:8050") 18 | 19 | def send_request(self, retry_num=0) -> List[Dict]: 20 | if retry_num > 3: 21 | print("尝试重新获取数据3次,还是未获取到cookie,请考虑增加代理") 22 | return [] 23 | post_body = { 24 | "har": "1", 25 | "html5_media": "false", 26 | "http_method": "GET", 27 | "png": 1, 28 | "render_all": False, 29 | "request_body": False, 30 | "resource_timeout": 0, 31 | "response_body": False, 32 | "viewport": "1920x1080", 33 | "wait": 3, 34 | "images": 1, 35 | "html": 1, 36 | "expand": 1, 37 | "timeout": 90, 38 | "url": "http://wenshu.court.gov.cn/", 39 | "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36", 40 | "lua_source": """function main(splash, args) 41 | assert(splash:go(args.url)) 42 | assert(splash:wait(0.5)) 43 | splash.images_enabled = false 44 | return { 45 | cookie = splash:get_cookies() 46 | } 47 | end 48 | """ 49 | } 50 | req = request.Request(url=urljoin(self.splash_url, "/execute"), data=json.dumps(post_body).encode('utf-8'), 51 | headers={"content-type": "application/json"}) 52 | resp = request.urlopen(req) 53 | 54 | if resp.status != 200: 55 | return self.send_request(retry_num+1) 56 | 57 | resp_json = json.loads(resp.read()) 58 | 59 | if 'cookie' not in resp_json: 60 | return self.send_request(retry_num+1) 61 | return resp_json['cookie'] 62 | 63 | @staticmethod 64 | def parse_cookie(cookies: List[Dict]): 65 | return {cookie['name']: cookie['value'] for cookie in cookies if cookie['name'] not in ['wzws_cid', 'SESSION']} 66 | 67 | def get_cookie(self) -> Dict[str, str]: 68 | return self.parse_cookie(self.send_request()) 69 | 70 | 71 | if __name__ == '__main__': 72 | ws_cookie = WSCookie() 73 | _cookie = ws_cookie.get_cookie() 74 | print(_cookie) -------------------------------------------------------------------------------- /demo2/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | -------------------------------------------------------------------------------- /demo2/get_cookie.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | 6 | from typing import Dict 7 | 8 | from PyQt5.QtCore import QEventLoop, QUrl 9 | from PyQt5.QtWidgets import QApplication 10 | from PyQt5.QtWebEngineWidgets import QWebEngineView, QWebEngineProfile 11 | 12 | 13 | def callback(html): 14 | print(html) 15 | 16 | 17 | def get_cookie(url: str) -> Dict[str, str]: 18 | 19 | class Render(QWebEngineView): 20 | cookies = {} 21 | html = None 22 | 23 | def __init__(self, url): 24 | self.app = QApplication(sys.argv) 25 | super(Render, self). __init__() 26 | self.page().profile().setHttpUserAgent( 27 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36" 28 | ) 29 | self.resize(1920, 1080) 30 | self.loadFinished.connect(self._loadFinished) 31 | self.load(QUrl(url)) 32 | 33 | QWebEngineProfile.defaultProfile().cookieStore().cookieAdded.connect(self._onCookieAdd) 34 | 35 | while self.html is None: 36 | self.app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents) 37 | 38 | def _onCookieAdd(self, cookie): 39 | print(cookie.domain()) 40 | if cookie.domain() != 'wenshu.court.gov.cn': 41 | return 42 | name = cookie.name().data().decode('utf-8') 43 | value = cookie.value().data().decode('utf-8') 44 | self.cookies[name] = value 45 | 46 | def _callable(self, data): 47 | self.html = data 48 | 49 | def _loadFinished(self): 50 | self.page().toHtml(self._callable) 51 | 52 | def __del__(self): 53 | self.app.quit() 54 | 55 | return Render(url).cookies 56 | 57 | 58 | if __name__ == '__main__': 59 | urll = "http://wenshu.court.gov.cn/" 60 | print(get_cookie(urll)) -------------------------------------------------------------------------------- /demo2/requirements.txt: -------------------------------------------------------------------------------- 1 | PyQt5 2 | PyQtWebEngine --------------------------------------------------------------------------------