├── tests ├── flow.pcapng └── demo.py ├── FlowAnalyzer ├── __init__.py ├── Path.py ├── logging_config.py └── FlowAnalyzer.py ├── LICENSE ├── README.md ├── setup.py └── requirements.txt /tests/flow.pcapng: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Byxs20/FlowAnalyzer/HEAD/tests/flow.pcapng -------------------------------------------------------------------------------- /FlowAnalyzer/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["FlowAnalyzer"] 2 | 3 | from .FlowAnalyzer import FlowAnalyzer 4 | -------------------------------------------------------------------------------- /FlowAnalyzer/Path.py: -------------------------------------------------------------------------------- 1 | # windows 2 | tshark_path = r"C:\Program Files\Wireshark\tshark.exe" 3 | 4 | def get_default_tshark_path() -> str: 5 | return tshark_path -------------------------------------------------------------------------------- /FlowAnalyzer/logging_config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def configure_logger(logger_name, level=logging.DEBUG) -> logging.Logger: 5 | # 创建一个 logger 对象 6 | logger = logging.getLogger(logger_name) 7 | logger.setLevel(level) 8 | 9 | # 创建一个处理器,将日志输出到控制台 10 | console_handler = logging.StreamHandler() 11 | logger.addHandler(console_handler) 12 | 13 | # 创建一个格式化器,定义日志的输出格式 14 | formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") 15 | console_handler.setFormatter(formatter) 16 | return logger 17 | 18 | logger = configure_logger("FlowAnalyzer", logging.INFO) 19 | 20 | if __name__ == '__main__': 21 | logger = configure_logger("FlowAnalyzer") 22 | logger.info("This is a test!") 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Pang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FlowAnalyzer 2 | 3 | # 安装 4 | 5 | 使用 `pip` 安装: 6 | 7 | ``` 8 | pip3 install FlowAnalyzer 9 | ``` 10 | 11 | ``` 12 | pip3 install FlowAnalyzer -i https://pypi.org/simple 13 | ``` 14 | 15 | # 快速上手 16 | 17 | ## 配置 18 | 19 | 如果您安装 `WireShark` 没有修改安装目录,默认 `tshark` 路径会如下: 20 | 21 | ```python 22 | # windows 23 | tshark_path = r"C:\Program Files\Wireshark\tshark.exe" 24 | ``` 25 | 26 | `Linux`, `MacOS` 默认路径不清楚,需要看下面的**纠正路径**,**确定路径没有问题,那也无需任何配置即可使用!** 27 | 28 | ## 纠正路径 29 | 30 | 修改 `python安装目录\Lib\site-packages\FlowAnalyzer\Path.py` 中的变量 `tshark_path` 改为**tshark正确路径** 31 | 32 | ## 测试 33 | 34 | ``` 35 | $ git clone https://github.com/Byxs20/FlowAnalyzer.git 36 | $ cd ./FlowAnalyzer/ 37 | $ python tests\demo.py 38 | ``` 39 | 40 | 运行结果: 41 | 42 | ``` 43 | [+] 正在处理第1个HTTP流! 44 | 序号: 2请求包, 请求头: b'POST /upload/php_eval_xor_base64.php HTTP/1.1\r\nUser-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0\r\n 45 | ... 46 | ``` 47 | 48 | # Contributing 49 | Feel free to submit issues or pull requests if you have any suggestions, improvements, or bug reports. 50 | 51 | # License 52 | 53 | This project is licensed under the [MIT License.](LICENSE) 54 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from setuptools import find_packages, setup 4 | 5 | with open(os.path.join(os.path.dirname(__file__), "README.md"), encoding="utf-8") as f: 6 | long_description = f.read() 7 | 8 | setup( 9 | name="FlowAnalyzer", 10 | version="0.4.0", 11 | description="FlowAnalyzer是一个流量分析器,用于解析和处理tshark导出的JSON数据文件", 12 | author="Byxs20", 13 | author_email="97766819@qq.com", 14 | packages=find_packages(exclude=["tests", "*.egg-info"]), 15 | package_data={ 16 | '': ['LICENSE', 'README.md', 'setup.py'], 17 | }, 18 | install_requires=[ 19 | ], 20 | classifiers=[ 21 | "Development Status :: 3 - Alpha", 22 | "Intended Audience :: Developers", 23 | "License :: OSI Approved :: MIT License", 24 | "Programming Language :: Python :: 3", 25 | "Programming Language :: Python :: 3.6", 26 | "Programming Language :: Python :: 3.7", 27 | "Programming Language :: Python :: 3.8", 28 | "Programming Language :: Python :: 3.9", 29 | ], 30 | 31 | long_description=long_description, 32 | long_description_content_type="text/markdown", 33 | url="https://github.com/Byxs20/FlowAnalyzer", 34 | ) 35 | -------------------------------------------------------------------------------- /tests/demo.py: -------------------------------------------------------------------------------- 1 | # sourcery skip: use-fstring-for-formatting 2 | import os 3 | 4 | from FlowAnalyzer import FlowAnalyzer 5 | 6 | baseDir = os.path.dirname(os.path.abspath(__file__)) 7 | flowPath = os.path.join(baseDir, "flow.pcapng") 8 | display_filter = "(http.request and urlencoded-form) or (http.request and data-text-lines) or (http.request and mime_multipart) or (http.response.code == 200 and data-text-lines)" 9 | 10 | json_path = FlowAnalyzer.get_json_data(flowPath, display_filter=display_filter) 11 | for http_seq_num, http in enumerate(FlowAnalyzer(json_path).generate_http_dict_pairs(), start=1): 12 | print(f"[+] 正在处理第{http_seq_num}个HTTP流!") 13 | 14 | request, response = http.request, http.response 15 | if request: 16 | request_num, header, file_data, time_epoch = request.frame_num, request.header, request.file_data, request.time_epoch 17 | print("序号: {}请求包, 请求头: {}, 文件: {}, 时间: {}".format(request_num, header, file_data, time_epoch)) 18 | 19 | if response: 20 | response_num, header, file_data, time_epoch = response.frame_num, response.header, response.file_data, response.time_epoch 21 | print("序号: {}响应包, 响应头: {}, 文件: {}, 时间: {}".format(response_num, header, file_data, time_epoch)) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by uv via the following command: 2 | # uv pip compile - -o requirements.txt 3 | backports-tarfile==1.2.0 4 | # via jaraco-context 5 | certifi==2024.7.4 6 | # via requests 7 | charset-normalizer==3.3.2 8 | # via requests 9 | docutils==0.20.1 10 | # via readme-renderer 11 | idna==3.7 12 | # via requests 13 | importlib-metadata==8.2.0 14 | # via 15 | # keyring 16 | # twine 17 | importlib-resources==6.4.0 18 | # via keyring 19 | jaraco-classes==3.4.0 20 | # via keyring 21 | jaraco-context==5.3.0 22 | # via keyring 23 | jaraco-functools==4.0.1 24 | # via keyring 25 | keyring==25.2.1 26 | # via twine 27 | markdown-it-py==3.0.0 28 | # via rich 29 | mdurl==0.1.2 30 | # via markdown-it-py 31 | more-itertools==10.3.0 32 | # via 33 | # jaraco-classes 34 | # jaraco-functools 35 | nh3==0.2.18 36 | # via readme-renderer 37 | pkginfo==1.10.0 38 | # via twine 39 | pygments==2.18.0 40 | # via 41 | # readme-renderer 42 | # rich 43 | pysocks==1.7.1 44 | pywin32-ctypes==0.2.2 45 | # via keyring 46 | readme-renderer==43.0 47 | # via twine 48 | requests==2.32.3 49 | # via 50 | # requests-toolbelt 51 | # twine 52 | requests-toolbelt==1.0.0 53 | # via twine 54 | rfc3986==2.0.0 55 | # via twine 56 | rich==13.7.1 57 | # via twine 58 | setuptools==72.1.0 59 | twine==5.1.1 60 | typing-extensions==4.12.2 61 | # via rich 62 | urllib3==2.2.2 63 | # via 64 | # requests 65 | # twine 66 | wheel==0.43.0 67 | zipp==3.19.2 68 | # via 69 | # importlib-metadata 70 | # importlib-resources 71 | -------------------------------------------------------------------------------- /FlowAnalyzer/FlowAnalyzer.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import gzip 3 | import hashlib 4 | import json 5 | import os 6 | import subprocess 7 | from dataclasses import dataclass 8 | from typing import Dict, Iterable, NamedTuple, Optional, Tuple 9 | from urllib import parse 10 | 11 | from .logging_config import logger 12 | from .Path import get_default_tshark_path 13 | 14 | 15 | @dataclass 16 | class Request: 17 | frame_num: int 18 | header: bytes 19 | file_data: bytes 20 | full_uri: str 21 | time_epoch: float 22 | 23 | 24 | @dataclass 25 | class Response: 26 | frame_num: int 27 | header: bytes 28 | file_data: bytes 29 | time_epoch: float 30 | _request_in: Optional[int] 31 | 32 | 33 | class HttpPair(NamedTuple): 34 | request: Optional[Request] 35 | response: Optional[Response] 36 | 37 | 38 | class FlowAnalyzer: 39 | """FlowAnalyzer是一个流量分析器,用于解析和处理tshark导出的JSON数据文件""" 40 | 41 | def __init__(self, json_path: str): 42 | """初始化FlowAnalyzer对象 43 | 44 | Parameters 45 | ---------- 46 | json_path : str 47 | tshark导出的JSON文件路径 48 | """ 49 | self.json_path = json_path 50 | self.check_json_file() 51 | 52 | def check_json_file(self): 53 | # sourcery skip: replace-interpolation-with-fstring 54 | """检查JSON文件是否存在并非空 55 | 56 | Raises 57 | ------ 58 | FileNotFoundError 59 | 当JSON文件不存在时抛出异常 60 | ValueError 61 | 当JSON文件内容为空时抛出异常 62 | """ 63 | if not os.path.exists(self.json_path): 64 | raise FileNotFoundError("您的tshark导出的JSON文件没有找到!JSON路径:%s" % self.json_path) 65 | 66 | if os.path.getsize(self.json_path) == 0: 67 | raise ValueError("您的tshark导出的JSON文件内容为空!JSON路径:%s" % self.json_path) 68 | 69 | def parse_packet(self, packet: dict) -> Tuple[int, int, float, str, str]: 70 | """解析Json中的关键信息字段 71 | 72 | Parameters 73 | ---------- 74 | packet : dict 75 | 传入Json字典 76 | 77 | Returns 78 | ------- 79 | Tuple[int, int, float, str, str] 80 | frame_num, request_in, time_epoch, full_uri, full_request 81 | """ 82 | frame_num = int(packet["frame.number"][0]) 83 | request_in = int(packet["http.request_in"][0]) if packet.get("http.request_in") else frame_num 84 | full_uri = parse.unquote(packet["http.request.full_uri"][0]) if packet.get("http.request.full_uri") else "" 85 | time_epoch = float(packet["frame.time_epoch"][0]) 86 | 87 | if packet.get("tcp.reassembled.data"): 88 | full_request = packet["tcp.reassembled.data"][0] 89 | elif packet.get("tcp.payload"): 90 | full_request = packet["tcp.payload"][0] 91 | else: 92 | # exported_pdu.exported_pdu 93 | full_request = packet["exported_pdu.exported_pdu"][0] 94 | return frame_num, request_in, time_epoch, full_uri, full_request 95 | 96 | def parse_http_json(self) -> Tuple[Dict[int, Request], Dict[int, Response]]: 97 | """解析JSON数据文件中的HTTP请求和响应信息 98 | 99 | Returns 100 | ------- 101 | tuple 102 | 包含请求字典和响应列表的元组 103 | """ 104 | with open(self.json_path, "r", encoding="utf-8") as f: 105 | data = json.load(f) 106 | 107 | requests, responses = {}, {} 108 | for packet in data: 109 | packet = packet["_source"]["layers"] 110 | frame_num, request_in, time_epoch, full_uri, full_request = self.parse_packet(packet) 111 | header, file_data = self.extract_http_file_data(full_request) 112 | 113 | # 请求包使用 full_uri 来记录请求 url 返回包使用 request_in 来记录请求包的序号 114 | if packet.get("http.response.code"): 115 | responses[frame_num] = Response( 116 | frame_num=frame_num, 117 | _request_in=request_in, 118 | header=header, 119 | file_data=file_data, 120 | time_epoch=time_epoch, 121 | ) 122 | else: 123 | requests[frame_num] = Request( 124 | frame_num=frame_num, header=header, file_data=file_data, time_epoch=time_epoch, full_uri=full_uri 125 | ) 126 | return requests, responses 127 | 128 | def generate_http_dict_pairs(self) -> Iterable[HttpPair]: # sourcery skip: use-named-expression 129 | """生成HTTP请求和响应信息的字典对 130 | Yields 131 | ------ 132 | Iterable[HttpPair] 133 | 包含请求和响应信息的字典迭代器 134 | """ 135 | requests, responses = self.parse_http_json() 136 | response_map = {r._request_in: r for r in responses.values()} 137 | yielded_resps = [] 138 | for req_id, req in requests.items(): 139 | resp = response_map.get(req_id) 140 | if resp: 141 | yielded_resps.append(resp) 142 | resp._request_in = None 143 | yield HttpPair(request=req, response=resp) 144 | else: 145 | yield HttpPair(request=req, response=None) 146 | 147 | for resp in response_map.values(): 148 | if resp not in yielded_resps: 149 | resp._request_in = None 150 | yield HttpPair(request=None, response=resp) 151 | 152 | @staticmethod 153 | def get_hash(file_path: str, display_filter: str) -> str: 154 | with open(file_path, "rb") as f: 155 | return hashlib.md5(f.read() + display_filter.encode()).hexdigest() 156 | 157 | @staticmethod 158 | def extract_json_file(file_name: str, display_filter: str, tshark_path: str, tshark_work_dir: str, json_work_path: str) -> None: 159 | command = [ 160 | tshark_path, 161 | "-r", 162 | file_name, 163 | "-Y", 164 | f"({display_filter})", 165 | "-T", 166 | "json", 167 | "-e", 168 | "http.response.code", 169 | "-e", 170 | "http.request_in", 171 | "-e", 172 | "tcp.reassembled.data", 173 | "-e", 174 | "frame.number", 175 | "-e", 176 | "tcp.payload", 177 | "-e", 178 | "frame.time_epoch", 179 | "-e", 180 | "exported_pdu.exported_pdu", 181 | "-e", 182 | "http.request.full_uri", 183 | ] 184 | logger.debug(f"导出Json命令: {command}") 185 | 186 | with open(json_work_path, "wb") as output_file: 187 | process = subprocess.Popen(command, stdout=output_file, stderr=subprocess.PIPE, cwd=tshark_work_dir) 188 | _, stderr = process.communicate() 189 | logger.debug(f"导出Json文件路径: {json_work_path}") 190 | 191 | if stderr and b"WARNING" not in stderr: 192 | try: 193 | print(f"[Warning/Error]: {stderr.decode('utf-8')}") 194 | except Exception: 195 | print(f"[Warning/Error]: {stderr.decode('gbk')}") 196 | 197 | @staticmethod 198 | def add_md5sum(json_work_path: str, md5_sum: str) -> None: 199 | with open(json_work_path, "r", encoding="utf-8") as f: 200 | data = json.load(f) 201 | data[0]["MD5Sum"] = md5_sum 202 | 203 | with open(json_work_path, "w", encoding="utf-8") as f: 204 | json.dump(data, f, indent=2) 205 | 206 | @staticmethod 207 | def get_json_data(file_path: str, display_filter: str, tshark_path: Optional[str] = None) -> str: 208 | # sourcery skip: replace-interpolation-with-fstring 209 | """获取JSON数据并保存至文件,保存目录是当前工作目录,也就是您运行脚本所在目录 210 | 211 | Parameters 212 | ---------- 213 | file_path : str 214 | 待处理的数据文件路径 215 | display_filter : str 216 | WireShark的显示过滤器 217 | 218 | Returns 219 | ------- 220 | str 221 | 保存JSON数据的文件路径 222 | """ 223 | if not os.path.exists(file_path): 224 | raise FileNotFoundError("您的填写的流量包没有找到!流量包路径:%s" % file_path) 225 | 226 | md5_sum = FlowAnalyzer.get_hash(file_path, display_filter) 227 | logger.debug(f"md5校验值: {md5_sum}") 228 | 229 | work_dir = os.getcwd() 230 | tshark_command_work_dir = os.path.dirname(os.path.abspath(file_path)) 231 | json_work_path = os.path.join(work_dir, "output.json") 232 | file_name = os.path.basename(file_path) 233 | 234 | if os.path.exists(json_work_path): 235 | try: 236 | with open(json_work_path, "r", encoding="utf-8") as f: 237 | data = json.load(f) 238 | if data[0].get("MD5Sum") == md5_sum: 239 | logger.debug("匹配md5校验无误,自动返回Json文件路径!") 240 | return json_work_path 241 | except Exception: 242 | logger.debug("默认的Json文件无法被正常解析, 正在重新生成Json文件中") 243 | 244 | tshark_path = FlowAnalyzer.get_tshark_path(tshark_path) 245 | FlowAnalyzer.extract_json_file(file_name, display_filter, tshark_path, tshark_command_work_dir, json_work_path) 246 | FlowAnalyzer.add_md5sum(json_work_path, md5_sum) 247 | return json_work_path 248 | 249 | @staticmethod 250 | def get_tshark_path(tshark_path: Optional[str]) -> str: 251 | default_tshark_path = get_default_tshark_path() 252 | if not os.path.exists(default_tshark_path): 253 | logger.debug("没有检测到tshark存在, 请查看并检查tshark_path") 254 | else: 255 | logger.debug("检测到默认tshark存在!") 256 | 257 | if tshark_path is None: 258 | logger.debug("您没有传入tshark_path, 请传入tshark_path") 259 | elif not os.path.exists(tshark_path): 260 | logger.debug("传入的tshark_path不存在, 请查看并检查tshark_path") 261 | 262 | use_tshark_path = None 263 | if os.path.exists(default_tshark_path): 264 | use_tshark_path = default_tshark_path 265 | 266 | if tshark_path is not None and os.path.exists(tshark_path): 267 | use_tshark_path = tshark_path 268 | 269 | if use_tshark_path is None: 270 | logger.critical("您没有配置 tshark_path 并且没有在参数中传入 tshark_path") 271 | exit(-1) 272 | return use_tshark_path 273 | 274 | def split_http_headers(self, file_data: bytes) -> Tuple[bytes, bytes]: 275 | headerEnd = file_data.find(b"\r\n\r\n") 276 | if headerEnd != -1: 277 | headerEnd += 4 278 | return file_data[:headerEnd], file_data[headerEnd:] 279 | elif file_data.find(b"\n\n") != -1: 280 | headerEnd = file_data.index(b"\n\n") + 2 281 | return file_data[:headerEnd], file_data[headerEnd:] 282 | else: 283 | print("[Warning] 没有找到headers和response的划分位置!") 284 | return b"", file_data 285 | 286 | def dechunck_http_response(self, file_data: bytes) -> bytes: 287 | """解码分块TCP数据 288 | 289 | Parameters 290 | ---------- 291 | file_data : bytes 292 | 已经切割掉headers的TCP数据 293 | 294 | Returns 295 | ------- 296 | bytes 297 | 解码分块后的TCP数据 298 | """ 299 | chunks = [] 300 | chunkSizeEnd = file_data.find(b"\n") + 1 301 | lineEndings = b"\r\n" if bytes([file_data[chunkSizeEnd - 2]]) == b"\r" else b"\n" 302 | lineEndingsLength = len(lineEndings) 303 | while True: 304 | chunkSize = int(file_data[:chunkSizeEnd], 16) 305 | if not chunkSize: 306 | break 307 | 308 | chunks.append(file_data[chunkSizeEnd : chunkSize + chunkSizeEnd]) 309 | file_data = file_data[chunkSizeEnd + chunkSize + lineEndingsLength :] 310 | chunkSizeEnd = file_data.find(lineEndings) + lineEndingsLength 311 | return b"".join(chunks) 312 | 313 | def extract_http_file_data(self, full_request: str) -> Tuple[bytes, bytes]: 314 | """提取HTTP请求或响应中的文件数据 315 | 316 | Parameters 317 | ---------- 318 | full_request : bytes 319 | HTTP请求或响应的原始字节流 320 | 321 | Returns 322 | ------- 323 | tuple 324 | 包含header和file_data的元组 325 | """ 326 | header, file_data = self.split_http_headers(bytes.fromhex(full_request)) 327 | 328 | with contextlib.suppress(Exception): 329 | file_data = self.dechunck_http_response(file_data) 330 | 331 | with contextlib.suppress(Exception): 332 | if file_data.startswith(b"\x1f\x8b"): 333 | file_data = gzip.decompress(file_data) 334 | return header, file_data 335 | --------------------------------------------------------------------------------