├── tests
    ├── flow.pcapng
    └── demo.py
├── FlowAnalyzer
    ├── __init__.py
    ├── Path.py
    ├── logging_config.py
    └── FlowAnalyzer.py
├── LICENSE
├── README.md
├── setup.py
└── requirements.txt


/tests/flow.pcapng:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Byxs20/FlowAnalyzer/HEAD/tests/flow.pcapng


--------------------------------------------------------------------------------
/FlowAnalyzer/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["FlowAnalyzer"]
2 | 
3 | from .FlowAnalyzer import FlowAnalyzer
4 | 


--------------------------------------------------------------------------------
/FlowAnalyzer/Path.py:
--------------------------------------------------------------------------------
1 | # windows
2 | tshark_path = r"C:\Program Files\Wireshark\tshark.exe"
3 | 
4 | def get_default_tshark_path() -> str:
5 |     return tshark_path


--------------------------------------------------------------------------------
/FlowAnalyzer/logging_config.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | def configure_logger(logger_name, level=logging.DEBUG) -> logging.Logger:
 5 |     # 创建一个 logger 对象
 6 |     logger = logging.getLogger(logger_name)
 7 |     logger.setLevel(level)
 8 | 
 9 |     # 创建一个处理器，将日志输出到控制台
10 |     console_handler = logging.StreamHandler()
11 |     logger.addHandler(console_handler)
12 | 
13 |     # 创建一个格式化器，定义日志的输出格式
14 |     formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
15 |     console_handler.setFormatter(formatter)
16 |     return logger
17 | 
18 | logger = configure_logger("FlowAnalyzer", logging.INFO)
19 | 
20 | if __name__ == '__main__':
21 |     logger = configure_logger("FlowAnalyzer")
22 |     logger.info("This is a test!")
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Pang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # FlowAnalyzer
 2 | 
 3 | # 安装
 4 | 
 5 | 使用 `pip` 安装：
 6 | 
 7 | ```
 8 | pip3 install FlowAnalyzer
 9 | ```
10 | 
11 | ```
12 | pip3 install FlowAnalyzer -i https://pypi.org/simple
13 | ```
14 | 
15 | # 快速上手
16 | 
17 | ## 配置
18 | 
19 | 如果您安装 `WireShark` 没有修改安装目录，默认 `tshark` 路径会如下：
20 | 
21 | ```python
22 | # windows
23 | tshark_path = r"C:\Program Files\Wireshark\tshark.exe"
24 | ```
25 | 
26 | `Linux`, `MacOS` 默认路径不清楚，需要看下面的**纠正路径**，**确定路径没有问题，那也无需任何配置即可使用！**
27 | 
28 | ## 纠正路径
29 | 
30 | 修改 `python安装目录\Lib\site-packages\FlowAnalyzer\Path.py` 中的变量 `tshark_path` 改为**tshark正确路径**
31 | 
32 | ## 测试
33 | 
34 | ```
35 | $ git clone https://github.com/Byxs20/FlowAnalyzer.git
36 | $ cd ./FlowAnalyzer/
37 | $ python tests\demo.py
38 | ```
39 | 
40 | 运行结果：
41 | 
42 | ```
43 | [+] 正在处理第1个HTTP流!
44 | 序号: 2请求包, 请求头: b'POST /upload/php_eval_xor_base64.php HTTP/1.1\r\nUser-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0\r\n
45 | ...
46 | ```
47 | 
48 | # Contributing
49 | Feel free to submit issues or pull requests if you have any suggestions, improvements, or bug reports.
50 | 
51 | # License
52 | 
53 | This project is licensed under the [MIT License.](LICENSE)
54 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from setuptools import find_packages, setup
 4 | 
 5 | with open(os.path.join(os.path.dirname(__file__), "README.md"), encoding="utf-8") as f:
 6 |     long_description = f.read()
 7 | 
 8 | setup(
 9 |     name="FlowAnalyzer",
10 |     version="0.4.0",
11 |     description="FlowAnalyzer是一个流量分析器，用于解析和处理tshark导出的JSON数据文件",
12 |     author="Byxs20",
13 |     author_email="97766819@qq.com",
14 |     packages=find_packages(exclude=["tests", "*.egg-info"]),
15 |     package_data={
16 |         '': ['LICENSE', 'README.md', 'setup.py'],
17 |     },
18 |     install_requires=[
19 |     ],
20 |     classifiers=[
21 |         "Development Status :: 3 - Alpha",
22 |         "Intended Audience :: Developers",
23 |         "License :: OSI Approved :: MIT License",
24 |         "Programming Language :: Python :: 3",
25 |         "Programming Language :: Python :: 3.6",
26 |         "Programming Language :: Python :: 3.7",
27 |         "Programming Language :: Python :: 3.8",
28 |         "Programming Language :: Python :: 3.9",
29 |     ],
30 | 
31 |     long_description=long_description,
32 |     long_description_content_type="text/markdown",
33 |     url="https://github.com/Byxs20/FlowAnalyzer",
34 | )
35 | 


--------------------------------------------------------------------------------
/tests/demo.py:
--------------------------------------------------------------------------------
 1 | # sourcery skip: use-fstring-for-formatting
 2 | import os
 3 | 
 4 | from FlowAnalyzer import FlowAnalyzer
 5 | 
 6 | baseDir = os.path.dirname(os.path.abspath(__file__))
 7 | flowPath = os.path.join(baseDir, "flow.pcapng")
 8 | display_filter = "(http.request and urlencoded-form) or (http.request and data-text-lines) or (http.request and mime_multipart) or (http.response.code == 200 and data-text-lines)"
 9 | 
10 | json_path = FlowAnalyzer.get_json_data(flowPath, display_filter=display_filter)
11 | for http_seq_num, http in enumerate(FlowAnalyzer(json_path).generate_http_dict_pairs(), start=1):
12 |     print(f"[+] 正在处理第{http_seq_num}个HTTP流!")
13 |     
14 |     request, response = http.request, http.response
15 |     if request:
16 |         request_num, header, file_data, time_epoch = request.frame_num, request.header, request.file_data, request.time_epoch
17 |         print("序号: {}请求包, 请求头: {}, 文件: {}, 时间: {}".format(request_num, header, file_data, time_epoch))
18 | 
19 |     if response:
20 |         response_num, header, file_data, time_epoch = response.frame_num, response.header, response.file_data, response.time_epoch
21 |         print("序号: {}响应包, 响应头: {}, 文件: {}, 时间: {}".format(response_num, header, file_data, time_epoch))


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # This file was autogenerated by uv via the following command:
 2 | #    uv pip compile - -o requirements.txt
 3 | backports-tarfile==1.2.0
 4 |     # via jaraco-context
 5 | certifi==2024.7.4
 6 |     # via requests
 7 | charset-normalizer==3.3.2
 8 |     # via requests
 9 | docutils==0.20.1
10 |     # via readme-renderer
11 | idna==3.7
12 |     # via requests
13 | importlib-metadata==8.2.0
14 |     # via
15 |     #   keyring
16 |     #   twine
17 | importlib-resources==6.4.0
18 |     # via keyring
19 | jaraco-classes==3.4.0
20 |     # via keyring
21 | jaraco-context==5.3.0
22 |     # via keyring
23 | jaraco-functools==4.0.1
24 |     # via keyring
25 | keyring==25.2.1
26 |     # via twine
27 | markdown-it-py==3.0.0
28 |     # via rich
29 | mdurl==0.1.2
30 |     # via markdown-it-py
31 | more-itertools==10.3.0
32 |     # via
33 |     #   jaraco-classes
34 |     #   jaraco-functools
35 | nh3==0.2.18
36 |     # via readme-renderer
37 | pkginfo==1.10.0
38 |     # via twine
39 | pygments==2.18.0
40 |     # via
41 |     #   readme-renderer
42 |     #   rich
43 | pysocks==1.7.1
44 | pywin32-ctypes==0.2.2
45 |     # via keyring
46 | readme-renderer==43.0
47 |     # via twine
48 | requests==2.32.3
49 |     # via
50 |     #   requests-toolbelt
51 |     #   twine
52 | requests-toolbelt==1.0.0
53 |     # via twine
54 | rfc3986==2.0.0
55 |     # via twine
56 | rich==13.7.1
57 |     # via twine
58 | setuptools==72.1.0
59 | twine==5.1.1
60 | typing-extensions==4.12.2
61 |     # via rich
62 | urllib3==2.2.2
63 |     # via
64 |     #   requests
65 |     #   twine
66 | wheel==0.43.0
67 | zipp==3.19.2
68 |     # via
69 |     #   importlib-metadata
70 |     #   importlib-resources
71 | 


--------------------------------------------------------------------------------
/FlowAnalyzer/FlowAnalyzer.py:
--------------------------------------------------------------------------------
  1 | import contextlib
  2 | import gzip
  3 | import hashlib
  4 | import json
  5 | import os
  6 | import subprocess
  7 | from dataclasses import dataclass
  8 | from typing import Dict, Iterable, NamedTuple, Optional, Tuple
  9 | from urllib import parse
 10 | 
 11 | from .logging_config import logger
 12 | from .Path import get_default_tshark_path
 13 | 
 14 | 
 15 | @dataclass
 16 | class Request:
 17 |     frame_num: int
 18 |     header: bytes
 19 |     file_data: bytes
 20 |     full_uri: str
 21 |     time_epoch: float
 22 | 
 23 | 
 24 | @dataclass
 25 | class Response:
 26 |     frame_num: int
 27 |     header: bytes
 28 |     file_data: bytes
 29 |     time_epoch: float
 30 |     _request_in: Optional[int]
 31 | 
 32 | 
 33 | class HttpPair(NamedTuple):
 34 |     request: Optional[Request]
 35 |     response: Optional[Response]
 36 | 
 37 | 
 38 | class FlowAnalyzer:
 39 |     """FlowAnalyzer是一个流量分析器，用于解析和处理tshark导出的JSON数据文件"""
 40 | 
 41 |     def __init__(self, json_path: str):
 42 |         """初始化FlowAnalyzer对象
 43 | 
 44 |         Parameters
 45 |         ----------
 46 |         json_path : str
 47 |             tshark导出的JSON文件路径
 48 |         """
 49 |         self.json_path = json_path
 50 |         self.check_json_file()
 51 | 
 52 |     def check_json_file(self):
 53 |         # sourcery skip: replace-interpolation-with-fstring
 54 |         """检查JSON文件是否存在并非空
 55 | 
 56 |         Raises
 57 |         ------
 58 |         FileNotFoundError
 59 |             当JSON文件不存在时抛出异常
 60 |         ValueError
 61 |             当JSON文件内容为空时抛出异常
 62 |         """
 63 |         if not os.path.exists(self.json_path):
 64 |             raise FileNotFoundError("您的tshark导出的JSON文件没有找到！JSON路径：%s" % self.json_path)
 65 | 
 66 |         if os.path.getsize(self.json_path) == 0:
 67 |             raise ValueError("您的tshark导出的JSON文件内容为空！JSON路径：%s" % self.json_path)
 68 | 
 69 |     def parse_packet(self, packet: dict) -> Tuple[int, int, float, str, str]:
 70 |         """解析Json中的关键信息字段
 71 | 
 72 |         Parameters
 73 |         ----------
 74 |         packet : dict
 75 |             传入Json字典
 76 | 
 77 |         Returns
 78 |         -------
 79 |         Tuple[int, int, float, str, str]
 80 |             frame_num, request_in, time_epoch, full_uri, full_request
 81 |         """
 82 |         frame_num = int(packet["frame.number"][0])
 83 |         request_in = int(packet["http.request_in"][0]) if packet.get("http.request_in") else frame_num
 84 |         full_uri = parse.unquote(packet["http.request.full_uri"][0]) if packet.get("http.request.full_uri") else ""
 85 |         time_epoch = float(packet["frame.time_epoch"][0])
 86 | 
 87 |         if packet.get("tcp.reassembled.data"):
 88 |             full_request = packet["tcp.reassembled.data"][0]
 89 |         elif packet.get("tcp.payload"):
 90 |             full_request = packet["tcp.payload"][0]
 91 |         else:
 92 |             # exported_pdu.exported_pdu
 93 |             full_request = packet["exported_pdu.exported_pdu"][0]
 94 |         return frame_num, request_in, time_epoch, full_uri, full_request
 95 | 
 96 |     def parse_http_json(self) -> Tuple[Dict[int, Request], Dict[int, Response]]:
 97 |         """解析JSON数据文件中的HTTP请求和响应信息
 98 | 
 99 |         Returns
100 |         -------
101 |         tuple
102 |             包含请求字典和响应列表的元组
103 |         """
104 |         with open(self.json_path, "r", encoding="utf-8") as f:
105 |             data = json.load(f)
106 | 
107 |         requests, responses = {}, {}
108 |         for packet in data:
109 |             packet = packet["_source"]["layers"]
110 |             frame_num, request_in, time_epoch, full_uri, full_request = self.parse_packet(packet)
111 |             header, file_data = self.extract_http_file_data(full_request)
112 | 
113 |             # 请求包使用 full_uri 来记录请求 url  返回包使用 request_in 来记录请求包的序号
114 |             if packet.get("http.response.code"):
115 |                 responses[frame_num] = Response(
116 |                     frame_num=frame_num,
117 |                     _request_in=request_in,
118 |                     header=header,
119 |                     file_data=file_data,
120 |                     time_epoch=time_epoch,
121 |                 )
122 |             else:
123 |                 requests[frame_num] = Request(
124 |                     frame_num=frame_num, header=header, file_data=file_data, time_epoch=time_epoch, full_uri=full_uri
125 |                 )
126 |         return requests, responses
127 | 
128 |     def generate_http_dict_pairs(self) -> Iterable[HttpPair]:  # sourcery skip: use-named-expression
129 |         """生成HTTP请求和响应信息的字典对
130 |         Yields
131 |         ------
132 |         Iterable[HttpPair]
133 |             包含请求和响应信息的字典迭代器
134 |         """
135 |         requests, responses = self.parse_http_json()
136 |         response_map = {r._request_in: r for r in responses.values()}
137 |         yielded_resps = []
138 |         for req_id, req in requests.items():
139 |             resp = response_map.get(req_id)
140 |             if resp:
141 |                 yielded_resps.append(resp)
142 |                 resp._request_in = None
143 |                 yield HttpPair(request=req, response=resp)
144 |             else:
145 |                 yield HttpPair(request=req, response=None)
146 | 
147 |         for resp in response_map.values():
148 |             if resp not in yielded_resps:
149 |                 resp._request_in = None
150 |                 yield HttpPair(request=None, response=resp)
151 | 
152 |     @staticmethod
153 |     def get_hash(file_path: str, display_filter: str) -> str:
154 |         with open(file_path, "rb") as f:
155 |             return hashlib.md5(f.read() + display_filter.encode()).hexdigest()
156 | 
157 |     @staticmethod
158 |     def extract_json_file(file_name: str, display_filter: str, tshark_path: str, tshark_work_dir: str, json_work_path: str) -> None:
159 |         command = [
160 |             tshark_path,
161 |             "-r",
162 |             file_name,
163 |             "-Y",
164 |             f"({display_filter})",
165 |             "-T",
166 |             "json",
167 |             "-e",
168 |             "http.response.code",
169 |             "-e",
170 |             "http.request_in",
171 |             "-e",
172 |             "tcp.reassembled.data",
173 |             "-e",
174 |             "frame.number",
175 |             "-e",
176 |             "tcp.payload",
177 |             "-e",
178 |             "frame.time_epoch",
179 |             "-e",
180 |             "exported_pdu.exported_pdu",
181 |             "-e",
182 |             "http.request.full_uri",
183 |         ]
184 |         logger.debug(f"导出Json命令: {command}")
185 | 
186 |         with open(json_work_path, "wb") as output_file:
187 |             process = subprocess.Popen(command, stdout=output_file, stderr=subprocess.PIPE, cwd=tshark_work_dir)
188 |             _, stderr = process.communicate()
189 |         logger.debug(f"导出Json文件路径: {json_work_path}")
190 | 
191 |         if stderr and b"WARNING" not in stderr:
192 |             try:
193 |                 print(f"[Warning/Error]: {stderr.decode('utf-8')}")
194 |             except Exception:
195 |                 print(f"[Warning/Error]: {stderr.decode('gbk')}")
196 | 
197 |     @staticmethod
198 |     def add_md5sum(json_work_path: str, md5_sum: str) -> None:
199 |         with open(json_work_path, "r", encoding="utf-8") as f:
200 |             data = json.load(f)
201 |         data[0]["MD5Sum"] = md5_sum
202 | 
203 |         with open(json_work_path, "w", encoding="utf-8") as f:
204 |             json.dump(data, f, indent=2)
205 | 
206 |     @staticmethod
207 |     def get_json_data(file_path: str, display_filter: str, tshark_path: Optional[str] = None) -> str:
208 |         # sourcery skip: replace-interpolation-with-fstring
209 |         """获取JSON数据并保存至文件，保存目录是当前工作目录，也就是您运行脚本所在目录
210 | 
211 |         Parameters
212 |         ----------
213 |         file_path : str
214 |             待处理的数据文件路径
215 |         display_filter : str
216 |             WireShark的显示过滤器
217 | 
218 |         Returns
219 |         -------
220 |         str
221 |             保存JSON数据的文件路径
222 |         """
223 |         if not os.path.exists(file_path):
224 |             raise FileNotFoundError("您的填写的流量包没有找到！流量包路径：%s" % file_path)
225 | 
226 |         md5_sum = FlowAnalyzer.get_hash(file_path, display_filter)
227 |         logger.debug(f"md5校验值: {md5_sum}")
228 | 
229 |         work_dir = os.getcwd()
230 |         tshark_command_work_dir = os.path.dirname(os.path.abspath(file_path))
231 |         json_work_path = os.path.join(work_dir, "output.json")
232 |         file_name = os.path.basename(file_path)
233 | 
234 |         if os.path.exists(json_work_path):
235 |             try:
236 |                 with open(json_work_path, "r", encoding="utf-8") as f:
237 |                     data = json.load(f)
238 |                     if data[0].get("MD5Sum") == md5_sum:
239 |                         logger.debug("匹配md5校验无误，自动返回Json文件路径!")
240 |                         return json_work_path
241 |             except Exception:
242 |                 logger.debug("默认的Json文件无法被正常解析, 正在重新生成Json文件中")
243 | 
244 |         tshark_path = FlowAnalyzer.get_tshark_path(tshark_path)
245 |         FlowAnalyzer.extract_json_file(file_name, display_filter, tshark_path, tshark_command_work_dir, json_work_path)
246 |         FlowAnalyzer.add_md5sum(json_work_path, md5_sum)
247 |         return json_work_path
248 | 
249 |     @staticmethod
250 |     def get_tshark_path(tshark_path: Optional[str]) -> str:
251 |         default_tshark_path = get_default_tshark_path()
252 |         if not os.path.exists(default_tshark_path):
253 |             logger.debug("没有检测到tshark存在, 请查看并检查tshark_path")
254 |         else:
255 |             logger.debug("检测到默认tshark存在!")
256 | 
257 |         if tshark_path is None:
258 |             logger.debug("您没有传入tshark_path, 请传入tshark_path")
259 |         elif not os.path.exists(tshark_path):
260 |             logger.debug("传入的tshark_path不存在, 请查看并检查tshark_path")
261 | 
262 |         use_tshark_path = None
263 |         if os.path.exists(default_tshark_path):
264 |             use_tshark_path = default_tshark_path
265 | 
266 |         if tshark_path is not None and os.path.exists(tshark_path):
267 |             use_tshark_path = tshark_path
268 | 
269 |         if use_tshark_path is None:
270 |             logger.critical("您没有配置 tshark_path 并且没有在参数中传入 tshark_path")
271 |             exit(-1)
272 |         return use_tshark_path
273 | 
274 |     def split_http_headers(self, file_data: bytes) -> Tuple[bytes, bytes]:
275 |         headerEnd = file_data.find(b"\r\n\r\n")
276 |         if headerEnd != -1:
277 |             headerEnd += 4
278 |             return file_data[:headerEnd], file_data[headerEnd:]
279 |         elif file_data.find(b"\n\n") != -1:
280 |             headerEnd = file_data.index(b"\n\n") + 2
281 |             return file_data[:headerEnd], file_data[headerEnd:]
282 |         else:
283 |             print("[Warning] 没有找到headers和response的划分位置!")
284 |             return b"", file_data
285 | 
286 |     def dechunck_http_response(self, file_data: bytes) -> bytes:
287 |         """解码分块TCP数据
288 | 
289 |         Parameters
290 |         ----------
291 |         file_data : bytes
292 |             已经切割掉headers的TCP数据
293 | 
294 |         Returns
295 |         -------
296 |         bytes
297 |             解码分块后的TCP数据
298 |         """
299 |         chunks = []
300 |         chunkSizeEnd = file_data.find(b"\n") + 1
301 |         lineEndings = b"\r\n" if bytes([file_data[chunkSizeEnd - 2]]) == b"\r" else b"\n"
302 |         lineEndingsLength = len(lineEndings)
303 |         while True:
304 |             chunkSize = int(file_data[:chunkSizeEnd], 16)
305 |             if not chunkSize:
306 |                 break
307 | 
308 |             chunks.append(file_data[chunkSizeEnd : chunkSize + chunkSizeEnd])
309 |             file_data = file_data[chunkSizeEnd + chunkSize + lineEndingsLength :]
310 |             chunkSizeEnd = file_data.find(lineEndings) + lineEndingsLength
311 |         return b"".join(chunks)
312 | 
313 |     def extract_http_file_data(self, full_request: str) -> Tuple[bytes, bytes]:
314 |         """提取HTTP请求或响应中的文件数据
315 | 
316 |         Parameters
317 |         ----------
318 |         full_request : bytes
319 |             HTTP请求或响应的原始字节流
320 | 
321 |         Returns
322 |         -------
323 |         tuple
324 |             包含header和file_data的元组
325 |         """
326 |         header, file_data = self.split_http_headers(bytes.fromhex(full_request))
327 | 
328 |         with contextlib.suppress(Exception):
329 |             file_data = self.dechunck_http_response(file_data)
330 | 
331 |         with contextlib.suppress(Exception):
332 |             if file_data.startswith(b"\x1f\x8b"):
333 |                 file_data = gzip.decompress(file_data)
334 |         return header, file_data
335 | 


--------------------------------------------------------------------------------