├── requirements.txt
├── utils
    ├── office.py
    ├── process.py
    ├── compress.py
    └── configurator.py
├── configs.yaml
├── .gitignore
├── README.md
├── cut_text_files.py
├── emergency.yaml
├── README_EN.md
├── LICENSE
└── sensitive-helper.py


/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xn0ne/sensitive-helper/HEAD/requirements.txt


--------------------------------------------------------------------------------
/utils/office.py:
--------------------------------------------------------------------------------
 1 | """
 2 | office.py
 3 | 
 4 | Office 文件解析：
 5 | - docx_handler: 将 .docx 主文档抽取为纯文本，保存为 *_resolved.txt；
 6 | - xlsx_handler: 将 .xlsx 所有工作表转为 dict 并写入 *_resolved.txt；
 7 | - pptx_handler: 预留（暂未实现）。
 8 | """
 9 | import json
10 | import pathlib
11 | import re
12 | from typing import Union
13 | 
14 | import pandas
15 | 
16 | try:
17 |     from utils import compress
18 | except:
19 |     import pathlib
20 |     import sys
21 | 
22 |     sys.path.append(pathlib.Path(__file__).parent.parent.__str__())
23 |     from utils import compress
24 | 
25 | 
26 | def docx_handler(file_path: Union[pathlib.Path, str]) -> pathlib.Path:
27 |     """解析 .docx 文本内容并输出到同名 *_resolved.txt 文件。"""
28 |     docx_path = compress.uncompress(file_path)
29 |     content = docx_path.joinpath('word/document.xml').read_text(encoding='utf-8')
30 |     content = re.sub(r'[\r\n]', '', content)
31 |     content = re.sub(r'<w:p\s.+?<w:t(\s[^<>]*?)?>', '\n', content)
32 |     content = re.sub(r'<[^<>]+>', '', content)
33 |     resolved_path = pathlib.Path(docx_path.__str__() + '_resolved.txt')
34 |     with open(resolved_path, 'w', encoding='utf-8') as _f:
35 |         _f.write(content)
36 |     return resolved_path
37 | 
38 | 
39 | def xlsx_handler(file_path: Union[pathlib.Path, str]):
40 |     """将 .xlsx 所有工作表转换为 dict 并写入同名 *_resolved.txt 文件。"""
41 |     xlsx_file = pandas.read_excel(file_path, sheet_name=None)
42 | 
43 |     with open(pathlib.Path(file_path.__str__() + '_resolved.txt'), 'w', encoding='utf-8') as _f:
44 |         for sheet in xlsx_file:
45 |             _f.write(xlsx_file[sheet].to_dict(orient='index').__str__() + '\n')
46 | 
47 | 
48 | def pptx_handler():
49 |     """预留：PPTX 解析未实现。"""
50 |     pass
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     # docx_handler('cache/utils.docx')
55 |     # xlsx_handler('cache/tttt/email.xlsx')
56 |     pass
57 | 


--------------------------------------------------------------------------------
/configs.yaml:
--------------------------------------------------------------------------------
 1 | config_path: config.yaml
 2 | exclude_files:
 3 |   - \.DS_Store
 4 | is_re_all: true
 5 | is_silent: false
 6 | output_format: csv
 7 | process_number: 12
 8 | re_filter_content: ""
 9 | row_split: '[\0-\x1F\x7F]+'
10 | rules:
11 |   AKSK:
12 |     - '[\s\n\''\"`=:#]LTAI\w{12,20}[\s\n\''\"`=:#]'
13 |     - '[\s\n\''\"`=:#](A3T[A-Z0-9]|ABIA|ACCA|AGPA|AIDA|AIPA|AKIA|ANPA|ANVA|APKA|AROA|ASCA|ASIA)[0-9A-Z]{16}[\s\n\''\"`=:#]'
14 |     - '[\s\n\''\"`=:#]GOOG\w{10,30}[\s\n\''\"`=:#]'
15 |     - '[\s\n\''\"`=:#]AZ[A-Za-z0-9]{34,40}[\s\n\''\"`=:#]'
16 |     - '[\s\n\''\"`=:#]IBM[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]'
17 |     - '[\s\n\''\"`=:#][a-zA-Z0-9]{8}(-[a-zA-Z0-9]{4}){3}-[a-zA-Z0-9]{12}[\s\n\''\"`=:#]'
18 |     - '[\s\n\''\"`=:#]OCID[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]'
19 |     - '[\s\n\''\"`=:#]LTAI[A-Za-z0-9]{12,20}[\s\n\''\"`=:#]'
20 |     - '[\s\n\''\"`=:#]AKID[A-Za-z0-9]{13,20}[\s\n\''\"`=:#]'
21 |     - '[\s\n\''\"`=:#]AK[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]'
22 |     - '[\s\n\''\"`=:#]JDC_[A-Z0-9]{28,32}[\s\n\''\"`=:#]'
23 |     - '[\s\n\''\"`=:#]AKLT[a-zA-Z0-9-_]{0,252}[\s\n\''\"`=:#]'
24 |     - '[\s\n\''\"`=:#]UC[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]'
25 |     - '[\s\n\''\"`=:#]QY[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]'
26 |     - '[\s\n\''\"`=:#]AKLT[a-zA-Z0-9-_]{16,28}[\s\n\''\"`=:#]'
27 |     - '[\s\n\''\"`=:#]LTC[A-Za-z0-9]{10,60}[\s\n\''\"`=:#]'
28 |     - '[\s\n\''\"`=:#]YD[A-Za-z0-9]{10,60}[\s\n\''\"`=:#]'
29 |     - '[\s\n\''\"`=:#]CTC[A-Za-z0-9]{10,60}[\s\n\''\"`=:#]'
30 |     - '[\s\n\''\"`=:#]YYT[A-Za-z0-9]{10,60}[\s\n\''\"`=:#]'
31 |     - '[\s\n\''\"`=:#]YY[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]'
32 |     - '[\s\n\''\"`=:#]CI[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]'
33 |     - '[\s\n\''\"`=:#]gcore[A-Za-z0-9]{10,30}[\s\n\''\"`=:#]'
34 |   BASE64:
35 |     - "[0-9a-zA-Z/+]{8,}={,2}"
36 |   EMAIL:
37 |     - '[a-zA-Z0-9][-+.\w]{1,127}@([a-zA-Z0-9][-a-zA-Z0-9]{0,63}.){,3}(org|com|cn|net|edu|mail)'
38 |   FILE PATH:
39 |     flags: I|X
40 |     re_filters: []
41 |     regexp:
42 |       - ([a-z]:\\)?([\\/])(users?|windows?|program files(\(x\d{2,3}\))?|s?bin|etc|usr|boot|dev|home|proc|opt|sys|srv|var)(\2[.\w!#\(~\[\{][.\w!#&\(\)+=~\[\]\{\}\s]{2,63}){1,16}
43 |   FUZZY MATCH:
44 |     flags: I
45 |     regexp:
46 |       - (APP|ACCESS|USER|PASS|OSS|ECS|CVM|AWS)[\w]{,8}(NAME|ID|KEY|NUM|ENC|CODE|SEC|WORD)[\w]{,16}["\'`]?\s*[=:(\{\[]\s*["\'`][\x20-\x7F]{,128}?[\'"`]
47 |       - (USR|PWD|COOKIE)[_\-A-Z][\w]{,16}["\'`]?\s*[=:(\{\[]\s*["\'`][\x20-\x7F]{,128}?[\'"`]
48 |       - (SECRET|SIGN|TOKEN)[\w]{,16}["\'`]?\s*[=:(\{\[]\s*["\'`][\x20-\x7F]{,128}?[\'"`]
49 |   JSON WEB TOKEN(JWT):
50 |     - ey[0-9a-zA-Z/+]{4,}={,2}\.[0-9a-zA-Z/+]{6,}={,2}\.[A-Za-z0-9-_]+
51 |   PHONE:
52 |     - (13[0-9]|14[5-9]|15[0-3,5-9]|16[6]|17[0-8]|18[0-9]|19[8,9])\d{8}
53 |   URL:
54 |     re_filters:
55 |       - (adobe|amap|android|apache|bing|digicert|eclipse|freecodecamp|github|githubusercontent|gnu|godaddy|google|googlesource|youtube|youtu|jd|npmjs|microsoft|openxmlformats|outlook|mozilla|openssl|oracle|qq|spring|sun|umang|w3|wikipedia|xml)\.(org|com|cn|net|edu|io|be)
56 |       - (ali|baidu|cdn|example|ssh|ssl)[\w-]*\.(org|com|cn|net|edu|io)
57 |     regexp:
58 |       - (ftp|https?):\/\/[%\.\w\-]+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?
59 | target_path: ""
60 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | .idea/
161 | 
162 | # other
163 | *.csv
164 | *.json
165 | .DS_Store
166 | .cache/
167 | cache/
168 | .output/
169 | output/
170 | *.rar
171 | *.zip
172 | *.7z
173 | *.tgz
174 | *.tar


--------------------------------------------------------------------------------
/utils/process.py:
--------------------------------------------------------------------------------
  1 | #!/bin/python3
  2 | # _*_ coding:utf-8 _*_
  3 | #
  4 | """
  5 | process.py
  6 | 
  7 | 进程池辅助工具：对 concurrent.futures.ProcessPoolExecutor 做轻量封装。
  8 | 
  9 | 功能：
 10 | - submit_super: 提交任务并记录 future；
 11 | - result_yield: 以生成器形式依次获取结果，获取后即从队列移除。
 12 | 
 13 | 适用场景：
 14 | - CPU 密集或网络请求密集任务；
 15 | - 可在多机分布式环境扩展。
 16 | 
 17 | 参考：`https://segmentfault.com/a/1190000007495352`
 18 | """
 19 | 
 20 | import concurrent.futures
 21 | import os
 22 | import random
 23 | import time
 24 | from typing import Any, Generator, List
 25 | 
 26 | 
 27 | class ProcessPoolHelper(concurrent.futures.ProcessPoolExecutor):
 28 |     """对 ProcessPoolExecutor 的简易封装，便于批量提交与顺序取回结果。"""
 29 |     def __init__(self, max_workers=None, mp_context=None, initializer=None, initargs=()):
 30 |         super().__init__(max_workers, mp_context, initializer, initargs)
 31 |         self.__job_list: List[concurrent.futures.Future] = []
 32 | 
 33 |     def submit_super(self, fn, /, *args, **kwargs) -> concurrent.futures.Future:
 34 |         """提交任务并加入内部队列，返回 future。"""
 35 |         job = self.submit(fn, *args, **kwargs)
 36 |         self.__job_list.append(job)
 37 |         return job
 38 | 
 39 |     def result_yield(self, timeout: float = None) -> Generator[Any, None, None]:
 40 |         """按提交顺序产出任务返回值；取出后从队列移除。
 41 | 
 42 |         注意：调用方会阻塞直到有任务完成或超时。
 43 |         """
 44 |         self.__job_list.reverse()
 45 |         while self.__job_list:
 46 |             yield self.__job_list.pop().result(timeout)
 47 | 
 48 |         self.__job_list = []
 49 | 
 50 | 
 51 | def __test_performance_func(min: int = 500, max: int = 600):
 52 |     # print(os.getpid(), '__test_performance_func running...')
 53 |     result = 0
 54 |     for i in range(random.randint(min, max)):
 55 |         for j in range(random.randint(min, max)):
 56 |             for k in range(random.randint(min, max)):
 57 |                 result += i + j + k
 58 |     print(os.getpid(), '__test_performance_func result:', str(result))
 59 |     # print(os.getpid(), '__test_performance_func ending...')
 60 | 
 61 | 
 62 | def __test_return_func(min: int = 500, max: int = 600):
 63 |     result = 0
 64 |     for i in range(random.randint(min, max)):
 65 |         for j in range(random.randint(min, max)):
 66 |             for k in range(random.randint(min, max)):
 67 |                 result += i + j + k
 68 |     print(os.getpid(), 'test_return_func result:', str(result))
 69 |     # 返回数据
 70 |     return result
 71 | 
 72 | 
 73 | def __test_return_dict_func(min: int = 500, max: int = 600):
 74 |     result = {}
 75 |     for i in range(random.randint(min, max)):
 76 |         for j in range(random.randint(min, max)):
 77 |             for k in range(random.randint(min, max)):
 78 |                 key = str(i // 500)
 79 |                 if not key in result:
 80 |                     result[key] = 0
 81 |                 result[key] += i + j + k
 82 |     print(os.getpid(), '__test_return_dict_func result:', result)
 83 |     # 返回数据
 84 |     return result
 85 | 
 86 | 
 87 | if __name__ == '__main__':
 88 |     print('__test_return_func 1 times')
 89 |     start_time = time.time()
 90 |     print('return:', __test_return_func(500, 550))
 91 |     print('run one times, total time(s):', time.time() - start_time)
 92 | 
 93 |     thr = ProcessPoolHelper(3)
 94 | 
 95 |     print('__test_performance_func')
 96 |     start_time = time.time()
 97 |     [thr.submit_super(__test_performance_func, 500, 550) for i in range(10)]
 98 |     print('return:', [i for i in thr.result_yield()])
 99 |     print('total time(s):', time.time() - start_time)
100 | 
101 |     print('__test_return_func')
102 |     start_time = time.time()
103 |     [thr.submit_super(__test_return_func, 500, 550) for i in range(10)]
104 |     print('return:', [i for i in thr.result_yield()])
105 |     print('total time(s):', time.time() - start_time)
106 | 
107 |     print('__test_return_dict_func')
108 |     start_time = time.time()
109 |     [thr.submit_super(__test_return_dict_func, 500, 550) for i in range(10)]
110 |     print('return:', [i for i in thr.result_yield()])
111 |     print('total time(s):', time.time() - start_time)
112 | 
113 |     # 处理器:           安装了 1 个处理器。
114 |     #                  [01]: AMD64 Family 25 Model 33 Stepping 0 AuthenticAMD ~3701 Mhz
115 |     # return: 2861758025421224
116 |     # run one times, total time(s): 10.262209177017212
117 |     # return: [2582051420060162, 2385134693133712, 2734006770193755, 2658863161379877, 2367450617576565, 2535566799548760, 2647666940791099, 2445589945625423, 2405781958502416, 2812995873098620]
118 |     # total time(s): 36.40744924545288
119 |     # return: [None, None, None, None, None, None, None, None, None, None]
120 |     # total time(s): 35.91919994354248
121 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 参考链接：<https://github.com/securing/DumpsterDiver>
  2 | 
  3 | # Sensitive Helper
  4 | 
  5 | 简体中文 | [English](./README_EN.md)
  6 | 
  7 | 最近项目要搜索本地的敏感数据工作太多了，网上用了一些工具效果一般，被老板DISS了很多次，当然也可能是我不会用，如：SO文件无法读取、多进程报错、配置看不懂、识别原理云里雾里等，想提issues的，但大家都要养家想想算了，自己改一个。
  8 | 
  9 | 基于正则表达式的本地文件敏感信息数据挖掘助手。如果要搜索网页上的敏感数据，可以把敏感数据导出到本地再进行搜索。优化了一下多线程的使用，优化了配置的使用方式。
 10 | 
 11 | **注意**：
 12 | 
 13 | + 如果默认规则不满足匹配需求，请自行调整 configs.yaml 文件中的 `rules` 部分内容进行匹配；
 14 | + 进度条是按文件数量来统计进度的，如果卡住并不是程序卡死了，可能是文件太大了，比如 1G 文件；
 15 | + 非静默模式是在匹配完成单个文件后才会输出命中数据，如果在扫描大文件没输出命中信息请耐心等待；
 16 | + 添加的规则越多，扫描的速度越慢，尽可能使用 1 条正则表达式匹配所需要的特征；
 17 | 
 18 | # 快速开始
 19 | 
 20 | ### 依赖
 21 | 
 22 | + python >= 3.6
 23 | 
 24 | 进入项目目录，使用以下命令安装依赖库
 25 | 
 26 | ```bash
 27 | pip3 install toml PyYAML tqdm pandas rarfile py7zr openpyxl
 28 | ```
 29 | 
 30 | 或者使用 PIP 的 `requirement` 参数安装依赖库
 31 | 
 32 | ```bash
 33 | pip3 install -r requirements.txt
 34 | ```
 35 | 
 36 | ### 基础用法
 37 | 
 38 | 使用 `-t` 参数直接对目标路径进行搜索
 39 | 
 40 | ```$ python3 sensitive-helper.py -t <你的搜索文件路径>```
 41 | 
 42 | 当想要排除部分类型文件，可以使用 `-e` 参数排除指定的文件，要注意这里是使用正则表达式进行文件名匹配的，比如程序可能搜索到以下文件 /tmp/aaa.so，如果不想搜索 `.so` 文件类型，可以使用正则表达式 `.*so` 程序会将 `aaa.so` 字符串与正则表达式进行匹配 `.*so`，即可对 `so` 格式文件进行过滤
 43 | 
 44 | ```$ python3 sensitive-helper.py -t <你的搜索文件路径> -e ".*so" ".*gz"```
 45 | 
 46 | 如果觉得搜索速度太慢，可以使用 `-p` 参数调整搜索的进程数（默认为：5）以提高搜索速度，虽然 Python 的多进程很差劲，但有总比没有好，注意一个进程只会处理一个文件，如果使用文本类打大文件，请先使用 `cut_text_files.py` 将文件切割再进行搜索会提升搜索速度。这个参数的值最佳设置推荐为 CPU 核心数相同，激进一点可以 CPU 核心数 * 2。
 47 | **注意**：计算机性能不好设置不要超过20个进程数，程序涉及大量的IO、内存操作，计算机可能会崩溃，比如我的电脑。
 48 | 
 49 | ```$ python3 sensitive-helper.py -t <你的搜索文件路径> -p 10```
 50 | 
 51 | 有保存数据的需求话，可以使用 `-o` 参数输出 json 格式的结果文件
 52 | 
 53 | ```$ python3 sensitive-helper.py -t <你的搜索文件路径> -o results.json```
 54 | 
 55 | 默认情况下，程序使用正则表达式进行匹配的时候，匹配到 1 条表达式就会退出当前文件的搜索。可以使用 `-a` 参数，强制程序将每条正则表达式都匹配完毕，挖掘更多可能有用的数据
 56 | 
 57 | ```$ python3 sensitive-helper.py -t <你的搜索文件路径> -a```
 58 | 
 59 | **注意**：程序内置默认匹配规则，规则优先级为：默认配置 < configs.yaml 配置 < 用户输入配置
 60 | 
 61 | ### 使用说明
 62 | 
 63 | ```
 64 | $ python3 sensitive-helper.py -h                                  
 65 | usage: sensitive-helper.py [-h] [-t TARGET_PATH] [-p PROCESS_NUMBER] [-c CONFIG_PATH] [-o OUTPUT_FORMAT] [-e EXCLUDE_FILES [EXCLUDE_FILES ...]] [-a] [-s] [-r RE_FILTER_CONTENT]
 66 | 
 67 |     ███████╗███████╗███╗   ██╗███████╗██╗████████╗██╗██╗   ██╗███████╗
 68 |     ██╔════╝██╔════╝████╗  ██║██╔════╝██║╚══██╔══╝██║██║   ██║██╔════╝
 69 |     ███████╗█████╗  ██╔██╗ ██║███████╗██║   ██║   ██║██║   ██║█████╗
 70 |     ╚════██║██╔══╝  ██║╚██╗██║╚════██║██║   ██║   ██║╚██╗ ██╔╝██╔══╝
 71 |     ███████║███████╗██║ ╚████║███████║██║   ██║   ██║ ╚████╔╝ ███████╗
 72 |     ╚══════╝╚══════╝╚═╝  ╚═══╝╚══════╝╚═╝   ╚═╝   ╚═╝  ╚═══╝  ╚══════╝
 73 |     v0.1.6
 74 |     by 0xn0ne, https://github.com/0xn0ne/sensitive-helper
 75 | 
 76 | options:
 77 |   -h, --help            显示帮助信息并退出程序
 78 |   -t, --target-path TARGET_PATH
 79 |                         搜索敏感信息的文件路径或文件夹路径（例如：~/download/folder）
 80 |   -p, --process-number PROCESS_NUMBER
 81 |                         程序进程数（默认值：12）
 82 |   -c, --config-path CONFIG_PATH
 83 |                         yaml 配置文件的路径（默认值：configs.yaml）
 84 |   -o, --output-format OUTPUT_FORMAT
 85 |                         输出文件格式，可用格式为 json、csv（默认值：csv）
 86 |   -e, --exclude-files EXCLUDE_FILES [EXCLUDE_FILES ...]
 87 |                         排除的文件，使用正则匹配（例如：\.DS_Store .*bin .*doc）
 88 |   -a, --is-re-all       每个文件的被单个正则表达式规则后退出匹配循环，或匹配所有正则表达式才退出匹配循环
 89 |   -s, --is-silent       静默模式：开启后，命令行不会输出命中的信息，会使用进度条来显示进度
 90 |   -r, --re-filter-content RE_FILTER_CONTENT
 91 |                         过滤正则，每行字符串匹配过程中命中该正则直接跳过该行
 92 | ```
 93 | 
 94 | ### 应急响应用法与示例
 95 | 
 96 | 感谢网络安全的朋友给提出的建议，该工具也可用于常见的网络攻击特征快速匹配，复杂型网络攻击不适用，如POST请求体内的攻击、0DAY漏洞攻击、特殊网络路径攻击等，酌情使用。
 97 | 
 98 | 当用于网络安全应急响应时，可直接对中间件与应用日志目录进行扫描，以快速提取疑似攻击痕迹，因为应急特征匹配和敏感数据匹配思路还是有部分区别，这里使用单独的配置文件 `emergency.yaml` 避免匹配混乱。
 99 | 
100 | + 支持的常见攻击指纹分组（可在 `emergency.yaml` 的 `rules` 中调整）：
101 |   + SQL INJECTION
102 |   + COMMAND EXECUTION
103 |   + PATH TRAVERSAL / LFI-RFI
104 |   + SSRF
105 |   + XSS
106 |   + LOG4SHELL / JNDI
107 |   + WEBSHELL / MALICIOUS UPLOAD
108 |   + JAVA / PHP DESERIALIZATION
109 |   + NOSQL INJECTION
110 |   + SENSITIVE ACCESS
111 | 
112 | 示例：扫描 Nginx/Apache/Tomcat 应用日志目录，并输出所有可疑的攻击请求
113 | 
114 | ```bash
115 | python3 sensitive-helper.py -t /var/log/nginx -a -s -c emergency.yaml
116 | python3 sensitive-helper.py -t /var/log/httpd  -a -s -c emergency.yaml
117 | python3 sensitive-helper.py -t /opt/tomcat/logs -a -s -c emergency.yaml
118 | ```
119 | 
120 | 建议：
121 | 
122 | + 如需展开归档日志，可不排除压缩包（程序会尝试递归解压）。
123 | + 如日志量特别大，建议结合 `-s` 开启进度条，并合理调高 `-p` 进程数。
124 | 
125 | ### 默认模式输出样例
126 | 
127 | ```bash
128 | $ python3 sensitive-helper.py -t "cache/" -a
129 | [*] file loading...
130 | [*] analyzing...
131 | 
132 | [+] group: FUZZY MATCH, match: AppId":"123456", file: cache/heapdump
133 | [+] group: BASE64, match: ZjY2MTQyNDEtYTIyYS00YjNlLTg1NTgtOTQ4NmUwZDFkZjM1, file: cache/heapdump
134 | [+] group: FUZZY MATCH, match: password":"123456", file: cache/heapdump
135 | [+] group: FILE PATH, match: C:\Windows\system32\drivers, file: cache/heapdump-BAK
136 | [+] group: URL, match: http://hello.world/123456.jpg, file: cache/heapdump-BAK  
137 | total file number: 5
138 | ```
139 | 
140 | ### 静默模式输出样例
141 | 
142 | ```bash
143 | $ python3 sensitive-helper.py -t "cache/" -a -s
144 | [*] file loading...
145 | [*] analyzing...
146 | 
147 | 53792/53792 [██████████████████████████████████████████] 00:28<00:00,1856.73it/s
148 | total file number: 53792
149 | ```
150 | 
151 | # Q&A
152 | 
153 | + Q：为什么不做网页的敏感数据搜索？
154 | + A：因为网页千变万化，改动一个API接口或是一个css或者id都可能要更新代码，不如导出到本地，统一使用文本识别的方式对数据处理。
155 | 


--------------------------------------------------------------------------------
/cut_text_files.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 文本文件切割工具
  5 | 支持按行数切割文本文件，可自定义输入文件/文件夹和输出目录
  6 | """
  7 | 
  8 | import argparse
  9 | import glob
 10 | import math
 11 | import os
 12 | import re
 13 | import sys
 14 | from pathlib import Path
 15 | from typing import List, Optional, Union
 16 | 
 17 | 
 18 | class TextFileSplitter:
 19 |     """文本文件切割器"""
 20 | 
 21 |     def __init__(self, output_lines_number: int = 10000, output_dir: str = "output"):
 22 |         """
 23 |         初始化切割器
 24 | 
 25 |         Args:
 26 |             output_lines_number: 每个输出文件的行数
 27 |             output_dir: 输出目录
 28 |         """
 29 |         self.output_lines_number = output_lines_number
 30 |         self.output_dir = Path(output_dir)
 31 |         self.output_dir.mkdir(exist_ok=True)
 32 | 
 33 |     def split_files(self, input_file: Union[str, Path]):
 34 |         """
 35 |         切割单个文件
 36 | 
 37 |         Args:
 38 |             input_file: 输入文件路径
 39 | 
 40 |         Returns:
 41 |             生成的文件路径列表
 42 |         """
 43 |         input_path = Path(input_file)
 44 |         if not input_path.exists():
 45 |             raise FileNotFoundError(f"文件或文件夹不存在: {input_path}")
 46 | 
 47 |         if input_path.is_file():
 48 |             input_path = [input_path]
 49 |         else:
 50 |             input_path = input_path.iterdir()
 51 | 
 52 |         for path in input_path:
 53 |             if path.is_dir():
 54 |                 continue
 55 | 
 56 |             print(f"正在处理文件: {path}")
 57 | 
 58 |             # 创建输出文件名模板
 59 |             base_name = path.stem
 60 |             extension = path.suffix
 61 | 
 62 |             content = path.read_bytes()
 63 |             content_list = re.split(rb'\n', content)
 64 |             content_list_len = len(content_list)
 65 |             times = math.ceil(content_list_len / self.output_lines_number)
 66 |             for index in range(times):
 67 |                 self._write_chunk(
 68 |                     content_list[index * self.output_lines_number : (index + 1) * self.output_lines_number],
 69 |                     base_name,
 70 |                     extension,
 71 |                     index,
 72 |                 )
 73 | 
 74 |             print(f"文件 {input_path} 已切割为 {times} 个文件")
 75 | 
 76 |     def _write_chunk(self, lines: List[bytes], base_name: str, extension: str, file_id: int) -> str:
 77 |         """
 78 |         写入文件块
 79 | 
 80 |         Args:
 81 |             lines: 要写入的行列表
 82 |             base_name: 基础文件名
 83 |             extension: 文件扩展名
 84 |             file_count: 文件编号
 85 | 
 86 |         Returns:
 87 |             输出文件路径
 88 |         """
 89 |         output_filename = f"{base_name}_part_{file_id:06d}{extension}"
 90 |         output_path = self.output_dir / output_filename
 91 |         output_path.write_bytes(b'\n'.join(lines))
 92 | 
 93 |         return str(output_path)
 94 | 
 95 |     def get_file_info(self, file_path: Union[str, Path]) -> dict:
 96 |         """
 97 |         获取文件信息
 98 | 
 99 |         Args:
100 |             file_path: 文件路径
101 | 
102 |         Returns:
103 |             文件信息字典
104 |         """
105 |         path = Path(file_path)
106 |         if not path.exists():
107 |             return {"error": "文件不存在"}
108 | 
109 |         try:
110 |             with open(path, "r", encoding="utf-8", errors="ignore") as f:
111 |                 line_count = sum(1 for _ in f)
112 | 
113 |             return {
114 |                 "file_name": path.name,
115 |                 "file_size": path.stat().st_size,
116 |                 "line_count": line_count,
117 |                 "estimated_parts": (line_count + self.output_lines_number - 1) // self.output_lines_number,
118 |             }
119 |         except Exception as e:
120 |             return {"error": str(e)}
121 | 
122 | 
123 | def main():
124 |     """主函数"""
125 |     parser = argparse.ArgumentParser(
126 |         description="文本文件切割工具 - 按行数切割文本文件",
127 |         formatter_class=argparse.RawDescriptionHelpFormatter,
128 |         epilog="""
129 | 使用示例:
130 |   python cut_text_files.py -f input_or_folder.txt -l 1000
131 |   python cut_text_files.py -f input.txt --info
132 |         """,
133 |     )
134 | 
135 |     # 输入参数组
136 |     input_group = parser.add_mutually_exclusive_group(required=True)
137 |     input_group.add_argument("-f", "--files", type=str, help="要切割的单个文件路径")
138 | 
139 |     # 其他参数
140 |     parser.add_argument("-l", "--lines", type=int, default=10000, help="每个输出文件的行数 (默认: 10000)")
141 |     parser.add_argument("-o", "--output", type=str, default="output", help="输出目录 (默认: output)")
142 |     parser.add_argument(
143 |         "-p",
144 |         "--pattern",
145 |         type=str,
146 |         default="*.txt",
147 |         help="文件匹配模式，仅用于目录模式 (默认: *.txt)",
148 |     )
149 |     parser.add_argument("--info", action="store_true", help="显示文件信息而不进行切割")
150 | 
151 |     args = parser.parse_args()
152 | 
153 |     # 创建切割器
154 |     splitter = TextFileSplitter(output_lines_number=args.lines, output_dir=args.output)
155 | 
156 |     if args.files:
157 |         # 处理单个文件
158 |         if args.info:
159 |             # 显示文件信息
160 |             info = splitter.get_file_info(args.files)
161 |             if "error" in info:
162 |                 print(f"错误: {info['error']}")
163 |                 return 1
164 | 
165 |             print(f"文件信息:")
166 |             print(f"  文件名: {info['file_name']}")
167 |             print(f"  文件大小: {info['file_size']:,} 字节")
168 |             print(f"  行数: {info['line_count']:,}")
169 |             print(f"  预计切割为: {info['estimated_parts']} 个文件")
170 |             print(f"  每个文件行数: {args.lines}")
171 |         else:
172 |             # 切割文件
173 |             splitter.split_files(args.files)
174 |             print(f"输出目录: {args.output}")
175 | 
176 | 
177 | if __name__ == "__main__":
178 |     sys.exit(main())
179 | 


--------------------------------------------------------------------------------
/emergency.yaml:
--------------------------------------------------------------------------------
  1 | # 应急排查专用配置
  2 | config_path: emergency.yaml
  3 | exclude_files:
  4 |   - \.DS_Store
  5 | is_re_all: true
  6 | is_silent: false
  7 | output_format: csv
  8 | process_number: 18
  9 | re_filter_content: '\s+(3\d\d|4\d\d|5\d\d)\s+'
 10 | row_split: '[\n\r]+'
 11 | rules:
 12 |   AKSK:
 13 |     - '[\s\n\''\"`=:#]LTAI\w{12,20}[\s\n\''\"`=:#]'
 14 |     - '[\s\n\''\"`=:#](A3T[A-Z0-9]|ABIA|ACCA|AGPA|AIDA|AIPA|AKIA|ANPA|ANVA|APKA|AROA|ASCA|ASIA)[0-9A-Z]{16}[\s\n\''\"`=:#]'
 15 |     - '[\s\n\''\"`=:#]GOOG\w{10,30}[\s\n\''\"`=:#]'
 16 |     - '[\s\n\''\"`=:#]AZ[A-Za-z0-9]{34,40}[\s\n\''\"`=:#]'
 17 |     - '[\s\n\''\"`=:#]IBM[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]'
 18 |     - '[\s\n\''\"`=:#][a-zA-Z0-9]{8}(-[a-zA-Z0-9]{4}){3}-[a-zA-Z0-9]{12}[\s\n\''\"`=:#]'
 19 |     - '[\s\n\''\"`=:#]OCID[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]'
 20 |     - '[\s\n\''\"`=:#]LTAI[A-Za-z0-9]{12,20}[\s\n\''\"`=:#]'
 21 |     - '[\s\n\''\"`=:#]AKID[A-Za-z0-9]{13,20}[\s\n\''\"`=:#]'
 22 |     - '[\s\n\''\"`=:#]AK[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]'
 23 |     - '[\s\n\''\"`=:#]JDC_[A-Z0-9]{28,32}[\s\n\''\"`=:#]'
 24 |     - '[\s\n\''\"`=:#]AKLT[a-zA-Z0-9-_]{0,252}[\s\n\''\"`=:#]'
 25 |     - '[\s\n\''\"`=:#]UC[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]'
 26 |     - '[\s\n\''\"`=:#]QY[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]'
 27 |     - '[\s\n\''\"`=:#]AKLT[a-zA-Z0-9-_]{16,28}[\s\n\''\"`=:#]'
 28 |     - '[\s\n\''\"`=:#]LTC[A-Za-z0-9]{10,60}[\s\n\''\"`=:#]'
 29 |     - '[\s\n\''\"`=:#]YD[A-Za-z0-9]{10,60}[\s\n\''\"`=:#]'
 30 |     - '[\s\n\''\"`=:#]CTC[A-Za-z0-9]{10,60}[\s\n\''\"`=:#]'
 31 |     - '[\s\n\''\"`=:#]YYT[A-Za-z0-9]{10,60}[\s\n\''\"`=:#]'
 32 |     - '[\s\n\''\"`=:#]YY[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]'
 33 |     - '[\s\n\''\"`=:#]CI[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]'
 34 |     - '[\s\n\''\"`=:#]gcore[A-Za-z0-9]{10,30}[\s\n\''\"`=:#]'
 35 |   JSON WEB TOKEN(JWT):
 36 |     - ey[0-9a-zA-Z/+]{4,}={,2}\.[0-9a-zA-Z/+]{6,}={,2}\.[A-Za-z0-9-_]+
 37 |   SQL INJECTION:
 38 |     flags: I
 39 |     regexp:
 40 |       - '.+[?/\;&]%[\da-f]{2}\S*(substring|concat|union|select|sleep|char\(|delay|information_schema|updatexml|extractvalue|load_file|outfile|into|xp_cmdshell|waitfor)\S+%[\da-f]{2}'
 41 |       - '.+[?/\;&]%[\da-f]{2}\S*(or|and)\S{,30}=\S{,6}%[\da-f]{2}'
 42 |   COMMAND EXECUTION:
 43 |     flags: I
 44 |     regexp:
 45 |       - '.+[?/\\;&]%[\da-f]{2}\S*(bash|zsh|cmd|powershell|pwsh|python|perl|wget|curl|netcat|telnet|whoami|uname|net.{1,6}user|ipconfig|ifconfig|shutdown)\S*%[\da-f]{2}'
 46 |   PATH TRAVERSAL / LFI-RFI:
 47 |     flags: I
 48 |     regexp:
 49 |       - .+/\S+?(\.\./|\.\.\\)+/\S+
 50 |       - .+/\S+?(etc/passwd|proc/self|windows/system32)
 51 |       - .+(php|file|zip|data|expect)://
 52 |       - .+(include|require)(_once).+
 53 |   SSRF:
 54 |     flags: I
 55 |     regexp:
 56 |       - \S{3,}(http|https|gopher|dict|file)://([1-2]?\d{1,2}\.[1-2]?\d{1,2}\.[1-2]?\d{,2}\.[1-2]?\d{,2}|localhost|::1)([:/])
 57 |       - \S{3,}dnslog\.\S+
 58 |   XSS:
 59 |     flags: I
 60 |     regexp:
 61 |       - \S+<\S*(script|iframe|svg)\S*>\S+
 62 |       - on(error|load|mouseover|focus|click)\s*=\S+
 63 |       - \S+(javascript:.+|document\.cookie|localStorage|sessionStorage)\S+
 64 |   LOG4J / JNDI:
 65 |     flags: I
 66 |     regexp:
 67 |       - \S+\$\{\S{,6}(jndi\s*:\s*|ldaps?|rmi|dns)\S+
 68 |   WEBSHELL:
 69 |     flags: I
 70 |     regexp:
 71 |       - .+(assert|eval|system|shell_exec|passthru|base64_decode).+$_(POST|REQUEST)
 72 |       - .+preg_replace\s*\(.*?/e
 73 |   JAVA / PHP DESERIALIZATION:
 74 |     flags: I
 75 |     regexp:
 76 |       - .+java\.io\.Object(Input|Output)Stream
 77 |       - .+(org\.apache\.commons\.collections\.|InvalidClassException|StreamCorruptedException)
 78 |   NOSQL INJECTION:
 79 |     flags: I
 80 |     regexp:
 81 |       - .+(\$ne|\$gt|\$gte|\$lt|\$lte|\$where)\s*[:=]
 82 |       - .+db\.[a-zA-Z0-9_]+\.find\(.*\$where
 83 |   SENSITIVE ACCESS:
 84 |     flags: I
 85 |     regexp:
 86 |       - .+'/\S+(/\.git|/\.svn|/swagger)/\S+'
 87 |       - .+'/\S+\.(zip|rar|tar|gz)\S*'
 88 |   # Common framework
 89 |   STRUTS2 / OGNL:
 90 |     flags: I
 91 |     regexp:
 92 |       - .+'%\{[#$].+?\}'
 93 |       - .+(redirect|action|method)\s*:.+
 94 |   IIS:
 95 |     flags: I
 96 |     regexp:
 97 |       - '.+\.asp.{,3};.{,3}\.[a-z]{2,3}'
 98 |   COMPONENT PATHS:
 99 |     flags: I
100 |     regexp:
101 |       # WebLogic 常见
102 |       - .+(/wls-wsat/\w+|/uddiexplorer/\w+\.jsp|/ws_utc/\w+\.do|/console/jsp/common/\w+\.jsp|/bea_wls_deployment_internal|/wls-wsat/CoordinatorPortType)
103 |       - ".+/console/(images|css|help|portal)/"
104 |       # Spring Boot / Actuator
105 |       - .+(/actuator|/heapdump)
106 |       # ActiveMQ
107 |       # GitLab
108 |       - .+(/users/password|/-/graphql|/import/gitlab_project)
109 |       # Microsoft SharePoint
110 |       - ".+/_layouts/15/ToolPane.aspx"
111 |       # Jenkins / JBoss / Struts / Nexus / Solr 等
112 |       - .+(/jmx-console|/invoker/JMXInvokerServlet|/struts2-showcase|/service/extdirect|/service/rest|/solr/admin|/solr/select)
113 |       # ===== 国产常见系统（指纹示例，需按环境取舍） =====
114 |       # JeecgBoot
115 |       - .+(/sys/common/upload|/getDictItemsByTable|/onlDragDatasetHead/getTotalData|/novat-boot/sys/user/passwordChange)
116 |       # 若依 RuoYi
117 |       - .+(/ruoyi-admin|/common/upload|/system/user/profile/resetPwd|/sendMessageWithAttachment)
118 |       # 帆软 FineReport
119 |       - .+(/webroot/decision/|/decision/login)
120 |       - ".+/WebReport/ReportServer.*cmd=design_install_reufile"
121 |       # 泛微 E-cology / Weaver
122 |       - .+(/interface/outter/outter_encryptclassOperation.jsp|/api/workflow/reqform/remarkOperate|/plugins/jqueryFileTree/connectors/jqueryFileTree.jsp|/weaver/weaver.email.FileDownloadLocation/login|/mobile/browser/WorkflowCenterTreeData.jsp|/weaver/ln.FileDownload|/weaver/ln.FileUpload|/weaver/weaver.file.FileDownload|/api/integration/datasource/update)
123 |       - ".+/mobilemode/mobile/server.jsp.+invoker=com.api.mobilemode.web.mobile.service.MobileEntranceAction"
124 |       # 致远 Seeyon
125 |       - .+(/seeyon/management/index.jsp|/seeyon/ajax.do|/seeyon/htmlofficeservlet|/seeyon/fileUpload.do)
126 |       # 蓝凌 Landray
127 |       - .+(/ekp/data/sys-common/dataxml.tmpl|/app/login.jsp|/data/sys-common|/sys/ui/extend/varkind/custom.jsp)
128 |       # 用友 Yonyou / NC / BIP / EF
129 |       - .+(/nc.itf.bap.service.IBapIOService|/ebvp/register/qrySubPurchaseOrgByParentPk|/portal/pt/servlet/getFormltem/doPost|/service/FileManageServlet|/bi/api/SemanticModel/GetOlapConnectionList|/uap.pubitf.ae.meta.IMetaWebService4BqCloud|/portal/pt/oacoSchedulerEvents/changeEvent|/ServiceDispatcherServlet)
130 |       - .+/com.ufida.web.action.ActionServlet.+repID=.+
131 |       - .+/Portal/Print/DynamaticExport.aspx.+filePath=.+
132 |       - .+/worksheet/workslist.jsp.+id=.+
133 |       # 金蝶 Kingdee / K3Cloud
134 |       - (/Kingdee.BOS.WebApi.ServicesStub|/CommonFileServer/Upload)
135 |       # 通达 Tongda OA
136 |       - .+(/general/login_code.php|/ispirit/login_code.php|/get_contactlist.php)
137 |       # 群晖 Synology
138 |       - .+(/webapi/auth.cgi|/webman/index.cgi)
139 |       # 深信服 Sangfor（VPN/NGAF 端口差异较大，按需取舍）
140 |       - .+(/fort/portal_login|/tool/log/c.php|/netConfig/set_port)
141 |       # 联软
142 |       - .+(/emm-core/oauth/token)
143 |       # 明源
144 |       - "/PubPlatform/nav/login/sso/login.aspx"
145 |       # 汉王E脸通
146 |       - .+(/manage/mobiVist.+/systemBlackList/uploadBlackListFile.do|/doorInfo/queryDoorInfoList.do.+order=|/manage/antisubmarine/queryAntisubmarineList.do.+order=|/manage/resourceUpload/imgDownload.do.+filePath=|/manage/intercom/.+/firstPeopleOpen/getDoors.do.+order=|/manage/authMultiplePeople/getGroupEmployee.do.+order=|/manage/visitorMapConfig/updateVisitorMapConfig.do|/manage/intercom/.+/resourceUpload/upload.do|/manage/leaveList/monadFileUpload.do)
147 |       # 扁鹊医疗
148 |       - .+(/WebServiceForFirstaidApp.asmx/GetMonitorList|/WebServiceForFirstaidApp.asmx/GetLyfsByParams)
149 |       # 时空智友ERP
150 |       - .+'/formservice.*service=updater.uploadStudioFile'
151 |       - .+'/formservice.*(service=attachment.write|filename=)'
152 |     # 联想/绿盟/金和/明源/大华/浪潮云/华测/Unibox/Richmail/PWS/Letta 待手机
153 | target_path: ""
154 | 


--------------------------------------------------------------------------------
/README_EN.md:
--------------------------------------------------------------------------------
  1 | Reference: <https://github.com/securing/DumpsterDiver>
  2 | 
  3 | # Sensitive Helper
  4 | 
  5 | [简体中文](./README.md) | English
  6 | 
  7 | Regular expression-based data mining assistant for sensitive information on local files. If you want to search for sensitive data on the web page, you can export the sensitive data to the local search. Optimized the use of multi-threading a bit , optimized the use of configuration .
  8 | 
  9 | **Note**: If the default rules do not meet your matching needs, please adjust the `rules` part of the configs.yaml file to match.
 10 | 
 11 | # Quick start
 12 | 
 13 | ### Required
 14 | 
 15 | + python >= 3.6
 16 | 
 17 | In the project directory and use the following command to install the dependent libraries
 18 | 
 19 | ```bash
 20 | pip3 install toml PyYAML tqdm pandas rarfile py7zr openpyxl
 21 | ```
 22 | 
 23 | Or use the `requirement` parameter of the PIP to install the dependency library
 24 | 
 25 | ```bash
 26 | pip3 install -r requirements.txt
 27 | ```
 28 | 
 29 | ### Basic usage
 30 | 
 31 | Use the `-t` parameter to search directly on the target path.
 32 | 
 33 | ```python3 sensitive-helper.py -t <Your search file path>```
 34 | 
 35 | When you want to exclude some types of files, you can use the `-e` parameter to exclude the specified files. Note that regular expressions are used here to match file names, for example, the program may search for the following file /tmp/aaa.so, if you do not want to search for the `.so` file type, you can use the regular expression `. *so` The program will match the `aaa.so` string with the regular expression `. *so` to filter `so` format files.
 36 | 
 37 | ```python3 sensitive-helper.py -t <Your search file path> -e ".*so" ".*gz"```
 38 | 
 39 | If search speed seems too slow, you can use the `-p` parameter to adjust the number of search processes (default: 5) and improve performance. While Python's multiprocessing is subpar, it's better than nothing. Note that each process handles only one file. When searching large text files, first use `cut_text_files.py` to split the files beforehand to enhance search speed. The optimal setting for this parameter is recommended to match the number of CPU cores. For more aggressive performance, you can set it to CPU cores * 2.
 40 | 
 41 | **Note**: Computer performance is not good set do not over 20 process number, the program requires a lot of IO, memory operations, the computer may crash, such as my computer...
 42 | 
 43 | ```python3 sensitive-helper.py -t <Your search file path> -p 20```
 44 | 
 45 | If you want to save the data, you can use the `-o` parameter to output the result file in json format.
 46 | 
 47 | ```python3 sensitive-helper.py -t <Your search file path> -o results.json```
 48 | 
 49 | By default, when the program matches using regular expressions, it will quit searching the current file after matching 1 expression. You can use the `-a` parameter to force the program to match every regular expression, to find more potentially useful data.
 50 | 
 51 | ```python3 sensitive-helper.py -t <Your search file path> -a```
 52 | 
 53 | **Note**: The program has built-in default matching rules, which are prioritized as follows: default configuration < configs.yaml configuration < user input configuration
 54 | 
 55 | ### Usage
 56 | 
 57 | ```bash
 58 | % python3 sensitive-helper.py -h                                                    
 59 | usage: sensitive-helper.py [-h] [-t TARGET_PATH] [-p PROCESS_NUMBER] [-c CONFIG_PATH] [-o OUTPUT_FORMAT] [-e EXCLUDE_FILES [EXCLUDE_FILES ...]] [-a] [-s] [-r RE_FILTER_CONTENT]
 60 | 
 61 |     ███████╗███████╗███╗   ██╗███████╗██╗████████╗██╗██╗   ██╗███████╗
 62 |     ██╔════╝██╔════╝████╗  ██║██╔════╝██║╚══██╔══╝██║██║   ██║██╔════╝
 63 |     ███████╗█████╗  ██╔██╗ ██║███████╗██║   ██║   ██║██║   ██║█████╗
 64 |     ╚════██║██╔══╝  ██║╚██╗██║╚════██║██║   ██║   ██║╚██╗ ██╔╝██╔══╝
 65 |     ███████║███████╗██║ ╚████║███████║██║   ██║   ██║ ╚████╔╝ ███████╗
 66 |     ╚══════╝╚══════╝╚═╝  ╚═══╝╚══════╝╚═╝   ╚═╝   ╚═╝  ╚═══╝  ╚══════╝
 67 |     v0.1.6
 68 |     by 0xn0ne, https://github.com/0xn0ne/sensitive-helper
 69 | 
 70 | options:
 71 |   -h, --help            show this help message and exit
 72 |   -t, --target-path TARGET_PATH
 73 |                         search for file paths or folder paths for sensitive cache (eg. ~/download/folder).
 74 |   -p, --process-number PROCESS_NUMBER
 75 |                         number of program processes (default: 12).
 76 |   -c, --config-path CONFIG_PATH
 77 |                         path to the yaml configuration file (default: configs.yaml).
 78 |   -o, --output-format OUTPUT_FORMAT
 79 |                         output file format, available formats json, csv (default: csv).
 80 |   -e, --exclude-files EXCLUDE_FILES [EXCLUDE_FILES ...]
 81 |                         excluded files, using regular matching (eg. \.DS_Store .*bin .*doc).
 82 |   -a, --is-re-all       hit a single regular expression per file or match all regular expressions to exit the match loop.
 83 |   -s, --is-silent       silent mode: when turned on, no hit data will be output on the console. use a progress bar instead.
 84 |   -r, --re-filter-content RE_FILTER_CONTENT
 85 |                         filter regular expression. if a regular expression is hit during the string matching process of each line, skip the matching of that line directly
 86 | ```
 87 | 
 88 | ### Cybersecurity Emergency Examples
 89 | 
 90 | Thankyou to our cybersecurity colleagues for their suggestions. This tool can also be used for rapid matching of common cyberattack signatures. It is not suitable for complex cyberattacks, such as attacks within POST request bodies, 0-day vulnerability exploits, or attacks targeting specific network paths. Use it judiciously.
 91 | 
 92 | When used for cybersecurity emergency, scanning can be performed directly on middleware and application log directories to rapidly extract suspected attack traces. Since emergency feature matching and sensitive data matching approaches differ in certain aspects, a separate configuration file `emergency.yaml` is employed here to prevent matching conflicts.
 93 | 
 94 | + Supported common attack fingerprint groups (adjustable in the `rules` section of `emergency.yaml`):
 95 |   + SQL INJECTION
 96 |   + COMMAND EXECUTION
 97 |   + PATH TRAVERSAL / LFI-RFI
 98 |   + SSRF
 99 |   + XSS
100 |   + LOG4SHELL / JNDI
101 |   + WEBSHELL / MALICIOUS UPLOAD
102 |   + JAVA / PHP DESERIALIZATION
103 |   + NOSQL INJECTION
104 |   + SENSITIVE ACCESS
105 | 
106 | Example: Scan the Nginx/Apache/Tomcat application log directories and output all suspicious attack requests.
107 | 
108 | ```bash
109 | python3 sensitive-helper.py -t /var/log/nginx -a -s -c emergency.yaml
110 | python3 sensitive-helper.py -t /var/log/httpd  -a -s -c emergency.yaml
111 | python3 sensitive-helper.py -t /opt/tomcat/logs -a -s -c emergency.yaml
112 | ```
113 | 
114 | Recommendations:
115 | + To expand archived logs, you may leave the compressed archive intact (the program will attempt recursive decompression).
116 | + For exceptionally large log volumes, we recommend enabling the progress bar with `-s` and appropriately increasing the number of processes with `-p`.
117 | 
118 | ### Sample: Default Mode
119 | 
120 | ```bash
121 | $ python3 sensitive-helper.py -t "cache/" -a
122 | [*] file loading...
123 | [*] analyzing...
124 | 
125 | [+] group: FUZZY MATCH, match: AppId":"123456", file: cache/heapdump
126 | [+] group: BASE64, match: ZjY2MTQyNDEtYTIyYS00YjNlLTg1NTgtOTQ4NmUwZDFkZjM1, file: cache/heapdump
127 | [+] group: FUZZY MATCH, match: password":"123456", file: cache/heapdump
128 | [+] group: FILE PATH, match: C:\Windows\system32\drivers, file: cache/heapdump-BAK
129 | [+] group: URL, match: http://hello.world/123456.jpg, file: cache/heapdump-BAK  
130 | total file number: 5
131 | ```
132 | 
133 | ### Sample: Silent Mode
134 | 
135 | ```bash
136 | $ python3 sensitive-helper.py -t "cache/" -a -s
137 | [*] file loading...
138 | [*] analyzing...
139 | 
140 | 53792/53792 [██████████████████████████████████████████] 00:28<00:00,1856.73it/s
141 | total file number: 53792
142 | ```
143 | 
144 | # Q&A
145 | 
146 | + Q: Why don't we do sensitive data search on web pages?
147 | + A: Because web pages are ever-changing, changing an API interface or a css or id may require updating the code, so it is better to export it to the local area and unify the data processing by using text recognition.
148 | 


--------------------------------------------------------------------------------
/utils/compress.py:
--------------------------------------------------------------------------------
  1 | #!/bin/python3
  2 | # _*_ coding:utf-8 _*_
  3 | #
  4 | """
  5 | compress.py
  6 | 
  7 | 压缩文件处理工具：识别并解压常见压缩/打包格式（zip/tar/gz/7z/rar）。
  8 | 
  9 | 功能要点：
 10 | - zip_info: 读取 zip 本地文件头判断是否 zip 以及压缩方式；
 11 | - uncompress_*: 各格式的解压实现，输出到指定目录；
 12 | - uncompress: 统一入口，自动识别并（可选）递归解压嵌套文件。
 13 | 
 14 | 依赖：
 15 | - 7z: 需要 `py7zr`
 16 | - rar: 需要安装系统工具（Windows: WinRAR 并在 PATH 中；Linux: unrar）
 17 | 
 18 | 参考：`https://segmentfault.com/a/1190000007495352`
 19 | """
 20 | 
 21 | import gzip
 22 | import pathlib
 23 | import tarfile
 24 | import zipfile
 25 | from typing import Any, Dict, Union
 26 | 
 27 | import py7zr
 28 | import rarfile
 29 | 
 30 | 
 31 | def get_zip_info(file_path: pathlib.Path) -> Dict[str, Any]:
 32 |     """读取 zip 文件头信息，返回是否为 zip 及压缩方式。"""
 33 |     ret = {'is_zip': False, 'compression': -1}
 34 |     with open(file_path, 'rb') as _f:
 35 |         byte_info = _f.read(30)
 36 |         ret['is_zip'] = byte_info[:4] == b'PK\x03\x04'
 37 |         ret['compression'] = int.from_bytes(byte_info[8:10], 'little')
 38 |     return ret
 39 | 
 40 | 
 41 | def uncompress_zip(
 42 |     file_path: Union[pathlib.Path, str], extract_dir: Union[pathlib.Path, str] = '', compression: int = 0
 43 | ) -> Union[pathlib.Path, Any]:
 44 |     """解压 zip 文件到 `extract_dir`。自动使用本地头中的压缩方式。"""
 45 |     if isinstance(file_path, str):
 46 |         file_path = pathlib.Path(file_path)
 47 |     if not extract_dir:
 48 |         extract_dir = file_path.parent.joinpath('un_' + file_path.name)
 49 |     if isinstance(extract_dir, str):
 50 |         extract_dir = pathlib.Path(extract_dir)
 51 | 
 52 |     # extract_dir = extract_dir.joinpath(file_path.name)
 53 |     extract_dir.mkdir(parents=True, exist_ok=True)
 54 | 
 55 |     with zipfile.ZipFile(file_path, 'r', compression=compression) as _f:
 56 |         for extr_name in _f.namelist():
 57 |             _f.extract(extr_name, extract_dir.__str__())
 58 |             extract_dir.joinpath(extr_name).rename(extract_dir.joinpath(extr_name.encode('cp437').decode('gbk')))
 59 |     return extract_dir
 60 | 
 61 | 
 62 | def is_tar(file_path: pathlib.Path):
 63 |     """通过魔数判断是否为 tar 文件。"""
 64 |     with open(file_path, 'rb') as _f:
 65 |         if _f.read(262)[-5:] == b'ustar':
 66 |             return True
 67 |     return False
 68 | 
 69 | 
 70 | def uncompress_tar(
 71 |     file_path: Union[pathlib.Path, str], extract_dir: Union[pathlib.Path, str] = ''
 72 | ) -> Union[pathlib.Path, Any]:
 73 |     """解包 tar/tar.* 文件到 `extract_dir`。"""
 74 |     if isinstance(file_path, str):
 75 |         file_path = pathlib.Path(file_path)
 76 |     if not extract_dir:
 77 |         extract_dir = file_path.parent.joinpath('un_' + file_path.name)
 78 |     if isinstance(extract_dir, str):
 79 |         extract_dir = pathlib.Path(extract_dir)
 80 | 
 81 |     # extract_dir = extract_dir.joinpath(file_path.name)
 82 |     extract_dir.mkdir(parents=True, exist_ok=True)
 83 | 
 84 |     # tarfile.ReadError: file could not be opened successfully
 85 |     with tarfile.open(file_path) as _f:
 86 |         for extr_name in _f.getnames():
 87 |             _f.extract(extr_name, extract_dir)
 88 |         return extract_dir
 89 | 
 90 | 
 91 | def is_gz(file_path: pathlib.Path):
 92 |     """通过魔数判断是否为 gzip 文件。"""
 93 |     with open(file_path, 'rb') as _f:
 94 |         if _f.read(2) == b'\x1f\x8b':
 95 |             return True
 96 |     return False
 97 | 
 98 | 
 99 | def uncompress_gz(
100 |     file_path: Union[pathlib.Path, str], extract_dir: Union[pathlib.Path, str] = ''
101 | ) -> Union[pathlib.Path, Any]:
102 |     """解压 gzip 文件；若内部为 tar 则继续调用 tar 解包。"""
103 |     if isinstance(file_path, str):
104 |         file_path = pathlib.Path(file_path)
105 |     if not extract_dir:
106 |         extract_dir = file_path.parent.joinpath('un_' + file_path.name)
107 |     if isinstance(extract_dir, str):
108 |         extract_dir = pathlib.Path(extract_dir)
109 | 
110 |     # extract_dir = extract_dir.joinpath(file_path.name)
111 |     extract_dir.mkdir(parents=True, exist_ok=True)
112 |     extract_file = extract_dir.joinpath(file_path.name)
113 | 
114 |     with gzip.open(file_path, 'rb') as gz_f:
115 |         with open(extract_file, 'wb+') as _f:
116 |             _f.write(gz_f.read())
117 |     if is_tar(extract_file):
118 |         return uncompress_tar(extract_file, extract_dir)
119 |     return extract_dir
120 | 
121 | 
122 | def is_7z(file_path: pathlib.Path):
123 |     """通过魔数判断是否为 7z 文件。"""
124 |     with open(file_path, 'rb') as _f:
125 |         if _f.read(6) == b'7z\xbc\xaf\x27\x1c':
126 |             return True
127 |     return False
128 | 
129 | 
130 | def uncompress_7z(
131 |     file_path: Union[pathlib.Path, str], extract_dir: Union[pathlib.Path, str] = ''
132 | ) -> Union[pathlib.Path, Any]:
133 |     """解压 7z 文件到 `extract_dir`。"""
134 |     if isinstance(file_path, str):
135 |         file_path = pathlib.Path(file_path)
136 |     if not extract_dir:
137 |         extract_dir = file_path.parent.joinpath('un_' + file_path.name)
138 |     if isinstance(extract_dir, str):
139 |         extract_dir = pathlib.Path(extract_dir)
140 | 
141 |     extract_dir.mkdir(parents=True, exist_ok=True)
142 | 
143 |     with py7zr.SevenZipFile(file_path, mode='r') as _f:
144 |         _f.extractall(extract_dir)
145 |     return extract_dir
146 | 
147 | 
148 | def is_rar(file_path: pathlib.Path):
149 |     """通过魔数判断是否为 RAR 文件。"""
150 |     with open(file_path, 'rb') as _f:
151 |         if _f.read(4) == b'\x52\x61\x72\x21':
152 |             return True
153 |     return False
154 | 
155 | 
156 | def uncompress_rar(
157 |     file_path: Union[pathlib.Path, str], extract_dir: Union[pathlib.Path, str] = ''
158 | ) -> Union[pathlib.Path, Any]:
159 |     """
160 |     解压 rar 文件在 windows 上需要安装 winrar，并配置好环境变量；linux 上需要安装 unrar，并配置好环境变量
161 |     否则会报出 rarfile.RarCannotExec: Cannot find working tool 错误
162 |     """
163 |     if isinstance(file_path, str):
164 |         file_path = pathlib.Path(file_path)
165 |     if not extract_dir:
166 |         extract_dir = file_path.parent.joinpath('un_' + file_path.name)
167 |     if isinstance(extract_dir, str):
168 |         extract_dir = pathlib.Path(extract_dir)
169 | 
170 |     extract_dir.mkdir(parents=True, exist_ok=True)
171 | 
172 |     with rarfile.RarFile(file_path) as _f:
173 |         # _f.extractall(extract_dir)
174 |         for extr_name in _f.namelist():
175 |             _f.extract(extr_name, extract_dir)
176 |     return extract_dir
177 | 
178 | 
179 | def is_bz(file_path: pathlib.Path):
180 |     """通过魔数判断是否为 bzip2 文件。"""
181 |     with open(file_path, 'rb') as _f:
182 |         if _f.read(2) == b'\x42\x5a\x68':
183 |             return True
184 |     return False
185 | 
186 | 
187 | def uncompress(
188 |     file_path: Union[pathlib.Path, str],
189 |     extract_dir: Union[pathlib.Path, str] = '',
190 |     is_error: bool = True,
191 |     is_recursive: bool = False,
192 |     max_level=64,
193 | ) -> Union[pathlib.Path, Any]:
194 |     """统一解压入口，自动识别并可递归解压。
195 | 
196 |     支持格式：gz/tar/7z/zip/rar。
197 |     当 `is_recursive=True` 时，将在 `max_level` 限制内递归处理嵌套压缩。
198 |     """
199 |     if not isinstance(file_path, pathlib.Path):
200 |         file_path = pathlib.Path(file_path)
201 |     if not extract_dir:
202 |         extract_dir = file_path.parent.joinpath('un_' + file_path.name)
203 |     if not isinstance(extract_dir, pathlib.Path):
204 |         extract_dir = pathlib.Path(extract_dir)
205 | 
206 |     if not file_path.is_file():
207 |         if is_error:
208 |             raise ValueError('{} is not a file.'.format(file_path))
209 |         return
210 | 
211 |     ret = None
212 |     file_info = get_zip_info(file_path)
213 |     if file_info['is_zip']:
214 |         ret = uncompress_zip(file_path, extract_dir, file_info['compression'])
215 |     elif is_gz(file_path):
216 |         ret = uncompress_gz(file_path, extract_dir)
217 |     elif is_tar(file_path):
218 |         ret = uncompress_tar(file_path, extract_dir)
219 |     elif is_7z(file_path):
220 |         ret = uncompress_7z(file_path, extract_dir)
221 |     elif is_rar(file_path):
222 |         ret = uncompress_rar(file_path, extract_dir)
223 |     elif is_error:
224 |         raise ValueError('{} is not a compressed file.'.format(file_path))
225 | 
226 |     if is_recursive and ret and max_level > 0:
227 |         for it in ret.glob('**/*'):
228 |             uncompress(it, ret.joinpath('un_' + it.name), is_error, is_recursive, max_level - 1)
229 |     return ret
230 | 
231 | 
232 | if __name__ == '__main__':
233 |     # print(zip_info(pathlib.Path('cache/utils.zip')))
234 |     # print(uncompress_zip('cache/utils.zip'))
235 |     # print(is_tar(pathlib.Path('cache/utils.tar')))
236 |     # print(uncompress_tar('cache/utils.tar'))
237 |     # print(is_gz(pathlib.Path('cache/utils.tgz')))
238 |     # print(uncompress_gz('cache/utils.tgz'))
239 |     # print(is_7z(pathlib.Path('cache/utils.7z')))
240 |     # print(uncompress_7z('cache/utils.7z'))
241 |     # print(is_rar(pathlib.Path('cache/utils.rar')))
242 |     # print(uncompress_rar('cache/utils.rar'))
243 |     print(uncompress('cache/utils.xlsx', is_error=False, is_recursive=True))
244 |     pass
245 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/sensitive-helper.py:
--------------------------------------------------------------------------------
  1 | #!/bin/python3
  2 | # _*_ coding:utf-8 _*_
  3 | #
  4 | """
  5 | sensitive-helper.py
  6 | 
  7 | 本地文件敏感信息搜索工具：
  8 | - 遍历目标目录或文件，必要时对常见办公与压缩文件进行解析/解压；
  9 | - 使用多进程并发扫描，根据规则匹配可能的敏感信息；
 10 | - 输出命中结果为 CSV 或 JSON。
 11 | 
 12 | 主要模块:
 13 | - utils.compress: 识别与解压多种压缩格式
 14 | - utils.office: 解析 .docx/.xlsx 内容为可扫描文本
 15 | - utils.process: 进程池辅助工具
 16 | - utils.configurator: 简易配置加载与合并
 17 | """
 18 | 
 19 | import base64
 20 | import binascii
 21 | import csv
 22 | import json
 23 | import pathlib
 24 | import re
 25 | import time
 26 | from typing import Any, AnyStr, Dict, List, Union
 27 | 
 28 | import pandas
 29 | import tqdm
 30 | 
 31 | from utils import compress, configurator, office, process
 32 | 
 33 | 
 34 | def log_run_times(func):
 35 |     """装饰器：记录执行耗时超过 1 秒的函数调用。调试专用函数，用于检测哪些步骤消耗大量计算时间，正式使用可不理会。
 36 | 
 37 |     记录内容写入 `run_times.log`，包括耗时与首个参数的前 127 字符。
 38 |     """
 39 | 
 40 |     def wrapper(*args, **kwargs):
 41 |         s_time = time.time()
 42 |         ret = func(*args, **kwargs)
 43 |         total_time = time.time() - s_time
 44 |         if total_time <= 1:
 45 |             return ret
 46 |         with open("run_times.log", "a") as _f:
 47 |             _f.write("total time(s): {}, args: {}\n".format(time.time() - s_time, args[0][:127]))
 48 |         return ret
 49 | 
 50 |     return wrapper
 51 | 
 52 | 
 53 | def string_to_reg_flags(flags: str):
 54 |     """将形如 "I|M|S" 的正则标志字符串转换为 re 标志位整型。"""
 55 |     flags_int = 0
 56 |     for flag in flags.split("|"):
 57 |         flags_int |= getattr(re, flag)
 58 |     return flags_int
 59 | 
 60 | 
 61 | def is_filter_base64(result: AnyStr):
 62 |     """校验并尝试解码 Base64 片段。
 63 | 
 64 |     返回 (is_filter, extend_text)：
 65 |     - is_filter 为 True 表示应过滤（无效/不可读/不合规）；
 66 |     - extend_text 为解码出的可读文本（当不过滤时）。
 67 |     """
 68 |     if len(result) % 4 != 0:
 69 |         return True, ""
 70 |     try:
 71 |         # 编码错误的全都丢掉，不丢掉也看不懂
 72 |         ret_extend = base64.b64decode(result).decode("utf-8")
 73 |         if not re.search(
 74 |             r"^[\u0020-\u007F\u2010-\u202f\u3000-\u301f\u4e00-\u9fa5\uff00-\uffef]+$",
 75 |             ret_extend,
 76 |         ):
 77 |             return True, ""
 78 |         # \u0020-\u007F：英文可视字符集
 79 |         # \u2010-\u202f：中文部分符号集
 80 |         # \u3000-\u301f：中文部分符号集
 81 |         # \u4e00-\u9fa5：中文常见文字集
 82 |         # \u2e80-\u9fff：中文文字及中文异形文字集
 83 |         # \uff00-\uffef：中文部分符号集
 84 |     except UnicodeDecodeError:
 85 |         return True, ""
 86 |     except binascii.Error:
 87 |         return True, ""
 88 |     return False, ret_extend
 89 | 
 90 | 
 91 | def is_filter_jwt(result: AnyStr):
 92 |     """快速校验 JWT 结构是否符合 Base64 块长度要求（粗筛）。"""
 93 |     times = 0
 94 |     res_split = result.split(b".")  # type: ignore
 95 |     while times < 2:
 96 |         if len(res_split[times]) % 4 != 0:
 97 |             return True, ""
 98 |         times += 1
 99 |     return False, ""
100 | 
101 | 
102 | def is_filter_result(result: AnyStr, filters: List[AnyStr], flags: int):
103 |     """基于 `re_filters` 做二次过滤，命中即过滤。"""
104 |     if not filters:
105 |         return False, ""
106 |     for fil in filters:
107 |         if re.search(fil, result, flags):  # type: ignore
108 |             return True, ""
109 |     return False, ""
110 | 
111 | 
112 | # @log_run_times
113 | def search_content(
114 |     file_object: Union[pathlib.Path, bytes],
115 |     rules: Dict[str, List[str]],
116 |     split: bytes = b"[\x00-\x1f\x7f]+",
117 |     re_filter_content: bytes = rb"",
118 |     is_re_all: bool = False,
119 |     is_silent: bool = False,
120 | ) -> List[Dict[str, str]]:
121 |     """扫描单个文件对象的内容并返回命中结果列表。
122 | 
123 |     参数:
124 |     - file_object: `Path` 或 bytes；Path 时按控制字符切分字节行；
125 |     - rules: 规则字典，值可以是字符串列表或带 flags/re_filters/regexp 的字典；
126 |     - split: 行分割正则（bytes）；
127 |     - re_filter_content: 跳过匹配过滤，碰到指定字符直接跳过匹配；
128 |     - is_re_all: 命中一条是否继续匹配该文件的其它规则。
129 | 
130 |     返回:
131 |     - 列表项包含 file/group/regexp/match/extend 字段。
132 |     """
133 |     ret = []
134 |     row_contents = [file_object]
135 |     if isinstance(file_object, pathlib.Path):
136 |         row_contents = re.split(split, file_object.read_bytes())
137 | 
138 |     # 创建文件行扫描进度条
139 |     file_name = str(file_object) if isinstance(file_object, pathlib.Path) else "bytes"
140 |     result_gen = enumerate(row_contents, start=1)
141 |     if is_silent:
142 |         result_gen = tqdm.tqdm(
143 |             enumerate(row_contents, start=1),
144 |             total=len(row_contents),
145 |             desc=f"file: {file_name.split('/')[-1][:16]}..",
146 |             leave=False,  # 不保留进度条，避免与主进度条冲突
147 |             ncols=100,
148 |             bar_format="{desc}:{percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
149 |         )
150 | 
151 |     for index, row_one in result_gen:
152 |         # 按控制字符进行分割行
153 |         if len(row_one) < 12:
154 |             # 单行内容少于8个字符，丢掉
155 |             continue
156 |         if re_filter_content and re.search(re_filter_content, row_one):
157 |             continue
158 |         for rule_name in rules:
159 |             rule = rules[rule_name]
160 |             flags = 0
161 |             filters = None
162 |             if isinstance(rule, Dict):
163 |                 if "flags" in rule:
164 |                     flags = string_to_reg_flags(rule["flags"])  # type: ignore
165 |                 if "re_filters" in rule:
166 |                     filters = rule["re_filters"]  # type: ignore
167 |                 rule = rule["regexp"]  # type: ignore
168 |             for regexp in rule:
169 |                 r_result = re.search(regexp, row_one, flags)  # type: ignore
170 |                 if not r_result:
171 |                     continue
172 |                 try:
173 |                     result_byte = r_result.group()
174 |                     result_text = result_byte.decode("utf-8")
175 |                 except UnicodeDecodeError:
176 |                     continue
177 |                 is_filter, extend = is_filter_result(result_byte, filters, flags)  # type: ignore
178 |                 if rule_name == "BASE64":
179 |                     is_filter, extend = is_filter_base64(result_byte)
180 |                 if rule_name == "JSON WEB TOKEN(JWT)":
181 |                     is_filter, extend = is_filter_jwt(result_byte)
182 |                 if is_filter:
183 |                     continue
184 | 
185 |                 ret.append(
186 |                     {
187 |                         "file": f"{file_object.__str__()}:{index}",
188 |                         "group": rule_name,
189 |                         "regexp": regexp.decode("utf-8"),  # type: ignore
190 |                         "match": result_text,
191 |                         "extend": extend,
192 |                     }
193 |                 )
194 |                 if not is_re_all:
195 |                     # 如果关闭了匹配所有正则组数据且已发现有用数据，则退出循环
196 |                     if hasattr(result_gen, 'close'):
197 |                         result_gen.close()  # type: ignore # 关闭进度条
198 |                     return ret
199 | 
200 |     if hasattr(result_gen, 'close'):
201 |         result_gen.close()  # type: ignore # 关闭进度条
202 |     return ret
203 | 
204 | 
205 | def gen_file_list(src_path: str, exclude_files: List[str]) -> List[pathlib.Path]:
206 |     """生成需扫描的文件列表，并对特定类型进行预处理。
207 | 
208 |     - 对 `.docx`/`.xlsx`：解析内容到同名 `_resolved.txt`，便于后续扫描；
209 |     - 其它文件：尝试递归解压缩，以发现嵌套内容。
210 |     """
211 |     tar_path = pathlib.Path(src_path)
212 |     ret = []
213 |     if tar_path.is_file():
214 |         ret.append(tar_path)
215 |     else:
216 |         for filepath in tar_path.glob("**/*"):
217 |             is_skip = False
218 |             if filepath.is_dir():
219 |                 continue
220 |             filename = filepath.name
221 |             for r_exclude in exclude_files:
222 |                 # 文件名正则匹配，在排除名单中则排除文件
223 |                 if re.match(r_exclude, filename):
224 |                     is_skip = True
225 |                     break
226 |             if is_skip:
227 |                 continue
228 |             if filename.endswith(".docx") and not filename.startswith("~$"):
229 |                 office.docx_handler(filepath)
230 |             elif filename.endswith(".xlsx") and not filename.startswith("~$"):
231 |                 office.xlsx_handler(filepath)
232 |             else:
233 |                 compress.uncompress(filepath, is_error=False, is_recursive=True)
234 |             ret.append(filepath)
235 |     return ret
236 | 
237 | 
238 | def run():
239 |     """主流程：构建任务队列、并发扫描、汇总去重并输出结果。"""
240 |     pool = process.ProcessPoolHelper(max_workers=CFG.get("process_number"))
241 |     print("[*] file loading...")
242 |     filelist = gen_file_list(CFG.get("target_path"), CFG.get("exclude_files"))  # type: ignore
243 |     if not filelist:
244 |         print("[!] the file path is empty. please check whether the path is correct.\n")
245 |         return
246 |     filelist = sorted(filelist, key=lambda x: x.stat().st_size, reverse=True)
247 |     ret = []
248 |     result_filter_list = []
249 |     print(f"[*] found {len(filelist)} files.")
250 |     groups = CFG.get("rules")
251 |     for filepath in filelist:
252 |         pool.submit_super(
253 |             search_content,
254 |             filepath,
255 |             groups,
256 |             CFG.get("row_split"),
257 |             CFG.get("re_filter_content"),
258 |             CFG.get("is_re_all"),
259 |             CFG.get("is_silent"),
260 |         )
261 | 
262 |     print("[*] analyzing...\n")
263 |     result_gen = pool.result_yield()
264 |     if CFG.get("is_silent"):
265 |         result_gen = tqdm.tqdm(
266 |             pool.result_yield(),
267 |             total=len(filelist),
268 |             mininterval=1,
269 |             ncols=100,
270 |             bar_format="{n_fmt}/{total_fmt} [{bar}] {elapsed}<{remaining},{rate_fmt}{postfix}",
271 |         )
272 |     for results in result_gen:
273 |         if not results:
274 |             continue
275 |         for result in results:
276 |             union_data = [result["file"], result["match"]]
277 |             # 相同文件，相同匹配字符串去重
278 |             if union_data in result_filter_list:
279 |                 continue
280 |             result_filter_list.append([result["file"], result["match"]])
281 |             ret.append(result)
282 |             if not CFG.get("is_silent"):
283 |                 print("[+] group: {}, match: {}, file: {}".format(result["group"], result["match"], result["file"]))
284 |     output_format = CFG.get("output_format")
285 |     filename = "results_{}.csv".format(time.strftime("%H%M%S", time.localtime()))
286 |     if output_format == "json":
287 |         filename = "results.json"
288 |         with open(filename, "w", encoding="utf-8") as _f:
289 |             _f.write(json.dumps(ret))
290 |     else:
291 |         to_csv(ret, filename)
292 | 
293 |     print("[*] total file number:", len(filelist))
294 |     print("[+] output to:", pathlib.Path(filename).absolute())
295 |     return ret
296 | 
297 | 
298 | def to_csv(data: Union[Dict[str, Any], List[Dict[str, Any]]], filename: str = "output.csv"):
299 |     """将结果列表输出为 CSV 文件。"""
300 |     dataframe = pandas.DataFrame(data)
301 |     dataframe.to_csv(filename, quoting=csv.QUOTE_MINIMAL)
302 | 
303 | 
304 | # 考虑字典数据、列表数据、函数数据
305 | FUZZY_UNIVERSAL_STRING = r'["\'`]?\s*[=:(\{\[]\s*["\'`][\x20-\x7F]{,128}?[\'"`]'
306 | #
307 | PATH_COMMON_STRING = r"users?|windows?|program files(\(x\d{2,3}\))?|s?bin|etc|usr|boot|dev|home|proc|opt|sys|srv|var"
308 | 
309 | __DEFAULT_CONFIG = {
310 |     "target_path": {
311 |         "__help": "search for file paths or folder paths for sensitive cache (eg. ~/download/folder).",
312 |     },
313 |     "process_number": {
314 |         "__type": int,
315 |         '__default': 12,
316 |         '__help': "number of program processes (default: 12).",
317 |     },
318 |     "config_path": {
319 |         "__default": "config.yaml",
320 |         '__help': "path to the yaml configuration file (default: configs.yaml).",
321 |     },
322 |     "output_format": {
323 |         "__default": "csv",
324 |         '__help': "output file format, available formats json, csv (default: csv).",
325 |     },
326 |     "exclude_files": {
327 |         "__default": [r"\.DS_Store"],
328 |         "__nargs": "+",
329 |         '__help': "excluded files, using regular matching (eg. \\.DS_Store .*bin .*doc).",
330 |     },
331 |     "is_re_all": {
332 |         "__flags": ['-a', '--is-re-all'],
333 |         "__action": "store_true",
334 |         '__help': "hit a single regular expression per file or match all regular expressions to exit the match loop.",
335 |     },
336 |     "is_silent": {
337 |         "__flags": ['-s', '--is-silent'],
338 |         "__action": "store_true",
339 |         '__help': "silent mode: when turned on, no hit data will be output on the console. use a progress bar instead.",
340 |     },
341 |     "re_filter_content": {
342 |         '__help': "filter regular expression. if a regular expression is hit during the string matching process of each line, skip the matching of that line directly",
343 |     },
344 |     "row_split": "[\x00-\x1f\x7f]+",
345 |     "rules": {
346 |         "AKSK": [
347 |             r"[\s\n\'\"`=:#]LTAI\w{12,20}[\s\n\'\"`=:#]",
348 |             r"[\s\n\'\"`=:#](A3T[A-Z0-9]|ABIA|ACCA|AGPA|AIDA|AIPA|AKIA|ANPA|ANVA|APKA|AROA|ASCA|ASIA)[0-9A-Z]{16}[\s\n\'\"`=:#]",
349 |             r"[\s\n\'\"`=:#]GOOG\w{10,30}[\s\n\'\"`=:#]",
350 |             r"[\s\n\'\"`=:#]AZ[A-Za-z0-9]{34,40}[\s\n\'\"`=:#]",
351 |             r"[\s\n\'\"`=:#]IBM[A-Za-z0-9]{10,40}[\s\n\'\"`=:#]",
352 |             r"[\s\n\'\"`=:#][a-zA-Z0-9]{8}(-[a-zA-Z0-9]{4}){3}-[a-zA-Z0-9]{12}[\s\n\'\"`=:#]",
353 |             r"[\s\n\'\"`=:#]OCID[A-Za-z0-9]{10,40}[\s\n\'\"`=:#]",
354 |             r"[\s\n\'\"`=:#]LTAI[A-Za-z0-9]{12,20}[\s\n\'\"`=:#]",
355 |             r"[\s\n\'\"`=:#]AKID[A-Za-z0-9]{13,20}[\s\n\'\"`=:#]",
356 |             r"[\s\n\'\"`=:#]AK[A-Za-z0-9]{10,40}[\s\n\'\"`=:#]",
357 |             r"[\s\n\'\"`=:#]JDC_[A-Z0-9]{28,32}[\s\n\'\"`=:#]",
358 |             r"[\s\n\'\"`=:#]AKLT[a-zA-Z0-9-_]{0,252}[\s\n\'\"`=:#]",
359 |             r"[\s\n\'\"`=:#]UC[A-Za-z0-9]{10,40}[\s\n\'\"`=:#]",
360 |             r"[\s\n\'\"`=:#]QY[A-Za-z0-9]{10,40}[\s\n\'\"`=:#]",
361 |             r"[\s\n\'\"`=:#]AKLT[a-zA-Z0-9-_]{16,28}[\s\n\'\"`=:#]",
362 |             r"[\s\n\'\"`=:#]LTC[A-Za-z0-9]{10,60}[\s\n\'\"`=:#]",
363 |             r"[\s\n\'\"`=:#]YD[A-Za-z0-9]{10,60}[\s\n\'\"`=:#]",
364 |             r"[\s\n\'\"`=:#]CTC[A-Za-z0-9]{10,60}[\s\n\'\"`=:#]",
365 |             r"[\s\n\'\"`=:#]YYT[A-Za-z0-9]{10,60}[\s\n\'\"`=:#]",
366 |             r"[\s\n\'\"`=:#]YY[A-Za-z0-9]{10,40}[\s\n\'\"`=:#]",
367 |             r"[\s\n\'\"`=:#]CI[A-Za-z0-9]{10,40}[\s\n\'\"`=:#]",
368 |             r"[\s\n\'\"`=:#]gcore[A-Za-z0-9]{10,30}[\s\n\'\"`=:#]",
369 |         ],
370 |         "JSON WEB TOKEN(JWT)": [r"ey[0-9a-zA-Z/+]{4,}={,2}\.[0-9a-zA-Z/+]{6,}={,2}\.[A-Za-z0-9-_]+"],
371 |         "FUZZY MATCH": {
372 |             "flags": "I",
373 |             "regexp": [
374 |                 r"(APP|ACCESS|USER|PASS|OSS|ECS|CVM|AWS)[\w]{,8}(NAME|ID|KEY|NUM|ENC|CODE|SEC|WORD)[\w]{,16}%s"
375 |                 % FUZZY_UNIVERSAL_STRING,
376 |                 # 考虑驼峰写法，下划线写法，MAP键值下面单词后必须接大写字母、下划线、中划线，否侧可能出现如：
377 |                 r"(USR|PWD|COOKIE)[_\-A-Z][\w]{,16}%s" % FUZZY_UNIVERSAL_STRING,
378 |                 r"(SECRET|SIGN|TOKEN)[\w]{,16}%s" % FUZZY_UNIVERSAL_STRING,
379 |             ],
380 |         },
381 |         "BASE64": [r"[0-9a-zA-Z/+]{8,}={,2}"],
382 |         "URL": {
383 |             "regexp": [r"(ftp|https?):\/\/[%.\w\-]+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?"],
384 |             "re_filters": [
385 |                 r"(adobe|amap|android|apache|bing|digicert|eclipse|freecodecamp|github|githubusercontent|gnu|godaddy|google|googlesource|youtube|youtu|jd"
386 |                 r"|npmjs|microsoft|openxmlformats|outlook|mozilla|openssl|oracle|qq|spring|sun|umang|w3|wikipedia|xml)\.("
387 |                 r"org|com|cn|net|edu|io|be)",
388 |                 r"(ali|baidu|cdn|example|ssh|ssl)[\w-]*\.(org|com|cn|net|edu|io)",
389 |             ],
390 |         },
391 |         "EMAIL": [r"[a-zA-Z0-9][-+.\w]{1,127}@([a-zA-Z0-9][-a-zA-Z0-9]{0,63}.){,3}(org|com|cn|net|edu|mail)"],
392 |         "PHONE": [r"(13[0-9]|14[5-9]|15[0-3,5-9]|16[6]|17[0-8]|18[0-9]|19[8,9])\d{8}"],
393 |         "FILE PATH": {
394 |             "flags": "I|X",
395 |             "regexp": [
396 |                 r"([a-z]:\\)?([\\/])(users?|windows?|program files(\(x\d{2,3}\))?|s?bin|etc|usr|boot|dev|home|proc|opt"
397 |                 r"|sys|srv|var)(\2[.\w!#\(~\[\{][.\w!#&\(\)+=~\[\]\{\}\s]{2,63}){1,16}"
398 |             ],
399 |             "re_filters": [
400 |                 # r'[\\/].*sdk.*',
401 |                 # r'[\\/](alibaba|aliyun|annotation|apache|chromium|collections|eclipse|facebook|functions|github|google'
402 |                 # r'|internal|jetbrains|oppo|reactnative|reflect|sdklib|sequences|taobao|tencent|unionpay|view|vivo'
403 |                 # r'|webkit|xiaomi)',
404 |             ],
405 |         },
406 |     },
407 | }
408 | 
409 | CFG = configurator.CliConfigurator({})
410 | 
411 | if __name__ == "__main__":
412 |     import argparse
413 | 
414 |     CFG = configurator.new(
415 |         configurator.CliConfigurator,
416 |         template=__DEFAULT_CONFIG,
417 |         formatter_class=argparse.RawDescriptionHelpFormatter,
418 |         description="""
419 |     ███████╗███████╗███╗   ██╗███████╗██╗████████╗██╗██╗   ██╗███████╗
420 |     ██╔════╝██╔════╝████╗  ██║██╔════╝██║╚══██╔══╝██║██║   ██║██╔════╝
421 |     ███████╗█████╗  ██╔██╗ ██║███████╗██║   ██║   ██║██║   ██║█████╗
422 |     ╚════██║██╔══╝  ██║╚██╗██║╚════██║██║   ██║   ██║╚██╗ ██╔╝██╔══╝
423 |     ███████║███████╗██║ ╚████║███████║██║   ██║   ██║ ╚████╔╝ ███████╗
424 |     ╚══════╝╚══════╝╚═╝  ╚═══╝╚══════╝╚═╝   ╚═╝   ╚═╝  ╚═══╝  ╚══════╝
425 |     v0.1.6
426 |     by 0xn0ne, https://github.com/0xn0ne/sensitive-helper
427 | """,
428 |     )
429 |     CFG.parse_args('config_path')  # type: ignore
430 |     print("[*] config:", CFG.gen_detail(depth=2, filters=["rules"]))
431 |     rules = CFG.get("rules")
432 |     for rule in rules.values():
433 |         if isinstance(rule, Dict):
434 |             if "re_filters" in rule:
435 |                 for index, value in enumerate(rule["re_filters"]):
436 |                     rule["re_filters"][index] = value.encode()
437 |             rule = rule["regexp"]
438 |         for index, value in enumerate(rule):
439 |             rule[index] = value.encode()
440 |     CFG.raw["row_split"] = CFG.raw["row_split"].encode()
441 |     CFG.raw["re_filter_content"] = CFG.raw["re_filter_content"].encode()
442 |     run()
443 | 


--------------------------------------------------------------------------------
/utils/configurator.py:
--------------------------------------------------------------------------------
  1 | #!/bin/python3
  2 | # _*_ coding:utf-8 _*_
  3 | #
  4 | # configurator.py
  5 | # 依赖安装：pip install toml yaml
  6 | # toml 文档：https://github.com/uiri/toml
  7 | # yaml 文档：https://pyyaml.org/wiki/PyYAMLDocumentation
  8 | 
  9 | import argparse
 10 | import json
 11 | import pathlib
 12 | import re
 13 | from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
 14 | 
 15 | try:
 16 |     import toml
 17 | except ImportError:
 18 |     toml = None
 19 | 
 20 | try:
 21 |     import yaml
 22 | except ImportError:
 23 |     yaml = None
 24 | 
 25 | # 全局配置字典，用于存储所有配置实例
 26 | _G_CFG = {}
 27 | 
 28 | 
 29 | class Mode:
 30 |     merge = 10
 31 |     update = 20
 32 |     replace = 30
 33 | 
 34 | 
 35 | class BaseConfigurator:
 36 |     def __init__(self, template: Dict[str, Any] = {}):
 37 |         """
 38 |         初始化配置基类
 39 | 
 40 |         Args:
 41 |             template (Dict, optional): 默认配置模板。默认为空字典。
 42 |         """
 43 |         self.raw = template or {}
 44 |         # 保存原始模板
 45 |         self.template = template
 46 | 
 47 |     def loads(self, content: str, fmt: str = "json", mode: int = Mode.update) -> "BaseConfigurator":
 48 |         """
 49 |         从字符串加载配置
 50 | 
 51 |         Args:
 52 |             content (str): 配置内容字符串
 53 |             fmt (str, optional): 配置格式，支持 'json', 'yaml', 'toml'。默认为 'json'。
 54 | 
 55 |         Returns:
 56 |             BaseConfigurator: 返回自身实例
 57 |         """
 58 |         if not content:
 59 |             return self
 60 |         if fmt == "toml" and toml:
 61 |             config_dict = toml.loads(content)
 62 |         elif fmt == "yaml" and yaml:
 63 |             config_dict = yaml.safe_load(content)
 64 |         else:
 65 |             config_dict = json.loads(content)
 66 |         if mode == Mode.merge:
 67 |             self.raw = self._merge_dicts(self.raw, config_dict)
 68 |         elif mode == Mode.replace:
 69 |             self.raw = config_dict
 70 |         else:
 71 |             self.raw.update(config_dict)
 72 |         return self
 73 | 
 74 |     def dumps(self, fmt: str = "json") -> str:
 75 |         """
 76 |         将配置转换为字符串
 77 | 
 78 |         Args:
 79 |             fmt (str, optional): 配置格式，支持 'json', 'yaml', 'toml'。默认为 'json'。
 80 | 
 81 |         Returns:
 82 |             str: 配置内容字符串
 83 |         """
 84 |         if fmt == "toml" and toml:
 85 |             return toml.dumps(self.raw)
 86 |         elif fmt == "yaml" and yaml:
 87 |             return yaml.safe_dump(self.raw)
 88 |         else:
 89 |             return json.dumps(self.raw)
 90 | 
 91 |     def get(self, keys: str, default: Any = None, sep: str = ".") -> Any:
 92 |         """
 93 |         获取配置值
 94 | 
 95 |         Args:
 96 |             keys (str): 多级键字符串
 97 |             default (Any, optional): 默认值。默认为 None。
 98 |             sep (str, optional): 键分隔符。默认为 '.'。
 99 | 
100 |         Returns:
101 |             Any: 配置值
102 |         """
103 |         keys_list = keys.split(sep)
104 |         value = self.raw
105 |         for key in keys_list:
106 |             if isinstance(value, dict) and key in value:
107 |                 value = value[key]
108 |             elif default is None:
109 |                 raise KeyError(f'key "{keys}" not found in configuration.')
110 |             else:
111 |                 return default
112 |         return value
113 | 
114 |     def set(self, keys: str, value: Any, sep: str = "."):
115 |         """
116 |         设置配置值
117 | 
118 |         Args:
119 |             keys (str): 多级键字符串
120 |             value (Any): 配置值
121 |             sep (str, optional): 键分隔符。默认为 '.'。
122 |         """
123 |         keys_list = keys.split(sep)
124 |         d = self.raw
125 |         for key in keys_list[:-1]:
126 |             if key not in d or not isinstance(d[key], dict):
127 |                 d[key] = {}
128 |             d = d[key]
129 |         d[keys_list[-1]] = value
130 | 
131 |     def exists(self, key: str) -> bool:
132 |         """
133 |         判断配置键是否存在
134 | 
135 |         Args:
136 |             key (str): 配置键
137 | 
138 |         Returns:
139 |             bool: 是否存在
140 |         """
141 |         return self.get(key) is not None
142 | 
143 |     def clear(self):
144 |         """
145 |         清空配置
146 |         """
147 |         self.raw = {}
148 | 
149 |     def gen_detail(self, depth: int = 3, sep: str = "; ", filters: List[str] = []) -> str:
150 |         """
151 |         生成配置简要预览
152 | 
153 |         Args:
154 |             depth (int, optional): 最大遍历深度，负数的情况下输出所有内容。默认为 3。
155 |             sep (str, optional): 首级键分隔符。默认为 ';'。
156 | 
157 |         Returns:
158 |             str: 配置简要预览字符串
159 |         """
160 | 
161 |         def _gen_detail(parent_dict, current_depth: int, parent_key: str = ""):
162 |             if not current_depth:
163 |                 return "..."
164 |             if isinstance(parent_dict, dict):
165 |                 d_detail = {}
166 |                 for k, v in parent_dict.items():
167 |                     next_key = "{}.{}".format(parent_key, k) if parent_key else k
168 |                     if filters and next_key in filters:
169 |                         continue
170 |                     d_detail[k] = _gen_detail(v, current_depth - 1, next_key)
171 | 
172 |                 return d_detail
173 |             return parent_dict
174 | 
175 |         detail = _gen_detail(self.raw, depth)
176 |         l_detail = []
177 |         for k, v in detail.items():  # type: ignore
178 |             l_detail.append(f"{k}: {v}")
179 | 
180 |         return sep.join(l_detail)
181 | 
182 |     def _merge_dicts(self, d1, d2):
183 |         """
184 |         合并两个字典
185 | 
186 |         Args:
187 |             d1 (dict): 字典1
188 |             d2 (dict): 字典2
189 | 
190 |         Returns:
191 |             dict: 合并后的字典
192 |         """
193 |         for k, v in d2.items():
194 |             if k in d1 and isinstance(d1[k], dict) and isinstance(v, dict):
195 |                 self._merge_dicts(d1[k], v)
196 |             else:
197 |                 d1[k] = v
198 |         return d1
199 | 
200 | 
201 | class FileConfigurator(BaseConfigurator):
202 |     def __init__(
203 |         self,
204 |         filepath: str = "configs.json",
205 |         template: Dict = {},
206 |         is_auto_make: bool = False,
207 |     ):
208 |         """
209 |         初始化文件配置子类
210 | 
211 |         Args:
212 |             filepath (str, optional): 配置文件路径。默认为 'configs.json'。
213 |             template (Dict, optional): 默认配置模板。默认为空字典。
214 |         """
215 |         super().__init__(template)
216 |         self.filepath = pathlib.Path(filepath)
217 |         if is_auto_make:
218 |             self.filepath.touch()
219 |         self.load(Mode.merge)
220 | 
221 |     def load(self, mode: int = Mode.update):
222 |         """
223 |         从文件加载配置
224 |         """
225 |         if self.filepath.is_file():
226 |             self.loads(self.filepath.read_text(encoding='utf-8'), self.filepath.suffix[1:], mode)
227 | 
228 |     def save(self, fmt: str = "json"):
229 |         """
230 |         将配置保存到文件
231 | 
232 |         Args:
233 |             fmt (str, optional): 配置格式，支持 'json', 'yaml', 'toml'。默认为 'json'。
234 |         """
235 |         if fmt not in ["json", "yaml", "toml"]:
236 |             raise NotImplementedError(f'the file format "{fmt}" is not supported.')
237 |         self.filepath = self.filepath.with_suffix(f".{fmt}")
238 |         self.filepath.write_text(self.dumps(fmt))
239 | 
240 |     def delete(self, fmts: Union[str, List[str]] = []):
241 |         """
242 |         删除配置文件
243 | 
244 |         Args:
245 |             fmt (Union[str, List[str]], optional): 配置格式，支持 'json', 'yaml', 'toml'。默认为空列表。
246 |         """
247 |         if isinstance(fmts, str):
248 |             fmts = [fmts]
249 |         for ext in fmts:
250 |             if ext not in ["json", "yaml", "toml"]:
251 |                 raise NotImplementedError(f'the file format "{ext}" is not supported.')
252 |         if not fmts:
253 |             fmts = ["json", "yaml", "toml"]
254 |         for ext in fmts:
255 |             self.filepath.with_suffix(f".{ext}").unlink(missing_ok=True)
256 | 
257 | 
258 | class CliConfigurator(FileConfigurator):
259 |     def __init__(
260 |         self,
261 |         template: Dict[str, Union[Dict[str, str], Any]],
262 |         filepath: str = "configs.json",
263 |         prog: Optional[str] = None,
264 |         usage: Optional[str] = None,
265 |         description: Optional[str] = None,
266 |         epilog: Optional[str] = None,
267 |         **kwargs,
268 |     ):
269 |         """
270 |         初始化命令行配置子类
271 | 
272 |         Args:
273 |             template (Dict, optional): 默认命令行参数配置模板，该模版必须存在，不支持从配置文件中加载。
274 |             filepath (str, optional): 配置文件路径。默认为 'configs.json'。
275 |         """
276 |         # 根据 template 配置，创建命令行
277 |         self.exists_short_key: Set[str] = set()
278 |         self.parser = argparse.ArgumentParser(prog, usage, description, epilog, **kwargs)
279 |         self._add_args_from_template(template)
280 | 
281 |         # 循环遍历 self.raw，去除命令行参数所需要使用的默认值和描述信息
282 |         del_keys = []
283 |         for key, value in template.items():
284 |             if not isinstance(value, dict):
285 |                 continue
286 |             for sub_key in value:
287 |                 if sub_key == "__default":
288 |                     template[key] = value["__default"]
289 |                     continue
290 |                 if sub_key == '__description':
291 |                     del value["__description"]
292 |                     continue
293 |                 if sub_key.startswith('__'):
294 |                     del_keys.append(key)
295 |                     break
296 |         # 因为中途删除key会导致字典变化，引发错误，检索完成后删除一轮
297 |         for del_key in del_keys:
298 |             del template[del_key]
299 |         super().__init__(filepath, template)
300 | 
301 |     def _parse_add_argument_kwargs(
302 |         self,
303 |         value: Dict[str, Any],
304 |         able_keys: List[str] = [
305 |             'default',
306 |             'type',
307 |             'help',
308 |             'required',
309 |             'action',
310 |             'nargs',
311 |             'const',
312 |             'choices',
313 |             'metavar',
314 |             'dest',
315 |             'deprecated',
316 |         ],
317 |     ) -> Dict[str, Any]:
318 |         """
319 |         able_keys参考可用参数来源https://docs.python.org/3/library/argparse.html#argparse.ArgumentParser.add_argument
320 | 
321 |         Args:
322 |             value (Dict[str, Any]): _description_
323 |             able_keys (List[str], optional): _description_. Defaults to [ 'default', 'type', 'help', 'required', 'action', 'nargs', 'const', 'choices', 'metavar', 'dest', 'deprecated', ].
324 | 
325 |         Returns:
326 |             Dict[str, Any]: _description_
327 |         """
328 |         kwargs = {}
329 |         for key, val in value.items():
330 |             if key.startswith('__') and key[2:] in able_keys:
331 |                 kwargs[key[2:]] = val
332 |         return kwargs
333 | 
334 |     def _gen_flags(self, flag: str, is_initial_letter_mode: bool = False) -> List[str]:
335 |         """创建短命令标志
336 | 
337 |         Args:
338 |             flag (str): 常规长度的 flag 字符串
339 |             is_initial_letter_mode (bool, optional): 启用首字母模式，将 flag 的每个首字母拼合成短命令标志，默认为：False.
340 | 
341 |         Returns:
342 |             List[str]: 成功生成短命令标志的情况下就会返回命令标志字符串列表
343 |         """
344 |         r_flag = re.findall(r'[\da-zA-Z]+', flag)
345 |         if not r_flag:
346 |             raise ValueError('the flag does not conform to the specification and should be a character in "0-9a-zA-Z".')
347 |         # 创建常规长度 flag
348 |         flags = ['--' + '-'.join(r_flag)]
349 |         new_short_flag = '-'
350 |         if is_initial_letter_mode:
351 |             # 如果短flag已存在，报错
352 |             new_short_flag += ''.join([i[0] for i in r_flag])
353 |             if new_short_flag in self.exists_short_key:
354 |                 raise ValueError(f'this "{new_short_flag}" short flag command already exists.')
355 |             self.exists_short_key.add(new_short_flag)
356 |             flags.reverse()
357 |             return flags
358 |         for iter in r_flag:
359 |             new_short_flag += iter[0]
360 |             # 防冲突，已存在就直接跳过
361 |             if new_short_flag not in self.exists_short_key:
362 |                 self.exists_short_key.add(new_short_flag)
363 |                 flags.append(new_short_flag)
364 |                 break
365 |         flags.reverse()
366 |         return flags
367 | 
368 |     def _add_args_from_template(self, template: Dict, group: Optional[argparse.ArgumentParser] = None):
369 |         """
370 |         根据模板添加命令行参数
371 | 
372 |         Args:
373 |             template (Dict): 配置模板，命令行参数解析仅支持解析深度为 2 的字典结构。如
374 |         """
375 |         if not group:
376 |             group = self.parser
377 | 
378 |         for key, value in template.items():
379 |             if not isinstance(value, dict):
380 |                 # 子键值为字典才会解析到命令行中
381 |                 continue
382 |             kwargs = self._parse_add_argument_kwargs(value)
383 |             if kwargs:
384 |                 if '__flags' in value:
385 |                     flags = value['__flags']
386 |                 else:
387 |                     flags = self._gen_flags(key)
388 | 
389 |                 group.add_argument(  # type: ignore
390 |                     *flags,
391 |                     **kwargs,
392 |                     # type=value["__type"],
393 |                     # default=value["__default"],
394 |                     # help=value.get("__help", ""),
395 |                     # required=value.get("__required", False),
396 |                 )
397 |                 continue
398 |             # 且没有扫描到 add_argument 函数所需的参数的情况下，添加为参数组
399 |             # __description 为可选参数，有则添加描述说明，无则跳过
400 |             group = self.parser.add_argument_group(key, description=value.get("__description", None))  # type: ignore
401 |             self._add_args_from_template(value, group)
402 | 
403 |     def parse_args(self, config_file_flag: Optional[str] = None):
404 |         """
405 |         解析命令行参数并合并到配置中
406 | 
407 |         Args:
408 |             config_file_flag (Optional[str]): 配置文件的命令标志. Defaults to None.
409 |         """
410 |         args = vars(self.parser.parse_args())
411 |         if config_file_flag and args[config_file_flag]:
412 |             self.filepath = pathlib.Path(args[config_file_flag])
413 |             # 因为如果需要根据命令标志读取文件并对配置修改，需要在解析过程中重载命令标志对应的文件
414 |             self.load(Mode.update)
415 |         for key, value in args.items():
416 |             if value is None:
417 |                 continue
418 |             self.set(key, value)
419 | 
420 | 
421 | def new(
422 |     base_class: Callable = FileConfigurator,
423 |     name: str = "__DEFAULT__",
424 |     *args,
425 |     **kwargs,
426 | ) -> Union[BaseConfigurator, FileConfigurator, CliConfigurator]:
427 |     """
428 |     创建具有指定名称和基类的配置对象的新实例。
429 | 
430 |     Args:
431 |         name (str, optional): 配置对象的名称。Defaults to '__DEFAULT__'.
432 |         base_class (Union[BaseConfigurator, FileConfigurator, CLIConfigurator], optional): 配置对象的基类。Defaults to FileConfigurator.
433 |         *args: Variable length argument list.
434 |         **kwargs: Arbitrary keyword arguments.
435 | 
436 |     Returns:
437 |         FileConfigurator: 新创建的配置对象。
438 | 
439 |     Raises:
440 |         None
441 | 
442 |     Examples:
443 |         >>> new('my_config_name', FileConfigurator, arg1='value1', arg2='value2')
444 |         <FileConfigurator object at 0x7f9b0a0c6b80>
445 |     """
446 |     if name not in _G_CFG:
447 |         _G_CFG[name] = base_class(*args, **kwargs)
448 |     return _G_CFG[name]
449 | 
450 | 
451 | # 示例用法
452 | if __name__ == "__main__":
453 | 
454 |     cfg = new(is_auto_make=True)
455 |     cfg.set("keyint", 16)
456 |     cfg.set("keystr", "hello")
457 |     cfg.set("keydic", {})
458 |     cfg.set("keyarr", ["a", "b"])
459 |     cfg.set("keys.a", {"id": 10, "role": "admin"})
460 |     cfg.set(
461 |         "keys.a.info",
462 |         {"name": "lilei", "age": 20, "female": True, "like": ["ball", "swim"]},
463 |     )
464 |     cfg.set("keys.b", {"user": "lee", "pass": "lei"})
465 |     assert cfg.get("keyint", 996) == 16
466 |     cfg.set("keyint", 32)
467 |     assert cfg.get("keyint", 996) == 32
468 | 
469 |     assert cfg.get("keys.b.user") == "lee"
470 |     cfg.set("keys.b.user", "li")
471 |     assert cfg.get("keys.b.user") == "li"
472 | 
473 |     assert cfg.get("keyc.a", 1024) == 1024
474 |     try:
475 |         assert cfg.get("keys.b.c.d") == "this is error test."
476 |         raise SystemError("assert failure.")
477 |     except KeyError:
478 |         pass
479 | 
480 |     assert cfg.dumps() == (
481 |         '{"keyint": 32, "keystr": "hello", "keydic": {}, "keyarr": ["a", "b"], "keys": {"a": {"id": 10, "role": "admin", "info": {"name": "lilei", "age": 20, "female": true, "like": ["ball", "swim"]}}, "b": {"user": "li", "pass": "lei"}}}'
482 |     )
483 |     assert cfg.gen_detail() == (
484 |         "keyint: 32; keystr: hello; keydic: {}; keyarr: ['a', 'b']; keys: {'a': {'id': '...', 'role': '...', 'info': '...'}, 'b': {'user': '...', 'pass': '...'}}"
485 |     )
486 |     assert cfg.gen_detail(depth=2) == (
487 |         "keyint: 32; keystr: hello; keydic: {}; keyarr: ['a', 'b']; keys: {'a': '...', 'b': '...'}"
488 |     )
489 |     assert cfg.gen_detail(filters=["notkey", "keyarr", "keys.a.info.name", "keys.b"]) == (
490 |         "keyint: 32; keystr: hello; keydic: {}; keys: {'a': {'id': '...', 'role': '...', 'info': '...'}}"
491 |     )
492 |     # cfg.load_from_url('https://httpbin.org/get')
493 |     # assert cfg.get('url') == 'https://httpbin.org/get'
494 | 
495 |     cfg.clear()
496 |     cfg.loads(
497 |         """za:
498 |     user: lee
499 | zb: 1024
500 | zc: {}
501 | zd:
502 | - a
503 | - b""",
504 |         fmt="yaml",
505 |     )
506 | 
507 |     assert cfg.gen_detail() == "za: {'user': 'lee'}; zb: 1024; zc: {}; zd: ['a', 'b']"
508 | 
509 |     cfg.save()  # type: ignore
510 |     cfg.save("json")  # type: ignore
511 |     cfg.save("yaml")  # type: ignore
512 |     cfg.save("toml")  # type: ignore
513 | 
514 |     template = {
515 |         "url": {
516 |             "__type": str,
517 |             "__default": "https://localhost/api/v2/user",
518 |             "__help": "后端地址",
519 |         },  # 一级命令行的情况
520 |         # 'database.host': {'__type': str, '__default': 'www.eg.com', '__help': '样例地址'},  # 命令行重复冲突的情况
521 |         "database": {
522 |             "host": {"__type": str, "__default": "localhost", "__help": "数据库地址"},
523 |             "port": {
524 |                 "__type": int,
525 |                 "__default": 3306,
526 |             },  # 二级命令行没 __help 参数的情况
527 |             "__description": "数据库组配置参数",  # 创建组有 __description 参数的情况
528 |         },
529 |         "logging": {
530 |             "level": {"__type": str, "__default": "INFO", "__help": "日志级别"}
531 |         },  # 创建组没 __description 参数的情况
532 |         "rule": {
533 |             "china": {"beijing": "北京地区规则"},  # 二级没__type、__default参数的情况
534 |         },
535 |     }
536 | 
537 |     cfg = CliConfigurator(template=template)
538 |     cfg.parse_args()
539 |     print(cfg.raw)
540 |     cfg.save("yaml")
541 |     cfg.parser.print_help()
542 |     print(cfg.gen_detail())
543 |     cfg.delete()
544 | 


--------------------------------------------------------------------------------