├── requirements.txt ├── utils ├── office.py ├── process.py ├── compress.py └── configurator.py ├── configs.yaml ├── .gitignore ├── README.md ├── cut_text_files.py ├── emergency.yaml ├── README_EN.md ├── LICENSE └── sensitive-helper.py /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xn0ne/sensitive-helper/HEAD/requirements.txt -------------------------------------------------------------------------------- /utils/office.py: -------------------------------------------------------------------------------- 1 | """ 2 | office.py 3 | 4 | Office 文件解析: 5 | - docx_handler: 将 .docx 主文档抽取为纯文本,保存为 *_resolved.txt; 6 | - xlsx_handler: 将 .xlsx 所有工作表转为 dict 并写入 *_resolved.txt; 7 | - pptx_handler: 预留(暂未实现)。 8 | """ 9 | import json 10 | import pathlib 11 | import re 12 | from typing import Union 13 | 14 | import pandas 15 | 16 | try: 17 | from utils import compress 18 | except: 19 | import pathlib 20 | import sys 21 | 22 | sys.path.append(pathlib.Path(__file__).parent.parent.__str__()) 23 | from utils import compress 24 | 25 | 26 | def docx_handler(file_path: Union[pathlib.Path, str]) -> pathlib.Path: 27 | """解析 .docx 文本内容并输出到同名 *_resolved.txt 文件。""" 28 | docx_path = compress.uncompress(file_path) 29 | content = docx_path.joinpath('word/document.xml').read_text(encoding='utf-8') 30 | content = re.sub(r'[\r\n]', '', content) 31 | content = re.sub(r']*?)?>', '\n', content) 32 | content = re.sub(r'<[^<>]+>', '', content) 33 | resolved_path = pathlib.Path(docx_path.__str__() + '_resolved.txt') 34 | with open(resolved_path, 'w', encoding='utf-8') as _f: 35 | _f.write(content) 36 | return resolved_path 37 | 38 | 39 | def xlsx_handler(file_path: Union[pathlib.Path, str]): 40 | """将 .xlsx 所有工作表转换为 dict 并写入同名 *_resolved.txt 文件。""" 41 | xlsx_file = pandas.read_excel(file_path, sheet_name=None) 42 | 43 | with open(pathlib.Path(file_path.__str__() + '_resolved.txt'), 'w', encoding='utf-8') as _f: 44 | for sheet in xlsx_file: 45 | _f.write(xlsx_file[sheet].to_dict(orient='index').__str__() + '\n') 46 | 47 | 48 | def pptx_handler(): 49 | """预留:PPTX 解析未实现。""" 50 | pass 51 | 52 | 53 | if __name__ == '__main__': 54 | # docx_handler('cache/utils.docx') 55 | # xlsx_handler('cache/tttt/email.xlsx') 56 | pass 57 | -------------------------------------------------------------------------------- /configs.yaml: -------------------------------------------------------------------------------- 1 | config_path: config.yaml 2 | exclude_files: 3 | - \.DS_Store 4 | is_re_all: true 5 | is_silent: false 6 | output_format: csv 7 | process_number: 12 8 | re_filter_content: "" 9 | row_split: '[\0-\x1F\x7F]+' 10 | rules: 11 | AKSK: 12 | - '[\s\n\''\"`=:#]LTAI\w{12,20}[\s\n\''\"`=:#]' 13 | - '[\s\n\''\"`=:#](A3T[A-Z0-9]|ABIA|ACCA|AGPA|AIDA|AIPA|AKIA|ANPA|ANVA|APKA|AROA|ASCA|ASIA)[0-9A-Z]{16}[\s\n\''\"`=:#]' 14 | - '[\s\n\''\"`=:#]GOOG\w{10,30}[\s\n\''\"`=:#]' 15 | - '[\s\n\''\"`=:#]AZ[A-Za-z0-9]{34,40}[\s\n\''\"`=:#]' 16 | - '[\s\n\''\"`=:#]IBM[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]' 17 | - '[\s\n\''\"`=:#][a-zA-Z0-9]{8}(-[a-zA-Z0-9]{4}){3}-[a-zA-Z0-9]{12}[\s\n\''\"`=:#]' 18 | - '[\s\n\''\"`=:#]OCID[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]' 19 | - '[\s\n\''\"`=:#]LTAI[A-Za-z0-9]{12,20}[\s\n\''\"`=:#]' 20 | - '[\s\n\''\"`=:#]AKID[A-Za-z0-9]{13,20}[\s\n\''\"`=:#]' 21 | - '[\s\n\''\"`=:#]AK[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]' 22 | - '[\s\n\''\"`=:#]JDC_[A-Z0-9]{28,32}[\s\n\''\"`=:#]' 23 | - '[\s\n\''\"`=:#]AKLT[a-zA-Z0-9-_]{0,252}[\s\n\''\"`=:#]' 24 | - '[\s\n\''\"`=:#]UC[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]' 25 | - '[\s\n\''\"`=:#]QY[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]' 26 | - '[\s\n\''\"`=:#]AKLT[a-zA-Z0-9-_]{16,28}[\s\n\''\"`=:#]' 27 | - '[\s\n\''\"`=:#]LTC[A-Za-z0-9]{10,60}[\s\n\''\"`=:#]' 28 | - '[\s\n\''\"`=:#]YD[A-Za-z0-9]{10,60}[\s\n\''\"`=:#]' 29 | - '[\s\n\''\"`=:#]CTC[A-Za-z0-9]{10,60}[\s\n\''\"`=:#]' 30 | - '[\s\n\''\"`=:#]YYT[A-Za-z0-9]{10,60}[\s\n\''\"`=:#]' 31 | - '[\s\n\''\"`=:#]YY[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]' 32 | - '[\s\n\''\"`=:#]CI[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]' 33 | - '[\s\n\''\"`=:#]gcore[A-Za-z0-9]{10,30}[\s\n\''\"`=:#]' 34 | BASE64: 35 | - "[0-9a-zA-Z/+]{8,}={,2}" 36 | EMAIL: 37 | - '[a-zA-Z0-9][-+.\w]{1,127}@([a-zA-Z0-9][-a-zA-Z0-9]{0,63}.){,3}(org|com|cn|net|edu|mail)' 38 | FILE PATH: 39 | flags: I|X 40 | re_filters: [] 41 | regexp: 42 | - ([a-z]:\\)?([\\/])(users?|windows?|program files(\(x\d{2,3}\))?|s?bin|etc|usr|boot|dev|home|proc|opt|sys|srv|var)(\2[.\w!#\(~\[\{][.\w!#&\(\)+=~\[\]\{\}\s]{2,63}){1,16} 43 | FUZZY MATCH: 44 | flags: I 45 | regexp: 46 | - (APP|ACCESS|USER|PASS|OSS|ECS|CVM|AWS)[\w]{,8}(NAME|ID|KEY|NUM|ENC|CODE|SEC|WORD)[\w]{,16}["\'`]?\s*[=:(\{\[]\s*["\'`][\x20-\x7F]{,128}?[\'"`] 47 | - (USR|PWD|COOKIE)[_\-A-Z][\w]{,16}["\'`]?\s*[=:(\{\[]\s*["\'`][\x20-\x7F]{,128}?[\'"`] 48 | - (SECRET|SIGN|TOKEN)[\w]{,16}["\'`]?\s*[=:(\{\[]\s*["\'`][\x20-\x7F]{,128}?[\'"`] 49 | JSON WEB TOKEN(JWT): 50 | - ey[0-9a-zA-Z/+]{4,}={,2}\.[0-9a-zA-Z/+]{6,}={,2}\.[A-Za-z0-9-_]+ 51 | PHONE: 52 | - (13[0-9]|14[5-9]|15[0-3,5-9]|16[6]|17[0-8]|18[0-9]|19[8,9])\d{8} 53 | URL: 54 | re_filters: 55 | - (adobe|amap|android|apache|bing|digicert|eclipse|freecodecamp|github|githubusercontent|gnu|godaddy|google|googlesource|youtube|youtu|jd|npmjs|microsoft|openxmlformats|outlook|mozilla|openssl|oracle|qq|spring|sun|umang|w3|wikipedia|xml)\.(org|com|cn|net|edu|io|be) 56 | - (ali|baidu|cdn|example|ssh|ssl)[\w-]*\.(org|com|cn|net|edu|io) 57 | regexp: 58 | - (ftp|https?):\/\/[%\.\w\-]+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])? 59 | target_path: "" 60 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | .idea/ 161 | 162 | # other 163 | *.csv 164 | *.json 165 | .DS_Store 166 | .cache/ 167 | cache/ 168 | .output/ 169 | output/ 170 | *.rar 171 | *.zip 172 | *.7z 173 | *.tgz 174 | *.tar -------------------------------------------------------------------------------- /utils/process.py: -------------------------------------------------------------------------------- 1 | #!/bin/python3 2 | # _*_ coding:utf-8 _*_ 3 | # 4 | """ 5 | process.py 6 | 7 | 进程池辅助工具:对 concurrent.futures.ProcessPoolExecutor 做轻量封装。 8 | 9 | 功能: 10 | - submit_super: 提交任务并记录 future; 11 | - result_yield: 以生成器形式依次获取结果,获取后即从队列移除。 12 | 13 | 适用场景: 14 | - CPU 密集或网络请求密集任务; 15 | - 可在多机分布式环境扩展。 16 | 17 | 参考:`https://segmentfault.com/a/1190000007495352` 18 | """ 19 | 20 | import concurrent.futures 21 | import os 22 | import random 23 | import time 24 | from typing import Any, Generator, List 25 | 26 | 27 | class ProcessPoolHelper(concurrent.futures.ProcessPoolExecutor): 28 | """对 ProcessPoolExecutor 的简易封装,便于批量提交与顺序取回结果。""" 29 | def __init__(self, max_workers=None, mp_context=None, initializer=None, initargs=()): 30 | super().__init__(max_workers, mp_context, initializer, initargs) 31 | self.__job_list: List[concurrent.futures.Future] = [] 32 | 33 | def submit_super(self, fn, /, *args, **kwargs) -> concurrent.futures.Future: 34 | """提交任务并加入内部队列,返回 future。""" 35 | job = self.submit(fn, *args, **kwargs) 36 | self.__job_list.append(job) 37 | return job 38 | 39 | def result_yield(self, timeout: float = None) -> Generator[Any, None, None]: 40 | """按提交顺序产出任务返回值;取出后从队列移除。 41 | 42 | 注意:调用方会阻塞直到有任务完成或超时。 43 | """ 44 | self.__job_list.reverse() 45 | while self.__job_list: 46 | yield self.__job_list.pop().result(timeout) 47 | 48 | self.__job_list = [] 49 | 50 | 51 | def __test_performance_func(min: int = 500, max: int = 600): 52 | # print(os.getpid(), '__test_performance_func running...') 53 | result = 0 54 | for i in range(random.randint(min, max)): 55 | for j in range(random.randint(min, max)): 56 | for k in range(random.randint(min, max)): 57 | result += i + j + k 58 | print(os.getpid(), '__test_performance_func result:', str(result)) 59 | # print(os.getpid(), '__test_performance_func ending...') 60 | 61 | 62 | def __test_return_func(min: int = 500, max: int = 600): 63 | result = 0 64 | for i in range(random.randint(min, max)): 65 | for j in range(random.randint(min, max)): 66 | for k in range(random.randint(min, max)): 67 | result += i + j + k 68 | print(os.getpid(), 'test_return_func result:', str(result)) 69 | # 返回数据 70 | return result 71 | 72 | 73 | def __test_return_dict_func(min: int = 500, max: int = 600): 74 | result = {} 75 | for i in range(random.randint(min, max)): 76 | for j in range(random.randint(min, max)): 77 | for k in range(random.randint(min, max)): 78 | key = str(i // 500) 79 | if not key in result: 80 | result[key] = 0 81 | result[key] += i + j + k 82 | print(os.getpid(), '__test_return_dict_func result:', result) 83 | # 返回数据 84 | return result 85 | 86 | 87 | if __name__ == '__main__': 88 | print('__test_return_func 1 times') 89 | start_time = time.time() 90 | print('return:', __test_return_func(500, 550)) 91 | print('run one times, total time(s):', time.time() - start_time) 92 | 93 | thr = ProcessPoolHelper(3) 94 | 95 | print('__test_performance_func') 96 | start_time = time.time() 97 | [thr.submit_super(__test_performance_func, 500, 550) for i in range(10)] 98 | print('return:', [i for i in thr.result_yield()]) 99 | print('total time(s):', time.time() - start_time) 100 | 101 | print('__test_return_func') 102 | start_time = time.time() 103 | [thr.submit_super(__test_return_func, 500, 550) for i in range(10)] 104 | print('return:', [i for i in thr.result_yield()]) 105 | print('total time(s):', time.time() - start_time) 106 | 107 | print('__test_return_dict_func') 108 | start_time = time.time() 109 | [thr.submit_super(__test_return_dict_func, 500, 550) for i in range(10)] 110 | print('return:', [i for i in thr.result_yield()]) 111 | print('total time(s):', time.time() - start_time) 112 | 113 | # 处理器: 安装了 1 个处理器。 114 | # [01]: AMD64 Family 25 Model 33 Stepping 0 AuthenticAMD ~3701 Mhz 115 | # return: 2861758025421224 116 | # run one times, total time(s): 10.262209177017212 117 | # return: [2582051420060162, 2385134693133712, 2734006770193755, 2658863161379877, 2367450617576565, 2535566799548760, 2647666940791099, 2445589945625423, 2405781958502416, 2812995873098620] 118 | # total time(s): 36.40744924545288 119 | # return: [None, None, None, None, None, None, None, None, None, None] 120 | # total time(s): 35.91919994354248 121 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 参考链接: 2 | 3 | # Sensitive Helper 4 | 5 | 简体中文 | [English](./README_EN.md) 6 | 7 | 最近项目要搜索本地的敏感数据工作太多了,网上用了一些工具效果一般,被老板DISS了很多次,当然也可能是我不会用,如:SO文件无法读取、多进程报错、配置看不懂、识别原理云里雾里等,想提issues的,但大家都要养家想想算了,自己改一个。 8 | 9 | 基于正则表达式的本地文件敏感信息数据挖掘助手。如果要搜索网页上的敏感数据,可以把敏感数据导出到本地再进行搜索。优化了一下多线程的使用,优化了配置的使用方式。 10 | 11 | **注意**: 12 | 13 | + 如果默认规则不满足匹配需求,请自行调整 configs.yaml 文件中的 `rules` 部分内容进行匹配; 14 | + 进度条是按文件数量来统计进度的,如果卡住并不是程序卡死了,可能是文件太大了,比如 1G 文件; 15 | + 非静默模式是在匹配完成单个文件后才会输出命中数据,如果在扫描大文件没输出命中信息请耐心等待; 16 | + 添加的规则越多,扫描的速度越慢,尽可能使用 1 条正则表达式匹配所需要的特征; 17 | 18 | # 快速开始 19 | 20 | ### 依赖 21 | 22 | + python >= 3.6 23 | 24 | 进入项目目录,使用以下命令安装依赖库 25 | 26 | ```bash 27 | pip3 install toml PyYAML tqdm pandas rarfile py7zr openpyxl 28 | ``` 29 | 30 | 或者使用 PIP 的 `requirement` 参数安装依赖库 31 | 32 | ```bash 33 | pip3 install -r requirements.txt 34 | ``` 35 | 36 | ### 基础用法 37 | 38 | 使用 `-t` 参数直接对目标路径进行搜索 39 | 40 | ```$ python3 sensitive-helper.py -t <你的搜索文件路径>``` 41 | 42 | 当想要排除部分类型文件,可以使用 `-e` 参数排除指定的文件,要注意这里是使用正则表达式进行文件名匹配的,比如程序可能搜索到以下文件 /tmp/aaa.so,如果不想搜索 `.so` 文件类型,可以使用正则表达式 `.*so` 程序会将 `aaa.so` 字符串与正则表达式进行匹配 `.*so`,即可对 `so` 格式文件进行过滤 43 | 44 | ```$ python3 sensitive-helper.py -t <你的搜索文件路径> -e ".*so" ".*gz"``` 45 | 46 | 如果觉得搜索速度太慢,可以使用 `-p` 参数调整搜索的进程数(默认为:5)以提高搜索速度,虽然 Python 的多进程很差劲,但有总比没有好,注意一个进程只会处理一个文件,如果使用文本类打大文件,请先使用 `cut_text_files.py` 将文件切割再进行搜索会提升搜索速度。这个参数的值最佳设置推荐为 CPU 核心数相同,激进一点可以 CPU 核心数 * 2。 47 | **注意**:计算机性能不好设置不要超过20个进程数,程序涉及大量的IO、内存操作,计算机可能会崩溃,比如我的电脑。 48 | 49 | ```$ python3 sensitive-helper.py -t <你的搜索文件路径> -p 10``` 50 | 51 | 有保存数据的需求话,可以使用 `-o` 参数输出 json 格式的结果文件 52 | 53 | ```$ python3 sensitive-helper.py -t <你的搜索文件路径> -o results.json``` 54 | 55 | 默认情况下,程序使用正则表达式进行匹配的时候,匹配到 1 条表达式就会退出当前文件的搜索。可以使用 `-a` 参数,强制程序将每条正则表达式都匹配完毕,挖掘更多可能有用的数据 56 | 57 | ```$ python3 sensitive-helper.py -t <你的搜索文件路径> -a``` 58 | 59 | **注意**:程序内置默认匹配规则,规则优先级为:默认配置 < configs.yaml 配置 < 用户输入配置 60 | 61 | ### 使用说明 62 | 63 | ``` 64 | $ python3 sensitive-helper.py -h 65 | usage: sensitive-helper.py [-h] [-t TARGET_PATH] [-p PROCESS_NUMBER] [-c CONFIG_PATH] [-o OUTPUT_FORMAT] [-e EXCLUDE_FILES [EXCLUDE_FILES ...]] [-a] [-s] [-r RE_FILTER_CONTENT] 66 | 67 | ███████╗███████╗███╗ ██╗███████╗██╗████████╗██╗██╗ ██╗███████╗ 68 | ██╔════╝██╔════╝████╗ ██║██╔════╝██║╚══██╔══╝██║██║ ██║██╔════╝ 69 | ███████╗█████╗ ██╔██╗ ██║███████╗██║ ██║ ██║██║ ██║█████╗ 70 | ╚════██║██╔══╝ ██║╚██╗██║╚════██║██║ ██║ ██║╚██╗ ██╔╝██╔══╝ 71 | ███████║███████╗██║ ╚████║███████║██║ ██║ ██║ ╚████╔╝ ███████╗ 72 | ╚══════╝╚══════╝╚═╝ ╚═══╝╚══════╝╚═╝ ╚═╝ ╚═╝ ╚═══╝ ╚══════╝ 73 | v0.1.6 74 | by 0xn0ne, https://github.com/0xn0ne/sensitive-helper 75 | 76 | options: 77 | -h, --help 显示帮助信息并退出程序 78 | -t, --target-path TARGET_PATH 79 | 搜索敏感信息的文件路径或文件夹路径(例如:~/download/folder) 80 | -p, --process-number PROCESS_NUMBER 81 | 程序进程数(默认值:12) 82 | -c, --config-path CONFIG_PATH 83 | yaml 配置文件的路径(默认值:configs.yaml) 84 | -o, --output-format OUTPUT_FORMAT 85 | 输出文件格式,可用格式为 json、csv(默认值:csv) 86 | -e, --exclude-files EXCLUDE_FILES [EXCLUDE_FILES ...] 87 | 排除的文件,使用正则匹配(例如:\.DS_Store .*bin .*doc) 88 | -a, --is-re-all 每个文件的被单个正则表达式规则后退出匹配循环,或匹配所有正则表达式才退出匹配循环 89 | -s, --is-silent 静默模式:开启后,命令行不会输出命中的信息,会使用进度条来显示进度 90 | -r, --re-filter-content RE_FILTER_CONTENT 91 | 过滤正则,每行字符串匹配过程中命中该正则直接跳过该行 92 | ``` 93 | 94 | ### 应急响应用法与示例 95 | 96 | 感谢网络安全的朋友给提出的建议,该工具也可用于常见的网络攻击特征快速匹配,复杂型网络攻击不适用,如POST请求体内的攻击、0DAY漏洞攻击、特殊网络路径攻击等,酌情使用。 97 | 98 | 当用于网络安全应急响应时,可直接对中间件与应用日志目录进行扫描,以快速提取疑似攻击痕迹,因为应急特征匹配和敏感数据匹配思路还是有部分区别,这里使用单独的配置文件 `emergency.yaml` 避免匹配混乱。 99 | 100 | + 支持的常见攻击指纹分组(可在 `emergency.yaml` 的 `rules` 中调整): 101 | + SQL INJECTION 102 | + COMMAND EXECUTION 103 | + PATH TRAVERSAL / LFI-RFI 104 | + SSRF 105 | + XSS 106 | + LOG4SHELL / JNDI 107 | + WEBSHELL / MALICIOUS UPLOAD 108 | + JAVA / PHP DESERIALIZATION 109 | + NOSQL INJECTION 110 | + SENSITIVE ACCESS 111 | 112 | 示例:扫描 Nginx/Apache/Tomcat 应用日志目录,并输出所有可疑的攻击请求 113 | 114 | ```bash 115 | python3 sensitive-helper.py -t /var/log/nginx -a -s -c emergency.yaml 116 | python3 sensitive-helper.py -t /var/log/httpd -a -s -c emergency.yaml 117 | python3 sensitive-helper.py -t /opt/tomcat/logs -a -s -c emergency.yaml 118 | ``` 119 | 120 | 建议: 121 | 122 | + 如需展开归档日志,可不排除压缩包(程序会尝试递归解压)。 123 | + 如日志量特别大,建议结合 `-s` 开启进度条,并合理调高 `-p` 进程数。 124 | 125 | ### 默认模式输出样例 126 | 127 | ```bash 128 | $ python3 sensitive-helper.py -t "cache/" -a 129 | [*] file loading... 130 | [*] analyzing... 131 | 132 | [+] group: FUZZY MATCH, match: AppId":"123456", file: cache/heapdump 133 | [+] group: BASE64, match: ZjY2MTQyNDEtYTIyYS00YjNlLTg1NTgtOTQ4NmUwZDFkZjM1, file: cache/heapdump 134 | [+] group: FUZZY MATCH, match: password":"123456", file: cache/heapdump 135 | [+] group: FILE PATH, match: C:\Windows\system32\drivers, file: cache/heapdump-BAK 136 | [+] group: URL, match: http://hello.world/123456.jpg, file: cache/heapdump-BAK 137 | total file number: 5 138 | ``` 139 | 140 | ### 静默模式输出样例 141 | 142 | ```bash 143 | $ python3 sensitive-helper.py -t "cache/" -a -s 144 | [*] file loading... 145 | [*] analyzing... 146 | 147 | 53792/53792 [██████████████████████████████████████████] 00:28<00:00,1856.73it/s 148 | total file number: 53792 149 | ``` 150 | 151 | # Q&A 152 | 153 | + Q:为什么不做网页的敏感数据搜索? 154 | + A:因为网页千变万化,改动一个API接口或是一个css或者id都可能要更新代码,不如导出到本地,统一使用文本识别的方式对数据处理。 155 | -------------------------------------------------------------------------------- /cut_text_files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 文本文件切割工具 5 | 支持按行数切割文本文件,可自定义输入文件/文件夹和输出目录 6 | """ 7 | 8 | import argparse 9 | import glob 10 | import math 11 | import os 12 | import re 13 | import sys 14 | from pathlib import Path 15 | from typing import List, Optional, Union 16 | 17 | 18 | class TextFileSplitter: 19 | """文本文件切割器""" 20 | 21 | def __init__(self, output_lines_number: int = 10000, output_dir: str = "output"): 22 | """ 23 | 初始化切割器 24 | 25 | Args: 26 | output_lines_number: 每个输出文件的行数 27 | output_dir: 输出目录 28 | """ 29 | self.output_lines_number = output_lines_number 30 | self.output_dir = Path(output_dir) 31 | self.output_dir.mkdir(exist_ok=True) 32 | 33 | def split_files(self, input_file: Union[str, Path]): 34 | """ 35 | 切割单个文件 36 | 37 | Args: 38 | input_file: 输入文件路径 39 | 40 | Returns: 41 | 生成的文件路径列表 42 | """ 43 | input_path = Path(input_file) 44 | if not input_path.exists(): 45 | raise FileNotFoundError(f"文件或文件夹不存在: {input_path}") 46 | 47 | if input_path.is_file(): 48 | input_path = [input_path] 49 | else: 50 | input_path = input_path.iterdir() 51 | 52 | for path in input_path: 53 | if path.is_dir(): 54 | continue 55 | 56 | print(f"正在处理文件: {path}") 57 | 58 | # 创建输出文件名模板 59 | base_name = path.stem 60 | extension = path.suffix 61 | 62 | content = path.read_bytes() 63 | content_list = re.split(rb'\n', content) 64 | content_list_len = len(content_list) 65 | times = math.ceil(content_list_len / self.output_lines_number) 66 | for index in range(times): 67 | self._write_chunk( 68 | content_list[index * self.output_lines_number : (index + 1) * self.output_lines_number], 69 | base_name, 70 | extension, 71 | index, 72 | ) 73 | 74 | print(f"文件 {input_path} 已切割为 {times} 个文件") 75 | 76 | def _write_chunk(self, lines: List[bytes], base_name: str, extension: str, file_id: int) -> str: 77 | """ 78 | 写入文件块 79 | 80 | Args: 81 | lines: 要写入的行列表 82 | base_name: 基础文件名 83 | extension: 文件扩展名 84 | file_count: 文件编号 85 | 86 | Returns: 87 | 输出文件路径 88 | """ 89 | output_filename = f"{base_name}_part_{file_id:06d}{extension}" 90 | output_path = self.output_dir / output_filename 91 | output_path.write_bytes(b'\n'.join(lines)) 92 | 93 | return str(output_path) 94 | 95 | def get_file_info(self, file_path: Union[str, Path]) -> dict: 96 | """ 97 | 获取文件信息 98 | 99 | Args: 100 | file_path: 文件路径 101 | 102 | Returns: 103 | 文件信息字典 104 | """ 105 | path = Path(file_path) 106 | if not path.exists(): 107 | return {"error": "文件不存在"} 108 | 109 | try: 110 | with open(path, "r", encoding="utf-8", errors="ignore") as f: 111 | line_count = sum(1 for _ in f) 112 | 113 | return { 114 | "file_name": path.name, 115 | "file_size": path.stat().st_size, 116 | "line_count": line_count, 117 | "estimated_parts": (line_count + self.output_lines_number - 1) // self.output_lines_number, 118 | } 119 | except Exception as e: 120 | return {"error": str(e)} 121 | 122 | 123 | def main(): 124 | """主函数""" 125 | parser = argparse.ArgumentParser( 126 | description="文本文件切割工具 - 按行数切割文本文件", 127 | formatter_class=argparse.RawDescriptionHelpFormatter, 128 | epilog=""" 129 | 使用示例: 130 | python cut_text_files.py -f input_or_folder.txt -l 1000 131 | python cut_text_files.py -f input.txt --info 132 | """, 133 | ) 134 | 135 | # 输入参数组 136 | input_group = parser.add_mutually_exclusive_group(required=True) 137 | input_group.add_argument("-f", "--files", type=str, help="要切割的单个文件路径") 138 | 139 | # 其他参数 140 | parser.add_argument("-l", "--lines", type=int, default=10000, help="每个输出文件的行数 (默认: 10000)") 141 | parser.add_argument("-o", "--output", type=str, default="output", help="输出目录 (默认: output)") 142 | parser.add_argument( 143 | "-p", 144 | "--pattern", 145 | type=str, 146 | default="*.txt", 147 | help="文件匹配模式,仅用于目录模式 (默认: *.txt)", 148 | ) 149 | parser.add_argument("--info", action="store_true", help="显示文件信息而不进行切割") 150 | 151 | args = parser.parse_args() 152 | 153 | # 创建切割器 154 | splitter = TextFileSplitter(output_lines_number=args.lines, output_dir=args.output) 155 | 156 | if args.files: 157 | # 处理单个文件 158 | if args.info: 159 | # 显示文件信息 160 | info = splitter.get_file_info(args.files) 161 | if "error" in info: 162 | print(f"错误: {info['error']}") 163 | return 1 164 | 165 | print(f"文件信息:") 166 | print(f" 文件名: {info['file_name']}") 167 | print(f" 文件大小: {info['file_size']:,} 字节") 168 | print(f" 行数: {info['line_count']:,}") 169 | print(f" 预计切割为: {info['estimated_parts']} 个文件") 170 | print(f" 每个文件行数: {args.lines}") 171 | else: 172 | # 切割文件 173 | splitter.split_files(args.files) 174 | print(f"输出目录: {args.output}") 175 | 176 | 177 | if __name__ == "__main__": 178 | sys.exit(main()) 179 | -------------------------------------------------------------------------------- /emergency.yaml: -------------------------------------------------------------------------------- 1 | # 应急排查专用配置 2 | config_path: emergency.yaml 3 | exclude_files: 4 | - \.DS_Store 5 | is_re_all: true 6 | is_silent: false 7 | output_format: csv 8 | process_number: 18 9 | re_filter_content: '\s+(3\d\d|4\d\d|5\d\d)\s+' 10 | row_split: '[\n\r]+' 11 | rules: 12 | AKSK: 13 | - '[\s\n\''\"`=:#]LTAI\w{12,20}[\s\n\''\"`=:#]' 14 | - '[\s\n\''\"`=:#](A3T[A-Z0-9]|ABIA|ACCA|AGPA|AIDA|AIPA|AKIA|ANPA|ANVA|APKA|AROA|ASCA|ASIA)[0-9A-Z]{16}[\s\n\''\"`=:#]' 15 | - '[\s\n\''\"`=:#]GOOG\w{10,30}[\s\n\''\"`=:#]' 16 | - '[\s\n\''\"`=:#]AZ[A-Za-z0-9]{34,40}[\s\n\''\"`=:#]' 17 | - '[\s\n\''\"`=:#]IBM[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]' 18 | - '[\s\n\''\"`=:#][a-zA-Z0-9]{8}(-[a-zA-Z0-9]{4}){3}-[a-zA-Z0-9]{12}[\s\n\''\"`=:#]' 19 | - '[\s\n\''\"`=:#]OCID[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]' 20 | - '[\s\n\''\"`=:#]LTAI[A-Za-z0-9]{12,20}[\s\n\''\"`=:#]' 21 | - '[\s\n\''\"`=:#]AKID[A-Za-z0-9]{13,20}[\s\n\''\"`=:#]' 22 | - '[\s\n\''\"`=:#]AK[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]' 23 | - '[\s\n\''\"`=:#]JDC_[A-Z0-9]{28,32}[\s\n\''\"`=:#]' 24 | - '[\s\n\''\"`=:#]AKLT[a-zA-Z0-9-_]{0,252}[\s\n\''\"`=:#]' 25 | - '[\s\n\''\"`=:#]UC[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]' 26 | - '[\s\n\''\"`=:#]QY[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]' 27 | - '[\s\n\''\"`=:#]AKLT[a-zA-Z0-9-_]{16,28}[\s\n\''\"`=:#]' 28 | - '[\s\n\''\"`=:#]LTC[A-Za-z0-9]{10,60}[\s\n\''\"`=:#]' 29 | - '[\s\n\''\"`=:#]YD[A-Za-z0-9]{10,60}[\s\n\''\"`=:#]' 30 | - '[\s\n\''\"`=:#]CTC[A-Za-z0-9]{10,60}[\s\n\''\"`=:#]' 31 | - '[\s\n\''\"`=:#]YYT[A-Za-z0-9]{10,60}[\s\n\''\"`=:#]' 32 | - '[\s\n\''\"`=:#]YY[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]' 33 | - '[\s\n\''\"`=:#]CI[A-Za-z0-9]{10,40}[\s\n\''\"`=:#]' 34 | - '[\s\n\''\"`=:#]gcore[A-Za-z0-9]{10,30}[\s\n\''\"`=:#]' 35 | JSON WEB TOKEN(JWT): 36 | - ey[0-9a-zA-Z/+]{4,}={,2}\.[0-9a-zA-Z/+]{6,}={,2}\.[A-Za-z0-9-_]+ 37 | SQL INJECTION: 38 | flags: I 39 | regexp: 40 | - '.+[?/\;&]%[\da-f]{2}\S*(substring|concat|union|select|sleep|char\(|delay|information_schema|updatexml|extractvalue|load_file|outfile|into|xp_cmdshell|waitfor)\S+%[\da-f]{2}' 41 | - '.+[?/\;&]%[\da-f]{2}\S*(or|and)\S{,30}=\S{,6}%[\da-f]{2}' 42 | COMMAND EXECUTION: 43 | flags: I 44 | regexp: 45 | - '.+[?/\\;&]%[\da-f]{2}\S*(bash|zsh|cmd|powershell|pwsh|python|perl|wget|curl|netcat|telnet|whoami|uname|net.{1,6}user|ipconfig|ifconfig|shutdown)\S*%[\da-f]{2}' 46 | PATH TRAVERSAL / LFI-RFI: 47 | flags: I 48 | regexp: 49 | - .+/\S+?(\.\./|\.\.\\)+/\S+ 50 | - .+/\S+?(etc/passwd|proc/self|windows/system32) 51 | - .+(php|file|zip|data|expect):// 52 | - .+(include|require)(_once).+ 53 | SSRF: 54 | flags: I 55 | regexp: 56 | - \S{3,}(http|https|gopher|dict|file)://([1-2]?\d{1,2}\.[1-2]?\d{1,2}\.[1-2]?\d{,2}\.[1-2]?\d{,2}|localhost|::1)([:/]) 57 | - \S{3,}dnslog\.\S+ 58 | XSS: 59 | flags: I 60 | regexp: 61 | - \S+<\S*(script|iframe|svg)\S*>\S+ 62 | - on(error|load|mouseover|focus|click)\s*=\S+ 63 | - \S+(javascript:.+|document\.cookie|localStorage|sessionStorage)\S+ 64 | LOG4J / JNDI: 65 | flags: I 66 | regexp: 67 | - \S+\$\{\S{,6}(jndi\s*:\s*|ldaps?|rmi|dns)\S+ 68 | WEBSHELL: 69 | flags: I 70 | regexp: 71 | - .+(assert|eval|system|shell_exec|passthru|base64_decode).+$_(POST|REQUEST) 72 | - .+preg_replace\s*\(.*?/e 73 | JAVA / PHP DESERIALIZATION: 74 | flags: I 75 | regexp: 76 | - .+java\.io\.Object(Input|Output)Stream 77 | - .+(org\.apache\.commons\.collections\.|InvalidClassException|StreamCorruptedException) 78 | NOSQL INJECTION: 79 | flags: I 80 | regexp: 81 | - .+(\$ne|\$gt|\$gte|\$lt|\$lte|\$where)\s*[:=] 82 | - .+db\.[a-zA-Z0-9_]+\.find\(.*\$where 83 | SENSITIVE ACCESS: 84 | flags: I 85 | regexp: 86 | - .+'/\S+(/\.git|/\.svn|/swagger)/\S+' 87 | - .+'/\S+\.(zip|rar|tar|gz)\S*' 88 | # Common framework 89 | STRUTS2 / OGNL: 90 | flags: I 91 | regexp: 92 | - .+'%\{[#$].+?\}' 93 | - .+(redirect|action|method)\s*:.+ 94 | IIS: 95 | flags: I 96 | regexp: 97 | - '.+\.asp.{,3};.{,3}\.[a-z]{2,3}' 98 | COMPONENT PATHS: 99 | flags: I 100 | regexp: 101 | # WebLogic 常见 102 | - .+(/wls-wsat/\w+|/uddiexplorer/\w+\.jsp|/ws_utc/\w+\.do|/console/jsp/common/\w+\.jsp|/bea_wls_deployment_internal|/wls-wsat/CoordinatorPortType) 103 | - ".+/console/(images|css|help|portal)/" 104 | # Spring Boot / Actuator 105 | - .+(/actuator|/heapdump) 106 | # ActiveMQ 107 | # GitLab 108 | - .+(/users/password|/-/graphql|/import/gitlab_project) 109 | # Microsoft SharePoint 110 | - ".+/_layouts/15/ToolPane.aspx" 111 | # Jenkins / JBoss / Struts / Nexus / Solr 等 112 | - .+(/jmx-console|/invoker/JMXInvokerServlet|/struts2-showcase|/service/extdirect|/service/rest|/solr/admin|/solr/select) 113 | # ===== 国产常见系统(指纹示例,需按环境取舍) ===== 114 | # JeecgBoot 115 | - .+(/sys/common/upload|/getDictItemsByTable|/onlDragDatasetHead/getTotalData|/novat-boot/sys/user/passwordChange) 116 | # 若依 RuoYi 117 | - .+(/ruoyi-admin|/common/upload|/system/user/profile/resetPwd|/sendMessageWithAttachment) 118 | # 帆软 FineReport 119 | - .+(/webroot/decision/|/decision/login) 120 | - ".+/WebReport/ReportServer.*cmd=design_install_reufile" 121 | # 泛微 E-cology / Weaver 122 | - .+(/interface/outter/outter_encryptclassOperation.jsp|/api/workflow/reqform/remarkOperate|/plugins/jqueryFileTree/connectors/jqueryFileTree.jsp|/weaver/weaver.email.FileDownloadLocation/login|/mobile/browser/WorkflowCenterTreeData.jsp|/weaver/ln.FileDownload|/weaver/ln.FileUpload|/weaver/weaver.file.FileDownload|/api/integration/datasource/update) 123 | - ".+/mobilemode/mobile/server.jsp.+invoker=com.api.mobilemode.web.mobile.service.MobileEntranceAction" 124 | # 致远 Seeyon 125 | - .+(/seeyon/management/index.jsp|/seeyon/ajax.do|/seeyon/htmlofficeservlet|/seeyon/fileUpload.do) 126 | # 蓝凌 Landray 127 | - .+(/ekp/data/sys-common/dataxml.tmpl|/app/login.jsp|/data/sys-common|/sys/ui/extend/varkind/custom.jsp) 128 | # 用友 Yonyou / NC / BIP / EF 129 | - .+(/nc.itf.bap.service.IBapIOService|/ebvp/register/qrySubPurchaseOrgByParentPk|/portal/pt/servlet/getFormltem/doPost|/service/FileManageServlet|/bi/api/SemanticModel/GetOlapConnectionList|/uap.pubitf.ae.meta.IMetaWebService4BqCloud|/portal/pt/oacoSchedulerEvents/changeEvent|/ServiceDispatcherServlet) 130 | - .+/com.ufida.web.action.ActionServlet.+repID=.+ 131 | - .+/Portal/Print/DynamaticExport.aspx.+filePath=.+ 132 | - .+/worksheet/workslist.jsp.+id=.+ 133 | # 金蝶 Kingdee / K3Cloud 134 | - (/Kingdee.BOS.WebApi.ServicesStub|/CommonFileServer/Upload) 135 | # 通达 Tongda OA 136 | - .+(/general/login_code.php|/ispirit/login_code.php|/get_contactlist.php) 137 | # 群晖 Synology 138 | - .+(/webapi/auth.cgi|/webman/index.cgi) 139 | # 深信服 Sangfor(VPN/NGAF 端口差异较大,按需取舍) 140 | - .+(/fort/portal_login|/tool/log/c.php|/netConfig/set_port) 141 | # 联软 142 | - .+(/emm-core/oauth/token) 143 | # 明源 144 | - "/PubPlatform/nav/login/sso/login.aspx" 145 | # 汉王E脸通 146 | - .+(/manage/mobiVist.+/systemBlackList/uploadBlackListFile.do|/doorInfo/queryDoorInfoList.do.+order=|/manage/antisubmarine/queryAntisubmarineList.do.+order=|/manage/resourceUpload/imgDownload.do.+filePath=|/manage/intercom/.+/firstPeopleOpen/getDoors.do.+order=|/manage/authMultiplePeople/getGroupEmployee.do.+order=|/manage/visitorMapConfig/updateVisitorMapConfig.do|/manage/intercom/.+/resourceUpload/upload.do|/manage/leaveList/monadFileUpload.do) 147 | # 扁鹊医疗 148 | - .+(/WebServiceForFirstaidApp.asmx/GetMonitorList|/WebServiceForFirstaidApp.asmx/GetLyfsByParams) 149 | # 时空智友ERP 150 | - .+'/formservice.*service=updater.uploadStudioFile' 151 | - .+'/formservice.*(service=attachment.write|filename=)' 152 | # 联想/绿盟/金和/明源/大华/浪潮云/华测/Unibox/Richmail/PWS/Letta 待手机 153 | target_path: "" 154 | -------------------------------------------------------------------------------- /README_EN.md: -------------------------------------------------------------------------------- 1 | Reference: 2 | 3 | # Sensitive Helper 4 | 5 | [简体中文](./README.md) | English 6 | 7 | Regular expression-based data mining assistant for sensitive information on local files. If you want to search for sensitive data on the web page, you can export the sensitive data to the local search. Optimized the use of multi-threading a bit , optimized the use of configuration . 8 | 9 | **Note**: If the default rules do not meet your matching needs, please adjust the `rules` part of the configs.yaml file to match. 10 | 11 | # Quick start 12 | 13 | ### Required 14 | 15 | + python >= 3.6 16 | 17 | In the project directory and use the following command to install the dependent libraries 18 | 19 | ```bash 20 | pip3 install toml PyYAML tqdm pandas rarfile py7zr openpyxl 21 | ``` 22 | 23 | Or use the `requirement` parameter of the PIP to install the dependency library 24 | 25 | ```bash 26 | pip3 install -r requirements.txt 27 | ``` 28 | 29 | ### Basic usage 30 | 31 | Use the `-t` parameter to search directly on the target path. 32 | 33 | ```python3 sensitive-helper.py -t ``` 34 | 35 | When you want to exclude some types of files, you can use the `-e` parameter to exclude the specified files. Note that regular expressions are used here to match file names, for example, the program may search for the following file /tmp/aaa.so, if you do not want to search for the `.so` file type, you can use the regular expression `. *so` The program will match the `aaa.so` string with the regular expression `. *so` to filter `so` format files. 36 | 37 | ```python3 sensitive-helper.py -t -e ".*so" ".*gz"``` 38 | 39 | If search speed seems too slow, you can use the `-p` parameter to adjust the number of search processes (default: 5) and improve performance. While Python's multiprocessing is subpar, it's better than nothing. Note that each process handles only one file. When searching large text files, first use `cut_text_files.py` to split the files beforehand to enhance search speed. The optimal setting for this parameter is recommended to match the number of CPU cores. For more aggressive performance, you can set it to CPU cores * 2. 40 | 41 | **Note**: Computer performance is not good set do not over 20 process number, the program requires a lot of IO, memory operations, the computer may crash, such as my computer... 42 | 43 | ```python3 sensitive-helper.py -t -p 20``` 44 | 45 | If you want to save the data, you can use the `-o` parameter to output the result file in json format. 46 | 47 | ```python3 sensitive-helper.py -t -o results.json``` 48 | 49 | By default, when the program matches using regular expressions, it will quit searching the current file after matching 1 expression. You can use the `-a` parameter to force the program to match every regular expression, to find more potentially useful data. 50 | 51 | ```python3 sensitive-helper.py -t -a``` 52 | 53 | **Note**: The program has built-in default matching rules, which are prioritized as follows: default configuration < configs.yaml configuration < user input configuration 54 | 55 | ### Usage 56 | 57 | ```bash 58 | % python3 sensitive-helper.py -h 59 | usage: sensitive-helper.py [-h] [-t TARGET_PATH] [-p PROCESS_NUMBER] [-c CONFIG_PATH] [-o OUTPUT_FORMAT] [-e EXCLUDE_FILES [EXCLUDE_FILES ...]] [-a] [-s] [-r RE_FILTER_CONTENT] 60 | 61 | ███████╗███████╗███╗ ██╗███████╗██╗████████╗██╗██╗ ██╗███████╗ 62 | ██╔════╝██╔════╝████╗ ██║██╔════╝██║╚══██╔══╝██║██║ ██║██╔════╝ 63 | ███████╗█████╗ ██╔██╗ ██║███████╗██║ ██║ ██║██║ ██║█████╗ 64 | ╚════██║██╔══╝ ██║╚██╗██║╚════██║██║ ██║ ██║╚██╗ ██╔╝██╔══╝ 65 | ███████║███████╗██║ ╚████║███████║██║ ██║ ██║ ╚████╔╝ ███████╗ 66 | ╚══════╝╚══════╝╚═╝ ╚═══╝╚══════╝╚═╝ ╚═╝ ╚═╝ ╚═══╝ ╚══════╝ 67 | v0.1.6 68 | by 0xn0ne, https://github.com/0xn0ne/sensitive-helper 69 | 70 | options: 71 | -h, --help show this help message and exit 72 | -t, --target-path TARGET_PATH 73 | search for file paths or folder paths for sensitive cache (eg. ~/download/folder). 74 | -p, --process-number PROCESS_NUMBER 75 | number of program processes (default: 12). 76 | -c, --config-path CONFIG_PATH 77 | path to the yaml configuration file (default: configs.yaml). 78 | -o, --output-format OUTPUT_FORMAT 79 | output file format, available formats json, csv (default: csv). 80 | -e, --exclude-files EXCLUDE_FILES [EXCLUDE_FILES ...] 81 | excluded files, using regular matching (eg. \.DS_Store .*bin .*doc). 82 | -a, --is-re-all hit a single regular expression per file or match all regular expressions to exit the match loop. 83 | -s, --is-silent silent mode: when turned on, no hit data will be output on the console. use a progress bar instead. 84 | -r, --re-filter-content RE_FILTER_CONTENT 85 | filter regular expression. if a regular expression is hit during the string matching process of each line, skip the matching of that line directly 86 | ``` 87 | 88 | ### Cybersecurity Emergency Examples 89 | 90 | Thankyou to our cybersecurity colleagues for their suggestions. This tool can also be used for rapid matching of common cyberattack signatures. It is not suitable for complex cyberattacks, such as attacks within POST request bodies, 0-day vulnerability exploits, or attacks targeting specific network paths. Use it judiciously. 91 | 92 | When used for cybersecurity emergency, scanning can be performed directly on middleware and application log directories to rapidly extract suspected attack traces. Since emergency feature matching and sensitive data matching approaches differ in certain aspects, a separate configuration file `emergency.yaml` is employed here to prevent matching conflicts. 93 | 94 | + Supported common attack fingerprint groups (adjustable in the `rules` section of `emergency.yaml`): 95 | + SQL INJECTION 96 | + COMMAND EXECUTION 97 | + PATH TRAVERSAL / LFI-RFI 98 | + SSRF 99 | + XSS 100 | + LOG4SHELL / JNDI 101 | + WEBSHELL / MALICIOUS UPLOAD 102 | + JAVA / PHP DESERIALIZATION 103 | + NOSQL INJECTION 104 | + SENSITIVE ACCESS 105 | 106 | Example: Scan the Nginx/Apache/Tomcat application log directories and output all suspicious attack requests. 107 | 108 | ```bash 109 | python3 sensitive-helper.py -t /var/log/nginx -a -s -c emergency.yaml 110 | python3 sensitive-helper.py -t /var/log/httpd -a -s -c emergency.yaml 111 | python3 sensitive-helper.py -t /opt/tomcat/logs -a -s -c emergency.yaml 112 | ``` 113 | 114 | Recommendations: 115 | + To expand archived logs, you may leave the compressed archive intact (the program will attempt recursive decompression). 116 | + For exceptionally large log volumes, we recommend enabling the progress bar with `-s` and appropriately increasing the number of processes with `-p`. 117 | 118 | ### Sample: Default Mode 119 | 120 | ```bash 121 | $ python3 sensitive-helper.py -t "cache/" -a 122 | [*] file loading... 123 | [*] analyzing... 124 | 125 | [+] group: FUZZY MATCH, match: AppId":"123456", file: cache/heapdump 126 | [+] group: BASE64, match: ZjY2MTQyNDEtYTIyYS00YjNlLTg1NTgtOTQ4NmUwZDFkZjM1, file: cache/heapdump 127 | [+] group: FUZZY MATCH, match: password":"123456", file: cache/heapdump 128 | [+] group: FILE PATH, match: C:\Windows\system32\drivers, file: cache/heapdump-BAK 129 | [+] group: URL, match: http://hello.world/123456.jpg, file: cache/heapdump-BAK 130 | total file number: 5 131 | ``` 132 | 133 | ### Sample: Silent Mode 134 | 135 | ```bash 136 | $ python3 sensitive-helper.py -t "cache/" -a -s 137 | [*] file loading... 138 | [*] analyzing... 139 | 140 | 53792/53792 [██████████████████████████████████████████] 00:28<00:00,1856.73it/s 141 | total file number: 53792 142 | ``` 143 | 144 | # Q&A 145 | 146 | + Q: Why don't we do sensitive data search on web pages? 147 | + A: Because web pages are ever-changing, changing an API interface or a css or id may require updating the code, so it is better to export it to the local area and unify the data processing by using text recognition. 148 | -------------------------------------------------------------------------------- /utils/compress.py: -------------------------------------------------------------------------------- 1 | #!/bin/python3 2 | # _*_ coding:utf-8 _*_ 3 | # 4 | """ 5 | compress.py 6 | 7 | 压缩文件处理工具:识别并解压常见压缩/打包格式(zip/tar/gz/7z/rar)。 8 | 9 | 功能要点: 10 | - zip_info: 读取 zip 本地文件头判断是否 zip 以及压缩方式; 11 | - uncompress_*: 各格式的解压实现,输出到指定目录; 12 | - uncompress: 统一入口,自动识别并(可选)递归解压嵌套文件。 13 | 14 | 依赖: 15 | - 7z: 需要 `py7zr` 16 | - rar: 需要安装系统工具(Windows: WinRAR 并在 PATH 中;Linux: unrar) 17 | 18 | 参考:`https://segmentfault.com/a/1190000007495352` 19 | """ 20 | 21 | import gzip 22 | import pathlib 23 | import tarfile 24 | import zipfile 25 | from typing import Any, Dict, Union 26 | 27 | import py7zr 28 | import rarfile 29 | 30 | 31 | def get_zip_info(file_path: pathlib.Path) -> Dict[str, Any]: 32 | """读取 zip 文件头信息,返回是否为 zip 及压缩方式。""" 33 | ret = {'is_zip': False, 'compression': -1} 34 | with open(file_path, 'rb') as _f: 35 | byte_info = _f.read(30) 36 | ret['is_zip'] = byte_info[:4] == b'PK\x03\x04' 37 | ret['compression'] = int.from_bytes(byte_info[8:10], 'little') 38 | return ret 39 | 40 | 41 | def uncompress_zip( 42 | file_path: Union[pathlib.Path, str], extract_dir: Union[pathlib.Path, str] = '', compression: int = 0 43 | ) -> Union[pathlib.Path, Any]: 44 | """解压 zip 文件到 `extract_dir`。自动使用本地头中的压缩方式。""" 45 | if isinstance(file_path, str): 46 | file_path = pathlib.Path(file_path) 47 | if not extract_dir: 48 | extract_dir = file_path.parent.joinpath('un_' + file_path.name) 49 | if isinstance(extract_dir, str): 50 | extract_dir = pathlib.Path(extract_dir) 51 | 52 | # extract_dir = extract_dir.joinpath(file_path.name) 53 | extract_dir.mkdir(parents=True, exist_ok=True) 54 | 55 | with zipfile.ZipFile(file_path, 'r', compression=compression) as _f: 56 | for extr_name in _f.namelist(): 57 | _f.extract(extr_name, extract_dir.__str__()) 58 | extract_dir.joinpath(extr_name).rename(extract_dir.joinpath(extr_name.encode('cp437').decode('gbk'))) 59 | return extract_dir 60 | 61 | 62 | def is_tar(file_path: pathlib.Path): 63 | """通过魔数判断是否为 tar 文件。""" 64 | with open(file_path, 'rb') as _f: 65 | if _f.read(262)[-5:] == b'ustar': 66 | return True 67 | return False 68 | 69 | 70 | def uncompress_tar( 71 | file_path: Union[pathlib.Path, str], extract_dir: Union[pathlib.Path, str] = '' 72 | ) -> Union[pathlib.Path, Any]: 73 | """解包 tar/tar.* 文件到 `extract_dir`。""" 74 | if isinstance(file_path, str): 75 | file_path = pathlib.Path(file_path) 76 | if not extract_dir: 77 | extract_dir = file_path.parent.joinpath('un_' + file_path.name) 78 | if isinstance(extract_dir, str): 79 | extract_dir = pathlib.Path(extract_dir) 80 | 81 | # extract_dir = extract_dir.joinpath(file_path.name) 82 | extract_dir.mkdir(parents=True, exist_ok=True) 83 | 84 | # tarfile.ReadError: file could not be opened successfully 85 | with tarfile.open(file_path) as _f: 86 | for extr_name in _f.getnames(): 87 | _f.extract(extr_name, extract_dir) 88 | return extract_dir 89 | 90 | 91 | def is_gz(file_path: pathlib.Path): 92 | """通过魔数判断是否为 gzip 文件。""" 93 | with open(file_path, 'rb') as _f: 94 | if _f.read(2) == b'\x1f\x8b': 95 | return True 96 | return False 97 | 98 | 99 | def uncompress_gz( 100 | file_path: Union[pathlib.Path, str], extract_dir: Union[pathlib.Path, str] = '' 101 | ) -> Union[pathlib.Path, Any]: 102 | """解压 gzip 文件;若内部为 tar 则继续调用 tar 解包。""" 103 | if isinstance(file_path, str): 104 | file_path = pathlib.Path(file_path) 105 | if not extract_dir: 106 | extract_dir = file_path.parent.joinpath('un_' + file_path.name) 107 | if isinstance(extract_dir, str): 108 | extract_dir = pathlib.Path(extract_dir) 109 | 110 | # extract_dir = extract_dir.joinpath(file_path.name) 111 | extract_dir.mkdir(parents=True, exist_ok=True) 112 | extract_file = extract_dir.joinpath(file_path.name) 113 | 114 | with gzip.open(file_path, 'rb') as gz_f: 115 | with open(extract_file, 'wb+') as _f: 116 | _f.write(gz_f.read()) 117 | if is_tar(extract_file): 118 | return uncompress_tar(extract_file, extract_dir) 119 | return extract_dir 120 | 121 | 122 | def is_7z(file_path: pathlib.Path): 123 | """通过魔数判断是否为 7z 文件。""" 124 | with open(file_path, 'rb') as _f: 125 | if _f.read(6) == b'7z\xbc\xaf\x27\x1c': 126 | return True 127 | return False 128 | 129 | 130 | def uncompress_7z( 131 | file_path: Union[pathlib.Path, str], extract_dir: Union[pathlib.Path, str] = '' 132 | ) -> Union[pathlib.Path, Any]: 133 | """解压 7z 文件到 `extract_dir`。""" 134 | if isinstance(file_path, str): 135 | file_path = pathlib.Path(file_path) 136 | if not extract_dir: 137 | extract_dir = file_path.parent.joinpath('un_' + file_path.name) 138 | if isinstance(extract_dir, str): 139 | extract_dir = pathlib.Path(extract_dir) 140 | 141 | extract_dir.mkdir(parents=True, exist_ok=True) 142 | 143 | with py7zr.SevenZipFile(file_path, mode='r') as _f: 144 | _f.extractall(extract_dir) 145 | return extract_dir 146 | 147 | 148 | def is_rar(file_path: pathlib.Path): 149 | """通过魔数判断是否为 RAR 文件。""" 150 | with open(file_path, 'rb') as _f: 151 | if _f.read(4) == b'\x52\x61\x72\x21': 152 | return True 153 | return False 154 | 155 | 156 | def uncompress_rar( 157 | file_path: Union[pathlib.Path, str], extract_dir: Union[pathlib.Path, str] = '' 158 | ) -> Union[pathlib.Path, Any]: 159 | """ 160 | 解压 rar 文件在 windows 上需要安装 winrar,并配置好环境变量;linux 上需要安装 unrar,并配置好环境变量 161 | 否则会报出 rarfile.RarCannotExec: Cannot find working tool 错误 162 | """ 163 | if isinstance(file_path, str): 164 | file_path = pathlib.Path(file_path) 165 | if not extract_dir: 166 | extract_dir = file_path.parent.joinpath('un_' + file_path.name) 167 | if isinstance(extract_dir, str): 168 | extract_dir = pathlib.Path(extract_dir) 169 | 170 | extract_dir.mkdir(parents=True, exist_ok=True) 171 | 172 | with rarfile.RarFile(file_path) as _f: 173 | # _f.extractall(extract_dir) 174 | for extr_name in _f.namelist(): 175 | _f.extract(extr_name, extract_dir) 176 | return extract_dir 177 | 178 | 179 | def is_bz(file_path: pathlib.Path): 180 | """通过魔数判断是否为 bzip2 文件。""" 181 | with open(file_path, 'rb') as _f: 182 | if _f.read(2) == b'\x42\x5a\x68': 183 | return True 184 | return False 185 | 186 | 187 | def uncompress( 188 | file_path: Union[pathlib.Path, str], 189 | extract_dir: Union[pathlib.Path, str] = '', 190 | is_error: bool = True, 191 | is_recursive: bool = False, 192 | max_level=64, 193 | ) -> Union[pathlib.Path, Any]: 194 | """统一解压入口,自动识别并可递归解压。 195 | 196 | 支持格式:gz/tar/7z/zip/rar。 197 | 当 `is_recursive=True` 时,将在 `max_level` 限制内递归处理嵌套压缩。 198 | """ 199 | if not isinstance(file_path, pathlib.Path): 200 | file_path = pathlib.Path(file_path) 201 | if not extract_dir: 202 | extract_dir = file_path.parent.joinpath('un_' + file_path.name) 203 | if not isinstance(extract_dir, pathlib.Path): 204 | extract_dir = pathlib.Path(extract_dir) 205 | 206 | if not file_path.is_file(): 207 | if is_error: 208 | raise ValueError('{} is not a file.'.format(file_path)) 209 | return 210 | 211 | ret = None 212 | file_info = get_zip_info(file_path) 213 | if file_info['is_zip']: 214 | ret = uncompress_zip(file_path, extract_dir, file_info['compression']) 215 | elif is_gz(file_path): 216 | ret = uncompress_gz(file_path, extract_dir) 217 | elif is_tar(file_path): 218 | ret = uncompress_tar(file_path, extract_dir) 219 | elif is_7z(file_path): 220 | ret = uncompress_7z(file_path, extract_dir) 221 | elif is_rar(file_path): 222 | ret = uncompress_rar(file_path, extract_dir) 223 | elif is_error: 224 | raise ValueError('{} is not a compressed file.'.format(file_path)) 225 | 226 | if is_recursive and ret and max_level > 0: 227 | for it in ret.glob('**/*'): 228 | uncompress(it, ret.joinpath('un_' + it.name), is_error, is_recursive, max_level - 1) 229 | return ret 230 | 231 | 232 | if __name__ == '__main__': 233 | # print(zip_info(pathlib.Path('cache/utils.zip'))) 234 | # print(uncompress_zip('cache/utils.zip')) 235 | # print(is_tar(pathlib.Path('cache/utils.tar'))) 236 | # print(uncompress_tar('cache/utils.tar')) 237 | # print(is_gz(pathlib.Path('cache/utils.tgz'))) 238 | # print(uncompress_gz('cache/utils.tgz')) 239 | # print(is_7z(pathlib.Path('cache/utils.7z'))) 240 | # print(uncompress_7z('cache/utils.7z')) 241 | # print(is_rar(pathlib.Path('cache/utils.rar'))) 242 | # print(uncompress_rar('cache/utils.rar')) 243 | print(uncompress('cache/utils.xlsx', is_error=False, is_recursive=True)) 244 | pass 245 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /sensitive-helper.py: -------------------------------------------------------------------------------- 1 | #!/bin/python3 2 | # _*_ coding:utf-8 _*_ 3 | # 4 | """ 5 | sensitive-helper.py 6 | 7 | 本地文件敏感信息搜索工具: 8 | - 遍历目标目录或文件,必要时对常见办公与压缩文件进行解析/解压; 9 | - 使用多进程并发扫描,根据规则匹配可能的敏感信息; 10 | - 输出命中结果为 CSV 或 JSON。 11 | 12 | 主要模块: 13 | - utils.compress: 识别与解压多种压缩格式 14 | - utils.office: 解析 .docx/.xlsx 内容为可扫描文本 15 | - utils.process: 进程池辅助工具 16 | - utils.configurator: 简易配置加载与合并 17 | """ 18 | 19 | import base64 20 | import binascii 21 | import csv 22 | import json 23 | import pathlib 24 | import re 25 | import time 26 | from typing import Any, AnyStr, Dict, List, Union 27 | 28 | import pandas 29 | import tqdm 30 | 31 | from utils import compress, configurator, office, process 32 | 33 | 34 | def log_run_times(func): 35 | """装饰器:记录执行耗时超过 1 秒的函数调用。调试专用函数,用于检测哪些步骤消耗大量计算时间,正式使用可不理会。 36 | 37 | 记录内容写入 `run_times.log`,包括耗时与首个参数的前 127 字符。 38 | """ 39 | 40 | def wrapper(*args, **kwargs): 41 | s_time = time.time() 42 | ret = func(*args, **kwargs) 43 | total_time = time.time() - s_time 44 | if total_time <= 1: 45 | return ret 46 | with open("run_times.log", "a") as _f: 47 | _f.write("total time(s): {}, args: {}\n".format(time.time() - s_time, args[0][:127])) 48 | return ret 49 | 50 | return wrapper 51 | 52 | 53 | def string_to_reg_flags(flags: str): 54 | """将形如 "I|M|S" 的正则标志字符串转换为 re 标志位整型。""" 55 | flags_int = 0 56 | for flag in flags.split("|"): 57 | flags_int |= getattr(re, flag) 58 | return flags_int 59 | 60 | 61 | def is_filter_base64(result: AnyStr): 62 | """校验并尝试解码 Base64 片段。 63 | 64 | 返回 (is_filter, extend_text): 65 | - is_filter 为 True 表示应过滤(无效/不可读/不合规); 66 | - extend_text 为解码出的可读文本(当不过滤时)。 67 | """ 68 | if len(result) % 4 != 0: 69 | return True, "" 70 | try: 71 | # 编码错误的全都丢掉,不丢掉也看不懂 72 | ret_extend = base64.b64decode(result).decode("utf-8") 73 | if not re.search( 74 | r"^[\u0020-\u007F\u2010-\u202f\u3000-\u301f\u4e00-\u9fa5\uff00-\uffef]+$", 75 | ret_extend, 76 | ): 77 | return True, "" 78 | # \u0020-\u007F:英文可视字符集 79 | # \u2010-\u202f:中文部分符号集 80 | # \u3000-\u301f:中文部分符号集 81 | # \u4e00-\u9fa5:中文常见文字集 82 | # \u2e80-\u9fff:中文文字及中文异形文字集 83 | # \uff00-\uffef:中文部分符号集 84 | except UnicodeDecodeError: 85 | return True, "" 86 | except binascii.Error: 87 | return True, "" 88 | return False, ret_extend 89 | 90 | 91 | def is_filter_jwt(result: AnyStr): 92 | """快速校验 JWT 结构是否符合 Base64 块长度要求(粗筛)。""" 93 | times = 0 94 | res_split = result.split(b".") # type: ignore 95 | while times < 2: 96 | if len(res_split[times]) % 4 != 0: 97 | return True, "" 98 | times += 1 99 | return False, "" 100 | 101 | 102 | def is_filter_result(result: AnyStr, filters: List[AnyStr], flags: int): 103 | """基于 `re_filters` 做二次过滤,命中即过滤。""" 104 | if not filters: 105 | return False, "" 106 | for fil in filters: 107 | if re.search(fil, result, flags): # type: ignore 108 | return True, "" 109 | return False, "" 110 | 111 | 112 | # @log_run_times 113 | def search_content( 114 | file_object: Union[pathlib.Path, bytes], 115 | rules: Dict[str, List[str]], 116 | split: bytes = b"[\x00-\x1f\x7f]+", 117 | re_filter_content: bytes = rb"", 118 | is_re_all: bool = False, 119 | is_silent: bool = False, 120 | ) -> List[Dict[str, str]]: 121 | """扫描单个文件对象的内容并返回命中结果列表。 122 | 123 | 参数: 124 | - file_object: `Path` 或 bytes;Path 时按控制字符切分字节行; 125 | - rules: 规则字典,值可以是字符串列表或带 flags/re_filters/regexp 的字典; 126 | - split: 行分割正则(bytes); 127 | - re_filter_content: 跳过匹配过滤,碰到指定字符直接跳过匹配; 128 | - is_re_all: 命中一条是否继续匹配该文件的其它规则。 129 | 130 | 返回: 131 | - 列表项包含 file/group/regexp/match/extend 字段。 132 | """ 133 | ret = [] 134 | row_contents = [file_object] 135 | if isinstance(file_object, pathlib.Path): 136 | row_contents = re.split(split, file_object.read_bytes()) 137 | 138 | # 创建文件行扫描进度条 139 | file_name = str(file_object) if isinstance(file_object, pathlib.Path) else "bytes" 140 | result_gen = enumerate(row_contents, start=1) 141 | if is_silent: 142 | result_gen = tqdm.tqdm( 143 | enumerate(row_contents, start=1), 144 | total=len(row_contents), 145 | desc=f"file: {file_name.split('/')[-1][:16]}..", 146 | leave=False, # 不保留进度条,避免与主进度条冲突 147 | ncols=100, 148 | bar_format="{desc}:{percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]", 149 | ) 150 | 151 | for index, row_one in result_gen: 152 | # 按控制字符进行分割行 153 | if len(row_one) < 12: 154 | # 单行内容少于8个字符,丢掉 155 | continue 156 | if re_filter_content and re.search(re_filter_content, row_one): 157 | continue 158 | for rule_name in rules: 159 | rule = rules[rule_name] 160 | flags = 0 161 | filters = None 162 | if isinstance(rule, Dict): 163 | if "flags" in rule: 164 | flags = string_to_reg_flags(rule["flags"]) # type: ignore 165 | if "re_filters" in rule: 166 | filters = rule["re_filters"] # type: ignore 167 | rule = rule["regexp"] # type: ignore 168 | for regexp in rule: 169 | r_result = re.search(regexp, row_one, flags) # type: ignore 170 | if not r_result: 171 | continue 172 | try: 173 | result_byte = r_result.group() 174 | result_text = result_byte.decode("utf-8") 175 | except UnicodeDecodeError: 176 | continue 177 | is_filter, extend = is_filter_result(result_byte, filters, flags) # type: ignore 178 | if rule_name == "BASE64": 179 | is_filter, extend = is_filter_base64(result_byte) 180 | if rule_name == "JSON WEB TOKEN(JWT)": 181 | is_filter, extend = is_filter_jwt(result_byte) 182 | if is_filter: 183 | continue 184 | 185 | ret.append( 186 | { 187 | "file": f"{file_object.__str__()}:{index}", 188 | "group": rule_name, 189 | "regexp": regexp.decode("utf-8"), # type: ignore 190 | "match": result_text, 191 | "extend": extend, 192 | } 193 | ) 194 | if not is_re_all: 195 | # 如果关闭了匹配所有正则组数据且已发现有用数据,则退出循环 196 | if hasattr(result_gen, 'close'): 197 | result_gen.close() # type: ignore # 关闭进度条 198 | return ret 199 | 200 | if hasattr(result_gen, 'close'): 201 | result_gen.close() # type: ignore # 关闭进度条 202 | return ret 203 | 204 | 205 | def gen_file_list(src_path: str, exclude_files: List[str]) -> List[pathlib.Path]: 206 | """生成需扫描的文件列表,并对特定类型进行预处理。 207 | 208 | - 对 `.docx`/`.xlsx`:解析内容到同名 `_resolved.txt`,便于后续扫描; 209 | - 其它文件:尝试递归解压缩,以发现嵌套内容。 210 | """ 211 | tar_path = pathlib.Path(src_path) 212 | ret = [] 213 | if tar_path.is_file(): 214 | ret.append(tar_path) 215 | else: 216 | for filepath in tar_path.glob("**/*"): 217 | is_skip = False 218 | if filepath.is_dir(): 219 | continue 220 | filename = filepath.name 221 | for r_exclude in exclude_files: 222 | # 文件名正则匹配,在排除名单中则排除文件 223 | if re.match(r_exclude, filename): 224 | is_skip = True 225 | break 226 | if is_skip: 227 | continue 228 | if filename.endswith(".docx") and not filename.startswith("~$"): 229 | office.docx_handler(filepath) 230 | elif filename.endswith(".xlsx") and not filename.startswith("~$"): 231 | office.xlsx_handler(filepath) 232 | else: 233 | compress.uncompress(filepath, is_error=False, is_recursive=True) 234 | ret.append(filepath) 235 | return ret 236 | 237 | 238 | def run(): 239 | """主流程:构建任务队列、并发扫描、汇总去重并输出结果。""" 240 | pool = process.ProcessPoolHelper(max_workers=CFG.get("process_number")) 241 | print("[*] file loading...") 242 | filelist = gen_file_list(CFG.get("target_path"), CFG.get("exclude_files")) # type: ignore 243 | if not filelist: 244 | print("[!] the file path is empty. please check whether the path is correct.\n") 245 | return 246 | filelist = sorted(filelist, key=lambda x: x.stat().st_size, reverse=True) 247 | ret = [] 248 | result_filter_list = [] 249 | print(f"[*] found {len(filelist)} files.") 250 | groups = CFG.get("rules") 251 | for filepath in filelist: 252 | pool.submit_super( 253 | search_content, 254 | filepath, 255 | groups, 256 | CFG.get("row_split"), 257 | CFG.get("re_filter_content"), 258 | CFG.get("is_re_all"), 259 | CFG.get("is_silent"), 260 | ) 261 | 262 | print("[*] analyzing...\n") 263 | result_gen = pool.result_yield() 264 | if CFG.get("is_silent"): 265 | result_gen = tqdm.tqdm( 266 | pool.result_yield(), 267 | total=len(filelist), 268 | mininterval=1, 269 | ncols=100, 270 | bar_format="{n_fmt}/{total_fmt} [{bar}] {elapsed}<{remaining},{rate_fmt}{postfix}", 271 | ) 272 | for results in result_gen: 273 | if not results: 274 | continue 275 | for result in results: 276 | union_data = [result["file"], result["match"]] 277 | # 相同文件,相同匹配字符串去重 278 | if union_data in result_filter_list: 279 | continue 280 | result_filter_list.append([result["file"], result["match"]]) 281 | ret.append(result) 282 | if not CFG.get("is_silent"): 283 | print("[+] group: {}, match: {}, file: {}".format(result["group"], result["match"], result["file"])) 284 | output_format = CFG.get("output_format") 285 | filename = "results_{}.csv".format(time.strftime("%H%M%S", time.localtime())) 286 | if output_format == "json": 287 | filename = "results.json" 288 | with open(filename, "w", encoding="utf-8") as _f: 289 | _f.write(json.dumps(ret)) 290 | else: 291 | to_csv(ret, filename) 292 | 293 | print("[*] total file number:", len(filelist)) 294 | print("[+] output to:", pathlib.Path(filename).absolute()) 295 | return ret 296 | 297 | 298 | def to_csv(data: Union[Dict[str, Any], List[Dict[str, Any]]], filename: str = "output.csv"): 299 | """将结果列表输出为 CSV 文件。""" 300 | dataframe = pandas.DataFrame(data) 301 | dataframe.to_csv(filename, quoting=csv.QUOTE_MINIMAL) 302 | 303 | 304 | # 考虑字典数据、列表数据、函数数据 305 | FUZZY_UNIVERSAL_STRING = r'["\'`]?\s*[=:(\{\[]\s*["\'`][\x20-\x7F]{,128}?[\'"`]' 306 | # 307 | PATH_COMMON_STRING = r"users?|windows?|program files(\(x\d{2,3}\))?|s?bin|etc|usr|boot|dev|home|proc|opt|sys|srv|var" 308 | 309 | __DEFAULT_CONFIG = { 310 | "target_path": { 311 | "__help": "search for file paths or folder paths for sensitive cache (eg. ~/download/folder).", 312 | }, 313 | "process_number": { 314 | "__type": int, 315 | '__default': 12, 316 | '__help': "number of program processes (default: 12).", 317 | }, 318 | "config_path": { 319 | "__default": "config.yaml", 320 | '__help': "path to the yaml configuration file (default: configs.yaml).", 321 | }, 322 | "output_format": { 323 | "__default": "csv", 324 | '__help': "output file format, available formats json, csv (default: csv).", 325 | }, 326 | "exclude_files": { 327 | "__default": [r"\.DS_Store"], 328 | "__nargs": "+", 329 | '__help': "excluded files, using regular matching (eg. \\.DS_Store .*bin .*doc).", 330 | }, 331 | "is_re_all": { 332 | "__flags": ['-a', '--is-re-all'], 333 | "__action": "store_true", 334 | '__help': "hit a single regular expression per file or match all regular expressions to exit the match loop.", 335 | }, 336 | "is_silent": { 337 | "__flags": ['-s', '--is-silent'], 338 | "__action": "store_true", 339 | '__help': "silent mode: when turned on, no hit data will be output on the console. use a progress bar instead.", 340 | }, 341 | "re_filter_content": { 342 | '__help': "filter regular expression. if a regular expression is hit during the string matching process of each line, skip the matching of that line directly", 343 | }, 344 | "row_split": "[\x00-\x1f\x7f]+", 345 | "rules": { 346 | "AKSK": [ 347 | r"[\s\n\'\"`=:#]LTAI\w{12,20}[\s\n\'\"`=:#]", 348 | r"[\s\n\'\"`=:#](A3T[A-Z0-9]|ABIA|ACCA|AGPA|AIDA|AIPA|AKIA|ANPA|ANVA|APKA|AROA|ASCA|ASIA)[0-9A-Z]{16}[\s\n\'\"`=:#]", 349 | r"[\s\n\'\"`=:#]GOOG\w{10,30}[\s\n\'\"`=:#]", 350 | r"[\s\n\'\"`=:#]AZ[A-Za-z0-9]{34,40}[\s\n\'\"`=:#]", 351 | r"[\s\n\'\"`=:#]IBM[A-Za-z0-9]{10,40}[\s\n\'\"`=:#]", 352 | r"[\s\n\'\"`=:#][a-zA-Z0-9]{8}(-[a-zA-Z0-9]{4}){3}-[a-zA-Z0-9]{12}[\s\n\'\"`=:#]", 353 | r"[\s\n\'\"`=:#]OCID[A-Za-z0-9]{10,40}[\s\n\'\"`=:#]", 354 | r"[\s\n\'\"`=:#]LTAI[A-Za-z0-9]{12,20}[\s\n\'\"`=:#]", 355 | r"[\s\n\'\"`=:#]AKID[A-Za-z0-9]{13,20}[\s\n\'\"`=:#]", 356 | r"[\s\n\'\"`=:#]AK[A-Za-z0-9]{10,40}[\s\n\'\"`=:#]", 357 | r"[\s\n\'\"`=:#]JDC_[A-Z0-9]{28,32}[\s\n\'\"`=:#]", 358 | r"[\s\n\'\"`=:#]AKLT[a-zA-Z0-9-_]{0,252}[\s\n\'\"`=:#]", 359 | r"[\s\n\'\"`=:#]UC[A-Za-z0-9]{10,40}[\s\n\'\"`=:#]", 360 | r"[\s\n\'\"`=:#]QY[A-Za-z0-9]{10,40}[\s\n\'\"`=:#]", 361 | r"[\s\n\'\"`=:#]AKLT[a-zA-Z0-9-_]{16,28}[\s\n\'\"`=:#]", 362 | r"[\s\n\'\"`=:#]LTC[A-Za-z0-9]{10,60}[\s\n\'\"`=:#]", 363 | r"[\s\n\'\"`=:#]YD[A-Za-z0-9]{10,60}[\s\n\'\"`=:#]", 364 | r"[\s\n\'\"`=:#]CTC[A-Za-z0-9]{10,60}[\s\n\'\"`=:#]", 365 | r"[\s\n\'\"`=:#]YYT[A-Za-z0-9]{10,60}[\s\n\'\"`=:#]", 366 | r"[\s\n\'\"`=:#]YY[A-Za-z0-9]{10,40}[\s\n\'\"`=:#]", 367 | r"[\s\n\'\"`=:#]CI[A-Za-z0-9]{10,40}[\s\n\'\"`=:#]", 368 | r"[\s\n\'\"`=:#]gcore[A-Za-z0-9]{10,30}[\s\n\'\"`=:#]", 369 | ], 370 | "JSON WEB TOKEN(JWT)": [r"ey[0-9a-zA-Z/+]{4,}={,2}\.[0-9a-zA-Z/+]{6,}={,2}\.[A-Za-z0-9-_]+"], 371 | "FUZZY MATCH": { 372 | "flags": "I", 373 | "regexp": [ 374 | r"(APP|ACCESS|USER|PASS|OSS|ECS|CVM|AWS)[\w]{,8}(NAME|ID|KEY|NUM|ENC|CODE|SEC|WORD)[\w]{,16}%s" 375 | % FUZZY_UNIVERSAL_STRING, 376 | # 考虑驼峰写法,下划线写法,MAP键值下面单词后必须接大写字母、下划线、中划线,否侧可能出现如: 377 | r"(USR|PWD|COOKIE)[_\-A-Z][\w]{,16}%s" % FUZZY_UNIVERSAL_STRING, 378 | r"(SECRET|SIGN|TOKEN)[\w]{,16}%s" % FUZZY_UNIVERSAL_STRING, 379 | ], 380 | }, 381 | "BASE64": [r"[0-9a-zA-Z/+]{8,}={,2}"], 382 | "URL": { 383 | "regexp": [r"(ftp|https?):\/\/[%.\w\-]+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?"], 384 | "re_filters": [ 385 | r"(adobe|amap|android|apache|bing|digicert|eclipse|freecodecamp|github|githubusercontent|gnu|godaddy|google|googlesource|youtube|youtu|jd" 386 | r"|npmjs|microsoft|openxmlformats|outlook|mozilla|openssl|oracle|qq|spring|sun|umang|w3|wikipedia|xml)\.(" 387 | r"org|com|cn|net|edu|io|be)", 388 | r"(ali|baidu|cdn|example|ssh|ssl)[\w-]*\.(org|com|cn|net|edu|io)", 389 | ], 390 | }, 391 | "EMAIL": [r"[a-zA-Z0-9][-+.\w]{1,127}@([a-zA-Z0-9][-a-zA-Z0-9]{0,63}.){,3}(org|com|cn|net|edu|mail)"], 392 | "PHONE": [r"(13[0-9]|14[5-9]|15[0-3,5-9]|16[6]|17[0-8]|18[0-9]|19[8,9])\d{8}"], 393 | "FILE PATH": { 394 | "flags": "I|X", 395 | "regexp": [ 396 | r"([a-z]:\\)?([\\/])(users?|windows?|program files(\(x\d{2,3}\))?|s?bin|etc|usr|boot|dev|home|proc|opt" 397 | r"|sys|srv|var)(\2[.\w!#\(~\[\{][.\w!#&\(\)+=~\[\]\{\}\s]{2,63}){1,16}" 398 | ], 399 | "re_filters": [ 400 | # r'[\\/].*sdk.*', 401 | # r'[\\/](alibaba|aliyun|annotation|apache|chromium|collections|eclipse|facebook|functions|github|google' 402 | # r'|internal|jetbrains|oppo|reactnative|reflect|sdklib|sequences|taobao|tencent|unionpay|view|vivo' 403 | # r'|webkit|xiaomi)', 404 | ], 405 | }, 406 | }, 407 | } 408 | 409 | CFG = configurator.CliConfigurator({}) 410 | 411 | if __name__ == "__main__": 412 | import argparse 413 | 414 | CFG = configurator.new( 415 | configurator.CliConfigurator, 416 | template=__DEFAULT_CONFIG, 417 | formatter_class=argparse.RawDescriptionHelpFormatter, 418 | description=""" 419 | ███████╗███████╗███╗ ██╗███████╗██╗████████╗██╗██╗ ██╗███████╗ 420 | ██╔════╝██╔════╝████╗ ██║██╔════╝██║╚══██╔══╝██║██║ ██║██╔════╝ 421 | ███████╗█████╗ ██╔██╗ ██║███████╗██║ ██║ ██║██║ ██║█████╗ 422 | ╚════██║██╔══╝ ██║╚██╗██║╚════██║██║ ██║ ██║╚██╗ ██╔╝██╔══╝ 423 | ███████║███████╗██║ ╚████║███████║██║ ██║ ██║ ╚████╔╝ ███████╗ 424 | ╚══════╝╚══════╝╚═╝ ╚═══╝╚══════╝╚═╝ ╚═╝ ╚═╝ ╚═══╝ ╚══════╝ 425 | v0.1.6 426 | by 0xn0ne, https://github.com/0xn0ne/sensitive-helper 427 | """, 428 | ) 429 | CFG.parse_args('config_path') # type: ignore 430 | print("[*] config:", CFG.gen_detail(depth=2, filters=["rules"])) 431 | rules = CFG.get("rules") 432 | for rule in rules.values(): 433 | if isinstance(rule, Dict): 434 | if "re_filters" in rule: 435 | for index, value in enumerate(rule["re_filters"]): 436 | rule["re_filters"][index] = value.encode() 437 | rule = rule["regexp"] 438 | for index, value in enumerate(rule): 439 | rule[index] = value.encode() 440 | CFG.raw["row_split"] = CFG.raw["row_split"].encode() 441 | CFG.raw["re_filter_content"] = CFG.raw["re_filter_content"].encode() 442 | run() 443 | -------------------------------------------------------------------------------- /utils/configurator.py: -------------------------------------------------------------------------------- 1 | #!/bin/python3 2 | # _*_ coding:utf-8 _*_ 3 | # 4 | # configurator.py 5 | # 依赖安装:pip install toml yaml 6 | # toml 文档:https://github.com/uiri/toml 7 | # yaml 文档:https://pyyaml.org/wiki/PyYAMLDocumentation 8 | 9 | import argparse 10 | import json 11 | import pathlib 12 | import re 13 | from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union 14 | 15 | try: 16 | import toml 17 | except ImportError: 18 | toml = None 19 | 20 | try: 21 | import yaml 22 | except ImportError: 23 | yaml = None 24 | 25 | # 全局配置字典,用于存储所有配置实例 26 | _G_CFG = {} 27 | 28 | 29 | class Mode: 30 | merge = 10 31 | update = 20 32 | replace = 30 33 | 34 | 35 | class BaseConfigurator: 36 | def __init__(self, template: Dict[str, Any] = {}): 37 | """ 38 | 初始化配置基类 39 | 40 | Args: 41 | template (Dict, optional): 默认配置模板。默认为空字典。 42 | """ 43 | self.raw = template or {} 44 | # 保存原始模板 45 | self.template = template 46 | 47 | def loads(self, content: str, fmt: str = "json", mode: int = Mode.update) -> "BaseConfigurator": 48 | """ 49 | 从字符串加载配置 50 | 51 | Args: 52 | content (str): 配置内容字符串 53 | fmt (str, optional): 配置格式,支持 'json', 'yaml', 'toml'。默认为 'json'。 54 | 55 | Returns: 56 | BaseConfigurator: 返回自身实例 57 | """ 58 | if not content: 59 | return self 60 | if fmt == "toml" and toml: 61 | config_dict = toml.loads(content) 62 | elif fmt == "yaml" and yaml: 63 | config_dict = yaml.safe_load(content) 64 | else: 65 | config_dict = json.loads(content) 66 | if mode == Mode.merge: 67 | self.raw = self._merge_dicts(self.raw, config_dict) 68 | elif mode == Mode.replace: 69 | self.raw = config_dict 70 | else: 71 | self.raw.update(config_dict) 72 | return self 73 | 74 | def dumps(self, fmt: str = "json") -> str: 75 | """ 76 | 将配置转换为字符串 77 | 78 | Args: 79 | fmt (str, optional): 配置格式,支持 'json', 'yaml', 'toml'。默认为 'json'。 80 | 81 | Returns: 82 | str: 配置内容字符串 83 | """ 84 | if fmt == "toml" and toml: 85 | return toml.dumps(self.raw) 86 | elif fmt == "yaml" and yaml: 87 | return yaml.safe_dump(self.raw) 88 | else: 89 | return json.dumps(self.raw) 90 | 91 | def get(self, keys: str, default: Any = None, sep: str = ".") -> Any: 92 | """ 93 | 获取配置值 94 | 95 | Args: 96 | keys (str): 多级键字符串 97 | default (Any, optional): 默认值。默认为 None。 98 | sep (str, optional): 键分隔符。默认为 '.'。 99 | 100 | Returns: 101 | Any: 配置值 102 | """ 103 | keys_list = keys.split(sep) 104 | value = self.raw 105 | for key in keys_list: 106 | if isinstance(value, dict) and key in value: 107 | value = value[key] 108 | elif default is None: 109 | raise KeyError(f'key "{keys}" not found in configuration.') 110 | else: 111 | return default 112 | return value 113 | 114 | def set(self, keys: str, value: Any, sep: str = "."): 115 | """ 116 | 设置配置值 117 | 118 | Args: 119 | keys (str): 多级键字符串 120 | value (Any): 配置值 121 | sep (str, optional): 键分隔符。默认为 '.'。 122 | """ 123 | keys_list = keys.split(sep) 124 | d = self.raw 125 | for key in keys_list[:-1]: 126 | if key not in d or not isinstance(d[key], dict): 127 | d[key] = {} 128 | d = d[key] 129 | d[keys_list[-1]] = value 130 | 131 | def exists(self, key: str) -> bool: 132 | """ 133 | 判断配置键是否存在 134 | 135 | Args: 136 | key (str): 配置键 137 | 138 | Returns: 139 | bool: 是否存在 140 | """ 141 | return self.get(key) is not None 142 | 143 | def clear(self): 144 | """ 145 | 清空配置 146 | """ 147 | self.raw = {} 148 | 149 | def gen_detail(self, depth: int = 3, sep: str = "; ", filters: List[str] = []) -> str: 150 | """ 151 | 生成配置简要预览 152 | 153 | Args: 154 | depth (int, optional): 最大遍历深度,负数的情况下输出所有内容。默认为 3。 155 | sep (str, optional): 首级键分隔符。默认为 ';'。 156 | 157 | Returns: 158 | str: 配置简要预览字符串 159 | """ 160 | 161 | def _gen_detail(parent_dict, current_depth: int, parent_key: str = ""): 162 | if not current_depth: 163 | return "..." 164 | if isinstance(parent_dict, dict): 165 | d_detail = {} 166 | for k, v in parent_dict.items(): 167 | next_key = "{}.{}".format(parent_key, k) if parent_key else k 168 | if filters and next_key in filters: 169 | continue 170 | d_detail[k] = _gen_detail(v, current_depth - 1, next_key) 171 | 172 | return d_detail 173 | return parent_dict 174 | 175 | detail = _gen_detail(self.raw, depth) 176 | l_detail = [] 177 | for k, v in detail.items(): # type: ignore 178 | l_detail.append(f"{k}: {v}") 179 | 180 | return sep.join(l_detail) 181 | 182 | def _merge_dicts(self, d1, d2): 183 | """ 184 | 合并两个字典 185 | 186 | Args: 187 | d1 (dict): 字典1 188 | d2 (dict): 字典2 189 | 190 | Returns: 191 | dict: 合并后的字典 192 | """ 193 | for k, v in d2.items(): 194 | if k in d1 and isinstance(d1[k], dict) and isinstance(v, dict): 195 | self._merge_dicts(d1[k], v) 196 | else: 197 | d1[k] = v 198 | return d1 199 | 200 | 201 | class FileConfigurator(BaseConfigurator): 202 | def __init__( 203 | self, 204 | filepath: str = "configs.json", 205 | template: Dict = {}, 206 | is_auto_make: bool = False, 207 | ): 208 | """ 209 | 初始化文件配置子类 210 | 211 | Args: 212 | filepath (str, optional): 配置文件路径。默认为 'configs.json'。 213 | template (Dict, optional): 默认配置模板。默认为空字典。 214 | """ 215 | super().__init__(template) 216 | self.filepath = pathlib.Path(filepath) 217 | if is_auto_make: 218 | self.filepath.touch() 219 | self.load(Mode.merge) 220 | 221 | def load(self, mode: int = Mode.update): 222 | """ 223 | 从文件加载配置 224 | """ 225 | if self.filepath.is_file(): 226 | self.loads(self.filepath.read_text(encoding='utf-8'), self.filepath.suffix[1:], mode) 227 | 228 | def save(self, fmt: str = "json"): 229 | """ 230 | 将配置保存到文件 231 | 232 | Args: 233 | fmt (str, optional): 配置格式,支持 'json', 'yaml', 'toml'。默认为 'json'。 234 | """ 235 | if fmt not in ["json", "yaml", "toml"]: 236 | raise NotImplementedError(f'the file format "{fmt}" is not supported.') 237 | self.filepath = self.filepath.with_suffix(f".{fmt}") 238 | self.filepath.write_text(self.dumps(fmt)) 239 | 240 | def delete(self, fmts: Union[str, List[str]] = []): 241 | """ 242 | 删除配置文件 243 | 244 | Args: 245 | fmt (Union[str, List[str]], optional): 配置格式,支持 'json', 'yaml', 'toml'。默认为空列表。 246 | """ 247 | if isinstance(fmts, str): 248 | fmts = [fmts] 249 | for ext in fmts: 250 | if ext not in ["json", "yaml", "toml"]: 251 | raise NotImplementedError(f'the file format "{ext}" is not supported.') 252 | if not fmts: 253 | fmts = ["json", "yaml", "toml"] 254 | for ext in fmts: 255 | self.filepath.with_suffix(f".{ext}").unlink(missing_ok=True) 256 | 257 | 258 | class CliConfigurator(FileConfigurator): 259 | def __init__( 260 | self, 261 | template: Dict[str, Union[Dict[str, str], Any]], 262 | filepath: str = "configs.json", 263 | prog: Optional[str] = None, 264 | usage: Optional[str] = None, 265 | description: Optional[str] = None, 266 | epilog: Optional[str] = None, 267 | **kwargs, 268 | ): 269 | """ 270 | 初始化命令行配置子类 271 | 272 | Args: 273 | template (Dict, optional): 默认命令行参数配置模板,该模版必须存在,不支持从配置文件中加载。 274 | filepath (str, optional): 配置文件路径。默认为 'configs.json'。 275 | """ 276 | # 根据 template 配置,创建命令行 277 | self.exists_short_key: Set[str] = set() 278 | self.parser = argparse.ArgumentParser(prog, usage, description, epilog, **kwargs) 279 | self._add_args_from_template(template) 280 | 281 | # 循环遍历 self.raw,去除命令行参数所需要使用的默认值和描述信息 282 | del_keys = [] 283 | for key, value in template.items(): 284 | if not isinstance(value, dict): 285 | continue 286 | for sub_key in value: 287 | if sub_key == "__default": 288 | template[key] = value["__default"] 289 | continue 290 | if sub_key == '__description': 291 | del value["__description"] 292 | continue 293 | if sub_key.startswith('__'): 294 | del_keys.append(key) 295 | break 296 | # 因为中途删除key会导致字典变化,引发错误,检索完成后删除一轮 297 | for del_key in del_keys: 298 | del template[del_key] 299 | super().__init__(filepath, template) 300 | 301 | def _parse_add_argument_kwargs( 302 | self, 303 | value: Dict[str, Any], 304 | able_keys: List[str] = [ 305 | 'default', 306 | 'type', 307 | 'help', 308 | 'required', 309 | 'action', 310 | 'nargs', 311 | 'const', 312 | 'choices', 313 | 'metavar', 314 | 'dest', 315 | 'deprecated', 316 | ], 317 | ) -> Dict[str, Any]: 318 | """ 319 | able_keys参考可用参数来源https://docs.python.org/3/library/argparse.html#argparse.ArgumentParser.add_argument 320 | 321 | Args: 322 | value (Dict[str, Any]): _description_ 323 | able_keys (List[str], optional): _description_. Defaults to [ 'default', 'type', 'help', 'required', 'action', 'nargs', 'const', 'choices', 'metavar', 'dest', 'deprecated', ]. 324 | 325 | Returns: 326 | Dict[str, Any]: _description_ 327 | """ 328 | kwargs = {} 329 | for key, val in value.items(): 330 | if key.startswith('__') and key[2:] in able_keys: 331 | kwargs[key[2:]] = val 332 | return kwargs 333 | 334 | def _gen_flags(self, flag: str, is_initial_letter_mode: bool = False) -> List[str]: 335 | """创建短命令标志 336 | 337 | Args: 338 | flag (str): 常规长度的 flag 字符串 339 | is_initial_letter_mode (bool, optional): 启用首字母模式,将 flag 的每个首字母拼合成短命令标志,默认为:False. 340 | 341 | Returns: 342 | List[str]: 成功生成短命令标志的情况下就会返回命令标志字符串列表 343 | """ 344 | r_flag = re.findall(r'[\da-zA-Z]+', flag) 345 | if not r_flag: 346 | raise ValueError('the flag does not conform to the specification and should be a character in "0-9a-zA-Z".') 347 | # 创建常规长度 flag 348 | flags = ['--' + '-'.join(r_flag)] 349 | new_short_flag = '-' 350 | if is_initial_letter_mode: 351 | # 如果短flag已存在,报错 352 | new_short_flag += ''.join([i[0] for i in r_flag]) 353 | if new_short_flag in self.exists_short_key: 354 | raise ValueError(f'this "{new_short_flag}" short flag command already exists.') 355 | self.exists_short_key.add(new_short_flag) 356 | flags.reverse() 357 | return flags 358 | for iter in r_flag: 359 | new_short_flag += iter[0] 360 | # 防冲突,已存在就直接跳过 361 | if new_short_flag not in self.exists_short_key: 362 | self.exists_short_key.add(new_short_flag) 363 | flags.append(new_short_flag) 364 | break 365 | flags.reverse() 366 | return flags 367 | 368 | def _add_args_from_template(self, template: Dict, group: Optional[argparse.ArgumentParser] = None): 369 | """ 370 | 根据模板添加命令行参数 371 | 372 | Args: 373 | template (Dict): 配置模板,命令行参数解析仅支持解析深度为 2 的字典结构。如 374 | """ 375 | if not group: 376 | group = self.parser 377 | 378 | for key, value in template.items(): 379 | if not isinstance(value, dict): 380 | # 子键值为字典才会解析到命令行中 381 | continue 382 | kwargs = self._parse_add_argument_kwargs(value) 383 | if kwargs: 384 | if '__flags' in value: 385 | flags = value['__flags'] 386 | else: 387 | flags = self._gen_flags(key) 388 | 389 | group.add_argument( # type: ignore 390 | *flags, 391 | **kwargs, 392 | # type=value["__type"], 393 | # default=value["__default"], 394 | # help=value.get("__help", ""), 395 | # required=value.get("__required", False), 396 | ) 397 | continue 398 | # 且没有扫描到 add_argument 函数所需的参数的情况下,添加为参数组 399 | # __description 为可选参数,有则添加描述说明,无则跳过 400 | group = self.parser.add_argument_group(key, description=value.get("__description", None)) # type: ignore 401 | self._add_args_from_template(value, group) 402 | 403 | def parse_args(self, config_file_flag: Optional[str] = None): 404 | """ 405 | 解析命令行参数并合并到配置中 406 | 407 | Args: 408 | config_file_flag (Optional[str]): 配置文件的命令标志. Defaults to None. 409 | """ 410 | args = vars(self.parser.parse_args()) 411 | if config_file_flag and args[config_file_flag]: 412 | self.filepath = pathlib.Path(args[config_file_flag]) 413 | # 因为如果需要根据命令标志读取文件并对配置修改,需要在解析过程中重载命令标志对应的文件 414 | self.load(Mode.update) 415 | for key, value in args.items(): 416 | if value is None: 417 | continue 418 | self.set(key, value) 419 | 420 | 421 | def new( 422 | base_class: Callable = FileConfigurator, 423 | name: str = "__DEFAULT__", 424 | *args, 425 | **kwargs, 426 | ) -> Union[BaseConfigurator, FileConfigurator, CliConfigurator]: 427 | """ 428 | 创建具有指定名称和基类的配置对象的新实例。 429 | 430 | Args: 431 | name (str, optional): 配置对象的名称。Defaults to '__DEFAULT__'. 432 | base_class (Union[BaseConfigurator, FileConfigurator, CLIConfigurator], optional): 配置对象的基类。Defaults to FileConfigurator. 433 | *args: Variable length argument list. 434 | **kwargs: Arbitrary keyword arguments. 435 | 436 | Returns: 437 | FileConfigurator: 新创建的配置对象。 438 | 439 | Raises: 440 | None 441 | 442 | Examples: 443 | >>> new('my_config_name', FileConfigurator, arg1='value1', arg2='value2') 444 | 445 | """ 446 | if name not in _G_CFG: 447 | _G_CFG[name] = base_class(*args, **kwargs) 448 | return _G_CFG[name] 449 | 450 | 451 | # 示例用法 452 | if __name__ == "__main__": 453 | 454 | cfg = new(is_auto_make=True) 455 | cfg.set("keyint", 16) 456 | cfg.set("keystr", "hello") 457 | cfg.set("keydic", {}) 458 | cfg.set("keyarr", ["a", "b"]) 459 | cfg.set("keys.a", {"id": 10, "role": "admin"}) 460 | cfg.set( 461 | "keys.a.info", 462 | {"name": "lilei", "age": 20, "female": True, "like": ["ball", "swim"]}, 463 | ) 464 | cfg.set("keys.b", {"user": "lee", "pass": "lei"}) 465 | assert cfg.get("keyint", 996) == 16 466 | cfg.set("keyint", 32) 467 | assert cfg.get("keyint", 996) == 32 468 | 469 | assert cfg.get("keys.b.user") == "lee" 470 | cfg.set("keys.b.user", "li") 471 | assert cfg.get("keys.b.user") == "li" 472 | 473 | assert cfg.get("keyc.a", 1024) == 1024 474 | try: 475 | assert cfg.get("keys.b.c.d") == "this is error test." 476 | raise SystemError("assert failure.") 477 | except KeyError: 478 | pass 479 | 480 | assert cfg.dumps() == ( 481 | '{"keyint": 32, "keystr": "hello", "keydic": {}, "keyarr": ["a", "b"], "keys": {"a": {"id": 10, "role": "admin", "info": {"name": "lilei", "age": 20, "female": true, "like": ["ball", "swim"]}}, "b": {"user": "li", "pass": "lei"}}}' 482 | ) 483 | assert cfg.gen_detail() == ( 484 | "keyint: 32; keystr: hello; keydic: {}; keyarr: ['a', 'b']; keys: {'a': {'id': '...', 'role': '...', 'info': '...'}, 'b': {'user': '...', 'pass': '...'}}" 485 | ) 486 | assert cfg.gen_detail(depth=2) == ( 487 | "keyint: 32; keystr: hello; keydic: {}; keyarr: ['a', 'b']; keys: {'a': '...', 'b': '...'}" 488 | ) 489 | assert cfg.gen_detail(filters=["notkey", "keyarr", "keys.a.info.name", "keys.b"]) == ( 490 | "keyint: 32; keystr: hello; keydic: {}; keys: {'a': {'id': '...', 'role': '...', 'info': '...'}}" 491 | ) 492 | # cfg.load_from_url('https://httpbin.org/get') 493 | # assert cfg.get('url') == 'https://httpbin.org/get' 494 | 495 | cfg.clear() 496 | cfg.loads( 497 | """za: 498 | user: lee 499 | zb: 1024 500 | zc: {} 501 | zd: 502 | - a 503 | - b""", 504 | fmt="yaml", 505 | ) 506 | 507 | assert cfg.gen_detail() == "za: {'user': 'lee'}; zb: 1024; zc: {}; zd: ['a', 'b']" 508 | 509 | cfg.save() # type: ignore 510 | cfg.save("json") # type: ignore 511 | cfg.save("yaml") # type: ignore 512 | cfg.save("toml") # type: ignore 513 | 514 | template = { 515 | "url": { 516 | "__type": str, 517 | "__default": "https://localhost/api/v2/user", 518 | "__help": "后端地址", 519 | }, # 一级命令行的情况 520 | # 'database.host': {'__type': str, '__default': 'www.eg.com', '__help': '样例地址'}, # 命令行重复冲突的情况 521 | "database": { 522 | "host": {"__type": str, "__default": "localhost", "__help": "数据库地址"}, 523 | "port": { 524 | "__type": int, 525 | "__default": 3306, 526 | }, # 二级命令行没 __help 参数的情况 527 | "__description": "数据库组配置参数", # 创建组有 __description 参数的情况 528 | }, 529 | "logging": { 530 | "level": {"__type": str, "__default": "INFO", "__help": "日志级别"} 531 | }, # 创建组没 __description 参数的情况 532 | "rule": { 533 | "china": {"beijing": "北京地区规则"}, # 二级没__type、__default参数的情况 534 | }, 535 | } 536 | 537 | cfg = CliConfigurator(template=template) 538 | cfg.parse_args() 539 | print(cfg.raw) 540 | cfg.save("yaml") 541 | cfg.parser.print_help() 542 | print(cfg.gen_detail()) 543 | cfg.delete() 544 | --------------------------------------------------------------------------------