├── .gitattributes ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── build_win.bat ├── config └── example_config.toml ├── locales ├── en │ └── LC_MESSAGES │ │ └── py_extract.po └── zh_Hans_CN │ └── LC_MESSAGES │ └── py_extract.po ├── py_extract ├── __init__.py ├── config.py ├── exceptions.py ├── extractor.py ├── file_renaming.py ├── logging_utils.py ├── utils.py └── zip_decrypter.pyx ├── requirements-dev.txt ├── requirements.txt ├── run.py ├── setup.py └── tests ├── __init__.py ├── test_extractor.py └── test_load_passwords.py /.gitattributes: -------------------------------------------------------------------------------- 1 | * text eol=lf 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | *.exe 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # dotenv 85 | .env 86 | 87 | # virtualenv 88 | .venv 89 | venv/ 90 | venv-*/ 91 | ENV/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # IDE settings 107 | .vscode/ 108 | .idea/ 109 | .ruff_cache/ 110 | 111 | # temp file 112 | temp/ 113 | temp.ipynb 114 | 115 | # config file 116 | py_extract_config.toml 117 | 118 | # cython file 119 | *.c 120 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v3.2.0 6 | hooks: 7 | - id: trailing-whitespace 8 | - id: end-of-file-fixer 9 | - id: check-case-conflict 10 | - id: check-added-large-files 11 | - repo: https://github.com/Lucas-C/pre-commit-hooks 12 | rev: "v1.5.1" 13 | hooks: 14 | - id: remove-crlf 15 | - id: remove-tabs 16 | - repo: https://github.com/charliermarsh/ruff-pre-commit 17 | rev: "v0.0.271" 18 | hooks: 19 | - id: ruff 20 | args: ["--select=I", "RUF"] 21 | - repo: https://github.com/psf/black 22 | rev: "23.3.0" 23 | hooks: 24 | - id: black 25 | args: ["--line-length", "80"] 26 | - repo: https://github.com/pre-commit/mirrors-mypy 27 | rev: "v1.5.1" 28 | hooks: 29 | - id: mypy 30 | additional_dependencies: [types-toml] 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Phil 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyExtract 2 | 3 | ![Python](https://img.shields.io/badge/python-3.11-blue.svg) 4 | [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit) 5 | ![GitHub](https://img.shields.io/github/license/davuses/PyExtract?style=flat-square) 6 | 7 | PyExtract is a utility that recursively finds and extracts archives in the target folder. 8 | 9 | It can decrypt and decompress zip archives with Non-UTF-8 encoded password. For more details, refer to the related [superuser question](https://superuser.com/questions/1676282). 10 | 11 | PyExtract uses Cython to speed up the `zipfile` library 12 | 13 | ## Screenshots 14 | 15 | image 16 | 17 | ## Installation and Usage 18 | 19 | ### Prerequisites 20 | 21 | - Python >= 3.11 22 | - 7-Zip program (`7z` binary) added to your machine's PATH environment variable. 23 | 24 | ### Installation 25 | 26 | Install the required packages using pip: 27 | 28 | ```sh 29 | pip install -r requirements.txt 30 | ``` 31 | 32 | Make sure you have Cython installed, and then compile the Cython extension: 33 | 34 | ```sh 35 | python setup.py build_ext --inplace 36 | ``` 37 | 38 | Compile translation files: 39 | 40 | ```sh 41 | python setup.py compile_catalog -D py_extract -d locales/ 42 | ``` 43 | 44 | ### Configuration and Running 45 | 46 | Create a configuration file `py_extract_config.toml` by copying and modifying the `./config/example_config.toml`: 47 | 48 | Then run: 49 | 50 | ```sh 51 | $ python run.py --help 52 | 53 | usage: run.py [-h] [-c CONFIG] [-t TARGET_DIR] [-a] [-d] 54 | 55 | PyExtract 56 | 57 | options: 58 | -h, --help show this help message and exit 59 | -c CONFIG, --config CONFIG 60 | config file path 61 | -t TARGET_DIR, --target-dir TARGET_DIR 62 | target directory 63 | -a, --auto-rename auto rename archives with bad names 64 | -d, --debug debug mode 65 | ``` 66 | 67 | ### Example Configuration 68 | 69 | Here's an example of a configuration file: 70 | 71 | ```toml 72 | # addtional encodings of zip archives, the defualt is utf-8 73 | # see https://en.wikipedia.org/wiki/Windows_code_page 74 | # cp936 is used for Chinese encoding 75 | zip_metadata_encoding = ["cp936"] 76 | 77 | # language: en, cn 78 | language = "en" 79 | 80 | # automatically rename archives with bad filenames 81 | auto_rename = false 82 | 83 | # logging level: "warning", "debug" 84 | logging_level = "warning" 85 | 86 | 87 | [path] 88 | target_directory = "D:/download" 89 | password_path = "D:/passwords.txt" 90 | 91 | [exclude] 92 | # exclude filenames, you can leave them empty: suffixes=[] 93 | suffixes = [".apk", ".exe"] 94 | filenames = ["do_not_extract_me.zip"] 95 | substrings = ["not_an_archive"] 96 | 97 | [rename] 98 | # rename files whose filenames contain these substrings: 99 | substrings = ["删除", "删除我", "delete_this"] 100 | 101 | ``` 102 | 103 | ### Windows Users 104 | 105 | For Windows users, you can download the compiled binary file from the [releases](https://github.com/davuses/PyExtract/releases) section. Or you can run `build_win.bat` to build the binary by yourself. 106 | 107 | ### Password File 108 | 109 | An example of password file: 110 | 111 | ```py 112 | # passwords.txt 113 | password_one_in_second_group 114 | password_two_in_second_group 115 | 116 | password_one_in_first_group 117 | password_two_in_first_group 118 | ``` 119 | -------------------------------------------------------------------------------- /build_win.bat: -------------------------------------------------------------------------------- 1 | python -m pip install pyinstaller -r requirements.txt 2 | 3 | python setup.py compile_catalog -D py_extract -d locales/ 4 | 5 | python setup.py build_ext --inplace 6 | 7 | pyinstaller --onefile --add-data "locales;locales" -n "PyExtract" "run.py" 8 | 9 | copy ".\config\example_config.toml" ".\dist\py_extract_config.toml" 10 | -------------------------------------------------------------------------------- /config/example_config.toml: -------------------------------------------------------------------------------- 1 | # addtional encodings of zip archives, the defualt is utf-8 2 | # see https://en.wikipedia.org/wiki/Windows_code_page 3 | # cp936 is used for Chinese encoding 4 | zip_metadata_encoding = ["cp936"] 5 | 6 | # language: en, cn 7 | language = "en" 8 | 9 | # automatically rename archives with bad filenames 10 | auto_rename = false 11 | 12 | # logging level: "warning", "debug" 13 | logging_level = "warning" 14 | 15 | 16 | [path] 17 | target_directory = "D:/download" 18 | password_path = "D:/passwords.txt" 19 | 20 | [exclude] 21 | # exclude filenames, you can leave them empty: suffixes=[] 22 | suffixes = [".apk", ".exe"] 23 | filenames = ["do_not_extract_me.zip"] 24 | substrings = ["not_an_archive"] 25 | 26 | [rename] 27 | # rename files whose filenames contain these substrings: 28 | substrings = ["删除", "删除我", "delete_this"] 29 | -------------------------------------------------------------------------------- /locales/en/LC_MESSAGES/py_extract.po: -------------------------------------------------------------------------------- 1 | # English translations for PROJECT. 2 | # Copyright (C) 2023 ORGANIZATION 3 | # This file is distributed under the same license as the PROJECT project. 4 | # FIRST AUTHOR , 2023. 5 | # 6 | msgid "" 7 | msgstr "" 8 | "Project-Id-Version: PROJECT VERSION\n" 9 | "Report-Msgid-Bugs-To: EMAIL@ADDRESS\n" 10 | "POT-Creation-Date: 2023-08-09 02:35+0800\n" 11 | "PO-Revision-Date: 2023-08-12 22:24+0800\n" 12 | "Last-Translator: FULL NAME \n" 13 | "Language: en\n" 14 | "Language-Team: en \n" 15 | "Plural-Forms: nplurals=2; plural=(n != 1);\n" 16 | "MIME-Version: 1.0\n" 17 | "Content-Type: text/plain; charset=utf-8\n" 18 | "Content-Transfer-Encoding: 8bit\n" 19 | "Generated-By: Babel 2.12.1\n" 20 | 21 | #: py_extract/py_extract.py:305 py_extract/py_extract.py:310 22 | #: py_extract/py_extract.py:321 23 | msgid "try password" 24 | msgstr "" 25 | 26 | #: py_extract/py_extract.py:306 27 | msgid "Skipping" 28 | msgstr "" 29 | 30 | #: py_extract/py_extract.py:307 py_extract/py_extract.py:309 31 | msgid "type" 32 | msgstr "" 33 | 34 | #: py_extract/py_extract.py:308 35 | msgid "Extracting" 36 | msgstr "" 37 | 38 | #: py_extract/py_extract.py:311 39 | msgid "password" 40 | msgstr "" 41 | 42 | #: py_extract/py_extract.py:312 43 | msgid "matches" 44 | msgstr "" 45 | 46 | #: py_extract/py_extract.py:313 47 | msgid "Done" 48 | msgstr "" 49 | 50 | #: py_extract/py_extract.py:314 51 | msgid "extracted to" 52 | msgstr "" 53 | 54 | #: py_extract/py_extract.py:315 55 | msgid "time cost" 56 | msgstr "" 57 | 58 | #: py_extract/py_extract.py:316 59 | msgid "Failed" 60 | msgstr "" 61 | 62 | #: py_extract/py_extract.py:317 63 | msgid "Wrong password or invalid archive" 64 | msgstr "" 65 | 66 | #: py_extract/py_extract.py:318 67 | msgid "Some files probably need to be renamed in these directories" 68 | msgstr "" 69 | 70 | #: py_extract/py_extract.py:319 71 | msgid "Take a look" 72 | msgstr "" 73 | 74 | #: py_extract/py_extract.py:320 75 | msgid "Do you want to retry extracting" 76 | msgstr "" 77 | 78 | #: py_extract/file_renaming.py:48 79 | msgid "rename done" 80 | msgstr "" 81 | 82 | #: py_extract/file_renaming.py:50 83 | msgid "skip rename" 84 | msgstr "" 85 | 86 | msgid "" 87 | "Din't find 7z command, please make sure 7z is installed and available in " 88 | "PATH env" 89 | msgstr "" 90 | 91 | msgid "Do you want to rename" 92 | msgstr "" 93 | 94 | msgid "to" 95 | msgstr "" 96 | 97 | msgid "retry extracting" 98 | msgstr "" 99 | 100 | msgid "target directory" 101 | msgstr "" 102 | 103 | msgid "None of the passwords can decrypt the archive" 104 | msgstr "" 105 | 106 | msgid "Invalid archive" 107 | msgstr "" 108 | 109 | 110 | msgid "None of the encodings can decode the archive" 111 | msgstr "" 112 | 113 | msgid "config path" 114 | msgstr "" 115 | 116 | msgid "password path" 117 | msgstr "" 118 | -------------------------------------------------------------------------------- /locales/zh_Hans_CN/LC_MESSAGES/py_extract.po: -------------------------------------------------------------------------------- 1 | # Chinese (Simplified, China) translations for PROJECT. 2 | # Copyright (C) 2023 ORGANIZATION 3 | # This file is distributed under the same license as the PROJECT project. 4 | # FIRST AUTHOR , 2023. 5 | # 6 | msgid "" 7 | msgstr "" 8 | "Project-Id-Version: PROJECT VERSION\n" 9 | "Report-Msgid-Bugs-To: EMAIL@ADDRESS\n" 10 | "POT-Creation-Date: 2023-08-09 02:35+0800\n" 11 | "PO-Revision-Date: 2023-08-09 02:51+0800\n" 12 | "Last-Translator: FULL NAME \n" 13 | "Language: zh_Hans_CN\n" 14 | "Language-Team: zh_Hans_CN \n" 15 | "Plural-Forms: nplurals=1; plural=0;\n" 16 | "MIME-Version: 1.0\n" 17 | "Content-Type: text/plain; charset=utf-8\n" 18 | "Content-Transfer-Encoding: 8bit\n" 19 | "Generated-By: Babel 2.12.1\n" 20 | 21 | #: py_extract/py_extract.py:305 py_extract/py_extract.py:310 22 | #: py_extract/py_extract.py:321 23 | msgid "try password" 24 | msgstr "尝试密码" 25 | 26 | #: py_extract/py_extract.py:306 27 | msgid "Skipping" 28 | msgstr "跳过" 29 | 30 | #: py_extract/py_extract.py:307 py_extract/py_extract.py:309 31 | msgid "type" 32 | msgstr "文件类型" 33 | 34 | #: py_extract/py_extract.py:308 35 | msgid "Extracting" 36 | msgstr "正在解压" 37 | 38 | #: py_extract/py_extract.py:311 39 | msgid "password" 40 | msgstr "密码" 41 | 42 | #: py_extract/py_extract.py:312 43 | msgid "matches" 44 | msgstr "匹配" 45 | 46 | #: py_extract/py_extract.py:313 47 | msgid "Done" 48 | msgstr "完成" 49 | 50 | #: py_extract/py_extract.py:314 51 | msgid "extracted to" 52 | msgstr "解压到" 53 | 54 | #: py_extract/py_extract.py:315 55 | msgid "time cost" 56 | msgstr "花费时间" 57 | 58 | #: py_extract/py_extract.py:316 59 | msgid "Failed" 60 | msgstr "失败" 61 | 62 | #: py_extract/py_extract.py:317 63 | msgid "Wrong password or invalid archive" 64 | msgstr "密码错误或无效文件" 65 | 66 | #: py_extract/py_extract.py:318 67 | msgid "Some files probably need to be renamed in these directories" 68 | msgstr "以下目录中有文件可能需要重命名" 69 | 70 | #: py_extract/py_extract.py:319 71 | msgid "Take a look" 72 | msgstr "去处理" 73 | 74 | #: py_extract/py_extract.py:320 75 | msgid "Do you want to retry extracting" 76 | msgstr "再次解压" 77 | 78 | #: py_extract/file_renaming.py:48 79 | msgid "rename done" 80 | msgstr "命名完成" 81 | 82 | #: py_extract/file_renaming.py:50 83 | msgid "跳过命名" 84 | msgstr "" 85 | 86 | msgid "" 87 | "Din't find 7z command, please make sure 7z is installed and available in " 88 | "PATH env" 89 | msgstr "未找到 7z 程序,请确认程序已安装并配置环境变量" 90 | 91 | 92 | msgid "Do you want to rename" 93 | msgstr "将文件" 94 | 95 | msgid "to" 96 | msgstr "重命名为" 97 | 98 | msgid "retry extracting" 99 | msgstr "再次解压" 100 | 101 | msgid "target directory" 102 | msgstr "目标文件夹" 103 | 104 | msgid "None of the passwords can decrypt the archive" 105 | msgstr "所有密码都尝试失败" 106 | 107 | msgid "Invalid archive" 108 | msgstr "无效文件" 109 | 110 | msgid "None of the encodings can decode the archive" 111 | msgstr "所有编码都解码失败" 112 | 113 | msgid "config path" 114 | msgstr "配置文件路径" 115 | 116 | msgid "password path" 117 | msgstr "密码文件路径" 118 | -------------------------------------------------------------------------------- /py_extract/__init__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gettext 3 | import sys 4 | from logging import getLogger 5 | from pathlib import Path 6 | 7 | from .config import load_config 8 | from .extractor import PyExtractor 9 | from .logging_utils import FormattedFileHandler 10 | 11 | 12 | def resource_path(relative_path: str): 13 | """Get absolute path to resource, works for dev and for PyInstaller""" 14 | base_path = getattr(sys, "_MEIPASS", "./") 15 | return Path(base_path).joinpath(relative_path) 16 | 17 | 18 | def init_translation(lang: str): 19 | app_name = "py_extract" 20 | localedir = resource_path("locales") 21 | lang_dict = {"cn": "zh_Hans_CN", "en": "en"} 22 | locale_language = lang_dict.get(lang, "en") 23 | en_i18n = gettext.translation( 24 | app_name, 25 | localedir, 26 | languages=[locale_language], 27 | ) 28 | en_i18n.install(names=["gettext", "ngettext"]) 29 | 30 | 31 | def create_py_extractor(): 32 | parser = argparse.ArgumentParser(description="PyExtract") 33 | parser.add_argument("-c", "--config", help="config file path") 34 | parser.add_argument("-t", "--target-dir", help="target directory") 35 | parser.add_argument( 36 | "-a", 37 | "--auto-rename", 38 | action="store_const", 39 | const=True, 40 | help="auto rename archives with bad names", 41 | ) 42 | parser.add_argument( 43 | "-d", 44 | "--debug", 45 | action="store_const", 46 | const="debug", 47 | help="debug mode", 48 | ) 49 | 50 | args = parser.parse_args() 51 | 52 | config_arg = args.config 53 | 54 | py_extract_config = load_config(config_arg) 55 | 56 | if arg_auto_rename := args.auto_rename: 57 | py_extract_config.auto_rename = arg_auto_rename 58 | if arg_debug := args.debug: 59 | py_extract_config.logging_level = arg_debug 60 | if arg_target_dir := args.target_dir: 61 | py_extract_config.target_directory = arg_target_dir 62 | logging_level = {"debug": "DEBUG"}.get( 63 | py_extract_config.logging_level, "WARNING" 64 | ) 65 | logger = getLogger(__name__) 66 | logger.setLevel(level=logging_level) 67 | log_path = "py_extract.log" 68 | logger.addHandler(FormattedFileHandler(log_path)) 69 | 70 | language = py_extract_config.language 71 | init_translation(language) 72 | py_extractor = PyExtractor(config=py_extract_config) 73 | return py_extractor 74 | -------------------------------------------------------------------------------- /py_extract/config.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | from pathlib import Path 3 | 4 | import tomllib 5 | 6 | from .exceptions import ( 7 | ConfigNotFound, 8 | InvalidConfig, 9 | InvalidPath, 10 | ) 11 | from .utils import load_passwords 12 | 13 | 14 | def is_list_of_str(list_to_test: list[str]): 15 | if not isinstance(list_to_test, list): 16 | return False 17 | if list_to_test and not all( 18 | map(lambda x: isinstance(x, str), list_to_test) 19 | ): 20 | return False 21 | return True 22 | 23 | 24 | @dataclasses.dataclass 25 | class PyExtractConfig: 26 | zip_metadata_encoding: list[str] 27 | exclude_suffix: list[str] 28 | exclude_filename: list[str] 29 | exclude_substrings: list[str] 30 | rename_substrings: list[str] 31 | target_directory: str 32 | passwords: list[str] 33 | password_path: str 34 | language: str 35 | auto_rename: bool 36 | logging_level: str 37 | config_path: str 38 | 39 | def __post_init__(self) -> None: 40 | assert is_list_of_str(self.zip_metadata_encoding) 41 | assert is_list_of_str(self.exclude_suffix) 42 | assert is_list_of_str(self.exclude_filename) 43 | assert is_list_of_str(self.exclude_substrings) 44 | assert is_list_of_str(self.rename_substrings) 45 | assert isinstance(self.target_directory, str) 46 | assert isinstance(self.language, str) 47 | assert isinstance(self.auto_rename, bool) 48 | assert isinstance(self.logging_level, str) 49 | if not Path(self.target_directory).exists(): 50 | raise InvalidPath( 51 | f"target directory {self.target_directory} doesn't exist" 52 | ) 53 | assert is_list_of_str(self.passwords) 54 | 55 | self.rename_substrings = sorted(self.rename_substrings, reverse=True) 56 | 57 | 58 | POSSIBLE_CONFIG_PATHS = [ 59 | "./config/py_extract_config.toml", 60 | "./py_extract_config.toml", 61 | ] 62 | CONFIG_NOT_FOUND_ERROR_MSG = f"""\ 63 | config file should be found in one of these paths: {POSSIBLE_CONFIG_PATHS},\ 64 | or you can specify the path with --config option""" 65 | 66 | 67 | def load_config(config_path: str | None = None) -> PyExtractConfig: 68 | if not config_path: 69 | for p in POSSIBLE_CONFIG_PATHS: 70 | if Path(p).is_file(): 71 | config_path = p 72 | break 73 | else: 74 | raise ConfigNotFound(CONFIG_NOT_FOUND_ERROR_MSG) 75 | else: 76 | if not Path(config_path).is_file(): 77 | raise ConfigNotFound(f"cannot find the config file: {config_path}") 78 | config_path = str(Path(config_path).resolve()) 79 | with open(config_path, mode="rb") as fp: 80 | toml_config = tomllib.load(fp) 81 | match toml_config: 82 | case { 83 | "zip_metadata_encoding": list() as zip_metadata_encoding, 84 | "language": str() as language, 85 | "auto_rename": bool() as auto_rename, 86 | "logging_level": str() as logging_level, 87 | "path": { 88 | "target_directory": str() as target_directory, 89 | "password_path": str() as password_path, 90 | }, 91 | "exclude": { 92 | "suffixes": list() as suffixes, 93 | "filenames": list() as filenames, 94 | "substrings": list() as substrings, 95 | }, 96 | "rename": {"substrings": list() as rename_substrings}, 97 | }: 98 | if not Path(password_path).exists(): 99 | raise InvalidPath( 100 | f"password file {password_path} doesn't exist" 101 | ) 102 | with open(password_path, "r", encoding="utf-8") as pwd_file: 103 | passwords = load_passwords(pwd_file) 104 | extract_config = PyExtractConfig( 105 | zip_metadata_encoding=zip_metadata_encoding, 106 | exclude_suffix=suffixes, 107 | exclude_filename=filenames, 108 | exclude_substrings=substrings, 109 | rename_substrings=rename_substrings, 110 | target_directory=target_directory, 111 | passwords=passwords, 112 | password_path=password_path, 113 | language=language, 114 | auto_rename=auto_rename, 115 | logging_level=logging_level, 116 | config_path=config_path, 117 | ) 118 | case _: 119 | raise InvalidConfig( 120 | "invalid configuration, please check" 121 | " ./config/example_config.toml for a valid configuration" 122 | ) 123 | return extract_config 124 | -------------------------------------------------------------------------------- /py_extract/exceptions.py: -------------------------------------------------------------------------------- 1 | class InvalidConfig(Exception): 2 | pass 3 | 4 | 5 | class InvalidPath(Exception): 6 | pass 7 | 8 | 9 | class BadFormat(Exception): 10 | pass 11 | 12 | 13 | class ConfigNotFound(Exception): 14 | pass 15 | 16 | 17 | class UnsafeTarfile(Exception): 18 | ... 19 | 20 | 21 | class SevenZipExtractFail(Exception): 22 | ... 23 | 24 | 25 | class SevenZipCmdNotFound(Exception): 26 | ... 27 | -------------------------------------------------------------------------------- /py_extract/extractor.py: -------------------------------------------------------------------------------- 1 | import builtins 2 | import os 3 | import re 4 | import shutil 5 | import stat 6 | import subprocess 7 | import sys 8 | import time 9 | import zipfile 10 | from enum import Enum, unique 11 | from logging import getLogger 12 | from pathlib import Path 13 | 14 | import magic 15 | 16 | from .config import PyExtractConfig 17 | from .exceptions import SevenZipCmdNotFound, SevenZipExtractFail 18 | from .file_renaming import ( 19 | RenameFileHandler, 20 | ) 21 | from .utils import ( 22 | done_color, 23 | failed_color, 24 | filename_color, 25 | output_same_line, 26 | ) 27 | 28 | # from .config_parser import py_extract_config 29 | from .zip_decrypter import _ZipDecrypter # pylint: disable=E0611 30 | 31 | setattr(zipfile, "_ZipDecrypter", _ZipDecrypter) 32 | 33 | logger = getLogger(__name__) 34 | 35 | 36 | class ExtractStatusCode(Enum): 37 | SUCCESS = 0 38 | WRONG_PASSWORD = 1 39 | FAIL = 2 40 | WRONG_ENCODING = 3 41 | 42 | 43 | @unique 44 | class ArchiveType(Enum): 45 | TAR = "application/x-tar" 46 | ZIP = "application/zip" 47 | SEVENTH_ZIP = "application/x-7z-compressed" 48 | RAR = "application/x-rar" 49 | 50 | @classmethod 51 | def get_suffix(cls, archive_type: "ArchiveType") -> str: 52 | suffix_mapping = { 53 | cls.TAR: "tar", 54 | cls.ZIP: "zip", 55 | cls.SEVENTH_ZIP: "7z", 56 | cls.RAR: "rar", 57 | } 58 | return suffix_mapping[archive_type] 59 | 60 | 61 | def remove_readonly(func, path, _) -> None: 62 | os.chmod(path, stat.S_IWRITE) 63 | func(path) 64 | 65 | 66 | class PyExtractor: 67 | def __init__(self, config: PyExtractConfig) -> None: 68 | self.config = config 69 | self.file_rename = RenameFileHandler( 70 | unwanted_substrings=config.rename_substrings, 71 | auto_rename=config.auto_rename, 72 | ) 73 | self.handled_archives: set[Path] = set() 74 | 75 | def run(self): 76 | target_dir = self.config.target_directory 77 | print( 78 | f'{_("target directory")}: {filename_color(target_dir)} ,' 79 | f' {_("config path")}: {filename_color(self.config.config_path)} ,' 80 | f' {_("password path")}:' 81 | f" {filename_color(self.config.password_path)}\n" 82 | ) 83 | logger.info(self.config) 84 | self.extract_archives_recursively(target_dir) 85 | 86 | def is_excluded_file(self, file: Path) -> bool: 87 | """test if file should be excluded""" 88 | return ( 89 | file.suffix in self.config.exclude_suffix 90 | or file.name in self.config.exclude_filename 91 | or any((sub in file.name for sub in self.config.exclude_substrings)) 92 | or bool( 93 | re.search( 94 | r"part(?:[2-9]|[1-9][0-9]|100|0[2-9])\.(rar|RAR)", str(file) 95 | ) 96 | ) 97 | ) 98 | 99 | def extract_zip( 100 | self, 101 | archive_name: Path, 102 | out_path: Path, 103 | pwd: str | None = None, 104 | default_encoding="utf-8", 105 | aes=False, 106 | ): 107 | # https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile 108 | # Monkey patch the decryption of zipfile with C for better performance, it 109 | # is about 10% slower than the 7z program in testing. 110 | additional_encodings = self.config.zip_metadata_encoding 111 | if default_encoding not in additional_encodings: 112 | additional_encodings.append(default_encoding) 113 | # logger.info("encodings: %s", additional_encodings) 114 | for encoding in additional_encodings: 115 | # logger.info("extract zip with encoding: %s", encoding) 116 | password: bytes | None = pwd.encode(encoding) if pwd else None 117 | try: 118 | if aes: 119 | import pyzipper 120 | 121 | with pyzipper.AESZipFile( 122 | archive_name, 123 | "r", 124 | compression=pyzipper.ZIP_DEFLATED, 125 | encryption=pyzipper.WZ_AES, 126 | ) as extracted_zip: 127 | extracted_zip.extractall(out_path, pwd=password) 128 | else: 129 | with zipfile.ZipFile( 130 | archive_name, "r", metadata_encoding=encoding 131 | ) as zip_file: 132 | zip_file.extractall(out_path, pwd=password) 133 | return ExtractStatusCode.SUCCESS 134 | except Exception as exc: 135 | if out_path.exists(): 136 | shutil.rmtree(out_path, onerror=remove_readonly) 137 | if isinstance(exc, NotImplementedError): 138 | # some algorithms are not supported by zipfile 139 | logger.exception( 140 | "%s algorithms NotImplemented", archive_name 141 | ) 142 | if not aes: 143 | return self.extract_zip( 144 | archive_name, out_path, pwd=pwd, aes=True 145 | ) 146 | return self.extract_7z(archive_name, out_path, pwd=pwd) 147 | if isinstance(exc, UnicodeDecodeError): 148 | logger.info("%s cannot decode %s", encoding, archive_name) 149 | continue 150 | if re.search( 151 | r"Bad password|is encrypted|Bad CRC-32", repr(exc) 152 | ): 153 | # if "Bad password" in repr(exc) or "is encrypted" in repr(exc): 154 | logger.info("%s, wrong password: %s", archive_name, pwd) 155 | return ExtractStatusCode.WRONG_PASSWORD 156 | logger.exception("%s pwd:%s", archive_name, pwd) 157 | return ExtractStatusCode.FAIL 158 | logger.error( 159 | "None of encodings %s can decode %s", 160 | additional_encodings, 161 | archive_name, 162 | ) 163 | return ExtractStatusCode.WRONG_ENCODING 164 | 165 | def extract_tar(self, archive_name: Path, out_path: Path): 166 | return self.extract_7z(archive_name, out_path) 167 | 168 | def extract_rar( 169 | self, archive_name: Path, out_path: Path, pwd: str | None = None 170 | ): 171 | # didn't find a usable python library for rar, switch to 7z program 172 | # 7z reduces absolute paths to relative paths by default 173 | return self.extract_7z(archive_name, out_path, pwd) 174 | 175 | def extract_7z( 176 | self, archive_name: Path, out_path: Path, pwd: str | None = None 177 | ): 178 | try: 179 | assert shutil.which("7z") 180 | cmd = [ 181 | "7z", 182 | "x", 183 | f"-p{pwd if pwd else ''}", 184 | archive_name.as_posix(), 185 | f"-o{out_path.as_posix()}", 186 | ] 187 | proc = subprocess.Popen( 188 | cmd, 189 | shell=False, 190 | stdin=subprocess.DEVNULL, 191 | stdout=subprocess.DEVNULL, 192 | stderr=subprocess.PIPE, 193 | universal_newlines=True, 194 | ) 195 | _stdout, errs = proc.communicate() 196 | rc = proc.returncode 197 | if rc != 0: 198 | raise SevenZipExtractFail(f"Extract fails, {errs}") 199 | except Exception as exc: 200 | if out_path.exists(): 201 | shutil.rmtree(out_path, onerror=remove_readonly) 202 | if isinstance(exc, AssertionError): 203 | logger.exception("7z command not found") 204 | raise SevenZipCmdNotFound from exc 205 | if "Wrong password" in str(exc): 206 | logger.info("%s , Wrong password: %s", archive_name, pwd) 207 | return ExtractStatusCode.WRONG_PASSWORD 208 | logger.exception(archive_name) 209 | return ExtractStatusCode.FAIL 210 | return ExtractStatusCode.SUCCESS 211 | 212 | def extract_archive( 213 | self, file: Path, archive_type: ArchiveType, dir_level: int 214 | ) -> Path | None: 215 | """Return output path if status code == SUCCESS, else return None""" 216 | target_out_dir = f"{file}_out" 217 | out_path = Path(target_out_dir) 218 | if out_path.exists(): 219 | print( 220 | f"{' ' * dir_level}▷ {_('Skipping')}" 221 | f" {filename_color(str(file))} , {_('type')}:" 222 | f" {ArchiveType.get_suffix(archive_type)}" 223 | ) 224 | return out_path 225 | indent = "".join([" " * dir_level, "└──"]) 226 | print( 227 | f"{' ' * dir_level}▶ {_('Extracting')}" 228 | f" {filename_color(str(file))} , {_('type')}:" 229 | f" {ArchiveType.get_suffix(archive_type)}" 230 | ) 231 | pwd = "" 232 | start = time.time() 233 | passwords_list = self.config.passwords 234 | # prepend empty password into passwords list 235 | passwords_list.insert(0, "") 236 | failed_msg = "" 237 | status_code = ExtractStatusCode.FAIL 238 | for pwd in passwords_list: 239 | output_same_line(f"{indent} {_('try password')} {pwd}") 240 | try: 241 | match archive_type: 242 | case ArchiveType.ZIP: 243 | status_code = self.extract_zip(file, out_path, pwd) 244 | case ArchiveType.TAR: 245 | status_code = self.extract_tar(file, out_path) 246 | case ArchiveType.SEVENTH_ZIP: 247 | status_code = self.extract_7z(file, out_path, pwd) 248 | case ArchiveType.RAR: 249 | status_code = self.extract_rar(file, out_path, pwd) 250 | case _: 251 | raise AssertionError("Not going to happen") 252 | match status_code: 253 | case ExtractStatusCode.WRONG_PASSWORD: 254 | continue 255 | case _: 256 | break 257 | 258 | except SevenZipCmdNotFound: 259 | failed_msg = _( 260 | "Din't find 7z command, please make sure 7z is" 261 | " installed and available in PATH" 262 | ) 263 | break 264 | else: 265 | failed_msg = _("None of the passwords can decrypt the archive") 266 | logger.error("No passwords can decrypt %s", file) 267 | if status_code == ExtractStatusCode.SUCCESS: 268 | end = time.time() 269 | time_cost = round(end - start) 270 | output_same_line(f"{indent} {_('password')} {pwd} {_('matches')}\n") 271 | print( 272 | f"{indent} {done_color(_('Done'))}" 273 | f" {filename_color(str(file))} {_('extracted to')}" 274 | f" {filename_color(str(out_path))}" 275 | f" , {_('time cost')}: {time_cost}s" 276 | ) 277 | logger.info("%s is extracted to %s", file, out_path) 278 | return out_path 279 | if status_code == ExtractStatusCode.WRONG_ENCODING: 280 | failed_msg = _("None of the encodings can decode the archive") 281 | if not failed_msg: 282 | failed_msg = _("Invalid archive") 283 | output_same_line( 284 | f"{indent} {failed_color(_('Failed'))}" 285 | f" {filename_color(str(file))} {failed_color(failed_msg)}\n" 286 | ) 287 | return None 288 | 289 | def extract_archives_recursively( 290 | self, target_dir: str | Path, dir_level: int = 0 291 | ) -> None: 292 | # don't match files in subdirs if in root directory 293 | dirs_to_rename_files: set[Path] = set() 294 | files = ( 295 | Path(target_dir).iterdir() 296 | if dir_level == 0 297 | else Path(target_dir).glob("**/*") 298 | ) 299 | 300 | for file in files: 301 | if (not file.is_file()) or self.is_excluded_file(file): 302 | continue 303 | if file not in self.handled_archives: 304 | self.handled_archives.add(file) 305 | else: 306 | continue 307 | try: 308 | file_type = magic.from_buffer( 309 | open(file, "rb").read(2048), mime=True 310 | ) 311 | except Exception: 312 | logger.exception("%s", file) 313 | continue 314 | try: 315 | archive_type = ArchiveType(file_type) 316 | except ValueError: 317 | pass 318 | else: 319 | out_dir = self.extract_archive(file, archive_type, dir_level) 320 | if out_dir: 321 | self.extract_archives_recursively( 322 | out_dir.as_posix(), dir_level=dir_level + 1 323 | ) 324 | else: 325 | # TODO: figure out a way to only rename failed archives, otherwise extracted archives will be renamed too 326 | if self.file_rename.has_unwanted_substrings_in_filenames( 327 | file.parent 328 | ): 329 | dirs_to_rename_files.add(file.parent) 330 | if dirs_to_rename_files: 331 | self.file_rename.rename_files_in_dirs(dirs_to_rename_files) 332 | if self.file_rename.auto_rename: 333 | choice = "y" 334 | print(f"{_('retry extracting')}:") 335 | else: 336 | sys.stdout.write( 337 | f"{_('Do you want to retry extracting')}? [y/n]" 338 | ) 339 | choice = input().lower() 340 | if choice in ["y", "Y"]: 341 | for d in dirs_to_rename_files: 342 | self.extract_archives_recursively(d, dir_level=dir_level) 343 | 344 | 345 | if __name__ == "__main__": 346 | _ = builtins.__dict__["_"] 347 | -------------------------------------------------------------------------------- /py_extract/file_renaming.py: -------------------------------------------------------------------------------- 1 | import builtins 2 | import sys 3 | from logging import getLogger 4 | from pathlib import Path 5 | 6 | from .utils import filename_color 7 | 8 | logger = getLogger(__name__) 9 | 10 | 11 | class RenameFileHandler: 12 | def __init__( 13 | self, 14 | unwanted_substrings: list[str], 15 | auto_rename: bool, 16 | ) -> None: 17 | self.unwanted_substrings = unwanted_substrings 18 | self.auto_rename = auto_rename 19 | 20 | def has_unwanted_substrings_in_filenames(self, target_dir: Path) -> bool: 21 | for path in Path(target_dir).iterdir(): 22 | if path.is_dir(): 23 | continue 24 | filename = path.name 25 | if any((substr in filename for substr in self.unwanted_substrings)): 26 | return True 27 | return False 28 | 29 | def display_files_to_rename(self, target_dir: Path): 30 | print(filename_color(str(target_dir))) 31 | for path in Path(target_dir).iterdir(): 32 | if path.is_dir(): 33 | continue 34 | filename = path.name 35 | for substr in self.unwanted_substrings: 36 | if substr in filename: 37 | print(filename_color(" " + filename)) 38 | break 39 | 40 | def rename_files_in_dir(self, target_dir: Path | str) -> None: 41 | _ = builtins.__dict__["_"] 42 | for path in Path(target_dir).iterdir(): 43 | if path.is_dir(): 44 | continue 45 | filename = path.name 46 | newname, oldname = "", filename 47 | for substr in self.unwanted_substrings: 48 | if substr in filename: 49 | newname = filename.replace(substr, "") 50 | break 51 | else: 52 | # substr not in filename, skip 53 | continue 54 | sys.stdout.write( 55 | f"{_('Do you want to rename')}" 56 | f" {filename_color(str(path.with_name(oldname)))} {_('to')}" 57 | f" {filename_color(str(path.with_name(newname)))} ? [y/n]" 58 | ) 59 | if self.auto_rename: 60 | choice = "y" 61 | sys.stdout.write("\n") 62 | else: 63 | choice = input().lower() 64 | if choice in ["y", "Y"]: 65 | new_path = path.rename(path.with_name(newname)) 66 | logger.info("rename %s to %s", path, new_path) 67 | print(_("rename done")) 68 | else: 69 | print(_("skip rename")) 70 | 71 | def rename_files_in_dirs(self, dirs: set[Path]) -> None: 72 | if self.auto_rename: 73 | choice = "y" 74 | print() 75 | else: 76 | print( 77 | f"\n{_('Some files probably need to be renamed in these directories')}:" 78 | ) 79 | for d in dirs: 80 | self.display_files_to_rename(d) 81 | sys.stdout.write(f"\n{_('Take a look')}? [y/n]") 82 | choice = input().lower() 83 | if choice in ["y", "Y"]: 84 | for d in dirs: 85 | self.rename_files_in_dir(d) 86 | 87 | 88 | if __name__ == "__main__": 89 | _ = builtins.__dict__["_"] 90 | -------------------------------------------------------------------------------- /py_extract/logging_utils.py: -------------------------------------------------------------------------------- 1 | from logging import ( 2 | FileHandler, 3 | Formatter, 4 | ) 5 | 6 | 7 | class FormattedFileHandler(FileHandler): 8 | def __init__(self, filename: str) -> None: 9 | super().__init__(filename, encoding="utf-8") 10 | formatter = Formatter( 11 | ( 12 | " %(asctime)s [%(levelname)s %(name)s:%(lineno)s" 13 | " %(funcName)s()] %(message)s" 14 | ), 15 | "%Y-%m-%d %H:%M:%S", 16 | ) 17 | self.setFormatter(formatter) 18 | -------------------------------------------------------------------------------- /py_extract/utils.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from typing import TextIO 3 | 4 | 5 | def output_same_line(text: str) -> None: 6 | """Output to the same line""" 7 | print("\x1b[2K\r" + text, end="", flush=True) 8 | 9 | 10 | class BColors: 11 | OK_BLUE = "\033[94m" 12 | OK_GREEN = "\033[92m" 13 | FAIL = "\033[91m" 14 | END = "\033[0m" 15 | 16 | 17 | def filename_color(text: str) -> str: 18 | return f"{BColors.OK_BLUE}{text}{BColors.END}" 19 | 20 | 21 | def done_color(text: str) -> str: 22 | return f"{BColors.OK_GREEN}{text}{BColors.END}" 23 | 24 | 25 | def failed_color(text: str) -> str: 26 | return f"{BColors.FAIL}{text}{BColors.END}" 27 | 28 | 29 | def load_passwords(pwd_file: TextIO) -> list[str]: 30 | """passwords file should be like: 31 | ``` 32 | # passwords.txt 33 | password_one_in_second_group 34 | password_two_in_second_group 35 | 36 | password_one_in_first_group 37 | password_two_in_first_group 38 | ``` 39 | """ 40 | lines = [line.strip() for line in pwd_file.readlines()] 41 | 42 | def strip_list(to_strip, rem): 43 | to_strip = list(itertools.dropwhile(lambda x: x == rem, to_strip)) 44 | to_strip = list(itertools.dropwhile(lambda x: x == rem, to_strip[::-1])) 45 | return to_strip[::-1] 46 | 47 | lines = strip_list(lines, "") 48 | delimiter = "" 49 | if delimiter not in lines: 50 | return lines 51 | groups = [ 52 | list(y) 53 | for x, y in itertools.groupby(lines, lambda z: z == delimiter) 54 | if not x 55 | ] 56 | return list(itertools.chain.from_iterable(groups[::-1])) 57 | -------------------------------------------------------------------------------- /py_extract/zip_decrypter.pyx: -------------------------------------------------------------------------------- 1 | # cython: language_level=3 2 | # distutils: language = c 3 | # cython: cdivision = True 4 | # cython: boundscheck = False 5 | # cython: wraparound = False 6 | # cython: nonecheck = False 7 | # cython: profile = False 8 | # https://github.com/ziyuang/czipfile/blob/ba592c44c79d830a063210d598737f91f4333035/czipfile.pyx 9 | 10 | """ 11 | cython implementation of zip decryption 12 | """ 13 | 14 | cimport cpython 15 | 16 | cdef class _ZipDecrypter: 17 | """Class to handle decryption of files stored within a ZIP archive. 18 | ZIP supports a password-based form of encryption. Even though known 19 | plaintext attacks have been found against it, it is still useful 20 | to be able to get data out of such a file. 21 | Usage: 22 | zd = _ZipDecrypter(mypwd) 23 | plain_text = zd(cypher_text) 24 | The original usage of: 25 | plain_text = map(zd, cypher_text) 26 | is still supported, but will be slower (by a factor of 10 or so, by 27 | my measurements) than simply calling it with the full cypher_text. 28 | """ 29 | 30 | # I guess to make these C vars, we must declare them out here? 31 | cdef unsigned long crctable[256] 32 | cdef unsigned long key0 33 | cdef unsigned long key1 34 | cdef unsigned long key2 35 | 36 | cdef void _GenerateCRCTable(self): 37 | """Generate a CRC-32 table. 38 | ZIP encryption uses the CRC32 one-byte primitive for scrambling some 39 | internal keys. We noticed that a direct implementation is faster than 40 | relying on binascii.crc32(). 41 | """ 42 | cdef unsigned long poly = 0xedb88320 43 | cdef unsigned long crc, i, j 44 | for 0 <= i < 256: 45 | crc = i 46 | for 0 <= j < 8: 47 | if crc & 1: 48 | crc = ((crc >> 1) & 0x7FFFFFFF) ^ poly 49 | else: 50 | crc = ((crc >> 1) & 0x7FFFFFFF) 51 | self.crctable[i] = crc 52 | 53 | cdef unsigned long _crc32(self, unsigned char ch, unsigned long crc): 54 | """Compute the CRC32 primitive on one byte.""" 55 | return ((crc >> 8) & 0xffffff) ^ self.crctable[(crc ^ ch) & 0xff] 56 | 57 | def __init__(self, pwd): 58 | self.key0 = 305419896 59 | self.key1 = 591751049 60 | self.key2 = 878082192 61 | 62 | # Generate the CRC table; previously done outside of any method 63 | self._GenerateCRCTable() 64 | 65 | # Update our keys, given the password 66 | for p in pwd: 67 | self._UpdateKeys(p) 68 | 69 | cdef void _UpdateKeys(self, unsigned char c): 70 | self.key0 = self._crc32(c, self.key0) 71 | self.key1 = (self.key1 + (self.key0 & 255)) & 4294967295UL 72 | self.key1 = (self.key1 * 134775813 + 1) & 4294967295UL 73 | self.key2 = self._crc32((self.key1 >> 24) & 255, self.key2) 74 | 75 | def __call__(self, data): 76 | cdef unsigned long k 77 | cdef Py_ssize_t i, datalen 78 | cdef char *data_s 79 | cdef char *ret_s 80 | 81 | cpython.PyBytes_AsStringAndSize(data, &data_s, &datalen) 82 | ret = cpython.PyBytes_FromStringAndSize(NULL, datalen) 83 | ret_s = cpython.PyBytes_AsString(ret) 84 | for 0 <= i < datalen: 85 | k = self.key2 | 2 86 | ret_s[i] = data_s[i] ^ (((k * (k^1)) >> 8) & 255); 87 | # The proper way to do this is to call _UpdateKeys here, like so: 88 | #self._UpdateKeys(ret_s[i]) 89 | # ... but we can cut runtime by about a third if we unroll the 90 | # function. So, we're doing so. Yes, it's duplication. Ah well... 91 | self.key0 = ((self.key0 >> 8) & 0xFFFFFF) ^ self.crctable[(self.key0 ^ ret_s[i]) & 0xFF] 92 | self.key1 = (self.key1 + (self.key0 & 255)) & 4294967295UL 93 | self.key1 = (self.key1 * 134775813 + 1) & 4294967295UL 94 | self.key2 = ((self.key2 >> 8) & 0xFFFFFF) ^ self.crctable[(self.key2 ^ ((self.key1 >> 24) & 255)) & 0xFF] 95 | 96 | return ret 97 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | toml 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Cython==0.29.33 2 | python-magic-bin==0.4.14; platform_system == "Windows" 3 | python-magic; platform_system == "Linux" 4 | babel 5 | pyzipper 6 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import traceback 2 | 3 | from py_extract import create_py_extractor 4 | 5 | try: 6 | py_extractor = create_py_extractor() 7 | py_extractor.run() 8 | except Exception: 9 | print(traceback.format_exc()) 10 | finally: 11 | input("Press Enter to exit...") 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from babel.messages import frontend as babel 2 | from Cython.Build import cythonize 3 | from Cython.Distutils import Extension 4 | from setuptools import setup 5 | 6 | extra_compile_args = ["-O3", "-ffast-math", "-fopenmp"] 7 | extra_link_args = ["-fopenmp"] 8 | 9 | lib_modules = [] 10 | 11 | lib_modules.append( 12 | Extension( 13 | "py_extract.zip_decrypter", 14 | ["py_extract/zip_decrypter.pyx"], 15 | language="c", 16 | extra_compile_args=extra_compile_args, 17 | extra_link_args=extra_link_args, 18 | ), 19 | ) 20 | 21 | setup( 22 | name="zip_decrypter", 23 | zip_safe=False, 24 | ext_modules=cythonize( 25 | lib_modules, 26 | language_level=3, 27 | compiler_directives={"always_allow_keywords": True}, 28 | ), 29 | cmdclass={"compile_catalog": babel.compile_catalog}, 30 | ) 31 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davuses/PyExtract/876b83cf4328010f83c3dd71b152b0d0cd14fe15/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_extractor.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import subprocess 3 | import sys 4 | from pathlib import Path 5 | 6 | import toml 7 | 8 | from py_extract import create_py_extractor 9 | 10 | FIRST_PASSWORD = "password1" 11 | 12 | SECOND_PASSWORD = "password2" 13 | 14 | PASSWORDS_FILE_CONTENT = f"""\ 15 | {FIRST_PASSWORD} 16 | 17 | {SECOND_PASSWORD} 18 | """ 19 | 20 | 21 | def test_7z_command(): 22 | assert bool(shutil.which("7z")) 23 | 24 | 25 | def test_py_extract(tmp_path: Path): 26 | tmp_dir = tmp_path / "dir" 27 | tmp_dir.mkdir(exist_ok=True) 28 | 29 | test_files = ["file1.txt", "file2.txt", "file3.txt"] 30 | 31 | make_split_archives(tmp_dir, test_files) 32 | 33 | passwords_path = tmp_path / "passwords.txt" 34 | with open(passwords_path, "w", encoding="utf-8") as test_pwd_file: 35 | test_pwd_file.write(PASSWORDS_FILE_CONTENT) 36 | with open( 37 | "./config/example_config.toml", "r", encoding="utf-8" 38 | ) as example_config_file: 39 | test_config = toml.load(example_config_file) 40 | test_config["path"]["target_directory"] = str(tmp_dir) 41 | test_config["path"]["password_path"] = str(passwords_path) 42 | test_config["rename"]["substrings"] = ["删除", "删", "删我"] 43 | test_config["auto_rename"] = True 44 | 45 | test_config_path = tmp_path / "test_config.toml" 46 | with open(test_config_path, "w", encoding="utf-8") as test_config_file: 47 | toml.dump(test_config, test_config_file) 48 | 49 | sys.argv[1:] = ["--config", str(test_config_path)] 50 | 51 | py_extractor = create_py_extractor() 52 | py_extractor.run() 53 | 54 | for filename in test_files: 55 | assert ( 56 | tmp_dir 57 | / f"./nested_archive.7z.001_out/archive.7z.001_out/{filename}" 58 | ).is_file() 59 | 60 | 61 | def make_split_archives(tmp_dir, test_files): 62 | test_filepaths = [tmp_dir / filename for filename in test_files] 63 | 64 | for p in test_filepaths: 65 | with open(p, mode="wb") as f: 66 | f.truncate(1024 * 1024 * 10) 67 | 68 | archive_name = "archive.7z" 69 | 70 | with subprocess.Popen( 71 | [ 72 | "7z", 73 | "a", 74 | f"-p{FIRST_PASSWORD}", 75 | "-v2k", 76 | "-mx9", 77 | "-mhe=on", 78 | archive_name, 79 | *test_files, 80 | ], 81 | shell=False, 82 | encoding="utf-8", 83 | stdout=subprocess.PIPE, 84 | stderr=subprocess.PIPE, 85 | cwd=tmp_dir, 86 | ) as proc: 87 | stdout, _stderr = proc.communicate() 88 | assert "Everything is Ok" in stdout 89 | 90 | for p in test_filepaths: 91 | p.unlink() 92 | 93 | first_volume_path = tmp_dir / "archive.7z.001" 94 | first_volume_path.rename(first_volume_path.with_name("archive.7z.删除001")) 95 | 96 | archives_names = [ 97 | "archive.7z.删除001", 98 | "archive.7z.002", 99 | "archive.7z.003", 100 | ] 101 | archive_paths = [tmp_dir / name for name in archives_names] 102 | 103 | with subprocess.Popen( 104 | [ 105 | "7z", 106 | "a", 107 | f"-p{SECOND_PASSWORD}", 108 | "-v2k", 109 | "-mx9", 110 | "-mhe=on", 111 | "nested_archive.7z", 112 | *archives_names, 113 | ], 114 | shell=False, 115 | encoding="utf-8", 116 | stdout=subprocess.PIPE, 117 | stderr=subprocess.PIPE, 118 | cwd=tmp_dir, 119 | ) as proc: 120 | stdout, _stderr = proc.communicate() 121 | assert "Everything is Ok" in stdout 122 | 123 | nested_archive_first_volume = tmp_dir / "nested_archive.7z.001" 124 | nested_archive_first_volume.rename( 125 | nested_archive_first_volume.with_name("nested_archive.7z.删001") 126 | ) 127 | nested_archive_second_volume = tmp_dir / "nested_archive.7z.002" 128 | nested_archive_second_volume.rename( 129 | nested_archive_second_volume.with_name("nested_archive.7z.删我002") 130 | ) 131 | for p in archive_paths: 132 | p.unlink() 133 | -------------------------------------------------------------------------------- /tests/test_load_passwords.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | 3 | from py_extract.utils import load_passwords 4 | 5 | PASSWORDS_TEXT = """\ 6 | 7 | 8 | foo 9 | foo2 10 | 11 | 12 | ok 13 | ok2 14 | 15 | 16 | 17 | bar 18 | bar2 19 | 20 | """ 21 | 22 | 23 | def test_load_passwords(): 24 | pwd_file = StringIO(PASSWORDS_TEXT) 25 | 26 | results = load_passwords(pwd_file=pwd_file) 27 | assert results == ["bar", "bar2", "ok", "ok2", "foo", "foo2"] 28 | --------------------------------------------------------------------------------