├── .gitattributes
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── build_win.bat
├── config
└── example_config.toml
├── locales
├── en
│ └── LC_MESSAGES
│ │ └── py_extract.po
└── zh_Hans_CN
│ └── LC_MESSAGES
│ └── py_extract.po
├── py_extract
├── __init__.py
├── config.py
├── exceptions.py
├── extractor.py
├── file_renaming.py
├── logging_utils.py
├── utils.py
└── zip_decrypter.pyx
├── requirements-dev.txt
├── requirements.txt
├── run.py
├── setup.py
└── tests
├── __init__.py
├── test_extractor.py
└── test_load_passwords.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | * text eol=lf
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | *.exe
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .coverage
43 | .coverage.*
44 | .cache
45 | nosetests.xml
46 | coverage.xml
47 | *.cover
48 | .hypothesis/
49 | .pytest_cache/
50 |
51 | # Translations
52 | *.mo
53 | *.pot
54 |
55 | # Django stuff:
56 | *.log
57 | local_settings.py
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # dotenv
85 | .env
86 |
87 | # virtualenv
88 | .venv
89 | venv/
90 | venv-*/
91 | ENV/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 | # IDE settings
107 | .vscode/
108 | .idea/
109 | .ruff_cache/
110 |
111 | # temp file
112 | temp/
113 | temp.ipynb
114 |
115 | # config file
116 | py_extract_config.toml
117 |
118 | # cython file
119 | *.c
120 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # See https://pre-commit.com for more information
2 | # See https://pre-commit.com/hooks.html for more hooks
3 | repos:
4 | - repo: https://github.com/pre-commit/pre-commit-hooks
5 | rev: v3.2.0
6 | hooks:
7 | - id: trailing-whitespace
8 | - id: end-of-file-fixer
9 | - id: check-case-conflict
10 | - id: check-added-large-files
11 | - repo: https://github.com/Lucas-C/pre-commit-hooks
12 | rev: "v1.5.1"
13 | hooks:
14 | - id: remove-crlf
15 | - id: remove-tabs
16 | - repo: https://github.com/charliermarsh/ruff-pre-commit
17 | rev: "v0.0.271"
18 | hooks:
19 | - id: ruff
20 | args: ["--select=I", "RUF"]
21 | - repo: https://github.com/psf/black
22 | rev: "23.3.0"
23 | hooks:
24 | - id: black
25 | args: ["--line-length", "80"]
26 | - repo: https://github.com/pre-commit/mirrors-mypy
27 | rev: "v1.5.1"
28 | hooks:
29 | - id: mypy
30 | additional_dependencies: [types-toml]
31 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2016 Phil
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PyExtract
2 |
3 | 
4 | [](https://github.com/pre-commit/pre-commit)
5 | 
6 |
7 | PyExtract is a utility that recursively finds and extracts archives in the target folder.
8 |
9 | It can decrypt and decompress zip archives with Non-UTF-8 encoded password. For more details, refer to the related [superuser question](https://superuser.com/questions/1676282).
10 |
11 | PyExtract uses Cython to speed up the `zipfile` library
12 |
13 | ## Screenshots
14 |
15 |
16 |
17 | ## Installation and Usage
18 |
19 | ### Prerequisites
20 |
21 | - Python >= 3.11
22 | - 7-Zip program (`7z` binary) added to your machine's PATH environment variable.
23 |
24 | ### Installation
25 |
26 | Install the required packages using pip:
27 |
28 | ```sh
29 | pip install -r requirements.txt
30 | ```
31 |
32 | Make sure you have Cython installed, and then compile the Cython extension:
33 |
34 | ```sh
35 | python setup.py build_ext --inplace
36 | ```
37 |
38 | Compile translation files:
39 |
40 | ```sh
41 | python setup.py compile_catalog -D py_extract -d locales/
42 | ```
43 |
44 | ### Configuration and Running
45 |
46 | Create a configuration file `py_extract_config.toml` by copying and modifying the `./config/example_config.toml`:
47 |
48 | Then run:
49 |
50 | ```sh
51 | $ python run.py --help
52 |
53 | usage: run.py [-h] [-c CONFIG] [-t TARGET_DIR] [-a] [-d]
54 |
55 | PyExtract
56 |
57 | options:
58 | -h, --help show this help message and exit
59 | -c CONFIG, --config CONFIG
60 | config file path
61 | -t TARGET_DIR, --target-dir TARGET_DIR
62 | target directory
63 | -a, --auto-rename auto rename archives with bad names
64 | -d, --debug debug mode
65 | ```
66 |
67 | ### Example Configuration
68 |
69 | Here's an example of a configuration file:
70 |
71 | ```toml
72 | # addtional encodings of zip archives, the defualt is utf-8
73 | # see https://en.wikipedia.org/wiki/Windows_code_page
74 | # cp936 is used for Chinese encoding
75 | zip_metadata_encoding = ["cp936"]
76 |
77 | # language: en, cn
78 | language = "en"
79 |
80 | # automatically rename archives with bad filenames
81 | auto_rename = false
82 |
83 | # logging level: "warning", "debug"
84 | logging_level = "warning"
85 |
86 |
87 | [path]
88 | target_directory = "D:/download"
89 | password_path = "D:/passwords.txt"
90 |
91 | [exclude]
92 | # exclude filenames, you can leave them empty: suffixes=[]
93 | suffixes = [".apk", ".exe"]
94 | filenames = ["do_not_extract_me.zip"]
95 | substrings = ["not_an_archive"]
96 |
97 | [rename]
98 | # rename files whose filenames contain these substrings:
99 | substrings = ["删除", "删除我", "delete_this"]
100 |
101 | ```
102 |
103 | ### Windows Users
104 |
105 | For Windows users, you can download the compiled binary file from the [releases](https://github.com/davuses/PyExtract/releases) section. Or you can run `build_win.bat` to build the binary by yourself.
106 |
107 | ### Password File
108 |
109 | An example of password file:
110 |
111 | ```py
112 | # passwords.txt
113 | password_one_in_second_group
114 | password_two_in_second_group
115 |
116 | password_one_in_first_group
117 | password_two_in_first_group
118 | ```
119 |
--------------------------------------------------------------------------------
/build_win.bat:
--------------------------------------------------------------------------------
1 | python -m pip install pyinstaller -r requirements.txt
2 |
3 | python setup.py compile_catalog -D py_extract -d locales/
4 |
5 | python setup.py build_ext --inplace
6 |
7 | pyinstaller --onefile --add-data "locales;locales" -n "PyExtract" "run.py"
8 |
9 | copy ".\config\example_config.toml" ".\dist\py_extract_config.toml"
10 |
--------------------------------------------------------------------------------
/config/example_config.toml:
--------------------------------------------------------------------------------
1 | # addtional encodings of zip archives, the defualt is utf-8
2 | # see https://en.wikipedia.org/wiki/Windows_code_page
3 | # cp936 is used for Chinese encoding
4 | zip_metadata_encoding = ["cp936"]
5 |
6 | # language: en, cn
7 | language = "en"
8 |
9 | # automatically rename archives with bad filenames
10 | auto_rename = false
11 |
12 | # logging level: "warning", "debug"
13 | logging_level = "warning"
14 |
15 |
16 | [path]
17 | target_directory = "D:/download"
18 | password_path = "D:/passwords.txt"
19 |
20 | [exclude]
21 | # exclude filenames, you can leave them empty: suffixes=[]
22 | suffixes = [".apk", ".exe"]
23 | filenames = ["do_not_extract_me.zip"]
24 | substrings = ["not_an_archive"]
25 |
26 | [rename]
27 | # rename files whose filenames contain these substrings:
28 | substrings = ["删除", "删除我", "delete_this"]
29 |
--------------------------------------------------------------------------------
/locales/en/LC_MESSAGES/py_extract.po:
--------------------------------------------------------------------------------
1 | # English translations for PROJECT.
2 | # Copyright (C) 2023 ORGANIZATION
3 | # This file is distributed under the same license as the PROJECT project.
4 | # FIRST AUTHOR , 2023.
5 | #
6 | msgid ""
7 | msgstr ""
8 | "Project-Id-Version: PROJECT VERSION\n"
9 | "Report-Msgid-Bugs-To: EMAIL@ADDRESS\n"
10 | "POT-Creation-Date: 2023-08-09 02:35+0800\n"
11 | "PO-Revision-Date: 2023-08-12 22:24+0800\n"
12 | "Last-Translator: FULL NAME \n"
13 | "Language: en\n"
14 | "Language-Team: en \n"
15 | "Plural-Forms: nplurals=2; plural=(n != 1);\n"
16 | "MIME-Version: 1.0\n"
17 | "Content-Type: text/plain; charset=utf-8\n"
18 | "Content-Transfer-Encoding: 8bit\n"
19 | "Generated-By: Babel 2.12.1\n"
20 |
21 | #: py_extract/py_extract.py:305 py_extract/py_extract.py:310
22 | #: py_extract/py_extract.py:321
23 | msgid "try password"
24 | msgstr ""
25 |
26 | #: py_extract/py_extract.py:306
27 | msgid "Skipping"
28 | msgstr ""
29 |
30 | #: py_extract/py_extract.py:307 py_extract/py_extract.py:309
31 | msgid "type"
32 | msgstr ""
33 |
34 | #: py_extract/py_extract.py:308
35 | msgid "Extracting"
36 | msgstr ""
37 |
38 | #: py_extract/py_extract.py:311
39 | msgid "password"
40 | msgstr ""
41 |
42 | #: py_extract/py_extract.py:312
43 | msgid "matches"
44 | msgstr ""
45 |
46 | #: py_extract/py_extract.py:313
47 | msgid "Done"
48 | msgstr ""
49 |
50 | #: py_extract/py_extract.py:314
51 | msgid "extracted to"
52 | msgstr ""
53 |
54 | #: py_extract/py_extract.py:315
55 | msgid "time cost"
56 | msgstr ""
57 |
58 | #: py_extract/py_extract.py:316
59 | msgid "Failed"
60 | msgstr ""
61 |
62 | #: py_extract/py_extract.py:317
63 | msgid "Wrong password or invalid archive"
64 | msgstr ""
65 |
66 | #: py_extract/py_extract.py:318
67 | msgid "Some files probably need to be renamed in these directories"
68 | msgstr ""
69 |
70 | #: py_extract/py_extract.py:319
71 | msgid "Take a look"
72 | msgstr ""
73 |
74 | #: py_extract/py_extract.py:320
75 | msgid "Do you want to retry extracting"
76 | msgstr ""
77 |
78 | #: py_extract/file_renaming.py:48
79 | msgid "rename done"
80 | msgstr ""
81 |
82 | #: py_extract/file_renaming.py:50
83 | msgid "skip rename"
84 | msgstr ""
85 |
86 | msgid ""
87 | "Din't find 7z command, please make sure 7z is installed and available in "
88 | "PATH env"
89 | msgstr ""
90 |
91 | msgid "Do you want to rename"
92 | msgstr ""
93 |
94 | msgid "to"
95 | msgstr ""
96 |
97 | msgid "retry extracting"
98 | msgstr ""
99 |
100 | msgid "target directory"
101 | msgstr ""
102 |
103 | msgid "None of the passwords can decrypt the archive"
104 | msgstr ""
105 |
106 | msgid "Invalid archive"
107 | msgstr ""
108 |
109 |
110 | msgid "None of the encodings can decode the archive"
111 | msgstr ""
112 |
113 | msgid "config path"
114 | msgstr ""
115 |
116 | msgid "password path"
117 | msgstr ""
118 |
--------------------------------------------------------------------------------
/locales/zh_Hans_CN/LC_MESSAGES/py_extract.po:
--------------------------------------------------------------------------------
1 | # Chinese (Simplified, China) translations for PROJECT.
2 | # Copyright (C) 2023 ORGANIZATION
3 | # This file is distributed under the same license as the PROJECT project.
4 | # FIRST AUTHOR , 2023.
5 | #
6 | msgid ""
7 | msgstr ""
8 | "Project-Id-Version: PROJECT VERSION\n"
9 | "Report-Msgid-Bugs-To: EMAIL@ADDRESS\n"
10 | "POT-Creation-Date: 2023-08-09 02:35+0800\n"
11 | "PO-Revision-Date: 2023-08-09 02:51+0800\n"
12 | "Last-Translator: FULL NAME \n"
13 | "Language: zh_Hans_CN\n"
14 | "Language-Team: zh_Hans_CN \n"
15 | "Plural-Forms: nplurals=1; plural=0;\n"
16 | "MIME-Version: 1.0\n"
17 | "Content-Type: text/plain; charset=utf-8\n"
18 | "Content-Transfer-Encoding: 8bit\n"
19 | "Generated-By: Babel 2.12.1\n"
20 |
21 | #: py_extract/py_extract.py:305 py_extract/py_extract.py:310
22 | #: py_extract/py_extract.py:321
23 | msgid "try password"
24 | msgstr "尝试密码"
25 |
26 | #: py_extract/py_extract.py:306
27 | msgid "Skipping"
28 | msgstr "跳过"
29 |
30 | #: py_extract/py_extract.py:307 py_extract/py_extract.py:309
31 | msgid "type"
32 | msgstr "文件类型"
33 |
34 | #: py_extract/py_extract.py:308
35 | msgid "Extracting"
36 | msgstr "正在解压"
37 |
38 | #: py_extract/py_extract.py:311
39 | msgid "password"
40 | msgstr "密码"
41 |
42 | #: py_extract/py_extract.py:312
43 | msgid "matches"
44 | msgstr "匹配"
45 |
46 | #: py_extract/py_extract.py:313
47 | msgid "Done"
48 | msgstr "完成"
49 |
50 | #: py_extract/py_extract.py:314
51 | msgid "extracted to"
52 | msgstr "解压到"
53 |
54 | #: py_extract/py_extract.py:315
55 | msgid "time cost"
56 | msgstr "花费时间"
57 |
58 | #: py_extract/py_extract.py:316
59 | msgid "Failed"
60 | msgstr "失败"
61 |
62 | #: py_extract/py_extract.py:317
63 | msgid "Wrong password or invalid archive"
64 | msgstr "密码错误或无效文件"
65 |
66 | #: py_extract/py_extract.py:318
67 | msgid "Some files probably need to be renamed in these directories"
68 | msgstr "以下目录中有文件可能需要重命名"
69 |
70 | #: py_extract/py_extract.py:319
71 | msgid "Take a look"
72 | msgstr "去处理"
73 |
74 | #: py_extract/py_extract.py:320
75 | msgid "Do you want to retry extracting"
76 | msgstr "再次解压"
77 |
78 | #: py_extract/file_renaming.py:48
79 | msgid "rename done"
80 | msgstr "命名完成"
81 |
82 | #: py_extract/file_renaming.py:50
83 | msgid "跳过命名"
84 | msgstr ""
85 |
86 | msgid ""
87 | "Din't find 7z command, please make sure 7z is installed and available in "
88 | "PATH env"
89 | msgstr "未找到 7z 程序,请确认程序已安装并配置环境变量"
90 |
91 |
92 | msgid "Do you want to rename"
93 | msgstr "将文件"
94 |
95 | msgid "to"
96 | msgstr "重命名为"
97 |
98 | msgid "retry extracting"
99 | msgstr "再次解压"
100 |
101 | msgid "target directory"
102 | msgstr "目标文件夹"
103 |
104 | msgid "None of the passwords can decrypt the archive"
105 | msgstr "所有密码都尝试失败"
106 |
107 | msgid "Invalid archive"
108 | msgstr "无效文件"
109 |
110 | msgid "None of the encodings can decode the archive"
111 | msgstr "所有编码都解码失败"
112 |
113 | msgid "config path"
114 | msgstr "配置文件路径"
115 |
116 | msgid "password path"
117 | msgstr "密码文件路径"
118 |
--------------------------------------------------------------------------------
/py_extract/__init__.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import gettext
3 | import sys
4 | from logging import getLogger
5 | from pathlib import Path
6 |
7 | from .config import load_config
8 | from .extractor import PyExtractor
9 | from .logging_utils import FormattedFileHandler
10 |
11 |
12 | def resource_path(relative_path: str):
13 | """Get absolute path to resource, works for dev and for PyInstaller"""
14 | base_path = getattr(sys, "_MEIPASS", "./")
15 | return Path(base_path).joinpath(relative_path)
16 |
17 |
18 | def init_translation(lang: str):
19 | app_name = "py_extract"
20 | localedir = resource_path("locales")
21 | lang_dict = {"cn": "zh_Hans_CN", "en": "en"}
22 | locale_language = lang_dict.get(lang, "en")
23 | en_i18n = gettext.translation(
24 | app_name,
25 | localedir,
26 | languages=[locale_language],
27 | )
28 | en_i18n.install(names=["gettext", "ngettext"])
29 |
30 |
31 | def create_py_extractor():
32 | parser = argparse.ArgumentParser(description="PyExtract")
33 | parser.add_argument("-c", "--config", help="config file path")
34 | parser.add_argument("-t", "--target-dir", help="target directory")
35 | parser.add_argument(
36 | "-a",
37 | "--auto-rename",
38 | action="store_const",
39 | const=True,
40 | help="auto rename archives with bad names",
41 | )
42 | parser.add_argument(
43 | "-d",
44 | "--debug",
45 | action="store_const",
46 | const="debug",
47 | help="debug mode",
48 | )
49 |
50 | args = parser.parse_args()
51 |
52 | config_arg = args.config
53 |
54 | py_extract_config = load_config(config_arg)
55 |
56 | if arg_auto_rename := args.auto_rename:
57 | py_extract_config.auto_rename = arg_auto_rename
58 | if arg_debug := args.debug:
59 | py_extract_config.logging_level = arg_debug
60 | if arg_target_dir := args.target_dir:
61 | py_extract_config.target_directory = arg_target_dir
62 | logging_level = {"debug": "DEBUG"}.get(
63 | py_extract_config.logging_level, "WARNING"
64 | )
65 | logger = getLogger(__name__)
66 | logger.setLevel(level=logging_level)
67 | log_path = "py_extract.log"
68 | logger.addHandler(FormattedFileHandler(log_path))
69 |
70 | language = py_extract_config.language
71 | init_translation(language)
72 | py_extractor = PyExtractor(config=py_extract_config)
73 | return py_extractor
74 |
--------------------------------------------------------------------------------
/py_extract/config.py:
--------------------------------------------------------------------------------
1 | import dataclasses
2 | from pathlib import Path
3 |
4 | import tomllib
5 |
6 | from .exceptions import (
7 | ConfigNotFound,
8 | InvalidConfig,
9 | InvalidPath,
10 | )
11 | from .utils import load_passwords
12 |
13 |
14 | def is_list_of_str(list_to_test: list[str]):
15 | if not isinstance(list_to_test, list):
16 | return False
17 | if list_to_test and not all(
18 | map(lambda x: isinstance(x, str), list_to_test)
19 | ):
20 | return False
21 | return True
22 |
23 |
24 | @dataclasses.dataclass
25 | class PyExtractConfig:
26 | zip_metadata_encoding: list[str]
27 | exclude_suffix: list[str]
28 | exclude_filename: list[str]
29 | exclude_substrings: list[str]
30 | rename_substrings: list[str]
31 | target_directory: str
32 | passwords: list[str]
33 | password_path: str
34 | language: str
35 | auto_rename: bool
36 | logging_level: str
37 | config_path: str
38 |
39 | def __post_init__(self) -> None:
40 | assert is_list_of_str(self.zip_metadata_encoding)
41 | assert is_list_of_str(self.exclude_suffix)
42 | assert is_list_of_str(self.exclude_filename)
43 | assert is_list_of_str(self.exclude_substrings)
44 | assert is_list_of_str(self.rename_substrings)
45 | assert isinstance(self.target_directory, str)
46 | assert isinstance(self.language, str)
47 | assert isinstance(self.auto_rename, bool)
48 | assert isinstance(self.logging_level, str)
49 | if not Path(self.target_directory).exists():
50 | raise InvalidPath(
51 | f"target directory {self.target_directory} doesn't exist"
52 | )
53 | assert is_list_of_str(self.passwords)
54 |
55 | self.rename_substrings = sorted(self.rename_substrings, reverse=True)
56 |
57 |
58 | POSSIBLE_CONFIG_PATHS = [
59 | "./config/py_extract_config.toml",
60 | "./py_extract_config.toml",
61 | ]
62 | CONFIG_NOT_FOUND_ERROR_MSG = f"""\
63 | config file should be found in one of these paths: {POSSIBLE_CONFIG_PATHS},\
64 | or you can specify the path with --config option"""
65 |
66 |
67 | def load_config(config_path: str | None = None) -> PyExtractConfig:
68 | if not config_path:
69 | for p in POSSIBLE_CONFIG_PATHS:
70 | if Path(p).is_file():
71 | config_path = p
72 | break
73 | else:
74 | raise ConfigNotFound(CONFIG_NOT_FOUND_ERROR_MSG)
75 | else:
76 | if not Path(config_path).is_file():
77 | raise ConfigNotFound(f"cannot find the config file: {config_path}")
78 | config_path = str(Path(config_path).resolve())
79 | with open(config_path, mode="rb") as fp:
80 | toml_config = tomllib.load(fp)
81 | match toml_config:
82 | case {
83 | "zip_metadata_encoding": list() as zip_metadata_encoding,
84 | "language": str() as language,
85 | "auto_rename": bool() as auto_rename,
86 | "logging_level": str() as logging_level,
87 | "path": {
88 | "target_directory": str() as target_directory,
89 | "password_path": str() as password_path,
90 | },
91 | "exclude": {
92 | "suffixes": list() as suffixes,
93 | "filenames": list() as filenames,
94 | "substrings": list() as substrings,
95 | },
96 | "rename": {"substrings": list() as rename_substrings},
97 | }:
98 | if not Path(password_path).exists():
99 | raise InvalidPath(
100 | f"password file {password_path} doesn't exist"
101 | )
102 | with open(password_path, "r", encoding="utf-8") as pwd_file:
103 | passwords = load_passwords(pwd_file)
104 | extract_config = PyExtractConfig(
105 | zip_metadata_encoding=zip_metadata_encoding,
106 | exclude_suffix=suffixes,
107 | exclude_filename=filenames,
108 | exclude_substrings=substrings,
109 | rename_substrings=rename_substrings,
110 | target_directory=target_directory,
111 | passwords=passwords,
112 | password_path=password_path,
113 | language=language,
114 | auto_rename=auto_rename,
115 | logging_level=logging_level,
116 | config_path=config_path,
117 | )
118 | case _:
119 | raise InvalidConfig(
120 | "invalid configuration, please check"
121 | " ./config/example_config.toml for a valid configuration"
122 | )
123 | return extract_config
124 |
--------------------------------------------------------------------------------
/py_extract/exceptions.py:
--------------------------------------------------------------------------------
1 | class InvalidConfig(Exception):
2 | pass
3 |
4 |
5 | class InvalidPath(Exception):
6 | pass
7 |
8 |
9 | class BadFormat(Exception):
10 | pass
11 |
12 |
13 | class ConfigNotFound(Exception):
14 | pass
15 |
16 |
17 | class UnsafeTarfile(Exception):
18 | ...
19 |
20 |
21 | class SevenZipExtractFail(Exception):
22 | ...
23 |
24 |
25 | class SevenZipCmdNotFound(Exception):
26 | ...
27 |
--------------------------------------------------------------------------------
/py_extract/extractor.py:
--------------------------------------------------------------------------------
1 | import builtins
2 | import os
3 | import re
4 | import shutil
5 | import stat
6 | import subprocess
7 | import sys
8 | import time
9 | import zipfile
10 | from enum import Enum, unique
11 | from logging import getLogger
12 | from pathlib import Path
13 |
14 | import magic
15 |
16 | from .config import PyExtractConfig
17 | from .exceptions import SevenZipCmdNotFound, SevenZipExtractFail
18 | from .file_renaming import (
19 | RenameFileHandler,
20 | )
21 | from .utils import (
22 | done_color,
23 | failed_color,
24 | filename_color,
25 | output_same_line,
26 | )
27 |
28 | # from .config_parser import py_extract_config
29 | from .zip_decrypter import _ZipDecrypter # pylint: disable=E0611
30 |
31 | setattr(zipfile, "_ZipDecrypter", _ZipDecrypter)
32 |
33 | logger = getLogger(__name__)
34 |
35 |
36 | class ExtractStatusCode(Enum):
37 | SUCCESS = 0
38 | WRONG_PASSWORD = 1
39 | FAIL = 2
40 | WRONG_ENCODING = 3
41 |
42 |
43 | @unique
44 | class ArchiveType(Enum):
45 | TAR = "application/x-tar"
46 | ZIP = "application/zip"
47 | SEVENTH_ZIP = "application/x-7z-compressed"
48 | RAR = "application/x-rar"
49 |
50 | @classmethod
51 | def get_suffix(cls, archive_type: "ArchiveType") -> str:
52 | suffix_mapping = {
53 | cls.TAR: "tar",
54 | cls.ZIP: "zip",
55 | cls.SEVENTH_ZIP: "7z",
56 | cls.RAR: "rar",
57 | }
58 | return suffix_mapping[archive_type]
59 |
60 |
61 | def remove_readonly(func, path, _) -> None:
62 | os.chmod(path, stat.S_IWRITE)
63 | func(path)
64 |
65 |
66 | class PyExtractor:
67 | def __init__(self, config: PyExtractConfig) -> None:
68 | self.config = config
69 | self.file_rename = RenameFileHandler(
70 | unwanted_substrings=config.rename_substrings,
71 | auto_rename=config.auto_rename,
72 | )
73 | self.handled_archives: set[Path] = set()
74 |
75 | def run(self):
76 | target_dir = self.config.target_directory
77 | print(
78 | f'{_("target directory")}: {filename_color(target_dir)} ,'
79 | f' {_("config path")}: {filename_color(self.config.config_path)} ,'
80 | f' {_("password path")}:'
81 | f" {filename_color(self.config.password_path)}\n"
82 | )
83 | logger.info(self.config)
84 | self.extract_archives_recursively(target_dir)
85 |
86 | def is_excluded_file(self, file: Path) -> bool:
87 | """test if file should be excluded"""
88 | return (
89 | file.suffix in self.config.exclude_suffix
90 | or file.name in self.config.exclude_filename
91 | or any((sub in file.name for sub in self.config.exclude_substrings))
92 | or bool(
93 | re.search(
94 | r"part(?:[2-9]|[1-9][0-9]|100|0[2-9])\.(rar|RAR)", str(file)
95 | )
96 | )
97 | )
98 |
99 | def extract_zip(
100 | self,
101 | archive_name: Path,
102 | out_path: Path,
103 | pwd: str | None = None,
104 | default_encoding="utf-8",
105 | aes=False,
106 | ):
107 | # https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile
108 | # Monkey patch the decryption of zipfile with C for better performance, it
109 | # is about 10% slower than the 7z program in testing.
110 | additional_encodings = self.config.zip_metadata_encoding
111 | if default_encoding not in additional_encodings:
112 | additional_encodings.append(default_encoding)
113 | # logger.info("encodings: %s", additional_encodings)
114 | for encoding in additional_encodings:
115 | # logger.info("extract zip with encoding: %s", encoding)
116 | password: bytes | None = pwd.encode(encoding) if pwd else None
117 | try:
118 | if aes:
119 | import pyzipper
120 |
121 | with pyzipper.AESZipFile(
122 | archive_name,
123 | "r",
124 | compression=pyzipper.ZIP_DEFLATED,
125 | encryption=pyzipper.WZ_AES,
126 | ) as extracted_zip:
127 | extracted_zip.extractall(out_path, pwd=password)
128 | else:
129 | with zipfile.ZipFile(
130 | archive_name, "r", metadata_encoding=encoding
131 | ) as zip_file:
132 | zip_file.extractall(out_path, pwd=password)
133 | return ExtractStatusCode.SUCCESS
134 | except Exception as exc:
135 | if out_path.exists():
136 | shutil.rmtree(out_path, onerror=remove_readonly)
137 | if isinstance(exc, NotImplementedError):
138 | # some algorithms are not supported by zipfile
139 | logger.exception(
140 | "%s algorithms NotImplemented", archive_name
141 | )
142 | if not aes:
143 | return self.extract_zip(
144 | archive_name, out_path, pwd=pwd, aes=True
145 | )
146 | return self.extract_7z(archive_name, out_path, pwd=pwd)
147 | if isinstance(exc, UnicodeDecodeError):
148 | logger.info("%s cannot decode %s", encoding, archive_name)
149 | continue
150 | if re.search(
151 | r"Bad password|is encrypted|Bad CRC-32", repr(exc)
152 | ):
153 | # if "Bad password" in repr(exc) or "is encrypted" in repr(exc):
154 | logger.info("%s, wrong password: %s", archive_name, pwd)
155 | return ExtractStatusCode.WRONG_PASSWORD
156 | logger.exception("%s pwd:%s", archive_name, pwd)
157 | return ExtractStatusCode.FAIL
158 | logger.error(
159 | "None of encodings %s can decode %s",
160 | additional_encodings,
161 | archive_name,
162 | )
163 | return ExtractStatusCode.WRONG_ENCODING
164 |
165 | def extract_tar(self, archive_name: Path, out_path: Path):
166 | return self.extract_7z(archive_name, out_path)
167 |
168 | def extract_rar(
169 | self, archive_name: Path, out_path: Path, pwd: str | None = None
170 | ):
171 | # didn't find a usable python library for rar, switch to 7z program
172 | # 7z reduces absolute paths to relative paths by default
173 | return self.extract_7z(archive_name, out_path, pwd)
174 |
175 | def extract_7z(
176 | self, archive_name: Path, out_path: Path, pwd: str | None = None
177 | ):
178 | try:
179 | assert shutil.which("7z")
180 | cmd = [
181 | "7z",
182 | "x",
183 | f"-p{pwd if pwd else ''}",
184 | archive_name.as_posix(),
185 | f"-o{out_path.as_posix()}",
186 | ]
187 | proc = subprocess.Popen(
188 | cmd,
189 | shell=False,
190 | stdin=subprocess.DEVNULL,
191 | stdout=subprocess.DEVNULL,
192 | stderr=subprocess.PIPE,
193 | universal_newlines=True,
194 | )
195 | _stdout, errs = proc.communicate()
196 | rc = proc.returncode
197 | if rc != 0:
198 | raise SevenZipExtractFail(f"Extract fails, {errs}")
199 | except Exception as exc:
200 | if out_path.exists():
201 | shutil.rmtree(out_path, onerror=remove_readonly)
202 | if isinstance(exc, AssertionError):
203 | logger.exception("7z command not found")
204 | raise SevenZipCmdNotFound from exc
205 | if "Wrong password" in str(exc):
206 | logger.info("%s , Wrong password: %s", archive_name, pwd)
207 | return ExtractStatusCode.WRONG_PASSWORD
208 | logger.exception(archive_name)
209 | return ExtractStatusCode.FAIL
210 | return ExtractStatusCode.SUCCESS
211 |
212 | def extract_archive(
213 | self, file: Path, archive_type: ArchiveType, dir_level: int
214 | ) -> Path | None:
215 | """Return output path if status code == SUCCESS, else return None"""
216 | target_out_dir = f"{file}_out"
217 | out_path = Path(target_out_dir)
218 | if out_path.exists():
219 | print(
220 | f"{' ' * dir_level}▷ {_('Skipping')}"
221 | f" {filename_color(str(file))} , {_('type')}:"
222 | f" {ArchiveType.get_suffix(archive_type)}"
223 | )
224 | return out_path
225 | indent = "".join([" " * dir_level, "└──"])
226 | print(
227 | f"{' ' * dir_level}▶ {_('Extracting')}"
228 | f" {filename_color(str(file))} , {_('type')}:"
229 | f" {ArchiveType.get_suffix(archive_type)}"
230 | )
231 | pwd = ""
232 | start = time.time()
233 | passwords_list = self.config.passwords
234 | # prepend empty password into passwords list
235 | passwords_list.insert(0, "")
236 | failed_msg = ""
237 | status_code = ExtractStatusCode.FAIL
238 | for pwd in passwords_list:
239 | output_same_line(f"{indent} {_('try password')} {pwd}")
240 | try:
241 | match archive_type:
242 | case ArchiveType.ZIP:
243 | status_code = self.extract_zip(file, out_path, pwd)
244 | case ArchiveType.TAR:
245 | status_code = self.extract_tar(file, out_path)
246 | case ArchiveType.SEVENTH_ZIP:
247 | status_code = self.extract_7z(file, out_path, pwd)
248 | case ArchiveType.RAR:
249 | status_code = self.extract_rar(file, out_path, pwd)
250 | case _:
251 | raise AssertionError("Not going to happen")
252 | match status_code:
253 | case ExtractStatusCode.WRONG_PASSWORD:
254 | continue
255 | case _:
256 | break
257 |
258 | except SevenZipCmdNotFound:
259 | failed_msg = _(
260 | "Din't find 7z command, please make sure 7z is"
261 | " installed and available in PATH"
262 | )
263 | break
264 | else:
265 | failed_msg = _("None of the passwords can decrypt the archive")
266 | logger.error("No passwords can decrypt %s", file)
267 | if status_code == ExtractStatusCode.SUCCESS:
268 | end = time.time()
269 | time_cost = round(end - start)
270 | output_same_line(f"{indent} {_('password')} {pwd} {_('matches')}\n")
271 | print(
272 | f"{indent} {done_color(_('Done'))}"
273 | f" {filename_color(str(file))} {_('extracted to')}"
274 | f" {filename_color(str(out_path))}"
275 | f" , {_('time cost')}: {time_cost}s"
276 | )
277 | logger.info("%s is extracted to %s", file, out_path)
278 | return out_path
279 | if status_code == ExtractStatusCode.WRONG_ENCODING:
280 | failed_msg = _("None of the encodings can decode the archive")
281 | if not failed_msg:
282 | failed_msg = _("Invalid archive")
283 | output_same_line(
284 | f"{indent} {failed_color(_('Failed'))}"
285 | f" {filename_color(str(file))} {failed_color(failed_msg)}\n"
286 | )
287 | return None
288 |
289 | def extract_archives_recursively(
290 | self, target_dir: str | Path, dir_level: int = 0
291 | ) -> None:
292 | # don't match files in subdirs if in root directory
293 | dirs_to_rename_files: set[Path] = set()
294 | files = (
295 | Path(target_dir).iterdir()
296 | if dir_level == 0
297 | else Path(target_dir).glob("**/*")
298 | )
299 |
300 | for file in files:
301 | if (not file.is_file()) or self.is_excluded_file(file):
302 | continue
303 | if file not in self.handled_archives:
304 | self.handled_archives.add(file)
305 | else:
306 | continue
307 | try:
308 | file_type = magic.from_buffer(
309 | open(file, "rb").read(2048), mime=True
310 | )
311 | except Exception:
312 | logger.exception("%s", file)
313 | continue
314 | try:
315 | archive_type = ArchiveType(file_type)
316 | except ValueError:
317 | pass
318 | else:
319 | out_dir = self.extract_archive(file, archive_type, dir_level)
320 | if out_dir:
321 | self.extract_archives_recursively(
322 | out_dir.as_posix(), dir_level=dir_level + 1
323 | )
324 | else:
325 | # TODO: figure out a way to only rename failed archives, otherwise extracted archives will be renamed too
326 | if self.file_rename.has_unwanted_substrings_in_filenames(
327 | file.parent
328 | ):
329 | dirs_to_rename_files.add(file.parent)
330 | if dirs_to_rename_files:
331 | self.file_rename.rename_files_in_dirs(dirs_to_rename_files)
332 | if self.file_rename.auto_rename:
333 | choice = "y"
334 | print(f"{_('retry extracting')}:")
335 | else:
336 | sys.stdout.write(
337 | f"{_('Do you want to retry extracting')}? [y/n]"
338 | )
339 | choice = input().lower()
340 | if choice in ["y", "Y"]:
341 | for d in dirs_to_rename_files:
342 | self.extract_archives_recursively(d, dir_level=dir_level)
343 |
344 |
345 | if __name__ == "__main__":
346 | _ = builtins.__dict__["_"]
347 |
--------------------------------------------------------------------------------
/py_extract/file_renaming.py:
--------------------------------------------------------------------------------
1 | import builtins
2 | import sys
3 | from logging import getLogger
4 | from pathlib import Path
5 |
6 | from .utils import filename_color
7 |
8 | logger = getLogger(__name__)
9 |
10 |
11 | class RenameFileHandler:
12 | def __init__(
13 | self,
14 | unwanted_substrings: list[str],
15 | auto_rename: bool,
16 | ) -> None:
17 | self.unwanted_substrings = unwanted_substrings
18 | self.auto_rename = auto_rename
19 |
20 | def has_unwanted_substrings_in_filenames(self, target_dir: Path) -> bool:
21 | for path in Path(target_dir).iterdir():
22 | if path.is_dir():
23 | continue
24 | filename = path.name
25 | if any((substr in filename for substr in self.unwanted_substrings)):
26 | return True
27 | return False
28 |
29 | def display_files_to_rename(self, target_dir: Path):
30 | print(filename_color(str(target_dir)))
31 | for path in Path(target_dir).iterdir():
32 | if path.is_dir():
33 | continue
34 | filename = path.name
35 | for substr in self.unwanted_substrings:
36 | if substr in filename:
37 | print(filename_color(" " + filename))
38 | break
39 |
40 | def rename_files_in_dir(self, target_dir: Path | str) -> None:
41 | _ = builtins.__dict__["_"]
42 | for path in Path(target_dir).iterdir():
43 | if path.is_dir():
44 | continue
45 | filename = path.name
46 | newname, oldname = "", filename
47 | for substr in self.unwanted_substrings:
48 | if substr in filename:
49 | newname = filename.replace(substr, "")
50 | break
51 | else:
52 | # substr not in filename, skip
53 | continue
54 | sys.stdout.write(
55 | f"{_('Do you want to rename')}"
56 | f" {filename_color(str(path.with_name(oldname)))} {_('to')}"
57 | f" {filename_color(str(path.with_name(newname)))} ? [y/n]"
58 | )
59 | if self.auto_rename:
60 | choice = "y"
61 | sys.stdout.write("\n")
62 | else:
63 | choice = input().lower()
64 | if choice in ["y", "Y"]:
65 | new_path = path.rename(path.with_name(newname))
66 | logger.info("rename %s to %s", path, new_path)
67 | print(_("rename done"))
68 | else:
69 | print(_("skip rename"))
70 |
71 | def rename_files_in_dirs(self, dirs: set[Path]) -> None:
72 | if self.auto_rename:
73 | choice = "y"
74 | print()
75 | else:
76 | print(
77 | f"\n{_('Some files probably need to be renamed in these directories')}:"
78 | )
79 | for d in dirs:
80 | self.display_files_to_rename(d)
81 | sys.stdout.write(f"\n{_('Take a look')}? [y/n]")
82 | choice = input().lower()
83 | if choice in ["y", "Y"]:
84 | for d in dirs:
85 | self.rename_files_in_dir(d)
86 |
87 |
88 | if __name__ == "__main__":
89 | _ = builtins.__dict__["_"]
90 |
--------------------------------------------------------------------------------
/py_extract/logging_utils.py:
--------------------------------------------------------------------------------
1 | from logging import (
2 | FileHandler,
3 | Formatter,
4 | )
5 |
6 |
7 | class FormattedFileHandler(FileHandler):
8 | def __init__(self, filename: str) -> None:
9 | super().__init__(filename, encoding="utf-8")
10 | formatter = Formatter(
11 | (
12 | " %(asctime)s [%(levelname)s %(name)s:%(lineno)s"
13 | " %(funcName)s()] %(message)s"
14 | ),
15 | "%Y-%m-%d %H:%M:%S",
16 | )
17 | self.setFormatter(formatter)
18 |
--------------------------------------------------------------------------------
/py_extract/utils.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | from typing import TextIO
3 |
4 |
5 | def output_same_line(text: str) -> None:
6 | """Output to the same line"""
7 | print("\x1b[2K\r" + text, end="", flush=True)
8 |
9 |
10 | class BColors:
11 | OK_BLUE = "\033[94m"
12 | OK_GREEN = "\033[92m"
13 | FAIL = "\033[91m"
14 | END = "\033[0m"
15 |
16 |
17 | def filename_color(text: str) -> str:
18 | return f"{BColors.OK_BLUE}{text}{BColors.END}"
19 |
20 |
21 | def done_color(text: str) -> str:
22 | return f"{BColors.OK_GREEN}{text}{BColors.END}"
23 |
24 |
25 | def failed_color(text: str) -> str:
26 | return f"{BColors.FAIL}{text}{BColors.END}"
27 |
28 |
29 | def load_passwords(pwd_file: TextIO) -> list[str]:
30 | """passwords file should be like:
31 | ```
32 | # passwords.txt
33 | password_one_in_second_group
34 | password_two_in_second_group
35 |
36 | password_one_in_first_group
37 | password_two_in_first_group
38 | ```
39 | """
40 | lines = [line.strip() for line in pwd_file.readlines()]
41 |
42 | def strip_list(to_strip, rem):
43 | to_strip = list(itertools.dropwhile(lambda x: x == rem, to_strip))
44 | to_strip = list(itertools.dropwhile(lambda x: x == rem, to_strip[::-1]))
45 | return to_strip[::-1]
46 |
47 | lines = strip_list(lines, "")
48 | delimiter = ""
49 | if delimiter not in lines:
50 | return lines
51 | groups = [
52 | list(y)
53 | for x, y in itertools.groupby(lines, lambda z: z == delimiter)
54 | if not x
55 | ]
56 | return list(itertools.chain.from_iterable(groups[::-1]))
57 |
--------------------------------------------------------------------------------
/py_extract/zip_decrypter.pyx:
--------------------------------------------------------------------------------
1 | # cython: language_level=3
2 | # distutils: language = c
3 | # cython: cdivision = True
4 | # cython: boundscheck = False
5 | # cython: wraparound = False
6 | # cython: nonecheck = False
7 | # cython: profile = False
8 | # https://github.com/ziyuang/czipfile/blob/ba592c44c79d830a063210d598737f91f4333035/czipfile.pyx
9 |
10 | """
11 | cython implementation of zip decryption
12 | """
13 |
14 | cimport cpython
15 |
16 | cdef class _ZipDecrypter:
17 | """Class to handle decryption of files stored within a ZIP archive.
18 | ZIP supports a password-based form of encryption. Even though known
19 | plaintext attacks have been found against it, it is still useful
20 | to be able to get data out of such a file.
21 | Usage:
22 | zd = _ZipDecrypter(mypwd)
23 | plain_text = zd(cypher_text)
24 | The original usage of:
25 | plain_text = map(zd, cypher_text)
26 | is still supported, but will be slower (by a factor of 10 or so, by
27 | my measurements) than simply calling it with the full cypher_text.
28 | """
29 |
30 | # I guess to make these C vars, we must declare them out here?
31 | cdef unsigned long crctable[256]
32 | cdef unsigned long key0
33 | cdef unsigned long key1
34 | cdef unsigned long key2
35 |
36 | cdef void _GenerateCRCTable(self):
37 | """Generate a CRC-32 table.
38 | ZIP encryption uses the CRC32 one-byte primitive for scrambling some
39 | internal keys. We noticed that a direct implementation is faster than
40 | relying on binascii.crc32().
41 | """
42 | cdef unsigned long poly = 0xedb88320
43 | cdef unsigned long crc, i, j
44 | for 0 <= i < 256:
45 | crc = i
46 | for 0 <= j < 8:
47 | if crc & 1:
48 | crc = ((crc >> 1) & 0x7FFFFFFF) ^ poly
49 | else:
50 | crc = ((crc >> 1) & 0x7FFFFFFF)
51 | self.crctable[i] = crc
52 |
53 | cdef unsigned long _crc32(self, unsigned char ch, unsigned long crc):
54 | """Compute the CRC32 primitive on one byte."""
55 | return ((crc >> 8) & 0xffffff) ^ self.crctable[(crc ^ ch) & 0xff]
56 |
57 | def __init__(self, pwd):
58 | self.key0 = 305419896
59 | self.key1 = 591751049
60 | self.key2 = 878082192
61 |
62 | # Generate the CRC table; previously done outside of any method
63 | self._GenerateCRCTable()
64 |
65 | # Update our keys, given the password
66 | for p in pwd:
67 | self._UpdateKeys(p)
68 |
69 | cdef void _UpdateKeys(self, unsigned char c):
70 | self.key0 = self._crc32(c, self.key0)
71 | self.key1 = (self.key1 + (self.key0 & 255)) & 4294967295UL
72 | self.key1 = (self.key1 * 134775813 + 1) & 4294967295UL
73 | self.key2 = self._crc32((self.key1 >> 24) & 255, self.key2)
74 |
75 | def __call__(self, data):
76 | cdef unsigned long k
77 | cdef Py_ssize_t i, datalen
78 | cdef char *data_s
79 | cdef char *ret_s
80 |
81 | cpython.PyBytes_AsStringAndSize(data, &data_s, &datalen)
82 | ret = cpython.PyBytes_FromStringAndSize(NULL, datalen)
83 | ret_s = cpython.PyBytes_AsString(ret)
84 | for 0 <= i < datalen:
85 | k = self.key2 | 2
86 | ret_s[i] = data_s[i] ^ (((k * (k^1)) >> 8) & 255);
87 | # The proper way to do this is to call _UpdateKeys here, like so:
88 | #self._UpdateKeys(ret_s[i])
89 | # ... but we can cut runtime by about a third if we unroll the
90 | # function. So, we're doing so. Yes, it's duplication. Ah well...
91 | self.key0 = ((self.key0 >> 8) & 0xFFFFFF) ^ self.crctable[(self.key0 ^ ret_s[i]) & 0xFF]
92 | self.key1 = (self.key1 + (self.key0 & 255)) & 4294967295UL
93 | self.key1 = (self.key1 * 134775813 + 1) & 4294967295UL
94 | self.key2 = ((self.key2 >> 8) & 0xFFFFFF) ^ self.crctable[(self.key2 ^ ((self.key1 >> 24) & 255)) & 0xFF]
95 |
96 | return ret
97 |
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | toml
3 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Cython==0.29.33
2 | python-magic-bin==0.4.14; platform_system == "Windows"
3 | python-magic; platform_system == "Linux"
4 | babel
5 | pyzipper
6 |
--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
1 | import traceback
2 |
3 | from py_extract import create_py_extractor
4 |
5 | try:
6 | py_extractor = create_py_extractor()
7 | py_extractor.run()
8 | except Exception:
9 | print(traceback.format_exc())
10 | finally:
11 | input("Press Enter to exit...")
12 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from babel.messages import frontend as babel
2 | from Cython.Build import cythonize
3 | from Cython.Distutils import Extension
4 | from setuptools import setup
5 |
6 | extra_compile_args = ["-O3", "-ffast-math", "-fopenmp"]
7 | extra_link_args = ["-fopenmp"]
8 |
9 | lib_modules = []
10 |
11 | lib_modules.append(
12 | Extension(
13 | "py_extract.zip_decrypter",
14 | ["py_extract/zip_decrypter.pyx"],
15 | language="c",
16 | extra_compile_args=extra_compile_args,
17 | extra_link_args=extra_link_args,
18 | ),
19 | )
20 |
21 | setup(
22 | name="zip_decrypter",
23 | zip_safe=False,
24 | ext_modules=cythonize(
25 | lib_modules,
26 | language_level=3,
27 | compiler_directives={"always_allow_keywords": True},
28 | ),
29 | cmdclass={"compile_catalog": babel.compile_catalog},
30 | )
31 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davuses/PyExtract/876b83cf4328010f83c3dd71b152b0d0cd14fe15/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_extractor.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | import subprocess
3 | import sys
4 | from pathlib import Path
5 |
6 | import toml
7 |
8 | from py_extract import create_py_extractor
9 |
10 | FIRST_PASSWORD = "password1"
11 |
12 | SECOND_PASSWORD = "password2"
13 |
14 | PASSWORDS_FILE_CONTENT = f"""\
15 | {FIRST_PASSWORD}
16 |
17 | {SECOND_PASSWORD}
18 | """
19 |
20 |
21 | def test_7z_command():
22 | assert bool(shutil.which("7z"))
23 |
24 |
25 | def test_py_extract(tmp_path: Path):
26 | tmp_dir = tmp_path / "dir"
27 | tmp_dir.mkdir(exist_ok=True)
28 |
29 | test_files = ["file1.txt", "file2.txt", "file3.txt"]
30 |
31 | make_split_archives(tmp_dir, test_files)
32 |
33 | passwords_path = tmp_path / "passwords.txt"
34 | with open(passwords_path, "w", encoding="utf-8") as test_pwd_file:
35 | test_pwd_file.write(PASSWORDS_FILE_CONTENT)
36 | with open(
37 | "./config/example_config.toml", "r", encoding="utf-8"
38 | ) as example_config_file:
39 | test_config = toml.load(example_config_file)
40 | test_config["path"]["target_directory"] = str(tmp_dir)
41 | test_config["path"]["password_path"] = str(passwords_path)
42 | test_config["rename"]["substrings"] = ["删除", "删", "删我"]
43 | test_config["auto_rename"] = True
44 |
45 | test_config_path = tmp_path / "test_config.toml"
46 | with open(test_config_path, "w", encoding="utf-8") as test_config_file:
47 | toml.dump(test_config, test_config_file)
48 |
49 | sys.argv[1:] = ["--config", str(test_config_path)]
50 |
51 | py_extractor = create_py_extractor()
52 | py_extractor.run()
53 |
54 | for filename in test_files:
55 | assert (
56 | tmp_dir
57 | / f"./nested_archive.7z.001_out/archive.7z.001_out/{filename}"
58 | ).is_file()
59 |
60 |
61 | def make_split_archives(tmp_dir, test_files):
62 | test_filepaths = [tmp_dir / filename for filename in test_files]
63 |
64 | for p in test_filepaths:
65 | with open(p, mode="wb") as f:
66 | f.truncate(1024 * 1024 * 10)
67 |
68 | archive_name = "archive.7z"
69 |
70 | with subprocess.Popen(
71 | [
72 | "7z",
73 | "a",
74 | f"-p{FIRST_PASSWORD}",
75 | "-v2k",
76 | "-mx9",
77 | "-mhe=on",
78 | archive_name,
79 | *test_files,
80 | ],
81 | shell=False,
82 | encoding="utf-8",
83 | stdout=subprocess.PIPE,
84 | stderr=subprocess.PIPE,
85 | cwd=tmp_dir,
86 | ) as proc:
87 | stdout, _stderr = proc.communicate()
88 | assert "Everything is Ok" in stdout
89 |
90 | for p in test_filepaths:
91 | p.unlink()
92 |
93 | first_volume_path = tmp_dir / "archive.7z.001"
94 | first_volume_path.rename(first_volume_path.with_name("archive.7z.删除001"))
95 |
96 | archives_names = [
97 | "archive.7z.删除001",
98 | "archive.7z.002",
99 | "archive.7z.003",
100 | ]
101 | archive_paths = [tmp_dir / name for name in archives_names]
102 |
103 | with subprocess.Popen(
104 | [
105 | "7z",
106 | "a",
107 | f"-p{SECOND_PASSWORD}",
108 | "-v2k",
109 | "-mx9",
110 | "-mhe=on",
111 | "nested_archive.7z",
112 | *archives_names,
113 | ],
114 | shell=False,
115 | encoding="utf-8",
116 | stdout=subprocess.PIPE,
117 | stderr=subprocess.PIPE,
118 | cwd=tmp_dir,
119 | ) as proc:
120 | stdout, _stderr = proc.communicate()
121 | assert "Everything is Ok" in stdout
122 |
123 | nested_archive_first_volume = tmp_dir / "nested_archive.7z.001"
124 | nested_archive_first_volume.rename(
125 | nested_archive_first_volume.with_name("nested_archive.7z.删001")
126 | )
127 | nested_archive_second_volume = tmp_dir / "nested_archive.7z.002"
128 | nested_archive_second_volume.rename(
129 | nested_archive_second_volume.with_name("nested_archive.7z.删我002")
130 | )
131 | for p in archive_paths:
132 | p.unlink()
133 |
--------------------------------------------------------------------------------
/tests/test_load_passwords.py:
--------------------------------------------------------------------------------
1 | from io import StringIO
2 |
3 | from py_extract.utils import load_passwords
4 |
5 | PASSWORDS_TEXT = """\
6 |
7 |
8 | foo
9 | foo2
10 |
11 |
12 | ok
13 | ok2
14 |
15 |
16 |
17 | bar
18 | bar2
19 |
20 | """
21 |
22 |
23 | def test_load_passwords():
24 | pwd_file = StringIO(PASSWORDS_TEXT)
25 |
26 | results = load_passwords(pwd_file=pwd_file)
27 | assert results == ["bar", "bar2", "ok", "ok2", "foo", "foo2"]
28 |
--------------------------------------------------------------------------------