├── src
    └── debloat
    │   ├── __init__.py
    │   ├── tests
    │       ├── __init__.py
    │       └── debloat_test.py
    │   ├── utilities
    │       ├── __init__.py
    │       ├── rsrc.py
    │       ├── readers.py
    │       ├── pyflate.py
    │       └── nsisParser.py
    │   ├── debloat.icns
    │   ├── debloat.ico
    │   ├── auxiliary.py
    │   ├── gui.spec
    │   ├── hook
    │       └── hook-tkinterdnd2.py
    │   ├── processor.pyi
    │   ├── main.py
    │   ├── performanceTest.py
    │   ├── gui.py
    │   └── processor.py
├── requirements.txt
├── .gitattributes
├── setup.py
├── setup.cfg
├── .gitignore
├── pyproject.toml
├── LICENSE
├── .github
    └── workflows
    │   └── python-publish.yml
├── README.md
└── changelog.txt


/src/debloat/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/debloat/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/debloat/utilities/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tkinterdnd2
2 | pefile


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | if __name__ == "__main__":
4 |     setup()


--------------------------------------------------------------------------------
/src/debloat/debloat.icns:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Squiblydoo/debloat/HEAD/src/debloat/debloat.icns


--------------------------------------------------------------------------------
/src/debloat/debloat.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Squiblydoo/debloat/HEAD/src/debloat/debloat.ico


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = debloat
 3 | 
 4 | [options]
 5 | package_dir=
 6 |     =src
 7 | packages=find:
 8 | 
 9 | [options.packages.find]
10 | where=src


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | build/
 3 | dist/
 4 | virt/
 5 | src/debloat.egg-info/
 6 | src/debloat/dist
 7 | src/debloat/.vscode
 8 | src/debloat/samples
 9 | src/debloat/UnSorted_samples
10 | src/debloat/Old_Sample_Set
11 | src/debloat/TODO.md
12 | src/debloat/unsolved
13 | src/debloat/temp
14 | 


--------------------------------------------------------------------------------
/src/debloat/auxiliary.py:
--------------------------------------------------------------------------------
 1 | """This file contains auxillary commands for removing bloat.
 2 | 
 3 | The commands in this file are not included in the automated processor
 4 | and can be used by other scripts."""
 5 | import pefile
 6 | 
 7 | def trim_null_bytes(out_path: str,\
 8 |             pe: pefile.PE) -> None:
 9 |     '''Remove nullbytes from end of file
10 |     
11 |     Key Arguments:
12 |     out_path -- new file to write
13 |     pe -- a pe file opject'''
14 |     trimmed_pe = pe.trim()
15 |     with open(out_path, "wb") as output_file:
16 |         output_file.write(trimmed_pe)


--------------------------------------------------------------------------------
/src/debloat/tests/debloat_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import debloat.processor as processor 
 3 | import pefile
 4 | 
 5 | 
 6 | # Can we print sizes?
 7 | def test_readable_size():
 8 |     assert processor.readable_size(10) == "10 bytes"
 9 | 
10 | 
11 | 
12 | def test_signture_abnormality():
13 |     # Is there information after the signature?
14 |     # Signature is at 10 with a size of 5, total file is 15
15 |     assert processor.handle_signature_abnormality(10, 5, 15) == False
16 |     # Is there information after the signature?
17 |     # Signature is at 10 with a size of 5, total file is 20
18 |     assert processor.handle_signature_abnormality(10, 5, 20) == True
19 | 


--------------------------------------------------------------------------------
/src/debloat/gui.spec:
--------------------------------------------------------------------------------
 1 | # -*- mode: python ; coding: utf-8 -*-
 2 | 
 3 | 
 4 | a = Analysis(
 5 |     ['gui.py'],
 6 |     pathex=[],
 7 |     binaries=[],
 8 |     datas=[],
 9 |     hiddenimports=[],
10 |     hookspath=['./hook'],
11 |     hooksconfig={},
12 |     runtime_hooks=[],
13 |     excludes=[],
14 |     noarchive=False,
15 | )
16 | pyz = PYZ(a.pure)
17 | 
18 | exe = EXE(
19 |     pyz,
20 |     a.scripts,
21 |     a.binaries,
22 |     a.datas,
23 |     [],
24 |     name='gui',
25 |     debug=False,
26 |     bootloader_ignore_signals=False,
27 |     strip=False,
28 |     upx=True,
29 |     upx_exclude=[],
30 |     runtime_tmpdir=None,
31 |     console=False,
32 |     disable_windowed_traceback=False,
33 |     argv_emulation=False,
34 |     target_arch=None,
35 |     codesign_identity=None,
36 |     entitlements_file=None,
37 | )
38 | 


--------------------------------------------------------------------------------
/src/debloat/utilities/rsrc.py:
--------------------------------------------------------------------------------
 1 | import enum
 2 | class RSRC(enum.IntEnum):
 3 |     CURSOR        = 0x01  # noqa
 4 |     BITMAP        = 0x02  # noqa
 5 |     ICON          = 0x03  # noqa
 6 |     MENU          = 0x04  # noqa
 7 |     DIALOG        = 0x05  # noqa
 8 |     STRING        = 0x06  # noqa
 9 |     FONTDIR       = 0x07  # noqa
10 |     FONT          = 0x08  # noqa
11 |     ACCELERATOR   = 0x09  # noqa
12 |     RCDATA        = 0x0A  # noqa
13 |     MESSAGETABLE  = 0x0B  # noqa
14 |     ICON_GROUP    = 0x0E  # noqa
15 |     VERSION       = 0x10  # noqa
16 |     DLGINCLUDE    = 0x11  # noqa
17 |     PLUGPLAY      = 0x13  # noqa
18 |     VXD           = 0x14  # noqa
19 |     ANICURSOR     = 0x15  # noqa
20 |     ANIICON       = 0x16  # noqa
21 |     HTML          = 0x17  # noqa
22 |     MANIFEST      = 0x18  # noqa
23 | 
24 |     def __str__(self):
25 |         return self.name


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "debloat"
 7 | version = "1.6.5"
 8 | authors = [
 9 |   { name="Squiblydoo", email="Squiblydoo@pm.me" },
10 | ]
11 | description = "Debloat is an tool to remove excess garbage from bloated executables."
12 | readme = "README.md"
13 | requires-python = ">=3.6"
14 | dependencies = [
15 |   "tkinterdnd2>=0.3.0",
16 |   "pefile>=2023.2.0"
17 | ]
18 | classifiers = [
19 |     "Programming Language :: Python :: 3",
20 |     "License :: OSI Approved :: BSD License",
21 |     "Operating System :: OS Independent",
22 | ]
23 | 
24 | [project.urls]
25 | "Homepage" = "https://github.com/Squiblydoo/debloat"
26 | "Bug Tracker" = "https://github.com/Squiblydoo/debloat/issues"
27 | 
28 | 
29 | [project.scripts]
30 | debloat = "debloat.main:main"
31 | debloat-gui = "debloat.gui:main"
32 | 


--------------------------------------------------------------------------------
/src/debloat/hook/hook-tkinterdnd2.py:
--------------------------------------------------------------------------------
 1 | """pyinstaller hook file.
 2 | 
 3 | You need to use this hook-file if you are packaging a project using tkinterdnd2.
 4 | Just put hook-tkinterdnd2.py in the same directory where you call pyinstaller and type:
 5 | 
 6 |     pyinstaller myproject/myproject.py --additional-hooks-dir=.
 7 | """
 8 | 
 9 | import os
10 | import platform
11 | from PyInstaller.utils.hooks import collect_data_files, collect_dynamic_libs
12 | 
13 | 
14 | s = platform.system()
15 | p = {
16 |     'Windows': ({'win-arm64', 'win-x86', 'win-x64' },{'tkdnd_unix.tcl', 'tkdnd_macosx.tcl'}),
17 |     'Linux': ({'linux-x64', 'linux-arm64'}, {'tkdnd_windows.tcl', 'tkdnd_macosx.tcl'}),
18 |     'Darwin': ({'osx-x64', 'osx-arm64'}, {'tkdnd_windows.tcl', 'tkdnd_unix.tcl'}),
19 | }
20 | if s in p:
21 |     datas = set([
22 |         x for x in (
23 |             *collect_data_files('tkinterdnd2'),
24 |             *collect_dynamic_libs('tkinterdnd2'),
25 |         )
26 |         if os.path.split(x[1])[1] in p[s][0] and os.path.split(x[0])[1] not in p[s][1]
27 |     ])
28 | else:
29 |     raise RuntimeError(f'TkinterDnD2 is not supported on platform "{s}".')


--------------------------------------------------------------------------------
/src/debloat/processor.pyi:
--------------------------------------------------------------------------------
 1 | import pefile
 2 | from _typeshed import Incomplete
 3 | from pefile import Structure as Structure
 4 | from typing import Callable, Optional, Tuple
 5 | 
 6 | PACKER: Incomplete
 7 | 
 8 | def readable_size(value: int) -> str: ...
 9 | def write_multiple_files(out_path: str, files: list, log_message: Callable[[str], None]) -> None: ...
10 | def write_patched_file(out_path: str, pe: pefile.PE) -> Tuple[int, str]: ...
11 | def handle_signature_abnormality(signature_address: int, signature_size: int, beginning_file_size: int) -> bool: ...
12 | def check_and_extract_NSIS(possible_header: bytearray, data: bytearray) -> list: ...
13 | def check_for_packer(possible_header: bytearray) -> int: ...
14 | def find_last_section(pe: pefile.PE) -> Optional[pefile.SectionStructure]: ...
15 | def get_signature_info(pe: pefile.PE) -> Tuple[int, int]: ...
16 | def adjust_offsets(pe: pefile.PE, gap_offset: int, gap_size: int): ...
17 | def refinery_strip(pe: pefile.PE, data: memoryview, block_size=...) -> int: ...
18 | def refinery_trim_resources(pe: pefile.PE, pe_data: bytearray) -> int: ...
19 | def remove_resources(pe: pefile.PE, pe_data: bytearray) -> Tuple[bytearray, int]: ...
20 | def check_section_compression(pe: pefile.PE, pe_data: bytearray, end_of_real_data, log_message: Callable[[str], None]) -> Tuple[pefile.PE, int, str]: ...
21 | def trim_junk(pe: pefile.PE, bloated_content: bytes, original_size_with_junk: int) -> int: ...
22 | def process_pe(pe: pefile.PE, out_path: str, last_ditch_processing: bool, log_message: Callable[[str], None]) -> None: ...
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2023, Squiblydoo
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this
 9 |    list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 |    this list of conditions and the following disclaimer in the documentation
13 |    and/or other materials provided with the distribution.
14 | 
15 | 3. Neither the name of the copyright holder nor the names of its
16 |    contributors may be used to endorse or promote products derived from
17 |    this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/src/debloat/main.py:
--------------------------------------------------------------------------------
 1 | """This file handles passing the CLI arguments into the processor"""
 2 | import os
 3 | import sys
 4 | from pathlib import Path
 5 | import argparse
 6 | import pefile
 7 | import debloat.processor
 8 | from debloat.processor import DEBLOAT_VERSION
 9 | from debloat.processor import RESULT_CODES
10 | 
11 | 
12 | def main() -> int:
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("executable", 
15 |                         help="Path to the executable to be debloated",
16 |                         type=Path)
17 |     parser.add_argument("--output", 
18 |                         help="Output location", 
19 |                         type=Path,
20 |                         required=False)
21 |     parser.add_argument("-yolo", "--last-ditch", dest="last_ditch_processing",
22 |                         help="""
23 |     Run last-ditch processing. In this mode Debloat may remove the
24 |     whole PE Overlay as a last resort if no smarter method works.
25 |                             """,
26 |                         action='store_true', default=False)
27 |     parser.add_argument("-c", "--cert", dest="cert_preservation", 
28 |                         help="""
29 |     Preserve the certificate on the end of the file if there is a certificate.
30 |     The certificate will no longer be valid.""",
31 |                         action='store_true',
32 |                         required=False,
33 |                         default=False)
34 |     parser.add_argument("-v", "--version", action='version', version='debloat version ' + DEBLOAT_VERSION, help="Prints program version")
35 |     args = parser.parse_args()
36 | 
37 |     file_path = args.executable
38 |     out_path = args.output
39 |     file_size = os.path.getsize(file_path)
40 | 
41 |     if not out_path:
42 |         out_path = file_path.parent \
43 |             / f"{file_path.stem}_patched{file_path.suffix}"
44 | 
45 |     try:
46 |         with open(file_path, "rb") as bloated_file:
47 |             pe_data = bloated_file.read()
48 |         pe = pefile.PE(data=pe_data, fast_load=True)
49 |     except Exception:
50 |         print('''
51 | Provided file is not an executable! Please try again with an executable. 
52 | Maybe it needs unzipped?'''
53 |               )
54 |         return 1
55 | 
56 |     result_code = debloat.processor.process_pe(pe, 
57 |                         out_path=str(out_path), 
58 |                         last_ditch_processing=args.last_ditch_processing,
59 |                         cert_preservation=args.cert_preservation,
60 |                         log_message=print,
61 |                         beginning_file_size=file_size
62 |                         )
63 |     print("Tactic identifed:", RESULT_CODES.get(result_code))
64 |     return 0
65 | 
66 | if __name__ == "__main__":
67 |     sys.exit(main())
68 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Build Executables
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 |     branches:
 9 |       - main
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ${{ matrix.os }}
14 |     strategy:
15 |       matrix:
16 |         os: [ubuntu-latest, macos-13, macos-14, windows-latest]
17 | 
18 |     steps:
19 |     - name: Checkout repository
20 |       uses: actions/checkout@v2
21 | 
22 |     - name: Set up Python
23 |       uses: actions/setup-python@v4
24 |       with:
25 |         python-version: '3.x'
26 | 
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         pip install pyinstaller tkinterdnd2 pefile
31 |       working-directory: src/debloat
32 | 
33 |     - name: Build executable on Linux
34 |       if: matrix.os == 'ubuntu-latest'
35 |       run: |
36 |         pyinstaller --onefile --noconsole --icon=debloat.ico --collect-all tkinterdnd2 --name debloat gui.py
37 |       working-directory: src/debloat
38 | 
39 |     - name: Build executable on macOS (Intel and ARM)
40 |       if: startsWith(matrix.os, 'macos')
41 |       run: |
42 |         pyinstaller --onefile --noconsole --additional-hooks-dir=./hook --icon=debloat.icns --name debloat gui.py
43 |       working-directory: src/debloat
44 | 
45 |     - name: Build executable on Windows
46 |       if: matrix.os == 'windows-latest'
47 |       run: |
48 |         pyinstaller --onefile --noconsole --additional-hooks-dir=./hook --icon=debloat.ico --name debloat gui.py
49 |       working-directory: src/debloat
50 | 
51 |     - name: Set output name
52 |       id: set-tar-name
53 |       if: startsWith(matrix.os, 'ubuntu') || startsWith(matrix.os, 'macos')
54 |       run: echo "TAR_NAME=debloat.tar" >> $GITHUB_ENV
55 |       
56 |     - name: 'Tar files'
57 |       if: startsWith(matrix.os, 'ubuntu') || startsWith(matrix.os, 'macos')
58 |       run: |
59 |         cd src/debloat/dist/ &&
60 |         tar -cf ${{ env.TAR_NAME}} *
61 | 
62 |     - name: Upload Linux build artifacts
63 |       uses: actions/upload-artifact@v4
64 |       if: startsWith(matrix.os, 'ubuntu')
65 |       with:
66 |         name: Linux_debloat
67 |         path: src/debloat/dist/${{ env.TAR_NAME }}
68 | 
69 |     - name: Upload Intel Mac build artifacts
70 |       uses: actions/upload-artifact@v4
71 |       if: matrix.os == 'macos-13'
72 |       with:
73 |         name: macOS_x86_intel_debloat
74 |         path: src/debloat/dist/${{ env.TAR_NAME }}
75 | 
76 |     - name: Upload ARM Mac build artifacts
77 |       uses: actions/upload-artifact@v4
78 |       if: matrix.os == 'macos-14'
79 |       with:
80 |         name: macOS_ARM_debloat
81 |         path: src/debloat/dist/${{ env.TAR_NAME }}
82 | 
83 |     - name: Upload Windows build artifacts
84 |       uses: actions/upload-artifact@v4
85 |       if: startsWith(matrix.os, 'windows') 
86 |       with:
87 |         name: ${{ matrix.os }}_debloat
88 |         path: src/debloat/dist/
89 | 


--------------------------------------------------------------------------------
/src/debloat/performanceTest.py:
--------------------------------------------------------------------------------
 1 | ## This script is for batch processing of samples and can be used for
 2 | ## measuring memory usage.
 3 | 
 4 | import os
 5 | import hashlib
 6 | from memray import commands, FileReader
 7 | from memray._memray import size_fmt
 8 | import debloat.processor
 9 | import timeit
10 | import argparse
11 | import cProfile
12 | import pstats
13 | import tempfile
14 | 
15 | argparser = argparse.ArgumentParser(
16 |     prog = "Debloat Performance test",
17 |     description = "This program takes a test type (--mem or --cpu) and performs tests using one or more samples. If no sample or directory is specified, it defaults to a 'samples' directory in the current working directory."
18 | )
19 | argparser.add_argument("--cpu", help="Run the CPU profiler", action="store_true")
20 | argparser.add_argument("--mem", help="Run the memory profiler", action="store_true")
21 | argparser.add_argument("--sample", help="Run the debloat processor on a single sample")
22 | argparser.add_argument("--directory", help="Specify sample directory", default="samples")
23 | argparser.add_argument("--keep", help="Keeps patched copies.", action="store_true")
24 | args = argparser.parse_args()
25 | 
26 | def process_samples(sample, directory):
27 |     file_size=os.path.getsize(args.directory +"/"+ sample)
28 |     setup = f"import pefile; import debloat; filename = '{args.directory}/{sample}'; "
29 |     code = f"binary = pefile.PE(filename, fast_load=True); result= debloat.processor.process_pe(binary, filename + '.patched', last_ditch_processing=False, cert_preservation=False, log_message=lambda *args, **kwargs: None, beginning_file_size={file_size}); print(result, end=' ')"
30 | 
31 |     if args.mem:
32 |         mem_profiler(setup, code, file_size, sample, directory)
33 |     if args.cpu:
34 |         cpu_profiler()
35 |     if not args.keep:
36 |         try:
37 |             os.remove(args.directory + "/" + sample + ".patched")
38 |         except:
39 |             pass
40 | 
41 | 
42 | def mem_profiler(setup, code, file_size, sample, directory):
43 |     with tempfile.NamedTemporaryFile() as f:
44 |         commands.main(["run", "-f", "-q", "-o", f.name, "-c", setup+code])
45 |         reader = FileReader(os.fspath(f.name), report_progress=False)
46 |         # Uncomment to hash outputed samples.
47 |         #with open(directory +"/"+ sample + ".patched", "rb") as g:
48 |         #    out = g.read()
49 |         #    out_hash = hashlib.sha256(out).hexdigest()
50 |     times = timeit.repeat(stmt=code, setup=setup, number=1, repeat=3)
51 |     print(sample, size_fmt(file_size), size_fmt(reader.metadata.peak_memory), [round(x,2) for x in times])
52 | 
53 | def cpu_profiler():
54 |     cProfile.run(setup+code, "tmp.prof")
55 |     p = pstats.Stats("tmp.prof")
56 |     p.sort_stats('tot').print_stats(10)
57 |     p.sort_stats('cumulative').print_stats(10)
58 | 
59 | if args.sample:
60 |     process_samples(args.sample, args.directory)    
61 | 
62 | else:
63 |     print("Debloat Method/ Original Filename / Disk Size /  Mem Usage / Time to process x 3")
64 |     for sample in os.listdir(args.directory):
65 |         process_samples(sample, args.directory)
66 | 
67 |         


--------------------------------------------------------------------------------
/src/debloat/gui.py:
--------------------------------------------------------------------------------
  1 | """This file handles all GUI components."""
  2 | import os
  3 | import time
  4 | from pathlib import Path 
  5 | from tkinter import *
  6 | import tkinter.scrolledtext as st
  7 | from typing import Tuple, Optional, Any
  8 | from tkinterdnd2 import DND_FILES, TkinterDnD
  9 | import pefile
 10 | import debloat.processor
 11 | from debloat.processor import DEBLOAT_VERSION
 12 | from debloat.processor import RESULT_CODES
 13 | 
 14 | class MainWindow(TkinterDnD.Tk):
 15 |     def __init__(self) -> None:
 16 |         '''Define main GUI window.'''
 17 |         TkinterDnD.Tk.__init__(self)
 18 |         self.title("Debloat " + DEBLOAT_VERSION)
 19 |         # I removed the Tkinter Icon since it didn't work on most 
 20 |         # platforms and just caused more problems than necessary.
 21 |         self.geometry("600x600")
 22 | 
 23 |         # Label and PathBox for the main function of program.
 24 |         self.pathbox_Label = Label(self, \
 25 |                                    text="Drag and drop file onto text bar.")
 26 |         self.pathbox_Label.pack()
 27 |         self.pathbox = Entry(self, width=150)
 28 |         self.pathbox.pack(padx=20)
 29 |         self.pathbox.drop_target_register(DND_FILES)
 30 |         self.pathbox.dnd_bind("<<Drop>>", self.process_entry)
 31 | 
 32 |         # Define button that will be used to the process file.
 33 |         self.process_button = Button(self, \
 34 |                                      text="Process file", \
 35 |                                      command=self.process)
 36 |         self.process_button.pack(pady=10)
 37 | 
 38 |         # Safe processing value and checkbox: Maybe not even needed?
 39 |         self.unsafe_processing = BooleanVar(value=False)
 40 |         self.unsafe_checkbox = Checkbutton(self,
 41 |                                         text="Check to run last ditch effort processing",
 42 |                                          variable=self.unsafe_processing)
 43 |         self.unsafe_checkbox.pack()
 44 | 
 45 |         self.cert_preservation = BooleanVar(value=False)
 46 |         self.cert_checkbox = Checkbutton(self, 
 47 |                                         text="Preserve Cert. Cert will be invalid but informational.",
 48 |                                         variable=self.cert_preservation)
 49 |         self.cert_checkbox.pack()
 50 | 
 51 |         
 52 | 
 53 |         # Define Scrollbox for output of program.
 54 |         self.output_scrollbox = st.ScrolledText(self, 
 55 |                                                 width=100, 
 56 |                                                 height=100,
 57 |                                                 wrap=WORD)
 58 |         self.output_scrollbox.pack(padx=20, pady=20)
 59 | 
 60 |     def clear_pathbox(self) -> None:
 61 |         '''Clear any text in the pathbox.'''
 62 |         self.pathbox.delete(0,"end")
 63 | 
 64 |     def output_scrollbox_handler(self, message: str, end = "\n", flush=True) -> None:
 65 |         '''Insert messages in the textbox.'''
 66 |         self.output_scrollbox.insert(INSERT, message + end)
 67 |         self.update()
 68 | 
 69 |     def process_entry(self, event: Any) -> None:
 70 |         '''Check and update user provided file path.'''
 71 |         self.pathbox.insert("end", event.data)
 72 |         file_path = self.pathbox.get()
 73 |         if file_path[0] == '{' and file_path[-1] == '}':
 74 |             file_path = file_path[1:-1]
 75 |             self.pathbox.delete(0,"end")
 76 |             self.pathbox.insert(0, file_path) 
 77 | 
 78 |     def process(self) -> None:
 79 |         '''Process the file at the user provided path.'''
 80 |         start_time = time.time()
 81 |         file_path = Path(self.pathbox.get())
 82 |         self.output_scrollbox_handler("Processing. Please wait.")
 83 |         try:
 84 |             with open(file_path, "rb") as bloated_file:
 85 |                 pe_data = bloated_file.read()
 86 |             pe = pefile.PE(data=pe_data, fast_load=True)
 87 |         except Exception:
 88 |             self.output_scrollbox_handler('''
 89 | Provided file is not an executable! Please try again 
 90 | with an executable. Maybe it needs unzipped?''')
 91 |             self.clear_pathbox()
 92 |             return
 93 |         file_size = os.path.getsize(file_path)
 94 |         out_path = file_path.parent \
 95 |             / f"{file_path.stem}_patched{file_path.suffix}"
 96 | 
 97 |         result_code = debloat.processor.process_pe(pe,  out_path, 
 98 |                                      self.unsafe_processing.get(), 
 99 |                                      self.cert_preservation.get(),
100 |                    log_message=self.output_scrollbox_handler,
101 |                    beginning_file_size=file_size)
102 |         self.output_scrollbox_handler("Tactic identified: " , RESULT_CODES.get(result_code) +"\n")
103 |         self.output_scrollbox_handler("-----Processing took %s seconds ---\n" \
104 |                                     % round((time.time() - start_time), 2))
105 |         self.clear_pathbox()
106 | 
107 | def main() -> None:
108 |     root = MainWindow()
109 |     root.mainloop()
110 | 
111 | if __name__== "__main__":
112 |     main()
113 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![debloat](https://user-images.githubusercontent.com/77356206/215351855-9f89c298-36b4-4234-89b5-dc3f26d1f8b0.png)
  2 | 
  3 | # Debloat
  4 | Debloat is a GUI and CLI tool to remove excess garbage from bloated executables.
  5 | 
  6 | By excess garbage, I mean 100 - 800 MB of junk bytes added to a binary to keep it from going into a sandbox. This method of adding junk is called "inflating" or "pumping" a binary. Debloat currently handles the 10 most common inflation tactics.
  7 | 
  8 | Being built with Python, the application can easily be leveraged in other workflows. Currently, debloat is used by [CCCS's AssemblyLine](https://www.cyber.gc.ca/en/tools-services/assemblyline) and [CERT Polska's MWDB](https://github.com/CERT-Polska/karton-archive-extractor).
  9 | 
 10 |  The program can be compiled for Windows, MacOS, Linux. The GUI and CLI have minimal options: it is intended to be as simple as possible and the logic within the program handles the different use cases automatically.
 11 | 
 12 | Compiled binaries have already been included in the [Releases](https://github.com/Squiblydoo/debloat/releases/).
 13 | 
 14 | The debloat can installed using `pip install debloat`. Use `debloat` to launch the CLI and `debloat-gui` to launch the GUI.
 15 | 
 16 | For advanced users, Debloat can also be imported into other scripts and the processing functions can be called individually.
 17 | 
 18 | ## How to use the GUI?
 19 | The GUI of Debloat intends to be as intuitive as possible.
 20 | When launched, you can drag and drop bloated file onto the text bar and press the "Process file" button.
 21 | Some technical information will be printed to the scrolling textbox and the file without bloat will be written to the directory the file was pulled from.
 22 | Sound easy? It is!
 23 | 
 24 | Processing files will take a few seconds.<br>
 25 | ![image](https://github.com/Squiblydoo/debloat/assets/77356206/3d2756cd-bc83-44e8-b223-edd8ed464369)
 26 | 
 27 | 
 28 | ## How to use the CLI?
 29 | After installing using `pip install debloat` use the command `debloat`.<br>
 30 | `debloat` can take two arguments. The first argument is required: the file to debloat. The second argument is optional: the output location. When no output is provided, it will be written to the same directory as the original file.
 31 | 
 32 | The gui can also be launched from the CLI using the command `debloat-gui`.
 33 | 
 34 | ## Does it always work?
 35 | Not yet.
 36 | Based on my recent analysis, debloat is able to [remove junk from bloated files 97.8% of the time](https://x.com/SquiblydooBlog/status/1795419380991291424).
 37 | 
 38 | In previous versions, `debloat` could accidentally remove too much of the binary. That is no longer the case unless you use the "--last-ditch" switch. If you ever need this switch, consider sharing the sample for additional analysis. This option has now been added to the GUI. Functionally, what the function does is it will remove the whole overlay, if there is one. In some cases this is necessary as no pattern for the junk was found---this is most commonly the case in samples that do not compress well.
 39 | 
 40 | ## Use Cases (Images from [Malcat](https://malcat.fr/))
 41 | ### Full support
 42 | - [x] Bloat appended to the end of a Signed PE.<br>
 43 | In the image below, the bloat has been appended to the end of the executable. <br>
 44 | ![Screenshot 2023-02-11 at 3 32 36 PM](https://user-images.githubusercontent.com/77356206/218279963-00780b59-8227-47dd-a0af-41096f6ae17b.png)
 45 | 
 46 | - [X] Signed or Unsigned Packed executable.<br>
 47 | In the image below, the bloat has been appended to the executable after packing. <br>
 48 | ![Screenshot 2023-02-11 at 3 44 10 PM](https://user-images.githubusercontent.com/77356206/218280433-6dbcf51a-68c8-48e1-a89a-ad0b818a0afc.png)
 49 | 
 50 | - [X] Signed executable includes bloat in the .rsrc section of the PE.<br>
 51 | In the image below, the bloat is identified as in the .rsrc section and is removed from the PE.<br>
 52 | ![Screenshot 2023-02-11 at 3 35 21 PM](https://user-images.githubusercontent.com/77356206/218280086-7cd548f8-e16b-4290-9283-a8a848de1419.png)
 53 | 
 54 | - [X] Cases where bloat is added inside a PE Section.<br>
 55 | In the image below, the bloat has been included in a PE section named [0]. <br>
 56 | ![Screenshot 2023-02-11 at 3 26 52 PM](https://user-images.githubusercontent.com/77356206/218279753-ed2c9102-482a-4639-aeb1-df8efc9c4e2e.png)
 57 | 
 58 | - [X] Cases where the executable is a Nullsoft Scriptable Installer System executable (NSIS aka Nullsoft)
 59 | These exe are installers that may contain one or more files. The files contained may or may not be malicious. (Sometimes actors will add files simply for increasing the file size.) All files within the installer are extracted to a new directory. The directory also contains the script for the installer which can be consulted to determine which files may be malicious.
 60 | In the image below, Malcat has identified the executable as a NSIS installer.
 61 | ![image](https://github.com/Squiblydoo/debloat/assets/77356206/86780abc-da4b-4808-bccb-733d97fa80d8)
 62 | 
 63 | # Partial Support
 64 | 
 65 | - [X] Cases where the junk is too random and the entropy is too high. In these cases, a switch/option called "--last-ditch" 
 66 | 
 67 | ### Other use cases
 68 | There are use cases where the tool does not work. However, I plan to solve for them before publishing too much about them.
 69 | 
 70 | ## Why?
 71 | There appear to be a limited number of tools to easily process bloated executables. The two tools I have seen the most are “foremost” which is intended for recovering binaries from a disk image and “pecheck”.
 72 | 
 73 | [Foremost](https://www.kali.org/tools/foremost/) works best in instances where the junk bytes are null (0x00) and it struggles when the binary has a fake or real signature. Its use in removing bloat from files is not its original purpose.
 74 | 
 75 | [Pecheck](https://github.com/DidierStevens/DidierStevensSuite/blob/master/pecheck.py) has been developed over 14+ years and has some confusing commandline options. The option to remove bloated content is not the primary function of the script. Pecheck has to be combined with another tool ([disitool](https://blog.didierstevens.com/programs/disitool/)) in order to handle signed executables. In my experience, there are other times where pecheck can get confused and return an executable twice the size of the original bloated executable. All these factors seem OK if you are handling a small number of binaries, but as the number of binaries and methods increase, a tool specific to removing bloat is needed.
 76 | 
 77 | [Binary Refinery](https://github.com/binref/refinery) is an amazing tool. It was written with the intention of being a [CyberChef](https://github.com/gchq/CyberChef) of the commandline. While both tools are amazing, they both have a shortcoming that requires the user to know what formulas should be applied. 
 78 | 
 79 | There are good solid manual methods to remove bloat from binaries, but these methods can be tedious and not all analysts have the skills to do this. This tool removes the burden of needing to know how to manually remove bloat. Additionally, it allows for better scale. The principles used in the script allow allow for better scale if automation is desired.
 80 | 
 81 | 
 82 | ## How to build? 
 83 | Follow the build commands appropriate to your platform. The main difference between build commands is the format of the icon.
 84 | <br>
 85 | MacOS<br>
 86 | `pyinstaller --onefile --noconsole --additional-hooks-dir=./hook --icon=debloat.icns gui.py`
 87 | 
 88 | Windows<br>
 89 | `pyinstaller --onefile  --noconsole  --additional-hooks-dir=./hook --icon=debloat.ico gui.py`
 90 | 
 91 | Linux<br> 
 92 | `pyinstaller --onefile --noconsole --icon=debloat.ico --collect-all tkinterdnd2 gui.py`
 93 | 
 94 | ## Want to discuss?
 95 | Consider joining the [debloat Discord](discord.gg/dvGXKaY5qr).
 96 | 
 97 | ## Credits
 98 | Big shoutout to Jesko Hüttenhain creator of [Binary Refinery](https://github.com/binref/refinery). The NSIS extraction is based on his reverse engineering of the NSIS file format. Check out Binary Refinery if you have not.
 99 | 
100 | ## Where is this project going next?
101 | Batch processing: process all files in a directory and produce a report.
102 | 
103 | Better support for using processing methods outside of debloat.
104 | 
105 | Support for debloating without unzipping.
106 | 


--------------------------------------------------------------------------------
/changelog.txt:
--------------------------------------------------------------------------------
  1 | 1.6.5
  2 | - Fixed bug introduced in 1.6.1 which consistently resulted in failure to parse NSIS installers.
  3 | - Updated NSIS extraction script to include new functionality from BinaryRefinery
  4 | - Removed use of ByteString which was removed in python3.14
  5 | 
  6 | 1.6.4
  7 | - Added an additional check to identify the Code-signing signature anomaly. This check previously exited if the anomaly was found but it did not check to determine if enough of the file was removed. Now a size check has been added in order to determine if additional processing is required. 
  8 | 
  9 | 1.6.3
 10 | - Fixes bug where debloat failed to handle malformed files. 
 11 | 
 12 | 1.6.2
 13 | - Fixes bug in adjust_offsets method that impacted tactic-7. Bug was introduced in 1.5.6.4 as a bad attempt at error handling.
 14 | 	- When adjusting offsets, it was possible for an error to be thrown because adjusting the offset would set it to an invalid value. However, this would happen because the value was invalid to begin with. The incorrect value was being improperly handled. I'm not 100% sure that I have it correct, but the new change works as expected.
 15 | 
 16 | 1.6.1
 17 | - Fixes legacy bug that could result in failure to identify NSIS installers.
 18 | 	- In previous builds, we only checked a small window for the NSIS header. That window has been increased.
 19 | - Updates the tkinterdnd hook file to only collect binaries associated with the operating system it is being built for.
 20 | - Add placeholders for 2 new use cases to solve for.
 21 | - Updates buildCLI.txt to specify output filename.
 22 | - Add file for GitHub build automation.
 23 | 
 24 | 1.6.0
 25 | - Improves NSIS Parser to handle an irregular NSIS format
 26 | - Adds solution for Use Case 17
 27 | 	- Attackers can include junk marked as the code signing signature. In previous versions, the certificate preservation would preserve the junk. Without certificate preservation, the junk would be removed but return a Result Code of "0 - No Solution Found" even though the file was deflated.
 28 | - Bug Fix
 29 | 	- Adds error handling to escape non-unicode PE section names
 30 | 
 31 | 1.5.6.6
 32 | - Bug Fix
 33 | 	- Patches bug in Result-Code 4 where an excess could be removed. 
 34 | 		- This was due to a miscalculation. In these instances, the "dynamic trim" and "refinery trim" methods were essentially being applied to the same data, then calculating an excess of junk.
 35 | 	- The check for duplicate items in an NSIS Installer has been improved.
 36 | 		- Previous check looked for item at the same offset; this version checks to see that all features are the same. 
 37 | 
 38 | 1.5.6.5
 39 | - Bug Fix
 40 | 	- Inadvertently changed "sample_compression" limit, thought it'd be OK, but it actually causes this check's main purpose to fail (that is, failing quickly when needed). Got some new ideas out of it though.
 41 | 
 42 | 1.5.6.4
 43 | - Bug Fixes
 44 | 	- Fixed logic that could incorrectly flag .text sections as suspicious.
 45 | 	- Handled rare error that could occur in updating offsets.
 46 | 	- Certificate preservation now works reliably for all use-cases. 
 47 | 
 48 | 1.5.6.3
 49 | - Bug Fixes
 50 | 	- Modified NSIS Parser to address issue identified in the implementation. More details here: https://github.com/binref/refinery/issues/49
 51 | 		- TLDR, NSIS Installers with the properly of uncompressed data was not previously accounted for due to lack of examples. They now are accounted for.
 52 | 	- Modified compression check in bloated overlay analysis
 53 | 		- previous compression check was erroneous and worked only based on miracles.
 54 | - Improvements
 55 | 	- Modified trimming threshold: 0.05 -> 0.15
 56 | 		- New trimming threshold allows for lower compressed junk.
 57 | 		- New trimming threshold removes more junk without being too aggressive.
 58 | - Known issue
 59 | 	- The certificate preservation option does not preserve the certificate in all use-cases, particularly cases where junk is in the overlay.
 60 | 
 61 | 1.5.6.2
 62 | - Bug Fix
 63 | 	- Not all possible paths returned a result code. An additional result code was added.
 64 | 
 65 | 1.5.6.1
 66 | - Bug Fix
 67 | 	- Added the result code for real this time.
 68 | 
 69 | 1.5.6
 70 | - Cert Support
 71 | 	- Added support in both CLI and GUI to preserve the authenticode certificate.
 72 | 		- Authenticode certificate is removed by default because the certificate becomes invalid. When it becomes invalid it becomes unclear whether the certificate was always invalid or not.
 73 | - Bug Fix
 74 | 	- A result code was missing which could cause problems in processing that looked for a result code.
 75 | 
 76 | 1.5.5
 77 | - General Improvements
 78 | 	- Added functionality to print debloat version/ added to GUI UI
 79 | 	- Deduped results_codes into processor file
 80 | - New Use Case
 81 | 	- Identified a use case that wasn't being solved, improved program logic to solve.
 82 | 		- Packed files with a bloated section. 
 83 | 
 84 | 1.5.4
 85 | - General Improvements
 86 | 	- This version prints report codes indicating which inflation tactic is identified.
 87 | 	- This version can now handle instances where no pattern exists within the junk data, or the pattern is disrupted by a few characters. This version uses the trimming method from binary refinery in two cases that were found to be more efficient.
 88 | 	- A performance testing script has been included.
 89 | 
 90 | The new updates hand a few edge use-cases that were not solvable before and fixes one bug.
 91 | 
 92 | 	Bugfix: If debloat was unable to trim a inflated section, it would tell you it could and then exit telling you that it could not.
 93 | 
 94 | 	New use-case solved: This solves the use-case where there a pattern exists in the overlay, but additional bytes have been added to disrupt the pattern. As much as 1 byte is enough to disrupt the pattern. This is not a problem anymore.
 95 | 
 96 | 
 97 | 1.5.3.4
 98 | - NSIS Parser improvements
 99 | 	- Additional use cases for NSIS were identified and tested. These identified additional bugs which are fixed in this version. These use cases were added and tested:
100 | 		- bzip2_liquid
101 | 		- bzip2_solid
102 | 		- lzma_liquid
103 | 		- lzma_solid
104 | 		- zlib_liquid
105 | 		- zlib_solid
106 | 
107 | 1.5.3.3
108 | - Modified NSIS Parser significantly.
109 | 	- Two use cases were identified where the parser were not working adequately. This resulted in identifying two logic bugs which resulted in fixing one and a large rewrite of some portions of the NSIS Parser. Rewrite was done by Huettenhain (https://github.com/huettenhain) for the original project of the NSIS Parser (https://github.com/binref/refinery) and then was incorporated into Debloat by me (Squiblydoo).
110 | 	- Removed some code that was unused.
111 | 
112 | 1.5.3.2
113 | - Fixed a bug with the RSRC trimming
114 | 	- These were some long standing issues:
115 | 		- The default threshold and default size_limit were brought into conformance with Refinery Trim
116 | 		- With the previously high threshold, it could result in problems from removing the entire resource.
117 | 		- I also reverted the compression method in this section. The one used elsewhere was found not to be compatible with this part of the processing.
118 | 
119 | 1.5.3.1
120 | - Fixed NSIS extractor bug. 
121 | 	- Bug was caused due to the failure of adding some bytes when iterating through NSIS entries.
122 | 	- Bug was caused by a missing variable.
123 | - Updated the imports for nsisParser and readers
124 | 	- (Somehow?) It was working without these needing to be explicitly mentioned, but it has been updated for completeness.
125 | 
126 | 1.5.3
127 | - Fixed alignment bug
128 | 	- There was a bug where I was subtracting instead of adding bytes to fix alignment. It now adds instead of subtracts.
129 | - Polished the trim
130 | 	- The "find_chunk_start" method had some unclear logic, that has been improved. 
131 | 	- Instead of trying to remove all junk, the method now returns all bytes if the full regex was unable to match.
132 | 	- So, if the step is 1000 or 2000 bytes and not all of them are junk, it will leave all 1000
133 | 		- The logic is that they aren't really hurting anything by being here, and it is better to leave them than accidentally remove them.
134 | 
135 | 1.5.2
136 | - Merged Optimization changes
137 | 	- Changes primarily related to the trim_junk function
138 | 	- Primary changes reduced the active memory cost
139 | - No changes in the functionality were made in this release.
140 | 
141 | 1.5.1
142 | - Made modifications recommended by gdesmar for memory improvements.
143 | 	- Added the ability to pass the size of the file to the process_pe method
144 | 		- This reduces memory usage to calculate the length
145 | 	- Bug fixes suggested by gdesmar such as passing the correct object type
146 | 	- New compression algorithim implemented
147 | 	- See https://github.com/Squiblydoo/debloat/pull/18 to learn more about performance enhancements.
148 | - Implemented the optional "beginning_file_size" parameter for "process_pe" in both main.py and gui.py
149 | - Fixed typecasting bug introduced in 1.5.0 in relation to the "write_multiple_files" method
150 | 
151 | 1.5.0
152 | - Added capability to handle Nullsoft Scriptable Install System (NSIS, aka Nullsoft) executables.
153 | 	- Setup instructions and binaries are extracted from the Nullsoft installer to a separate directory.
154 | 	- At this time, the user needs to resubmit files if they are bloated. Currently, debloat has no way of determining which files are malicious.
155 | 
156 | - Fully renamed "Unsafe" Processing to "last_ditch_processing"
157 | 	- Last ditch better represents its purpose.
158 | 	- "Unsafe" is a name that is often used in the context of untrusted code.
159 | 	- Fixed inconsistency in naming of "last ditch processing"
160 | 
161 | - Adjusted how debloat determines if junk was removed or not:
162 | 	- Previously, it could think junk removed if 1 or more bytes were removed or if only the signature was removed.
163 | 	- Now debloat checks for a 10% removal at the least
164 | 
165 | - Updated documentation regarding Linux build command.
166 | 	- This had been updated elsewhere, but the update had not made it to the README
167 | 
168 | 1.4.3
169 | - Fixed a logic bug where debloating a section did not debloat the proper section.
170 |     - This worked previously when the bloated section was the last section
171 | - Finished a TODO item: namely, change all the offsets in the sections when the bloated section wasn't the last section of the binary.
172 | 
173 | 1.4.2
174 | - Added checkbox for unsafe processing in GUI
175 | - Moved RSRC class out of processor into utilities
176 | - Fixed bug where chunk_start could fail to be given a value with the result that the program would stop functioning but not inform the user. Better error handling in this case to come.
177 | 
178 | 1.4.1
179 | - Fixed loading PE in GUI
180 | 
181 | 1.4.0
182 | - Fixed headers in a few use cases where I had missed them before.
183 | - Fixed removing resource method. Works properly now.
184 | - Fixed instance where the dynamic trim regex could pick up illegal characters
185 | - Now last_loads PE for better loading time.
186 | - Now manipuates PE data in the buffer.
187 | 
188 | 1.3.2.2
189 | - Fixed a bug where the Delta_last_non_junk value could fail to be set in one use case.
190 | 
191 | 1.3.2.1
192 | - Temporary fix for release version.
193 | 
194 | 1.3.2
195 | - Added Dynamic Trim for trimming bytes from both the Overlay and bloated sections
196 | 	- Dynamic trim identifies the junk and creates a targeted regex to remove it.
197 | 
198 | - Improved output. 
199 | 	- Output wasn't being updated as the program ran. I now clear the buffer and update the UI after each output message.
200 | 
201 | 1.3.1
202 | - Fixed required versions in pyproject.toml
203 | 
204 | 1.3.0
205 | - Merged refactoring changes per nazywam's recommendation
206 | - Updated text length per PEP8
207 | - Started docstrings and other documentation for methods
208 | - Updated variable names for PEP8 consistency
209 | 


--------------------------------------------------------------------------------
/src/debloat/utilities/readers.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Most of this code was repurposed from Binary Refinery (https://github.com/binref/refinery), used under the 3-Clause BSD License
  4 | 
  5 | from __future__ import annotations
  6 | import io
  7 | import itertools
  8 | import enum
  9 | import struct
 10 | import re
 11 | import functools
 12 | from types import TracebackType
 13 | from typing import List, Union, Tuple, Optional, Iterable, TypeVar, Generic, Any
 14 | 
 15 | T = TypeVar('T', bound=Union[bytearray, bytes, memoryview])
 16 | UnpackType = Union[int, bool, float, bytes]
 17 | 
 18 | def signed(k: int, bits: int):
 19 |     M = 1 << bits
 20 |     k = k & (M - 1)
 21 |     return k - M if k >> (bits - 1) else k
 22 | 
 23 | def exception_to_string(exception: BaseException, default=None) -> str:
 24 |     """
 25 |     Attempts to convert a given exception to a good description that can be exposed to the user.
 26 |     """
 27 |     if not exception.args:
 28 |         return exception.__class__.__name__
 29 |     it = (a for a in exception.args if isinstance(a, str))
 30 |     if default is None:
 31 |         default = str(exception)
 32 |     return max(it, key=len, default=default).strip()
 33 | 
 34 | class StreamDetour:
 35 |     def __init__(self, stream: io.IOBase, 
 36 |                  offset=None, whence=io.SEEK_SET) -> None:
 37 |         self.stream = stream
 38 |         self.offset = offset
 39 |         self.whence = whence
 40 | 
 41 |     def __enter__(self) -> io.IOBase:
 42 |         self.cursor = self.stream.tell()
 43 |         if self.offset is not None:
 44 |             self.stream.seek(self.offset, self.whence)
 45 |         return self.stream
 46 |     
 47 |     def __exit__(self, *args) -> None:
 48 |         self.stream.seek(self.cursor, io.SEEK_SET)
 49 | 
 50 | class MemoryFile(Generic[T], io.IOBase):
 51 | 
 52 |     closed: bool
 53 |     read_as_bytes: bool
 54 | 
 55 |     _data: T
 56 |     _cursor: int   # Defines where in the file we are currently reading from
 57 |     _closed: bool
 58 | 
 59 |     class SEEK(int, enum.Enum):
 60 |         CUR = io.SEEK_CUR
 61 |         END = io.SEEK_END
 62 |         SET = io.SEEK_SET
 63 | 
 64 |     def __init__(self, data: T, read_as_bytes: bool = False, 
 65 |                  file_number: Optional[int] = None) -> None:
 66 |         self._data = data
 67 |         self._cursor = 0
 68 |         self._closed = False
 69 |         self.read_as_bytes = read_as_bytes
 70 |         self.file_number = file_number
 71 | 
 72 |     def close(self) -> None:
 73 |         self._closed = True
 74 | 
 75 |     @property
 76 |     def closed(self) -> bool:
 77 |         return self._closed
 78 |     
 79 |     # Enter and exit methods for context manager
 80 |     def __enter__(self) -> 'MemoryFile':
 81 |         return self
 82 |     
 83 |     def __exit__(self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None) -> None:
 84 |         return super().__exit__(exc_type, exc_val, exc_tb)
 85 |     
 86 |     def flush(self) -> None:
 87 |         return super().flush()
 88 |     
 89 |     def isatty(self) -> bool:
 90 |         return super().isatty()
 91 | 
 92 |     def __iter__(self) -> Iterable[bytes]:
 93 |         return self
 94 |     
 95 |     def __len__(self) -> int:
 96 |         return len(self._data)
 97 |     
 98 |     def __next__(self) -> bytes:
 99 |         if self._cursor >= len(self._data):
100 |             raise StopIteration
101 |         else:
102 |             return self.readline()
103 |         
104 |     def file_number(self) -> Optional[int]:
105 |         if self.file_number is None:
106 |             return None
107 |         return self.file_number
108 |     
109 |     def readable(self) -> bool:
110 |         return super().readable()
111 |     
112 |     def seekable(self) -> bool:
113 |         return super().seekable()
114 |     
115 |     @property
116 |     def is_eof(self) -> bool:
117 |         return self._closed or self._cursor >= len(self._data)
118 |     
119 |     @property
120 |     def remaining_bytes(self) -> int:
121 |         return len(self._data) - self.tell()
122 |     
123 |     def writable(self) -> bool:
124 |         if self._closed:
125 |             return False
126 |         # Readonly for memoryview?
127 |         return isinstance(self._data, bytearray)
128 | 
129 |     def read(self, size: int = -1, peek: bool = False) -> T:
130 |         beginning = self._cursor
131 |         if size is None or size < 0:
132 |             end = len(self._data)
133 |         else:
134 |             end = min(self._cursor + size, len(self._data))
135 |         result = self._data[beginning:end]
136 |         if self.read_as_bytes and not isinstance(result, bytes):
137 |             result = bytes(result)
138 |         if not peek:
139 |             self._cursor = end
140 |         return result
141 |     
142 |     def peek(self, size: int = -1) -> memoryview:
143 |         cursor = self._cursor
144 |         mv = memoryview(self._data)
145 |         if size is None or size < 0:
146 |             return mv[cursor:]
147 |         return mv[cursor:cursor + size]
148 |     
149 |     def read1(self, size: int = -1, peek: bool = False) -> T:
150 |         return self.read(size, peek)
151 |     
152 |     def _find_linebreak(self, beginning: int, end: int) -> int:
153 |         if not isinstance(self._data, memoryview):
154 |             return self._data.find(b'\n', beginning, end)
155 |         for k in range(beginning, end):
156 |             if self._data[k] == 0xA: return k
157 |         return -1
158 |     
159 |     def readline(self, size: int = -1) -> T:
160 |         beginning, end = self._cursor, len(self._data)
161 |         if size is not None and size >= 0:
162 |             end = beginning + size
163 |         p = self._find_linebreak(beginning, end)
164 |         self._cursor = end if p < 0 else p + 1
165 |         result = self._data[beginning:self._cursor]
166 |         if self.read_as_bytes and not isinstance(result, bytes):
167 |             result = bytes(result)
168 |         return result
169 |     
170 |     def readlines(self, size: int = -1) -> Iterable[T]:
171 |         if size is None or size < 0:
172 |             yield from self
173 |         else:
174 |             total = 0
175 |             while total < size:
176 |                 line = next(self)
177 |                 total += len(line)
178 |                 yield line
179 |     
180 |     def readinto1(self, buffer: Any) -> int:
181 |         data = self.read(len(buffer))
182 |         size = len(data)
183 |         buffer[:size] = data
184 |         return size
185 |     
186 |     def readinto(self, buffer: Any) -> int: 
187 |         return self.readinto1(buffer)
188 | 
189 |     def tell(self) -> int:
190 |         return self._cursor
191 |     
192 |     def seek_relative(self, offset: int) -> int:
193 |         return self.seek(self._cursor + offset)
194 |         
195 |     def seek_set(self, offset: int) -> int:
196 |         if offset < 0:
197 |             return self.seek(offset, self.SEEK.END)
198 |         else:
199 |             return self.seek(offset, self.SEEK.SET)
200 |         
201 |     def get_buffer(self) -> T:
202 |         return self._data
203 |     
204 |     def get_value(self) -> T:
205 |         return self._data
206 |     
207 |     def seek(self, offset: int, whence=io.SEEK_SET) -> int:
208 |         if whence == io.SEEK_SET:
209 |             if offset < 0:
210 |                 raise ValueError('Negative seek position {}'.format(offset))    
211 |             self._cursor = offset
212 |         elif whence == io.SEEK_CUR:
213 |             self._cursor += offset
214 |         elif whence == io.SEEK_END:
215 |             self._cursor = len(self._data) + offset
216 |         self._cursor = max(0, min(self._cursor, len(self._data)))
217 |         return self._cursor
218 |     
219 |     def write_lines(self, lines: Iterable[Union[bytes, bytearray, memoryview]]) -> None:
220 |         for line in lines:
221 |             self.append(line)
222 | 
223 |     def truncate(self, size=None) -> None:
224 |         if size is not None:
225 |             if not (0 <= size <= len(self._data)):
226 |                 raise ValueError('Invalid size {}'.format(size))
227 |             self._cursor = size
228 |         del self._data[self._cursor:]
229 | 
230 |     def append_byte(self, byte: int) -> None:
231 |         try:
232 |             cursor = self._cursor
233 |             if cursor < len(self._data):
234 |                 self._data[cursor] = byte
235 |             else:
236 |                 self._data.append(byte)
237 |         except Exception as T:
238 |             raise io.UnsupportedOperation('append_byte') from T
239 |         else:
240 |             self._cursor += 1
241 | 
242 |     def append(self, data: Iterable[int]) -> int:
243 |         output_data = self._data
244 |         end = len(output_data)
245 |         beginning = self._cursor
246 |         if beginning == end:
247 |             output_data.extend(data)
248 |             self._cursor = end = len(output_data)
249 |             return end - beginning
250 |         try:
251 |             size = len(data)
252 |         except Exception as T:
253 |             it = iter(data)
254 |             for cursor, byte in enumerate(it, end - beginning):
255 |                 output_data[cursor] = byte
256 |                 if cursor >= end - 1:
257 |                     break
258 |             else:
259 |                 cursor += 1
260 |                 self._cursor = cursor
261 |                 return cursor - beginning
262 |             output_data.extend(it)
263 |         else:
264 |             self._cursor += size
265 |             try:
266 |                 self._data[beginning:self._cursor] = data
267 |             except Exception as T:
268 |                 self._cursor = beginning
269 |                 raise io.UnsupportedOperation('append') from T
270 |             return size
271 |         self._cursor = end = len(output_data)
272 |         return end - beginning
273 | 
274 |     def __getitem__(self, slice: Any) -> T:
275 |         result = self._data[slice]
276 |         if isinstance(result, bytes) and not self.read_as_bytes:
277 |             result = bytes(result)
278 |         return result
279 | 
280 |     def replay(self, offset: int, length: int) -> None:
281 |         if offset not in range(self._cursor + 1):
282 |             raise ValueError('Invalid offset {}'.format(offset))
283 |         rep, r = divmod(length, offset)
284 |         offset = -offset - len(self) + self._cursor
285 |         replay = self._data[offset:offset + r]
286 |         if rep > 0:
287 |             replay = bytes(self._data[offset:self._cursor]) * rep + replay
288 |         self.append(replay)
289 | 
290 | 
291 | class order(str, enum.Enum):
292 |     big = 'big'
293 |     little = 'little'
294 | 
295 | class StructReader(MemoryFile[T]):
296 | 
297 |     class Unaligned(RuntimeError):
298 |         pass
299 | 
300 |     def __init__(self, data: T, bigendian: bool = False):
301 |         super().__init__(data)
302 |         self._number_of_bits = 0
303 |         self._buffer_bits = 0
304 |         self._bigendian = bigendian
305 |     
306 |     def __enter__(self) -> 'StructReader':
307 |         return self
308 |     
309 |     def __exit__(self) -> None:
310 |         return super().__exit__()
311 | 
312 |     @property
313 |     def bigendian(self):
314 |         self.bigendian = True
315 |         try:
316 |             yield self
317 |         finally:
318 |             self.bigendian = False
319 |     
320 |     @property
321 |     def byteorder_format(self) -> str:
322 |         return '>' if self.bigendian else '<'
323 |     
324 |     @property
325 |     def byteorder_name(self) -> str:
326 |         return 'big' if self._bigendian else 'little'
327 | 
328 |     def readinto(self, buffer: Any) -> int:
329 |         return super().readinto(buffer)
330 |     
331 |     def seek(self, offset: int, whence: int = io.SEEK_SET) -> int:
332 |         return super().seek(offset, whence)
333 |     
334 |     def read_exactly(self, size: Optional[int] = None,
335 |                       peek: bool = False) -> T:
336 |         if not self.byte_aligned:
337 |             raise StructReader.Unaligned('Buffer is not byte aligned')
338 |         data = self.read1(size, peek)
339 |         if size and len(data) < size:
340 |             raise EOF(data)
341 |         return data
342 |         
343 |     @property
344 |     def byte_aligned(self) -> bool:
345 |         return not self._number_of_bits
346 |     
347 |     def byte_align(self, blocksize: int = 1) -> Tuple[int, int]:
348 |         if self.byte_aligned:
349 |             return 0, 0
350 |         number_of_bits = self._number_of_bits
351 |         buffer_bits = self._buffer_bits
352 |         self._number_of_bits = 0
353 |         self._buffer_bits = 0
354 |         mod = self._cursor % blocksize
355 |         self.seek_relative(mod and blocksize - mod)
356 |         return number_of_bits, buffer_bits
357 |     
358 |     # TODO: Review Read Integer if needed
359 |     def read_integer(self, length: int, peek: bool = False) -> int:
360 |         """
361 |         Read `length` many bits from the underlying stream as an integer.
362 |         """
363 |         if length < self._number_of_bits:
364 |             new_count = self._number_of_bits - length
365 |             if self.bigendian:
366 |                 result = self._buffer_bist >> new_count
367 |                 if not peek:
368 |                     self._buffer_bist ^= result << new_count
369 |             else:
370 |                 result = self._buffer_bist & 2 ** length - 1
371 |                 if not peek:
372 |                     self._buffer_bist >>= length
373 |             if not peek:
374 |                 self._number_of_bits = new_count
375 |             return result
376 | 
377 | 
378 |         number_of_bits, buffer_bits = self.byte_align()
379 |         number_of_missing_bits = length - number_of_bits
380 |         bytecount, rest = divmod(number_of_missing_bits, 8)
381 |         if rest:
382 |             bytecount += 1
383 |             rest = 8 - rest
384 |         if bytecount == 1:
385 |             result, = self.read_exactly(1, peek)
386 |         else:
387 |             result = int.from_bytes(self.read_exactly(bytecount, peek), self.byteorder_name)
388 |         if not number_of_bits and not rest:
389 |             return result
390 |         if self.bigendian:
391 |             rbmask   = 2 ** rest - 1       # noqa
392 |             excess   = result & rbmask     # noqa
393 |             result >>= rest                # noqa
394 |             result  ^= buffer_bits << number_of_missing_bits   # noqa
395 |         else:
396 |             excess   = result >> number_of_missing_bits  # noqa
397 |             result  ^= excess << number_of_missing_bits  # noqa
398 |             result <<= number_of_bits               # noqa
399 |             result  |= buffer_bits               # noqa
400 |         assert excess.bit_length() <= rest
401 |         if not peek:
402 |             self._number_of_bits = rest
403 |             self._buffer_bist = excess
404 |         return result
405 |         
406 |     def read_bytes(self, size: int, peek: bool = False) -> bytes:
407 |         if self.byte_aligned:
408 |             data = self.read_exactly(size, peek)
409 |             if not isinstance(data, bytes):
410 |                 data = bytes(data)
411 |             return data
412 |         else:
413 |             return self.read_bits(size * 8, peek).tobytes()
414 |         
415 |     def read_bit(self) -> int:
416 |         return self.read_integer(1)
417 |     
418 |     def read_bits(self, number_of_bits: int) -> Iterable[int]:
419 |         chunk = self.read_integrer(number_of_bits) 
420 |         for k in range(number_of_bits -1, -1, -1):
421 |             yield chunk >> k & 1
422 | 
423 |     def read_flags(self, number_of_bits: int, reverse=False) -> Iterable[bool]:
424 |         bits = list(self.read_bits(number_of_bits))
425 |         if reverse:
426 |             bits.reverse()
427 |         for bit in bits:
428 |             yield bool(bit)
429 | 
430 |     def read_struct(self, spec: str, unwrap=False, 
431 |                     peek=False) -> Union[List[UnpackType], UnpackType]:
432 |         if not spec:
433 |             raise ValueError('No format specified')
434 |         byte_order = spec[:1]
435 |         if byte_order in '<!=@>':
436 |             spec = spec[1:]
437 |         else:
438 |             byte_order = self.byteorder_format
439 |         data = []
440 |         current_cursor = self.tell()
441 | 
442 |         for k, part in enumerate(re.split('(\\d*[auwE])', spec)):
443 |             if k % 2 == 1:
444 |                 count = 1 if len(part) == 1 else int(part[:~0])
445 |                 part = part[~0]
446 |                 for _ in range(count):
447 |                     if part == 'a':
448 |                         size = self.read_integer(8)
449 |                         data.append(self.read_bytes(size))
450 |                     elif part == 'u':
451 |                         data.append(self.read_integer(8))
452 |                     elif part == 'w':
453 |                         data.append(self.read_integer(16))
454 |                     elif part == 'E':
455 |                         data.append(self.read_integer(32))
456 |                     else:
457 |                         raise ValueError('Invalid format {}'.format(part))
458 |                 continue
459 |             else:
460 |                 part = F'{byte_order}{part}'
461 |                 data.extend(struct.unpack(part, self.read_exactly(struct.calcsize(part))))
462 |         if unwrap and len(data) == 1:
463 |             return data[0]
464 |         if peek:
465 |             self.seek_set(current_cursor)
466 |         return data
467 | 
468 |     def read_nibble(self, peek: bool = False) -> int:
469 |         return self.read_integer(4, peek)
470 | 
471 | 
472 |     def u8(self, peek: bool = False) -> int: 
473 |         return self.read_integer(8, peek)
474 |     def i8(self, peek: bool = False) -> int: 
475 |         return signed(self.read_integer(8, peek), 8)
476 |     def u16(self, peek: bool = False) -> int: 
477 |         return self.read_integer(16, peek)
478 |     def u32(self, peek: bool = False) -> int: 
479 |         return self.read_integer(32, peek)
480 |     def u64(self, peek: bool = False) -> int: 
481 |         return self.read_integer(64, peek)
482 |     def i16(self, peek: bool = False) -> int: 
483 |         return signed(self.read_integer(16, peek), 16)
484 |     def i32(self, peek: bool = False) -> int: 
485 |         return signed(self.read_integer(32, peek), 32)
486 |     def i64(self, peek: bool = False) -> int: 
487 |         return signed(self.read_integer(64, peek), 64)
488 |     def f32(self, peek: bool = False) -> float: 
489 |         return self.read_struct('f', unwrap=True, peek=peek)
490 |     def f64(self, peek: bool = False) -> float: 
491 |         return self.read_struct('d', unwrap=True, peek=peek)
492 |     def read_byte(self, peek: bool = False) -> int: 
493 |         return self.read_integer(8, peek)
494 |     def read_char(self, peek: bool = False) -> int:
495 |         return signed(self.read_integer(8, peek), 8)
496 | 
497 |     def read_terminated_array(self, terminator: bytes, 
498 |                               alignment: int = 1) -> bytearray:
499 |         pos = self.tell()
500 |         buffer = self.get_buffer()
501 |         try:
502 |             end = pos - 1
503 |             while True:
504 |                 end = buffer.find(terminator, end + 1)
505 |                 if end < 0 or not (end - pos) % alignment:
506 |                     break
507 |         except AttributeError:
508 |             result = bytearray()
509 |             while not self.is_eof:
510 |                 result.extend(self.read_bytes(alignment))
511 |                 if result.endswith(terminator):
512 |                     return result[:-len(terminator)]
513 |             self.seek(pos)
514 |             raise EOF
515 |         else:
516 |             data = self.read_exactly(end - pos)
517 |             self.seek_relative(len(terminator))
518 |             return bytearray(data)
519 |     
520 |     def read_c_string(self, encoding=None) -> Union[str, bytearray]:
521 |         data = self.read_terminated_array(b'\0')
522 |         if encoding is not None:
523 |             data = data.decode(encoding)
524 |         return data
525 |     
526 |     def read_w_string(self, encoding=None) -> Union[str, bytearray]:
527 |         data = self.read_terminated_array(b'\0\0', 2)
528 |         if encoding is not None:
529 |             data = data.decode(encoding)
530 |         return data
531 |     
532 |     def read_length_prefixed(self, 
533 |                              prefix_size: int = 32, 
534 |                              encoding: Optional[str] = None,
535 |                              block_size: int = 1) -> Union[T, str]:
536 |         prefix = self.read_integer(prefix_size) * block_size
537 |         data = self.read(prefix)
538 |         if encoding is not None:
539 |             data = data.decode(encoding)
540 |         return data
541 | 
542 |     def read_length_prefixed_ascii(self, 
543 |                                    prefix_size: int = 32) -> Union[T, str]:
544 |         return self.read_length_prefixed(prefix_size, 'ascii')
545 |     
546 |     def read_length_prefixed_utf8(self, 
547 |                                   prefix_size: int = 32) -> Union[T, str]:
548 |         return self.read_length_prefixed(prefix_size, 'utf-8')
549 |     
550 |     def read_length_prefixed_utf16(self, 
551 |                                    prefix_size: int = 32,
552 |                                    bytecount: bool = False) -> Union[T, str]:
553 |         block_size = 1 if bytecount else 2
554 |         return self.read_length_prefixed(prefix_size, 'utf-16le', block_size)
555 |     
556 |     # TODO: Review function if needed
557 |     def read_7bit_encoded_int(self, max_bits: int = 0) -> int:
558 |         result = 0
559 |         for k in itertools.count():
560 |             if k == max_bits:
561 |                 raise ValueError('Invalid 7-bit encoded integer')
562 |             byte = self.read_byte()
563 |             result |= (byte & 0x7F) << (7 * k)
564 |             if not byte & 0x80:
565 |                 break
566 |         return result
567 | 
568 | class StructMeta(type):
569 |     def __new__(mcls, name, bases, nmspc, parser=StructReader):
570 |         return type.__new__(mcls, name, bases, nmspc)
571 |     
572 |     def __init__(cls, name, bases, nmspc, parser=StructReader):
573 |         super(StructMeta, cls).__init__(name, bases, nmspc)
574 |         original__init__ = cls.__init__
575 | 
576 |         @functools.wraps(original__init__)
577 |         def wrapped__init__(self: Struct, reader, *args, **kwargs):
578 |             if not isinstance(reader, parser):
579 |                 if issubclass(parser, reader.__class__):
580 |                     raise ValueError(
581 |                         F'A reader of type {reader.__class__.__name__} was passed to {cls.__name__}, '
582 |                         F'but a {parser.__name__} is required.')
583 |                 reader = parser(reader)
584 |             start = reader.tell()
585 |             view = memoryview(reader.get_buffer())
586 |             original__init__(self, reader, *args, **kwargs)
587 |             self.__data = view[start:reader.tell()]
588 | 
589 |         cls.__init__ = wrapped__init__
590 | 
591 | class Struct(metaclass=StructMeta):
592 | 
593 |     _data: Union[memoryview, bytearray]
594 | 
595 |     def __len__(self):
596 |         return len(self._data)
597 |     
598 |     def __bytes__(self):
599 |         return bytes(self._data)   
600 |     
601 |     def get_data(self, decouple=False):
602 |         if decouple and isinstance(self._data, memoryview):
603 |             self._data = bytearray(self._data)
604 |         return self._data
605 |     
606 |     def __init__(self, reader: StructReader):
607 |         pass
608 | 
609 |     
610 | 
611 | 
612 | class EOF(EOFError):
613 |     def __init__(self, rest: Union[bytes, bytearray, memoryview] = b''):
614 |         super().__init__('End of File')
615 |         self.rest = rest


--------------------------------------------------------------------------------
/src/debloat/utilities/pyflate.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # ============================ MOTIFICATION NOTE ============================
  3 | # The content of this file has been modified for use in binary refinery; it
  4 | # has been ported from Python2 to Python3 and the BZip2 implementation was
  5 | # rewritten to support NSIS-specific BZip stream and block headers, which are
  6 | # different from the official standard values. The original code was taken
  7 | # from the following location:
  8 | #  https://github.com/pfalcon/pyflate/blob/master/pyflate.py
  9 | 
 10 | # ============================ ORIGINAL LICENSING ============================
 11 | # Copyright 2006--2007-01-21 Paul Sladen
 12 | #  http://www.paul.sladen.org/projects/compression/
 13 | #
 14 | # You may use and distribute this code under any DFSG-compatible license (eg.
 15 | # BSD, GNU GPLv2).
 16 | #
 17 | # Stand-alone pure-Python DEFLATE (gzip) and bzip2 decoder/decompressor. This
 18 | # is probably most useful for research purposes/index building; there is
 19 | # certainly some room for improvement in the Huffman bit-matcher.
 20 | #
 21 | # With the as-written implementation, there was a known bug in BWT decoding
 22 | # to do with repeated strings. This has been worked around; see bwt_reverse().
 23 | # Correct output is produced in all test cases but ideally the problem would
 24 | # be found...
 25 | # ============================================================================
 26 | from __future__ import annotations
 27 | from typing import List, Tuple, Iterable, Optional, BinaryIO
 28 | 
 29 | import itertools
 30 | import abc
 31 | 
 32 | 
 33 | class BitfieldBase(abc.ABC):
 34 | 
 35 |     def __init__(self, x):
 36 |         if isinstance(x, BitfieldBase):
 37 |             self.f = x.f
 38 |             self.bits = x.bits
 39 |             self.bitfield = x.bitfield
 40 |             self.count = x.bitfield
 41 |         else:
 42 |             self.f = x
 43 |             self.bits = 0
 44 |             self.bitfield = 0x0
 45 |             self.count = 0
 46 | 
 47 |     def _read(self, n):
 48 |         s = self.f.read(n)
 49 |         if not s:
 50 |             raise RuntimeError('length error')
 51 |         self.count += len(s)
 52 |         return s
 53 | 
 54 |     def needbits(self, n):
 55 |         while self.bits < n:
 56 |             self._more()
 57 | 
 58 |     def _mask(self, n):
 59 |         return (1 << n) - 1
 60 | 
 61 |     def toskip(self):
 62 |         return self.bits & 0x7
 63 | 
 64 |     def align(self):
 65 |         self.readbits(self.toskip())
 66 | 
 67 |     def dropbits(self, n=8):
 68 |         while n >= self.bits and n > 7:
 69 |             n -= self.bits
 70 |             self.bits = 0
 71 |             n -= len(self.f._read(n >> 3)) << 3
 72 |         if n:
 73 |             self.readbits(n)
 74 | 
 75 |     def dropbytes(self, n=1):
 76 |         self.dropbits(n << 3)
 77 | 
 78 |     def tell(self):
 79 |         return self.count - ((self.bits + 7) >> 3), 7 - ((self.bits - 1) & 0x7)
 80 | 
 81 |     def tellbits(self):
 82 |         bytes, bits = self.tell()
 83 |         return (bytes << 3) + bits
 84 | 
 85 |     @abc.abstractmethod
 86 |     def _more(self):
 87 |         pass
 88 | 
 89 |     @abc.abstractmethod
 90 |     def snoopbits(self, n=8):
 91 |         pass
 92 | 
 93 |     @abc.abstractmethod
 94 |     def readbits(self, n=8):
 95 |         pass
 96 | 
 97 | 
 98 | class LBitfield(BitfieldBase):
 99 | 
100 |     def _more(self):
101 |         c = self._read(1)
102 |         self.bitfield += c[0] << self.bits
103 |         self.bits += 8
104 | 
105 |     def snoopbits(self, n=8):
106 |         if n > self.bits:
107 |             self.needbits(n)
108 |         return self.bitfield & self._mask(n)
109 | 
110 |     def readbits(self, n=8):
111 |         if n > self.bits:
112 |             self.needbits(n)
113 |         r = self.bitfield & self._mask(n)
114 |         self.bits -= n
115 |         self.bitfield >>= n
116 |         return r
117 | 
118 | 
119 | class RBitfield(BitfieldBase):
120 | 
121 |     def _more(self):
122 |         c = self._read(1)
123 |         self.bitfield <<= 8
124 |         self.bitfield += c[0]
125 |         self.bits += 8
126 | 
127 |     def snoopbits(self, n=8):
128 |         if n > self.bits:
129 |             self.needbits(n)
130 |         return (self.bitfield >> (self.bits - n)) & self._mask(n)
131 | 
132 |     def readbits(self, n=8):
133 |         if n > self.bits:
134 |             self.needbits(n)
135 |         r = (self.bitfield >> (self.bits - n)) & self._mask(n)
136 |         self.bits -= n
137 |         self.bitfield &= ~(self._mask(n) << self.bits)
138 |         return r
139 | 
140 | 
141 | class HuffmanLength:
142 |     code: int
143 |     bits: int
144 |     symbol: Optional[int]
145 |     reverse_symbol: Optional[int]
146 | 
147 |     def __init__(self, code, bits=0):
148 |         self.code = code
149 |         self.bits = bits
150 |         self.symbol = None
151 |         self.reverse_symbol = None
152 | 
153 |     def __lt__(self, other):
154 |         return self.__cmp(other) < 0
155 | 
156 |     def __gt__(self, other):
157 |         return self.__cmp(other) > 0
158 | 
159 |     def __eq__(self, other):
160 |         return self.__cmp(other) == 0
161 | 
162 |     def __le__(self, other):
163 |         return self.__cmp(other) <= 0
164 | 
165 |     def __ge__(self, other):
166 |         return self.__cmp(other) >= 0
167 | 
168 |     def __ne__(self, other):
169 |         return self.__cmp(other) != 0
170 | 
171 |     def __cmp(self, other):
172 |         a, b = self.bits, other.bits
173 |         if a == b:
174 |             a, b = self.code, other.code
175 |         return (a > b) - (a < b)
176 | 
177 | 
178 | def reverse_bits(v: int, n: int):
179 |     a = 1 << 0
180 |     b = 1 << (n - 1)
181 |     z = 0
182 |     for i in range(n - 1, -1, -2):
183 |         z |= (v >> i) & a
184 |         z |= (v << i) & b
185 |         a <<= 1
186 |         b >>= 1
187 |     return z
188 | 
189 | 
190 | def reverse_bytes(v, n):
191 |     a = 0xff << 0
192 |     b = 0xff << (n - 8)
193 |     z = 0
194 |     for i in range(n - 8, -8, -16):
195 |         z |= (v >> i) & a
196 |         z |= (v << i) & b
197 |         a <<= 8
198 |         b >>= 8
199 |     return z
200 | 
201 | 
202 | class HuffmanTable:
203 |     table: List[HuffmanLength]
204 | 
205 |     def __init__(self, bootstrap):
206 |         table = []
207 |         start, bits = bootstrap[0]
208 |         for finish, endbits in bootstrap[1:]:
209 |             if bits:
210 |                 for code in range(start, finish):
211 |                     table.append(HuffmanLength(code, bits))
212 |             start, bits = finish, endbits
213 |             if endbits == -1:
214 |                 break
215 |         table.sort()
216 |         self.table = table
217 | 
218 |     def populate_huffman_symbols(self):
219 |         bits, symbol = -1, -1
220 |         for x in self.table:
221 |             symbol += 1
222 |             if x.bits != bits:
223 |                 symbol <<= (x.bits - bits)
224 |                 bits = x.bits
225 |             x.symbol = symbol
226 |             x.reverse_symbol = reverse_bits(symbol, bits)
227 | 
228 |     def min_max_bits(self):
229 |         self.min_bits, self.max_bits = 16, -1
230 |         for x in self.table:
231 |             if x.bits < self.min_bits: self.min_bits = x.bits
232 |             if x.bits > self.max_bits: self.max_bits = x.bits
233 | 
234 |     def _find_symbol(self, bits: int, symbol: int, table: Iterable[HuffmanLength]) -> int:
235 |         for h in table:
236 |             if h.bits == bits and h.reverse_symbol == symbol:
237 |                 return h.code
238 |         return -1
239 | 
240 |     def find_next_symbol(self, field: LBitfield, reversed=True):
241 |         cached_length = -1
242 |         cached = None
243 |         for x in self.table:
244 |             if cached_length != x.bits:
245 |                 cached = field.snoopbits(x.bits)
246 |                 cached_length = x.bits
247 |             if (reversed and x.reverse_symbol == cached) or (not reversed and x.symbol == cached):
248 |                 field.readbits(x.bits)
249 |                 return x.code
250 |         raise RuntimeError(F'symbol not found even after end of table at {field.tell()}')
251 | 
252 | 
253 | class OrderedHuffmanTable(HuffmanTable):
254 |     def __init__(self, lengths):
255 |         _ordered_lengths = list(enumerate(lengths))
256 |         _ordered_lengths.append((len(lengths), -1))
257 |         super().__init__(_ordered_lengths)
258 | 
259 | 
260 | CODE_LENGTH_ORDERS = (
261 |     0x10, 0x11, 0x12, 0x00, 0x08, 0x07, 0x09, 0x06, 0x0A, 0x05,
262 |     0x0B, 0x04, 0x0C, 0x03, 0x0D, 0x02, 0x0E, 0x01, 0x0F)
263 | 
264 | DISTANCE_BASE = (
265 |     0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0007, 0x0009, 0x000D, 0x0011, 0x0019,
266 |     0x0021, 0x0031, 0x0041, 0x0061, 0x0081, 0x00C1, 0x0101, 0x0181, 0x0201, 0x0301,
267 |     0x0401, 0x0601, 0x0801, 0x0C01, 0x1001, 0x1801, 0x2001, 0x3001, 0x4001, 0x6001)
268 | 
269 | LENGTH_BASE = (
270 |     0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000A, 0x000B, 0x000D,
271 |     0x000F, 0x0011, 0x0013, 0x0017, 0x001B, 0x001F, 0x0023, 0x002B, 0x0033, 0x003B,
272 |     0x0043, 0x0053, 0x0063, 0x0073, 0x0083, 0x00A3, 0x00C3, 0x00E3, 0x0102)
273 | 
274 | 
275 | def extra_distance_bits(n):
276 |     if 0 <= n <= 1:
277 |         return 0
278 |     elif 2 <= n <= 29:
279 |         return (n >> 1) - 1
280 |     else:
281 |         raise RuntimeError('illegal distance code')
282 | 
283 | 
284 | def extra_length_bits(n):
285 |     if 257 <= n <= 260 or n == 285:
286 |         return 0
287 |     elif 261 <= n <= 284:
288 |         return ((n - 257) >> 2) - 1
289 |     else:
290 |         raise RuntimeError('illegal length code')
291 | 
292 | 
293 | def move_to_front(array: list, index):
294 |     array[:] = itertools.chain(
295 |         itertools.islice(array, index, index + 1),
296 |         itertools.islice(array, 0, index),
297 |         itertools.islice(array, index + 1, None)
298 |     )
299 | 
300 | 
301 | def bwt_transform(data):
302 |     tmp = bytearray(sorted(data))
303 |     base = list(map(tmp.find, range(256)))
304 |     pointers = [-1] * len(data)
305 |     for i, symbol in enumerate(data):
306 |         pointers[base[symbol]] = i
307 |         base[symbol] += 1
308 |     return pointers
309 | 
310 | 
311 | def bwt_reverse(data, end):
312 |     out = bytearray(len(data))
313 |     transform = bwt_transform(data)
314 | 
315 |     # STRAGENESS WARNING: There was a bug somewhere here in that
316 |     # if the output of the BWT resolves to a perfect copy of N
317 |     # identical strings (think exact multiples of 255 'X' here),
318 |     # then a loop is formed.  When decoded, the output string would
319 |     # be cut off after the first loop, typically '\0\0\0\0\xfb'.
320 |     # The previous loop construct was:
321 |     #
322 |     #  next = T[end]
323 |     #  while next != end:
324 |     #      out += L[next]
325 |     #      next = T[next]
326 |     #  out += L[next]
327 |     #
328 |     # For the moment, I've instead replaced it with a check to see
329 |     # if there has been enough output generated.  I didn't figured
330 |     # out where the off-by-one-ism is yet---that actually produced
331 |     # the cyclic loop.
332 | 
333 |     for i in range(len(data)):
334 |         end = transform[end]
335 |         out[i] = data[end]
336 | 
337 |     return out
338 | 
339 | 
340 | class _DecompressionFile(abc.ABC):
341 | 
342 |     def readable(self) -> bool:
343 |         return True
344 | 
345 |     def seekable(self) -> bool:
346 |         return False
347 | 
348 |     def writable(self) -> bool:
349 |         return False
350 | 
351 |     def write(self, __b):
352 |         raise NotImplementedError
353 | 
354 |     data: BinaryIO
355 |     bits: BitfieldBase
356 |     nsis: bool
357 |     done: bool
358 |     current_block: bytearray
359 | 
360 |     def __init__(self, data: BinaryIO, nsis: bool = True):
361 |         self.data = data
362 |         self.nsis = nsis
363 |         self.done = False
364 |         self.current_block = bytearray()
365 | 
366 |     def readall(self) -> bytes:
367 |         return self.read()
368 | 
369 |     def readinto(self, __buffer):
370 |         data = self.read(len(__buffer))
371 |         size = len(data)
372 |         __buffer[:size] = data
373 |         return size
374 | 
375 |     def read(self, size=-1):
376 |         while size not in range(len(self.current_block)):
377 |             if not self._readblock():
378 |                 break
379 |         block = self.current_block
380 |         if size < 0 or size >= len(block):
381 |             self.current_block = bytearray()
382 |             return block
383 |         else:
384 |             out = block[:size]
385 |             del block[:size]
386 |             return out
387 | 
388 |     @abc.abstractmethod
389 |     def _readblock(self) -> bool:
390 |         pass
391 | 
392 | 
393 | class BZip2File(_DecompressionFile):
394 | 
395 |     blocksize: int
396 |     block_header_size: int
397 |     block_header_type: Tuple[int, int]
398 |     current_block: bytearray
399 | 
400 |     def __init__(self, data: BinaryIO, nsis: bool = True):
401 |         super().__init__(data, nsis)
402 |         self.bits = RBitfield(data)
403 | 
404 |         if nsis:
405 |             self.blocksize = 9
406 |             self.block_header_size = 8
407 |             self.block_header_type = (0x31, 0x17)
408 |         else:
409 |             if data.read(2) != b'BZ':
410 |                 raise RuntimeError('BZip2 header magic is missing')
411 |             if self.bits.readbits(8) != ord('h'):
412 |                 raise RuntimeError('BZip2 header contains unknown compression method')
413 |             blocksize = self.bits.readbits(8)
414 |             if 0x31 <= blocksize <= 0x39:
415 |                 blocksize = blocksize - 0x30
416 |             else:
417 |                 raise RuntimeError('BZip2 header specifies invalid block size')
418 |             self.blocksize = blocksize
419 |             self.block_header_size = 48
420 |             self.block_header_type = (0x314159265359, 0x177245385090)
421 | 
422 |         self.blocksize *= 100_000
423 | 
424 |     def _readblock(self):
425 |         out = self.current_block
426 |         if self.done:
427 |             return False
428 |         br = self.bits
429 |         blocktype = br.readbits(self.block_header_size)
430 |         if not self.nsis:
431 |             _ = br.readbits(32) # crc
432 |         if blocktype == self.block_header_type[0]:
433 |             if not self.nsis and br.readbits(1):
434 |                 raise RuntimeError('BZip2 randomised support not implemented')
435 |             pointer = br.readbits(24)
436 |             huffman_used_map = br.readbits(16)
437 |             map_mask = 1 << 15
438 |             used = []
439 |             while map_mask > 0:
440 |                 if huffman_used_map & map_mask:
441 |                     huffman_used_bitmap = br.readbits(16)
442 |                     bit_mask = 1 << 15
443 |                     while bit_mask > 0:
444 |                         if huffman_used_bitmap & bit_mask:
445 |                             pass
446 |                         used += [bool(huffman_used_bitmap & bit_mask)]
447 |                         bit_mask >>= 1
448 |                 else:
449 |                     used += [False] * 16
450 |                 map_mask >>= 1
451 |             huffman_groups = br.readbits(3)
452 |             if not 2 <= huffman_groups <= 6:
453 |                 raise RuntimeError('BZip2 number of Huffman groups not in range 2..6')
454 |             selectors_used = br.readbits(15)
455 |             mtf = list(range(huffman_groups))
456 |             selectors_list = []
457 |             for i in range(selectors_used):
458 |                 c = 0
459 |                 while br.readbits(1):
460 |                     c += 1
461 |                     if c >= huffman_groups:
462 |                         raise RuntimeError('BZip2 chosen selector greater than number of groups (max 6)')
463 |                 if c >= 0:
464 |                     move_to_front(mtf, c)
465 |                 selectors_list += mtf[0:1]
466 |             groups_lengths = []
467 |             symbols_in_use = sum(used) + 2  # remember RUN[AB] RLE symbols
468 |             for _ in range(huffman_groups):
469 |                 length = br.readbits(5)
470 |                 lengths = []
471 |                 for i in range(symbols_in_use):
472 |                     if not 0 <= length <= 20:
473 |                         raise RuntimeError('BZip2 Huffman length code outside range 0..20')
474 |                     while br.readbits(1):
475 |                         length -= (br.readbits(1) * 2) - 1
476 |                     lengths += [length]
477 |                 groups_lengths += [lengths]
478 | 
479 |             tables = []
480 |             for g in groups_lengths:
481 |                 codes = OrderedHuffmanTable(g)
482 |                 codes.populate_huffman_symbols()
483 |                 codes.min_max_bits()
484 |                 tables.append(codes)
485 | 
486 |             favourites = [y for y, x in enumerate(used) if x]
487 |             selector_pointer = 0
488 |             decoded = 0
489 |             repeat = repeat_power = 0
490 |             buffer = bytearray()
491 |             t = None
492 |             while True:
493 |                 decoded -= 1
494 |                 if decoded <= 0:
495 |                     decoded = 50
496 |                     if selector_pointer <= len(selectors_list):
497 |                         t = tables[selectors_list[selector_pointer]]
498 |                         selector_pointer += 1
499 |                 r = t.find_next_symbol(br, False)
500 |                 if 0 <= r <= 1:
501 |                     if repeat == 0:
502 |                         repeat_power = 1
503 |                     repeat += repeat_power << r
504 |                     repeat_power <<= 1
505 |                     continue
506 |                 elif repeat > 0:
507 |                     buffer.extend(itertools.repeat(favourites[0], repeat))
508 |                     repeat = 0
509 |                 if r == symbols_in_use - 1:
510 |                     break
511 |                 else:
512 |                     o = favourites[r - 1]
513 |                     move_to_front(favourites, r - 1)
514 |                     buffer.append(o)
515 |             # RLE step
516 |             nt = bwt_reverse(buffer, pointer)
517 |             done = bytearray()
518 |             n = len(nt)
519 |             i = 0
520 |             while i < n:
521 |                 if i < n - 4 and nt[i] == nt[i + 1] == nt[i + 2] == nt[i + 3]:
522 |                     done.extend(itertools.repeat(nt[i], nt[i + 4] + 4))
523 |                     i += 5
524 |                 else:
525 |                     done.append(nt[i])
526 |                     i += 1
527 |             out.extend(done)
528 |             return True
529 |         elif blocktype == self.block_header_type[1]:
530 |             br.align()
531 |             self.done = True
532 |             return False
533 |         else:
534 |             raise RuntimeError(
535 |                 F'unknown BZip2 block value 0x{blocktype:0{self.block_header_size // 4}X}')
536 | 
537 | 
538 | class GZipFile(_DecompressionFile):
539 | 
540 |     def __init__(self, data: BinaryIO, nsis: bool = True):
541 |         super().__init__(data, nsis)
542 |         br = self.bits = LBitfield(data)
543 |         if not nsis and self.data.read(2) != b'\x1F\x8B':
544 |             raise RuntimeError('Unknown (not 1F8B) header')
545 |         if not nsis and br.readbits(8) != 8:
546 |             raise RuntimeError('Unknown (not type 8 DEFLATE) compression method')
547 |         if not nsis:
548 |             self.flags = br.readbits(8)
549 |             self.mtime = br.readbits(32)
550 |             self.extra_flags = br.readbits(8)
551 |             self.os_type = br.readbits(8)
552 |             self.file_name = ''
553 |             self.comment = ''
554 | 
555 |             if self.flags & 0x04:
556 |                 # structured GZ_FEXTRA miscellaneous data
557 |                 xlen = br.readbits(16)
558 |                 br.dropbytes(xlen)
559 |             while self.flags & 0x08:
560 |                 # original GZ_FNAME filename
561 |                 cc = br.readbits(8)
562 |                 if not cc:
563 |                     break
564 |                 self.file_name += chr(cc)
565 |             while self.flags & 0x10:
566 |                 # human readable GZ_FCOMMENT
567 |                 cc = br.readbits(8)
568 |                 if not cc:
569 |                     break
570 |                 self.comment += chr(cc)
571 |             if self.flags & 0x02:
572 |                 # header-only GZ_FHCRC checksum
573 |                 br.readbits(16)
574 | 
575 |     def _readblock(self) -> bool:
576 |         if self.done:
577 |             return False
578 |         br = self.bits
579 |         out = self.current_block
580 |         lastbit = br.readbits(1)
581 |         blocktype = br.readbits(2)
582 | 
583 |         def _error_unused(msg):
584 |             return RuntimeError(F'illegal unused {msg} in use at {br.tell()}')
585 | 
586 |         if blocktype == 0:
587 |             br.align()
588 |             length = br.readbits(16)
589 |             if not self.nsis and 0 != length & br.readbits(16):
590 |                 raise RuntimeError('stored block lengths do not match each other')
591 |             if not br.bits:
592 |                 it = self.data.read(length)
593 |             else:
594 |                 it = (br.readbits(8) for _ in range(length))
595 |             out.extend(it)
596 | 
597 |         elif blocktype == 1 or blocktype == 2:
598 |             main_literals, main_distances = None, None
599 | 
600 |             if blocktype == 1: # Static Huffman
601 |                 static_huffman_bootstrap = [(0, 8), (144, 9), (256, 7), (280, 8), (288, -1)]
602 |                 static_huffman_lengths_bootstrap = [(0, 5), (32, -1)]
603 |                 main_literals = HuffmanTable(static_huffman_bootstrap)
604 |                 main_distances = HuffmanTable(static_huffman_lengths_bootstrap)
605 | 
606 |             elif blocktype == 2: # Dynamic Huffman
607 |                 len_codes = br.readbits(5)
608 |                 literals = len_codes + 257
609 |                 distances = br.readbits(5) + 1
610 |                 code_lengths_length = br.readbits(4) + 4
611 |                 table = [0] * 19
612 |                 for i in range(code_lengths_length):
613 |                     table[CODE_LENGTH_ORDERS[i]] = br.readbits(3)
614 |                 dynamic_codes = OrderedHuffmanTable(table)
615 |                 dynamic_codes.populate_huffman_symbols()
616 |                 dynamic_codes.min_max_bits()
617 | 
618 |                 # Decode the code_lengths for both tables at once,
619 |                 # then split the list later
620 | 
621 |                 code_lengths = []
622 |                 n = 0
623 |                 while n < (literals + distances):
624 |                     r = dynamic_codes.find_next_symbol(br)
625 |                     if 0 <= r <= 15: # literal bitlength for this code
626 |                         count = 1
627 |                         what = r
628 |                     elif r == 16: # repeat last code
629 |                         count = 3 + br.readbits(2)
630 |                         # Is this supposed to default to '0' if in the zeroth position?
631 |                         what = code_lengths[-1]
632 |                     elif r == 17: # repeat zero
633 |                         count = 3 + br.readbits(3)
634 |                         what = 0
635 |                     elif r == 18: # repeat zero lots
636 |                         count = 11 + br.readbits(7)
637 |                         what = 0
638 |                     else:
639 |                         raise RuntimeError('next code length is outside of the range 0 <= r <= 18')
640 |                     code_lengths += [what] * count
641 |                     n += count
642 | 
643 |                 main_literals = OrderedHuffmanTable(code_lengths[:literals])
644 |                 main_distances = OrderedHuffmanTable(code_lengths[literals:])
645 | 
646 |             main_literals.populate_huffman_symbols()
647 |             main_distances.populate_huffman_symbols()
648 |             main_literals.min_max_bits()
649 |             main_distances.min_max_bits()
650 |             literal_count = 0
651 | 
652 |             while True:
653 |                 r = main_literals.find_next_symbol(br)
654 |                 if 0 <= r <= 255:
655 |                     literal_count += 1
656 |                     out.append(r)
657 |                 elif r == 256:
658 |                     if literal_count > 0:
659 |                         literal_count = 0
660 |                     break
661 |                 elif 257 <= r <= 285: # dictionary lookup
662 |                     if literal_count > 0:
663 |                         literal_count = 0
664 |                     length_extra = br.readbits(extra_length_bits(r))
665 |                     length = LENGTH_BASE[r - 257] + length_extra
666 | 
667 |                     r1 = main_distances.find_next_symbol(br)
668 |                     if 0 <= r1 <= 29:
669 |                         distance = DISTANCE_BASE[r1] + br.readbits(extra_distance_bits(r1))
670 |                         while length > distance:
671 |                             out += out[-distance:]
672 |                             length -= distance
673 |                         if length == distance:
674 |                             out += out[-distance:]
675 |                         else:
676 |                             out += out[-distance:length - distance]
677 |                     elif 30 <= r1 <= 31:
678 |                         raise _error_unused('distance symbol')
679 |                 elif 286 <= r <= 287:
680 |                     raise _error_unused('literal/length symbol')
681 |         elif blocktype == 3:
682 |             raise _error_unused('blocktype')
683 | 
684 |         if lastbit:
685 |             self.done = True
686 |             br.align()
687 |             try:
688 |                 _ = br.readbits(32) # crc
689 |                 _ = br.readbits(32) # length
690 |             except Exception:
691 |                 if not self.nsis:
692 |                     raise
693 |             return False
694 |         else:
695 |             return True
696 | 


--------------------------------------------------------------------------------
/src/debloat/processor.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file handles the processing of binaries and helper methods.
  3 | 
  4 | Three methods rely heavily on parts of Binary Refinery
  5 | https://github.com/binref/refinery
  6 | Copyright 2019 Jesko Hüttenhain under the 3-Clause BSD License
  7 | The methods are:
  8 | refinery_strip()
  9 | adjust_offsets()
 10 | refinery_trim_resources()
 11 | The RSRC Class is also from refinery.
 12 | """
 13 | from pathlib import Path
 14 | import re
 15 | from typing import Tuple, Optional, Any, Callable, List
 16 | import pefile
 17 | import binascii
 18 | import zlib
 19 | from pefile import Structure, SectionStructure, DIRECTORY_ENTRY
 20 | from typing import Generator, Iterable, Optional
 21 | 
 22 | import debloat.utilities.nsisParser as nsisParser
 23 | import debloat.utilities.rsrc as rsrc
 24 | 
 25 | DEBLOAT_VERSION = "1.6.5"
 26 | 
 27 | RESULT_CODES = {
 28 |     0: "No Solution found.",
 29 |     1: "Junk after signature.",
 30 |     2: "Single repeated byte in overlay.",
 31 |     3: "Pattern in overlay.",
 32 |     4: "Sets of repeated bytes in overlay.",
 33 |     5: "NSIS Installer.",
 34 |     6: "Bloat in PE resources",
 35 |     7: "Bloat in PE section",
 36 |     8: "Bloat in .NET resource",
 37 |     9: "Non-essential, high entropy overlay",
 38 |     10: "High compression with bytes at end.",
 39 |     11: ".NET Single File with junk",
 40 |     12: "Packed file with bloated section",
 41 |     13: "Random overlay with high compression",
 42 |     14: "Junk interspersed with data",
 43 |     15: "VMProtected junk",
 44 |     16: "InnoSetup Installer",
 45 |     17: "Junk in the certificate",
 46 |     18: "SFX Archive",
 47 |     19: "Electron Application"
 48 | }
 49 | 
 50 | 
 51 | _KB = 1000
 52 | _MB = _KB * _KB
 53 | 
 54 | def readable_size(value: int) -> str:
 55 |     '''Return bytes in human readable format.'''
 56 |     if value <= 1024:
 57 |         return '%s bytes' % value
 58 |     elif value < 1024 * 1024:
 59 |         return '%.1f KB' % (float(value) / 1024.0)
 60 |     elif value < 1024 * 1024 * 1024:
 61 |         return '%.1f MB' % (float(value) / 1024.0 / 1024.0)
 62 |     else:
 63 |         return '%.1f GB' % (float(value) / 1024.0 / 1024.0 / 1024.0)
 64 | 
 65 | def write_multiple_files(out_path: str,
 66 |                          files: list, log_message: Callable[[str], None]) -> None:
 67 |     '''
 68 |     Writes multiple files to disk when applicable.
 69 |     '''
 70 |     log_message("Installer unpacked!\n")
 71 |     log_message(f"The files are being written to {out_path}")
 72 |     for file in files:
 73 |         out_file_path = Path(out_path) / Path(file.path.replace("\\", "/"))
 74 |         out_dir_path = out_file_path.parent
 75 |         out_dir_path.mkdir(parents=True, exist_ok=True)
 76 |         with open(out_file_path, "wb") as f:
 77 |             f.write(file.data)
 78 |             log_message("File: " + str(Path(file.path.replace("\\", "/"))))
 79 |     log_message("")
 80 |     log_message("The user will need to determine which file is malicious if any.")
 81 |     log_message("If a file is bloated: resubmit it through the tool to debloat it.")
 82 |     return 
 83 | 
 84 | 
 85 | def write_patched_file(out_path: str,
 86 |                         pe: pefile.PE) -> Tuple[int, str]:
 87 |     '''Writes the patched file to disk.
 88 | 
 89 |     Keyword Arguments:
 90 |     out_path -- the path and file name to write
 91 |     pe -- the pefile that is being processed
 92 |     end_of_real_data -- an int indicating the size of bytes to write'''
 93 |     with open(out_path, 'wb') as writer:
 94 |         writer.write(pe.write())
 95 |         final_filesize = len(pe.write())
 96 |         return final_filesize, out_path
 97 | 
 98 | def handle_signature_abnormality(signature_address: int,
 99 |                                 signature_size: int,
100 |                                 beginning_file_size: int,
101 |                                 data_to_delete: List) -> Tuple[bool, int]:
102 |     '''Remove all bytes after a PE signature'''
103 |     # If the signature_address is 0, there was no original signature.
104 |     # We are setting the signature address to the filesize in order to
105 |     # skip the next check.
106 |     if signature_address == 0:
107 |         signature_address = beginning_file_size
108 |     # Check to see if there is data after the signature; if so, it is
109 |     #  junk data
110 |     signature_abnormality = False
111 |     if signature_size > (beginning_file_size - signature_size):
112 |         result_code = 17
113 |         signature_abnormality = True
114 |     elif beginning_file_size > (signature_address + signature_size):
115 |         result_code = 1
116 |         signature_abnormality = True
117 |     
118 |     if signature_abnormality is True:
119 |         data_to_delete.append((signature_address + signature_size, beginning_file_size))
120 |     else: 
121 |         result_code = 0
122 |     return signature_abnormality, result_code
123 | 
124 | def check_and_extract_NSIS(possible_header: bytearray, pe: pefile.PE) -> list:
125 |     '''Check if the PE is an NSIS installer.'''
126 |     extractor = nsisParser.extractNSIS()
127 |     confirm_if_nsis = extractor._find_archive_offset(memoryview(possible_header))
128 |     if confirm_if_nsis is None:
129 |         return
130 |     extracted_files = extractor.unpack(memoryview(pe.__data__))
131 |     return extracted_files
132 | 
133 | 
134 | def find_last_section(pe: pefile.PE) -> Optional[pefile.SectionStructure]:
135 |     '''Iterate through PE sections to identify the last one.'''
136 |     last_section = None
137 |     for section in pe.sections:
138 |         if last_section is None \
139 |                         or section.PointerToRawData > last_section.PointerToRawData:
140 |             last_section = section
141 |     return last_section
142 | 
143 | def get_signature_info(pe: pefile.PE, cert_preservation) -> Tuple[int, int]:
144 |     '''Remove PE signature and update header.'''
145 |     signature_address = pe.OPTIONAL_HEADER.DATA_DIRECTORY[pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_SECURITY']].VirtualAddress
146 |     signature_size = pe.OPTIONAL_HEADER.DATA_DIRECTORY[pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_SECURITY']].Size
147 |     pe.OPTIONAL_HEADER.DATA_DIRECTORY[pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_SECURITY']].VirtualAddress = 0
148 |     # If the cert is to be preservered, we do not need to modify the size in the header. 
149 |     if cert_preservation == False:
150 |         pe.OPTIONAL_HEADER.DATA_DIRECTORY[pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_SECURITY']].Size = 0
151 | 
152 |     return signature_address, signature_size
153 | 
154 | 
155 | def adjust_offsets(pe: pefile.PE, gap_offset: int, gap_size: int):
156 |     base = pe.OPTIONAL_HEADER.ImageBase
157 |     alignment = pe.OPTIONAL_HEADER.FileAlignment
158 |     rva_offset = pe.get_rva_from_offset(gap_offset)
159 |     tva_offset = rva_offset + base
160 | 
161 |     section = pe.get_section_by_offset(gap_offset)
162 |     new_section_size = section.SizeOfRawData - gap_size
163 |     if new_section_size % alignment != 0:
164 |         raise RuntimeError(
165 |             F'trimming 0x{gap_size:X} bytes from section {(section.Name)} of size 0x{section.SizeOfRawData:X} '
166 |             F'violates required section alignment of 0x{alignment:X} bytes')
167 |     inside_section_offset = gap_offset - section.PointerToRawData
168 |     if inside_section_offset > new_section_size:
169 |         overlap = inside_section_offset - new_section_size
170 |         raise RuntimeError(F'trimming from section {(section.Name)}; data extends {overlap} beyond section')
171 | 
172 |     rva_lbound = section.VirtualAddress
173 |     rva_ubound = section.VirtualAddress + section.Misc_VirtualSize - 1
174 |     tva_lbound = rva_lbound + base
175 |     tva_ubound = rva_ubound + base
176 | 
177 |     def adjust_attributes_of_structure(
178 |         structure: Structure,
179 |         threshold: int,
180 |         valid_values_lower_bound: Optional[int],
181 |         valid_values_upper_bound: Optional[int],
182 |         attributes: Iterable[str]
183 |     ):
184 |         for attribute in attributes:
185 |             old_value = getattr(structure, attribute, 0)
186 |             if old_value <= gap_offset:
187 |                 continue
188 |             if valid_values_lower_bound is not None and old_value < valid_values_lower_bound:
189 |                 continue
190 |             if valid_values_upper_bound is not None and old_value > valid_values_upper_bound:
191 |                 continue
192 |             new_value = old_value - gap_size
193 |             if new_value < gap_offset:
194 |                 raise RuntimeError(F'adjusting attribute {attribute} of {structure.name} would result in negative value: {new_value}')
195 |             setattr(structure, attribute, new_value)
196 | 
197 |     it: Iterable[Structure] = iter(pe.__structures__)
198 |     remove = []
199 | 
200 |     for index, structure in enumerate(it):
201 |         old_offset = structure.get_file_offset()
202 |         new_offset = old_offset - gap_offset
203 | 
204 |         if old_offset > gap_offset:
205 |             if old_offset < gap_offset + gap_size:
206 |                 remove.append(index)
207 |                 continue
208 |             if isinstance(structure, SectionStructure) and new_offset % alignment != 0:
209 |                 raise RuntimeError(
210 |                     F'section {(structure.Name)} would be moved to offset 0x{new_offset:X}, '
211 |                     F'violating section alignment value 0x{alignment:X}.')
212 |             structure.set_file_offset(new_offset)
213 | 
214 |         try:
215 |             adjust_attributes_of_structure(structure, rva_offset, rva_lbound, rva_ubound, (
216 |                 'OffsetToData',
217 |                 'AddressOfData',
218 |                 'VirtualAddress',
219 |                 'AddressOfNames',
220 |                 'AddressOfNameOrdinals',
221 |                 'AddressOfFunctions',
222 |                 'AddressOfEntryPoint',
223 |                 'AddressOfRawData',
224 |                 'BaseOfCode',
225 |                 'BaseOfData',
226 |             ))
227 |             adjust_attributes_of_structure(structure, tva_offset, tva_lbound, tva_ubound, (
228 |                 'StartAddressOfRawData',
229 |                 'EndAddressOfRawData',
230 |                 'AddressOfIndex',
231 |                 'AddressOfCallBacks',
232 |             ))
233 |             adjust_attributes_of_structure(structure, gap_offset, None, None, (
234 |                 'OffsetModuleName',
235 |                 'PointerToRawData',
236 |             ))
237 |         except Exception as e:
238 |             continue
239 | 
240 |         for attribute in (
241 |             'CvHeaderOffset',
242 |             'OffsetIn2Qwords',
243 |             'OffsetInQwords',
244 |             'Offset',
245 |             'OffsetLow',
246 |             'OffsetHigh'
247 |         ):
248 |             if not hasattr(structure, attribute):
249 |                 continue
250 |     
251 |     while remove:
252 |         index = remove.pop()
253 |         pe.__structures__[index:index + 1] = []
254 | 
255 |     section.SizeOfRawData = new_section_size
256 |     return pe
257 | 
258 | 
259 | def refinery_strip(data: memoryview, alignment=1, block_size=_MB) -> int:
260 |     if not data:
261 |         return 0
262 |     threshold = 0.15
263 |     data_overhang = len(data) % alignment
264 |     result = data_overhang
265 | 
266 |     if 0 < threshold < 1:
267 |         def compression_ratio(offset: int):
268 |             ratio = len(zlib.compress(data[:offset], level=1)) / offset
269 |             return ratio
270 |         upper = len(data)
271 |         lower = result
272 | 
273 |         if compression_ratio(upper) <= threshold:
274 |             while block_size < upper - lower:
275 |                 pivot = (lower + upper) // 2
276 |                 ratio = compression_ratio(pivot)
277 |                 if ratio > threshold:
278 |                     lower = pivot + 1
279 |                     continue
280 |                 upper = pivot
281 |                 if abs(ratio - threshold) < 1e-10:
282 |                     break
283 |         result = upper
284 |     while result > 1 and data[result - 2] == data[result -1]:
285 |         result -= 1
286 | 
287 |     result = max(result, data_overhang)
288 | 
289 |     result = result + (data_overhang - result) % alignment
290 | 
291 |     if result > len(data):
292 |         excess = result - len(data)
293 |         excess = excess + (-excess % alignment)
294 |         result = result - excess
295 | 
296 |     return result
297 | 
298 | 
299 | def refinery_trim_resources(pe: pefile.PE, data_to_delete: List) -> int:
300 |     size_limit = 10000
301 |     size_removed = 0
302 | 
303 |     def find_bloated_resources(pe: pefile.PE, directory, level: int = 0, *path) -> Generator[Structure, None, None]:
304 |         for entry in directory.entries:
305 |             name = getattr(entry, 'name')
306 |             numeric_id = getattr(entry, 'id')
307 |             if not name:
308 |                 if level == 0 and numeric_id in iter(rsrc.RSRC):
309 |                     name = rsrc.RSRC(entry.id)
310 |                 elif numeric_id is not None:
311 |                     name = str(numeric_id)
312 |             name = name and str(name) or '?'
313 |             if entry.struct.DataIsDirectory:
314 |                 yield from find_bloated_resources(pe, entry.directory, level + 1, *path, name)
315 |                 continue
316 |             struct: Structure = entry.data.struct
317 |             name = '/'.join((*path, name))
318 |             if struct.Size <= size_limit:
319 |                 continue
320 |             yield name, struct
321 | 
322 |     RSRC_INDEX = DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_RESOURCE']
323 |     pe.parse_data_directories(directories=[RSRC_INDEX])
324 | 
325 |     try:
326 |         resources = pe.DIRECTORY_ENTRY_RESOURCE
327 |     except AttributeError:
328 |         return 0
329 |     for name, resource in find_bloated_resources(pe, resources):
330 |         offset = pe.get_offset_from_rva(resource.OffsetToData)
331 |         # Offset may be modified from debloating a previous resource
332 |         original_offset = offset
333 |         for slice_start, slice_end in data_to_delete:
334 |             if slice_start <= original_offset:
335 |                 original_offset += slice_end-slice_start
336 |         old_size = resource.Size
337 |         new_size = refinery_strip(memoryview(pe.__data__)[original_offset:original_offset + old_size], pe.OPTIONAL_HEADER.FileAlignment)
338 |         gap_size = old_size - new_size
339 |         if gap_size <= 0:
340 |             continue
341 |         resource.Size = new_size
342 |         adjust_offsets(pe, offset + new_size, gap_size)
343 |         size_removed += gap_size
344 |         data_to_delete.append((original_offset + new_size, original_offset + old_size))
345 | 
346 |     pe.OPTIONAL_HEADER.DATA_DIRECTORY[RSRC_INDEX].Size -= size_removed
347 | 
348 | def get_compressed_size(data: memoryview, offset: int, level: int = -1):
349 |     if offset <= 1024:
350 |         return len(zlib.compress(data[:offset], level=level))
351 | 
352 |     compress_obj = zlib.compressobj(level=level)
353 |     compress_data_len = 0
354 |     index = 0
355 |     for index in range(offset//1024):
356 |         chunk = data[index*1024 : (index+1)*1024]
357 |         compress_data_len += len(compress_obj.compress(chunk))
358 |     leftover = offset%1024
359 |     if leftover:
360 |         chunk = data[(index+1)*1024 : (index+1)*1024 + leftover]
361 |         compress_data_len += len(compress_obj.compress(chunk))
362 |     compress_data_len += len(compress_obj.flush())
363 |     return compress_data_len
364 | 
365 | def check_section_compression(pe: pefile.PE, data_to_delete: List,
366 |                               log_message: Callable[[str], None]) -> Tuple[pefile.PE, int, str]:
367 |         biggest_section = None
368 |         biggest_uncompressed = int
369 |         result = ""
370 |         for section in pe.sections:
371 |             section_name = section.Name.decode("utf8", errors="backslashreplace")
372 |             compressed_section_size = get_compressed_size(
373 |                 memoryview(pe.__data__)[section.PointerToRawData : section.PointerToRawData+section.SizeOfRawData],
374 |                 section.SizeOfRawData
375 |             )
376 |             section_compression_ratio = section.SizeOfRawData / compressed_section_size * 100
377 |             log_message("Section: "  + section_name, end="\t", flush=True)
378 |             log_message(" Compression Ratio: " + str(round(section_compression_ratio, 2)) +"%", end="\t",flush=True)
379 |             log_message("Size of section: " + readable_size(section.SizeOfRawData) +".",flush=True)
380 |             if biggest_section is None or section.SizeOfRawData > biggest_section.SizeOfRawData:
381 |                 biggest_section = section
382 |                 biggest_uncompressed = section_compression_ratio
383 |         # Handle specific bloated sections
384 |         if biggest_section.Name.decode("utf8", errors="backslashreplace") == ".rsrc\x00\x00\x00":
385 |             # Get biggest resource or resources and drop them from the
386 |             # Resource table
387 |             log_message('''
388 | Bloat was located in the resource section. Removing bloat..
389 | ''')
390 |             refinery_trim_resources(pe, data_to_delete)
391 |             result_code = 6 # Bloated resource
392 |             return result, result_code
393 | 
394 |         elif biggest_section.Name.decode("utf8", errors="backslashreplace") == ".text\x00\x00\x00" and biggest_uncompressed > 3000:
395 |             # Data stored in the .text section is often a .NET Resource. The following checks
396 |             # to confirm it is .NET and then drops the resources.
397 |             if pe.OPTIONAL_HEADER.DATA_DIRECTORY[14].Size:
398 |                 log_message('''
399 | Bloat was detected in the text section. Bloat is likely in a .NET Resource
400 | This use case cannot be processed at this time. ''')
401 |             result_code = 0 # No solution
402 |             return result, result_code
403 |         if biggest_uncompressed > 3000:
404 |             log_message('''
405 | The compression ratio of ''' + biggest_section.Name.decode("utf8", errors="backslashreplace") + ''' is indicative of a bloated section.
406 | ''', end="", flush=True)
407 |             # Get the size of the section.
408 |             biggest_section_end = biggest_section.PointerToRawData + biggest_section.SizeOfRawData
409 |             original_section_size = biggest_section.SizeOfRawData
410 |             biggest_section_data = memoryview(pe.__data__)[biggest_section.PointerToRawData:biggest_section_end]
411 |             delta_last_non_junk, result_code = trim_junk(pe, biggest_section_data, original_section_size)
412 |             # Remove the junk from the section.
413 |             if delta_last_non_junk > original_section_size:
414 |                 log_message("Section was not able to be reduced.")
415 |                 result_code = 0
416 |                 return result, result_code
417 |             data_to_delete.append((biggest_section.PointerToRawData + delta_last_non_junk, biggest_section_end))
418 |             
419 |             section_bytes_to_remove = original_section_size - delta_last_non_junk
420 |             # Adjust all offsets for the file.
421 |             adjust_offsets(pe, biggest_section.PointerToRawData, section_bytes_to_remove)
422 |             log_message("Bloated section reduced.")
423 |             result_code = 7 # Bloated PE section
424 |             return result, result_code
425 | 
426 |         # If no bloat was found, return an expected return value
427 |         result_code = 0 # No solution
428 |         return result, result_code
429 | 
430 | def find_chunk_start(targeted_regex, chunk_start, original_size_with_junk, bloated_content: memoryview, step):
431 |     bloated_content_len = len(bloated_content)
432 |     compiled_targeted_regex = re.compile(targeted_regex)
433 |     chunk_end = chunk_start
434 |     while original_size_with_junk > chunk_end:
435 |         chunk_end = chunk_start + step
436 |         targeted_regex_match = compiled_targeted_regex.search(binascii.hexlify(bytes(bloated_content[max(bloated_content_len - chunk_end, 0):bloated_content_len - chunk_start])[::-1]))
437 |         if targeted_regex_match:
438 |             chunk_start += targeted_regex_match.end(0)
439 |         else:
440 |             # If the targeted_regex_match does not
441 |             # return anything, that indicates the previous loop
442 |             # had content which did not match. We'll use that
443 |             # to help ensure we do not remove too much of the file.
444 |             chunk_start -= step * 2
445 |             break
446 |     return chunk_start
447 | 
448 | def trim_junk(pe: pefile.PE, bloated_content: memoryview,
449 |               original_size_with_junk: int) -> int:
450 |     '''Attempts multiple methods to trim junk from the end of a section.'''
451 |     alignment = pe.OPTIONAL_HEADER.FileAlignment
452 | 
453 |     # Regex Explained:
454 |     # Match raw bytes that are repeated more than 20 times at the end
455 |     # of a binary.
456 |     delta_last_non_junk = original_size_with_junk
457 |     # First Method: Trims 1 repeating byte.
458 |     # Check against 200 bytes, if successful, calculate full match.
459 |     junk_match = re.search(rb'^(..)\1{20,}', bytes(bloated_content[:-601:-1]))
460 |     chunk_start = 0
461 |     if not junk_match:
462 |         # Second method: remove junk using refinery_strip. This method
463 |         # is more efficent than a previous check that was used here.
464 |         delta_last_non_junk = refinery_strip(bloated_content, alignment)
465 |         result_code = 3 # Pattern in overlay.
466 | 
467 |     # Junk was identified. A new size is assigned and returned.
468 |     else:
469 |         # First method continued...
470 |         bloated_content_len = len(bloated_content)
471 |         targeted_regex = rb"("+ binascii.hexlify(junk_match.group(1)) + rb")\1{1,}"
472 |         precompiled_chunk = binascii.hexlify(junk_match.group(1)) * int(1000/len(junk_match.group(1)))
473 |         chunk_end = chunk_start
474 |         while original_size_with_junk > chunk_end:
475 |             chunk_end = chunk_start + 1000
476 |             chunk = binascii.hexlify(bytes(bloated_content[max(bloated_content_len - chunk_end, 0):bloated_content_len - chunk_start])[::-1])
477 |             if chunk == precompiled_chunk:
478 |                 chunk_start += 1000
479 |                 continue
480 |             else:
481 |                 # If the chunk does not match the precompiled chunk,
482 |                 # we will return to the previous chunk_start in order
483 |                 # to ensure important bytes are not removed.
484 |                 if chunk_start > 1000:
485 |                     chunk_start -= 1000
486 |                 break
487 |         junk_to_remove = chunk_start 
488 | 
489 |         # Third Method: check for a series of one repeated byte.
490 |         # If the trimming did not remove more than half of the bytes then
491 |         # this suggests the attacker may have put a random series of
492 |         # repeated bytes. We use refinery_trim for efficiency.
493 |         if junk_to_remove * 2 < original_size_with_junk / 2:
494 |             delta_last_non_junk = refinery_strip(bloated_content, alignment)
495 |             junk_to_remove = 0 # Reset junk_to_remove because Refinery Strip will remove it.
496 |             result_code = 4 # Sets of repeated bytes in overlay.
497 |         else:
498 |             result_code = 2 # Single repeated byte in overlay
499 |         delta_last_non_junk -= junk_to_remove
500 | 
501 |     # The returned size must account for the file alignment.
502 |     # We will make sure it is aligned by adding bytes.
503 |     not_aligned = alignment - (delta_last_non_junk % alignment)
504 |     delta_last_non_junk = delta_last_non_junk + not_aligned
505 |     if not result_code:
506 |         result_code = 0
507 |     return delta_last_non_junk, result_code
508 | 
509 | def process_pe(pe: pefile.PE, out_path: str, last_ditch_processing: bool,
510 |                 cert_preservation: bool,log_message: Callable[[str], None], 
511 |                 beginning_file_size: int = 0) -> None:
512 |     '''Prepare PE, perform checks, remote junk, write patched binary.'''
513 |     result_code = 0
514 |     if not beginning_file_size:
515 |         beginning_file_size = len(pe.write())
516 | 
517 |     # Remove Signature and modify size of Optional Header Security entry.
518 |     signature_address, signature_size = get_signature_info(pe, cert_preservation)
519 |     
520 |     if cert_preservation == True:
521 |         cert = [(signature_address, signature_address + signature_size)]
522 |         certData = memoryview(pe.__data__)[signature_address:signature_address + signature_size]
523 |         data_to_delete = [(signature_address, signature_address + signature_size)]
524 |     else:
525 |         if signature_size > 0:
526 |             log_message("""A certificate is being removed from this file.\n-To preserve the certificate use the Cert Preservation option.""")
527 |         data_to_delete = [(signature_address, signature_address + signature_size)]
528 | 
529 |     signature_abnormality, result_code = handle_signature_abnormality(signature_address,
530 |                                                         signature_size,
531 |                                                         beginning_file_size,
532 |                                                         data_to_delete)
533 |     if signature_abnormality is True and sum(slice_end-slice_start for slice_start, slice_end in data_to_delete) >= (beginning_file_size * 0.1):
534 |         pass
535 |     # Handle Overlays: this includes packers and overlays which are completely junk
536 |     elif pe.get_overlay_data_start_offset() and signature_size < len(pe.__data__) - pe.get_overlay_data_start_offset():
537 |         possible_header = pe.__data__[pe.get_overlay_data_start_offset():pe.get_overlay_data_start_offset() + 20_000]
538 |         # Check first to see if the file is NSIS
539 |         nsis_extracted = check_and_extract_NSIS(possible_header, pe)
540 |         if nsis_extracted:
541 |             write_multiple_files(out_path, nsis_extracted, log_message)
542 |             result_code = 5 # NSIS Installer
543 |             return result_code
544 | 
545 |         else:
546 |             log_message("Attempting dynamic trim...")
547 |             last_section = find_last_section(pe)
548 |             if last_section is None:
549 |                 log_message("Unable to process. This may indicate the file is malformed.")
550 |                 return 0
551 |             overlay = memoryview(pe.__data__)[last_section.PointerToRawData + last_section.SizeOfRawData:signature_address or beginning_file_size]
552 |             
553 |             # The following checks a sample of the overlay to determine if it will be able to be removed.
554 |             overlay_compression_sample = get_compressed_size(memoryview(overlay)[-2000:], 2000)
555 |             sample_compression = beginning_file_size / overlay_compression_sample 
556 |             file_size_wo_overlay = len(memoryview(pe.__data__)[:last_section.PointerToRawData + last_section.SizeOfRawData])
557 |             if sample_compression > 400000:
558 |                 required_data_from_overlay, result_code = trim_junk(pe, overlay, beginning_file_size)
559 |                 end_of_real_data = file_size_wo_overlay + required_data_from_overlay
560 |                 data_to_delete.append(((file_size_wo_overlay + required_data_from_overlay), beginning_file_size ))
561 |                 
562 |             else:
563 |                 result, result_code = check_section_compression(pe, data_to_delete, log_message=log_message)
564 |                 if len(data_to_delete) == 1:
565 |                     end_of_real_data = beginning_file_size
566 |                 else:
567 |                     result_code = 12 # Packed with junk in section
568 |                     end_of_real_data = beginning_file_size - sum(slice_end-slice_start for slice_start, slice_end in data_to_delete)
569 | 
570 |             if end_of_real_data > beginning_file_size * 0.9:
571 |                 if last_ditch_processing is True:
572 |                     log_message("""
573 | "Last ditch" switch detected. Running last ditch debloat technique:\n
574 | This is the last resort that removes the whole overlay: this works in cases where the overlay lacks a pattern.
575 | However, if the file does not run after this, it is in indicator that this method removed critical data.
576 |                     """)
577 |                     end_of_real_data = last_section.PointerToRawData + last_section.SizeOfRawData
578 |                     data_to_delete.append((end_of_real_data, beginning_file_size))
579 |                 else:
580 |                     log_message("""
581 | Overlay was unable to be trimmed. Try unpacking with UniExtract2 or re-running
582 | Debloat with the "--last-ditch" parameter."""
583 |                                 )
584 |             elif result_code == 12:
585 |                 # The end was already determined and no more data needs to be removed.
586 |                 pass
587 |             else:
588 |                 data_to_delete.append((end_of_real_data, beginning_file_size))
589 |     # Handle bloated sections
590 |     # TODO: break up into functions
591 |     else:
592 |         # In order to solve some use cases, we will find the biggest section
593 |         # within the binary.
594 |         result, result_code = check_section_compression(pe, data_to_delete, log_message=log_message)
595 |         log_message(result)
596 |     # All processing is done. Report results.
597 |     # There is always the signature in the list
598 |     if len(data_to_delete) == 0 or sum(slice_end-slice_start for slice_start, slice_end in data_to_delete) <= (beginning_file_size * 0.1):
599 |         log_message("""No automated method for reducing the size worked. Please consider sharing the
600 | sample for additional analysis.
601 | Email: Squiblydoo@pm.me
602 | Twitter: @SquiblydooBlog.
603 |                     """)
604 |         result_code = 0
605 |         return result_code
606 |     else:
607 |         pe_data = bytearray()
608 |         start = 0
609 |         for slice_start, slice_end in sorted(data_to_delete):
610 |             pe_data += bytearray(pe.__data__[start:slice_start])
611 |             start = slice_end
612 |         pe_data += bytearray(pe.__data__[start:beginning_file_size])
613 |         if cert_preservation == True and signature_size > 0:
614 |             if result_code == 17:
615 |                 log_message("Certificate is being used for junk and will be removed.")
616 |             else:
617 |                 pe_data += certData
618 |                 pe.OPTIONAL_HEADER.DATA_DIRECTORY[pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_SECURITY']].VirtualAddress = len(pe_data) - signature_size
619 | 
620 |         pe.__data__ = pe_data
621 |         final_filesize, new_pe_name = write_patched_file(out_path,
622 |                                                          pe)
623 |         reduction_calculation = round(((beginning_file_size \
624 |                                         - final_filesize) \
625 |                                         / beginning_file_size) * 100, 2)
626 |         log_message("Beginning File size: " \
627 |                 + readable_size(beginning_file_size) + ".")
628 |         log_message("File was reduced by " \
629 |                     + str(reduction_calculation) + "%.")
630 |         log_message("Final file size: " \
631 |                     + readable_size(final_filesize) + ".")
632 |         log_message("Processing complete.\nFile written to '" \
633 |                     + str(new_pe_name) + "'.")
634 |         return result_code
635 | 


--------------------------------------------------------------------------------
/src/debloat/utilities/nsisParser.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python3
   2 | # -*- coding: utf-8 -*-
   3 | # Most of this code was repurposed from Binary Refinery (https://github.com/binref/refinery), used under the 3-Clause BSD License
   4 | 
   5 | from collections import namedtuple
   6 | import struct
   7 | 
   8 | import enum
   9 | 
  10 | import itertools
  11 | import re
  12 | import io
  13 | import dataclasses
  14 | 
  15 | import zlib
  16 | import lzma
  17 | 
  18 | from datetime import datetime
  19 | 
  20 | import logging
  21 | from debloat.utilities.readers import StructReader, Struct, StreamDetour, MemoryFile
  22 | from debloat.utilities.pyflate import BZip2File, GZipFile
  23 | from typing import (
  24 |     BinaryIO, 
  25 |     NamedTuple, 
  26 |     Iterable, 
  27 |     Iterator, 
  28 |     Callable, 
  29 |     Union, 
  30 |     Optional, 
  31 |     List, 
  32 |     Dict,
  33 |     Type)
  34 | 
  35 | logging.basicConfig(level=logging.WARN)
  36 | 
  37 | class UnpackResult:
  38 | 
  39 |     def get_data(self) -> Union[bytes, bytearray, memoryview]:
  40 |         if Callable(self.data):
  41 |             self.data = self.data()
  42 |         return self.data
  43 | 
  44 |     def __init__(self, _br__path: str, _br__data: Union[Union[bytes, bytearray, memoryview], Callable[[], Union[bytes, bytearray, memoryview]]], **_br__meta):
  45 |         self.path = _br__path
  46 |         self.data = _br__data
  47 |         self.meta = _br__meta
  48 |         for key in [key for key, value in _br__meta.items() if value is None]:
  49 |             del _br__meta[key]   
  50 | 
  51 | class ArchiveUnit:
  52 |     def __init__(self, *paths, 
  53 |                  list=False, join_path=False, 
  54 |                  drop_path=False, fuzzy=0, exact=False, 
  55 |                  regex=False, 
  56 |                  date=b'date', 
  57 |                  path=b'path', **kwargs):
  58 |         self.paths = paths
  59 |         self.list = list
  60 |         self.join_path = join_path
  61 |         self.drop_path = drop_path
  62 |         self.fuzzy = fuzzy
  63 |         self.exact = exact
  64 |         self.regex = regex
  65 |         self.path = path
  66 |         self.date = date,
  67 |         self.kwargs = kwargs
  68 | 
  69 |     def _pack(
  70 |         self,
  71 |         path: str,
  72 |         date: Optional[Union[datetime, str]],
  73 |         data: Union[Union[bytes, bytearray, memoryview], Callable[[], Union[bytes, bytearray, memoryview]]],
  74 |         **meta
  75 |     ) -> UnpackResult:
  76 |         if isinstance(date, datetime):
  77 |             date = date.isoformat(' ', 'seconds')
  78 |         if isinstance(date, str):
  79 |             meta[self.args.date.decode(self.codec)] = date
  80 |         return UnpackResult(path, data, **meta)
  81 | 
  82 | class DeflateFile(io.RawIOBase):
  83 | 
  84 |     data: MemoryFile
  85 |     dc: zlib.decompress
  86 | 
  87 |     def __new__(cls, data: MemoryFile):
  88 |         self = super().__new__(cls)
  89 |         self.data = data
  90 |         self.dc = zlib.decompressobj(-15)
  91 |         return io.BufferedReader(self)
  92 | 
  93 |     def readall(self) -> bytes:
  94 |         return self.read()
  95 | 
  96 |     def readinto(self, __buffer):
  97 |         data = self.read(len(__buffer))
  98 |         size = len(data)
  99 |         __buffer[:size] = data
 100 |         return size
 101 | 
 102 |     def read(self, size=-1):
 103 |         buffer = self.dc.unconsumed_tail or self.data.read(size)
 104 |         kwargs = {}
 105 |         if size > 0:
 106 |             kwargs.update(max_length=size)
 107 |         return self.dc.decompress(buffer, **kwargs)
 108 | 
 109 |     def readable(self) -> bool:
 110 |         return True
 111 | 
 112 |     def seekable(self) -> bool:
 113 |         return False
 114 | 
 115 |     def writable(self) -> bool:
 116 |         return False
 117 | 
 118 |     def write(self, __b):
 119 |         raise NotImplementedError
 120 | 
 121 | class LZMAOptions(NamedTuple):
 122 |     filter_flag: bool
 123 |     dictionary_size: int
 124 | 
 125 | class NSBlockHeaderOffset(Struct):
 126 |     def __init__(self, reader: StructReader, is64bit: bool):
 127 |         self.offset = reader.u64() if is64bit else reader.u32()
 128 |         self.size = reader.u32()
 129 | 
 130 | class NSMethod(str, enum.Enum):
 131 |     Copy = 'COPY'
 132 |     LZMA = 'LZMA'
 133 |     BZip2 = 'BZIP2'
 134 |     Deflate = 'DEFLATE'
 135 |     NSGzip = 'NsisGzip'
 136 | 
 137 | class Op(enum.IntEnum):
 138 |     INVALID_OPCODE     = 0              # noqa
 139 |     RET                = enum.auto()    # noqa; Return
 140 |     NOP                = enum.auto()    # noqa; Nop, Goto
 141 |     ABORT              = enum.auto()    # noqa; Abort
 142 |     QUIT               = enum.auto()    # noqa; Quit
 143 |     CALL               = enum.auto()    # noqa; Call, InitPluginsDir
 144 |     UPDATETEXT         = enum.auto()    # noqa; DetailPrint
 145 |     SLEEP              = enum.auto()    # noqa; Sleep
 146 |     BRINGTOFRONT       = enum.auto()    # noqa; BringToFront
 147 |     CHDETAILSVIEW      = enum.auto()    # noqa; SetDetailsView
 148 |     SETFILEATTRIBUTES  = enum.auto()    # noqa; SetFileAttributes
 149 |     CREATEDIR          = enum.auto()    # noqa; CreateDirectory, SetOutPath
 150 |     IFFILEEXISTS       = enum.auto()    # noqa; IfFileExists
 151 |     SETFLAG            = enum.auto()    # noqa; SetRebootFlag, ...
 152 |     IFFLAG             = enum.auto()    # noqa; IfAbort, IfSilent, IfErrors, IfRebootFlag
 153 |     GETFLAG            = enum.auto()    # noqa; GetInstDirError, GetErrorLevel
 154 |     RENAME             = enum.auto()    # noqa; Rename
 155 |     GETFULLPATHNAME    = enum.auto()    # noqa; GetFullPathName
 156 |     SEARCHPATH         = enum.auto()    # noqa; SearchPath
 157 |     GETTEMPFILENAME    = enum.auto()    # noqa; GetTempFileName
 158 |     EXTRACTFILE        = enum.auto()    # noqa; File
 159 |     DELETEFILE         = enum.auto()    # noqa; Delete
 160 |     MESSAGEBOX         = enum.auto()    # noqa; MessageBox
 161 |     RMDIR              = enum.auto()    # noqa; RMDir
 162 |     STRLEN             = enum.auto()    # noqa; StrLen
 163 |     ASSIGNVAR          = enum.auto()    # noqa; StrCpy
 164 |     STRCMP             = enum.auto()    # noqa; StrCmp
 165 |     READENVSTR         = enum.auto()    # noqa; ReadEnvStr, ExpandEnvStrings
 166 |     INTCMP             = enum.auto()    # noqa; IntCmp, IntCmpU
 167 |     INTOP              = enum.auto()    # noqa; IntOp
 168 |     INTFMT             = enum.auto()    # noqa; IntFmt/Int64Fmt
 169 |     PUSHPOP            = enum.auto()    # noqa; Push/Pop/Exchange
 170 |     FINDWINDOW         = enum.auto()    # noqa; FindWindow
 171 |     SENDMESSAGE        = enum.auto()    # noqa; SendMessage
 172 |     ISWINDOW           = enum.auto()    # noqa; IsWindow
 173 |     GETDLGITEM         = enum.auto()    # noqa; GetDlgItem
 174 |     SETCTLCOLORS       = enum.auto()    # noqa; SetCtlColors
 175 |     SETBRANDINGIMAGE   = enum.auto()    # noqa; SetBrandingImage / LoadAndSetImage
 176 |     CREATEFONT         = enum.auto()    # noqa; CreateFont
 177 |     SHOWWINDOW         = enum.auto()    # noqa; ShowWindow, EnableWindow, HideWindow
 178 |     SHELLEXEC          = enum.auto()    # noqa; ExecShell
 179 |     EXECUTE            = enum.auto()    # noqa; Exec, ExecWait
 180 |     GETFILETIME        = enum.auto()    # noqa; GetFileTime
 181 |     GETDLLVERSION      = enum.auto()    # noqa; GetDLLVersion
 182 | #   GETFONTVERSION     = enum.auto()    # noqa; Park : 2.46.2
 183 | #   GETFONTNAME        = enum.auto()    # noqa; Park : 2.46.3
 184 |     REGISTERDLL        = enum.auto()    # noqa; RegDLL, UnRegDLL, CallInstDLL
 185 |     CREATESHORTCUT     = enum.auto()    # noqa; CreateShortCut
 186 |     COPYFILES          = enum.auto()    # noqa; CopyFiles
 187 |     REBOOT             = enum.auto()    # noqa; Reboot
 188 |     WRITEINI           = enum.auto()    # noqa; WriteINIStr, DeleteINISec, DeleteINIStr, FlushINI
 189 |     READINISTR         = enum.auto()    # noqa; ReadINIStr
 190 |     DELREG             = enum.auto()    # noqa; DeleteRegValue, DeleteRegKey
 191 |     WRITEREG           = enum.auto()    # noqa; WriteRegStr, WriteRegExpandStr, WriteRegBin, WriteRegDWORD
 192 |     READREGSTR         = enum.auto()    # noqa; ReadRegStr, ReadRegDWORD
 193 |     REGENUM            = enum.auto()    # noqa; EnumRegKey, EnumRegValue
 194 |     FCLOSE             = enum.auto()    # noqa; FileClose
 195 |     FOPEN              = enum.auto()    # noqa; FileOpen
 196 |     FPUTS              = enum.auto()    # noqa; FileWrite, FileWriteByte
 197 |     FGETS              = enum.auto()    # noqa; FileRead, FileReadByte
 198 | #   Park:
 199 | #   FPUTWS             = enum.auto()    # noqa; FileWriteUTF16LE, FileWriteWord
 200 | #   FGETWS             = enum.auto()    # noqa; FileReadUTF16LE, FileReadWord
 201 |     FSEEK              = enum.auto()    # noqa; FileSeek
 202 |     FINDCLOSE          = enum.auto()    # noqa; FindClose
 203 |     FINDNEXT           = enum.auto()    # noqa; FindNext
 204 |     FINDFIRST          = enum.auto()    # noqa; FindFirst
 205 |     WRITEUNINSTALLER   = enum.auto()    # noqa; WriteUninstaller
 206 | #   Park : since 2.46.3 the log is enabled in main Park version
 207 | #   LOG                = enum.auto()    # noqa; LogSet, LogText
 208 |     SECTIONSET         = enum.auto()    # noqa; Get*, Set*
 209 |     INSTTYPESET        = enum.auto()    # noqa; InstTypeSetText, InstTypeGetText, SetCurInstType, GetCurInstType
 210 | #   Before NSIS v3.06: Instructions not actually implemented in exehead, but used in compiler.
 211 | #   GETLABELADDR       = enum.auto()    # noqa; both of these get converted to ASSIGNVAR
 212 | #   GETFUNCTIONADDR    = enum.auto()    # noqa
 213 | #   In NSIS v3.06 and later it was changed to:
 214 |     GETOSINFO          = enum.auto()    # noqa
 215 |     RESERVEDOPCODE     = enum.auto()    # noqa
 216 |     LOCKWINDOW         = enum.auto()    # noqa; LockWindow
 217 | #   Two unicode commands available only in Unicode archive:
 218 |     FPUTWS             = enum.auto()    # noqa; FileWriteUTF16LE, FileWriteWord
 219 |     FGETWS             = enum.auto()    # noqa; FileReadUTF16LE, FileReadWord
 220 | #   Since NSIS v3.06 the fllowing IDs codes was moved here:
 221 | #   Opcodes listed here are not actually used in exehead.
 222 | #   No exehead opcodes should be present after these!
 223 | #   GETLABELADDR       = enum.auto()    # noqa; ASSIGNVAR
 224 | #   GETFUNCTIONADDR    = enum.auto()    # noqa; ASSIGNVAR
 225 | #   The following IDs are not IDs in real order.
 226 | #   We just need some IDs to translate eny extended layout to main layout:
 227 |     LOG                = enum.auto()    # noqa; LogSet, LogText
 228 | #   Park
 229 |     FINDPROC           = enum.auto()    # noqa; FindProc
 230 |     GETFONTVERSION     = enum.auto()    # noqa; GetFontVersion
 231 |     GETFONTNAME        = enum.auto()    # noqa; GetFontName
 232 | 
 233 |     @classmethod
 234 |     def from_int(cls, value: int) -> 'Op':
 235 |         try:
 236 |             return cls(value)
 237 |         except ValueError:
 238 |             return cls.INVALID_OPCODE
 239 | _Op_PARAMETER_COUNT = {
 240 |     Op.INVALID_OPCODE   : 0,
 241 |     Op.RET              : 0,
 242 |     Op.NOP              : 1,
 243 |     Op.ABORT            : 1,
 244 |     Op.QUIT             : 0,
 245 |     Op.CALL             : 2,
 246 |     Op.UPDATETEXT       : 6,
 247 |     Op.SLEEP            : 1,
 248 |     Op.BRINGTOFRONT     : 0,
 249 |     Op.CHDETAILSVIEW    : 2,
 250 |     Op.SETFILEATTRIBUTES: 2,
 251 |     Op.CREATEDIR        : 3,
 252 |     Op.IFFILEEXISTS     : 3,
 253 |     Op.SETFLAG          : 3,
 254 |     Op.IFFLAG           : 4,
 255 |     Op.GETFLAG          : 2,
 256 |     Op.RENAME           : 4,
 257 |     Op.GETFULLPATHNAME  : 3,
 258 |     Op.SEARCHPATH       : 2,
 259 |     Op.GETTEMPFILENAME  : 2,
 260 |     Op.EXTRACTFILE      : 6,
 261 |     Op.DELETEFILE       : 2,
 262 |     Op.MESSAGEBOX       : 6,
 263 |     Op.RMDIR            : 2,
 264 |     Op.STRLEN           : 2,
 265 |     Op.ASSIGNVAR        : 4,
 266 |     Op.STRCMP           : 5,
 267 |     Op.READENVSTR       : 3,
 268 |     Op.INTCMP           : 6,
 269 |     Op.INTOP            : 4,
 270 |     Op.INTFMT           : 4,
 271 |     Op.PUSHPOP          : 6,
 272 |     Op.FINDWINDOW       : 5,
 273 |     Op.SENDMESSAGE      : 6,
 274 |     Op.ISWINDOW         : 3,
 275 |     Op.GETDLGITEM       : 3,
 276 |     Op.SETCTLCOLORS     : 2,
 277 |     Op.SETBRANDINGIMAGE : 4,
 278 |     Op.CREATEFONT       : 5,
 279 |     Op.SHOWWINDOW       : 4,
 280 |     Op.SHELLEXEC        : 6,
 281 |     Op.EXECUTE          : 3,
 282 |     Op.GETFILETIME      : 3,
 283 |     Op.GETDLLVERSION    : 4,
 284 |     Op.REGISTERDLL      : 6,
 285 |     Op.CREATESHORTCUT   : 6,
 286 |     Op.COPYFILES        : 4,
 287 |     Op.REBOOT           : 1,
 288 |     Op.WRITEINI         : 5,
 289 |     Op.READINISTR       : 4,
 290 |     Op.DELREG           : 5,
 291 |     Op.WRITEREG         : 6,
 292 |     Op.READREGSTR       : 5,
 293 |     Op.REGENUM          : 5,
 294 |     Op.FCLOSE           : 1,
 295 |     Op.FOPEN            : 4,
 296 |     Op.FPUTS            : 3,
 297 |     Op.FGETS            : 4,
 298 |     Op.FSEEK            : 4,
 299 |     Op.FINDCLOSE        : 1,
 300 |     Op.FINDNEXT         : 2,
 301 |     Op.FINDFIRST        : 3,
 302 |     Op.WRITEUNINSTALLER : 4,
 303 |     Op.SECTIONSET       : 5,
 304 |     Op.INSTTYPESET      : 4,
 305 |     Op.GETOSINFO        : 6,
 306 |     Op.RESERVEDOPCODE   : 2,
 307 |     Op.LOCKWINDOW       : 1,
 308 |     Op.FPUTWS           : 4,
 309 |     Op.FGETWS           : 4,
 310 |     Op.LOG              : 2,
 311 |     Op.FINDPROC         : 2,
 312 |     Op.GETFONTVERSION   : 2,
 313 |     Op.GETFONTNAME      : 2,
 314 | }
 315 | 
 316 | NS_SHELL_STRINGS = {
 317 |     0x00: 'DESKTOP',
 318 |     0x01: 'INTERNET',
 319 |     0x02: 'SMPROGRAMS',
 320 |     0x03: 'CONTROLS',
 321 |     0x04: 'PRINTERS',
 322 |     0x05: 'DOCUMENTS',
 323 |     0x06: 'FAVORITES',
 324 |     0x07: 'SMSTARTUP',
 325 |     0x08: 'RECENT',
 326 |     0x09: 'SENDTO',
 327 |     0x0A: 'BITBUCKET',
 328 |     0x0B: 'STARTMENU',
 329 |     0x0D: 'MUSIC',
 330 |     0x0E: 'VIDEOS',
 331 |     0x10: 'DESKTOP',
 332 |     0x11: 'DRIVES',
 333 |     0x12: 'NETWORK',
 334 |     0x13: 'NETHOOD',
 335 |     0x14: 'FONTS',
 336 |     0x15: 'TEMPLATES',
 337 |     0x16: 'STARTMENU',
 338 |     0x17: 'SMPROGRAMS',
 339 |     0x18: 'SMSTARTUP',
 340 |     0x19: 'DESKTOP',
 341 |     0x1A: 'APPDATA',
 342 |     0x1B: 'PRINTHOOD',
 343 |     0x1C: 'LOCALAPPDATA',
 344 |     0x1D: 'ALTSTARTUP',
 345 |     0x1E: 'ALTSTARTUP',
 346 |     0x1F: 'FAVORITES',
 347 |     0x20: 'INTERNET_CACHE',
 348 |     0x21: 'COOKIES',
 349 |     0x22: 'HISTORY',
 350 |     0x23: 'APPDATA',
 351 |     0x24: 'WINDIR',
 352 |     0x25: 'SYSDIR',
 353 |     0x26: 'PROGRAM_FILES',
 354 |     0x27: 'PICTURES',
 355 |     0x28: 'PROFILE',
 356 |     0x29: 'SYSTEMX86',
 357 |     0x2A: 'PROGRAM_FILESX86',
 358 |     0x2B: 'PROGRAM_FILES_COMMON',
 359 |     0x2C: 'PROGRAM_FILES_COMMONX8',
 360 |     0x2D: 'TEMPLATES',
 361 |     0x2E: 'DOCUMENTS',
 362 |     0x2F: 'ADMINTOOLS',
 363 |     0x30: 'ADMINTOOLS',
 364 |     0x31: 'CONNECTIONS',
 365 |     0x35: 'MUSIC',
 366 |     0x36: 'PICTURES',
 367 |     0x37: 'VIDEOS',
 368 |     0x38: 'RESOURCES',
 369 |     0x39: 'RESOURCES_LOCALIZED',
 370 |     0x3A: 'COMMON_OEM_LINKS',
 371 |     0x3B: 'CDBURN_AREA',
 372 |     0x3D: 'COMPUTERSNEARME',
 373 | }
 374 | 
 375 | NS_VARIABLE_STRINGS = (
 376 |     "CMDLINE",
 377 |     "INSTDIR",
 378 |     "OUTDIR",
 379 |     "EXEDIR",
 380 |     "LANGUAGE",
 381 |     "TEMP",
 382 |     "PLUGINSDIR",
 383 |     "EXEPATH",  # NSIS 2.26+
 384 |     "EXEFILE",  # NSIS 2.26+
 385 |     "HWNDPARENT",
 386 |     "CLICK",    # set from page->clicknext
 387 |     "OUTDIR",   # NSIS 2.04+
 388 | )
 389 | 
 390 | class NSHeaderFlags(enum.IntFlag):
 391 |     Undefined = 0
 392 |     Uninstall = 1
 393 |     Silent = 2
 394 |     NoCrc = 4
 395 |     ForceCrc = 8
 396 |     LongOffset = 16
 397 |     ExternalFileSupport = 32
 398 |     ExternalFile = 64
 399 |     IsStubInstaller = 128
 400 | 
 401 | 
 402 | 
 403 | class NSType(enum.IntEnum):
 404 |     Nsis2 = 0
 405 |     Nsis3 = enum.auto()
 406 |     Park1 = enum.auto()
 407 |     Park2 = enum.auto()
 408 |     Park3 = enum.auto()
 409 | 
 410 | class NSScriptInstruction(Struct):
 411 |     def __init__(self, reader: StructReader):
 412 |         self.opcode = reader.u32()
 413 |         self.arguments = [reader.u32() for _ in range(6)]
 414 | 
 415 | class NSScriptExtendedInstruction(Struct):
 416 |     def __init__(self, reader: StructReader):
 417 |         self.opcode = reader.u32()
 418 |         self.arguments = [reader.u32() for _ in range(8)]
 419 | 
 420 | 
 421 | class NSCharCode(enum.IntEnum):
 422 |     NONE = 0
 423 |     CHAR = enum.auto()
 424 |     SKIP = enum.auto()
 425 |     SHELL = enum.auto()
 426 |     VAR = enum.auto()
 427 |     LANG = enum.auto()
 428 | 
 429 |     @property
 430 |     def special(self):
 431 |         return self > NSCharCode.CHAR
 432 |     
 433 | @dataclasses.dataclass
 434 | class NSItem:
 435 |     offset: int
 436 |     name: Optional[str] = None
 437 |     mtime: Optional[datetime] = None
 438 |     is_compressed: bool = True
 439 |     is_uninstaller: bool = False
 440 |     attributes: Optional[int] = None
 441 |     size: Optional[int] = None
 442 |     compressed_size: Optional[int] = None
 443 |     estimated_size: Optional[int] = None
 444 |     dictionary_size: int = 1
 445 |     patch_size: int = 0
 446 |     prefix: Optional[str] = None
 447 | 
 448 |     @property
 449 |     def path(self) -> str:
 450 |         path = self.name
 451 |         if self.prefix:
 452 |             path = F'{self.prefix}\\{path}'
 453 |         return path
 454 |     
 455 |     def __str__(self) -> str:
 456 |         return self.name
 457 | 
 458 |     def __eq__(self, other) -> bool:
 459 |         if not other or not isinstance(other, self.__class__):
 460 |             return False
 461 |         return (
 462 |             self.offset == other.offset
 463 |             and self.mtime == other.mtime
 464 |             and self.is_compressed == other.is_compressed
 465 |             and self.is_uninstaller == other.is_uninstaller
 466 |             and self.attributes == other.attributes
 467 |             and self.size == other.size
 468 |             and self.compressed_size == other.compressed_size
 469 |             and self.estimated_size == other.estimated_size
 470 |             and self.dictionary_size == other.dictionary_size
 471 |             and self.patch_size == other.patch_size
 472 |             and self.path == other.path
 473 |         )
 474 | 
 475 | 
 476 | class NSHeader(Struct):
 477 |     BACKSLASH           = ord('\\')  # noqa
 478 |     NS_CMDLINE          = 20         # noqa
 479 |     NS_INSTDIR          = 21         # noqa
 480 |     NS_OUTDIR           = 22         # noqa
 481 |     NS_EXEDIR           = 23         # noqa
 482 |     NS_LANGUAGE         = 24         # noqa
 483 |     NS_TEMP             = 25         # noqa
 484 |     NS_PLUGINSDIR       = 26         # noqa
 485 |     NS_EXEPATH          = 27         # noqa NSIS 2.26+
 486 |     NS_EXEFILE          = 28         # noqa NSIS 2.26+
 487 |     NS_HWNDPARENT_225   = 27         # noqa
 488 |     NS_HWNDPARENT_226   = 29         # noqa
 489 |     NS_CLICK            = 30         # noqa
 490 |     NS_OUTDIR_225       = 29         # noqa NSIS 2.04 - 2.25
 491 |     NS_OUTDIR_226       = 31         # noqa NSIS 2.26+
 492 | 
 493 |     def _string_args_to_single_arg(self, arg1: int, 
 494 |                                    arg2: Optional[int] = None) -> int:
 495 |         if self.type >= NSType.Park1:
 496 |             return arg1 & 0x7FFF
 497 |         else:
 498 |             if arg2 is None:
 499 |                 arg2 = arg1 >> 8
 500 |             arg1 &= 0x7F
 501 |             arg2 &= 0x7F
 502 |             return arg1 | arg2 << 7
 503 | 
 504 |     def _get_char_code(self, char: int) -> NSCharCode:
 505 |         if self.type >= NSType.Park1:
 506 |             if char < 0x80:
 507 |                 return NSCharCode.CHAR
 508 |             lookup = {
 509 |                 0xE000: NSCharCode.SKIP,
 510 |                 0xE001: NSCharCode.VAR,
 511 |                 0xE002: NSCharCode.SHELL,
 512 |                 0xE003: NSCharCode.LANG,
 513 |             }
 514 |         elif self.type is NSType.Nsis3:
 515 |             if char > 4:
 516 |                 return NSCharCode.CHAR
 517 |             lookup = {
 518 |                 0x0002: NSCharCode.SHELL,
 519 |                 0x0003: NSCharCode.VAR,
 520 |                 0x0004: NSCharCode.SKIP,
 521 |             }
 522 |         elif self.type is NSType.Nsis2:
 523 |             lookup = {
 524 |                 0x00FC: NSCharCode.SKIP,
 525 |                 0x00FD: NSCharCode.VAR,
 526 |                 0x00FE: NSCharCode.SHELL,
 527 |             }
 528 |         else:
 529 |             raise ValueError(F'Unknown NSIS type {self.type}.')
 530 |         return lookup.get(char, NSCharCode.NONE)
 531 | 
 532 |     def _string_code_shell(self, index1: int, 
 533 |                            index2: Optional[int] = None) -> str:
 534 |         if index2 is None:
 535 |             index2 = index1 >> 8
 536 |             index1 &= 0xFF
 537 |         if index1 & 0x80 != 0:
 538 |             offset = index1 & 0x3F
 539 |             with StreamDetour(self.strings, offset):
 540 |                 if self.strings.tell() != offset:
 541 |                     raise ValueError(F'Failed to detour to offset 0x{offset:02X}.')
 542 |                 path = self._read_current_string()
 543 |                 if path.startswith('ProgramFilesDir'):
 544 |                     return '$PROGRAMFILES'
 545 |                 if path.startswith('CommonFilesDir'):
 546 |                     return '$COMMONFILES'
 547 |                 suffix = 32 * (index1 >> 5 & 2)
 548 |                 return F'$REG{suffix}({path})'
 549 |         for index in (index1, index2):
 550 |             shell = NS_SHELL_STRINGS.get(index)
 551 |             if shell is not None:
 552 |                 return F'$SHELL:{shell}'
 553 |         else:
 554 |             return F'Error:$SHELL:{index1:02X}{index2:02X}'
 555 | 
 556 |     def _string_code_variable(self, index: int) -> str:
 557 |         varcount = 20 + len(NS_VARIABLE_STRINGS)
 558 |         if self._is_nsis200:
 559 |             varcount -= 3
 560 |         elif self._is_nsis225:
 561 |             varcount -= 2
 562 |         if index < 20:
 563 |             if index >= 10:
 564 |                 return F'$R{index - 10}'
 565 |             return F'$V{index}'
 566 |         else:
 567 |             if index < varcount:
 568 |                 if self._is_nsis225 and index >= self.NS_EXEPATH:
 569 |                     index += 2
 570 |                 try:
 571 |                     variable = NS_VARIABLE_STRINGS[index - 20]
 572 |                 except IndexError:
 573 |                     return F'Error:$V:{index}'
 574 |                 else:
 575 |                     return F'${variable}'
 576 |             return F'Error:$V:{index}'
 577 |         
 578 |     def _string_code_language(self, index: int) -> str:
 579 |         return F'$LANGUAGE:{index}'
 580 | 
 581 |     @property
 582 |     def _read_char(self) -> str:
 583 |         return self.strings.u16 if self.unicode else self.strings.u8
 584 |     
 585 |     def _seek_to_string(self, position: int) -> bool:
 586 |         pos = position * self.charsize
 587 |         return self.strings.seek(pos) == pos
 588 |     
 589 |     def _read_string(self, position: int) -> Optional[str]:
 590 |         if position < 0:
 591 |             return self._string_code_language(-(position + 1))
 592 |         if not self._seek_to_string(position):
 593 |             return None
 594 |         return self._read_current_string()
 595 | 
 596 |     def _read_string_raw(self, position: int) -> Optional[str]:
 597 |         if not self._seek_to_string(position):
 598 |             return None
 599 |         if self.unicode:
 600 |             return self.strings.read_w_string()
 601 |         else:
 602 |             return self.strings.read_c_string()
 603 |         
 604 |     def _is_var_absolute_path(self, position: int) -> bool:
 605 |         var = self._get_var_index(position)
 606 |         if var is None:
 607 |             return False
 608 |         return var in (
 609 |             self.NS_INSTDIR,
 610 |             self.NS_EXEDIR,
 611 |             self.NS_TEMP,
 612 |             self.NS_PLUGINSDIR,
 613 |         )
 614 |     
 615 |     def _is_good_string(self, position: int) -> bool:
 616 |         if position == 0:
 617 |             return False
 618 |         if not self._seek_to_string(position - 1):
 619 |             return False
 620 |         prefix = self._read_char()
 621 |         return prefix == 0 or prefix == self.BACKSLASH
 622 | 
 623 |     def _is_var_str(self, position: int, index: int) -> bool:
 624 |         if index > 0x7FFF:
 625 |             return False
 626 |         var_index = self._get_var_index(position)
 627 |         if var_index is None:
 628 |             return False
 629 |         if self._get_resource_finished(position, 0) is None:
 630 |             return False
 631 |         return var_index == index
 632 | 
 633 |     def _get_var_index(self, position: int) -> Optional[int]:
 634 |         if not self._seek_to_string(position):
 635 |             raise LookupError(F'Failed to seek to string at position 0x{position:08X}.')
 636 |         try:
 637 |             code = self._read_char()
 638 |             if self._get_char_code(code) is not NSCharCode.VAR:
 639 |                 return None
 640 |             arg1 = self._read_char()
 641 |             if arg1 == 0:
 642 |                 return None
 643 |             if self.unicode:
 644 |                 args = arg1,
 645 |             else:
 646 |                 arg2 = self._read_char()
 647 |                 if arg2 == 0:
 648 |                     return None
 649 |                 args = arg1, arg2
 650 |             return self._string_args_to_single_arg(*args)
 651 |         except EOFError:
 652 |             return None
 653 |         
 654 |     def _get_resource(self, position: int) -> Optional[int]:
 655 |         if self.unicode:
 656 |             if len(self.strings) - position >= 4:
 657 |                 return 2
 658 |         else:
 659 |             if len(self.strings) - position >= 3:
 660 |                 return 3
 661 |         return None
 662 | 
 663 |     def _get_resource_finished(self, position: int, 
 664 |                           terminator: int) -> Optional[int]:
 665 |         if not self._seek_to_string(position):
 666 |             return None
 667 |         self.strings.seek_relative(3)
 668 |         if self.unicode:
 669 |             self.strings.seek_relative(1)
 670 |         if self.strings.remaining_bytes < self.charsize:
 671 |             return None
 672 |         if self._read_char() != terminator:
 673 |             return None
 674 |         return 3 if self.unicode else 4
 675 | 
 676 | 
 677 |     @property
 678 |     def charsize(self) -> int:
 679 |         return 2 if self.unicode else 1
 680 | 
 681 |     def _read_current_string(self) -> str:
 682 |         string = io.StringIO()
 683 |         chars = iter(self._read_char, 0)
 684 |         for letter in chars:
 685 |             code = self._get_char_code(letter)
 686 |             if code is NSCharCode.CHAR:
 687 |                 string.write(chr(letter))
 688 |                 continue
 689 |             if code.special:
 690 |                 try:
 691 |                     var1 = next(chars)
 692 |                 except StopIteration:
 693 |                     break
 694 |                 if var1 == 0:
 695 |                     break
 696 |                 if code is NSCharCode.SKIP:
 697 |                     letter = var1
 698 |                 else:
 699 |                     if not self.unicode:
 700 |                         try:
 701 |                             var2 = next(chars)
 702 |                         except StopIteration:
 703 |                                 break
 704 |                         if var2 == 0:
 705 |                             break
 706 |                         vars = var1, var2
 707 |                     else:
 708 |                         vars = var1,
 709 |                     if code is NSCharCode.SHELL:
 710 |                         string.write(self._string_code_shell(*vars))
 711 |                         continue
 712 |                     else:
 713 |                         var = self._string_args_to_single_arg(*vars)
 714 |                         if code is NSCharCode.VAR:
 715 |                             string.write(self._string_code_variable(var))
 716 |                         if code is NSCharCode.LANG:
 717 |                             string.write(self._string_code_language(var))
 718 |                         continue
 719 |             string.write(chr(letter))
 720 |         return string.getvalue()
 721 | 
 722 |     def opcode(self, cmd: NSScriptInstruction) -> Op:
 723 |         code = cmd.opcode
 724 |         if self.type < NSType.Park1:
 725 |             if self._log_cmd_is_enabled:
 726 |                 return Op.from_int(code)
 727 |             if code < Op.SECTIONSET:
 728 |                 return Op.from_int(code)
 729 |             if code is Op.SECTIONSET:
 730 |                 return Op.LOG
 731 |             return Op.from_int(code - 1)
 732 |         if code < Op.REGISTERDLL:
 733 |             return Op.from_int(code)
 734 |         if self.type >= NSType.Park2:
 735 |             if code == Op.REGISTERDLL:
 736 |                 return Op.GETFONTVERSION
 737 |             code -= 1
 738 |         if self.type >= NSType.Park3:
 739 |             if code == Op.REGISTERDLL:
 740 |                 return Op.GETFONTNAME
 741 |             code -= 1
 742 |         if code >= Op.FSEEK:
 743 |             if self.unicode:
 744 |                 if code == Op.FSEEK:
 745 |                     return Op.FPUTWS
 746 |                 if code == Op.FSEEK + 1:
 747 |                     return Op.FGETWS
 748 |                 code -= 2
 749 |             if code >= Op.SECTIONSET and self._log_cmd_is_enabled:
 750 |                 if code == Op.SECTIONSET:
 751 |                     return Op.LOG
 752 |                 return Op.from_int(code - 1)
 753 |             if code == Op.FPUTWS:
 754 |                 return Op.FINDPROC
 755 |         return Op.from_int(code)
 756 | 
 757 |     def _find_bad_cmd(self) -> None:
 758 |         self._bad_cmd = -1
 759 |         for instruction in self.instructions:
 760 |             cmd = self.opcode(instruction)
 761 |             arg = instruction.arguments
 762 |             if cmd is Op.INVALID_OPCODE:
 763 |                 continue
 764 |             if cmd >= self._bad_cmd >= 0:
 765 |                 continue
 766 |             if self.type is NSType.Nsis3:
 767 |                 if cmd == Op.RESERVEDOPCODE:
 768 |                     self._bad_cmd = cmd
 769 |                     continue
 770 |             else:
 771 |                 if cmd == Op.RESERVEDOPCODE or cmd == Op.GETOSINFO:
 772 |                     self._bad_cmd = cmd
 773 |                     continue
 774 |             last_non_empty_index = max((k for k, a in enumerate(arg, 1) if a), default=0)
 775 |             if cmd == Op.FINDPROC and last_non_empty_index == 0:
 776 |                 self._bad_cmd = cmd
 777 |                 continue
 778 |             if _Op_PARAMETER_COUNT[cmd] < last_non_empty_index:
 779 |                 self._bad_cmd = cmd
 780 | 
 781 |     def _guess_nsis_version(self):
 782 |         self.strong_nsis = False
 783 |         self.strong_park = False
 784 |         char_mask = 0x8080 if self.unicode else 0x80
 785 |         self.strings.seek(0)
 786 |         while not self.strings.is_eof:
 787 |             string = self._read_current_string()
 788 |             if string is None:
 789 |                 continue
 790 |             if len(string) < 2:
 791 |                 continue
 792 |             if ord(string[0]) != 3:
 793 |                 continue
 794 |             if ord(string[1]) & char_mask == char_mask:
 795 |                 self.type = NSType.Nsis3
 796 |                 self.strong_nsis = True
 797 |                 break
 798 |         if self.unicode:
 799 |             if not self.strong_nsis:
 800 |                 self.type = NSType.Park1
 801 |                 self.strong_park = True
 802 |         elif self.type is NSType.Nsis2:
 803 |             for instruction in self.instructions:
 804 |                 cmd = self.opcode(instruction)
 805 |                 arg = instruction.arguments
 806 |                 if cmd is Op.GETDLGITEM:
 807 |                     if self._is_var_str(arg[1], self.NS_HWNDPARENT_225):
 808 |                         self._is_nsis225 = True
 809 |                         if arg[0] == self.NS_OUTDIR_225:
 810 |                             self._is_nsis200 = True
 811 |                             break
 812 |                 if cmd is Op.ASSIGNVAR:
 813 |                     if arg[0] == self.NS_OUTDIR_225 and arg[2] == 0 and arg[3] == 0:
 814 |                         self._is_nsis225 = self._is_var_str(arg[1], self.NS_OUTDIR)
 815 |         got_park_version = False
 816 |         mask = 0
 817 |         IN = 4 if self.unicode else 2
 818 |         if not self.strong_nsis and not self._is_nsis225 and not self._is_nsis200:
 819 |             for instruction in self.instructions:
 820 |                 cmd = instruction.opcode
 821 |                 arg = instruction.arguments
 822 |                 alt = arg[3]
 823 |                 if cmd < Op.WRITEUNINSTALLER or cmd > Op.WRITEUNINSTALLER + IN:
 824 |                     continue
 825 |                 if arg[4] != 0 or arg[5] != 0 or arg[0] <= 1 or alt <= 1:
 826 |                     continue
 827 |                 if not self._is_good_string(arg[0]) or not self._is_good_string(alt):
 828 |                     continue
 829 |                 index = self._get_var_index(alt)
 830 |                 if index is None:
 831 |                     continue
 832 |                 additional = self._get_resource_finished(alt, self.BACKSLASH)
 833 |                 if index != self.NS_INSTDIR:
 834 |                     continue
 835 |                 if self._read_string_raw(alt + additional) == self._read_string_raw(arg[0]):
 836 |                     inserts = cmd - Op.WRITEUNINSTALLER.value
 837 |                     mask |= 1 << inserts
 838 |             if mask == 1:
 839 |                 got_park_version = True
 840 |             elif mask:
 841 |                 shift = 0
 842 |                 nt = self.type
 843 |                 if self.unicode:
 844 |                     shift = 2
 845 |                 if mask == 1 << (shift + 1):
 846 |                     nt = NSType.Park2
 847 |                 if mask == 1 << (shift + 2):
 848 |                     nt = NSType.Park3
 849 |                 if nt != self.type:
 850 |                     got_park_version = True
 851 |                     self.type = nt
 852 |         self._find_bad_cmd()
 853 |         if self._bad_cmd < Op.REGISTERDLL:
 854 |             return
 855 |         if self.strong_park and not got_park_version:
 856 |             if self._bad_cmd < Op.SECTIONSET:
 857 |                 self.type = NSType.Park3
 858 |                 self._log_cmd_is_enabled = True
 859 |                 self._find_bad_cmd()
 860 |                 if self._bad_cmd in range(Op.SECTIONSET):
 861 |                     self.type = NSType.Park2
 862 |                     self._log_cmd_is_enabled = False
 863 |                     self._find_bad_cmd()
 864 |                     if self._bad_cmd in range(Op.SECTIONSET):
 865 |                         self.type = NSType.Park1
 866 |                         self._find_bad_cmd()
 867 |         if self._bad_cmd >= Op.SECTIONSET:
 868 |             self._log_cmd_is_enabled = not self._log_cmd_is_enabled
 869 |             self._find_bad_cmd()
 870 |             if self._bad_cmd >= Op.SECTIONSET and self._log_cmd_is_enabled:
 871 |                 self._log_cmd_is_enabled = False
 872 |                 self._find_bad_cmd()
 873 | 
 874 |     def _read_items(self) -> List[NSItem]:
 875 |         prefixes = ['$INSTDIR']
 876 |         out_dir = ''
 877 |         out_dir_index = (
 878 |             self.NS_OUTDIR_225
 879 |         ) if self._is_nsis225 else (
 880 |             self.NS_OUTDIR_226
 881 |         )
 882 |         items: List[NSItem] = []
 883 | 
 884 |         for cmd_index, instruction in enumerate(self.instructions):
 885 |             def set_path(index:int) -> None:
 886 |                 item.prefix = None
 887 |                 item.name = self._read_string(index)
 888 |                 if not self._is_var_absolute_path(index):
 889 |                     item.prefix = prefixes[-1]
 890 |             
 891 |             cmd = self.opcode(instruction)
 892 |             arg = instruction.arguments
 893 | 
 894 |             if cmd is Op.INVALID_OPCODE:
 895 |                 continue
 896 |             elif cmd is Op.CREATEDIR:
 897 |                 if not arg[1]:
 898 |                     continue
 899 |                 _path = arg[0]
 900 |                 index = self._get_var_index(_path)
 901 |                 if index in (out_dir_index, self.NS_OUTDIR):
 902 |                     _path += self._get_resource(_path)
 903 |                 path = self._read_string(_path)
 904 |                 if index == out_dir_index:
 905 |                     path = out_dir + path
 906 |                 elif index == self.NS_OUTDIR:
 907 |                     path = prefixes[-1] + path
 908 |                 prefixes.append(path)
 909 |             elif cmd is Op.ASSIGNVAR:
 910 |                 if arg[0] != out_dir_index:
 911 |                     continue
 912 |                 if self._is_var_str(arg[1], self.NS_OUTDIR) and arg[2] == 0 and arg[3] == 0:
 913 |                     out_dir = prefixes[-1]
 914 |             elif cmd is Op.EXTRACTFILE:
 915 |                 try:
 916 |                     time = datetime.fromtimestamp(arg[4] << 32 | arg[3])
 917 |                 except Exception:
 918 |                     time = None
 919 |                 item = NSItem(arg[2], mtime=time)
 920 |                 set_path(arg[1])
 921 |                 items.append(item)
 922 |                 if not self._is_var_str(arg[1], 10):
 923 |                     continue
 924 |                 cmd_back_offset = 28
 925 |                 if cmd_index > 1:
 926 |                     previous = self.instructions[cmd_index - 1]
 927 |                     if self.opcode(previous) is Op.NOP:
 928 |                         cmd_back_offset -= 2
 929 |                 if cmd_index <= cmd_back_offset:
 930 |                     continue
 931 |                 previous = self.instructions[cmd_index - cmd_back_offset]
 932 |                 if self.opcode(previous) is Op.ASSIGNVAR:
 933 |                     previous_arguments = previous.arguments
 934 |                     if previous_arguments[0] == 14 and previous_arguments[2] == 0 and previous_arguments[3] == 0:
 935 |                         set_path(previous_arguments[1])
 936 |             elif cmd is Op.SETFILEATTRIBUTES:
 937 |                 if cmd_index > 0:
 938 |                     previous = self.instructions[cmd_index - 1]
 939 |                     previous_arguments = previous.arguments
 940 |                     if self.opcode(previous) is Op.EXTRACTFILE and arg[0] == previous_arguments[1]:
 941 |                         item = items[-1]
 942 |                         item.attributes = arg[1]
 943 |             elif cmd is Op.WRITEUNINSTALLER:
 944 |                 if arg[4] or arg[5] or arg[0] <=1 or arg[3] <= 1:
 945 |                     continue
 946 |                 if not self._is_good_string(arg[0]):
 947 |                     continue
 948 |                 if self._bad_cmd in range(Op.WRITEUNINSTALLER):
 949 |                     continue
 950 |                 item = NSItem(arg[1])
 951 |                 set_path(arg[0])
 952 |                 item.patch_size = arg[2]
 953 |                 item.is_uninstaller = True
 954 |                 items.append(item)
 955 |         return items
 956 |                 
 957 |     @property
 958 |     def script(self):
 959 |         script = io.StringIO()
 960 |         name_width = max(len(op.name) for op in Op)
 961 |         addr_width = len(F'{len(self.instructions):X}')
 962 |         for k, instruction in enumerate(self.instructions):
 963 |             if k > 0:
 964 |                 script.write('\n')
 965 |             opcode = self.opcode(instruction)
 966 |             script.write(F'{k:0{addr_width}X} {opcode.name:{name_width}}')
 967 |             for j, arg in enumerate(instruction.arguments[:_Op_PARAMETER_COUNT.get(opcode, 6)]):
 968 |                 if j > 0:
 969 |                     script.write(', ')
 970 |                 if arg > 20 and self._is_good_string(arg):
 971 |                     script.write(repr(self._read_string(arg)))
 972 |                 elif arg < 0x100:
 973 |                     script.write(str(arg))
 974 |                 elif arg < 0x10000:
 975 |                     script.write(F'${arg:04X}')
 976 |                 else:
 977 |                     script.write(F'${arg:08X}')
 978 |         return script.getvalue()
 979 | 
 980 |     def _string_code_language(self, index: int) -> str:
 981 |         return F'$LANGUAGE:{index:04X}'
 982 | 
 983 | 
 984 |     def __init__(self, reader: StructReader[bytearray], size: int, extended: bool):
 985 |         self.is64bit = size >= 4 + 12 * 8 and not any(
 986 |             struct.unpack('8xI' * 8, reader.peek(12 * 8)))
 987 |         block_header_offset_size = 12 if self.is64bit else 8
 988 |         required_size = block_header_offset_size * 8 + 4
 989 |         if size < required_size:
 990 |             raise ValueError(F'Header size 0x{size:08X} is too small. Minimum required size is 0x{required_size:08X}.')
 991 |         # TODO: Confirm role of unknown value. Copilot believes it to be
 992 |         # a signature indicating the end of the NSIS installer header.
 993 |         self.unknown_value = reader.u32()
 994 |         self.block_header_offsets = [NSBlockHeaderOffset(
 995 |             reader.read(block_header_offset_size), 
 996 |             is64bit=self.is64bit) for _ in range(8)]
 997 |         self.block_header_entries = self.block_header_offsets[2]
 998 |         self.block_header_strings = self.block_header_offsets[3]
 999 |         self.block_header_langtables = self.block_header_offsets[4]
1000 |         
1001 |         for key, offset in enumerate(self.block_header_offsets):
1002 |             width = 0x10 if self.is64bit else 8
1003 |             table = {2: 'entries', 3: 'strings', 4: 'langtables'}.get(key)
1004 |             message = F'Block {key}: offset=0x{offset.offset:0{width}X}, size=0x{offset.size:0{width}X}'
1005 |             if table is not None:
1006 |                 message += F'{message} ({table})'
1007 |             logging.debug(message)
1008 | 
1009 |         self.type = NSType.Nsis2 # Default to NSIS 2
1010 |         
1011 |         reader.seek_set(self.block_header_entries.offset)
1012 |         InsnParser = NSScriptExtendedInstruction if extended else NSScriptInstruction
1013 |         self.instructions: List[NSScriptInstruction] = [
1014 |             InsnParser(reader) for _ in range(self.block_header_entries.size)]
1015 | 
1016 |         if self.block_header_entries.offset > size:
1017 |             raise ValueError(F'Header indicates {self.block_header_entries.size} entries, but only {size} bytes remain.')
1018 |         if self.block_header_strings.offset > size:
1019 |             raise ValueError(F'Header indicates {self.block_header_strings.size} strings, but only {size} bytes remain.')
1020 |         if self.block_header_langtables.offset > size:
1021 |             raise ValueError(F'Header indicates {self.block_header_langtables.size} langtables, but only {size} bytes remain.')
1022 |         if self.block_header_langtables.offset < self.block_header_strings.offset:
1023 |             raise ValueError(F'Langtables block is before strings block.')
1024 |         string_table_size = self.block_header_langtables.offset - self.block_header_strings.offset
1025 |         if string_table_size < 2:
1026 |             raise ValueError(F'String table size is too small.')
1027 |         reader.seek_set(self.block_header_strings.offset)
1028 |         strings = reader.read(string_table_size)
1029 |         self.unicode = strings[:2] == B'\0\0'
1030 |         if strings[-1] != 0 or (self.unicode and strings[-2] != 0):
1031 |             raise ValueError(F'String table is not null-terminated.')
1032 |         if self.unicode and string_table_size % 2 != 0:
1033 |             raise ValueError(F'String table is not even-sized.')
1034 | 
1035 |         self.strings = StructReader(strings)
1036 |         if self.block_header_entries.size > (1 << 25):
1037 |             raise ValueError(F'Header indicates {self.block_header_entries.size} entries, which is too large.')
1038 |         
1039 |         self._log_cmd_is_enabled = False
1040 |         self._is_nsis225 = False
1041 |         self._is_nsis200 = False
1042 |         self_bad_cmd = -1
1043 | 
1044 |         self._guess_nsis_version()
1045 | 
1046 |         items: Dict[(str, int), NSItem] = {}
1047 |         for item in self._read_items():
1048 |             if items.setdefault((item.path, item.offset), item) != item:
1049 |                 raise ValueError(F'Duplicate item: {item.path} at 0x{item.offset:08X}')
1050 |         
1051 |         self.items = [items[t] for t in sorted(items.keys())]
1052 | 
1053 |     @property
1054 |     def nsis_deflate(self):
1055 |         return self.type is not NSType.Nsis3
1056 |     
1057 |     @property
1058 |     def encoding(self):
1059 |         return 'utf-16' if self.unicode else 'latin1'
1060 | 
1061 |     @property
1062 |     def charsize(self):
1063 |         return 2 if self.unicode else 1
1064 |         
1065 | 
1066 | 
1067 | class NSArchive(Struct):
1068 |     MAGICS = [
1069 |         # https://nsis.sourceforge.io/Can_I_decompile_an_existing_installer
1070 |         B'\xEF\xBE\xAD\xDE' B'Null' B'soft' B'Inst',   # v1.6
1071 |         B'\xEF\xBE\xAD\xDE' B'Null' B'Soft' B'Inst',   # v1.3
1072 |         B'\xED\xBE\xAD\xDE' B'Null' B'Soft' B'Inst',   # v1.1
1073 |         B'\xEF\xBE\xAD\xDE' B'nsis' B'inst' B'all\0',  # v1.0
1074 |     ]
1075 | 
1076 |     @dataclasses.dataclass
1077 |     class Entry:
1078 |         offset: int
1079 |         data: bytearray
1080 |         compressed_size: int
1081 |         decompression_failed: bool = False
1082 | 
1083 | 
1084 |     def __init__(self, reader: StructReader[bytearray]):
1085 |         self.flags = NSHeaderFlags(reader.u32())
1086 |         self.signature = reader.read(0x10)
1087 |         header_data = None
1088 |         header_size = reader.u32()
1089 |         header_data_length = None
1090 |         archive_size = reader.u32()
1091 |         self.archive_offset = reader.tell()
1092 |         body_size = archive_size - self.archive_offset
1093 |         if body_size < 0:
1094 |             raise ValueError("Invalid archive size")
1095 |         if header_size < self.archive_offset:
1096 |             raise ValueError("Invalid header size")
1097 |         if reader.remaining_bytes < body_size:
1098 |             raise ValueError(
1099 |                 F'Header indicates archive size 0x{archive_size:08X}, '
1100 |                 F'but only 0x{reader.remaining_bytes:08X} bytes remain.')
1101 |     
1102 |         
1103 |         
1104 |         # Preview_bytes and preview check will check the compression format. This takes
1105 |         # a few bytes and checks the header to determine the format
1106 |         
1107 |         # Header Matching Logic:
1108 |         #  X is the header size as given by the first header
1109 |         #  T is a value less than 0xE
1110 |         #  Y is a value different from 0x80
1111 |         # XX XX XX XX __ __ __ __ __ __ __  non-solid, uncompressed
1112 |         # 00 00 00 00 00 00 00 00 XX XX XX XX  non-solid, uncompressed, extended
1113 |         # 5D 00 00 DD DD 00 __ __ __ __ __  solid LZMA
1114 |         # 00 5D 00 00 DD DD 00 __ __ __ __  solid LZMA, empty filter
1115 |         # 01 5D 00 00 DD DD 00 __ __ __ __  solid LZMA, BCJ filter
1116 |         # __ __ __ 80 5D 00 00 DD DD 00 __  non-solid LZMA
1117 |         # __ __ __ 80 00 5D 00 00 DD DD 00  non-solid LZMA, empty filter
1118 |         # __ __ __ 80 01 5D 00 00 DD DD 00  non-solid LZMA, BCJ filter
1119 |         # __ __ __ 80 01 0T __ __ __ __ __  non-solid BZip
1120 |         # __ __ __ 80 __ __ __ __ __ __ __  non-solid deflate
1121 |         # 01 0T __ YY __ __ __ __ __ __ __  solid BZip
1122 |         # __ __ __ YY __ __ __ __ __ __ __  solid Deflate
1123 |         
1124 |         def lzmacheck(preview):
1125 |             if B'\x5D\0\0' not in preview[:4]:
1126 |                 return False
1127 |             filter_flag = preview_bytes[0] <= 1
1128 |             reader.seek_relative(3 + int(filter_flag))
1129 |             self.lzma_options = LZMAOptions(filter_flag, reader.u32())
1130 |             return True
1131 |         
1132 |         def bzipcheck(preview):
1133 |             return preview[0] == 0x31 and preview[1] < 14
1134 |         
1135 |         preview_bytes = bytes(reader.peek(16))
1136 |         preview_check = preview_bytes.find(header_size.to_bytes(4, byteorder='little'))
1137 |         
1138 |         # The default "solid" value is True and default method is deflate.
1139 |         # Regarding Solid:
1140 |         # "If /SOLID is used, all of the installer data is compressed in one block. This results in greater compression ratios."
1141 |         # We determine if the compression is solid or not by checking the headers.
1142 |         # https://nsis.sourceforge.io/Docs/Chapter4.html#
1143 |         self.solid = True
1144 |         self.extended = False
1145 |         self.lzma_options: Optional[LZMAOptions] = None
1146 |         self.method = NSMethod.Deflate
1147 |         self.entries: Dict[int, bytearray] = {}
1148 |         self.entry_offset_delta = 4
1149 |         self._solid_iter = None
1150 |         if preview_check >= 0:
1151 |             header_data_length = header_size
1152 |             self.method = NSMethod.Copy
1153 |             self.solid = False
1154 |             if not preview_check:
1155 |                 header_prefix_size = 0x04
1156 |             elif preview_check == 8:
1157 |                 header_prefix_size = 0x10
1158 |                 self.extended = True
1159 |             else:
1160 |                 raise ValueError(F'Invalid header size: 0x{header_size:08X}, unknown NSIS format')
1161 |             reader.seek_relative(header_prefix_size)
1162 |             self.entry_offset_delta = header_prefix_size
1163 |             header_data = reader.read_exactly(header_data_length)
1164 |         elif lzmacheck(preview_bytes):
1165 |             self.method = NSMethod.LZMA
1166 |         elif preview_bytes[3] == 0x80:
1167 |             self.solid = False
1168 |             reader.seek_relative(4)
1169 |             preview_bytes = bytes(reader.peek(4))
1170 |             if lzmacheck(preview_bytes):
1171 |                 self.method = NSMethod.LZMA
1172 |             elif bzipcheck(preview_bytes):
1173 |                 self.method = NSMethod.BZip2
1174 |         elif bzipcheck(preview_bytes):
1175 |             self.method = NSMethod.BZip2
1176 | 
1177 |         reader.seek_set(self.archive_offset)
1178 |         self.entries: Dict[int, bytearray] = {}
1179 |         #self.entry_offset_delta = 0
1180 |         #self._solid_iter = None
1181 | 
1182 |         if header_data is None:
1183 |             item = self._decompress_items(reader)
1184 |             header_entry = next(item)
1185 |             if header_entry.decompression_failed:
1186 |                 raise ValueError(
1187 |                     'This archive seems to use an NSIS-specific deflate '
1188 |                     'algorithm which has not been implemented yet.')
1189 |             if self.solid:
1190 |                 self._solid_iter = item 
1191 |             self.entry_offset_delta += header_entry.compressed_size
1192 |             header_data = header_entry.data
1193 |         else:
1194 |             self.entry_offset_delta += len(header_data)
1195 | 
1196 |         if not header_data:
1197 |             raise ValueError("Empty header")
1198 |         logging.debug(F'Header size: 0x{header_size:08X}')
1199 | 
1200 |         self.header = NSHeader(header_data, size=header_size, extended=self.extended)
1201 |         self.reader = reader
1202 | 
1203 |         if self.method is NSMethod.Deflate and self.header.nsis_deflate:
1204 |             self.method = NSMethod.NSGzip
1205 |         
1206 |     @property
1207 |     def script(self):
1208 |         return self.header.script
1209 |     
1210 |     @property
1211 |     def offset_items(self):
1212 |         return self.archive_offset + self.entry_offset_delta
1213 |     
1214 |     def _extract_item_data(self, item: NSItem) -> Entry:
1215 |         if self.solid:
1216 |             while True:
1217 |                 try:
1218 |                     entry = self.entries[item.offset]
1219 |                 except KeyError:
1220 |                     try:
1221 |                         entry = next(self._solid_iter)
1222 |                     except StopIteration:
1223 |                         raise LookupError(F'Failed to find item at offset 0x{item.offset:08X}.')
1224 |                     self.entries[entry.offset - self.entry_offset_delta] = entry.data
1225 |                 else: 
1226 |                     return entry
1227 |         else:
1228 |             self.reader.seek(self.offset_items + item.offset)
1229 |             decompressed = self._decompress_items(self.reader)
1230 |             entry = next(decompressed).data
1231 |             return entry
1232 | 
1233 |     class SolidReader(Iterable[Entry]):
1234 |         def __init__(self, src: BinaryIO, prefix_length: int):
1235 |             self.src = src
1236 |             self.pos = 0
1237 |             self.prefix_length = prefix_length
1238 | 
1239 |         def __iter__(self):
1240 |             return self
1241 |         
1242 |         def __next__(self):
1243 |             offset = self.pos
1244 |             mask = (1 << ((self.prefix_length * 8) - 1)) - 1
1245 |             size = self.src.read(self.prefix_length)
1246 |             if len(size) != self.prefix_length:
1247 |                 raise StopIteration
1248 |             size = int.from_bytes(size, byteorder='little')
1249 |             read = size & mask
1250 |             data = self.src.read(read)
1251 |             if len(data) != read:
1252 |                 raise EOFError('Unexpected end of stream while decompressing archive entries.')
1253 |             self.pos = offset + read + 4
1254 |             return NSArchive.Entry(offset, data, size)
1255 | 
1256 |     class PartsReader(SolidReader):
1257 |         def __init__(self, src: BinaryIO, decompressor: Optional[Type[BinaryIO]], prefix_length: int):
1258 |             super().__init__(src, prefix_length)
1259 |             self._dc = decompressor
1260 | 
1261 |         def __next__(self):
1262 |             item = super().__next__()
1263 |             is_compressed = bool(item.compressed_size & 0x80000000)
1264 |             item.compressed_size &= 0x7FFFFFFF
1265 |             if is_compressed:
1266 |                 try:
1267 |                     dc = self._dc(MemoryFile(item.data))
1268 |                     item.data = dc.read()
1269 |                 except Exception:
1270 |                     item.decompression_failed = True
1271 |             return item              
1272 |         
1273 |     class LZMAFix:
1274 |         ''' Creates a wrapper to compensate for how NSIS handles LZMA'''
1275 |         def __init__(self, src: MemoryFile):
1276 |             self._src = src
1277 |             self._fix = MemoryFile(bytes(src.read(5)) + B'\xFF' * 8)
1278 | 
1279 |         def __getattr__(self, key):
1280 |             return getattr(self._src, key)
1281 |         
1282 |         def read(self, size: int = -1):
1283 |             src = self._src
1284 |             fix = self._fix
1285 |             if not fix.remaining_bytes:
1286 |                 return src.read(size)
1287 |             if size < 0:
1288 |                 size = fix.remaining_bytes + src.remaining_bytes
1289 |             data = bytearray(size)
1290 |             wrapper = fix.read(size)
1291 |             data[:len(wrapper)] = wrapper
1292 |             data[len(wrapper):] = src.read(size - len(wrapper))
1293 |             return data
1294 |     
1295 | 
1296 | 
1297 |     def _decompress_items(self, reader: StructReader[bytearray]) -> Iterator[Entry]:
1298 |         """ Decompresses the items in the archive. """
1299 |         def NSISLZMAFile(d):
1300 |             if use_filter := self.lzma_options.filter_flag:
1301 |                 use_filter = d.u8()
1302 |             if use_filter > 1:
1303 |                 raise ValueError(F'LZMA/BCJ chunk with invalid filter indicator byte 0x{use_filter:X}')
1304 |             if not use_filter:
1305 |                 _filter = None
1306 |                 _format = None
1307 |                 _stream = self.LZMAFix(d)
1308 |             else:
1309 |                 pv = d.u8()
1310 |                 ds = max(self.lzma_options.dictionary_size, d.u32())
1311 |                 if (pv >= 225):
1312 |                     raise ValueError('Unexpected LZMA properties; value exceeds 225.')
1313 |                 pv, lc = divmod(pv, 9)
1314 |                 pb, lp = divmod(pv, 5)
1315 |                 _filter = [
1316 |                     dict(id=lzma.FILTER_X86),
1317 |                     dict(id=lzma.FILTER_LZMA1, dict_size=ds, lc=lc, lp=lp, pb=pb)]
1318 |                 _format = lzma.FORMAT_RAW
1319 |                 _stream = d
1320 | 
1321 |             return lzma.LZMAFile(_stream, filters=_filter, format=_format)
1322 |             
1323 |         decompressor: Type[BinaryIO]= {
1324 |             NSMethod.Copy    : None,
1325 |             NSMethod.Deflate : DeflateFile,
1326 |             NSMethod.NSGzip  : GZipFile,
1327 |             NSMethod.LZMA    : NSISLZMAFile,
1328 |             NSMethod.BZip2   : BZip2File,
1329 |         }[self.method]
1330 |         prefix_length = 8 if self.extended else 4
1331 |         if self.solid:
1332 |             return self.SolidReader(decompressor(reader), prefix_length)
1333 |         else:
1334 |             return self.PartsReader(reader, decompressor, prefix_length)
1335 |         
1336 | 
1337 | class extractNSIS(ArchiveUnit):
1338 |     """
1339 |     A class to extract an NSIS file.
1340 |     """
1341 |     @classmethod
1342 |     def _find_archive_offset(cls, data: memoryview, before: int = -1, flaw_max=2) -> int:
1343 |         def signatures(*magics):
1344 |             for changes in range(flaw_max + 1):
1345 |                 for magic in magics:
1346 |                     if not changes:
1347 |                         yield 0, magic
1348 |                         continue
1349 |                     for positions in itertools.permutations(range(len(magic)), r=changes):
1350 |                         signature = bytearray(magic)
1351 |                         for position in positions:
1352 |                             signature[position] = 0x2E
1353 |                         yield changes, bytes(signature)
1354 |         best_guess = None
1355 |         search_space = memoryview(data)
1356 |         for flaws, sig in signatures(*NSArchive.MAGICS):
1357 |             if flaws > 1:
1358 |                 search_space = search_space[:0x20_000]
1359 |             matches = [m.start() - 4 for m in re.finditer(sig, 
1360 |                                                           search_space, 
1361 |                                                           flags=re.DOTALL)]
1362 |             if before >= 0:
1363 |                 matches = [match for match in matches if match < before]
1364 |             matches.reverse()
1365 |             archive = None
1366 |             for match in matches:
1367 |                 if match % 0x200 == 0:
1368 |                     archive = match
1369 |                     break
1370 |             if not archive:
1371 |                 if matches and not best_guess:
1372 |                     best_guess = matches[-1]
1373 |             else:
1374 |                 message = F'Archive signature was found at offset 0x{archive:08X}.'
1375 |                 if flaws > 0:
1376 |                     message += F' the signature has {flaws} flaws and was likely modified.'
1377 |                 logging.debug(message)
1378 |                 return archive
1379 |         if best_guess:
1380 |             message = F'Archive signature was found at offset 0x{best_guess:08X}, but it has too many flaws to be reliable.'
1381 |             logging.debug(message)
1382 |         return best_guess
1383 |         
1384 | 
1385 |     def unpack(self, data: memoryview):
1386 |         memory = memoryview(data)
1387 |         before = -1
1388 |         _error = None
1389 |         while True:
1390 |             offset = self._find_archive_offset(data, before)
1391 |             if offset is None:
1392 |                 _error = _error or ValueError("Unable to find NSIS archive marker")
1393 |                 raise _error
1394 |             try:
1395 |                 archive = NSArchive(memory[offset:])
1396 |             except Exception as e:
1397 |                 _error = e
1398 |                 before = offset
1399 |             else:
1400 |                 break
1401 | 
1402 |         unpacked_items = []
1403 |         for item in archive.header.items:
1404 |             unpacked_items.append(self._pack(item.path, item.mtime, archive._extract_item_data(item)))
1405 |         unpacked_items.append(self._pack('setup.nsis', None, archive.script.encode('utf-8'))) 
1406 |         return unpacked_items
1407 | 
1408 | 
1409 | 
1410 | 


--------------------------------------------------------------------------------