├── .flake8 ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── .pre-commit-config.yaml ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.md ├── docs └── expose-text.png ├── expose_text ├── __init__.py ├── core.py ├── exceptions.py └── formats │ ├── __init__.py │ ├── _docx.py │ ├── _html.py │ ├── _pdf.py │ ├── _txt.py │ ├── _utils.py │ ├── base.py │ ├── markup │ ├── __init__.py │ └── utils.py │ └── pdf │ ├── __init__.py │ ├── auto_pdf.py │ ├── pdf2html2pdf.py │ └── pdf_redactor.py ├── pyproject.toml ├── requirements.txt ├── setup.py └── tests ├── __init__.py ├── conftest.py ├── files ├── doctest.txt ├── doctest_altered.txt ├── foo.bar ├── pdf │ └── doc.pdf ├── test.docx ├── test.html ├── test.txt ├── test_altered.docx ├── test_altered.html ├── test_altered.txt └── tmp │ └── .gitkeep ├── test_alterations_buffer.py ├── test_apply_buffer_to_text.py ├── test_auto_pdf_format.py ├── test_docx_format.py ├── test_file_wrapper.py ├── test_html_format.py ├── test_pdf2html2pdf_format.py ├── test_pdf_format.py └── test_txt_format.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E203, W503 3 | max-line-length = 127 4 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Continuous Integration 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | pre-commit: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v2 10 | - uses: actions/setup-python@v2 11 | - uses: pre-commit/action@v2.0.0 12 | test: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: [3.7, 3.8] 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Install OS dependecies 24 | run: | 25 | sudo apt-get install -y poppler-utils 26 | wget --quiet https://github.com/wkhtmltopdf/wkhtmltopdf/releases/download/0.12.3/wkhtmltox-0.12.3_linux-generic-amd64.tar.xz && \ 27 | tar vxf wkhtmltox-0.12.3_linux-generic-amd64.tar.xz && \ 28 | sudo cp wkhtmltox/bin/wk* /usr/local/bin/ && \ 29 | rm -rf wkhtmltox 30 | - name: Install Python dependencies 31 | run: | 32 | python -m pip install --upgrade pip 33 | pip install -r requirements.txt 34 | - name: Test with pytest 35 | run: | 36 | pip install pytest 37 | pip install pytest-cov 38 | pytest --doctest-modules --cov-report term --cov=expose_text 39 | publish: 40 | runs-on: ubuntu-latest 41 | needs: [pre-commit, test] 42 | if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') 43 | steps: 44 | - uses: actions/checkout@v2 45 | - name: Set up Python 46 | uses: actions/setup-python@v2 47 | with: 48 | python-version: '3.x' 49 | - name: Publish package for tags 50 | run: | 51 | python -m pip install --upgrade pip setuptools wheel twine 52 | python setup.py sdist bdist_wheel 53 | python -m twine upload -u ${{ secrets.PYPI_USERNAME }} -p ${{ secrets.PYPI_PASSWORD }} dist/* 54 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # PyCharm 2 | .idea/ 3 | 4 | # Mac 5 | .DS_Store 6 | 7 | # Virtualenv 8 | venv/ 9 | 10 | # Python 11 | expose_text.egg-info/ 12 | 13 | # Tests 14 | .coverage 15 | __pycache__ 16 | tests/files/tmp/* 17 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: stable 4 | hooks: 5 | - id: black 6 | - repo: https://gitlab.com/pycqa/flake8 7 | rev: 3.7.9 8 | hooks: 9 | - id: flake8 10 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7 2 | 3 | # Run tests within Docker 4 | # docker build -t expose-text . 5 | # docker run expose-text 6 | 7 | WORKDIR /app 8 | 9 | # Install PDF depdencies (expose-text) 10 | RUN apt-get update 11 | RUN apt-get install -y cmake autoconf 12 | 13 | # wkhtmltopdf 14 | RUN wget --quiet https://github.com/wkhtmltopdf/wkhtmltopdf/releases/download/0.12.3/wkhtmltox-0.12.3_linux-generic-amd64.tar.xz && \ 15 | tar vxf wkhtmltox-0.12.3_linux-generic-amd64.tar.xz && \ 16 | cp wkhtmltox/bin/wk* /usr/local/bin/ && \ 17 | rm -rf wkhtmltox 18 | 19 | # Uninstall old version (latest version is not available over apt) 20 | RUN apt-get purge -y poppler-utils 21 | 22 | # Install new poppler-utils manually 23 | RUN wget poppler.freedesktop.org/poppler-0.90.1.tar.xz 24 | RUN tar -xvf poppler-0.90.1.tar.xz 25 | RUN cd poppler-0.90.1 && mkdir build && cd build && cmake .. && make && ldconfig 26 | RUN ln -s /usr/local/bin/pdftohtml /usr/bin/pdftohtml 27 | 28 | # Install packages 29 | COPY requirements.txt . 30 | 31 | RUN pip install --no-cache-dir -r requirements.txt 32 | 33 | RUN pip install pytest pytest-cov 34 | 35 | COPY ./ /app/ 36 | 37 | CMD ["pytest", "--doctest-modules", "--cov-report", "term", "--cov", "expose_text", "-s"] 38 | 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jonas Langhabel, Malte Ostendorff 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ExposeText 2 | 3 | **Expose the text in a document for modification.** 4 | 5 | --- 6 | 7 | [![PyPI version](https://badge.fury.io/py/expose-text.svg)](https://badge.fury.io/py/expose-text) 8 | ![Tests](https://github.com/openredact/expose-text/workflows/Tests/badge.svg?branch=master) 9 | ![Black & Flake8](https://github.com/openredact/expose-text/workflows/Black%20&%20Flake8/badge.svg?branch=master) 10 | [![Code style: Black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black) 11 | [![MIT license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](http://opensource.org/licenses/MIT) 12 | 13 | _**⚠️ Disclaimer ⚠️:**_ This is a prototype. Do not use for anything critical. 14 | 15 | ## What is ExposeText? 16 | 17 | Dealing with document file formats can be quite painful. 18 | Oftentimes code must be written that’s specific to one file format. 19 | We have written ExposeText with the goal to make modifying documents as simple as changing Python strings. 20 | A slice of the original document can be directly assigned a new content by using the character indices of the extracted text, all while keeping the document's original formatting. 21 | 22 | We published a blog post about ExposeText on [Medium](https://medium.com/@openredact/introducing-exposetext-modify-document-files-as-simply-as-strings-cc5caa5f9c66?source=friends_link&sk=825c8f64dfa4e943b66d1faf351340a2). 23 | 24 | ![](https://raw.githubusercontent.com/openredact/expose-text/master/docs/expose-text.png "Exposing the plain text content, then modifying it") 25 | 26 | ## Supported Formats 27 | 28 | ExposeText has prototypical support for the following file formats: 29 | 30 | - .txt 31 | - Per default, the encoding is assumed to be UTF-8. 32 | - You can install [chardet](https://github.com/chardet/chardet) (`pip install chardet`), to automatically detect the encoding. 33 | - .html 34 | - You can pass either an HTML snippet, an HTML body or a complete HTML document. If you pass a complete HTML document, every text content outside the body is ignored. 35 | - The output file will always be encoded in UTF-8. 36 | - .docx 37 | - Only text within `` tags (the tags for anything that is text) is exposed. E.g. the mailto link of an e-mail address is not exposed. 38 | - .pdf 39 | - Per default, text in PDFs can only be replaced with characters that occur in the file (fonts are stored economically in PDF files). 40 | - If you install the additional dependencies [Poppler (pdftohtml)](https://poppler.freedesktop.org/) and [wkhtmltopdf](https://wkhtmltopdf.org/), the PDF is rerendered and there is no more restriction on the characters that can be used. 41 | 42 | 43 | ## Usage 44 | 45 | ExposeText supports files as well as binary data objects. 46 | Depending on your use case you can use one of the following interfaces for making modifications. 47 | 48 | ### Installation 49 | 50 | `expose-text` can be installed from PyPi and has to be installed in a virtual environment (venv or conda for instance). 51 | 52 | ```bash 53 | pip install expose-text 54 | ``` 55 | 56 | ### Slicing API 57 | 58 | The slicing API applies each alteration immediately. 59 | 60 | Exposing and modifying text inside a file: 61 | ```python 62 | >>> from expose_text import FileWrapper 63 | >>> 64 | >>> wrapper = FileWrapper("myfile.docx") 65 | >>> wrapper.text 66 | 'This is the content as string.' 67 | 68 | >>> wrapper[12:19] = "new content" 69 | >>> wrapper.text 70 | 'This is the new content as string.' 71 | 72 | >>> wrapper[33] = "!" # note that you have to use the updated index here 73 | >>> wrapper.text 74 | 'This is the new content as string!' 75 | 76 | >>> wrapper.save("newfile.docx") 77 | ``` 78 | 79 | If you want to work directly with binary data you have to pass the file format: 80 | ```python 81 | >>> from expose_text import BinaryWrapper 82 | >>> 83 | >>> wrapper = BinaryWrapper(my_bytes, ".docx") 84 | >>> wrapper.text 85 | 'This is the content as string.' 86 | 87 | >>> wrapper[12:19] = "new content" 88 | >>> wrapper.text 89 | 'This is the new content as string.' 90 | 91 | >>> wrapper.bytes # get the modified file as bytes 92 | b'...' 93 | ``` 94 | 95 | ### Functional API 96 | 97 | With the functional API, you can queue several alterations based on the initial indices and then apply them together. 98 | ```python 99 | >>> wrapper.text 100 | 'This is the content as string.' 101 | 102 | >>> wrapper.add_alter(12, 19, "new content") 103 | >>> wrapper.add_alter(29, 30, "!") 104 | >>> wrapper.apply_alters() 105 | >>> wrapper.text 106 | 'This is the new content as string!' 107 | ``` 108 | 109 | ## Development 110 | 111 | ### Install requirements 112 | 113 | You can install all (production and development) requirements using: 114 | 115 | ``` 116 | pip install -r requirements.txt 117 | ``` 118 | 119 | ### Install the pre-commit hooks 120 | 121 | This repository uses git hooks to validate code quality and formatting. 122 | 123 | ``` 124 | pre-commit install 125 | git config --bool flake8.strict true # Makes the commit fail if flake8 reports an error 126 | ``` 127 | 128 | To run the hooks: 129 | ``` 130 | pre-commit run --all-files 131 | ``` 132 | 133 | ### Testing 134 | 135 | The tests can be executed with: 136 | ``` 137 | pytest --doctest-modules --cov-report term --cov=expose_text 138 | ``` 139 | 140 | ### Testing in Docker 141 | 142 | You can run the test as well in a Docker container: 143 | 144 | ```bash 145 | docker build -t expose-text 146 | docker run expose-text 147 | ``` 148 | 149 | ## How to contact us 150 | 151 | For usage questions, bugs, or suggestions please file a Github issue. 152 | If you would like to contribute or have other questions please email hello@openredact.org. 153 | 154 | ## License 155 | 156 | [MIT License](https://github.com/openredact/expose-text/blob/master/LICENSE) 157 | -------------------------------------------------------------------------------- /docs/expose-text.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openredact/expose-text/c8e21774d0ca2f0103e91b50ea093c87a3738a3e/docs/expose-text.png -------------------------------------------------------------------------------- /expose_text/__init__.py: -------------------------------------------------------------------------------- 1 | from expose_text.core import FileWrapper, BinaryWrapper # noqa: F401 2 | from expose_text.exceptions import UnsupportedFormat # noqa: F401 3 | -------------------------------------------------------------------------------- /expose_text/core.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from typing import Union 4 | 5 | from expose_text.formats import registry 6 | 7 | registry.register_formats() 8 | 9 | 10 | class BinaryWrapper: 11 | """A wrapper for binary files in various formats that exposes their text content for modification. 12 | 13 | >>> from pathlib import Path 14 | >>> root = Path(__file__).parent.parent 15 | >>> f = open(root / 'tests/files/doctest.txt', 'rb') 16 | >>> bytes_ = f.read() 17 | 18 | Open binary data and inspect the text content. 19 | 20 | >>> bw = BinaryWrapper(bytes_, '.txt') 21 | >>> bw.text 22 | 'This is the content as string.' 23 | 24 | Or access the content using slicing. 25 | 26 | >>> bw[12:19] 27 | 'content' 28 | >>> bw[29] 29 | '.' 30 | 31 | This string provides the indices for the modification of the file. Queue new alterations and when you are done 32 | apply them to change the file. 33 | 34 | >>> bw.add_alter(0, 4, 'That') 35 | >>> bw.apply_alters() 36 | >>> bw.text 37 | 'That is the content as string.' 38 | 39 | The slicing interface lets you make and apply an alteration in a single call. 40 | 41 | >>> bw[12:19] = 'new content' 42 | >>> bw[33] = '!' 43 | >>> bw.text 44 | 'That is the new content as string!' 45 | 46 | Return the content in binary format. 47 | >>> bw.bytes 48 | b'That is the new content as string!' 49 | """ 50 | 51 | def __init__(self, bytes_: bytes, format_cls_or_str: Union[type, str]): 52 | """ 53 | Constructor 54 | 55 | :param bytes_: Input bytes 56 | :param format_cls_or_str: Explicit Format class or file extension string (Format class will be auto-determined) 57 | """ 58 | if isinstance(format_cls_or_str, str): 59 | format_cls_or_str = registry.find_format(format_cls_or_str) 60 | elif not isinstance(format_cls_or_str, type): 61 | raise ValueError("`format_cls_or_str` must be provided as either Format class or file extension string") 62 | 63 | self.file = format_cls_or_str() 64 | self.file.load(bytes_) 65 | 66 | @property 67 | def text(self) -> str: 68 | """The text content of the file.""" 69 | return self.file.text 70 | 71 | @property 72 | def bytes(self) -> bytes: 73 | """The binary content of the file.""" 74 | return self.file.bytes 75 | 76 | def add_alter(self, start: int, end: int, text: str): 77 | """Queue a new change up for alteration. 78 | 79 | The `start` and `end` indices refer to the current value of the `text` property. Apply the queued alterations 80 | by calling `apply_alters()`. 81 | """ 82 | self.file.add_alter(start, end, text) 83 | 84 | def apply_alters(self): 85 | """Apply all queued alterations.""" 86 | self.file.apply_alters() 87 | 88 | def __getitem__(self, key: Union[slice, int]): 89 | """Get a substring of the contained text using slicing or indexing.""" 90 | return self.file.text.__getitem__(key) 91 | 92 | def __setitem__(self, key: Union[slice, int], value: str): 93 | """Add and apply one alter using the slicing syntax.""" 94 | if isinstance(key, slice): 95 | self.add_alter(key.start, key.stop, value) 96 | else: 97 | self.add_alter(key, key + 1, value) 98 | self.apply_alters() 99 | 100 | 101 | class FileWrapper(BinaryWrapper): 102 | """A wrapper for various file formats that exposes their text content for modification. 103 | 104 | >>> from pathlib import Path 105 | >>> root = Path(__file__).parent.parent 106 | 107 | Open a file and inspect its text content. 108 | 109 | >>> fw = FileWrapper(root / 'tests/files/doctest.txt') 110 | >>> fw.text 111 | 'This is the content as string.' 112 | 113 | Or access the content using slicing. 114 | 115 | >>> fw[12:19] 116 | 'content' 117 | >>> fw[29] 118 | '.' 119 | 120 | This string provides the indices for the modification of the file. Queue new alterations and when you are done 121 | apply them to change the file. 122 | 123 | >>> fw.add_alter(0, 4, 'That') 124 | >>> fw.apply_alters() 125 | >>> fw.text 126 | 'That is the content as string.' 127 | 128 | The slicing interface lets you make and apply an alteration in a single call. 129 | 130 | >>> fw[12:19] = 'new content' 131 | >>> fw[33] = '!' 132 | >>> fw.text 133 | 'That is the new content as string!' 134 | 135 | Now create a new file that looks like the original one but with the altered content. 136 | >>> fw.save(root / 'tests/files/doctest_altered.txt') 137 | """ 138 | 139 | def __init__(self, file_path: Union[Path, str], format_cls: type = None): 140 | """ 141 | Constructor 142 | 143 | :param file_path: Path to input file 144 | :param format_cls: Specific Format class (if not set, class is determined based on file extension) 145 | """ 146 | _, extension = os.path.splitext(file_path) 147 | 148 | with open(file_path, "rb") as f: 149 | bytes_ = f.read() 150 | 151 | super().__init__(bytes_, format_cls if format_cls else extension) 152 | 153 | def save(self, file_path: Union[Path, str]): 154 | """Save the file to disk.""" 155 | with open(file_path, "wb") as f: 156 | f.write(self.file.bytes) 157 | -------------------------------------------------------------------------------- /expose_text/exceptions.py: -------------------------------------------------------------------------------- 1 | class UnsupportedFormat(NotImplementedError): 2 | """This file format is not supported""" 3 | 4 | 5 | class FormatError(ValueError): 6 | pass 7 | -------------------------------------------------------------------------------- /expose_text/formats/__init__.py: -------------------------------------------------------------------------------- 1 | from importlib import import_module 2 | 3 | from expose_text.exceptions import UnsupportedFormat 4 | from expose_text.formats.base import Format 5 | 6 | 7 | class Registry: 8 | """This class registers the supported file formats. 9 | 10 | If you implement a new format, make sure to add it to `register_formats()`. 11 | """ 12 | 13 | _formats = {} 14 | 15 | def find_format(self, key) -> Format: 16 | if key not in self._formats: 17 | raise UnsupportedFormat(f"Format {key} is not supported!") 18 | return self._formats[key] 19 | 20 | def register_formats(self): 21 | self._register(".txt", "expose_text.formats._txt.TxtFormat") 22 | self._register(".html", "expose_text.formats._html.HtmlFormat") 23 | # self._register(".pdf", "expose_text.formats._pdf.PdfFormat") 24 | self._register(".pdf", "expose_text.formats.pdf.auto_pdf.AutoPdfFormat") 25 | self._register(".docx", "expose_text.formats._docx.DocxFormat") 26 | 27 | def _register(self, key, class_path): 28 | module_path, class_name = class_path.rsplit(".", 1) 29 | format_cls = getattr(import_module(module_path), class_name) 30 | self._formats[key] = format_cls 31 | 32 | 33 | registry = Registry() 34 | -------------------------------------------------------------------------------- /expose_text/formats/_docx.py: -------------------------------------------------------------------------------- 1 | import io 2 | import re 3 | import zipfile 4 | 5 | from defusedxml.minidom import parse 6 | 7 | from expose_text.formats._utils import apply_buffer_to_text 8 | from expose_text.formats.base import Format 9 | from expose_text.formats.markup.utils import MarkupModifier, Mapper 10 | 11 | 12 | class DocxFormat(Format): 13 | _docx_container = None 14 | _text = "" 15 | _xml_modifier = None 16 | 17 | def load(self, bytes_): 18 | self._docx_container = DocxContainer(bytes_) 19 | 20 | mapper = DocxMapper(self._docx_container.document_xml) 21 | self._text, mapping = mapper.simultaneous_text_extraction_and_mapping() 22 | 23 | self._xml_modifier = MarkupModifier(self._docx_container.document_xml, mapping) 24 | 25 | @property 26 | def text(self): 27 | return self._text 28 | 29 | @property 30 | def bytes(self): 31 | return self._docx_container.to_bytes() 32 | 33 | def apply_alters(self): 34 | self._text = apply_buffer_to_text(self._buffer, self._text) 35 | self._docx_container.document_xml = self._xml_modifier.apply_buffer(self._buffer) 36 | self._buffer.clear() 37 | 38 | 39 | class DocxContainer: 40 | _docx = None 41 | document_xml = None 42 | 43 | def __init__(self, bytes_): 44 | docx_io = io.BytesIO(bytes_) 45 | self._docx = zipfile.ZipFile(docx_io) 46 | 47 | document_xml_bytes = self._docx.read("word/document.xml") 48 | 49 | document_xml_io = io.BytesIO(document_xml_bytes) 50 | encoding = parse(document_xml_io).encoding 51 | 52 | self.document_xml = document_xml_bytes.decode(encoding) 53 | 54 | def to_bytes(self): 55 | # modifying a zip file is not supported, thus it has to be rebuilt 56 | bytes_io = io.BytesIO() 57 | zout = zipfile.ZipFile(bytes_io, "w") 58 | for zinfo in self._docx.infolist(): 59 | if zinfo.filename == "word/document.xml": 60 | zout.writestr(zinfo, self.document_xml) 61 | continue 62 | 63 | buffer = self._docx.read(zinfo.filename) 64 | zout.writestr(zinfo, buffer) 65 | zout.close() 66 | return bytes_io.getvalue() 67 | 68 | 69 | class DocxMapper(Mapper): 70 | def simultaneous_text_extraction_and_mapping(self): 71 | # get plain text from word/document.xml (everything between and ) 72 | self._remove_pattern(r"\n") # get rid of all newlines from the xml formatting 73 | self._remove_pattern(r"<\/w:p>|]*>", replace_with="\n") # add newlines from paragraph ends and linebreaks 74 | self._remove_pattern(r"<\/w:t>.*?]*>", flags=re.MULTILINE) # delete content from text close to open tags 75 | self._remove_pattern(r"^.*]*>", flags=re.MULTILINE) # delete to remaining open tags 76 | self._remove_pattern(r"<\/w:t>.*$", flags=re.MULTILINE) # delete from remaining close tags 77 | self._remove_pattern(r"^.*<.*$", flags=re.MULTILINE) # delete leftover lines with xml content 78 | 79 | # unescape characters 80 | self._remove_pattern(r"&", replace_with="&") 81 | self._remove_pattern(r"<", replace_with="<") 82 | self._remove_pattern(r">", replace_with=">") 83 | self._remove_pattern(r""", replace_with='"') 84 | self._remove_pattern(r"'", replace_with="'") 85 | 86 | # remove leading and trailing newlines 87 | self._remove_pattern(r"^\n+") 88 | self._remove_pattern(r"\n+$") 89 | 90 | return self._text, self._text_to_markup_idx 91 | -------------------------------------------------------------------------------- /expose_text/formats/_html.py: -------------------------------------------------------------------------------- 1 | import html 2 | import re 3 | 4 | from bs4 import UnicodeDammit 5 | 6 | from expose_text.formats._utils import apply_buffer_to_text 7 | from expose_text.formats.base import Format 8 | from expose_text.formats.markup.utils import MarkupModifier, Mapper 9 | 10 | 11 | class HtmlFormat(Format): 12 | _html = "" 13 | _text = "" 14 | _html_modifier = None 15 | 16 | def load(self, bytes_): 17 | self._html = to_unicode(bytes_) 18 | 19 | mapper = HtmlMapper(self._html) 20 | self._text, mapping = mapper.simultaneous_text_extraction_and_mapping() 21 | 22 | self._html_modifier = MarkupModifier(self._html, mapping) 23 | 24 | @property 25 | def text(self): 26 | return self._text 27 | 28 | @property 29 | def bytes(self): 30 | return self._html.encode("UTF-8") 31 | 32 | def apply_alters(self): 33 | self._text = apply_buffer_to_text(self._buffer, self._text) 34 | self._html = self._html_modifier.apply_buffer(self._buffer) 35 | self._buffer.clear() 36 | 37 | 38 | def to_unicode(bytes_): 39 | def unescape_html(html_): 40 | unescaped_html = "" 41 | pattern = re.compile(r"&#\d{1,4};|&\w{1,6};") 42 | cur = 0 43 | for m in pattern.finditer(html_): 44 | if m.group(0) in ["<", ">", "&", """, "'"]: 45 | continue 46 | 47 | unescaped_html += html_[cur : m.start()] + html.unescape(m.group(0)) 48 | cur = m.end() 49 | unescaped_html += html_[cur:] 50 | return unescaped_html 51 | 52 | dammit = UnicodeDammit(bytes_) 53 | encoding = dammit.original_encoding 54 | decoded_html = bytes_.decode(encoding) 55 | return unescape_html(decoded_html) 56 | 57 | 58 | class HtmlMapper(Mapper): 59 | def simultaneous_text_extraction_and_mapping(self): 60 | # get rid of everything but body and title 61 | self._remove_pattern(r"^.*]*>", flags=re.DOTALL) # delete everything from beginning to body 62 | self._remove_pattern(r"<\/body>.*$", flags=re.DOTALL) # delete everything from body to end 63 | 64 | # remove html from inside body 65 | self._remove_pattern(r"
", replace_with="\n") # html linebreaks 66 | self._remove_pattern( 67 | r"""]*>.*?<\/script> # remove scripts 68 | |]*>.*?<\/style> # remove styles 69 | |]*>.*?<\/template> # remove templates 70 | |<[^>]+> # remove all tags """, 71 | flags=re.DOTALL | re.VERBOSE, 72 | ) 73 | self._remove_pattern(r"(^[ \xc2\xa0]+)", flags=re.MULTILINE) # leading (non-breaking) whitespace 74 | self._remove_pattern(r"(\n\r?){3,}", replace_with="\n\n") # excess newlines 75 | 76 | # unescape characters 77 | self._remove_pattern(r"&", replace_with="&") 78 | self._remove_pattern(r"<", replace_with="<") 79 | self._remove_pattern(r">", replace_with=">") 80 | self._remove_pattern(r""", replace_with='"') 81 | self._remove_pattern(r"'", replace_with="'") 82 | 83 | # remove leading and trailing newlines 84 | self._remove_pattern(r"^\n+") 85 | self._remove_pattern(r"\n+$") 86 | 87 | return self._text, self._text_to_markup_idx 88 | -------------------------------------------------------------------------------- /expose_text/formats/_pdf.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | from pdfrw import PdfReader, PdfDict, PdfWriter 4 | from pdfrw import PdfArray 5 | 6 | from expose_text.formats.base import Format 7 | from expose_text.formats.pdf import pdf_redactor 8 | from expose_text.formats.pdf.pdf_redactor import InlineImage, RedactorOptions 9 | 10 | 11 | class PdfFormat(Format): 12 | """ 13 | 14 | Mostly based on https://github.com/JoshData/pdf-redactor 15 | 16 | # A general-purpose PDF text-layer redaction tool. 17 | # License: CC0 1.0 Universal 18 | # Source: https://github.com/JoshData/pdf-redactor 19 | 20 | """ 21 | 22 | options = None # type: RedactorOptions 23 | document = None 24 | text_tokens = None 25 | page_tokens = None 26 | 27 | def load(self, bytes_): 28 | self.options = pdf_redactor.RedactorOptions() 29 | self.options.input_stream = bytes_ 30 | 31 | self.document = PdfReader(fdata=bytes_) 32 | self.text_tokens, self.page_tokens = pdf_redactor.build_text_layer(self.document, self.options) 33 | 34 | @property 35 | def text(self): 36 | return "".join(t.value for t in self.text_tokens) 37 | 38 | @property 39 | def bytes(self): 40 | stream = io.BytesIO() 41 | writer = PdfWriter() 42 | writer.trailer = self.document 43 | writer.write(stream) 44 | return stream.getvalue() 45 | 46 | def apply_alters(self): 47 | # Finding all matches... 48 | text_tokens_index = 0 49 | text_tokens_charpos = 0 50 | text_tokens_token_xdiff = 0 51 | text_tokens = self.text_tokens 52 | 53 | # Mostly from update_text_layer 54 | # Pass the matched text to the replacement function to get replaced text. 55 | for start, end, alteration in self._buffer.sort(): 56 | # We got a match at text_content[start_idx:end_idx]. 57 | start_idx = start 58 | end_idx = end 59 | 60 | # Do a text replacement in the tokens that produced this text content. 61 | # It may have been produced by multiple tokens, so loop until we find them all. 62 | while start_idx < end_idx: 63 | # Find the original tokens in the content stream that 64 | # produced the matched text. Start by advancing over any 65 | # tokens that are entirely before this span of text. 66 | while ( 67 | text_tokens_index < len(text_tokens) 68 | and text_tokens_charpos + len(text_tokens[text_tokens_index].value) - text_tokens_token_xdiff <= start_idx 69 | ): 70 | text_tokens_charpos += len(text_tokens[text_tokens_index].value) - text_tokens_token_xdiff 71 | text_tokens_index += 1 72 | text_tokens_token_xdiff = 0 73 | if text_tokens_index == len(text_tokens): 74 | break 75 | assert text_tokens_charpos <= start_idx 76 | 77 | # The token at text_tokens_index, and possibly subsequent ones, 78 | # are responsible for this text. Replace the matched content 79 | # here with replacement content. 80 | tok = text_tokens[text_tokens_index] 81 | 82 | # Where does this match begin within the token's text content? 83 | mpos = start_idx - text_tokens_charpos 84 | assert mpos >= 0 85 | 86 | # How long is the match within this token? 87 | mlen = min(end_idx - start_idx, len(tok.value) - text_tokens_token_xdiff - mpos) 88 | assert mlen >= 0 89 | 90 | # How much should we replace here? 91 | if mlen < (end_idx - start_idx): 92 | # There will be more replaced later, so take the same number 93 | # of characters from the replacement text. 94 | r = alteration[:mlen] 95 | alteration = alteration[mlen:] 96 | else: 97 | # This is the last token in which we'll replace text, so put 98 | # all of the remaining replacement content here. 99 | r = alteration 100 | alteration = None # sanity 101 | 102 | # Do the replacement. 103 | tok.value = ( 104 | tok.value[: mpos + text_tokens_token_xdiff] + r + tok.value[mpos + mlen + text_tokens_token_xdiff :] 105 | ) 106 | text_tokens_token_xdiff += len(r) - mlen 107 | 108 | # Advance for next iteration. 109 | start_idx += mlen 110 | 111 | # Replace page content streams with updated tokens. 112 | self.apply_updated_text() 113 | 114 | def tok_str(self, tok): 115 | # Replace the page's content stream with our updated tokens. 116 | # The content stream may have been an array of streams before, 117 | # so replace the whole thing with a single new stream. Unfortunately 118 | # the str on PdfArray and PdfDict doesn't work right. 119 | if isinstance(tok, PdfArray): 120 | return "[ " + " ".join(self.tok_str(x) for x in tok) + "] " 121 | if isinstance(tok, InlineImage): 122 | return ( 123 | "BI " 124 | + " ".join(self.tok_str(x) + " " + self.tok_str(y) for x, y in tok.items()) 125 | + " ID " 126 | + tok.stream 127 | + " EI " 128 | ) 129 | if isinstance(tok, PdfDict): 130 | return "<< " + " ".join(self.tok_str(x) + " " + self.tok_str(y) for x, y in tok.items()) + ">> " 131 | 132 | return str(tok) 133 | 134 | def apply_updated_text(self): 135 | # Create a new content stream for each page by concatenating the 136 | # tokens in the page_tokens lists. 137 | 138 | for i, page in enumerate(self.document.pages): 139 | if page.Contents is None: 140 | continue # nothing was here 141 | 142 | page.Contents = PdfDict() 143 | page.Contents.stream = "\n".join(self.tok_str(tok) for tok in self.page_tokens[i]) 144 | 145 | page.Contents.Length = len(page.Contents.stream) # reset 146 | 147 | self._buffer.clear() 148 | -------------------------------------------------------------------------------- /expose_text/formats/_txt.py: -------------------------------------------------------------------------------- 1 | from expose_text.formats._utils import apply_buffer_to_text 2 | from expose_text.formats.base import Format 3 | 4 | # chardet is LGPL, link it dynamically 5 | try: 6 | import chardet 7 | except ModuleNotFoundError: 8 | chardet = None 9 | 10 | 11 | class TxtFormat(Format): 12 | _encoding = None 13 | _content = "" 14 | 15 | def load(self, bytes_): 16 | if chardet: 17 | self._encoding = chardet.detect(bytes_)["encoding"] 18 | else: 19 | # if the encoding is not detected dynamically, it is assumed to be UTF-8 20 | self._encoding = "UTF-8" 21 | 22 | self._content = bytes_.decode(self._encoding) 23 | 24 | @property 25 | def text(self): 26 | return self._content 27 | 28 | @property 29 | def bytes(self): 30 | return self._content.encode(self._encoding) 31 | 32 | def apply_alters(self): 33 | self._content = apply_buffer_to_text(self._buffer, self._content) 34 | self._buffer.clear() 35 | -------------------------------------------------------------------------------- /expose_text/formats/_utils.py: -------------------------------------------------------------------------------- 1 | class AlterationsBuffer: 2 | """This class is used to safely queue alterations. 3 | 4 | Add new alterations to this buffer by using one of the two interfaces. The logic makes sure that no overlapping 5 | alterations are added, i.e. that each part of the original text can only be altered once. 6 | 7 | >>> buffer = AlterationsBuffer() 8 | >>> buffer.add(0, 10, 'new_text') 9 | >>> buffer += (10, 20, 'new_text') 10 | 11 | Access the alterations by using the iterable interface of this class. 12 | """ 13 | 14 | def __init__(self): 15 | self.buffer = [] 16 | 17 | def __iter__(self): 18 | return iter(self.buffer) 19 | 20 | def __iadd__(self, alter): 21 | if not isinstance(alter, tuple) or len(alter) != 3: 22 | raise TypeError("Invalid alteration! Valid ones are (start, end, new_text) tuples.") 23 | self.add(*alter) 24 | return self 25 | 26 | def __len__(self): 27 | return len(self.buffer) 28 | 29 | def add(self, start, end, new_text): 30 | if not end > start: 31 | raise ValueError("end should be larger than start!") 32 | 33 | alter = (start, end, new_text) 34 | if self._overlaps_with_existing_alter(alter): 35 | raise ValueError("The given alteration overlaps with an existing one!") 36 | 37 | self.buffer += [alter] 38 | 39 | def sort(self, reverse=False): 40 | self.buffer.sort(key=lambda alter: alter[0], reverse=reverse) 41 | return self 42 | 43 | def clear(self): 44 | self.buffer = [] 45 | 46 | def _overlaps_with_existing_alter(self, new_alter): 47 | new_start = new_alter[0] 48 | new_end = new_alter[1] 49 | 50 | for existing_alter in self.buffer: 51 | existing_start = existing_alter[0] 52 | existing_end = existing_alter[1] 53 | if existing_start <= new_start < existing_end or existing_start < new_end <= existing_end: 54 | return True 55 | 56 | return False 57 | 58 | 59 | def apply_buffer_to_text(buffer, text): 60 | """Apply all alterations from the buffer to the text. 61 | 62 | This replaces the original text at the indices specified in the alterations by the respective altered texts. 63 | """ 64 | new_text = "" 65 | cur = 0 66 | for start, end, alteration in buffer.sort(): 67 | new_text += text[cur:start] + alteration 68 | cur = end 69 | new_text += text[cur:] 70 | return new_text 71 | -------------------------------------------------------------------------------- /expose_text/formats/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | from ._utils import AlterationsBuffer 4 | 5 | 6 | class Format(ABC): 7 | def __init__(self): 8 | self._buffer = AlterationsBuffer() 9 | 10 | @abstractmethod 11 | def load(self, bytes_): 12 | """Load the file in binary format into the internal representation.""" 13 | pass 14 | 15 | @property 16 | @abstractmethod 17 | def text(self): 18 | """Get the current text content.""" 19 | pass 20 | 21 | @property 22 | @abstractmethod 23 | def bytes(self): 24 | """Get the current file content as binary data.""" 25 | 26 | def add_alter(self, start, end, new_text): 27 | """Queue an alteration of the text. 28 | 29 | The `start` and `end` indices are based on the current `text` content. The `text` and `bytes` content are not 30 | changed by calling this method. To apply the changes call `apply_alters()`. 31 | """ 32 | self._buffer += (start, end, new_text) 33 | 34 | @abstractmethod 35 | def apply_alters(self): 36 | """Apply all queued alterations. 37 | 38 | After calling this method, `text` and `bytes` will be updated. 39 | """ 40 | pass 41 | -------------------------------------------------------------------------------- /expose_text/formats/markup/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openredact/expose-text/c8e21774d0ca2f0103e91b50ea093c87a3738a3e/expose_text/formats/markup/__init__.py -------------------------------------------------------------------------------- /expose_text/formats/markup/utils.py: -------------------------------------------------------------------------------- 1 | import html 2 | import re 3 | from abc import ABC, abstractmethod 4 | 5 | """Utils for markup languages with tags and elements like XML or HTML.""" 6 | 7 | 8 | class MarkupModifier: 9 | """This class takes care of altering markup.""" 10 | 11 | def __init__(self, markup, mapping): 12 | """ 13 | :param markup: a string containing content in a markup language 14 | :param mapping: a mapping from the indices of the contained text to its positions in the markup, 15 | i.e. `mapping[text_idx] == markup_idx` 16 | """ 17 | self._markup = markup 18 | self._text_to_markup_idx = mapping 19 | 20 | def apply_buffer(self, buffer): 21 | new_markup = "" 22 | cur = 0 23 | for start, end, new_text in buffer.sort(): 24 | new_markup += self._markup[cur : self._text_to_markup_idx[start]] + html.escape(new_text) 25 | 26 | # inner - 1: get the markup index of last text char, outer + 1: get the next char in markup 27 | cur = self._text_to_markup_idx[end - 1] + 1 28 | 29 | # append any markup tags that got skipped (in case end spanned further than the starting element) 30 | new_markup += self._get_skipped_tags(self._text_to_markup_idx[start], cur) 31 | new_markup += self._markup[cur:] 32 | self._markup = new_markup 33 | return self._markup 34 | 35 | def _get_skipped_tags(self, start, end): 36 | """Return all tags between start and end.""" 37 | pattern = re.compile(r"<[^>]*>") 38 | tags = pattern.findall(self._markup[start:end]) 39 | return "\n".join(tags) 40 | 41 | 42 | class Mapper(ABC): 43 | """This is the base for language specific classes that map markup to text and create an index mapping. 44 | 45 | Initially `self._text` contains the markup which is then step by step removed by calls to `_remove_pattern` in 46 | `get_text_and_mapping`. While removing it an index mapping is maintained that maps each index in `self._text` to its 47 | position in the markup. 48 | """ 49 | 50 | def __init__(self, markup): 51 | self._text = markup 52 | self._markup = markup 53 | self._text_to_markup_idx = list(range(len(markup))) 54 | 55 | @abstractmethod 56 | def simultaneous_text_extraction_and_mapping(self): 57 | """Extract the text and create an index mapping by one or more calls to `remove_patterns`.""" 58 | return self._text, self._text_to_markup_idx 59 | 60 | def _remove_pattern(self, regex, replace_with="", flags=0): 61 | """Remove or replace patterns in the markup. 62 | 63 | :param regex: the regex to replace 64 | :param replace_with: an optional string to replace matches with 65 | :param flags: optional re compile flags 66 | """ 67 | pattern = re.compile(regex, flags=flags) 68 | while True: 69 | m = re.search(pattern, self._text) 70 | if m is None: 71 | break 72 | 73 | self._replace_content_in_markup(m.start(0), m.end(0), replace_with) 74 | 75 | def _replace_content_in_markup(self, start, end, new_text): 76 | if len(new_text) > end - start: 77 | raise ValueError() 78 | self._text = self._text[:start] + new_text + self._text[end:] 79 | del self._text_to_markup_idx[start + len(new_text) : end] 80 | -------------------------------------------------------------------------------- /expose_text/formats/pdf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openredact/expose-text/c8e21774d0ca2f0103e91b50ea093c87a3738a3e/expose_text/formats/pdf/__init__.py -------------------------------------------------------------------------------- /expose_text/formats/pdf/auto_pdf.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from expose_text.formats import Format 4 | from expose_text.formats._pdf import PdfFormat 5 | from expose_text.formats.pdf.pdf2html2pdf import Pdf2Html2PdfFormat 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class AutoPdfFormat(Format): 11 | """ 12 | Automatically determine what PDF format can be used depending on availability of dependencies (and alters) 13 | """ 14 | 15 | def __init__(self): 16 | super().__init__() 17 | 18 | pdf2html2pdf = Pdf2Html2PdfFormat() 19 | 20 | if pdf2html2pdf.is_installed(): 21 | logger.info("Using pdf2html2pdf (dependencies are installed)") 22 | self.format = pdf2html2pdf 23 | else: 24 | logger.info("Using PdfFormat (dependencies are missing)") 25 | self.format = PdfFormat() 26 | 27 | def load(self, bytes_): 28 | self.format.load(bytes_) 29 | 30 | @property 31 | def text(self): 32 | return self.format.text 33 | 34 | @property 35 | def bytes(self): 36 | return self.format.bytes 37 | 38 | def add_alter(self, start, end, new_text): 39 | self.format.add_alter(start, end, new_text) 40 | 41 | def apply_alters(self): 42 | self.format.apply_alters() 43 | -------------------------------------------------------------------------------- /expose_text/formats/pdf/pdf2html2pdf.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import collections 3 | import logging 4 | import os 5 | import re 6 | import shutil 7 | import tempfile 8 | from subprocess import run, PIPE 9 | from typing import Dict 10 | 11 | import pdfkit 12 | 13 | from expose_text.exceptions import FormatError 14 | from expose_text.formats._html import HtmlFormat 15 | from expose_text.formats.base import Format 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | class Pdf2Html2PdfFormat(Format): 21 | """ 22 | Use HTML as intermediate format to work with PDFs. 23 | Not loss-free! Layout might be different, but replacements with out-of-vocabulary characters is possible! 24 | 25 | Dependencies: 26 | - PDF to HTML: poppler-utils 27 | - HTML to PDF: pdfkit (wrapper for wkhtmltopdf utility to convert HTML to PDF using Webkit) 28 | 29 | """ 30 | 31 | page2html = {} 32 | html_format = None # type: HtmlFormat 33 | 34 | def __init__( 35 | self, 36 | encoding="utf-8", 37 | pdf_margin_left="0", 38 | pdf_margin_right="0", 39 | pdf_margin_top="0", 40 | pdf_margin_bottom="0", 41 | pdf_output_zoom="1.6", 42 | pdf_input_zoom="1.0", 43 | pdftohtml_path="pdftohtml", 44 | wkhtmltopdf_path="wkhtmltopdf", 45 | ): 46 | """ 47 | For PDF settings see pdfkit (wkhtmltopdf) documentation 48 | """ 49 | super().__init__() 50 | self.encoding = encoding 51 | self.html_format = HtmlFormat() 52 | self.pdf_margin_left = pdf_margin_left 53 | self.pdf_margin_right = pdf_margin_right 54 | self.pdf_margin_top = pdf_margin_top 55 | self.pdf_margin_bottom = pdf_margin_bottom 56 | self.pdf_output_zoom = pdf_output_zoom 57 | self.pdf_input_zoom = pdf_input_zoom 58 | self.pdftohtml_path = pdftohtml_path 59 | self.wkhtmltopdf_path = wkhtmltopdf_path 60 | 61 | def load(self, bytes_): 62 | self.page2html = self.get_html_pages_from_pdf(bytes_) 63 | 64 | # send to html format wrapper 65 | pages_html = [html for page, html in self.page2html.items()] 66 | 67 | # print('Pages: %s' % [page for page, html in self.page2html.items()]) 68 | logger.info("Loading only a single page") 69 | pages_html = self.page2html[1] 70 | 71 | self.html_format.load(("".join(pages_html)).replace(" ", " ").encode("utf-8")) 72 | 73 | @property 74 | def text(self): 75 | # html to text 76 | return self.html_format.text 77 | 78 | @property 79 | def html(self): 80 | return self.page2html[1] 81 | 82 | @property 83 | def bytes(self): 84 | """Generate PDF from HTML bytes with pdfkit (wkhtmltopdf) """ 85 | html_bytes = self.html_format.bytes 86 | 87 | pdf_bytes = pdfkit.from_string( 88 | html_bytes.decode(self.encoding), 89 | False, 90 | options={ 91 | "load-error-handling": "ignore", 92 | "load-media-error-handling": "ignore", 93 | "margin-left": self.pdf_margin_left, 94 | "margin-right": self.pdf_margin_right, 95 | "margin-top": self.pdf_margin_top, 96 | "margin-bottom": self.pdf_margin_bottom, 97 | "zoom": self.pdf_output_zoom, 98 | # 'disable-smart-shrinking': '', 99 | }, 100 | ) 101 | 102 | return pdf_bytes 103 | 104 | def add_alter(self, start, end, new_text): 105 | """Alter only on HTML format""" 106 | self.html_format.add_alter(start, end, new_text) 107 | 108 | def apply_alters(self): 109 | """Alter only on HTML format""" 110 | self.html_format.apply_alters() 111 | 112 | def get_html_pages_from_pdf(self, pdf_bytes) -> Dict[int, str]: 113 | """ 114 | Converts PDF to HTML with htmltopdf (from poppler-utils: https://poppler.freedesktop.org/) 115 | 116 | :param pdf_bytes: 117 | :return: Page number => HTML string 118 | """ 119 | page2html = {} 120 | file_prefix = "pdf" 121 | tmpdir = tempfile.mkdtemp(prefix="pdftohtml-") 122 | 123 | run_args = [self.pdftohtml_path, "-zoom", self.pdf_input_zoom, "-c", "-", tmpdir + "/" + file_prefix] 124 | 125 | logger.debug(f"Execute poppler-pdftohtml: {run_args}") 126 | 127 | process = run(run_args, stdout=PIPE, input=pdf_bytes,) # , encoding='ascii' 128 | 129 | if process.returncode != 0: 130 | raise FormatError("pdftohtml returned error exit code: %s" % process.returncode) 131 | 132 | # Iterate over output files 133 | tmp_files = os.listdir(tmpdir) 134 | logger.error(tmp_files) 135 | 136 | for fn in tmp_files: 137 | if fn.startswith(file_prefix + "-") and fn.endswith(".html"): 138 | # Page file 139 | logger.debug(f"PDF-page file: {file_prefix}") 140 | page_num = int(fn[len(file_prefix) + 1 : -5]) 141 | 142 | with open(os.path.join(tmpdir, fn), "r") as f: 143 | html = f.read() 144 | 145 | # Replace body bgcolor + Margin settings 146 | html = html.replace('bgcolor="#A0A0A0"', 'style="margin: 0; padding: 0;"') 147 | 148 | # Replace image source with base64 encodings 149 | def img_src_to_base64(match): 150 | fn = match.group(1) 151 | with open(os.path.join(tmpdir, fn), "rb") as image_file: 152 | encoded_img = base64.b64encode(image_file.read()).decode("utf-8") 153 | 154 | return f'src="data:image/png;base64, {encoded_img}"' 155 | 156 | pattern = re.compile(r'src="(.*?)"') # src="pdf001.png" 157 | html = pattern.sub(img_src_to_base64, html) 158 | 159 | page2html[page_num] = html 160 | 161 | # Remove temp files 162 | shutil.rmtree(tmpdir) 163 | 164 | # Ensure page order 165 | page2html = collections.OrderedDict(sorted(page2html.items())) 166 | 167 | return page2html 168 | 169 | def is_installed(self): 170 | # Check if dependencies are installed 171 | return shutil.which(self.pdftohtml_path) is not None and shutil.which(self.wkhtmltopdf_path) is not None 172 | -------------------------------------------------------------------------------- /expose_text/formats/pdf/pdf_redactor.py: -------------------------------------------------------------------------------- 1 | # A general-purpose PDF text-layer redaction tool. 2 | # License: CC0 1.0 Universal 3 | # Source: https://github.com/JoshData/pdf-redactor 4 | 5 | import sys 6 | from datetime import datetime 7 | 8 | from pdfrw import PdfDict 9 | 10 | 11 | class RedactorOptions: 12 | """Redaction and I/O options.""" 13 | 14 | # Input/Output 15 | input_stream = None 16 | output_stream = None 17 | 18 | # Metadata filters map names of entries in the PDF Document Information Dictionary 19 | # (e.g. "Title", "Author", "Subject", "Keywords", "Creator", "Producer", "CreationDate", 20 | # and "ModDate") to an array of functions to run on the values of those keys. 21 | # 22 | # Each function is given a pdfrw.objects.PdfString containing the current field value, 23 | # or None if the field is not present in the input PDF, as the function's first argument 24 | # and it must return either a string, a datetime.datetime value (CreationDate and ModDate 25 | # should datetime.datetime values), or None to replace the field's value with. Return 26 | # None to clear the field (unless a later function adds a new value). 27 | # 28 | # The functions are run in order. Each function is given the previous function's return 29 | # value. The last function's return value is put into the output PDF. 30 | # 31 | # If a datetime.datetime is returned without timezone info (a "naive" datetime), then 32 | # it must be in UTC. Use pytz.timezone.localize to encode a local time. 33 | # 34 | # Use "DEFAULT" as a key to apply functions to all metadata fields that have no specific 35 | # functions defined, which is useful to remove all unrecognized fields. 36 | # 37 | # Use "ALL" to appy functions to all metadata fields, after any field-specific 38 | # functions or DEFAULT functions are run. 39 | metadata_filters = {} 40 | 41 | # The XMP metadata filters are functions that are passed any existing XMP data and 42 | # return new XMP metadata. The functions are called in order and each is passed the 43 | # result of the previous function. The functions are given an xml.etree.Element object, 44 | # or None, as their first argument and must return an object of the same type, or None. 45 | xmp_filters = [] 46 | 47 | # This function controls how XML returned by xmp_filters is serialized. Replace this 48 | # function with any function that takes an xml.etree.Element object and returns a string 49 | # (a unicode string --- don't serialize to bytes). 50 | xmp_serializer = None 51 | 52 | # The content filters are run on the combined content streams of the pages, as well 53 | # as on each annotation's text attributes separately. 54 | # 55 | # Each filter is a tuple of a compiled regular expression and a function to generate 56 | # replacement text, which is given a re.Match object as its sole argument. It must return a string. 57 | # 58 | # Since spaces in PDFs are sometimes not encoded as text but instead as positional 59 | # offsets (like newlines), the regular expression should treat all spaces as optional. 60 | # 61 | # Since pdfrw doesn't support content stream compression, you should use a tool like qpdf 62 | # to decompress the streams before using this tool (see the README). 63 | content_filters = [] 64 | 65 | # When replacement text isn't likely to have a glyph stored in the PDF's fonts, 66 | # replace the character with these other characters (if they don't have the same 67 | # problem): 68 | content_replacement_glyphs = ["?", "#", "*", " "] 69 | 70 | # The link filters are run on link annotations. Each link filter is a function 71 | # that is passed the link target (a string holding a URI) and a second argujment 72 | # holding the annotation object. The function should return a new URI or None to 73 | # remove the link. 74 | link_filters = [] 75 | 76 | 77 | def redactor(options): 78 | # This is the function that performs redaction. 79 | 80 | if sys.version_info < (3,): 81 | if options.input_stream is None: 82 | options.input_stream = sys.stdin # input stream containing the PDF to redact 83 | if options.output_stream is None: 84 | options.output_stream = sys.stdout # output stream to write the new, redacted PDF to 85 | else: 86 | if options.input_stream is None: 87 | options.input_stream = sys.stdin.buffer # input byte stream containing the PDF to redact 88 | if options.output_stream is None: 89 | options.output_stream = sys.stdout.buffer # output byte stream to write the new, redacted PDF to 90 | 91 | from pdfrw import PdfReader, PdfWriter 92 | 93 | # Read the PDF. 94 | document = PdfReader(options.input_stream) 95 | 96 | # Modify its Document Information Dictionary metadata. 97 | update_metadata(document, options) 98 | 99 | # Modify its XMP metadata. 100 | update_xmp_metadata(document, options) 101 | 102 | if options.content_filters: 103 | # Build up the complete text stream of the PDF content. 104 | text_layer = build_text_layer(document, options) 105 | 106 | # Apply filters to the text stream. 107 | update_text_layer(options, *text_layer) 108 | 109 | # Replace page content streams with updated tokens. 110 | apply_updated_text(document, *text_layer) 111 | 112 | # Update annotations. 113 | update_annotations(document, options) 114 | 115 | # Write the PDF back out. 116 | writer = PdfWriter() 117 | writer.trailer = document 118 | writer.write(options.output_stream) 119 | 120 | 121 | def update_metadata(trailer, options): 122 | # Update the PDF's Document Information Dictionary, which contains keys like 123 | # Title, Author, Subject, Keywords, Creator, Producer, CreationDate, and ModDate 124 | # (the latter two containing Date values, the rest strings). 125 | 126 | from pdfrw.objects import PdfString, PdfName 127 | 128 | # Create the metadata dict if it doesn't exist, since the caller may be adding fields. 129 | if not trailer.Info: 130 | trailer.Info = PdfDict() 131 | 132 | # Get a list of all metadata fields that exist in the PDF plus any fields 133 | # that there are metadata filters for (since they may insert field values). 134 | keys = set(str(k)[1:] for k in trailer.Info.keys()) | set( 135 | k for k in options.metadata_filters.keys() if k not in ("DEFAULT", "ALL") 136 | ) 137 | 138 | # Update each metadata field. 139 | for key in keys: 140 | # Get the functions to apply to this field. 141 | functions = options.metadata_filters.get(key) 142 | if functions is None: 143 | # If nothing is defined for this field, use the DEFAULT functions. 144 | functions = options.metadata_filters.get("DEFAULT", []) 145 | 146 | # Append the ALL functions. 147 | functions += options.metadata_filters.get("ALL", []) 148 | 149 | # Run the functions on any existing values. 150 | value = trailer.Info[PdfName(key)] 151 | for f in functions: 152 | # Before passing to the function, convert from a PdfString to a Python string. 153 | if isinstance(value, PdfString): 154 | # decode from PDF's "(...)" syntax. 155 | value = value.decode() 156 | 157 | # Filter the value. 158 | value = f(value) 159 | 160 | # Convert Python data type to PdfString. 161 | if isinstance(value, str) or (sys.version_info < (3,)): # and isinstance(value, unicode)): 162 | # Convert string to a PdfString instance. 163 | value = PdfString.from_unicode(value) 164 | 165 | elif isinstance(value, datetime): 166 | # Convert datetime into a PDF "D" string format. 167 | value = value.strftime("%Y%m%d%H%M%S%z") 168 | if len(value) == 19: 169 | # If TZ info was included, add an apostrophe between the hour/minutes offsets. 170 | value = value[:17] + "'" + value[17:] 171 | value = PdfString("(D:%s)" % value) 172 | 173 | elif value is None: 174 | # delete the metadata value 175 | pass 176 | 177 | else: 178 | raise ValueError( 179 | "Invalid type of value returned by metadata_filter function. %s was returned by %s." 180 | % (repr(value), f.__name__ or "anonymous function") 181 | ) 182 | 183 | # Replace value. 184 | trailer.Info[PdfName(key)] = value 185 | 186 | 187 | def update_xmp_metadata(trailer, options): 188 | if trailer.Root.Metadata: 189 | # Safely parse the existing XMP data. 190 | from defusedxml.ElementTree import fromstring 191 | 192 | value = fromstring(trailer.Root.Metadata.stream) 193 | else: 194 | # There is no XMP metadata in the document. 195 | value = None 196 | 197 | # Run each filter. 198 | for f in options.xmp_filters: 199 | value = f(value) 200 | 201 | # Set new metadata. 202 | if value is None: 203 | # Clear it. 204 | trailer.Root.Metadata = None 205 | else: 206 | # Serialize the XML and save it into the PDF metadata. 207 | 208 | # Get the serializer. 209 | serializer = options.xmp_serializer 210 | if serializer is None: 211 | # Use a default serializer based on xml.etree.ElementTree.tostring. 212 | def serializer(xml_root): 213 | import xml.etree.ElementTree 214 | 215 | if hasattr(xml.etree.ElementTree, "register_namespace"): 216 | # Beginning with Python 3.2 we can define namespace prefixes. 217 | xml.etree.ElementTree.register_namespace("xmp", "adobe:ns:meta/") 218 | xml.etree.ElementTree.register_namespace("pdf13", "http://ns.adobe.com/pdf/1.3/") 219 | xml.etree.ElementTree.register_namespace("xap", "http://ns.adobe.com/xap/1.0/") 220 | xml.etree.ElementTree.register_namespace("dc", "http://purl.org/dc/elements/1.1/") 221 | xml.etree.ElementTree.register_namespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#") 222 | return xml.etree.ElementTree.tostring(xml_root, encoding="unicode" if sys.version_info >= (3, 0) else None) 223 | 224 | # Create a fresh Metadata dictionary and serialize the XML into it. 225 | trailer.Root.Metadata = PdfDict() 226 | trailer.Root.Metadata.Type = "Metadata" 227 | trailer.Root.Metadata.Subtype = "XML" 228 | trailer.Root.Metadata.stream = serializer(value) 229 | 230 | 231 | class InlineImage(PdfDict): 232 | def read_data(self, tokens): 233 | # "Unless the image uses ASCIIHexDecode or ASCII85Decode as one 234 | # of its filters, the ID operator should be followed by a 235 | # single white-space character, and the next character is 236 | # interpreted as the first byte of image data. 237 | if tokens.current[0][1] > tokens.current[0][0] + 3: 238 | tokens.current[0] = (tokens.current[0][0], tokens.current[0][0] + 3) 239 | 240 | start = tokens.floc 241 | state = 0 242 | whitespace = (" ", "\n", "\r") 243 | # 0: image data or trailing whitespace 244 | # 1: E 245 | # 2: I 246 | for i in range(start, len(tokens.fdata)): 247 | if state == 0: 248 | if tokens.fdata[i] == "E": 249 | state = 1 250 | elif state == 1: 251 | if tokens.fdata[i] == "I": 252 | state = 2 253 | else: 254 | state = 0 255 | elif state == 2: 256 | if tokens.fdata[i] in whitespace: 257 | for j in range(i + 1, i + 6): 258 | o = ord(tokens.fdata[j]) 259 | if o == 0x0A: # \n 260 | continue 261 | elif o == 0x0D: # \r 262 | continue 263 | elif o >= 0x20 and o <= 0x7E: 264 | continue 265 | else: 266 | state = 0 267 | break 268 | else: 269 | end = i - 3 270 | assert tokens.fdata[end] in whitespace 271 | break 272 | else: 273 | state = 0 274 | 275 | self._stream = tokens.fdata[start:end] 276 | tokens.floc = end 277 | 278 | 279 | def tokenize_streams(streams): 280 | # pdfrw's tokenizer PdfTokens does lexical analysis only. But we need 281 | # to collapse arrays ([ .. ]) and dictionaries (<< ... >>) into single 282 | # token entries. 283 | from pdfrw import PdfTokens, PdfArray 284 | 285 | stack = [] 286 | for stream in streams: 287 | tokens = PdfTokens(stream) 288 | for token in tokens: 289 | # Is this a control token? 290 | if token == "<<": 291 | # begins a dictionary 292 | stack.append((PdfDict, [])) 293 | continue 294 | elif token == "[": 295 | # begins an array 296 | stack.append((PdfArray, [])) 297 | continue 298 | elif token in (">>", "]"): 299 | # ends a dictionary or array 300 | constructor, content = stack.pop(-1) 301 | if constructor == PdfDict: 302 | # Turn flat list into key/value pairs. 303 | content = chunk_pairs(content) 304 | token = constructor(content) 305 | elif token == "BI": 306 | # begins an inline image's dictionary half 307 | stack.append((InlineImage, [])) 308 | continue 309 | elif token == "ID": 310 | # divides an inline image's dictionary half and data half 311 | constructor, content = stack[-1] 312 | content = chunk_pairs(content) 313 | img = constructor(content) 314 | img.read_data(tokens) 315 | stack[-1] = (img, None) 316 | continue 317 | elif token == "EI": 318 | # ends an inline image 319 | token, _ = stack.pop(-1) 320 | 321 | # If we're inside something, add this token to that thing. 322 | if len(stack) > 0: 323 | stack[-1][1].append(token) 324 | continue 325 | 326 | # Yield it. 327 | yield token 328 | 329 | 330 | def build_text_layer(document, options): 331 | # Within each page's content stream, look for text-showing operators to 332 | # find the text content of the page. Construct a string that contains the 333 | # entire text content of the document AND a mapping from characters in the 334 | # text content to tokens in the content streams. That lets us modify the 335 | # tokens in the content streams when we find text that we want to redact. 336 | # 337 | # The text-showing operators are: 338 | # 339 | # (text) Tj -- show a string of text 340 | # (text) ' -- move to next line and show a string of text 341 | # aw ac (text) " -- show a string of text with word/character spacing parameters 342 | # [ ... ] TJ -- show text strings from the array, which are interleaved with spacing parameters 343 | # 344 | # (These operators appear only within BT ... ET so-called "text objects", 345 | # although we don't make use of it.) 346 | # 347 | # But since we don't understand any of the other content stream operators, 348 | # and in particular we don't know how many operands each (non-text) operator 349 | # takes, we can never be sure whether what we see in the content stream is 350 | # an operator or an operand. If we see a "Tj", maybe it is the operand of 351 | # some other operator? 352 | # 353 | # We'll assume we can get by just fine, however, assuming that whenever we 354 | # see one of these tokens that it's an operator and not an operand. 355 | # 356 | # But TJ remains a little tricky because its operand is an array that preceeds 357 | # it. Arrays are delimited by square brackets and we need to parse that. 358 | # 359 | # We also have to be concerned with the encoding of the text content, which 360 | # depends on the active font. With a simple font, the text is a string whose 361 | # bytes are glyph codes. With a composite font, a CMap maps multi-byte 362 | # character codes to glyphs. In either case, we must map glyphs to unicode 363 | # characters so that we can pattern match against it. 364 | # 365 | # To know the active font, we look for the " Tf" operator. 366 | 367 | from pdfrw import PdfObject, PdfString, PdfArray 368 | from pdfrw.uncompress import uncompress as uncompress_streams 369 | from pdfrw.objects.pdfname import BasePdfName 370 | 371 | text_tokens = [] 372 | fontcache = {} 373 | 374 | class TextToken: 375 | value = None 376 | font = None 377 | 378 | def __init__(self, value, font): 379 | self.font = font 380 | self.raw_original_value = value 381 | self.original_value = toUnicode(value, font, fontcache) 382 | self.value = self.original_value 383 | 384 | def __str__(self): 385 | # __str__ is used for serialization 386 | if self.value == self.original_value: 387 | # If unchanged, return the raw original value without decoding/encoding. 388 | return PdfString.from_bytes(self.raw_original_value) 389 | else: 390 | # If the value changed, encode it from Unicode according to the encoding 391 | # of the font that is active at the location of this token. 392 | return PdfString.from_bytes(fromUnicode(self.value, self.font, fontcache, options)) 393 | 394 | def __repr__(self): 395 | # __repr__ is used for debugging 396 | return "Token<%s>" % repr(self.value) 397 | 398 | def process_text(token): 399 | if token.value == "": 400 | return 401 | text_tokens.append(token) 402 | 403 | # For each page... 404 | page_tokens = [] 405 | for page in document.pages: 406 | # For each token in the content stream... 407 | 408 | # Remember this page's revised token list. 409 | token_list = [] 410 | page_tokens.append(token_list) 411 | 412 | if page.Contents is None: 413 | continue 414 | 415 | prev_token = None 416 | prev_prev_token = None 417 | current_font = None 418 | 419 | # The page may have one content stream or an array of content streams. 420 | # If an array, they are treated as if they are concatenated into a single 421 | # stream (per the spec). 422 | if isinstance(page.Contents, PdfArray): 423 | contents = list(page.Contents) 424 | else: 425 | contents = [page.Contents] 426 | 427 | # If a compression Filter is applied, attempt to un-apply it. If an unrecognized 428 | # filter is present, an error is raised. uncompress_streams expects an array of 429 | # streams. 430 | uncompress_streams(contents) 431 | 432 | def make_mutable_string_token(token): 433 | if isinstance(token, PdfString): 434 | token = TextToken(token.to_bytes(), current_font) 435 | 436 | # Remember all unicode characters seen in this font so we can 437 | # avoid inserting characters that the PDF isn't likely to have 438 | # a glyph for. 439 | if current_font and current_font.BaseFont: 440 | fontcache.setdefault(current_font.BaseFont, set()).update(token.value) 441 | return token 442 | 443 | # Iterate through the tokens in the page's content streams. 444 | for token in tokenize_streams(content.stream for content in contents): 445 | # Replace any string token with our own class that hold a mutable 446 | # value, which is how we'll rewrite content. 447 | token = make_mutable_string_token(token) 448 | 449 | # Append the token into a new list that holds all tokens. 450 | token_list.append(token) 451 | 452 | # If the token is an operator and we're not inside an array... 453 | if isinstance(token, PdfObject): 454 | # And it's one that we recognize, process it. 455 | if token in ("Tj", "'", '"') and isinstance(prev_token, TextToken): 456 | # Simple text operators. 457 | process_text(prev_token) 458 | elif token == "TJ" and isinstance(prev_token, PdfArray): 459 | # The text array operator. 460 | for i in range(len(prev_token)): 461 | # (item may not be a string! only the strings are text.) 462 | prev_token[i] = make_mutable_string_token(prev_token[i]) 463 | if isinstance(prev_token[i], TextToken): 464 | process_text(prev_token[i]) 465 | 466 | elif token == "Tf" and isinstance(prev_prev_token, BasePdfName): 467 | # Update the current font. 468 | # prev_prev_token holds the font 'name'. The name must be looked up 469 | # in the content stream's resource dictionary, which is page.Resources, 470 | # plus any resource dictionaries above it in the document hierarchy. 471 | current_font = None 472 | resources = page.Resources 473 | while resources and not current_font: 474 | current_font = resources.Font[prev_prev_token] 475 | resources = resources.Parent 476 | 477 | # Remember the previously seen token in case the next operator is a text-showing 478 | # operator -- in which case this was the operand. Remember the token before that 479 | # because it may be a font name for the Tf operator. 480 | prev_prev_token = prev_token 481 | prev_token = token 482 | 483 | return (text_tokens, page_tokens) 484 | 485 | 486 | def chunk_pairs(s): 487 | while len(s) >= 2: 488 | yield (s.pop(0), s.pop(0)) 489 | 490 | 491 | def chunk_triples(s): 492 | while len(s) >= 3: 493 | yield (s.pop(0), s.pop(0), s.pop(0)) 494 | 495 | 496 | class CMap(object): 497 | def __init__(self, cmap): 498 | self.bytes_to_unicode = {} 499 | self.unicode_to_bytes = {} 500 | self.defns = {} 501 | self.usecmap = None 502 | 503 | # Decompress the CMap stream & check that it's not compressed in a way 504 | # we can't understand. 505 | from pdfrw.uncompress import uncompress as uncompress_streams 506 | 507 | uncompress_streams([cmap]) 508 | 509 | # print(cmap.stream, file=sys.stderr) 510 | 511 | # This is based on https://github.com/euske/pdfminer/blob/master/pdfminer/cmapdb.py. 512 | from pdfrw import PdfString, PdfArray 513 | 514 | in_cmap = False 515 | operand_stack = [] 516 | codespacerange = [] 517 | 518 | def code_to_int(code): 519 | # decode hex encoding 520 | code = code.to_bytes() 521 | if sys.version_info < (3,): 522 | code = (ord(c) for c in code) 523 | from functools import reduce 524 | 525 | return reduce(lambda x0, x: x0 * 256 + x, (b for b in code)) 526 | 527 | def add_mapping(code, char, offset=0): 528 | # Is this a mapping for a one-byte or two-byte character code? 529 | width = len(codespacerange[0].to_bytes()) 530 | assert len(codespacerange[1].to_bytes()) == width 531 | if width == 1: 532 | # one-byte entry 533 | if sys.version_info < (3,): 534 | code = chr(code) 535 | else: 536 | code = bytes([code]) 537 | elif width == 2: 538 | if sys.version_info < (3,): 539 | code = chr(code // 256) + chr(code & 255) 540 | else: 541 | code = bytes([code // 256, code & 255]) 542 | else: 543 | raise ValueError("Invalid code space range %s?" % repr(codespacerange)) 544 | 545 | # Some range operands take an array. 546 | if isinstance(char, PdfArray): 547 | char = char[offset] 548 | 549 | # The Unicode character is given usually as a hex string of one or more 550 | # two-byte Unicode code points. 551 | if isinstance(char, PdfString): 552 | char = char.to_bytes() 553 | if sys.version_info < (3,): 554 | char = (ord(c) for c in char) 555 | 556 | c = "" 557 | for xh, xl in chunk_pairs(list(char)): 558 | c += chr(xh * 256 + xl) 559 | char = c 560 | 561 | if offset > 0: 562 | char = char[0:-1] + chr(ord(char[-1]) + offset) 563 | else: 564 | assert offset == 0 565 | 566 | self.bytes_to_unicode[code] = char 567 | self.unicode_to_bytes[char] = code 568 | 569 | for token in tokenize_streams([cmap.stream]): 570 | if token == "begincmap": 571 | in_cmap = True 572 | operand_stack[:] = [] 573 | continue 574 | elif token == "endcmap": 575 | in_cmap = False 576 | continue 577 | if not in_cmap: 578 | continue 579 | 580 | if token == "def": 581 | name = operand_stack.pop(0) 582 | value = operand_stack.pop(0) 583 | self.defns[name] = value 584 | 585 | elif token == "usecmap": 586 | self.usecmap = self.pop(0) 587 | 588 | elif token == "begincodespacerange": 589 | operand_stack[:] = [] 590 | elif token == "endcodespacerange": 591 | codespacerange = [operand_stack.pop(0), operand_stack.pop(0)] 592 | 593 | elif token in ("begincidrange", "beginbfrange"): 594 | operand_stack[:] = [] 595 | elif token in ("endcidrange", "endbfrange"): 596 | for (code1, code2, cid_or_name1) in chunk_triples(operand_stack): 597 | if not isinstance(code1, PdfString) or not isinstance(code2, PdfString): 598 | continue 599 | code1 = code_to_int(code1) 600 | code2 = code_to_int(code2) 601 | for code in range(code1, code2 + 1): 602 | add_mapping(code, cid_or_name1, code - code1) 603 | operand_stack[:] = [] 604 | 605 | elif token in ("begincidchar", "beginbfchar"): 606 | operand_stack[:] = [] 607 | elif token in ("endcidchar", "endbfchar"): 608 | for (code, char) in chunk_pairs(operand_stack): 609 | if not isinstance(code, PdfString): 610 | continue 611 | add_mapping(code_to_int(code), char) 612 | operand_stack[:] = [] 613 | 614 | elif token == "beginnotdefrange": 615 | operand_stack[:] = [] 616 | elif token == "endnotdefrange": 617 | operand_stack[:] = [] 618 | 619 | else: 620 | operand_stack.append(token) 621 | 622 | def dump(self): 623 | for code, char in self.bytes_to_unicode.items(): 624 | print(repr(code), char) 625 | 626 | def decode(self, string): 627 | ret = [] 628 | i = 0 629 | while i < len(string): 630 | if string[i : i + 1] in self.bytes_to_unicode: 631 | # byte matches a single-byte entry 632 | ret.append(self.bytes_to_unicode[string[i : i + 1]]) 633 | i += 1 634 | elif string[i : i + 2] in self.bytes_to_unicode: 635 | # next two bytes matches a multi-byte entry 636 | ret.append(self.bytes_to_unicode[string[i : i + 2]]) 637 | i += 2 638 | else: 639 | ret.append("?") 640 | i += 1 641 | return "".join(ret) 642 | 643 | def encode(self, string): 644 | ret = [] 645 | for c in string: 646 | ret.append(self.unicode_to_bytes.get(c, b"")) 647 | return b"".join(ret) 648 | 649 | 650 | def toUnicode(string, font, fontcache): 651 | # This is hard! 652 | 653 | if not font: 654 | # There is no font for this text. Assume Latin-1. 655 | return string.decode("Latin-1") 656 | elif font.ToUnicode: 657 | # Decompress the CMap stream & check that it's not compressed in a way 658 | # we can't understand. 659 | from pdfrw.uncompress import uncompress as uncompress_streams 660 | 661 | uncompress_streams([font.ToUnicode]) 662 | 663 | # Use the CMap, which maps character codes to Unicode code points. 664 | if font.ToUnicode.stream not in fontcache: 665 | fontcache[font.ToUnicode.stream] = CMap(font.ToUnicode) 666 | cmap = fontcache[font.ToUnicode.stream] 667 | 668 | string = cmap.decode(string) 669 | # print(string, end='', file=sys.stderr) 670 | # sys.stderr.write(string) 671 | return string 672 | elif font.Encoding == "/WinAnsiEncoding": 673 | return string.decode("cp1252", "replace") 674 | elif font.Encoding == "/MacRomanEncoding": 675 | return string.decode("mac_roman", "replace") 676 | else: 677 | return "?" 678 | # raise ValueError("Don't know how to decode data from font %s." % font) 679 | 680 | 681 | def fromUnicode(string, font, fontcache, options): 682 | # Filter out characters that are not likely to have renderable glyphs 683 | # because the character didn't occur in the original PDF in its font. 684 | # For any character that didn't occur in the original PDF, replace it 685 | # with the first character in options.content_replacement_glyphs that 686 | # did occur in the original PDF. If none ocurred, delete the character. 687 | if font and font.BaseFont in fontcache: 688 | char_occurs = fontcache[font.BaseFont] 689 | 690 | def map_char(c): 691 | for cc in [c] + options.content_replacement_glyphs: 692 | if cc in char_occurs: 693 | return cc 694 | return "" # no replacement glyph => omit character 695 | 696 | string = "".join(map_char(c) for c in string) 697 | 698 | # Encode the Unicode string in the same encoding that it was originally 699 | # stored in --- based on the font that was active when the token was 700 | # used in a text-showing operation. 701 | if not font: 702 | # There was no font for this text. Assume Latin-1. 703 | return string.encode("Latin-1") 704 | 705 | elif font.ToUnicode and font.ToUnicode.stream in fontcache: 706 | # Convert the Unicode code points back to one/two-byte CIDs. 707 | cmap = fontcache[font.ToUnicode.stream] 708 | return cmap.encode(string) 709 | 710 | # Convert using a simple encoding. 711 | elif font.Encoding == "/WinAnsiEncoding": 712 | return string.encode("cp1252") 713 | elif font.Encoding == "/MacRomanEncoding": 714 | return string.encode("mac_roman") 715 | 716 | # Don't know how to handle this sort of font. 717 | else: 718 | raise ValueError("Don't know how to encode data to font %s." % font) 719 | 720 | 721 | def update_text_layer(options, text_tokens, page_tokens): 722 | if len(text_tokens) == 0: 723 | # No text content. 724 | return 725 | 726 | # Apply each regular expression to the text content... 727 | for pattern, function in options.content_filters: 728 | # Finding all matches... 729 | text_tokens_index = 0 730 | text_tokens_charpos = 0 731 | text_tokens_token_xdiff = 0 732 | text_content = "".join(t.value for t in text_tokens) 733 | for m in pattern.finditer(text_content): 734 | # We got a match at text_content[i1:i2]. 735 | i1 = m.start() 736 | i2 = m.end() 737 | 738 | # Pass the matched text to the replacement function to get replaced text. 739 | replacement = function(m) 740 | 741 | # Do a text replacement in the tokens that produced this text content. 742 | # It may have been produced by multiple tokens, so loop until we find them all. 743 | while i1 < i2: 744 | # Find the original tokens in the content stream that 745 | # produced the matched text. Start by advancing over any 746 | # tokens that are entirely before this span of text. 747 | while ( 748 | text_tokens_index < len(text_tokens) 749 | and text_tokens_charpos + len(text_tokens[text_tokens_index].value) - text_tokens_token_xdiff <= i1 750 | ): 751 | text_tokens_charpos += len(text_tokens[text_tokens_index].value) - text_tokens_token_xdiff 752 | text_tokens_index += 1 753 | text_tokens_token_xdiff = 0 754 | if text_tokens_index == len(text_tokens): 755 | break 756 | assert text_tokens_charpos <= i1 757 | 758 | # The token at text_tokens_index, and possibly subsequent ones, 759 | # are responsible for this text. Replace the matched content 760 | # here with replacement content. 761 | tok = text_tokens[text_tokens_index] 762 | 763 | # Where does this match begin within the token's text content? 764 | mpos = i1 - text_tokens_charpos 765 | assert mpos >= 0 766 | 767 | # How long is the match within this token? 768 | mlen = min(i2 - i1, len(tok.value) - text_tokens_token_xdiff - mpos) 769 | assert mlen >= 0 770 | 771 | # How much should we replace here? 772 | if mlen < (i2 - i1): 773 | # There will be more replaced later, so take the same number 774 | # of characters from the replacement text. 775 | r = replacement[:mlen] 776 | replacement = replacement[mlen:] 777 | else: 778 | # This is the last token in which we'll replace text, so put 779 | # all of the remaining replacement content here. 780 | r = replacement 781 | replacement = None # sanity 782 | 783 | # Do the replacement. 784 | tok.value = ( 785 | tok.value[: mpos + text_tokens_token_xdiff] + r + tok.value[mpos + mlen + text_tokens_token_xdiff :] 786 | ) 787 | text_tokens_token_xdiff += len(r) - mlen 788 | 789 | # Advance for next iteration. 790 | i1 += mlen 791 | 792 | 793 | def apply_updated_text(document, text_tokens, page_tokens): 794 | # Create a new content stream for each page by concatenating the 795 | # tokens in the page_tokens lists. 796 | from pdfrw import PdfArray 797 | 798 | for i, page in enumerate(document.pages): 799 | if page.Contents is None: 800 | continue # nothing was here 801 | 802 | # Replace the page's content stream with our updated tokens. 803 | # The content stream may have been an array of streams before, 804 | # so replace the whole thing with a single new stream. Unfortunately 805 | # the str on PdfArray and PdfDict doesn't work right. 806 | def tok_str(tok): 807 | if isinstance(tok, PdfArray): 808 | return "[ " + " ".join(tok_str(x) for x in tok) + "] " 809 | if isinstance(tok, InlineImage): 810 | return "BI " + " ".join(tok_str(x) + " " + tok_str(y) for x, y in tok.items()) + " ID " + tok.stream + " EI " 811 | if isinstance(tok, PdfDict): 812 | return "<< " + " ".join(tok_str(x) + " " + tok_str(y) for x, y in tok.items()) + ">> " 813 | return str(tok) 814 | 815 | page.Contents = PdfDict() 816 | page.Contents.stream = "\n".join(tok_str(tok) for tok in page_tokens[i]) 817 | page.Contents.Length = len(page.Contents.stream) # reset 818 | 819 | 820 | def update_annotations(document, options): 821 | for page in document.pages: 822 | if hasattr(page, "Annots") and isinstance(page.Annots, list): 823 | for annotation in page.Annots: 824 | update_annotation(annotation, options) 825 | 826 | 827 | def update_annotation(annotation, options): 828 | from pdfrw.objects import PdfString 829 | 830 | # Contents holds a plain-text representation of the annotation 831 | # content, such as for accessibility. All annotation types may 832 | # have a Contents. NM holds the "annotation name" which also 833 | # could have redactable text, I suppose. Markup annotations have 834 | # "T" fields that hold a title / text label. Subj holds a 835 | # comment subject. CA, RC, and AC are used in widget annotations. 836 | for string_field in ("Contents", "NM", "T", "Subj", "CA", "RC", "AC"): 837 | if getattr(annotation, string_field): 838 | value = getattr(annotation, string_field).to_unicode() 839 | for pattern, function in options.content_filters: 840 | value = pattern.sub(function, value) 841 | setattr(annotation, string_field, PdfString.from_unicode(value)) 842 | 843 | # A rich-text stream. Not implemented. Bail so that we don't 844 | # accidentally leak something that should be redacted. 845 | if annotation.RC: 846 | raise ValueError("Annotation rich-text streams (Annot/RC) are not supported.") 847 | 848 | # An action, usually used for links. 849 | if annotation.A: 850 | update_annotation_action(annotation, annotation.A, options) 851 | if annotation.PA: 852 | update_annotation_action(annotation, annotation.PA, options) 853 | 854 | # If set, another annotation. 855 | if annotation.Popup: 856 | update_annotation(annotation.Popup, options) 857 | 858 | 859 | # TODO? Redaction annotations have some other attributes that might 860 | # have text. But since they're intended for redaction... maybe we 861 | # should keep them anyway. 862 | 863 | 864 | def update_annotation_action(annotation, action, options): 865 | from pdfrw.objects import PdfString 866 | 867 | if action.URI and options.link_filters: 868 | value = action.URI.to_unicode() 869 | for func in options.link_filters: 870 | value = func(value, annotation) 871 | if value is None: 872 | # Remove annotation by supressing the action. 873 | action.URI = None 874 | else: 875 | action.URI = PdfString.from_unicode(value) 876 | 877 | if action.Next: 878 | # May be an Action or array of Actions to execute next. 879 | next_action = action.Next 880 | if isinstance(action.Next, dict): 881 | next_action = [action.Next] 882 | for a in next_action: 883 | update_annotation_action(annotation, a, options) 884 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 127 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | pdfrw==0.4 3 | wkhtmltopdf==0.2 4 | pdfkit==0.6.1 5 | defusedxml==0.6.0 6 | beautifulsoup4==4.9.1 7 | 8 | # Development dependencies 9 | chardet==3.0.4 10 | pre-commit==2.2.0 11 | pytest==5.4.1 12 | pytest-cov==2.8.1 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | from setuptools import find_packages 4 | 5 | setup( 6 | name="expose-text", 7 | version="0.1.6", 8 | url="https://openredact.org/", 9 | author="Jonas Langhabel, Malte Ostendorff", 10 | author_email="hello@openredact.org", 11 | packages=find_packages(exclude=["tests"]), 12 | include_package_data=True, 13 | license="MIT", 14 | description="A Python module that exposes text for modification in multiple file types.", 15 | long_description=open("README.md").read(), 16 | long_description_content_type="text/markdown", 17 | install_requires=["pdfrw==0.4", "defusedxml==0.6.0", "beautifulsoup4==4.9.1", "wkhtmltopdf==0.2", "pdfkit==0.6.1"], 18 | python_requires=">=3.7", 19 | ) 20 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logging.basicConfig( 4 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, 5 | ) 6 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | 6 | @pytest.fixture 7 | def test_files(): 8 | return Path(__file__).parent / "files" 9 | -------------------------------------------------------------------------------- /tests/files/doctest.txt: -------------------------------------------------------------------------------- 1 | This is the content as string. -------------------------------------------------------------------------------- /tests/files/doctest_altered.txt: -------------------------------------------------------------------------------- 1 | That is the new content as string! -------------------------------------------------------------------------------- /tests/files/foo.bar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openredact/expose-text/c8e21774d0ca2f0103e91b50ea093c87a3738a3e/tests/files/foo.bar -------------------------------------------------------------------------------- /tests/files/pdf/doc.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openredact/expose-text/c8e21774d0ca2f0103e91b50ea093c87a3738a3e/tests/files/pdf/doc.pdf -------------------------------------------------------------------------------- /tests/files/test.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openredact/expose-text/c8e21774d0ca2f0103e91b50ea093c87a3738a3e/tests/files/test.docx -------------------------------------------------------------------------------- /tests/files/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | This is a test - The title is not considered by expose-text 9 | 10 | 11 | 12 |
This is some kind of header
13 |
14 |

And now some content…
well not very much.

15 |

This sentence will be replaced.

16 |
17 | 20 |
One more line with an ümlaut.
21 | 22 | 23 | -------------------------------------------------------------------------------- /tests/files/test.txt: -------------------------------------------------------------------------------- 1 | This is a test file. 2 | 3 | With multiple lines. 4 | 5 | See if you can change its content. 6 | -------------------------------------------------------------------------------- /tests/files/test_altered.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openredact/expose-text/c8e21774d0ca2f0103e91b50ea093c87a3738a3e/tests/files/test_altered.docx -------------------------------------------------------------------------------- /tests/files/test_altered.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | This is a test - The title is not considered by expose-text 9 | 10 | 11 | 12 |
This is some kind of header
13 |
14 |

And now some content…
well not very much.

15 |

A new sentence.

16 |
17 | 20 |
One more line with an ümlaut.
21 | 22 | 23 | -------------------------------------------------------------------------------- /tests/files/test_altered.txt: -------------------------------------------------------------------------------- 1 | This is a test file. With a single line. See if you can change its content. 2 | -------------------------------------------------------------------------------- /tests/files/tmp/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openredact/expose-text/c8e21774d0ca2f0103e91b50ea093c87a3738a3e/tests/files/tmp/.gitkeep -------------------------------------------------------------------------------- /tests/test_alterations_buffer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from expose_text.formats._utils import AlterationsBuffer 4 | 5 | 6 | @pytest.fixture() 7 | def buffer(): 8 | return AlterationsBuffer() 9 | 10 | 11 | def test_invalid_type(buffer): 12 | with pytest.raises(TypeError): 13 | buffer += 0 14 | 15 | with pytest.raises(TypeError): 16 | buffer += (1, 2) 17 | 18 | 19 | def test_overlapping_alterations(buffer): 20 | buffer.add(5, 15, "luke") 21 | 22 | with pytest.raises(ValueError): 23 | buffer.add(0, 10, "vader") 24 | 25 | with pytest.raises(ValueError): 26 | buffer.add(10, 20, "obi") 27 | 28 | 29 | def test_non_overlapping_corner_cases(buffer): 30 | buffer.add(5, 15, "anakin") # existing one 31 | 32 | buffer.add(0, 5, "jango") 33 | buffer.add(15, 20, "boba") 34 | assert len(buffer) == 3 35 | 36 | 37 | def test_sorting(buffer): 38 | buffer.add(0, 5, "yoda") 39 | buffer.add(20, 25, "jarjar") 40 | buffer.add(10, 15, "kenobi") 41 | buffer.sort() 42 | assert list(buffer) == [(0, 5, "yoda"), (10, 15, "kenobi"), (20, 25, "jarjar")] 43 | -------------------------------------------------------------------------------- /tests/test_apply_buffer_to_text.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from expose_text.formats._utils import apply_buffer_to_text, AlterationsBuffer 4 | 5 | 6 | @pytest.fixture 7 | def text(): 8 | return "This is the content of a text file.\n\nWith multiple lines.\n\nTry alter me." 9 | 10 | 11 | @pytest.fixture 12 | def buffer(): 13 | return AlterationsBuffer() 14 | 15 | 16 | def test_replace_text(buffer, text): 17 | buffer.add(0, 4, "That") 18 | altered_text = apply_buffer_to_text(buffer, text) 19 | assert altered_text == "That is the content of a text file.\n\nWith multiple lines.\n\nTry alter me." 20 | 21 | 22 | def test_remove_text(buffer, text): 23 | buffer.add(35, 59, " ") 24 | altered_text = apply_buffer_to_text(buffer, text) 25 | assert altered_text == "This is the content of a text file. Try alter me." 26 | -------------------------------------------------------------------------------- /tests/test_auto_pdf_format.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from expose_text import FileWrapper 6 | from expose_text.formats.pdf.pdf2html2pdf import Pdf2Html2PdfFormat 7 | 8 | black_square = u"\u25A0" 9 | 10 | 11 | @pytest.fixture 12 | def tmp_files(): 13 | return Path(__file__).parent / "files" / "tmp" 14 | 15 | 16 | @pytest.fixture 17 | def test_files(): 18 | return Path(__file__).parent / "files" / "pdf" 19 | 20 | 21 | def test_pdf_text(tmp_files, test_files): 22 | """ 23 | 24 | Run this test alone: pytest -s tests/test_pdf_format.py 25 | 26 | """ 27 | input_fp = test_files / "doc.pdf" 28 | output_fp = tmp_files / "doc.altered.pdf" 29 | 30 | fw = FileWrapper(input_fp) 31 | 32 | print(fw.text[:100]) 33 | 34 | fw.add_alter(0, 9, "Deutscher") # replace "Deutscher" 35 | fw.apply_alters() 36 | 37 | print("xxx") 38 | 39 | print(fw.text[:100]) 40 | 41 | # assert "XXXXXXX" == fw.text[0:7] # TODO there is something wrong with indexing 42 | 43 | fw.save(output_fp) 44 | 45 | 46 | def test_check_dependencies(): 47 | print(Pdf2Html2PdfFormat().is_installed()) 48 | -------------------------------------------------------------------------------- /tests/test_docx_format.py: -------------------------------------------------------------------------------- 1 | import filecmp 2 | 3 | import pytest 4 | 5 | from expose_text import FileWrapper 6 | from expose_text.formats._docx import DocxFormat 7 | 8 | ENCODING = "UTF-8" 9 | 10 | 11 | @pytest.fixture 12 | def docx_bytes(test_files): 13 | with open(test_files / "test.docx", "rb") as f: 14 | return f.read() 15 | 16 | 17 | @pytest.fixture 18 | def docx_text(): 19 | return """Title 20 | 21 | Some body lines. 22 | 23 | A text in different colors and styles. 24 | 25 | This is a paragraph with a line 26 | break and nasty tags.""" 27 | 28 | 29 | @pytest.fixture 30 | def format_cls(docx_bytes): 31 | format_cls = DocxFormat() 32 | format_cls.load(docx_bytes) 33 | return format_cls 34 | 35 | 36 | @pytest.fixture 37 | def replace(): 38 | def function(string, start, stop, new_content): 39 | return string[:start] + new_content + string[stop:] 40 | 41 | return function 42 | 43 | 44 | def test_text_property(format_cls, docx_text): 45 | assert format_cls.text == docx_text 46 | 47 | 48 | def test_bytes_property(format_cls, docx_text): 49 | format_again = DocxFormat() 50 | format_again.load(format_cls.bytes) 51 | assert format_again.text == docx_text 52 | 53 | 54 | def test_replacing_with_longer_text(format_cls, docx_text, replace): 55 | args = 25, 63, "This is the replaced line." 56 | format_cls.add_alter(*args) 57 | format_cls.apply_alters() 58 | assert format_cls.text == replace(docx_text, *args) 59 | 60 | 61 | def test_replacing_with_shorter_text(format_cls, docx_text, replace): 62 | args = 7, 23, "XXX" 63 | format_cls.add_alter(*args) 64 | format_cls.apply_alters() 65 | assert format_cls.text == replace(docx_text, *args) 66 | 67 | 68 | def test_removing_text(format_cls, docx_text, replace): 69 | args = 64, 124, "" 70 | format_cls.add_alter(*args) 71 | format_cls.apply_alters() 72 | assert format_cls.text == replace(docx_text, *args) 73 | 74 | 75 | def test_alter_file(test_files, tmp_path): 76 | file_path = test_files / "test.docx" 77 | altered_file_path = test_files / "test_altered.docx" 78 | tmp_out_path = tmp_path / "test_out.docx" 79 | 80 | file_wrapper = FileWrapper(file_path) 81 | file_wrapper.add_alter(7, 23, "XXX") 82 | file_wrapper.add_alter(25, 63, "This is the replaced line.") 83 | file_wrapper.add_alter(64, 124, "") 84 | file_wrapper.apply_alters() 85 | file_wrapper.save(tmp_out_path) 86 | 87 | assert ( 88 | file_wrapper.text 89 | == """Title 90 | 91 | XXX 92 | 93 | This is the replaced line. 94 | """ 95 | ) 96 | assert filecmp.cmp(altered_file_path, tmp_out_path, shallow=False) 97 | -------------------------------------------------------------------------------- /tests/test_file_wrapper.py: -------------------------------------------------------------------------------- 1 | import filecmp 2 | 3 | import pytest 4 | 5 | from expose_text import FileWrapper, UnsupportedFormat 6 | 7 | 8 | def test_unsupported_format(test_files): 9 | with pytest.raises(UnsupportedFormat): 10 | FileWrapper(test_files / "foo.bar") 11 | 12 | 13 | def test_load_and_save_for_path(test_files, tmp_path): 14 | file_path = test_files / "test.txt" 15 | result_path = tmp_path / "test_out.txt" 16 | 17 | file_wrapper = FileWrapper(file_path) 18 | file_wrapper.save(result_path) 19 | 20 | assert filecmp.cmp(file_path, result_path, shallow=False) 21 | 22 | 23 | def test_load_and_save_for_string(test_files, tmp_path): 24 | file_path = test_files / "test.txt" 25 | result_path = tmp_path / "test_out.txt" 26 | 27 | file_wrapper = FileWrapper(str(file_path)) 28 | file_wrapper.save(str(result_path)) 29 | 30 | assert filecmp.cmp(file_path, result_path, shallow=False) 31 | 32 | 33 | def test_alter_file(test_files, tmp_path): 34 | file_path = test_files / "test.txt" 35 | altered_file_path = test_files / "test_altered.txt" 36 | tmp_out_path = tmp_path / "test_out.txt" 37 | 38 | file_wrapper = FileWrapper(file_path) 39 | file_wrapper.add_alter(20, 44, " With a single line. ") 40 | file_wrapper.apply_alters() 41 | file_wrapper.save(tmp_out_path) 42 | 43 | assert filecmp.cmp(altered_file_path, tmp_out_path, shallow=False) 44 | -------------------------------------------------------------------------------- /tests/test_html_format.py: -------------------------------------------------------------------------------- 1 | import filecmp 2 | 3 | import pytest 4 | 5 | from expose_text import FileWrapper 6 | from expose_text.formats._html import HtmlFormat 7 | 8 | ENCODING = "UTF-8" 9 | 10 | 11 | @pytest.fixture 12 | def html_snippet(): 13 | return """

German paragraph

\n

1. … macht mich glücklich

""".encode(ENCODING) 14 | 15 | 16 | @pytest.fixture 17 | def format_cls(html_snippet): 18 | format_cls = HtmlFormat() 19 | format_cls.load(html_snippet) 20 | return format_cls 21 | 22 | 23 | def test_text_property(format_cls): 24 | assert format_cls.text == "German paragraph\n1. … macht mich glücklich" 25 | 26 | 27 | def test_bytes_property(format_cls): 28 | assert format_cls.bytes == '

German paragraph

\n' "

1. … macht mich glücklich

".encode( 29 | ENCODING 30 | ) 31 | 32 | 33 | def test_unescaping_html(): 34 | html_bytes = '

<>&

\n

… macht mich glücklich

'.encode(ENCODING) 35 | format_cls = HtmlFormat() 36 | format_cls.load(html_bytes) 37 | assert format_cls.text == "<>&\n… macht mich glücklich" 38 | assert format_cls.bytes == '

<>&

\n' "

… macht mich glücklich

".encode( 39 | ENCODING 40 | ) 41 | 42 | 43 | def test_same_length_replacing(format_cls): 44 | format_cls.add_alter(0, 6, "XXXXXX") 45 | format_cls.apply_alters() 46 | assert format_cls.text == "XXXXXX paragraph\n1. … macht mich glücklich" 47 | assert format_cls.bytes == '

XXXXXX paragraph

\n' "

1. … macht mich glücklich

".encode( 48 | ENCODING 49 | ) 50 | 51 | 52 | def test_replacing_with_longer_text(format_cls): 53 | format_cls.add_alter(0, 6, "XXXXXXXXX") 54 | format_cls.apply_alters() 55 | assert format_cls.text == "XXXXXXXXX paragraph\n1. … macht mich glücklich" 56 | assert ( 57 | format_cls.bytes == '

XXXXXXXXX paragraph

\n' 58 | "

1. … macht mich glücklich

".encode(ENCODING) 59 | ) 60 | 61 | 62 | def test_replacing_with_shorter_text(format_cls): 63 | format_cls.add_alter(0, 6, "XXX") 64 | format_cls.apply_alters() 65 | assert format_cls.text == "XXX paragraph\n1. … macht mich glücklich" 66 | assert format_cls.bytes == '

XXX paragraph

\n' "

1. … macht mich glücklich

".encode( 67 | ENCODING 68 | ) 69 | 70 | 71 | def test_removing_text(format_cls): 72 | format_cls.add_alter(0, 7, "") 73 | format_cls.apply_alters() 74 | assert format_cls.text == "paragraph\n1. … macht mich glücklich" 75 | assert format_cls.bytes == '

paragraph

\n' "

1. … macht mich glücklich

".encode( 76 | ENCODING 77 | ) 78 | 79 | 80 | def test_removing_entire_content_of_element(format_cls): 81 | format_cls.add_alter(0, 16, "") 82 | format_cls.apply_alters() 83 | assert format_cls.text == "\n1. … macht mich glücklich" 84 | assert format_cls.bytes == '

\n' "

1. … macht mich glücklich

".encode(ENCODING) 85 | 86 | 87 | def test_removing_over_element_borders(format_cls): 88 | format_cls.add_alter(0, 20, "") 89 | format_cls.apply_alters() 90 | assert format_cls.text == "… macht mich glücklich" 91 | assert format_cls.bytes == '

\n' "

… macht mich glücklich

".encode(ENCODING) 92 | 93 | 94 | def test_replacing_over_element_borders(format_cls): 95 | format_cls.add_alter(0, 20, "All content goes in the first element. ") 96 | format_cls.apply_alters() 97 | assert format_cls.text == "All content goes in the first element. … macht mich glücklich" 98 | assert ( 99 | format_cls.bytes == '

All content goes in the first element.

\n' 100 | "

… macht mich glücklich

".encode(ENCODING) 101 | ) 102 | 103 | 104 | def test_escaping_html_characters(format_cls): 105 | format_cls.add_alter(0, 6, "") 106 | format_cls.apply_alters() 107 | assert format_cls.text == " paragraph\n1. … macht mich glücklich" 108 | assert ( 109 | format_cls.bytes == '

<Language> paragraph

\n' 110 | "

1. … macht mich glücklich

".encode(ENCODING) 111 | ) 112 | 113 | 114 | def test_umlauts(format_cls): 115 | format_cls.add_alter(20, 21, "Ein Äffchen") 116 | format_cls.apply_alters() 117 | assert format_cls.text == "German paragraph\n1. Ein Äffchen macht mich glücklich" 118 | assert ( 119 | format_cls.bytes == '

German paragraph

\n' 120 | "

1. Ein Äffchen macht mich glücklich

".encode(ENCODING) 121 | ) 122 | 123 | 124 | def test_chained_alterations(format_cls): 125 | format_cls.add_alter(33, 42, "froh") 126 | format_cls.add_alter(7, 19, "Paragraph:") 127 | format_cls.add_alter(0, 6, "Deutscher") 128 | format_cls.add_alter(20, 21, "Essen") 129 | format_cls.apply_alters() 130 | assert format_cls.text == "Deutscher Paragraph: Essen macht mich froh" 131 | assert format_cls.bytes == '

Deutscher Paragraph:

\n' "

Essen macht mich froh

".encode( 132 | ENCODING 133 | ) 134 | 135 | 136 | def test_altering_html_body(): 137 | html_bytes = ( 138 | '

German paragraph

\n

1. … macht mich glücklich

' 139 | ).encode(ENCODING) 140 | format_cls = HtmlFormat() 141 | format_cls.load(html_bytes) 142 | print(format_cls.text) 143 | format_cls.add_alter(0, 6, "Deutscher") 144 | format_cls.add_alter(20, 21, "Essen") 145 | format_cls.apply_alters() 146 | assert format_cls.text == "Deutscher paragraph\n1. Essen macht mich glücklich" 147 | assert ( 148 | format_cls.bytes == '

Deutscher paragraph

\n' 149 | "

1. Essen macht mich glücklich

".encode(ENCODING) 150 | ) 151 | 152 | 153 | def test_altering_html_document(): 154 | html_bytes = """ 155 | 156 | 157 | 158 | 159 | 160 | The title is not considered. 161 | 162 | 163 | 164 | 165 | 166 |
167 |

German paragraph

168 |

1. … macht mich glücklich

169 |
170 | 171 | """.encode( 172 | ENCODING 173 | ) 174 | format_cls = HtmlFormat() 175 | format_cls.load(html_bytes) 176 | format_cls.add_alter(0, 6, "Deutscher") 177 | format_cls.add_alter(20, 21, "Essen") 178 | format_cls.apply_alters() 179 | assert format_cls.text == "Deutscher paragraph\n1. Essen macht mich glücklich" 180 | assert ( 181 | format_cls.bytes 182 | == """ 183 | 184 | 185 | 186 | 187 | 188 | The title is not considered. 189 | 190 | 191 | 192 | 193 | 194 |
195 |

Deutscher paragraph

196 |

1. Essen macht mich glücklich

197 |
198 | 199 | """.encode( 200 | ENCODING 201 | ) 202 | ) 203 | 204 | 205 | def test_alter_file(test_files, tmp_path): 206 | file_path = test_files / "test.html" 207 | altered_file_path = test_files / "test_altered.html" 208 | tmp_out_path = tmp_path / "test_out.html" 209 | 210 | file_wrapper = FileWrapper(file_path) 211 | file_wrapper.add_alter(71, 102, "A new sentence.") 212 | file_wrapper.apply_alters() 213 | file_wrapper.save(tmp_out_path) 214 | 215 | assert filecmp.cmp(altered_file_path, tmp_out_path, shallow=False) 216 | -------------------------------------------------------------------------------- /tests/test_pdf2html2pdf_format.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from expose_text import FileWrapper 6 | from expose_text.formats.pdf.auto_pdf import AutoPdfFormat 7 | 8 | black_square = u"\u25A0" 9 | 10 | 11 | @pytest.fixture 12 | def tmp_files(): 13 | return Path(__file__).parent / "files" / "tmp" 14 | 15 | 16 | @pytest.fixture 17 | def test_files(): 18 | return Path(__file__).parent / "files" / "pdf" 19 | 20 | 21 | def test_pdf_text(tmp_files, test_files): 22 | """ 23 | 24 | Run this test alone: pytest -s tests/test_pdf2html2pdf_format.py 25 | 26 | """ 27 | input_fp = test_files / "doc.pdf" 28 | output_fp = tmp_files / "doc.altered.pdf" 29 | 30 | fw = FileWrapper(input_fp, AutoPdfFormat) 31 | 32 | print("Before: %s" % fw.text[:25]) 33 | 34 | # fw.add_alter(0, 9, "Deutscher") # replace "Deutscher" 35 | fw.add_alter(0, 9, "".join(10 * [black_square])) # replace "Deutscher" 36 | fw.apply_alters() 37 | 38 | print("After: %s" % fw.text[:25]) 39 | 40 | fw.save(output_fp) 41 | 42 | print("Type: %s" % type(fw.file)) 43 | -------------------------------------------------------------------------------- /tests/test_pdf_format.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from expose_text import FileWrapper 6 | from expose_text.formats.pdf.pdf2html2pdf import Pdf2Html2PdfFormat 7 | 8 | black_square = u"\u25A0" 9 | 10 | 11 | @pytest.fixture 12 | def tmp_files(): 13 | return Path(__file__).parent / "files" / "tmp" 14 | 15 | 16 | @pytest.fixture 17 | def test_files(): 18 | return Path(__file__).parent / "files" / "pdf" 19 | 20 | 21 | def test_pdf_text(tmp_files, test_files): 22 | """ 23 | 24 | Run this test alone: pytest -s tests/test_pdf_format.py 25 | 26 | """ 27 | input_fp = test_files / "doc.pdf" 28 | output_fp = tmp_files / "doc.altered.pdf" 29 | 30 | fw = FileWrapper(input_fp) 31 | 32 | print(fw.text[:100]) 33 | 34 | fw.add_alter(0, 9, "Deutscher") # replace "Deutscher" 35 | fw.apply_alters() 36 | 37 | print("xxx") 38 | 39 | print(fw.text[:100]) 40 | 41 | # assert "XXXXXXX" == fw.text[0:7] # TODO there is something wrong with indexing 42 | 43 | fw.save(output_fp) 44 | 45 | 46 | def test_check_dependencies(): 47 | print(Pdf2Html2PdfFormat().is_installed()) 48 | -------------------------------------------------------------------------------- /tests/test_txt_format.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from expose_text.formats._txt import TxtFormat 4 | 5 | ENCODING = "UTF-8" 6 | 7 | 8 | @pytest.fixture 9 | def txt_bytes(): 10 | return b"This is the content of a text file.\n\nWith multiple lines.\n\nTry alter me." 11 | 12 | 13 | @pytest.fixture() 14 | def format_cls(txt_bytes): 15 | format_cls = TxtFormat() 16 | format_cls.load(txt_bytes) 17 | return format_cls 18 | 19 | 20 | def test_text_property(format_cls, txt_bytes): 21 | assert format_cls.text == txt_bytes.decode(ENCODING) 22 | 23 | 24 | def test_bytes_property(format_cls, txt_bytes): 25 | assert format_cls.bytes == txt_bytes 26 | 27 | 28 | def test_alterations(format_cls): 29 | # all indices are for the original string (in_bytes) 30 | format_cls.add_alter(0, 4, "That") 31 | format_cls.add_alter(35, 59, " ") 32 | format_cls.add_alter(63, 68, "change") 33 | format_cls.apply_alters() 34 | assert format_cls.text == "That is the content of a text file. Try change me." 35 | assert format_cls.bytes == b"That is the content of a text file. Try change me." 36 | 37 | 38 | @pytest.mark.parametrize("encoding", ["utf-8", "utf-16", "latin-1", "windows-1252"]) 39 | def test_encodings(encoding): 40 | encoded_string = "¾ der Mäuse sind weiß. The bread costs 7$.".encode(encoding) 41 | format_cls = TxtFormat() 42 | format_cls.load(encoded_string) 43 | assert format_cls.bytes == encoded_string 44 | assert format_cls.text == encoded_string.decode(encoding) 45 | --------------------------------------------------------------------------------