├── .flake8
├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── .pre-commit-config.yaml
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── docs
    └── expose-text.png
├── expose_text
    ├── __init__.py
    ├── core.py
    ├── exceptions.py
    └── formats
    │   ├── __init__.py
    │   ├── _docx.py
    │   ├── _html.py
    │   ├── _pdf.py
    │   ├── _txt.py
    │   ├── _utils.py
    │   ├── base.py
    │   ├── markup
    │       ├── __init__.py
    │       └── utils.py
    │   └── pdf
    │       ├── __init__.py
    │       ├── auto_pdf.py
    │       ├── pdf2html2pdf.py
    │       └── pdf_redactor.py
├── pyproject.toml
├── requirements.txt
├── setup.py
└── tests
    ├── __init__.py
    ├── conftest.py
    ├── files
        ├── doctest.txt
        ├── doctest_altered.txt
        ├── foo.bar
        ├── pdf
        │   └── doc.pdf
        ├── test.docx
        ├── test.html
        ├── test.txt
        ├── test_altered.docx
        ├── test_altered.html
        ├── test_altered.txt
        └── tmp
        │   └── .gitkeep
    ├── test_alterations_buffer.py
    ├── test_apply_buffer_to_text.py
    ├── test_auto_pdf_format.py
    ├── test_docx_format.py
    ├── test_file_wrapper.py
    ├── test_html_format.py
    ├── test_pdf2html2pdf_format.py
    ├── test_pdf_format.py
    └── test_txt_format.py


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E203, W503
3 | max-line-length = 127
4 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: Continuous Integration
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   pre-commit:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - uses: actions/checkout@v2
10 |       - uses: actions/setup-python@v2
11 |       - uses: pre-commit/action@v2.0.0
12 |   test:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       matrix:
16 |         python-version: [3.7, 3.8]
17 |     steps:
18 |     - uses: actions/checkout@v2
19 |     - name: Set up Python ${{ matrix.python-version }}
20 |       uses: actions/setup-python@v2
21 |       with:
22 |         python-version: ${{ matrix.python-version }}
23 |     - name: Install OS dependecies
24 |       run: |
25 |         sudo apt-get install -y poppler-utils
26 |         wget --quiet https://github.com/wkhtmltopdf/wkhtmltopdf/releases/download/0.12.3/wkhtmltox-0.12.3_linux-generic-amd64.tar.xz && \
27 |             tar vxf wkhtmltox-0.12.3_linux-generic-amd64.tar.xz && \
28 |             sudo cp wkhtmltox/bin/wk* /usr/local/bin/ && \
29 |             rm -rf wkhtmltox
30 |     - name: Install Python dependencies
31 |       run: |
32 |         python -m pip install --upgrade pip
33 |         pip install -r requirements.txt
34 |     - name: Test with pytest
35 |       run: |
36 |         pip install pytest
37 |         pip install pytest-cov
38 |         pytest --doctest-modules --cov-report term --cov=expose_text
39 |   publish:
40 |     runs-on: ubuntu-latest
41 |     needs: [pre-commit, test]
42 |     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
43 |     steps:
44 |       - uses: actions/checkout@v2
45 |       - name: Set up Python
46 |         uses: actions/setup-python@v2
47 |         with:
48 |           python-version: '3.x'
49 |       - name: Publish package for tags
50 |         run: |
51 |           python -m pip install --upgrade pip setuptools wheel twine
52 |           python setup.py sdist bdist_wheel
53 |           python -m twine upload -u ${{ secrets.PYPI_USERNAME }} -p ${{ secrets.PYPI_PASSWORD }} dist/*
54 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # PyCharm
 2 | .idea/
 3 | 
 4 | # Mac
 5 | .DS_Store
 6 | 
 7 | # Virtualenv
 8 | venv/
 9 | 
10 | # Python
11 | expose_text.egg-info/
12 | 
13 | # Tests
14 | .coverage
15 | __pycache__
16 | tests/files/tmp/*
17 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/psf/black
 3 |     rev: stable
 4 |     hooks:
 5 |       - id: black
 6 |   - repo: https://gitlab.com/pycqa/flake8
 7 |     rev: 3.7.9
 8 |     hooks:
 9 |       - id: flake8
10 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7
 2 | 
 3 | # Run tests within Docker
 4 | # docker build -t expose-text .
 5 | # docker run expose-text
 6 | 
 7 | WORKDIR /app
 8 | 
 9 | # Install PDF depdencies (expose-text)
10 | RUN apt-get update
11 | RUN apt-get install -y cmake autoconf
12 | 
13 | # wkhtmltopdf
14 | RUN wget --quiet https://github.com/wkhtmltopdf/wkhtmltopdf/releases/download/0.12.3/wkhtmltox-0.12.3_linux-generic-amd64.tar.xz && \
15 |     tar vxf wkhtmltox-0.12.3_linux-generic-amd64.tar.xz && \
16 |     cp wkhtmltox/bin/wk* /usr/local/bin/ && \
17 |     rm -rf wkhtmltox
18 | 
19 | # Uninstall old version (latest version is not available over apt)
20 | RUN apt-get purge -y poppler-utils
21 | 
22 | # Install new poppler-utils manually
23 | RUN wget poppler.freedesktop.org/poppler-0.90.1.tar.xz
24 | RUN tar -xvf poppler-0.90.1.tar.xz
25 | RUN cd poppler-0.90.1 && mkdir build && cd build && cmake .. && make && ldconfig
26 | RUN ln -s /usr/local/bin/pdftohtml /usr/bin/pdftohtml
27 | 
28 | # Install packages
29 | COPY requirements.txt .
30 | 
31 | RUN pip install --no-cache-dir -r requirements.txt
32 | 
33 | RUN pip install pytest pytest-cov
34 | 
35 | COPY ./ /app/
36 | 
37 | CMD ["pytest", "--doctest-modules", "--cov-report", "term", "--cov", "expose_text", "-s"]
38 | 
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Jonas Langhabel, Malte Ostendorff
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ExposeText
  2 | 
  3 | **Expose the text in a document for modification.**
  4 | 
  5 | ---
  6 | 
  7 | [![PyPI version](https://badge.fury.io/py/expose-text.svg)](https://badge.fury.io/py/expose-text)
  8 | ![Tests](https://github.com/openredact/expose-text/workflows/Tests/badge.svg?branch=master)
  9 | ![Black & Flake8](https://github.com/openredact/expose-text/workflows/Black%20&%20Flake8/badge.svg?branch=master)
 10 | [![Code style: Black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black)
 11 | [![MIT license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](http://opensource.org/licenses/MIT)
 12 | 
 13 | _**⚠️ Disclaimer ⚠️:**_ This is a prototype. Do not use for anything critical.
 14 | 
 15 | ## What is ExposeText?
 16 | 
 17 | Dealing with document file formats can be quite painful.
 18 | Oftentimes code must be written that’s specific to one file format.
 19 | We have written ExposeText with the goal to make modifying documents as simple as changing Python strings.
 20 | A slice of the original document can be directly assigned a new content by using the character indices of the extracted text, all while keeping the document's original formatting.
 21 | 
 22 | We published a blog post about ExposeText on [Medium](https://medium.com/@openredact/introducing-exposetext-modify-document-files-as-simply-as-strings-cc5caa5f9c66?source=friends_link&sk=825c8f64dfa4e943b66d1faf351340a2).
 23 | 
 24 | ![](https://raw.githubusercontent.com/openredact/expose-text/master/docs/expose-text.png "Exposing the plain text content, then modifying it")
 25 | 
 26 | ## Supported Formats
 27 | 
 28 | ExposeText has prototypical support for the following file formats:
 29 | 
 30 | - .txt
 31 |   - Per default, the encoding is assumed to be UTF-8.
 32 |   - You can install [chardet](https://github.com/chardet/chardet) (`pip install chardet`), to automatically detect the encoding.
 33 | - .html
 34 |   - You can pass either an HTML snippet, an HTML body or a complete HTML document. If you pass a complete HTML document, every text content outside the body is ignored.
 35 |   - The output file will always be encoded in UTF-8.
 36 | - .docx
 37 |   - Only text within `<w:t>` tags (the tags for anything that is text) is exposed. E.g. the mailto link of an e-mail address is not exposed.
 38 | - .pdf
 39 |   - Per default, text in PDFs can only be replaced with characters that occur in the file (fonts are stored economically in PDF files).
 40 |   - If you install the additional dependencies [Poppler (pdftohtml)](https://poppler.freedesktop.org/) and [wkhtmltopdf](https://wkhtmltopdf.org/), the PDF is rerendered and there is no more restriction on the characters that can be used.
 41 | 
 42 | 
 43 | ## Usage
 44 | 
 45 | ExposeText supports files as well as binary data objects.
 46 | Depending on your use case you can use one of the following interfaces for making modifications.
 47 | 
 48 | ### Installation
 49 | 
 50 | `expose-text` can be installed from PyPi and has to be installed in a virtual environment (venv or conda for instance).
 51 | 
 52 | ```bash
 53 | pip install expose-text
 54 | ```
 55 | 
 56 | ### Slicing API
 57 | 
 58 | The slicing API applies each alteration immediately.
 59 | 
 60 | Exposing and modifying text inside a file:
 61 | ```python
 62 | >>> from expose_text import FileWrapper
 63 | >>>
 64 | >>> wrapper = FileWrapper("myfile.docx")
 65 | >>> wrapper.text
 66 | 'This is the content as string.'
 67 | 
 68 | >>> wrapper[12:19] = "new content"
 69 | >>> wrapper.text
 70 | 'This is the new content as string.'
 71 | 
 72 | >>> wrapper[33] = "!"  # note that you have to use the updated index here
 73 | >>> wrapper.text
 74 | 'This is the new content as string!'
 75 | 
 76 | >>> wrapper.save("newfile.docx")
 77 | ```
 78 | 
 79 | If you want to work directly with binary data you have to pass the file format:
 80 | ```python
 81 | >>> from expose_text import BinaryWrapper
 82 | >>>
 83 | >>> wrapper = BinaryWrapper(my_bytes, ".docx")
 84 | >>> wrapper.text
 85 | 'This is the content as string.'
 86 | 
 87 | >>> wrapper[12:19] = "new content"
 88 | >>> wrapper.text
 89 | 'This is the new content as string.'
 90 | 
 91 | >>> wrapper.bytes  # get the modified file as bytes
 92 | b'...'
 93 | ```
 94 | 
 95 | ### Functional API
 96 | 
 97 | With the functional API, you can queue several alterations based on the initial indices and then apply them together.
 98 | ```python
 99 | >>> wrapper.text
100 | 'This is the content as string.'
101 | 
102 | >>> wrapper.add_alter(12, 19, "new content")
103 | >>> wrapper.add_alter(29, 30, "!")
104 | >>> wrapper.apply_alters()
105 | >>> wrapper.text
106 | 'This is the new content as string!'
107 | ```
108 | 
109 | ## Development
110 | 
111 | ### Install requirements
112 | 
113 | You can install all (production and development) requirements using:
114 | 
115 | ```
116 | pip install -r requirements.txt
117 | ```
118 | 
119 | ### Install the pre-commit hooks
120 | 
121 | This repository uses git hooks to validate code quality and formatting.
122 | 
123 | ```
124 | pre-commit install
125 | git config --bool flake8.strict true  # Makes the commit fail if flake8 reports an error
126 | ```
127 | 
128 | To run the hooks:
129 | ```
130 | pre-commit run --all-files
131 | ```
132 | 
133 | ### Testing
134 | 
135 | The tests can be executed with:
136 | ```
137 | pytest --doctest-modules --cov-report term --cov=expose_text
138 | ```
139 | 
140 | ### Testing in Docker
141 | 
142 | You can run the test as well in a Docker container:
143 | 
144 | ```bash
145 | docker build -t expose-text
146 | docker run expose-text
147 | ```
148 | 
149 | ## How to contact us
150 | 
151 | For usage questions, bugs, or suggestions please file a Github issue.
152 | If you would like to contribute or have other questions please email hello@openredact.org.
153 | 
154 | ## License
155 | 
156 | [MIT License](https://github.com/openredact/expose-text/blob/master/LICENSE)
157 | 


--------------------------------------------------------------------------------
/docs/expose-text.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openredact/expose-text/c8e21774d0ca2f0103e91b50ea093c87a3738a3e/docs/expose-text.png


--------------------------------------------------------------------------------
/expose_text/__init__.py:
--------------------------------------------------------------------------------
1 | from expose_text.core import FileWrapper, BinaryWrapper  # noqa: F401
2 | from expose_text.exceptions import UnsupportedFormat  # noqa: F401
3 | 


--------------------------------------------------------------------------------
/expose_text/core.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from pathlib import Path
  3 | from typing import Union
  4 | 
  5 | from expose_text.formats import registry
  6 | 
  7 | registry.register_formats()
  8 | 
  9 | 
 10 | class BinaryWrapper:
 11 |     """A wrapper for binary files in various formats that exposes their text content for modification.
 12 | 
 13 |     >>> from pathlib import Path
 14 |     >>> root = Path(__file__).parent.parent
 15 |     >>> f = open(root / 'tests/files/doctest.txt', 'rb')
 16 |     >>> bytes_ = f.read()
 17 | 
 18 |     Open binary data and inspect the text content.
 19 | 
 20 |     >>> bw = BinaryWrapper(bytes_, '.txt')
 21 |     >>> bw.text
 22 |     'This is the content as string.'
 23 | 
 24 |     Or access the content using slicing.
 25 | 
 26 |     >>> bw[12:19]
 27 |     'content'
 28 |     >>> bw[29]
 29 |     '.'
 30 | 
 31 |     This string provides the indices for the modification of the file. Queue new alterations and when you are done
 32 |     apply them to change the file.
 33 | 
 34 |     >>> bw.add_alter(0, 4, 'That')
 35 |     >>> bw.apply_alters()
 36 |     >>> bw.text
 37 |     'That is the content as string.'
 38 | 
 39 |     The slicing interface lets you make and apply an alteration in a single call.
 40 | 
 41 |     >>> bw[12:19] = 'new content'
 42 |     >>> bw[33] = '!'
 43 |     >>> bw.text
 44 |     'That is the new content as string!'
 45 | 
 46 |     Return the content in binary format.
 47 |     >>> bw.bytes
 48 |     b'That is the new content as string!'
 49 |     """
 50 | 
 51 |     def __init__(self, bytes_: bytes, format_cls_or_str: Union[type, str]):
 52 |         """
 53 |         Constructor
 54 | 
 55 |         :param bytes_: Input bytes
 56 |         :param format_cls_or_str: Explicit Format class or file extension string (Format class will be auto-determined)
 57 |         """
 58 |         if isinstance(format_cls_or_str, str):
 59 |             format_cls_or_str = registry.find_format(format_cls_or_str)
 60 |         elif not isinstance(format_cls_or_str, type):
 61 |             raise ValueError("`format_cls_or_str` must be provided as either Format class or file extension string")
 62 | 
 63 |         self.file = format_cls_or_str()
 64 |         self.file.load(bytes_)
 65 | 
 66 |     @property
 67 |     def text(self) -> str:
 68 |         """The text content of the file."""
 69 |         return self.file.text
 70 | 
 71 |     @property
 72 |     def bytes(self) -> bytes:
 73 |         """The binary content of the file."""
 74 |         return self.file.bytes
 75 | 
 76 |     def add_alter(self, start: int, end: int, text: str):
 77 |         """Queue a new change up for alteration.
 78 | 
 79 |         The `start` and `end` indices refer to the current value of the `text` property. Apply the queued alterations
 80 |         by calling `apply_alters()`.
 81 |         """
 82 |         self.file.add_alter(start, end, text)
 83 | 
 84 |     def apply_alters(self):
 85 |         """Apply all queued alterations."""
 86 |         self.file.apply_alters()
 87 | 
 88 |     def __getitem__(self, key: Union[slice, int]):
 89 |         """Get a substring of the contained text using slicing or indexing."""
 90 |         return self.file.text.__getitem__(key)
 91 | 
 92 |     def __setitem__(self, key: Union[slice, int], value: str):
 93 |         """Add and apply one alter using the slicing syntax."""
 94 |         if isinstance(key, slice):
 95 |             self.add_alter(key.start, key.stop, value)
 96 |         else:
 97 |             self.add_alter(key, key + 1, value)
 98 |         self.apply_alters()
 99 | 
100 | 
101 | class FileWrapper(BinaryWrapper):
102 |     """A wrapper for various file formats that exposes their text content for modification.
103 | 
104 |     >>> from pathlib import Path
105 |     >>> root = Path(__file__).parent.parent
106 | 
107 |     Open a file and inspect its text content.
108 | 
109 |     >>> fw = FileWrapper(root / 'tests/files/doctest.txt')
110 |     >>> fw.text
111 |     'This is the content as string.'
112 | 
113 |     Or access the content using slicing.
114 | 
115 |     >>> fw[12:19]
116 |     'content'
117 |     >>> fw[29]
118 |     '.'
119 | 
120 |     This string provides the indices for the modification of the file. Queue new alterations and when you are done
121 |     apply them to change the file.
122 | 
123 |     >>> fw.add_alter(0, 4, 'That')
124 |     >>> fw.apply_alters()
125 |     >>> fw.text
126 |     'That is the content as string.'
127 | 
128 |     The slicing interface lets you make and apply an alteration in a single call.
129 | 
130 |     >>> fw[12:19] = 'new content'
131 |     >>> fw[33] = '!'
132 |     >>> fw.text
133 |     'That is the new content as string!'
134 | 
135 |     Now create a new file that looks like the original one but with the altered content.
136 |     >>> fw.save(root / 'tests/files/doctest_altered.txt')
137 |     """
138 | 
139 |     def __init__(self, file_path: Union[Path, str], format_cls: type = None):
140 |         """
141 |         Constructor
142 | 
143 |         :param file_path: Path to input file
144 |         :param format_cls: Specific Format class (if not set, class is determined based on file extension)
145 |         """
146 |         _, extension = os.path.splitext(file_path)
147 | 
148 |         with open(file_path, "rb") as f:
149 |             bytes_ = f.read()
150 | 
151 |         super().__init__(bytes_, format_cls if format_cls else extension)
152 | 
153 |     def save(self, file_path: Union[Path, str]):
154 |         """Save the file to disk."""
155 |         with open(file_path, "wb") as f:
156 |             f.write(self.file.bytes)
157 | 


--------------------------------------------------------------------------------
/expose_text/exceptions.py:
--------------------------------------------------------------------------------
1 | class UnsupportedFormat(NotImplementedError):
2 |     """This file format is not supported"""
3 | 
4 | 
5 | class FormatError(ValueError):
6 |     pass
7 | 


--------------------------------------------------------------------------------
/expose_text/formats/__init__.py:
--------------------------------------------------------------------------------
 1 | from importlib import import_module
 2 | 
 3 | from expose_text.exceptions import UnsupportedFormat
 4 | from expose_text.formats.base import Format
 5 | 
 6 | 
 7 | class Registry:
 8 |     """This class registers the supported file formats.
 9 | 
10 |     If you implement a new format, make sure to add it to `register_formats()`.
11 |     """
12 | 
13 |     _formats = {}
14 | 
15 |     def find_format(self, key) -> Format:
16 |         if key not in self._formats:
17 |             raise UnsupportedFormat(f"Format {key} is not supported!")
18 |         return self._formats[key]
19 | 
20 |     def register_formats(self):
21 |         self._register(".txt", "expose_text.formats._txt.TxtFormat")
22 |         self._register(".html", "expose_text.formats._html.HtmlFormat")
23 |         # self._register(".pdf", "expose_text.formats._pdf.PdfFormat")
24 |         self._register(".pdf", "expose_text.formats.pdf.auto_pdf.AutoPdfFormat")
25 |         self._register(".docx", "expose_text.formats._docx.DocxFormat")
26 | 
27 |     def _register(self, key, class_path):
28 |         module_path, class_name = class_path.rsplit(".", 1)
29 |         format_cls = getattr(import_module(module_path), class_name)
30 |         self._formats[key] = format_cls
31 | 
32 | 
33 | registry = Registry()
34 | 


--------------------------------------------------------------------------------
/expose_text/formats/_docx.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import re
 3 | import zipfile
 4 | 
 5 | from defusedxml.minidom import parse
 6 | 
 7 | from expose_text.formats._utils import apply_buffer_to_text
 8 | from expose_text.formats.base import Format
 9 | from expose_text.formats.markup.utils import MarkupModifier, Mapper
10 | 
11 | 
12 | class DocxFormat(Format):
13 |     _docx_container = None
14 |     _text = ""
15 |     _xml_modifier = None
16 | 
17 |     def load(self, bytes_):
18 |         self._docx_container = DocxContainer(bytes_)
19 | 
20 |         mapper = DocxMapper(self._docx_container.document_xml)
21 |         self._text, mapping = mapper.simultaneous_text_extraction_and_mapping()
22 | 
23 |         self._xml_modifier = MarkupModifier(self._docx_container.document_xml, mapping)
24 | 
25 |     @property
26 |     def text(self):
27 |         return self._text
28 | 
29 |     @property
30 |     def bytes(self):
31 |         return self._docx_container.to_bytes()
32 | 
33 |     def apply_alters(self):
34 |         self._text = apply_buffer_to_text(self._buffer, self._text)
35 |         self._docx_container.document_xml = self._xml_modifier.apply_buffer(self._buffer)
36 |         self._buffer.clear()
37 | 
38 | 
39 | class DocxContainer:
40 |     _docx = None
41 |     document_xml = None
42 | 
43 |     def __init__(self, bytes_):
44 |         docx_io = io.BytesIO(bytes_)
45 |         self._docx = zipfile.ZipFile(docx_io)
46 | 
47 |         document_xml_bytes = self._docx.read("word/document.xml")
48 | 
49 |         document_xml_io = io.BytesIO(document_xml_bytes)
50 |         encoding = parse(document_xml_io).encoding
51 | 
52 |         self.document_xml = document_xml_bytes.decode(encoding)
53 | 
54 |     def to_bytes(self):
55 |         # modifying a zip file is not supported, thus it has to be rebuilt
56 |         bytes_io = io.BytesIO()
57 |         zout = zipfile.ZipFile(bytes_io, "w")
58 |         for zinfo in self._docx.infolist():
59 |             if zinfo.filename == "word/document.xml":
60 |                 zout.writestr(zinfo, self.document_xml)
61 |                 continue
62 | 
63 |             buffer = self._docx.read(zinfo.filename)
64 |             zout.writestr(zinfo, buffer)
65 |         zout.close()
66 |         return bytes_io.getvalue()
67 | 
68 | 
69 | class DocxMapper(Mapper):
70 |     def simultaneous_text_extraction_and_mapping(self):
71 |         # get plain text from word/document.xml (everything between <w:t ...> and </w:t>)
72 |         self._remove_pattern(r"\n")  # get rid of all newlines from the xml formatting
73 |         self._remove_pattern(r"<\/w:p>|<w:br[^>]*>", replace_with="\n")  # add newlines from paragraph ends and linebreaks
74 |         self._remove_pattern(r"<\/w:t>.*?<w:t[^>]*>", flags=re.MULTILINE)  # delete content from text close to open tags
75 |         self._remove_pattern(r"^.*<w:t[^>]*>", flags=re.MULTILINE)  # delete to remaining open tags
76 |         self._remove_pattern(r"<\/w:t>.*$", flags=re.MULTILINE)  # delete from remaining close tags
77 |         self._remove_pattern(r"^.*<.*$", flags=re.MULTILINE)  # delete leftover lines with xml content
78 | 
79 |         # unescape characters
80 |         self._remove_pattern(r"&amp;", replace_with="&")
81 |         self._remove_pattern(r"&lt;", replace_with="<")
82 |         self._remove_pattern(r"&gt;", replace_with=">")
83 |         self._remove_pattern(r"&quot;", replace_with='"')
84 |         self._remove_pattern(r"&apos;", replace_with="'")
85 | 
86 |         # remove leading and trailing newlines
87 |         self._remove_pattern(r"^\n+")
88 |         self._remove_pattern(r"\n+$")
89 | 
90 |         return self._text, self._text_to_markup_idx
91 | 


--------------------------------------------------------------------------------
/expose_text/formats/_html.py:
--------------------------------------------------------------------------------
 1 | import html
 2 | import re
 3 | 
 4 | from bs4 import UnicodeDammit
 5 | 
 6 | from expose_text.formats._utils import apply_buffer_to_text
 7 | from expose_text.formats.base import Format
 8 | from expose_text.formats.markup.utils import MarkupModifier, Mapper
 9 | 
10 | 
11 | class HtmlFormat(Format):
12 |     _html = ""
13 |     _text = ""
14 |     _html_modifier = None
15 | 
16 |     def load(self, bytes_):
17 |         self._html = to_unicode(bytes_)
18 | 
19 |         mapper = HtmlMapper(self._html)
20 |         self._text, mapping = mapper.simultaneous_text_extraction_and_mapping()
21 | 
22 |         self._html_modifier = MarkupModifier(self._html, mapping)
23 | 
24 |     @property
25 |     def text(self):
26 |         return self._text
27 | 
28 |     @property
29 |     def bytes(self):
30 |         return self._html.encode("UTF-8")
31 | 
32 |     def apply_alters(self):
33 |         self._text = apply_buffer_to_text(self._buffer, self._text)
34 |         self._html = self._html_modifier.apply_buffer(self._buffer)
35 |         self._buffer.clear()
36 | 
37 | 
38 | def to_unicode(bytes_):
39 |     def unescape_html(html_):
40 |         unescaped_html = ""
41 |         pattern = re.compile(r"&#\d{1,4};|&\w{1,6};")
42 |         cur = 0
43 |         for m in pattern.finditer(html_):
44 |             if m.group(0) in ["&lt;", "&gt;", "&amp;", "&quot;", "&apos;"]:
45 |                 continue
46 | 
47 |             unescaped_html += html_[cur : m.start()] + html.unescape(m.group(0))
48 |             cur = m.end()
49 |         unescaped_html += html_[cur:]
50 |         return unescaped_html
51 | 
52 |     dammit = UnicodeDammit(bytes_)
53 |     encoding = dammit.original_encoding
54 |     decoded_html = bytes_.decode(encoding)
55 |     return unescape_html(decoded_html)
56 | 
57 | 
58 | class HtmlMapper(Mapper):
59 |     def simultaneous_text_extraction_and_mapping(self):
60 |         # get rid of everything but body and title
61 |         self._remove_pattern(r"^.*<body[^>]*>", flags=re.DOTALL)  # delete everything from beginning to body
62 |         self._remove_pattern(r"<\/body>.*$", flags=re.DOTALL)  # delete everything from body to end
63 | 
64 |         # remove html from inside body
65 |         self._remove_pattern(r"<br ?\/?>", replace_with="\n")  # html linebreaks
66 |         self._remove_pattern(
67 |             r"""<script[^>]*>.*?<\/script>  # remove scripts
68 |                 |<style[^>]*>.*?<\/style>  # remove styles
69 |                 |<template[^>]*>.*?<\/template> # remove templates
70 |                 |<[^>]+>  # remove all tags """,
71 |             flags=re.DOTALL | re.VERBOSE,
72 |         )
73 |         self._remove_pattern(r"(^[ \xc2\xa0]+)", flags=re.MULTILINE)  # leading (non-breaking) whitespace
74 |         self._remove_pattern(r"(\n\r?){3,}", replace_with="\n\n")  # excess newlines
75 | 
76 |         # unescape characters
77 |         self._remove_pattern(r"&amp;", replace_with="&")
78 |         self._remove_pattern(r"&lt;", replace_with="<")
79 |         self._remove_pattern(r"&gt;", replace_with=">")
80 |         self._remove_pattern(r"&quot;", replace_with='"')
81 |         self._remove_pattern(r"&apos;", replace_with="'")
82 | 
83 |         # remove leading and trailing newlines
84 |         self._remove_pattern(r"^\n+")
85 |         self._remove_pattern(r"\n+$")
86 | 
87 |         return self._text, self._text_to_markup_idx
88 | 


--------------------------------------------------------------------------------
/expose_text/formats/_pdf.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | 
  3 | from pdfrw import PdfReader, PdfDict, PdfWriter
  4 | from pdfrw import PdfArray
  5 | 
  6 | from expose_text.formats.base import Format
  7 | from expose_text.formats.pdf import pdf_redactor
  8 | from expose_text.formats.pdf.pdf_redactor import InlineImage, RedactorOptions
  9 | 
 10 | 
 11 | class PdfFormat(Format):
 12 |     """
 13 | 
 14 |     Mostly based on https://github.com/JoshData/pdf-redactor
 15 | 
 16 |     # A general-purpose PDF text-layer redaction tool.
 17 |     # License: CC0 1.0 Universal
 18 |     # Source: https://github.com/JoshData/pdf-redactor
 19 | 
 20 |     """
 21 | 
 22 |     options = None  # type: RedactorOptions
 23 |     document = None
 24 |     text_tokens = None
 25 |     page_tokens = None
 26 | 
 27 |     def load(self, bytes_):
 28 |         self.options = pdf_redactor.RedactorOptions()
 29 |         self.options.input_stream = bytes_
 30 | 
 31 |         self.document = PdfReader(fdata=bytes_)
 32 |         self.text_tokens, self.page_tokens = pdf_redactor.build_text_layer(self.document, self.options)
 33 | 
 34 |     @property
 35 |     def text(self):
 36 |         return "".join(t.value for t in self.text_tokens)
 37 | 
 38 |     @property
 39 |     def bytes(self):
 40 |         stream = io.BytesIO()
 41 |         writer = PdfWriter()
 42 |         writer.trailer = self.document
 43 |         writer.write(stream)
 44 |         return stream.getvalue()
 45 | 
 46 |     def apply_alters(self):
 47 |         # Finding all matches...
 48 |         text_tokens_index = 0
 49 |         text_tokens_charpos = 0
 50 |         text_tokens_token_xdiff = 0
 51 |         text_tokens = self.text_tokens
 52 | 
 53 |         # Mostly from update_text_layer
 54 |         # Pass the matched text to the replacement function to get replaced text.
 55 |         for start, end, alteration in self._buffer.sort():
 56 |             # We got a match at text_content[start_idx:end_idx].
 57 |             start_idx = start
 58 |             end_idx = end
 59 | 
 60 |             # Do a text replacement in the tokens that produced this text content.
 61 |             # It may have been produced by multiple tokens, so loop until we find them all.
 62 |             while start_idx < end_idx:
 63 |                 # Find the original tokens in the content stream that
 64 |                 # produced the matched text. Start by advancing over any
 65 |                 # tokens that are entirely before this span of text.
 66 |                 while (
 67 |                     text_tokens_index < len(text_tokens)
 68 |                     and text_tokens_charpos + len(text_tokens[text_tokens_index].value) - text_tokens_token_xdiff <= start_idx
 69 |                 ):
 70 |                     text_tokens_charpos += len(text_tokens[text_tokens_index].value) - text_tokens_token_xdiff
 71 |                     text_tokens_index += 1
 72 |                     text_tokens_token_xdiff = 0
 73 |                 if text_tokens_index == len(text_tokens):
 74 |                     break
 75 |                 assert text_tokens_charpos <= start_idx
 76 | 
 77 |                 # The token at text_tokens_index, and possibly subsequent ones,
 78 |                 # are responsible for this text. Replace the matched content
 79 |                 # here with replacement content.
 80 |                 tok = text_tokens[text_tokens_index]
 81 | 
 82 |                 # Where does this match begin within the token's text content?
 83 |                 mpos = start_idx - text_tokens_charpos
 84 |                 assert mpos >= 0
 85 | 
 86 |                 # How long is the match within this token?
 87 |                 mlen = min(end_idx - start_idx, len(tok.value) - text_tokens_token_xdiff - mpos)
 88 |                 assert mlen >= 0
 89 | 
 90 |                 # How much should we replace here?
 91 |                 if mlen < (end_idx - start_idx):
 92 |                     # There will be more replaced later, so take the same number
 93 |                     # of characters from the replacement text.
 94 |                     r = alteration[:mlen]
 95 |                     alteration = alteration[mlen:]
 96 |                 else:
 97 |                     # This is the last token in which we'll replace text, so put
 98 |                     # all of the remaining replacement content here.
 99 |                     r = alteration
100 |                     alteration = None  # sanity
101 | 
102 |                 # Do the replacement.
103 |                 tok.value = (
104 |                     tok.value[: mpos + text_tokens_token_xdiff] + r + tok.value[mpos + mlen + text_tokens_token_xdiff :]
105 |                 )
106 |                 text_tokens_token_xdiff += len(r) - mlen
107 | 
108 |                 # Advance for next iteration.
109 |                 start_idx += mlen
110 | 
111 |         # Replace page content streams with updated tokens.
112 |         self.apply_updated_text()
113 | 
114 |     def tok_str(self, tok):
115 |         # Replace the page's content stream with our updated tokens.
116 |         # The content stream may have been an array of streams before,
117 |         # so replace the whole thing with a single new stream. Unfortunately
118 |         # the str on PdfArray and PdfDict doesn't work right.
119 |         if isinstance(tok, PdfArray):
120 |             return "[ " + " ".join(self.tok_str(x) for x in tok) + "] "
121 |         if isinstance(tok, InlineImage):
122 |             return (
123 |                 "BI "
124 |                 + " ".join(self.tok_str(x) + " " + self.tok_str(y) for x, y in tok.items())
125 |                 + " ID "
126 |                 + tok.stream
127 |                 + " EI "
128 |             )
129 |         if isinstance(tok, PdfDict):
130 |             return "<< " + " ".join(self.tok_str(x) + " " + self.tok_str(y) for x, y in tok.items()) + ">> "
131 | 
132 |         return str(tok)
133 | 
134 |     def apply_updated_text(self):
135 |         # Create a new content stream for each page by concatenating the
136 |         # tokens in the page_tokens lists.
137 | 
138 |         for i, page in enumerate(self.document.pages):
139 |             if page.Contents is None:
140 |                 continue  # nothing was here
141 | 
142 |             page.Contents = PdfDict()
143 |             page.Contents.stream = "\n".join(self.tok_str(tok) for tok in self.page_tokens[i])
144 | 
145 |             page.Contents.Length = len(page.Contents.stream)  # reset
146 | 
147 |         self._buffer.clear()
148 | 


--------------------------------------------------------------------------------
/expose_text/formats/_txt.py:
--------------------------------------------------------------------------------
 1 | from expose_text.formats._utils import apply_buffer_to_text
 2 | from expose_text.formats.base import Format
 3 | 
 4 | # chardet is LGPL, link it dynamically
 5 | try:
 6 |     import chardet
 7 | except ModuleNotFoundError:
 8 |     chardet = None
 9 | 
10 | 
11 | class TxtFormat(Format):
12 |     _encoding = None
13 |     _content = ""
14 | 
15 |     def load(self, bytes_):
16 |         if chardet:
17 |             self._encoding = chardet.detect(bytes_)["encoding"]
18 |         else:
19 |             # if the encoding is not detected dynamically, it is assumed to be UTF-8
20 |             self._encoding = "UTF-8"
21 | 
22 |         self._content = bytes_.decode(self._encoding)
23 | 
24 |     @property
25 |     def text(self):
26 |         return self._content
27 | 
28 |     @property
29 |     def bytes(self):
30 |         return self._content.encode(self._encoding)
31 | 
32 |     def apply_alters(self):
33 |         self._content = apply_buffer_to_text(self._buffer, self._content)
34 |         self._buffer.clear()
35 | 


--------------------------------------------------------------------------------
/expose_text/formats/_utils.py:
--------------------------------------------------------------------------------
 1 | class AlterationsBuffer:
 2 |     """This class is used to safely queue alterations.
 3 | 
 4 |     Add new alterations to this buffer by using one of the two interfaces. The logic makes sure that no overlapping
 5 |     alterations are added, i.e. that each part of the original text can only be altered once.
 6 | 
 7 |     >>> buffer = AlterationsBuffer()
 8 |     >>> buffer.add(0, 10, 'new_text')
 9 |     >>> buffer += (10, 20, 'new_text')
10 | 
11 |     Access the alterations by using the iterable interface of this class.
12 |     """
13 | 
14 |     def __init__(self):
15 |         self.buffer = []
16 | 
17 |     def __iter__(self):
18 |         return iter(self.buffer)
19 | 
20 |     def __iadd__(self, alter):
21 |         if not isinstance(alter, tuple) or len(alter) != 3:
22 |             raise TypeError("Invalid alteration! Valid ones are (start, end, new_text) tuples.")
23 |         self.add(*alter)
24 |         return self
25 | 
26 |     def __len__(self):
27 |         return len(self.buffer)
28 | 
29 |     def add(self, start, end, new_text):
30 |         if not end > start:
31 |             raise ValueError("end should be larger than start!")
32 | 
33 |         alter = (start, end, new_text)
34 |         if self._overlaps_with_existing_alter(alter):
35 |             raise ValueError("The given alteration overlaps with an existing one!")
36 | 
37 |         self.buffer += [alter]
38 | 
39 |     def sort(self, reverse=False):
40 |         self.buffer.sort(key=lambda alter: alter[0], reverse=reverse)
41 |         return self
42 | 
43 |     def clear(self):
44 |         self.buffer = []
45 | 
46 |     def _overlaps_with_existing_alter(self, new_alter):
47 |         new_start = new_alter[0]
48 |         new_end = new_alter[1]
49 | 
50 |         for existing_alter in self.buffer:
51 |             existing_start = existing_alter[0]
52 |             existing_end = existing_alter[1]
53 |             if existing_start <= new_start < existing_end or existing_start < new_end <= existing_end:
54 |                 return True
55 | 
56 |         return False
57 | 
58 | 
59 | def apply_buffer_to_text(buffer, text):
60 |     """Apply all alterations from the buffer to the text.
61 | 
62 |     This replaces the original text at the indices specified in the alterations by the respective altered texts.
63 |     """
64 |     new_text = ""
65 |     cur = 0
66 |     for start, end, alteration in buffer.sort():
67 |         new_text += text[cur:start] + alteration
68 |         cur = end
69 |     new_text += text[cur:]
70 |     return new_text
71 | 


--------------------------------------------------------------------------------
/expose_text/formats/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | from ._utils import AlterationsBuffer
 4 | 
 5 | 
 6 | class Format(ABC):
 7 |     def __init__(self):
 8 |         self._buffer = AlterationsBuffer()
 9 | 
10 |     @abstractmethod
11 |     def load(self, bytes_):
12 |         """Load the file in binary format into the internal representation."""
13 |         pass
14 | 
15 |     @property
16 |     @abstractmethod
17 |     def text(self):
18 |         """Get the current text content."""
19 |         pass
20 | 
21 |     @property
22 |     @abstractmethod
23 |     def bytes(self):
24 |         """Get the current file content as binary data."""
25 | 
26 |     def add_alter(self, start, end, new_text):
27 |         """Queue an alteration of the text.
28 | 
29 |         The `start` and `end` indices are based on the current `text` content. The `text` and `bytes` content are not
30 |         changed by calling this method. To apply the changes call `apply_alters()`.
31 |         """
32 |         self._buffer += (start, end, new_text)
33 | 
34 |     @abstractmethod
35 |     def apply_alters(self):
36 |         """Apply all queued alterations.
37 | 
38 |         After calling this method, `text` and `bytes` will be updated.
39 |         """
40 |         pass
41 | 


--------------------------------------------------------------------------------
/expose_text/formats/markup/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openredact/expose-text/c8e21774d0ca2f0103e91b50ea093c87a3738a3e/expose_text/formats/markup/__init__.py


--------------------------------------------------------------------------------
/expose_text/formats/markup/utils.py:
--------------------------------------------------------------------------------
 1 | import html
 2 | import re
 3 | from abc import ABC, abstractmethod
 4 | 
 5 | """Utils for markup languages with tags and elements like XML or HTML."""
 6 | 
 7 | 
 8 | class MarkupModifier:
 9 |     """This class takes care of altering markup."""
10 | 
11 |     def __init__(self, markup, mapping):
12 |         """
13 |         :param markup:  a string containing content in a markup language
14 |         :param mapping: a mapping from the indices of the contained text to its positions in the markup,
15 |             i.e. `mapping[text_idx] == markup_idx`
16 |         """
17 |         self._markup = markup
18 |         self._text_to_markup_idx = mapping
19 | 
20 |     def apply_buffer(self, buffer):
21 |         new_markup = ""
22 |         cur = 0
23 |         for start, end, new_text in buffer.sort():
24 |             new_markup += self._markup[cur : self._text_to_markup_idx[start]] + html.escape(new_text)
25 | 
26 |             # inner - 1: get the markup index of last text char, outer + 1: get the next char in markup
27 |             cur = self._text_to_markup_idx[end - 1] + 1
28 | 
29 |             # append any markup tags that got skipped (in case end spanned further than the starting element)
30 |             new_markup += self._get_skipped_tags(self._text_to_markup_idx[start], cur)
31 |         new_markup += self._markup[cur:]
32 |         self._markup = new_markup
33 |         return self._markup
34 | 
35 |     def _get_skipped_tags(self, start, end):
36 |         """Return all tags between start and end."""
37 |         pattern = re.compile(r"<[^>]*>")
38 |         tags = pattern.findall(self._markup[start:end])
39 |         return "\n".join(tags)
40 | 
41 | 
42 | class Mapper(ABC):
43 |     """This is the base for language specific classes that map markup to text and create an index mapping.
44 | 
45 |     Initially `self._text` contains the markup which is then step by step removed by calls to `_remove_pattern` in
46 |     `get_text_and_mapping`. While removing it an index mapping is maintained that maps each index in `self._text` to its
47 |     position in the markup.
48 |     """
49 | 
50 |     def __init__(self, markup):
51 |         self._text = markup
52 |         self._markup = markup
53 |         self._text_to_markup_idx = list(range(len(markup)))
54 | 
55 |     @abstractmethod
56 |     def simultaneous_text_extraction_and_mapping(self):
57 |         """Extract the text and create an index mapping by one or more calls to `remove_patterns`."""
58 |         return self._text, self._text_to_markup_idx
59 | 
60 |     def _remove_pattern(self, regex, replace_with="", flags=0):
61 |         """Remove or replace patterns in the markup.
62 | 
63 |         :param regex: the regex to replace
64 |         :param replace_with: an optional string to replace matches with
65 |         :param flags: optional re compile flags
66 |         """
67 |         pattern = re.compile(regex, flags=flags)
68 |         while True:
69 |             m = re.search(pattern, self._text)
70 |             if m is None:
71 |                 break
72 | 
73 |             self._replace_content_in_markup(m.start(0), m.end(0), replace_with)
74 | 
75 |     def _replace_content_in_markup(self, start, end, new_text):
76 |         if len(new_text) > end - start:
77 |             raise ValueError()
78 |         self._text = self._text[:start] + new_text + self._text[end:]
79 |         del self._text_to_markup_idx[start + len(new_text) : end]
80 | 


--------------------------------------------------------------------------------
/expose_text/formats/pdf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openredact/expose-text/c8e21774d0ca2f0103e91b50ea093c87a3738a3e/expose_text/formats/pdf/__init__.py


--------------------------------------------------------------------------------
/expose_text/formats/pdf/auto_pdf.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from expose_text.formats import Format
 4 | from expose_text.formats._pdf import PdfFormat
 5 | from expose_text.formats.pdf.pdf2html2pdf import Pdf2Html2PdfFormat
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | class AutoPdfFormat(Format):
11 |     """
12 |     Automatically determine what PDF format can be used depending on availability of dependencies (and alters)
13 |     """
14 | 
15 |     def __init__(self):
16 |         super().__init__()
17 | 
18 |         pdf2html2pdf = Pdf2Html2PdfFormat()
19 | 
20 |         if pdf2html2pdf.is_installed():
21 |             logger.info("Using pdf2html2pdf (dependencies are installed)")
22 |             self.format = pdf2html2pdf
23 |         else:
24 |             logger.info("Using PdfFormat (dependencies are missing)")
25 |             self.format = PdfFormat()
26 | 
27 |     def load(self, bytes_):
28 |         self.format.load(bytes_)
29 | 
30 |     @property
31 |     def text(self):
32 |         return self.format.text
33 | 
34 |     @property
35 |     def bytes(self):
36 |         return self.format.bytes
37 | 
38 |     def add_alter(self, start, end, new_text):
39 |         self.format.add_alter(start, end, new_text)
40 | 
41 |     def apply_alters(self):
42 |         self.format.apply_alters()
43 | 


--------------------------------------------------------------------------------
/expose_text/formats/pdf/pdf2html2pdf.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import collections
  3 | import logging
  4 | import os
  5 | import re
  6 | import shutil
  7 | import tempfile
  8 | from subprocess import run, PIPE
  9 | from typing import Dict
 10 | 
 11 | import pdfkit
 12 | 
 13 | from expose_text.exceptions import FormatError
 14 | from expose_text.formats._html import HtmlFormat
 15 | from expose_text.formats.base import Format
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | class Pdf2Html2PdfFormat(Format):
 21 |     """
 22 |     Use HTML as intermediate format to work with PDFs.
 23 |     Not loss-free! Layout might be different, but replacements with out-of-vocabulary characters is possible!
 24 | 
 25 |     Dependencies:
 26 |     - PDF to HTML: poppler-utils
 27 |     - HTML to PDF: pdfkit (wrapper for wkhtmltopdf utility to convert HTML to PDF using Webkit)
 28 | 
 29 |     """
 30 | 
 31 |     page2html = {}
 32 |     html_format = None  # type: HtmlFormat
 33 | 
 34 |     def __init__(
 35 |         self,
 36 |         encoding="utf-8",
 37 |         pdf_margin_left="0",
 38 |         pdf_margin_right="0",
 39 |         pdf_margin_top="0",
 40 |         pdf_margin_bottom="0",
 41 |         pdf_output_zoom="1.6",
 42 |         pdf_input_zoom="1.0",
 43 |         pdftohtml_path="pdftohtml",
 44 |         wkhtmltopdf_path="wkhtmltopdf",
 45 |     ):
 46 |         """
 47 |         For PDF settings see pdfkit (wkhtmltopdf) documentation
 48 |         """
 49 |         super().__init__()
 50 |         self.encoding = encoding
 51 |         self.html_format = HtmlFormat()
 52 |         self.pdf_margin_left = pdf_margin_left
 53 |         self.pdf_margin_right = pdf_margin_right
 54 |         self.pdf_margin_top = pdf_margin_top
 55 |         self.pdf_margin_bottom = pdf_margin_bottom
 56 |         self.pdf_output_zoom = pdf_output_zoom
 57 |         self.pdf_input_zoom = pdf_input_zoom
 58 |         self.pdftohtml_path = pdftohtml_path
 59 |         self.wkhtmltopdf_path = wkhtmltopdf_path
 60 | 
 61 |     def load(self, bytes_):
 62 |         self.page2html = self.get_html_pages_from_pdf(bytes_)
 63 | 
 64 |         # send to html format wrapper
 65 |         pages_html = [html for page, html in self.page2html.items()]
 66 | 
 67 |         # print('Pages: %s' % [page for page, html in self.page2html.items()])
 68 |         logger.info("Loading only a single page")
 69 |         pages_html = self.page2html[1]
 70 | 
 71 |         self.html_format.load(("".join(pages_html)).replace("&#160;", " ").encode("utf-8"))
 72 | 
 73 |     @property
 74 |     def text(self):
 75 |         # html to text
 76 |         return self.html_format.text
 77 | 
 78 |     @property
 79 |     def html(self):
 80 |         return self.page2html[1]
 81 | 
 82 |     @property
 83 |     def bytes(self):
 84 |         """Generate PDF from HTML bytes with pdfkit (wkhtmltopdf) """
 85 |         html_bytes = self.html_format.bytes
 86 | 
 87 |         pdf_bytes = pdfkit.from_string(
 88 |             html_bytes.decode(self.encoding),
 89 |             False,
 90 |             options={
 91 |                 "load-error-handling": "ignore",
 92 |                 "load-media-error-handling": "ignore",
 93 |                 "margin-left": self.pdf_margin_left,
 94 |                 "margin-right": self.pdf_margin_right,
 95 |                 "margin-top": self.pdf_margin_top,
 96 |                 "margin-bottom": self.pdf_margin_bottom,
 97 |                 "zoom": self.pdf_output_zoom,
 98 |                 # 'disable-smart-shrinking': '',
 99 |             },
100 |         )
101 | 
102 |         return pdf_bytes
103 | 
104 |     def add_alter(self, start, end, new_text):
105 |         """Alter only on HTML format"""
106 |         self.html_format.add_alter(start, end, new_text)
107 | 
108 |     def apply_alters(self):
109 |         """Alter only on HTML format"""
110 |         self.html_format.apply_alters()
111 | 
112 |     def get_html_pages_from_pdf(self, pdf_bytes) -> Dict[int, str]:
113 |         """
114 |         Converts PDF to HTML with htmltopdf (from poppler-utils: https://poppler.freedesktop.org/)
115 | 
116 |         :param pdf_bytes:
117 |         :return:  Page number => HTML string
118 |         """
119 |         page2html = {}
120 |         file_prefix = "pdf"
121 |         tmpdir = tempfile.mkdtemp(prefix="pdftohtml-")
122 | 
123 |         run_args = [self.pdftohtml_path, "-zoom", self.pdf_input_zoom, "-c", "-", tmpdir + "/" + file_prefix]
124 | 
125 |         logger.debug(f"Execute poppler-pdftohtml: {run_args}")
126 | 
127 |         process = run(run_args, stdout=PIPE, input=pdf_bytes,)  # , encoding='ascii'
128 | 
129 |         if process.returncode != 0:
130 |             raise FormatError("pdftohtml returned error exit code: %s" % process.returncode)
131 | 
132 |         # Iterate over output files
133 |         tmp_files = os.listdir(tmpdir)
134 |         logger.error(tmp_files)
135 | 
136 |         for fn in tmp_files:
137 |             if fn.startswith(file_prefix + "-") and fn.endswith(".html"):
138 |                 # Page file
139 |                 logger.debug(f"PDF-page file: {file_prefix}")
140 |                 page_num = int(fn[len(file_prefix) + 1 : -5])
141 | 
142 |                 with open(os.path.join(tmpdir, fn), "r") as f:
143 |                     html = f.read()
144 | 
145 |                     # Replace body bgcolor + Margin settings
146 |                     html = html.replace('bgcolor="#A0A0A0"', 'style="margin: 0; padding: 0;"')
147 | 
148 |                     # Replace image source with base64 encodings
149 |                     def img_src_to_base64(match):
150 |                         fn = match.group(1)
151 |                         with open(os.path.join(tmpdir, fn), "rb") as image_file:
152 |                             encoded_img = base64.b64encode(image_file.read()).decode("utf-8")
153 | 
154 |                         return f'src="data:image/png;base64, {encoded_img}"'
155 | 
156 |                     pattern = re.compile(r'src="(.*?)"')  # src="pdf001.png"
157 |                     html = pattern.sub(img_src_to_base64, html)
158 | 
159 |                     page2html[page_num] = html
160 | 
161 |         # Remove temp files
162 |         shutil.rmtree(tmpdir)
163 | 
164 |         # Ensure page order
165 |         page2html = collections.OrderedDict(sorted(page2html.items()))
166 | 
167 |         return page2html
168 | 
169 |     def is_installed(self):
170 |         # Check if dependencies are installed
171 |         return shutil.which(self.pdftohtml_path) is not None and shutil.which(self.wkhtmltopdf_path) is not None
172 | 


--------------------------------------------------------------------------------
/expose_text/formats/pdf/pdf_redactor.py:
--------------------------------------------------------------------------------
  1 | # A general-purpose PDF text-layer redaction tool.
  2 | # License: CC0 1.0 Universal
  3 | # Source: https://github.com/JoshData/pdf-redactor
  4 | 
  5 | import sys
  6 | from datetime import datetime
  7 | 
  8 | from pdfrw import PdfDict
  9 | 
 10 | 
 11 | class RedactorOptions:
 12 |     """Redaction and I/O options."""
 13 | 
 14 |     # Input/Output
 15 |     input_stream = None
 16 |     output_stream = None
 17 | 
 18 |     # Metadata filters map names of entries in the PDF Document Information Dictionary
 19 |     # (e.g. "Title", "Author", "Subject", "Keywords", "Creator", "Producer", "CreationDate",
 20 |     # and "ModDate") to an array of functions to run on the values of those keys.
 21 |     #
 22 |     # Each function is given a pdfrw.objects.PdfString containing the current field value,
 23 |     # or None if the field is not present in the input PDF, as the function's first argument
 24 |     # and it must return either a string, a datetime.datetime value (CreationDate and ModDate
 25 |     # should datetime.datetime values), or None to replace the field's value with. Return
 26 |     # None to clear the field (unless a later function adds a new value).
 27 |     #
 28 |     # The functions are run in order. Each function is given the previous function's return
 29 |     # value. The last function's return value is put into the output PDF.
 30 |     #
 31 |     # If a datetime.datetime is returned without timezone info (a "naive" datetime), then
 32 |     # it must be in UTC. Use pytz.timezone.localize to encode a local time.
 33 |     #
 34 |     # Use "DEFAULT" as a key to apply functions to all metadata fields that have no specific
 35 |     # functions defined, which is useful to remove all unrecognized fields.
 36 |     #
 37 |     # Use "ALL" to appy functions to all metadata fields, after any field-specific
 38 |     # functions or DEFAULT functions are run.
 39 |     metadata_filters = {}
 40 | 
 41 |     # The XMP metadata filters are functions that are passed any existing XMP data and
 42 |     # return new XMP metadata. The functions are called in order and each is passed the
 43 |     # result of the previous function. The functions are given an xml.etree.Element object,
 44 |     # or None, as their first argument and must return an object of the same type, or None.
 45 |     xmp_filters = []
 46 | 
 47 |     # This function controls how XML returned by xmp_filters is serialized. Replace this
 48 |     # function with any function that takes an xml.etree.Element object and returns a string
 49 |     # (a unicode string --- don't serialize to bytes).
 50 |     xmp_serializer = None
 51 | 
 52 |     # The content filters are run on the combined content streams of the pages, as well
 53 |     # as on each annotation's text attributes separately.
 54 |     #
 55 |     # Each filter is a tuple of a compiled regular expression and a function to generate
 56 |     # replacement text, which is given a re.Match object as its sole argument. It must return a string.
 57 |     #
 58 |     # Since spaces in PDFs are sometimes not encoded as text but instead as positional
 59 |     # offsets (like newlines), the regular expression should treat all spaces as optional.
 60 |     #
 61 |     # Since pdfrw doesn't support content stream compression, you should use a tool like qpdf
 62 |     # to decompress the streams before using this tool (see the README).
 63 |     content_filters = []
 64 | 
 65 |     # When replacement text isn't likely to have a glyph stored in the PDF's fonts,
 66 |     # replace the character with these other characters (if they don't have the same
 67 |     # problem):
 68 |     content_replacement_glyphs = ["?", "#", "*", " "]
 69 | 
 70 |     # The link filters are run on link annotations. Each link filter is a function
 71 |     # that is passed the link target (a string holding a URI) and a second argujment
 72 |     # holding the annotation object. The function should return a new URI or None to
 73 |     # remove the link.
 74 |     link_filters = []
 75 | 
 76 | 
 77 | def redactor(options):
 78 |     # This is the function that performs redaction.
 79 | 
 80 |     if sys.version_info < (3,):
 81 |         if options.input_stream is None:
 82 |             options.input_stream = sys.stdin  # input stream containing the PDF to redact
 83 |         if options.output_stream is None:
 84 |             options.output_stream = sys.stdout  # output stream to write the new, redacted PDF to
 85 |     else:
 86 |         if options.input_stream is None:
 87 |             options.input_stream = sys.stdin.buffer  # input byte stream containing the PDF to redact
 88 |         if options.output_stream is None:
 89 |             options.output_stream = sys.stdout.buffer  # output byte stream to write the new, redacted PDF to
 90 | 
 91 |     from pdfrw import PdfReader, PdfWriter
 92 | 
 93 |     # Read the PDF.
 94 |     document = PdfReader(options.input_stream)
 95 | 
 96 |     # Modify its Document Information Dictionary metadata.
 97 |     update_metadata(document, options)
 98 | 
 99 |     # Modify its XMP metadata.
100 |     update_xmp_metadata(document, options)
101 | 
102 |     if options.content_filters:
103 |         # Build up the complete text stream of the PDF content.
104 |         text_layer = build_text_layer(document, options)
105 | 
106 |         # Apply filters to the text stream.
107 |         update_text_layer(options, *text_layer)
108 | 
109 |         # Replace page content streams with updated tokens.
110 |         apply_updated_text(document, *text_layer)
111 | 
112 |     # Update annotations.
113 |     update_annotations(document, options)
114 | 
115 |     # Write the PDF back out.
116 |     writer = PdfWriter()
117 |     writer.trailer = document
118 |     writer.write(options.output_stream)
119 | 
120 | 
121 | def update_metadata(trailer, options):
122 |     # Update the PDF's Document Information Dictionary, which contains keys like
123 |     # Title, Author, Subject, Keywords, Creator, Producer, CreationDate, and ModDate
124 |     # (the latter two containing Date values, the rest strings).
125 | 
126 |     from pdfrw.objects import PdfString, PdfName
127 | 
128 |     # Create the metadata dict if it doesn't exist, since the caller may be adding fields.
129 |     if not trailer.Info:
130 |         trailer.Info = PdfDict()
131 | 
132 |     # Get a list of all metadata fields that exist in the PDF plus any fields
133 |     # that there are metadata filters for (since they may insert field values).
134 |     keys = set(str(k)[1:] for k in trailer.Info.keys()) | set(
135 |         k for k in options.metadata_filters.keys() if k not in ("DEFAULT", "ALL")
136 |     )
137 | 
138 |     # Update each metadata field.
139 |     for key in keys:
140 |         # Get the functions to apply to this field.
141 |         functions = options.metadata_filters.get(key)
142 |         if functions is None:
143 |             # If nothing is defined for this field, use the DEFAULT functions.
144 |             functions = options.metadata_filters.get("DEFAULT", [])
145 | 
146 |         # Append the ALL functions.
147 |         functions += options.metadata_filters.get("ALL", [])
148 | 
149 |         # Run the functions on any existing values.
150 |         value = trailer.Info[PdfName(key)]
151 |         for f in functions:
152 |             # Before passing to the function, convert from a PdfString to a Python string.
153 |             if isinstance(value, PdfString):
154 |                 # decode from PDF's "(...)" syntax.
155 |                 value = value.decode()
156 | 
157 |             # Filter the value.
158 |             value = f(value)
159 | 
160 |             # Convert Python data type to PdfString.
161 |             if isinstance(value, str) or (sys.version_info < (3,)):  # and isinstance(value, unicode)):
162 |                 # Convert string to a PdfString instance.
163 |                 value = PdfString.from_unicode(value)
164 | 
165 |             elif isinstance(value, datetime):
166 |                 # Convert datetime into a PDF "D" string format.
167 |                 value = value.strftime("%Y%m%d%H%M%S%z")
168 |                 if len(value) == 19:
169 |                     # If TZ info was included, add an apostrophe between the hour/minutes offsets.
170 |                     value = value[:17] + "'" + value[17:]
171 |                 value = PdfString("(D:%s)" % value)
172 | 
173 |             elif value is None:
174 |                 # delete the metadata value
175 |                 pass
176 | 
177 |             else:
178 |                 raise ValueError(
179 |                     "Invalid type of value returned by metadata_filter function. %s was returned by %s."
180 |                     % (repr(value), f.__name__ or "anonymous function")
181 |                 )
182 | 
183 |             # Replace value.
184 |             trailer.Info[PdfName(key)] = value
185 | 
186 | 
187 | def update_xmp_metadata(trailer, options):
188 |     if trailer.Root.Metadata:
189 |         # Safely parse the existing XMP data.
190 |         from defusedxml.ElementTree import fromstring
191 | 
192 |         value = fromstring(trailer.Root.Metadata.stream)
193 |     else:
194 |         # There is no XMP metadata in the document.
195 |         value = None
196 | 
197 |     # Run each filter.
198 |     for f in options.xmp_filters:
199 |         value = f(value)
200 | 
201 |     # Set new metadata.
202 |     if value is None:
203 |         # Clear it.
204 |         trailer.Root.Metadata = None
205 |     else:
206 |         # Serialize the XML and save it into the PDF metadata.
207 | 
208 |         # Get the serializer.
209 |         serializer = options.xmp_serializer
210 |         if serializer is None:
211 |             # Use a default serializer based on xml.etree.ElementTree.tostring.
212 |             def serializer(xml_root):
213 |                 import xml.etree.ElementTree
214 | 
215 |                 if hasattr(xml.etree.ElementTree, "register_namespace"):
216 |                     # Beginning with Python 3.2 we can define namespace prefixes.
217 |                     xml.etree.ElementTree.register_namespace("xmp", "adobe:ns:meta/")
218 |                     xml.etree.ElementTree.register_namespace("pdf13", "http://ns.adobe.com/pdf/1.3/")
219 |                     xml.etree.ElementTree.register_namespace("xap", "http://ns.adobe.com/xap/1.0/")
220 |                     xml.etree.ElementTree.register_namespace("dc", "http://purl.org/dc/elements/1.1/")
221 |                     xml.etree.ElementTree.register_namespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
222 |                 return xml.etree.ElementTree.tostring(xml_root, encoding="unicode" if sys.version_info >= (3, 0) else None)
223 | 
224 |         # Create a fresh Metadata dictionary and serialize the XML into it.
225 |         trailer.Root.Metadata = PdfDict()
226 |         trailer.Root.Metadata.Type = "Metadata"
227 |         trailer.Root.Metadata.Subtype = "XML"
228 |         trailer.Root.Metadata.stream = serializer(value)
229 | 
230 | 
231 | class InlineImage(PdfDict):
232 |     def read_data(self, tokens):
233 |         # "Unless the image uses ASCIIHexDecode or ASCII85Decode as one
234 |         # of its filters, the ID operator should be followed by a
235 |         # single white-space character, and the next character is
236 |         # interpreted as the first byte of image data.
237 |         if tokens.current[0][1] > tokens.current[0][0] + 3:
238 |             tokens.current[0] = (tokens.current[0][0], tokens.current[0][0] + 3)
239 | 
240 |         start = tokens.floc
241 |         state = 0
242 |         whitespace = (" ", "\n", "\r")
243 |         # 0: image data or trailing whitespace
244 |         # 1: E
245 |         # 2: I
246 |         for i in range(start, len(tokens.fdata)):
247 |             if state == 0:
248 |                 if tokens.fdata[i] == "E":
249 |                     state = 1
250 |             elif state == 1:
251 |                 if tokens.fdata[i] == "I":
252 |                     state = 2
253 |                 else:
254 |                     state = 0
255 |             elif state == 2:
256 |                 if tokens.fdata[i] in whitespace:
257 |                     for j in range(i + 1, i + 6):
258 |                         o = ord(tokens.fdata[j])
259 |                         if o == 0x0A:  # \n
260 |                             continue
261 |                         elif o == 0x0D:  # \r
262 |                             continue
263 |                         elif o >= 0x20 and o <= 0x7E:
264 |                             continue
265 |                         else:
266 |                             state = 0
267 |                             break
268 |                     else:
269 |                         end = i - 3
270 |                         assert tokens.fdata[end] in whitespace
271 |                         break
272 |                 else:
273 |                     state = 0
274 | 
275 |         self._stream = tokens.fdata[start:end]
276 |         tokens.floc = end
277 | 
278 | 
279 | def tokenize_streams(streams):
280 |     # pdfrw's tokenizer PdfTokens does lexical analysis only. But we need
281 |     # to collapse arrays ([ .. ]) and dictionaries (<< ... >>) into single
282 |     # token entries.
283 |     from pdfrw import PdfTokens, PdfArray
284 | 
285 |     stack = []
286 |     for stream in streams:
287 |         tokens = PdfTokens(stream)
288 |         for token in tokens:
289 |             # Is this a control token?
290 |             if token == "<<":
291 |                 # begins a dictionary
292 |                 stack.append((PdfDict, []))
293 |                 continue
294 |             elif token == "[":
295 |                 # begins an array
296 |                 stack.append((PdfArray, []))
297 |                 continue
298 |             elif token in (">>", "]"):
299 |                 # ends a dictionary or array
300 |                 constructor, content = stack.pop(-1)
301 |                 if constructor == PdfDict:
302 |                     # Turn flat list into key/value pairs.
303 |                     content = chunk_pairs(content)
304 |                 token = constructor(content)
305 |             elif token == "BI":
306 |                 # begins an inline image's dictionary half
307 |                 stack.append((InlineImage, []))
308 |                 continue
309 |             elif token == "ID":
310 |                 # divides an inline image's dictionary half and data half
311 |                 constructor, content = stack[-1]
312 |                 content = chunk_pairs(content)
313 |                 img = constructor(content)
314 |                 img.read_data(tokens)
315 |                 stack[-1] = (img, None)
316 |                 continue
317 |             elif token == "EI":
318 |                 # ends an inline image
319 |                 token, _ = stack.pop(-1)
320 | 
321 |             # If we're inside something, add this token to that thing.
322 |             if len(stack) > 0:
323 |                 stack[-1][1].append(token)
324 |                 continue
325 | 
326 |             # Yield it.
327 |             yield token
328 | 
329 | 
330 | def build_text_layer(document, options):
331 |     # Within each page's content stream, look for text-showing operators to
332 |     # find the text content of the page. Construct a string that contains the
333 |     # entire text content of the document AND a mapping from characters in the
334 |     # text content to tokens in the content streams. That lets us modify the
335 |     # tokens in the content streams when we find text that we want to redact.
336 |     #
337 |     # The text-showing operators are:
338 |     #
339 |     #   (text) Tj      -- show a string of text
340 |     #   (text) '       -- move to next line and show a string of text
341 |     #   aw ac (text) " -- show a string of text with word/character spacing parameters
342 |     #   [ ... ] TJ     -- show text strings from the array, which are interleaved with spacing parameters
343 |     #
344 |     # (These operators appear only within BT ... ET so-called "text objects",
345 |     # although we don't make use of it.)
346 |     #
347 |     # But since we don't understand any of the other content stream operators,
348 |     # and in particular we don't know how many operands each (non-text) operator
349 |     # takes, we can never be sure whether what we see in the content stream is
350 |     # an operator or an operand. If we see a "Tj", maybe it is the operand of
351 |     # some other operator?
352 |     #
353 |     # We'll assume we can get by just fine, however, assuming that whenever we
354 |     # see one of these tokens that it's an operator and not an operand.
355 |     #
356 |     # But TJ remains a little tricky because its operand is an array that preceeds
357 |     # it. Arrays are delimited by square brackets and we need to parse that.
358 |     #
359 |     # We also have to be concerned with the encoding of the text content, which
360 |     # depends on the active font. With a simple font, the text is a string whose
361 |     # bytes are glyph codes. With a composite font, a CMap maps multi-byte
362 |     # character codes to glyphs. In either case, we must map glyphs to unicode
363 |     # characters so that we can pattern match against it.
364 |     #
365 |     # To know the active font, we look for the "<font> <size> Tf" operator.
366 | 
367 |     from pdfrw import PdfObject, PdfString, PdfArray
368 |     from pdfrw.uncompress import uncompress as uncompress_streams
369 |     from pdfrw.objects.pdfname import BasePdfName
370 | 
371 |     text_tokens = []
372 |     fontcache = {}
373 | 
374 |     class TextToken:
375 |         value = None
376 |         font = None
377 | 
378 |         def __init__(self, value, font):
379 |             self.font = font
380 |             self.raw_original_value = value
381 |             self.original_value = toUnicode(value, font, fontcache)
382 |             self.value = self.original_value
383 | 
384 |         def __str__(self):
385 |             # __str__ is used for serialization
386 |             if self.value == self.original_value:
387 |                 # If unchanged, return the raw original value without decoding/encoding.
388 |                 return PdfString.from_bytes(self.raw_original_value)
389 |             else:
390 |                 # If the value changed, encode it from Unicode according to the encoding
391 |                 # of the font that is active at the location of this token.
392 |                 return PdfString.from_bytes(fromUnicode(self.value, self.font, fontcache, options))
393 | 
394 |         def __repr__(self):
395 |             # __repr__ is used for debugging
396 |             return "Token<%s>" % repr(self.value)
397 | 
398 |     def process_text(token):
399 |         if token.value == "":
400 |             return
401 |         text_tokens.append(token)
402 | 
403 |     # For each page...
404 |     page_tokens = []
405 |     for page in document.pages:
406 |         # For each token in the content stream...
407 | 
408 |         # Remember this page's revised token list.
409 |         token_list = []
410 |         page_tokens.append(token_list)
411 | 
412 |         if page.Contents is None:
413 |             continue
414 | 
415 |         prev_token = None
416 |         prev_prev_token = None
417 |         current_font = None
418 | 
419 |         # The page may have one content stream or an array of content streams.
420 |         # If an array, they are treated as if they are concatenated into a single
421 |         # stream (per the spec).
422 |         if isinstance(page.Contents, PdfArray):
423 |             contents = list(page.Contents)
424 |         else:
425 |             contents = [page.Contents]
426 | 
427 |         # If a compression Filter is applied, attempt to un-apply it. If an unrecognized
428 |         # filter is present, an error is raised. uncompress_streams expects an array of
429 |         # streams.
430 |         uncompress_streams(contents)
431 | 
432 |         def make_mutable_string_token(token):
433 |             if isinstance(token, PdfString):
434 |                 token = TextToken(token.to_bytes(), current_font)
435 | 
436 |                 # Remember all unicode characters seen in this font so we can
437 |                 # avoid inserting characters that the PDF isn't likely to have
438 |                 # a glyph for.
439 |                 if current_font and current_font.BaseFont:
440 |                     fontcache.setdefault(current_font.BaseFont, set()).update(token.value)
441 |             return token
442 | 
443 |         # Iterate through the tokens in the page's content streams.
444 |         for token in tokenize_streams(content.stream for content in contents):
445 |             # Replace any string token with our own class that hold a mutable
446 |             # value, which is how we'll rewrite content.
447 |             token = make_mutable_string_token(token)
448 | 
449 |             # Append the token into a new list that holds all tokens.
450 |             token_list.append(token)
451 | 
452 |             # If the token is an operator and we're not inside an array...
453 |             if isinstance(token, PdfObject):
454 |                 # And it's one that we recognize, process it.
455 |                 if token in ("Tj", "'", '"') and isinstance(prev_token, TextToken):
456 |                     # Simple text operators.
457 |                     process_text(prev_token)
458 |                 elif token == "TJ" and isinstance(prev_token, PdfArray):
459 |                     # The text array operator.
460 |                     for i in range(len(prev_token)):
461 |                         # (item may not be a string! only the strings are text.)
462 |                         prev_token[i] = make_mutable_string_token(prev_token[i])
463 |                         if isinstance(prev_token[i], TextToken):
464 |                             process_text(prev_token[i])
465 | 
466 |                 elif token == "Tf" and isinstance(prev_prev_token, BasePdfName):
467 |                     # Update the current font.
468 |                     # prev_prev_token holds the font 'name'. The name must be looked up
469 |                     # in the content stream's resource dictionary, which is page.Resources,
470 |                     # plus any resource dictionaries above it in the document hierarchy.
471 |                     current_font = None
472 |                     resources = page.Resources
473 |                     while resources and not current_font:
474 |                         current_font = resources.Font[prev_prev_token]
475 |                         resources = resources.Parent
476 | 
477 |             # Remember the previously seen token in case the next operator is a text-showing
478 |             # operator -- in which case this was the operand. Remember the token before that
479 |             # because it may be a font name for the Tf operator.
480 |             prev_prev_token = prev_token
481 |             prev_token = token
482 | 
483 |     return (text_tokens, page_tokens)
484 | 
485 | 
486 | def chunk_pairs(s):
487 |     while len(s) >= 2:
488 |         yield (s.pop(0), s.pop(0))
489 | 
490 | 
491 | def chunk_triples(s):
492 |     while len(s) >= 3:
493 |         yield (s.pop(0), s.pop(0), s.pop(0))
494 | 
495 | 
496 | class CMap(object):
497 |     def __init__(self, cmap):
498 |         self.bytes_to_unicode = {}
499 |         self.unicode_to_bytes = {}
500 |         self.defns = {}
501 |         self.usecmap = None
502 | 
503 |         # Decompress the CMap stream & check that it's not compressed in a way
504 |         # we can't understand.
505 |         from pdfrw.uncompress import uncompress as uncompress_streams
506 | 
507 |         uncompress_streams([cmap])
508 | 
509 |         # print(cmap.stream, file=sys.stderr)
510 | 
511 |         # This is based on https://github.com/euske/pdfminer/blob/master/pdfminer/cmapdb.py.
512 |         from pdfrw import PdfString, PdfArray
513 | 
514 |         in_cmap = False
515 |         operand_stack = []
516 |         codespacerange = []
517 | 
518 |         def code_to_int(code):
519 |             # decode hex encoding
520 |             code = code.to_bytes()
521 |             if sys.version_info < (3,):
522 |                 code = (ord(c) for c in code)
523 |             from functools import reduce
524 | 
525 |             return reduce(lambda x0, x: x0 * 256 + x, (b for b in code))
526 | 
527 |         def add_mapping(code, char, offset=0):
528 |             # Is this a mapping for a one-byte or two-byte character code?
529 |             width = len(codespacerange[0].to_bytes())
530 |             assert len(codespacerange[1].to_bytes()) == width
531 |             if width == 1:
532 |                 # one-byte entry
533 |                 if sys.version_info < (3,):
534 |                     code = chr(code)
535 |                 else:
536 |                     code = bytes([code])
537 |             elif width == 2:
538 |                 if sys.version_info < (3,):
539 |                     code = chr(code // 256) + chr(code & 255)
540 |                 else:
541 |                     code = bytes([code // 256, code & 255])
542 |             else:
543 |                 raise ValueError("Invalid code space range %s?" % repr(codespacerange))
544 | 
545 |             # Some range operands take an array.
546 |             if isinstance(char, PdfArray):
547 |                 char = char[offset]
548 | 
549 |             # The Unicode character is given usually as a hex string of one or more
550 |             # two-byte Unicode code points.
551 |             if isinstance(char, PdfString):
552 |                 char = char.to_bytes()
553 |                 if sys.version_info < (3,):
554 |                     char = (ord(c) for c in char)
555 | 
556 |                 c = ""
557 |                 for xh, xl in chunk_pairs(list(char)):
558 |                     c += chr(xh * 256 + xl)
559 |                 char = c
560 | 
561 |                 if offset > 0:
562 |                     char = char[0:-1] + chr(ord(char[-1]) + offset)
563 |             else:
564 |                 assert offset == 0
565 | 
566 |             self.bytes_to_unicode[code] = char
567 |             self.unicode_to_bytes[char] = code
568 | 
569 |         for token in tokenize_streams([cmap.stream]):
570 |             if token == "begincmap":
571 |                 in_cmap = True
572 |                 operand_stack[:] = []
573 |                 continue
574 |             elif token == "endcmap":
575 |                 in_cmap = False
576 |                 continue
577 |             if not in_cmap:
578 |                 continue
579 | 
580 |             if token == "def":
581 |                 name = operand_stack.pop(0)
582 |                 value = operand_stack.pop(0)
583 |                 self.defns[name] = value
584 | 
585 |             elif token == "usecmap":
586 |                 self.usecmap = self.pop(0)
587 | 
588 |             elif token == "begincodespacerange":
589 |                 operand_stack[:] = []
590 |             elif token == "endcodespacerange":
591 |                 codespacerange = [operand_stack.pop(0), operand_stack.pop(0)]
592 | 
593 |             elif token in ("begincidrange", "beginbfrange"):
594 |                 operand_stack[:] = []
595 |             elif token in ("endcidrange", "endbfrange"):
596 |                 for (code1, code2, cid_or_name1) in chunk_triples(operand_stack):
597 |                     if not isinstance(code1, PdfString) or not isinstance(code2, PdfString):
598 |                         continue
599 |                     code1 = code_to_int(code1)
600 |                     code2 = code_to_int(code2)
601 |                     for code in range(code1, code2 + 1):
602 |                         add_mapping(code, cid_or_name1, code - code1)
603 |                 operand_stack[:] = []
604 | 
605 |             elif token in ("begincidchar", "beginbfchar"):
606 |                 operand_stack[:] = []
607 |             elif token in ("endcidchar", "endbfchar"):
608 |                 for (code, char) in chunk_pairs(operand_stack):
609 |                     if not isinstance(code, PdfString):
610 |                         continue
611 |                     add_mapping(code_to_int(code), char)
612 |                 operand_stack[:] = []
613 | 
614 |             elif token == "beginnotdefrange":
615 |                 operand_stack[:] = []
616 |             elif token == "endnotdefrange":
617 |                 operand_stack[:] = []
618 | 
619 |             else:
620 |                 operand_stack.append(token)
621 | 
622 |     def dump(self):
623 |         for code, char in self.bytes_to_unicode.items():
624 |             print(repr(code), char)
625 | 
626 |     def decode(self, string):
627 |         ret = []
628 |         i = 0
629 |         while i < len(string):
630 |             if string[i : i + 1] in self.bytes_to_unicode:
631 |                 # byte matches a single-byte entry
632 |                 ret.append(self.bytes_to_unicode[string[i : i + 1]])
633 |                 i += 1
634 |             elif string[i : i + 2] in self.bytes_to_unicode:
635 |                 # next two bytes matches a multi-byte entry
636 |                 ret.append(self.bytes_to_unicode[string[i : i + 2]])
637 |                 i += 2
638 |             else:
639 |                 ret.append("?")
640 |                 i += 1
641 |         return "".join(ret)
642 | 
643 |     def encode(self, string):
644 |         ret = []
645 |         for c in string:
646 |             ret.append(self.unicode_to_bytes.get(c, b""))
647 |         return b"".join(ret)
648 | 
649 | 
650 | def toUnicode(string, font, fontcache):
651 |     # This is hard!
652 | 
653 |     if not font:
654 |         # There is no font for this text. Assume Latin-1.
655 |         return string.decode("Latin-1")
656 |     elif font.ToUnicode:
657 |         # Decompress the CMap stream & check that it's not compressed in a way
658 |         # we can't understand.
659 |         from pdfrw.uncompress import uncompress as uncompress_streams
660 | 
661 |         uncompress_streams([font.ToUnicode])
662 | 
663 |         # Use the CMap, which maps character codes to Unicode code points.
664 |         if font.ToUnicode.stream not in fontcache:
665 |             fontcache[font.ToUnicode.stream] = CMap(font.ToUnicode)
666 |         cmap = fontcache[font.ToUnicode.stream]
667 | 
668 |         string = cmap.decode(string)
669 |         # print(string, end='', file=sys.stderr)
670 |         # sys.stderr.write(string)
671 |         return string
672 |     elif font.Encoding == "/WinAnsiEncoding":
673 |         return string.decode("cp1252", "replace")
674 |     elif font.Encoding == "/MacRomanEncoding":
675 |         return string.decode("mac_roman", "replace")
676 |     else:
677 |         return "?"
678 |     # raise ValueError("Don't know how to decode data from font %s." % font)
679 | 
680 | 
681 | def fromUnicode(string, font, fontcache, options):
682 |     # Filter out characters that are not likely to have renderable glyphs
683 |     # because the character didn't occur in the original PDF in its font.
684 |     # For any character that didn't occur in the original PDF, replace it
685 |     # with the first character in options.content_replacement_glyphs that
686 |     # did occur in the original PDF. If none ocurred, delete the character.
687 |     if font and font.BaseFont in fontcache:
688 |         char_occurs = fontcache[font.BaseFont]
689 | 
690 |         def map_char(c):
691 |             for cc in [c] + options.content_replacement_glyphs:
692 |                 if cc in char_occurs:
693 |                     return cc
694 |             return ""  # no replacement glyph => omit character
695 | 
696 |         string = "".join(map_char(c) for c in string)
697 | 
698 |     # Encode the Unicode string in the same encoding that it was originally
699 |     # stored in --- based on the font that was active when the token was
700 |     # used in a text-showing operation.
701 |     if not font:
702 |         # There was no font for this text. Assume Latin-1.
703 |         return string.encode("Latin-1")
704 | 
705 |     elif font.ToUnicode and font.ToUnicode.stream in fontcache:
706 |         # Convert the Unicode code points back to one/two-byte CIDs.
707 |         cmap = fontcache[font.ToUnicode.stream]
708 |         return cmap.encode(string)
709 | 
710 |     # Convert using a simple encoding.
711 |     elif font.Encoding == "/WinAnsiEncoding":
712 |         return string.encode("cp1252")
713 |     elif font.Encoding == "/MacRomanEncoding":
714 |         return string.encode("mac_roman")
715 | 
716 |     # Don't know how to handle this sort of font.
717 |     else:
718 |         raise ValueError("Don't know how to encode data to font %s." % font)
719 | 
720 | 
721 | def update_text_layer(options, text_tokens, page_tokens):
722 |     if len(text_tokens) == 0:
723 |         # No text content.
724 |         return
725 | 
726 |     # Apply each regular expression to the text content...
727 |     for pattern, function in options.content_filters:
728 |         # Finding all matches...
729 |         text_tokens_index = 0
730 |         text_tokens_charpos = 0
731 |         text_tokens_token_xdiff = 0
732 |         text_content = "".join(t.value for t in text_tokens)
733 |         for m in pattern.finditer(text_content):
734 |             # We got a match at text_content[i1:i2].
735 |             i1 = m.start()
736 |             i2 = m.end()
737 | 
738 |             # Pass the matched text to the replacement function to get replaced text.
739 |             replacement = function(m)
740 | 
741 |             # Do a text replacement in the tokens that produced this text content.
742 |             # It may have been produced by multiple tokens, so loop until we find them all.
743 |             while i1 < i2:
744 |                 # Find the original tokens in the content stream that
745 |                 # produced the matched text. Start by advancing over any
746 |                 # tokens that are entirely before this span of text.
747 |                 while (
748 |                     text_tokens_index < len(text_tokens)
749 |                     and text_tokens_charpos + len(text_tokens[text_tokens_index].value) - text_tokens_token_xdiff <= i1
750 |                 ):
751 |                     text_tokens_charpos += len(text_tokens[text_tokens_index].value) - text_tokens_token_xdiff
752 |                     text_tokens_index += 1
753 |                     text_tokens_token_xdiff = 0
754 |                 if text_tokens_index == len(text_tokens):
755 |                     break
756 |                 assert text_tokens_charpos <= i1
757 | 
758 |                 # The token at text_tokens_index, and possibly subsequent ones,
759 |                 # are responsible for this text. Replace the matched content
760 |                 # here with replacement content.
761 |                 tok = text_tokens[text_tokens_index]
762 | 
763 |                 # Where does this match begin within the token's text content?
764 |                 mpos = i1 - text_tokens_charpos
765 |                 assert mpos >= 0
766 | 
767 |                 # How long is the match within this token?
768 |                 mlen = min(i2 - i1, len(tok.value) - text_tokens_token_xdiff - mpos)
769 |                 assert mlen >= 0
770 | 
771 |                 # How much should we replace here?
772 |                 if mlen < (i2 - i1):
773 |                     # There will be more replaced later, so take the same number
774 |                     # of characters from the replacement text.
775 |                     r = replacement[:mlen]
776 |                     replacement = replacement[mlen:]
777 |                 else:
778 |                     # This is the last token in which we'll replace text, so put
779 |                     # all of the remaining replacement content here.
780 |                     r = replacement
781 |                     replacement = None  # sanity
782 | 
783 |                 # Do the replacement.
784 |                 tok.value = (
785 |                     tok.value[: mpos + text_tokens_token_xdiff] + r + tok.value[mpos + mlen + text_tokens_token_xdiff :]
786 |                 )
787 |                 text_tokens_token_xdiff += len(r) - mlen
788 | 
789 |                 # Advance for next iteration.
790 |                 i1 += mlen
791 | 
792 | 
793 | def apply_updated_text(document, text_tokens, page_tokens):
794 |     # Create a new content stream for each page by concatenating the
795 |     # tokens in the page_tokens lists.
796 |     from pdfrw import PdfArray
797 | 
798 |     for i, page in enumerate(document.pages):
799 |         if page.Contents is None:
800 |             continue  # nothing was here
801 | 
802 |         # Replace the page's content stream with our updated tokens.
803 |         # The content stream may have been an array of streams before,
804 |         # so replace the whole thing with a single new stream. Unfortunately
805 |         # the str on PdfArray and PdfDict doesn't work right.
806 |         def tok_str(tok):
807 |             if isinstance(tok, PdfArray):
808 |                 return "[ " + " ".join(tok_str(x) for x in tok) + "] "
809 |             if isinstance(tok, InlineImage):
810 |                 return "BI " + " ".join(tok_str(x) + " " + tok_str(y) for x, y in tok.items()) + " ID " + tok.stream + " EI "
811 |             if isinstance(tok, PdfDict):
812 |                 return "<< " + " ".join(tok_str(x) + " " + tok_str(y) for x, y in tok.items()) + ">> "
813 |             return str(tok)
814 | 
815 |         page.Contents = PdfDict()
816 |         page.Contents.stream = "\n".join(tok_str(tok) for tok in page_tokens[i])
817 |         page.Contents.Length = len(page.Contents.stream)  # reset
818 | 
819 | 
820 | def update_annotations(document, options):
821 |     for page in document.pages:
822 |         if hasattr(page, "Annots") and isinstance(page.Annots, list):
823 |             for annotation in page.Annots:
824 |                 update_annotation(annotation, options)
825 | 
826 | 
827 | def update_annotation(annotation, options):
828 |     from pdfrw.objects import PdfString
829 | 
830 |     # Contents holds a plain-text representation of the annotation
831 |     # content, such as for accessibility. All annotation types may
832 |     # have a Contents. NM holds the "annotation name" which also
833 |     # could have redactable text, I suppose. Markup annotations have
834 |     # "T" fields that hold a title / text label. Subj holds a
835 |     # comment subject. CA, RC, and AC are used in widget annotations.
836 |     for string_field in ("Contents", "NM", "T", "Subj", "CA", "RC", "AC"):
837 |         if getattr(annotation, string_field):
838 |             value = getattr(annotation, string_field).to_unicode()
839 |             for pattern, function in options.content_filters:
840 |                 value = pattern.sub(function, value)
841 |             setattr(annotation, string_field, PdfString.from_unicode(value))
842 | 
843 |     # A rich-text stream. Not implemented. Bail so that we don't
844 |     # accidentally leak something that should be redacted.
845 |     if annotation.RC:
846 |         raise ValueError("Annotation rich-text streams (Annot/RC) are not supported.")
847 | 
848 |     # An action, usually used for links.
849 |     if annotation.A:
850 |         update_annotation_action(annotation, annotation.A, options)
851 |     if annotation.PA:
852 |         update_annotation_action(annotation, annotation.PA, options)
853 | 
854 |     # If set, another annotation.
855 |     if annotation.Popup:
856 |         update_annotation(annotation.Popup, options)
857 | 
858 | 
859 | # TODO? Redaction annotations have some other attributes that might
860 | # have text. But since they're intended for redaction... maybe we
861 | # should keep them anyway.
862 | 
863 | 
864 | def update_annotation_action(annotation, action, options):
865 |     from pdfrw.objects import PdfString
866 | 
867 |     if action.URI and options.link_filters:
868 |         value = action.URI.to_unicode()
869 |         for func in options.link_filters:
870 |             value = func(value, annotation)
871 |         if value is None:
872 |             # Remove annotation by supressing the action.
873 |             action.URI = None
874 |         else:
875 |             action.URI = PdfString.from_unicode(value)
876 | 
877 |     if action.Next:
878 |         # May be an Action or array of Actions to execute next.
879 |         next_action = action.Next
880 |         if isinstance(action.Next, dict):
881 |             next_action = [action.Next]
882 |         for a in next_action:
883 |             update_annotation_action(annotation, a, options)
884 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 127
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Dependencies
 2 | pdfrw==0.4
 3 | wkhtmltopdf==0.2
 4 | pdfkit==0.6.1
 5 | defusedxml==0.6.0
 6 | beautifulsoup4==4.9.1
 7 | 
 8 | # Development dependencies
 9 | chardet==3.0.4
10 | pre-commit==2.2.0
11 | pytest==5.4.1
12 | pytest-cov==2.8.1
13 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | from setuptools import find_packages
 4 | 
 5 | setup(
 6 |     name="expose-text",
 7 |     version="0.1.6",
 8 |     url="https://openredact.org/",
 9 |     author="Jonas Langhabel, Malte Ostendorff",
10 |     author_email="hello@openredact.org",
11 |     packages=find_packages(exclude=["tests"]),
12 |     include_package_data=True,
13 |     license="MIT",
14 |     description="A Python module that exposes text for modification in multiple file types.",
15 |     long_description=open("README.md").read(),
16 |     long_description_content_type="text/markdown",
17 |     install_requires=["pdfrw==0.4", "defusedxml==0.6.0", "beautifulsoup4==4.9.1", "wkhtmltopdf==0.2", "pdfkit==0.6.1"],
18 |     python_requires=">=3.7",
19 | )
20 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | logging.basicConfig(
4 |     format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO,
5 | )
6 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | 
3 | import pytest
4 | 
5 | 
6 | @pytest.fixture
7 | def test_files():
8 |     return Path(__file__).parent / "files"
9 | 


--------------------------------------------------------------------------------
/tests/files/doctest.txt:
--------------------------------------------------------------------------------
1 | This is the content as string.


--------------------------------------------------------------------------------
/tests/files/doctest_altered.txt:
--------------------------------------------------------------------------------
1 | That is the new content as string!


--------------------------------------------------------------------------------
/tests/files/foo.bar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openredact/expose-text/c8e21774d0ca2f0103e91b50ea093c87a3738a3e/tests/files/foo.bar


--------------------------------------------------------------------------------
/tests/files/pdf/doc.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openredact/expose-text/c8e21774d0ca2f0103e91b50ea093c87a3738a3e/tests/files/pdf/doc.pdf


--------------------------------------------------------------------------------
/tests/files/test.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openredact/expose-text/c8e21774d0ca2f0103e91b50ea093c87a3738a3e/tests/files/test.docx


--------------------------------------------------------------------------------
/tests/files/test.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | 
 3 | <html lang="de">
 4 | <head>
 5 |     <meta charset="utf-8">
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1">
 7 |     <link rel="icon" type="image/png" sizes="32x32" href="foo.bar">
 8 |     <title>This is a test - The title is not considered by expose-text</title>
 9 |     <meta property="foo:title" content="Test">
10 | </head>
11 | <body class="no-js foo">
12 | <header class="bar">This is some kind of header</header>
13 | <div>
14 |     <p>And now some content…<br/>well not very much.</p>
15 |     <p>This sentence will be replaced.</p>
16 | </div>
17 | <script type="text/javascript">
18 |   var d = document, g = d.createElement('script'), s = d.getElementsByTagName('script')[0]
19 | </script>
20 | <div>One more line with an ümlaut.</div>
21 | </body>
22 | </html>
23 | 


--------------------------------------------------------------------------------
/tests/files/test.txt:
--------------------------------------------------------------------------------
1 | This is a test file.
2 | 
3 | With multiple lines.
4 | 
5 | See if you can change its content.
6 | 


--------------------------------------------------------------------------------
/tests/files/test_altered.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openredact/expose-text/c8e21774d0ca2f0103e91b50ea093c87a3738a3e/tests/files/test_altered.docx


--------------------------------------------------------------------------------
/tests/files/test_altered.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | 
 3 | <html lang="de">
 4 | <head>
 5 |     <meta charset="utf-8">
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1">
 7 |     <link rel="icon" type="image/png" sizes="32x32" href="foo.bar">
 8 |     <title>This is a test - The title is not considered by expose-text</title>
 9 |     <meta property="foo:title" content="Test">
10 | </head>
11 | <body class="no-js foo">
12 | <header class="bar">This is some kind of header</header>
13 | <div>
14 |     <p>And now some content…<br/>well not very much.</p>
15 |     <p>A new sentence.</p>
16 | </div>
17 | <script type="text/javascript">
18 |   var d = document, g = d.createElement('script'), s = d.getElementsByTagName('script')[0]
19 | </script>
20 | <div>One more line with an ümlaut.</div>
21 | </body>
22 | </html>
23 | 


--------------------------------------------------------------------------------
/tests/files/test_altered.txt:
--------------------------------------------------------------------------------
1 | This is a test file. With a single line. See if you can change its content.
2 | 


--------------------------------------------------------------------------------
/tests/files/tmp/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openredact/expose-text/c8e21774d0ca2f0103e91b50ea093c87a3738a3e/tests/files/tmp/.gitkeep


--------------------------------------------------------------------------------
/tests/test_alterations_buffer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from expose_text.formats._utils import AlterationsBuffer
 4 | 
 5 | 
 6 | @pytest.fixture()
 7 | def buffer():
 8 |     return AlterationsBuffer()
 9 | 
10 | 
11 | def test_invalid_type(buffer):
12 |     with pytest.raises(TypeError):
13 |         buffer += 0
14 | 
15 |     with pytest.raises(TypeError):
16 |         buffer += (1, 2)
17 | 
18 | 
19 | def test_overlapping_alterations(buffer):
20 |     buffer.add(5, 15, "luke")
21 | 
22 |     with pytest.raises(ValueError):
23 |         buffer.add(0, 10, "vader")
24 | 
25 |     with pytest.raises(ValueError):
26 |         buffer.add(10, 20, "obi")
27 | 
28 | 
29 | def test_non_overlapping_corner_cases(buffer):
30 |     buffer.add(5, 15, "anakin")  # existing one
31 | 
32 |     buffer.add(0, 5, "jango")
33 |     buffer.add(15, 20, "boba")
34 |     assert len(buffer) == 3
35 | 
36 | 
37 | def test_sorting(buffer):
38 |     buffer.add(0, 5, "yoda")
39 |     buffer.add(20, 25, "jarjar")
40 |     buffer.add(10, 15, "kenobi")
41 |     buffer.sort()
42 |     assert list(buffer) == [(0, 5, "yoda"), (10, 15, "kenobi"), (20, 25, "jarjar")]
43 | 


--------------------------------------------------------------------------------
/tests/test_apply_buffer_to_text.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from expose_text.formats._utils import apply_buffer_to_text, AlterationsBuffer
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def text():
 8 |     return "This is the content of a text file.\n\nWith multiple lines.\n\nTry alter me."
 9 | 
10 | 
11 | @pytest.fixture
12 | def buffer():
13 |     return AlterationsBuffer()
14 | 
15 | 
16 | def test_replace_text(buffer, text):
17 |     buffer.add(0, 4, "That")
18 |     altered_text = apply_buffer_to_text(buffer, text)
19 |     assert altered_text == "That is the content of a text file.\n\nWith multiple lines.\n\nTry alter me."
20 | 
21 | 
22 | def test_remove_text(buffer, text):
23 |     buffer.add(35, 59, " ")
24 |     altered_text = apply_buffer_to_text(buffer, text)
25 |     assert altered_text == "This is the content of a text file. Try alter me."
26 | 


--------------------------------------------------------------------------------
/tests/test_auto_pdf_format.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | from expose_text import FileWrapper
 6 | from expose_text.formats.pdf.pdf2html2pdf import Pdf2Html2PdfFormat
 7 | 
 8 | black_square = u"\u25A0"
 9 | 
10 | 
11 | @pytest.fixture
12 | def tmp_files():
13 |     return Path(__file__).parent / "files" / "tmp"
14 | 
15 | 
16 | @pytest.fixture
17 | def test_files():
18 |     return Path(__file__).parent / "files" / "pdf"
19 | 
20 | 
21 | def test_pdf_text(tmp_files, test_files):
22 |     """
23 | 
24 |     Run this test alone: pytest -s tests/test_pdf_format.py
25 | 
26 |     """
27 |     input_fp = test_files / "doc.pdf"
28 |     output_fp = tmp_files / "doc.altered.pdf"
29 | 
30 |     fw = FileWrapper(input_fp)
31 | 
32 |     print(fw.text[:100])
33 | 
34 |     fw.add_alter(0, 9, "Deutscher")  # replace "Deutscher"
35 |     fw.apply_alters()
36 | 
37 |     print("xxx")
38 | 
39 |     print(fw.text[:100])
40 | 
41 |     # assert "XXXXXXX" == fw.text[0:7]  # TODO there is something wrong with indexing
42 | 
43 |     fw.save(output_fp)
44 | 
45 | 
46 | def test_check_dependencies():
47 |     print(Pdf2Html2PdfFormat().is_installed())
48 | 


--------------------------------------------------------------------------------
/tests/test_docx_format.py:
--------------------------------------------------------------------------------
 1 | import filecmp
 2 | 
 3 | import pytest
 4 | 
 5 | from expose_text import FileWrapper
 6 | from expose_text.formats._docx import DocxFormat
 7 | 
 8 | ENCODING = "UTF-8"
 9 | 
10 | 
11 | @pytest.fixture
12 | def docx_bytes(test_files):
13 |     with open(test_files / "test.docx", "rb") as f:
14 |         return f.read()
15 | 
16 | 
17 | @pytest.fixture
18 | def docx_text():
19 |     return """Title
20 | 
21 | Some body lines.
22 | 
23 | A text in different colors and styles.
24 | 
25 | This is a paragraph with a line
26 | break and nasty <w:t> tags."""
27 | 
28 | 
29 | @pytest.fixture
30 | def format_cls(docx_bytes):
31 |     format_cls = DocxFormat()
32 |     format_cls.load(docx_bytes)
33 |     return format_cls
34 | 
35 | 
36 | @pytest.fixture
37 | def replace():
38 |     def function(string, start, stop, new_content):
39 |         return string[:start] + new_content + string[stop:]
40 | 
41 |     return function
42 | 
43 | 
44 | def test_text_property(format_cls, docx_text):
45 |     assert format_cls.text == docx_text
46 | 
47 | 
48 | def test_bytes_property(format_cls, docx_text):
49 |     format_again = DocxFormat()
50 |     format_again.load(format_cls.bytes)
51 |     assert format_again.text == docx_text
52 | 
53 | 
54 | def test_replacing_with_longer_text(format_cls, docx_text, replace):
55 |     args = 25, 63, "This is the replaced line."
56 |     format_cls.add_alter(*args)
57 |     format_cls.apply_alters()
58 |     assert format_cls.text == replace(docx_text, *args)
59 | 
60 | 
61 | def test_replacing_with_shorter_text(format_cls, docx_text, replace):
62 |     args = 7, 23, "XXX"
63 |     format_cls.add_alter(*args)
64 |     format_cls.apply_alters()
65 |     assert format_cls.text == replace(docx_text, *args)
66 | 
67 | 
68 | def test_removing_text(format_cls, docx_text, replace):
69 |     args = 64, 124, ""
70 |     format_cls.add_alter(*args)
71 |     format_cls.apply_alters()
72 |     assert format_cls.text == replace(docx_text, *args)
73 | 
74 | 
75 | def test_alter_file(test_files, tmp_path):
76 |     file_path = test_files / "test.docx"
77 |     altered_file_path = test_files / "test_altered.docx"
78 |     tmp_out_path = tmp_path / "test_out.docx"
79 | 
80 |     file_wrapper = FileWrapper(file_path)
81 |     file_wrapper.add_alter(7, 23, "XXX")
82 |     file_wrapper.add_alter(25, 63, "This is the replaced line.")
83 |     file_wrapper.add_alter(64, 124, "")
84 |     file_wrapper.apply_alters()
85 |     file_wrapper.save(tmp_out_path)
86 | 
87 |     assert (
88 |         file_wrapper.text
89 |         == """Title
90 | 
91 | XXX
92 | 
93 | This is the replaced line.
94 | """
95 |     )
96 |     assert filecmp.cmp(altered_file_path, tmp_out_path, shallow=False)
97 | 


--------------------------------------------------------------------------------
/tests/test_file_wrapper.py:
--------------------------------------------------------------------------------
 1 | import filecmp
 2 | 
 3 | import pytest
 4 | 
 5 | from expose_text import FileWrapper, UnsupportedFormat
 6 | 
 7 | 
 8 | def test_unsupported_format(test_files):
 9 |     with pytest.raises(UnsupportedFormat):
10 |         FileWrapper(test_files / "foo.bar")
11 | 
12 | 
13 | def test_load_and_save_for_path(test_files, tmp_path):
14 |     file_path = test_files / "test.txt"
15 |     result_path = tmp_path / "test_out.txt"
16 | 
17 |     file_wrapper = FileWrapper(file_path)
18 |     file_wrapper.save(result_path)
19 | 
20 |     assert filecmp.cmp(file_path, result_path, shallow=False)
21 | 
22 | 
23 | def test_load_and_save_for_string(test_files, tmp_path):
24 |     file_path = test_files / "test.txt"
25 |     result_path = tmp_path / "test_out.txt"
26 | 
27 |     file_wrapper = FileWrapper(str(file_path))
28 |     file_wrapper.save(str(result_path))
29 | 
30 |     assert filecmp.cmp(file_path, result_path, shallow=False)
31 | 
32 | 
33 | def test_alter_file(test_files, tmp_path):
34 |     file_path = test_files / "test.txt"
35 |     altered_file_path = test_files / "test_altered.txt"
36 |     tmp_out_path = tmp_path / "test_out.txt"
37 | 
38 |     file_wrapper = FileWrapper(file_path)
39 |     file_wrapper.add_alter(20, 44, " With a single line. ")
40 |     file_wrapper.apply_alters()
41 |     file_wrapper.save(tmp_out_path)
42 | 
43 |     assert filecmp.cmp(altered_file_path, tmp_out_path, shallow=False)
44 | 


--------------------------------------------------------------------------------
/tests/test_html_format.py:
--------------------------------------------------------------------------------
  1 | import filecmp
  2 | 
  3 | import pytest
  4 | 
  5 | from expose_text import FileWrapper
  6 | from expose_text.formats._html import HtmlFormat
  7 | 
  8 | ENCODING = "UTF-8"
  9 | 
 10 | 
 11 | @pytest.fixture
 12 | def html_snippet():
 13 |     return """<div class="foo"><h1>German paragraph</h1>\n<p>1. … macht mich glücklich</p></div>""".encode(ENCODING)
 14 | 
 15 | 
 16 | @pytest.fixture
 17 | def format_cls(html_snippet):
 18 |     format_cls = HtmlFormat()
 19 |     format_cls.load(html_snippet)
 20 |     return format_cls
 21 | 
 22 | 
 23 | def test_text_property(format_cls):
 24 |     assert format_cls.text == "German paragraph\n1. … macht mich glücklich"
 25 | 
 26 | 
 27 | def test_bytes_property(format_cls):
 28 |     assert format_cls.bytes == '<div class="foo"><h1>German paragraph</h1>\n' "<p>1. … macht mich glücklich</p></div>".encode(
 29 |         ENCODING
 30 |     )
 31 | 
 32 | 
 33 | def test_unescaping_html():
 34 |     html_bytes = '<div class="foo"><h1>&lt;&gt;&amp;</h1>\n<p>&hellip; macht mich gl&uuml;cklich</p></div>'.encode(ENCODING)
 35 |     format_cls = HtmlFormat()
 36 |     format_cls.load(html_bytes)
 37 |     assert format_cls.text == "<>&\n… macht mich glücklich"
 38 |     assert format_cls.bytes == '<div class="foo"><h1>&lt;&gt;&amp;</h1>\n' "<p>… macht mich glücklich</p></div>".encode(
 39 |         ENCODING
 40 |     )
 41 | 
 42 | 
 43 | def test_same_length_replacing(format_cls):
 44 |     format_cls.add_alter(0, 6, "XXXXXX")
 45 |     format_cls.apply_alters()
 46 |     assert format_cls.text == "XXXXXX paragraph\n1. … macht mich glücklich"
 47 |     assert format_cls.bytes == '<div class="foo"><h1>XXXXXX paragraph</h1>\n' "<p>1. … macht mich glücklich</p></div>".encode(
 48 |         ENCODING
 49 |     )
 50 | 
 51 | 
 52 | def test_replacing_with_longer_text(format_cls):
 53 |     format_cls.add_alter(0, 6, "XXXXXXXXX")
 54 |     format_cls.apply_alters()
 55 |     assert format_cls.text == "XXXXXXXXX paragraph\n1. … macht mich glücklich"
 56 |     assert (
 57 |         format_cls.bytes == '<div class="foo"><h1>XXXXXXXXX paragraph</h1>\n'
 58 |         "<p>1. … macht mich glücklich</p></div>".encode(ENCODING)
 59 |     )
 60 | 
 61 | 
 62 | def test_replacing_with_shorter_text(format_cls):
 63 |     format_cls.add_alter(0, 6, "XXX")
 64 |     format_cls.apply_alters()
 65 |     assert format_cls.text == "XXX paragraph\n1. … macht mich glücklich"
 66 |     assert format_cls.bytes == '<div class="foo"><h1>XXX paragraph</h1>\n' "<p>1. … macht mich glücklich</p></div>".encode(
 67 |         ENCODING
 68 |     )
 69 | 
 70 | 
 71 | def test_removing_text(format_cls):
 72 |     format_cls.add_alter(0, 7, "")
 73 |     format_cls.apply_alters()
 74 |     assert format_cls.text == "paragraph\n1. … macht mich glücklich"
 75 |     assert format_cls.bytes == '<div class="foo"><h1>paragraph</h1>\n' "<p>1. … macht mich glücklich</p></div>".encode(
 76 |         ENCODING
 77 |     )
 78 | 
 79 | 
 80 | def test_removing_entire_content_of_element(format_cls):
 81 |     format_cls.add_alter(0, 16, "")
 82 |     format_cls.apply_alters()
 83 |     assert format_cls.text == "\n1. … macht mich glücklich"
 84 |     assert format_cls.bytes == '<div class="foo"><h1></h1>\n' "<p>1. … macht mich glücklich</p></div>".encode(ENCODING)
 85 | 
 86 | 
 87 | def test_removing_over_element_borders(format_cls):
 88 |     format_cls.add_alter(0, 20, "")
 89 |     format_cls.apply_alters()
 90 |     assert format_cls.text == "… macht mich glücklich"
 91 |     assert format_cls.bytes == '<div class="foo"><h1></h1>\n' "<p>… macht mich glücklich</p></div>".encode(ENCODING)
 92 | 
 93 | 
 94 | def test_replacing_over_element_borders(format_cls):
 95 |     format_cls.add_alter(0, 20, "All content goes in the first element. ")
 96 |     format_cls.apply_alters()
 97 |     assert format_cls.text == "All content goes in the first element. … macht mich glücklich"
 98 |     assert (
 99 |         format_cls.bytes == '<div class="foo"><h1>All content goes in the first element. </h1>\n'
100 |         "<p>… macht mich glücklich</p></div>".encode(ENCODING)
101 |     )
102 | 
103 | 
104 | def test_escaping_html_characters(format_cls):
105 |     format_cls.add_alter(0, 6, "<Language>")
106 |     format_cls.apply_alters()
107 |     assert format_cls.text == "<Language> paragraph\n1. … macht mich glücklich"
108 |     assert (
109 |         format_cls.bytes == '<div class="foo"><h1>&lt;Language&gt; paragraph</h1>\n'
110 |         "<p>1. … macht mich glücklich</p></div>".encode(ENCODING)
111 |     )
112 | 
113 | 
114 | def test_umlauts(format_cls):
115 |     format_cls.add_alter(20, 21, "Ein Äffchen")
116 |     format_cls.apply_alters()
117 |     assert format_cls.text == "German paragraph\n1. Ein Äffchen macht mich glücklich"
118 |     assert (
119 |         format_cls.bytes == '<div class="foo"><h1>German paragraph</h1>\n'
120 |         "<p>1. Ein Äffchen macht mich glücklich</p></div>".encode(ENCODING)
121 |     )
122 | 
123 | 
124 | def test_chained_alterations(format_cls):
125 |     format_cls.add_alter(33, 42, "froh")
126 |     format_cls.add_alter(7, 19, "Paragraph:")
127 |     format_cls.add_alter(0, 6, "Deutscher")
128 |     format_cls.add_alter(20, 21, "Essen")
129 |     format_cls.apply_alters()
130 |     assert format_cls.text == "Deutscher Paragraph: Essen macht mich froh"
131 |     assert format_cls.bytes == '<div class="foo"><h1>Deutscher Paragraph:</h1>\n' "<p> Essen macht mich froh</p></div>".encode(
132 |         ENCODING
133 |     )
134 | 
135 | 
136 | def test_altering_html_body():
137 |     html_bytes = (
138 |         '<body class="bar"><div class="foo"><h1>German paragraph</h1>\n<p>1. … macht mich glücklich</p></div></body>'
139 |     ).encode(ENCODING)
140 |     format_cls = HtmlFormat()
141 |     format_cls.load(html_bytes)
142 |     print(format_cls.text)
143 |     format_cls.add_alter(0, 6, "Deutscher")
144 |     format_cls.add_alter(20, 21, "Essen")
145 |     format_cls.apply_alters()
146 |     assert format_cls.text == "Deutscher paragraph\n1. Essen macht mich glücklich"
147 |     assert (
148 |         format_cls.bytes == '<body class="bar"><div class="foo"><h1>Deutscher paragraph</h1>\n'
149 |         "<p>1. Essen macht mich glücklich</p></div></body>".encode(ENCODING)
150 |     )
151 | 
152 | 
153 | def test_altering_html_document():
154 |     html_bytes = """
155 | <!DOCTYPE html>
156 | 
157 | <html class="client-nojs" dir="ltr" lang="de">
158 | <head>
159 | <meta charset="UTF-8"/>
160 | <title>The title is not considered.</title>
161 | <script>document.documentElement.className="client-js";</script>
162 | <link href="//github.com" rel="dns-prefetch"/>
163 | <!--[if lt IE 9]><script src="/w/lnjnsef4/3nklnfasldcnal.js"></script><![endif]-->
164 | </head>
165 | <body class="bar">
166 | <div class="foo">
167 | <h1>German paragraph</h1>
168 | <p>1. … macht mich glücklich</p>
169 | </div>
170 | </body>
171 | </html>""".encode(
172 |         ENCODING
173 |     )
174 |     format_cls = HtmlFormat()
175 |     format_cls.load(html_bytes)
176 |     format_cls.add_alter(0, 6, "Deutscher")
177 |     format_cls.add_alter(20, 21, "Essen")
178 |     format_cls.apply_alters()
179 |     assert format_cls.text == "Deutscher paragraph\n1. Essen macht mich glücklich"
180 |     assert (
181 |         format_cls.bytes
182 |         == """
183 | <!DOCTYPE html>
184 | 
185 | <html class="client-nojs" dir="ltr" lang="de">
186 | <head>
187 | <meta charset="UTF-8"/>
188 | <title>The title is not considered.</title>
189 | <script>document.documentElement.className="client-js";</script>
190 | <link href="//github.com" rel="dns-prefetch"/>
191 | <!--[if lt IE 9]><script src="/w/lnjnsef4/3nklnfasldcnal.js"></script><![endif]-->
192 | </head>
193 | <body class="bar">
194 | <div class="foo">
195 | <h1>Deutscher paragraph</h1>
196 | <p>1. Essen macht mich glücklich</p>
197 | </div>
198 | </body>
199 | </html>""".encode(
200 |             ENCODING
201 |         )
202 |     )
203 | 
204 | 
205 | def test_alter_file(test_files, tmp_path):
206 |     file_path = test_files / "test.html"
207 |     altered_file_path = test_files / "test_altered.html"
208 |     tmp_out_path = tmp_path / "test_out.html"
209 | 
210 |     file_wrapper = FileWrapper(file_path)
211 |     file_wrapper.add_alter(71, 102, "A new sentence.")
212 |     file_wrapper.apply_alters()
213 |     file_wrapper.save(tmp_out_path)
214 | 
215 |     assert filecmp.cmp(altered_file_path, tmp_out_path, shallow=False)
216 | 


--------------------------------------------------------------------------------
/tests/test_pdf2html2pdf_format.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | from expose_text import FileWrapper
 6 | from expose_text.formats.pdf.auto_pdf import AutoPdfFormat
 7 | 
 8 | black_square = u"\u25A0"
 9 | 
10 | 
11 | @pytest.fixture
12 | def tmp_files():
13 |     return Path(__file__).parent / "files" / "tmp"
14 | 
15 | 
16 | @pytest.fixture
17 | def test_files():
18 |     return Path(__file__).parent / "files" / "pdf"
19 | 
20 | 
21 | def test_pdf_text(tmp_files, test_files):
22 |     """
23 | 
24 |     Run this test alone: pytest -s tests/test_pdf2html2pdf_format.py
25 | 
26 |     """
27 |     input_fp = test_files / "doc.pdf"
28 |     output_fp = tmp_files / "doc.altered.pdf"
29 | 
30 |     fw = FileWrapper(input_fp, AutoPdfFormat)
31 | 
32 |     print("Before: %s" % fw.text[:25])
33 | 
34 |     # fw.add_alter(0, 9, "Deutscher")  # replace "Deutscher"
35 |     fw.add_alter(0, 9, "".join(10 * [black_square]))  # replace "Deutscher"
36 |     fw.apply_alters()
37 | 
38 |     print("After: %s" % fw.text[:25])
39 | 
40 |     fw.save(output_fp)
41 | 
42 |     print("Type: %s" % type(fw.file))
43 | 


--------------------------------------------------------------------------------
/tests/test_pdf_format.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | from expose_text import FileWrapper
 6 | from expose_text.formats.pdf.pdf2html2pdf import Pdf2Html2PdfFormat
 7 | 
 8 | black_square = u"\u25A0"
 9 | 
10 | 
11 | @pytest.fixture
12 | def tmp_files():
13 |     return Path(__file__).parent / "files" / "tmp"
14 | 
15 | 
16 | @pytest.fixture
17 | def test_files():
18 |     return Path(__file__).parent / "files" / "pdf"
19 | 
20 | 
21 | def test_pdf_text(tmp_files, test_files):
22 |     """
23 | 
24 |     Run this test alone: pytest -s tests/test_pdf_format.py
25 | 
26 |     """
27 |     input_fp = test_files / "doc.pdf"
28 |     output_fp = tmp_files / "doc.altered.pdf"
29 | 
30 |     fw = FileWrapper(input_fp)
31 | 
32 |     print(fw.text[:100])
33 | 
34 |     fw.add_alter(0, 9, "Deutscher")  # replace "Deutscher"
35 |     fw.apply_alters()
36 | 
37 |     print("xxx")
38 | 
39 |     print(fw.text[:100])
40 | 
41 |     # assert "XXXXXXX" == fw.text[0:7]  # TODO there is something wrong with indexing
42 | 
43 |     fw.save(output_fp)
44 | 
45 | 
46 | def test_check_dependencies():
47 |     print(Pdf2Html2PdfFormat().is_installed())
48 | 


--------------------------------------------------------------------------------
/tests/test_txt_format.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from expose_text.formats._txt import TxtFormat
 4 | 
 5 | ENCODING = "UTF-8"
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def txt_bytes():
10 |     return b"This is the content of a text file.\n\nWith multiple lines.\n\nTry alter me."
11 | 
12 | 
13 | @pytest.fixture()
14 | def format_cls(txt_bytes):
15 |     format_cls = TxtFormat()
16 |     format_cls.load(txt_bytes)
17 |     return format_cls
18 | 
19 | 
20 | def test_text_property(format_cls, txt_bytes):
21 |     assert format_cls.text == txt_bytes.decode(ENCODING)
22 | 
23 | 
24 | def test_bytes_property(format_cls, txt_bytes):
25 |     assert format_cls.bytes == txt_bytes
26 | 
27 | 
28 | def test_alterations(format_cls):
29 |     # all indices are for the original string (in_bytes)
30 |     format_cls.add_alter(0, 4, "That")
31 |     format_cls.add_alter(35, 59, " ")
32 |     format_cls.add_alter(63, 68, "change")
33 |     format_cls.apply_alters()
34 |     assert format_cls.text == "That is the content of a text file. Try change me."
35 |     assert format_cls.bytes == b"That is the content of a text file. Try change me."
36 | 
37 | 
38 | @pytest.mark.parametrize("encoding", ["utf-8", "utf-16", "latin-1", "windows-1252"])
39 | def test_encodings(encoding):
40 |     encoded_string = "¾ der Mäuse sind weiß. The bread costs 7$.".encode(encoding)
41 |     format_cls = TxtFormat()
42 |     format_cls.load(encoded_string)
43 |     assert format_cls.bytes == encoded_string
44 |     assert format_cls.text == encoded_string.decode(encoding)
45 | 


--------------------------------------------------------------------------------