├── .flake8
├── .github
    └── workflows
    │   ├── codeql-analysis.yml
    │   ├── python-checks.yml
    │   └── python-publish.yml
├── .gitignore
├── LICENSE.txt
├── README.md
├── doc
    └── demo.png
├── pdfannots.py
├── pdfannots
    ├── __init__.py
    ├── __main__.py
    ├── cli.py
    ├── printer
    │   ├── __init__.py
    │   ├── json.py
    │   └── markdown.py
    ├── py.typed
    ├── types.py
    └── utils.py
├── pyproject.toml
├── requirements.txt
├── tests.py
└── tests
    ├── FreeText-annotation.pdf
    ├── caret.pdf
    ├── hotos17.pdf
    ├── issue13.pdf
    ├── issue46.pdf
    ├── issue61.pdf
    ├── issue9.pdf
    ├── pr24.pdf
    └── word2column.pdf


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E741,W503
3 | max-line-length = 100
4 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | name: "CodeQL"
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     # The branches below must be a subset of the branches above
 8 |     branches: [ main ]
 9 |   schedule:
10 |     - cron: '39 7 * * 3'
11 | 
12 | jobs:
13 |   analyze:
14 |     name: Analyze
15 |     runs-on: ubuntu-latest
16 |     permissions:
17 |       actions: read
18 |       contents: read
19 |       security-events: write
20 | 
21 |     strategy:
22 |       fail-fast: false
23 |       matrix:
24 |         language: [ 'python' ]
25 | 
26 |     steps:
27 |     - name: Checkout repository
28 |       uses: actions/checkout@v2
29 | 
30 |     # Initializes the CodeQL tools for scanning.
31 |     - name: Initialize CodeQL
32 |       uses: github/codeql-action/init@v1
33 |       with:
34 |         languages: ${{ matrix.language }}
35 |         # If you wish to specify custom queries, you can do so here or in a config file.
36 |         # By default, queries listed here will override any specified in a config file.
37 |         # Prefix the list here with "+" to use these queries and those in the config file.
38 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
39 | 
40 |     - name: Perform CodeQL Analysis
41 |       uses: github/codeql-action/analyze@v1
42 | 


--------------------------------------------------------------------------------
/.github/workflows/python-checks.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python checks
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ main ]
 9 |   pull_request:
10 |     branches: [ main ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v2
22 |     - name: Set up Python ${{ matrix.python-version }}
23 |       uses: actions/setup-python@v2
24 |       with:
25 |         python-version: ${{ matrix.python-version }}
26 |     - name: Install dependencies
27 |       run: |
28 |         python -m pip install --upgrade pip
29 |         python -m pip install flake8 mypy autopep8 pytest
30 |         pip install -r requirements.txt
31 |     - name: Type check with mypy
32 |       run: mypy .
33 |     - name: Lint with flake8
34 |       run: flake8 . --count --show-source --statistics
35 |     - name: Check formatting with autopep8
36 |       run: autopep8 --diff --recursive --exit-code .
37 |     - name: Test with pytest
38 |       run: pytest tests.py
39 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will build and upload a Python Package using Twine
 2 | name: Upload Python Package to PyPI
 3 | 
 4 | on:
 5 |   # Trigger automatically when a release is published
 6 |   release:
 7 |     types: [published]
 8 | 
 9 |   # Also permit manual dispatch
10 |   workflow_dispatch:
11 | 
12 | jobs:
13 |   deploy:
14 | 
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v2
19 |     - name: Set up Python
20 |       uses: actions/setup-python@v2
21 |       with:
22 |         python-version: '3.x'
23 |     - name: Install dependencies
24 |       run: |
25 |         python -m pip install --upgrade pip
26 |         pip install build twine hatchling
27 |     - name: Extract version
28 |       id: get-version
29 |       run: echo "::set-output name=VERSION::$(python -m hatchling version)"
30 |     - name: Build package
31 |       run: python -m build
32 |     - name: Check package
33 |       run: twine check --strict dist/*
34 |     - name: Publish package, only if correctly tagged
35 |       if: github.ref == format('refs/tags/v{0}', steps.get-version.outputs.VERSION)
36 |       run: twine upload --non-interactive --verbose --disable-progress-bar dist/*
37 |       env:
38 |         TWINE_USERNAME: __token__
39 |         TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | dist/
3 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) Microsoft Corporation (2016-2022) and Andrew Baumann (2022-). All rights reserved.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining
 6 | a copy of this software and associated documentation files (the
 7 | "Software"), to deal in the Software without restriction, including
 8 | without limitation the rights to use, copy, modify, merge, publish,
 9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## pdfannots
  2 | 
  3 | [![Build status](https://github.com/0xabu/pdfannots/actions/workflows/python-checks.yml/badge.svg)](https://github.com/0xabu/pdfannots/actions/workflows/python-checks.yml)
  4 | [![PyPI version](https://img.shields.io/pypi/v/pdfannots)](https://pypi.org/project/pdfannots/)
  5 | 
  6 | This program extracts annotations (highlights, comments, etc.) from a PDF file,
  7 | and formats them as Markdown or exports them to JSON. It is primarily intended
  8 | for use in reviewing submissions to scientific conferences/journals.
  9 | 
 10 | ![Sample/demo of pdfannots extracting Markdown from an annotated PDF](doc/demo.png)
 11 | 
 12 | For the default Markdown format, the output is as follows:
 13 | 
 14 |  * Highlights without an attached comment are output first, as
 15 |    "highlights" with just the highlighted text included. Note that
 16 |    these are not typically suitable for use in a review, since they're
 17 |    unlikely to have any meaning to the recipient; they are just meant
 18 |    to serve as a reminder to the reviewer.
 19 | 
 20 |  * Highlights with an attached comment, and text annotations (not
 21 |    attached to any particular text/highlight) are output next, as
 22 |    "detailed comments". Typically most comments on a reviewed paper
 23 |    are of this form.
 24 | 
 25 |  * Underline, strikeout, and squiggly underline annotations are output
 26 |    last, as "Nits", with or without an attached comment. The intention
 27 |    of this is to easily separate formatting or grammatical corrections
 28 |    from more substantial comments about the content of the document.
 29 | 
 30 | For each annotation, the page number is given, along with the associated
 31 | (highlighted/underlined) text, if any. Additionally, if the document embeds
 32 | outlines (aka bookmarks), such as those generated by the LaTeX
 33 | [hyperref](https://ctan.org/pkg/hyperref) package, they are printed to help
 34 | identify to which section in the document the annotation refers.
 35 | 
 36 | 
 37 | ### Installation
 38 | 
 39 | To install the latest released version from PyPI, use a command such as:
 40 | ```
 41 | python3 -m pip install pdfannots
 42 | ```
 43 | 
 44 | 
 45 | ### Usage
 46 | 
 47 | See `pdfannots --help` (in a source tree: `pdfannots.py --help`) for
 48 | options and invocation.
 49 | 
 50 | 
 51 | ### Dependencies
 52 | 
 53 |  * Python >= 3.8
 54 |  * [pdfminer.six](https://github.com/pdfminer/pdfminer.six)
 55 | 
 56 | 
 57 | ### Known issues and limitations
 58 | 
 59 |  * While it is generally reliable, pdfminer (the underlying PDF parser) is
 60 |    not infallible at extracting text from a PDF. It has been known to fail
 61 |    in several different ways:
 62 | 
 63 |     * Sometimes it misses or misplaces individual characters, resulting in
 64 |       annotations with some or all of the text missing (in the latter case,
 65 |       you'll see a warning).
 66 | 
 67 |     * Sometimes the characters are captured, but not spaces between the words.
 68 |       Tweaking the advanced layout analysis parameters (e.g., `--word-margin`)
 69 |       may help with this.
 70 | 
 71 |     * Sometimes it extracts all the text but renders it out of order, for
 72 |       example, reporting that text at the top of a second column comes before
 73 |       text at the end of the first column. This causes pdfannots to return the
 74 |       annotations out of order, or to report the wrong outlines (section
 75 |       headings) for annotations. You can mostly work around this issue by using
 76 |       the `--cols` parameter to force a fixed page layout for the document
 77 |       (e.g. `--cols=2` for a typical 2-column document).
 78 | 
 79 |  * If an annotation (such as a StrikeOut) covers solely whitespace, no text is
 80 |    extracted for the annotation, and it will be skipped (with a warning). This
 81 |    is an artifact of the way pdfminer reports whitespace with only an implicit
 82 |    position defined by surrounding characters.
 83 | 
 84 |  * When extracting text, we remove all hyphens that immediately precede a line
 85 |    break and join the adjacent words. This usually produces the best results
 86 |    with LaTeX multi-column documents (e.g. "soft-`\n`ware" becomes "software"),
 87 |    but sometimes the hyphen needs to stay (e.g. "memory-`\n`mapped", which will be
 88 |    extracted as "memorymapped"), and we can't tell the difference. To disable
 89 |    this behaviour, pass `--keep-hyphens`.
 90 | 
 91 | 
 92 | ### FAQ
 93 | 
 94 |  1. I'd like to change how the output is formatted.
 95 | 
 96 |     Some minor tweaks (e.g.: word wrap, skipping or reordering output sections)
 97 |     can be accomplished via command-line arguments.
 98 | 
 99 |     All of the output comes from the relevant `Printer` subclass; more elaborate
100 |     changes can be accomplished there. Pull requests to introduce new output
101 |     formats or variants as printers are welcomed.
102 | 
103 |  2. I think I got a review generated by this tool...
104 | 
105 |     I hope that it was a constructive review, and that the annotations
106 |     helped the reviewer give you more detailed feedback so you can improve
107 |     your paper. This is, after all, just a tool, and it should not be an
108 |     excuse for reviewer sloppiness.
109 | 


--------------------------------------------------------------------------------
/doc/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xabu/pdfannots/08f6f0abdb690fb9d8b6cc3f6d244fd2d80c9af6/doc/demo.png


--------------------------------------------------------------------------------
/pdfannots.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | 
3 | # This script, which is not part of the pdfannots package, allows pdfannots
4 | # to by run directly from a source tree clone.
5 | 
6 | if __name__ == '__main__':
7 |     import pdfannots.cli
8 |     pdfannots.cli.main()
9 | 


--------------------------------------------------------------------------------
/pdfannots/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tool to extract and pretty-print PDF annotations for reviewing.
  3 | """
  4 | 
  5 | __version__ = '0.5'
  6 | 
  7 | import bisect
  8 | import collections
  9 | import itertools
 10 | import logging
 11 | import typing as typ
 12 | 
 13 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 14 | from pdfminer.pdfpage import PDFPage
 15 | from pdfminer.layout import (LAParams, LTAnno, LTChar, LTComponent, LTContainer, LTFigure, LTItem,
 16 |                              LTPage, LTTextBox, LTTextLine)
 17 | from pdfminer.converter import PDFLayoutAnalyzer
 18 | from pdfminer.pdfparser import PDFParser
 19 | from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
 20 | from pdfminer.psparser import PSLiteralTable, PSLiteral
 21 | from pdfminer import pdftypes
 22 | import pdfminer.settings
 23 | import pdfminer.utils
 24 | 
 25 | from .types import Page, Outline, AnnotationType, Annotation, Document, RGB
 26 | from .utils import cleanup_text, decode_datetime
 27 | 
 28 | pdfminer.settings.STRICT = False
 29 | 
 30 | logger = logging.getLogger('pdfannots')
 31 | 
 32 | ANNOT_SUBTYPES: typ.Dict[PSLiteral, AnnotationType] = {
 33 |     PSLiteralTable.intern(e.name): e for e in AnnotationType}
 34 | """Mapping from PSliteral to our own enumerant, for supported annotation types."""
 35 | 
 36 | IGNORED_ANNOT_SUBTYPES = \
 37 |     frozenset(PSLiteralTable.intern(n) for n in (
 38 |         'Link',   # Links are used for internal document links (e.g. to other pages).
 39 |         'Popup',  # Controls the on-screen appearance of other annotations. TODO: we may want to
 40 |                   # check for an optional 'Contents' field for alternative human-readable contents.
 41 |     ))
 42 | """Annotation types that we ignore without issuing a warning."""
 43 | 
 44 | 
 45 | def _mkannotation(
 46 |     pa: typ.Dict[str, typ.Any],
 47 |     page: Page
 48 | ) -> typ.Optional[Annotation]:
 49 |     """
 50 |     Given a PDF annotation, capture relevant fields and construct an Annotation object.
 51 | 
 52 |     Refer to Section 8.4 of the PDF reference (version 1.7).
 53 |     """
 54 | 
 55 |     subtype = pa.get('Subtype')
 56 |     annot_type = None
 57 |     assert isinstance(subtype, PSLiteral)
 58 |     try:
 59 |         annot_type = ANNOT_SUBTYPES[subtype]
 60 |     except KeyError:
 61 |         pass
 62 | 
 63 |     if annot_type is None:
 64 |         if subtype not in IGNORED_ANNOT_SUBTYPES:
 65 |             logger.warning("Unsupported %s annotation ignored on %s", subtype.name, page)
 66 |         return None
 67 | 
 68 |     contents = pa.get('Contents')
 69 |     if contents is not None:
 70 |         # decode as string, normalise line endings, replace special characters
 71 |         contents = cleanup_text(pdfminer.utils.decode_text(contents))
 72 | 
 73 |     rgb: typ.Optional[RGB] = None
 74 |     color = pdftypes.resolve1(pa.get('C'))
 75 |     if color:
 76 |         if (isinstance(color, list)
 77 |                 and len(color) == 3
 78 |                 and all(isinstance(e, (int, float)) and 0 <= e <= 1 for e in color)):
 79 |             rgb = RGB(*color)
 80 |         else:
 81 |             logger.warning("Invalid color %s in annotation on %s", color, page)
 82 | 
 83 |     # Rect defines the location of the annotation on the page
 84 |     rect = pdftypes.resolve1(pa.get('Rect'))
 85 | 
 86 |     # QuadPoints are defined only for "markup" annotations (Highlight, Underline, StrikeOut,
 87 |     # Squiggly, Caret), where they specify the quadrilaterals (boxes) covered by the annotation.
 88 |     quadpoints = pdftypes.resolve1(pa.get('QuadPoints'))
 89 | 
 90 |     author = pdftypes.resolve1(pa.get('T'))
 91 |     if author is not None:
 92 |         author = pdfminer.utils.decode_text(author)
 93 | 
 94 |     name = pdftypes.resolve1(pa.get('NM'))
 95 |     if name is not None:
 96 |         name = pdfminer.utils.decode_text(name)
 97 | 
 98 |     created = None
 99 |     dobj = pa.get('CreationDate')
100 |     # some pdf apps set modification date, but not creation date
101 |     dobj = dobj or pa.get('ModDate')
102 |     # poppler-based apps (e.g. Okular) use 'M' for some reason
103 |     dobj = dobj or pa.get('M')
104 |     createds = pdftypes.resolve1(dobj)
105 |     if createds is not None:
106 |         createds = pdfminer.utils.decode_text(createds)
107 |         created = decode_datetime(createds)
108 | 
109 |     in_reply_to = pa.get('IRT')
110 |     is_group = False
111 |     if in_reply_to is not None:
112 |         reply_type = pa.get('RT')
113 |         if reply_type is PSLiteralTable.intern('Group'):
114 |             is_group = True
115 |         elif not (reply_type is None or reply_type is PSLiteralTable.intern('R')):
116 |             logger.warning("Unexpected RT=%s, treated as R", reply_type)
117 | 
118 |     return Annotation(page, annot_type, quadpoints=quadpoints, rect=rect, name=name,
119 |                       contents=contents, author=author, created=created, color=rgb,
120 |                       in_reply_to_ref=in_reply_to, is_group_child=is_group)
121 | 
122 | 
123 | def _get_outlines(doc: PDFDocument) -> typ.Iterator[Outline]:
124 |     """Retrieve a list of (unresolved) Outline objects for all recognised outlines in the PDF."""
125 | 
126 |     def _resolve_dest(dest: typ.Any) -> typ.Any:
127 |         if isinstance(dest, pdftypes.PDFObjRef):
128 |             dest = pdftypes.resolve1(dest)
129 |         if isinstance(dest, bytes):
130 |             dest = pdftypes.resolve1(doc.get_dest(dest))
131 |         elif isinstance(dest, PSLiteral):
132 |             dest = pdftypes.resolve1(doc.get_dest(dest.name))
133 |         if isinstance(dest, dict):
134 |             dest = dest['D']
135 |         return dest
136 | 
137 |     for (_, title, destname, actionref, _) in doc.get_outlines():
138 |         if destname is None and actionref:
139 |             action = pdftypes.resolve1(actionref)
140 |             if isinstance(action, dict):
141 |                 subtype = action.get('S')
142 |                 if subtype is PSLiteralTable.intern('GoTo'):
143 |                     destname = action.get('D')
144 |         if destname is None:
145 |             continue
146 |         dest = _resolve_dest(destname)
147 | 
148 |         # consider targets of the form [page /XYZ left top zoom]
149 |         if dest[1] is PSLiteralTable.intern('XYZ'):
150 |             (pageref, _, targetx, targety) = dest[:4]
151 | 
152 |             if not isinstance(pageref, (int, pdftypes.PDFObjRef)):
153 |                 logger.warning("Unsupported pageref in outline: %s", pageref)
154 |             else:
155 |                 if targetx is None or targety is None:
156 |                     # Treat as a general reference to the page
157 |                     target = None
158 |                 else:
159 |                     target = (targetx, targety)
160 |                     if not all(isinstance(v, (int, float)) for v in target):
161 |                         logger.warning("Unsupported target in outline: (%r, %r)", targetx, targety)
162 |                         target = None
163 | 
164 |                 yield Outline(title, pageref, target)
165 | 
166 | 
167 | class _PDFProcessor(PDFLayoutAnalyzer):
168 |     """
169 |     PDF processor class.
170 | 
171 |     This class encapsulates our primary interface with pdfminer's page layout logic. It is used
172 |     to define a logical order for the objects we care about (Annotations and Outlines) on a page,
173 |     and to capture the text that annotations may refer to.
174 |     """
175 | 
176 |     CONTEXT_CHARS = 256
177 |     """Maximum number of recent characters to keep as context."""
178 | 
179 |     page: typ.Optional[Page]                # Page being processed.
180 |     charseq: int                            # Character sequence number within the page.
181 |     compseq: int                            # Component sequence number within the page.
182 |     recent_text: typ.Deque[str]             # Rotating buffer of recent text, for context.
183 |     _lasthit: typ.FrozenSet[Annotation]     # Annotations hit by the most recent character.
184 |     _curline: typ.Set[Annotation]           # Annotations hit somewhere on the current line.
185 | 
186 |     # Stores annotations that are subscribed to receive their post-annotation
187 |     # context. The first element of each tuple, on which the list is sorted, is
188 |     # the sequence number of the last character to hit the annotation.
189 |     context_subscribers: typ.List[typ.Tuple[int, Annotation]]
190 | 
191 |     def __init__(self, rsrcmgr: PDFResourceManager, laparams: LAParams):
192 |         super().__init__(rsrcmgr, laparams=laparams)
193 |         self.page = None
194 |         self.recent_text = collections.deque(maxlen=self.CONTEXT_CHARS)
195 |         self.context_subscribers = []
196 |         self.clear()
197 | 
198 |     def clear(self) -> None:
199 |         """Reset our internal per-page state."""
200 |         self.charseq = 0
201 |         self.compseq = 0
202 |         self.recent_text.clear()
203 |         self.context_subscribers.clear()
204 |         self._lasthit = frozenset()
205 |         self._curline = set()
206 | 
207 |     def set_page(self, page: Page) -> None:
208 |         """Prepare to process a new page. Must be called prior to processing."""
209 |         assert self.page is None
210 |         self.page = page
211 | 
212 |     def receive_layout(self, ltpage: LTPage) -> None:
213 |         """Callback from PDFLayoutAnalyzer superclass. Called once with each laid-out page."""
214 |         assert self.page is not None
215 | 
216 |         # Re-initialise our per-page state
217 |         self.clear()
218 | 
219 |         # Render all the items on the page
220 |         self.render(ltpage)
221 | 
222 |         # If we still have annotations needing context, give them whatever we have
223 |         for (charseq, annot) in self.context_subscribers:
224 |             available = self.charseq - charseq
225 |             annot.post_context = ''.join(self.recent_text[n] for n in range(-available, 0))
226 | 
227 |         self.page = None
228 | 
229 |     def update_pageseq(self, component: LTComponent) -> bool:
230 |         """Assign sequence numbers for objects on the page based on the nearest line of text.
231 |         Returns True if we need to recurse on smaller sub-components (e.g. characters)."""
232 |         assert self.page is not None
233 |         self.compseq += 1
234 | 
235 |         hits = 0
236 |         for x in itertools.chain(self.page.annots, self.page.outlines):
237 |             if x.update_pageseq(component, self.compseq):
238 |                 hits += 1
239 | 
240 |         # If we have assigned the same sequence number to multiple objects, and there exist smaller
241 |         # sub-components (e.g. characters within a line), we'll recurse on those assigning sequence
242 |         # numbers to sub-components to disambiguate the hits, but first we must forget about the
243 |         # current sequence number.
244 |         # NB: This could be done more efficiently -- we really only need to disambiguate conflicts
245 |         # that still exist after processing *all* the line-level components on the same page, but
246 |         # that would require multiple rendering passes.
247 |         if hits > 1 and isinstance(component, LTContainer) and len(component) > 1:
248 |             for x in itertools.chain(self.page.annots, self.page.outlines):
249 |                 x.discard_pageseq(self.compseq)
250 |             return True
251 | 
252 |         return False
253 | 
254 |     def test_boxes(self, item: LTComponent) -> None:
255 |         """Update the set of annotations whose boxes intersect with the area of the given item."""
256 |         assert self.page is not None
257 |         hits = frozenset(a for a in self.page.annots if a.boxes
258 |                          and any(b.hit_item(item) for b in a.boxes))
259 |         self._lasthit = hits
260 |         self._curline.update(hits)
261 | 
262 |     def capture_context(self, text: str) -> None:
263 |         """Store the character for use as context, and update subscribers if required."""
264 |         self.recent_text.append(text)
265 |         self.charseq += 1
266 | 
267 |         # Notify subscribers for whom this character provides the full post-context.
268 |         while self.context_subscribers:
269 |             (charseq, annot) = self.context_subscribers[0]
270 |             assert charseq < self.charseq
271 |             if charseq == self.charseq - self.CONTEXT_CHARS:
272 |                 annot.set_post_context(''.join(self.recent_text))
273 |                 self.context_subscribers.pop(0)
274 |             else:
275 |                 assert charseq > self.charseq - self.CONTEXT_CHARS
276 |                 break
277 | 
278 |     def capture_char(self, text: str) -> None:
279 |         """Capture a character."""
280 |         self.capture_context(text)
281 | 
282 |         if text == '\n':
283 |             # "Broadcast" newlines to _all_ annotations that received any text on the
284 |             # current line, in case they see more text on the next line, even if the
285 |             # most recent character on the line was not covered by their boxes.
286 |             for a in self._curline:
287 |                 a.capture('\n')
288 |             self._curline = set()
289 |         else:
290 |             # Broadcast the character to annotations that include it.
291 |             for a in self._lasthit:
292 |                 last_charseq = a.last_charseq
293 |                 a.capture(text, self.charseq)
294 | 
295 |                 if a.wants_context():
296 |                     if a.has_context():
297 |                         # We already gave the annotation the pre-context, so it is subscribed.
298 |                         # Locate and remove the annotation's existing context subscription.
299 |                         assert last_charseq != 0
300 |                         i = bisect.bisect_left(self.context_subscribers, (last_charseq,))
301 |                         assert 0 <= i < len(self.context_subscribers)
302 |                         while True:
303 |                             (found_charseq, found_annot) = self.context_subscribers[i]
304 |                             assert found_charseq == last_charseq
305 |                             if found_annot is a:
306 |                                 self.context_subscribers.pop(i)
307 |                                 break
308 |                             i += 1
309 |                             assert i < len(self.context_subscribers)
310 | 
311 |                     else:
312 |                         # This is the first hit for the annotation, so set the pre-context.
313 |                         assert last_charseq == 0
314 |                         assert len(a.text) != 0
315 |                         pre_context = ''.join(
316 |                             self.recent_text[n] for n in range(len(self.recent_text) - 1))
317 |                         a.set_pre_context(pre_context)
318 | 
319 |                     # Subscribe this annotation for post-context.
320 |                     self.context_subscribers.append((self.charseq, a))
321 | 
322 |     def render(self, item: LTItem, pageseq_nested: bool = False) -> None:
323 |         """
324 |         Helper for receive_layout, called recursively for every item on a page, in layout order.
325 | 
326 |         Ref: https://pdfminersix.readthedocs.io/en/latest/topic/converting_pdf_to_text.html
327 |         """
328 |         # Assign sequence numbers to items on the page based on their proximity to lines of text or
329 |         # to figures (which may contain bare LTChar elements).
330 |         if isinstance(item, (LTTextLine, LTFigure)) or (
331 |                 pageseq_nested and isinstance(item, LTComponent)):
332 |             pageseq_nested = self.update_pageseq(item)
333 | 
334 |         # If it's a container, recurse on nested items.
335 |         if isinstance(item, LTContainer):
336 |             for child in item:
337 |                 self.render(child, pageseq_nested)
338 | 
339 |             # After the children of a text box, capture the end of the final
340 |             # line (logic derived from pdfminer.converter.TextConverter).
341 |             if isinstance(item, LTTextBox):
342 |                 self.capture_char('\n')
343 | 
344 |         # Each character is represented by one LTChar, and we must handle
345 |         # individual characters (not higher-level objects like LTTextLine)
346 |         # so that we can capture only those covered by the annotation boxes.
347 |         elif isinstance(item, LTChar):
348 |             self.test_boxes(item)
349 |             self.capture_char(item.get_text())
350 | 
351 |         # LTAnno objects capture whitespace not explicitly encoded in
352 |         # the text. They don't have an (X,Y) position -- we treat them
353 |         # the same as the most recent character.
354 |         elif isinstance(item, LTAnno):
355 |             self.capture_char(item.get_text())
356 | 
357 | 
358 | def process_file(
359 |     file: typ.BinaryIO,
360 |     *,  # Subsequent arguments are keyword-only
361 |     columns_per_page: typ.Optional[int] = None,
362 |     emit_progress_to: typ.Optional[typ.TextIO] = None,
363 |     laparams: LAParams = LAParams()
364 | ) -> Document:
365 |     """
366 |     Process a PDF file, extracting its annotations and outlines.
367 | 
368 |     Arguments:
369 |         file                Handle to PDF file
370 |         columns_per_page    If set, overrides PDF Miner's layout detect with a fixed page layout
371 |         emit_progress_to    If set, file handle (e.g. sys.stderr) to which progress is reported
372 |         laparams            PDF Miner layout parameters
373 |     """
374 | 
375 |     # Initialise PDFMiner state
376 |     rsrcmgr = PDFResourceManager()
377 |     device = _PDFProcessor(rsrcmgr, laparams)
378 |     interpreter = PDFPageInterpreter(rsrcmgr, device)
379 |     parser = PDFParser(file)
380 |     doc = PDFDocument(parser)
381 | 
382 |     def emit_progress(msg: str) -> None:
383 |         if emit_progress_to is not None:
384 |             emit_progress_to.write(msg)
385 |             emit_progress_to.flush()
386 | 
387 |     emit_progress(file.name)
388 | 
389 |     # Retrieve outlines if present. Each outline refers to a page, using
390 |     # *either* a PDF object ID or an integer page number. These references will
391 |     # be resolved below while rendering pages -- for now we insert them into one
392 |     # of two dicts for later.
393 |     outlines_by_pageno: typ.Dict[object, typ.List[Outline]] = collections.defaultdict(list)
394 |     outlines_by_objid: typ.Dict[object, typ.List[Outline]] = collections.defaultdict(list)
395 | 
396 |     try:
397 |         for o in _get_outlines(doc):
398 |             if isinstance(o.pageref, pdftypes.PDFObjRef):
399 |                 outlines_by_objid[o.pageref.objid].append(o)
400 |             else:
401 |                 outlines_by_pageno[o.pageref].append(o)
402 |     except PDFNoOutlines:
403 |         logger.info("Document doesn't include outlines (\"bookmarks\")")
404 |     except Exception as ex:
405 |         logger.warning("Failed to retrieve outlines: %s", ex)
406 | 
407 |     # Iterate over all the pages, constructing page objects.
408 |     result = Document()
409 |     for (pageno, pdfpage) in enumerate(PDFPage.create_pages(doc)):
410 |         emit_progress(" %d" % (pageno + 1))
411 | 
412 |         page = Page(pageno, pdfpage.pageid, pdfpage.label, pdfpage.mediabox, columns_per_page)
413 |         result.pages.append(page)
414 | 
415 |         # Resolve any outlines referring to this page, and link them to the page.
416 |         # Note that outlines may refer to the page number or ID.
417 |         for o in (outlines_by_objid.pop(page.objid, [])
418 |                   + outlines_by_pageno.pop(pageno, [])):
419 |             o.resolve(page)
420 |             page.outlines.append(o)
421 | 
422 |         # Dict from object ID (in the ObjRef) to Annotation object
423 |         # This is used while post-processing to resolve inter-annotation references
424 |         annots_by_objid: typ.Dict[int, Annotation] = {}
425 | 
426 |         # Construct Annotation objects, and append them to the page.
427 |         for pa in pdftypes.resolve1(pdfpage.annots) if pdfpage.annots else []:
428 |             if isinstance(pa, pdftypes.PDFObjRef):
429 |                 annot_dict = pdftypes.dict_value(pa)
430 |                 if annot_dict:  # Would be empty if pa is a broken ref
431 |                     annot = _mkannotation(annot_dict, page)
432 |                     if annot is not None:
433 |                         page.annots.append(annot)
434 |                         assert pa.objid not in annots_by_objid
435 |                         annots_by_objid[pa.objid] = annot
436 |             else:
437 |                 logger.warning("Unknown annotation: %s", pa)
438 | 
439 |         # If the page has neither outlines nor annotations, skip further processing.
440 |         if not (page.annots or page.outlines):
441 |             continue
442 | 
443 |         # Render the page. This captures the selected text for any annotations
444 |         # on the page, and updates annotations and outlines with a logical
445 |         # sequence number based on the order of text lines on the page.
446 |         device.set_page(page)
447 |         interpreter.process_page(pdfpage)
448 | 
449 |         # Now we have their logical order, sort the annotations and outlines.
450 |         page.annots.sort()
451 |         page.outlines.sort()
452 | 
453 |         # Give the annotations a chance to update their internals
454 |         for a in page.annots:
455 |             a.postprocess(annots_by_objid)
456 | 
457 |     emit_progress("\n")
458 | 
459 |     device.close()
460 | 
461 |     # all outlines should be resolved by now
462 |     assert {} == outlines_by_pageno
463 |     assert {} == outlines_by_objid
464 | 
465 |     return result
466 | 


--------------------------------------------------------------------------------
/pdfannots/__main__.py:
--------------------------------------------------------------------------------
1 | from .cli import main
2 | 
3 | if __name__ == "__main__":
4 |     main()
5 | 


--------------------------------------------------------------------------------
/pdfannots/cli.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import sys
  4 | import typing as typ
  5 | 
  6 | from pdfminer.layout import LAParams
  7 | 
  8 | from . import __doc__, __version__, process_file
  9 | from .printer import Printer
 10 | from .printer.markdown import MarkdownPrinter, GroupedMarkdownPrinter
 11 | from .printer.json import JsonPrinter
 12 | 
 13 | 
 14 | MD_FORMAT_ARGS = frozenset([
 15 |     'condense',
 16 |     'group_highlights_by_color',
 17 |     'page_number_offset',
 18 |     'print_filename',
 19 |     'sections',
 20 |     'use_page_labels',
 21 |     'wrap_column',
 22 | ])
 23 | """Names of arguments passed to the markdown printer."""
 24 | 
 25 | 
 26 | def _float_or_disabled(x: str) -> typ.Optional[float]:
 27 |     if x.lower().strip() == "disabled":
 28 |         return None
 29 |     try:
 30 |         return float(x)
 31 |     except ValueError as ex:
 32 |         raise argparse.ArgumentTypeError("invalid float value: {}".format(x)) from ex
 33 | 
 34 | 
 35 | def parse_args() -> typ.Tuple[argparse.Namespace, LAParams]:
 36 |     p = argparse.ArgumentParser(prog='pdfannots', description=__doc__)
 37 | 
 38 |     p.add_argument('--version', action='version',
 39 |                    version='%(prog)s ' + __version__)
 40 | 
 41 |     p.add_argument("input", metavar="INFILE", type=argparse.FileType("rb"),
 42 |                    help="PDF files to process", nargs='+')
 43 | 
 44 |     g = p.add_argument_group('Basic options')
 45 |     g.add_argument("-p", "--progress", default=False, action="store_true",
 46 |                    help="Emit progress information to stderr.")
 47 |     g.add_argument("-o", metavar="OUTFILE", type=argparse.FileType("w", encoding="utf-8"),
 48 |                    dest="output", default=sys.stdout, help="Output file (default is stdout).")
 49 |     g.add_argument("-n", "--cols", default=None, type=int, metavar="COLS", dest="cols",
 50 |                    help="Assume a fixed top-to-bottom left-to-right page layout with this many "
 51 |                         "columns per page. If unset, PDFMiner's layout detection logic is used.")
 52 |     g.add_argument("--keep-hyphens", dest="remove_hyphens", default=True, action="store_false",
 53 |                    help="When capturing text across a line break, don't attempt to remove hyphens.")
 54 |     g.add_argument("-f", "--format", choices=["md", "json"], default="md",
 55 |                    help="Output format (default: markdown).")
 56 | 
 57 |     g = p.add_argument_group('Options controlling markdown output')
 58 |     mutex_group = g.add_mutually_exclusive_group()
 59 |     mutex_group.add_argument(
 60 |         "--no-group",
 61 |         dest="group",
 62 |         default=True, action="store_false",
 63 |         help="Emit annotations in order, don't group into sections."
 64 |     )
 65 |     mutex_group.add_argument(
 66 |         "--group-highlights-by-color",
 67 |         dest="group_highlights_by_color",
 68 |         default=False, action="store_true",
 69 |         help="Group highlights by color in grouped output."
 70 |     )
 71 | 
 72 |     g.add_argument("-s", "--sections", metavar="SEC", nargs="*",
 73 |                    choices=GroupedMarkdownPrinter.ALL_SECTIONS,
 74 |                    default=GroupedMarkdownPrinter.ALL_SECTIONS,
 75 |                    help=("sections to emit (default: %s)" %
 76 |                          ', '.join(GroupedMarkdownPrinter.ALL_SECTIONS)))
 77 |     g.add_argument("--no-condense", dest="condense", default=True, action="store_false",
 78 |                    help="Emit annotations as a blockquote regardless of length.")
 79 |     g.add_argument("--no-page-labels", dest="use_page_labels", default=True, action="store_false",
 80 |                    help="Ignore page labels if present, just print 1-based page numbers.")
 81 |     g.add_argument("--page-number-offset", dest="page_number_offset", default=0, type=int,
 82 |                    help="Increase or decrease page numbers with a fixed offset.")
 83 |     g.add_argument("--print-filename", dest="print_filename", default=False, action="store_true",
 84 |                    help="Print the name of each file with annotations.")
 85 |     g.add_argument("-w", "--wrap", dest="wrap_column", metavar="COLS", type=int,
 86 |                    help="Wrap text at this many output columns.")
 87 | 
 88 |     g = p.add_argument_group(
 89 |         "Advanced options affecting PDFMiner text layout analysis")
 90 |     laparams = LAParams()
 91 |     g.add_argument(
 92 |         "--line-overlap", metavar="REL_HEIGHT", type=float, default=laparams.line_overlap,
 93 |         help="If two characters have more overlap than this they are considered to be "
 94 |              "on the same line. The overlap is specified relative to the minimum height "
 95 |              "of both characters. Default: %s" % laparams.line_overlap)
 96 |     g.add_argument(
 97 |         "--char-margin", metavar="REL_WIDTH", type=float, default=laparams.char_margin,
 98 |         help="If two characters are closer together than this margin they "
 99 |              "are considered to be part of the same line. The margin is "
100 |              "specified relative to the character width. Default: %s" % laparams.char_margin)
101 |     g.add_argument(
102 |         "--word-margin", metavar="REL_WIDTH", type=float, default=laparams.word_margin,
103 |         help="If two characters on the same line are further apart than this "
104 |              "margin then they are considered to be two separate words, and "
105 |              "an intermediate space will be added for readability. The margin "
106 |              "is specified relative to the character width. Default: %s" % laparams.word_margin)
107 |     g.add_argument(
108 |         "--line-margin", metavar="REL_HEIGHT", type=float, default=laparams.line_margin,
109 |         help="If two lines are close together they are considered to "
110 |              "be part of the same paragraph. The margin is specified "
111 |              "relative to the height of a line. Default: %s" % laparams.line_margin)
112 |     g.add_argument(
113 |         "--boxes-flow", type=_float_or_disabled, default=laparams.boxes_flow,
114 |         help="Specifies how much a horizontal and vertical position of a "
115 |              "text matters when determining the order of lines. The value "
116 |              "should be within the range of -1.0 (only horizontal position "
117 |              "matters) to +1.0 (only vertical position matters). You can also "
118 |              "pass 'disabled' to disable advanced layout analysis, and "
119 |              "instead return text based on the position of the bottom left "
120 |              "corner of the text box. Default: %s" % laparams.boxes_flow)
121 | 
122 |     # The next two booleans are described as if they default off, so let's ensure that.
123 |     assert not laparams.detect_vertical
124 |     assert not laparams.all_texts
125 |     g.add_argument(
126 |         "--detect-vertical", default=laparams.detect_vertical,
127 |         action="store_const", const=(not laparams.detect_vertical),
128 |         help="Consider vertical text during layout analysis.")
129 |     g.add_argument(
130 |         "--all-texts", default=laparams.all_texts,
131 |         action="store_const", const=(not laparams.all_texts),
132 |         help="Perform layout analysis on text in figures.")
133 | 
134 |     args = p.parse_args()
135 | 
136 |     # Propagate parsed layout parameters back to LAParams object
137 |     for param in ("line_overlap", "char_margin", "word_margin", "line_margin",
138 |                   "boxes_flow", "detect_vertical", "all_texts"):
139 |         setattr(laparams, param, getattr(args, param))
140 | 
141 |     return args, laparams
142 | 
143 | 
144 | def main() -> None:
145 |     args, laparams = parse_args()
146 |     logging.basicConfig(format='%(levelname)s: %(message)s',
147 |                         level=logging.WARNING)
148 | 
149 |     # construct appropriate Printer
150 |     printer: Printer
151 |     if args.format == "md":
152 |         mdargs = {k: getattr(args, k) for k in MD_FORMAT_ARGS}
153 |         printer = (GroupedMarkdownPrinter if args.group else MarkdownPrinter)(**mdargs)
154 |     elif args.format == "json":
155 |         printer = JsonPrinter(
156 |             remove_hyphens=args.remove_hyphens,
157 |             output_codec=args.output.encoding)
158 | 
159 |     def write_if_nonempty(s: str) -> None:
160 |         if s:
161 |             args.output.write(s)
162 | 
163 |     write_if_nonempty(printer.begin())
164 | 
165 |     # iterate over files
166 |     for file in args.input:
167 |         doc = process_file(
168 |             file,
169 |             columns_per_page=args.cols,
170 |             emit_progress_to=(sys.stderr if args.progress else None),
171 |             laparams=laparams)
172 |         for line in printer.print_file(file.name, doc):
173 |             args.output.write(line)
174 | 
175 |     write_if_nonempty(printer.end())
176 | 


--------------------------------------------------------------------------------
/pdfannots/printer/__init__.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import typing as typ
 3 | 
 4 | from ..types import Document
 5 | 
 6 | 
 7 | class Printer(abc.ABC):
 8 |     """
 9 |     Base class for pretty-printers.
10 |     """
11 | 
12 |     def begin(self) -> str:
13 |         """Called once prior to print_file call. Returns initial output."""
14 |         return ''
15 | 
16 |     @abc.abstractmethod
17 |     def print_file(
18 |         self,
19 |         filename: str,
20 |         document: Document
21 |     ) -> typ.Iterator[str]:
22 |         """
23 |         Pretty-print a single document.
24 | 
25 |         Pretty-print the extracted annotations, yielding output (incrementally) as strings.
26 |         Called multiple times, once per file.
27 |         """
28 | 
29 |     def end(self) -> str:
30 |         """Called once after the final print_file call. Returns any final additional output."""
31 |         return ''
32 | 


--------------------------------------------------------------------------------
/pdfannots/printer/json.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import typing as typ
 3 | 
 4 | from . import Printer
 5 | from ..types import Annotation, Document
 6 | 
 7 | 
 8 | def annot_to_dict(
 9 |     doc: Document,
10 |     annot: Annotation,
11 |     remove_hyphens: bool
12 | ) -> typ.Dict[str, typ.Any]:
13 |     """Convert an annotation to a dictionary representation suitable for JSON encoding."""
14 |     assert annot.pos
15 | 
16 |     result = {
17 |         "name": annot.name,
18 |         "type": annot.subtype.name,
19 |         "page": annot.pos.page.pageno + 1,
20 |         "page_label": annot.pos.page.label,
21 |         "start_xy": (annot.pos.x, annot.pos.y),
22 |         "prior_outline": getattr(doc.nearest_outline(annot.pos), 'title', None),
23 |         "text": annot.gettext(remove_hyphens),
24 |         "contents": annot.contents,
25 |         "author": annot.author,
26 |         "created": annot.created.strftime('%Y-%m-%dT%H:%M:%S') if annot.created else None,
27 |         "color": ('#' + annot.color.ashex()) if annot.color else None,
28 |         "in_reply_to": annot.in_reply_to.name if annot.in_reply_to else None,
29 |     }
30 | 
31 |     # Remove keys with None values in nested dictionary and return
32 |     return {k: v for k, v in result.items() if v is not None}
33 | 
34 | 
35 | class JsonPrinter(Printer):
36 |     def __init__(
37 |             self,
38 |             *,
39 |             remove_hyphens: bool,  # Whether to remove hyphens across a line break
40 |             output_codec: str      # Text codec in use for output
41 |     ) -> None:
42 |         self.remove_hyphens = remove_hyphens
43 |         self.seen_first = False
44 | 
45 |         # JSON must be represented as UTF-8, UTF-16, or UTF-32. If the output codec is
46 |         # one of these, we can disable ASCII string escaping in the JSON encoder.
47 |         self.ensure_ascii = output_codec not in ['utf-8', 'utf-16', 'utf-32']
48 | 
49 |     def end(self) -> str:
50 |         return '\n'
51 | 
52 |     def print_file(
53 |         self,
54 |         filename: str,
55 |         document: Document
56 |     ) -> typ.Iterator[str]:
57 |         if self.seen_first:
58 |             # The flat array format is incompatible with multiple input files
59 |             # TODO: Ideally we'd catch this at invocation time
60 |             raise RuntimeError("The JSON output format does not support multiple files.")
61 |         else:
62 |             self.seen_first = True
63 | 
64 |         annots = [annot_to_dict(document, a, self.remove_hyphens)
65 |                   for a in document.iter_annots(include_replies=True)]
66 |         yield from json.JSONEncoder(indent=2, ensure_ascii=self.ensure_ascii).iterencode(annots)
67 | 


--------------------------------------------------------------------------------
/pdfannots/printer/markdown.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import logging
  3 | import textwrap
  4 | import typing as typ
  5 | 
  6 | from . import Printer
  7 | from ..types import RGB, AnnotationType, Pos, Annotation, Document
  8 | 
  9 | logger = logging.getLogger('pdfannots')
 10 | 
 11 | MAX_CONTEXT_WORDS = 10
 12 | """Maximum number of words returned by trim_context."""
 13 | 
 14 | FALLBACK_CONTEXT_WORDS = 4
 15 | """Number of words returned by trim_context in fallback mode."""
 16 | 
 17 | CONTEXT_BOUNDARIES = [
 18 |     # (separator, keep_on_left, keep_on_right)
 19 |     ('. ', False, True),  # sentence boundary
 20 |     ('! ', False, True),
 21 |     ('? ', False, True),
 22 |     (': ', False, False),
 23 |     ('; ', False, False),
 24 |     ('" ', False, True),   # end of quote
 25 |     (' "', True, False),   # start of quote
 26 |     (') ', False, True),   # end of parenthesis
 27 |     (' (', True, False),   # start of parenthesis
 28 |     ('—', False, False),   # em dash
 29 | ]
 30 | """Rough approximation of natural boundaries in writing, used when searching for context."""
 31 | 
 32 | 
 33 | def trim_context(context: str, keep_right: bool) -> str:
 34 |     """
 35 |     Trim context for presentation.
 36 | 
 37 |     Given a potentially-long string of context preceding or following an annotation, identify
 38 |     a natural boundary at which to trim it, and return the trimmed string.
 39 | 
 40 |     Arguments:
 41 |         context     String of captured context
 42 |         keep_right  Whether to retain text on the right (True) or left (False) end of the string
 43 |     """
 44 |     best = None
 45 | 
 46 |     for (sep, keep_sep_left, keep_sep_right) in CONTEXT_BOUNDARIES:
 47 |         # search for the separator
 48 |         i = context.rfind(sep) if keep_right else context.find(sep)
 49 |         if i < 0:
 50 |             continue
 51 | 
 52 |         # include the separator if desired
 53 |         if (keep_right and not keep_sep_left) or (not keep_right and keep_sep_right):
 54 |             i += len(sep)
 55 | 
 56 |         # extract the candidate string
 57 |         candidate = context[i:] if keep_right else context[:i]
 58 | 
 59 |         if best is None or len(candidate) < len(best):
 60 |             best = candidate
 61 |             if len(candidate.split()) <= 1:
 62 |                 break
 63 | 
 64 |     if best is not None and len(best.split()) <= MAX_CONTEXT_WORDS:
 65 |         return best
 66 | 
 67 |     # Give up and take a few words, whatever they are.
 68 |     if keep_right:
 69 |         fallback = '...' + ' '.join(context.split()[-FALLBACK_CONTEXT_WORDS:])
 70 |         if context[-1].isspace():
 71 |             fallback += context[-1]
 72 |     else:
 73 |         fallback = ' '.join(context.split()[:FALLBACK_CONTEXT_WORDS]) + '...'
 74 |         if context[0].isspace():
 75 |             fallback = context[0] + fallback
 76 | 
 77 |     return fallback
 78 | 
 79 | 
 80 | class MarkdownPrinter(Printer):
 81 |     BULLET_INDENT1 = " * "
 82 |     BULLET_INDENT2 = "   "
 83 |     QUOTE_INDENT = BULLET_INDENT2 + "> "
 84 | 
 85 |     def __init__(
 86 |         self,
 87 |         *,
 88 |         condense: bool = True,                  # Permit use of the condensed format
 89 |         page_number_offset: int = 0,            # Page number offset
 90 |         print_filename: bool = False,           # Whether to print file names
 91 |         remove_hyphens: bool = True,            # Whether to remove hyphens across a line break
 92 |         use_page_labels: bool = True,           # Whether to use page labels
 93 |         wrap_column: typ.Optional[int] = None,  # Column at which output is word-wrapped
 94 |         **kwargs: typ.Any                       # Other args, ignored
 95 |     ) -> None:
 96 |         self.page_number_offset = page_number_offset
 97 |         self.print_filename = print_filename
 98 |         self.remove_hyphens = remove_hyphens
 99 |         self.use_page_labels = use_page_labels
100 |         self.wrap_column = wrap_column
101 |         self.condense = condense
102 | 
103 |         if self.wrap_column:
104 |             # For bullets, we need two text wrappers: one for the leading
105 |             # bullet on the first paragraph, one without.
106 |             self.bullet_tw1 = textwrap.TextWrapper(
107 |                 width=self.wrap_column,
108 |                 initial_indent=self.BULLET_INDENT1,
109 |                 subsequent_indent=self.BULLET_INDENT2)
110 | 
111 |             self.bullet_tw2 = textwrap.TextWrapper(
112 |                 width=self.wrap_column,
113 |                 initial_indent=self.BULLET_INDENT2,
114 |                 subsequent_indent=self.BULLET_INDENT2)
115 | 
116 |             # For blockquotes, each line is prefixed with "> "
117 |             self.quote_tw = textwrap.TextWrapper(
118 |                 width=self.wrap_column,
119 |                 initial_indent=self.QUOTE_INDENT,
120 |                 subsequent_indent=self.QUOTE_INDENT)
121 | 
122 |     def print_file(
123 |         self,
124 |         filename: str,
125 |         document: Document
126 |     ) -> typ.Iterator[str]:
127 |         body_iter = self.emit_body(document)
128 | 
129 |         if self.print_filename:
130 |             # Print the file name, only if there is some output.
131 |             try:
132 |                 first = next(body_iter)
133 |             except StopIteration:
134 |                 pass
135 |             else:
136 |                 yield "# File: '%s'\n\n" % filename
137 |                 yield first
138 | 
139 |         yield from body_iter
140 | 
141 |     @staticmethod
142 |     def format_pos(
143 |         pos: Pos,
144 |         document: Document,
145 |         use_page_label: bool,
146 |         page_number_offset: int
147 |     ) -> str:
148 | 
149 |         result = pos.page.format_name(
150 |             use_label=use_page_label,
151 |             page_number_offset=page_number_offset).title()
152 | 
153 |         o = document.nearest_outline(pos)
154 |         if o:
155 |             result += " (%s)" % o.title
156 | 
157 |         return result
158 | 
159 |     def format_bullet(
160 |         self,
161 |         paras: typ.List[str],
162 |         quote: typ.Optional[typ.Tuple[int, int]] = None
163 |     ) -> str:
164 |         """
165 |         Format a Markdown bullet, wrapped as desired.
166 |         """
167 | 
168 |         if quote is not None:
169 |             (quotepos, quotelen) = quote
170 |             assert quotepos > 0  # first paragraph to format as a block-quote
171 |             assert quotelen > 0  # length of the blockquote in paragraphs
172 |             assert quotepos + quotelen <= len(paras)
173 | 
174 |         # emit the first paragraph with the bullet
175 |         if self.wrap_column:
176 |             ret = self.bullet_tw1.fill(paras[0])
177 |         else:
178 |             ret = self.BULLET_INDENT1 + paras[0]
179 | 
180 |         # emit subsequent paragraphs
181 |         npara = 1
182 |         for para in paras[1:]:
183 |             # are we in a blockquote?
184 |             inquote = quote and npara >= quotepos and npara < quotepos + quotelen
185 | 
186 |             # emit a paragraph break
187 |             # if we're going straight to a quote, we don't need an extra newline
188 |             ret = ret + ('\n' if quote and npara == quotepos else '\n\n')
189 | 
190 |             if self.wrap_column:
191 |                 tw = self.quote_tw if inquote else self.bullet_tw2
192 |                 ret = ret + tw.fill(para)
193 |             else:
194 |                 indent = self.QUOTE_INDENT if inquote else self.BULLET_INDENT2
195 |                 ret = ret + indent + para
196 | 
197 |             npara += 1
198 | 
199 |         return ret
200 | 
201 |     def merge_context(self, annot: Annotation, text: str) -> str:
202 |         """Merge the context for a strikeout or caret annotation into the text."""
203 |         (pre, post) = annot.get_context(self.remove_hyphens)
204 | 
205 |         if pre:
206 |             pre = trim_context(pre, keep_right=True)
207 | 
208 |         if post:
209 |             post = trim_context(post, keep_right=False)
210 | 
211 |         if annot.subtype == AnnotationType.StrikeOut:
212 |             return pre + '~~' + text + '~~' + post
213 |         else:
214 |             assert annot.subtype == AnnotationType.Caret
215 |             assert text.isspace()
216 |             return pre.rstrip(' ') + ' ^ ' + post.lstrip(' ')
217 | 
218 |     def format_annot(
219 |         self,
220 |         annot: Annotation,
221 |         document: Document,
222 |         extra: typ.Optional[str] = None
223 |     ) -> str:
224 |         # Limited support for Caret annotations with a single "reply" of type StrikeOut
225 |         contents = annot.contents
226 |         if annot.subtype == AnnotationType.Caret and annot.group_children:
227 |             child = annot.get_child_by_type(AnnotationType.StrikeOut)
228 |             if child:
229 |                 annot = child
230 |                 if child.contents:
231 |                     logger.warning("Ignored StrikeOut comment: %s", child.contents)
232 | 
233 |         # capture item text and contents (i.e. the comment), and split the latter into paragraphs
234 |         text = annot.gettext(self.remove_hyphens) or ''
235 |         comment = [l for l in contents.splitlines() if l] if contents else []
236 | 
237 |         if annot.has_context():
238 |             text = self.merge_context(annot, text)
239 | 
240 |         # we are either printing: item text and item contents, or one of the two
241 |         # if we see an annotation with neither, something has gone wrong
242 |         if not (text or comment):
243 |             logger.warning('%s annotation at %s has neither text nor a comment; skipped',
244 |                            annot.subtype.name, annot.pos)
245 |             return ''
246 | 
247 |         # compute the formatted position (and extra bit if needed) as a label
248 |         assert annot.pos is not None
249 |         label = self.format_pos(
250 |             annot.pos, document, self.use_page_labels, self.page_number_offset
251 |         ) + (" " + extra if extra else "") + ":"
252 | 
253 |         # If we have short (few words) text with a short or no comment, and the
254 |         # text contains no embedded full stops or quotes, then we'll just put
255 |         # quotation marks around the text and merge the two into a single paragraph.
256 |         if (self.condense
257 |             and text
258 |             and not annot.has_context()
259 |             and len(text.split()) <= 10  # words
260 |             and all([x not in text for x in ['"', '. ']])
261 |                 and (not comment or len(comment) == 1)):
262 |             msg = label + ' "' + text + '"'
263 |             if comment:
264 |                 msg = msg + ' -- ' + comment[0]
265 |             return self.format_bullet([msg]) + "\n\n"
266 | 
267 |         # If there is no text and a single-paragraph comment, it also goes on
268 |         # one line.
269 |         elif comment and not text and len(comment) == 1:
270 |             msg = label + " " + comment[0]
271 |             return self.format_bullet([msg]) + "\n\n"
272 | 
273 |         # Otherwise, text (if any) turns into a blockquote, and the comment (if
274 |         # any) into subsequent paragraphs.
275 |         else:
276 |             msgparas = [label] + [text] + comment
277 |             quotepos = (1, 1) if text else None
278 |             return self.format_bullet(msgparas, quotepos) + "\n\n"
279 | 
280 |     def emit_body(
281 |         self,
282 |         document: Document
283 |     ) -> typ.Iterator[str]:
284 |         for a in document.iter_annots():
285 |             yield self.format_annot(a, document, a.subtype.name)
286 | 
287 | 
288 | class GroupedMarkdownPrinter(MarkdownPrinter):
289 |     ANNOT_NITS = frozenset({AnnotationType.Caret, AnnotationType.Squiggly,
290 |                             AnnotationType.StrikeOut, AnnotationType.Underline})
291 |     ALL_SECTIONS = ["highlights", "comments", "nits"]
292 | 
293 |     def __init__(
294 |         self,
295 |         *,
296 |         sections: typ.Sequence[str] = ALL_SECTIONS,  # controls the order of sections output
297 |         group_highlights_by_color: bool = False,     # Whether to group highlights by color
298 |         **kwargs: typ.Any                            # other args -- see superclass
299 |     ) -> None:
300 |         super().__init__(**kwargs)
301 |         self.sections = sections
302 |         self.group_highlights_by_color = group_highlights_by_color
303 |         self._fmt_header_called: bool
304 | 
305 |     def emit_body(
306 |         self,
307 |         document: Document
308 |     ) -> typ.Iterator[str]:
309 | 
310 |         self._fmt_header_called = False
311 | 
312 |         def fmt_header(name: str, level: int = 2) -> str:
313 |             """
314 |             A function that formats a header with a given name and level.
315 | 
316 |             Parameters:
317 |                 name (str): The name of the header.
318 |                 level (int, optional): The level of the header. Defaults to 2.
319 | 
320 |             Returns:
321 |                 str: The formatted header.
322 |             """
323 |             # emit blank separator line if needed
324 |             prefix = '\n' if self._fmt_header_called else ''
325 |             self._fmt_header_called = True
326 |             header = '#' * level
327 |             return prefix + header + " " + name + "\n"
328 | 
329 |         # Partition annotations into nits, comments, and highlights.
330 |         nits: typ.List[Annotation] = []
331 |         comments: typ.List[Annotation] = []
332 |         highlights: typ.List[Annotation] = []  # When grouping by color holds only undefined annots
333 |         highlights_by_color: typ.DefaultDict[RGB, typ.List[Annotation]] = defaultdict(list)
334 | 
335 |         for a in document.iter_annots():
336 |             if a.subtype in self.ANNOT_NITS:
337 |                 nits.append(a)
338 |             elif a.contents:
339 |                 comments.append(a)
340 |             elif a.subtype == AnnotationType.Highlight:
341 |                 if self.group_highlights_by_color and a.color:
342 |                     highlights_by_color[a.color].append(a)
343 |                 else:
344 |                     highlights.append(a)
345 | 
346 |         for secname in self.sections:
347 |             if (highlights or highlights_by_color) and secname == 'highlights':
348 |                 yield fmt_header("Highlights")
349 | 
350 |                 for color, annots in highlights_by_color.items():
351 |                     yield fmt_header(f"Color: {color.ashex()}", level=3)
352 |                     for a in annots:
353 |                         yield self.format_annot(a, document)
354 | 
355 |                 if highlights and self.group_highlights_by_color:
356 |                     yield fmt_header("Color: undefined", level=3)
357 | 
358 |                 for a in highlights:
359 |                     yield self.format_annot(a, document)
360 | 
361 |             if comments and secname == 'comments':
362 |                 yield fmt_header("Detailed comments")
363 |                 for a in comments:
364 |                     yield self.format_annot(a, document)
365 | 
366 |             if nits and secname == 'nits':
367 |                 yield fmt_header("Nits")
368 |                 for a in nits:
369 |                     extra = None
370 |                     if a.subtype == AnnotationType.Caret:
371 |                         if a.get_child_by_type(AnnotationType.StrikeOut):
372 |                             extra = "suggested replacement"
373 |                         else:
374 |                             extra = "suggested insertion"
375 |                     elif a.subtype == AnnotationType.StrikeOut:
376 |                         extra = "suggested deletion"
377 | 
378 |                     yield self.format_annot(a, document, extra)
379 | 


--------------------------------------------------------------------------------
/pdfannots/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xabu/pdfannots/08f6f0abdb690fb9d8b6cc3f6d244fd2d80c9af6/pdfannots/py.typed


--------------------------------------------------------------------------------
/pdfannots/types.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import bisect
  4 | import datetime
  5 | import enum
  6 | import functools
  7 | import logging
  8 | import typing as typ
  9 | 
 10 | from pdfminer.layout import LTComponent, LTText
 11 | from pdfminer.pdftypes import PDFObjRef
 12 | 
 13 | from .utils import merge_lines
 14 | 
 15 | logger = logging.getLogger('pdfannots')
 16 | 
 17 | Point = typ.Tuple[float, float]
 18 | """An (x, y) point in PDF coordinates, i.e. bottom left is 0,0."""
 19 | 
 20 | BoxCoords = typ.Tuple[float, float, float, float]
 21 | """The coordinates of a bounding box (x0, y0, x1, y1)."""
 22 | 
 23 | 
 24 | class Box:
 25 |     """
 26 |     Coordinates of a rectangular box.
 27 |     """
 28 | 
 29 |     def __init__(self, x0: float, y0: float, x1: float, y1: float):
 30 |         assert x0 <= x1 and y0 <= y1
 31 |         self.x0 = x0
 32 |         self.x1 = x1
 33 |         self.y0 = y0
 34 |         self.y1 = y1
 35 | 
 36 |     def __repr__(self) -> str:
 37 |         return '<Box (%f,%f) (%f,%f)>' % (self.x0, self.y0, self.x1, self.y1)
 38 | 
 39 |     @staticmethod
 40 |     def from_item(item: LTComponent) -> Box:
 41 |         """Construct a Box from the bounding box of a given PDF component."""
 42 |         return Box(item.x0, item.y0, item.x1, item.y1)
 43 | 
 44 |     @staticmethod
 45 |     def from_coords(coords: BoxCoords) -> Box:
 46 |         """Construct a Box from the given PDF coordinates."""
 47 |         (x0, y0, x1, y1) = coords
 48 |         return Box(x0, y0, x1, y1)
 49 | 
 50 |     def get_coords(self) -> BoxCoords:
 51 |         """Return the PDF coordinates of this box."""
 52 |         return (self.x0, self.y0, self.x1, self.y1)
 53 | 
 54 |     def get_width(self) -> float:
 55 |         """Return the width of the box."""
 56 |         return self.x1 - self.x0
 57 | 
 58 |     def get_height(self) -> float:
 59 |         """Return the height of the box."""
 60 |         return self.y1 - self.y0
 61 | 
 62 |     def get_area(self) -> float:
 63 |         """Return the area of the box."""
 64 |         return self.get_height() * self.get_width()
 65 | 
 66 |     def get_overlap(self, other: Box) -> float:
 67 |         """Compute the overlapping area (if any) with the provided box."""
 68 |         x_overlap = max(0, min(other.x1, self.x1) - max(other.x0, self.x0))
 69 |         y_overlap = max(0, min(other.y1, self.y1) - max(other.y0, self.y0))
 70 |         return x_overlap * y_overlap
 71 | 
 72 |     def hit_item(self, item: LTComponent) -> bool:
 73 |         """Does most of the area of the PDF component overlap this box?"""
 74 |         item_area = float(item.width) * float(item.height)
 75 |         overlap_area = self.get_overlap(Box.from_item(item))
 76 | 
 77 |         if overlap_area != 0:
 78 |             logger.debug(
 79 |                 "Box hit: '%s' %f-%f,%f-%f in %f-%f,%f-%f %2.0f%%",
 80 |                 item.get_text() if isinstance(item, LTText) else '',
 81 |                 item.x0, item.x1, item.y0, item.y1,
 82 |                 self.x0, self.x1, self.y0, self.y1,
 83 |                 100 * overlap_area / item_area)
 84 | 
 85 |         assert overlap_area <= item_area
 86 |         return (item_area != 0) and overlap_area >= (0.5 * item_area)
 87 | 
 88 |     def closest_point(self, point: Point) -> Point:
 89 |         """Compute the closest point in this box to the specified point."""
 90 |         px, py = point
 91 |         return (min(max(self.x0, px), self.x1),
 92 |                 min(max(self.y0, py), self.y1))
 93 | 
 94 |     def square_of_distance_to_closest_point(self, point: Point) -> float:
 95 |         """
 96 |         Compute the distance from the closest point in this box to the specified point, squared.
 97 | 
 98 |         (We avoid calling sqrt for performance reasons, since we just need to compare.)
 99 |         """
100 |         x, y = self.closest_point(point)
101 |         px, py = point
102 |         return abs(px - x)**2 + abs(py - y)**2
103 | 
104 | 
105 | @functools.total_ordering
106 | class Page:
107 |     """
108 |     Page.
109 | 
110 |     A page object uniquely represents a page in the PDF. It is identified by a
111 |     zero-based page number, and a PDF object ID. It holds a list of Annotation
112 |     objects for annotations on the page, and Outline objects for outlines that
113 |     link to somewhere on the page.
114 |     """
115 | 
116 |     annots: typ.List[Annotation]
117 |     outlines: typ.List[Outline]
118 | 
119 |     def __init__(
120 |         self,
121 |         pageno: int,
122 |         objid: object,
123 |         label: typ.Optional[str],
124 |         mediabox: BoxCoords,
125 |         fixed_columns: typ.Optional[int] = None
126 |     ):
127 |         assert pageno >= 0
128 |         assert fixed_columns is None or fixed_columns > 0
129 |         self.pageno = pageno
130 |         self.objid = objid
131 |         self.label = label
132 |         self.annots = []
133 |         self.outlines = []
134 |         self.mediabox = Box.from_coords(mediabox)
135 |         self.fixed_columns = fixed_columns
136 | 
137 |     def __repr__(self) -> str:
138 |         return '<Page #%d>' % self.pageno  # zero-based page index
139 | 
140 |     def __str__(self) -> str:
141 |         return self.format_name()
142 | 
143 |     def format_name(self, use_label: bool = True, page_number_offset: int = 0) -> str:
144 |         if self.label and use_label:
145 |             return 'page %s' % self.label
146 |         else:
147 |             # + 1 for 1-based page numbers in normal program output (error messages, etc.)
148 |             return 'page #%d' % (self.pageno + 1 + page_number_offset)
149 | 
150 |     def __eq__(self, other: object) -> bool:
151 |         if not isinstance(other, Page):
152 |             return NotImplemented
153 |         return self.pageno == other.pageno
154 | 
155 |     def __lt__(self, other: object) -> bool:
156 |         if not isinstance(other, Page):
157 |             return NotImplemented
158 |         return self.pageno < other.pageno
159 | 
160 | 
161 | @functools.total_ordering
162 | class Pos:
163 |     """
164 |     A position within the document.
165 | 
166 |     This object represents an x,y point on a particular page. Such positions are
167 |     also comparable, and compare in natural document reading order (as inferred
168 |     by pdfminer's text layout detection).
169 |     """
170 | 
171 |     def __init__(self, page: Page, x: float, y: float):
172 |         self.page = page
173 |         self.x = x
174 |         self.y = y
175 |         self._pageseq = 0
176 |         self._pageseq_distance = 0.0
177 | 
178 |     def __str__(self) -> str:
179 |         return '%s (%.3f,%.3f)' % (self.page, self.x, self.y)
180 | 
181 |     def __repr__(self) -> str:
182 |         return '<Pos pg#%d (%.3f,%.3f) #%d>' % (self.page.pageno, self.x, self.y, self._pageseq)
183 | 
184 |     def __eq__(self, other: object) -> bool:
185 |         if isinstance(other, Pos):
186 |             return (self.page == other.page
187 |                     and self.x == other.x
188 |                     and self.y == other.y)
189 |         return NotImplemented
190 | 
191 |     def __lt__(self, other: object) -> bool:
192 |         if isinstance(other, Pos):
193 |             if self.page == other.page:
194 |                 assert self.page is other.page
195 |                 if self.page.fixed_columns:
196 |                     # Fixed layout: assume left-to-right top-to-bottom documents
197 |                     (sx, sy) = self.page.mediabox.closest_point((self.x, self.y))
198 |                     (ox, oy) = self.page.mediabox.closest_point((other.x, other.y))
199 |                     colwidth = self.page.mediabox.get_width() / self.page.fixed_columns
200 |                     self_col = (sx - self.page.mediabox.x0) // colwidth
201 |                     other_col = (ox - self.page.mediabox.x0) // colwidth
202 |                     return self_col < other_col or (self_col == other_col and sy > oy)
203 |                 else:
204 |                     # Default layout inferred from pdfminer traversal
205 |                     assert self._pageseq != 0
206 |                     assert other._pageseq != 0
207 |                     if self._pageseq == other._pageseq:
208 |                         # The positions are on or closest to the same line of text.
209 |                         # XXX: assume top-to-bottom left-to-right order
210 |                         return self.x < other.x if self.y == other.y else self.y > other.y
211 |                     else:
212 |                         return self._pageseq < other._pageseq
213 |             else:
214 |                 return self.page < other.page
215 |         else:
216 |             return NotImplemented
217 | 
218 |     def item_hit(self, item: LTComponent) -> bool:
219 |         """Is this pos within the bounding box of the given PDF component?"""
220 |         return (self.x >= item.x0
221 |                 and self.x <= item.x1
222 |                 and self.y >= item.y0
223 |                 and self.y <= item.y1)
224 | 
225 |     def update_pageseq(self, component: LTComponent, pageseq: int) -> bool:
226 |         """If close-enough to the given component, adopt its sequence number and return True."""
227 |         assert pageseq > 0
228 |         if self.item_hit(component):
229 |             # This pos is inside the component area
230 |             self._pageseq = pageseq
231 |             self._pageseq_distance = 0
232 |             return True
233 |         else:
234 |             d = Box.from_item(component).square_of_distance_to_closest_point((self.x, self.y))
235 |             if self._pageseq == 0 or self._pageseq_distance > d:
236 |                 self._pageseq = pageseq
237 |                 self._pageseq_distance = d
238 |                 return True
239 |             return False
240 | 
241 |     def discard_pageseq(self, pageseq: int) -> None:
242 |         """If we have been assigned the specified pageseq, forget about it."""
243 |         if self._pageseq == pageseq:
244 |             self._pageseq = 0
245 |             self._pageseq_distance = 0.0
246 | 
247 | 
248 | @functools.total_ordering
249 | class ObjectWithPos:
250 |     """Any object that (eventually) has a logical position on the page."""
251 | 
252 |     def __init__(self, pos: typ.Optional[Pos] = None):
253 |         self.pos = pos
254 | 
255 |     def __lt__(self, other: object) -> bool:
256 |         if isinstance(other, ObjectWithPos):
257 |             assert self.pos is not None
258 |             assert other.pos is not None
259 |             return self.pos < other.pos
260 |         return NotImplemented
261 | 
262 |     def update_pageseq(self, component: LTComponent, pageseq: int) -> bool:
263 |         """Delegates to Pos.update_pageseq"""
264 |         return False if self.pos is None else self.pos.update_pageseq(component, pageseq)
265 | 
266 |     def discard_pageseq(self, pageseq: int) -> None:
267 |         """Delegates to Pos.discard_pageseq"""
268 |         if self.pos is not None:
269 |             self.pos.discard_pageseq(pageseq)
270 | 
271 | 
272 | class AnnotationType(enum.Enum):
273 |     """A supported PDF annotation type. Enumerant names match the Subtype names of the PDF spec."""
274 | 
275 |     # A "sticky note" comment annotation.
276 |     Text = enum.auto()
277 | 
278 |     # Markup annotations that apply to one or more regions on the page.
279 |     Highlight = enum.auto()
280 |     Squiggly = enum.auto()
281 |     StrikeOut = enum.auto()
282 |     Underline = enum.auto()
283 | 
284 |     Caret = enum.auto()
285 | 
286 |     # A single rectangle, that is abused by some Apple tools to render custom
287 |     # highlights. We do not attempt to capture the affected text.
288 |     Square = enum.auto()
289 | 
290 |     # Free-form text written somewhere on the page.
291 |     FreeText = enum.auto()
292 | 
293 | 
294 | class Annotation(ObjectWithPos):
295 |     """
296 |     A PDF annotation, and its extracted text.
297 | 
298 |     Attributes:
299 |         author          Author of the annotation
300 |         color           RGB color of the annotation
301 |         contents        Contents of the annotation in the PDF (e.g. comment/description)
302 |         created         Timestamp the annotation was created
303 |         group_children  Annotations grouped together with this one
304 |         in_reply_to     Reference to another annotation on the page that this is "in reply to"
305 |         is_group_child  Is this annotation a member of a parent group?
306 |         last_charseq    Sequence number of the most recent character in text
307 |         name            If present, uniquely identifies this annotation among others on the page
308 |         replies         Annotations replying to this one (reverse of in_reply_to)
309 |         subtype         PDF annotation type
310 |         text            Text in the order captured (use gettext() for a cleaner form)
311 | 
312 |     Attributes updated for StrikeOut and Caret annotations:
313 |         pre_context  Text captured just prior to the beginning of 'text'
314 |         post_context Text captured just after the end of 'text'
315 |     """
316 | 
317 |     boxes: typ.List[Box]
318 |     contents: typ.Optional[str]
319 |     group_children: typ.List[Annotation]
320 |     in_reply_to: typ.Optional[Annotation]
321 |     pre_context: typ.Optional[str]
322 |     post_context: typ.Optional[str]
323 |     replies: typ.List[Annotation]
324 |     text: typ.List[str]
325 | 
326 |     def __init__(
327 |             self,
328 |             page: Page,
329 |             subtype: AnnotationType,
330 |             *,
331 |             author: typ.Optional[str] = None,
332 |             created: typ.Optional[datetime.datetime] = None,
333 |             color: typ.Optional[RGB] = None,
334 |             contents: typ.Optional[str] = None,
335 |             in_reply_to_ref: typ.Optional[PDFObjRef] = None,
336 |             is_group_child: bool = False,
337 |             name: typ.Optional[str] = None,
338 |             quadpoints: typ.Optional[typ.Sequence[float]] = None,
339 |             rect: typ.Optional[BoxCoords] = None):
340 | 
341 |         # Construct boxes from quadpoints
342 |         boxes = []
343 |         if quadpoints is not None:
344 |             assert len(quadpoints) % 8 == 0
345 |             while quadpoints != []:
346 |                 (x0, y0, x1, y1, x2, y2, x3, y3) = quadpoints[:8]
347 |                 quadpoints = quadpoints[8:]
348 |                 xvals = [x0, x1, x2, x3]
349 |                 yvals = [y0, y1, y2, y3]
350 |                 box = Box(min(xvals), min(yvals), max(xvals), max(yvals))
351 |                 boxes.append(box)
352 | 
353 |         # Kludge for Caret annotations that lack quadpoints, but need to capture context
354 |         if quadpoints is None and subtype == AnnotationType.Caret:
355 |             assert rect is not None
356 |             boxes.append(Box.from_coords(rect))
357 | 
358 |         # Compute a meaningful position of this annotation on the page
359 |         assert rect or boxes
360 |         (x0, y0, x1, y1) = rect if rect else boxes[0].get_coords()
361 |         # XXX: assume left-to-right top-to-bottom text
362 |         pos = Pos(page, min(x0, x1), max(y0, y1))
363 |         super().__init__(pos)
364 | 
365 |         # Initialise the attributes
366 |         self.author = author
367 |         self.boxes = boxes
368 |         self.color = color
369 |         self.contents = contents if contents else None
370 |         self.created = created
371 |         self.group_children = []
372 |         self.name = name
373 |         self.last_charseq = 0
374 |         self.post_context = None
375 |         self.pre_context = None
376 |         self.replies = []
377 |         self.subtype = subtype
378 |         self.text = []
379 | 
380 |         # The in_reply_to reference will be resolved in postprocess()
381 |         self.in_reply_to = None
382 |         self._in_reply_to_ref = in_reply_to_ref
383 |         self.is_group_child = is_group_child
384 |         if is_group_child:
385 |             assert in_reply_to_ref
386 | 
387 |     def __repr__(self) -> str:
388 |         return ('<Annotation %s %r%s%s>' %
389 |                 (self.subtype.name, self.pos,
390 |                  " '%s'" % self.contents[:10] if self.contents else '',
391 |                  " '%s'" % ''.join(self.text[:10]) if self.text else ''))
392 | 
393 |     def capture(self, text: str, charseq: int = 0) -> None:
394 |         """Capture text (while rendering the PDF page)."""
395 |         self.text.append(text)
396 |         if charseq:
397 |             assert charseq > self.last_charseq
398 |             self.last_charseq = charseq
399 | 
400 |     def gettext(self, remove_hyphens: bool = False) -> typ.Optional[str]:
401 |         """Retrieve cleaned-up text, after rendering."""
402 |         if self.boxes:
403 |             if self.text:
404 |                 captured = ''.join(self.text)
405 |                 return merge_lines(captured, remove_hyphens, strip_space=(not self.has_context()))
406 |             else:
407 |                 # something's strange -- we have boxes but no text for them
408 |                 logger.warning('Missing text for %s annotation at %s', self.subtype.name, self.pos)
409 |                 return ""
410 |         else:
411 |             return None
412 | 
413 |     def get_child_by_type(self, child_type: AnnotationType) -> typ.Optional[Annotation]:
414 |         """Return the first child of the given type."""
415 |         for c in self.group_children:
416 |             if c.subtype == child_type:
417 |                 return c
418 |         return None
419 | 
420 |     def wants_context(self) -> bool:
421 |         """Returns true if this annotation type should include context."""
422 |         return self.subtype in {AnnotationType.Caret, AnnotationType.StrikeOut}
423 | 
424 |     def set_pre_context(self, pre_context: str) -> None:
425 |         assert self.pre_context is None
426 |         self.pre_context = pre_context
427 | 
428 |     def set_post_context(self, post_context: str) -> None:
429 |         assert self.post_context is None
430 | 
431 |         # If the text ends in a (broadcast) newline, discard it lest it mess up the context below.
432 |         if self.text and self.text[-1] == '\n':
433 |             self.text.pop()
434 | 
435 |         # If the captured text ends in any (other) space, move it to the context.
436 |         whitespace = []
437 |         while self.text and self.text[-1].isspace():
438 |             whitespace.append(self.text.pop())
439 |         if whitespace:
440 |             post_context = ''.join(whitespace) + post_context
441 | 
442 |         self.post_context = post_context
443 | 
444 |     def has_context(self) -> bool:
445 |         """Returns true if this annotation captured context."""
446 |         return self.pre_context is not None or self.post_context is not None
447 | 
448 |     def get_context(self, remove_hyphens: bool = False) -> typ.Tuple[str, str]:
449 |         """Returns context captured for this annotation, as a tuple (pre, post)."""
450 |         return (merge_lines(self.pre_context or '', remove_hyphens, strip_space=False),
451 |                 merge_lines(self.post_context or '', remove_hyphens, strip_space=False))
452 | 
453 |     def postprocess(self, annots_by_objid: typ.Dict[int, Annotation]) -> None:
454 |         """Update internal state once all text and context has been captured."""
455 |         # Resole the in_reply_to object reference to its annotation
456 |         if self._in_reply_to_ref is not None:
457 |             assert self.in_reply_to is None  # This should be called once only
458 |             a = annots_by_objid.get(self._in_reply_to_ref.objid)
459 |             if a is None:
460 |                 logger.warning("IRT reference (%d) not found in page annotations",
461 |                                self._in_reply_to_ref.objid)
462 |             elif self.is_group_child:
463 |                 a.group_children.append(self)
464 |             else:
465 |                 self.in_reply_to = a
466 |                 a.replies.append(self)
467 | 
468 |         # The Skim PDF reader (https://skim-app.sourceforge.io/) creates annotations whose
469 |         # default initial contents are a copy of the selected text. Unless the user goes to
470 |         # the trouble of editing each annotation, this goes badly for us because we have
471 |         # duplicate text and contents (e.g., for simple highlights and strikeout).
472 |         if self.contents and (text := self.gettext()) and text.strip() == self.contents.strip():
473 |             self.contents = None
474 | 
475 | 
476 | UnresolvedPage = typ.Union[int, PDFObjRef]
477 | """A reference to a page that is *either* a page number, or a PDF object ID."""
478 | 
479 | 
480 | class Outline(ObjectWithPos):
481 |     """
482 |     A PDF outline (also known as a bookmark).
483 | 
484 |     Outlines are used to navigate the PDF, and are often headings in the
485 |     document's table of contents. A single outline has a title (name), and a
486 |     target location in the PDF (page and X/Y coordinates). Initially the page is
487 |     referred to by reference, but the reference is unresolved -- it is either a
488 |     page number, or a PDF object ID. While rendering the PDF, the page is
489 |     resolved to a Page object, and the pos attribute is updated.
490 |     """
491 | 
492 |     def __init__(
493 |         self,
494 |         title: str,
495 |         pageref: UnresolvedPage,
496 |         target: typ.Optional[typ.Tuple[float, float]]
497 |     ):
498 |         super().__init__()
499 |         self.title = title
500 |         self.pageref = pageref
501 |         self.target = target
502 | 
503 |     def __repr__(self) -> str:
504 |         return '<Outline \'%s\' %r>' % (self.title, self.pos)
505 | 
506 |     def resolve(self, page: Page) -> None:
507 |         """Resolve our page reference to the given page, and update our position."""
508 |         assert self.pos is None
509 |         if isinstance(self.pageref, PDFObjRef):
510 |             assert self.pageref.objid == page.objid
511 |         else:
512 |             assert self.pageref == page.pageno
513 | 
514 |         if self.target is None:
515 |             # XXX: "first" point on the page, assuming left-to-right top-to-bottom order
516 |             (targetx, targety) = (page.mediabox.x0, page.mediabox.y1)
517 |         else:
518 |             (targetx, targety) = self.target
519 | 
520 |         self.pos = Pos(page, targetx, targety)
521 | 
522 | 
523 | class Document:
524 |     """
525 |     A fully-extracted PDF document.
526 | 
527 |     This is really just a list of pages and some helpers.
528 | 
529 |     Attributes:
530 |         pages   An ordered list of Page objects, indexed by zero-based page number.
531 |     """
532 | 
533 |     pages: typ.List[Page]
534 | 
535 |     def __init__(self) -> None:
536 |         self.pages = []
537 | 
538 |     def iter_annots(self, *, include_replies: bool = False) -> typ.Iterator[Annotation]:
539 |         """
540 |         Iterate over all the annotations in the document.
541 | 
542 |         Only the primary annotation for a group is included.
543 |         Replies are included only if include_replies is True.
544 |         """
545 | 
546 |         for p in self.pages:
547 |             for a in p.annots:
548 |                 if not a.is_group_child and (include_replies or not a.in_reply_to):
549 |                     yield a
550 | 
551 |     def nearest_outline(
552 |         self,
553 |         pos: Pos
554 |     ) -> typ.Optional[Outline]:
555 |         """Return the first outline occuring prior to the given position, in reading order."""
556 | 
557 |         # Search pages backwards from the given pos
558 |         for pageno in range(pos.page.pageno, -1, -1):
559 |             page = self.pages[pageno]
560 |             assert page.pageno == pageno
561 | 
562 |             # Outlines are pre-sorted, so we can use bisect to find the first outline < pos
563 |             idx = bisect.bisect(page.outlines, ObjectWithPos(pos))
564 |             if idx:
565 |                 return page.outlines[idx - 1]
566 | 
567 |         return None
568 | 
569 | 
570 | class RGB(typ.NamedTuple):
571 |     red: float
572 |     green: float
573 |     blue: float
574 | 
575 |     def ashex(self) -> str:
576 |         "Return a 6-character string representing the 24-bit hex code for this colour."
577 |         red_hex = format(int(self.red * 255), '02x')
578 |         green_hex = format(int(self.green * 255), '02x')
579 |         blue_hex = format(int(self.blue * 255), '02x')
580 |         return red_hex + green_hex + blue_hex
581 | 
582 |     def __str__(self) -> str:
583 |         return f"RGB({self.ashex()})"
584 | 


--------------------------------------------------------------------------------
/pdfannots/utils.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import typing as typ
 3 | 
 4 | CHARACTER_SUBSTITUTIONS = {
 5 |     'ﬀ': 'ff',
 6 |     'ﬁ': 'fi',
 7 |     'ﬂ': 'fl',
 8 |     'ﬃ': 'ffi',
 9 |     'ﬄ': 'ffl',
10 |     '‘': "'",
11 |     '’': "'",
12 |     '“': '"',
13 |     '”': '"',
14 |     '…': '...',
15 | }
16 | 
17 | 
18 | def cleanup_text(text: str) -> str:
19 |     """
20 |     Normalise line endings and replace common special characters with plain ASCII equivalents.
21 |     """
22 |     if '\r' in text:
23 |         text = text.replace('\r\n', '\n').replace('\r', '\n')
24 |     return ''.join([CHARACTER_SUBSTITUTIONS.get(c, c) for c in text])
25 | 
26 | 
27 | def merge_lines(captured_text: str, remove_hyphens: bool = False, strip_space: bool = True) -> str:
28 |     """
29 |     Merge and cleanup lines in captured text, optionally removing hyphens.
30 | 
31 |     Any number of consecutive newlines is replaced by a single space, unless the
32 |     prior line ends in a hyphen, in which case they are just removed entirely.
33 |     This makes it easier for the renderer to "broadcast" newlines to active
34 |     annotations regardless of box hits. (Detecting paragraph breaks is tricky,
35 |     and left for future work!)
36 |     """
37 |     results = []
38 | 
39 |     lines = captured_text.splitlines()
40 |     for i in range(len(lines)):
41 |         thisline = lines[i]
42 |         if thisline == '':
43 |             continue
44 | 
45 |         nextline = lines[i + 1] if i + 1 < len(lines) else None
46 | 
47 |         if (len(thisline) >= 2
48 |                 and thisline[-1] == '-'       # Line ends in an apparent hyphen
49 |                 and thisline[-2].islower()):  # Prior character was a lowercase letter
50 |             # We have a likely hyphen. Remove it if desired.
51 |             if remove_hyphens:
52 |                 thisline = thisline[:-1]
53 |         elif (not thisline[-1].isspace()
54 |               and nextline is not None
55 |               and (nextline == '' or not nextline[0].isspace())):
56 |             # Insert space to replace the line break
57 |             thisline += ' '
58 | 
59 |         results.append(cleanup_text(thisline))
60 | 
61 |     result = ''.join(results)
62 | 
63 |     if result:
64 |         if strip_space:
65 |             result = result.strip()
66 |         else:
67 |             # re-insert load-bearing spaces from linebreaks when needed for context
68 |             if len(lines) > 0 and lines[0] == '' and not result[0].isspace():
69 |                 result = ' ' + result
70 |             if len(lines) > 1 and lines[-1] == '' and not result[-1].isspace():
71 |                 result += ' '
72 | 
73 |     return result
74 | 
75 | 
76 | def decode_datetime(dts: str) -> typ.Optional[datetime.datetime]:
77 |     if dts.startswith('D:'):  # seems 'optional but recommended'
78 |         dts = dts[2:]
79 |     dts = dts.replace("'", '')
80 |     zi = dts.find('Z')
81 |     if zi != -1:  # sometimes it's Z/Z0000
82 |         dts = dts[:zi] + '+0000'
83 |     fmt = '%Y%m%d%H%M%S'
84 |     # dates in PDFs are quite flaky and underspecified... so perhaps worth defensive code here
85 |     for suf in ['%z', '']:  # sometimes timezone is missing
86 |         try:
87 |             return datetime.datetime.strptime(dts, fmt + suf)
88 |         except ValueError:
89 |             continue
90 |     return None
91 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "pdfannots"
 7 | dynamic = ["version"]
 8 | requires-python = ">=3.8"
 9 | dependencies = ["pdfminer.six >= 20220319, != 20240706"]
10 | description = "Tool to extract and pretty-print PDF annotations for reviewing"
11 | readme = "README.md"
12 | license = {file = "LICENSE.txt"}
13 | authors = [
14 |     {name = "Andrew Baumann", email = "pdfannots.pypi.org@ab.id.au"},
15 | ]
16 | classifiers = [
17 |     "Intended Audience :: Science/Research",
18 |     "Topic :: Text Processing",
19 |     "License :: OSI Approved :: MIT License",
20 |     "Programming Language :: Python :: 3",
21 |     "Programming Language :: Python :: 3.8",
22 |     "Programming Language :: Python :: 3.9",
23 |     "Programming Language :: Python :: 3.10",
24 |     "Programming Language :: Python :: 3.11",
25 |     "Programming Language :: Python :: 3.12",
26 |     "Programming Language :: Python :: 3.13",
27 | ]
28 | 
29 | [project.scripts]
30 | pdfannots = "pdfannots.cli:main"
31 | 
32 | [project.urls]
33 | Homepage = "https://github.com/0xabu/pdfannots"
34 | 
35 | [tool.hatch.version]
36 | path = "pdfannots/__init__.py"
37 | 
38 | [tool.mypy]
39 | # strict mode
40 | warn_unused_configs = true
41 | disallow_any_generics = true
42 | disallow_subclassing_any = true
43 | disallow_untyped_calls = true
44 | disallow_untyped_defs = true
45 | disallow_incomplete_defs = true
46 | check_untyped_defs = true
47 | disallow_untyped_decorators = true
48 | no_implicit_optional = true
49 | warn_redundant_casts = true
50 | warn_unused_ignores = true
51 | warn_return_any = true
52 | no_implicit_reexport = true
53 | strict_equality = true
54 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # pip requirements for pdfannots
2 | # Use as: pip3 install -r requirements.txt
3 | 
4 | pdfminer.six == 20231228
5 | 


--------------------------------------------------------------------------------
/tests.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import functools
  4 | import json
  5 | import operator
  6 | import pathlib
  7 | import re
  8 | import typing as typ
  9 | import unittest
 10 | from datetime import datetime, timedelta, timezone
 11 | 
 12 | import pdfminer.layout
 13 | 
 14 | import pdfannots
 15 | import pdfannots.utils
 16 | from pdfannots.types import AnnotationType
 17 | from pdfannots.printer.markdown import MarkdownPrinter, GroupedMarkdownPrinter
 18 | from pdfannots.printer.json import JsonPrinter
 19 | 
 20 | 
 21 | class UnitTests(unittest.TestCase):
 22 |     def test_decode_datetime(self) -> None:
 23 |         datas = [
 24 |             ("D:123456", None),  # defensive on bad datetimes
 25 |             ("D:20190119212926-08'00'",
 26 |              datetime(2019, 1, 19, 21, 29, 26, tzinfo=timezone(-timedelta(hours=8)))),
 27 |             ("20200102030405Z0000",
 28 |              datetime(2020, 1, 2, 3, 4, 5, tzinfo=timezone.utc)),
 29 |             ("D:20101112191817", datetime(2010, 11, 12, 19, 18, 17)),
 30 |         ]
 31 |         for dts, expected in datas:
 32 |             dt = pdfannots.utils.decode_datetime(dts)
 33 |             self.assertEqual(dt, expected)
 34 | 
 35 | 
 36 | class ExtractionTestBase(unittest.TestCase):
 37 |     filename: str
 38 | 
 39 |     # Permit a test to customise the columns_per_page or LAParams
 40 |     columns_per_page: typ.Optional[int] = None
 41 |     laparams = pdfminer.layout.LAParams()
 42 | 
 43 |     def setUp(self) -> None:
 44 |         path = pathlib.Path(__file__).parent / 'tests' / self.filename
 45 |         with path.open('rb') as f:
 46 |             self.doc = pdfannots.process_file(f, columns_per_page=self.columns_per_page,
 47 |                                               laparams=self.laparams)
 48 |             self.annots = [a for p in self.doc.pages for a in p.annots]
 49 |             self.outlines = [o for p in self.doc.pages for o in p.outlines]
 50 | 
 51 |     def assertEndsWith(self, bigstr: str, suffix: str) -> None:
 52 |         self.assertEqual(bigstr[-len(suffix):], suffix)
 53 | 
 54 |     def assertStartsWith(self, bigstr: str, prefix: str) -> None:
 55 |         self.assertEqual(bigstr[:len(prefix)], prefix)
 56 | 
 57 | 
 58 | class ExtractionTests(ExtractionTestBase):
 59 |     filename = 'hotos17.pdf'
 60 |     columns_per_page = 2  # for test_nearest_outline
 61 | 
 62 |     def test_annots(self) -> None:
 63 |         EXPECTED = [
 64 |             (0, AnnotationType.Squiggly, None, 'recent Intel CPUs have introduced'),
 65 |             (0, AnnotationType.Text, 'This is a note with no text attached.', None),
 66 |             (0, AnnotationType.StrikeOut, None, 'e'),
 67 |             (1, AnnotationType.Highlight, None,
 68 |              'TSX launched with "Haswell" in 2013 but was later disabled due to a bug. '
 69 |              '"Broadwell" CPUs with the bug fix shipped in late 2014.'),
 70 |             (1, AnnotationType.Highlight, 'This is lower in column 1',
 71 |              'user-mode access to FS/GS registers, and TLB tags for non-VM address spaces'),
 72 |             (1, AnnotationType.Highlight, None,
 73 |              'segmentation, task switching, and 16-bit modes.'),
 74 |             (1, AnnotationType.Highlight, 'This is at the top of column two',
 75 |              'The jump is due to extensions introduced with the "Skylake" microarchitecture'),
 76 |             (3, AnnotationType.Squiggly, 'This is a nit.',
 77 |              'Control transfer in x86 is already very complex'),
 78 |             (3, AnnotationType.Underline, 'This is a different nit',
 79 |              'Besides modifying semantics of all indirect control transfers'),
 80 |             (3, AnnotationType.StrikeOut, None,
 81 |              'While we may disagree with some of the design choices,')]
 82 | 
 83 |         self.assertEqual(len(self.annots), len(EXPECTED))
 84 |         for a, expected in zip(self.annots, EXPECTED):
 85 |             assert a.pos is not None
 86 |             self.assertEqual(
 87 |                 (a.pos.page.pageno, a.subtype, a.contents, a.gettext(remove_hyphens=True)),
 88 |                 expected)
 89 |         self.assertEqual(self.annots[0].created, datetime(
 90 |             2019, 1, 19, 21, 29, 42, tzinfo=timezone(-timedelta(hours=8))))
 91 | 
 92 |         # test for correct whitespace on the strikeout annot
 93 |         a = self.annots[2]
 94 |         self.assertTrue(a.has_context())
 95 |         (pre, post) = a.get_context()
 96 |         self.assertEndsWith(pre, 'widths, ar')
 97 |         self.assertStartsWith(post, ' counted')
 98 | 
 99 |     def test_outlines(self) -> None:
100 |         EXPECTED = [
101 |             'Introduction',
102 |             'Background: x86 extensions',
103 |             'Case study: SGX',
104 |             'Case study: CET',
105 |             'Implications',
106 |             'Concluding remarks']
107 | 
108 |         self.assertEqual(len(self.outlines), len(EXPECTED))
109 |         for o, expected in zip(self.outlines, EXPECTED):
110 |             self.assertEqual(o.title, expected)
111 | 
112 |     def test_nearest_outline(self) -> None:
113 |         # Page 1 (Introduction) Squiggly: "recent Intel CPUs have introduced"
114 |         a = self.doc.pages[0].annots[0]
115 |         assert a.pos is not None
116 |         o = self.doc.nearest_outline(a.pos)
117 |         assert o is not None
118 |         self.assertEqual(o.title, 'Introduction')
119 | 
120 |         # Page 4 (Case study: CET) Squiggly: "Control transfer in x86 is already very complex"
121 |         # Note: pdfminer gets this wrong as of 20201018; we must set columns_per_page to fix it
122 |         a = self.doc.pages[3].annots[0]
123 |         assert a.pos is not None
124 |         o = self.doc.nearest_outline(a.pos)
125 |         assert o is not None
126 |         self.assertEqual(o.title, 'Case study: CET')
127 | 
128 | 
129 | class Issue9(ExtractionTestBase):
130 |     filename = 'issue9.pdf'
131 | 
132 |     def test(self) -> None:
133 |         self.assertEqual(len(self.annots), 1)
134 |         a = self.annots[0]
135 |         self.assertEqual(a.gettext(), 'World')
136 | 
137 | 
138 | class Issue13(ExtractionTestBase):
139 |     filename = 'issue13.pdf'
140 | 
141 |     def test(self) -> None:
142 |         self.assertEqual(len(self.annots), 1)
143 |         a = self.annots[0]
144 |         self.assertEqual(a.gettext(), 'This is a sample statement.')
145 | 
146 | 
147 | class Issue46(ExtractionTestBase):
148 |     filename = 'issue46.pdf'
149 | 
150 |     def test(self) -> None:
151 |         self.assertEqual(len(self.annots), 3)
152 | 
153 |         self.assertEqual(self.annots[0].subtype, AnnotationType.Highlight)
154 |         self.assertEqual(self.annots[0].gettext(), 'C – Curate')
155 | 
156 |         self.assertEqual(self.annots[1].subtype, AnnotationType.Square)
157 |         self.assertEqual(self.annots[1].gettext(), None)
158 | 
159 |         self.assertEqual(self.annots[2].subtype, AnnotationType.Highlight)
160 |         self.assertEqual(self.annots[2].gettext(), 'This was a novel idea at the time')
161 | 
162 | 
163 | class Issue61(ExtractionTestBase):
164 |     filename = 'issue61.pdf'
165 | 
166 |     def test(self) -> None:
167 |         self.assertEqual(len(self.annots), 1)
168 |         a = self.annots[0]
169 |         self.assertEqual(a.subtype, AnnotationType.Caret)
170 |         self.assertEqual(a.contents, 'and machine learning')
171 |         self.assertTrue(a.has_context())
172 | 
173 | 
174 | class Pr24(ExtractionTestBase):
175 |     filename = 'pr24.pdf'
176 | 
177 |     def test(self) -> None:
178 |         EXPECTED = [
179 |             (AnnotationType.Highlight, 'long highlight',
180 |              'Heading Link to heading that is working with vim-pandoc. Link to heading that'),
181 |             (AnnotationType.Highlight, 'short highlight', 'not working'),
182 |             (AnnotationType.Text, None, None),
183 |             (AnnotationType.Highlight, None, 'Some more text'),
184 |             (AnnotationType.Text, 'dual\n\npara note', None),
185 |             (AnnotationType.Text, 's', None)]
186 |         self.assertEqual(len(self.annots), len(EXPECTED))
187 |         for a, expected in zip(self.annots, EXPECTED):
188 |             self.assertEqual((a.subtype, a.contents, a.gettext()), expected)
189 | 
190 | 
191 | class Landscape2Column(ExtractionTestBase):
192 |     filename = 'word2column.pdf'
193 | 
194 |     def test(self) -> None:
195 |         self.assertEqual(len(self.annots), 9)
196 | 
197 |         a = self.annots[0]
198 |         self.assertEqual(a.subtype, AnnotationType.StrikeOut)
199 |         self.assertEqual(a.gettext(), 'nostrud exercitation')
200 |         self.assertTrue(a.has_context())
201 |         (pre, post) = a.get_context()
202 |         self.assertEndsWith(
203 |             pre, 'Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor '
204 |             'incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis ')
205 |         self.assertStartsWith(
206 |             post, ' ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor')
207 | 
208 |         a = self.annots[1]
209 |         self.assertEqual(a.subtype, AnnotationType.StrikeOut)
210 |         self.assertEqual(a.gettext(), 'Duis')
211 |         self.assertTrue(a.has_context())
212 |         (pre, post) = a.get_context()
213 |         self.assertEndsWith(pre, 'ullamco laboris nisi ut aliquip ex ea commodo consequat. ')
214 |         self.assertStartsWith(
215 |             post, ' aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu '
216 |             'fugiat nulla pariatur.')
217 | 
218 |         a = self.annots[2]
219 |         self.assertEqual(a.subtype, AnnotationType.StrikeOut)
220 |         self.assertEqual(a.gettext(), 'laborum')
221 |         self.assertTrue(a.has_context())
222 |         (pre, post) = a.get_context()
223 |         self.assertEndsWith(pre, ', sunt in culpa qui officia deserunt mollit anim id est ')
224 |         self.assertStartsWith(post, '. Heading 2 Sed ut perspiciatis,')
225 | 
226 |         a = self.annots[3]
227 |         self.assertEqual(a.subtype, AnnotationType.Highlight)
228 |         self.assertEqual(
229 |             a.gettext(), 'At vero eos et accusamus et iusto odio dignissimos ducimus, qui '
230 |             'blanditiis praesentium voluptatum deleniti atque corrupti,')
231 |         self.assertFalse(a.has_context())
232 | 
233 |         a = self.annots[4]
234 |         self.assertEqual(a.subtype, AnnotationType.Squiggly)
235 |         self.assertEqual(
236 |             a.gettext(), 'Itaque earum rerum hic tenetur a sapiente delectus, ut aut reiciendis '
237 |             'voluptatibus maiores alias consequatur aut perferendis doloribus asperiores repellat.')
238 |         self.assertEqual(a.contents, 'Nonsense!')
239 |         self.assertFalse(a.has_context())
240 | 
241 |         a = self.annots[5]
242 |         self.assertEqual(a.subtype, AnnotationType.StrikeOut)
243 |         self.assertEqual(a.gettext(), 'equal')
244 |         self.assertTrue(a.has_context())
245 |         (pre, post) = a.get_context()
246 |         self.assertEndsWith(pre, 'the pain and trouble that are bound to ensue; and ')
247 |         self.assertStartsWith(post, ' blame belongs to those who fail in their')  # end of page
248 | 
249 |         a = self.annots[6]
250 |         self.assertEqual(a.subtype, AnnotationType.StrikeOut)
251 |         self.assertEqual(a.gettext(), 'duty')
252 |         self.assertTrue(a.has_context())
253 |         (pre, post) = a.get_context()
254 |         self.assertEqual(pre, '')  # start of page
255 |         self.assertStartsWith(post, ' through weakness of will, which')
256 | 
257 |         a = self.annots[7]
258 |         self.assertEqual(a.subtype, AnnotationType.StrikeOut)
259 |         self.assertEqual(a.gettext(), 'In a free hour,')
260 |         self.assertTrue(a.has_context())
261 |         (pre, post) = a.get_context()
262 |         self.assertEndsWith(pre, 'These cases are perfectly simple and easy to distinguish. ')
263 |         self.assertStartsWith(post, ' when our power of choice is untrammeled and when nothing')
264 | 
265 | 
266 | class FreeTextAnnotation(ExtractionTestBase):
267 |     filename = 'FreeText-annotation.pdf'
268 | 
269 |     def test(self) -> None:
270 |         self.assertEqual(len(self.annots), 1)
271 |         self.assertEqual(self.annots[0].subtype, AnnotationType.FreeText)
272 |         self.assertEqual(self.annots[0].contents, 'Annotation with subtype "FreeText".')
273 |         self.assertEqual(self.annots[0].gettext(), None)
274 | 
275 | 
276 | class CaretAnnotations(ExtractionTestBase):
277 |     filename = 'caret.pdf'
278 | 
279 |     def test(self) -> None:
280 |         self.assertEqual(len(self.annots), 5)
281 |         a = self.annots[0]
282 |         self.assertEqual(a.subtype, AnnotationType.StrikeOut)
283 |         self.assertEqual(a.gettext(), 'Adobe Acrobat Reader')
284 |         self.assertTrue(a.is_group_child)
285 |         self.assertEqual(a.group_children, [])
286 |         g = self.annots[3]
287 |         self.assertEqual(g.subtype, AnnotationType.Caret)
288 |         self.assertEqual(g.contents, 'Google Chrome')
289 |         self.assertFalse(g.is_group_child)
290 |         self.assertEqual(g.group_children, [a])
291 |         self.assertEqual(g.get_child_by_type(AnnotationType.StrikeOut), a)
292 | 
293 | 
294 | class PrinterTestBase(unittest.TestCase):
295 |     filename = 'hotos17.pdf'
296 | 
297 |     def setUp(self) -> None:
298 |         path = pathlib.Path(__file__).parent / 'tests' / self.filename
299 |         with path.open('rb') as f:
300 |             self.doc = pdfannots.process_file(f)
301 | 
302 | 
303 | class MarkdownPrinterTest(PrinterTestBase):
304 |     # There's not a whole lot of value in testing the precise output format,
305 |     # but let's make sure we produce a non-trivial result and don't crash.
306 |     def test_flat(self) -> None:
307 |         p = MarkdownPrinter(print_filename=True, remove_hyphens=False)
308 | 
309 |         linecount = 0
310 |         charcount = 0
311 |         for line in p.print_file('dummyfile', self.doc):
312 |             linecount += line.count('\n')
313 |             charcount += len(line)
314 | 
315 |         self.assertGreater(linecount, 5)
316 |         self.assertGreater(charcount, 500)
317 | 
318 |     def test_flat_page_number_offset(self) -> None:
319 |         p = MarkdownPrinter(page_number_offset=-1)
320 | 
321 |         page_numbers = []
322 |         for line in p.print_file('dummyfile', self.doc):
323 |             m = re.match(r'.+Page #([0-9])', line)
324 |             if m:
325 |                 page_numbers.append(m[1])
326 | 
327 |         self.assertEqual(page_numbers, ['0', '0', '0', '1', '1', '1', '1', '3', '3', '3'])
328 | 
329 |     def test_grouped(self) -> None:
330 |         p = GroupedMarkdownPrinter(wrap_column=80)
331 | 
332 |         linecount = 0
333 |         charcount = 0
334 |         for line in p.print_file('dummyfile', self.doc):
335 |             linecount += line.count('\n')
336 |             charcount += len(line)
337 | 
338 |         self.assertGreater(linecount, 10)
339 |         self.assertGreater(charcount, 900)
340 | 
341 |     def test_multicolorgrouping(self) -> None:
342 |         p = GroupedMarkdownPrinter(group_highlights_by_color=True)
343 | 
344 |         linecount = 0
345 |         charcount = 0
346 |         for line in p.print_file('dummyfile', self.doc):
347 |             linecount += line.count('\n')
348 |             charcount += len(line)
349 | 
350 |         self.assertGreater(linecount, 10)
351 |         self.assertGreater(charcount, 900)
352 | 
353 | 
354 | class JsonPrinterTest(PrinterTestBase):
355 |     def test_flat(self) -> None:
356 |         p = JsonPrinter(remove_hyphens=False, output_codec='utf-8')
357 | 
358 |         j = json.loads(
359 |             p.begin()
360 |             + functools.reduce(operator.add, p.print_file('dummyfile', self.doc))
361 |             + p.end())
362 | 
363 |         self.assertTrue(isinstance(j, list))
364 |         self.assertEqual(len(j), 10)
365 | 
366 | 
367 | if __name__ == "__main__":
368 |     unittest.main()
369 | 


--------------------------------------------------------------------------------
/tests/FreeText-annotation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xabu/pdfannots/08f6f0abdb690fb9d8b6cc3f6d244fd2d80c9af6/tests/FreeText-annotation.pdf


--------------------------------------------------------------------------------
/tests/caret.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xabu/pdfannots/08f6f0abdb690fb9d8b6cc3f6d244fd2d80c9af6/tests/caret.pdf


--------------------------------------------------------------------------------
/tests/hotos17.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xabu/pdfannots/08f6f0abdb690fb9d8b6cc3f6d244fd2d80c9af6/tests/hotos17.pdf


--------------------------------------------------------------------------------
/tests/issue13.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xabu/pdfannots/08f6f0abdb690fb9d8b6cc3f6d244fd2d80c9af6/tests/issue13.pdf


--------------------------------------------------------------------------------
/tests/issue46.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xabu/pdfannots/08f6f0abdb690fb9d8b6cc3f6d244fd2d80c9af6/tests/issue46.pdf


--------------------------------------------------------------------------------
/tests/issue61.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xabu/pdfannots/08f6f0abdb690fb9d8b6cc3f6d244fd2d80c9af6/tests/issue61.pdf


--------------------------------------------------------------------------------
/tests/issue9.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xabu/pdfannots/08f6f0abdb690fb9d8b6cc3f6d244fd2d80c9af6/tests/issue9.pdf


--------------------------------------------------------------------------------
/tests/pr24.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xabu/pdfannots/08f6f0abdb690fb9d8b6cc3f6d244fd2d80c9af6/tests/pr24.pdf


--------------------------------------------------------------------------------
/tests/word2column.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xabu/pdfannots/08f6f0abdb690fb9d8b6cc3f6d244fd2d80c9af6/tests/word2column.pdf


--------------------------------------------------------------------------------