├── .flake8 ├── .github └── workflows │ ├── codeql-analysis.yml │ ├── python-checks.yml │ └── python-publish.yml ├── .gitignore ├── LICENSE.txt ├── README.md ├── doc └── demo.png ├── pdfannots.py ├── pdfannots ├── __init__.py ├── __main__.py ├── cli.py ├── printer │ ├── __init__.py │ ├── json.py │ └── markdown.py ├── py.typed ├── types.py └── utils.py ├── pyproject.toml ├── requirements.txt ├── tests.py └── tests ├── FreeText-annotation.pdf ├── caret.pdf ├── hotos17.pdf ├── issue13.pdf ├── issue46.pdf ├── issue61.pdf ├── issue9.pdf ├── pr24.pdf └── word2column.pdf /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E741,W503 3 | max-line-length = 100 4 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL" 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | # The branches below must be a subset of the branches above 8 | branches: [ main ] 9 | schedule: 10 | - cron: '39 7 * * 3' 11 | 12 | jobs: 13 | analyze: 14 | name: Analyze 15 | runs-on: ubuntu-latest 16 | permissions: 17 | actions: read 18 | contents: read 19 | security-events: write 20 | 21 | strategy: 22 | fail-fast: false 23 | matrix: 24 | language: [ 'python' ] 25 | 26 | steps: 27 | - name: Checkout repository 28 | uses: actions/checkout@v2 29 | 30 | # Initializes the CodeQL tools for scanning. 31 | - name: Initialize CodeQL 32 | uses: github/codeql-action/init@v1 33 | with: 34 | languages: ${{ matrix.language }} 35 | # If you wish to specify custom queries, you can do so here or in a config file. 36 | # By default, queries listed here will override any specified in a config file. 37 | # Prefix the list here with "+" to use these queries and those in the config file. 38 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 39 | 40 | - name: Perform CodeQL Analysis 41 | uses: github/codeql-action/analyze@v1 42 | -------------------------------------------------------------------------------- /.github/workflows/python-checks.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python checks 5 | 6 | on: 7 | push: 8 | branches: [ main ] 9 | pull_request: 10 | branches: [ main ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] 19 | 20 | steps: 21 | - uses: actions/checkout@v2 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v2 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | python -m pip install flake8 mypy autopep8 pytest 30 | pip install -r requirements.txt 31 | - name: Type check with mypy 32 | run: mypy . 33 | - name: Lint with flake8 34 | run: flake8 . --count --show-source --statistics 35 | - name: Check formatting with autopep8 36 | run: autopep8 --diff --recursive --exit-code . 37 | - name: Test with pytest 38 | run: pytest tests.py 39 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will build and upload a Python Package using Twine 2 | name: Upload Python Package to PyPI 3 | 4 | on: 5 | # Trigger automatically when a release is published 6 | release: 7 | types: [published] 8 | 9 | # Also permit manual dispatch 10 | workflow_dispatch: 11 | 12 | jobs: 13 | deploy: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: '3.x' 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install build twine hatchling 27 | - name: Extract version 28 | id: get-version 29 | run: echo "::set-output name=VERSION::$(python -m hatchling version)" 30 | - name: Build package 31 | run: python -m build 32 | - name: Check package 33 | run: twine check --strict dist/* 34 | - name: Publish package, only if correctly tagged 35 | if: github.ref == format('refs/tags/v{0}', steps.get-version.outputs.VERSION) 36 | run: twine upload --non-interactive --verbose --disable-progress-bar dist/* 37 | env: 38 | TWINE_USERNAME: __token__ 39 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | dist/ 3 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation (2016-2022) and Andrew Baumann (2022-). All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## pdfannots 2 | 3 | [![Build status](https://github.com/0xabu/pdfannots/actions/workflows/python-checks.yml/badge.svg)](https://github.com/0xabu/pdfannots/actions/workflows/python-checks.yml) 4 | [![PyPI version](https://img.shields.io/pypi/v/pdfannots)](https://pypi.org/project/pdfannots/) 5 | 6 | This program extracts annotations (highlights, comments, etc.) from a PDF file, 7 | and formats them as Markdown or exports them to JSON. It is primarily intended 8 | for use in reviewing submissions to scientific conferences/journals. 9 | 10 | ![Sample/demo of pdfannots extracting Markdown from an annotated PDF](doc/demo.png) 11 | 12 | For the default Markdown format, the output is as follows: 13 | 14 | * Highlights without an attached comment are output first, as 15 | "highlights" with just the highlighted text included. Note that 16 | these are not typically suitable for use in a review, since they're 17 | unlikely to have any meaning to the recipient; they are just meant 18 | to serve as a reminder to the reviewer. 19 | 20 | * Highlights with an attached comment, and text annotations (not 21 | attached to any particular text/highlight) are output next, as 22 | "detailed comments". Typically most comments on a reviewed paper 23 | are of this form. 24 | 25 | * Underline, strikeout, and squiggly underline annotations are output 26 | last, as "Nits", with or without an attached comment. The intention 27 | of this is to easily separate formatting or grammatical corrections 28 | from more substantial comments about the content of the document. 29 | 30 | For each annotation, the page number is given, along with the associated 31 | (highlighted/underlined) text, if any. Additionally, if the document embeds 32 | outlines (aka bookmarks), such as those generated by the LaTeX 33 | [hyperref](https://ctan.org/pkg/hyperref) package, they are printed to help 34 | identify to which section in the document the annotation refers. 35 | 36 | 37 | ### Installation 38 | 39 | To install the latest released version from PyPI, use a command such as: 40 | ``` 41 | python3 -m pip install pdfannots 42 | ``` 43 | 44 | 45 | ### Usage 46 | 47 | See `pdfannots --help` (in a source tree: `pdfannots.py --help`) for 48 | options and invocation. 49 | 50 | 51 | ### Dependencies 52 | 53 | * Python >= 3.8 54 | * [pdfminer.six](https://github.com/pdfminer/pdfminer.six) 55 | 56 | 57 | ### Known issues and limitations 58 | 59 | * While it is generally reliable, pdfminer (the underlying PDF parser) is 60 | not infallible at extracting text from a PDF. It has been known to fail 61 | in several different ways: 62 | 63 | * Sometimes it misses or misplaces individual characters, resulting in 64 | annotations with some or all of the text missing (in the latter case, 65 | you'll see a warning). 66 | 67 | * Sometimes the characters are captured, but not spaces between the words. 68 | Tweaking the advanced layout analysis parameters (e.g., `--word-margin`) 69 | may help with this. 70 | 71 | * Sometimes it extracts all the text but renders it out of order, for 72 | example, reporting that text at the top of a second column comes before 73 | text at the end of the first column. This causes pdfannots to return the 74 | annotations out of order, or to report the wrong outlines (section 75 | headings) for annotations. You can mostly work around this issue by using 76 | the `--cols` parameter to force a fixed page layout for the document 77 | (e.g. `--cols=2` for a typical 2-column document). 78 | 79 | * If an annotation (such as a StrikeOut) covers solely whitespace, no text is 80 | extracted for the annotation, and it will be skipped (with a warning). This 81 | is an artifact of the way pdfminer reports whitespace with only an implicit 82 | position defined by surrounding characters. 83 | 84 | * When extracting text, we remove all hyphens that immediately precede a line 85 | break and join the adjacent words. This usually produces the best results 86 | with LaTeX multi-column documents (e.g. "soft-`\n`ware" becomes "software"), 87 | but sometimes the hyphen needs to stay (e.g. "memory-`\n`mapped", which will be 88 | extracted as "memorymapped"), and we can't tell the difference. To disable 89 | this behaviour, pass `--keep-hyphens`. 90 | 91 | 92 | ### FAQ 93 | 94 | 1. I'd like to change how the output is formatted. 95 | 96 | Some minor tweaks (e.g.: word wrap, skipping or reordering output sections) 97 | can be accomplished via command-line arguments. 98 | 99 | All of the output comes from the relevant `Printer` subclass; more elaborate 100 | changes can be accomplished there. Pull requests to introduce new output 101 | formats or variants as printers are welcomed. 102 | 103 | 2. I think I got a review generated by this tool... 104 | 105 | I hope that it was a constructive review, and that the annotations 106 | helped the reviewer give you more detailed feedback so you can improve 107 | your paper. This is, after all, just a tool, and it should not be an 108 | excuse for reviewer sloppiness. 109 | -------------------------------------------------------------------------------- /doc/demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xabu/pdfannots/08f6f0abdb690fb9d8b6cc3f6d244fd2d80c9af6/doc/demo.png -------------------------------------------------------------------------------- /pdfannots.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # This script, which is not part of the pdfannots package, allows pdfannots 4 | # to by run directly from a source tree clone. 5 | 6 | if __name__ == '__main__': 7 | import pdfannots.cli 8 | pdfannots.cli.main() 9 | -------------------------------------------------------------------------------- /pdfannots/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tool to extract and pretty-print PDF annotations for reviewing. 3 | """ 4 | 5 | __version__ = '0.5' 6 | 7 | import bisect 8 | import collections 9 | import itertools 10 | import logging 11 | import typing as typ 12 | 13 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 14 | from pdfminer.pdfpage import PDFPage 15 | from pdfminer.layout import (LAParams, LTAnno, LTChar, LTComponent, LTContainer, LTFigure, LTItem, 16 | LTPage, LTTextBox, LTTextLine) 17 | from pdfminer.converter import PDFLayoutAnalyzer 18 | from pdfminer.pdfparser import PDFParser 19 | from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines 20 | from pdfminer.psparser import PSLiteralTable, PSLiteral 21 | from pdfminer import pdftypes 22 | import pdfminer.settings 23 | import pdfminer.utils 24 | 25 | from .types import Page, Outline, AnnotationType, Annotation, Document, RGB 26 | from .utils import cleanup_text, decode_datetime 27 | 28 | pdfminer.settings.STRICT = False 29 | 30 | logger = logging.getLogger('pdfannots') 31 | 32 | ANNOT_SUBTYPES: typ.Dict[PSLiteral, AnnotationType] = { 33 | PSLiteralTable.intern(e.name): e for e in AnnotationType} 34 | """Mapping from PSliteral to our own enumerant, for supported annotation types.""" 35 | 36 | IGNORED_ANNOT_SUBTYPES = \ 37 | frozenset(PSLiteralTable.intern(n) for n in ( 38 | 'Link', # Links are used for internal document links (e.g. to other pages). 39 | 'Popup', # Controls the on-screen appearance of other annotations. TODO: we may want to 40 | # check for an optional 'Contents' field for alternative human-readable contents. 41 | )) 42 | """Annotation types that we ignore without issuing a warning.""" 43 | 44 | 45 | def _mkannotation( 46 | pa: typ.Dict[str, typ.Any], 47 | page: Page 48 | ) -> typ.Optional[Annotation]: 49 | """ 50 | Given a PDF annotation, capture relevant fields and construct an Annotation object. 51 | 52 | Refer to Section 8.4 of the PDF reference (version 1.7). 53 | """ 54 | 55 | subtype = pa.get('Subtype') 56 | annot_type = None 57 | assert isinstance(subtype, PSLiteral) 58 | try: 59 | annot_type = ANNOT_SUBTYPES[subtype] 60 | except KeyError: 61 | pass 62 | 63 | if annot_type is None: 64 | if subtype not in IGNORED_ANNOT_SUBTYPES: 65 | logger.warning("Unsupported %s annotation ignored on %s", subtype.name, page) 66 | return None 67 | 68 | contents = pa.get('Contents') 69 | if contents is not None: 70 | # decode as string, normalise line endings, replace special characters 71 | contents = cleanup_text(pdfminer.utils.decode_text(contents)) 72 | 73 | rgb: typ.Optional[RGB] = None 74 | color = pdftypes.resolve1(pa.get('C')) 75 | if color: 76 | if (isinstance(color, list) 77 | and len(color) == 3 78 | and all(isinstance(e, (int, float)) and 0 <= e <= 1 for e in color)): 79 | rgb = RGB(*color) 80 | else: 81 | logger.warning("Invalid color %s in annotation on %s", color, page) 82 | 83 | # Rect defines the location of the annotation on the page 84 | rect = pdftypes.resolve1(pa.get('Rect')) 85 | 86 | # QuadPoints are defined only for "markup" annotations (Highlight, Underline, StrikeOut, 87 | # Squiggly, Caret), where they specify the quadrilaterals (boxes) covered by the annotation. 88 | quadpoints = pdftypes.resolve1(pa.get('QuadPoints')) 89 | 90 | author = pdftypes.resolve1(pa.get('T')) 91 | if author is not None: 92 | author = pdfminer.utils.decode_text(author) 93 | 94 | name = pdftypes.resolve1(pa.get('NM')) 95 | if name is not None: 96 | name = pdfminer.utils.decode_text(name) 97 | 98 | created = None 99 | dobj = pa.get('CreationDate') 100 | # some pdf apps set modification date, but not creation date 101 | dobj = dobj or pa.get('ModDate') 102 | # poppler-based apps (e.g. Okular) use 'M' for some reason 103 | dobj = dobj or pa.get('M') 104 | createds = pdftypes.resolve1(dobj) 105 | if createds is not None: 106 | createds = pdfminer.utils.decode_text(createds) 107 | created = decode_datetime(createds) 108 | 109 | in_reply_to = pa.get('IRT') 110 | is_group = False 111 | if in_reply_to is not None: 112 | reply_type = pa.get('RT') 113 | if reply_type is PSLiteralTable.intern('Group'): 114 | is_group = True 115 | elif not (reply_type is None or reply_type is PSLiteralTable.intern('R')): 116 | logger.warning("Unexpected RT=%s, treated as R", reply_type) 117 | 118 | return Annotation(page, annot_type, quadpoints=quadpoints, rect=rect, name=name, 119 | contents=contents, author=author, created=created, color=rgb, 120 | in_reply_to_ref=in_reply_to, is_group_child=is_group) 121 | 122 | 123 | def _get_outlines(doc: PDFDocument) -> typ.Iterator[Outline]: 124 | """Retrieve a list of (unresolved) Outline objects for all recognised outlines in the PDF.""" 125 | 126 | def _resolve_dest(dest: typ.Any) -> typ.Any: 127 | if isinstance(dest, pdftypes.PDFObjRef): 128 | dest = pdftypes.resolve1(dest) 129 | if isinstance(dest, bytes): 130 | dest = pdftypes.resolve1(doc.get_dest(dest)) 131 | elif isinstance(dest, PSLiteral): 132 | dest = pdftypes.resolve1(doc.get_dest(dest.name)) 133 | if isinstance(dest, dict): 134 | dest = dest['D'] 135 | return dest 136 | 137 | for (_, title, destname, actionref, _) in doc.get_outlines(): 138 | if destname is None and actionref: 139 | action = pdftypes.resolve1(actionref) 140 | if isinstance(action, dict): 141 | subtype = action.get('S') 142 | if subtype is PSLiteralTable.intern('GoTo'): 143 | destname = action.get('D') 144 | if destname is None: 145 | continue 146 | dest = _resolve_dest(destname) 147 | 148 | # consider targets of the form [page /XYZ left top zoom] 149 | if dest[1] is PSLiteralTable.intern('XYZ'): 150 | (pageref, _, targetx, targety) = dest[:4] 151 | 152 | if not isinstance(pageref, (int, pdftypes.PDFObjRef)): 153 | logger.warning("Unsupported pageref in outline: %s", pageref) 154 | else: 155 | if targetx is None or targety is None: 156 | # Treat as a general reference to the page 157 | target = None 158 | else: 159 | target = (targetx, targety) 160 | if not all(isinstance(v, (int, float)) for v in target): 161 | logger.warning("Unsupported target in outline: (%r, %r)", targetx, targety) 162 | target = None 163 | 164 | yield Outline(title, pageref, target) 165 | 166 | 167 | class _PDFProcessor(PDFLayoutAnalyzer): 168 | """ 169 | PDF processor class. 170 | 171 | This class encapsulates our primary interface with pdfminer's page layout logic. It is used 172 | to define a logical order for the objects we care about (Annotations and Outlines) on a page, 173 | and to capture the text that annotations may refer to. 174 | """ 175 | 176 | CONTEXT_CHARS = 256 177 | """Maximum number of recent characters to keep as context.""" 178 | 179 | page: typ.Optional[Page] # Page being processed. 180 | charseq: int # Character sequence number within the page. 181 | compseq: int # Component sequence number within the page. 182 | recent_text: typ.Deque[str] # Rotating buffer of recent text, for context. 183 | _lasthit: typ.FrozenSet[Annotation] # Annotations hit by the most recent character. 184 | _curline: typ.Set[Annotation] # Annotations hit somewhere on the current line. 185 | 186 | # Stores annotations that are subscribed to receive their post-annotation 187 | # context. The first element of each tuple, on which the list is sorted, is 188 | # the sequence number of the last character to hit the annotation. 189 | context_subscribers: typ.List[typ.Tuple[int, Annotation]] 190 | 191 | def __init__(self, rsrcmgr: PDFResourceManager, laparams: LAParams): 192 | super().__init__(rsrcmgr, laparams=laparams) 193 | self.page = None 194 | self.recent_text = collections.deque(maxlen=self.CONTEXT_CHARS) 195 | self.context_subscribers = [] 196 | self.clear() 197 | 198 | def clear(self) -> None: 199 | """Reset our internal per-page state.""" 200 | self.charseq = 0 201 | self.compseq = 0 202 | self.recent_text.clear() 203 | self.context_subscribers.clear() 204 | self._lasthit = frozenset() 205 | self._curline = set() 206 | 207 | def set_page(self, page: Page) -> None: 208 | """Prepare to process a new page. Must be called prior to processing.""" 209 | assert self.page is None 210 | self.page = page 211 | 212 | def receive_layout(self, ltpage: LTPage) -> None: 213 | """Callback from PDFLayoutAnalyzer superclass. Called once with each laid-out page.""" 214 | assert self.page is not None 215 | 216 | # Re-initialise our per-page state 217 | self.clear() 218 | 219 | # Render all the items on the page 220 | self.render(ltpage) 221 | 222 | # If we still have annotations needing context, give them whatever we have 223 | for (charseq, annot) in self.context_subscribers: 224 | available = self.charseq - charseq 225 | annot.post_context = ''.join(self.recent_text[n] for n in range(-available, 0)) 226 | 227 | self.page = None 228 | 229 | def update_pageseq(self, component: LTComponent) -> bool: 230 | """Assign sequence numbers for objects on the page based on the nearest line of text. 231 | Returns True if we need to recurse on smaller sub-components (e.g. characters).""" 232 | assert self.page is not None 233 | self.compseq += 1 234 | 235 | hits = 0 236 | for x in itertools.chain(self.page.annots, self.page.outlines): 237 | if x.update_pageseq(component, self.compseq): 238 | hits += 1 239 | 240 | # If we have assigned the same sequence number to multiple objects, and there exist smaller 241 | # sub-components (e.g. characters within a line), we'll recurse on those assigning sequence 242 | # numbers to sub-components to disambiguate the hits, but first we must forget about the 243 | # current sequence number. 244 | # NB: This could be done more efficiently -- we really only need to disambiguate conflicts 245 | # that still exist after processing *all* the line-level components on the same page, but 246 | # that would require multiple rendering passes. 247 | if hits > 1 and isinstance(component, LTContainer) and len(component) > 1: 248 | for x in itertools.chain(self.page.annots, self.page.outlines): 249 | x.discard_pageseq(self.compseq) 250 | return True 251 | 252 | return False 253 | 254 | def test_boxes(self, item: LTComponent) -> None: 255 | """Update the set of annotations whose boxes intersect with the area of the given item.""" 256 | assert self.page is not None 257 | hits = frozenset(a for a in self.page.annots if a.boxes 258 | and any(b.hit_item(item) for b in a.boxes)) 259 | self._lasthit = hits 260 | self._curline.update(hits) 261 | 262 | def capture_context(self, text: str) -> None: 263 | """Store the character for use as context, and update subscribers if required.""" 264 | self.recent_text.append(text) 265 | self.charseq += 1 266 | 267 | # Notify subscribers for whom this character provides the full post-context. 268 | while self.context_subscribers: 269 | (charseq, annot) = self.context_subscribers[0] 270 | assert charseq < self.charseq 271 | if charseq == self.charseq - self.CONTEXT_CHARS: 272 | annot.set_post_context(''.join(self.recent_text)) 273 | self.context_subscribers.pop(0) 274 | else: 275 | assert charseq > self.charseq - self.CONTEXT_CHARS 276 | break 277 | 278 | def capture_char(self, text: str) -> None: 279 | """Capture a character.""" 280 | self.capture_context(text) 281 | 282 | if text == '\n': 283 | # "Broadcast" newlines to _all_ annotations that received any text on the 284 | # current line, in case they see more text on the next line, even if the 285 | # most recent character on the line was not covered by their boxes. 286 | for a in self._curline: 287 | a.capture('\n') 288 | self._curline = set() 289 | else: 290 | # Broadcast the character to annotations that include it. 291 | for a in self._lasthit: 292 | last_charseq = a.last_charseq 293 | a.capture(text, self.charseq) 294 | 295 | if a.wants_context(): 296 | if a.has_context(): 297 | # We already gave the annotation the pre-context, so it is subscribed. 298 | # Locate and remove the annotation's existing context subscription. 299 | assert last_charseq != 0 300 | i = bisect.bisect_left(self.context_subscribers, (last_charseq,)) 301 | assert 0 <= i < len(self.context_subscribers) 302 | while True: 303 | (found_charseq, found_annot) = self.context_subscribers[i] 304 | assert found_charseq == last_charseq 305 | if found_annot is a: 306 | self.context_subscribers.pop(i) 307 | break 308 | i += 1 309 | assert i < len(self.context_subscribers) 310 | 311 | else: 312 | # This is the first hit for the annotation, so set the pre-context. 313 | assert last_charseq == 0 314 | assert len(a.text) != 0 315 | pre_context = ''.join( 316 | self.recent_text[n] for n in range(len(self.recent_text) - 1)) 317 | a.set_pre_context(pre_context) 318 | 319 | # Subscribe this annotation for post-context. 320 | self.context_subscribers.append((self.charseq, a)) 321 | 322 | def render(self, item: LTItem, pageseq_nested: bool = False) -> None: 323 | """ 324 | Helper for receive_layout, called recursively for every item on a page, in layout order. 325 | 326 | Ref: https://pdfminersix.readthedocs.io/en/latest/topic/converting_pdf_to_text.html 327 | """ 328 | # Assign sequence numbers to items on the page based on their proximity to lines of text or 329 | # to figures (which may contain bare LTChar elements). 330 | if isinstance(item, (LTTextLine, LTFigure)) or ( 331 | pageseq_nested and isinstance(item, LTComponent)): 332 | pageseq_nested = self.update_pageseq(item) 333 | 334 | # If it's a container, recurse on nested items. 335 | if isinstance(item, LTContainer): 336 | for child in item: 337 | self.render(child, pageseq_nested) 338 | 339 | # After the children of a text box, capture the end of the final 340 | # line (logic derived from pdfminer.converter.TextConverter). 341 | if isinstance(item, LTTextBox): 342 | self.capture_char('\n') 343 | 344 | # Each character is represented by one LTChar, and we must handle 345 | # individual characters (not higher-level objects like LTTextLine) 346 | # so that we can capture only those covered by the annotation boxes. 347 | elif isinstance(item, LTChar): 348 | self.test_boxes(item) 349 | self.capture_char(item.get_text()) 350 | 351 | # LTAnno objects capture whitespace not explicitly encoded in 352 | # the text. They don't have an (X,Y) position -- we treat them 353 | # the same as the most recent character. 354 | elif isinstance(item, LTAnno): 355 | self.capture_char(item.get_text()) 356 | 357 | 358 | def process_file( 359 | file: typ.BinaryIO, 360 | *, # Subsequent arguments are keyword-only 361 | columns_per_page: typ.Optional[int] = None, 362 | emit_progress_to: typ.Optional[typ.TextIO] = None, 363 | laparams: LAParams = LAParams() 364 | ) -> Document: 365 | """ 366 | Process a PDF file, extracting its annotations and outlines. 367 | 368 | Arguments: 369 | file Handle to PDF file 370 | columns_per_page If set, overrides PDF Miner's layout detect with a fixed page layout 371 | emit_progress_to If set, file handle (e.g. sys.stderr) to which progress is reported 372 | laparams PDF Miner layout parameters 373 | """ 374 | 375 | # Initialise PDFMiner state 376 | rsrcmgr = PDFResourceManager() 377 | device = _PDFProcessor(rsrcmgr, laparams) 378 | interpreter = PDFPageInterpreter(rsrcmgr, device) 379 | parser = PDFParser(file) 380 | doc = PDFDocument(parser) 381 | 382 | def emit_progress(msg: str) -> None: 383 | if emit_progress_to is not None: 384 | emit_progress_to.write(msg) 385 | emit_progress_to.flush() 386 | 387 | emit_progress(file.name) 388 | 389 | # Retrieve outlines if present. Each outline refers to a page, using 390 | # *either* a PDF object ID or an integer page number. These references will 391 | # be resolved below while rendering pages -- for now we insert them into one 392 | # of two dicts for later. 393 | outlines_by_pageno: typ.Dict[object, typ.List[Outline]] = collections.defaultdict(list) 394 | outlines_by_objid: typ.Dict[object, typ.List[Outline]] = collections.defaultdict(list) 395 | 396 | try: 397 | for o in _get_outlines(doc): 398 | if isinstance(o.pageref, pdftypes.PDFObjRef): 399 | outlines_by_objid[o.pageref.objid].append(o) 400 | else: 401 | outlines_by_pageno[o.pageref].append(o) 402 | except PDFNoOutlines: 403 | logger.info("Document doesn't include outlines (\"bookmarks\")") 404 | except Exception as ex: 405 | logger.warning("Failed to retrieve outlines: %s", ex) 406 | 407 | # Iterate over all the pages, constructing page objects. 408 | result = Document() 409 | for (pageno, pdfpage) in enumerate(PDFPage.create_pages(doc)): 410 | emit_progress(" %d" % (pageno + 1)) 411 | 412 | page = Page(pageno, pdfpage.pageid, pdfpage.label, pdfpage.mediabox, columns_per_page) 413 | result.pages.append(page) 414 | 415 | # Resolve any outlines referring to this page, and link them to the page. 416 | # Note that outlines may refer to the page number or ID. 417 | for o in (outlines_by_objid.pop(page.objid, []) 418 | + outlines_by_pageno.pop(pageno, [])): 419 | o.resolve(page) 420 | page.outlines.append(o) 421 | 422 | # Dict from object ID (in the ObjRef) to Annotation object 423 | # This is used while post-processing to resolve inter-annotation references 424 | annots_by_objid: typ.Dict[int, Annotation] = {} 425 | 426 | # Construct Annotation objects, and append them to the page. 427 | for pa in pdftypes.resolve1(pdfpage.annots) if pdfpage.annots else []: 428 | if isinstance(pa, pdftypes.PDFObjRef): 429 | annot_dict = pdftypes.dict_value(pa) 430 | if annot_dict: # Would be empty if pa is a broken ref 431 | annot = _mkannotation(annot_dict, page) 432 | if annot is not None: 433 | page.annots.append(annot) 434 | assert pa.objid not in annots_by_objid 435 | annots_by_objid[pa.objid] = annot 436 | else: 437 | logger.warning("Unknown annotation: %s", pa) 438 | 439 | # If the page has neither outlines nor annotations, skip further processing. 440 | if not (page.annots or page.outlines): 441 | continue 442 | 443 | # Render the page. This captures the selected text for any annotations 444 | # on the page, and updates annotations and outlines with a logical 445 | # sequence number based on the order of text lines on the page. 446 | device.set_page(page) 447 | interpreter.process_page(pdfpage) 448 | 449 | # Now we have their logical order, sort the annotations and outlines. 450 | page.annots.sort() 451 | page.outlines.sort() 452 | 453 | # Give the annotations a chance to update their internals 454 | for a in page.annots: 455 | a.postprocess(annots_by_objid) 456 | 457 | emit_progress("\n") 458 | 459 | device.close() 460 | 461 | # all outlines should be resolved by now 462 | assert {} == outlines_by_pageno 463 | assert {} == outlines_by_objid 464 | 465 | return result 466 | -------------------------------------------------------------------------------- /pdfannots/__main__.py: -------------------------------------------------------------------------------- 1 | from .cli import main 2 | 3 | if __name__ == "__main__": 4 | main() 5 | -------------------------------------------------------------------------------- /pdfannots/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | import typing as typ 5 | 6 | from pdfminer.layout import LAParams 7 | 8 | from . import __doc__, __version__, process_file 9 | from .printer import Printer 10 | from .printer.markdown import MarkdownPrinter, GroupedMarkdownPrinter 11 | from .printer.json import JsonPrinter 12 | 13 | 14 | MD_FORMAT_ARGS = frozenset([ 15 | 'condense', 16 | 'group_highlights_by_color', 17 | 'page_number_offset', 18 | 'print_filename', 19 | 'sections', 20 | 'use_page_labels', 21 | 'wrap_column', 22 | ]) 23 | """Names of arguments passed to the markdown printer.""" 24 | 25 | 26 | def _float_or_disabled(x: str) -> typ.Optional[float]: 27 | if x.lower().strip() == "disabled": 28 | return None 29 | try: 30 | return float(x) 31 | except ValueError as ex: 32 | raise argparse.ArgumentTypeError("invalid float value: {}".format(x)) from ex 33 | 34 | 35 | def parse_args() -> typ.Tuple[argparse.Namespace, LAParams]: 36 | p = argparse.ArgumentParser(prog='pdfannots', description=__doc__) 37 | 38 | p.add_argument('--version', action='version', 39 | version='%(prog)s ' + __version__) 40 | 41 | p.add_argument("input", metavar="INFILE", type=argparse.FileType("rb"), 42 | help="PDF files to process", nargs='+') 43 | 44 | g = p.add_argument_group('Basic options') 45 | g.add_argument("-p", "--progress", default=False, action="store_true", 46 | help="Emit progress information to stderr.") 47 | g.add_argument("-o", metavar="OUTFILE", type=argparse.FileType("w", encoding="utf-8"), 48 | dest="output", default=sys.stdout, help="Output file (default is stdout).") 49 | g.add_argument("-n", "--cols", default=None, type=int, metavar="COLS", dest="cols", 50 | help="Assume a fixed top-to-bottom left-to-right page layout with this many " 51 | "columns per page. If unset, PDFMiner's layout detection logic is used.") 52 | g.add_argument("--keep-hyphens", dest="remove_hyphens", default=True, action="store_false", 53 | help="When capturing text across a line break, don't attempt to remove hyphens.") 54 | g.add_argument("-f", "--format", choices=["md", "json"], default="md", 55 | help="Output format (default: markdown).") 56 | 57 | g = p.add_argument_group('Options controlling markdown output') 58 | mutex_group = g.add_mutually_exclusive_group() 59 | mutex_group.add_argument( 60 | "--no-group", 61 | dest="group", 62 | default=True, action="store_false", 63 | help="Emit annotations in order, don't group into sections." 64 | ) 65 | mutex_group.add_argument( 66 | "--group-highlights-by-color", 67 | dest="group_highlights_by_color", 68 | default=False, action="store_true", 69 | help="Group highlights by color in grouped output." 70 | ) 71 | 72 | g.add_argument("-s", "--sections", metavar="SEC", nargs="*", 73 | choices=GroupedMarkdownPrinter.ALL_SECTIONS, 74 | default=GroupedMarkdownPrinter.ALL_SECTIONS, 75 | help=("sections to emit (default: %s)" % 76 | ', '.join(GroupedMarkdownPrinter.ALL_SECTIONS))) 77 | g.add_argument("--no-condense", dest="condense", default=True, action="store_false", 78 | help="Emit annotations as a blockquote regardless of length.") 79 | g.add_argument("--no-page-labels", dest="use_page_labels", default=True, action="store_false", 80 | help="Ignore page labels if present, just print 1-based page numbers.") 81 | g.add_argument("--page-number-offset", dest="page_number_offset", default=0, type=int, 82 | help="Increase or decrease page numbers with a fixed offset.") 83 | g.add_argument("--print-filename", dest="print_filename", default=False, action="store_true", 84 | help="Print the name of each file with annotations.") 85 | g.add_argument("-w", "--wrap", dest="wrap_column", metavar="COLS", type=int, 86 | help="Wrap text at this many output columns.") 87 | 88 | g = p.add_argument_group( 89 | "Advanced options affecting PDFMiner text layout analysis") 90 | laparams = LAParams() 91 | g.add_argument( 92 | "--line-overlap", metavar="REL_HEIGHT", type=float, default=laparams.line_overlap, 93 | help="If two characters have more overlap than this they are considered to be " 94 | "on the same line. The overlap is specified relative to the minimum height " 95 | "of both characters. Default: %s" % laparams.line_overlap) 96 | g.add_argument( 97 | "--char-margin", metavar="REL_WIDTH", type=float, default=laparams.char_margin, 98 | help="If two characters are closer together than this margin they " 99 | "are considered to be part of the same line. The margin is " 100 | "specified relative to the character width. Default: %s" % laparams.char_margin) 101 | g.add_argument( 102 | "--word-margin", metavar="REL_WIDTH", type=float, default=laparams.word_margin, 103 | help="If two characters on the same line are further apart than this " 104 | "margin then they are considered to be two separate words, and " 105 | "an intermediate space will be added for readability. The margin " 106 | "is specified relative to the character width. Default: %s" % laparams.word_margin) 107 | g.add_argument( 108 | "--line-margin", metavar="REL_HEIGHT", type=float, default=laparams.line_margin, 109 | help="If two lines are close together they are considered to " 110 | "be part of the same paragraph. The margin is specified " 111 | "relative to the height of a line. Default: %s" % laparams.line_margin) 112 | g.add_argument( 113 | "--boxes-flow", type=_float_or_disabled, default=laparams.boxes_flow, 114 | help="Specifies how much a horizontal and vertical position of a " 115 | "text matters when determining the order of lines. The value " 116 | "should be within the range of -1.0 (only horizontal position " 117 | "matters) to +1.0 (only vertical position matters). You can also " 118 | "pass 'disabled' to disable advanced layout analysis, and " 119 | "instead return text based on the position of the bottom left " 120 | "corner of the text box. Default: %s" % laparams.boxes_flow) 121 | 122 | # The next two booleans are described as if they default off, so let's ensure that. 123 | assert not laparams.detect_vertical 124 | assert not laparams.all_texts 125 | g.add_argument( 126 | "--detect-vertical", default=laparams.detect_vertical, 127 | action="store_const", const=(not laparams.detect_vertical), 128 | help="Consider vertical text during layout analysis.") 129 | g.add_argument( 130 | "--all-texts", default=laparams.all_texts, 131 | action="store_const", const=(not laparams.all_texts), 132 | help="Perform layout analysis on text in figures.") 133 | 134 | args = p.parse_args() 135 | 136 | # Propagate parsed layout parameters back to LAParams object 137 | for param in ("line_overlap", "char_margin", "word_margin", "line_margin", 138 | "boxes_flow", "detect_vertical", "all_texts"): 139 | setattr(laparams, param, getattr(args, param)) 140 | 141 | return args, laparams 142 | 143 | 144 | def main() -> None: 145 | args, laparams = parse_args() 146 | logging.basicConfig(format='%(levelname)s: %(message)s', 147 | level=logging.WARNING) 148 | 149 | # construct appropriate Printer 150 | printer: Printer 151 | if args.format == "md": 152 | mdargs = {k: getattr(args, k) for k in MD_FORMAT_ARGS} 153 | printer = (GroupedMarkdownPrinter if args.group else MarkdownPrinter)(**mdargs) 154 | elif args.format == "json": 155 | printer = JsonPrinter( 156 | remove_hyphens=args.remove_hyphens, 157 | output_codec=args.output.encoding) 158 | 159 | def write_if_nonempty(s: str) -> None: 160 | if s: 161 | args.output.write(s) 162 | 163 | write_if_nonempty(printer.begin()) 164 | 165 | # iterate over files 166 | for file in args.input: 167 | doc = process_file( 168 | file, 169 | columns_per_page=args.cols, 170 | emit_progress_to=(sys.stderr if args.progress else None), 171 | laparams=laparams) 172 | for line in printer.print_file(file.name, doc): 173 | args.output.write(line) 174 | 175 | write_if_nonempty(printer.end()) 176 | -------------------------------------------------------------------------------- /pdfannots/printer/__init__.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import typing as typ 3 | 4 | from ..types import Document 5 | 6 | 7 | class Printer(abc.ABC): 8 | """ 9 | Base class for pretty-printers. 10 | """ 11 | 12 | def begin(self) -> str: 13 | """Called once prior to print_file call. Returns initial output.""" 14 | return '' 15 | 16 | @abc.abstractmethod 17 | def print_file( 18 | self, 19 | filename: str, 20 | document: Document 21 | ) -> typ.Iterator[str]: 22 | """ 23 | Pretty-print a single document. 24 | 25 | Pretty-print the extracted annotations, yielding output (incrementally) as strings. 26 | Called multiple times, once per file. 27 | """ 28 | 29 | def end(self) -> str: 30 | """Called once after the final print_file call. Returns any final additional output.""" 31 | return '' 32 | -------------------------------------------------------------------------------- /pdfannots/printer/json.py: -------------------------------------------------------------------------------- 1 | import json 2 | import typing as typ 3 | 4 | from . import Printer 5 | from ..types import Annotation, Document 6 | 7 | 8 | def annot_to_dict( 9 | doc: Document, 10 | annot: Annotation, 11 | remove_hyphens: bool 12 | ) -> typ.Dict[str, typ.Any]: 13 | """Convert an annotation to a dictionary representation suitable for JSON encoding.""" 14 | assert annot.pos 15 | 16 | result = { 17 | "name": annot.name, 18 | "type": annot.subtype.name, 19 | "page": annot.pos.page.pageno + 1, 20 | "page_label": annot.pos.page.label, 21 | "start_xy": (annot.pos.x, annot.pos.y), 22 | "prior_outline": getattr(doc.nearest_outline(annot.pos), 'title', None), 23 | "text": annot.gettext(remove_hyphens), 24 | "contents": annot.contents, 25 | "author": annot.author, 26 | "created": annot.created.strftime('%Y-%m-%dT%H:%M:%S') if annot.created else None, 27 | "color": ('#' + annot.color.ashex()) if annot.color else None, 28 | "in_reply_to": annot.in_reply_to.name if annot.in_reply_to else None, 29 | } 30 | 31 | # Remove keys with None values in nested dictionary and return 32 | return {k: v for k, v in result.items() if v is not None} 33 | 34 | 35 | class JsonPrinter(Printer): 36 | def __init__( 37 | self, 38 | *, 39 | remove_hyphens: bool, # Whether to remove hyphens across a line break 40 | output_codec: str # Text codec in use for output 41 | ) -> None: 42 | self.remove_hyphens = remove_hyphens 43 | self.seen_first = False 44 | 45 | # JSON must be represented as UTF-8, UTF-16, or UTF-32. If the output codec is 46 | # one of these, we can disable ASCII string escaping in the JSON encoder. 47 | self.ensure_ascii = output_codec not in ['utf-8', 'utf-16', 'utf-32'] 48 | 49 | def end(self) -> str: 50 | return '\n' 51 | 52 | def print_file( 53 | self, 54 | filename: str, 55 | document: Document 56 | ) -> typ.Iterator[str]: 57 | if self.seen_first: 58 | # The flat array format is incompatible with multiple input files 59 | # TODO: Ideally we'd catch this at invocation time 60 | raise RuntimeError("The JSON output format does not support multiple files.") 61 | else: 62 | self.seen_first = True 63 | 64 | annots = [annot_to_dict(document, a, self.remove_hyphens) 65 | for a in document.iter_annots(include_replies=True)] 66 | yield from json.JSONEncoder(indent=2, ensure_ascii=self.ensure_ascii).iterencode(annots) 67 | -------------------------------------------------------------------------------- /pdfannots/printer/markdown.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import logging 3 | import textwrap 4 | import typing as typ 5 | 6 | from . import Printer 7 | from ..types import RGB, AnnotationType, Pos, Annotation, Document 8 | 9 | logger = logging.getLogger('pdfannots') 10 | 11 | MAX_CONTEXT_WORDS = 10 12 | """Maximum number of words returned by trim_context.""" 13 | 14 | FALLBACK_CONTEXT_WORDS = 4 15 | """Number of words returned by trim_context in fallback mode.""" 16 | 17 | CONTEXT_BOUNDARIES = [ 18 | # (separator, keep_on_left, keep_on_right) 19 | ('. ', False, True), # sentence boundary 20 | ('! ', False, True), 21 | ('? ', False, True), 22 | (': ', False, False), 23 | ('; ', False, False), 24 | ('" ', False, True), # end of quote 25 | (' "', True, False), # start of quote 26 | (') ', False, True), # end of parenthesis 27 | (' (', True, False), # start of parenthesis 28 | ('—', False, False), # em dash 29 | ] 30 | """Rough approximation of natural boundaries in writing, used when searching for context.""" 31 | 32 | 33 | def trim_context(context: str, keep_right: bool) -> str: 34 | """ 35 | Trim context for presentation. 36 | 37 | Given a potentially-long string of context preceding or following an annotation, identify 38 | a natural boundary at which to trim it, and return the trimmed string. 39 | 40 | Arguments: 41 | context String of captured context 42 | keep_right Whether to retain text on the right (True) or left (False) end of the string 43 | """ 44 | best = None 45 | 46 | for (sep, keep_sep_left, keep_sep_right) in CONTEXT_BOUNDARIES: 47 | # search for the separator 48 | i = context.rfind(sep) if keep_right else context.find(sep) 49 | if i < 0: 50 | continue 51 | 52 | # include the separator if desired 53 | if (keep_right and not keep_sep_left) or (not keep_right and keep_sep_right): 54 | i += len(sep) 55 | 56 | # extract the candidate string 57 | candidate = context[i:] if keep_right else context[:i] 58 | 59 | if best is None or len(candidate) < len(best): 60 | best = candidate 61 | if len(candidate.split()) <= 1: 62 | break 63 | 64 | if best is not None and len(best.split()) <= MAX_CONTEXT_WORDS: 65 | return best 66 | 67 | # Give up and take a few words, whatever they are. 68 | if keep_right: 69 | fallback = '...' + ' '.join(context.split()[-FALLBACK_CONTEXT_WORDS:]) 70 | if context[-1].isspace(): 71 | fallback += context[-1] 72 | else: 73 | fallback = ' '.join(context.split()[:FALLBACK_CONTEXT_WORDS]) + '...' 74 | if context[0].isspace(): 75 | fallback = context[0] + fallback 76 | 77 | return fallback 78 | 79 | 80 | class MarkdownPrinter(Printer): 81 | BULLET_INDENT1 = " * " 82 | BULLET_INDENT2 = " " 83 | QUOTE_INDENT = BULLET_INDENT2 + "> " 84 | 85 | def __init__( 86 | self, 87 | *, 88 | condense: bool = True, # Permit use of the condensed format 89 | page_number_offset: int = 0, # Page number offset 90 | print_filename: bool = False, # Whether to print file names 91 | remove_hyphens: bool = True, # Whether to remove hyphens across a line break 92 | use_page_labels: bool = True, # Whether to use page labels 93 | wrap_column: typ.Optional[int] = None, # Column at which output is word-wrapped 94 | **kwargs: typ.Any # Other args, ignored 95 | ) -> None: 96 | self.page_number_offset = page_number_offset 97 | self.print_filename = print_filename 98 | self.remove_hyphens = remove_hyphens 99 | self.use_page_labels = use_page_labels 100 | self.wrap_column = wrap_column 101 | self.condense = condense 102 | 103 | if self.wrap_column: 104 | # For bullets, we need two text wrappers: one for the leading 105 | # bullet on the first paragraph, one without. 106 | self.bullet_tw1 = textwrap.TextWrapper( 107 | width=self.wrap_column, 108 | initial_indent=self.BULLET_INDENT1, 109 | subsequent_indent=self.BULLET_INDENT2) 110 | 111 | self.bullet_tw2 = textwrap.TextWrapper( 112 | width=self.wrap_column, 113 | initial_indent=self.BULLET_INDENT2, 114 | subsequent_indent=self.BULLET_INDENT2) 115 | 116 | # For blockquotes, each line is prefixed with "> " 117 | self.quote_tw = textwrap.TextWrapper( 118 | width=self.wrap_column, 119 | initial_indent=self.QUOTE_INDENT, 120 | subsequent_indent=self.QUOTE_INDENT) 121 | 122 | def print_file( 123 | self, 124 | filename: str, 125 | document: Document 126 | ) -> typ.Iterator[str]: 127 | body_iter = self.emit_body(document) 128 | 129 | if self.print_filename: 130 | # Print the file name, only if there is some output. 131 | try: 132 | first = next(body_iter) 133 | except StopIteration: 134 | pass 135 | else: 136 | yield "# File: '%s'\n\n" % filename 137 | yield first 138 | 139 | yield from body_iter 140 | 141 | @staticmethod 142 | def format_pos( 143 | pos: Pos, 144 | document: Document, 145 | use_page_label: bool, 146 | page_number_offset: int 147 | ) -> str: 148 | 149 | result = pos.page.format_name( 150 | use_label=use_page_label, 151 | page_number_offset=page_number_offset).title() 152 | 153 | o = document.nearest_outline(pos) 154 | if o: 155 | result += " (%s)" % o.title 156 | 157 | return result 158 | 159 | def format_bullet( 160 | self, 161 | paras: typ.List[str], 162 | quote: typ.Optional[typ.Tuple[int, int]] = None 163 | ) -> str: 164 | """ 165 | Format a Markdown bullet, wrapped as desired. 166 | """ 167 | 168 | if quote is not None: 169 | (quotepos, quotelen) = quote 170 | assert quotepos > 0 # first paragraph to format as a block-quote 171 | assert quotelen > 0 # length of the blockquote in paragraphs 172 | assert quotepos + quotelen <= len(paras) 173 | 174 | # emit the first paragraph with the bullet 175 | if self.wrap_column: 176 | ret = self.bullet_tw1.fill(paras[0]) 177 | else: 178 | ret = self.BULLET_INDENT1 + paras[0] 179 | 180 | # emit subsequent paragraphs 181 | npara = 1 182 | for para in paras[1:]: 183 | # are we in a blockquote? 184 | inquote = quote and npara >= quotepos and npara < quotepos + quotelen 185 | 186 | # emit a paragraph break 187 | # if we're going straight to a quote, we don't need an extra newline 188 | ret = ret + ('\n' if quote and npara == quotepos else '\n\n') 189 | 190 | if self.wrap_column: 191 | tw = self.quote_tw if inquote else self.bullet_tw2 192 | ret = ret + tw.fill(para) 193 | else: 194 | indent = self.QUOTE_INDENT if inquote else self.BULLET_INDENT2 195 | ret = ret + indent + para 196 | 197 | npara += 1 198 | 199 | return ret 200 | 201 | def merge_context(self, annot: Annotation, text: str) -> str: 202 | """Merge the context for a strikeout or caret annotation into the text.""" 203 | (pre, post) = annot.get_context(self.remove_hyphens) 204 | 205 | if pre: 206 | pre = trim_context(pre, keep_right=True) 207 | 208 | if post: 209 | post = trim_context(post, keep_right=False) 210 | 211 | if annot.subtype == AnnotationType.StrikeOut: 212 | return pre + '~~' + text + '~~' + post 213 | else: 214 | assert annot.subtype == AnnotationType.Caret 215 | assert text.isspace() 216 | return pre.rstrip(' ') + ' ^ ' + post.lstrip(' ') 217 | 218 | def format_annot( 219 | self, 220 | annot: Annotation, 221 | document: Document, 222 | extra: typ.Optional[str] = None 223 | ) -> str: 224 | # Limited support for Caret annotations with a single "reply" of type StrikeOut 225 | contents = annot.contents 226 | if annot.subtype == AnnotationType.Caret and annot.group_children: 227 | child = annot.get_child_by_type(AnnotationType.StrikeOut) 228 | if child: 229 | annot = child 230 | if child.contents: 231 | logger.warning("Ignored StrikeOut comment: %s", child.contents) 232 | 233 | # capture item text and contents (i.e. the comment), and split the latter into paragraphs 234 | text = annot.gettext(self.remove_hyphens) or '' 235 | comment = [l for l in contents.splitlines() if l] if contents else [] 236 | 237 | if annot.has_context(): 238 | text = self.merge_context(annot, text) 239 | 240 | # we are either printing: item text and item contents, or one of the two 241 | # if we see an annotation with neither, something has gone wrong 242 | if not (text or comment): 243 | logger.warning('%s annotation at %s has neither text nor a comment; skipped', 244 | annot.subtype.name, annot.pos) 245 | return '' 246 | 247 | # compute the formatted position (and extra bit if needed) as a label 248 | assert annot.pos is not None 249 | label = self.format_pos( 250 | annot.pos, document, self.use_page_labels, self.page_number_offset 251 | ) + (" " + extra if extra else "") + ":" 252 | 253 | # If we have short (few words) text with a short or no comment, and the 254 | # text contains no embedded full stops or quotes, then we'll just put 255 | # quotation marks around the text and merge the two into a single paragraph. 256 | if (self.condense 257 | and text 258 | and not annot.has_context() 259 | and len(text.split()) <= 10 # words 260 | and all([x not in text for x in ['"', '. ']]) 261 | and (not comment or len(comment) == 1)): 262 | msg = label + ' "' + text + '"' 263 | if comment: 264 | msg = msg + ' -- ' + comment[0] 265 | return self.format_bullet([msg]) + "\n\n" 266 | 267 | # If there is no text and a single-paragraph comment, it also goes on 268 | # one line. 269 | elif comment and not text and len(comment) == 1: 270 | msg = label + " " + comment[0] 271 | return self.format_bullet([msg]) + "\n\n" 272 | 273 | # Otherwise, text (if any) turns into a blockquote, and the comment (if 274 | # any) into subsequent paragraphs. 275 | else: 276 | msgparas = [label] + [text] + comment 277 | quotepos = (1, 1) if text else None 278 | return self.format_bullet(msgparas, quotepos) + "\n\n" 279 | 280 | def emit_body( 281 | self, 282 | document: Document 283 | ) -> typ.Iterator[str]: 284 | for a in document.iter_annots(): 285 | yield self.format_annot(a, document, a.subtype.name) 286 | 287 | 288 | class GroupedMarkdownPrinter(MarkdownPrinter): 289 | ANNOT_NITS = frozenset({AnnotationType.Caret, AnnotationType.Squiggly, 290 | AnnotationType.StrikeOut, AnnotationType.Underline}) 291 | ALL_SECTIONS = ["highlights", "comments", "nits"] 292 | 293 | def __init__( 294 | self, 295 | *, 296 | sections: typ.Sequence[str] = ALL_SECTIONS, # controls the order of sections output 297 | group_highlights_by_color: bool = False, # Whether to group highlights by color 298 | **kwargs: typ.Any # other args -- see superclass 299 | ) -> None: 300 | super().__init__(**kwargs) 301 | self.sections = sections 302 | self.group_highlights_by_color = group_highlights_by_color 303 | self._fmt_header_called: bool 304 | 305 | def emit_body( 306 | self, 307 | document: Document 308 | ) -> typ.Iterator[str]: 309 | 310 | self._fmt_header_called = False 311 | 312 | def fmt_header(name: str, level: int = 2) -> str: 313 | """ 314 | A function that formats a header with a given name and level. 315 | 316 | Parameters: 317 | name (str): The name of the header. 318 | level (int, optional): The level of the header. Defaults to 2. 319 | 320 | Returns: 321 | str: The formatted header. 322 | """ 323 | # emit blank separator line if needed 324 | prefix = '\n' if self._fmt_header_called else '' 325 | self._fmt_header_called = True 326 | header = '#' * level 327 | return prefix + header + " " + name + "\n" 328 | 329 | # Partition annotations into nits, comments, and highlights. 330 | nits: typ.List[Annotation] = [] 331 | comments: typ.List[Annotation] = [] 332 | highlights: typ.List[Annotation] = [] # When grouping by color holds only undefined annots 333 | highlights_by_color: typ.DefaultDict[RGB, typ.List[Annotation]] = defaultdict(list) 334 | 335 | for a in document.iter_annots(): 336 | if a.subtype in self.ANNOT_NITS: 337 | nits.append(a) 338 | elif a.contents: 339 | comments.append(a) 340 | elif a.subtype == AnnotationType.Highlight: 341 | if self.group_highlights_by_color and a.color: 342 | highlights_by_color[a.color].append(a) 343 | else: 344 | highlights.append(a) 345 | 346 | for secname in self.sections: 347 | if (highlights or highlights_by_color) and secname == 'highlights': 348 | yield fmt_header("Highlights") 349 | 350 | for color, annots in highlights_by_color.items(): 351 | yield fmt_header(f"Color: {color.ashex()}", level=3) 352 | for a in annots: 353 | yield self.format_annot(a, document) 354 | 355 | if highlights and self.group_highlights_by_color: 356 | yield fmt_header("Color: undefined", level=3) 357 | 358 | for a in highlights: 359 | yield self.format_annot(a, document) 360 | 361 | if comments and secname == 'comments': 362 | yield fmt_header("Detailed comments") 363 | for a in comments: 364 | yield self.format_annot(a, document) 365 | 366 | if nits and secname == 'nits': 367 | yield fmt_header("Nits") 368 | for a in nits: 369 | extra = None 370 | if a.subtype == AnnotationType.Caret: 371 | if a.get_child_by_type(AnnotationType.StrikeOut): 372 | extra = "suggested replacement" 373 | else: 374 | extra = "suggested insertion" 375 | elif a.subtype == AnnotationType.StrikeOut: 376 | extra = "suggested deletion" 377 | 378 | yield self.format_annot(a, document, extra) 379 | -------------------------------------------------------------------------------- /pdfannots/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xabu/pdfannots/08f6f0abdb690fb9d8b6cc3f6d244fd2d80c9af6/pdfannots/py.typed -------------------------------------------------------------------------------- /pdfannots/types.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import bisect 4 | import datetime 5 | import enum 6 | import functools 7 | import logging 8 | import typing as typ 9 | 10 | from pdfminer.layout import LTComponent, LTText 11 | from pdfminer.pdftypes import PDFObjRef 12 | 13 | from .utils import merge_lines 14 | 15 | logger = logging.getLogger('pdfannots') 16 | 17 | Point = typ.Tuple[float, float] 18 | """An (x, y) point in PDF coordinates, i.e. bottom left is 0,0.""" 19 | 20 | BoxCoords = typ.Tuple[float, float, float, float] 21 | """The coordinates of a bounding box (x0, y0, x1, y1).""" 22 | 23 | 24 | class Box: 25 | """ 26 | Coordinates of a rectangular box. 27 | """ 28 | 29 | def __init__(self, x0: float, y0: float, x1: float, y1: float): 30 | assert x0 <= x1 and y0 <= y1 31 | self.x0 = x0 32 | self.x1 = x1 33 | self.y0 = y0 34 | self.y1 = y1 35 | 36 | def __repr__(self) -> str: 37 | return '' % (self.x0, self.y0, self.x1, self.y1) 38 | 39 | @staticmethod 40 | def from_item(item: LTComponent) -> Box: 41 | """Construct a Box from the bounding box of a given PDF component.""" 42 | return Box(item.x0, item.y0, item.x1, item.y1) 43 | 44 | @staticmethod 45 | def from_coords(coords: BoxCoords) -> Box: 46 | """Construct a Box from the given PDF coordinates.""" 47 | (x0, y0, x1, y1) = coords 48 | return Box(x0, y0, x1, y1) 49 | 50 | def get_coords(self) -> BoxCoords: 51 | """Return the PDF coordinates of this box.""" 52 | return (self.x0, self.y0, self.x1, self.y1) 53 | 54 | def get_width(self) -> float: 55 | """Return the width of the box.""" 56 | return self.x1 - self.x0 57 | 58 | def get_height(self) -> float: 59 | """Return the height of the box.""" 60 | return self.y1 - self.y0 61 | 62 | def get_area(self) -> float: 63 | """Return the area of the box.""" 64 | return self.get_height() * self.get_width() 65 | 66 | def get_overlap(self, other: Box) -> float: 67 | """Compute the overlapping area (if any) with the provided box.""" 68 | x_overlap = max(0, min(other.x1, self.x1) - max(other.x0, self.x0)) 69 | y_overlap = max(0, min(other.y1, self.y1) - max(other.y0, self.y0)) 70 | return x_overlap * y_overlap 71 | 72 | def hit_item(self, item: LTComponent) -> bool: 73 | """Does most of the area of the PDF component overlap this box?""" 74 | item_area = float(item.width) * float(item.height) 75 | overlap_area = self.get_overlap(Box.from_item(item)) 76 | 77 | if overlap_area != 0: 78 | logger.debug( 79 | "Box hit: '%s' %f-%f,%f-%f in %f-%f,%f-%f %2.0f%%", 80 | item.get_text() if isinstance(item, LTText) else '', 81 | item.x0, item.x1, item.y0, item.y1, 82 | self.x0, self.x1, self.y0, self.y1, 83 | 100 * overlap_area / item_area) 84 | 85 | assert overlap_area <= item_area 86 | return (item_area != 0) and overlap_area >= (0.5 * item_area) 87 | 88 | def closest_point(self, point: Point) -> Point: 89 | """Compute the closest point in this box to the specified point.""" 90 | px, py = point 91 | return (min(max(self.x0, px), self.x1), 92 | min(max(self.y0, py), self.y1)) 93 | 94 | def square_of_distance_to_closest_point(self, point: Point) -> float: 95 | """ 96 | Compute the distance from the closest point in this box to the specified point, squared. 97 | 98 | (We avoid calling sqrt for performance reasons, since we just need to compare.) 99 | """ 100 | x, y = self.closest_point(point) 101 | px, py = point 102 | return abs(px - x)**2 + abs(py - y)**2 103 | 104 | 105 | @functools.total_ordering 106 | class Page: 107 | """ 108 | Page. 109 | 110 | A page object uniquely represents a page in the PDF. It is identified by a 111 | zero-based page number, and a PDF object ID. It holds a list of Annotation 112 | objects for annotations on the page, and Outline objects for outlines that 113 | link to somewhere on the page. 114 | """ 115 | 116 | annots: typ.List[Annotation] 117 | outlines: typ.List[Outline] 118 | 119 | def __init__( 120 | self, 121 | pageno: int, 122 | objid: object, 123 | label: typ.Optional[str], 124 | mediabox: BoxCoords, 125 | fixed_columns: typ.Optional[int] = None 126 | ): 127 | assert pageno >= 0 128 | assert fixed_columns is None or fixed_columns > 0 129 | self.pageno = pageno 130 | self.objid = objid 131 | self.label = label 132 | self.annots = [] 133 | self.outlines = [] 134 | self.mediabox = Box.from_coords(mediabox) 135 | self.fixed_columns = fixed_columns 136 | 137 | def __repr__(self) -> str: 138 | return '' % self.pageno # zero-based page index 139 | 140 | def __str__(self) -> str: 141 | return self.format_name() 142 | 143 | def format_name(self, use_label: bool = True, page_number_offset: int = 0) -> str: 144 | if self.label and use_label: 145 | return 'page %s' % self.label 146 | else: 147 | # + 1 for 1-based page numbers in normal program output (error messages, etc.) 148 | return 'page #%d' % (self.pageno + 1 + page_number_offset) 149 | 150 | def __eq__(self, other: object) -> bool: 151 | if not isinstance(other, Page): 152 | return NotImplemented 153 | return self.pageno == other.pageno 154 | 155 | def __lt__(self, other: object) -> bool: 156 | if not isinstance(other, Page): 157 | return NotImplemented 158 | return self.pageno < other.pageno 159 | 160 | 161 | @functools.total_ordering 162 | class Pos: 163 | """ 164 | A position within the document. 165 | 166 | This object represents an x,y point on a particular page. Such positions are 167 | also comparable, and compare in natural document reading order (as inferred 168 | by pdfminer's text layout detection). 169 | """ 170 | 171 | def __init__(self, page: Page, x: float, y: float): 172 | self.page = page 173 | self.x = x 174 | self.y = y 175 | self._pageseq = 0 176 | self._pageseq_distance = 0.0 177 | 178 | def __str__(self) -> str: 179 | return '%s (%.3f,%.3f)' % (self.page, self.x, self.y) 180 | 181 | def __repr__(self) -> str: 182 | return '' % (self.page.pageno, self.x, self.y, self._pageseq) 183 | 184 | def __eq__(self, other: object) -> bool: 185 | if isinstance(other, Pos): 186 | return (self.page == other.page 187 | and self.x == other.x 188 | and self.y == other.y) 189 | return NotImplemented 190 | 191 | def __lt__(self, other: object) -> bool: 192 | if isinstance(other, Pos): 193 | if self.page == other.page: 194 | assert self.page is other.page 195 | if self.page.fixed_columns: 196 | # Fixed layout: assume left-to-right top-to-bottom documents 197 | (sx, sy) = self.page.mediabox.closest_point((self.x, self.y)) 198 | (ox, oy) = self.page.mediabox.closest_point((other.x, other.y)) 199 | colwidth = self.page.mediabox.get_width() / self.page.fixed_columns 200 | self_col = (sx - self.page.mediabox.x0) // colwidth 201 | other_col = (ox - self.page.mediabox.x0) // colwidth 202 | return self_col < other_col or (self_col == other_col and sy > oy) 203 | else: 204 | # Default layout inferred from pdfminer traversal 205 | assert self._pageseq != 0 206 | assert other._pageseq != 0 207 | if self._pageseq == other._pageseq: 208 | # The positions are on or closest to the same line of text. 209 | # XXX: assume top-to-bottom left-to-right order 210 | return self.x < other.x if self.y == other.y else self.y > other.y 211 | else: 212 | return self._pageseq < other._pageseq 213 | else: 214 | return self.page < other.page 215 | else: 216 | return NotImplemented 217 | 218 | def item_hit(self, item: LTComponent) -> bool: 219 | """Is this pos within the bounding box of the given PDF component?""" 220 | return (self.x >= item.x0 221 | and self.x <= item.x1 222 | and self.y >= item.y0 223 | and self.y <= item.y1) 224 | 225 | def update_pageseq(self, component: LTComponent, pageseq: int) -> bool: 226 | """If close-enough to the given component, adopt its sequence number and return True.""" 227 | assert pageseq > 0 228 | if self.item_hit(component): 229 | # This pos is inside the component area 230 | self._pageseq = pageseq 231 | self._pageseq_distance = 0 232 | return True 233 | else: 234 | d = Box.from_item(component).square_of_distance_to_closest_point((self.x, self.y)) 235 | if self._pageseq == 0 or self._pageseq_distance > d: 236 | self._pageseq = pageseq 237 | self._pageseq_distance = d 238 | return True 239 | return False 240 | 241 | def discard_pageseq(self, pageseq: int) -> None: 242 | """If we have been assigned the specified pageseq, forget about it.""" 243 | if self._pageseq == pageseq: 244 | self._pageseq = 0 245 | self._pageseq_distance = 0.0 246 | 247 | 248 | @functools.total_ordering 249 | class ObjectWithPos: 250 | """Any object that (eventually) has a logical position on the page.""" 251 | 252 | def __init__(self, pos: typ.Optional[Pos] = None): 253 | self.pos = pos 254 | 255 | def __lt__(self, other: object) -> bool: 256 | if isinstance(other, ObjectWithPos): 257 | assert self.pos is not None 258 | assert other.pos is not None 259 | return self.pos < other.pos 260 | return NotImplemented 261 | 262 | def update_pageseq(self, component: LTComponent, pageseq: int) -> bool: 263 | """Delegates to Pos.update_pageseq""" 264 | return False if self.pos is None else self.pos.update_pageseq(component, pageseq) 265 | 266 | def discard_pageseq(self, pageseq: int) -> None: 267 | """Delegates to Pos.discard_pageseq""" 268 | if self.pos is not None: 269 | self.pos.discard_pageseq(pageseq) 270 | 271 | 272 | class AnnotationType(enum.Enum): 273 | """A supported PDF annotation type. Enumerant names match the Subtype names of the PDF spec.""" 274 | 275 | # A "sticky note" comment annotation. 276 | Text = enum.auto() 277 | 278 | # Markup annotations that apply to one or more regions on the page. 279 | Highlight = enum.auto() 280 | Squiggly = enum.auto() 281 | StrikeOut = enum.auto() 282 | Underline = enum.auto() 283 | 284 | Caret = enum.auto() 285 | 286 | # A single rectangle, that is abused by some Apple tools to render custom 287 | # highlights. We do not attempt to capture the affected text. 288 | Square = enum.auto() 289 | 290 | # Free-form text written somewhere on the page. 291 | FreeText = enum.auto() 292 | 293 | 294 | class Annotation(ObjectWithPos): 295 | """ 296 | A PDF annotation, and its extracted text. 297 | 298 | Attributes: 299 | author Author of the annotation 300 | color RGB color of the annotation 301 | contents Contents of the annotation in the PDF (e.g. comment/description) 302 | created Timestamp the annotation was created 303 | group_children Annotations grouped together with this one 304 | in_reply_to Reference to another annotation on the page that this is "in reply to" 305 | is_group_child Is this annotation a member of a parent group? 306 | last_charseq Sequence number of the most recent character in text 307 | name If present, uniquely identifies this annotation among others on the page 308 | replies Annotations replying to this one (reverse of in_reply_to) 309 | subtype PDF annotation type 310 | text Text in the order captured (use gettext() for a cleaner form) 311 | 312 | Attributes updated for StrikeOut and Caret annotations: 313 | pre_context Text captured just prior to the beginning of 'text' 314 | post_context Text captured just after the end of 'text' 315 | """ 316 | 317 | boxes: typ.List[Box] 318 | contents: typ.Optional[str] 319 | group_children: typ.List[Annotation] 320 | in_reply_to: typ.Optional[Annotation] 321 | pre_context: typ.Optional[str] 322 | post_context: typ.Optional[str] 323 | replies: typ.List[Annotation] 324 | text: typ.List[str] 325 | 326 | def __init__( 327 | self, 328 | page: Page, 329 | subtype: AnnotationType, 330 | *, 331 | author: typ.Optional[str] = None, 332 | created: typ.Optional[datetime.datetime] = None, 333 | color: typ.Optional[RGB] = None, 334 | contents: typ.Optional[str] = None, 335 | in_reply_to_ref: typ.Optional[PDFObjRef] = None, 336 | is_group_child: bool = False, 337 | name: typ.Optional[str] = None, 338 | quadpoints: typ.Optional[typ.Sequence[float]] = None, 339 | rect: typ.Optional[BoxCoords] = None): 340 | 341 | # Construct boxes from quadpoints 342 | boxes = [] 343 | if quadpoints is not None: 344 | assert len(quadpoints) % 8 == 0 345 | while quadpoints != []: 346 | (x0, y0, x1, y1, x2, y2, x3, y3) = quadpoints[:8] 347 | quadpoints = quadpoints[8:] 348 | xvals = [x0, x1, x2, x3] 349 | yvals = [y0, y1, y2, y3] 350 | box = Box(min(xvals), min(yvals), max(xvals), max(yvals)) 351 | boxes.append(box) 352 | 353 | # Kludge for Caret annotations that lack quadpoints, but need to capture context 354 | if quadpoints is None and subtype == AnnotationType.Caret: 355 | assert rect is not None 356 | boxes.append(Box.from_coords(rect)) 357 | 358 | # Compute a meaningful position of this annotation on the page 359 | assert rect or boxes 360 | (x0, y0, x1, y1) = rect if rect else boxes[0].get_coords() 361 | # XXX: assume left-to-right top-to-bottom text 362 | pos = Pos(page, min(x0, x1), max(y0, y1)) 363 | super().__init__(pos) 364 | 365 | # Initialise the attributes 366 | self.author = author 367 | self.boxes = boxes 368 | self.color = color 369 | self.contents = contents if contents else None 370 | self.created = created 371 | self.group_children = [] 372 | self.name = name 373 | self.last_charseq = 0 374 | self.post_context = None 375 | self.pre_context = None 376 | self.replies = [] 377 | self.subtype = subtype 378 | self.text = [] 379 | 380 | # The in_reply_to reference will be resolved in postprocess() 381 | self.in_reply_to = None 382 | self._in_reply_to_ref = in_reply_to_ref 383 | self.is_group_child = is_group_child 384 | if is_group_child: 385 | assert in_reply_to_ref 386 | 387 | def __repr__(self) -> str: 388 | return ('' % 389 | (self.subtype.name, self.pos, 390 | " '%s'" % self.contents[:10] if self.contents else '', 391 | " '%s'" % ''.join(self.text[:10]) if self.text else '')) 392 | 393 | def capture(self, text: str, charseq: int = 0) -> None: 394 | """Capture text (while rendering the PDF page).""" 395 | self.text.append(text) 396 | if charseq: 397 | assert charseq > self.last_charseq 398 | self.last_charseq = charseq 399 | 400 | def gettext(self, remove_hyphens: bool = False) -> typ.Optional[str]: 401 | """Retrieve cleaned-up text, after rendering.""" 402 | if self.boxes: 403 | if self.text: 404 | captured = ''.join(self.text) 405 | return merge_lines(captured, remove_hyphens, strip_space=(not self.has_context())) 406 | else: 407 | # something's strange -- we have boxes but no text for them 408 | logger.warning('Missing text for %s annotation at %s', self.subtype.name, self.pos) 409 | return "" 410 | else: 411 | return None 412 | 413 | def get_child_by_type(self, child_type: AnnotationType) -> typ.Optional[Annotation]: 414 | """Return the first child of the given type.""" 415 | for c in self.group_children: 416 | if c.subtype == child_type: 417 | return c 418 | return None 419 | 420 | def wants_context(self) -> bool: 421 | """Returns true if this annotation type should include context.""" 422 | return self.subtype in {AnnotationType.Caret, AnnotationType.StrikeOut} 423 | 424 | def set_pre_context(self, pre_context: str) -> None: 425 | assert self.pre_context is None 426 | self.pre_context = pre_context 427 | 428 | def set_post_context(self, post_context: str) -> None: 429 | assert self.post_context is None 430 | 431 | # If the text ends in a (broadcast) newline, discard it lest it mess up the context below. 432 | if self.text and self.text[-1] == '\n': 433 | self.text.pop() 434 | 435 | # If the captured text ends in any (other) space, move it to the context. 436 | whitespace = [] 437 | while self.text and self.text[-1].isspace(): 438 | whitespace.append(self.text.pop()) 439 | if whitespace: 440 | post_context = ''.join(whitespace) + post_context 441 | 442 | self.post_context = post_context 443 | 444 | def has_context(self) -> bool: 445 | """Returns true if this annotation captured context.""" 446 | return self.pre_context is not None or self.post_context is not None 447 | 448 | def get_context(self, remove_hyphens: bool = False) -> typ.Tuple[str, str]: 449 | """Returns context captured for this annotation, as a tuple (pre, post).""" 450 | return (merge_lines(self.pre_context or '', remove_hyphens, strip_space=False), 451 | merge_lines(self.post_context or '', remove_hyphens, strip_space=False)) 452 | 453 | def postprocess(self, annots_by_objid: typ.Dict[int, Annotation]) -> None: 454 | """Update internal state once all text and context has been captured.""" 455 | # Resole the in_reply_to object reference to its annotation 456 | if self._in_reply_to_ref is not None: 457 | assert self.in_reply_to is None # This should be called once only 458 | a = annots_by_objid.get(self._in_reply_to_ref.objid) 459 | if a is None: 460 | logger.warning("IRT reference (%d) not found in page annotations", 461 | self._in_reply_to_ref.objid) 462 | elif self.is_group_child: 463 | a.group_children.append(self) 464 | else: 465 | self.in_reply_to = a 466 | a.replies.append(self) 467 | 468 | # The Skim PDF reader (https://skim-app.sourceforge.io/) creates annotations whose 469 | # default initial contents are a copy of the selected text. Unless the user goes to 470 | # the trouble of editing each annotation, this goes badly for us because we have 471 | # duplicate text and contents (e.g., for simple highlights and strikeout). 472 | if self.contents and (text := self.gettext()) and text.strip() == self.contents.strip(): 473 | self.contents = None 474 | 475 | 476 | UnresolvedPage = typ.Union[int, PDFObjRef] 477 | """A reference to a page that is *either* a page number, or a PDF object ID.""" 478 | 479 | 480 | class Outline(ObjectWithPos): 481 | """ 482 | A PDF outline (also known as a bookmark). 483 | 484 | Outlines are used to navigate the PDF, and are often headings in the 485 | document's table of contents. A single outline has a title (name), and a 486 | target location in the PDF (page and X/Y coordinates). Initially the page is 487 | referred to by reference, but the reference is unresolved -- it is either a 488 | page number, or a PDF object ID. While rendering the PDF, the page is 489 | resolved to a Page object, and the pos attribute is updated. 490 | """ 491 | 492 | def __init__( 493 | self, 494 | title: str, 495 | pageref: UnresolvedPage, 496 | target: typ.Optional[typ.Tuple[float, float]] 497 | ): 498 | super().__init__() 499 | self.title = title 500 | self.pageref = pageref 501 | self.target = target 502 | 503 | def __repr__(self) -> str: 504 | return '' % (self.title, self.pos) 505 | 506 | def resolve(self, page: Page) -> None: 507 | """Resolve our page reference to the given page, and update our position.""" 508 | assert self.pos is None 509 | if isinstance(self.pageref, PDFObjRef): 510 | assert self.pageref.objid == page.objid 511 | else: 512 | assert self.pageref == page.pageno 513 | 514 | if self.target is None: 515 | # XXX: "first" point on the page, assuming left-to-right top-to-bottom order 516 | (targetx, targety) = (page.mediabox.x0, page.mediabox.y1) 517 | else: 518 | (targetx, targety) = self.target 519 | 520 | self.pos = Pos(page, targetx, targety) 521 | 522 | 523 | class Document: 524 | """ 525 | A fully-extracted PDF document. 526 | 527 | This is really just a list of pages and some helpers. 528 | 529 | Attributes: 530 | pages An ordered list of Page objects, indexed by zero-based page number. 531 | """ 532 | 533 | pages: typ.List[Page] 534 | 535 | def __init__(self) -> None: 536 | self.pages = [] 537 | 538 | def iter_annots(self, *, include_replies: bool = False) -> typ.Iterator[Annotation]: 539 | """ 540 | Iterate over all the annotations in the document. 541 | 542 | Only the primary annotation for a group is included. 543 | Replies are included only if include_replies is True. 544 | """ 545 | 546 | for p in self.pages: 547 | for a in p.annots: 548 | if not a.is_group_child and (include_replies or not a.in_reply_to): 549 | yield a 550 | 551 | def nearest_outline( 552 | self, 553 | pos: Pos 554 | ) -> typ.Optional[Outline]: 555 | """Return the first outline occuring prior to the given position, in reading order.""" 556 | 557 | # Search pages backwards from the given pos 558 | for pageno in range(pos.page.pageno, -1, -1): 559 | page = self.pages[pageno] 560 | assert page.pageno == pageno 561 | 562 | # Outlines are pre-sorted, so we can use bisect to find the first outline < pos 563 | idx = bisect.bisect(page.outlines, ObjectWithPos(pos)) 564 | if idx: 565 | return page.outlines[idx - 1] 566 | 567 | return None 568 | 569 | 570 | class RGB(typ.NamedTuple): 571 | red: float 572 | green: float 573 | blue: float 574 | 575 | def ashex(self) -> str: 576 | "Return a 6-character string representing the 24-bit hex code for this colour." 577 | red_hex = format(int(self.red * 255), '02x') 578 | green_hex = format(int(self.green * 255), '02x') 579 | blue_hex = format(int(self.blue * 255), '02x') 580 | return red_hex + green_hex + blue_hex 581 | 582 | def __str__(self) -> str: 583 | return f"RGB({self.ashex()})" 584 | -------------------------------------------------------------------------------- /pdfannots/utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import typing as typ 3 | 4 | CHARACTER_SUBSTITUTIONS = { 5 | 'ff': 'ff', 6 | 'fi': 'fi', 7 | 'fl': 'fl', 8 | 'ffi': 'ffi', 9 | 'ffl': 'ffl', 10 | '‘': "'", 11 | '’': "'", 12 | '“': '"', 13 | '”': '"', 14 | '…': '...', 15 | } 16 | 17 | 18 | def cleanup_text(text: str) -> str: 19 | """ 20 | Normalise line endings and replace common special characters with plain ASCII equivalents. 21 | """ 22 | if '\r' in text: 23 | text = text.replace('\r\n', '\n').replace('\r', '\n') 24 | return ''.join([CHARACTER_SUBSTITUTIONS.get(c, c) for c in text]) 25 | 26 | 27 | def merge_lines(captured_text: str, remove_hyphens: bool = False, strip_space: bool = True) -> str: 28 | """ 29 | Merge and cleanup lines in captured text, optionally removing hyphens. 30 | 31 | Any number of consecutive newlines is replaced by a single space, unless the 32 | prior line ends in a hyphen, in which case they are just removed entirely. 33 | This makes it easier for the renderer to "broadcast" newlines to active 34 | annotations regardless of box hits. (Detecting paragraph breaks is tricky, 35 | and left for future work!) 36 | """ 37 | results = [] 38 | 39 | lines = captured_text.splitlines() 40 | for i in range(len(lines)): 41 | thisline = lines[i] 42 | if thisline == '': 43 | continue 44 | 45 | nextline = lines[i + 1] if i + 1 < len(lines) else None 46 | 47 | if (len(thisline) >= 2 48 | and thisline[-1] == '-' # Line ends in an apparent hyphen 49 | and thisline[-2].islower()): # Prior character was a lowercase letter 50 | # We have a likely hyphen. Remove it if desired. 51 | if remove_hyphens: 52 | thisline = thisline[:-1] 53 | elif (not thisline[-1].isspace() 54 | and nextline is not None 55 | and (nextline == '' or not nextline[0].isspace())): 56 | # Insert space to replace the line break 57 | thisline += ' ' 58 | 59 | results.append(cleanup_text(thisline)) 60 | 61 | result = ''.join(results) 62 | 63 | if result: 64 | if strip_space: 65 | result = result.strip() 66 | else: 67 | # re-insert load-bearing spaces from linebreaks when needed for context 68 | if len(lines) > 0 and lines[0] == '' and not result[0].isspace(): 69 | result = ' ' + result 70 | if len(lines) > 1 and lines[-1] == '' and not result[-1].isspace(): 71 | result += ' ' 72 | 73 | return result 74 | 75 | 76 | def decode_datetime(dts: str) -> typ.Optional[datetime.datetime]: 77 | if dts.startswith('D:'): # seems 'optional but recommended' 78 | dts = dts[2:] 79 | dts = dts.replace("'", '') 80 | zi = dts.find('Z') 81 | if zi != -1: # sometimes it's Z/Z0000 82 | dts = dts[:zi] + '+0000' 83 | fmt = '%Y%m%d%H%M%S' 84 | # dates in PDFs are quite flaky and underspecified... so perhaps worth defensive code here 85 | for suf in ['%z', '']: # sometimes timezone is missing 86 | try: 87 | return datetime.datetime.strptime(dts, fmt + suf) 88 | except ValueError: 89 | continue 90 | return None 91 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "pdfannots" 7 | dynamic = ["version"] 8 | requires-python = ">=3.8" 9 | dependencies = ["pdfminer.six >= 20220319, != 20240706"] 10 | description = "Tool to extract and pretty-print PDF annotations for reviewing" 11 | readme = "README.md" 12 | license = {file = "LICENSE.txt"} 13 | authors = [ 14 | {name = "Andrew Baumann", email = "pdfannots.pypi.org@ab.id.au"}, 15 | ] 16 | classifiers = [ 17 | "Intended Audience :: Science/Research", 18 | "Topic :: Text Processing", 19 | "License :: OSI Approved :: MIT License", 20 | "Programming Language :: Python :: 3", 21 | "Programming Language :: Python :: 3.8", 22 | "Programming Language :: Python :: 3.9", 23 | "Programming Language :: Python :: 3.10", 24 | "Programming Language :: Python :: 3.11", 25 | "Programming Language :: Python :: 3.12", 26 | "Programming Language :: Python :: 3.13", 27 | ] 28 | 29 | [project.scripts] 30 | pdfannots = "pdfannots.cli:main" 31 | 32 | [project.urls] 33 | Homepage = "https://github.com/0xabu/pdfannots" 34 | 35 | [tool.hatch.version] 36 | path = "pdfannots/__init__.py" 37 | 38 | [tool.mypy] 39 | # strict mode 40 | warn_unused_configs = true 41 | disallow_any_generics = true 42 | disallow_subclassing_any = true 43 | disallow_untyped_calls = true 44 | disallow_untyped_defs = true 45 | disallow_incomplete_defs = true 46 | check_untyped_defs = true 47 | disallow_untyped_decorators = true 48 | no_implicit_optional = true 49 | warn_redundant_casts = true 50 | warn_unused_ignores = true 51 | warn_return_any = true 52 | no_implicit_reexport = true 53 | strict_equality = true 54 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # pip requirements for pdfannots 2 | # Use as: pip3 install -r requirements.txt 3 | 4 | pdfminer.six == 20231228 5 | -------------------------------------------------------------------------------- /tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import functools 4 | import json 5 | import operator 6 | import pathlib 7 | import re 8 | import typing as typ 9 | import unittest 10 | from datetime import datetime, timedelta, timezone 11 | 12 | import pdfminer.layout 13 | 14 | import pdfannots 15 | import pdfannots.utils 16 | from pdfannots.types import AnnotationType 17 | from pdfannots.printer.markdown import MarkdownPrinter, GroupedMarkdownPrinter 18 | from pdfannots.printer.json import JsonPrinter 19 | 20 | 21 | class UnitTests(unittest.TestCase): 22 | def test_decode_datetime(self) -> None: 23 | datas = [ 24 | ("D:123456", None), # defensive on bad datetimes 25 | ("D:20190119212926-08'00'", 26 | datetime(2019, 1, 19, 21, 29, 26, tzinfo=timezone(-timedelta(hours=8)))), 27 | ("20200102030405Z0000", 28 | datetime(2020, 1, 2, 3, 4, 5, tzinfo=timezone.utc)), 29 | ("D:20101112191817", datetime(2010, 11, 12, 19, 18, 17)), 30 | ] 31 | for dts, expected in datas: 32 | dt = pdfannots.utils.decode_datetime(dts) 33 | self.assertEqual(dt, expected) 34 | 35 | 36 | class ExtractionTestBase(unittest.TestCase): 37 | filename: str 38 | 39 | # Permit a test to customise the columns_per_page or LAParams 40 | columns_per_page: typ.Optional[int] = None 41 | laparams = pdfminer.layout.LAParams() 42 | 43 | def setUp(self) -> None: 44 | path = pathlib.Path(__file__).parent / 'tests' / self.filename 45 | with path.open('rb') as f: 46 | self.doc = pdfannots.process_file(f, columns_per_page=self.columns_per_page, 47 | laparams=self.laparams) 48 | self.annots = [a for p in self.doc.pages for a in p.annots] 49 | self.outlines = [o for p in self.doc.pages for o in p.outlines] 50 | 51 | def assertEndsWith(self, bigstr: str, suffix: str) -> None: 52 | self.assertEqual(bigstr[-len(suffix):], suffix) 53 | 54 | def assertStartsWith(self, bigstr: str, prefix: str) -> None: 55 | self.assertEqual(bigstr[:len(prefix)], prefix) 56 | 57 | 58 | class ExtractionTests(ExtractionTestBase): 59 | filename = 'hotos17.pdf' 60 | columns_per_page = 2 # for test_nearest_outline 61 | 62 | def test_annots(self) -> None: 63 | EXPECTED = [ 64 | (0, AnnotationType.Squiggly, None, 'recent Intel CPUs have introduced'), 65 | (0, AnnotationType.Text, 'This is a note with no text attached.', None), 66 | (0, AnnotationType.StrikeOut, None, 'e'), 67 | (1, AnnotationType.Highlight, None, 68 | 'TSX launched with "Haswell" in 2013 but was later disabled due to a bug. ' 69 | '"Broadwell" CPUs with the bug fix shipped in late 2014.'), 70 | (1, AnnotationType.Highlight, 'This is lower in column 1', 71 | 'user-mode access to FS/GS registers, and TLB tags for non-VM address spaces'), 72 | (1, AnnotationType.Highlight, None, 73 | 'segmentation, task switching, and 16-bit modes.'), 74 | (1, AnnotationType.Highlight, 'This is at the top of column two', 75 | 'The jump is due to extensions introduced with the "Skylake" microarchitecture'), 76 | (3, AnnotationType.Squiggly, 'This is a nit.', 77 | 'Control transfer in x86 is already very complex'), 78 | (3, AnnotationType.Underline, 'This is a different nit', 79 | 'Besides modifying semantics of all indirect control transfers'), 80 | (3, AnnotationType.StrikeOut, None, 81 | 'While we may disagree with some of the design choices,')] 82 | 83 | self.assertEqual(len(self.annots), len(EXPECTED)) 84 | for a, expected in zip(self.annots, EXPECTED): 85 | assert a.pos is not None 86 | self.assertEqual( 87 | (a.pos.page.pageno, a.subtype, a.contents, a.gettext(remove_hyphens=True)), 88 | expected) 89 | self.assertEqual(self.annots[0].created, datetime( 90 | 2019, 1, 19, 21, 29, 42, tzinfo=timezone(-timedelta(hours=8)))) 91 | 92 | # test for correct whitespace on the strikeout annot 93 | a = self.annots[2] 94 | self.assertTrue(a.has_context()) 95 | (pre, post) = a.get_context() 96 | self.assertEndsWith(pre, 'widths, ar') 97 | self.assertStartsWith(post, ' counted') 98 | 99 | def test_outlines(self) -> None: 100 | EXPECTED = [ 101 | 'Introduction', 102 | 'Background: x86 extensions', 103 | 'Case study: SGX', 104 | 'Case study: CET', 105 | 'Implications', 106 | 'Concluding remarks'] 107 | 108 | self.assertEqual(len(self.outlines), len(EXPECTED)) 109 | for o, expected in zip(self.outlines, EXPECTED): 110 | self.assertEqual(o.title, expected) 111 | 112 | def test_nearest_outline(self) -> None: 113 | # Page 1 (Introduction) Squiggly: "recent Intel CPUs have introduced" 114 | a = self.doc.pages[0].annots[0] 115 | assert a.pos is not None 116 | o = self.doc.nearest_outline(a.pos) 117 | assert o is not None 118 | self.assertEqual(o.title, 'Introduction') 119 | 120 | # Page 4 (Case study: CET) Squiggly: "Control transfer in x86 is already very complex" 121 | # Note: pdfminer gets this wrong as of 20201018; we must set columns_per_page to fix it 122 | a = self.doc.pages[3].annots[0] 123 | assert a.pos is not None 124 | o = self.doc.nearest_outline(a.pos) 125 | assert o is not None 126 | self.assertEqual(o.title, 'Case study: CET') 127 | 128 | 129 | class Issue9(ExtractionTestBase): 130 | filename = 'issue9.pdf' 131 | 132 | def test(self) -> None: 133 | self.assertEqual(len(self.annots), 1) 134 | a = self.annots[0] 135 | self.assertEqual(a.gettext(), 'World') 136 | 137 | 138 | class Issue13(ExtractionTestBase): 139 | filename = 'issue13.pdf' 140 | 141 | def test(self) -> None: 142 | self.assertEqual(len(self.annots), 1) 143 | a = self.annots[0] 144 | self.assertEqual(a.gettext(), 'This is a sample statement.') 145 | 146 | 147 | class Issue46(ExtractionTestBase): 148 | filename = 'issue46.pdf' 149 | 150 | def test(self) -> None: 151 | self.assertEqual(len(self.annots), 3) 152 | 153 | self.assertEqual(self.annots[0].subtype, AnnotationType.Highlight) 154 | self.assertEqual(self.annots[0].gettext(), 'C – Curate') 155 | 156 | self.assertEqual(self.annots[1].subtype, AnnotationType.Square) 157 | self.assertEqual(self.annots[1].gettext(), None) 158 | 159 | self.assertEqual(self.annots[2].subtype, AnnotationType.Highlight) 160 | self.assertEqual(self.annots[2].gettext(), 'This was a novel idea at the time') 161 | 162 | 163 | class Issue61(ExtractionTestBase): 164 | filename = 'issue61.pdf' 165 | 166 | def test(self) -> None: 167 | self.assertEqual(len(self.annots), 1) 168 | a = self.annots[0] 169 | self.assertEqual(a.subtype, AnnotationType.Caret) 170 | self.assertEqual(a.contents, 'and machine learning') 171 | self.assertTrue(a.has_context()) 172 | 173 | 174 | class Pr24(ExtractionTestBase): 175 | filename = 'pr24.pdf' 176 | 177 | def test(self) -> None: 178 | EXPECTED = [ 179 | (AnnotationType.Highlight, 'long highlight', 180 | 'Heading Link to heading that is working with vim-pandoc. Link to heading that'), 181 | (AnnotationType.Highlight, 'short highlight', 'not working'), 182 | (AnnotationType.Text, None, None), 183 | (AnnotationType.Highlight, None, 'Some more text'), 184 | (AnnotationType.Text, 'dual\n\npara note', None), 185 | (AnnotationType.Text, 's', None)] 186 | self.assertEqual(len(self.annots), len(EXPECTED)) 187 | for a, expected in zip(self.annots, EXPECTED): 188 | self.assertEqual((a.subtype, a.contents, a.gettext()), expected) 189 | 190 | 191 | class Landscape2Column(ExtractionTestBase): 192 | filename = 'word2column.pdf' 193 | 194 | def test(self) -> None: 195 | self.assertEqual(len(self.annots), 9) 196 | 197 | a = self.annots[0] 198 | self.assertEqual(a.subtype, AnnotationType.StrikeOut) 199 | self.assertEqual(a.gettext(), 'nostrud exercitation') 200 | self.assertTrue(a.has_context()) 201 | (pre, post) = a.get_context() 202 | self.assertEndsWith( 203 | pre, 'Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor ' 204 | 'incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis ') 205 | self.assertStartsWith( 206 | post, ' ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor') 207 | 208 | a = self.annots[1] 209 | self.assertEqual(a.subtype, AnnotationType.StrikeOut) 210 | self.assertEqual(a.gettext(), 'Duis') 211 | self.assertTrue(a.has_context()) 212 | (pre, post) = a.get_context() 213 | self.assertEndsWith(pre, 'ullamco laboris nisi ut aliquip ex ea commodo consequat. ') 214 | self.assertStartsWith( 215 | post, ' aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu ' 216 | 'fugiat nulla pariatur.') 217 | 218 | a = self.annots[2] 219 | self.assertEqual(a.subtype, AnnotationType.StrikeOut) 220 | self.assertEqual(a.gettext(), 'laborum') 221 | self.assertTrue(a.has_context()) 222 | (pre, post) = a.get_context() 223 | self.assertEndsWith(pre, ', sunt in culpa qui officia deserunt mollit anim id est ') 224 | self.assertStartsWith(post, '. Heading 2 Sed ut perspiciatis,') 225 | 226 | a = self.annots[3] 227 | self.assertEqual(a.subtype, AnnotationType.Highlight) 228 | self.assertEqual( 229 | a.gettext(), 'At vero eos et accusamus et iusto odio dignissimos ducimus, qui ' 230 | 'blanditiis praesentium voluptatum deleniti atque corrupti,') 231 | self.assertFalse(a.has_context()) 232 | 233 | a = self.annots[4] 234 | self.assertEqual(a.subtype, AnnotationType.Squiggly) 235 | self.assertEqual( 236 | a.gettext(), 'Itaque earum rerum hic tenetur a sapiente delectus, ut aut reiciendis ' 237 | 'voluptatibus maiores alias consequatur aut perferendis doloribus asperiores repellat.') 238 | self.assertEqual(a.contents, 'Nonsense!') 239 | self.assertFalse(a.has_context()) 240 | 241 | a = self.annots[5] 242 | self.assertEqual(a.subtype, AnnotationType.StrikeOut) 243 | self.assertEqual(a.gettext(), 'equal') 244 | self.assertTrue(a.has_context()) 245 | (pre, post) = a.get_context() 246 | self.assertEndsWith(pre, 'the pain and trouble that are bound to ensue; and ') 247 | self.assertStartsWith(post, ' blame belongs to those who fail in their') # end of page 248 | 249 | a = self.annots[6] 250 | self.assertEqual(a.subtype, AnnotationType.StrikeOut) 251 | self.assertEqual(a.gettext(), 'duty') 252 | self.assertTrue(a.has_context()) 253 | (pre, post) = a.get_context() 254 | self.assertEqual(pre, '') # start of page 255 | self.assertStartsWith(post, ' through weakness of will, which') 256 | 257 | a = self.annots[7] 258 | self.assertEqual(a.subtype, AnnotationType.StrikeOut) 259 | self.assertEqual(a.gettext(), 'In a free hour,') 260 | self.assertTrue(a.has_context()) 261 | (pre, post) = a.get_context() 262 | self.assertEndsWith(pre, 'These cases are perfectly simple and easy to distinguish. ') 263 | self.assertStartsWith(post, ' when our power of choice is untrammeled and when nothing') 264 | 265 | 266 | class FreeTextAnnotation(ExtractionTestBase): 267 | filename = 'FreeText-annotation.pdf' 268 | 269 | def test(self) -> None: 270 | self.assertEqual(len(self.annots), 1) 271 | self.assertEqual(self.annots[0].subtype, AnnotationType.FreeText) 272 | self.assertEqual(self.annots[0].contents, 'Annotation with subtype "FreeText".') 273 | self.assertEqual(self.annots[0].gettext(), None) 274 | 275 | 276 | class CaretAnnotations(ExtractionTestBase): 277 | filename = 'caret.pdf' 278 | 279 | def test(self) -> None: 280 | self.assertEqual(len(self.annots), 5) 281 | a = self.annots[0] 282 | self.assertEqual(a.subtype, AnnotationType.StrikeOut) 283 | self.assertEqual(a.gettext(), 'Adobe Acrobat Reader') 284 | self.assertTrue(a.is_group_child) 285 | self.assertEqual(a.group_children, []) 286 | g = self.annots[3] 287 | self.assertEqual(g.subtype, AnnotationType.Caret) 288 | self.assertEqual(g.contents, 'Google Chrome') 289 | self.assertFalse(g.is_group_child) 290 | self.assertEqual(g.group_children, [a]) 291 | self.assertEqual(g.get_child_by_type(AnnotationType.StrikeOut), a) 292 | 293 | 294 | class PrinterTestBase(unittest.TestCase): 295 | filename = 'hotos17.pdf' 296 | 297 | def setUp(self) -> None: 298 | path = pathlib.Path(__file__).parent / 'tests' / self.filename 299 | with path.open('rb') as f: 300 | self.doc = pdfannots.process_file(f) 301 | 302 | 303 | class MarkdownPrinterTest(PrinterTestBase): 304 | # There's not a whole lot of value in testing the precise output format, 305 | # but let's make sure we produce a non-trivial result and don't crash. 306 | def test_flat(self) -> None: 307 | p = MarkdownPrinter(print_filename=True, remove_hyphens=False) 308 | 309 | linecount = 0 310 | charcount = 0 311 | for line in p.print_file('dummyfile', self.doc): 312 | linecount += line.count('\n') 313 | charcount += len(line) 314 | 315 | self.assertGreater(linecount, 5) 316 | self.assertGreater(charcount, 500) 317 | 318 | def test_flat_page_number_offset(self) -> None: 319 | p = MarkdownPrinter(page_number_offset=-1) 320 | 321 | page_numbers = [] 322 | for line in p.print_file('dummyfile', self.doc): 323 | m = re.match(r'.+Page #([0-9])', line) 324 | if m: 325 | page_numbers.append(m[1]) 326 | 327 | self.assertEqual(page_numbers, ['0', '0', '0', '1', '1', '1', '1', '3', '3', '3']) 328 | 329 | def test_grouped(self) -> None: 330 | p = GroupedMarkdownPrinter(wrap_column=80) 331 | 332 | linecount = 0 333 | charcount = 0 334 | for line in p.print_file('dummyfile', self.doc): 335 | linecount += line.count('\n') 336 | charcount += len(line) 337 | 338 | self.assertGreater(linecount, 10) 339 | self.assertGreater(charcount, 900) 340 | 341 | def test_multicolorgrouping(self) -> None: 342 | p = GroupedMarkdownPrinter(group_highlights_by_color=True) 343 | 344 | linecount = 0 345 | charcount = 0 346 | for line in p.print_file('dummyfile', self.doc): 347 | linecount += line.count('\n') 348 | charcount += len(line) 349 | 350 | self.assertGreater(linecount, 10) 351 | self.assertGreater(charcount, 900) 352 | 353 | 354 | class JsonPrinterTest(PrinterTestBase): 355 | def test_flat(self) -> None: 356 | p = JsonPrinter(remove_hyphens=False, output_codec='utf-8') 357 | 358 | j = json.loads( 359 | p.begin() 360 | + functools.reduce(operator.add, p.print_file('dummyfile', self.doc)) 361 | + p.end()) 362 | 363 | self.assertTrue(isinstance(j, list)) 364 | self.assertEqual(len(j), 10) 365 | 366 | 367 | if __name__ == "__main__": 368 | unittest.main() 369 | -------------------------------------------------------------------------------- /tests/FreeText-annotation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xabu/pdfannots/08f6f0abdb690fb9d8b6cc3f6d244fd2d80c9af6/tests/FreeText-annotation.pdf -------------------------------------------------------------------------------- /tests/caret.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xabu/pdfannots/08f6f0abdb690fb9d8b6cc3f6d244fd2d80c9af6/tests/caret.pdf -------------------------------------------------------------------------------- /tests/hotos17.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xabu/pdfannots/08f6f0abdb690fb9d8b6cc3f6d244fd2d80c9af6/tests/hotos17.pdf -------------------------------------------------------------------------------- /tests/issue13.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xabu/pdfannots/08f6f0abdb690fb9d8b6cc3f6d244fd2d80c9af6/tests/issue13.pdf -------------------------------------------------------------------------------- /tests/issue46.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xabu/pdfannots/08f6f0abdb690fb9d8b6cc3f6d244fd2d80c9af6/tests/issue46.pdf -------------------------------------------------------------------------------- /tests/issue61.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xabu/pdfannots/08f6f0abdb690fb9d8b6cc3f6d244fd2d80c9af6/tests/issue61.pdf -------------------------------------------------------------------------------- /tests/issue9.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xabu/pdfannots/08f6f0abdb690fb9d8b6cc3f6d244fd2d80c9af6/tests/issue9.pdf -------------------------------------------------------------------------------- /tests/pr24.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xabu/pdfannots/08f6f0abdb690fb9d8b6cc3f6d244fd2d80c9af6/tests/pr24.pdf -------------------------------------------------------------------------------- /tests/word2column.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xabu/pdfannots/08f6f0abdb690fb9d8b6cc3f6d244fd2d80c9af6/tests/word2column.pdf --------------------------------------------------------------------------------