├── setup.cfg ├── LICENSE ├── setup.py ├── .gitignore ├── test_pdf_bookmark.py ├── README.md └── pdf_bookmark.py /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Xianghu Zhao 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | The setup file for pdf-bookmark package 5 | ''' 6 | 7 | from distutils.core import setup 8 | import os 9 | import re 10 | import setuptools # pylint: disable=unused-import 11 | 12 | HERE = os.path.abspath(os.path.dirname(__file__)) 13 | 14 | 15 | def find_version(): 16 | '''Find version''' 17 | with open(os.path.join(HERE, 'pdf_bookmark.py')) as file_py: 18 | content = file_py.read() 19 | 20 | version_match = re.search( 21 | r"^VERSION\s*=\s*['\"]([^'\"]*)['\"]", content, re.M) 22 | if version_match: 23 | return version_match.group(1) 24 | raise RuntimeError("Unable to find version string.") 25 | 26 | 27 | def find_long_description(): 28 | '''Find long description''' 29 | with open(os.path.join(HERE, 'README.md')) as file_ld: 30 | return file_ld.read() 31 | 32 | 33 | setup( 34 | name='pdf-bookmark', 35 | version=find_version(), 36 | description='PDF Bookmark Import and Export', 37 | long_description=find_long_description(), 38 | long_description_content_type='text/markdown', 39 | author='Xianghu Zhao', 40 | author_email='xianghuzhao@gmail.com', 41 | url='https://github.com/xianghuzhao/pdf-bookmark', 42 | license='MIT', 43 | 44 | py_modules=['pdf_bookmark'], 45 | tests_require=['pytest'], 46 | entry_points={'console_scripts': ['pdf-bookmark = pdf_bookmark:main']}, 47 | ) 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # celery beat schedule file 95 | celerybeat-schedule 96 | 97 | # SageMath parsed files 98 | *.sage.py 99 | 100 | # Environments 101 | .env 102 | .venv 103 | env/ 104 | venv/ 105 | venv2/ 106 | ENV/ 107 | env.bak/ 108 | venv.bak/ 109 | 110 | # Spyder project settings 111 | .spyderproject 112 | .spyproject 113 | 114 | # Rope project settings 115 | .ropeproject 116 | 117 | # mkdocs documentation 118 | /site 119 | 120 | # mypy 121 | .mypy_cache/ 122 | .dmypy.json 123 | dmypy.json 124 | 125 | # Pyre type checker 126 | .pyre/ 127 | 128 | # Test file 129 | *.pdf 130 | *.bmk 131 | -------------------------------------------------------------------------------- /test_pdf_bookmark.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-docstring 2 | 3 | import pytest 4 | 5 | from pdf_bookmark import InvalidRomanNumeralError 6 | from pdf_bookmark import RomanOutOfRangeError 7 | from pdf_bookmark import roman_to_arabic 8 | from pdf_bookmark import arabic_to_roman 9 | 10 | from pdf_bookmark import InvalidLettersNumeralError 11 | from pdf_bookmark import letters_to_arabic 12 | from pdf_bookmark import arabic_to_letters 13 | 14 | 15 | INVALID_ROMAN = ( 16 | '', 17 | 'ii', 18 | 'IIIII', 19 | 'ID', 20 | 'XM', 21 | '12345', 22 | 'jflaiffj', 23 | '+=_-&^%#!$%#*&)~`,.><', 24 | ) 25 | 26 | INVALID_ROMAN_VALUE = ( 27 | -100000, 28 | -1, 29 | 5000, 30 | 5001, 31 | 5002, 32 | 10000, 33 | ) 34 | 35 | ROMAN_PAIRS = ( 36 | (0, 'N'), 37 | (1, 'I'), 38 | (2, 'II'), 39 | (3, 'III'), 40 | (4, 'IV'), 41 | (5, 'V'), 42 | (9, 'IX'), 43 | (12, 'XII'), 44 | (16, 'XVI'), 45 | (29, 'XXIX'), 46 | (44, 'XLIV'), 47 | (45, 'XLV'), 48 | (68, 'LXVIII'), 49 | (83, 'LXXXIII'), 50 | (97, 'XCVII'), 51 | (99, 'XCIX'), 52 | (400, 'CD'), 53 | (500, 'D'), 54 | (501, 'DI'), 55 | (649, 'DCXLIX'), 56 | (798, 'DCCXCVIII'), 57 | (891, 'DCCCXCI'), 58 | (1000, 'M'), 59 | (1004, 'MIV'), 60 | (1006, 'MVI'), 61 | (1023, 'MXXIII'), 62 | (2014, 'MMXIV'), 63 | (3999, 'MMMCMXCIX'), 64 | (4999, 'MMMMCMXCIX'), 65 | ) 66 | 67 | 68 | def test_invalid_roman(): 69 | for roman in INVALID_ROMAN: 70 | with pytest.raises(InvalidRomanNumeralError): 71 | roman_to_arabic(roman) 72 | 73 | 74 | def test_roman_to_arabic(): 75 | for arabic, roman in ROMAN_PAIRS: 76 | assert roman_to_arabic(roman) == arabic 77 | 78 | 79 | def test_out_of_range_roman(): 80 | for arabic in INVALID_ROMAN_VALUE: 81 | with pytest.raises(RomanOutOfRangeError): 82 | arabic_to_roman(arabic) 83 | 84 | 85 | def test_arabic_to_roman(): 86 | for arabic, roman in ROMAN_PAIRS: 87 | assert arabic_to_roman(arabic) == roman 88 | 89 | 90 | INVALID_LETTERS = ( 91 | '0', 92 | '0342', 93 | 'a', 94 | 'ABC', 95 | 'AAAAAA8', 96 | '9BBBB', 97 | '&*-+#', 98 | '12345', 99 | 'jflaiffj', 100 | '+=_-&^%#!$%#*&)~`,.><', 101 | ) 102 | 103 | LETTERS_PAIRS = ( 104 | (0, ''), 105 | (1, 'A'), 106 | (2, 'B'), 107 | (3, 'C'), 108 | (8, 'H'), 109 | (26, 'Z'), 110 | (27, 'AA'), 111 | (52, 'ZZ'), 112 | (106, 'BBBBB'), 113 | ) 114 | 115 | 116 | def test_invalid_letter(): 117 | for letters in INVALID_LETTERS: 118 | with pytest.raises(InvalidLettersNumeralError): 119 | letters_to_arabic(letters) 120 | 121 | 122 | def test_letters_to_arabic(): 123 | for arabic, letters in LETTERS_PAIRS: 124 | assert letters_to_arabic(letters) == arabic 125 | 126 | 127 | def test_arabic_to_letters(): 128 | for arabic, letters in LETTERS_PAIRS: 129 | assert arabic_to_letters(arabic) == letters 130 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PDF-Bookmark 2 | 3 | [![PyPI](https://badge.fury.io/py/pdf-bookmark.svg)](https://pypi.org/project/pdf-bookmark/) 4 | 5 | PDF-Bookmark is a tool for importing and exporting pdf bookmark 6 | with the `bmk` format. 7 | 8 | 9 | ## Installation 10 | 11 | ```shell 12 | $ pip install pdf-bookmark 13 | ``` 14 | 15 | `pdf-bookmark` utilizes 16 | [PDFtk](https://www.pdflabs.com/tools/pdftk-server/) 17 | and [Ghostscript](https://www.ghostscript.com/) 18 | to export and import pdf bookmarks. 19 | They must be installed before running `pdf-bookmark`. 20 | 21 | 22 | ### PDFtk 23 | 24 | PDFtk is used here to export bookmark from pdf file. 25 | The java port [pdftk-java](https://gitlab.com/pdftk-java/pdftk) 26 | may also be OK. 27 | 28 | On Arch Linux, `pdftk-java` could be installed by: 29 | 30 | ```shell 31 | $ sudo pacman -S pdftk java-commons-lang 32 | ``` 33 | 34 | Verify the installation: 35 | 36 | ```shell 37 | $ pdftk --version 38 | ``` 39 | 40 | 41 | ### Ghostscript 42 | 43 | Ghostscript is used here to import bookmark to pdf file. 44 | 45 | On Arch Linux, it could be installed by: 46 | 47 | ```shell 48 | $ sudo pacman -S ghostscript 49 | ``` 50 | 51 | Verify the installation: 52 | 53 | ```shell 54 | $ gs --version 55 | ``` 56 | 57 | 58 | ## bmk format 59 | 60 | The `bmk` format is used to describe the bookmark of a pdf file. 61 | It will be used to import bookmark into a pdf file. 62 | 63 | `bmk` format is easy to write. 64 | It looks quite like the content of a book. 65 | So you can copy the content and modify from it. 66 | 67 | Each line represents a bookmark item. The title and the page number are 68 | separated by at least 4 dots "`.`". 69 | 70 | The level of a bookmark is specified by the indentation of spaces. 71 | The default indentation is 2 spaces, and the number of spaces could be 72 | configured with inline command. 73 | 74 | This is a simple example of a `bmk` file. 75 | 76 | ``` 77 | 序................1 78 | Chapter 1................4 79 | Chapter 2................5 80 | 2.1 Section 1................6 81 | 2.1.1 SubSection 1................6 82 | 2.1.2 SubSection 2................8 83 | 2.2 Section 2................12 84 | Chapter 3................20 85 | Appendix................36 86 | ``` 87 | 88 | Import the bookmark and create a new pdf file: 89 | 90 | ```shell 91 | $ pdf-bookmark -p input.pdf -b bookmark.bmk -o new.pdf 92 | ``` 93 | 94 | 95 | ### Export bmk format 96 | 97 | The `bmk` format could also be exported from a pdf file with bookmark. 98 | You may also modify the bookmark from the exported one. 99 | 100 | ```shell 101 | $ pdf-bookmark -p input.pdf 102 | ``` 103 | 104 | 105 | ### Inline command 106 | 107 | There could also be inline commands in the file to do more controls 108 | on the bookmark. These commands start with `!!!` and modify some 109 | properties of bookmark. The new property will affect bookmarks after 110 | the line until it is changed again. 111 | 112 | It is normal that the main body of a pdf file does not start from the 113 | first page of pdf, and the page number is not always arabic. 114 | 115 | ``` 116 | !!! collapse_level = 2 117 | 118 | !!! num_style = Roman 119 | Preface................I 120 | Content................IV 121 | 122 | !!! new_index = 12 123 | !!! num_style = Arabic 124 | Introduction................1 125 | Chapter 1................4 126 | Chapter 2................5 127 | 2.1 Section 1................6 128 | 2.2 Section 2................7 129 | Chapter 3................10 130 | Appendix................11 131 | ``` 132 | 133 | With these inline commands, you do not need to recalculate the index 134 | number for each page. 135 | 136 | Here are all supported inline commands: 137 | 138 | * `new_index`. Default: 1. 139 | The following bookmark index will be recalculated from the 140 | new index number (`new_index + page - 1`). 141 | * `num_start`. Default: 1. 142 | Specify the number of first page if it does not start from 1 143 | (`new_index + page - num_start`). 144 | * `num_style`. Default: `Arabic`. 145 | The page number style. Could be `Arabic`, `Roman` and `Letters`. 146 | * `collapse_level`. Default: 0. 147 | On which level the bookmarks are collapsed. 0 means expand all. 148 | * `level_indent`. Default: 2. 149 | Number of indentation spaces for a new level. 150 | 151 | 152 | ## pdf-bookmark command 153 | 154 | The `pdf-bookmark` command is installed by `pip install pdf-bookmark`. 155 | 156 | ``` 157 | usage: pdf-bookmark [-h] [-f {bmk,none,pdftk,pdfmark,json}] 158 | [-l COLLAPSE_LEVEL] [-b BOOKMARK] [-p PDF] [-o OUTPUT_PDF] 159 | 160 | Import and export PDF bookmark 161 | 162 | optional arguments: 163 | -h, --help show this help message and exit 164 | -f {bmk,none,pdftk,pdfmark,json}, --format {bmk,none,pdftk,pdfmark,json} 165 | the output format of bookmark 166 | -l COLLAPSE_LEVEL, --collapse-level COLLAPSE_LEVEL 167 | the min level to be collapsed, 0 to expand all 168 | -b BOOKMARK, --bookmark BOOKMARK 169 | the bookmark file to be imported 170 | -p PDF, --pdf PDF the input PDF file 171 | -o OUTPUT_PDF, --output-pdf OUTPUT_PDF 172 | the output PDF file 173 | ``` 174 | 175 | 176 | ## Example 177 | 178 | ### Import bookmark 179 | 180 | This will import the `bmk` bookmark into a pdf file: 181 | 182 | ```shell 183 | $ pdf-bookmark -p input.pdf -b bookmark.bmk -o new.pdf 184 | ``` 185 | 186 | If you would like to have a quite output: 187 | 188 | ```shell 189 | $ pdf-bookmark -p input.pdf -b bookmark.bmk -f none -o new.pdf 190 | ``` 191 | 192 | ### Export bookmark 193 | 194 | This will export the `bmk` bookmark to stdout from a pdf file: 195 | 196 | ```shell 197 | $ pdf-bookmark -p input.pdf 198 | ``` 199 | 200 | The output format could be changed to `pdfmark`, `json`: 201 | 202 | ```shell 203 | $ pdf-bookmark -p input.pdf -f pdfmark 204 | $ pdf-bookmark -p input.pdf -f json 205 | ``` 206 | 207 | ### Change the collapse level 208 | 209 | This will only change the collapse level of the pdf. 210 | 211 | ```shell 212 | $ pdf-bookmark -p input.pdf -l 2 -o new.pdf 213 | ``` 214 | -------------------------------------------------------------------------------- /pdf_bookmark.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # pylint: disable=invalid-name 4 | 5 | ''' 6 | Import and export PDF bookmark 7 | ''' 8 | 9 | import os 10 | import sys 11 | import subprocess 12 | import re 13 | import argparse 14 | import json 15 | import tempfile 16 | import codecs 17 | 18 | 19 | VERSION = '1.1.0' 20 | 21 | 22 | _NUM_STYLE_MAP = { 23 | 'DecimalArabicNumerals': 'Arabic', 24 | 'UppercaseRomanNumerals': 'Roman', 25 | 'LowercaseRomanNumerals': 'Roman', 26 | 'UppercaseLetters': 'Letters', 27 | 'LowercaseLetters': 'Letters', 28 | } 29 | 30 | 31 | _ROMAN_NUMERAL_PAIR = ( 32 | ('M', 1000), 33 | ('CM', 900), 34 | ('D', 500), 35 | ('CD', 400), 36 | ('C', 100), 37 | ('XC', 90), 38 | ('L', 50), 39 | ('XL', 40), 40 | ('X', 10), 41 | ('IX', 9), 42 | ('V', 5), 43 | ('IV', 4), 44 | ('I', 1), 45 | ) 46 | 47 | _ROMAN_NUMERAL_MAP = {pair[0]: pair[1] for pair in _ROMAN_NUMERAL_PAIR} 48 | 49 | _ROMAN_NUMERAL_PATTERN = re.compile( 50 | '^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$' 51 | ) 52 | 53 | 54 | _BOOKMARK_DESCRIPTION = { 55 | 'bookmark': { 56 | 'prefix': 'Bookmark', 57 | 'fields': { 58 | 'Title': 'title', 59 | 'Level': 'level', 60 | 'PageNumber': 'page', 61 | }, 62 | 'handler': { 63 | 'title': lambda s: _unicode_replace(s) if _UNICODE_REGEXP.search(s) else s, 64 | 'level': int, 65 | 'page': int, 66 | }, 67 | }, 68 | 'page_label': { 69 | 'prefix': 'PageLabel', 70 | 'fields': { 71 | 'NewIndex': 'new_index', 72 | 'Start': 'num_start', 73 | 'NumStyle': 'num_style', 74 | }, 75 | 'handler': { 76 | 'new_index': int, 77 | 'num_start': int, 78 | 'num_style': lambda s: _NUM_STYLE_MAP.get(s, 'Arabic'), 79 | }, 80 | }, 81 | } 82 | 83 | 84 | _UNICODE_REGEXP = re.compile('&#([0-9]+);') 85 | 86 | 87 | _CONTENT_MINIMUM_DOTS = 4 88 | 89 | 90 | class CommandError(Exception): 91 | '''Run command error''' 92 | 93 | 94 | class InvalidBookmarkSyntaxError(Exception): 95 | '''Invalid bookmark syntax''' 96 | 97 | 98 | class InvalidNumeralError(ValueError): 99 | '''Invalid numeral expression''' 100 | 101 | 102 | class InvalidRomanNumeralError(InvalidNumeralError): 103 | '''Invalid roman numeral expression''' 104 | 105 | 106 | class RomanOutOfRangeError(Exception): 107 | '''The roman number is out of range''' 108 | 109 | 110 | class InvalidLettersNumeralError(InvalidNumeralError): 111 | '''Invalid letters numeral expression''' 112 | 113 | 114 | class LettersOutOfRangeError(Exception): 115 | '''The letters number is out of range''' 116 | 117 | 118 | class PdfMarkError(Exception): 119 | '''Error dealing with pdfmark''' 120 | 121 | 122 | def echo(s, nl=True, err=False): 123 | ''' 124 | Print to stdout 125 | ''' 126 | if err: 127 | out = sys.stderr 128 | else: 129 | out = sys.stdout 130 | 131 | out.write(s) 132 | if nl: 133 | out.write('\n') 134 | out.flush() 135 | 136 | 137 | def roman_to_arabic(roman): 138 | ''' 139 | Convert roman to arabic 140 | ''' 141 | if not roman: 142 | raise InvalidRomanNumeralError('No input found') 143 | 144 | if roman == 'N': 145 | return 0 146 | 147 | if not _ROMAN_NUMERAL_PATTERN.match(roman): 148 | raise InvalidRomanNumeralError( 149 | 'Invalid Roman numeral: {}'.format(roman)) 150 | 151 | arabic = 0 152 | for i, n in enumerate(roman): 153 | if i == len(roman)-1 or _ROMAN_NUMERAL_MAP[roman[i]] >= _ROMAN_NUMERAL_MAP[roman[i+1]]: 154 | arabic += _ROMAN_NUMERAL_MAP[n] 155 | else: 156 | arabic -= _ROMAN_NUMERAL_MAP[n] 157 | 158 | return arabic 159 | 160 | 161 | def arabic_to_roman(arabic): 162 | ''' 163 | Convert arabic to roman 164 | ''' 165 | if arabic < 0 or arabic > 4999: 166 | raise RomanOutOfRangeError('Roman numeral must in [0, 5000)') 167 | 168 | if arabic == 0: 169 | return 'N' 170 | 171 | roman = '' 172 | 173 | remain = arabic 174 | for digit, unit in _ROMAN_NUMERAL_PAIR: 175 | digit_num = remain // unit 176 | roman += digit*digit_num 177 | remain -= unit*digit_num 178 | 179 | return roman 180 | 181 | 182 | def letters_to_arabic(letters): 183 | ''' 184 | Convert letters to arabic 185 | ''' 186 | if not letters: 187 | return 0 188 | 189 | letter = letters[0] 190 | if ord(letter) < ord('A') or ord(letter) > ord('Z'): 191 | raise InvalidLettersNumeralError('Must be capital letter') 192 | 193 | for digit in letters[1:]: 194 | if digit != letter: 195 | raise InvalidLettersNumeralError('Letters are not identical') 196 | 197 | return len(letters)*26 - 25 + ord(letter) - ord('A') 198 | 199 | 200 | def arabic_to_letters(arabic): 201 | ''' 202 | Convert arabic to letters 203 | ''' 204 | if arabic < 0: 205 | raise LettersOutOfRangeError('Letters numeral must >= 0') 206 | 207 | if arabic == 0: 208 | return '' 209 | 210 | return chr(((arabic-1) % 26) + ord('A')) * ((arabic+25) // 26) 211 | 212 | 213 | def _unicode_replace_match(match): 214 | return chr(int(match.group(1))) 215 | 216 | 217 | def _unicode_replace(string): 218 | return _UNICODE_REGEXP.sub(_unicode_replace_match, string) 219 | 220 | 221 | def call(cmd, encoding=None): 222 | ''' 223 | Run command 224 | ''' 225 | if encoding is None: 226 | encoding = 'utf-8' 227 | 228 | try: 229 | p = subprocess.Popen(cmd, stdout=subprocess.PIPE, 230 | stderr=subprocess.PIPE) 231 | except FileNotFoundError: 232 | raise CommandError('Command not installed: {}'.format(cmd[0])) 233 | 234 | out, err = p.communicate() 235 | status = p.wait() 236 | 237 | if status != 0: 238 | raise CommandError( 239 | 'Invoke command {} failed with exit code {}:\n {}'.format( 240 | cmd, status, err.decode(encoding))) 241 | 242 | if encoding: 243 | out = out.decode(encoding) 244 | return out 245 | 246 | 247 | def import_pdftk(data, collapse_level=0): 248 | ''' 249 | Convert pdftk output to bookmark 250 | ''' 251 | bookmarks = {} 252 | bookmark_info = {} 253 | 254 | for t in _BOOKMARK_DESCRIPTION: 255 | bookmarks[t] = [] 256 | bookmark_info[t] = {} 257 | 258 | for line in data.splitlines(): 259 | try: 260 | key, value = line.split(': ', 1) 261 | except ValueError: # e.g. line == 'InfoBegin' 262 | continue 263 | 264 | for bm_type, bm_detail in _BOOKMARK_DESCRIPTION.items(): 265 | if not key.startswith(bm_detail['prefix']): 266 | continue 267 | 268 | k = key[len(bm_detail['prefix']):] 269 | if k not in bm_detail['fields']: 270 | continue 271 | 272 | k = bm_detail['fields'][k] 273 | if k in bm_detail['handler']: 274 | v = bm_detail['handler'][k](value) 275 | 276 | bookmark_info[bm_type][k] = v 277 | 278 | ready_for_save = True 279 | for _, field in bm_detail['fields'].items(): 280 | if field not in bookmark_info[bm_type]: 281 | ready_for_save = False 282 | break 283 | if not ready_for_save: 284 | continue 285 | 286 | bookmark_info[bm_type]['collapse'] = collapse_level != 0 and \ 287 | bookmark_info[bm_type]['level'] >= collapse_level 288 | 289 | bookmarks[bm_type].append(bookmark_info[bm_type]) 290 | bookmark_info[bm_type] = {} 291 | 292 | return bookmarks 293 | 294 | 295 | def export_bmk(bookmarks): 296 | ''' 297 | Export to bookmark format 298 | ''' 299 | bm_output = '!!! # Generated bmk file\n' 300 | 301 | page_labels = bookmarks['page_label'] 302 | 303 | current_page_label_index = -1 304 | 305 | current_collapse_level = 0 306 | 307 | for bm in bookmarks['bookmark']: 308 | page_label_index = -1 309 | for i, pl in enumerate(page_labels): 310 | if bm['page'] >= pl['new_index']: 311 | page_label_index = i 312 | 313 | if page_label_index >= 0: 314 | if page_label_index != current_page_label_index: 315 | bm_output += '\n' 316 | 317 | for k in ['new_index', 'num_start', 'num_style']: 318 | bm_output += '!!! {} = {}\n'.format( 319 | k, page_labels[page_label_index][k]) 320 | 321 | bm_output += '\n' 322 | 323 | current_page_label_index = page_label_index 324 | 325 | page = bm['page'] - \ 326 | page_labels[page_label_index]['new_index'] + \ 327 | page_labels[page_label_index]['num_start'] 328 | 329 | if page_labels[page_label_index]['num_style'] == 'Roman': 330 | page = arabic_to_roman(page) 331 | elif page_labels[page_label_index]['num_style'] == 'Letters': 332 | page = arabic_to_letters(page) 333 | else: 334 | page = bm['page'] 335 | 336 | # This is a XOR of (bm['collapse']) and 337 | # (current_collapse_level == 0 or current_collapse_level > bm['level']) 338 | if bm['collapse'] == (current_collapse_level == 0 or current_collapse_level > bm['level']): 339 | current_collapse_level = bm['level'] if bm['collapse'] else 0 340 | bm_output += '!!! collapse_level = {}\n'.format( 341 | current_collapse_level) 342 | 343 | bm_output += '{}{}................{}\n'.format( 344 | ' '*(bm['level']-1), bm['title'], page) 345 | 346 | return bm_output 347 | 348 | 349 | def _parse_bookmark_command(line): 350 | if line[3:].lstrip().startswith('#'): 351 | return '', '' 352 | 353 | try: 354 | k, v = line[3:].split('=', 1) 355 | except ValueError: 356 | raise InvalidBookmarkSyntaxError('Invalid syntax: {}'.format(line)) 357 | 358 | return k.strip(), v.strip() 359 | 360 | 361 | def _parse_level(line, level_indent): 362 | space_count = 0 363 | for c in line: 364 | if c != ' ': 365 | break 366 | space_count += 1 367 | 368 | if space_count % level_indent != 0: 369 | raise InvalidBookmarkSyntaxError( 370 | 'Level indentation error: {}'.format(line)) 371 | 372 | return space_count // level_indent + 1, line[space_count:] 373 | 374 | 375 | def _split_title_page(title_page): 376 | start_pos = title_page.find('.'*_CONTENT_MINIMUM_DOTS) 377 | if start_pos < 0: 378 | raise InvalidBookmarkSyntaxError( 379 | 'There must be at least {} "." specified in the line "{}"'.format(_CONTENT_MINIMUM_DOTS,title_page)) 380 | 381 | end_pos = start_pos + _CONTENT_MINIMUM_DOTS 382 | for c in title_page[start_pos+_CONTENT_MINIMUM_DOTS:]: 383 | if c != '.': 384 | break 385 | end_pos += 1 386 | 387 | title = title_page[:start_pos] 388 | page = title_page[end_pos:] 389 | 390 | return title.strip(), page.strip() 391 | 392 | 393 | def import_bmk(bookmark_data, collapse_level=0): 394 | ''' 395 | Import bookmark format 396 | ''' 397 | bookmarks = {} 398 | bookmarks['bookmark'] = [] 399 | bookmarks['page_label'] = [] 400 | 401 | page_config = { 402 | 'new_index': 1, 403 | 'num_start': 1, 404 | 'num_style': 'Arabic', 405 | 'collapse_level': collapse_level, 406 | 'level_indent': 2, 407 | } 408 | 409 | page_label_saved = False 410 | 411 | for line in bookmark_data.splitlines(): 412 | if not line.strip(): 413 | continue 414 | 415 | if line.startswith('!!!'): 416 | k, v = _parse_bookmark_command(line) 417 | if not k: 418 | continue 419 | if k == 'new_index': 420 | page_label_saved = False 421 | page_config[k] = int(v) 422 | page_config['num_start'] = 1 423 | page_config['num_style'] = 'Arabic' 424 | elif k in ['num_start', 'collapse_level', 'level_indent']: 425 | page_config[k] = int(v) 426 | else: 427 | page_config[k] = v 428 | continue 429 | 430 | if not page_label_saved: 431 | bookmarks['page_label'].append({kk: vv for kk, vv in page_config.items() if kk in [ 432 | 'new_index', 'num_start', 'num_style']}) 433 | page_label_saved = True 434 | 435 | level, title_page = _parse_level(line, page_config['level_indent']) 436 | 437 | title, page = _split_title_page(title_page) 438 | 439 | try: 440 | if page_config['num_style'] == 'Roman': 441 | page = roman_to_arabic(page.upper()) 442 | elif page_config['num_style'] == 'Letters': 443 | page = letters_to_arabic(page.upper()) 444 | else: 445 | page = int(page) 446 | except ValueError: 447 | raise InvalidBookmarkSyntaxError( 448 | 'Page number invalid: {}'.format(page)) 449 | 450 | page = page - page_config['num_start'] + page_config['new_index'] 451 | 452 | collapse = page_config['collapse_level'] != 0 and level >= page_config['collapse_level'] 453 | 454 | bookmark_info = { 455 | 'level': level, 456 | 'title': title, 457 | 'page': page, 458 | 'collapse': collapse, 459 | } 460 | bookmarks['bookmark'].append(bookmark_info) 461 | 462 | return bookmarks 463 | 464 | 465 | def _pdfmark_unicode(string): 466 | r""" 467 | >>> _pdfmark_unicode('ascii text with ) paren') 468 | '(ascii text with \\) paren)' 469 | >>> _pdfmark_unicode('\u03b1\u03b2\u03b3') 470 | '' 471 | """ 472 | try: 473 | string.encode('ascii') 474 | except UnicodeEncodeError: 475 | b = codecs.BOM_UTF16_BE + string.encode('utf-16-be') 476 | return '<{}>'.format(''.join('{:02X}'.format(byte) for byte in b)) 477 | else: 478 | # escape special characters 479 | for a, b in [('\\', '\\\\'), ('(', '\\('), (')', '\\)'), 480 | ('\n', '\\n'), ('\t', '\\t')]: 481 | string = string.replace(a, b) 482 | return '({})'.format(string) 483 | 484 | 485 | def _pdfmark_unicode_decode(string): 486 | r""" 487 | >>> _pdfmark_unicode_decode(_pdfmark_unicode('\u03b1\u03b2\u03b3')) 488 | '\u03b1\u03b2\u03b3' 489 | """ 490 | if not (string.startswith('')): 491 | raise PdfMarkError 492 | 493 | b = bytes(int(float.fromhex(x1+x2)) 494 | for x1, x2 in zip(string[5:-2:2], string[6:-1:2])) 495 | return b.decode('utf-16-be') 496 | 497 | 498 | def export_pdfmark(bookmarks): 499 | ''' 500 | Convert bookmark to pdfmark 501 | ''' 502 | pdfmark = '' 503 | 504 | for i, bm in enumerate(bookmarks['bookmark']): 505 | pdfmark += '[' 506 | 507 | count = 0 508 | for bmk in bookmarks['bookmark'][i+1:]: 509 | if bmk['level'] == bm['level']: 510 | break 511 | if bmk['level'] == bm['level'] + 1: 512 | count += 1 513 | if count: 514 | sign = '-' if bm.get('collapse') else '' 515 | pdfmark += '/Count {}{} '.format(sign, count) 516 | 517 | pdfmark += '/Title {} /Page {} '.format( 518 | _pdfmark_unicode(bm['title']), bm['page']) 519 | 520 | pdfmark += '/OUT pdfmark\n' 521 | 522 | return pdfmark 523 | 524 | 525 | def _write_pdfmark_noop_file(): 526 | # By default, Ghostscript will preserve pdfmarks from the sources PDFs 527 | fd, filename = tempfile.mkstemp(prefix='pdfmark-noop-', text=True) 528 | # Make `[... /OUT pdfmark` a no-op. 529 | os.write(fd, b""" 530 | % store the original pdfmark 531 | /originalpdfmark { //pdfmark } bind def 532 | 533 | % replace pdfmark with a wrapper that ignores OUT 534 | /pdfmark 535 | { 536 | { % begin loop 537 | 538 | { counttomark pop } 539 | stopped 540 | { /pdfmark errordict /unmatchedmark get exec stop } 541 | if 542 | 543 | dup type /nametype ne 544 | { /pdfmark errordict /typecheck get exec stop } 545 | if 546 | 547 | dup /OUT eq 548 | { (Skipping OUT pdfmark\n) print cleartomark exit } 549 | if 550 | 551 | originalpdfmark exit 552 | 553 | } loop 554 | } def 555 | """) 556 | os.close(fd) 557 | return filename 558 | 559 | 560 | def _write_pdfmark_restore_file(): 561 | fd, filename = tempfile.mkstemp(prefix='pdfmark-restore-', text=True) 562 | # Restore the default `[... /Out pdfmark` behaviour 563 | os.write(fd, b'/pdfmark { originalpdfmark } bind def\n') 564 | os.close(fd) 565 | return filename 566 | 567 | 568 | def _write_pdfmark_pagemode(): 569 | fd, filename = tempfile.mkstemp(prefix='pagemode_', text=True) 570 | os.write( 571 | fd, b'[/PageMode /UseOutlines /View [/FitH] /Page 1 /DOCVIEW pdfmark\n') 572 | os.close(fd) 573 | return filename 574 | 575 | 576 | def generate_pdf(pdfmark, pdf, output_pdf): 577 | ''' 578 | Generate pdf from pdfmark and pdf file 579 | ''' 580 | fd, pdfmark_file = tempfile.mkstemp(prefix='pdfmark_', text=True) 581 | os.write(fd, pdfmark.encode('ascii')) 582 | os.close(fd) 583 | 584 | pdfmark_noop = _write_pdfmark_noop_file() 585 | pdfmark_restore = _write_pdfmark_restore_file() 586 | pdfmark_pagemode = _write_pdfmark_pagemode() 587 | 588 | call(['gs', '-dBATCH', '-dNOPAUSE', '-sDEVICE=pdfwrite', '-dAutoRotatePages=/None', 589 | '-sOutputFile={}'.format(output_pdf), 590 | pdfmark_noop, 591 | pdf, 592 | pdfmark_restore, 593 | pdfmark_file, 594 | pdfmark_pagemode]) 595 | 596 | os.remove(pdfmark_noop) 597 | os.remove(pdfmark_restore) 598 | os.remove(pdfmark_file) 599 | os.remove(pdfmark_pagemode) 600 | 601 | 602 | def main(): 603 | ''' 604 | The main process 605 | ''' 606 | parser = argparse.ArgumentParser(description=__doc__) 607 | parser.add_argument( 608 | '-f', '--format', default='bmk', 609 | choices=['bmk', 'none', 'pdftk', 'pdfmark', 'json'], 610 | help='the output format of bookmark') 611 | parser.add_argument( 612 | '-l', '--collapse-level', default=0, type=int, 613 | help='the min level to be collapsed, 0 to expand all') 614 | parser.add_argument( 615 | '-b', '--bookmark', help='the bookmark file to be imported') 616 | parser.add_argument( 617 | '-p', '--pdf', help='the input PDF file') 618 | parser.add_argument( 619 | '-o', '--output-pdf', help='the output PDF file') 620 | parser.add_argument( 621 | '-v', '--version', action='store_true', help='version of pdf-bookmark') 622 | 623 | args = parser.parse_args() 624 | 625 | if args.version: 626 | echo('pdf-bookmark version {}'.format(VERSION)) 627 | return 0 628 | 629 | if args.bookmark is None and args.pdf is None or \ 630 | args.pdf is None and args.output_pdf is not None: 631 | parser.print_help(sys.stderr) 632 | return 1 633 | 634 | if args.bookmark is not None: 635 | with open(args.bookmark) as f: 636 | bookmarks = import_bmk(f.read(), args.collapse_level) 637 | else: 638 | pdftk_data = call(['pdftk', args.pdf, 'dump_data'], 'ascii') 639 | 640 | if args.format == 'pdftk': 641 | echo(pdftk_data, nl=False) 642 | 643 | bookmarks = import_pdftk(pdftk_data, args.collapse_level) 644 | 645 | if args.format == 'pdfmark' or (args.output_pdf is not None and args.pdf is not None): 646 | pdfmark = export_pdfmark(bookmarks) 647 | 648 | if args.format == 'json': 649 | echo(json.dumps(bookmarks)) 650 | elif args.format == 'bmk': 651 | echo(export_bmk(bookmarks), nl=False) 652 | elif args.format == 'pdfmark': 653 | echo(pdfmark, nl=False) 654 | 655 | if args.output_pdf is not None: 656 | generate_pdf(pdfmark, args.pdf, args.output_pdf) 657 | 658 | return 0 659 | 660 | 661 | if __name__ == '__main__': 662 | sys.exit(main()) 663 | --------------------------------------------------------------------------------