├── setup.cfg
├── LICENSE
├── setup.py
├── .gitignore
├── test_pdf_bookmark.py
├── README.md
└── pdf_bookmark.py


/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=1
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Xianghu Zhao
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | The setup file for pdf-bookmark package
 5 | '''
 6 | 
 7 | from distutils.core import setup
 8 | import os
 9 | import re
10 | import setuptools   # pylint: disable=unused-import
11 | 
12 | HERE = os.path.abspath(os.path.dirname(__file__))
13 | 
14 | 
15 | def find_version():
16 |     '''Find version'''
17 |     with open(os.path.join(HERE, 'pdf_bookmark.py')) as file_py:
18 |         content = file_py.read()
19 | 
20 |     version_match = re.search(
21 |         r"^VERSION\s*=\s*['\"]([^'\"]*)['\"]", content, re.M)
22 |     if version_match:
23 |         return version_match.group(1)
24 |     raise RuntimeError("Unable to find version string.")
25 | 
26 | 
27 | def find_long_description():
28 |     '''Find long description'''
29 |     with open(os.path.join(HERE, 'README.md')) as file_ld:
30 |         return file_ld.read()
31 | 
32 | 
33 | setup(
34 |     name='pdf-bookmark',
35 |     version=find_version(),
36 |     description='PDF Bookmark Import and Export',
37 |     long_description=find_long_description(),
38 |     long_description_content_type='text/markdown',
39 |     author='Xianghu Zhao',
40 |     author_email='xianghuzhao@gmail.com',
41 |     url='https://github.com/xianghuzhao/pdf-bookmark',
42 |     license='MIT',
43 | 
44 |     py_modules=['pdf_bookmark'],
45 |     tests_require=['pytest'],
46 |     entry_points={'console_scripts': ['pdf-bookmark = pdf_bookmark:main']},
47 | )
48 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # celery beat schedule file
 95 | celerybeat-schedule
 96 | 
 97 | # SageMath parsed files
 98 | *.sage.py
 99 | 
100 | # Environments
101 | .env
102 | .venv
103 | env/
104 | venv/
105 | venv2/
106 | ENV/
107 | env.bak/
108 | venv.bak/
109 | 
110 | # Spyder project settings
111 | .spyderproject
112 | .spyproject
113 | 
114 | # Rope project settings
115 | .ropeproject
116 | 
117 | # mkdocs documentation
118 | /site
119 | 
120 | # mypy
121 | .mypy_cache/
122 | .dmypy.json
123 | dmypy.json
124 | 
125 | # Pyre type checker
126 | .pyre/
127 | 
128 | # Test file
129 | *.pdf
130 | *.bmk
131 | 


--------------------------------------------------------------------------------
/test_pdf_bookmark.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=missing-docstring
  2 | 
  3 | import pytest
  4 | 
  5 | from pdf_bookmark import InvalidRomanNumeralError
  6 | from pdf_bookmark import RomanOutOfRangeError
  7 | from pdf_bookmark import roman_to_arabic
  8 | from pdf_bookmark import arabic_to_roman
  9 | 
 10 | from pdf_bookmark import InvalidLettersNumeralError
 11 | from pdf_bookmark import letters_to_arabic
 12 | from pdf_bookmark import arabic_to_letters
 13 | 
 14 | 
 15 | INVALID_ROMAN = (
 16 |     '',
 17 |     'ii',
 18 |     'IIIII',
 19 |     'ID',
 20 |     'XM',
 21 |     '12345',
 22 |     'jflaiffj',
 23 |     '+=_-&^%#!$%#*&)~`,.><',
 24 | )
 25 | 
 26 | INVALID_ROMAN_VALUE = (
 27 |     -100000,
 28 |     -1,
 29 |     5000,
 30 |     5001,
 31 |     5002,
 32 |     10000,
 33 | )
 34 | 
 35 | ROMAN_PAIRS = (
 36 |     (0, 'N'),
 37 |     (1, 'I'),
 38 |     (2, 'II'),
 39 |     (3, 'III'),
 40 |     (4, 'IV'),
 41 |     (5, 'V'),
 42 |     (9, 'IX'),
 43 |     (12, 'XII'),
 44 |     (16, 'XVI'),
 45 |     (29, 'XXIX'),
 46 |     (44, 'XLIV'),
 47 |     (45, 'XLV'),
 48 |     (68, 'LXVIII'),
 49 |     (83, 'LXXXIII'),
 50 |     (97, 'XCVII'),
 51 |     (99, 'XCIX'),
 52 |     (400, 'CD'),
 53 |     (500, 'D'),
 54 |     (501, 'DI'),
 55 |     (649, 'DCXLIX'),
 56 |     (798, 'DCCXCVIII'),
 57 |     (891, 'DCCCXCI'),
 58 |     (1000, 'M'),
 59 |     (1004, 'MIV'),
 60 |     (1006, 'MVI'),
 61 |     (1023, 'MXXIII'),
 62 |     (2014, 'MMXIV'),
 63 |     (3999, 'MMMCMXCIX'),
 64 |     (4999, 'MMMMCMXCIX'),
 65 | )
 66 | 
 67 | 
 68 | def test_invalid_roman():
 69 |     for roman in INVALID_ROMAN:
 70 |         with pytest.raises(InvalidRomanNumeralError):
 71 |             roman_to_arabic(roman)
 72 | 
 73 | 
 74 | def test_roman_to_arabic():
 75 |     for arabic, roman in ROMAN_PAIRS:
 76 |         assert roman_to_arabic(roman) == arabic
 77 | 
 78 | 
 79 | def test_out_of_range_roman():
 80 |     for arabic in INVALID_ROMAN_VALUE:
 81 |         with pytest.raises(RomanOutOfRangeError):
 82 |             arabic_to_roman(arabic)
 83 | 
 84 | 
 85 | def test_arabic_to_roman():
 86 |     for arabic, roman in ROMAN_PAIRS:
 87 |         assert arabic_to_roman(arabic) == roman
 88 | 
 89 | 
 90 | INVALID_LETTERS = (
 91 |     '0',
 92 |     '0342',
 93 |     'a',
 94 |     'ABC',
 95 |     'AAAAAA8',
 96 |     '9BBBB',
 97 |     '&*-+#',
 98 |     '12345',
 99 |     'jflaiffj',
100 |     '+=_-&^%#!$%#*&)~`,.><',
101 | )
102 | 
103 | LETTERS_PAIRS = (
104 |     (0, ''),
105 |     (1, 'A'),
106 |     (2, 'B'),
107 |     (3, 'C'),
108 |     (8, 'H'),
109 |     (26, 'Z'),
110 |     (27, 'AA'),
111 |     (52, 'ZZ'),
112 |     (106, 'BBBBB'),
113 | )
114 | 
115 | 
116 | def test_invalid_letter():
117 |     for letters in INVALID_LETTERS:
118 |         with pytest.raises(InvalidLettersNumeralError):
119 |             letters_to_arabic(letters)
120 | 
121 | 
122 | def test_letters_to_arabic():
123 |     for arabic, letters in LETTERS_PAIRS:
124 |         assert letters_to_arabic(letters) == arabic
125 | 
126 | 
127 | def test_arabic_to_letters():
128 |     for arabic, letters in LETTERS_PAIRS:
129 |         assert arabic_to_letters(arabic) == letters
130 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PDF-Bookmark
  2 | 
  3 | [![PyPI](https://badge.fury.io/py/pdf-bookmark.svg)](https://pypi.org/project/pdf-bookmark/)
  4 | 
  5 | PDF-Bookmark is a tool for importing and exporting pdf bookmark
  6 | with the `bmk` format.
  7 | 
  8 | 
  9 | ## Installation
 10 | 
 11 | ```shell
 12 | $ pip install pdf-bookmark
 13 | ```
 14 | 
 15 | `pdf-bookmark` utilizes
 16 | [PDFtk](https://www.pdflabs.com/tools/pdftk-server/)
 17 | and [Ghostscript](https://www.ghostscript.com/)
 18 | to export and import pdf bookmarks.
 19 | They must be installed before running `pdf-bookmark`.
 20 | 
 21 | 
 22 | ### PDFtk
 23 | 
 24 | PDFtk is used here to export bookmark from pdf file.
 25 | The java port [pdftk-java](https://gitlab.com/pdftk-java/pdftk)
 26 | may also be OK.
 27 | 
 28 | On Arch Linux, `pdftk-java` could be installed by:
 29 | 
 30 | ```shell
 31 | $ sudo pacman -S pdftk java-commons-lang
 32 | ```
 33 | 
 34 | Verify the installation:
 35 | 
 36 | ```shell
 37 | $ pdftk --version
 38 | ```
 39 | 
 40 | 
 41 | ### Ghostscript
 42 | 
 43 | Ghostscript is used here to import bookmark to pdf file.
 44 | 
 45 | On Arch Linux, it could be installed by:
 46 | 
 47 | ```shell
 48 | $ sudo pacman -S ghostscript
 49 | ```
 50 | 
 51 | Verify the installation:
 52 | 
 53 | ```shell
 54 | $ gs --version
 55 | ```
 56 | 
 57 | 
 58 | ## bmk format
 59 | 
 60 | The `bmk` format is used to describe the bookmark of a pdf file.
 61 | It will be used to import bookmark into a pdf file.
 62 | 
 63 | `bmk` format is easy to write.
 64 | It looks quite like the content of a book.
 65 | So you can copy the content and modify from it.
 66 | 
 67 | Each line represents a bookmark item. The title and the page number are
 68 | separated by at least 4 dots "`.`".
 69 | 
 70 | The level of a bookmark is specified by the indentation of spaces.
 71 | The default indentation is 2 spaces, and the number of spaces could be
 72 | configured with inline command.
 73 | 
 74 | This is a simple example of a `bmk` file.
 75 | 
 76 | ```
 77 | 序................1
 78 | Chapter 1................4
 79 | Chapter 2................5
 80 |   2.1 Section 1................6
 81 |     2.1.1 SubSection 1................6
 82 |     2.1.2 SubSection 2................8
 83 |   2.2 Section 2................12
 84 | Chapter 3................20
 85 | Appendix................36
 86 | ```
 87 | 
 88 | Import the bookmark and create a new pdf file:
 89 | 
 90 | ```shell
 91 | $ pdf-bookmark -p input.pdf -b bookmark.bmk -o new.pdf
 92 | ```
 93 | 
 94 | 
 95 | ### Export bmk format
 96 | 
 97 | The `bmk` format could also be exported from a pdf file with bookmark.
 98 | You may also modify the bookmark from the exported one.
 99 | 
100 | ```shell
101 | $ pdf-bookmark -p input.pdf
102 | ```
103 | 
104 | 
105 | ### Inline command
106 | 
107 | There could also be inline commands in the file to do more controls
108 | on the bookmark. These commands start with `!!!` and modify some
109 | properties of bookmark. The new property will affect bookmarks after
110 | the line until it is changed again.
111 | 
112 | It is normal that the main body of a pdf file does not start from the
113 | first page of pdf, and the page number is not always arabic.
114 | 
115 | ```
116 | !!! collapse_level = 2
117 | 
118 | !!! num_style = Roman
119 | Preface................I
120 | Content................IV
121 | 
122 | !!! new_index = 12
123 | !!! num_style = Arabic
124 | Introduction................1
125 | Chapter 1................4
126 | Chapter 2................5
127 |   2.1 Section 1................6
128 |   2.2 Section 2................7
129 | Chapter 3................10
130 | Appendix................11
131 | ```
132 | 
133 | With these inline commands, you do not need to recalculate the index
134 | number for each page.
135 | 
136 | Here are all supported inline commands:
137 | 
138 | * `new_index`. Default: 1.
139 |    The following bookmark index will be recalculated from the
140 |    new index number (`new_index + page - 1`).
141 | * `num_start`. Default: 1.
142 |    Specify the number of first page if it does not start from 1
143 |    (`new_index + page - num_start`).
144 | * `num_style`. Default: `Arabic`.
145 |    The page number style. Could be `Arabic`, `Roman` and `Letters`.
146 | * `collapse_level`. Default: 0.
147 |    On which level the bookmarks are collapsed. 0 means expand all.
148 | * `level_indent`. Default: 2.
149 |    Number of indentation spaces for a new level.
150 | 
151 | 
152 | ## pdf-bookmark command
153 | 
154 | The `pdf-bookmark` command is installed by `pip install pdf-bookmark`.
155 | 
156 | ```
157 | usage: pdf-bookmark [-h] [-f {bmk,none,pdftk,pdfmark,json}]
158 |                     [-l COLLAPSE_LEVEL] [-b BOOKMARK] [-p PDF] [-o OUTPUT_PDF]
159 | 
160 | Import and export PDF bookmark
161 | 
162 | optional arguments:
163 |   -h, --help            show this help message and exit
164 |   -f {bmk,none,pdftk,pdfmark,json}, --format {bmk,none,pdftk,pdfmark,json}
165 |                         the output format of bookmark
166 |   -l COLLAPSE_LEVEL, --collapse-level COLLAPSE_LEVEL
167 |                         the min level to be collapsed, 0 to expand all
168 |   -b BOOKMARK, --bookmark BOOKMARK
169 |                         the bookmark file to be imported
170 |   -p PDF, --pdf PDF     the input PDF file
171 |   -o OUTPUT_PDF, --output-pdf OUTPUT_PDF
172 |                         the output PDF file
173 | ```
174 | 
175 | 
176 | ## Example
177 | 
178 | ### Import bookmark
179 | 
180 | This will import the `bmk` bookmark into a pdf file:
181 | 
182 | ```shell
183 | $ pdf-bookmark -p input.pdf -b bookmark.bmk -o new.pdf
184 | ```
185 | 
186 | If you would like to have a quite output:
187 | 
188 | ```shell
189 | $ pdf-bookmark -p input.pdf -b bookmark.bmk -f none -o new.pdf
190 | ```
191 | 
192 | ### Export bookmark
193 | 
194 | This will export the `bmk` bookmark to stdout from a pdf file:
195 | 
196 | ```shell
197 | $ pdf-bookmark -p input.pdf
198 | ```
199 | 
200 | The output format could be changed to `pdfmark`, `json`:
201 | 
202 | ```shell
203 | $ pdf-bookmark -p input.pdf -f pdfmark
204 | $ pdf-bookmark -p input.pdf -f json
205 | ```
206 | 
207 | ### Change the collapse level
208 | 
209 | This will only change the collapse level of the pdf.
210 | 
211 | ```shell
212 | $ pdf-bookmark -p input.pdf -l 2 -o new.pdf
213 | ```
214 | 


--------------------------------------------------------------------------------
/pdf_bookmark.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # pylint: disable=invalid-name
  4 | 
  5 | '''
  6 | Import and export PDF bookmark
  7 | '''
  8 | 
  9 | import os
 10 | import sys
 11 | import subprocess
 12 | import re
 13 | import argparse
 14 | import json
 15 | import tempfile
 16 | import codecs
 17 | 
 18 | 
 19 | VERSION = '1.1.0'
 20 | 
 21 | 
 22 | _NUM_STYLE_MAP = {
 23 |     'DecimalArabicNumerals': 'Arabic',
 24 |     'UppercaseRomanNumerals': 'Roman',
 25 |     'LowercaseRomanNumerals': 'Roman',
 26 |     'UppercaseLetters': 'Letters',
 27 |     'LowercaseLetters': 'Letters',
 28 | }
 29 | 
 30 | 
 31 | _ROMAN_NUMERAL_PAIR = (
 32 |     ('M', 1000),
 33 |     ('CM', 900),
 34 |     ('D', 500),
 35 |     ('CD', 400),
 36 |     ('C', 100),
 37 |     ('XC', 90),
 38 |     ('L', 50),
 39 |     ('XL', 40),
 40 |     ('X', 10),
 41 |     ('IX', 9),
 42 |     ('V', 5),
 43 |     ('IV', 4),
 44 |     ('I', 1),
 45 | )
 46 | 
 47 | _ROMAN_NUMERAL_MAP = {pair[0]: pair[1] for pair in _ROMAN_NUMERAL_PAIR}
 48 | 
 49 | _ROMAN_NUMERAL_PATTERN = re.compile(
 50 |     '^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$'
 51 | )
 52 | 
 53 | 
 54 | _BOOKMARK_DESCRIPTION = {
 55 |     'bookmark': {
 56 |         'prefix': 'Bookmark',
 57 |         'fields': {
 58 |             'Title': 'title',
 59 |             'Level': 'level',
 60 |             'PageNumber': 'page',
 61 |         },
 62 |         'handler': {
 63 |             'title': lambda s: _unicode_replace(s) if _UNICODE_REGEXP.search(s) else s,
 64 |             'level': int,
 65 |             'page': int,
 66 |         },
 67 |     },
 68 |     'page_label': {
 69 |         'prefix': 'PageLabel',
 70 |         'fields': {
 71 |             'NewIndex': 'new_index',
 72 |             'Start': 'num_start',
 73 |             'NumStyle': 'num_style',
 74 |         },
 75 |         'handler': {
 76 |             'new_index': int,
 77 |             'num_start': int,
 78 |             'num_style': lambda s: _NUM_STYLE_MAP.get(s, 'Arabic'),
 79 |         },
 80 |     },
 81 | }
 82 | 
 83 | 
 84 | _UNICODE_REGEXP = re.compile('&#([0-9]+);')
 85 | 
 86 | 
 87 | _CONTENT_MINIMUM_DOTS = 4
 88 | 
 89 | 
 90 | class CommandError(Exception):
 91 |     '''Run command error'''
 92 | 
 93 | 
 94 | class InvalidBookmarkSyntaxError(Exception):
 95 |     '''Invalid bookmark syntax'''
 96 | 
 97 | 
 98 | class InvalidNumeralError(ValueError):
 99 |     '''Invalid numeral expression'''
100 | 
101 | 
102 | class InvalidRomanNumeralError(InvalidNumeralError):
103 |     '''Invalid roman numeral expression'''
104 | 
105 | 
106 | class RomanOutOfRangeError(Exception):
107 |     '''The roman number is out of range'''
108 | 
109 | 
110 | class InvalidLettersNumeralError(InvalidNumeralError):
111 |     '''Invalid letters numeral expression'''
112 | 
113 | 
114 | class LettersOutOfRangeError(Exception):
115 |     '''The letters number is out of range'''
116 | 
117 | 
118 | class PdfMarkError(Exception):
119 |     '''Error dealing with pdfmark'''
120 | 
121 | 
122 | def echo(s, nl=True, err=False):
123 |     '''
124 |     Print to stdout
125 |     '''
126 |     if err:
127 |         out = sys.stderr
128 |     else:
129 |         out = sys.stdout
130 | 
131 |     out.write(s)
132 |     if nl:
133 |         out.write('\n')
134 |     out.flush()
135 | 
136 | 
137 | def roman_to_arabic(roman):
138 |     '''
139 |     Convert roman to arabic
140 |     '''
141 |     if not roman:
142 |         raise InvalidRomanNumeralError('No input found')
143 | 
144 |     if roman == 'N':
145 |         return 0
146 | 
147 |     if not _ROMAN_NUMERAL_PATTERN.match(roman):
148 |         raise InvalidRomanNumeralError(
149 |             'Invalid Roman numeral: {}'.format(roman))
150 | 
151 |     arabic = 0
152 |     for i, n in enumerate(roman):
153 |         if i == len(roman)-1 or _ROMAN_NUMERAL_MAP[roman[i]] >= _ROMAN_NUMERAL_MAP[roman[i+1]]:
154 |             arabic += _ROMAN_NUMERAL_MAP[n]
155 |         else:
156 |             arabic -= _ROMAN_NUMERAL_MAP[n]
157 | 
158 |     return arabic
159 | 
160 | 
161 | def arabic_to_roman(arabic):
162 |     '''
163 |     Convert arabic to roman
164 |     '''
165 |     if arabic < 0 or arabic > 4999:
166 |         raise RomanOutOfRangeError('Roman numeral must in [0, 5000)')
167 | 
168 |     if arabic == 0:
169 |         return 'N'
170 | 
171 |     roman = ''
172 | 
173 |     remain = arabic
174 |     for digit, unit in _ROMAN_NUMERAL_PAIR:
175 |         digit_num = remain // unit
176 |         roman += digit*digit_num
177 |         remain -= unit*digit_num
178 | 
179 |     return roman
180 | 
181 | 
182 | def letters_to_arabic(letters):
183 |     '''
184 |     Convert letters to arabic
185 |     '''
186 |     if not letters:
187 |         return 0
188 | 
189 |     letter = letters[0]
190 |     if ord(letter) < ord('A') or ord(letter) > ord('Z'):
191 |         raise InvalidLettersNumeralError('Must be capital letter')
192 | 
193 |     for digit in letters[1:]:
194 |         if digit != letter:
195 |             raise InvalidLettersNumeralError('Letters are not identical')
196 | 
197 |     return len(letters)*26 - 25 + ord(letter) - ord('A')
198 | 
199 | 
200 | def arabic_to_letters(arabic):
201 |     '''
202 |     Convert arabic to letters
203 |     '''
204 |     if arabic < 0:
205 |         raise LettersOutOfRangeError('Letters numeral must >= 0')
206 | 
207 |     if arabic == 0:
208 |         return ''
209 | 
210 |     return chr(((arabic-1) % 26) + ord('A')) * ((arabic+25) // 26)
211 | 
212 | 
213 | def _unicode_replace_match(match):
214 |     return chr(int(match.group(1)))
215 | 
216 | 
217 | def _unicode_replace(string):
218 |     return _UNICODE_REGEXP.sub(_unicode_replace_match, string)
219 | 
220 | 
221 | def call(cmd, encoding=None):
222 |     '''
223 |     Run command
224 |     '''
225 |     if encoding is None:
226 |         encoding = 'utf-8'
227 | 
228 |     try:
229 |         p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
230 |                              stderr=subprocess.PIPE)
231 |     except FileNotFoundError:
232 |         raise CommandError('Command not installed: {}'.format(cmd[0]))
233 | 
234 |     out, err = p.communicate()
235 |     status = p.wait()
236 | 
237 |     if status != 0:
238 |         raise CommandError(
239 |             'Invoke command {} failed with exit code {}:\n {}'.format(
240 |                 cmd, status, err.decode(encoding)))
241 | 
242 |     if encoding:
243 |         out = out.decode(encoding)
244 |     return out
245 | 
246 | 
247 | def import_pdftk(data, collapse_level=0):
248 |     '''
249 |     Convert pdftk output to bookmark
250 |     '''
251 |     bookmarks = {}
252 |     bookmark_info = {}
253 | 
254 |     for t in _BOOKMARK_DESCRIPTION:
255 |         bookmarks[t] = []
256 |         bookmark_info[t] = {}
257 | 
258 |     for line in data.splitlines():
259 |         try:
260 |             key, value = line.split(': ', 1)
261 |         except ValueError:  # e.g. line == 'InfoBegin'
262 |             continue
263 | 
264 |         for bm_type, bm_detail in _BOOKMARK_DESCRIPTION.items():
265 |             if not key.startswith(bm_detail['prefix']):
266 |                 continue
267 | 
268 |             k = key[len(bm_detail['prefix']):]
269 |             if k not in bm_detail['fields']:
270 |                 continue
271 | 
272 |             k = bm_detail['fields'][k]
273 |             if k in bm_detail['handler']:
274 |                 v = bm_detail['handler'][k](value)
275 | 
276 |             bookmark_info[bm_type][k] = v
277 | 
278 |             ready_for_save = True
279 |             for _, field in bm_detail['fields'].items():
280 |                 if field not in bookmark_info[bm_type]:
281 |                     ready_for_save = False
282 |                     break
283 |             if not ready_for_save:
284 |                 continue
285 | 
286 |             bookmark_info[bm_type]['collapse'] = collapse_level != 0 and \
287 |                 bookmark_info[bm_type]['level'] >= collapse_level
288 | 
289 |             bookmarks[bm_type].append(bookmark_info[bm_type])
290 |             bookmark_info[bm_type] = {}
291 | 
292 |     return bookmarks
293 | 
294 | 
295 | def export_bmk(bookmarks):
296 |     '''
297 |     Export to bookmark format
298 |     '''
299 |     bm_output = '!!! # Generated bmk file\n'
300 | 
301 |     page_labels = bookmarks['page_label']
302 | 
303 |     current_page_label_index = -1
304 | 
305 |     current_collapse_level = 0
306 | 
307 |     for bm in bookmarks['bookmark']:
308 |         page_label_index = -1
309 |         for i, pl in enumerate(page_labels):
310 |             if bm['page'] >= pl['new_index']:
311 |                 page_label_index = i
312 | 
313 |         if page_label_index >= 0:
314 |             if page_label_index != current_page_label_index:
315 |                 bm_output += '\n'
316 | 
317 |                 for k in ['new_index', 'num_start', 'num_style']:
318 |                     bm_output += '!!! {} = {}\n'.format(
319 |                         k, page_labels[page_label_index][k])
320 | 
321 |                 bm_output += '\n'
322 | 
323 |                 current_page_label_index = page_label_index
324 | 
325 |             page = bm['page'] - \
326 |                 page_labels[page_label_index]['new_index'] + \
327 |                 page_labels[page_label_index]['num_start']
328 | 
329 |             if page_labels[page_label_index]['num_style'] == 'Roman':
330 |                 page = arabic_to_roman(page)
331 |             elif page_labels[page_label_index]['num_style'] == 'Letters':
332 |                 page = arabic_to_letters(page)
333 |         else:
334 |             page = bm['page']
335 | 
336 |         # This is a XOR of (bm['collapse']) and
337 |         # (current_collapse_level == 0 or current_collapse_level > bm['level'])
338 |         if bm['collapse'] == (current_collapse_level == 0 or current_collapse_level > bm['level']):
339 |             current_collapse_level = bm['level'] if bm['collapse'] else 0
340 |             bm_output += '!!! collapse_level = {}\n'.format(
341 |                 current_collapse_level)
342 | 
343 |         bm_output += '{}{}................{}\n'.format(
344 |             '  '*(bm['level']-1), bm['title'], page)
345 | 
346 |     return bm_output
347 | 
348 | 
349 | def _parse_bookmark_command(line):
350 |     if line[3:].lstrip().startswith('#'):
351 |         return '', ''
352 | 
353 |     try:
354 |         k, v = line[3:].split('=', 1)
355 |     except ValueError:
356 |         raise InvalidBookmarkSyntaxError('Invalid syntax: {}'.format(line))
357 | 
358 |     return k.strip(), v.strip()
359 | 
360 | 
361 | def _parse_level(line, level_indent):
362 |     space_count = 0
363 |     for c in line:
364 |         if c != ' ':
365 |             break
366 |         space_count += 1
367 | 
368 |     if space_count % level_indent != 0:
369 |         raise InvalidBookmarkSyntaxError(
370 |             'Level indentation error: {}'.format(line))
371 | 
372 |     return space_count // level_indent + 1, line[space_count:]
373 | 
374 | 
375 | def _split_title_page(title_page):
376 |     start_pos = title_page.find('.'*_CONTENT_MINIMUM_DOTS)
377 |     if start_pos < 0:
378 |         raise InvalidBookmarkSyntaxError(
379 |             'There must be at least {} "." specified in the line "{}"'.format(_CONTENT_MINIMUM_DOTS,title_page))
380 | 
381 |     end_pos = start_pos + _CONTENT_MINIMUM_DOTS
382 |     for c in title_page[start_pos+_CONTENT_MINIMUM_DOTS:]:
383 |         if c != '.':
384 |             break
385 |         end_pos += 1
386 | 
387 |     title = title_page[:start_pos]
388 |     page = title_page[end_pos:]
389 | 
390 |     return title.strip(), page.strip()
391 | 
392 | 
393 | def import_bmk(bookmark_data, collapse_level=0):
394 |     '''
395 |     Import bookmark format
396 |     '''
397 |     bookmarks = {}
398 |     bookmarks['bookmark'] = []
399 |     bookmarks['page_label'] = []
400 | 
401 |     page_config = {
402 |         'new_index': 1,
403 |         'num_start': 1,
404 |         'num_style': 'Arabic',
405 |         'collapse_level': collapse_level,
406 |         'level_indent': 2,
407 |     }
408 | 
409 |     page_label_saved = False
410 | 
411 |     for line in bookmark_data.splitlines():
412 |         if not line.strip():
413 |             continue
414 | 
415 |         if line.startswith('!!!'):
416 |             k, v = _parse_bookmark_command(line)
417 |             if not k:
418 |                 continue
419 |             if k == 'new_index':
420 |                 page_label_saved = False
421 |                 page_config[k] = int(v)
422 |                 page_config['num_start'] = 1
423 |                 page_config['num_style'] = 'Arabic'
424 |             elif k in ['num_start', 'collapse_level', 'level_indent']:
425 |                 page_config[k] = int(v)
426 |             else:
427 |                 page_config[k] = v
428 |             continue
429 | 
430 |         if not page_label_saved:
431 |             bookmarks['page_label'].append({kk: vv for kk, vv in page_config.items() if kk in [
432 |                 'new_index', 'num_start', 'num_style']})
433 |             page_label_saved = True
434 | 
435 |         level, title_page = _parse_level(line, page_config['level_indent'])
436 | 
437 |         title, page = _split_title_page(title_page)
438 | 
439 |         try:
440 |             if page_config['num_style'] == 'Roman':
441 |                 page = roman_to_arabic(page.upper())
442 |             elif page_config['num_style'] == 'Letters':
443 |                 page = letters_to_arabic(page.upper())
444 |             else:
445 |                 page = int(page)
446 |         except ValueError:
447 |             raise InvalidBookmarkSyntaxError(
448 |                 'Page number invalid: {}'.format(page))
449 | 
450 |         page = page - page_config['num_start'] + page_config['new_index']
451 | 
452 |         collapse = page_config['collapse_level'] != 0 and level >= page_config['collapse_level']
453 | 
454 |         bookmark_info = {
455 |             'level': level,
456 |             'title': title,
457 |             'page': page,
458 |             'collapse': collapse,
459 |         }
460 |         bookmarks['bookmark'].append(bookmark_info)
461 | 
462 |     return bookmarks
463 | 
464 | 
465 | def _pdfmark_unicode(string):
466 |     r"""
467 |     >>> _pdfmark_unicode('ascii text with ) paren')
468 |     '(ascii text with \\) paren)'
469 |     >>> _pdfmark_unicode('\u03b1\u03b2\u03b3')
470 |     '<FEFF03B103B203B3>'
471 |     """
472 |     try:
473 |         string.encode('ascii')
474 |     except UnicodeEncodeError:
475 |         b = codecs.BOM_UTF16_BE + string.encode('utf-16-be')
476 |         return '<{}>'.format(''.join('{:02X}'.format(byte) for byte in b))
477 |     else:
478 |         # escape special characters
479 |         for a, b in [('\\', '\\\\'), ('(', '\\('), (')', '\\)'),
480 |                      ('\n', '\\n'), ('\t', '\\t')]:
481 |             string = string.replace(a, b)
482 |         return '({})'.format(string)
483 | 
484 | 
485 | def _pdfmark_unicode_decode(string):
486 |     r"""
487 |     >>> _pdfmark_unicode_decode(_pdfmark_unicode('\u03b1\u03b2\u03b3'))
488 |     '\u03b1\u03b2\u03b3'
489 |     """
490 |     if not (string.startswith('<FEFF') and string.endswith('>')):
491 |         raise PdfMarkError
492 | 
493 |     b = bytes(int(float.fromhex(x1+x2))
494 |               for x1, x2 in zip(string[5:-2:2], string[6:-1:2]))
495 |     return b.decode('utf-16-be')
496 | 
497 | 
498 | def export_pdfmark(bookmarks):
499 |     '''
500 |     Convert bookmark to pdfmark
501 |     '''
502 |     pdfmark = ''
503 | 
504 |     for i, bm in enumerate(bookmarks['bookmark']):
505 |         pdfmark += '['
506 | 
507 |         count = 0
508 |         for bmk in bookmarks['bookmark'][i+1:]:
509 |             if bmk['level'] == bm['level']:
510 |                 break
511 |             if bmk['level'] == bm['level'] + 1:
512 |                 count += 1
513 |         if count:
514 |             sign = '-' if bm.get('collapse') else ''
515 |             pdfmark += '/Count {}{} '.format(sign, count)
516 | 
517 |         pdfmark += '/Title {} /Page {} '.format(
518 |             _pdfmark_unicode(bm['title']), bm['page'])
519 | 
520 |         pdfmark += '/OUT pdfmark\n'
521 | 
522 |     return pdfmark
523 | 
524 | 
525 | def _write_pdfmark_noop_file():
526 |     # By default, Ghostscript will preserve pdfmarks from the sources PDFs
527 |     fd, filename = tempfile.mkstemp(prefix='pdfmark-noop-', text=True)
528 |     # Make `[... /OUT pdfmark` a no-op.
529 |     os.write(fd, b"""
530 | % store the original pdfmark
531 | /originalpdfmark { //pdfmark } bind def
532 | 
533 | % replace pdfmark with a wrapper that ignores OUT
534 | /pdfmark
535 | {
536 |   {  % begin loop
537 | 
538 |       { counttomark pop }
539 |     stopped
540 |       { /pdfmark errordict /unmatchedmark get exec stop }
541 |     if
542 | 
543 |     dup type /nametype ne
544 |       { /pdfmark errordict /typecheck get exec stop }
545 |     if
546 | 
547 |     dup /OUT eq
548 |       { (Skipping OUT pdfmark\n) print cleartomark exit }
549 |     if
550 | 
551 |     originalpdfmark exit
552 | 
553 |   } loop
554 | } def
555 | """)
556 |     os.close(fd)
557 |     return filename
558 | 
559 | 
560 | def _write_pdfmark_restore_file():
561 |     fd, filename = tempfile.mkstemp(prefix='pdfmark-restore-', text=True)
562 |     # Restore the default `[... /Out pdfmark` behaviour
563 |     os.write(fd, b'/pdfmark { originalpdfmark } bind def\n')
564 |     os.close(fd)
565 |     return filename
566 | 
567 | 
568 | def _write_pdfmark_pagemode():
569 |     fd, filename = tempfile.mkstemp(prefix='pagemode_', text=True)
570 |     os.write(
571 |         fd, b'[/PageMode /UseOutlines /View [/FitH] /Page 1 /DOCVIEW pdfmark\n')
572 |     os.close(fd)
573 |     return filename
574 | 
575 | 
576 | def generate_pdf(pdfmark, pdf, output_pdf):
577 |     '''
578 |     Generate pdf from pdfmark and pdf file
579 |     '''
580 |     fd, pdfmark_file = tempfile.mkstemp(prefix='pdfmark_', text=True)
581 |     os.write(fd, pdfmark.encode('ascii'))
582 |     os.close(fd)
583 | 
584 |     pdfmark_noop = _write_pdfmark_noop_file()
585 |     pdfmark_restore = _write_pdfmark_restore_file()
586 |     pdfmark_pagemode = _write_pdfmark_pagemode()
587 | 
588 |     call(['gs', '-dBATCH', '-dNOPAUSE', '-sDEVICE=pdfwrite', '-dAutoRotatePages=/None',
589 |           '-sOutputFile={}'.format(output_pdf),
590 |           pdfmark_noop,
591 |           pdf,
592 |           pdfmark_restore,
593 |           pdfmark_file,
594 |           pdfmark_pagemode])
595 | 
596 |     os.remove(pdfmark_noop)
597 |     os.remove(pdfmark_restore)
598 |     os.remove(pdfmark_file)
599 |     os.remove(pdfmark_pagemode)
600 | 
601 | 
602 | def main():
603 |     '''
604 |     The main process
605 |     '''
606 |     parser = argparse.ArgumentParser(description=__doc__)
607 |     parser.add_argument(
608 |         '-f', '--format', default='bmk',
609 |         choices=['bmk', 'none', 'pdftk', 'pdfmark', 'json'],
610 |         help='the output format of bookmark')
611 |     parser.add_argument(
612 |         '-l', '--collapse-level', default=0, type=int,
613 |         help='the min level to be collapsed, 0 to expand all')
614 |     parser.add_argument(
615 |         '-b', '--bookmark', help='the bookmark file to be imported')
616 |     parser.add_argument(
617 |         '-p', '--pdf', help='the input PDF file')
618 |     parser.add_argument(
619 |         '-o', '--output-pdf', help='the output PDF file')
620 |     parser.add_argument(
621 |         '-v', '--version', action='store_true', help='version of pdf-bookmark')
622 | 
623 |     args = parser.parse_args()
624 | 
625 |     if args.version:
626 |         echo('pdf-bookmark version {}'.format(VERSION))
627 |         return 0
628 | 
629 |     if args.bookmark is None and args.pdf is None or \
630 |             args.pdf is None and args.output_pdf is not None:
631 |         parser.print_help(sys.stderr)
632 |         return 1
633 | 
634 |     if args.bookmark is not None:
635 |         with open(args.bookmark) as f:
636 |             bookmarks = import_bmk(f.read(), args.collapse_level)
637 |     else:
638 |         pdftk_data = call(['pdftk', args.pdf, 'dump_data'], 'ascii')
639 | 
640 |         if args.format == 'pdftk':
641 |             echo(pdftk_data, nl=False)
642 | 
643 |         bookmarks = import_pdftk(pdftk_data, args.collapse_level)
644 | 
645 |     if args.format == 'pdfmark' or (args.output_pdf is not None and args.pdf is not None):
646 |         pdfmark = export_pdfmark(bookmarks)
647 | 
648 |     if args.format == 'json':
649 |         echo(json.dumps(bookmarks))
650 |     elif args.format == 'bmk':
651 |         echo(export_bmk(bookmarks), nl=False)
652 |     elif args.format == 'pdfmark':
653 |         echo(pdfmark, nl=False)
654 | 
655 |     if args.output_pdf is not None:
656 |         generate_pdf(pdfmark, args.pdf, args.output_pdf)
657 | 
658 |     return 0
659 | 
660 | 
661 | if __name__ == '__main__':
662 |     sys.exit(main())
663 | 


--------------------------------------------------------------------------------