├── requirements.txt
├── bin
    └── boox-hlconvert
├── README.md
├── LICENSE
├── .gitignore
├── helper.py
├── boox_annot_reader.py
├── main.py
└── pdf_text_search.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | pdfrw==0.4
2 | PyMuPDF
3 | colorlog
4 | 


--------------------------------------------------------------------------------
/bin/boox-hlconvert:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # get script location
3 | selfpath="$(dirname $(readlink -f "$0"))"
4 | # activate pyenv virtualenv
5 | eval "$(pyenv init -)" || exit $?
6 | pyenv activate --quiet pdf || exit $?
7 | 
8 | python "$selfpath/../main.py" "$@"
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Boox Highlights Converter
 2 | 
 3 | ### Convert highlight annotations produced by Boox neoreader into standard PDF format
 4 | 
 5 | The original format created by neo-reader from [boox eink reader](https://onyxboox.com/) are in txt and the standard export function
 6 | from reader create a non-standard pdf format. So I wrote this script to read in from the txt file produced
 7 | by neo-reader, and re-create those annotations (only highlights and comments for now) with help from 
 8 | `pdfrw` and `PyMuPDF`. You will need a `pyenv` virtualenv named as `pdf` (and install dependencies from `requirements.txt`)
 9 | if you want to use it in your system.
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Oscar Tin Yiu Lai
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/helper.py:
--------------------------------------------------------------------------------
 1 | import fitz
 2 | from pdfrw import PdfDict, PdfArray, PdfName
 3 | 
 4 | 
 5 | def create_highlight(points, color=(1, 0.92, 0.23), author=None, contents=None):
 6 |     """Given Quad points, create a highligh object in standard pdf format."""
 7 |     new_highlight = PdfDict()
 8 |     new_highlight.F = 4
 9 |     new_highlight.Type = PdfName('Annot')
10 |     new_highlight.Subtype = PdfName('Highlight')
11 |     if author:
12 |         new_highlight.T = author
13 |     new_highlight.C = color
14 |     if contents:
15 |         new_highlight.Contents = contents
16 |     new_highlight.indirect = True
17 | 
18 |     #############################################################
19 |     ### Search for bounding coordinates
20 |     #############################################################
21 |     bot_left_x = float('inf')
22 |     bot_left_y = float('inf')
23 |     top_right_x = 0.0
24 |     top_right_y = 0.0
25 | 
26 |     quad_pts = []
27 |     for (x1, y1, x2, y2) in points:
28 |         # this quadpoints specified PDF definition of rect box
29 |         quad_pts.extend([x1, y2, x2, y2, x1, y1, x2, y1])
30 |         bot_left_x = min(bot_left_x, x1, x2)
31 |         bot_left_y = min(bot_left_y, y1, y2)
32 |         top_right_x = max(top_right_x, x1, x2)
33 |         top_right_y = max(top_right_y, y1, y2)
34 | 
35 |     new_highlight.QuadPoints = PdfArray(quad_pts)
36 |     new_highlight.Rect = PdfArray([bot_left_x, bot_left_y,
37 |                                    top_right_x, top_right_y])
38 |     return new_highlight
39 | 
40 | def add_annot(pdfrw_page, annot):
41 |     """Add annotations to page, create an array if none exists yet"""
42 |     if pdfrw_page.Annots is None:
43 |         pdfrw_page.Annots = PdfArray()
44 |     pdfrw_page.Annots.append(annot)
45 | 
46 | def pdfrw_quadpoint_to_fitz_rect(pts):
47 |     """Convert pdfrw quadpoints into fitz rect format (from one library to another)."""
48 |     origin = 0
49 |     rects = []
50 |     while origin < len(pts):
51 |         (x1, y1, x2, y2) = pts[origin+0], pts[origin+5], pts[origin+6], pts[origin+1]
52 |         rects.append(fitz.Rect(x1, y1, x2, y2))
53 |         origin += 8
54 |     return rects
55 | 


--------------------------------------------------------------------------------
/boox_annot_reader.py:
--------------------------------------------------------------------------------
 1 | """
 2 | For conveting annotation from a boox annotated file.
 3 | """
 4 | import os
 5 | import re
 6 | import logging
 7 | 
 8 | _LOGGER = logging.getLogger()
 9 | 
10 | class Annot:
11 |     """Class that represents an annotation."""
12 |     def __init__(self):
13 |         self.page = None
14 |         self.text = ""
15 |         self.comment = None
16 | 
17 |     def __repr__(self):
18 |         return "<{}: page: {}, text: {}, comment: {}>".format(
19 |             self.__class__.__name__,
20 |             self.page, self.text,
21 |             self.comment)
22 | 
23 | def read_annotations(pdf_path):
24 |     """Read annotations from folder that hold the .txt file, then return the text."""
25 |     path_name = os.path.splitext(pdf_path)[0]
26 |     base_name_with_ext = os.path.basename(pdf_path)
27 |     base_name = os.path.splitext(base_name_with_ext)[0]
28 |     annotation_file_name = os.path.join(path_name, base_name + '-annotation.txt')
29 |     if not os.path.isfile(annotation_file_name):
30 |         _LOGGER.debug("Expected annotation file does not exists.")
31 |         return None
32 |     with open(annotation_file_name, 'r', newline='') as annot_file:
33 |         # the newline parameter stop python from translating \r\n to \n
34 |         begining_anno = True
35 |         ended = False
36 |         annotations = []
37 |         for line in annot_file.readlines():
38 |             ##############################
39 |             ##  REPLACE INVALID TOKENS  ##
40 |             ##############################
41 |             # this indicate some token that the program cannot recognise (as place holder)
42 |             if "\xef\xbf\xbe" in line:
43 |                 # It likely to be a hyphen for word break. Replace it as '-'.
44 |                 line = line.replace("\xef\xbf\xbe", '-\n')
45 |             ###########################
46 |             if begining_anno:
47 |                 ### Page line + Comment
48 |                 annotations.append(Annot())
49 | 
50 |                 begining_anno = False
51 |                 match_obj = re.match(r'(?:Page )([0-9]+)\s{1,2}(.*)?\n', line)
52 | 
53 |                 if not match_obj:
54 |                     raise Exception("Error in parsing first line")
55 | 
56 |                 annotations[-1].page = match_obj.group(1)
57 |                 annotations[-1].comment = match_obj.group(2)
58 |             elif '\x00' in line:
59 |                 ### Last line before End of annotation
60 |                 ended = True
61 |                 line = line.replace('\x00', '')
62 |                 annotations[-1].text += line
63 |             elif '--------------------' in line:
64 |                 ### End of annotation
65 |                 if not ended:
66 |                     raise Exception("Did not detect \\x00 indicating end of line?")
67 |                 begining_anno = True
68 |                 ended = False
69 |                 # fix ups the formatting of each components
70 |                 # NOTE this -1 because in the program index starts at 0
71 |                 annotations[-1].page = int(annotations[-1].page) - 1
72 |                 annotations[-1].text = annotations[-1].text.rstrip()
73 |                 annotations[-1].comment = annotations[-1].comment.rstrip()
74 |                 if not annotations[-1].comment:
75 |                     # remove empty comment
76 |                     annotations[-1].comment = None
77 |             elif '\r\n' in line:
78 |                 ### text (highlighted pdf text)
79 |                 annotations[-1].text += line
80 |             elif '\n' in  line:
81 |                 ### Comment
82 |                 annotations[-1].comment += line
83 |             else:
84 |                 raise Exception("ERROR: The boox annotations txt file contain unrecognisible line")
85 |     return annotations
86 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | """Main file for boox annotation conversion."""
  2 | import sys
  3 | import os
  4 | import shutil
  5 | import argparse
  6 | import logging
  7 | from colorlog import ColoredFormatter
  8 | 
  9 | from pdfrw import PdfReader, PdfWriter, PdfDict, PdfArray
 10 | from helper import (
 11 |     create_highlight,
 12 |     add_annot,
 13 | )
 14 | from pdf_text_search import(
 15 |     PDFTextSearch,
 16 |     TextNotFoundException,
 17 |     MultipleInstancesException
 18 | )
 19 | from boox_annot_reader import read_annotations
 20 | 
 21 | _LOGGER = logging.getLogger()
 22 | AUTHOR = 'Tin Lai'
 23 | 
 24 | def convert(input_file, use_new_file=False, backup_file=True):
 25 |     """Convert a given file's annotations."""
 26 |     annotations = read_annotations(input_file)
 27 |     if annotations is None:
 28 |         _LOGGER.info("Skipping...")
 29 |         return None
 30 |     if backup_file:
 31 |         backup(input_file)
 32 |     annotations = sorted(annotations, key=lambda x: x.page)
 33 |     if use_new_file:
 34 |         output = 'result.' + os.path.basename(input_file)
 35 |     else:
 36 |         output = os.path.basename(input_file)
 37 |     output = os.path.join(os.path.dirname(input_file), output)
 38 | 
 39 |     trailer = PdfReader(input_file)
 40 |     fitz_pdf = PDFTextSearch(input_file)
 41 |     for i, page in enumerate(trailer.pages):
 42 | 
 43 |         if annotations and i == annotations[0].page:
 44 |             page_num = i+1
 45 |             count = 0
 46 |             while annotations and i == annotations[0].page:
 47 |                 try:
 48 |                     _annot = annotations.pop(0)
 49 |                     text = _annot.text
 50 |                     try:
 51 |                         points = fitz_pdf.get_quadpoints(i, text)
 52 |                     except TextNotFoundException:
 53 |                         # use fall back to try again
 54 |                         _LOGGER.debug("Page %d: Using fall-back mechanism."
 55 |                                       "Might contains mistaken hls.", page_num)
 56 |                         points = fitz_pdf.fallback_get_quadpoints(i, text)
 57 |                     except MultipleInstancesException:
 58 |                         _LOGGER.error("Page %d: The following text found multiple instances,\n\n"
 59 |                                       "  --> \"%s\" <--  \n\n"
 60 |                                       "(Token too short?), please re-highligh it manually.",
 61 |                                       page_num, text)
 62 |                         continue
 63 |                     highlight = create_highlight(points,
 64 |                                                  author=AUTHOR,
 65 |                                                  contents=_annot.comment,
 66 |                                                  color=(1, 1, 0.4))
 67 |                     # check to see if this annotation exists already
 68 |                     if fitz_pdf.annot_exists(page_num=i, annot=highlight):
 69 |                         _LOGGER.debug("Page %d: This annot already exists, skipping...", page_num)
 70 |                     else:
 71 |                         add_annot(page, annot=highlight)
 72 |                         count += 1
 73 |                         # shorten the line by removing all \r or \n, and also remove double spacing.
 74 |                         hightlighted = text.replace('\r', ' ').replace('\n', ' ').replace('  ', ' ')
 75 |                         _LOGGER.info("Page %d: Highlighted:,\n"
 76 |                                       "  --> \"%s\" <--  \n",
 77 |                                       page_num, hightlighted)
 78 |                 except TextNotFoundException:
 79 |                     _LOGGER.error("Page %d: The following text was not found,\n\n"
 80 |                                   "  --> \"%s\" <--  \n\n"
 81 |                                   "please re-highligh it manually.",
 82 |                                   page_num, text)
 83 |             print(">> Page {} successfully converted: {}".format(page_num, count))
 84 | 
 85 |     PdfWriter(output, trailer=trailer).write()
 86 |     return output
 87 | 
 88 | def handle_args():
 89 |     """Handle arguments for argparse."""
 90 |     parser = argparse.ArgumentParser(
 91 |         description="Convert Boox neoreader highlights annotation to standard "
 92 |                     "pdf format.")
 93 |     parser.add_argument(
 94 |         "file",
 95 |         help="File or directory as input. If the given argument is a directory, "
 96 |              "all files within will be the action target. The default action "
 97 |              "(without -c or -r flag) is to perform the annotation conversion "
 98 |              "action. (default: current working directory)",
 99 |         nargs='?',
100 |         default=os.getcwd(),
101 |         metavar="FILE_OR_DIR")
102 |     parser.add_argument(
103 |         "-c",
104 |         "--clean",
105 |         action='store_true',
106 |         default=False,
107 |         help="Cleans up the bak files from current directory.")
108 |     parser.add_argument(
109 |         "--clean-entire-dir",
110 |         action='store_true',
111 |         default=False,
112 |         help="Cleans up the enitre directory so that any annotation directory or "
113 |              "bak files will be deleted; hence, implies --clean."
114 |         )
115 |     parser.add_argument(
116 |         "-r",
117 |         "--restore",
118 |         action='store_true',
119 |         default=False,
120 |         help="Use existing bak file to restore and overwrite the original pdf files.")
121 |     parser.add_argument(
122 |         "-n",
123 |         "--new-file",
124 |         action='store_true',
125 |         default=False,
126 |         help="Create a new file instead of overwriting the input file.")
127 |     parser.add_argument(
128 |         "--no-backup",
129 |         action='store_true',
130 |         default=False,
131 |         help="Do not create a bak file (dangeous if using original file).")
132 |     parser.add_argument(
133 |         '-v',
134 |         "--verbose",
135 |         action='store_true',
136 |         default=False,
137 |         help="Be verbose in the status of conversion progress.")
138 | 
139 |     args = vars(parser.parse_args())
140 |     if args['clean_entire_dir']:
141 |         args['clean'] = True
142 |     if args['verbose']:
143 |         _LOGGER.setLevel(logging.DEBUG)
144 |     else:
145 |         _LOGGER.setLevel(logging.ERROR)
146 |     # logger to stdout
147 |     channel = logging.StreamHandler(sys.stdout)
148 |     LOGFORMAT = '%(log_color)s%(levelname)s: %(message)s%(reset)s'
149 |     formatter = ColoredFormatter(LOGFORMAT)
150 |     channel.setFormatter(formatter)
151 |     _LOGGER.addHandler(channel)
152 |     return args
153 | 
154 | def backup(inpfn):
155 |     """Create a bak file for the given input file."""
156 |     backup_file = '{}.bak'.format(inpfn)
157 |     if os.path.isfile(backup_file):
158 |         _LOGGER.debug('Found backup pdf. Using the bak as input instead.')
159 |         shutil.copyfile(backup_file, inpfn)
160 |     else:
161 |         shutil.copyfile(inpfn, backup_file)
162 | 
163 | 
164 | def clean_up(inpfn):
165 |     """Clean up for the given input file."""
166 |     backup_file = '{}.bak'.format(inpfn)
167 |     if os.path.isfile(backup_file):
168 |         os.remove(backup_file)
169 |         _LOGGER.debug("Deleting %s", backup_file)
170 | 
171 | def restore(inpfn, end_with_bak=False):
172 |     """Restore the given input file."""
173 |     if end_with_bak:
174 |         backup_file = inpfn
175 |         inpfn = inpfn[0:inpfn.rfind('.')]
176 |     else:
177 |         backup_file = '{}.bak'.format(inpfn)
178 |     if os.path.isfile(backup_file):
179 |         os.rename(backup_file, inpfn)
180 |     elif not end_with_bak:
181 |         _LOGGER.info("Bak file for '%s' does not exists.", inpfn)
182 | 
183 | def convert_wrapper(inpfn, args):
184 |     """A wrapper for the convert function, for converting multiple files at once."""
185 |     outfn = convert(input_file=inpfn, use_new_file=args['new_file'],
186 |                     backup_file=(not args['no_backup']))
187 |     if outfn is None:
188 |         return
189 |     # open result file with foxitreader (to re-save the format as it helps to fixes stuff)
190 |     # need abs path becuase using relative path does not seems to mess up saving path
191 |     last_modified_time = os.path.getmtime(outfn)
192 |     import subprocess
193 |     with open(os.devnull, 'w') as file_null:
194 |         _LOGGER.debug('Opening %s', outfn)
195 |         subprocess.call(['foxitreader', outfn], close_fds=True,
196 |                         stdout=file_null, stderr=subprocess.STDOUT)
197 |     # Check if user had saved the file after opening foxitreader
198 |     if os.path.getmtime(outfn) <= last_modified_time:
199 |         _LOGGER.warning("Seems like you did not save the file after opening foxitreader? "
200 |                         "Its best to allow it do works for us on fixing internal PDF structures.")
201 | 
202 | def main():
203 |     """Entry point when this file is run."""
204 |     args = handle_args()
205 |     if args['clean'] == args['restore'] and args['clean']:
206 |         _LOGGER.error("The flag -c and -r are mutually exclusive, cannot be both set!")
207 |         sys.exit(1)
208 |     inpfn = os.path.abspath(args['file'])
209 | 
210 |     # for clean up or restore
211 |     if os.path.isdir(inpfn):
212 |         for file in os.listdir(inpfn):
213 |             # prefix the file name with its directory
214 |             file = os.path.join(inpfn, file)
215 |             if file.endswith(".bak") and args['restore']:
216 |                 restore(file, end_with_bak=True)
217 |             elif file.endswith(".pdf"):
218 |                 if args['clean_entire_dir']:
219 |                     annot_path = os.path.splitext(file)[0]
220 |                     if os.path.isdir(annot_path):
221 |                         _LOGGER.debug("Deleting annot dir %s", annot_path)
222 |                         shutil.rmtree(annot_path)
223 |                 if args['clean']:
224 |                     clean_up(file)
225 |                 elif not args['clean'] and not args['restore']:
226 |                     # Main functionality
227 |                     print('='*80)
228 |                     print(' {}'.format(os.path.basename(file)))
229 |                     print('-'*80)
230 |                     convert_wrapper(file, args)
231 |                     print('')
232 |     else:
233 |         if args['clean']:
234 |             clean_up(inpfn)
235 |         elif args['restore']:
236 |             restore(inpfn)
237 |         else:
238 |             # Main functionality
239 |             convert_wrapper(inpfn, args)
240 | 
241 | 
242 | if __name__ == '__main__':
243 |     main()
244 | 


--------------------------------------------------------------------------------
/pdf_text_search.py:
--------------------------------------------------------------------------------
  1 | """
  2 | For searching text in a pdf file.
  3 | """
  4 | import logging
  5 | import fitz
  6 | from helper import pdfrw_quadpoint_to_fitz_rect
  7 | 
  8 | _LOGGER = logging.getLogger()
  9 | 
 10 | TOKENS_MIN_LENGTH = 2
 11 | SAME_LINE_TOL = 1.5
 12 | 
 13 | class TextNotFoundException(Exception):
 14 |     """Exception for text not found in pdf."""
 15 |     pass
 16 | 
 17 | class MultipleInstancesException(Exception):
 18 |     """Exception for multiple possible instances found in pdf."""
 19 |     pass
 20 | 
 21 | class FallbackFailedException(Exception):
 22 |     """Exception for fallback method of pdf text search fails as well."""
 23 |     pass
 24 | 
 25 | class PossibleErrorException(Exception):
 26 |     """Exception for a possible unforseen error."""
 27 |     pass
 28 | 
 29 | class PDFTextSearch:
 30 |     """Represent a class that search text from a pdf."""
 31 | 
 32 |     def __init__(self, doc_name):
 33 |         self.doc = fitz.open(doc_name)
 34 | 
 35 |     def get_quadpoints(self, page_num, text, hit_max=16, ignore_short_width=4, extract=True):
 36 |         """Search for the given text in the page. Raise exception if more than one result found"""
 37 |         page = self.doc[page_num]
 38 |         rects = page.searchFor(text, hit_max=hit_max)
 39 |         if len(rects) < 1:
 40 |             raise TextNotFoundException("No search result found: {}".format(text))
 41 |         if len(rects) > 1:
 42 |             # We detect error very naively...... But at least it's better than none.
 43 |             # We detect via checking if the results are consecutive lines. If they are
 44 |             # it is most likely it is a single result with multiline spanning. If not,
 45 |             # most likely the searching text is too short and result in many lines having
 46 |             # the same sequence of word.
 47 |             textblock = page.getTextBlocks()
 48 |             consecutive_results = None
 49 |             i = 0
 50 |             for textblock in page.getTextBlocks():
 51 |                 if i >= len(rects):
 52 |                     break  # DONE
 53 |                 textblock_rect = fitz.Rect(textblock[0], textblock[1], textblock[2], textblock[3])
 54 |                 # print(dir(textblock_rect))
 55 |                 # print(type(textblock_rect))
 56 |                 if textblock_rect.includeRect(rects[i]):
 57 |                     while i < len(rects):
 58 |                         if rects[i].width < ignore_short_width:
 59 |                             # Do not include this short line in highlighting
 60 |                             rects.pop(i)
 61 |                         if consecutive_results is None:
 62 |                             consecutive_results = 'started'
 63 |                         elif consecutive_results == 'end':
 64 |                             raise MultipleInstancesException(
 65 |                                 "Possible multiple search results. The results are not consecutive")
 66 |                         # 'FOUNDDDDD!!!!!!
 67 |                         i += 1
 68 |                         if i >= len(rects) or not textblock_rect.includeRect(rects[i]):
 69 |                             break
 70 |                 else:
 71 |                     if consecutive_results == 'started':
 72 |                         consecutive_results = 'end'
 73 | 
 74 |             # if reaching this point, all result must have been matched. If not, error
 75 |             if i < len(rects):
 76 |                 raise PossibleErrorException("ERROR! Not all result been vertified.")
 77 |         if not extract:
 78 |             return rects
 79 |         merged = self.merge_tokens(rects)
 80 |         return self.invert_coordinates(merged, self.page_height(page_num))
 81 | 
 82 | 
 83 |     def fallback_get_quadpoints(self, page_num, text, hit_max=16, ignore_short_width=4):
 84 |         """
 85 |         Search for the given text in the page. Raise exception if more than one result found.
 86 |         This fallback method breaks the entire text into chunks of tokens, and ignore tokens that
 87 |         cannot be recognised. Therefore, it is more robust as it ignores a part of tokens for
 88 |         better finding text, and also does its best to detect error.
 89 |         """
 90 |         tokens = []
 91 |         def add(words):
 92 |             """Helper functino to add w to tokens (and check length before doing so)"""
 93 |             if len(words) <= 2:
 94 |                 _LOGGER.debug("VERY SHORT token: '%s'! Ignoring this token...", words)
 95 |                 return
 96 |             tokens.extend(self.get_quadpoints(page_num, words, hit_max,
 97 |                                               ignore_short_width, extract=False))
 98 |         def get_token(line):
 99 |             """Given line, return the splited sentence before and after the first
100 |             occurance of an escape char"""
101 |             idx, skiped_word = self.unicode_idx(line)
102 |             if idx < 0:
103 |                 return line, ''
104 |             _LOGGER.debug("Ignoring unicode '%s' from: '%s'", skiped_word, line)
105 |             words = line.split(' ')
106 |             return ' '.join(words[:idx]), ' '.join(words[idx+1:])
107 |         def add_remaining_words(line):
108 |             """Add all remaining words into list."""
109 |             while len(line.split(' ')) > TOKENS_MIN_LENGTH:
110 |                 words, line = get_token(line)
111 |                 try:
112 |                     add(words)
113 |                 except TextNotFoundException:
114 |                     _LOGGER.debug("Skipping '%s' as it was not found", words)
115 | 
116 |         for i, line in enumerate(text.split('\n')):
117 |             line = line.rstrip()
118 |             if i == 0:
119 |                 # first few words
120 |                 if self.unicode_idx(line)[0] != -1 and self.unicode_idx(line)[0] <= 3:
121 |                     raise FallbackFailedException("Escaped character too close to beginning tokens")
122 |                 words, line = get_token(line)
123 |                 add(words)
124 |                 add_remaining_words(line)
125 |             else:
126 |                 add_remaining_words(line)
127 |         merged = self.merge_tokens(tokens)
128 |         return self.invert_coordinates(merged, self.page_height(page_num))
129 | 
130 |     def annot_exists(self, page_num, annot):
131 |         """Given an annot in pdfrw, determine if it already exists by utilising fitz."""
132 |         page = self.doc[page_num]
133 |         page_annot = page.firstAnnot
134 |         # need to change pdfrw's rect coor to fits fitz's coordinate *by inverting)
135 |         pending_annots = [fitz.Rect(x) for x in self.invert_coordinates(
136 |             pdfrw_quadpoint_to_fitz_rect(annot.QuadPoints), self.page_height(page_num))]
137 |         """We consider the two given annots are the same if all the sub-parts of the pending
138 |         annots intersects one of the annot that we are checking. (We cannot simply use
139 |         contains because the coordinates data are slightly off and hence unreliable)"""
140 |         while page_annot:
141 |             # inverted
142 |             if all(page_annot.rect.intersects(a) for a in pending_annots):
143 |                 return True
144 |             # else we continue to check next annot
145 |             page_annot = page_annot.next                        # get next annot on page
146 | 
147 |         return False
148 | 
149 |     def page_height(self, page_num):
150 |         """Return the page height of given page."""
151 |         page = self.doc[page_num]
152 |         return page.bound().y1
153 | 
154 |     @staticmethod
155 |     def merge_tokens(annot_tokens):
156 |         """Try to merge the broken tokens together, with full line width"""
157 |         if len(annot_tokens) < 2:
158 |             # no need to merge len = 1
159 |             return annot_tokens
160 |         def sameline(l1, l2):
161 |             """Determine if l1 and l2 are on the same line"""
162 |             tol = SAME_LINE_TOL
163 |             if (abs(l1.y0 - l2.y0) < tol and
164 |                     abs(l1.y1 - l2.y1) < tol):
165 |                 return True
166 |             return False
167 |         def merge_column_tokens(tokens):
168 |             """Loop through to find left most & right most boarder."""
169 |             left_most = float('inf')
170 |             right_most = 0
171 |             for t in tokens:
172 |                 left_most = min(left_most, t.x0, t.x1)
173 |                 right_most = max(right_most, t.x0, t.x1)
174 |             lines = []
175 |             for i, t in enumerate(tokens):
176 |                 if i == 0:
177 |                     lines.append([t])  # first line no need to check previous line
178 |                 else:
179 |                     # determine if it's same line as before
180 |                     if sameline(lines[-1][0], t):
181 |                         # append to previous line
182 |                         lines[-1].append(t)
183 |                     else:
184 |                         # create a new line
185 |                         lines.append([t])
186 |             ###########################
187 |             ## NOW WE DO THE MERGING ##
188 |             ###########################
189 |             new_lines = []
190 |             for i, line in enumerate(lines):
191 |                 bot = float('inf')
192 |                 top = 0
193 |                 for l in line:
194 |                     bot = min(bot, l.y0)
195 |                     top = max(top, l.y1)
196 |                 if i == 0:
197 |                     new_lines.append(fitz.Rect(line[0].x0, bot, right_most, top))
198 |                 elif i == len(lines) - 1:
199 |                     new_lines.append(fitz.Rect(left_most, bot, line[-1].x1, top))
200 |                 else:
201 |                     new_lines.append(fitz.Rect(left_most, bot, right_most, top))
202 |             return new_lines
203 | 
204 |         # detect if the highlights spans a double column
205 |         is_double_column = False
206 |         double_column = [[], []]
207 |         double_column[0].append(annot_tokens[0])
208 | 
209 |         # filter the tokens that belong to different columns, and perform merge for each column
210 |         for i in range(1, len(annot_tokens)):
211 |             if not sameline(annot_tokens[i-1], annot_tokens[i]):
212 |                 if annot_tokens[i-1].y0 > annot_tokens[i].y0:
213 |                     is_double_column = True
214 |             if not is_double_column:
215 |                 double_column[0].append(annot_tokens[i])
216 |             else:
217 |                 double_column[1].append(annot_tokens[i])
218 |         if not is_double_column:
219 |             return merge_column_tokens(annot_tokens)
220 | 
221 |         firstcol_merged_tokens = merge_column_tokens(double_column[0])
222 |         secondcol_merged_tokens = merge_column_tokens(double_column[1])
223 |         firstcol_merged_tokens.extend(secondcol_merged_tokens)
224 |         return firstcol_merged_tokens
225 | 
226 |     @staticmethod
227 |     def invert_coordinates(rects, page_height):
228 |         """
229 |         TO work around the different coordinate system in fitz and pdfrw. The x-axis
230 |         are same but the y-axis are opposite to each other. One starts at top and one starts
231 |         at bottom
232 |         """
233 |         # convert from top left bot right -- to -- bot left top right
234 |         # this is for compliance of convention in PDF
235 |         rects = [(r.x0, r.y1, r.x1, r.y0) for r in rects]
236 |         # the coordinate system in fitz and pdfrw are inverted.
237 |         # Need to invert back with "page_height - y"
238 |         # this is for converting between fitz and pdfrw system
239 |         return [(r[0], page_height - r[1], r[2], page_height - r[3]) for r in rects]
240 | 
241 |     @staticmethod
242 |     def unicode_idx(text):
243 |         """Return the index of word (within the line) that contain escape char \\x"""
244 |         text = repr(text)[1:-1]
245 |         for i, word in enumerate(text.split(' ')):
246 |             if "\\x" in word:
247 |                 return i, word
248 |         return -1, None
249 | 


--------------------------------------------------------------------------------