├── requirements.txt ├── bin └── boox-hlconvert ├── README.md ├── LICENSE ├── .gitignore ├── helper.py ├── boox_annot_reader.py ├── main.py └── pdf_text_search.py /requirements.txt: -------------------------------------------------------------------------------- 1 | pdfrw==0.4 2 | PyMuPDF 3 | colorlog 4 | -------------------------------------------------------------------------------- /bin/boox-hlconvert: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # get script location 3 | selfpath="$(dirname $(readlink -f "$0"))" 4 | # activate pyenv virtualenv 5 | eval "$(pyenv init -)" || exit $? 6 | pyenv activate --quiet pdf || exit $? 7 | 8 | python "$selfpath/../main.py" "$@" 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Boox Highlights Converter 2 | 3 | ### Convert highlight annotations produced by Boox neoreader into standard PDF format 4 | 5 | The original format created by neo-reader from [boox eink reader](https://onyxboox.com/) are in txt and the standard export function 6 | from reader create a non-standard pdf format. So I wrote this script to read in from the txt file produced 7 | by neo-reader, and re-create those annotations (only highlights and comments for now) with help from 8 | `pdfrw` and `PyMuPDF`. You will need a `pyenv` virtualenv named as `pdf` (and install dependencies from `requirements.txt`) 9 | if you want to use it in your system. 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Oscar Tin Yiu Lai 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /helper.py: -------------------------------------------------------------------------------- 1 | import fitz 2 | from pdfrw import PdfDict, PdfArray, PdfName 3 | 4 | 5 | def create_highlight(points, color=(1, 0.92, 0.23), author=None, contents=None): 6 | """Given Quad points, create a highligh object in standard pdf format.""" 7 | new_highlight = PdfDict() 8 | new_highlight.F = 4 9 | new_highlight.Type = PdfName('Annot') 10 | new_highlight.Subtype = PdfName('Highlight') 11 | if author: 12 | new_highlight.T = author 13 | new_highlight.C = color 14 | if contents: 15 | new_highlight.Contents = contents 16 | new_highlight.indirect = True 17 | 18 | ############################################################# 19 | ### Search for bounding coordinates 20 | ############################################################# 21 | bot_left_x = float('inf') 22 | bot_left_y = float('inf') 23 | top_right_x = 0.0 24 | top_right_y = 0.0 25 | 26 | quad_pts = [] 27 | for (x1, y1, x2, y2) in points: 28 | # this quadpoints specified PDF definition of rect box 29 | quad_pts.extend([x1, y2, x2, y2, x1, y1, x2, y1]) 30 | bot_left_x = min(bot_left_x, x1, x2) 31 | bot_left_y = min(bot_left_y, y1, y2) 32 | top_right_x = max(top_right_x, x1, x2) 33 | top_right_y = max(top_right_y, y1, y2) 34 | 35 | new_highlight.QuadPoints = PdfArray(quad_pts) 36 | new_highlight.Rect = PdfArray([bot_left_x, bot_left_y, 37 | top_right_x, top_right_y]) 38 | return new_highlight 39 | 40 | def add_annot(pdfrw_page, annot): 41 | """Add annotations to page, create an array if none exists yet""" 42 | if pdfrw_page.Annots is None: 43 | pdfrw_page.Annots = PdfArray() 44 | pdfrw_page.Annots.append(annot) 45 | 46 | def pdfrw_quadpoint_to_fitz_rect(pts): 47 | """Convert pdfrw quadpoints into fitz rect format (from one library to another).""" 48 | origin = 0 49 | rects = [] 50 | while origin < len(pts): 51 | (x1, y1, x2, y2) = pts[origin+0], pts[origin+5], pts[origin+6], pts[origin+1] 52 | rects.append(fitz.Rect(x1, y1, x2, y2)) 53 | origin += 8 54 | return rects 55 | -------------------------------------------------------------------------------- /boox_annot_reader.py: -------------------------------------------------------------------------------- 1 | """ 2 | For conveting annotation from a boox annotated file. 3 | """ 4 | import os 5 | import re 6 | import logging 7 | 8 | _LOGGER = logging.getLogger() 9 | 10 | class Annot: 11 | """Class that represents an annotation.""" 12 | def __init__(self): 13 | self.page = None 14 | self.text = "" 15 | self.comment = None 16 | 17 | def __repr__(self): 18 | return "<{}: page: {}, text: {}, comment: {}>".format( 19 | self.__class__.__name__, 20 | self.page, self.text, 21 | self.comment) 22 | 23 | def read_annotations(pdf_path): 24 | """Read annotations from folder that hold the .txt file, then return the text.""" 25 | path_name = os.path.splitext(pdf_path)[0] 26 | base_name_with_ext = os.path.basename(pdf_path) 27 | base_name = os.path.splitext(base_name_with_ext)[0] 28 | annotation_file_name = os.path.join(path_name, base_name + '-annotation.txt') 29 | if not os.path.isfile(annotation_file_name): 30 | _LOGGER.debug("Expected annotation file does not exists.") 31 | return None 32 | with open(annotation_file_name, 'r', newline='') as annot_file: 33 | # the newline parameter stop python from translating \r\n to \n 34 | begining_anno = True 35 | ended = False 36 | annotations = [] 37 | for line in annot_file.readlines(): 38 | ############################## 39 | ## REPLACE INVALID TOKENS ## 40 | ############################## 41 | # this indicate some token that the program cannot recognise (as place holder) 42 | if "\xef\xbf\xbe" in line: 43 | # It likely to be a hyphen for word break. Replace it as '-'. 44 | line = line.replace("\xef\xbf\xbe", '-\n') 45 | ########################### 46 | if begining_anno: 47 | ### Page line + Comment 48 | annotations.append(Annot()) 49 | 50 | begining_anno = False 51 | match_obj = re.match(r'(?:Page )([0-9]+)\s{1,2}(.*)?\n', line) 52 | 53 | if not match_obj: 54 | raise Exception("Error in parsing first line") 55 | 56 | annotations[-1].page = match_obj.group(1) 57 | annotations[-1].comment = match_obj.group(2) 58 | elif '\x00' in line: 59 | ### Last line before End of annotation 60 | ended = True 61 | line = line.replace('\x00', '') 62 | annotations[-1].text += line 63 | elif '--------------------' in line: 64 | ### End of annotation 65 | if not ended: 66 | raise Exception("Did not detect \\x00 indicating end of line?") 67 | begining_anno = True 68 | ended = False 69 | # fix ups the formatting of each components 70 | # NOTE this -1 because in the program index starts at 0 71 | annotations[-1].page = int(annotations[-1].page) - 1 72 | annotations[-1].text = annotations[-1].text.rstrip() 73 | annotations[-1].comment = annotations[-1].comment.rstrip() 74 | if not annotations[-1].comment: 75 | # remove empty comment 76 | annotations[-1].comment = None 77 | elif '\r\n' in line: 78 | ### text (highlighted pdf text) 79 | annotations[-1].text += line 80 | elif '\n' in line: 81 | ### Comment 82 | annotations[-1].comment += line 83 | else: 84 | raise Exception("ERROR: The boox annotations txt file contain unrecognisible line") 85 | return annotations 86 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | """Main file for boox annotation conversion.""" 2 | import sys 3 | import os 4 | import shutil 5 | import argparse 6 | import logging 7 | from colorlog import ColoredFormatter 8 | 9 | from pdfrw import PdfReader, PdfWriter, PdfDict, PdfArray 10 | from helper import ( 11 | create_highlight, 12 | add_annot, 13 | ) 14 | from pdf_text_search import( 15 | PDFTextSearch, 16 | TextNotFoundException, 17 | MultipleInstancesException 18 | ) 19 | from boox_annot_reader import read_annotations 20 | 21 | _LOGGER = logging.getLogger() 22 | AUTHOR = 'Tin Lai' 23 | 24 | def convert(input_file, use_new_file=False, backup_file=True): 25 | """Convert a given file's annotations.""" 26 | annotations = read_annotations(input_file) 27 | if annotations is None: 28 | _LOGGER.info("Skipping...") 29 | return None 30 | if backup_file: 31 | backup(input_file) 32 | annotations = sorted(annotations, key=lambda x: x.page) 33 | if use_new_file: 34 | output = 'result.' + os.path.basename(input_file) 35 | else: 36 | output = os.path.basename(input_file) 37 | output = os.path.join(os.path.dirname(input_file), output) 38 | 39 | trailer = PdfReader(input_file) 40 | fitz_pdf = PDFTextSearch(input_file) 41 | for i, page in enumerate(trailer.pages): 42 | 43 | if annotations and i == annotations[0].page: 44 | page_num = i+1 45 | count = 0 46 | while annotations and i == annotations[0].page: 47 | try: 48 | _annot = annotations.pop(0) 49 | text = _annot.text 50 | try: 51 | points = fitz_pdf.get_quadpoints(i, text) 52 | except TextNotFoundException: 53 | # use fall back to try again 54 | _LOGGER.debug("Page %d: Using fall-back mechanism." 55 | "Might contains mistaken hls.", page_num) 56 | points = fitz_pdf.fallback_get_quadpoints(i, text) 57 | except MultipleInstancesException: 58 | _LOGGER.error("Page %d: The following text found multiple instances,\n\n" 59 | " --> \"%s\" <-- \n\n" 60 | "(Token too short?), please re-highligh it manually.", 61 | page_num, text) 62 | continue 63 | highlight = create_highlight(points, 64 | author=AUTHOR, 65 | contents=_annot.comment, 66 | color=(1, 1, 0.4)) 67 | # check to see if this annotation exists already 68 | if fitz_pdf.annot_exists(page_num=i, annot=highlight): 69 | _LOGGER.debug("Page %d: This annot already exists, skipping...", page_num) 70 | else: 71 | add_annot(page, annot=highlight) 72 | count += 1 73 | # shorten the line by removing all \r or \n, and also remove double spacing. 74 | hightlighted = text.replace('\r', ' ').replace('\n', ' ').replace(' ', ' ') 75 | _LOGGER.info("Page %d: Highlighted:,\n" 76 | " --> \"%s\" <-- \n", 77 | page_num, hightlighted) 78 | except TextNotFoundException: 79 | _LOGGER.error("Page %d: The following text was not found,\n\n" 80 | " --> \"%s\" <-- \n\n" 81 | "please re-highligh it manually.", 82 | page_num, text) 83 | print(">> Page {} successfully converted: {}".format(page_num, count)) 84 | 85 | PdfWriter(output, trailer=trailer).write() 86 | return output 87 | 88 | def handle_args(): 89 | """Handle arguments for argparse.""" 90 | parser = argparse.ArgumentParser( 91 | description="Convert Boox neoreader highlights annotation to standard " 92 | "pdf format.") 93 | parser.add_argument( 94 | "file", 95 | help="File or directory as input. If the given argument is a directory, " 96 | "all files within will be the action target. The default action " 97 | "(without -c or -r flag) is to perform the annotation conversion " 98 | "action. (default: current working directory)", 99 | nargs='?', 100 | default=os.getcwd(), 101 | metavar="FILE_OR_DIR") 102 | parser.add_argument( 103 | "-c", 104 | "--clean", 105 | action='store_true', 106 | default=False, 107 | help="Cleans up the bak files from current directory.") 108 | parser.add_argument( 109 | "--clean-entire-dir", 110 | action='store_true', 111 | default=False, 112 | help="Cleans up the enitre directory so that any annotation directory or " 113 | "bak files will be deleted; hence, implies --clean." 114 | ) 115 | parser.add_argument( 116 | "-r", 117 | "--restore", 118 | action='store_true', 119 | default=False, 120 | help="Use existing bak file to restore and overwrite the original pdf files.") 121 | parser.add_argument( 122 | "-n", 123 | "--new-file", 124 | action='store_true', 125 | default=False, 126 | help="Create a new file instead of overwriting the input file.") 127 | parser.add_argument( 128 | "--no-backup", 129 | action='store_true', 130 | default=False, 131 | help="Do not create a bak file (dangeous if using original file).") 132 | parser.add_argument( 133 | '-v', 134 | "--verbose", 135 | action='store_true', 136 | default=False, 137 | help="Be verbose in the status of conversion progress.") 138 | 139 | args = vars(parser.parse_args()) 140 | if args['clean_entire_dir']: 141 | args['clean'] = True 142 | if args['verbose']: 143 | _LOGGER.setLevel(logging.DEBUG) 144 | else: 145 | _LOGGER.setLevel(logging.ERROR) 146 | # logger to stdout 147 | channel = logging.StreamHandler(sys.stdout) 148 | LOGFORMAT = '%(log_color)s%(levelname)s: %(message)s%(reset)s' 149 | formatter = ColoredFormatter(LOGFORMAT) 150 | channel.setFormatter(formatter) 151 | _LOGGER.addHandler(channel) 152 | return args 153 | 154 | def backup(inpfn): 155 | """Create a bak file for the given input file.""" 156 | backup_file = '{}.bak'.format(inpfn) 157 | if os.path.isfile(backup_file): 158 | _LOGGER.debug('Found backup pdf. Using the bak as input instead.') 159 | shutil.copyfile(backup_file, inpfn) 160 | else: 161 | shutil.copyfile(inpfn, backup_file) 162 | 163 | 164 | def clean_up(inpfn): 165 | """Clean up for the given input file.""" 166 | backup_file = '{}.bak'.format(inpfn) 167 | if os.path.isfile(backup_file): 168 | os.remove(backup_file) 169 | _LOGGER.debug("Deleting %s", backup_file) 170 | 171 | def restore(inpfn, end_with_bak=False): 172 | """Restore the given input file.""" 173 | if end_with_bak: 174 | backup_file = inpfn 175 | inpfn = inpfn[0:inpfn.rfind('.')] 176 | else: 177 | backup_file = '{}.bak'.format(inpfn) 178 | if os.path.isfile(backup_file): 179 | os.rename(backup_file, inpfn) 180 | elif not end_with_bak: 181 | _LOGGER.info("Bak file for '%s' does not exists.", inpfn) 182 | 183 | def convert_wrapper(inpfn, args): 184 | """A wrapper for the convert function, for converting multiple files at once.""" 185 | outfn = convert(input_file=inpfn, use_new_file=args['new_file'], 186 | backup_file=(not args['no_backup'])) 187 | if outfn is None: 188 | return 189 | # open result file with foxitreader (to re-save the format as it helps to fixes stuff) 190 | # need abs path becuase using relative path does not seems to mess up saving path 191 | last_modified_time = os.path.getmtime(outfn) 192 | import subprocess 193 | with open(os.devnull, 'w') as file_null: 194 | _LOGGER.debug('Opening %s', outfn) 195 | subprocess.call(['foxitreader', outfn], close_fds=True, 196 | stdout=file_null, stderr=subprocess.STDOUT) 197 | # Check if user had saved the file after opening foxitreader 198 | if os.path.getmtime(outfn) <= last_modified_time: 199 | _LOGGER.warning("Seems like you did not save the file after opening foxitreader? " 200 | "Its best to allow it do works for us on fixing internal PDF structures.") 201 | 202 | def main(): 203 | """Entry point when this file is run.""" 204 | args = handle_args() 205 | if args['clean'] == args['restore'] and args['clean']: 206 | _LOGGER.error("The flag -c and -r are mutually exclusive, cannot be both set!") 207 | sys.exit(1) 208 | inpfn = os.path.abspath(args['file']) 209 | 210 | # for clean up or restore 211 | if os.path.isdir(inpfn): 212 | for file in os.listdir(inpfn): 213 | # prefix the file name with its directory 214 | file = os.path.join(inpfn, file) 215 | if file.endswith(".bak") and args['restore']: 216 | restore(file, end_with_bak=True) 217 | elif file.endswith(".pdf"): 218 | if args['clean_entire_dir']: 219 | annot_path = os.path.splitext(file)[0] 220 | if os.path.isdir(annot_path): 221 | _LOGGER.debug("Deleting annot dir %s", annot_path) 222 | shutil.rmtree(annot_path) 223 | if args['clean']: 224 | clean_up(file) 225 | elif not args['clean'] and not args['restore']: 226 | # Main functionality 227 | print('='*80) 228 | print(' {}'.format(os.path.basename(file))) 229 | print('-'*80) 230 | convert_wrapper(file, args) 231 | print('') 232 | else: 233 | if args['clean']: 234 | clean_up(inpfn) 235 | elif args['restore']: 236 | restore(inpfn) 237 | else: 238 | # Main functionality 239 | convert_wrapper(inpfn, args) 240 | 241 | 242 | if __name__ == '__main__': 243 | main() 244 | -------------------------------------------------------------------------------- /pdf_text_search.py: -------------------------------------------------------------------------------- 1 | """ 2 | For searching text in a pdf file. 3 | """ 4 | import logging 5 | import fitz 6 | from helper import pdfrw_quadpoint_to_fitz_rect 7 | 8 | _LOGGER = logging.getLogger() 9 | 10 | TOKENS_MIN_LENGTH = 2 11 | SAME_LINE_TOL = 1.5 12 | 13 | class TextNotFoundException(Exception): 14 | """Exception for text not found in pdf.""" 15 | pass 16 | 17 | class MultipleInstancesException(Exception): 18 | """Exception for multiple possible instances found in pdf.""" 19 | pass 20 | 21 | class FallbackFailedException(Exception): 22 | """Exception for fallback method of pdf text search fails as well.""" 23 | pass 24 | 25 | class PossibleErrorException(Exception): 26 | """Exception for a possible unforseen error.""" 27 | pass 28 | 29 | class PDFTextSearch: 30 | """Represent a class that search text from a pdf.""" 31 | 32 | def __init__(self, doc_name): 33 | self.doc = fitz.open(doc_name) 34 | 35 | def get_quadpoints(self, page_num, text, hit_max=16, ignore_short_width=4, extract=True): 36 | """Search for the given text in the page. Raise exception if more than one result found""" 37 | page = self.doc[page_num] 38 | rects = page.searchFor(text, hit_max=hit_max) 39 | if len(rects) < 1: 40 | raise TextNotFoundException("No search result found: {}".format(text)) 41 | if len(rects) > 1: 42 | # We detect error very naively...... But at least it's better than none. 43 | # We detect via checking if the results are consecutive lines. If they are 44 | # it is most likely it is a single result with multiline spanning. If not, 45 | # most likely the searching text is too short and result in many lines having 46 | # the same sequence of word. 47 | textblock = page.getTextBlocks() 48 | consecutive_results = None 49 | i = 0 50 | for textblock in page.getTextBlocks(): 51 | if i >= len(rects): 52 | break # DONE 53 | textblock_rect = fitz.Rect(textblock[0], textblock[1], textblock[2], textblock[3]) 54 | # print(dir(textblock_rect)) 55 | # print(type(textblock_rect)) 56 | if textblock_rect.includeRect(rects[i]): 57 | while i < len(rects): 58 | if rects[i].width < ignore_short_width: 59 | # Do not include this short line in highlighting 60 | rects.pop(i) 61 | if consecutive_results is None: 62 | consecutive_results = 'started' 63 | elif consecutive_results == 'end': 64 | raise MultipleInstancesException( 65 | "Possible multiple search results. The results are not consecutive") 66 | # 'FOUNDDDDD!!!!!! 67 | i += 1 68 | if i >= len(rects) or not textblock_rect.includeRect(rects[i]): 69 | break 70 | else: 71 | if consecutive_results == 'started': 72 | consecutive_results = 'end' 73 | 74 | # if reaching this point, all result must have been matched. If not, error 75 | if i < len(rects): 76 | raise PossibleErrorException("ERROR! Not all result been vertified.") 77 | if not extract: 78 | return rects 79 | merged = self.merge_tokens(rects) 80 | return self.invert_coordinates(merged, self.page_height(page_num)) 81 | 82 | 83 | def fallback_get_quadpoints(self, page_num, text, hit_max=16, ignore_short_width=4): 84 | """ 85 | Search for the given text in the page. Raise exception if more than one result found. 86 | This fallback method breaks the entire text into chunks of tokens, and ignore tokens that 87 | cannot be recognised. Therefore, it is more robust as it ignores a part of tokens for 88 | better finding text, and also does its best to detect error. 89 | """ 90 | tokens = [] 91 | def add(words): 92 | """Helper functino to add w to tokens (and check length before doing so)""" 93 | if len(words) <= 2: 94 | _LOGGER.debug("VERY SHORT token: '%s'! Ignoring this token...", words) 95 | return 96 | tokens.extend(self.get_quadpoints(page_num, words, hit_max, 97 | ignore_short_width, extract=False)) 98 | def get_token(line): 99 | """Given line, return the splited sentence before and after the first 100 | occurance of an escape char""" 101 | idx, skiped_word = self.unicode_idx(line) 102 | if idx < 0: 103 | return line, '' 104 | _LOGGER.debug("Ignoring unicode '%s' from: '%s'", skiped_word, line) 105 | words = line.split(' ') 106 | return ' '.join(words[:idx]), ' '.join(words[idx+1:]) 107 | def add_remaining_words(line): 108 | """Add all remaining words into list.""" 109 | while len(line.split(' ')) > TOKENS_MIN_LENGTH: 110 | words, line = get_token(line) 111 | try: 112 | add(words) 113 | except TextNotFoundException: 114 | _LOGGER.debug("Skipping '%s' as it was not found", words) 115 | 116 | for i, line in enumerate(text.split('\n')): 117 | line = line.rstrip() 118 | if i == 0: 119 | # first few words 120 | if self.unicode_idx(line)[0] != -1 and self.unicode_idx(line)[0] <= 3: 121 | raise FallbackFailedException("Escaped character too close to beginning tokens") 122 | words, line = get_token(line) 123 | add(words) 124 | add_remaining_words(line) 125 | else: 126 | add_remaining_words(line) 127 | merged = self.merge_tokens(tokens) 128 | return self.invert_coordinates(merged, self.page_height(page_num)) 129 | 130 | def annot_exists(self, page_num, annot): 131 | """Given an annot in pdfrw, determine if it already exists by utilising fitz.""" 132 | page = self.doc[page_num] 133 | page_annot = page.firstAnnot 134 | # need to change pdfrw's rect coor to fits fitz's coordinate *by inverting) 135 | pending_annots = [fitz.Rect(x) for x in self.invert_coordinates( 136 | pdfrw_quadpoint_to_fitz_rect(annot.QuadPoints), self.page_height(page_num))] 137 | """We consider the two given annots are the same if all the sub-parts of the pending 138 | annots intersects one of the annot that we are checking. (We cannot simply use 139 | contains because the coordinates data are slightly off and hence unreliable)""" 140 | while page_annot: 141 | # inverted 142 | if all(page_annot.rect.intersects(a) for a in pending_annots): 143 | return True 144 | # else we continue to check next annot 145 | page_annot = page_annot.next # get next annot on page 146 | 147 | return False 148 | 149 | def page_height(self, page_num): 150 | """Return the page height of given page.""" 151 | page = self.doc[page_num] 152 | return page.bound().y1 153 | 154 | @staticmethod 155 | def merge_tokens(annot_tokens): 156 | """Try to merge the broken tokens together, with full line width""" 157 | if len(annot_tokens) < 2: 158 | # no need to merge len = 1 159 | return annot_tokens 160 | def sameline(l1, l2): 161 | """Determine if l1 and l2 are on the same line""" 162 | tol = SAME_LINE_TOL 163 | if (abs(l1.y0 - l2.y0) < tol and 164 | abs(l1.y1 - l2.y1) < tol): 165 | return True 166 | return False 167 | def merge_column_tokens(tokens): 168 | """Loop through to find left most & right most boarder.""" 169 | left_most = float('inf') 170 | right_most = 0 171 | for t in tokens: 172 | left_most = min(left_most, t.x0, t.x1) 173 | right_most = max(right_most, t.x0, t.x1) 174 | lines = [] 175 | for i, t in enumerate(tokens): 176 | if i == 0: 177 | lines.append([t]) # first line no need to check previous line 178 | else: 179 | # determine if it's same line as before 180 | if sameline(lines[-1][0], t): 181 | # append to previous line 182 | lines[-1].append(t) 183 | else: 184 | # create a new line 185 | lines.append([t]) 186 | ########################### 187 | ## NOW WE DO THE MERGING ## 188 | ########################### 189 | new_lines = [] 190 | for i, line in enumerate(lines): 191 | bot = float('inf') 192 | top = 0 193 | for l in line: 194 | bot = min(bot, l.y0) 195 | top = max(top, l.y1) 196 | if i == 0: 197 | new_lines.append(fitz.Rect(line[0].x0, bot, right_most, top)) 198 | elif i == len(lines) - 1: 199 | new_lines.append(fitz.Rect(left_most, bot, line[-1].x1, top)) 200 | else: 201 | new_lines.append(fitz.Rect(left_most, bot, right_most, top)) 202 | return new_lines 203 | 204 | # detect if the highlights spans a double column 205 | is_double_column = False 206 | double_column = [[], []] 207 | double_column[0].append(annot_tokens[0]) 208 | 209 | # filter the tokens that belong to different columns, and perform merge for each column 210 | for i in range(1, len(annot_tokens)): 211 | if not sameline(annot_tokens[i-1], annot_tokens[i]): 212 | if annot_tokens[i-1].y0 > annot_tokens[i].y0: 213 | is_double_column = True 214 | if not is_double_column: 215 | double_column[0].append(annot_tokens[i]) 216 | else: 217 | double_column[1].append(annot_tokens[i]) 218 | if not is_double_column: 219 | return merge_column_tokens(annot_tokens) 220 | 221 | firstcol_merged_tokens = merge_column_tokens(double_column[0]) 222 | secondcol_merged_tokens = merge_column_tokens(double_column[1]) 223 | firstcol_merged_tokens.extend(secondcol_merged_tokens) 224 | return firstcol_merged_tokens 225 | 226 | @staticmethod 227 | def invert_coordinates(rects, page_height): 228 | """ 229 | TO work around the different coordinate system in fitz and pdfrw. The x-axis 230 | are same but the y-axis are opposite to each other. One starts at top and one starts 231 | at bottom 232 | """ 233 | # convert from top left bot right -- to -- bot left top right 234 | # this is for compliance of convention in PDF 235 | rects = [(r.x0, r.y1, r.x1, r.y0) for r in rects] 236 | # the coordinate system in fitz and pdfrw are inverted. 237 | # Need to invert back with "page_height - y" 238 | # this is for converting between fitz and pdfrw system 239 | return [(r[0], page_height - r[1], r[2], page_height - r[3]) for r in rects] 240 | 241 | @staticmethod 242 | def unicode_idx(text): 243 | """Return the index of word (within the line) that contain escape char \\x""" 244 | text = repr(text)[1:-1] 245 | for i, word in enumerate(text.split(' ')): 246 | if "\\x" in word: 247 | return i, word 248 | return -1, None 249 | --------------------------------------------------------------------------------