├── .gitignore ├── LICENSE └── hocrRemover.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Fabio Oliveira Costa 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /hocrRemover.py: -------------------------------------------------------------------------------- 1 | from html.parser import HTMLParser 2 | import os.path as os_path 3 | import cv2 4 | 5 | class Parser(HTMLParser): 6 | """ Parser that grabs the HOCR bbox 7 | """ 8 | def __init__(self): 9 | self.boxes = [] 10 | super().__init__() 11 | 12 | def handle_starttag(self, tag, attrs): 13 | is_word = False 14 | box = None 15 | for attr in attrs: 16 | if attr[0] =='class' and attr[1] =='ocrx_word': 17 | is_word = True 18 | elif attr[0] =='title' and attr[1].startswith('bbox'): 19 | box_str = attr[1].split(';')[0] 20 | box_str = list(map(int,box_str.split(' ')[1:])) 21 | box = ((box_str[0],box_str[1]),(box_str[2],box_str[3])) 22 | if not is_word: 23 | return 24 | self.boxes.append(box) 25 | 26 | def get_hocr_boxes(file): 27 | """Gets a list of HOCR bbox 28 | Args: 29 | file (string): Hocr file 30 | Returns: 31 | Tuple: The box with ( (top_x,top_y),(bottom_x),(bottom_y)) 32 | """ 33 | parser = Parser() 34 | with open(file,'r',encoding='utf-8') as f: 35 | content = f.read() 36 | parser.feed(content) 37 | return parser.boxes 38 | 39 | def remove_text(hocr,in,out): 40 | """Put a white rectangle on hocr box 41 | Args: 42 | hocr(str): hocr file 43 | in(str): input img path 44 | out(str):ouput img path 45 | """ 46 | boxes = get_hocr_boxes(hocr) 47 | img = cv2.imread(in) 48 | for box in boxes: 49 | cv2.rectangle(img,box[0],box[1],(255,255,255),-1) 50 | cv2.imwrite(out,img) 51 | --------------------------------------------------------------------------------