├── .gitignore ├── LICENSE ├── README.md └── source ├── __init__.py └── pytessy.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Boost Software License - Version 1.0 - August 17th, 2003 2 | 3 | Permission is hereby granted, free of charge, to any person or organization 4 | obtaining a copy of the software and accompanying documentation covered by 5 | this license (the "Software") to use, reproduce, display, distribute, 6 | execute, and transmit the Software, and to prepare derivative works of the 7 | Software, and to permit third-parties to whom the Software is furnished to 8 | do so, all subject to the following: 9 | 10 | The copyright notices in the Software and this entire statement, including 11 | the above license grant, this restriction and the following disclaimer, 12 | must be included in all copies of the Software, in whole or in part, and 13 | all derivative works of the Software, unless such copies or derivative 14 | works are solely in the form of machine-executable object code generated by 15 | a source language processor. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 20 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 21 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 22 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | DEALINGS IN THE SOFTWARE. 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyTessy - Tesseract-OCR, faster! 2 | 3 | This module allows faster access to Tesseract-OCR from Python scripts. 4 | 5 | ## Why and when is it so fast? 6 | 7 | PyTessy uses direct library-level access to Tesseract-OCR's core library. Therefore is it so fast in case when the image is already in the memory or when the image need to be processed before scanning with Tesseract-OCR. In case of reading and scanning existing files only PyTessy is just a bit faster than usual Tesseract-OCR Python wrappers. 8 | 9 | ## Requirements 10 | 11 | ### Operating system 12 | 13 | PyTessy is operating system independent in case if you set the exact location of your Tesseract-OCR library since presently library search process is implemented on Windows only. 14 | 15 | ### Python modules 16 | 17 | PyTessy uses only modules from the Standard Library only. Python version must be ` >= 3.6 `. 18 | 19 | ### External requirements 20 | 21 | You have to have installed or portable version of Tesseract-OCR (at least a working library and ` tessdata `). 22 | 23 | You can download Tesseract-OCR from [here](https://tesseract-ocr.github.io/tessdoc/Downloads). 24 | 25 | ## Installation 26 | 27 | You can install the latest PyTessy version with ` pip install pytessy ` or you can download the wheel from this repository or you can build it from the source code. 28 | 29 | ## Documentation 30 | 31 | PyTessy has a [ReadTheDocs page](https://pytessy.readthedocs.io/) 32 | -------------------------------------------------------------------------------- /source/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyperrixel/pytessy/70ce189807ac380705d5cbb532472bd64063decf/source/__init__.py -------------------------------------------------------------------------------- /source/pytessy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | """ 3 | _ _ 4 | (_) | | 5 | _ __ _ __ __ ___ | | 6 | | '__| | | \ \/ / / _ \ | | 7 | | | | | > < | __/ | | 8 | |_| |_| /_/\_\ \___| |_| 9 | 10 | 11 | 12 | PyTessy 13 | ======= 14 | 15 | Tesseract-OCR, faster! 16 | 17 | This module allows faster access to Tesseract-OCR from Python scripts. 18 | 19 | This module is always faster than common Tesseract-OCR wrappers like pytesseract 20 | because it uses direct access to Tesseract-OCR's core library instead of calling 21 | its executable. 22 | 23 | The specification of the connection to the driver is based on the source code 24 | from here: https://github.com/UB-Mannheim/tesseract/blob/master/src/api/capi.cpp 25 | 26 | Copyright rixel 2020 27 | Distributed under the Boost Software License, Version 1.0. 28 | See accompanying file LICENSE or a copy at https://www.boost.org/LICENSE_1_0.txt 29 | """ 30 | 31 | 32 | 33 | import __main__ 34 | import ctypes 35 | import ctypes.util 36 | from os import chdir, environ 37 | from os.path import abspath, dirname, isabs, isdir, isfile, join 38 | from sys import platform 39 | 40 | 41 | 42 | class PyTessyError(Exception): 43 | """ 44 | PyTessyError class 45 | ------------------ 46 | Empty subclass of Exception to throw module-specific errors. 47 | """ 48 | 49 | pass 50 | 51 | 52 | 53 | class TesseractHandler(object): 54 | """ 55 | TesseractHandler class 56 | ---------------------- 57 | Handles raw Tesseract-OCR calls with limited functionality only. 58 | """ 59 | 60 | _lib = None 61 | _api = None 62 | 63 | 64 | 65 | class TessBaseAPI(ctypes._Pointer): 66 | """ 67 | TessBaseAPI 68 | ----------- 69 | Empty ctypes._Pointer subclass to serve as TessBaseAPI handler pointer. 70 | """ 71 | 72 | _type_ = type('_TessBaseAPI', (ctypes.Structure,), {}) 73 | 74 | 75 | 76 | def __init__(self, lib_path=None, data_path=None, language='eng'): 77 | """ 78 | Initializes Tesseract-OCR api handler object instance 79 | ----------------------------------------------------- 80 | @Params: lib_path (string) [optional] Path to Tesseract-OCR library. 81 | data_path (string) [optional] Path to Tesseract-OCR data files. 82 | language (string) [optional] Language code to work with. 83 | """ 84 | 85 | if self._lib is None: 86 | self.setup_lib(lib_path) 87 | self._api = self._lib.TessBaseAPICreate() 88 | if self._lib.TessBaseAPIInit3(self._api, data_path.encode('ascii'), 89 | language.encode('ascii')): 90 | raise PyTessyError('Failed to initalize Tesseract-OCR library.') 91 | 92 | 93 | 94 | def get_text(self): 95 | """ 96 | Gets text as utf-8 decoded string 97 | --------------------------------- 98 | @Return: (string) Text read by Tesseract-OCR as utf-8 string. 99 | """ 100 | 101 | self._check_setup() 102 | result = self._lib.TessBaseAPIGetUTF8Text(self._api) 103 | if result: 104 | return result.decode('utf-8') 105 | 106 | 107 | 108 | def get_text_raw(self): 109 | """ 110 | Gets text as raw bytes data 111 | --------------------------- 112 | @Return: (bytes) Text read by Tesseract-OCR as raw bytes . 113 | """ 114 | 115 | self._check_setup() 116 | return self._lib.TessBaseAPIGetUTF8Text(self._api) 117 | 118 | 119 | 120 | def set_image(self, imagedata, width, height, bytes_per_pixel, bytes_per_line, 121 | resolution): 122 | """ 123 | Sets image to read 124 | ------------------ 125 | @Params: imagedata (ctyps.int arrray) Raw imagedata to read. 126 | width (int) Width of the image. 127 | height (int) Height of the image. 128 | bytes_per_pixel (int) Number of bytes that 129 | represents a pixel. 130 | bytes_per_line (int) Number of bytes in a line. 131 | resolution (int) Resolution of the image 132 | in dpi. 133 | """ 134 | 135 | self._check_setup() 136 | self._lib.TessBaseAPISetImage(self._api, 137 | imagedata, width, height, 138 | bytes_per_pixel, bytes_per_line) 139 | self._lib.TessBaseAPISetSourceResolution(self._api, resolution) 140 | 141 | 142 | 143 | @classmethod 144 | def setup_lib(cls, lib_path=None): 145 | """ 146 | Binds Tesseract-OCR library to the handler 147 | ------------------------------------------ 148 | @Params: (string) [optional] Path to Tesseract-OCR library. 149 | @Raises: PyTessyError If ctypes cannot find Tesseract-OCR library. 150 | """ 151 | 152 | if cls._lib is not None: 153 | return 154 | lib_path = ctypes.util.find_library(lib_path) 155 | if lib_path is None: 156 | raise PyTessyError('Ctypes couldn\'t find Tesseract-OCR library') 157 | cls._lib = lib = ctypes.CDLL(lib_path) 158 | 159 | lib.TessBaseAPICreate.restype = cls.TessBaseAPI # handle 160 | 161 | lib.TessBaseAPIDelete.restype = None # void 162 | lib.TessBaseAPIDelete.argtypes = (cls.TessBaseAPI,) # handle 163 | 164 | lib.TessBaseAPIInit3.argtypes = (cls.TessBaseAPI, # handle 165 | ctypes.c_char_p, # datapath 166 | ctypes.c_char_p) # language 167 | 168 | lib.TessBaseAPISetImage.restype = None # void 169 | lib.TessBaseAPISetImage.argtypes = (cls.TessBaseAPI, # handle 170 | ctypes.c_void_p, # imagedata 171 | ctypes.c_int, # width 172 | ctypes.c_int, # height 173 | ctypes.c_int, # bytes_per_pixel 174 | ctypes.c_int) # bytes_per_line 175 | 176 | lib.TessBaseAPIGetUTF8Text.restype = ctypes.c_char_p # text 177 | lib.TessBaseAPIGetUTF8Text.argtypes = (cls.TessBaseAPI, ) # handle 178 | 179 | lib.TessBaseAPISetSourceResolution.restype = None # void 180 | lib.TessBaseAPISetSourceResolution.argtypes = (cls.TessBaseAPI, # handle 181 | ctypes.c_int) # ppi 182 | 183 | 184 | 185 | def _check_setup(self): 186 | """ 187 | Chekcs whether Tesseract-OCR is set up or not 188 | --------------------------------------------- 189 | @Raises: PyTessyError If library handler not yet configured. 190 | PyTessyError If api handler not yet configured. 191 | """ 192 | 193 | if not self._lib: 194 | raise PyTessyError('Tesseract handler library not configured.') 195 | if not self._api: 196 | raise PyTessyError('Tesseract handler api not created.') 197 | 198 | 199 | 200 | def __del__(self): 201 | """ 202 | Disconnects TessBaseAPI when instance is deleted 203 | ------------------------------------------------ 204 | """ 205 | 206 | if not self._lib or not self._api: 207 | return 208 | if not getattr(self, 'closed', False): 209 | self._lib.TessBaseAPIDelete(self._api) 210 | self.closed = True 211 | 212 | 213 | 214 | class PyTessy(object): 215 | """ 216 | PyTessy 217 | ------- 218 | Provides user-friendly and fast Tesseract-OCR interface. 219 | """ 220 | 221 | DEFAULT_HORIZONTAL_DPI = 96 222 | TESSDATA_DIRNAME = 'tessdata' 223 | TESSERACT_DIRNAME = 'Tesseract-OCR' 224 | TESSERACT_DEFAULT_HORIZONTAL_DPI = 70 225 | VERSION = '0.0.1' 226 | 227 | 228 | 229 | def __init__(self, tesseract_path=None, api_version=None, lib_path=None, 230 | data_path=None, language='eng', verbose_search=False): 231 | """ 232 | Initializes PyTessy instance 233 | ---------------------------- 234 | @Params: tesseract_path (string) [optional] Path (directory's name) 235 | to Tesseract-OCR library. 236 | api_version (string) [optional] Api version suffix string 237 | (should be compatible with 238 | Tesseract-OCR 3). 239 | lib_path (string) [optional] Exact path to the 240 | Tesseract-OCR library. 241 | to data directory (usually "tessdata"). 242 | data_path (string) [optional] Path (directory's name) 243 | to data directory (usually "tessdata"). 244 | language (string) [optional] Languge code to use. 245 | verbose_search (boolean) [optional] Whether to display 246 | library searching process or not. 247 | @Raises: NotImplementedError If the operating system is not 248 | implemented yet (linux, macOS). 249 | You can avoid this error by giving 250 | exact path of Tesseract-OCR library. 251 | NotImplementedError If the operating system will be 252 | never implemented. 253 | You can avoid this error by giving 254 | exact path of Tesseract-OCR library. 255 | FileNotFoundError If the given exact library path 256 | doesn't point to existing file. 257 | FileNotFoundError If failed to found library with 258 | search process. 259 | FileNotFoundError If cannot found "tessdata" directory. 260 | """ 261 | 262 | run_path = dirname(abspath(__main__.__file__)) 263 | no_lib = True 264 | if lib_path is not None: 265 | if isfile(lib_path): 266 | no_lib = False 267 | else: 268 | raise FileNotFoundError('PyTessy: lib_path: "{}" doesn\'t exist.' 269 | .format(lib_path)) 270 | if no_lib: 271 | if verbose_search: 272 | verbose = lambda *pa, **pk: print(*pa, **pk) 273 | else: 274 | verbose = lambda *pa, **pk: None 275 | if platform.startswith('win'): 276 | verbose('PyTessy v{} on {} searching for Tesseract-OCR library...' 277 | .format(PyTessy.VERSION, platform)) 278 | if api_version is None: 279 | lib_name = 'libtesseract-5' 280 | else: 281 | lib_name = 'libtesseract{}'.format(api_version) 282 | verbose('--- Target library name: {}'.format(lib_name)) 283 | if tesseract_path is not None: 284 | dirs = [tesseract_path, run_path, join(run_path, PyTessy.TESSERACT_DIRNAME)] 285 | else: 286 | dirs = [run_path, join(run_path, PyTessy.TESSERACT_DIRNAME)] 287 | if 'PROGRAMFILES' in environ: 288 | dirs.append(join(environ['PROGRAMFILES'], PyTessy.TESSERACT_DIRNAME)) 289 | if 'PROGRAMFILES(X86)' in environ: 290 | dirs.append(join(environ['PROGRAMFILES(X86)'], PyTessy.TESSERACT_DIRNAME)) 291 | for dir in dirs: 292 | test = join(dir, '{}.dll'.format(lib_name)) 293 | if isfile(test): 294 | lib_path = test 295 | verbose(' {} SUCCESS.'.format(test)) 296 | break 297 | else: 298 | verbose(' {} FAILED.'.format(test)) 299 | if lib_path is None: 300 | raise FileNotFoundError('Cannot locate Tesseract-OCR library.') 301 | elif platform.startswith('linux'): 302 | raise NotImplementedError('PyTessy: Library search on Linux is not implemented yet.') 303 | elif platform.startswith('darwin'): 304 | raise NotImplementedError('PyTessy: Library search on MacOS is not implemented yet.') 305 | else: 306 | raise NotImplementedError('PyTessy: Library search on this system is not implemented.') 307 | tess_path = dirname(abspath(lib_path)) 308 | no_tessdata = True 309 | if data_path is not None: 310 | if isdir(data_path): 311 | no_tessdata = False 312 | if no_tessdata: 313 | for test_path in [run_path, join(run_path, PyTessy.TESSERACT_DIRNAME), tess_path]: 314 | test_path = join(test_path, PyTessy.TESSDATA_DIRNAME) 315 | if isdir(test_path): 316 | data_path = test_path 317 | break 318 | if data_path is None: 319 | raise FileNotFoundError('PyTessy: Couldn\'t find "tessdata" directory.') 320 | chdir(tess_path) 321 | self._tess = TesseractHandler(lib_path=lib_path, data_path=data_path, 322 | language=language) 323 | chdir(run_path) 324 | 325 | 326 | 327 | def justread(self, raw_image_ctypes, width, height, bytes_per_pixel, 328 | bytes_per_line, resolution=96): 329 | """ 330 | Reads text as utf-8 string from raw image data without any check 331 | ---------------------------------------------------------------- 332 | @Params: raw_image_ctypes (ctypes int arrray) Raw image data. 333 | width (int) Image width. 334 | height (int) Image height. 335 | bytes_per_pixel (int) Number of bytes per pixel. 336 | bytes_per_line (int) Number of bytes per line. 337 | resolution (int) [optional] Resolution in 338 | dpi. Default: 96. 339 | @Return: (sting) Text read by Tesseract-OCR 340 | as utf-8 string. 341 | """ 342 | 343 | self._tess.set_image(raw_image_ctypes, width, height, bytes_per_pixel, 344 | bytes_per_line, resolution) 345 | return self._tess.get_text() 346 | 347 | 348 | 349 | def justread_raw(self, raw_image_ctypes, width, height, bytes_per_pixel, 350 | bytes_per_line, resolution=96): 351 | """ 352 | Reads text as raw bytes data from raw image data without any check 353 | ------------------------------------------------------------------ 354 | @Params: raw_image_ctypes (ctypes int arrray) Raw image data. 355 | width (int) Image width. 356 | height (int) Image height. 357 | bytes_per_pixel (int) Number of bytes per pixel. 358 | bytes_per_line (int) Number of bytes per line. 359 | resolution (int) [optional] Resolution in 360 | dpi. Default: 96. 361 | @Return: (bytes) Text read by Tesseract-OCR 362 | as raw bytes data. 363 | """ 364 | 365 | self._tess.set_image(raw_image_ctypes, width, height, bytes_per_pixel, 366 | bytes_per_line, resolution) 367 | return self._tess.get_text() 368 | 369 | 370 | 371 | def read(self, imagedata, width, height, bytes_per_pixel, resolution=96, 372 | raw=False): 373 | """ 374 | Reads text from image data 375 | -------------------------- 376 | @Params: imagedata (ctypes int arrray) Raw image data. 377 | width (int) Image width. 378 | height (int) Image height. 379 | bytes_per_pixel (int) Number of bytes per pixel. 380 | resolution (int) [optional] Resolution in 381 | dpi. Default: 96. 382 | raw (boolean) [optional] Whether to read 383 | in raw or utf-8 mode. 384 | @Return: (bytes) or (string) Text read by Tesseract-OCR 385 | """ 386 | 387 | bytes_per_line = width * bytes_per_pixel 388 | if raw: 389 | return self.justread_raw(imagedata, width, height, bytes_per_pixel, 390 | bytes_per_line, resolution) 391 | else: 392 | return self.justread(imagedata, width, height, bytes_per_pixel, 393 | bytes_per_line, resolution) 394 | 395 | 396 | 397 | if __name__ == '__main__': 398 | print('This is a module not a script.') 399 | --------------------------------------------------------------------------------