├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README ├── Vagrantfile ├── docs ├── Makefile ├── conf.py ├── index.rst └── make.bat ├── install.sh ├── setup.py ├── tesserwrap ├── __init__.py ├── core.py └── cpp │ ├── tesseract_ext.cpp │ ├── tesseract_ext.h │ ├── tesseract_wrap.cpp │ └── tesseract_wrap.h ├── tests ├── test_tesserwrap.py └── util.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | tesserwrap.egg-info 3 | *.pyc 4 | *.swp 5 | README.pdf 6 | dist 7 | *.egg 8 | *.egg* 9 | *.o 10 | *.so 11 | .vagrant 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | ** Licensed under the Apache License, Version 2.0 (the "License"); 3 | ** you may not use this file except in compliance with the License. 4 | ** You may obtain a copy of the License at 5 | ** http://www.apache.org/licenses/LICENSE-2.0 6 | ** Unless required by applicable law or agreed to in writing, software 7 | ** distributed under the License is distributed on an "AS IS" BASIS, 8 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | ** See the License for the specific language governing permissions and 10 | ** limitations under the License. 11 | 12 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include tesserwrap *.cpp *.h *.py 2 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Tesserwrap - Basic Tesseract API Wrapper for Python 2 | 3 | Tesserwrap is a project that allows simple bindings to Tesseract's API rather than executing the application manually each time. 4 | 5 | Docs: https://tesserwrap.readthedocs.org/en/latest/ 6 | IRC: #tesserwrap on Freenode 7 | -------------------------------------------------------------------------------- /Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | # Vagrantfile API/syntax version. Don't touch unless you know what you're doing! 5 | VAGRANTFILE_API_VERSION = "2" 6 | 7 | Vagrant.configure(VAGRANTFILE_API_VERSION) do |config| 8 | config.vm.box = "hashicorp/precise64" 9 | config.vm.provision :shell, :path => "install.sh" 10 | end 11 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " singlehtml to make a single large HTML file" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 31 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 32 | @echo " text to make text files" 33 | @echo " man to make manual pages" 34 | @echo " texinfo to make Texinfo files" 35 | @echo " info to make Texinfo files and run them through makeinfo" 36 | @echo " gettext to make PO message catalogs" 37 | @echo " changes to make an overview of all changed/added/deprecated items" 38 | @echo " linkcheck to check all external links for integrity" 39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 40 | 41 | clean: 42 | -rm -rf $(BUILDDIR)/* 43 | 44 | html: 45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 48 | 49 | dirhtml: 50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 51 | @echo 52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 53 | 54 | singlehtml: 55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 56 | @echo 57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 58 | 59 | pickle: 60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 61 | @echo 62 | @echo "Build finished; now you can process the pickle files." 63 | 64 | json: 65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 66 | @echo 67 | @echo "Build finished; now you can process the JSON files." 68 | 69 | htmlhelp: 70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 71 | @echo 72 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 73 | ".hhp project file in $(BUILDDIR)/htmlhelp." 74 | 75 | qthelp: 76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 77 | @echo 78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Tesserwrap.qhcp" 81 | @echo "To view the help file:" 82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Tesserwrap.qhc" 83 | 84 | devhelp: 85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 86 | @echo 87 | @echo "Build finished." 88 | @echo "To view the help file:" 89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/Tesserwrap" 90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Tesserwrap" 91 | @echo "# devhelp" 92 | 93 | epub: 94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 95 | @echo 96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 97 | 98 | latex: 99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 100 | @echo 101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 103 | "(use \`make latexpdf' here to do that automatically)." 104 | 105 | latexpdf: 106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 107 | @echo "Running LaTeX files through pdflatex..." 108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 110 | 111 | text: 112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 113 | @echo 114 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 115 | 116 | man: 117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 118 | @echo 119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 120 | 121 | texinfo: 122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 123 | @echo 124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 125 | @echo "Run \`make' in that directory to run these through makeinfo" \ 126 | "(use \`make info' here to do that automatically)." 127 | 128 | info: 129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 130 | @echo "Running Texinfo files through makeinfo..." 131 | make -C $(BUILDDIR)/texinfo info 132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 133 | 134 | gettext: 135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 136 | @echo 137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 138 | 139 | changes: 140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 141 | @echo 142 | @echo "The overview file is in $(BUILDDIR)/changes." 143 | 144 | linkcheck: 145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 146 | @echo 147 | @echo "Link check complete; look for any errors in the above output " \ 148 | "or in $(BUILDDIR)/linkcheck/output.txt." 149 | 150 | doctest: 151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 152 | @echo "Testing of doctests in the sources finished, look at the " \ 153 | "results in $(BUILDDIR)/doctest/output.txt." 154 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Tesserwrap documentation build configuration file, created by 5 | # sphinx-quickstart on Tue Mar 19 21:24:47 2013. 6 | # 7 | # This file is execfile()d with the current directory set to its containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import os 16 | import sys 17 | 18 | sys.path.append(os.path.abspath("..")) 19 | 20 | class Mock(object): 21 | def __init__(self, *args, **kwargs): 22 | pass 23 | 24 | def __call__(self, *args, **kwargs): 25 | return Mock() 26 | 27 | @classmethod 28 | def __getattr__(cls, name): 29 | if name in ('__file__', '__path__'): 30 | return '/dev/null' 31 | elif name[0] == name[0].upper(): 32 | mockType = type(name, (), {}) 33 | mockType.__module__ = __name__ 34 | return mockType 35 | else: 36 | return Mock() 37 | 38 | sys.modules['tesserwrap.core'] = Mock() 39 | 40 | import tesserwrap 41 | 42 | # If extensions (or modules to document with autodoc) are in another directory, 43 | # add these directories to sys.path here. If the directory is relative to the 44 | # documentation root, use os.path.abspath to make it absolute, like shown here. 45 | #sys.path.insert(0, os.path.abspath('.')) 46 | 47 | # -- General configuration ----------------------------------------------------- 48 | 49 | # If your documentation needs a minimal Sphinx version, state it here. 50 | #needs_sphinx = '1.0' 51 | 52 | # Add any Sphinx extension module names here, as strings. They can be extensions 53 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 54 | extensions = ['sphinx.ext.intersphinx', 'sphinx.ext.autodoc', 'sphinx.ext.coverage'] 55 | 56 | # Add any paths that contain templates here, relative to this directory. 57 | templates_path = ['_templates'] 58 | 59 | # The suffix of source filenames. 60 | source_suffix = '.rst' 61 | 62 | # The encoding of source files. 63 | #source_encoding = 'utf-8-sig' 64 | 65 | # The master toctree document. 66 | master_doc = 'index' 67 | 68 | # General information about the project. 69 | project = 'Tesserwrap' 70 | copyright = '2011 - 2013, Greg Jurman, et al' 71 | 72 | # The version info for the project you're documenting, acts as replacement for 73 | # |version| and |release|, also used in various other places throughout the 74 | # built documents. 75 | # 76 | # The short X.Y version. 77 | version = '0.1.1' 78 | # The full version, including alpha/beta/rc tags. 79 | release = '0.1.1' 80 | 81 | # The language for content autogenerated by Sphinx. Refer to documentation 82 | # for a list of supported languages. 83 | #language = None 84 | 85 | # There are two options for replacing |today|: either, you set today to some 86 | # non-false value, then it is used: 87 | #today = '' 88 | # Else, today_fmt is used as the format for a strftime call. 89 | #today_fmt = '%B %d, %Y' 90 | 91 | # List of patterns, relative to source directory, that match files and 92 | # directories to ignore when looking for source files. 93 | exclude_patterns = ['_build'] 94 | 95 | # The reST default role (used for this markup: `text`) to use for all documents. 96 | #default_role = None 97 | 98 | # If true, '()' will be appended to :func: etc. cross-reference text. 99 | #add_function_parentheses = True 100 | 101 | # If true, the current module name will be prepended to all description 102 | # unit titles (such as .. function::). 103 | #add_module_names = True 104 | 105 | # If true, sectionauthor and moduleauthor directives will be shown in the 106 | # output. They are ignored by default. 107 | #show_authors = False 108 | 109 | # The name of the Pygments (syntax highlighting) style to use. 110 | pygments_style = 'sphinx' 111 | 112 | # A list of ignored prefixes for module index sorting. 113 | #modindex_common_prefix = [] 114 | 115 | 116 | # -- Options for HTML output --------------------------------------------------- 117 | 118 | # The theme to use for HTML and HTML Help pages. See the documentation for 119 | # a list of builtin themes. 120 | html_theme = 'default' 121 | 122 | # Theme options are theme-specific and customize the look and feel of a theme 123 | # further. For a list of options available for each theme, see the 124 | # documentation. 125 | #html_theme_options = {} 126 | 127 | # Add any paths that contain custom themes here, relative to this directory. 128 | #html_theme_path = [] 129 | 130 | # The name for this set of Sphinx documents. If None, it defaults to 131 | # " v documentation". 132 | #html_title = None 133 | 134 | # A shorter title for the navigation bar. Default is the same as html_title. 135 | #html_short_title = None 136 | 137 | # The name of an image file (relative to this directory) to place at the top 138 | # of the sidebar. 139 | #html_logo = None 140 | 141 | # The name of an image file (within the static path) to use as favicon of the 142 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 143 | # pixels large. 144 | #html_favicon = None 145 | 146 | # Add any paths that contain custom static files (such as style sheets) here, 147 | # relative to this directory. They are copied after the builtin static files, 148 | # so a file named "default.css" will overwrite the builtin "default.css". 149 | html_static_path = ['_static'] 150 | 151 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 152 | # using the given strftime format. 153 | #html_last_updated_fmt = '%b %d, %Y' 154 | 155 | # If true, SmartyPants will be used to convert quotes and dashes to 156 | # typographically correct entities. 157 | #html_use_smartypants = True 158 | 159 | # Custom sidebar templates, maps document names to template names. 160 | #html_sidebars = {} 161 | 162 | # Additional templates that should be rendered to pages, maps page names to 163 | # template names. 164 | #html_additional_pages = {} 165 | 166 | # If false, no module index is generated. 167 | #html_domain_indices = True 168 | 169 | # If false, no index is generated. 170 | #html_use_index = True 171 | 172 | # If true, the index is split into individual pages for each letter. 173 | #html_split_index = False 174 | 175 | # If true, links to the reST sources are added to the pages. 176 | #html_show_sourcelink = True 177 | 178 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 179 | #html_show_sphinx = True 180 | 181 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 182 | #html_show_copyright = True 183 | 184 | # If true, an OpenSearch description file will be output, and all pages will 185 | # contain a tag referring to it. The value of this option must be the 186 | # base URL from which the finished HTML is served. 187 | #html_use_opensearch = '' 188 | 189 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 190 | #html_file_suffix = None 191 | 192 | # Output file base name for HTML help builder. 193 | htmlhelp_basename = 'Tesserwrapdoc' 194 | 195 | 196 | # -- Options for LaTeX output -------------------------------------------------- 197 | 198 | latex_elements = { 199 | # The paper size ('letterpaper' or 'a4paper'). 200 | #'papersize': 'letterpaper', 201 | 202 | # The font size ('10pt', '11pt' or '12pt'). 203 | #'pointsize': '10pt', 204 | 205 | # Additional stuff for the LaTeX preamble. 206 | #'preamble': '', 207 | } 208 | 209 | # Grouping the document tree into LaTeX files. List of tuples 210 | # (source start file, target name, title, author, documentclass [howto/manual]). 211 | latex_documents = [ 212 | ('index', 'Tesserwrap.tex', 'Tesserwrap Documentation', 213 | 'Greg Jurman, et al', 'manual'), 214 | ] 215 | 216 | # The name of an image file (relative to this directory) to place at the top of 217 | # the title page. 218 | #latex_logo = None 219 | 220 | # For "manual" documents, if this is true, then toplevel headings are parts, 221 | # not chapters. 222 | #latex_use_parts = False 223 | 224 | # If true, show page references after internal links. 225 | #latex_show_pagerefs = False 226 | 227 | # If true, show URL addresses after external links. 228 | #latex_show_urls = False 229 | 230 | # Documents to append as an appendix to all manuals. 231 | #latex_appendices = [] 232 | 233 | # If false, no module index is generated. 234 | #latex_domain_indices = True 235 | 236 | 237 | # -- Options for manual page output -------------------------------------------- 238 | 239 | # One entry per manual page. List of tuples 240 | # (source start file, name, description, authors, manual section). 241 | man_pages = [ 242 | ('index', 'tesserwrap', 'Tesserwrap Documentation', 243 | ['Greg Jurman, et al'], 1) 244 | ] 245 | 246 | # If true, show URL addresses after external links. 247 | #man_show_urls = False 248 | 249 | 250 | # -- Options for Texinfo output ------------------------------------------------ 251 | 252 | # Grouping the document tree into Texinfo files. List of tuples 253 | # (source start file, target name, title, author, 254 | # dir menu entry, description, category) 255 | texinfo_documents = [ 256 | ('index', 'Tesserwrap', 'Tesserwrap Documentation', 257 | 'Greg Jurman, et al', 'Tesserwrap', 'One line description of project.', 258 | 'Miscellaneous'), 259 | ] 260 | 261 | # Documents to append as an appendix to all manuals. 262 | #texinfo_appendices = [] 263 | 264 | # If false, no module index is generated. 265 | #texinfo_domain_indices = True 266 | 267 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 268 | #texinfo_show_urls = 'footnote' 269 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. module:: tesserwrap 2 | 3 | Welcome to Tesserwrap's documentation! 4 | ====================================== 5 | 6 | Tesserwrap is a ctypes/capi wrapper for 7 | `Tesseract OCR `_. 8 | 9 | .. Contents: 10 | 11 | .. .. toctree:: 12 | :maxdepth: 2 13 | 14 | .. autoclass:: Tesseract 15 | :members: 16 | 17 | 18 | Indices and tables 19 | ================== 20 | 21 | * :ref:`genindex` 22 | * :ref:`modindex` 23 | * :ref:`search` 24 | 25 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. linkcheck to check all external links for integrity 37 | echo. doctest to run all doctests embedded in the documentation if enabled 38 | goto end 39 | ) 40 | 41 | if "%1" == "clean" ( 42 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 43 | del /q /s %BUILDDIR%\* 44 | goto end 45 | ) 46 | 47 | if "%1" == "html" ( 48 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 49 | if errorlevel 1 exit /b 1 50 | echo. 51 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 52 | goto end 53 | ) 54 | 55 | if "%1" == "dirhtml" ( 56 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 57 | if errorlevel 1 exit /b 1 58 | echo. 59 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 60 | goto end 61 | ) 62 | 63 | if "%1" == "singlehtml" ( 64 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 68 | goto end 69 | ) 70 | 71 | if "%1" == "pickle" ( 72 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished; now you can process the pickle files. 76 | goto end 77 | ) 78 | 79 | if "%1" == "json" ( 80 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished; now you can process the JSON files. 84 | goto end 85 | ) 86 | 87 | if "%1" == "htmlhelp" ( 88 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can run HTML Help Workshop with the ^ 92 | .hhp project file in %BUILDDIR%/htmlhelp. 93 | goto end 94 | ) 95 | 96 | if "%1" == "qthelp" ( 97 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 98 | if errorlevel 1 exit /b 1 99 | echo. 100 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 101 | .qhcp project file in %BUILDDIR%/qthelp, like this: 102 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Tesserwrap.qhcp 103 | echo.To view the help file: 104 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Tesserwrap.ghc 105 | goto end 106 | ) 107 | 108 | if "%1" == "devhelp" ( 109 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 110 | if errorlevel 1 exit /b 1 111 | echo. 112 | echo.Build finished. 113 | goto end 114 | ) 115 | 116 | if "%1" == "epub" ( 117 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 118 | if errorlevel 1 exit /b 1 119 | echo. 120 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 121 | goto end 122 | ) 123 | 124 | if "%1" == "latex" ( 125 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 129 | goto end 130 | ) 131 | 132 | if "%1" == "text" ( 133 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The text files are in %BUILDDIR%/text. 137 | goto end 138 | ) 139 | 140 | if "%1" == "man" ( 141 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 145 | goto end 146 | ) 147 | 148 | if "%1" == "texinfo" ( 149 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 150 | if errorlevel 1 exit /b 1 151 | echo. 152 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 153 | goto end 154 | ) 155 | 156 | if "%1" == "gettext" ( 157 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 158 | if errorlevel 1 exit /b 1 159 | echo. 160 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 161 | goto end 162 | ) 163 | 164 | if "%1" == "changes" ( 165 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 166 | if errorlevel 1 exit /b 1 167 | echo. 168 | echo.The overview file is in %BUILDDIR%/changes. 169 | goto end 170 | ) 171 | 172 | if "%1" == "linkcheck" ( 173 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 174 | if errorlevel 1 exit /b 1 175 | echo. 176 | echo.Link check complete; look for any errors in the above output ^ 177 | or in %BUILDDIR%/linkcheck/output.txt. 178 | goto end 179 | ) 180 | 181 | if "%1" == "doctest" ( 182 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 183 | if errorlevel 1 exit /b 1 184 | echo. 185 | echo.Testing of doctests in the sources finished, look at the ^ 186 | results in %BUILDDIR%/doctest/output.txt. 187 | goto end 188 | ) 189 | 190 | :end 191 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | apt-get update 2 | apt-get install -y make 3 | apt-get install -y g++ 4 | apt-get install -y tesseract-ocr 5 | apt-get install -y libtesseract-dev 6 | # http://stackoverflow.com/questions/4011705/python-the-imagingft-c-module-is-not-installed 7 | apt-get install -y libfreetype6-dev 8 | apt-get install -y python-pip 9 | apt-get install -y python-dev 10 | 11 | pip install pillow nose Sphinx -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from setuptools import setup, Extension 4 | 5 | import multiprocessing 6 | import distutils.util as du_util 7 | 8 | 9 | # Library locator function 10 | # Looks to see which library is available to link against 11 | def check_lib_by_name(lib_name, search_path=None): 12 | s_path = "" 13 | platform_opts = "" 14 | if search_path: 15 | for path in search_path: 16 | s_path = s_path + "-L%s" % path 17 | 18 | # OSX specific (From: jmel - Tesserwrap: #11) 19 | if "macosx" in du_util.get_platform(): 20 | platform_opts = "-arch x86_64 -execute -macosx_version_min 10.6 -pie -lm -lpthread -lcrt1.o" 21 | 22 | return os.system('ld %s %s -l%s' % (s_path, platform_opts, lib_name)) == 0 23 | 24 | 25 | def find_closest_libname(lib_names, search_path=None): 26 | for lib_name in lib_names: 27 | if check_lib_by_name(lib_name, search_path): 28 | return lib_name 29 | raise Exception( 30 | "Cannot find Tesseract via ldconfig, confirm it is installed.") 31 | 32 | 33 | # Utility function to read the README file. 34 | # Used for the long_description. It's nice, because now 1) we have a top level 35 | # README file and 2) it's easier to type in the README file than to put a raw 36 | # string in below ... 37 | def read(fname): 38 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 39 | 40 | extra_lib_paths = ['/usr/local/lib'] 41 | 42 | tesseract_possible_names = ['tesseract_api', 'tesseract'] 43 | tesseract_lib_name = find_closest_libname( 44 | tesseract_possible_names, 45 | extra_lib_paths) 46 | 47 | tesser_cpp = Extension( 48 | 'libtesserwrap', 49 | include_dirs=['/usr/local/include'], 50 | libraries=[tesseract_lib_name], 51 | library_dirs=extra_lib_paths, 52 | sources=[ 53 | 'tesserwrap/cpp/tesseract_ext.cpp', 54 | 'tesserwrap/cpp/tesseract_wrap.cpp'], 55 | depends=[ 56 | 'tesserwrap/cpp/tesseract_wrap.cpp', 57 | 'tesserwrap/cpp/tesseract_wrap.h', 58 | 'tesserwrap/cpp/tesseract_ext.cpp', 59 | 'tesserwrap/cpp/tesseract_ext.h' 60 | ], 61 | ) 62 | 63 | if os.environ.get('READTHEDOCS', None) == 'True': 64 | extensions = None 65 | else: 66 | extensions = [tesser_cpp] 67 | 68 | setup( 69 | name="tesserwrap", 70 | version="0.1.6", 71 | author="Greg Jurman, and others", 72 | author_email="gdj2214@rit.edu", 73 | description=("Basic python bindings to the Tesseract C++ API"), 74 | license="Apache License 2.0", 75 | keywords="tesseract ocr cpp", 76 | url="https://github.com/gregjurman/tesserwrap", 77 | packages=['tesserwrap'], 78 | zip_safe=False, 79 | ext_modules=extensions, 80 | long_description=read('README'), 81 | tests_require=['nose', 'Pillow'], 82 | test_suite="nose.collector", 83 | classifiers=[ 84 | "Topic :: Scientific/Engineering :: Image Recognition", 85 | "License :: OSI Approved :: Apache Software License", 86 | "Development Status :: 3 - Alpha", 87 | "Topic :: Software Development :: Libraries :: Python Modules", 88 | 'Programming Language :: Python :: 2.7', 89 | 'Programming Language :: Python :: 3.3', 90 | ], 91 | ) 92 | -------------------------------------------------------------------------------- /tesserwrap/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import tr 2 | from ctypes import c_ulonglong, byref 3 | from collections import namedtuple 4 | import sys 5 | import warnings 6 | 7 | 8 | __all__ = ["PageSegMode", "PageIteratorLevel", "Tesseract"] 9 | 10 | 11 | class PageSegMode(object): 12 | PSM_OSD_ONLY = 0 13 | PSM_AUTO_OSD = 1 14 | PSM_AUTO_ONLY = 2 15 | PSM_AUTO = 3 16 | PSM_SINGLE_COLUMN = 4 17 | PSM_SINGLE_BLOCK_VERT_TEXT = 5 18 | PSM_SINGLE_BLOCK = 6 19 | PSM_SINGLE_LINE = 7 20 | PSM_SINGLE_WORD = 8 21 | PSM_CIRCLE_WORD = 9 22 | PSM_SINGLE_CHAR = 10 23 | 24 | class PageIteratorLevel(object): 25 | RIL_BLOCK = 0 26 | RIL_PARA = 1 27 | RIL_TEXTLINE = 2 28 | RIL_WORD = 3 29 | RIL_SYMBOL = 4 30 | 31 | 32 | class Tesseract(object): 33 | """Tesseract OCR object. 34 | 35 | :param datadir: 36 | Tesseract data-directory with Tesseract training data. 37 | 38 | :param lang: 39 | The language of the image(s) to be OCRed. 40 | 41 | A simple example:: 42 | 43 | >>> from tesserwrap import Tesseract 44 | >>> from PIL import Image 45 | 46 | >>> img = Image.open("test.png") 47 | >>> tr = Tesseract() 48 | >>> tr.ocr_image(img) 49 | 'The quick brown fox jumps ove\\n\\n' 50 | """ 51 | 52 | def __init__(self, datadir="", lang="eng"): 53 | """Initialize a new Tesseract object 54 | 55 | """ 56 | self.handle = tr.Tesserwrap_Init( 57 | bytes(datadir, "ascii") if sys.version[:3] >= '3.2' else datadir, 58 | bytes(lang, "ascii") if sys.version[:3] >= '3.2' else lang 59 | ) 60 | 61 | def __del__(self): 62 | try: 63 | if self.handle and core: 64 | tr.Tesserwrap_Destroy(self.handle) 65 | self.handle = None 66 | except AttributeError: 67 | print("__del__ without handle release") 68 | pass 69 | 70 | def set_image(self, image): 71 | '''Takes a PIL Image and loads it into Tesseract for further 72 | operations. 73 | 74 | Note:: This function will automatically convert the image to 75 | Grayscale. 76 | 77 | :param image: image 78 | Image to use in tesseract. 79 | ''' 80 | if image.mode != "L": 81 | image = image.convert("L") 82 | 83 | if hasattr(image, "tobytes"): 84 | img_bytes = image.tobytes() 85 | else: 86 | img_bytes = image.tostring() 87 | 88 | tr.Tesserwrap_SetImage( 89 | self.handle, 90 | img_bytes, # Image data 91 | len(img_bytes), # size of buffer 92 | image.size[0], # Width 93 | image.size[1]) # Height 94 | 95 | def get_text(self): 96 | """Get the text of the OCR'd image as a byte-string 97 | """ 98 | return tr.Tesserwrap_GetUTF8Text(self.handle) 99 | 100 | def get_utf8_text(self): 101 | """Get the text of the OCR'd image as a string. 102 | 103 | This function is kept for backwards compatability with the 0.0 104 | version of tesserwrap. 105 | """ 106 | return self.get_text().decode(encoding="UTF-8") 107 | 108 | def ocr_image(self, image): 109 | """OCR an image returning the UTF8 text data. 110 | 111 | :param image: image 112 | Image to be OCR'd by tesseract. 113 | """ 114 | self.set_image(image) 115 | self.set_page_seg_mode(PageSegMode.PSM_SINGLE_BLOCK) 116 | return self.get_utf8_text() 117 | 118 | def get_rectangle(self): 119 | """Get the bounding rectangle that tesseract is looking at inside 120 | of the image. 121 | """ 122 | left, top = c_ulonglong(), c_ulonglong() 123 | width, height = c_ulonglong(), c_ulonglong() 124 | tr.Tesserwrap_GetRectangle( 125 | self.handle, 126 | byref(left), byref(top), 127 | byref(width), byref(height)) 128 | return ((left.value, top.value), (width.value, height.value)) 129 | 130 | def set_rectangle(self, left, top, width, height): 131 | """Set the OCR detection bounding-box. 132 | 133 | :param left: integer 134 | Pixels offset right from left of the image. 135 | 136 | :param top: integer 137 | Pixels offset down from the top of the image. 138 | 139 | :param width: integer 140 | Width of the bounding-box. 141 | 142 | :param height: integer 143 | Height of the bounding-box. 144 | """ 145 | tr.Tesserwrap_SetRectangle( 146 | self.handle, 147 | left, top, 148 | width, height) 149 | 150 | def get_page_seg_mode(self): 151 | """Returns the page analysis mode from Tesseract""" 152 | return tr.Tesserwrap_GetPageSegMode(self.handle) 153 | 154 | def set_page_seg_mode(self, mode=PageSegMode.PSM_SINGLE_BLOCK): 155 | """Set the page layout analysis mode. 156 | 157 | :param mode: integer 158 | The page layout analysis mode. See PageSegMode class for options 159 | """ 160 | tr.Tesserwrap_SetPageSegMode(self.handle, mode) 161 | 162 | def clear(self): 163 | """Clear the tesseract Image, and clean up any Tesseract run-data.""" 164 | tr.Tesserwrap_Clear(self.handle) 165 | 166 | def set_variable(self, key, value): 167 | """Set an internal Tesseract variable. 168 | 169 | :param key: str 170 | Variable name to change. 171 | 172 | :param value: str 173 | New variable value. 174 | 175 | """ 176 | tr.Tesserwrap_SetVariable( 177 | self.handle, 178 | bytes(key, "ascii") if sys.version[:3] >= '3.2' else key, 179 | bytes(value, "ascii") if sys.version[:3] >= '3.2' else value 180 | ) 181 | 182 | def get_mean_confidence(self): 183 | """Returns the (average) confidence value between 0 and 100. 184 | """ 185 | return tr.Tesserwrap_MeanTextConf(self.handle) 186 | 187 | def get_all_word_confidences(self): 188 | node = tr.Tesserwrap_AllWordConfidences(self.handle) 189 | result = [] 190 | 191 | while bool(node): 192 | result.append(node.contents.value) 193 | node = node.contents.next 194 | 195 | return result 196 | 197 | def get_result(self, level): 198 | node = tr.Tesserwrap_GetResult(self.handle, level) 199 | result = [] 200 | Item = namedtuple('Item', ['value', 'confidence', 'box']) 201 | 202 | while bool(node): 203 | item = Item( 204 | value=node.contents.value, 205 | confidence=node.contents.confidence, 206 | box = tuple(node.contents.box) 207 | ) 208 | result.append(item) 209 | node = node.contents.next 210 | 211 | return result 212 | 213 | def get_words(self): 214 | """Get a list containing all the words in the OCR'd image. 215 | :returns: A list containing objects with the attributes: 216 | value: the string value of the word 217 | box: left, upper, right, and lower pixel coordinate 218 | confidence: confidence value between 0 and 100 219 | """ 220 | return self.get_result(PageIteratorLevel.RIL_WORD) 221 | 222 | def get_symbols(self): 223 | """Get a list containing all symbols in the OCR'd image. 224 | :returns: A list containing objects with the attributes: 225 | value: the string value of the symbol 226 | box: left, upper, right, and lower pixel coordinate 227 | confidence: confidence value between 0 and 100 228 | """ 229 | return self.get_result(PageIteratorLevel.RIL_SYMBOL) 230 | 231 | def get_textlines(self): 232 | """Get a list containing all lines in the OCR'd image. 233 | :returns: A list containing objects with the attributes: 234 | value: the string value of the line 235 | box: left, upper, right, and lower pixel coordinate 236 | confidence: confidence value between 0 and 100 237 | """ 238 | return self.get_result(PageIteratorLevel.RIL_TEXTLINE) 239 | 240 | 241 | 242 | 243 | def tesseract(*args, **kwargs): 244 | """When the lower-case version of tesseract is called, spit out a 245 | DeprecationWarning and create the new class object. 246 | """ 247 | warnings.warn( 248 | "Soon 'tesseract' will be deprecated, use Tesseract instead", 249 | DeprecationWarning, stacklevel=2) 250 | return Tesseract(*args, **kwargs) 251 | -------------------------------------------------------------------------------- /tesserwrap/core.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from ctypes import * 4 | from ctypes.util import find_library 5 | 6 | import distutils.sysconfig 7 | 8 | 9 | 10 | 11 | 12 | def get_shared_lib_extension(is_python_ext=False): 13 | """Return the correct file extension for shared libraries. 14 | 15 | Parameters 16 | ---------- 17 | is_python_ext : bool, optional 18 | Whether the shared library is a Python extension. Default is False. 19 | 20 | Returns 21 | ------- 22 | so_ext : str 23 | The shared library extension. 24 | 25 | Notes 26 | ----- 27 | For Python shared libs, `so_ext` will typically be '.so' on Linux and OS X, 28 | and '.pyd' on Windows. For Python >= 3.2 `so_ext` has a tag prepended on 29 | POSIX systems according to PEP 3149. For Python 3.2 this is implemented on 30 | Linux, but not on OS X. 31 | 32 | """ 33 | so_ext = distutils.sysconfig.get_config_var('SO') or '' 34 | # fix long extension for Python >=3.2, see PEP 3149. 35 | if not is_python_ext and 'SOABI' in distutils.sysconfig.get_config_vars(): 36 | # Does nothing unless SOABI config var exists 37 | so_ext = so_ext.replace( 38 | '.' + distutils.sysconfig.get_config_var('SOABI'), '', 1) 39 | 40 | return so_ext 41 | 42 | 43 | def load_library(libname, loader_path): 44 | """Load a DLL via ctypes load function. Return None on failure. 45 | 46 | Try loading the DLL from the current package directory first, 47 | then from the Windows DLL search path. 48 | 49 | """ 50 | so_ext = get_shared_lib_extension() 51 | libname_ext = [libname + so_ext] 52 | if sys.version[:3] >= '3.2': 53 | # For Python >= 3.2 a tag may be added to lib extension 54 | # (platform dependent). If we find such a tag, try both with 55 | # and without it. 56 | so_ext2 = get_shared_lib_extension(is_python_ext=True) 57 | if not so_ext2 == so_ext: 58 | libname_ext.insert(0, libname + so_ext2) 59 | 60 | loader_path = os.path.abspath(loader_path + "/..") 61 | 62 | # Need to save exception when using Python 3k, see PEP 3110. 63 | exc = None 64 | for ln in libname_ext: 65 | try: 66 | libpath = os.path.join(loader_path, ln) 67 | return cdll[libpath] 68 | except OSError as e: 69 | exc = e 70 | raise exc 71 | 72 | 73 | 74 | tr = load_library('libtesserwrap', os.path.dirname(__file__)) 75 | 76 | 77 | tr.Tesserwrap_Init.restype = c_void_p 78 | tr.Tesserwrap_Init.argtypes = [c_char_p, c_char_p] 79 | 80 | tr.Tesserwrap_Destroy.argtypes = [c_void_p] 81 | tr.Tesserwrap_Destroy.restype = None 82 | 83 | tr.Tesserwrap_GetRectangle.restype = None 84 | tr.Tesserwrap_GetRectangle.argtypes = [ 85 | c_void_p, 86 | POINTER(c_ulonglong), POINTER(c_ulonglong), 87 | POINTER(c_ulonglong), POINTER(c_ulonglong) 88 | ] 89 | 90 | tr.Tesserwrap_SetRectangle.restype = None 91 | tr.Tesserwrap_SetRectangle.argtypes = [ 92 | c_void_p, 93 | c_ulonglong, c_ulonglong, 94 | c_ulonglong, c_ulonglong 95 | ] 96 | 97 | tr.Tesserwrap_SetImage.restype = None 98 | tr.Tesserwrap_SetImage.argtypes = [ 99 | c_void_p, 100 | c_char_p, 101 | c_ulonglong, c_longlong, c_longlong 102 | ] 103 | 104 | tr.Tesserwrap_GetUTF8Text.restype = c_char_p 105 | tr.Tesserwrap_GetUTF8Text.argtypes = [c_void_p] 106 | 107 | tr.Tesserwrap_GetPageSegMode.restype = c_int 108 | tr.Tesserwrap_GetPageSegMode.argtypes = [c_void_p] 109 | 110 | tr.Tesserwrap_SetPageSegMode.restype = None 111 | tr.Tesserwrap_SetPageSegMode.argtypes = [c_void_p, c_int] 112 | 113 | tr.Tesserwrap_Clear.restype = None 114 | tr.Tesserwrap_Clear.argtypes = [c_void_p] 115 | 116 | tr.Tesserwrap_SetVariable.restype = None 117 | tr.Tesserwrap_SetVariable.argtypes = [c_void_p, c_char_p, c_char_p] 118 | 119 | tr.Tesserwrap_MeanTextConf.restype = c_int 120 | tr.Tesserwrap_MeanTextConf.argtypes = [c_void_p] 121 | 122 | class ConfidenceNode(Structure): 123 | pass 124 | 125 | ConfidenceNode._fields_ = [ 126 | ("value", c_int), 127 | ("next", POINTER(ConfidenceNode)) 128 | ] 129 | 130 | tr.Tesserwrap_AllWordConfidences.restype = POINTER(ConfidenceNode) 131 | tr.Tesserwrap_AllWordConfidences.argtypes = [c_void_p] 132 | 133 | class ResultNode(Structure): 134 | pass 135 | 136 | ResultNode._fields_ = [ 137 | ("value", c_char_p), 138 | ("confidence", c_float), 139 | ("box", c_int * 4), 140 | ("next", POINTER(ResultNode)) 141 | ] 142 | 143 | tr.Tesserwrap_GetResult.restype = POINTER(ResultNode) 144 | tr.Tesserwrap_GetResult.argtypes = [c_void_p, c_int] -------------------------------------------------------------------------------- /tesserwrap/cpp/tesseract_ext.cpp: -------------------------------------------------------------------------------- 1 | #include "tesseract_ext.h" 2 | 3 | TessBaseAPIExt::TessBaseAPIExt(void) 4 | :picture(NULL){} 5 | 6 | TessBaseAPIExt::~TessBaseAPIExt(void) 7 | { 8 | if(picture) delete [] picture; 9 | this->End(); 10 | } 11 | 12 | const char* TessBaseAPIExt::TesseractRect(const unsigned char *data, 13 | int bytes_per_pixel, int bytes_per_line, 14 | int left, int top, int width, int height) 15 | { 16 | return super::TesseractRect(data, bytes_per_pixel, bytes_per_line, 17 | left, top, width, height); 18 | 19 | } 20 | 21 | void TessBaseAPIExt::SetImage(const unsigned char *data, uint64_t size, 22 | uint64_t width, uint64_t height) 23 | { 24 | if(picture) delete [] picture; 25 | picture = new unsigned char[size]; 26 | std::memcpy(picture, data, size); 27 | super::SetImage(picture, width, height, 1, width); 28 | this->SetRectangle(0, 0, width, height); 29 | } 30 | 31 | void TessBaseAPIExt::GetRectangle(uint64_t *left, uint64_t *top, uint64_t *width, uint64_t *height) 32 | { 33 | (*left) = this->rect_left_; 34 | (*top) = this->rect_top_; 35 | (*width) = this->rect_width_; 36 | (*height) = this->rect_height_; 37 | } -------------------------------------------------------------------------------- /tesserwrap/cpp/tesseract_ext.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | class TessBaseAPIExt : public tesseract::TessBaseAPI 8 | { 9 | private: 10 | unsigned char *picture; 11 | typedef tesseract::TessBaseAPI super; 12 | 13 | public: 14 | TessBaseAPIExt(void); 15 | ~TessBaseAPIExt(void); // Default destructor 16 | const char* TesseractRect(const unsigned char *data, 17 | int bytes_per_pixel, int bytes_per_line, 18 | int left, int top, int width, int height); 19 | void GetRectangle(uint64_t *, uint64_t *, uint64_t *, uint64_t *); 20 | void SetImage(const unsigned char *data, uint64_t size, uint64_t width, uint64_t height); 21 | }; 22 | 23 | typedef TessBaseAPIExt *TessH; 24 | -------------------------------------------------------------------------------- /tesserwrap/cpp/tesseract_wrap.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "tesseract_wrap.h" 4 | 5 | struct ConfidenceNode 6 | { 7 | int value; 8 | struct ConfidenceNode *next; 9 | }; 10 | 11 | struct ResultNode 12 | { 13 | char *value; 14 | float confidence; 15 | int box[4]; 16 | struct ResultNode *next; 17 | }; 18 | 19 | 20 | TESSERWRAP_CAPI TessH Tesserwrap_Init(const char *datadir, const char *lang) 21 | { 22 | TessH h = new TessBaseAPIExt(); 23 | h->Init(datadir, lang); 24 | return (TessH) h; 25 | } 26 | 27 | TESSERWRAP_CAPI void Tesserwrap_Destroy(TessH tesserwrap) 28 | { 29 | TessBaseAPIExt *api = (TessBaseAPIExt*) tesserwrap; 30 | if (api) delete api; 31 | } 32 | 33 | TESSERWRAP_CAPI void Tesserwrap_GetRectangle(TessH tesserwrap, 34 | uint64_t *left, uint64_t *top, 35 | uint64_t *width, uint64_t *height) 36 | { 37 | TessBaseAPIExt *api = (TessBaseAPIExt*) tesserwrap; 38 | api->GetRectangle(left, top, width, height); 39 | } 40 | 41 | TESSERWRAP_CAPI void Tesserwrap_SetRectangle(TessH tesserwrap, 42 | uint64_t left, uint64_t top, 43 | uint64_t width, uint64_t height) 44 | { 45 | TessBaseAPIExt *api = (TessBaseAPIExt*) tesserwrap; 46 | api->SetRectangle(left, top, width, height); 47 | } 48 | 49 | TESSERWRAP_CAPI void Tesserwrap_SetImage(TessH tesserwrap, 50 | const unsigned char *picture, uint64_t size, uint64_t width, uint64_t height) 51 | { 52 | TessBaseAPIExt *api = (TessBaseAPIExt*) tesserwrap; 53 | api->SetImage(picture, size, width, height); 54 | } 55 | 56 | TESSERWRAP_CAPI void Tesserwrap_SetPageSegMode(TessH tesserwrap, 57 | tesseract::PageSegMode pageseg) 58 | { 59 | TessBaseAPIExt *api = (TessBaseAPIExt*) tesserwrap; 60 | api->SetPageSegMode(pageseg); 61 | } 62 | 63 | TESSERWRAP_CAPI tesseract::PageSegMode Tesserwrap_GetPageSegMode(TessH tesserwrap) 64 | { 65 | TessBaseAPIExt *api = (TessBaseAPIExt*) tesserwrap; 66 | return api->GetPageSegMode(); 67 | } 68 | 69 | TESSERWRAP_CAPI const char *Tesserwrap_GetUTF8Text(TessH tesserwrap) 70 | { 71 | TessBaseAPIExt *api = (TessBaseAPIExt*) tesserwrap; 72 | return api->GetUTF8Text(); 73 | } 74 | 75 | TESSERWRAP_CAPI void Tesserwrap_Clear(TessH tesserwrap) 76 | { 77 | TessBaseAPIExt *api = (TessBaseAPIExt*) tesserwrap; 78 | api->Clear(); 79 | } 80 | 81 | TESSERWRAP_CAPI void Tesserwrap_SetVariable(TessH tesserwrap, const char *key, const char *value) 82 | { 83 | TessBaseAPIExt *api = (TessBaseAPIExt*) tesserwrap; 84 | api->SetVariable(key, value); 85 | } 86 | 87 | TESSERWRAP_CAPI int Tesserwrap_MeanTextConf(TessH tesserwrap) 88 | { 89 | TessBaseAPIExt *api = (TessBaseAPIExt*) tesserwrap; 90 | return api->MeanTextConf(); 91 | } 92 | 93 | TESSERWRAP_CAPI ConfidenceNode *Tesserwrap_AllWordConfidences(TessH tesserwrap) 94 | { 95 | TessBaseAPIExt *api = (TessBaseAPIExt*) tesserwrap; 96 | 97 | ConfidenceNode *first = new ConfidenceNode; 98 | ConfidenceNode *previous = new ConfidenceNode; 99 | 100 | int* confs = api->AllWordConfidences(); 101 | int len, *trav; 102 | for (len = 0, trav = confs; *trav != -1; trav++, len++){ 103 | ConfidenceNode *temp = new ConfidenceNode; 104 | temp->value = *trav; 105 | temp->next = NULL; 106 | 107 | if(len == 0){ 108 | first = temp; 109 | } 110 | else{ 111 | previous->next = temp; 112 | } 113 | previous = temp; 114 | } 115 | free(confs); 116 | if(len == 0){ 117 | return NULL; 118 | } 119 | return first; 120 | } 121 | 122 | TESSERWRAP_CAPI ResultNode *Tesserwrap_GetResult(TessH tesserwrap, int level) 123 | { 124 | TessBaseAPIExt *api = (TessBaseAPIExt*) tesserwrap; 125 | 126 | ResultNode *first = new ResultNode; 127 | ResultNode *previous = new ResultNode; 128 | 129 | tesseract::ResultIterator* ri = api->GetIterator(); 130 | tesseract::PageIteratorLevel lev = static_cast(level); 131 | 132 | int len = 0; 133 | 134 | if(ri != 0) { 135 | do { 136 | char* symbol = ri->GetUTF8Text(lev); 137 | float conf = ri->Confidence(lev); 138 | 139 | int x1, y1, x2, y2; 140 | ri->BoundingBox(lev, &x1, &y1, &x2, &y2); 141 | 142 | if(symbol != 0) { 143 | ResultNode *temp = new ResultNode; 144 | 145 | temp->value = strdup(symbol); 146 | temp->confidence = conf; 147 | temp->next = NULL; 148 | temp->box[0] = x1; 149 | temp->box[1] = y1; 150 | temp->box[2] = x2; 151 | temp->box[3] = y2; 152 | 153 | if(len == 0){ 154 | first = temp; 155 | } 156 | else{ 157 | previous->next = temp; 158 | } 159 | 160 | previous = temp; 161 | len++; 162 | } 163 | delete[] symbol; 164 | } while((ri->Next(lev))); 165 | } 166 | 167 | if(len == 0){ 168 | return NULL; 169 | } 170 | return first; 171 | } 172 | 173 | 174 | 175 | -------------------------------------------------------------------------------- /tesserwrap/cpp/tesseract_wrap.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include "tesseract_ext.h" 3 | 4 | #ifdef __cplusplus 5 | # define TR_C_START extern "C" { 6 | # define TR_C_END } 7 | #else 8 | # define TR_C_START 9 | # define TR_C_END 10 | #endif 11 | 12 | #define TESSERWRAP_CAPI 13 | 14 | TR_C_START 15 | 16 | typedef TessBaseAPIExt* TessH; 17 | 18 | struct ConfidenceNode; 19 | struct ResultNode; 20 | 21 | TESSERWRAP_CAPI TessH Tesserwrap_Init(const char *datadir, const char *lang); 22 | TESSERWRAP_CAPI void Tesserwrap_Destroy(TessH tesserwrap); 23 | TESSERWRAP_CAPI void Tesserwrap_GetRectangle(TessH tesserwrap, 24 | uint64_t *left, uint64_t *top, 25 | uint64_t *width, uint64_t *height); 26 | TESSERWRAP_CAPI void Tesserwrap_SetRectangle(TessH tesserwrap, 27 | uint64_t left, uint64_t top, 28 | uint64_t width, uint64_t height); 29 | TESSERWRAP_CAPI void Tesserwrap_SetImage(TessH tesserwrap, 30 | const unsigned char *picture, uint64_t size, uint64_t width, uint64_t height); 31 | TESSERWRAP_CAPI void Tesserwrap_SetPageSegMode(TessH tesserwrap, 32 | tesseract::PageSegMode pageseg); 33 | TESSERWRAP_CAPI tesseract::PageSegMode Tesserwrap_GetPageSegMode(TessH tesserwrap); 34 | TESSERWRAP_CAPI const char *Tesserwrap_GetUTF8Text(TessH tesserwrap); 35 | TESSERWRAP_CAPI void Tesserwrap_Clear(TessH tesserwrap); 36 | TESSERWRAP_CAPI void Tesserwrap_SetVariable(TessH tesserwrap, const char *key, const char *value); 37 | TESSERWRAP_CAPI int Tesserwrap_MeanTextConf(TessH tesserwrap); 38 | TESSERWRAP_CAPI ConfidenceNode *Tesserwrap_AllWordConfidences(TessH tesserwrap); 39 | TESSERWRAP_CAPI ResultNode *Tesserwrap_GetResult(TessH tesserwrap , int level); 40 | 41 | TR_C_END 42 | -------------------------------------------------------------------------------- /tests/test_tesserwrap.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | from nose.tools import eq_, ok_, raises 4 | from PIL import Image, ImageDraw, ImageFont 5 | 6 | import tesserwrap 7 | from util import tolerant 8 | 9 | def create_img(text="Quick brown fox", depth="L"): 10 | font = "/usr/share/fonts/gnu-free/FreeSansBold.ttf" 11 | fnt = ImageFont.truetype(font, 24) 12 | imgbg = Image.new(depth, (710, 40), "#FFFFFF") 13 | draw = ImageDraw.Draw(imgbg) 14 | draw.text((10, 0), text, font=fnt, fill="#000000") 15 | del draw 16 | return imgbg 17 | 18 | 19 | class TestTesseract(unittest.TestCase): 20 | @tolerant() 21 | def test_ocr_image(self): 22 | test_text = "ABABABA" 23 | img = create_img(test_text) 24 | tr = tesserwrap.Tesseract() 25 | out_text = tr.ocr_image(img).strip() 26 | eq_(out_text, test_text, 27 | "%s is not %s" % (out_text, test_text)) 28 | 29 | @tolerant() 30 | def test_ocr_image_RGB(self): 31 | test_text = "ABABABA" 32 | img = create_img(test_text, "RGB") 33 | tr = tesserwrap.Tesseract() 34 | out_text = tr.ocr_image(img).strip() 35 | eq_(out_text, test_text, 36 | "%s is not %s" % (out_text, test_text)) 37 | 38 | @tolerant() 39 | def test_ocr_image_Whitelist(self): 40 | test_text = "ABABABA" 41 | img = create_img(test_text) 42 | tr = tesserwrap.Tesseract() 43 | tr.set_variable("tessedit_char_whitelist", "A") 44 | out_text = tr.ocr_image(img).strip() 45 | assert out_text != test_text, "%r == %r" % (out_text, test_text) 46 | 47 | @tolerant() 48 | def test_set_rectangle(self): 49 | test_text = "A BBB" 50 | img = create_img("A BBB CCC") 51 | tr = tesserwrap.Tesseract() 52 | tr.set_image(img) 53 | tr.set_rectangle(0, 0, 100, 40) 54 | out_text = tr.get_text().decode().strip() 55 | eq_(out_text, test_text, 56 | "%s is not %s" % (out_text, test_text)) 57 | 58 | def test_get_rectangle(self): 59 | test_text = "A BBB" 60 | img = create_img("A BBB CCC") 61 | tr = tesserwrap.Tesseract() 62 | tr.set_image(img) 63 | tr.get_text() # run recognizer to get all data set 64 | (l, t), (w, h) = tr.get_rectangle() 65 | eq_(l, 0, "Left attribute incorrect") 66 | eq_(t, 0, "Top attribute incorrect") 67 | eq_(w, 710, "Width attribute incorrect") 68 | eq_(h, 40, "Height attribute incorrect") 69 | 70 | def test_bad_handle(self): 71 | tr = tesserwrap.Tesseract() 72 | del tr.handle 73 | del tr 74 | 75 | def test_clear(self): 76 | tr = tesserwrap.Tesseract() 77 | img = create_img("A BBB CCC") 78 | tr.set_image(img) 79 | tr.clear() 80 | 81 | def test_deprecator(self): 82 | tr = tesserwrap.tesseract() 83 | 84 | def test_mean_confidence(self): 85 | tr = tesserwrap.Tesseract() 86 | img = create_img("Hello World") 87 | tr.set_image(img) 88 | tr.get_text() # run recognizer to get all data set 89 | ok_(tr.get_mean_confidence() >= 0, "Confidence should be positve integer") 90 | 91 | def test_word_confidences(self): 92 | tr = tesserwrap.Tesseract() 93 | img = create_img() 94 | tr.set_image(img) 95 | tr.get_text() # run recognizer to get all data set 96 | res = tr.get_all_word_confidences() 97 | eq_(len(res), 3, "Each word should have one item in result") 98 | eq_(tr.get_mean_confidence(), sum(res)/len(res), "Mean confidence incorrect") 99 | # Empty image 100 | img = create_img("") 101 | tr.set_image(img) 102 | res2 = tr.get_all_word_confidences() 103 | eq_([], res2, "Should be empty result and no crash") 104 | 105 | 106 | def test_get_words(self): 107 | tr = tesserwrap.Tesseract() 108 | img = create_img() 109 | tr.set_image(img) 110 | tr.get_text() 111 | 112 | res = tr.get_words() 113 | eq_(len(res), 3, "Each word should have one item in result") 114 | item = res[0] 115 | eq_(item.value, 'Quick', "%s is not %s" % (item.value, 'Quick')) 116 | eq_(len(item.box), 4, 'Box does not contain 4 items') 117 | 118 | 119 | def test_get_symbols(self): 120 | tr = tesserwrap.Tesseract() 121 | test_text = 'ABCD' 122 | img = create_img(test_text) 123 | tr.set_image(img) 124 | tr.get_text() 125 | 126 | res = tr.get_symbols() 127 | result_text = ''.join([l.value for l in res]) 128 | eq_(result_text, test_text, "%s is not %s" % (result_text, test_text)) 129 | 130 | def test_get_textlines(self): 131 | tr = tesserwrap.Tesseract() 132 | test_text = 'This is a line' 133 | img = create_img(test_text) 134 | tr.set_image(img) 135 | tr.get_text() 136 | 137 | res = tr.get_textlines() 138 | result_text = ''.join([l.value.strip() for l in res]) 139 | eq_(result_text, test_text, "%s is not %s" % (result_text, test_text)) 140 | 141 | -------------------------------------------------------------------------------- /tests/util.py: -------------------------------------------------------------------------------- 1 | import nose 2 | 3 | 4 | def tolerant(n=3): 5 | """ A decorator. If the wrapped test fails, try again a number of 6 | times to see if we didn't just experience a network timeout. 7 | """ 8 | 9 | def decorate(func): 10 | name = func.__name__ 11 | 12 | def newfunc(*args, **kw): 13 | original_exception = None 14 | for i in range(n): 15 | try: 16 | return func(*args, **kw) 17 | except Exception as e: 18 | if not original_exception: 19 | original_exception = e 20 | if i is n-1: 21 | raise original_exception 22 | 23 | newfunc = nose.tools.nontrivial.make_decorator(func)(newfunc) 24 | return newfunc 25 | return decorate 26 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist=py27,py33 3 | --------------------------------------------------------------------------------