├── .bumpversion.cfg ├── .dockerignore ├── .github └── FUNDING.yaml ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── Dockerfile-pkg ├── Dockerfile-pkg-langs ├── Dockerfile_build.sh ├── LICENSE.md ├── PageXML.cc ├── PageXML.h ├── README.md ├── githook-pre-commit ├── mock_cv.h ├── old ├── Dockerfile_ubuntu-langs ├── Dockerfile_ubuntu16.04-github-master ├── Dockerfile_ubuntu16.04-pkg ├── Dockerfile_ubuntu18.04-github-master └── Dockerfile_ubuntu18.04-pkg ├── tesseract-recognize.cc └── tesseract_recognize_api.py /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 2025.03.31 3 | commit = True 4 | tag = True 5 | tag_name = {new_version} 6 | 7 | [bumpversion:file:tesseract-recognize.cc] 8 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | dockerfiles 2 | Dockerfile* 3 | .git 4 | git* 5 | *.md 6 | CMakeFiles 7 | CMakeCache.txt 8 | cmake_install.cmake 9 | Makefile 10 | -------------------------------------------------------------------------------- /.github/FUNDING.yaml: -------------------------------------------------------------------------------- 1 | github: mauvilsa 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | CMakeCache.txt 2 | CMakeFiles 3 | Makefile 4 | cmake_install.cmake 5 | tesseract-recognize 6 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "pagexml"] 2 | path = pagexml 3 | url = https://github.com/omni-us/pagexml.git 4 | [submodule "CMakeModules"] 5 | path = CMakeModules 6 | url = https://github.com/lbaehren/CMakeModules.git 7 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required( VERSION 2.8.12 ) 2 | project( tesseract-recognize ) 3 | set( tool_EXE tesseract-recognize ) 4 | list( APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/CMakeModules" ) 5 | include( FindPackageHandleStandardArgs ) 6 | #find_package( LibMagic ) 7 | find_package( Ghostscript ) 8 | find_package( PkgConfig ) 9 | pkg_check_modules( lept REQUIRED lept ) 10 | pkg_check_modules( tesseract REQUIRED tesseract ) 11 | pkg_check_modules( libxml REQUIRED libxml-2.0>=2.9 ) 12 | pkg_check_modules( libxslt REQUIRED libxslt ) 13 | 14 | file( GLOB tool_SRC "*.cc" ) 15 | add_executable( ${tool_EXE} ${tool_SRC} ) 16 | set_property( TARGET ${tool_EXE} PROPERTY CXX_STANDARD 11 ) 17 | 18 | include_directories( SYSTEM ${tesseract_INCLUDEDIR} ) 19 | 20 | add_definitions( -D__PAGEXML_LEPT__ ) 21 | #add_definitions( -D__PAGEXML_MAGICK__ ) 22 | add_definitions( -D__PAGEXML_GS__ ) # TODO: pdf support is broken, gsRenderPdfPageToPng generates empty png 23 | add_definitions( -D__PAGEXML_SLIM__ ) 24 | 25 | set( CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES};${GHOSTSCRIPT_INCLUDES}" ) 26 | 27 | string( REPLACE ";" " " CFLAGS_STR "-Wall -W ${lept_CFLAGS} ${tesseract_CFLAGS} ${Magick_CFLAGS} ${libxml_CFLAGS} ${libxslt_CFLAGS}" ) 28 | set_target_properties( ${tool_EXE} PROPERTIES COMPILE_FLAGS "${CFLAGS_STR}" ) 29 | 30 | include_directories( SYSTEM ${Magick_INCLUDEDIR} ) # To suppress system header warnings 31 | 32 | #target_link_libraries( ${tool_EXE} ${lept_LDFLAGS} ${tesseract_LDFLAGS} ${libxml_LDFLAGS} -lOpenCL ) 33 | target_link_libraries( ${tool_EXE} ${lept_LDFLAGS} ${tesseract_LDFLAGS} ${Magick_LDFLAGS} ${GHOSTSCRIPT_LIBRARIES} ${libxml_LDFLAGS} ${libxslt_LDFLAGS} ) 34 | 35 | install( TARGETS ${tool_EXE} DESTINATION bin ) 36 | add_custom_target( install-docker 37 | cp ${CMAKE_HOME_DIRECTORY}/tesseract-recognize-docker ${CMAKE_HOME_DIRECTORY}/tesseract_recognize_api.py ${CMAKE_INSTALL_PREFIX}/bin ) 38 | 39 | add_custom_target( realclean cd ${CMAKE_HOME_DIRECTORY} COMMAND rm -fr ${tool_EXE} ${tool_EXE}.exe ${tool_EXE}.dSYM CMakeFiles CMakeCache.txt cmake_install.cmake install_manifest.txt Makefile ) 40 | -------------------------------------------------------------------------------- /Dockerfile-pkg: -------------------------------------------------------------------------------- 1 | ARG UBUNTU_TAG=24.04 2 | FROM ubuntu:$UBUNTU_TAG 3 | 4 | ENV DEBIAN_FRONTEND=noninteractive 5 | 6 | RUN apt-get update --fix-missing \ 7 | && apt-get install -y --no-install-recommends \ 8 | build-essential \ 9 | cmake \ 10 | ghostscript \ 11 | libgs-dev \ 12 | libleptonica-dev \ 13 | libtesseract-dev \ 14 | libxml2-dev \ 15 | libxslt1-dev \ 16 | pkg-config \ 17 | python3-pip 18 | 19 | COPY CMakeModules /tmp/tesseract-recognize/CMakeModules 20 | COPY pagexml /tmp/tesseract-recognize/pagexml 21 | COPY CMakeLists.txt Dockerfile* PageXML* mock_cv.h tesseract-recognize* /tmp/tesseract-recognize/ 22 | 23 | RUN cd /tmp/tesseract-recognize \ 24 | && cmake -DCMAKE_BUILD_TYPE=Release . \ 25 | && make 26 | 27 | 28 | FROM ubuntu:$UBUNTU_TAG 29 | 30 | LABEL maintainer="Mauricio Villegas " 31 | 32 | RUN apt-get update --fix-missing \ 33 | && apt-get install -y --no-install-recommends \ 34 | ghostscript \ 35 | libxslt1.1 \ 36 | tesseract-ocr \ 37 | python3-pip \ 38 | && apt-get autoremove -y \ 39 | && apt-get purge -y $(dpkg -l | awk '{if($1=="rc")print $2}') \ 40 | && apt-get clean \ 41 | && rm -rf /var/lib/apt/lists/* 42 | 43 | COPY --from=0 /tmp/tesseract-recognize/tesseract-recognize /usr/local/bin/ 44 | COPY tesseract_recognize_api.py /usr/local/bin/ 45 | RUN sed -n '/^@requirements /{ s|^@requirements ||; p; }' /usr/local/bin/tesseract_recognize_api.py > /tmp/requirements.txt \ 46 | && pip3 install --break-system-packages -r /tmp/requirements.txt \ 47 | && rm /tmp/requirements.txt 48 | 49 | RUN useradd -m -u 1048 -g 0 tesseract 50 | USER 1048 51 | EXPOSE 5000 52 | ENTRYPOINT ["/usr/local/bin/tesseract_recognize_api.py", "--host", "0.0.0.0"] 53 | -------------------------------------------------------------------------------- /Dockerfile-pkg-langs: -------------------------------------------------------------------------------- 1 | ARG UBUNTU_TAG=24.04 2 | FROM ubuntu:$UBUNTU_TAG 3 | 4 | LABEL maintainer="Mauricio Villegas " 5 | 6 | ENV DEBIAN_FRONTEND=noninteractive 7 | 8 | # Install all language packages 9 | RUN apt-get update --fix-missing \ 10 | && apt-get install -y --no-install-recommends \ 11 | rsync \ 12 | tesseract-ocr-* \ 13 | && apt-get clean \ 14 | && rm -rf /var/lib/apt/lists/* 15 | 16 | CMD ["rsync", "-av", "/usr/share/tesseract-ocr/5/tessdata", "/opt/tesseract-ocr"] 17 | -------------------------------------------------------------------------------- /Dockerfile_build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | UBUNTU="24.04" 4 | VERSION=$(sed -r -n '/^current_version/{ s/.*= //; p; }' .bumpversion.cfg) 5 | 6 | docker build \ 7 | -t mauvilsa/tesseract-recognize:$VERSION-ubuntu$UBUNTU-pkg \ 8 | -f Dockerfile-pkg \ 9 | --build-arg UBUNTU_TAG=$UBUNTU \ 10 | . 11 | 12 | docker build \ 13 | -t mauvilsa/tesseract-recognize:$VERSION-ubuntu$UBUNTU-pkg-langs \ 14 | -f Dockerfile-pkg-langs \ 15 | --build-arg UBUNTU_TAG=$UBUNTU \ 16 | . 17 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015-present, Mauricio Villegas 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /PageXML.cc: -------------------------------------------------------------------------------- 1 | pagexml/lib/PageXML.cc -------------------------------------------------------------------------------- /PageXML.h: -------------------------------------------------------------------------------- 1 | pagexml/lib/PageXML.h -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NAME 2 | 3 | tesseract-recognize - A tool that does layout analysis and/or text recognition using tesseract and outputs the result in Page XML format. 4 | 5 | 6 | # Requirements (Ubuntu 18.04 & 20.04 & 22.04 & 24.04) 7 | 8 | ## Build 9 | 10 | - make 11 | - cmake 12 | - g++ 13 | - libtesseract-dev 14 | - libgs-dev 15 | - libxslt1-dev 16 | 17 | ## Runtime 18 | 19 | - tesseract-ocr 20 | - ghostscript 21 | - libxslt1.1 22 | 23 | 24 | # Installation and usage 25 | 26 | To compile from source follow the instructions here. If you only want the tool 27 | it might be simpler to use docker as explained in the next section. 28 | 29 | git clone --recursive https://github.com/mauvilsa/tesseract-recognize 30 | mkdir tesseract-recognize/build 31 | cd tesseract-recognize/build 32 | cmake -DCMAKE_INSTALL_PREFIX:PATH=$HOME .. 33 | make install 34 | 35 | tesseract-recognize --help 36 | tesseract-recognize IMAGE1 IMAGE2 -o OUTPUT.xml 37 | tesseract-recognize INPUT.xml -o OUTPUT.xml 38 | 39 | 40 | # Installation and usage (docker) 41 | 42 | The latest docker images are based on Ubuntu 24.04 and use the version of 43 | tesseract from the default package repositories (see the respective [docker hub 44 | page](https://hub.docker.com/r/mauvilsa/tesseract-recognize/)). 45 | 46 | To install first pull the docker image of your choosing, using a command such 47 | as: 48 | 49 | TAG="SELECTED_TAG_HERE" 50 | docker pull mauvilsa/tesseract-recognize:$TAG 51 | 52 | The basic docker image only includes language files for recognition of English, 53 | so for additional languages you need to provide to the docker container the 54 | corresponding tessdata files. There is also an additional docker image that can 55 | be used to create a volume that includes all languages from the tesseract-ocr-* 56 | ubuntu packages. To create this volume run the following: 57 | 58 | docker pull mauvilsa/tesseract-recognize:$TAG-langs 59 | docker run \ 60 | --rm \ 61 | --mount source=tesseract-ocr-tessdata,destination=/opt/tesseract-ocr/tessdata \ 62 | -it mauvilsa/tesseract-recognize:$TAG-langs 63 | 64 | Then there are two possible ways of using the tesseract-recognize docker image, 65 | through a command line interface or through a REST API, as explained in the next 66 | two sections. 67 | 68 | 69 | ## Command line interface 70 | 71 | First download the 72 | [https://github.com/omni-us/docker-command-line-interface](docker-cli), put it 73 | in some directory in your path and make it executable, for example: 74 | 75 | wget -O $HOME/.local/bin https://raw.githubusercontent.com/omni-us/docker-command-line-interface/master/docker-cli 76 | chmod +x $HOME/.local/bin/docker-cli 77 | 78 | As an additional step, you could look at `docker-cli --help` and read about how 79 | to configure bash completion. 80 | 81 | After installing docker-cli, the tesseract-recognize tool can be used like any 82 | other command, i.e. 83 | 84 | docker-cli \ 85 | --ipc=host \ 86 | -- mauvilsa/tesseract-recognize:$TAG \ 87 | tesseract-recognize IMAGE -o OUTPUT.xml 88 | 89 | To recognize other languages using the tessdata volume mentioned previously can 90 | be done as follows 91 | 92 | docker-cli \ 93 | --ipc=host \ 94 | --mount source=tesseract-ocr-tessdata,destination=/opt/tesseract-ocr/tessdata \ 95 | --env TESSDATA_PREFIX=/opt/tesseract-ocr/tessdata \ 96 | -- mauvilsa/tesseract-recognize:$TAG \ 97 | tesseract-recognize --lang deu IMAGE -o OUTPUT.xml 98 | 99 | For convenience you could setup an alias, i.e. 100 | 101 | alias tesseract-recognize-docker="docker-cli --ipc=host --mount source=tesseract-ocr-tessdata,destination=/opt/tesseract-ocr/tessdata --env TESSDATA_PREFIX=/opt/tesseract-ocr/tessdata -- mauvilsa/tesseract-recognize:$TAG tesseract-recognize" 102 | tesseract-recognize-docker --help 103 | 104 | 105 | ## API interface 106 | 107 | The API interface uses a python flask sever that can be accessed through port 108 | 5000 inside the docker container. For example the server could be started as: 109 | 110 | docker run --rm -t -p 5000:5000 mauvilsa/tesseract-recognize:$TAG 111 | 112 | The API exposes the following endpoints: 113 | 114 | Method | Endpoint | Description | Parameters (form fields) 115 | ------ | --------------------------------- | -------------------------------- | ------------------------ 116 | GET | /tesseract-recognize/version | Returns tool version information | - 117 | GET | /tesseract-recognize/help | Returns tool help | - 118 | GET | /tesseract-recognize/swagger.json | The swagger json | - 119 | POST | /tesseract-recognize/process | Recognize given images or xml | **images (array, required):** Image files with names as in page xml. **pagexml (optional):** Page xml file to recognize. **options (optional):** Array of strings with options for the tesseract-recognize tool. 120 | 121 | For illustration purposes the curl command can be used. Processing an input 122 | image with a non-default layout level would be using a POST such as 123 | 124 | curl -o output.xml -F images=@img.png -F options='["--layout", "word"]' http://localhost:5000/tesseract-recognize/process 125 | 126 | To process a page xml file, both the xml and the respective images should be 127 | included in the request, that is for example 128 | 129 | curl -o output.xml -F images=@img1.png -F images=@img2.png -F pagexml=input.xml http://localhost:5000/tesseract-recognize/process 130 | 131 | The API is implemented using Flask-RESTPlus which allows that once the server is 132 | started, you can use a browser to get a more detailed view of the exposed 133 | endpoints by going to http://localhost:5000/tesseract-recognize/swagger. 134 | 135 | 136 | # Viewing results 137 | 138 | The results can be viewed/edited using the Page XML editor available at 139 | https://github.com/mauvilsa/nw-page-editor or using other tools that support 140 | this format such as http://www.primaresearch.org/tools and 141 | https://transkribus.eu/Transkribus/ . 142 | 143 | 144 | # Contributing 145 | 146 | If you intend to contribute, before any commits be sure to first execute 147 | githook-pre-commit to setup (symlink) the pre-commit hook. This hook takes care 148 | of automatically updating the tool version. 149 | -------------------------------------------------------------------------------- /githook-pre-commit: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ### Create pre-commit symlink if unset ### 4 | GITDIR=""; 5 | if [ -d .git ]; then 6 | GITDIR=".git"; 7 | elif [ -f .git ]; then 8 | GITDIR=$(sed -n '/^gitdir:/{ s|.*: ||; p; }' .git); 9 | fi 10 | if [ ! -d "$GITDIR" ]; then 11 | echo "${0##*/}: error: unable to find git directory" 1>&2; 12 | exit 1; 13 | fi 14 | if [ ! -h "$GITDIR/hooks/pre-commit" ]; then 15 | if [ $(realpath --help 2>&1 | grep -c relative) != 0 ]; then 16 | HOOK=$(realpath --relative-to="$GITDIR/hooks" ./githook-pre-commit); 17 | else 18 | HOOK=$(readlink -f ./githook-pre-commit); 19 | fi 20 | ln -fs "$HOOK" "$GITDIR/hooks/pre-commit"; 21 | echo "${0##*/}: creating git pre-commit hook symlink" 1>&2; 22 | exit 1; 23 | fi 24 | 25 | 26 | ### Update versions on files ### 27 | FILES=( $(git status --porcelain | sed -r 's|^ |_|; s|^(.) |\1_|;' | grep -E '^([MRA]|.M)') ); 28 | V=$(date -u +%Y.%m.%d); 29 | 30 | check_change_after_staged () { 31 | [ "${2:1:1}" = "M" ] && 32 | echo "${0##*/}: error: aborting due to file change after staged: $1" 1>&2 && 33 | exit 1; 34 | } 35 | 36 | update_file_version () { 37 | echo "${0##*/}: updating version of $1" 1>&2; 38 | sed -r -i 's|([$"])Version:[^$"]*([$"])|\1Version: '"$V"'\2|' "$1"; 39 | git add "$1"; 40 | } 41 | 42 | n=1; 43 | while [ "$n" -lt "${#FILES[@]}" ]; do 44 | check_change_after_staged "${FILES[$n]}" "${FILES[$((n-1))]}"; 45 | case "${FILES[$n]}" in 46 | tesseract-recognize.cc ) 47 | update_file_version "${FILES[$n]}"; 48 | ;; 49 | *.py ) 50 | update_file_version "${FILES[$n]}"; 51 | echo "${0##*/}: pylint ${FILES[$n]}" 1>&2; 52 | pylint --errors-only "${FILES[$n]}"; 53 | ;; 54 | esac 55 | [ "$?" != "0" ] && exit 1; 56 | n=$((n+2)); 57 | done 58 | 59 | exit 0; 60 | -------------------------------------------------------------------------------- /mock_cv.h: -------------------------------------------------------------------------------- 1 | pagexml/lib/mock_cv.h -------------------------------------------------------------------------------- /old/Dockerfile_ubuntu-langs: -------------------------------------------------------------------------------- 1 | ARG TESSREC_TAG 2 | FROM mauvilsa/tesseract-recognize:TESSREC_TAG 3 | 4 | ### Install all language packages ### 5 | RUN apt-get update --fix-missing \ 6 | && apt-get install -y --no-install-recommends \ 7 | tesseract-ocr-* \ 8 | && apt-get clean \ 9 | && rm -rf /var/lib/apt/lists/* 10 | -------------------------------------------------------------------------------- /old/Dockerfile_ubuntu16.04-github-master: -------------------------------------------------------------------------------- 1 | FROM library/ubuntu:16.04 2 | 3 | MAINTAINER Mauricio Villegas 4 | 5 | ENV DEBIAN_FRONTEND=noninteractive 6 | SHELL ["/bin/bash", "-c"] 7 | 8 | 9 | ### Copy the source code to a temporal directory ### 10 | COPY . /tmp/tesseract-recognize/ 11 | 12 | 13 | ### Install build pre-requisites ### 14 | RUN apt-get update --fix-missing \ 15 | && apt-get install -y --no-install-recommends \ 16 | ca-certificates \ 17 | build-essential \ 18 | cmake \ 19 | git \ 20 | libxml2-dev \ 21 | libxslt1-dev \ 22 | libopencv-dev \ 23 | libmagick++-dev \ 24 | libtool \ 25 | automake \ 26 | autoconf \ 27 | autoconf-archive \ 28 | checkinstall \ 29 | 30 | 31 | ### Install leptonica 1.74 from Ubuntu 17.10 by pinning ### 32 | && echo 'deb http://archive.ubuntu.com/ubuntu artful main restricted universe multiverse' > /etc/apt/sources.list.d/ubuntu17.10.list \ 33 | && echo $'Package: *\nPin: release v=16.04, l=Ubuntu\nPin-Priority: 1000\n' > /etc/apt/preferences \ 34 | && echo $'Package: liblept5\nPin: release v=17.10, l=Ubuntu\nPin-Priority: 1001\n' >> /etc/apt/preferences \ 35 | && echo $'Package: libleptonica-dev\nPin: release v=17.10, l=Ubuntu\nPin-Priority: 1001' >> /etc/apt/preferences \ 36 | && apt-get update \ 37 | && apt-get install -y --no-install-recommends \ 38 | liblept5 \ 39 | libleptonica-dev \ 40 | 41 | 42 | ### Compile and install latest tesseract from repo ### 43 | && git clone https://github.com/tesseract-ocr/tesseract.git /tmp/tesseract \ 44 | && cd /tmp/tesseract \ 45 | && v=$(git log --date=iso -1 | sed -n '/^Date:/{s|^Date: *||;s| .*||;s|-|.|g;p;}') \ 46 | && sed -i "s|TESSERACT_VERSION_STR .*|TESSERACT_VERSION_STR \"$v\"|" ccutil/version.h \ 47 | && ./autogen.sh \ 48 | && ./configure --prefix=/usr \ 49 | #&& CFLAGS="-O2 -DUSE_STD_NAMESPACE" ./configure --prefix=/usr \ 50 | && make -j$(nproc) \ 51 | && echo "tesseract-ocr" > description-pak \ 52 | && checkinstall -y --pkgname=tesseract-ocr --maintainer=mauricio_ville@yahoo.com --pkgversion=$v --pkgrelease=0 \ 53 | 54 | 55 | ### Compile and install tesseract-recognize ### 56 | && cd /tmp/tesseract-recognize \ 57 | && cmake -DCMAKE_BUILD_TYPE=Release . \ 58 | && make install install-docker \ 59 | 60 | 61 | ### Remove build-only software and install runtime pre-requisites ### 62 | && cd \ 63 | && rm -rf /tmp/tesseract-recognize /tmp/tesseract \ 64 | && apt-get purge -y \ 65 | ca-certificates \ 66 | build-essential \ 67 | cmake \ 68 | git \ 69 | libxml2-dev \ 70 | libxslt1-dev \ 71 | libopencv-dev \ 72 | libmagick++-dev \ 73 | libtool \ 74 | automake \ 75 | autoconf \ 76 | autoconf-archive \ 77 | checkinstall \ 78 | && apt-get autoremove -y \ 79 | && apt-get purge -y $(dpkg -l | awk '{if($1=="rc")print $2}') \ 80 | && apt-get install -y --no-install-recommends \ 81 | ghostscript \ 82 | libxml2 \ 83 | libxslt1.1 \ 84 | libopencv-core2.4v5 \ 85 | libmagick++-6.q16-5v5 \ 86 | libgomp1 \ 87 | python-flask \ 88 | python-six \ 89 | && apt-get clean \ 90 | && rm -rf /var/lib/apt/lists/* 91 | 92 | 93 | ### By default start the flask API server ### 94 | CMD ["/usr/bin/python","/usr/local/bin/tesseract_recognize_api.py"] 95 | EXPOSE 5000 96 | -------------------------------------------------------------------------------- /old/Dockerfile_ubuntu16.04-pkg: -------------------------------------------------------------------------------- 1 | FROM library/ubuntu:16.04 2 | 3 | MAINTAINER Mauricio Villegas 4 | 5 | ENV DEBIAN_FRONTEND=noninteractive 6 | 7 | 8 | ### Copy the source code to a temporal directory ### 9 | COPY . /tmp/tesseract-recognize/ 10 | 11 | 12 | ### Install build pre-requisites ### 13 | RUN apt-get update --fix-missing \ 14 | && apt-get install -y --no-install-recommends \ 15 | build-essential \ 16 | cmake \ 17 | tesseract-ocr-dev \ 18 | libleptonica-dev \ 19 | libxml2-dev \ 20 | libxslt1-dev \ 21 | libopencv-dev \ 22 | libmagick++-dev \ 23 | 24 | 25 | ### Compile and install tesseract-recognize ### 26 | && cd /tmp/tesseract-recognize \ 27 | && cmake -DCMAKE_BUILD_TYPE=Release . \ 28 | && make install install-docker \ 29 | 30 | 31 | ### Remove build-only software and install runtime pre-requisites ### 32 | && cd \ 33 | && rm -rf /tmp/tesseract-recognize \ 34 | && apt-get purge -y \ 35 | build-essential \ 36 | cmake \ 37 | tesseract-ocr-dev \ 38 | libleptonica-dev \ 39 | libxml2-dev \ 40 | libxslt1-dev \ 41 | libopencv-dev \ 42 | libmagick++-dev \ 43 | && apt-get autoremove -y \ 44 | && apt-get purge -y $(dpkg -l | awk '{if($1=="rc")print $2}') \ 45 | && apt-get install -y --no-install-recommends \ 46 | tesseract-ocr \ 47 | ghostscript \ 48 | libxml2 \ 49 | libxslt1.1 \ 50 | libopencv-core2.4v5 \ 51 | libmagick++-6.q16-5v5 \ 52 | python-flask \ 53 | python-six \ 54 | && apt-get clean \ 55 | && rm -rf /var/lib/apt/lists/* 56 | 57 | 58 | ### By default start the flask API server ### 59 | CMD ["/usr/bin/python","/usr/local/bin/tesseract_recognize_api.py"] 60 | EXPOSE 5000 61 | -------------------------------------------------------------------------------- /old/Dockerfile_ubuntu18.04-github-master: -------------------------------------------------------------------------------- 1 | FROM library/ubuntu:18.04 2 | 3 | MAINTAINER Mauricio Villegas 4 | 5 | ENV DEBIAN_FRONTEND=noninteractive 6 | SHELL ["/bin/bash", "-c"] 7 | 8 | 9 | ### Copy the source code to a temporal directory ### 10 | COPY . /tmp/tesseract-recognize/ 11 | 12 | 13 | ### Install build pre-requisites ### 14 | RUN apt-get update --fix-missing \ 15 | #&& d=$(apt-cache depends libopencv-dev | sed -n '/Depends: libopencv-.*-dev/{ s|.* ||; s|$|-|; p; }' | grep -v libopencv-core-dev | tr '\n' ' ') \ 16 | && apt-get install -y --no-install-recommends \ 17 | ca-certificates \ 18 | build-essential \ 19 | cmake \ 20 | git \ 21 | libxml2-dev \ 22 | libxslt1-dev \ 23 | libopencv-dev \ 24 | libmagick++-dev \ 25 | libleptonica-dev \ 26 | libtool \ 27 | automake \ 28 | autoconf \ 29 | autoconf-archive \ 30 | #checkinstall \ 31 | #&& d=$(apt-cache depends libopencv-dev | sed -n '/Depends: libopencv-.*-dev/{ s|.* ||; p; }' | grep -v libopencv-core-dev | tr '\n' ' ') \ 32 | && dpkg -r --force-depends libtesseract4 \ 33 | 34 | 35 | ### Compile and install latest tesseract from repo ### 36 | && git clone https://github.com/tesseract-ocr/tesseract.git /tmp/tesseract \ 37 | && cd /tmp/tesseract \ 38 | && ./autogen.sh \ 39 | && ./configure --prefix=/usr \ 40 | #&& CFLAGS="-O2 -DUSE_STD_NAMESPACE" ./configure --prefix=/usr \ 41 | #&& v=$(git log --date=iso -1 | sed -n '/^Date:/{s|^Date: *||;s| .*||;s|-|.|g;p;}') \ 42 | #&& sed -i "s|TESSERACT_VERSION_STR .*|TESSERACT_VERSION_STR \"$v\"|" src/api/tess_version.h \ 43 | && make -j$(nproc) \ 44 | && make install \ 45 | # && echo "tesseract-ocr" > description-pak \ 46 | ## && checkinstall -y --pkgname=tesseract-ocr --maintainer=mauricio_ville@yahoo.com --pkgversion=$v --pkgrelease=0 \ 47 | # && checkinstall -y --pkgname=tesseract-ocr --maintainer=mauricio_ville@yahoo.com --pkgrelease=0 \ 48 | 49 | 50 | ### Compile and install tesseract-recognize ### 51 | && cd /tmp/tesseract-recognize \ 52 | && cmake -DCMAKE_BUILD_TYPE=Release . \ 53 | && make install install-docker \ 54 | 55 | 56 | ### Remove build-only software and install runtime pre-requisites ### 57 | && cd \ 58 | && rm -rf /tmp/tesseract-recognize /tmp/tesseract \ 59 | && apt --fix-broken -y install \ 60 | && apt-get purge -y --fix-broken \ 61 | ca-certificates \ 62 | build-essential \ 63 | cmake \ 64 | git \ 65 | libxml2-dev \ 66 | libxslt1-dev \ 67 | libopencv-dev \ 68 | libmagick++-dev \ 69 | libleptonica-dev \ 70 | libtool \ 71 | automake \ 72 | autoconf \ 73 | autoconf-archive \ 74 | #checkinstall \ 75 | && apt-get autoremove -y \ 76 | && apt-get purge -y $(dpkg -l | awk '{if($1=="rc")print $2}') \ 77 | && apt-get install -y --no-install-recommends \ 78 | ghostscript \ 79 | libxml2 \ 80 | libxslt1.1 \ 81 | libopencv-core3.2 \ 82 | libmagick++-6.q16-7 \ 83 | liblept5 \ 84 | libgomp1 \ 85 | python-flask \ 86 | python-six \ 87 | && apt-get clean \ 88 | && rm -rf /var/lib/apt/lists/* 89 | 90 | 91 | ### By default start the flask API server ### 92 | CMD ["/usr/bin/python","/usr/local/bin/tesseract_recognize_api.py"] 93 | EXPOSE 5000 94 | -------------------------------------------------------------------------------- /old/Dockerfile_ubuntu18.04-pkg: -------------------------------------------------------------------------------- 1 | FROM library/ubuntu:18.04 2 | 3 | MAINTAINER Mauricio Villegas 4 | 5 | ENV DEBIAN_FRONTEND=noninteractive 6 | 7 | 8 | ### Install runtime requirements ### 9 | RUN apt-get update --fix-missing \ 10 | && apt-get install -y --no-install-recommends \ 11 | tesseract-ocr \ 12 | ghostscript \ 13 | libxml2 \ 14 | libxslt1.1 \ 15 | libopencv-core3.2 \ 16 | libmagick++-6.q16-7 \ 17 | python-flask \ 18 | python-six \ 19 | && apt-get clean \ 20 | && rm -rf /var/lib/apt/lists/* \ 21 | && sed '/ 5 | * @copyright Copyright (c) 2015-present, Mauricio Villegas 6 | * @link https://github.com/mauvilsa/tesseract-recognize 7 | * @license MIT License 8 | */ 9 | 10 | /*** Includes *****************************************************************/ 11 | #include 12 | #include 13 | using std::string; 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include <../leptonica/allheaders.h> 21 | #include <../tesseract/baseapi.h> 22 | 23 | #include "PageXML.h" 24 | 25 | /*** Definitions **************************************************************/ 26 | static char tool[] = "tesseract-recognize"; 27 | static char version[] = "Version: 2025.03.31"; 28 | 29 | char gb_page_ns[] = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"; 30 | 31 | char gb_default_lang[] = "eng"; 32 | char gb_default_xpath[] = "//_:TextRegion"; 33 | char gb_default_output[] = "-"; 34 | 35 | char *gb_output = gb_default_output; 36 | char *gb_lang = gb_default_lang; 37 | char *gb_tessdata = NULL; 38 | int gb_psm = tesseract::PSM_AUTO; 39 | int gb_oem = tesseract::OEM_DEFAULT; 40 | bool gb_onlylayout = false; 41 | bool gb_textlevels[] = { false, false, false, false }; 42 | bool gb_textatlayout = true; 43 | char *gb_xpath = gb_default_xpath; 44 | char *gb_image = NULL; 45 | int gb_density = 300; 46 | bool gb_inplace = false; 47 | 48 | bool gb_save_crops = false; 49 | 50 | enum { 51 | LEVEL_REGION = 0, 52 | LEVEL_LINE, 53 | LEVEL_WORD, 54 | LEVEL_GLYPH 55 | }; 56 | 57 | const char* levelStrings[] = { 58 | "region", 59 | "line", 60 | "word", 61 | "glyph" 62 | }; 63 | 64 | inline static int parseLevel( const char* level ) { 65 | int levels = sizeof(levelStrings) / sizeof(levelStrings[0]); 66 | for( int n=0; n= 0x040000 124 | fprintf( stderr, " --oem MODE OCR engine mode (def.=%d)\n", gb_oem ); 125 | #endif 126 | fprintf( stderr, " --layout-level LEVEL Layout output level: region, line, word, glyph (def.=%s)\n", levelStrings[gb_layoutlevel] ); 127 | fprintf( stderr, " --text-levels L1[,L2]+ Text output level(s): region, line, word, glyph (def.=layout-level)\n" ); 128 | fprintf( stderr, " --only-layout Only perform layout analysis, no OCR (def.=%s)\n", strbool(gb_onlylayout) ); 129 | fprintf( stderr, " --save-crops Saves cropped images (def.=%s)\n", strbool(gb_save_crops) ); 130 | fprintf( stderr, " --xpath XPATH xpath for selecting elements to process (def.=%s)\n", gb_xpath ); 131 | fprintf( stderr, " --image IMAGE Use given image instead of one in Page XML\n" ); 132 | fprintf( stderr, " --density DENSITY Density in dpi for pdf rendering (def.=%d)\n", gb_density ); 133 | fprintf( stderr, " --inplace Overwrite input XML with result (def.=%s)\n", strbool(gb_inplace) ); 134 | fprintf( stderr, " -o, --output Output page xml file (def.=%s)\n", gb_output ); 135 | fprintf( stderr, " -h, --help Print this usage information and exit\n" ); 136 | fprintf( stderr, " -v, --version Print version and exit\n" ); 137 | fprintf( stderr, "\n" ); 138 | int r = system( "tesseract --help-psm 2>&1 | sed '/^ *[02] /d; s| (Default)||;' 1>&2" ); 139 | if( r != 0 ) 140 | fprintf( stderr, "warning: tesseract command not found in path\n" ); 141 | #if TESSERACT_VERSION >= 0x040000 142 | fprintf( stderr, "\n" ); 143 | r += system( "tesseract --help-oem 1>&2" ); 144 | #endif 145 | fprintf( stderr, "Examples:\n" ); 146 | fprintf( stderr, " %s -o out.xml in1.png in2.png ### Multiple images as input\n", tool ); 147 | fprintf( stderr, " %s -o out.xml in.tiff ### TIFF possibly with multiple frames\n", tool ); 148 | fprintf( stderr, " %s -o out.xml --density 200 in.pdf\n", tool ); 149 | fprintf( stderr, " %s -o out.xml --xpath //_:Page in.xml ### Empty page xml recognize the complete pages\n", tool ); 150 | fprintf( stderr, " %s -o out.xml --psm 1 in.png ### Detect page orientation pages\n", tool ); 151 | fprintf( stderr, " %s -o out.xml --xpath \"//_:TextRegion[@id='r1']\" --layout-level word --only-layout in.xml ### Detect text lines and words only in TextRegion with id=r1\n", tool ); 152 | } 153 | 154 | 155 | void setCoords( tesseract::ResultIterator* iter, tesseract::PageIteratorLevel iter_level, PageXML& page, xmlNodePtr& xelem, int x, int y, tesseract::Orientation orientation = tesseract::ORIENTATION_PAGE_UP ) { 156 | int left, top, right, bottom; 157 | int pagenum = page.getPageNumber(xelem); 158 | iter->BoundingBox( iter_level, &left, &top, &right, &bottom ); 159 | std::vector points; 160 | if ( left == 0 && top == 0 && right == (int)page.getPageWidth(pagenum) && bottom == (int)page.getPageHeight(pagenum) ) 161 | points = { cv::Point2f(0,0), cv::Point2f(0,0) }; 162 | else { 163 | cv::Point2f tl(x+left,y+top); 164 | cv::Point2f tr(x+right,y+top); 165 | cv::Point2f br(x+right,y+bottom); 166 | cv::Point2f bl(x+left,y+bottom); 167 | switch( orientation ) { 168 | case tesseract::ORIENTATION_PAGE_UP: points = { tl, tr, br, bl }; break; 169 | case tesseract::ORIENTATION_PAGE_RIGHT: points = { tr, br, bl, tl }; break; 170 | case tesseract::ORIENTATION_PAGE_LEFT: points = { bl, tl, tr, br }; break; 171 | case tesseract::ORIENTATION_PAGE_DOWN: points = { br, bl, tl, tr }; break; 172 | } 173 | } 174 | page.setCoords( xelem, points ); 175 | } 176 | 177 | void setLineCoords( tesseract::ResultIterator* iter, tesseract::PageIteratorLevel iter_level, PageXML& page, xmlNodePtr& xelem, int x, int y, tesseract::Orientation orientation ) { 178 | setCoords( iter, iter_level, page, xelem, x, y, orientation ); 179 | std::vector coords = page.getPoints( xelem ); 180 | int x1, y1, x2, y2; 181 | iter->Baseline( iter_level, &x1, &y1, &x2, &y2 ); 182 | cv::Point2f b_p1(x+x1,y+y1), b_p2(x+x2,y+y2); 183 | cv::Point2f baseline_p1, baseline_p2; 184 | if ( ! page.intersection( b_p1, b_p2, coords[0], coords[3], baseline_p1 ) || 185 | ! page.intersection( b_p1, b_p2, coords[1], coords[2], baseline_p2 ) ) { 186 | std::string lid = page.getAttr(xelem,"id"); 187 | fprintf(stderr,"warning: no intersection between baseline and bounding box sides id=%s\n",lid.c_str()); 188 | std::vector baseline = { 189 | cv::Point2f(x+x1,y+y1), 190 | cv::Point2f(x+x2,y+y2) }; 191 | page.setBaseline( xelem, baseline ); 192 | return; 193 | } 194 | std::vector baseline = { baseline_p1, baseline_p2 }; 195 | page.setBaseline( xelem, baseline ); 196 | double up1 = cv::norm( baseline_p1 - coords[0] ); 197 | double up2 = cv::norm( baseline_p2 - coords[1] ); 198 | double down1 = cv::norm( baseline_p1 - coords[3] ); 199 | double down2 = cv::norm( baseline_p2 - coords[2] ); 200 | double height = 0.5*( up1 + up2 + down1 + down2 ); 201 | double offset = height <= 0.0 ? 0.0 : 0.5*( down1 + down2 ) / height; 202 | page.setPolystripe( xelem, height <= 0.0 ? 1.0 : height, offset, false ); 203 | } 204 | 205 | void setTextEquiv( tesseract::ResultIterator* iter, tesseract::PageIteratorLevel iter_level, PageXML& page, xmlNodePtr& xelem ) { 206 | double conf = 0.01*iter->Confidence( iter_level ); 207 | char* text = iter->GetUTF8Text( iter_level ); 208 | std::string stext(text); 209 | stext = std::regex_replace( stext, std::regex("^\\s+|\\s+$"), "$1" ); 210 | page.setTextEquiv( xelem, stext.c_str(), &conf ); 211 | delete[] text; 212 | } 213 | 214 | template 215 | void split( const std::string &s, char delim, Out result ) { 216 | std::stringstream ss(s); 217 | std::string item; 218 | while( std::getline(ss, item, delim) ) 219 | *(result++) = item; 220 | } 221 | 222 | std::set parsePagesSet( std::string range ) { 223 | std::set pages_set; 224 | std::vector parts; 225 | split( range, ',', std::back_inserter(parts) ); 226 | for( auto part : parts ) { 227 | std::string::size_type dash_pos = part.find('-'); 228 | if( dash_pos == std::string::npos ) 229 | pages_set.insert(stoi(part)); 230 | else 231 | for( int num=stoi(part.substr(0, dash_pos)); num<=stoi(part.substr(dash_pos+1)); num++ ) 232 | pages_set.insert(num); 233 | } 234 | return pages_set; 235 | } 236 | 237 | 238 | /*** Program ******************************************************************/ 239 | int main( int argc, char *argv[] ) { 240 | 241 | /// Disable debugging and informational messages from Leptonica. /// 242 | setMsgSeverity(L_SEVERITY_ERROR); 243 | 244 | /// Parse input arguments /// 245 | int n,m; 246 | std::stringstream test; 247 | std::string token; 248 | while ( ( n = getopt_long(argc,argv,gb_short_options,gb_long_options,&m) ) != -1 ) 249 | switch ( n ) { 250 | case OPTION_TESSDATA: 251 | gb_tessdata = optarg; 252 | break; 253 | case OPTION_LANG: 254 | gb_lang = optarg; 255 | break; 256 | case OPTION_PSM: 257 | gb_psm = atoi(optarg); 258 | if( gb_psm < tesseract::PSM_AUTO_OSD || gb_psm == tesseract::PSM_AUTO_ONLY || gb_psm >= tesseract::PSM_COUNT ) { 259 | fprintf( stderr, "%s: error: invalid page segmentation mode: %s\n", tool, optarg ); 260 | return 1; 261 | } 262 | break; 263 | #if TESSERACT_VERSION >= 0x040000 264 | case OPTION_OEM: 265 | gb_oem = atoi(optarg); 266 | if( gb_oem < tesseract::OEM_TESSERACT_ONLY || gb_oem >= tesseract::OEM_COUNT ) { 267 | fprintf( stderr, "%s: error: invalid OCR engine mode: %s\n", tool, optarg ); 268 | return 1; 269 | } 270 | break; 271 | #endif 272 | case OPTION_LAYOUTLEVEL: 273 | gb_layoutlevel = parseLevel(optarg); 274 | if( gb_layoutlevel == -1 ) { 275 | fprintf( stderr, "%s: error: invalid level: %s\n", tool, optarg ); 276 | return 1; 277 | } 278 | break; 279 | case OPTION_TEXTLEVELS: 280 | test = std::stringstream(optarg); 281 | while( std::getline(test, token, ',') ) { 282 | int textlevel = parseLevel(token.c_str()); 283 | if( textlevel == -1 ) { 284 | fprintf( stderr, "%s: error: invalid level: %s\n", tool, token.c_str() ); 285 | return 1; 286 | } 287 | gb_textlevels[textlevel] = true; 288 | gb_textatlayout = false; 289 | } 290 | break; 291 | case OPTION_ONLYLAYOUT: 292 | gb_onlylayout = true; 293 | break; 294 | case OPTION_SAVECROPS: 295 | gb_save_crops = true; 296 | break; 297 | case OPTION_XPATH: 298 | gb_xpath = optarg; 299 | break; 300 | case OPTION_IMAGE: 301 | gb_image = optarg; 302 | break; 303 | case OPTION_DENSITY: 304 | gb_density = atoi(optarg); 305 | break; 306 | case OPTION_INPLACE: 307 | gb_inplace = true; 308 | break; 309 | case OPTION_OUTPUT: 310 | gb_output = optarg; 311 | break; 312 | case OPTION_HELP: 313 | print_usage(); 314 | return 0; 315 | case OPTION_VERSION: 316 | fprintf( stderr, "%s %s\n", tool, version+9 ); 317 | fprintf( stderr, "compiled against PageXML %s\n", PageXML::version() ); 318 | #ifdef TESSERACT_VERSION_STR 319 | fprintf( stderr, "compiled against tesseract %s, linked with %s\n", TESSERACT_VERSION_STR, tesseract::TessBaseAPI::Version() ); 320 | #else 321 | fprintf( stderr, "linked with tesseract %s\n", tesseract::TessBaseAPI::Version() ); 322 | #endif 323 | return 0; 324 | default: 325 | fprintf( stderr, "%s: error: incorrect input argument: %s\n", tool, argv[optind-1] ); 326 | return 1; 327 | } 328 | 329 | /// Default text level /// 330 | if ( gb_textatlayout ) 331 | gb_textlevels[gb_layoutlevel] = true; 332 | 333 | /// Check that there is at least one non-option argument /// 334 | if ( optind >= argc ) { 335 | fprintf( stderr, "%s: error: at least one input file must be provided, see usage with --help\n", tool ); 336 | return 1; 337 | } 338 | 339 | /// Initialize tesseract just for layout or with given language and tessdata path/// 340 | tesseract::TessBaseAPI *tessApi = new tesseract::TessBaseAPI(); 341 | 342 | if ( gb_onlylayout && gb_psm != tesseract::PSM_AUTO_OSD ) 343 | tessApi->InitForAnalysePage(); 344 | else 345 | #if TESSERACT_VERSION >= 0x040000 346 | if ( tessApi->Init( gb_tessdata, gb_lang, (tesseract::OcrEngineMode)gb_oem ) ) { 347 | #else 348 | if ( tessApi->Init( gb_tessdata, gb_lang) ) { 349 | #endif 350 | fprintf( stderr, "%s: error: could not initialize tesseract\n", tool ); 351 | return 1; 352 | } 353 | 354 | tessApi->SetPageSegMode( (tesseract::PageSegMode)gb_psm ); 355 | 356 | PageXML page; 357 | int num_pages = 0; 358 | bool pixRelease = false; 359 | std::vector images; 360 | tesseract::ResultIterator* iter = NULL; 361 | 362 | std::regex reIsXml(".+\\.xml$|^-$",std::regex_constants::icase); 363 | std::regex reIsTiff(".+\\.tif{1,2}(|\\[[-, 0-9]+\\])$",std::regex_constants::icase); 364 | std::regex reIsPdf(".+\\.pdf(|\\[[-, 0-9]+\\])$",std::regex_constants::icase); 365 | std::regex reImagePageNum("(.+)\\[([-, 0-9]+)\\]$"); 366 | std::cmatch base_match; 367 | char *input_file = argv[optind]; 368 | bool input_xml = std::regex_match(input_file,base_match,reIsXml); 369 | 370 | /// Inplace only when XML input and output not specified /// 371 | if ( gb_inplace && ( ! input_xml || strcmp(gb_output,"-") ) ) { 372 | fprintf( stderr, "%s: warning: ignoring --inplace option, output to %s\n", tool, gb_output ); 373 | gb_inplace = false; 374 | } 375 | 376 | /// Info for process element /// 377 | char tool_info[128]; 378 | if ( gb_onlylayout ) 379 | snprintf( tool_info, sizeof tool_info, "%s_v%.10s tesseract_v%s", tool, version+9, tesseract::TessBaseAPI::Version() ); 380 | else 381 | snprintf( tool_info, sizeof tool_info, "%s_v%.10s tesseract_v%s lang=%s", tool, version+9, tesseract::TessBaseAPI::Version(), gb_lang ); 382 | 383 | /// Loop through input files /// 384 | for ( ; optind < argc; optind++ ) { 385 | input_file = argv[optind]; 386 | input_xml = std::regex_match(input_file,base_match,reIsXml); 387 | bool input_tiff = std::regex_match(input_file,base_match,reIsTiff); 388 | bool input_pdf = std::regex_match(input_file,base_match,reIsPdf); 389 | 390 | /// Get selected pages for tiff/pdf if given /// 391 | std::set pages_set; 392 | std::string page_sel; 393 | std::string input_file_str = std::string(input_file); 394 | if ( input_tiff || input_pdf ) { 395 | if( std::regex_match(input_file, base_match, reImagePageNum) ) { 396 | pages_set = parsePagesSet(base_match[2].str()); 397 | page_sel = std::string(base_match[2].str()); 398 | input_file_str = std::string(base_match[1].str()); 399 | } 400 | } 401 | 402 | /// Input is xml /// 403 | if ( input_xml ) { 404 | if ( num_pages > 0 ) { 405 | fprintf( stderr, "%s: error: only a single page xml allowed as input\n", tool ); 406 | return 1; 407 | } 408 | try { 409 | page.loadXml( input_file ); // if input_file is "-" xml is read from stdin 410 | } catch ( const std::exception& e ) { 411 | fprintf( stderr, "%s: error: problems reading xml file: %s\n%s\n", tool, input_file, e.what() ); 412 | return 1; 413 | } 414 | if ( gb_image != NULL ) { 415 | if ( page.count("//_:Page") > 1 ) { 416 | fprintf( stderr, "%s: error: specifying image with multipage xml input not supported\n", tool ); 417 | return 1; 418 | } 419 | page.loadImage( 0, gb_image ); 420 | } 421 | num_pages += page.count("//_:Page"); 422 | 423 | if ( gb_psm == tesseract::PSM_AUTO_OSD && page.count("//_:ImageOrientation") > 0 ) { 424 | fprintf( stderr, "%s: error: refusing to use OSD on page xml that already contains ImageOrientation elements\n", tool ); 425 | return 1; 426 | } 427 | 428 | std::vector sel = page.select(gb_xpath); 429 | int selPages = 0; 430 | for ( n=0; n<(int)sel.size(); n++ ) 431 | if ( page.nodeIs( sel[n], "Page" ) ) 432 | selPages++; 433 | if ( selPages > 0 && selPages != (int)sel.size() ) { 434 | fprintf( stderr, "%s: error: xpath can select Page or non-Page elements but not a mixture of both: %s\n", tool, gb_xpath ); 435 | return 1; 436 | } 437 | 438 | if ( selPages == 0 ) { 439 | pixRelease = true; 440 | images = page.crop( (std::string(gb_xpath)+"/_:Coords").c_str(), NULL, false ); 441 | page.releaseImages(); 442 | } 443 | else { 444 | for ( n=0; n<(int)sel.size(); n++ ) { 445 | NamedImage namedimage; 446 | namedimage.image = NULL; 447 | namedimage.node = sel[n]; 448 | images.push_back( namedimage ); 449 | num_pages++; 450 | } 451 | } 452 | } 453 | 454 | /// Input is tiff image /// 455 | else if ( input_tiff ) { 456 | pixRelease = true; 457 | 458 | /// Read input image /// 459 | PIXA* tiffimage = pixaReadMultipageTiff( input_file_str.c_str() ); 460 | if ( tiffimage == NULL || tiffimage->n == 0 ) { 461 | fprintf( stderr, "%s: error: problems reading tiff image: %s\n", tool, input_file ); 462 | return 1; 463 | } 464 | 465 | if ( pages_set.size() > 0 && tiffimage->n <= *pages_set.rbegin() ) { 466 | fprintf( stderr, "%s: error: invalid page selection (%s) on tiff with %d pages\n", tool, page_sel.c_str(), tiffimage->n+1 ); 467 | return 1; 468 | } 469 | 470 | for ( n=0; nn; n++ ) { 471 | if ( pages_set.size() > 0 && pages_set.find(n) == pages_set.end() ) 472 | continue; 473 | 474 | PageImage image = pixClone(tiffimage->pix[n]); 475 | std::string pagepath = input_file_str+"["+std::to_string(n)+"]"; 476 | NamedImage namedimage; 477 | namedimage.image = image; 478 | if ( num_pages == 0 ) 479 | namedimage.node = page.newXml( tool_info, pagepath.c_str(), pixGetWidth(image), pixGetHeight(image), gb_page_ns ); 480 | else 481 | namedimage.node = page.addPage( pagepath.c_str(), pixGetWidth(image), pixGetHeight(image) ); 482 | images.push_back( namedimage ); 483 | num_pages++; 484 | } 485 | 486 | pixaDestroy(&tiffimage); 487 | } 488 | 489 | /// Input is pdf /// 490 | else if ( input_pdf ) { 491 | std::vector< std::pair > pdf_pages = gsGetPdfPageSizes(input_file_str); 492 | if ( pages_set.size() > 0 && (int)pdf_pages.size() <= *pages_set.rbegin() ) { 493 | fprintf( stderr, "%s: error: invalid page selection (%s) on pdf with %d pages\n", tool, page_sel.c_str(), (int)pdf_pages.size() ); 494 | return 1; 495 | } 496 | 497 | for ( n=0; n<(int)pdf_pages.size(); n++ ) { 498 | if ( pages_set.size() > 0 && pages_set.find(n) == pages_set.end() ) 499 | continue; 500 | 501 | std::string pagepath = input_file_str+"["+std::to_string(n)+"]"; 502 | NamedImage namedimage; 503 | namedimage.image = NULL; 504 | if ( num_pages == 0 ) 505 | namedimage.node = page.newXml( tool_info, pagepath.c_str(), (int)(0.5+pdf_pages[n].first), (int)(0.5+pdf_pages[n].second), gb_page_ns ); 506 | else 507 | namedimage.node = page.addPage( pagepath.c_str(), (int)(0.5+pdf_pages[n].first), (int)(0.5+pdf_pages[n].second) ); 508 | images.push_back( namedimage ); 509 | num_pages++; 510 | } 511 | } 512 | 513 | /// Input is image /// 514 | else { 515 | /// Read input image /// 516 | PageImage image = pixRead( input_file ); 517 | if ( image == NULL ) { 518 | fprintf( stderr, "%s: error: problems reading image: %s\n", tool, input_file ); 519 | return 1; 520 | } 521 | 522 | NamedImage namedimage; 523 | namedimage.image = NULL; 524 | if ( num_pages == 0 ) 525 | namedimage.node = page.newXml( tool_info, input_file, pixGetWidth(image), pixGetHeight(image), gb_page_ns ); 526 | else 527 | namedimage.node = page.addPage( input_file, pixGetWidth(image), pixGetHeight(image) ); 528 | num_pages++; 529 | pixDestroy(&image); 530 | images.push_back( namedimage ); 531 | } 532 | } 533 | 534 | page.processStart(tool_info); 535 | 536 | /// Loop through all images to process /// 537 | for ( n=0; n<(int)images.size(); n++ ) { 538 | xmlNodePtr xpg = page.closest( "Page", images[n].node ); 539 | 540 | if ( images[n].image == NULL ) { 541 | try { 542 | page.loadImage(xpg, NULL, true, gb_density ); 543 | images[n].image = page.getPageImage(n); 544 | } catch ( const std::exception& e ) { 545 | fprintf( stderr, "%s: error: problems loading page image: %s :: %s\n", tool, page.getPageImageFilename(n).c_str(), e.what() ); 546 | return 1; 547 | } 548 | } 549 | 550 | tessApi->SetImage( images[n].image ); 551 | if ( gb_save_crops && input_xml ) { 552 | std::string fout = std::string("crop_")+std::to_string(n)+"_"+images[n].id+".png"; 553 | fprintf( stderr, "%s: writing cropped image: %s\n", tool, fout.c_str() ); 554 | pixWriteImpliedFormat( fout.c_str(), images[n].image, 0, 0 ); 555 | } 556 | 557 | /// For xml input setup node level /// 558 | xmlNodePtr node = NULL; 559 | int node_level = -1; 560 | if ( input_xml ) { 561 | node = images[n].node->parent; 562 | if ( page.nodeIs( node, "TextRegion" ) ) 563 | node_level = LEVEL_REGION; 564 | else if ( page.nodeIs( node, "TextLine" ) ) { 565 | node_level = LEVEL_LINE; 566 | if ( gb_psm != tesseract::PSM_SINGLE_LINE && gb_psm != tesseract::PSM_RAW_LINE ) { 567 | fprintf( stderr, "%s: error: for xml input selecting text lines, valid page segmentation modes are %d and %d\n", tool, tesseract::PSM_SINGLE_LINE, tesseract::PSM_RAW_LINE ); 568 | return 1; 569 | } 570 | } 571 | else if ( page.nodeIs( node, "Word" ) ) { 572 | node_level = LEVEL_WORD; 573 | if ( gb_psm != tesseract::PSM_SINGLE_WORD && gb_psm != tesseract::PSM_CIRCLE_WORD ) { 574 | fprintf( stderr, "%s: error: for xml input selecting words, valid page segmentation modes are %d and %d\n", tool, tesseract::PSM_SINGLE_WORD, tesseract::PSM_CIRCLE_WORD ); 575 | return 1; 576 | } 577 | } 578 | else if ( page.nodeIs( node, "Glyph" ) ) { 579 | node_level = LEVEL_GLYPH; 580 | if ( gb_psm != tesseract::PSM_SINGLE_CHAR ) { 581 | fprintf( stderr, "%s: error: for xml input selecting glyphs, the only valid page segmentation mode is %d\n", tool, tesseract::PSM_SINGLE_CHAR ); 582 | return 1; 583 | } 584 | } 585 | if ( gb_layoutlevel < node_level ) { 586 | fprintf( stderr, "%s: error: layout level lower than xpath selection level\n", tool ); 587 | return 1; 588 | } 589 | } 590 | 591 | /// Perform layout analysis /// 592 | if ( gb_onlylayout && gb_psm != tesseract::PSM_AUTO_OSD ) 593 | iter = (tesseract::ResultIterator*)( tessApi->AnalyseLayout() ); 594 | 595 | /// Perform recognition /// 596 | else { 597 | tessApi->Recognize( 0 ); 598 | iter = tessApi->GetIterator(); 599 | } 600 | 601 | if ( iter != NULL && ! iter->Empty( tesseract::RIL_BLOCK ) ) { 602 | /// Orientation and Script Detection /// 603 | tesseract::Orientation orientation; 604 | tesseract::WritingDirection writing_direction; 605 | tesseract::TextlineOrder textline_order; 606 | float deskew_angle; 607 | iter->Orientation( &orientation, &writing_direction, &textline_order, &deskew_angle ); 608 | 609 | if ( gb_psm == tesseract::PSM_AUTO_OSD ) { 610 | if ( deskew_angle != 0.0 ) 611 | page.setProperty( xpg, "deskewAngle", deskew_angle ); 612 | switch ( orientation ) { 613 | case tesseract::ORIENTATION_PAGE_RIGHT: page.setProperty( xpg, "apply-image-orientation", -90 ); break; 614 | case tesseract::ORIENTATION_PAGE_LEFT: page.setProperty( xpg, "apply-image-orientation", 90 ); break; 615 | case tesseract::ORIENTATION_PAGE_DOWN: page.setProperty( xpg, "apply-image-orientation", 180 ); break; 616 | default: break; 617 | } 618 | switch ( writing_direction ) { 619 | case tesseract::WRITING_DIRECTION_LEFT_TO_RIGHT: page.setProperty( xpg, "readingDirection", "left-to-right" ); break; 620 | case tesseract::WRITING_DIRECTION_RIGHT_TO_LEFT: page.setProperty( xpg, "readingDirection", "right-to-left" ); break; 621 | case tesseract::WRITING_DIRECTION_TOP_TO_BOTTOM: page.setProperty( xpg, "readingDirection", "top-to-bottom" ); break; 622 | } 623 | switch ( textline_order ) { 624 | case tesseract::TEXTLINE_ORDER_LEFT_TO_RIGHT: page.setProperty( xpg, "textLineOrder", "left-to-right" ); break; 625 | case tesseract::TEXTLINE_ORDER_RIGHT_TO_LEFT: page.setProperty( xpg, "textLineOrder", "right-to-left" ); break; 626 | case tesseract::TEXTLINE_ORDER_TOP_TO_BOTTOM: page.setProperty( xpg, "textLineOrder", "top-to-bottom" ); break; 627 | } 628 | } 629 | 630 | /// Loop through blocks /// 631 | int block = 0; 632 | while ( gb_layoutlevel >= LEVEL_REGION ) { 633 | /// Skip non-text blocks /// 634 | /* 635 | 0 PT_UNKNOWN, // Type is not yet known. Keep as the first element. 636 | 1 PT_FLOWING_TEXT, // Text that lives inside a column. 637 | 2 PT_HEADING_TEXT, // Text that spans more than one column. 638 | 3 PT_PULLOUT_TEXT, // Text that is in a cross-column pull-out region. 639 | 4 PT_EQUATION, // Partition belonging to an equation region. 640 | 5 PT_INLINE_EQUATION, // Partition has inline equation. 641 | 6 PT_TABLE, // Partition belonging to a table region. 642 | 7 PT_VERTICAL_TEXT, // Text-line runs vertically. 643 | 8 PT_CAPTION_TEXT, // Text that belongs to an image. 644 | 9 PT_FLOWING_IMAGE, // Image that lives inside a column. 645 | 10 PT_HEADING_IMAGE, // Image that spans more than one column. 646 | 11 PT_PULLOUT_IMAGE, // Image that is in a cross-column pull-out region. 647 | 12 PT_HORZ_LINE, // Horizontal Line. 648 | 13 PT_VERT_LINE, // Vertical Line. 649 | 14 PT_NOISE, // Lies outside of any column. 650 | */ 651 | if ( iter->BlockType() > tesseract::PT_CAPTION_TEXT ) { 652 | if ( ! iter->Next( tesseract::RIL_BLOCK ) ) 653 | break; 654 | continue; 655 | } 656 | 657 | block++; 658 | 659 | xmlNodePtr xreg = NULL; 660 | std::string rid = "b" + std::to_string(block); 661 | 662 | /// If xml input and region selected, prepend id to rid and set xreg to node /// 663 | if ( node_level == LEVEL_REGION ) { 664 | rid = std::string(images[n].id) + "_" + rid; 665 | xreg = node; 666 | } 667 | 668 | /// If it is multipage, prepend page number to rid /// 669 | if ( num_pages > 1 ) 670 | rid = std::string("pg") + std::to_string(1+page.getPageNumber(xpg)) + "_" + rid; 671 | 672 | /// Otherwise add block as TextRegion element /// 673 | if ( node_level < LEVEL_REGION ) { 674 | xreg = page.addTextRegion( xpg, rid.c_str() ); 675 | 676 | /// Set block bounding box and text /// 677 | setCoords( iter, tesseract::RIL_BLOCK, page, xreg, images[n].x, images[n].y ); 678 | if ( ! gb_onlylayout && gb_textlevels[LEVEL_REGION] ) 679 | setTextEquiv( iter, tesseract::RIL_BLOCK, page, xreg ); 680 | } 681 | 682 | /// Set rotation and reading direction /// 683 | /*tesseract::Orientation orientation; 684 | tesseract::WritingDirection writing_direction; 685 | tesseract::TextlineOrder textline_order; 686 | float deskew_angle;*/ 687 | iter->Orientation( &orientation, &writing_direction, &textline_order, &deskew_angle ); 688 | if ( ! input_xml || node_level <= LEVEL_REGION ) { 689 | if ( deskew_angle != 0.0 ) 690 | page.setProperty( xpg, "deskewAngle", deskew_angle ); 691 | PAGEXML_READ_DIRECTION direct = PAGEXML_READ_DIRECTION_LTR; 692 | switch( writing_direction ) { 693 | case tesseract::WRITING_DIRECTION_LEFT_TO_RIGHT: direct = PAGEXML_READ_DIRECTION_LTR; break; 694 | case tesseract::WRITING_DIRECTION_RIGHT_TO_LEFT: direct = PAGEXML_READ_DIRECTION_RTL; break; 695 | case tesseract::WRITING_DIRECTION_TOP_TO_BOTTOM: direct = PAGEXML_READ_DIRECTION_TTB; break; 696 | } 697 | page.setReadingDirection( xreg, direct ); 698 | /*float orient = 0.0; 699 | switch( orientation ) { 700 | case tesseract::ORIENTATION_PAGE_UP: orient = 0.0; break; 701 | case tesseract::ORIENTATION_PAGE_RIGHT: orient = -90.0; break; 702 | case tesseract::ORIENTATION_PAGE_LEFT: orient = 90.0; break; 703 | case tesseract::ORIENTATION_PAGE_DOWN: orient = 180.0; break; 704 | } 705 | page.setRotation( xreg, orient );*/ 706 | } 707 | 708 | /// Loop through paragraphs in current block /// 709 | int para = 0; 710 | while ( gb_layoutlevel >= LEVEL_REGION ) { 711 | para++; 712 | 713 | /// Loop through lines in current paragraph /// 714 | int line = 0; 715 | while ( gb_layoutlevel >= LEVEL_LINE ) { 716 | line++; 717 | 718 | xmlNodePtr xline = NULL; 719 | 720 | /// If xml input and line selected, set xline to node /// 721 | if ( node_level == LEVEL_LINE ) 722 | xline = node; 723 | 724 | /// Otherwise add TextLine element /// 725 | else if ( node_level < LEVEL_LINE ) { 726 | std::string lid = rid + "_p" + std::to_string(para) + "_l" + std::to_string(line); 727 | xline = page.addTextLine( xreg, lid.c_str() ); 728 | } 729 | 730 | /// Set line bounding box, baseline and text /// 731 | if ( xline != NULL ) { 732 | setLineCoords( iter, tesseract::RIL_TEXTLINE, page, xline, images[n].x, images[n].y, orientation ); 733 | if ( ! gb_onlylayout && gb_textlevels[LEVEL_LINE] ) 734 | setTextEquiv( iter, tesseract::RIL_TEXTLINE, page, xline ); 735 | } 736 | 737 | /// Loop through words in current text line /// 738 | while ( gb_layoutlevel >= LEVEL_WORD ) { 739 | xmlNodePtr xword = NULL; 740 | 741 | /// If xml input and word selected, set xword to node /// 742 | if ( node_level == LEVEL_WORD ) 743 | xword = node; 744 | 745 | /// Otherwise add Word element /// 746 | else if ( node_level < LEVEL_WORD ) 747 | xword = page.addWord( xline ); 748 | 749 | /// Set word bounding box and text /// 750 | if ( xword != NULL ) { 751 | setCoords( iter, tesseract::RIL_WORD, page, xword, images[n].x, images[n].y, orientation ); 752 | if ( ! gb_onlylayout && gb_textlevels[LEVEL_WORD] ) 753 | setTextEquiv( iter, tesseract::RIL_WORD, page, xword ); 754 | } 755 | 756 | /// Loop through symbols in current word /// 757 | while ( gb_layoutlevel >= LEVEL_GLYPH ) { 758 | /// Set xglyph to node or add new Glyph element depending on the case /// 759 | xmlNodePtr xglyph = node_level == LEVEL_GLYPH ? node : page.addGlyph( xword ); 760 | 761 | /// Set symbol bounding box and text /// 762 | setCoords( iter, tesseract::RIL_SYMBOL, page, xglyph, images[n].x, images[n].y, orientation ); 763 | if ( ! gb_onlylayout && gb_textlevels[LEVEL_GLYPH] ) 764 | setTextEquiv( iter, tesseract::RIL_SYMBOL, page, xglyph ); 765 | 766 | if ( iter->IsAtFinalElement( tesseract::RIL_WORD, tesseract::RIL_SYMBOL ) ) 767 | break; 768 | iter->Next( tesseract::RIL_SYMBOL ); 769 | } // while ( gb_layoutlevel >= LEVEL_GLYPH ) { 770 | 771 | if ( iter->IsAtFinalElement( tesseract::RIL_TEXTLINE, tesseract::RIL_WORD ) ) 772 | break; 773 | iter->Next( tesseract::RIL_WORD ); 774 | } // while ( gb_layoutlevel >= LEVEL_WORD ) { 775 | 776 | if ( iter->IsAtFinalElement( tesseract::RIL_PARA, tesseract::RIL_TEXTLINE ) ) 777 | break; 778 | iter->Next( tesseract::RIL_TEXTLINE ); 779 | } // while ( gb_layoutlevel >= LEVEL_LINE ) { 780 | 781 | if ( iter->IsAtFinalElement( tesseract::RIL_BLOCK, tesseract::RIL_PARA ) ) 782 | break; 783 | iter->Next( tesseract::RIL_PARA ); 784 | } // while ( gb_layoutlevel >= LEVEL_REGION ) { 785 | 786 | if ( ! iter->Next( tesseract::RIL_BLOCK ) ) 787 | break; 788 | } // while ( gb_layoutlevel >= LEVEL_REGION ) { 789 | } // if ( iter != NULL && ! iter->Empty( tesseract::RIL_BLOCK ) ) { 790 | page.releaseImage(xpg); 791 | } // for ( n=0; n<(int)images.size(); n++ ) { 792 | 793 | /// Apply image orientations /// 794 | std::vector sel = page.select("//_:Page[_:Property/@key='apply-image-orientation']"); 795 | for ( n=(int)sel.size()-1; n>=0; n-- ) { 796 | int angle = atoi( page.getPropertyValue( sel[n], "apply-image-orientation" ).c_str() ); 797 | if ( angle ) 798 | page.rotatePage( -angle, sel[n], true ); 799 | page.rmElems( page.select("_:Property[@key='apply-image-orientation']", sel[n]) ); 800 | std::vector lines = page.select(".//_:TextLine",sel[n]); 801 | /// Fix image orientation using baselines /// 802 | if ( lines.size() > 0 ) { 803 | double domangle = page.getDominantBaselinesOrientation(lines); 804 | angle = 0; 805 | if ( domangle >= M_PI/4 && domangle < 3*M_PI/4 ) 806 | angle = -90; 807 | else if ( domangle <= -M_PI/4 && domangle > -3*M_PI/4 ) 808 | angle = 90; 809 | else if ( domangle >= 3*M_PI/4 || domangle <= -3*M_PI/4 ) 810 | angle = 180; 811 | if ( angle ) 812 | page.rotatePage(angle, sel[n], true); 813 | } 814 | } 815 | 816 | /// Fill in "0,0 0,0" Word Coords /// 817 | sel = page.select("//_:Word[_:Coords/@points='0,0 0,0']"); 818 | for ( n=(int)sel.size()-1; n>=0; n-- ) { 819 | xmlNodePtr elem = sel[n]; 820 | xmlNodePtr elem_pre = page.selectNth("preceding-sibling::_:Word[_:Coords/@points!='0,0 0,0']", -1, elem); 821 | xmlNodePtr elem_fol = page.selectNth("following-sibling::_:Word[_:Coords/@points!='0,0 0,0']", 0, elem); 822 | if ( elem_pre == NULL && elem_fol == NULL ) { 823 | page.setCoords(elem, page.getPoints(page.parent(elem))); 824 | page.setProperty(elem, "coords-unk-filler"); 825 | continue; 826 | } 827 | std::vector pts_pre = page.getPoints(elem_pre); 828 | std::vector pts_fol = page.getPoints(elem_fol); 829 | std::vector pts; 830 | if ( elem_pre != NULL && elem_fol != NULL ) { 831 | pts.push_back(pts_pre[1]); 832 | pts.push_back(pts_fol[0]); 833 | pts.push_back(pts_fol[3]); 834 | pts.push_back(pts_pre[2]); 835 | } 836 | else if ( elem_pre != NULL ) { 837 | cv::Point2f upper = pts_pre[1] - pts_pre[0]; 838 | cv::Point2f lower = pts_pre[2] - pts_pre[3]; 839 | upper = upper/cv::norm(upper) + pts_pre[1]; 840 | lower = lower/cv::norm(lower) + pts_pre[2]; 841 | pts.push_back(pts_pre[1]); 842 | pts.push_back(upper); 843 | pts.push_back(lower); 844 | pts.push_back(pts_pre[2]); 845 | } 846 | else { 847 | cv::Point2f upper = pts_fol[0] - pts_fol[1]; 848 | cv::Point2f lower = pts_fol[3] - pts_fol[2]; 849 | upper = upper/cv::norm(upper) + pts_fol[0]; 850 | lower = lower/cv::norm(lower) + pts_fol[3]; 851 | pts.push_back(upper); 852 | pts.push_back(pts_fol[0]); 853 | pts.push_back(pts_fol[3]); 854 | pts.push_back(lower); 855 | } 856 | page.setCoords(elem, pts); 857 | page.setProperty(elem, "coords-unk-filler"); 858 | } 859 | 860 | /// Try to make imageFilename be a relative path w.r.t. the output XML /// 861 | if ( ! input_xml && ! gb_inplace && strcmp(gb_output,"-") ) 862 | page.relativizeImageFilename(gb_output); 863 | 864 | /// Write resulting XML /// 865 | int bytes = page.write( gb_inplace ? input_file : gb_output ); 866 | if ( bytes <= 0 ) 867 | fprintf( stderr, "%s: error: problems writing to output xml\n", tool ); 868 | 869 | /// Release resources /// 870 | if ( pixRelease ) 871 | for ( n=0; n<(int)images.size(); n++ ) 872 | pixDestroy(&(images[n].image)); 873 | tessApi->End(); 874 | delete tessApi; 875 | delete iter; 876 | 877 | return bytes <= 0 ? 1 : 0; 878 | } 879 | -------------------------------------------------------------------------------- /tesseract_recognize_api.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Command line tool for the tesseract-recognize API server.""" 3 | 4 | """ 5 | @author Mauricio Villegas 6 | @copyright Copyright(c) 2017-present, Mauricio Villegas 7 | 8 | @requirements pagexml-slim>=2022.4.12 9 | @requirements jsonargparse>=4.38.0 10 | @requirements flask-restx>=1.3.0 11 | """ 12 | 13 | import os 14 | import re 15 | import sys 16 | import json 17 | import shutil 18 | import queue 19 | import threading 20 | import tempfile 21 | import pagexml 22 | from time import time 23 | from functools import wraps 24 | from subprocess import Popen, PIPE, STDOUT 25 | from jsonargparse import ArgumentParser, ActionYesNo 26 | from flask import Flask, Response, abort 27 | from flask_restx import Api, Resource, reqparse 28 | from werkzeug.datastructures import FileStorage 29 | from werkzeug.exceptions import BadRequest 30 | 31 | 32 | def get_cli_parser(logger=True): 33 | """Returns the parser object for the command line tool.""" 34 | parser = ArgumentParser( 35 | logger=logger, 36 | default_env=True, 37 | description=__doc__) 38 | 39 | parser.add_argument('--cfg', 40 | action='config', 41 | help='Path to a yaml configuration file.') 42 | parser.add_argument('--threads', 43 | type=int, 44 | default=4, 45 | help='Maximum number of tesseract-recognize instances to run in parallel.') 46 | parser.add_argument('--prefix', 47 | default='/tesseract-recognize', 48 | help='Prefix string for all API endpoints. Use "%%s" in string to replace by the API version.') 49 | parser.add_argument('--host', 50 | default='127.0.0.1', 51 | help='Hostname to listen on.') 52 | parser.add_argument('--port', 53 | type=int, 54 | default=5000, 55 | help='Port for the server.') 56 | parser.add_argument('--debug', 57 | action=ActionYesNo, 58 | default=False, 59 | help='Whether to run in debugging mode.') 60 | 61 | return parser 62 | 63 | 64 | def TypePageXML(value): 65 | """Parse Page XML request type. 66 | 67 | Args: 68 | value: The raw type value. 69 | 70 | Returns: 71 | dict[str, {str,PageXML}]: Dictionary including the page xml 'filename', the 'string' representation and the PageXML 'object'. 72 | """ 73 | if type(value) != FileStorage: 74 | raise ValueError('Expected pagexml to be of type FileStorage.') 75 | 76 | spxml = value.read().decode('utf-8') 77 | pxml = pagexml.PageXML() 78 | pxml.loadXmlString(spxml) 79 | 80 | return {'filename': value.filename, 'object': pxml, 'string': spxml} 81 | 82 | 83 | class ParserPageXML(reqparse.RequestParser): 84 | """Class for parsing requests including a Page XML.""" 85 | 86 | def parse_args(self, **kwargs): 87 | """Extension of parse_args that additionally does some Page XML checks.""" 88 | req_dict = super().parse_args(**kwargs) 89 | 90 | if req_dict['pagexml'] is not None and req_dict['images'] is not None: 91 | pxml = req_dict['pagexml']['object'] 92 | images_xml = set() 93 | for page in pxml.select('//_:Page'): 94 | fname = re.sub(r'\[[0-9]+]$', '', pxml.getAttr(page, 'imageFilename')) 95 | images_xml.add(fname) 96 | images_received = [os.path.basename(x.filename) for x in req_dict['images']] 97 | for fname in images_received: 98 | if fname not in images_xml: 99 | raise BadRequest('Received image not referenced in the Page XML: '+fname) 100 | if len(images_xml) != len(images_received): 101 | raise BadRequest('Expected to receive all images referenced in the Page XML ('+str(len(images_xml))+') but only got a subset ('+str(len(images_received))+')') 102 | 103 | return req_dict 104 | 105 | 106 | def write_to_tmpdir(req_dict, prefix='tesseract_recognize_api_tmp_', basedir='/tmp'): 107 | """Writes images and page xml from a request to a temporal directory. 108 | 109 | Args: 110 | req_dict (dict): Parsed Page XML request. 111 | prefix (str): Prefix for temporal directory name. 112 | basedir (str): Base temporal directory. 113 | 114 | Returns: 115 | The path to the temporal directory where saved. 116 | """ 117 | tmpdir = tempfile.mkdtemp(prefix=prefix, dir=basedir) 118 | if req_dict['pagexml'] is not None: 119 | fxml = os.path.basename(req_dict['pagexml']['filename']) 120 | with open(os.path.join(tmpdir, fxml), 'w') as f: 121 | f.write(req_dict['pagexml']['string']) 122 | if req_dict['images'] is not None: 123 | for image in req_dict['images']: 124 | image.save(os.path.join(tmpdir, os.path.basename(image.filename))) 125 | return tmpdir 126 | 127 | 128 | class images_pagexml_request: 129 | """Decorator class for endpoints receiving images with optionally a page xml and responding with a page xml.""" 130 | 131 | def __init__(self, 132 | api, 133 | images_help='Images with file names as referenced in the Page XML if given.', 134 | pagexml_help='Optional valid Page XML file.', 135 | options_help='Optional configuration options to be used for processing.', 136 | response_help='Resulting Page XML after processing.'): 137 | """Initializer for images_pagexml_request class. 138 | 139 | Args: 140 | api (flask_restplus.Api): The flask_restplus Api instance. 141 | images_help (str): Help for images field in swagger documentation. 142 | pagexml_help (str): Help for pagexml field in swagger documentation. 143 | options_help (str): Help for config field in swagger documentation. 144 | response_help (str): Help for pagexml response in swagger documentation. 145 | """ 146 | self.api = api 147 | self.response_help = response_help 148 | 149 | parser = ParserPageXML(bundle_errors=True) 150 | parser.add_argument('images', 151 | location='files', 152 | type=FileStorage, 153 | required=True, 154 | action='append', 155 | help=images_help) 156 | parser.add_argument('pagexml', 157 | location='files', 158 | type=TypePageXML, 159 | required=False, 160 | help=pagexml_help) 161 | parser.add_argument('options', 162 | location='form', 163 | type=str, 164 | required=False, 165 | default=[], 166 | action='append', 167 | help=options_help) 168 | self.parser = parser 169 | 170 | def __call__(self, method): 171 | """Makes a flask_restplus.Resource method expect a page xml and/or respond with a page xml.""" 172 | method = self.api.expect(self.parser)(method) 173 | method = self.api.response(200, description=self.response_help)(method) 174 | method = self.api.produces(['application/xml'])(method) 175 | 176 | @wraps(method) 177 | def images_pagexml_request_wrapper(func): 178 | req_dict = self.parser.parse_args() 179 | pxml = method(func, req_dict) 180 | return Response( 181 | pxml.toString(True), 182 | mimetype='application/xml', 183 | headers={'Content-type': 'application/xml; charset=utf-8'}) 184 | 185 | return images_pagexml_request_wrapper 186 | 187 | 188 | def run_tesseract_recognize(*args): 189 | """Runs a tesseract-recognize command using given arguments.""" 190 | cmd = ['tesseract-recognize'] 191 | cmd.extend(list(args)) 192 | 193 | proc = Popen(cmd, shell=False, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True) 194 | cmd_out = proc.stdout.read().decode("utf-8") 195 | proc.communicate() 196 | cmd_rc = proc.returncode 197 | 198 | return cmd_rc, cmd_out 199 | 200 | 201 | if __name__ == '__main__': 202 | ## Parse config ## 203 | parser = get_cli_parser(logger=os.path.basename(__file__)) 204 | cfg = parser.parse_args(env=True) 205 | 206 | ## Create a Flask WSGI application ## 207 | app = Flask(__name__) # pylint: disable=invalid-name 208 | app.logger = parser.logger 209 | 210 | ## Create a Flask-RESTPlus API ## 211 | api = Api(app, 212 | doc=cfg.prefix+'/swagger', 213 | version='2.0', 214 | prefix=cfg.prefix, 215 | title='tesseract-recognize API', 216 | description='An API for running tesseract-recognition jobs.') 217 | 218 | 219 | ## Definition of endpoints ## 220 | @api.route('/version') 221 | class ServiceVersion(Resource): 222 | @api.response(200, description='Version of the running service.') 223 | @api.produces(['text/plain']) 224 | def get(self): 225 | """Endpoint to get the version of the running service.""" 226 | rc, out = run_tesseract_recognize('--version') 227 | if rc != 0: 228 | abort(500, 'problems getting version from tesseract-recognize command :: '+str(out)) 229 | return Response(out, mimetype='text/plain') 230 | 231 | 232 | @api.route('/help') 233 | class ServiceHelp(Resource): 234 | @api.response(200, description='Help for the running service.') 235 | @api.produces(['text/plain']) 236 | def get(self): 237 | """Endpoint to get the help for the running service.""" 238 | rc, out = run_tesseract_recognize('--help') 239 | if rc != 0: 240 | abort(500, 'problems getting help from tesseract-recognize command :: '+str(out)) 241 | return Response(out, mimetype='text/plain') 242 | 243 | 244 | num_requests = 0 245 | @api.route('/process') 246 | class ProcessRequest(Resource): 247 | @images_pagexml_request(api) 248 | @api.doc(responses={400: 'tesseract-recognize execution failed.'}) 249 | def post(self, req_dict): 250 | """Endpoint for running tesseract-recognize on given images or page xml file.""" 251 | start_time = time() 252 | done_queue = queue.Queue() 253 | process_queue.put((done_queue, req_dict)) 254 | while True: 255 | try: 256 | thread, num_requests, pxml = done_queue.get(True, 0.05) 257 | break 258 | except queue.Empty: 259 | continue 260 | if isinstance(pxml, Exception): 261 | app.logger.error('Request '+str(num_requests)+' on thread '+str(thread)+' unsuccessful, ' 262 | +('%.4g' % (time()-start_time))+' sec. :: '+str(pxml)) 263 | abort(400, 'processing failed :: '+str(pxml)) 264 | else: 265 | app.logger.info('Request '+str(num_requests)+' on thread '+str(thread)+' successful, ' 266 | +('%.4g' % (time()-start_time))+' sec.') 267 | return pxml 268 | 269 | 270 | process_queue = queue.Queue() # type: ignore 271 | 272 | 273 | ## Processor thread function ## 274 | def start_processing(thread, process_queue): 275 | 276 | num_requests = 0 277 | tmpdir = None 278 | while True: 279 | try: 280 | done_queue, req_dict = process_queue.get(True, 0.05) 281 | num_requests += 1 282 | tmpdir = write_to_tmpdir(req_dict) 283 | 284 | opts = list(req_dict['options']) 285 | if len(opts) == 1 and opts[0][0] == '[': 286 | opts = json.loads(opts[0]) 287 | if req_dict['pagexml'] is not None: 288 | opts.append(os.path.join(tmpdir, os.path.basename(req_dict['pagexml']['filename']))) 289 | elif req_dict['images'] is not None: 290 | for image in req_dict['images']: 291 | opts.append(os.path.join(tmpdir, os.path.basename(image.filename))) 292 | else: 293 | raise KeyError('No images found in request.') 294 | opts.extend(['-o', os.path.join(tmpdir, 'output.xml')]) 295 | 296 | rc, out = run_tesseract_recognize(*opts) 297 | if rc != 0: 298 | raise RuntimeError('tesseract-recognize execution failed :: opts: '+str(opts)+' :: '+str(out)) 299 | 300 | pxml = pagexml.PageXML(os.path.join(tmpdir, 'output.xml')) 301 | done_queue.put((thread, num_requests, pxml)) 302 | 303 | except queue.Empty: 304 | continue 305 | except json.decoder.JSONDecodeError as ex: 306 | done_queue.put((thread, num_requests, RuntimeError('JSONDecodeError: '+str(ex)+' while parsing '+opts[0]))) 307 | except Exception as ex: 308 | done_queue.put((thread, num_requests, ex)) 309 | finally: 310 | if not cfg.debug and tmpdir is not None: 311 | shutil.rmtree(tmpdir) 312 | tmpdir = None 313 | 314 | 315 | for thread in range(cfg.threads): 316 | threading.Thread(target=start_processing, args=(thread+1, process_queue)).start() 317 | 318 | 319 | app.run(host=cfg.host, port=cfg.port, debug=cfg.debug) 320 | --------------------------------------------------------------------------------