├── .bumpversion.cfg
├── .dockerignore
├── .github
    └── FUNDING.yaml
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── Dockerfile-pkg
├── Dockerfile-pkg-langs
├── Dockerfile_build.sh
├── LICENSE.md
├── PageXML.cc
├── PageXML.h
├── README.md
├── githook-pre-commit
├── mock_cv.h
├── old
    ├── Dockerfile_ubuntu-langs
    ├── Dockerfile_ubuntu16.04-github-master
    ├── Dockerfile_ubuntu16.04-pkg
    ├── Dockerfile_ubuntu18.04-github-master
    └── Dockerfile_ubuntu18.04-pkg
├── tesseract-recognize.cc
└── tesseract_recognize_api.py


/.bumpversion.cfg:
--------------------------------------------------------------------------------
1 | [bumpversion]
2 | current_version = 2025.03.31
3 | commit = True
4 | tag = True
5 | tag_name = {new_version}
6 | 
7 | [bumpversion:file:tesseract-recognize.cc]
8 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | dockerfiles
 2 | Dockerfile*
 3 | .git
 4 | git*
 5 | *.md
 6 | CMakeFiles
 7 | CMakeCache.txt
 8 | cmake_install.cmake
 9 | Makefile
10 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yaml:
--------------------------------------------------------------------------------
1 | github: mauvilsa
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | CMakeCache.txt
2 | CMakeFiles
3 | Makefile
4 | cmake_install.cmake
5 | tesseract-recognize
6 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "pagexml"]
2 | 	path = pagexml
3 | 	url = https://github.com/omni-us/pagexml.git
4 | [submodule "CMakeModules"]
5 | 	path = CMakeModules
6 | 	url = https://github.com/lbaehren/CMakeModules.git
7 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required( VERSION 2.8.12 )
 2 | project( tesseract-recognize )
 3 | set( tool_EXE tesseract-recognize )
 4 | list( APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/CMakeModules" )
 5 | include( FindPackageHandleStandardArgs )
 6 | #find_package( LibMagic )
 7 | find_package( Ghostscript )
 8 | find_package( PkgConfig )
 9 | pkg_check_modules( lept REQUIRED lept )
10 | pkg_check_modules( tesseract REQUIRED tesseract )
11 | pkg_check_modules( libxml REQUIRED libxml-2.0>=2.9 )
12 | pkg_check_modules( libxslt REQUIRED libxslt )
13 | 
14 | file( GLOB tool_SRC "*.cc" )
15 | add_executable( ${tool_EXE} ${tool_SRC} )
16 | set_property( TARGET ${tool_EXE} PROPERTY CXX_STANDARD 11 )
17 | 
18 | include_directories( SYSTEM ${tesseract_INCLUDEDIR} )
19 | 
20 | add_definitions( -D__PAGEXML_LEPT__ )
21 | #add_definitions( -D__PAGEXML_MAGICK__ )
22 | add_definitions( -D__PAGEXML_GS__ )  # TODO: pdf support is broken, gsRenderPdfPageToPng generates empty png
23 | add_definitions( -D__PAGEXML_SLIM__ )
24 | 
25 | set( CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES};${GHOSTSCRIPT_INCLUDES}" )
26 | 
27 | string( REPLACE ";" " " CFLAGS_STR "-Wall -W ${lept_CFLAGS} ${tesseract_CFLAGS} ${Magick_CFLAGS} ${libxml_CFLAGS} ${libxslt_CFLAGS}" )
28 | set_target_properties( ${tool_EXE} PROPERTIES COMPILE_FLAGS "${CFLAGS_STR}" )
29 | 
30 | include_directories( SYSTEM ${Magick_INCLUDEDIR} ) # To suppress system header warnings
31 | 
32 | #target_link_libraries( ${tool_EXE} ${lept_LDFLAGS} ${tesseract_LDFLAGS} ${libxml_LDFLAGS} -lOpenCL )
33 | target_link_libraries( ${tool_EXE} ${lept_LDFLAGS} ${tesseract_LDFLAGS} ${Magick_LDFLAGS} ${GHOSTSCRIPT_LIBRARIES} ${libxml_LDFLAGS} ${libxslt_LDFLAGS} )
34 | 
35 | install( TARGETS ${tool_EXE} DESTINATION bin )
36 | add_custom_target( install-docker
37 |   cp ${CMAKE_HOME_DIRECTORY}/tesseract-recognize-docker ${CMAKE_HOME_DIRECTORY}/tesseract_recognize_api.py ${CMAKE_INSTALL_PREFIX}/bin )
38 | 
39 | add_custom_target( realclean cd ${CMAKE_HOME_DIRECTORY} COMMAND rm -fr ${tool_EXE} ${tool_EXE}.exe ${tool_EXE}.dSYM CMakeFiles CMakeCache.txt cmake_install.cmake install_manifest.txt Makefile )
40 | 


--------------------------------------------------------------------------------
/Dockerfile-pkg:
--------------------------------------------------------------------------------
 1 | ARG UBUNTU_TAG=24.04
 2 | FROM ubuntu:$UBUNTU_TAG
 3 | 
 4 | ENV DEBIAN_FRONTEND=noninteractive
 5 | 
 6 | RUN apt-get update --fix-missing \
 7 |  && apt-get install -y --no-install-recommends \
 8 |       build-essential \
 9 |       cmake \
10 |       ghostscript \
11 |       libgs-dev \
12 |       libleptonica-dev \
13 |       libtesseract-dev \
14 |       libxml2-dev \
15 |       libxslt1-dev \
16 |       pkg-config \
17 |       python3-pip
18 | 
19 | COPY CMakeModules /tmp/tesseract-recognize/CMakeModules
20 | COPY pagexml /tmp/tesseract-recognize/pagexml
21 | COPY CMakeLists.txt Dockerfile* PageXML* mock_cv.h tesseract-recognize* /tmp/tesseract-recognize/
22 | 
23 | RUN cd /tmp/tesseract-recognize \
24 |  && cmake -DCMAKE_BUILD_TYPE=Release . \
25 |  && make
26 | 
27 | 
28 | FROM ubuntu:$UBUNTU_TAG
29 | 
30 | LABEL maintainer="Mauricio Villegas <mauricio_ville@yahoo.com>"
31 | 
32 | RUN apt-get update --fix-missing \
33 |  && apt-get install -y --no-install-recommends \
34 |       ghostscript \
35 |       libxslt1.1 \
36 |       tesseract-ocr \
37 |       python3-pip \
38 |  && apt-get autoremove -y \
39 |  && apt-get purge -y $(dpkg -l | awk '{if($1=="rc")print $2}') \
40 |  && apt-get clean \
41 |  && rm -rf /var/lib/apt/lists/*
42 | 
43 | COPY --from=0 /tmp/tesseract-recognize/tesseract-recognize /usr/local/bin/
44 | COPY tesseract_recognize_api.py /usr/local/bin/
45 | RUN sed -n '/^@requirements /{ s|^@requirements ||; p; }' /usr/local/bin/tesseract_recognize_api.py > /tmp/requirements.txt \
46 |  && pip3 install --break-system-packages -r /tmp/requirements.txt \
47 |  && rm /tmp/requirements.txt
48 | 
49 | RUN useradd -m -u 1048 -g 0 tesseract
50 | USER 1048
51 | EXPOSE 5000
52 | ENTRYPOINT ["/usr/local/bin/tesseract_recognize_api.py", "--host", "0.0.0.0"]
53 | 


--------------------------------------------------------------------------------
/Dockerfile-pkg-langs:
--------------------------------------------------------------------------------
 1 | ARG UBUNTU_TAG=24.04
 2 | FROM ubuntu:$UBUNTU_TAG
 3 | 
 4 | LABEL maintainer="Mauricio Villegas <mauricio_ville@yahoo.com>"
 5 | 
 6 | ENV DEBIAN_FRONTEND=noninteractive
 7 | 
 8 | # Install all language packages
 9 | RUN apt-get update --fix-missing \
10 |  && apt-get install -y --no-install-recommends \
11 |       rsync \
12 |       tesseract-ocr-* \
13 |  && apt-get clean \
14 |  && rm -rf /var/lib/apt/lists/*
15 | 
16 |  CMD ["rsync", "-av", "/usr/share/tesseract-ocr/5/tessdata", "/opt/tesseract-ocr"]
17 | 


--------------------------------------------------------------------------------
/Dockerfile_build.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | UBUNTU="24.04"
 4 | VERSION=$(sed -r -n '/^current_version/{ s/.*= //; p; }' .bumpversion.cfg)
 5 | 
 6 | docker build \
 7 |   -t mauvilsa/tesseract-recognize:$VERSION-ubuntu$UBUNTU-pkg \
 8 |   -f Dockerfile-pkg \
 9 |   --build-arg UBUNTU_TAG=$UBUNTU \
10 |   .
11 | 
12 | docker build \
13 |   -t mauvilsa/tesseract-recognize:$VERSION-ubuntu$UBUNTU-pkg-langs \
14 |   -f Dockerfile-pkg-langs \
15 |   --build-arg UBUNTU_TAG=$UBUNTU \
16 |   .
17 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015-present, Mauricio Villegas <maurovill+tesseract@gmail.com>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/PageXML.cc:
--------------------------------------------------------------------------------
1 | pagexml/lib/PageXML.cc


--------------------------------------------------------------------------------
/PageXML.h:
--------------------------------------------------------------------------------
1 | pagexml/lib/PageXML.h


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # NAME
  2 | 
  3 | tesseract-recognize - A tool that does layout analysis and/or text recognition using tesseract and outputs the result in Page XML format.
  4 | 
  5 | 
  6 | # Requirements (Ubuntu 18.04 & 20.04 & 22.04 & 24.04)
  7 | 
  8 | ## Build
  9 | 
 10 | - make
 11 | - cmake
 12 | - g++
 13 | - libtesseract-dev
 14 | - libgs-dev
 15 | - libxslt1-dev
 16 | 
 17 | ## Runtime
 18 | 
 19 | - tesseract-ocr
 20 | - ghostscript
 21 | - libxslt1.1
 22 | 
 23 | 
 24 | # Installation and usage
 25 | 
 26 | To compile from source follow the instructions here. If you only want the tool
 27 | it might be simpler to use docker as explained in the next section.
 28 | 
 29 |     git clone --recursive https://github.com/mauvilsa/tesseract-recognize
 30 |     mkdir tesseract-recognize/build
 31 |     cd tesseract-recognize/build
 32 |     cmake -DCMAKE_INSTALL_PREFIX:PATH=$HOME ..
 33 |     make install
 34 |     
 35 |     tesseract-recognize --help
 36 |     tesseract-recognize IMAGE1 IMAGE2 -o OUTPUT.xml
 37 |     tesseract-recognize INPUT.xml -o OUTPUT.xml
 38 | 
 39 | 
 40 | # Installation and usage (docker)
 41 | 
 42 | The latest docker images are based on Ubuntu 24.04 and use the version of
 43 | tesseract from the default package repositories (see the respective [docker hub
 44 | page](https://hub.docker.com/r/mauvilsa/tesseract-recognize/)).
 45 | 
 46 | To install first pull the docker image of your choosing, using a command such
 47 | as:
 48 | 
 49 |     TAG="SELECTED_TAG_HERE"
 50 |     docker pull mauvilsa/tesseract-recognize:$TAG
 51 | 
 52 | The basic docker image only includes language files for recognition of English,
 53 | so for additional languages you need to provide to the docker container the
 54 | corresponding tessdata files. There is also an additional docker image that can
 55 | be used to create a volume that includes all languages from the tesseract-ocr-*
 56 | ubuntu packages. To create this volume run the following:
 57 | 
 58 |     docker pull mauvilsa/tesseract-recognize:$TAG-langs
 59 |     docker run \
 60 |       --rm \
 61 |       --mount source=tesseract-ocr-tessdata,destination=/opt/tesseract-ocr/tessdata \
 62 |       -it mauvilsa/tesseract-recognize:$TAG-langs
 63 | 
 64 | Then there are two possible ways of using the tesseract-recognize docker image,
 65 | through a command line interface or through a REST API, as explained in the next
 66 | two sections.
 67 | 
 68 | 
 69 | ## Command line interface
 70 | 
 71 | First download the
 72 | [https://github.com/omni-us/docker-command-line-interface](docker-cli), put it
 73 | in some directory in your path and make it executable, for example:
 74 | 
 75 |     wget -O $HOME/.local/bin https://raw.githubusercontent.com/omni-us/docker-command-line-interface/master/docker-cli
 76 |     chmod +x $HOME/.local/bin/docker-cli
 77 | 
 78 | As an additional step, you could look at `docker-cli --help` and read about how
 79 | to configure bash completion.
 80 | 
 81 | After installing docker-cli, the tesseract-recognize tool can be used like any
 82 | other command, i.e.
 83 | 
 84 |     docker-cli \
 85 |       --ipc=host \
 86 |       -- mauvilsa/tesseract-recognize:$TAG \
 87 |       tesseract-recognize IMAGE -o OUTPUT.xml
 88 | 
 89 | To recognize other languages using the tessdata volume mentioned previously can
 90 | be done as follows
 91 | 
 92 |     docker-cli \
 93 |       --ipc=host \
 94 |       --mount source=tesseract-ocr-tessdata,destination=/opt/tesseract-ocr/tessdata \
 95 |       --env TESSDATA_PREFIX=/opt/tesseract-ocr/tessdata \
 96 |       -- mauvilsa/tesseract-recognize:$TAG \
 97 |       tesseract-recognize --lang deu IMAGE -o OUTPUT.xml
 98 | 
 99 | For convenience you could setup an alias, i.e.
100 | 
101 |     alias tesseract-recognize-docker="docker-cli --ipc=host --mount source=tesseract-ocr-tessdata,destination=/opt/tesseract-ocr/tessdata --env TESSDATA_PREFIX=/opt/tesseract-ocr/tessdata -- mauvilsa/tesseract-recognize:$TAG tesseract-recognize"
102 |     tesseract-recognize-docker --help
103 | 
104 | 
105 | ## API interface
106 | 
107 | The API interface uses a python flask sever that can be accessed through port
108 | 5000 inside the docker container. For example the server could be started as:
109 | 
110 |     docker run --rm -t -p 5000:5000 mauvilsa/tesseract-recognize:$TAG 
111 | 
112 | The API exposes the following endpoints:
113 | 
114 | Method | Endpoint                          | Description                      | Parameters (form fields)
115 | ------ | --------------------------------- | -------------------------------- | ------------------------
116 | GET    | /tesseract-recognize/version      | Returns tool version information | -
117 | GET    | /tesseract-recognize/help         | Returns tool help                | -
118 | GET    | /tesseract-recognize/swagger.json | The swagger json                 | -
119 | POST   | /tesseract-recognize/process      | Recognize given images or xml    | **images (array, required):** Image files with names as in page xml. **pagexml (optional):** Page xml file to recognize. **options (optional):** Array of strings with options for the tesseract-recognize tool.
120 | 
121 | For illustration purposes the curl command can be used. Processing an input
122 | image with a non-default layout level would be using a POST such as
123 | 
124 |     curl -o output.xml -F images=@img.png -F options='["--layout", "word"]' http://localhost:5000/tesseract-recognize/process
125 | 
126 | To process a page xml file, both the xml and the respective images should be
127 | included in the request, that is for example
128 | 
129 |     curl -o output.xml -F images=@img1.png -F images=@img2.png -F pagexml=input.xml http://localhost:5000/tesseract-recognize/process
130 | 
131 | The API is implemented using Flask-RESTPlus which allows that once the server is
132 | started, you can use a browser to get a more detailed view of the exposed
133 | endpoints by going to http://localhost:5000/tesseract-recognize/swagger.
134 | 
135 | 
136 | # Viewing results
137 | 
138 | The results can be viewed/edited using the Page XML editor available at
139 | https://github.com/mauvilsa/nw-page-editor or using other tools that support
140 | this format such as http://www.primaresearch.org/tools and
141 | https://transkribus.eu/Transkribus/ .
142 | 
143 | 
144 | # Contributing
145 | 
146 | If you intend to contribute, before any commits be sure to first execute
147 | githook-pre-commit to setup (symlink) the pre-commit hook. This hook takes care
148 | of automatically updating the tool version.
149 | 


--------------------------------------------------------------------------------
/githook-pre-commit:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | ### Create pre-commit symlink if unset ###
 4 | GITDIR="";
 5 | if [ -d .git ]; then
 6 |   GITDIR=".git";
 7 | elif [ -f .git ]; then
 8 |   GITDIR=$(sed -n '/^gitdir:/{ s|.*: ||; p; }' .git);
 9 | fi
10 | if [ ! -d "$GITDIR" ]; then
11 |   echo "${0##*/}: error: unable to find git directory" 1>&2;
12 |   exit 1;
13 | fi
14 | if [ ! -h "$GITDIR/hooks/pre-commit" ]; then
15 |   if [ $(realpath --help 2>&1 | grep -c relative) != 0 ]; then
16 |     HOOK=$(realpath --relative-to="$GITDIR/hooks" ./githook-pre-commit);
17 |   else
18 |     HOOK=$(readlink -f ./githook-pre-commit);
19 |   fi
20 |   ln -fs "$HOOK" "$GITDIR/hooks/pre-commit";
21 |   echo "${0##*/}: creating git pre-commit hook symlink" 1>&2;
22 |   exit 1;
23 | fi
24 | 
25 | 
26 | ### Update versions on files ###
27 | FILES=( $(git status --porcelain | sed -r 's|^ |_|; s|^(.) |\1_|;' | grep -E '^([MRA]|.M)') );
28 | V=$(date -u +%Y.%m.%d);
29 | 
30 | check_change_after_staged () {
31 |   [ "${2:1:1}" = "M" ] &&
32 |     echo "${0##*/}: error: aborting due to file change after staged: $1" 1>&2 &&
33 |     exit 1;
34 | }
35 | 
36 | update_file_version () {
37 |   echo "${0##*/}: updating version of $1" 1>&2;
38 |   sed -r -i 's|([$"])Version:[^$"]*([$"])|\1Version: '"$V"'\2|' "$1";
39 |   git add "$1";
40 | }
41 | 
42 | n=1;
43 | while [ "$n" -lt "${#FILES[@]}" ]; do
44 |   check_change_after_staged "${FILES[$n]}" "${FILES[$((n-1))]}";
45 |   case "${FILES[$n]}" in
46 |     tesseract-recognize.cc )
47 |       update_file_version "${FILES[$n]}";
48 |       ;;
49 |     *.py )
50 |       update_file_version "${FILES[$n]}";
51 |       echo "${0##*/}: pylint ${FILES[$n]}" 1>&2;
52 |       pylint --errors-only "${FILES[$n]}";
53 |       ;;
54 |   esac
55 |   [ "$?" != "0" ] && exit 1;
56 |   n=$((n+2));
57 | done
58 | 
59 | exit 0;
60 | 


--------------------------------------------------------------------------------
/mock_cv.h:
--------------------------------------------------------------------------------
1 | pagexml/lib/mock_cv.h


--------------------------------------------------------------------------------
/old/Dockerfile_ubuntu-langs:
--------------------------------------------------------------------------------
 1 | ARG TESSREC_TAG
 2 | FROM mauvilsa/tesseract-recognize:TESSREC_TAG
 3 | 
 4 | ### Install all language packages ###
 5 | RUN apt-get update --fix-missing \
 6 |  && apt-get install -y --no-install-recommends \
 7 |       tesseract-ocr-* \
 8 |  && apt-get clean \
 9 |  && rm -rf /var/lib/apt/lists/*
10 | 


--------------------------------------------------------------------------------
/old/Dockerfile_ubuntu16.04-github-master:
--------------------------------------------------------------------------------
 1 | FROM library/ubuntu:16.04
 2 | 
 3 | MAINTAINER Mauricio Villegas <mauricio_ville@yahoo.com>
 4 | 
 5 | ENV DEBIAN_FRONTEND=noninteractive
 6 | SHELL ["/bin/bash", "-c"]
 7 | 
 8 | 
 9 | ### Copy the source code to a temporal directory ###
10 | COPY . /tmp/tesseract-recognize/
11 | 
12 | 
13 | ### Install build pre-requisites ###
14 | RUN apt-get update --fix-missing \
15 |  && apt-get install -y --no-install-recommends \
16 |       ca-certificates \
17 |       build-essential \
18 |       cmake \
19 |       git \
20 |       libxml2-dev \
21 |       libxslt1-dev \
22 |       libopencv-dev \
23 |       libmagick++-dev \
24 |       libtool \
25 |       automake \
26 |       autoconf \
27 |       autoconf-archive \
28 |       checkinstall \
29 | 
30 | 
31 | ### Install leptonica 1.74 from Ubuntu 17.10 by pinning ###
32 |  && echo 'deb http://archive.ubuntu.com/ubuntu artful main restricted universe multiverse' > /etc/apt/sources.list.d/ubuntu17.10.list \
33 |  && echo $'Package: *\nPin: release v=16.04, l=Ubuntu\nPin-Priority: 1000\n' > /etc/apt/preferences \
34 |  && echo $'Package: liblept5\nPin: release v=17.10, l=Ubuntu\nPin-Priority: 1001\n' >> /etc/apt/preferences \
35 |  && echo $'Package: libleptonica-dev\nPin: release v=17.10, l=Ubuntu\nPin-Priority: 1001' >> /etc/apt/preferences \
36 |  && apt-get update \
37 |  && apt-get install -y --no-install-recommends \
38 |       liblept5 \
39 |       libleptonica-dev \
40 | 
41 | 
42 | ### Compile and install latest tesseract from repo ###
43 |  && git clone https://github.com/tesseract-ocr/tesseract.git /tmp/tesseract \
44 |  && cd /tmp/tesseract \
45 |  && v=$(git log --date=iso -1 | sed -n '/^Date:/{s|^Date: *||;s| .*||;s|-|.|g;p;}') \
46 |  && sed -i "s|TESSERACT_VERSION_STR .*|TESSERACT_VERSION_STR \"$v\"|" ccutil/version.h \
47 |  && ./autogen.sh \
48 |  && ./configure --prefix=/usr \
49 |  #&& CFLAGS="-O2 -DUSE_STD_NAMESPACE" ./configure --prefix=/usr \
50 |  && make -j$(nproc) \
51 |  && echo "tesseract-ocr" > description-pak \
52 |  && checkinstall -y --pkgname=tesseract-ocr --maintainer=mauricio_ville@yahoo.com --pkgversion=$v --pkgrelease=0 \
53 | 
54 | 
55 | ### Compile and install tesseract-recognize ###
56 |  && cd /tmp/tesseract-recognize \
57 |  && cmake -DCMAKE_BUILD_TYPE=Release . \
58 |  && make install install-docker \
59 | 
60 | 
61 | ### Remove build-only software and install runtime pre-requisites ###
62 |  && cd \
63 |  && rm -rf /tmp/tesseract-recognize /tmp/tesseract \
64 |  && apt-get purge -y \
65 |       ca-certificates \
66 |       build-essential \
67 |       cmake \
68 |       git \
69 |       libxml2-dev \
70 |       libxslt1-dev \
71 |       libopencv-dev \
72 |       libmagick++-dev \
73 |       libtool \
74 |       automake \
75 |       autoconf \
76 |       autoconf-archive \
77 |       checkinstall \
78 |  && apt-get autoremove -y \
79 |  && apt-get purge -y $(dpkg -l | awk '{if($1=="rc")print $2}') \
80 |  && apt-get install -y --no-install-recommends \
81 |       ghostscript \
82 |       libxml2 \
83 |       libxslt1.1 \
84 |       libopencv-core2.4v5 \
85 |       libmagick++-6.q16-5v5 \
86 |       libgomp1 \
87 |       python-flask \
88 |       python-six \
89 |  && apt-get clean \
90 |  && rm -rf /var/lib/apt/lists/*
91 | 
92 | 
93 | ### By default start the flask API server ###
94 | CMD ["/usr/bin/python","/usr/local/bin/tesseract_recognize_api.py"]
95 | EXPOSE 5000
96 | 


--------------------------------------------------------------------------------
/old/Dockerfile_ubuntu16.04-pkg:
--------------------------------------------------------------------------------
 1 | FROM library/ubuntu:16.04
 2 | 
 3 | MAINTAINER Mauricio Villegas <mauricio_ville@yahoo.com>
 4 | 
 5 | ENV DEBIAN_FRONTEND=noninteractive
 6 | 
 7 | 
 8 | ### Copy the source code to a temporal directory ###
 9 | COPY . /tmp/tesseract-recognize/
10 | 
11 | 
12 | ### Install build pre-requisites ###
13 | RUN apt-get update --fix-missing \
14 |  && apt-get install -y --no-install-recommends \
15 |       build-essential \
16 |       cmake \
17 |       tesseract-ocr-dev \
18 |       libleptonica-dev \
19 |       libxml2-dev \
20 |       libxslt1-dev \
21 |       libopencv-dev \
22 |       libmagick++-dev \
23 | 
24 | 
25 | ### Compile and install tesseract-recognize ###
26 |  && cd /tmp/tesseract-recognize \
27 |  && cmake -DCMAKE_BUILD_TYPE=Release . \
28 |  && make install install-docker \
29 | 
30 | 
31 | ### Remove build-only software and install runtime pre-requisites ###
32 |  && cd \
33 |  && rm -rf /tmp/tesseract-recognize \
34 |  && apt-get purge -y \
35 |       build-essential \
36 |       cmake \
37 |       tesseract-ocr-dev \
38 |       libleptonica-dev \
39 |       libxml2-dev \
40 |       libxslt1-dev \
41 |       libopencv-dev \
42 |       libmagick++-dev \
43 |  && apt-get autoremove -y \
44 |  && apt-get purge -y $(dpkg -l | awk '{if($1=="rc")print $2}') \
45 |  && apt-get install -y --no-install-recommends \
46 |       tesseract-ocr \
47 |       ghostscript \
48 |       libxml2 \
49 |       libxslt1.1 \
50 |       libopencv-core2.4v5 \
51 |       libmagick++-6.q16-5v5 \
52 |       python-flask \
53 |       python-six \
54 |  && apt-get clean \
55 |  && rm -rf /var/lib/apt/lists/*
56 | 
57 | 
58 | ### By default start the flask API server ###
59 | CMD ["/usr/bin/python","/usr/local/bin/tesseract_recognize_api.py"]
60 | EXPOSE 5000
61 | 


--------------------------------------------------------------------------------
/old/Dockerfile_ubuntu18.04-github-master:
--------------------------------------------------------------------------------
 1 | FROM library/ubuntu:18.04
 2 | 
 3 | MAINTAINER Mauricio Villegas <mauricio_ville@yahoo.com>
 4 | 
 5 | ENV DEBIAN_FRONTEND=noninteractive
 6 | SHELL ["/bin/bash", "-c"]
 7 | 
 8 | 
 9 | ### Copy the source code to a temporal directory ###
10 | COPY . /tmp/tesseract-recognize/
11 | 
12 | 
13 | ### Install build pre-requisites ###
14 | RUN apt-get update --fix-missing \
15 |  #&& d=$(apt-cache depends libopencv-dev | sed -n '/Depends: libopencv-.*-dev/{ s|.* ||; s|$|-|; p; }' | grep -v libopencv-core-dev | tr '\n' ' ') \
16 |  && apt-get install -y --no-install-recommends \
17 |       ca-certificates \
18 |       build-essential \
19 |       cmake \
20 |       git \
21 |       libxml2-dev \
22 |       libxslt1-dev \
23 |       libopencv-dev \
24 |       libmagick++-dev \
25 |       libleptonica-dev \
26 |       libtool \
27 |       automake \
28 |       autoconf \
29 |       autoconf-archive \
30 |       #checkinstall \
31 |  #&& d=$(apt-cache depends libopencv-dev | sed -n '/Depends: libopencv-.*-dev/{ s|.* ||; p; }' | grep -v libopencv-core-dev | tr '\n' ' ') \
32 |  && dpkg -r --force-depends libtesseract4 \
33 | 
34 | 
35 | ### Compile and install latest tesseract from repo ###
36 |  && git clone https://github.com/tesseract-ocr/tesseract.git /tmp/tesseract \
37 |  && cd /tmp/tesseract \
38 |  && ./autogen.sh \
39 |  && ./configure --prefix=/usr \
40 |  #&& CFLAGS="-O2 -DUSE_STD_NAMESPACE" ./configure --prefix=/usr \
41 |  #&& v=$(git log --date=iso -1 | sed -n '/^Date:/{s|^Date: *||;s| .*||;s|-|.|g;p;}') \
42 |  #&& sed -i "s|TESSERACT_VERSION_STR .*|TESSERACT_VERSION_STR \"$v\"|" src/api/tess_version.h \
43 |  && make -j$(nproc) \
44 |  && make install \
45 | # && echo "tesseract-ocr" > description-pak \
46 | ## && checkinstall -y --pkgname=tesseract-ocr --maintainer=mauricio_ville@yahoo.com --pkgversion=$v --pkgrelease=0 \
47 | # && checkinstall -y --pkgname=tesseract-ocr --maintainer=mauricio_ville@yahoo.com --pkgrelease=0 \
48 | 
49 | 
50 | ### Compile and install tesseract-recognize ###
51 |  && cd /tmp/tesseract-recognize \
52 |  && cmake -DCMAKE_BUILD_TYPE=Release . \
53 |  && make install install-docker \
54 | 
55 | 
56 | ### Remove build-only software and install runtime pre-requisites ###
57 |  && cd \
58 |  && rm -rf /tmp/tesseract-recognize /tmp/tesseract \
59 |  && apt --fix-broken -y install \
60 |  && apt-get purge -y --fix-broken \
61 |       ca-certificates \
62 |       build-essential \
63 |       cmake \
64 |       git \
65 |       libxml2-dev \
66 |       libxslt1-dev \
67 |       libopencv-dev \
68 |       libmagick++-dev \
69 |       libleptonica-dev \
70 |       libtool \
71 |       automake \
72 |       autoconf \
73 |       autoconf-archive \
74 |       #checkinstall \
75 |  && apt-get autoremove -y \
76 |  && apt-get purge -y $(dpkg -l | awk '{if($1=="rc")print $2}') \
77 |  && apt-get install -y --no-install-recommends \
78 |       ghostscript \
79 |       libxml2 \
80 |       libxslt1.1 \
81 |       libopencv-core3.2 \
82 |       libmagick++-6.q16-7 \
83 |       liblept5 \
84 |       libgomp1 \
85 |       python-flask \
86 |       python-six \
87 |  && apt-get clean \
88 |  && rm -rf /var/lib/apt/lists/*
89 | 
90 | 
91 | ### By default start the flask API server ###
92 | CMD ["/usr/bin/python","/usr/local/bin/tesseract_recognize_api.py"]
93 | EXPOSE 5000
94 | 


--------------------------------------------------------------------------------
/old/Dockerfile_ubuntu18.04-pkg:
--------------------------------------------------------------------------------
 1 | FROM library/ubuntu:18.04
 2 | 
 3 | MAINTAINER Mauricio Villegas <mauricio_ville@yahoo.com>
 4 | 
 5 | ENV DEBIAN_FRONTEND=noninteractive
 6 | 
 7 | 
 8 | ### Install runtime requirements ###
 9 | RUN apt-get update --fix-missing \
10 |  && apt-get install -y --no-install-recommends \
11 |       tesseract-ocr \
12 |       ghostscript \
13 |       libxml2 \
14 |       libxslt1.1 \
15 |       libopencv-core3.2 \
16 |       libmagick++-6.q16-7 \
17 |       python-flask \
18 |       python-six \
19 |  && apt-get clean \
20 |  && rm -rf /var/lib/apt/lists/* \
21 |  && sed '/<policy domain="coder"/d' -i /etc/ImageMagick-6/policy.xml
22 | 
23 | 
24 | ### Copy the source code to a temporal directory ###
25 | COPY . /tmp/tesseract-recognize/
26 | 
27 | 
28 | ### Install build pre-requisites ###
29 | RUN apt-get update --fix-missing \
30 |  && apt-get install -y --no-install-recommends \
31 |       build-essential \
32 |       cmake \
33 |       libtesseract-dev \
34 |       libleptonica-dev \
35 |       libxml2-dev \
36 |       libxslt1-dev \
37 |       libopencv-dev \
38 |       libmagick++-dev \
39 | 
40 | 
41 | ### Compile and install tesseract-recognize ###
42 |  && cd /tmp/tesseract-recognize \
43 |  && cmake -DCMAKE_BUILD_TYPE=Release . \
44 |  && make install install-docker \
45 | 
46 | 
47 | ### Remove build-only software ###
48 |  && cd \
49 |  && rm -rf /tmp/tesseract-recognize \
50 |  && apt-get purge -y \
51 |       build-essential \
52 |       cmake \
53 |       libtesseract-dev \
54 |       libleptonica-dev \
55 |       libxml2-dev \
56 |       libxslt1-dev \
57 |       libopencv-dev \
58 |       libmagick++-dev \
59 |  && apt-get autoremove -y \
60 |  && apt-get purge -y $(dpkg -l | awk '{if($1=="rc")print $2}') \
61 |  && apt-get clean \
62 |  && rm -rf /var/lib/apt/lists/*
63 | 
64 | 
65 | ### By default start the flask API server ###
66 | CMD ["/usr/bin/python","/usr/local/bin/tesseract_recognize_api.py"]
67 | EXPOSE 5000
68 | 


--------------------------------------------------------------------------------
/tesseract-recognize.cc:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Tool that does layout analysis and OCR using tesseract providing results in Page XML format
  3 |  *
  4 |  * @author Mauricio Villegas <maurovill+tesseract@gmail.com>
  5 |  * @copyright Copyright (c) 2015-present, Mauricio Villegas <maurovill+tesseract@gmail.com>
  6 |  * @link https://github.com/mauvilsa/tesseract-recognize
  7 |  * @license MIT License
  8 |  */
  9 | 
 10 | /*** Includes *****************************************************************/
 11 | #include <algorithm>
 12 | #include <string>
 13 | using std::string;
 14 | #include <regex>
 15 | #include <set>
 16 | #include <sstream>
 17 | #include <iterator>
 18 | #include <getopt.h>
 19 | 
 20 | #include <../leptonica/allheaders.h>
 21 | #include <../tesseract/baseapi.h>
 22 | 
 23 | #include "PageXML.h"
 24 | 
 25 | /*** Definitions **************************************************************/
 26 | static char tool[] = "tesseract-recognize";
 27 | static char version[] = "Version: 2025.03.31";
 28 | 
 29 | char gb_page_ns[] = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15";
 30 | 
 31 | char gb_default_lang[] = "eng";
 32 | char gb_default_xpath[] = "//_:TextRegion";
 33 | char gb_default_output[] = "-";
 34 | 
 35 | char *gb_output = gb_default_output;
 36 | char *gb_lang = gb_default_lang;
 37 | char *gb_tessdata = NULL;
 38 | int gb_psm = tesseract::PSM_AUTO;
 39 | int gb_oem = tesseract::OEM_DEFAULT;
 40 | bool gb_onlylayout = false;
 41 | bool gb_textlevels[] = { false, false, false, false };
 42 | bool gb_textatlayout = true;
 43 | char *gb_xpath = gb_default_xpath;
 44 | char *gb_image = NULL;
 45 | int gb_density = 300;
 46 | bool gb_inplace = false;
 47 | 
 48 | bool gb_save_crops = false;
 49 | 
 50 | enum {
 51 |   LEVEL_REGION = 0,
 52 |   LEVEL_LINE,
 53 |   LEVEL_WORD,
 54 |   LEVEL_GLYPH
 55 | };
 56 | 
 57 | const char* levelStrings[] = {
 58 |   "region",
 59 |   "line",
 60 |   "word",
 61 |   "glyph"
 62 | };
 63 | 
 64 | inline static int parseLevel( const char* level ) {
 65 |   int levels = sizeof(levelStrings) / sizeof(levelStrings[0]);
 66 |   for( int n=0; n<levels; n++ )
 67 |     if( ! strcmp(levelStrings[n],level) )
 68 |       return n;
 69 |   return -1;
 70 | }
 71 | 
 72 | int gb_layoutlevel = LEVEL_LINE;
 73 | 
 74 | enum {
 75 |   OPTION_OUTPUT      = 'o',
 76 |   OPTION_HELP        = 'h',
 77 |   OPTION_VERSION     = 'v',
 78 |   OPTION_TESSDATA    = 256,
 79 |   OPTION_LANG             ,
 80 |   OPTION_LAYOUTLEVEL      ,
 81 |   OPTION_TEXTLEVELS       ,
 82 |   OPTION_ONLYLAYOUT       ,
 83 |   OPTION_SAVECROPS        ,
 84 |   OPTION_XPATH            ,
 85 |   OPTION_IMAGE            ,
 86 |   OPTION_DENSITY          ,
 87 |   OPTION_PSM              ,
 88 |   OPTION_OEM              ,
 89 |   OPTION_INPLACE
 90 | };
 91 | 
 92 | static char gb_short_options[] = "o:hv";
 93 | 
 94 | static struct option gb_long_options[] = {
 95 |     { "output",       required_argument, NULL, OPTION_OUTPUT },
 96 |     { "help",         no_argument,       NULL, OPTION_HELP },
 97 |     { "version",      no_argument,       NULL, OPTION_VERSION },
 98 |     { "tessdata",     required_argument, NULL, OPTION_TESSDATA },
 99 |     { "lang",         required_argument, NULL, OPTION_LANG },
100 |     { "psm",          required_argument, NULL, OPTION_PSM },
101 |     { "oem",          required_argument, NULL, OPTION_OEM },
102 |     { "layout-level", required_argument, NULL, OPTION_LAYOUTLEVEL },
103 |     { "text-levels",  required_argument, NULL, OPTION_TEXTLEVELS },
104 |     { "only-layout",  no_argument,       NULL, OPTION_ONLYLAYOUT },
105 |     { "save-crops",   no_argument,       NULL, OPTION_SAVECROPS },
106 |     { "xpath",        required_argument, NULL, OPTION_XPATH },
107 |     { "image",        required_argument, NULL, OPTION_IMAGE },
108 |     { "density",      required_argument, NULL, OPTION_DENSITY },
109 |     { "inplace",      no_argument,       NULL, OPTION_INPLACE },
110 |     { 0, 0, 0, 0 }
111 |   };
112 | 
113 | /*** Functions ****************************************************************/
114 | #define strbool( cond ) ( ( cond ) ? "true" : "false" )
115 | 
116 | void print_usage() {
117 |   fprintf( stderr, "Description: Layout analysis and OCR using tesseract providing results in Page XML format\n" );
118 |   fprintf( stderr, "Usage: %s [OPTIONS] (IMAGE+|PDF+|PAGEXML)\n", tool );
119 |   fprintf( stderr, "Options:\n" );
120 |   fprintf( stderr, " --lang LANG             Language used for OCR (def.=%s)\n", gb_lang );
121 |   fprintf( stderr, " --tessdata PATH         Location of tessdata (def.=%s)\n", gb_tessdata );
122 |   fprintf( stderr, " --psm MODE              Page segmentation mode (def.=%d)\n", gb_psm );
123 | #if TESSERACT_VERSION >= 0x040000
124 |   fprintf( stderr, " --oem MODE              OCR engine mode (def.=%d)\n", gb_oem );
125 | #endif
126 |   fprintf( stderr, " --layout-level LEVEL    Layout output level: region, line, word, glyph (def.=%s)\n", levelStrings[gb_layoutlevel] );
127 |   fprintf( stderr, " --text-levels L1[,L2]+  Text output level(s): region, line, word, glyph (def.=layout-level)\n" );
128 |   fprintf( stderr, " --only-layout           Only perform layout analysis, no OCR (def.=%s)\n", strbool(gb_onlylayout) );
129 |   fprintf( stderr, " --save-crops            Saves cropped images (def.=%s)\n", strbool(gb_save_crops) );
130 |   fprintf( stderr, " --xpath XPATH           xpath for selecting elements to process (def.=%s)\n", gb_xpath );
131 |   fprintf( stderr, " --image IMAGE           Use given image instead of one in Page XML\n" );
132 |   fprintf( stderr, " --density DENSITY       Density in dpi for pdf rendering (def.=%d)\n", gb_density );
133 |   fprintf( stderr, " --inplace               Overwrite input XML with result (def.=%s)\n", strbool(gb_inplace) );
134 |   fprintf( stderr, " -o, --output            Output page xml file (def.=%s)\n", gb_output );
135 |   fprintf( stderr, " -h, --help              Print this usage information and exit\n" );
136 |   fprintf( stderr, " -v, --version           Print version and exit\n" );
137 |   fprintf( stderr, "\n" );
138 |   int r = system( "tesseract --help-psm 2>&1 | sed '/^ *[02] /d; s| (Default)||;' 1>&2" );
139 |   if( r != 0 )
140 |     fprintf( stderr, "warning: tesseract command not found in path\n" );
141 | #if TESSERACT_VERSION >= 0x040000
142 |   fprintf( stderr, "\n" );
143 |   r += system( "tesseract --help-oem 1>&2" );
144 | #endif
145 |   fprintf( stderr, "Examples:\n" );
146 |   fprintf( stderr, "  %s -o out.xml in1.png in2.png  ### Multiple images as input\n", tool );
147 |   fprintf( stderr, "  %s -o out.xml in.tiff  ### TIFF possibly with multiple frames\n", tool );
148 |   fprintf( stderr, "  %s -o out.xml --density 200 in.pdf\n", tool );
149 |   fprintf( stderr, "  %s -o out.xml --xpath //_:Page in.xml  ### Empty page xml recognize the complete pages\n", tool );
150 |   fprintf( stderr, "  %s -o out.xml --psm 1 in.png  ### Detect page orientation pages\n", tool );
151 |   fprintf( stderr, "  %s -o out.xml --xpath \"//_:TextRegion[@id='r1']\" --layout-level word --only-layout in.xml  ### Detect text lines and words only in TextRegion with id=r1\n", tool );
152 | }
153 | 
154 | 
155 | void setCoords( tesseract::ResultIterator* iter, tesseract::PageIteratorLevel iter_level, PageXML& page, xmlNodePtr& xelem, int x, int y, tesseract::Orientation orientation = tesseract::ORIENTATION_PAGE_UP ) {
156 |   int left, top, right, bottom;
157 |   int pagenum = page.getPageNumber(xelem);
158 |   iter->BoundingBox( iter_level, &left, &top, &right, &bottom );
159 |   std::vector<cv::Point2f> points;
160 |   if ( left == 0 && top == 0 && right == (int)page.getPageWidth(pagenum) && bottom == (int)page.getPageHeight(pagenum) )
161 |     points = { cv::Point2f(0,0), cv::Point2f(0,0) };
162 |   else {
163 |     cv::Point2f tl(x+left,y+top);
164 |     cv::Point2f tr(x+right,y+top);
165 |     cv::Point2f br(x+right,y+bottom);
166 |     cv::Point2f bl(x+left,y+bottom);
167 |     switch( orientation ) {
168 |       case tesseract::ORIENTATION_PAGE_UP:    points = { tl, tr, br, bl }; break;
169 |       case tesseract::ORIENTATION_PAGE_RIGHT: points = { tr, br, bl, tl }; break;
170 |       case tesseract::ORIENTATION_PAGE_LEFT:  points = { bl, tl, tr, br }; break;
171 |       case tesseract::ORIENTATION_PAGE_DOWN:  points = { br, bl, tl, tr }; break;
172 |     }
173 |   }
174 |   page.setCoords( xelem, points );
175 | }
176 | 
177 | void setLineCoords( tesseract::ResultIterator* iter, tesseract::PageIteratorLevel iter_level, PageXML& page, xmlNodePtr& xelem, int x, int y, tesseract::Orientation orientation ) {
178 |   setCoords( iter, iter_level, page, xelem, x, y, orientation );
179 |   std::vector<cv::Point2f> coords = page.getPoints( xelem );
180 |   int x1, y1, x2, y2;
181 |   iter->Baseline( iter_level, &x1, &y1, &x2, &y2 );
182 |   cv::Point2f b_p1(x+x1,y+y1), b_p2(x+x2,y+y2);
183 |   cv::Point2f baseline_p1, baseline_p2;
184 |   if ( ! page.intersection( b_p1, b_p2, coords[0], coords[3], baseline_p1 ) ||
185 |        ! page.intersection( b_p1, b_p2, coords[1], coords[2], baseline_p2 ) ) {
186 |     std::string lid = page.getAttr(xelem,"id");
187 |     fprintf(stderr,"warning: no intersection between baseline and bounding box sides id=%s\n",lid.c_str());
188 |     std::vector<cv::Point2f> baseline = {
189 |       cv::Point2f(x+x1,y+y1),
190 |       cv::Point2f(x+x2,y+y2) };
191 |     page.setBaseline( xelem, baseline );
192 |     return;
193 |   }
194 |   std::vector<cv::Point2f> baseline = { baseline_p1, baseline_p2 };
195 |   page.setBaseline( xelem, baseline );
196 |   double up1 = cv::norm( baseline_p1 - coords[0] );
197 |   double up2 = cv::norm( baseline_p2 - coords[1] );
198 |   double down1 = cv::norm( baseline_p1 - coords[3] );
199 |   double down2 = cv::norm( baseline_p2 - coords[2] );
200 |   double height = 0.5*( up1 + up2 + down1 + down2 );
201 |   double offset = height <= 0.0 ? 0.0 : 0.5*( down1 + down2 ) / height;
202 |   page.setPolystripe( xelem, height <= 0.0 ? 1.0 : height, offset, false );
203 | }
204 | 
205 | void setTextEquiv( tesseract::ResultIterator* iter, tesseract::PageIteratorLevel iter_level, PageXML& page, xmlNodePtr& xelem ) {
206 |   double conf = 0.01*iter->Confidence( iter_level );
207 |   char* text = iter->GetUTF8Text( iter_level );
208 |   std::string stext(text);
209 |   stext = std::regex_replace( stext, std::regex("^\\s+|\\s+$"), "$1" );
210 |   page.setTextEquiv( xelem, stext.c_str(), &conf );
211 |   delete[] text;
212 | }
213 | 
214 | template<typename Out>
215 | void split( const std::string &s, char delim, Out result ) {
216 |   std::stringstream ss(s);
217 |   std::string item;
218 |   while( std::getline(ss, item, delim) )
219 |     *(result++) = item;
220 | }
221 | 
222 | std::set<int> parsePagesSet( std::string range ) {
223 |   std::set<int> pages_set;
224 |   std::vector<std::string> parts;
225 |   split( range, ',', std::back_inserter(parts) );
226 |   for( auto part : parts ) {
227 |     std::string::size_type dash_pos = part.find('-');
228 |     if( dash_pos == std::string::npos )
229 |       pages_set.insert(stoi(part));
230 |     else
231 |       for( int num=stoi(part.substr(0, dash_pos)); num<=stoi(part.substr(dash_pos+1)); num++ )
232 |         pages_set.insert(num);
233 |   }
234 |   return pages_set;
235 | }
236 | 
237 | 
238 | /*** Program ******************************************************************/
239 | int main( int argc, char *argv[] ) {
240 | 
241 |   /// Disable debugging and informational messages from Leptonica. ///
242 |   setMsgSeverity(L_SEVERITY_ERROR);
243 | 
244 |   /// Parse input arguments ///
245 |   int n,m;
246 |   std::stringstream test;
247 |   std::string token;
248 |   while ( ( n = getopt_long(argc,argv,gb_short_options,gb_long_options,&m) ) != -1 )
249 |     switch ( n ) {
250 |       case OPTION_TESSDATA:
251 |         gb_tessdata = optarg;
252 |         break;
253 |       case OPTION_LANG:
254 |         gb_lang = optarg;
255 |         break;
256 |       case OPTION_PSM:
257 |         gb_psm = atoi(optarg);
258 |         if( gb_psm < tesseract::PSM_AUTO_OSD || gb_psm == tesseract::PSM_AUTO_ONLY || gb_psm >= tesseract::PSM_COUNT ) {
259 |           fprintf( stderr, "%s: error: invalid page segmentation mode: %s\n", tool, optarg );
260 |           return 1;
261 |         }
262 |         break;
263 | #if TESSERACT_VERSION >= 0x040000
264 |       case OPTION_OEM:
265 |         gb_oem = atoi(optarg);
266 |         if( gb_oem < tesseract::OEM_TESSERACT_ONLY || gb_oem >= tesseract::OEM_COUNT ) {
267 |           fprintf( stderr, "%s: error: invalid OCR engine mode: %s\n", tool, optarg );
268 |           return 1;
269 |         }
270 |         break;
271 | #endif
272 |       case OPTION_LAYOUTLEVEL:
273 |         gb_layoutlevel = parseLevel(optarg);
274 |         if( gb_layoutlevel == -1 ) {
275 |           fprintf( stderr, "%s: error: invalid level: %s\n", tool, optarg );
276 |           return 1;
277 |         }
278 |         break;
279 |       case OPTION_TEXTLEVELS:
280 |         test = std::stringstream(optarg);
281 |         while( std::getline(test, token, ',') ) {
282 |           int textlevel = parseLevel(token.c_str());
283 |           if( textlevel == -1 ) {
284 |             fprintf( stderr, "%s: error: invalid level: %s\n", tool, token.c_str() );
285 |             return 1;
286 |           }
287 |           gb_textlevels[textlevel] = true;
288 |           gb_textatlayout = false;
289 |         }
290 |         break;
291 |       case OPTION_ONLYLAYOUT:
292 |         gb_onlylayout = true;
293 |         break;
294 |       case OPTION_SAVECROPS:
295 |         gb_save_crops = true;
296 |         break;
297 |       case OPTION_XPATH:
298 |         gb_xpath = optarg;
299 |         break;
300 |       case OPTION_IMAGE:
301 |         gb_image = optarg;
302 |         break;
303 |       case OPTION_DENSITY:
304 |         gb_density = atoi(optarg);
305 |         break;
306 |       case OPTION_INPLACE:
307 |         gb_inplace = true;
308 |         break;
309 |       case OPTION_OUTPUT:
310 |         gb_output = optarg;
311 |         break;
312 |       case OPTION_HELP:
313 |         print_usage();
314 |         return 0;
315 |       case OPTION_VERSION:
316 |         fprintf( stderr, "%s %s\n", tool, version+9 );
317 |         fprintf( stderr, "compiled against PageXML %s\n", PageXML::version() );
318 | #ifdef TESSERACT_VERSION_STR
319 |         fprintf( stderr, "compiled against tesseract %s, linked with %s\n", TESSERACT_VERSION_STR, tesseract::TessBaseAPI::Version() );
320 | #else
321 |         fprintf( stderr, "linked with tesseract %s\n", tesseract::TessBaseAPI::Version() );
322 | #endif
323 |         return 0;
324 |       default:
325 |         fprintf( stderr, "%s: error: incorrect input argument: %s\n", tool, argv[optind-1] );
326 |         return 1;
327 |     }
328 | 
329 |   /// Default text level ///
330 |   if ( gb_textatlayout )
331 |     gb_textlevels[gb_layoutlevel] = true;
332 | 
333 |   /// Check that there is at least one non-option argument ///
334 |   if ( optind >= argc ) {
335 |     fprintf( stderr, "%s: error: at least one input file must be provided, see usage with --help\n", tool );
336 |     return 1;
337 |   }
338 | 
339 |   /// Initialize tesseract just for layout or with given language and tessdata path///
340 |   tesseract::TessBaseAPI *tessApi = new tesseract::TessBaseAPI();
341 | 
342 |   if ( gb_onlylayout && gb_psm != tesseract::PSM_AUTO_OSD )
343 |     tessApi->InitForAnalysePage();
344 |   else
345 | #if TESSERACT_VERSION >= 0x040000
346 |   if ( tessApi->Init( gb_tessdata, gb_lang, (tesseract::OcrEngineMode)gb_oem ) ) {
347 | #else
348 |   if ( tessApi->Init( gb_tessdata, gb_lang) ) {
349 | #endif
350 |     fprintf( stderr, "%s: error: could not initialize tesseract\n", tool );
351 |     return 1;
352 |   }
353 | 
354 |   tessApi->SetPageSegMode( (tesseract::PageSegMode)gb_psm );
355 | 
356 |   PageXML page;
357 |   int num_pages = 0;
358 |   bool pixRelease = false;
359 |   std::vector<NamedImage> images;
360 |   tesseract::ResultIterator* iter = NULL;
361 | 
362 |   std::regex reIsXml(".+\\.xml$|^-$",std::regex_constants::icase);
363 |   std::regex reIsTiff(".+\\.tif{1,2}(|\\[[-, 0-9]+\\])$",std::regex_constants::icase);
364 |   std::regex reIsPdf(".+\\.pdf(|\\[[-, 0-9]+\\])$",std::regex_constants::icase);
365 |   std::regex reImagePageNum("(.+)\\[([-, 0-9]+)\\]$");
366 |   std::cmatch base_match;
367 |   char *input_file = argv[optind];
368 |   bool input_xml = std::regex_match(input_file,base_match,reIsXml);
369 | 
370 |   /// Inplace only when XML input and output not specified ///
371 |   if ( gb_inplace && ( ! input_xml || strcmp(gb_output,"-") ) ) {
372 |     fprintf( stderr, "%s: warning: ignoring --inplace option, output to %s\n", tool, gb_output );
373 |     gb_inplace = false;
374 |   }
375 | 
376 |   /// Info for process element ///
377 |   char tool_info[128];
378 |   if ( gb_onlylayout )
379 |     snprintf( tool_info, sizeof tool_info, "%s_v%.10s tesseract_v%s", tool, version+9, tesseract::TessBaseAPI::Version() );
380 |   else
381 |     snprintf( tool_info, sizeof tool_info, "%s_v%.10s tesseract_v%s lang=%s", tool, version+9, tesseract::TessBaseAPI::Version(), gb_lang );
382 | 
383 |   /// Loop through input files ///
384 |   for ( ; optind < argc; optind++ ) {
385 |     input_file = argv[optind];
386 |     input_xml = std::regex_match(input_file,base_match,reIsXml);
387 |     bool input_tiff = std::regex_match(input_file,base_match,reIsTiff);
388 |     bool input_pdf = std::regex_match(input_file,base_match,reIsPdf);
389 | 
390 |     /// Get selected pages for tiff/pdf if given ///
391 |     std::set<int> pages_set;
392 |     std::string page_sel;
393 |     std::string input_file_str = std::string(input_file);
394 |     if ( input_tiff || input_pdf ) {
395 |       if( std::regex_match(input_file, base_match, reImagePageNum) ) {
396 |         pages_set = parsePagesSet(base_match[2].str());
397 |         page_sel = std::string(base_match[2].str());
398 |         input_file_str = std::string(base_match[1].str());
399 |       }
400 |     }
401 | 
402 |     /// Input is xml ///
403 |     if ( input_xml ) {
404 |       if ( num_pages > 0 ) {
405 |         fprintf( stderr, "%s: error: only a single page xml allowed as input\n", tool );
406 |         return 1;
407 |       }
408 |       try {
409 |         page.loadXml( input_file ); // if input_file is "-" xml is read from stdin
410 |       } catch ( const std::exception& e ) {
411 |         fprintf( stderr, "%s: error: problems reading xml file: %s\n%s\n", tool, input_file, e.what() );
412 |         return 1;
413 |       }
414 |       if ( gb_image != NULL ) {
415 |         if ( page.count("//_:Page") > 1 ) {
416 |           fprintf( stderr, "%s: error: specifying image with multipage xml input not supported\n", tool );
417 |           return 1;
418 |         }
419 |         page.loadImage( 0, gb_image );
420 |       }
421 |       num_pages += page.count("//_:Page");
422 | 
423 |       if ( gb_psm == tesseract::PSM_AUTO_OSD && page.count("//_:ImageOrientation") > 0 ) {
424 |         fprintf( stderr, "%s: error: refusing to use OSD on page xml that already contains ImageOrientation elements\n", tool );
425 |         return 1;
426 |       }
427 | 
428 |       std::vector<xmlNodePtr> sel = page.select(gb_xpath);
429 |       int selPages = 0;
430 |       for ( n=0; n<(int)sel.size(); n++ )
431 |         if ( page.nodeIs( sel[n], "Page" ) )
432 |           selPages++;
433 |       if ( selPages > 0 && selPages != (int)sel.size() ) {
434 |         fprintf( stderr, "%s: error: xpath can select Page or non-Page elements but not a mixture of both: %s\n", tool, gb_xpath );
435 |         return 1;
436 |       }
437 | 
438 |       if ( selPages == 0 ) {
439 |         pixRelease = true;
440 |         images = page.crop( (std::string(gb_xpath)+"/_:Coords").c_str(), NULL, false );
441 |         page.releaseImages();
442 |       }
443 |       else {
444 |         for ( n=0; n<(int)sel.size(); n++ ) {
445 |           NamedImage namedimage;
446 |           namedimage.image = NULL;
447 |           namedimage.node = sel[n];
448 |           images.push_back( namedimage );
449 |           num_pages++;
450 |         }
451 |       }
452 |     }
453 | 
454 |     /// Input is tiff image ///
455 |     else if ( input_tiff ) {
456 |       pixRelease = true;
457 | 
458 |       /// Read input image ///
459 |       PIXA* tiffimage = pixaReadMultipageTiff( input_file_str.c_str() );
460 |       if ( tiffimage == NULL || tiffimage->n == 0 ) {
461 |         fprintf( stderr, "%s: error: problems reading tiff image: %s\n", tool, input_file );
462 |         return 1;
463 |       }
464 | 
465 |       if ( pages_set.size() > 0 && tiffimage->n <= *pages_set.rbegin() ) {
466 |         fprintf( stderr, "%s: error: invalid page selection (%s) on tiff with %d pages\n", tool, page_sel.c_str(), tiffimage->n+1 );
467 |         return 1;
468 |       }
469 | 
470 |       for ( n=0; n<tiffimage->n; n++ ) {
471 |         if ( pages_set.size() > 0 && pages_set.find(n) == pages_set.end() )
472 |           continue;
473 | 
474 |         PageImage image = pixClone(tiffimage->pix[n]);
475 |         std::string pagepath = input_file_str+"["+std::to_string(n)+"]";
476 |         NamedImage namedimage;
477 |         namedimage.image = image;
478 |         if ( num_pages == 0 )
479 |           namedimage.node = page.newXml( tool_info, pagepath.c_str(), pixGetWidth(image), pixGetHeight(image), gb_page_ns );
480 |         else
481 |           namedimage.node = page.addPage( pagepath.c_str(), pixGetWidth(image), pixGetHeight(image) );
482 |         images.push_back( namedimage );
483 |         num_pages++;
484 |       }
485 | 
486 |       pixaDestroy(&tiffimage);
487 |     }
488 | 
489 |     /// Input is pdf ///
490 |     else if ( input_pdf ) {
491 |       std::vector< std::pair<double,double> > pdf_pages = gsGetPdfPageSizes(input_file_str);
492 |       if ( pages_set.size() > 0 && (int)pdf_pages.size() <= *pages_set.rbegin() ) {
493 |         fprintf( stderr, "%s: error: invalid page selection (%s) on pdf with %d pages\n", tool, page_sel.c_str(), (int)pdf_pages.size() );
494 |         return 1;
495 |       }
496 | 
497 |       for ( n=0; n<(int)pdf_pages.size(); n++ ) {
498 |         if ( pages_set.size() > 0 && pages_set.find(n) == pages_set.end() )
499 |           continue;
500 | 
501 |         std::string pagepath = input_file_str+"["+std::to_string(n)+"]";
502 |         NamedImage namedimage;
503 |         namedimage.image = NULL;
504 |         if ( num_pages == 0 )
505 |           namedimage.node = page.newXml( tool_info, pagepath.c_str(), (int)(0.5+pdf_pages[n].first), (int)(0.5+pdf_pages[n].second), gb_page_ns );
506 |         else
507 |           namedimage.node = page.addPage( pagepath.c_str(), (int)(0.5+pdf_pages[n].first), (int)(0.5+pdf_pages[n].second) );
508 |         images.push_back( namedimage );
509 |         num_pages++;
510 |       }
511 |     }
512 | 
513 |     /// Input is image ///
514 |     else {
515 |       /// Read input image ///
516 |       PageImage image = pixRead( input_file );
517 |       if ( image == NULL ) {
518 |         fprintf( stderr, "%s: error: problems reading image: %s\n", tool, input_file );
519 |         return 1;
520 |       }
521 | 
522 |       NamedImage namedimage;
523 |       namedimage.image = NULL;
524 |       if ( num_pages == 0 )
525 |         namedimage.node = page.newXml( tool_info, input_file, pixGetWidth(image), pixGetHeight(image), gb_page_ns );
526 |       else
527 |         namedimage.node = page.addPage( input_file, pixGetWidth(image), pixGetHeight(image) );
528 |       num_pages++;
529 |       pixDestroy(&image);
530 |       images.push_back( namedimage );
531 |     }
532 |   }
533 | 
534 |   page.processStart(tool_info);
535 | 
536 |   /// Loop through all images to process ///
537 |   for ( n=0; n<(int)images.size(); n++ ) {
538 |     xmlNodePtr xpg = page.closest( "Page", images[n].node );
539 | 
540 |     if ( images[n].image == NULL ) {
541 |       try {
542 |         page.loadImage(xpg, NULL, true, gb_density );
543 |         images[n].image = page.getPageImage(n);
544 |       } catch ( const std::exception& e ) {
545 |         fprintf( stderr, "%s: error: problems loading page image: %s :: %s\n", tool, page.getPageImageFilename(n).c_str(), e.what() );
546 |         return 1;
547 |       }
548 |     }
549 | 
550 |     tessApi->SetImage( images[n].image );
551 |     if ( gb_save_crops && input_xml ) {
552 |       std::string fout = std::string("crop_")+std::to_string(n)+"_"+images[n].id+".png";
553 |       fprintf( stderr, "%s: writing cropped image: %s\n", tool, fout.c_str() );
554 |       pixWriteImpliedFormat( fout.c_str(), images[n].image, 0, 0 );
555 |     }
556 | 
557 |     /// For xml input setup node level ///
558 |     xmlNodePtr node = NULL;
559 |     int node_level = -1;
560 |     if ( input_xml ) {
561 |       node = images[n].node->parent;
562 |       if ( page.nodeIs( node, "TextRegion" ) )
563 |         node_level = LEVEL_REGION;
564 |       else if ( page.nodeIs( node, "TextLine" ) ) {
565 |         node_level = LEVEL_LINE;
566 |         if ( gb_psm != tesseract::PSM_SINGLE_LINE && gb_psm != tesseract::PSM_RAW_LINE ) {
567 |           fprintf( stderr, "%s: error: for xml input selecting text lines, valid page segmentation modes are %d and %d\n", tool, tesseract::PSM_SINGLE_LINE, tesseract::PSM_RAW_LINE );
568 |           return 1;
569 |         }
570 |       }
571 |       else if ( page.nodeIs( node, "Word" ) ) {
572 |         node_level = LEVEL_WORD;
573 |         if ( gb_psm != tesseract::PSM_SINGLE_WORD && gb_psm != tesseract::PSM_CIRCLE_WORD ) {
574 |           fprintf( stderr, "%s: error: for xml input selecting words, valid page segmentation modes are %d and %d\n", tool, tesseract::PSM_SINGLE_WORD, tesseract::PSM_CIRCLE_WORD );
575 |           return 1;
576 |         }
577 |       }
578 |       else if ( page.nodeIs( node, "Glyph" ) ) {
579 |         node_level = LEVEL_GLYPH;
580 |         if ( gb_psm != tesseract::PSM_SINGLE_CHAR ) {
581 |           fprintf( stderr, "%s: error: for xml input selecting glyphs, the only valid page segmentation mode is %d\n", tool, tesseract::PSM_SINGLE_CHAR );
582 |           return 1;
583 |         }
584 |       }
585 |       if ( gb_layoutlevel < node_level ) {
586 |         fprintf( stderr, "%s: error: layout level lower than xpath selection level\n", tool );
587 |         return 1;
588 |       }
589 |     }
590 | 
591 |     /// Perform layout analysis ///
592 |     if ( gb_onlylayout && gb_psm != tesseract::PSM_AUTO_OSD )
593 |       iter = (tesseract::ResultIterator*)( tessApi->AnalyseLayout() );
594 | 
595 |     /// Perform recognition ///
596 |     else {
597 |       tessApi->Recognize( 0 );
598 |       iter = tessApi->GetIterator();
599 |     }
600 | 
601 |     if ( iter != NULL && ! iter->Empty( tesseract::RIL_BLOCK ) ) {
602 |       /// Orientation and Script Detection ///
603 |       tesseract::Orientation orientation;
604 |       tesseract::WritingDirection writing_direction;
605 |       tesseract::TextlineOrder textline_order;
606 |       float deskew_angle;
607 |       iter->Orientation( &orientation, &writing_direction, &textline_order, &deskew_angle );
608 | 
609 |       if ( gb_psm == tesseract::PSM_AUTO_OSD ) {
610 |         if ( deskew_angle != 0.0 )
611 |           page.setProperty( xpg, "deskewAngle", deskew_angle );
612 |         switch ( orientation ) {
613 |           case tesseract::ORIENTATION_PAGE_RIGHT:          page.setProperty( xpg, "apply-image-orientation", -90 );      break;
614 |           case tesseract::ORIENTATION_PAGE_LEFT:           page.setProperty( xpg, "apply-image-orientation", 90 );       break;
615 |           case tesseract::ORIENTATION_PAGE_DOWN:           page.setProperty( xpg, "apply-image-orientation", 180 );      break;
616 |           default: break;
617 |         }
618 |         switch ( writing_direction ) {
619 |           case tesseract::WRITING_DIRECTION_LEFT_TO_RIGHT: page.setProperty( xpg, "readingDirection", "left-to-right" ); break;
620 |           case tesseract::WRITING_DIRECTION_RIGHT_TO_LEFT: page.setProperty( xpg, "readingDirection", "right-to-left" ); break;
621 |           case tesseract::WRITING_DIRECTION_TOP_TO_BOTTOM: page.setProperty( xpg, "readingDirection", "top-to-bottom" ); break;
622 |         }
623 |         switch ( textline_order ) {
624 |           case tesseract::TEXTLINE_ORDER_LEFT_TO_RIGHT:    page.setProperty( xpg, "textLineOrder", "left-to-right" );    break;
625 |           case tesseract::TEXTLINE_ORDER_RIGHT_TO_LEFT:    page.setProperty( xpg, "textLineOrder", "right-to-left" );    break;
626 |           case tesseract::TEXTLINE_ORDER_TOP_TO_BOTTOM:    page.setProperty( xpg, "textLineOrder", "top-to-bottom" );    break;
627 |         }
628 |       }
629 | 
630 |       /// Loop through blocks ///
631 |       int block = 0;
632 |       while ( gb_layoutlevel >= LEVEL_REGION ) {
633 |         /// Skip non-text blocks ///
634 |         /*
635 |          0 PT_UNKNOWN,        // Type is not yet known. Keep as the first element.
636 |          1 PT_FLOWING_TEXT,   // Text that lives inside a column.
637 |          2 PT_HEADING_TEXT,   // Text that spans more than one column.
638 |          3 PT_PULLOUT_TEXT,   // Text that is in a cross-column pull-out region.
639 |          4 PT_EQUATION,       // Partition belonging to an equation region.
640 |          5 PT_INLINE_EQUATION,  // Partition has inline equation.
641 |          6 PT_TABLE,          // Partition belonging to a table region.
642 |          7 PT_VERTICAL_TEXT,  // Text-line runs vertically.
643 |          8 PT_CAPTION_TEXT,   // Text that belongs to an image.
644 |          9 PT_FLOWING_IMAGE,  // Image that lives inside a column.
645 |          10 PT_HEADING_IMAGE,  // Image that spans more than one column.
646 |          11 PT_PULLOUT_IMAGE,  // Image that is in a cross-column pull-out region.
647 |          12 PT_HORZ_LINE,      // Horizontal Line.
648 |          13 PT_VERT_LINE,      // Vertical Line.
649 |          14 PT_NOISE,          // Lies outside of any column.
650 |         */
651 |         if ( iter->BlockType() > tesseract::PT_CAPTION_TEXT ) {
652 |           if ( ! iter->Next( tesseract::RIL_BLOCK ) )
653 |             break;
654 |           continue;
655 |         }
656 | 
657 |         block++;
658 | 
659 |         xmlNodePtr xreg = NULL;
660 |         std::string rid = "b" + std::to_string(block);
661 | 
662 |         /// If xml input and region selected, prepend id to rid and set xreg to node ///
663 |         if ( node_level == LEVEL_REGION ) {
664 |           rid = std::string(images[n].id) + "_" + rid;
665 |           xreg = node;
666 |         }
667 | 
668 |         /// If it is multipage, prepend page number to rid ///
669 |         if ( num_pages > 1 )
670 |           rid = std::string("pg") + std::to_string(1+page.getPageNumber(xpg)) + "_" + rid;
671 | 
672 |         /// Otherwise add block as TextRegion element ///
673 |         if ( node_level < LEVEL_REGION ) {
674 |           xreg = page.addTextRegion( xpg, rid.c_str() );
675 | 
676 |           /// Set block bounding box and text ///
677 |           setCoords( iter, tesseract::RIL_BLOCK, page, xreg, images[n].x, images[n].y );
678 |           if ( ! gb_onlylayout && gb_textlevels[LEVEL_REGION] )
679 |             setTextEquiv( iter, tesseract::RIL_BLOCK, page, xreg );
680 |         }
681 | 
682 |         /// Set rotation and reading direction ///
683 |         /*tesseract::Orientation orientation;
684 |         tesseract::WritingDirection writing_direction;
685 |         tesseract::TextlineOrder textline_order;
686 |         float deskew_angle;*/
687 |         iter->Orientation( &orientation, &writing_direction, &textline_order, &deskew_angle );
688 |         if ( ! input_xml || node_level <= LEVEL_REGION ) {
689 |           if ( deskew_angle != 0.0 )
690 |             page.setProperty( xpg, "deskewAngle", deskew_angle );
691 |           PAGEXML_READ_DIRECTION direct = PAGEXML_READ_DIRECTION_LTR;
692 |           switch( writing_direction ) {
693 |             case tesseract::WRITING_DIRECTION_LEFT_TO_RIGHT: direct = PAGEXML_READ_DIRECTION_LTR; break;
694 |             case tesseract::WRITING_DIRECTION_RIGHT_TO_LEFT: direct = PAGEXML_READ_DIRECTION_RTL; break;
695 |             case tesseract::WRITING_DIRECTION_TOP_TO_BOTTOM: direct = PAGEXML_READ_DIRECTION_TTB; break;
696 |           }
697 |           page.setReadingDirection( xreg, direct );
698 |           /*float orient = 0.0;
699 |           switch( orientation ) {
700 |             case tesseract::ORIENTATION_PAGE_UP:    orient = 0.0;   break;
701 |             case tesseract::ORIENTATION_PAGE_RIGHT: orient = -90.0; break;
702 |             case tesseract::ORIENTATION_PAGE_LEFT:  orient = 90.0;  break;
703 |             case tesseract::ORIENTATION_PAGE_DOWN:  orient = 180.0; break;
704 |           }
705 |           page.setRotation( xreg, orient );*/
706 |         }
707 | 
708 |         /// Loop through paragraphs in current block ///
709 |         int para = 0;
710 |         while ( gb_layoutlevel >= LEVEL_REGION ) {
711 |           para++;
712 | 
713 |           /// Loop through lines in current paragraph ///
714 |           int line = 0;
715 |           while ( gb_layoutlevel >= LEVEL_LINE ) {
716 |             line++;
717 | 
718 |             xmlNodePtr xline = NULL;
719 | 
720 |             /// If xml input and line selected, set xline to node ///
721 |             if ( node_level == LEVEL_LINE )
722 |               xline = node;
723 | 
724 |             /// Otherwise add TextLine element ///
725 |             else if ( node_level < LEVEL_LINE ) {
726 |               std::string lid = rid + "_p" + std::to_string(para) + "_l" + std::to_string(line);
727 |               xline = page.addTextLine( xreg, lid.c_str() );
728 |             }
729 | 
730 |             /// Set line bounding box, baseline and text ///
731 |             if ( xline != NULL ) {
732 |               setLineCoords( iter, tesseract::RIL_TEXTLINE, page, xline, images[n].x, images[n].y, orientation );
733 |               if ( ! gb_onlylayout && gb_textlevels[LEVEL_LINE] )
734 |                 setTextEquiv( iter, tesseract::RIL_TEXTLINE, page, xline );
735 |             }
736 | 
737 |             /// Loop through words in current text line ///
738 |             while ( gb_layoutlevel >= LEVEL_WORD ) {
739 |               xmlNodePtr xword = NULL;
740 | 
741 |               /// If xml input and word selected, set xword to node ///
742 |               if ( node_level == LEVEL_WORD )
743 |                 xword = node;
744 | 
745 |               /// Otherwise add Word element ///
746 |               else if ( node_level < LEVEL_WORD )
747 |                 xword = page.addWord( xline );
748 | 
749 |               /// Set word bounding box and text ///
750 |               if ( xword != NULL ) {
751 |                 setCoords( iter, tesseract::RIL_WORD, page, xword, images[n].x, images[n].y, orientation );
752 |                 if ( ! gb_onlylayout && gb_textlevels[LEVEL_WORD] )
753 |                   setTextEquiv( iter, tesseract::RIL_WORD, page, xword );
754 |               }
755 | 
756 |               /// Loop through symbols in current word ///
757 |               while ( gb_layoutlevel >= LEVEL_GLYPH ) {
758 |                 /// Set xglyph to node or add new Glyph element depending on the case ///
759 |                 xmlNodePtr xglyph = node_level == LEVEL_GLYPH ? node : page.addGlyph( xword );
760 | 
761 |                 /// Set symbol bounding box and text ///
762 |                 setCoords( iter, tesseract::RIL_SYMBOL, page, xglyph, images[n].x, images[n].y, orientation );
763 |                 if ( ! gb_onlylayout && gb_textlevels[LEVEL_GLYPH] )
764 |                   setTextEquiv( iter, tesseract::RIL_SYMBOL, page, xglyph );
765 | 
766 |                 if ( iter->IsAtFinalElement( tesseract::RIL_WORD, tesseract::RIL_SYMBOL ) )
767 |                   break;
768 |                 iter->Next( tesseract::RIL_SYMBOL );
769 |               } // while ( gb_layoutlevel >= LEVEL_GLYPH ) {
770 | 
771 |               if ( iter->IsAtFinalElement( tesseract::RIL_TEXTLINE, tesseract::RIL_WORD ) )
772 |                 break;
773 |               iter->Next( tesseract::RIL_WORD );
774 |             } // while ( gb_layoutlevel >= LEVEL_WORD ) {
775 | 
776 |             if ( iter->IsAtFinalElement( tesseract::RIL_PARA, tesseract::RIL_TEXTLINE ) )
777 |               break;
778 |             iter->Next( tesseract::RIL_TEXTLINE );
779 |           } // while ( gb_layoutlevel >= LEVEL_LINE ) {
780 | 
781 |           if ( iter->IsAtFinalElement( tesseract::RIL_BLOCK, tesseract::RIL_PARA ) )
782 |             break;
783 |           iter->Next( tesseract::RIL_PARA );
784 |         } // while ( gb_layoutlevel >= LEVEL_REGION ) {
785 | 
786 |         if ( ! iter->Next( tesseract::RIL_BLOCK ) )
787 |           break;
788 |       } // while ( gb_layoutlevel >= LEVEL_REGION ) {
789 |     } // if ( iter != NULL && ! iter->Empty( tesseract::RIL_BLOCK ) ) {
790 |     page.releaseImage(xpg);
791 |   } // for ( n=0; n<(int)images.size(); n++ ) {
792 | 
793 |   /// Apply image orientations ///
794 |   std::vector<xmlNodePtr> sel = page.select("//_:Page[_:Property/@key='apply-image-orientation']");
795 |   for ( n=(int)sel.size()-1; n>=0; n-- ) {
796 |     int angle = atoi( page.getPropertyValue( sel[n], "apply-image-orientation" ).c_str() );
797 |     if ( angle )
798 |       page.rotatePage( -angle, sel[n], true );
799 |     page.rmElems( page.select("_:Property[@key='apply-image-orientation']", sel[n]) );
800 |     std::vector<xmlNodePtr> lines = page.select(".//_:TextLine",sel[n]);
801 |     /// Fix image orientation using baselines ///
802 |     if ( lines.size() > 0 ) {
803 |       double domangle = page.getDominantBaselinesOrientation(lines);
804 |       angle = 0;
805 |       if ( domangle >= M_PI/4 && domangle < 3*M_PI/4 )
806 |         angle = -90;
807 |       else if ( domangle <= -M_PI/4 && domangle > -3*M_PI/4 )
808 |         angle = 90;
809 |       else if ( domangle >= 3*M_PI/4 || domangle <= -3*M_PI/4 )
810 |         angle = 180;
811 |       if ( angle )
812 |         page.rotatePage(angle, sel[n], true);
813 |     }
814 |   }
815 | 
816 |   /// Fill in "0,0 0,0" Word Coords ///
817 |   sel = page.select("//_:Word[_:Coords/@points='0,0 0,0']");
818 |   for ( n=(int)sel.size()-1; n>=0; n-- ) {
819 |     xmlNodePtr elem = sel[n];
820 |     xmlNodePtr elem_pre = page.selectNth("preceding-sibling::_:Word[_:Coords/@points!='0,0 0,0']", -1, elem);
821 |     xmlNodePtr elem_fol = page.selectNth("following-sibling::_:Word[_:Coords/@points!='0,0 0,0']", 0, elem);
822 |     if ( elem_pre == NULL && elem_fol == NULL ) {
823 |       page.setCoords(elem, page.getPoints(page.parent(elem)));
824 |       page.setProperty(elem, "coords-unk-filler");
825 |       continue;
826 |     }
827 |     std::vector<cv::Point2f> pts_pre = page.getPoints(elem_pre);
828 |     std::vector<cv::Point2f> pts_fol = page.getPoints(elem_fol);
829 |     std::vector<cv::Point2f> pts;
830 |     if ( elem_pre != NULL && elem_fol != NULL ) {
831 |       pts.push_back(pts_pre[1]);
832 |       pts.push_back(pts_fol[0]);
833 |       pts.push_back(pts_fol[3]);
834 |       pts.push_back(pts_pre[2]);
835 |     }
836 |     else if ( elem_pre != NULL ) {
837 |       cv::Point2f upper = pts_pre[1] - pts_pre[0];
838 |       cv::Point2f lower = pts_pre[2] - pts_pre[3];
839 |       upper = upper/cv::norm(upper) + pts_pre[1];
840 |       lower = lower/cv::norm(lower) + pts_pre[2];
841 |       pts.push_back(pts_pre[1]);
842 |       pts.push_back(upper);
843 |       pts.push_back(lower);
844 |       pts.push_back(pts_pre[2]);
845 |     }
846 |     else {
847 |       cv::Point2f upper = pts_fol[0] - pts_fol[1];
848 |       cv::Point2f lower = pts_fol[3] - pts_fol[2];
849 |       upper = upper/cv::norm(upper) + pts_fol[0];
850 |       lower = lower/cv::norm(lower) + pts_fol[3];
851 |       pts.push_back(upper);
852 |       pts.push_back(pts_fol[0]);
853 |       pts.push_back(pts_fol[3]);
854 |       pts.push_back(lower);
855 |     }
856 |     page.setCoords(elem, pts);
857 |     page.setProperty(elem, "coords-unk-filler");
858 |   }
859 | 
860 |   /// Try to make imageFilename be a relative path w.r.t. the output XML ///
861 |   if ( ! input_xml && ! gb_inplace && strcmp(gb_output,"-") )
862 |     page.relativizeImageFilename(gb_output);
863 | 
864 |   /// Write resulting XML ///
865 |   int bytes = page.write( gb_inplace ? input_file : gb_output );
866 |   if ( bytes <= 0 )
867 |     fprintf( stderr, "%s: error: problems writing to output xml\n", tool );
868 | 
869 |   /// Release resources ///
870 |   if ( pixRelease )
871 |     for ( n=0; n<(int)images.size(); n++ )
872 |       pixDestroy(&(images[n].image));
873 |   tessApi->End();
874 |   delete tessApi;
875 |   delete iter;
876 | 
877 |   return bytes <= 0 ? 1 : 0;
878 | }
879 | 


--------------------------------------------------------------------------------
/tesseract_recognize_api.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Command line tool for the tesseract-recognize API server."""
  3 | 
  4 | """
  5 | @author Mauricio Villegas <maurovill+tesseract@gmail.com>
  6 | @copyright Copyright(c) 2017-present, Mauricio Villegas <maurovill+tesseract@gmail.com>
  7 | 
  8 | @requirements pagexml-slim>=2022.4.12
  9 | @requirements jsonargparse>=4.38.0
 10 | @requirements flask-restx>=1.3.0
 11 | """
 12 | 
 13 | import os
 14 | import re
 15 | import sys
 16 | import json
 17 | import shutil
 18 | import queue
 19 | import threading
 20 | import tempfile
 21 | import pagexml
 22 | from time import time
 23 | from functools import wraps
 24 | from subprocess import Popen, PIPE, STDOUT
 25 | from jsonargparse import ArgumentParser, ActionYesNo
 26 | from flask import Flask, Response, abort
 27 | from flask_restx import Api, Resource, reqparse
 28 | from werkzeug.datastructures import FileStorage
 29 | from werkzeug.exceptions import BadRequest
 30 | 
 31 | 
 32 | def get_cli_parser(logger=True):
 33 |     """Returns the parser object for the command line tool."""
 34 |     parser = ArgumentParser(
 35 |         logger=logger,
 36 |         default_env=True,
 37 |         description=__doc__)
 38 | 
 39 |     parser.add_argument('--cfg',
 40 |         action='config',
 41 |         help='Path to a yaml configuration file.')
 42 |     parser.add_argument('--threads',
 43 |         type=int,
 44 |         default=4,
 45 |         help='Maximum number of tesseract-recognize instances to run in parallel.')
 46 |     parser.add_argument('--prefix',
 47 |         default='/tesseract-recognize',
 48 |         help='Prefix string for all API endpoints. Use "%%s" in string to replace by the API version.')
 49 |     parser.add_argument('--host',
 50 |         default='127.0.0.1',
 51 |         help='Hostname to listen on.')
 52 |     parser.add_argument('--port',
 53 |         type=int,
 54 |         default=5000,
 55 |         help='Port for the server.')
 56 |     parser.add_argument('--debug',
 57 |         action=ActionYesNo,
 58 |         default=False,
 59 |         help='Whether to run in debugging mode.')
 60 | 
 61 |     return parser
 62 | 
 63 | 
 64 | def TypePageXML(value):
 65 |     """Parse Page XML request type.
 66 | 
 67 |     Args:
 68 |         value: The raw type value.
 69 | 
 70 |     Returns:
 71 |         dict[str, {str,PageXML}]: Dictionary including the page xml 'filename', the 'string' representation and the PageXML 'object'.
 72 |     """
 73 |     if type(value) != FileStorage:
 74 |         raise ValueError('Expected pagexml to be of type FileStorage.')
 75 | 
 76 |     spxml = value.read().decode('utf-8')
 77 |     pxml = pagexml.PageXML()
 78 |     pxml.loadXmlString(spxml)
 79 | 
 80 |     return {'filename': value.filename, 'object': pxml, 'string': spxml}
 81 | 
 82 | 
 83 | class ParserPageXML(reqparse.RequestParser):
 84 |     """Class for parsing requests including a Page XML."""
 85 | 
 86 |     def parse_args(self, **kwargs):
 87 |         """Extension of parse_args that additionally does some Page XML checks."""
 88 |         req_dict = super().parse_args(**kwargs)
 89 | 
 90 |         if req_dict['pagexml'] is not None and req_dict['images'] is not None:
 91 |             pxml = req_dict['pagexml']['object']
 92 |             images_xml = set()
 93 |             for page in pxml.select('//_:Page'):
 94 |                 fname = re.sub(r'\[[0-9]+]$', '', pxml.getAttr(page, 'imageFilename'))
 95 |                 images_xml.add(fname)
 96 |             images_received = [os.path.basename(x.filename) for x in req_dict['images']]
 97 |             for fname in images_received:
 98 |                 if fname not in images_xml:
 99 |                     raise BadRequest('Received image not referenced in the Page XML: '+fname)
100 |             if len(images_xml) != len(images_received):
101 |                 raise BadRequest('Expected to receive all images referenced in the Page XML ('+str(len(images_xml))+') but only got a subset ('+str(len(images_received))+')')
102 | 
103 |         return req_dict
104 | 
105 | 
106 | def write_to_tmpdir(req_dict, prefix='tesseract_recognize_api_tmp_', basedir='/tmp'):
107 |     """Writes images and page xml from a request to a temporal directory.
108 | 
109 |     Args:
110 |         req_dict (dict):     Parsed Page XML request.
111 |         prefix (str):        Prefix for temporal directory name.
112 |         basedir (str):       Base temporal directory.
113 | 
114 |     Returns:
115 |         The path to the temporal directory where saved.
116 |     """
117 |     tmpdir = tempfile.mkdtemp(prefix=prefix, dir=basedir)
118 |     if req_dict['pagexml'] is not None:
119 |         fxml = os.path.basename(req_dict['pagexml']['filename'])
120 |         with open(os.path.join(tmpdir, fxml), 'w') as f:
121 |             f.write(req_dict['pagexml']['string'])
122 |     if req_dict['images'] is not None:
123 |         for image in req_dict['images']:
124 |             image.save(os.path.join(tmpdir, os.path.basename(image.filename)))
125 |     return tmpdir
126 | 
127 | 
128 | class images_pagexml_request:
129 |     """Decorator class for endpoints receiving images with optionally a page xml and responding with a page xml."""
130 | 
131 |     def __init__(self,
132 |                  api,
133 |                  images_help='Images with file names as referenced in the Page XML if given.',
134 |                  pagexml_help='Optional valid Page XML file.',
135 |                  options_help='Optional configuration options to be used for processing.',
136 |                  response_help='Resulting Page XML after processing.'):
137 |         """Initializer for images_pagexml_request class.
138 | 
139 |         Args:
140 |             api (flask_restplus.Api): The flask_restplus Api instance.
141 |             images_help (str):        Help for images field in swagger documentation.
142 |             pagexml_help (str):       Help for pagexml field in swagger documentation.
143 |             options_help (str):       Help for config field in swagger documentation.
144 |             response_help (str):      Help for pagexml response in swagger documentation.
145 |         """
146 |         self.api = api
147 |         self.response_help = response_help
148 | 
149 |         parser = ParserPageXML(bundle_errors=True)
150 |         parser.add_argument('images',
151 |             location='files',
152 |             type=FileStorage,
153 |             required=True,
154 |             action='append',
155 |             help=images_help)
156 |         parser.add_argument('pagexml',
157 |             location='files',
158 |             type=TypePageXML,
159 |             required=False,
160 |             help=pagexml_help)
161 |         parser.add_argument('options',
162 |             location='form',
163 |             type=str,
164 |             required=False,
165 |             default=[],
166 |             action='append',
167 |             help=options_help)
168 |         self.parser = parser
169 | 
170 |     def __call__(self, method):
171 |         """Makes a flask_restplus.Resource method expect a page xml and/or respond with a page xml."""
172 |         method = self.api.expect(self.parser)(method)
173 |         method = self.api.response(200, description=self.response_help)(method)
174 |         method = self.api.produces(['application/xml'])(method)
175 | 
176 |         @wraps(method)
177 |         def images_pagexml_request_wrapper(func):
178 |             req_dict = self.parser.parse_args()
179 |             pxml = method(func, req_dict)
180 |             return Response(
181 |                 pxml.toString(True),
182 |                 mimetype='application/xml',
183 |                 headers={'Content-type': 'application/xml; charset=utf-8'})
184 | 
185 |         return images_pagexml_request_wrapper
186 | 
187 | 
188 | def run_tesseract_recognize(*args):
189 |     """Runs a tesseract-recognize command using given arguments."""
190 |     cmd = ['tesseract-recognize']
191 |     cmd.extend(list(args))
192 | 
193 |     proc = Popen(cmd, shell=False, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
194 |     cmd_out = proc.stdout.read().decode("utf-8")
195 |     proc.communicate()
196 |     cmd_rc = proc.returncode
197 | 
198 |     return cmd_rc, cmd_out
199 | 
200 | 
201 | if __name__ == '__main__':
202 |     ## Parse config ##
203 |     parser = get_cli_parser(logger=os.path.basename(__file__))
204 |     cfg = parser.parse_args(env=True)
205 | 
206 |     ## Create a Flask WSGI application ##
207 |     app = Flask(__name__)  # pylint: disable=invalid-name
208 |     app.logger = parser.logger
209 | 
210 |     ## Create a Flask-RESTPlus API ##
211 |     api = Api(app,
212 |               doc=cfg.prefix+'/swagger',
213 |               version='2.0',
214 |               prefix=cfg.prefix,
215 |               title='tesseract-recognize API',
216 |               description='An API for running tesseract-recognition jobs.')
217 | 
218 | 
219 |     ## Definition of endpoints ##
220 |     @api.route('/version')
221 |     class ServiceVersion(Resource):
222 |         @api.response(200, description='Version of the running service.')
223 |         @api.produces(['text/plain'])
224 |         def get(self):
225 |             """Endpoint to get the version of the running service."""
226 |             rc, out = run_tesseract_recognize('--version')
227 |             if rc != 0:
228 |                 abort(500, 'problems getting version from tesseract-recognize command :: '+str(out))
229 |             return Response(out, mimetype='text/plain')
230 | 
231 | 
232 |     @api.route('/help')
233 |     class ServiceHelp(Resource):
234 |         @api.response(200, description='Help for the running service.')
235 |         @api.produces(['text/plain'])
236 |         def get(self):
237 |             """Endpoint to get the help for the running service."""
238 |             rc, out = run_tesseract_recognize('--help')
239 |             if rc != 0:
240 |                 abort(500, 'problems getting help from tesseract-recognize command :: '+str(out))
241 |             return Response(out, mimetype='text/plain')
242 | 
243 | 
244 |     num_requests = 0
245 |     @api.route('/process')
246 |     class ProcessRequest(Resource):
247 |         @images_pagexml_request(api)
248 |         @api.doc(responses={400: 'tesseract-recognize execution failed.'})
249 |         def post(self, req_dict):
250 |             """Endpoint for running tesseract-recognize on given images or page xml file."""
251 |             start_time = time()
252 |             done_queue = queue.Queue()
253 |             process_queue.put((done_queue, req_dict))
254 |             while True:
255 |                 try:
256 |                     thread, num_requests, pxml = done_queue.get(True, 0.05)
257 |                     break
258 |                 except queue.Empty:
259 |                     continue
260 |             if isinstance(pxml, Exception):
261 |                 app.logger.error('Request '+str(num_requests)+' on thread '+str(thread)+' unsuccessful, '
262 |                                  +('%.4g' % (time()-start_time))+' sec. :: '+str(pxml))
263 |                 abort(400, 'processing failed :: '+str(pxml))
264 |             else:
265 |                 app.logger.info('Request '+str(num_requests)+' on thread '+str(thread)+' successful, '
266 |                                 +('%.4g' % (time()-start_time))+' sec.')
267 |                 return pxml
268 | 
269 | 
270 |     process_queue = queue.Queue()  # type: ignore
271 | 
272 | 
273 |     ## Processor thread function ##
274 |     def start_processing(thread, process_queue):
275 | 
276 |         num_requests = 0
277 |         tmpdir = None
278 |         while True:
279 |             try:
280 |                 done_queue, req_dict = process_queue.get(True, 0.05)
281 |                 num_requests += 1
282 |                 tmpdir = write_to_tmpdir(req_dict)
283 | 
284 |                 opts = list(req_dict['options'])
285 |                 if len(opts) == 1 and opts[0][0] == '[':
286 |                     opts = json.loads(opts[0])
287 |                 if req_dict['pagexml'] is not None:
288 |                     opts.append(os.path.join(tmpdir, os.path.basename(req_dict['pagexml']['filename'])))
289 |                 elif req_dict['images'] is not None:
290 |                     for image in req_dict['images']:
291 |                         opts.append(os.path.join(tmpdir, os.path.basename(image.filename)))
292 |                 else:
293 |                     raise KeyError('No images found in request.')
294 |                 opts.extend(['-o', os.path.join(tmpdir, 'output.xml')])
295 | 
296 |                 rc, out = run_tesseract_recognize(*opts)
297 |                 if rc != 0:
298 |                     raise RuntimeError('tesseract-recognize execution failed :: opts: '+str(opts)+' :: '+str(out))
299 | 
300 |                 pxml = pagexml.PageXML(os.path.join(tmpdir, 'output.xml'))
301 |                 done_queue.put((thread, num_requests, pxml))
302 | 
303 |             except queue.Empty:
304 |                 continue
305 |             except json.decoder.JSONDecodeError as ex:
306 |                 done_queue.put((thread, num_requests, RuntimeError('JSONDecodeError: '+str(ex)+' while parsing '+opts[0])))
307 |             except Exception as ex:
308 |                 done_queue.put((thread, num_requests, ex))
309 |             finally:
310 |                 if not cfg.debug and tmpdir is not None:
311 |                     shutil.rmtree(tmpdir)
312 |                     tmpdir = None
313 | 
314 | 
315 |     for thread in range(cfg.threads):
316 |         threading.Thread(target=start_processing, args=(thread+1, process_queue)).start()
317 | 
318 | 
319 |     app.run(host=cfg.host, port=cfg.port, debug=cfg.debug)
320 | 


--------------------------------------------------------------------------------