├── .github └── FUNDING.yml ├── .gitlab-ci.yml ├── LICENSE ├── README.md ├── alpine ├── Dockerfile ├── tess.patch └── train-lang ├── artifacthub-repo.yml ├── entrypoint └── ubuntu ├── Dockerfile └── train-lang /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | patreon: jitesoft 2 | open_collective: jitesoft-open-source 3 | custom: ['https://sponsus.org/u/jitesoft'] 4 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | stages: 2 | - readme 3 | - pre 4 | - download 5 | - build 6 | - containerize 7 | - scan 8 | 9 | include: 10 | - file: /Scan/trivy.yml 11 | project: jitesoft/gitlab-ci-lib 12 | - file: /OCI/push-readme.yml 13 | project: jitesoft/gitlab-ci-lib 14 | 15 | update-readme: 16 | stage: pre 17 | extends: .readme-check.v2 18 | variables: 19 | PUSHRM_FILE: "$CI_PROJECT_DIR/README.md" 20 | GIT_DEPTH: "3" 21 | REGISTRIES: "quay.io/jitesoft/tesseract,docker.io/jitesoft/tesseract-ocr" 22 | tags: [ protected ] 23 | 24 | check: 25 | parallel: 26 | matrix: 27 | - { TESS_BRANCH: "5", GIT_STRATEGY: none } 28 | stage: download 29 | rules: 30 | - if: '$CI_PIPELINE_SOURCE == "schedule"' 31 | when: always 32 | - if: '$CI_PIPELINE_SOURCE == "web"' 33 | when: always 34 | - when: never 35 | script: 36 | - apk add --no-cache jq 37 | - touch version${TESS_BRANCH}.txt 38 | - VERSION=$(wget -qO- https://api.github.com/repos/tesseract-ocr/tesseract/releases | jq -r ".[].tag_name" | grep -oP "[$TESS_BRANCH][.]\d+[.]\d+$" | sort -V -f | tail -n1) 39 | - LEPT_VERSION=$(wget -qO- https://api.github.com/repos/DanBloomberg/leptonica/releases | jq -r ".[0].tag_name") 40 | - DONE=$(cat version${TESS_BRANCH}.txt) 41 | - | 42 | if [ ! -z "${FORCE_BUILD+x}" ] || [ "${VERSION}" != "${DONE}" ]; then 43 | curl -F token=${CI_JOB_TOKEN} -F ref=master -F "variables[LEPT_VERSION]=$LEPT_VERSION" -F "variables[TESS_BRANCH]=$TESS_BRANCH" -F "variables[VERSION]=${VERSION}" -F "variables[BUILD]=true" https://gitlab.com/api/v4/projects/${CI_PROJECT_ID}/trigger/pipeline 44 | else 45 | curl -F token=${CI_JOB_TOKEN} -F ref=master -F "variables[TESS_BRANCH]=$TESS_BRANCH" -F "variables[VERSION]=${VERSION}" -F "variables[SCAN]=true" https://gitlab.com/api/v4/projects/${CI_PROJECT_ID}/trigger/pipeline 46 | fi 47 | 48 | download: 49 | rules: 50 | - if: "$BUILD" 51 | when: always 52 | - when: never 53 | variables: 54 | GIT_STRATEGY: none 55 | stage: download 56 | image: registry.gitlab.com/jitesoft/dockerfiles/alpine:latest 57 | before_script: 58 | - apk add --no-cache wget coreutils 59 | script: 60 | - wget -O tess.tar.gz https://github.com/tesseract-ocr/tesseract/archive/refs/tags/${VERSION}.tar.gz 61 | - wget -O lept.tar.gz https://github.com/DanBloomberg/leptonica/archive/refs/tags/${LEPT_VERSION}.tar.gz 62 | - wget https://github.com/tesseract-ocr/tessdata/raw/3.04.00/equ.traineddata 63 | - wget https://github.com/tesseract-ocr/tessdata/raw/3.04.00/osd.traineddata 64 | - wget -O eng.traineddata https://github.com/tesseract-ocr/tessdata_fast/raw/main/eng.traineddata 65 | artifacts: 66 | paths: 67 | - tess.tar.gz 68 | - lept.tar.gz 69 | - osd.traineddata 70 | - equ.traineddata 71 | - eng.traineddata 72 | expire_in: 5 days 73 | 74 | #region UBUNTU 75 | 76 | .build:ubuntu: 77 | rules: 78 | - if: "$BUILD" 79 | when: on_success 80 | - when: never 81 | stage: build 82 | image: registry.gitlab.com/jitesoft/dockerfiles/ubuntu:${UBUNTU_VERSION} 83 | dependencies: 84 | - download 85 | before_script: 86 | - export DEBIAN_FRONTEND=noninteractive 87 | - if [ ! -d "ccache" ]; then mkdir ccache; fi 88 | - apt-get update 89 | - apt-get install -y locales 90 | - EXTRA_CONFIG_ARGS="" 91 | - if [ "${UBUNTU_VERSION}" = "20.04" ]; then EXTRA_CONFIG_ARGS="--disable-dependency-tracking"; fi 92 | - locale-gen en_US.UTF-8 93 | - dpkg-reconfigure --frontend=noninteractive locales 94 | - apt-get install -y ccache g++ gcc autoconf automake make libtool pkg-config libpng-dev libturbojpeg0-dev libtiff5-dev zlib1g-dev libwebp-dev libopenjp2-7-dev libgif-dev 95 | - update-ccache-symlinks 96 | - export PATH="/usr/lib/ccache:$PATH" 97 | - ccache -s 98 | script: 99 | - mkdir -p src/tess 100 | - mkdir -p src/lept 101 | - tar -xhzf lept.tar.gz -C src/lept --strip-components=1 102 | - tar -xhzf tess.tar.gz -C src/tess --strip-components=1 103 | - cd ${CI_PROJECT_DIR}/src/lept 104 | - ./autogen.sh 105 | - ./configure ${EXTRA_CONFIG_ARGS} 106 | - make 107 | - make install 108 | - cd ${CI_PROJECT_DIR}/src/tess 109 | - ./autogen.sh 110 | - ./configure --with-extra-libraries=/usr/local/lib ${EXTRA_CONFIG_ARGS} 111 | - make 112 | - make install 113 | - cd ${CI_PROJECT_DIR} 114 | - cp entrypoint /usr/local/bin/entrypoint 115 | - cp ubuntu/train-lang /usr/local/bin/train-lang 116 | - mkdir -p /usr/local/share/tessdata 117 | - cp eng.traineddata equ.traineddata osd.traineddata /usr/local/share/tessdata 118 | - chmod -R +x /usr/local/bin 119 | - $(cd /usr/local && tar -czf ${CI_PROJECT_DIR}/tess-${DOCKER_ARCH}.tar.gz *) 120 | - mkdir -p binaries/${UBUNTU_VERSION} 121 | - mv tess-${DOCKER_ARCH}.tar.gz binaries/${UBUNTU_VERSION} 122 | - ccache -s 123 | cache: 124 | paths: 125 | - ccache 126 | key: tess.ubuntu.build.ccache-${DOCKER_ARCH}-${UBUNTU_VERSION}-${TESS_BRANCH} 127 | artifacts: 128 | paths: 129 | - binaries 130 | when: on_success 131 | expire_in: 1 day 132 | 133 | build:ubuntu:amd: 134 | extends: .build:ubuntu 135 | parallel: 136 | matrix: 137 | - { UBUNTU_VERSION: "20.04" } 138 | - { UBUNTU_VERSION: "22.04" } 139 | - { UBUNTU_VERSION: "24.04" } 140 | variables: 141 | CCACHE_DIR: "${CI_PROJECT_DIR}/ccache" 142 | DOCKER_ARCH: "amd64" 143 | CCACHE_COMPRESS: "true" 144 | CCACHE_MAXSIZE: "128M" 145 | tags: 146 | - native-amd64 147 | 148 | build:ubuntu:arm: 149 | extends: .build:ubuntu 150 | parallel: 151 | matrix: 152 | - { UBUNTU_VERSION: "20.04" } 153 | - { UBUNTU_VERSION: "22.04" } 154 | - { UBUNTU_VERSION: "24.04" } 155 | variables: 156 | CCACHE_DIR: "${CI_PROJECT_DIR}/ccache" 157 | DOCKER_ARCH: "arm64" 158 | CCACHE_COMPRESS: "true" 159 | CCACHE_MAXSIZE: "128M" 160 | tags: 161 | - native-aarch64 162 | 163 | .containerize:ubuntu: 164 | needs: 165 | - build:ubuntu:arm 166 | - build:ubuntu:amd 167 | stage: containerize 168 | image: registry.gitlab.com/jitesoft/dockerfiles/misc:latest 169 | script: 170 | - mkdir bin 171 | - mv binaries/${UBUNTU_VERSION}/* bin/ 172 | - TAGLIST=$(helper taglist "jitesoft/tesseract-ocr,ghcr.io/jitesoft/tesseract,${CI_REGISTRY_IMAGE},quay.io/jitesoft/tesseract" "${TAGS}") 173 | - docker buildx build --sbom=true --provenance=true --push ${TAGLIST} --platform "linux/amd64,linux/arm64" --build-arg BUILD_TIME="$(date -Iseconds)" --build-arg UBUNTU_VERSION="${UBUNTU_VERSION}" --build-arg TESSERACT_VERSION="${TESS_VERSION}" --build-arg LEPTONICA_VERSION="${LEPT_VERSION}" -f ubuntu/Dockerfile . 174 | tags: [protected, jitesoft, buildx] 175 | 176 | containerize:ubuntu:5: 177 | extends: .containerize:ubuntu 178 | rules: 179 | - if: '$BUILD && $TESS_BRANCH == "5"' 180 | when: on_success 181 | - when: never 182 | parallel: 183 | matrix: 184 | - { UBUNTU_VERSION: "20.04", TAGS: "20.04,focal" } 185 | - { UBUNTU_VERSION: "22.04", TAGS: "22.04,jammy,lts,latest,5,${VERSION}" } 186 | - { UBUNTU_VERSION: "24.04", TAGS: "5-24.04,5-noble,5-lts,5-latest,5,5-${VERSION}" } 187 | 188 | #endregion 189 | 190 | .build:alpine: 191 | rules: 192 | - if: "$BUILD" 193 | when: on_success 194 | - when: never 195 | stage: build 196 | dependencies: 197 | - download 198 | image: registry.gitlab.com/jitesoft/dockerfiles/misc/buildbase/3.20:latest 199 | before_script: 200 | - if [ ! -d "ccache" ]; then mkdir ccache; fi 201 | - export PATH="/usr/lib/ccache/bin:$PATH" 202 | script: 203 | - mkdir -p src/tess 204 | - mkdir -p src/lept 205 | - tar -xhzf lept.tar.gz -C src/lept --strip-components=1 206 | - tar -xhzf tess.tar.gz -C src/tess --strip-components=1 207 | - apk add --no-cache pango-dev icu-dev cairo-dev libpng-dev libjpeg-turbo-dev tiff-dev libwebp-dev giflib-dev openjpeg-dev patch 208 | - | 209 | if [ "$TESS_BRANCH" = "4" ]; then 210 | patch src/tess/src/ccutil/ocrclass.h < alpine/tess.patch 211 | fi 212 | - cd ${CI_PROJECT_DIR}/src/lept 213 | - ./autogen.sh 214 | - ./configure 215 | - make 216 | - make install 217 | - cd ${CI_PROJECT_DIR}/src/tess 218 | - ./autogen.sh 219 | - ./configure 220 | - make 221 | - make install 222 | - cd ${CI_PROJECT_DIR} 223 | - cp alpine/train-lang /usr/local/bin 224 | - mkdir -p /usr/local/share/tessdata 225 | - cp eng.traineddata equ.traineddata osd.traineddata /usr/local/share/tessdata 226 | - cp entrypoint /usr/local/bin 227 | - chmod +x /usr/local/bin 228 | - $(cd /usr/local && tar -czf ${CI_PROJECT_DIR}/tess-${DOCKER_ARCH}.tar.gz *) 229 | - mkdir binaries 230 | - mv tess-${DOCKER_ARCH}.tar.gz binaries/ 231 | cache: 232 | paths: 233 | - ccache 234 | key: tess.build.ccache-${DOCKER_ARCH}-${TESS_BRANCH} 235 | artifacts: 236 | paths: 237 | - binaries 238 | when: on_success 239 | expire_in: 1 day 240 | 241 | build:alpine:amd: 242 | extends: .build:alpine 243 | variables: 244 | CCACHE_DIR: "${CI_PROJECT_DIR}/ccache" 245 | DOCKER_ARCH: "amd64" 246 | tags: 247 | - native-amd64 248 | 249 | build:alpine:arm: 250 | extends: .build:alpine 251 | variables: 252 | CCACHE_DIR: "${CI_PROJECT_DIR}/ccache" 253 | DOCKER_ARCH: "arm64" 254 | tags: 255 | - native-aarch64 256 | 257 | containerize:alpine: 258 | rules: 259 | - if: "$BUILD" 260 | when: on_success 261 | - when: never 262 | needs: 263 | - build:alpine:arm 264 | - build:alpine:amd 265 | variables: 266 | GL_TAGS: "${VERSION},${TESS_BRANCH}" 267 | TAGS: "${VERSION}-alpine,${TESS_BRANCH}-alpine" 268 | stage: containerize 269 | image: registry.gitlab.com/jitesoft/dockerfiles/misc:latest 270 | before_script: 271 | - | 272 | if [ "${TESS_BRANCH}" = "5" ]; then 273 | GL_TAGS="${GL_TAGS},latest" 274 | TAGS="${TAGS},latest-alpine,alpine" 275 | fi 276 | script: 277 | - TAGLIST=$(helper taglist "jitesoft/tesseract-ocr,ghcr.io/jitesoft/tesseract,quay.io/jitesoft/tesseract" "${TAGS}") 278 | - GL_TAGLIST=$(helper taglist "${CI_REGISTRY_IMAGE}/alpine", "${GL_TAGS}") 279 | - docker buildx build --sbom=true --provenance=true --push ${TAGLIST} ${GL_TAGLIST} --platform linux/amd64,linux/arm64 --build-arg BUILD_TIME="$(date -Iseconds)" --build-arg TESSERACT_VERSION="${TESS_VERSION}" --build-arg LEPTONICA_VERSION="${LEPT_VERSION}" -f alpine/Dockerfile . 280 | tags: [protected, jitesoft, buildx] 281 | 282 | scan:ubuntu:5: 283 | extends: .container_scanning 284 | rules: 285 | - if: '$SCAN && $TESS_BRANCH == "5"' 286 | when: always 287 | - if: '$BUILD && $TESS_BRANCH == "5"' 288 | when: on_success 289 | - when: never 290 | needs: 291 | - job: containerize:ubuntu:5 292 | artifacts: false 293 | parallel: 294 | matrix: 295 | - { SCANNING_IMAGE_NAME: "${CI_REGISTRY_IMAGE}:20.04", GIT_STRATEGY: "none" } 296 | - { SCANNING_IMAGE_NAME: "${CI_REGISTRY_IMAGE}:22.04", GIT_STRATEGY: "none" } 297 | - { SCANNING_IMAGE_NAME: "${CI_REGISTRY_IMAGE}:24.04", GIT_STRATEGY: "none" } 298 | 299 | scan:alpine:5: 300 | needs: 301 | - job: containerize:alpine 302 | artifacts: false 303 | extends: .container_scanning 304 | rules: 305 | - if: '$SCAN && $TESS_BRANCH == "5"' 306 | when: always 307 | - if: '$BUILD && $TESS_BRANCH == "5"' 308 | when: on_success 309 | - when: never 310 | variables: 311 | GIT_STRATEGY: none 312 | SCANNING_IMAGE_NAME: "${CI_REGISTRY_IMAGE}/alpine:5" 313 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Jitesoft 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tesseract OCR. 2 | 3 | [![Docker Pulls](https://img.shields.io/docker/pulls/jitesoft/tesseract-ocr.svg)](https://hub.docker.com/r/jitesoft/tesseract-ocr) 4 | [![Back project](https://img.shields.io/badge/Open%20Collective-Tip%20the%20devs!-blue.svg)](https://opencollective.com/jitesoft-open-source) 5 | 6 | [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) - Ubuntu and Alpine linux images. 7 | 8 | Tesseract and Leptonica are both built from source for each platform and distro, 9 | supported platforms are amd64 (x86_64) arm64 (aarch64). 10 | 11 | ## Tags 12 | 13 | Versions indicate OS version (or the name in case of alpine), the images with `4-` prefix uses 14 | tesseract version 4 while images without the prefix uses version 5. 15 | 16 | All versions use the same training data. 17 | 18 | Images can be found at: 19 | 20 | * [Docker hub](https://hub.docker.com/r/jitesoft/tesseract-ocr): `jitesoft/tesseract-ocr` 21 | * [GitLab](https://gitlab.com/jitesoft/dockerfiles/tesseract): `registry.gitlab.com/jitesoft/dockerfiles/tesseract` 22 | * [GitHub](https://github.com/orgs/jitesoft/packages/container/package/tesseract): `ghcr.io/jitesoft/tesseract` 23 | * [Quay](https://quay.io/jitesoft/tesseract): `quay.io/jitesoft/tesseract` 24 | 25 | ## Dockerfile 26 | 27 | Dockerfile can be found at [GitLab](https://gitlab.com/jitesoft/dockerfiles/tesseract) or [GitHub](https://github.com/jitesoft/docker-tesseract-ocr) 28 | 29 | ## Training and languages 30 | 31 | The default image have the english training data installed from start. The training data used is the "fast" data. It parses quicker but not at best quality. 32 | It's possible to train another language by invoking the `train-lang` script, followed by the language code (ISO 639-2 `eng`, `swe` etc). If you wish to use `fast` or `best`, add that as an optional parameter after the language code (`train-lang eng --fast`) else use the standard without any extra arg. 33 | The above could easily be done in a derived image: 34 | 35 | ```dockerfile 36 | FROM jitesoft/tesseract-ocr 37 | RUN train-lang bul --fast 38 | ``` 39 | 40 | The languages are downloaded from the official tesseract tessdata repositories. 41 | 42 | For a full list of supported languages check the following links: 43 | 44 | https://github.com/tesseract-ocr/tessdata 45 | https://github.com/tesseract-ocr/tessdata_best 46 | https://github.com/tesseract-ocr/tessdata_fast 47 | 48 | It is also possible to just copy a traineddata file to the `/usr/local/share/tessdata` (`/usr/share/tessdata` on alpine) directory of the container. 49 | 50 | ## Example execution 51 | 52 | ```bash 53 | docker pull jitesoft/tesseract-ocr 54 | docker run -v /path/to/image/img.jpg:/tmp/img.jpg jitesoft/tesseract-ocr /tmp/img.jpg stdout 55 | ``` 56 | 57 | Use high DPI image for best result. Higher DPI does increase the time to run though. 58 | 59 | ### Image labels 60 | 61 | This image follows the [Jitesoft image label specification 1.0.0](https://gitlab.com/snippets/1866155). 62 | 63 | ## Licenses 64 | 65 | The images and scripts in the repository are released under the [MIT license](https://gitlab.com/jitesoft/dockerfiles/tesseract/blob/master/LICENSE). 66 | Tesseract is released under the [Apache License v2](https://github.com/tesseract-ocr/tesseract/blob/master/LICENSE) 67 | 68 | Notice: The tesseract source have been modified with a patch (`alpine/tess.patch`) to allow for compilation in alpine linux. 69 | 70 | 71 | ### Sponsors 72 | 73 | Jitesoft images are built via GitLab CI on runners hosted by the following wonderful organisations: 74 | 75 | 76 | Oregon State University - Open Source Lab 77 | 78 | 79 | _The companies above are not affiliated with Jitesoft or any Jitesoft Projects directly._ 80 | 81 | --- 82 | 83 | Sponsoring is vital for the further development and maintaining of open source. 84 | Questions and sponsoring queries can be made by email. 85 | If you wish to sponsor our projects, reach out to the email above or visit any of the following sites: 86 | 87 | [Open Collective](https://opencollective.com/jitesoft-open-source) 88 | [GitHub Sponsors](https://github.com/sponsors/jitesoft) 89 | [Patreon](https://www.patreon.com/jitesoft) 90 | -------------------------------------------------------------------------------- /alpine/Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:experimental 2 | FROM registry.gitlab.com/jitesoft/dockerfiles/alpine:3.20 3 | ARG TESSERACT_VERSION 4 | ARG LEPTONICA_VERSION 5 | ARG BUILD_TIME 6 | 7 | LABEL maintainer="Johannes Tegnér " \ 8 | maintainer.org="Jitesoft" \ 9 | maintainer.org.uri="https://jitesoft.com" \ 10 | com.jitesoft.project.repo.type="git" \ 11 | com.jitesoft.project.repo.uri="https://gitlab.com/jitesoft/dockerfiles/tesseract" \ 12 | com.jitesoft.project.repo.issues="https://gitlab.com/jitesoft/dockerfiles/tesseract/issues" \ 13 | com.jitesoft.project.registry.uri="registry.gitlab.com/jitesoft/dockerfiles/tesseract" \ 14 | com.jitesoft.app.tesseract.version="${TESSERACT_VERSION}" \ 15 | com.jitesoft.app.leptonica.version="${LEPTONICA_VERSION}" \ 16 | # Open container labels 17 | org.opencontainers.image.version="${TESSERACT_VERSION}" \ 18 | org.opencontainers.image.created="${BUILD_TIME}" \ 19 | org.opencontainers.image.description="Tesseract OCR on alpine" \ 20 | org.opencontainers.image.vendor="Jitesoft" \ 21 | org.opencontainers.image.source="https://gitlab.com/jitesoft/dockerfiles/tesseract" \ 22 | # Artifact hub annotations 23 | io.artifacthub.package.alternative-locations="oci://index.docker.io/jitesoft/tesseract-ocr,oci://ghcr.io/jitesoft/tesseract,oci://quay.io/jitesoft/tesseract" \ 24 | io.artifacthub.package.readme-url="https://gitlab.com/jitesoft/dockerfiles/tesseract/-/raw/master/README.md" \ 25 | io.artifacthub.package.logo-url="https://jitesoft.com/favicon-96x96.png" 26 | 27 | ARG TESSERACT_VERSION 28 | ARG LEPTONICA_VERSION 29 | ARG TARGETARCH 30 | 31 | ENV TESSDATA_PREFIX="/usr/local/share/tessdata" 32 | 33 | RUN --mount=type=bind,source=./binaries,target=/tmp/tess \ 34 | apk add --no-cache --virtual .extract tar \ 35 | && addgroup -g 472 -S tesseract 2>&1 \ 36 | && adduser -u 472 -D -S -G tesseract tesseract \ 37 | && tar -xzhf /tmp/tess/tess-${TARGETARCH}.tar.gz -C /usr/local \ 38 | && apk del .extract \ 39 | && apk add --no-cache --virtual .runtime-deps libpng libjpeg-turbo tiff zlib libwebp giflib openjpeg libstdc++ libgomp \ 40 | && chmod -R +x /usr/local/bin \ 41 | && chown -R tesseract:tesseract /usr/local/share \ 42 | && rm -f /usr/local/lib/*.a 43 | 44 | USER tesseract 45 | ENTRYPOINT ["entrypoint"] 46 | CMD ["--version"] 47 | 48 | -------------------------------------------------------------------------------- /alpine/tess.patch: -------------------------------------------------------------------------------- 1 | --- ocrclass.h 2020-01-10 12:26:33.307224500 +0100 2 | +++ ocrclass.h 2020-01-10 12:26:58.748584300 +0100 3 | @@ -28,6 +28,7 @@ 4 | 5 | #include 6 | #include 7 | +#include 8 | #ifdef _WIN32 9 | #include // for timeval 10 | #endif 11 | -------------------------------------------------------------------------------- /alpine/train-lang: -------------------------------------------------------------------------------- 1 | #!/bin/ash 2 | 3 | LANGUAGE_CODE="${1}" 4 | TYPE="" 5 | while [[ $# -gt 0 ]]; do 6 | case "$1" in 7 | --fast) 8 | TYPE="_fast" 9 | echo "Fetching trained data with 'fast' version." 10 | break;; 11 | --best) 12 | TYPE="_best" 13 | echo "Fetching trained data with 'best' version." 14 | break;; 15 | *) 16 | echo "Fetching standard trained data." 17 | break;; 18 | esac 19 | done 20 | 21 | echo "Trying to install tesseract trained data for language with code: ${LANGUAGE_CODE}" 22 | URI="https://github.com/tesseract-ocr/tessdata${TYPE}/raw/main/${LANGUAGE_CODE}.traineddata" 23 | apk add --no-cache wget 24 | 25 | if wget --spider "${URI}" >/dev/null 2>&1; then 26 | wget -O "${TESSDATA_PREFIX}/${LANGUAGE_CODE}.traineddata" "${URI}" 27 | else 28 | echo "Invalid language code." 29 | fi 30 | apk del wget 31 | -------------------------------------------------------------------------------- /artifacthub-repo.yml: -------------------------------------------------------------------------------- 1 | repositoryID: f65fc590-511e-4b07-bbf5-e3be8d6cc7a5 2 | owners: 3 | - name: Johannes Tegnér 4 | email: johannes@jitesoft.com 5 | -------------------------------------------------------------------------------- /entrypoint: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if [ "${1}" = "train-lang" ]; then 4 | exec "$@" 5 | else 6 | exec tesseract "$@" 7 | fi 8 | -------------------------------------------------------------------------------- /ubuntu/Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:experimental 2 | ARG UBUNTU_VERSION 3 | FROM registry.gitlab.com/jitesoft/dockerfiles/ubuntu:${UBUNTU_VERSION} 4 | ARG TESSERACT_VERSION 5 | ARG LEPTONICA_VERSION 6 | ARG BUILD_TIME 7 | 8 | LABEL maintainer="Johannes Tegnér " \ 9 | maintainer.org="Jitesoft" \ 10 | maintainer.org.uri="https://jitesoft.com" \ 11 | com.jitesoft.project.repo.type="git" \ 12 | com.jitesoft.project.repo.uri="https://gitlab.com/jitesoft/dockerfiles/tesseract" \ 13 | com.jitesoft.project.repo.issues="https://gitlab.com/jitesoft/dockerfiles/tesseract/issues" \ 14 | com.jitesoft.project.registry.uri="registry.gitlab.com/jitesoft/dockerfiles/tesseract" \ 15 | com.jitesoft.app.tesseract.version="${TESSERACT_VERSION}" \ 16 | com.jitesoft.app.leptonica.version="${LEPTONICA_VERSION}" \ 17 | # Open container labels 18 | org.opencontainers.image.version="${TESSERACT_VERSION}" \ 19 | org.opencontainers.image.created="${BUILD_TIME}" \ 20 | org.opencontainers.image.description="Tesseract OCR on ubuntu" \ 21 | org.opencontainers.image.vendor="Jitesoft" \ 22 | org.opencontainers.image.source="https://gitlab.com/jitesoft/dockerfiles/tesseract" \ 23 | # Artifact hub annotations 24 | io.artifacthub.package.alternative-locations="oci://index.docker.io/jitesoft/tesseract-ocr,oci://ghcr.io/jitesoft/tesseract,oci://quay.io/jitesoft/tesseract" \ 25 | io.artifacthub.package.readme-url="https://gitlab.com/jitesoft/dockerfiles/tesseract/-/raw/master/README.md" \ 26 | io.artifacthub.package.logo-url="https://jitesoft.com/favicon-96x96.png" 27 | 28 | ARG TESSERACT_VERSION 29 | ARG LEPTONICA_VERSION 30 | ARG TARGETARCH 31 | ARG UBUNTU_VERSION 32 | 33 | ENV TESSDATA_PREFIX="/usr/local/share/tessdata" 34 | 35 | RUN --mount=type=bind,source=./bin,target=/tmp/tess \ 36 | tar -xzhf /tmp/tess/tess-${TARGETARCH}.tar.gz -C /usr/local \ 37 | && groupadd -g 472 -r tesseract \ 38 | && useradd -u 472 -r -g tesseract tesseract \ 39 | && apt-get update \ 40 | && if [ "${UBUNTU_VERSION}" = "24.04" ]; then apt-get -y install libgomp1 libgif7 libwebpmux3 libwebp7 libopenjp2-7 libpng16-16 libjpeg9 libtiff6 zlib1g wget; fi \ 41 | && if [ "${UBUNTU_VERSION}" = "22.04" ]; then apt-get -y install libgomp1 libgif7 libwebpmux3 libwebp7 libopenjp2-7 libpng16-16 libjpeg9 libtiff5 zlib1g wget; fi \ 42 | && if [ "${UBUNTU_VERSION}" = "20.04" ]; then apt-get -y install libgomp1 libgif7 libwebpmux3 libwebp6 libopenjp2-7 libpng16-16 libjpeg9 libtiff5 zlib1g wget; fi \ 43 | && chmod -R +x /usr/local/bin \ 44 | && chown -R tesseract:tesseract /usr/local/share \ 45 | && rm -f /usr/local/lib/*.a \ 46 | && apt-get autoremove -y \ 47 | && apt-get clean -y 48 | 49 | USER tesseract 50 | ENTRYPOINT ["entrypoint"] 51 | CMD ["--version"] 52 | -------------------------------------------------------------------------------- /ubuntu/train-lang: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | LANGUAGE_CODE="${1}" 4 | TYPE="" 5 | while [[ $# -gt 0 ]]; do 6 | case "$1" in 7 | --fast) 8 | TYPE="_fast" 9 | echo "Fetching trained data with 'fast' version." 10 | break;; 11 | --best) 12 | TYPE="_best" 13 | echo "Fetching trained data with 'best' version." 14 | break;; 15 | *) 16 | echo "Fetching standard trained data." 17 | break;; 18 | esac 19 | done 20 | 21 | echo "Trying to install tesseract trained data for language with code: ${LANGUAGE_CODE}" 22 | 23 | URI="https://github.com/tesseract-ocr/tessdata${TYPE}/raw/main/${LANGUAGE_CODE}.traineddata" 24 | if wget --spider "${URI}" >/dev/null 2>&1; then 25 | wget -O "${TESSDATA_PREFIX}/${LANGUAGE_CODE}.traineddata" "${URI}" 26 | else 27 | echo "Invalid language code." 28 | fi 29 | --------------------------------------------------------------------------------