├── .github
└── FUNDING.yml
├── .gitlab-ci.yml
├── LICENSE
├── README.md
├── alpine
├── Dockerfile
├── tess.patch
└── train-lang
├── artifacthub-repo.yml
├── entrypoint
└── ubuntu
├── Dockerfile
└── train-lang
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | patreon: jitesoft
2 | open_collective: jitesoft-open-source
3 | custom: ['https://sponsus.org/u/jitesoft']
4 |
--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
1 | stages:
2 | - readme
3 | - pre
4 | - download
5 | - build
6 | - containerize
7 | - scan
8 |
9 | include:
10 | - file: /Scan/trivy.yml
11 | project: jitesoft/gitlab-ci-lib
12 | - file: /OCI/push-readme.yml
13 | project: jitesoft/gitlab-ci-lib
14 |
15 | update-readme:
16 | stage: pre
17 | extends: .readme-check.v2
18 | variables:
19 | PUSHRM_FILE: "$CI_PROJECT_DIR/README.md"
20 | GIT_DEPTH: "3"
21 | REGISTRIES: "quay.io/jitesoft/tesseract,docker.io/jitesoft/tesseract-ocr"
22 | tags: [ protected ]
23 |
24 | check:
25 | parallel:
26 | matrix:
27 | - { TESS_BRANCH: "5", GIT_STRATEGY: none }
28 | stage: download
29 | rules:
30 | - if: '$CI_PIPELINE_SOURCE == "schedule"'
31 | when: always
32 | - if: '$CI_PIPELINE_SOURCE == "web"'
33 | when: always
34 | - when: never
35 | script:
36 | - apk add --no-cache jq
37 | - touch version${TESS_BRANCH}.txt
38 | - VERSION=$(wget -qO- https://api.github.com/repos/tesseract-ocr/tesseract/releases | jq -r ".[].tag_name" | grep -oP "[$TESS_BRANCH][.]\d+[.]\d+$" | sort -V -f | tail -n1)
39 | - LEPT_VERSION=$(wget -qO- https://api.github.com/repos/DanBloomberg/leptonica/releases | jq -r ".[0].tag_name")
40 | - DONE=$(cat version${TESS_BRANCH}.txt)
41 | - |
42 | if [ ! -z "${FORCE_BUILD+x}" ] || [ "${VERSION}" != "${DONE}" ]; then
43 | curl -F token=${CI_JOB_TOKEN} -F ref=master -F "variables[LEPT_VERSION]=$LEPT_VERSION" -F "variables[TESS_BRANCH]=$TESS_BRANCH" -F "variables[VERSION]=${VERSION}" -F "variables[BUILD]=true" https://gitlab.com/api/v4/projects/${CI_PROJECT_ID}/trigger/pipeline
44 | else
45 | curl -F token=${CI_JOB_TOKEN} -F ref=master -F "variables[TESS_BRANCH]=$TESS_BRANCH" -F "variables[VERSION]=${VERSION}" -F "variables[SCAN]=true" https://gitlab.com/api/v4/projects/${CI_PROJECT_ID}/trigger/pipeline
46 | fi
47 |
48 | download:
49 | rules:
50 | - if: "$BUILD"
51 | when: always
52 | - when: never
53 | variables:
54 | GIT_STRATEGY: none
55 | stage: download
56 | image: registry.gitlab.com/jitesoft/dockerfiles/alpine:latest
57 | before_script:
58 | - apk add --no-cache wget coreutils
59 | script:
60 | - wget -O tess.tar.gz https://github.com/tesseract-ocr/tesseract/archive/refs/tags/${VERSION}.tar.gz
61 | - wget -O lept.tar.gz https://github.com/DanBloomberg/leptonica/archive/refs/tags/${LEPT_VERSION}.tar.gz
62 | - wget https://github.com/tesseract-ocr/tessdata/raw/3.04.00/equ.traineddata
63 | - wget https://github.com/tesseract-ocr/tessdata/raw/3.04.00/osd.traineddata
64 | - wget -O eng.traineddata https://github.com/tesseract-ocr/tessdata_fast/raw/main/eng.traineddata
65 | artifacts:
66 | paths:
67 | - tess.tar.gz
68 | - lept.tar.gz
69 | - osd.traineddata
70 | - equ.traineddata
71 | - eng.traineddata
72 | expire_in: 5 days
73 |
74 | #region UBUNTU
75 |
76 | .build:ubuntu:
77 | rules:
78 | - if: "$BUILD"
79 | when: on_success
80 | - when: never
81 | stage: build
82 | image: registry.gitlab.com/jitesoft/dockerfiles/ubuntu:${UBUNTU_VERSION}
83 | dependencies:
84 | - download
85 | before_script:
86 | - export DEBIAN_FRONTEND=noninteractive
87 | - if [ ! -d "ccache" ]; then mkdir ccache; fi
88 | - apt-get update
89 | - apt-get install -y locales
90 | - EXTRA_CONFIG_ARGS=""
91 | - if [ "${UBUNTU_VERSION}" = "20.04" ]; then EXTRA_CONFIG_ARGS="--disable-dependency-tracking"; fi
92 | - locale-gen en_US.UTF-8
93 | - dpkg-reconfigure --frontend=noninteractive locales
94 | - apt-get install -y ccache g++ gcc autoconf automake make libtool pkg-config libpng-dev libturbojpeg0-dev libtiff5-dev zlib1g-dev libwebp-dev libopenjp2-7-dev libgif-dev
95 | - update-ccache-symlinks
96 | - export PATH="/usr/lib/ccache:$PATH"
97 | - ccache -s
98 | script:
99 | - mkdir -p src/tess
100 | - mkdir -p src/lept
101 | - tar -xhzf lept.tar.gz -C src/lept --strip-components=1
102 | - tar -xhzf tess.tar.gz -C src/tess --strip-components=1
103 | - cd ${CI_PROJECT_DIR}/src/lept
104 | - ./autogen.sh
105 | - ./configure ${EXTRA_CONFIG_ARGS}
106 | - make
107 | - make install
108 | - cd ${CI_PROJECT_DIR}/src/tess
109 | - ./autogen.sh
110 | - ./configure --with-extra-libraries=/usr/local/lib ${EXTRA_CONFIG_ARGS}
111 | - make
112 | - make install
113 | - cd ${CI_PROJECT_DIR}
114 | - cp entrypoint /usr/local/bin/entrypoint
115 | - cp ubuntu/train-lang /usr/local/bin/train-lang
116 | - mkdir -p /usr/local/share/tessdata
117 | - cp eng.traineddata equ.traineddata osd.traineddata /usr/local/share/tessdata
118 | - chmod -R +x /usr/local/bin
119 | - $(cd /usr/local && tar -czf ${CI_PROJECT_DIR}/tess-${DOCKER_ARCH}.tar.gz *)
120 | - mkdir -p binaries/${UBUNTU_VERSION}
121 | - mv tess-${DOCKER_ARCH}.tar.gz binaries/${UBUNTU_VERSION}
122 | - ccache -s
123 | cache:
124 | paths:
125 | - ccache
126 | key: tess.ubuntu.build.ccache-${DOCKER_ARCH}-${UBUNTU_VERSION}-${TESS_BRANCH}
127 | artifacts:
128 | paths:
129 | - binaries
130 | when: on_success
131 | expire_in: 1 day
132 |
133 | build:ubuntu:amd:
134 | extends: .build:ubuntu
135 | parallel:
136 | matrix:
137 | - { UBUNTU_VERSION: "20.04" }
138 | - { UBUNTU_VERSION: "22.04" }
139 | - { UBUNTU_VERSION: "24.04" }
140 | variables:
141 | CCACHE_DIR: "${CI_PROJECT_DIR}/ccache"
142 | DOCKER_ARCH: "amd64"
143 | CCACHE_COMPRESS: "true"
144 | CCACHE_MAXSIZE: "128M"
145 | tags:
146 | - native-amd64
147 |
148 | build:ubuntu:arm:
149 | extends: .build:ubuntu
150 | parallel:
151 | matrix:
152 | - { UBUNTU_VERSION: "20.04" }
153 | - { UBUNTU_VERSION: "22.04" }
154 | - { UBUNTU_VERSION: "24.04" }
155 | variables:
156 | CCACHE_DIR: "${CI_PROJECT_DIR}/ccache"
157 | DOCKER_ARCH: "arm64"
158 | CCACHE_COMPRESS: "true"
159 | CCACHE_MAXSIZE: "128M"
160 | tags:
161 | - native-aarch64
162 |
163 | .containerize:ubuntu:
164 | needs:
165 | - build:ubuntu:arm
166 | - build:ubuntu:amd
167 | stage: containerize
168 | image: registry.gitlab.com/jitesoft/dockerfiles/misc:latest
169 | script:
170 | - mkdir bin
171 | - mv binaries/${UBUNTU_VERSION}/* bin/
172 | - TAGLIST=$(helper taglist "jitesoft/tesseract-ocr,ghcr.io/jitesoft/tesseract,${CI_REGISTRY_IMAGE},quay.io/jitesoft/tesseract" "${TAGS}")
173 | - docker buildx build --sbom=true --provenance=true --push ${TAGLIST} --platform "linux/amd64,linux/arm64" --build-arg BUILD_TIME="$(date -Iseconds)" --build-arg UBUNTU_VERSION="${UBUNTU_VERSION}" --build-arg TESSERACT_VERSION="${TESS_VERSION}" --build-arg LEPTONICA_VERSION="${LEPT_VERSION}" -f ubuntu/Dockerfile .
174 | tags: [protected, jitesoft, buildx]
175 |
176 | containerize:ubuntu:5:
177 | extends: .containerize:ubuntu
178 | rules:
179 | - if: '$BUILD && $TESS_BRANCH == "5"'
180 | when: on_success
181 | - when: never
182 | parallel:
183 | matrix:
184 | - { UBUNTU_VERSION: "20.04", TAGS: "20.04,focal" }
185 | - { UBUNTU_VERSION: "22.04", TAGS: "22.04,jammy,lts,latest,5,${VERSION}" }
186 | - { UBUNTU_VERSION: "24.04", TAGS: "5-24.04,5-noble,5-lts,5-latest,5,5-${VERSION}" }
187 |
188 | #endregion
189 |
190 | .build:alpine:
191 | rules:
192 | - if: "$BUILD"
193 | when: on_success
194 | - when: never
195 | stage: build
196 | dependencies:
197 | - download
198 | image: registry.gitlab.com/jitesoft/dockerfiles/misc/buildbase/3.20:latest
199 | before_script:
200 | - if [ ! -d "ccache" ]; then mkdir ccache; fi
201 | - export PATH="/usr/lib/ccache/bin:$PATH"
202 | script:
203 | - mkdir -p src/tess
204 | - mkdir -p src/lept
205 | - tar -xhzf lept.tar.gz -C src/lept --strip-components=1
206 | - tar -xhzf tess.tar.gz -C src/tess --strip-components=1
207 | - apk add --no-cache pango-dev icu-dev cairo-dev libpng-dev libjpeg-turbo-dev tiff-dev libwebp-dev giflib-dev openjpeg-dev patch
208 | - |
209 | if [ "$TESS_BRANCH" = "4" ]; then
210 | patch src/tess/src/ccutil/ocrclass.h < alpine/tess.patch
211 | fi
212 | - cd ${CI_PROJECT_DIR}/src/lept
213 | - ./autogen.sh
214 | - ./configure
215 | - make
216 | - make install
217 | - cd ${CI_PROJECT_DIR}/src/tess
218 | - ./autogen.sh
219 | - ./configure
220 | - make
221 | - make install
222 | - cd ${CI_PROJECT_DIR}
223 | - cp alpine/train-lang /usr/local/bin
224 | - mkdir -p /usr/local/share/tessdata
225 | - cp eng.traineddata equ.traineddata osd.traineddata /usr/local/share/tessdata
226 | - cp entrypoint /usr/local/bin
227 | - chmod +x /usr/local/bin
228 | - $(cd /usr/local && tar -czf ${CI_PROJECT_DIR}/tess-${DOCKER_ARCH}.tar.gz *)
229 | - mkdir binaries
230 | - mv tess-${DOCKER_ARCH}.tar.gz binaries/
231 | cache:
232 | paths:
233 | - ccache
234 | key: tess.build.ccache-${DOCKER_ARCH}-${TESS_BRANCH}
235 | artifacts:
236 | paths:
237 | - binaries
238 | when: on_success
239 | expire_in: 1 day
240 |
241 | build:alpine:amd:
242 | extends: .build:alpine
243 | variables:
244 | CCACHE_DIR: "${CI_PROJECT_DIR}/ccache"
245 | DOCKER_ARCH: "amd64"
246 | tags:
247 | - native-amd64
248 |
249 | build:alpine:arm:
250 | extends: .build:alpine
251 | variables:
252 | CCACHE_DIR: "${CI_PROJECT_DIR}/ccache"
253 | DOCKER_ARCH: "arm64"
254 | tags:
255 | - native-aarch64
256 |
257 | containerize:alpine:
258 | rules:
259 | - if: "$BUILD"
260 | when: on_success
261 | - when: never
262 | needs:
263 | - build:alpine:arm
264 | - build:alpine:amd
265 | variables:
266 | GL_TAGS: "${VERSION},${TESS_BRANCH}"
267 | TAGS: "${VERSION}-alpine,${TESS_BRANCH}-alpine"
268 | stage: containerize
269 | image: registry.gitlab.com/jitesoft/dockerfiles/misc:latest
270 | before_script:
271 | - |
272 | if [ "${TESS_BRANCH}" = "5" ]; then
273 | GL_TAGS="${GL_TAGS},latest"
274 | TAGS="${TAGS},latest-alpine,alpine"
275 | fi
276 | script:
277 | - TAGLIST=$(helper taglist "jitesoft/tesseract-ocr,ghcr.io/jitesoft/tesseract,quay.io/jitesoft/tesseract" "${TAGS}")
278 | - GL_TAGLIST=$(helper taglist "${CI_REGISTRY_IMAGE}/alpine", "${GL_TAGS}")
279 | - docker buildx build --sbom=true --provenance=true --push ${TAGLIST} ${GL_TAGLIST} --platform linux/amd64,linux/arm64 --build-arg BUILD_TIME="$(date -Iseconds)" --build-arg TESSERACT_VERSION="${TESS_VERSION}" --build-arg LEPTONICA_VERSION="${LEPT_VERSION}" -f alpine/Dockerfile .
280 | tags: [protected, jitesoft, buildx]
281 |
282 | scan:ubuntu:5:
283 | extends: .container_scanning
284 | rules:
285 | - if: '$SCAN && $TESS_BRANCH == "5"'
286 | when: always
287 | - if: '$BUILD && $TESS_BRANCH == "5"'
288 | when: on_success
289 | - when: never
290 | needs:
291 | - job: containerize:ubuntu:5
292 | artifacts: false
293 | parallel:
294 | matrix:
295 | - { SCANNING_IMAGE_NAME: "${CI_REGISTRY_IMAGE}:20.04", GIT_STRATEGY: "none" }
296 | - { SCANNING_IMAGE_NAME: "${CI_REGISTRY_IMAGE}:22.04", GIT_STRATEGY: "none" }
297 | - { SCANNING_IMAGE_NAME: "${CI_REGISTRY_IMAGE}:24.04", GIT_STRATEGY: "none" }
298 |
299 | scan:alpine:5:
300 | needs:
301 | - job: containerize:alpine
302 | artifacts: false
303 | extends: .container_scanning
304 | rules:
305 | - if: '$SCAN && $TESS_BRANCH == "5"'
306 | when: always
307 | - if: '$BUILD && $TESS_BRANCH == "5"'
308 | when: on_success
309 | - when: never
310 | variables:
311 | GIT_STRATEGY: none
312 | SCANNING_IMAGE_NAME: "${CI_REGISTRY_IMAGE}/alpine:5"
313 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Jitesoft
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Tesseract OCR.
2 |
3 | [](https://hub.docker.com/r/jitesoft/tesseract-ocr)
4 | [](https://opencollective.com/jitesoft-open-source)
5 |
6 | [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) - Ubuntu and Alpine linux images.
7 |
8 | Tesseract and Leptonica are both built from source for each platform and distro,
9 | supported platforms are amd64 (x86_64) arm64 (aarch64).
10 |
11 | ## Tags
12 |
13 | Versions indicate OS version (or the name in case of alpine), the images with `4-` prefix uses
14 | tesseract version 4 while images without the prefix uses version 5.
15 |
16 | All versions use the same training data.
17 |
18 | Images can be found at:
19 |
20 | * [Docker hub](https://hub.docker.com/r/jitesoft/tesseract-ocr): `jitesoft/tesseract-ocr`
21 | * [GitLab](https://gitlab.com/jitesoft/dockerfiles/tesseract): `registry.gitlab.com/jitesoft/dockerfiles/tesseract`
22 | * [GitHub](https://github.com/orgs/jitesoft/packages/container/package/tesseract): `ghcr.io/jitesoft/tesseract`
23 | * [Quay](https://quay.io/jitesoft/tesseract): `quay.io/jitesoft/tesseract`
24 |
25 | ## Dockerfile
26 |
27 | Dockerfile can be found at [GitLab](https://gitlab.com/jitesoft/dockerfiles/tesseract) or [GitHub](https://github.com/jitesoft/docker-tesseract-ocr)
28 |
29 | ## Training and languages
30 |
31 | The default image have the english training data installed from start. The training data used is the "fast" data. It parses quicker but not at best quality.
32 | It's possible to train another language by invoking the `train-lang` script, followed by the language code (ISO 639-2 `eng`, `swe` etc). If you wish to use `fast` or `best`, add that as an optional parameter after the language code (`train-lang eng --fast`) else use the standard without any extra arg.
33 | The above could easily be done in a derived image:
34 |
35 | ```dockerfile
36 | FROM jitesoft/tesseract-ocr
37 | RUN train-lang bul --fast
38 | ```
39 |
40 | The languages are downloaded from the official tesseract tessdata repositories.
41 |
42 | For a full list of supported languages check the following links:
43 |
44 | https://github.com/tesseract-ocr/tessdata
45 | https://github.com/tesseract-ocr/tessdata_best
46 | https://github.com/tesseract-ocr/tessdata_fast
47 |
48 | It is also possible to just copy a traineddata file to the `/usr/local/share/tessdata` (`/usr/share/tessdata` on alpine) directory of the container.
49 |
50 | ## Example execution
51 |
52 | ```bash
53 | docker pull jitesoft/tesseract-ocr
54 | docker run -v /path/to/image/img.jpg:/tmp/img.jpg jitesoft/tesseract-ocr /tmp/img.jpg stdout
55 | ```
56 |
57 | Use high DPI image for best result. Higher DPI does increase the time to run though.
58 |
59 | ### Image labels
60 |
61 | This image follows the [Jitesoft image label specification 1.0.0](https://gitlab.com/snippets/1866155).
62 |
63 | ## Licenses
64 |
65 | The images and scripts in the repository are released under the [MIT license](https://gitlab.com/jitesoft/dockerfiles/tesseract/blob/master/LICENSE).
66 | Tesseract is released under the [Apache License v2](https://github.com/tesseract-ocr/tesseract/blob/master/LICENSE)
67 |
68 | Notice: The tesseract source have been modified with a patch (`alpine/tess.patch`) to allow for compilation in alpine linux.
69 |
70 |
71 | ### Sponsors
72 |
73 | Jitesoft images are built via GitLab CI on runners hosted by the following wonderful organisations:
74 |
75 |
76 |
77 |
78 |
79 | _The companies above are not affiliated with Jitesoft or any Jitesoft Projects directly._
80 |
81 | ---
82 |
83 | Sponsoring is vital for the further development and maintaining of open source.
84 | Questions and sponsoring queries can be made by email.
85 | If you wish to sponsor our projects, reach out to the email above or visit any of the following sites:
86 |
87 | [Open Collective](https://opencollective.com/jitesoft-open-source)
88 | [GitHub Sponsors](https://github.com/sponsors/jitesoft)
89 | [Patreon](https://www.patreon.com/jitesoft)
90 |
--------------------------------------------------------------------------------
/alpine/Dockerfile:
--------------------------------------------------------------------------------
1 | # syntax=docker/dockerfile:experimental
2 | FROM registry.gitlab.com/jitesoft/dockerfiles/alpine:3.20
3 | ARG TESSERACT_VERSION
4 | ARG LEPTONICA_VERSION
5 | ARG BUILD_TIME
6 |
7 | LABEL maintainer="Johannes Tegnér " \
8 | maintainer.org="Jitesoft" \
9 | maintainer.org.uri="https://jitesoft.com" \
10 | com.jitesoft.project.repo.type="git" \
11 | com.jitesoft.project.repo.uri="https://gitlab.com/jitesoft/dockerfiles/tesseract" \
12 | com.jitesoft.project.repo.issues="https://gitlab.com/jitesoft/dockerfiles/tesseract/issues" \
13 | com.jitesoft.project.registry.uri="registry.gitlab.com/jitesoft/dockerfiles/tesseract" \
14 | com.jitesoft.app.tesseract.version="${TESSERACT_VERSION}" \
15 | com.jitesoft.app.leptonica.version="${LEPTONICA_VERSION}" \
16 | # Open container labels
17 | org.opencontainers.image.version="${TESSERACT_VERSION}" \
18 | org.opencontainers.image.created="${BUILD_TIME}" \
19 | org.opencontainers.image.description="Tesseract OCR on alpine" \
20 | org.opencontainers.image.vendor="Jitesoft" \
21 | org.opencontainers.image.source="https://gitlab.com/jitesoft/dockerfiles/tesseract" \
22 | # Artifact hub annotations
23 | io.artifacthub.package.alternative-locations="oci://index.docker.io/jitesoft/tesseract-ocr,oci://ghcr.io/jitesoft/tesseract,oci://quay.io/jitesoft/tesseract" \
24 | io.artifacthub.package.readme-url="https://gitlab.com/jitesoft/dockerfiles/tesseract/-/raw/master/README.md" \
25 | io.artifacthub.package.logo-url="https://jitesoft.com/favicon-96x96.png"
26 |
27 | ARG TESSERACT_VERSION
28 | ARG LEPTONICA_VERSION
29 | ARG TARGETARCH
30 |
31 | ENV TESSDATA_PREFIX="/usr/local/share/tessdata"
32 |
33 | RUN --mount=type=bind,source=./binaries,target=/tmp/tess \
34 | apk add --no-cache --virtual .extract tar \
35 | && addgroup -g 472 -S tesseract 2>&1 \
36 | && adduser -u 472 -D -S -G tesseract tesseract \
37 | && tar -xzhf /tmp/tess/tess-${TARGETARCH}.tar.gz -C /usr/local \
38 | && apk del .extract \
39 | && apk add --no-cache --virtual .runtime-deps libpng libjpeg-turbo tiff zlib libwebp giflib openjpeg libstdc++ libgomp \
40 | && chmod -R +x /usr/local/bin \
41 | && chown -R tesseract:tesseract /usr/local/share \
42 | && rm -f /usr/local/lib/*.a
43 |
44 | USER tesseract
45 | ENTRYPOINT ["entrypoint"]
46 | CMD ["--version"]
47 |
48 |
--------------------------------------------------------------------------------
/alpine/tess.patch:
--------------------------------------------------------------------------------
1 | --- ocrclass.h 2020-01-10 12:26:33.307224500 +0100
2 | +++ ocrclass.h 2020-01-10 12:26:58.748584300 +0100
3 | @@ -28,6 +28,7 @@
4 |
5 | #include
6 | #include
7 | +#include
8 | #ifdef _WIN32
9 | #include // for timeval
10 | #endif
11 |
--------------------------------------------------------------------------------
/alpine/train-lang:
--------------------------------------------------------------------------------
1 | #!/bin/ash
2 |
3 | LANGUAGE_CODE="${1}"
4 | TYPE=""
5 | while [[ $# -gt 0 ]]; do
6 | case "$1" in
7 | --fast)
8 | TYPE="_fast"
9 | echo "Fetching trained data with 'fast' version."
10 | break;;
11 | --best)
12 | TYPE="_best"
13 | echo "Fetching trained data with 'best' version."
14 | break;;
15 | *)
16 | echo "Fetching standard trained data."
17 | break;;
18 | esac
19 | done
20 |
21 | echo "Trying to install tesseract trained data for language with code: ${LANGUAGE_CODE}"
22 | URI="https://github.com/tesseract-ocr/tessdata${TYPE}/raw/main/${LANGUAGE_CODE}.traineddata"
23 | apk add --no-cache wget
24 |
25 | if wget --spider "${URI}" >/dev/null 2>&1; then
26 | wget -O "${TESSDATA_PREFIX}/${LANGUAGE_CODE}.traineddata" "${URI}"
27 | else
28 | echo "Invalid language code."
29 | fi
30 | apk del wget
31 |
--------------------------------------------------------------------------------
/artifacthub-repo.yml:
--------------------------------------------------------------------------------
1 | repositoryID: f65fc590-511e-4b07-bbf5-e3be8d6cc7a5
2 | owners:
3 | - name: Johannes Tegnér
4 | email: johannes@jitesoft.com
5 |
--------------------------------------------------------------------------------
/entrypoint:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | if [ "${1}" = "train-lang" ]; then
4 | exec "$@"
5 | else
6 | exec tesseract "$@"
7 | fi
8 |
--------------------------------------------------------------------------------
/ubuntu/Dockerfile:
--------------------------------------------------------------------------------
1 | # syntax=docker/dockerfile:experimental
2 | ARG UBUNTU_VERSION
3 | FROM registry.gitlab.com/jitesoft/dockerfiles/ubuntu:${UBUNTU_VERSION}
4 | ARG TESSERACT_VERSION
5 | ARG LEPTONICA_VERSION
6 | ARG BUILD_TIME
7 |
8 | LABEL maintainer="Johannes Tegnér " \
9 | maintainer.org="Jitesoft" \
10 | maintainer.org.uri="https://jitesoft.com" \
11 | com.jitesoft.project.repo.type="git" \
12 | com.jitesoft.project.repo.uri="https://gitlab.com/jitesoft/dockerfiles/tesseract" \
13 | com.jitesoft.project.repo.issues="https://gitlab.com/jitesoft/dockerfiles/tesseract/issues" \
14 | com.jitesoft.project.registry.uri="registry.gitlab.com/jitesoft/dockerfiles/tesseract" \
15 | com.jitesoft.app.tesseract.version="${TESSERACT_VERSION}" \
16 | com.jitesoft.app.leptonica.version="${LEPTONICA_VERSION}" \
17 | # Open container labels
18 | org.opencontainers.image.version="${TESSERACT_VERSION}" \
19 | org.opencontainers.image.created="${BUILD_TIME}" \
20 | org.opencontainers.image.description="Tesseract OCR on ubuntu" \
21 | org.opencontainers.image.vendor="Jitesoft" \
22 | org.opencontainers.image.source="https://gitlab.com/jitesoft/dockerfiles/tesseract" \
23 | # Artifact hub annotations
24 | io.artifacthub.package.alternative-locations="oci://index.docker.io/jitesoft/tesseract-ocr,oci://ghcr.io/jitesoft/tesseract,oci://quay.io/jitesoft/tesseract" \
25 | io.artifacthub.package.readme-url="https://gitlab.com/jitesoft/dockerfiles/tesseract/-/raw/master/README.md" \
26 | io.artifacthub.package.logo-url="https://jitesoft.com/favicon-96x96.png"
27 |
28 | ARG TESSERACT_VERSION
29 | ARG LEPTONICA_VERSION
30 | ARG TARGETARCH
31 | ARG UBUNTU_VERSION
32 |
33 | ENV TESSDATA_PREFIX="/usr/local/share/tessdata"
34 |
35 | RUN --mount=type=bind,source=./bin,target=/tmp/tess \
36 | tar -xzhf /tmp/tess/tess-${TARGETARCH}.tar.gz -C /usr/local \
37 | && groupadd -g 472 -r tesseract \
38 | && useradd -u 472 -r -g tesseract tesseract \
39 | && apt-get update \
40 | && if [ "${UBUNTU_VERSION}" = "24.04" ]; then apt-get -y install libgomp1 libgif7 libwebpmux3 libwebp7 libopenjp2-7 libpng16-16 libjpeg9 libtiff6 zlib1g wget; fi \
41 | && if [ "${UBUNTU_VERSION}" = "22.04" ]; then apt-get -y install libgomp1 libgif7 libwebpmux3 libwebp7 libopenjp2-7 libpng16-16 libjpeg9 libtiff5 zlib1g wget; fi \
42 | && if [ "${UBUNTU_VERSION}" = "20.04" ]; then apt-get -y install libgomp1 libgif7 libwebpmux3 libwebp6 libopenjp2-7 libpng16-16 libjpeg9 libtiff5 zlib1g wget; fi \
43 | && chmod -R +x /usr/local/bin \
44 | && chown -R tesseract:tesseract /usr/local/share \
45 | && rm -f /usr/local/lib/*.a \
46 | && apt-get autoremove -y \
47 | && apt-get clean -y
48 |
49 | USER tesseract
50 | ENTRYPOINT ["entrypoint"]
51 | CMD ["--version"]
52 |
--------------------------------------------------------------------------------
/ubuntu/train-lang:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | LANGUAGE_CODE="${1}"
4 | TYPE=""
5 | while [[ $# -gt 0 ]]; do
6 | case "$1" in
7 | --fast)
8 | TYPE="_fast"
9 | echo "Fetching trained data with 'fast' version."
10 | break;;
11 | --best)
12 | TYPE="_best"
13 | echo "Fetching trained data with 'best' version."
14 | break;;
15 | *)
16 | echo "Fetching standard trained data."
17 | break;;
18 | esac
19 | done
20 |
21 | echo "Trying to install tesseract trained data for language with code: ${LANGUAGE_CODE}"
22 |
23 | URI="https://github.com/tesseract-ocr/tessdata${TYPE}/raw/main/${LANGUAGE_CODE}.traineddata"
24 | if wget --spider "${URI}" >/dev/null 2>&1; then
25 | wget -O "${TESSDATA_PREFIX}/${LANGUAGE_CODE}.traineddata" "${URI}"
26 | else
27 | echo "Invalid language code."
28 | fi
29 |
--------------------------------------------------------------------------------