├── .dockerignore ├── .github └── workflows │ ├── docker-image.yml │ └── python-app.yml ├── .gitignore ├── .pylintrc ├── CHANGELOG.md ├── Dockerfile ├── MANIFEST.in ├── Makefile ├── README.md ├── ocrd-tool.json ├── ocrd_detectron2 ├── __init__.py ├── cli.py ├── ocrd-tool.json ├── presets_DocBank_X101.json ├── presets_DocBank_X101_page.json ├── presets_Jambo-sudo_X101.json ├── presets_Math_R50.json ├── presets_NewspaperNavigator_R50.json ├── presets_PRImALayout_R50.json ├── presets_PubLayNet_R101.json ├── presets_PubLayNet_R101_JPLeoRX.json ├── presets_PubLayNet_R50.json ├── presets_PubLayNet_R50_JPLeoRX.json ├── presets_PubLayNet_X101.json ├── presets_TableBank_X152.json ├── presets_TableBank_X152_Psarpei.json └── segment.py ├── pyproject.toml ├── requirements-test.txt └── requirements.txt /.dockerignore: -------------------------------------------------------------------------------- 1 | test 2 | repo 3 | dist 4 | build 5 | *.egg-info 6 | *.whl 7 | -------------------------------------------------------------------------------- /.github/workflows/docker-image.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image CI 2 | 3 | on: 4 | push: 5 | branches: [ "master" ] 6 | workflow_dispatch: 7 | 8 | env: 9 | DOCKER_TAGNAME: ocrd/detectron2 10 | 11 | jobs: 12 | 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | permissions: 17 | packages: write 18 | contents: read 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | - # Activate cache export feature to reduce build time of image 23 | name: Set up Docker Buildx 24 | uses: docker/setup-buildx-action@v2 25 | - name: Build the Docker image 26 | run: make docker DOCKER_TAG=${{ env.DOCKER_TAGNAME }} 27 | - name: Login to Dockerhub 28 | uses: docker/login-action@v2 29 | with: 30 | username: ${{ vars.DOCKERHUB_USERNAME }} 31 | password: ${{ secrets.DOCKERHUB_PASSWORD }} 32 | - name: Push image to Dockerhub 33 | run: docker push ${{ env.DOCKER_TAGNAME }} 34 | - name: Alias the Docker image for GHCR 35 | run: docker tag ${{ env.DOCKER_TAGNAME }} ghcr.io/bertsky/ocrd_detectron2 36 | - name: Login to GitHub Container Registry 37 | uses: docker/login-action@v2 38 | with: 39 | registry: ghcr.io 40 | username: ${{ github.actor }} 41 | password: ${{ secrets.GITHUB_TOKEN }} 42 | - name: Push image to Github Container Registry 43 | run: docker push ghcr.io/bertsky/ocrd_detectron2 44 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: CLI Tests 5 | 6 | on: 7 | push: 8 | branches: [ "master" ] 9 | pull_request: 10 | workflow_dispatch: 11 | inputs: 12 | upterm-session: 13 | description: 'Run SSH login server for debugging' 14 | default: False 15 | type: boolean 16 | 17 | jobs: 18 | build: 19 | 20 | runs-on: ubuntu-latest 21 | strategy: 22 | matrix: 23 | python-version: ['3.8', '3.9', '3.10'] 24 | 25 | steps: 26 | - uses: actions/checkout@v4 27 | - name: Set up Python 28 | uses: actions/setup-python@v5 29 | with: 30 | python-version: ${{ matrix.python-version }} 31 | - name: Lint with flake8 32 | run: | 33 | pip install flake8 34 | # stop the build if there are Python syntax errors or undefined names 35 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 36 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 37 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 38 | - name: Setup upterm session 39 | # interactive SSH logins for debugging 40 | if: github.event.inputs.upterm-session == 'true' 41 | uses: lhotari/action-upterm@v1 42 | - name: Install dependencies 43 | run: make deps 44 | - name: Install package 45 | run: make install 46 | - name: Cache models 47 | uses: actions/cache@v4 48 | with: 49 | key: detectron-models 50 | path: /home/runner/.local/share/ocrd-resources/ocrd-detectron2-segment/* 51 | - name: Install dependencies for test 52 | # also downloads models, if not already present 53 | run: make deps-test 54 | - name: Run tests 55 | run: make test 56 | - name: Upload test results 57 | if: matrix.python-version == '3.8' 58 | uses: actions/upload-artifact@v4 59 | with: 60 | name: test-results 61 | path: | 62 | ./test/assets/*/data/test-result 63 | ./test/assets/*/data/OCR-D-SEG-*/ 64 | if-no-files-found: error 65 | 66 | publish: 67 | permissions: 68 | # for stefanzweifel/git-auto-commit-action to push code on gh-pages 69 | contents: write 70 | needs: build 71 | runs-on: ubuntu-latest 72 | continue-on-error: true 73 | steps: 74 | - name: Checkout GH Pages 75 | uses: actions/checkout@v4 76 | with: 77 | ref: gh-pages 78 | - name: Download Artifact 79 | uses: actions/download-artifact@v4 80 | with: 81 | name: test-results 82 | path: test-results 83 | - name: Data Ingest 84 | run: bash gen-test-results-table.sh 85 | - name: Commit 86 | uses: stefanzweifel/git-auto-commit-action@v4 87 | with: 88 | commit_message: new test results from ${{ github.sha }} 89 | branch: gh-pages 90 | # file_pattern: test-results* 91 | repository: . 92 | 93 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Distribution / packaging 7 | .Python 8 | build/ 9 | develop-eggs/ 10 | dist/ 11 | downloads/ 12 | eggs/ 13 | .eggs/ 14 | lib/ 15 | lib64/ 16 | parts/ 17 | sdist/ 18 | var/ 19 | wheels/ 20 | *.egg-info/ 21 | .installed.cfg 22 | *.egg 23 | MANIFEST 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .coverage.* 40 | .cache 41 | nosetests.xml 42 | coverage.xml 43 | *.cover 44 | .hypothesis/ 45 | .pytest_cache/ 46 | 47 | # vim tmp 48 | *.swp 49 | *.swo 50 | 51 | # emacs bkup 52 | *~ 53 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | extension-pkg-whitelist=cv2 3 | 4 | [MESSAGES CONTROL] 5 | disable = 6 | ungrouped-imports, 7 | bad-continuation, 8 | missing-docstring, 9 | no-self-use, 10 | superfluous-parens, 11 | invalid-name, 12 | line-too-long, 13 | too-many-arguments, 14 | too-many-branches, 15 | too-many-statements, 16 | too-many-locals, 17 | too-few-public-methods, 18 | too-many-nested-blocks, 19 | wrong-import-order, 20 | duplicate-code 21 | 22 | # allow non-snake-case identifiers: 23 | good-names=n,i 24 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 6 | 7 | ## [Unreleased] 8 | 9 | ## [0.2.0] - 2025-04-08 10 | 11 | ### Changed 12 | - updated to OCR-D v3 API 13 | - switched from `setup.py` to `pyproject.toml` 14 | (and `ocrd-tool.json` based versioning) 15 | - updated Dockerfile (base version, variables, labels, `ocrd-all-tool.json`) 16 | - updated CI 17 | 18 | ## [0.1.8] - 2023-06-29 19 | ### Fixed 20 | - workarounds for broken models (DocBank_X101, Jambo-sudo_X101) 21 | - `make deps`: add explicit reqs prior to pip step with Torch index 22 | - set `pc:PcGts/@pcGtsId` from `mets:file/@ID` 23 | 24 | ### Added 25 | - CI for CLI tests (with cached models and stored result artifacts) 26 | 27 | ### Changed 28 | - migrated model URLs from external to Github release assets 29 | 30 | ## [0.1.7] - 2023-03-20 31 | ### Fixed 32 | - adapt to Numpy 1.24 (no `np.bool`) 33 | 34 | ### Added 35 | - model by Jambo-sudo (PubLayNet+custom GT) 36 | - model by LayoutParser (PRImA Layout GT) 37 | - CLI tests 38 | 39 | ## [0.1.6] - 2023-03-10 40 | ### Fixed 41 | - avoid colon in generated region IDs 42 | - `make deps`: add explicit deps for torch 43 | - fix/update file resources 44 | - fix model config base paths on-the-fly 45 | 46 | ### Added 47 | - add Psarpei TD model 48 | 49 | ## [0.1.5] - 2023-01-15 50 | ### Fixed 51 | - param `debug_img`: 1 image per page 52 | - URLs/specs for PubLayNet/JPLeoRX models 53 | 54 | ## [0.1.4] - 2022-12-02 55 | ### Added 56 | - param `postprocessing` (select steps, including `none`) 57 | - param `debug_img` (styles to visualise raw predictions, including `none`) 58 | 59 | ## [0.1.3] - 2022-11-02 60 | ### Fixed 61 | - `make deps`: fall back to Detectron2 src build 62 | 63 | ### Changed 64 | - added various models as file resources 65 | - added corresponding preset files 66 | - updated documentation 67 | 68 | ## [0.1.2] - 2022-10-27 69 | ### Fixed 70 | - `make deps`: fix CUDA detection even more 71 | - apply `device` param as passed 72 | 73 | ### Changed 74 | - downscale images to no more than 150 DPI for prediction (for speed) 75 | - add param `operation_level` (default `page`), add `table` mode 76 | 77 | ## [0.1.1] - 2022-02-02 78 | ### Fixed 79 | - `make deps`: fix CUDA detection and allow CPU as fallback 80 | 81 | ### Changed 82 | - instance segmentation postprocessing: use asymmetric overlap 83 | criterion for non-maximum suppression 84 | - skip instances which belong to classes with empty category 85 | - annotate incrementally (by skipping candidates that overlap 86 | with pre-existing top-level regions) 87 | 88 | ## [0.1.0] - 2022-01-21 89 | 90 | 91 | [0.1.0]: ../../compare/aeca7e37...v0.1.0 92 | [0.1.1]: ../../compare/v0.1.0...v0.1.1 93 | [0.1.2]: ../../compare/v0.1.1...v0.1.2 94 | [0.1.3]: ../../compare/v0.1.2...v0.1.3 95 | [0.1.4]: ../../compare/v0.1.3...v0.1.4 96 | [0.1.5]: ../../compare/v0.1.4...v0.1.5 97 | [0.1.6]: ../../compare/v0.1.5...v0.1.6 98 | [0.1.7]: ../../compare/v0.1.6...v0.1.7 99 | [0.1.8]: ../../compare/v0.1.7...v0.1.8 100 | [0.2.0]: ../../compare/v0.1.8...v0.2.0 101 | [unreleased]: ../../compare/v0.2.0...master 102 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG DOCKER_BASE_IMAGE 2 | FROM $DOCKER_BASE_IMAGE 3 | ARG VCS_REF 4 | ARG BUILD_DATE 5 | LABEL \ 6 | maintainer="https://ocr-d.de/en/contact" \ 7 | org.label-schema.vcs-ref=$VCS_REF \ 8 | org.label-schema.vcs-url="https://github.com/bertsky/ocrd_detectron2" \ 9 | org.label-schema.build-date=$BUILD_DATE \ 10 | org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \ 11 | org.opencontainers.image.title="ocrd_detectron2" \ 12 | org.opencontainers.image.description="OCR-D wrapper for detectron2 based segmentation models" \ 13 | org.opencontainers.image.source="https://github.com/bertsky/ocrd_detectron2" \ 14 | org.opencontainers.image.documentation="https://github.com/bertsky/ocrd_detectron2/blob/${VCS_REF}/README.md" \ 15 | org.opencontainers.image.revision=$VCS_REF \ 16 | org.opencontainers.image.created=$BUILD_DATE \ 17 | org.opencontainers.image.base.name=ocrd/core-cuda-torch 18 | 19 | ENV DEBIAN_FRONTEND=noninteractive 20 | ENV PYTHONIOENCODING=utf8 21 | ENV LANG=C.UTF-8 22 | ENV LC_ALL=C.UTF-8 23 | 24 | # avoid HOME/.local/share (hard to predict USER here) 25 | # so let XDG_DATA_HOME coincide with fixed system location 26 | # (can still be overridden by derived stages) 27 | ENV XDG_DATA_HOME /usr/local/share 28 | # avoid the need for an extra volume for persistent resource user db 29 | # (i.e. XDG_CONFIG_HOME/ocrd/resources.yml) 30 | ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources 31 | 32 | WORKDIR /build/ocrd_detectron2 33 | 34 | COPY . . 35 | COPY ocrd-tool.json . 36 | # prepackage ocrd-tool.json as ocrd-all-tool.json 37 | RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json 38 | # prepackage ocrd-all-module-dir.json 39 | RUN ocrd ocrd-tool ocrd-tool.json dump-module-dirs > $(dirname $(ocrd bashlib filename))/ocrd-all-module-dir.json 40 | # install everything and reduce image size 41 | RUN apt-get install -y --no-install-recommends g++ && \ 42 | make deps && \ 43 | make install && \ 44 | rm -rf /build/ocrd_detectron2 && \ 45 | apt-get -y remove --auto-remove g++ 46 | 47 | WORKDIR /data 48 | VOLUME /data 49 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include ocrd-tool.json 2 | include README.md 3 | include requirements.txt 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PYTHON = python3 2 | PIP = pip3 3 | PYTHONIOENCODING=utf8 4 | SHELL = /bin/bash 5 | 6 | # Docker container tag 7 | DOCKER_BASE_IMAGE ?= docker.io/ocrd/core-cuda-torch:latest 8 | DOCKER_TAG ?= 'ocrd/detectron2' 9 | DOCKER ?= docker 10 | 11 | help: 12 | @echo 13 | @echo " Targets" 14 | @echo 15 | @echo " deps Install only Python dependencies via pip" 16 | @echo " install Install full Python package via pip" 17 | @echo " install-dev Install full Python package via pip" 18 | @echo " deps-test Install Python dependencies for tests via pip and models via resmgr" 19 | @echo " test Run regression tests" 20 | @echo " build Build Python package as source and wheel distribution" 21 | @echo " clean Remove symlinks in test/assets" 22 | @echo " docker Build Docker image" 23 | @echo 24 | @echo " Variables" 25 | @echo " PYTHON" 26 | @echo " CUDA_VERSION override detection of CUDA runtime version (e.g. '11.3' or 'CPU')" 27 | @echo " DOCKER_TAG Docker image tag of result for the docker target" 28 | 29 | # Install Python deps via pip 30 | # There is no prebuilt for detectron2 on PyPI, and the public wheels depend on CUDA and Torch version. 31 | # See https://github.com/facebookresearch/detectron2/blob/main/INSTALL.md#install-pre-built-detectron2 32 | # and https://github.com/facebookresearch/detectron2/issues/969 33 | # While there is a web site which lists them, which works with `pip install -f`, this unfortunately cannot 34 | # be encapsulated via setuptools, see https://github.com/pypa/pip/issues/5898 35 | # and https://stackoverflow.com/questions/3472430/how-can-i-make-setuptools-install-a-package-thats-not-on-pypi 36 | # and https://github.com/pypa/pip/issues/4187 37 | # Detectron2 requires Torch >=1.10 and <1.11, which is quite out of date now. 38 | # Also, the prebuilt versions on https://dl.fbaipublicfiles.com/detectron2/wheels/*/torch1.10/index.html 39 | # are only available for CUDA 10.1, 10.2, 11.1, 11.3 or CPU. 40 | # Moreoever, even Torch >=1.10 and <1.11 is not available on https://download.pytorch.org/whl/torch/ 41 | # except for a narrow few CUDA versions. 42 | # To make matters worse, Detectron2 setup fails specifying Torch as build-time and run-time dependency: 43 | # https://github.com/facebookresearch/detectron2/issues/4472 44 | # Therefore, source build of Detectron2 fails unless Torch is already installed before _and_ using 45 | # pip install --no-build-isolation. 46 | # Finally, due to https://github.com/pypa/pip/issues/4321, we cannot even mix -f links and pkgindex (for Pytorch versions) 47 | # because pip will (more or less) randomly pick the one or the other. 48 | # Detectron2 must always have the same version of Torch at runtime which it was compiled against. 49 | deps: 50 | @$(PIP) install -r <(sed "/torch/d;/detectron2/d" requirements.txt) 51 | @if test -n "$$CUDA_VERSION"; then :; \ 52 | elif test -s /usr/local/cuda/version.txt; then \ 53 | CUDA_VERSION=$$(sed 's/^.* //;s/\([0-9]\+[.][0-9]\).*/\1/' /usr/local/cuda/version.txt); \ 54 | elif command -v nvcc &>/dev/null; then \ 55 | CUDA_VERSION=$$(nvcc --version | sed -n '/^Cuda/{s/.* release //;s/,.*//;p;}'); \ 56 | elif command -v nvidia-smi &>/dev/null; then \ 57 | CUDA_VERSION=$$(nvidia-smi | sed -n '/CUDA Version/{s/.*CUDA Version: //;s/ .*//;p;}'); \ 58 | elif command -v pkg-config &>/dev/null; then \ 59 | CUDA_VERSION=$$(pkg-config --list-all | sed -n '/^cudart/{s/cudart-//;s/ .*//;p;q;}'); \ 60 | fi && \ 61 | if test -z "$$CUDA_VERSION"; then \ 62 | echo "Cannot find CUDA runtime library, assuming CPU-only"; CUDA_VERSION=CPU; \ 63 | fi && echo "Detected CUDA version: $$CUDA_VERSION" && \ 64 | if test "$$CUDA_VERSION" = CPU; then CUDA=cpu; \ 65 | else IFS=. CUDA=($$CUDA_VERSION) && CUDA=cu$${CUDA[0]}$${CUDA[1]}; \ 66 | fi && \ 67 | $(PIP) install -i "https://download.pytorch.org/whl/$$CUDA" \ 68 | -r <(sed -n "/torch/p" requirements.txt) && \ 69 | $(PIP) install --no-build-isolation "git+https://github.com/facebookresearch/detectron2#egg=detectron2" 70 | 71 | # Install Python package via pip 72 | install: deps 73 | $(PIP) install . 74 | 75 | # Install Python package via pip 76 | install-dev: deps 77 | $(PIP) install -e . 78 | 79 | # Install testing python deps via pip 80 | deps-test: models-test 81 | $(PIP) install -r requirements-test.txt 82 | 83 | build: 84 | $(PIP) install build 85 | $(PYTHON) -m build . 86 | 87 | # Clone OCR-D/assets to ./repo/assets 88 | repo/assets: 89 | @mkdir -p $(@D) 90 | git clone https://github.com/OCR-D/assets $@ 91 | 92 | # Setup test data 93 | test/assets: repo/assets 94 | @mkdir -p $@ 95 | cp -r -t $@ repo/assets/data/* 96 | 97 | # Remove test data copies and intermediate results 98 | clean: 99 | -$(RM) -r test/assets 100 | 101 | # Build docker image 102 | docker: 103 | $(DOCKER) build \ 104 | --build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \ 105 | --build-arg VCS_REF=$$(git rev-parse --short HEAD) \ 106 | --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ 107 | -t $(DOCKER_TAG) . 108 | 109 | #MODELDIR := $(or $(XDG_DATA_HOME),$(HOME)/.local/share)/ocrd-resources/ocrd-detectron2-segment 110 | 111 | TESTMODEL := TableBank_X152_Psarpei 112 | TESTMODEL += DocBank_X101 113 | TESTMODEL += Jambo-sudo_X101 114 | TESTMODEL += PRImALayout_R50 115 | 116 | TESTBED := gutachten 117 | TESTBED += column-samples 118 | 119 | models-test: $(TESTMODEL:=.yaml) 120 | models-test: $(TESTMODEL:=.pth) 121 | 122 | %.yaml: 123 | ocrd resmgr download ocrd-detectron2-segment $@ 124 | %.pth: 125 | ocrd resmgr download ocrd-detectron2-segment $@ 126 | 127 | test: $(patsubst %,test/assets/%/data/test-result,$(TESTBED)) 128 | @cat $^ 129 | 130 | count-regions := python -c "import sys; from ocrd_models.ocrd_page import parse; print('%s: %d' % (sys.argv[1], len(parse(sys.argv[1], silence=True).get_Page().get_AllRegions())))" 131 | 132 | %/test-result: test/assets 133 | for MODEL in $(TESTMODEL); do \ 134 | $(MAKE) MODEL=$$MODEL $*/OCR-D-SEG-$$MODEL; \ 135 | done 136 | @shopt -s nullglob; { for file in $(TESTMODEL:%=$*/OCR-D-SEG-%/*.xml); do \ 137 | $(count-regions) $$file; \ 138 | done; } > $@ 139 | 140 | %/OCR-D-BIN: 141 | cd $(@D) && ocrd-skimage-binarize -I `grp=(*IMG); basename $$grp` -O $(@F) 142 | 143 | # workaround for OCR-D/core#930: 144 | %/OCR-D-SEG-$(MODEL): PRESET = $(shell ocrd-detectron2-segment -D)/presets_$(MODEL).json 145 | 146 | %/OCR-D-SEG-$(MODEL): %/OCR-D-BIN 147 | cd $(@D) && ocrd-detectron2-segment -I $( Use detectron2 to segment each page into regions. 71 | 72 | > Open and deserialize PAGE input files and their respective images. 73 | > Fetch a raw and a binarized image for the page frame (possibly 74 | > cropped and deskewed). 75 | 76 | > Feed the raw image into the detectron2 predictor that has been used 77 | > to load the given model. Then, depending on the model capabilities 78 | > (whether it can do panoptic segmentation or only instance 79 | > segmentation, whether the latter can do masks or only bounding 80 | > boxes), post-process the predictions: 81 | 82 | > - panoptic segmentation: take the provided segment label map, and 83 | > apply the segment to class label map, 84 | > - instance segmentation: find an optimal non-overlapping set (flat 85 | > map) of instances via non-maximum suppression, 86 | > - both: avoid overlapping pre-existing top-level regions (incremental 87 | > segmentation). 88 | 89 | > Then extend / shrink the surviving masks to fully include / exclude 90 | > connected components in the foreground that are on the boundary. 91 | 92 | > (This describes the steps when ``postprocessing`` is `full`. A value 93 | > of `only-nms` will omit the morphological extension/shrinking, while 94 | > `only-morph` will omit the non-maximum suppression, and `none` will 95 | > skip all postprocessing.) 96 | 97 | > Finally, find the convex hull polygon for each region, and map its 98 | > class id to a new PAGE region type (and subtype). 99 | 100 | > (Does not annotate `ReadingOrder` or `TextLine`s or `@orientation`.) 101 | 102 | > Produce a new output file by serialising the resulting hierarchy. 103 | 104 | Options: 105 | -I, --input-file-grp USE File group(s) used as input 106 | -O, --output-file-grp USE File group(s) used as output 107 | -g, --page-id ID Physical page ID(s) to process 108 | --overwrite Remove existing output pages/images 109 | (with --page-id, remove only those) 110 | --profile Enable profiling 111 | --profile-file Write cProfile stats to this file. Implies --profile 112 | -p, --parameter JSON-PATH Parameters, either verbatim JSON string 113 | or JSON file path 114 | -P, --param-override KEY VAL Override a single JSON object key-value pair, 115 | taking precedence over --parameter 116 | -m, --mets URL-PATH URL or file path of METS to process 117 | -w, --working-dir PATH Working directory of local workspace 118 | -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE] 119 | Log level 120 | -C, --show-resource RESNAME Dump the content of processor resource RESNAME 121 | -L, --list-resources List names of processor resources 122 | -J, --dump-json Dump tool description as JSON and exit 123 | -D, --dump-module-dir Output the 'module' directory with resources for this processor 124 | -h, --help This help message 125 | -V, --version Show version 126 | 127 | Parameters: 128 | "operation_level" [string - "page"] 129 | hierarchy level which to predict and assign regions for 130 | Possible values: ["page", "table"] 131 | "categories" [array - REQUIRED] 132 | maps each category (class index) of the model to a PAGE region 133 | type (and @type or @custom if separated by colon), e.g. 134 | ['TextRegion:paragraph', 'TextRegion:heading', 135 | 'TextRegion:floating', 'TableRegion', 'ImageRegion'] for PubLayNet; 136 | categories with an empty string will be skipped during prediction 137 | "model_config" [string - REQUIRED] 138 | path name of model config 139 | "model_weights" [string - REQUIRED] 140 | path name of model weights 141 | "min_confidence" [number - 0.5] 142 | confidence threshold for detections 143 | "postprocessing" [string - "full"] 144 | which postprocessing steps to enable: by default, applies a custom 145 | non-maximum suppression (to avoid overlaps) and morphological 146 | operations (using connected component analysis on the binarized 147 | input image to shrink or expand regions) 148 | Possible values: ["full", "only-nms", "only-morph", "none"] 149 | "debug_img" [string - "none"] 150 | paint an AlternativeImage which blends the input image 151 | and all raw decoded region candidates 152 | Possible values: ["none", "instance_colors", "instance_colors_only", "category_colors"] 153 | "device" [string - "cuda"] 154 | select computing device for Torch (e.g. cpu or cuda:0); will fall 155 | back to CPU if no GPU is available 156 | ``` 157 | 158 | Example: 159 | 160 | # download one preconfigured model: 161 | ocrd resmgr download ocrd-detectron2-segment TableBank_X152.yaml 162 | ocrd resmgr download ocrd-detectron2-segment TableBank_X152.pth 163 | # run it (setting model_config, model_weights and categories): 164 | ocrd-detectron2-segment -I OCR-D-BIN -O OCR-D-SEG-TAB -P categories '["TableRegion"]' -P model_config TableBank_X152.yaml -P model_weights TableBank_X152.pth -P min_confidence 0.1 165 | # run it (equivalent, with presets file) 166 | ocrd-detectron2-segment -I OCR-D-BIN -O OCR-D-SEG-TAB -p presets_TableBank_X152.json -P min_confidence 0.1 167 | # download all preconfigured models 168 | ocrd resmgr download ocrd-detectron2-segment "*" 169 | 170 | For installation **via Docker**, usage is bascially the same as above – with some modifications: 171 | 172 | # For data persistency, decide which host-side directories you want to mount in Docker: 173 | DATADIR=/host-side/path/to/data 174 | MODELDIR=/host-side/path/to/models 175 | # Either you "log in" to a container first: 176 | docker run -v $DATADIR:/data -v $MODELDIR:/usr/local/share/ocrd-resources -it bertsky/ocrd_detectron2 bash 177 | # and then can use the above commands verbatim 178 | ... 179 | # Or you spin up a new container each time, 180 | # which means prefixing the above commands with 181 | docker run -v $DATADIR:/data -v $MODELDIR:/usr/local/share/ocrd-resources bertsky/ocrd_detectron2 ... 182 | 183 | 184 | #### Debugging 185 | 186 | If you mistrust your model, and/or this tool's additional postprocessing, 187 | try playing with the runtime parameters: 188 | 189 | - Set `debug_img` to some value other than `none`, e.g. `instance_colors_only`. 190 | This will generate an image which overlays the raw predictions with the raw image 191 | using Detectron2's internal visualiser. The parameter settings correspond to its 192 | [ColorMode](https://detectron2.readthedocs.io/en/latest/modules/utils.html#detectron2.utils.visualizer.ColorMode). 193 | The AlternativeImages will have `@comments="debug"`, and will also be referenced in the METS, 194 | which allows convenient browsing with [OCR-D Browser](https://github.com/hnesk/browse-ocrd). 195 | (For example, open the Page View and Image View side by side, and navigate to your output 196 | fileGrp on each.) 197 | - Selectively disable postprocessing steps: from the default `full` via `only-nms` (first stage) 198 | or `only-morph` (second stage) to `none`. 199 | - Lower `min_confidence` to get more candidates, raise to get fewer. 200 | 201 | ## Models 202 | 203 | Some of the following models have already been registered as known [file resources](https://ocr-d.de/en/spec/cli#processor-resources), along with parameter presets to use them conveniently. 204 | 205 | To get a list of registered models **available for download**, do: 206 | 207 | ocrd resmgr list-available -e ocrd-detectron2-segment 208 | 209 | To get a list of **already installed** models and presets, do: 210 | 211 | ocrd resmgr list-installed -e ocrd-detectron2-segment 212 | 213 | To **download** a registered model (i.e. a config file and the respective weights file), do: 214 | 215 | ocrd resmgr download ocrd-detectron2-segment NAME.yaml 216 | ocrd resmgr download ocrd-detectron2-segment NAME.pth 217 | 218 | To download more models (registered or other), see: 219 | 220 | ocrd resmgr download --help 221 | 222 | To **use** a model, do: 223 | 224 | ocrd-detectron2-segment -P model_config NAME.yaml -P model_weights NAME.pth -P categories '[...]' ... 225 | ocrd-detectron2-segment -p NAME.json ... # equivalent, with presets file 226 | 227 | To add (i.e. register) a **new model**, you first have to find: 228 | - the classes it is trained on, so you can then define a mapping to PAGE-XML region (and subregion) types, 229 | - a download link to the model config and model weights file. 230 | Archives (zip/tar) are allowed, but then you must also specify the file paths to extract. 231 | 232 | Assuming you have done so, then proceed as follows: 233 | 234 | # from local file path 235 | ocrd resmgr download -n path/to/model/config.yml ocrd-detectron2-segment NAME.yml 236 | ocrd resmgr download -n path/to/model/weights.pth ocrd-detectron2-segment NAME.pth 237 | # from single file URL 238 | ocrd resmgr download -n https://path.to/model/config.yml ocrd-detectron2-segment NAME.yml 239 | ocrd resmgr download -n https://path.to/model/weights.pth ocrd-detectron2-segment NAME.pth 240 | # from zip file URL 241 | ocrd resmgr download -n https://path.to/model/arch.zip -t archive -P zip-path/to/config.yml ocrd-detectron2-segment NAME.yml 242 | ocrd resmgr download -n https://path.to/model/arch.zip -t archive -P zip-path/to/weights.pth ocrd-detectron2-segment NAME.pth 243 | # create corresponding preset file 244 | echo '{"model_weights": "NAME.pth", "model_config": "NAME.yml", "categories": [...]}' > NAME.json 245 | # install preset file so it can be used everywhere (not just in CWD): 246 | ocrd resmgr download -n NAME.json ocrd-detectron2-segment NAME.json 247 | # now the new model can be used just like the preregistered models 248 | ocrd-detectron2-segment -p NAME.json ... 249 | 250 | 251 | What follows is an **overview** of the **preregistered** models (i.e. available via `resmgr`). 252 | 253 | > **Note**: These are just examples, no exhaustive search was done yet! 254 | 255 | > **Note**: The filename suffix (.pth vs .pkl) of the weight file does matter! 256 | 257 | ### [TableBank](https://github.com/doc-analysis/TableBank) 258 | 259 | X152-FPN [config](https://layoutlm.blob.core.windows.net/tablebank/model_zoo/detection/All_X152/All_X152.yaml)|[weights](https://layoutlm.blob.core.windows.net/tablebank/model_zoo/detection/All_X152/model_final.pth)|`["TableRegion"]` 260 | 261 | ### [TableBank](https://github.com/Psarpei/Multi-Type-TD-TSR) 262 | 263 | X152-FPN [config](https://drive.google.com/drive/folders/1COTV5f7dEAA4Txmxy3LVfcNHiPSc4Bmp?usp=sharing)|[weights](https://drive.google.com/drive/folders/1COTV5f7dEAA4Txmxy3LVfcNHiPSc4Bmp?usp=sharing)|`["TableRegion"]` 264 | 265 | ### [PubLayNet](https://github.com/hpanwar08/detectron2) 266 | 267 | R50-FPN [config](https://github.com/hpanwar08/detectron2/raw/master/configs/DLA_mask_rcnn_R_50_FPN_3x.yaml)|[weights](https://www.dropbox.com/sh/44ez171b2qaocd2/AAB0huidzzOXeo99QdplZRjua)|`["TextRegion:paragraph", "TextRegion:heading", "TextRegion:floating", "TableRegion", "ImageRegion"]` 268 | 269 | R101-FPN [config](https://github.com/hpanwar08/detectron2/raw/master/configs/DLA_mask_rcnn_R_101_FPN_3x.yaml)|[weights](https://www.dropbox.com/sh/wgt9skz67usliei/AAD9n6qbsyMz1Y3CwpZpHXCpa)|`["TextRegion:paragraph", "TextRegion:heading", "TextRegion:floating", "TableRegion", "ImageRegion"]` 270 | 271 | X101-FPN [config](https://github.com/hpanwar08/detectron2/raw/master/configs/DLA_mask_rcnn_X_101_32x8d_FPN_3x.yaml)|[weights](https://www.dropbox.com/sh/1098ym6vhad4zi6/AABe16eSdY_34KGp52W0ruwha)|`["TextRegion:paragraph", "TextRegion:heading", "TextRegion:floating", "TableRegion", "ImageRegion"]` 272 | 273 | ### [PubLayNet](https://github.com/JPLeoRX/detectron2-publaynet) 274 | 275 | R50-FPN [config](https://github.com/facebookresearch/detectron2/blob/main/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml)|[weights](https://drive.google.com/file/d/1IbxaRd82hIrxPT4a1U61_g2vvE3zcRLO/view?usp=sharing)|`["TextRegion:paragraph", "TextRegion:heading", "TextRegion:floating", "TableRegion", "ImageRegion"]` 276 | 277 | R101-FPN [config](https://github.com/facebookresearch/detectron2/blob/main/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml)|[weights](https://drive.google.com/file/d/17MD-FegQtFRNn4GeHqKCLaQZ6FiFrzLg/view?usp=sharing)|`["TextRegion:paragraph", "TextRegion:heading", "TextRegion:floating", "TableRegion", "ImageRegion"]` 278 | 279 | ### [LayoutParser](https://github.com/Layout-Parser/layout-parser/blob/master/src/layoutparser/models/detectron2/catalog.py) 280 | 281 | provides different model variants of various depths for multiple datasets: 282 | - [PubLayNet](https://github.com/ibm-aur-nlp/PubLayNet) (Medical Research Papers) 283 | - [TableBank](https://doc-analysis.github.io/tablebank-page/index.html) (Tables Computer Typesetting) 284 | - [PRImALayout](https://www.primaresearch.org/dataset/) (Various Computer Typesetting) 285 | R50-FPN [config](https://www.dropbox.com/s/yc92x97k50abynt/config.yaml?dl=1)|[weights](https://www.dropbox.com/s/h7th27jfv19rxiy/model_final.pth?dl=1)|`["Background","TextRegion","ImageRegion","TableRegion","MathsRegion","SeparatorRegion","LineDrawingRegion"]` 286 | - [HJDataset](https://dell-research-harvard.github.io/HJDataset/) (Historical Japanese Magazines) 287 | - [NewspaperNavigator](https://news-navigator.labs.loc.gov/) (Historical Newspapers) 288 | - [Math Formula Detection](http://transcriptorium.eu/~htrcontest/MathsICDAR2021/) 289 | 290 | See [here](https://github.com/Layout-Parser/layout-parser/blob/master/docs/notes/modelzoo.md) for an overview, 291 | and [here](https://github.com/Layout-Parser/layout-parser/blob/main/src/layoutparser/models/detectron2/catalog.py) for the model files. 292 | You will have to adapt the label map to conform to [PAGE-XML](https://github.com/PRImA-Research-Lab/PAGE-XML) 293 | region (sub)types accordingly. 294 | 295 | ### [PubLaynet finetuning](https://github.com/Jambo-sudo/Historical-document-layout-analysis) 296 | 297 | (pre-trained on PubLayNet, fine-tuned on a custom, non-public GT corpus of 500 pages 20th century magazines) 298 | 299 | X101-FPN [config](https://github.com/Jambo-sudo/Historical-document-layout-analysis/raw/main/historical-document-analysis/DLA_mask_rcnn_X_101_32x8d_FPN_3x.yaml)|[weights](https://www.dropbox.com/s/hfhsdpvg7jesd4g/pub_model_final.pth?dl=1)|`["TextRegion:caption","ImageRegion","TextRegion:page-number","TableRegion","TextRegion:heading","TextRegion:paragraph"]` 300 | 301 | ### [DocBank](https://github.com/doc-analysis/DocBank/blob/master/MODEL_ZOO.md) 302 | 303 | X101-FPN [archive](https://layoutlm.blob.core.windows.net/docbank/model_zoo/X101.zip) 304 | 305 | Proposed mappings: 306 | - `["TextRegion:header", "TextRegion:credit", "TextRegion:caption", "TextRegion:other", "MathsRegion", "GraphicRegion", "TextRegion:footer", "TextRegion:floating", "TextRegion:paragraph", "TextRegion:endnote", "TextRegion:heading", "TableRegion", "TextRegion:heading"]` (using only predefined `@type`) 307 | - `["TextRegion:abstract", "TextRegion:author", "TextRegion:caption", "TextRegion:date", "MathsRegion", "GraphicRegion", "TextRegion:footer", "TextRegion:list", "TextRegion:paragraph", "TextRegion:reference", "TextRegion:heading", "TableRegion", "TextRegion:title"]` (using `@custom` as well) 308 | 309 | ## Testing 310 | 311 | To install Python dependencies and download some models: 312 | 313 | make deps-test 314 | 315 | Which is the equivalent of: 316 | 317 | pip install -r requirements-test.txt 318 | make models-test 319 | 320 | To run the tests, then do: 321 | 322 | make test 323 | 324 | You can inspect the results under `test/assets/*/data` under various new `OCR-D-SEG-*` fileGrps. 325 | (Again, it is recommended to use [OCR-D Browser](https://github.com/hnesk/browse-ocrd).) 326 | 327 | Finally, to remove the test data, do: 328 | 329 | make clean 330 | 331 | ### Test results 332 | 333 | These tests are integrated as a [Github Action](https://github.com/bertsky/ocrd_detectron2/actions/workflows/python-app.yml). Its results can be viewed [here](https://bertsky.github.io/ocrd_detectron2/test-results). 334 | -------------------------------------------------------------------------------- /ocrd-tool.json: -------------------------------------------------------------------------------- 1 | ocrd_detectron2/ocrd-tool.json -------------------------------------------------------------------------------- /ocrd_detectron2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bertsky/ocrd_detectron2/6ff0a12bc552dc1aeaa63ea450d951ad58099f0c/ocrd_detectron2/__init__.py -------------------------------------------------------------------------------- /ocrd_detectron2/cli.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor 4 | from .segment import Detectron2Segment 5 | 6 | @click.command() 7 | @ocrd_cli_options 8 | def ocrd_detectron2_segment(*args, **kwargs): 9 | return ocrd_cli_wrap_processor(Detectron2Segment, *args, **kwargs) 10 | -------------------------------------------------------------------------------- /ocrd_detectron2/ocrd-tool.json: -------------------------------------------------------------------------------- 1 | { 2 | "git_url": "https://github.com/bertsky/ocrd_detectron2", 3 | "dockerhub": "ocrd/detectron2", 4 | "version": "0.2.0", 5 | "tools": { 6 | "ocrd-detectron2-segment": { 7 | "executable": "ocrd-detectron2-segment", 8 | "categories": ["Layout analysis"], 9 | "steps": ["layout/segmentation/region"], 10 | "description": "Detect regions with Detectron2 models", 11 | "input_file_grp_cardinality": 1, 12 | "output_file_grp_cardinality": 1, 13 | "parameters": { 14 | "operation_level": { 15 | "type": "string", 16 | "enum": ["page", "table"], 17 | "default": "page", 18 | "description": "hierarchy level which to predict and assign regions for" 19 | }, 20 | "categories": { 21 | "type": "array", 22 | "required": true, 23 | "description": "maps each region category (position) of the model to a PAGE region type (and @type or @custom if separated by colon), e.g. ['TextRegion:paragraph', 'TextRegion:heading', 'TextRegion:floating', 'TableRegion', 'ImageRegion'] for PubLayNet; categories with an empty string will be skipped during prediction" 24 | }, 25 | "model_config": { 26 | "type": "string", 27 | "format": "uri", 28 | "content-type": "text/yaml", 29 | "required": true, 30 | "description": "path name of model config" 31 | }, 32 | "model_weights": { 33 | "type": "string", 34 | "format": "uri", 35 | "content-type": "application/octet-stream", 36 | "required": true, 37 | "description": "path name of model weights" 38 | }, 39 | "min_confidence": { 40 | "type": "number", 41 | "format": "float", 42 | "default": 0.5, 43 | "description": "confidence threshold for detections" 44 | }, 45 | "postprocessing": { 46 | "type": "string", 47 | "enum": ["full", "only-nms", "only-morph", "none"], 48 | "default": "full", 49 | "description": "which postprocessing steps to enable: by default, applies a custom non-maximum suppression (to avoid overlaps) and morphological operations (using connected component analysis on the binarized input image to shrink or expand regions)" 50 | }, 51 | "debug_img": { 52 | "type": "string", 53 | "enum": ["none", "instance_colors", "instance_colors_only", "category_colors"], 54 | "default": "none", 55 | "description": "paint an AlternativeImage which blends the input image and all raw decoded region candidates" 56 | }, 57 | "device": { 58 | "type": "string", 59 | "default": "cuda", 60 | "description": "select computing device for Torch (e.g. cpu or cuda:0); will fall back to CPU if no GPU is available" 61 | } 62 | }, 63 | "resources": [ 64 | { 65 | "description": "TableBank via LayoutLM X152-FPN config", 66 | "name": "TableBank_X152.yaml", 67 | "size": 536, 68 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/TableBank_X152.yaml" 69 | }, 70 | { 71 | "description": "TableBank via LayoutLM X152-FPN weights", 72 | "name": "TableBank_X152.pth", 73 | "size": 1103832675, 74 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/TableBank_X152.pth" 75 | }, 76 | { 77 | "description": "TableBank via Psarpei X152-FPN config", 78 | "name": "TableBank_X152_Psarpei.yaml", 79 | "size": 534, 80 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/TableBank_X152_Psarpei.yaml" 81 | }, 82 | { 83 | "description": "TableBank via Psarpei X152-FPN weights", 84 | "name": "TableBank_X152_Psarpei.pth", 85 | "size": 1103832675, 86 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/TableBank_X152_Psarpei.pth" 87 | }, 88 | { 89 | "description": "PubLayNet via hpanwar08 R50-FPN config", 90 | "name": "PubLayNet_R_50_FPN_3x.yaml", 91 | "size": 388, 92 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PubLayNet_R_50_FPN_3x.yaml" 93 | }, 94 | { 95 | "description": "PubLayNet via hpanwar08 R50-FPN weights", 96 | "name": "PubLayNet_R_50_FPN_3x.pth", 97 | "size": 176249718, 98 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PubLayNet_R_50_FPN_3x.pth" 99 | }, 100 | { 101 | "description": "PubLayNet via hpanwar08 R101-FPN config", 102 | "name": "PubLayNet_R_101_FPN_3x.yaml", 103 | "size": 392, 104 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PubLayNet_R_101_FPN_3x.yaml" 105 | }, 106 | { 107 | "description": "PubLayNet via hpanwar08 R101-FPN weights", 108 | "name": "PubLayNet_R_101_FPN_3x.pth", 109 | "size": 503147199, 110 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PubLayNet_R_101_FPN_3x.pth" 111 | }, 112 | { 113 | "description": "PubLayNet via hpanwar08 X101-FPN config", 114 | "name": "PubLayNet_X_101_32x8d_FPN_3x.yaml", 115 | "size": 592, 116 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PubLayNet_X_101_32x8d_FPN_3x.yaml" 117 | }, 118 | { 119 | "description": "PubLayNet via hpanwar08 X101-FPN weights", 120 | "name": "PubLayNet_X_101_32x8d_FPN_3x.pth", 121 | "size": 429840864, 122 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PubLayNet_X_101_32x8d_FPN_3x.pth" 123 | }, 124 | { 125 | "description": "PubLayNet via JPLeoRX R50-FPN config", 126 | "name": "PubLayNet_R_50_FPN_3x_JPLeoRX.yaml", 127 | "size": 388, 128 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PubLayNet_R_50_FPN_3x_JPLeoRX.yaml" 129 | }, 130 | { 131 | "description": "PubLayNet via JPLeoRX R50-FPN weights", 132 | "name": "PubLayNet_R_50_FPN_3x_JPLeoRX.pth", 133 | "size": 176299422, 134 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PubLayNet_R_50_FPN_3x_JPLeoRX.pth" 135 | }, 136 | { 137 | "description": "PubLayNet via JPLeoRX R101-FPN config", 138 | "name": "PubLayNet_R_101_FPN_3x_JPLeoRX.yaml", 139 | "size": 392, 140 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PubLayNet_R_101_FPN_3x_JPLeoRX.yaml" 141 | }, 142 | { 143 | "description": "PubLayNet via JPLeoRX R101-FPN weights", 144 | "name": "PubLayNet_R_101_FPN_3x_JPLeoRX.pth", 145 | "size": 252572745, 146 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PubLayNet_R_101_FPN_3x_JPLeoRX.pth" 147 | }, 148 | { 149 | "description": "Modern Magazines via Jambo-sudo X101-FPN (pre-trained on PubLayNet, fine-tuned on 500 p. 20th cent. magazines) config", 150 | "name": "Jambo-sudo_X101.yaml", 151 | "size": 592, 152 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/Jambo-sudo_X101.yaml" 153 | }, 154 | { 155 | "description": "Modern Magazines via Jambo-sudo X101-FPN (pre-trained on PubLayNet, fine-tuned on 500 p. 20th cent. magazines) weights", 156 | "name": "Jambo-sudo_X101.pth", 157 | "size": 856430002, 158 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/Jambo-sudo_X101.pth" 159 | }, 160 | { 161 | "description": "PRImALayout via LayoutLM R50-FPN config", 162 | "name": "PRImALayout_R50.yaml", 163 | "size": 934, 164 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PRImALayout_R50.yaml" 165 | }, 166 | { 167 | "description": "PRImALayout via LayoutLM R50-FPN weights", 168 | "name": "PRImALayout_R50.pth", 169 | "size": 351229486, 170 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PRImALayout_R50.pth" 171 | }, 172 | { 173 | "description": "DocBank via LayoutLM X101-FPN config", 174 | "name": "DocBank_X101.yaml", 175 | "size": 523, 176 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/DocBank_X101.yaml" 177 | }, 178 | { 179 | "description": "DocBank via LayoutLM X101-FPN config", 180 | "name": "DocBank_X101.pth", 181 | "size": 835606605, 182 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/DocBank_X101.pth" 183 | }, 184 | { 185 | "description": "NewspaperNavigator via LayoutParser R50-PanopticFPN config", 186 | "name": "NewspaperNavigator_R_50_PFPN_3x.yaml", 187 | "size": 330226761, 188 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/NewspaperNavigator_R_50_PFPN_3x.yaml" 189 | }, 190 | { 191 | "description": "NewspaperNavigator via LayoutParser R50-PanopticFPN weights", 192 | "name": "NewspaperNavigator_R_50_PFPN_3x.pth", 193 | "size": 330226761, 194 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/NewspaperNavigator_R_50_PFPN_3x.pth" 195 | }, 196 | { 197 | "description": "MathFormulaDetection via LayoutParser R50-FPN config", 198 | "name": "Math_R_50_FPN_3x.yaml", 199 | "size": 5632, 200 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/Math_R_50_FPN_3x.yaml" 201 | }, 202 | { 203 | "description": "MathFormulaDetection via LayoutParser R50-FPN weights", 204 | "name": "Math_R_50_FPN_3x.pth", 205 | "size": 330084629, 206 | "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/Math_R_50_FPN_3x.pth" 207 | } 208 | ] 209 | } 210 | } 211 | } 212 | -------------------------------------------------------------------------------- /ocrd_detectron2/presets_DocBank_X101.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_config": "DocBank_X101.yaml", 3 | "model_weights": "DocBank_X101.pth", 4 | "categories": [ 5 | "TextRegion:abstract", 6 | "TextRegion:author", 7 | "TextRegion:caption", 8 | "TextRegion:date", 9 | "MathsRegion", 10 | "GraphicRegion", 11 | "TextRegion:footer", 12 | "TextRegion:list", 13 | "TextRegion:paragraph", 14 | "TextRegion:reference", 15 | "TextRegion:heading", 16 | "TableRegion", 17 | "TextRegion:title" 18 | ] 19 | } -------------------------------------------------------------------------------- /ocrd_detectron2/presets_DocBank_X101_page.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_config": "DocBank_X101.yaml", 3 | "model_weights": "DocBank_X101.pth", 4 | "categories": [ 5 | "TextRegion:header", 6 | "TextRegion:credit", 7 | "TextRegion:caption", 8 | "TextRegion:other", 9 | "MathsRegion", 10 | "GraphicRegion", 11 | "TextRegion:footer", 12 | "TextRegion:floating", 13 | "TextRegion:paragraph", 14 | "TextRegion:endnote", 15 | "TextRegion:heading", 16 | "TableRegion", 17 | "TextRegion:heading" 18 | ] 19 | } 20 | -------------------------------------------------------------------------------- /ocrd_detectron2/presets_Jambo-sudo_X101.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_config": "Jambo-sudo_X101.yaml", 3 | "model_weights": "Jambo-sudo_X101.pth", 4 | "categories": [ 5 | "TextRegion:caption", 6 | "ImageRegion", 7 | "TextRegion:page-number", 8 | "TableRegion", 9 | "TextRegion:heading", 10 | "TextRegion:paragraph" 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /ocrd_detectron2/presets_Math_R50.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_config": "Math_R_50_FPN_3x.yaml", 3 | "model_weights": "Math_R_50_FPN_3x.pth", 4 | "categories": [ 5 | "", 6 | "MathsRegion" 7 | ] 8 | } -------------------------------------------------------------------------------- /ocrd_detectron2/presets_NewspaperNavigator_R50.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_config": "NewspaperNavigator_R_50_PFPN_3x.yaml", 3 | "model_weights": "NewspaperNavigator_R_50_PFPN_3x.pth", 4 | "categories": [ 5 | "ImageRegion:photograph", 6 | "ImageRegion:illustration", 7 | "MapRegion", 8 | "ImageRegion:cartoon", 9 | "ImageRegion:editorial", 10 | "TextRegion:heading", 11 | "AdvertRegion" 12 | ] 13 | } -------------------------------------------------------------------------------- /ocrd_detectron2/presets_PRImALayout_R50.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_config": "PRImALayout_R50.yaml", 3 | "model_weights": "PRImALayout_R50.pth", 4 | "categories": [ 5 | "Background", 6 | "TextRegion", 7 | "ImageRegion", 8 | "TableRegion", 9 | "MathsRegion", 10 | "SeparatorRegion", 11 | "LineDrawingRegion" 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /ocrd_detectron2/presets_PubLayNet_R101.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_config": "PubLayNet_R_101_FPN_3x.yaml", 3 | "model_weights": "PubLayNet_R_101_FPN_3x.pth", 4 | "categories": [ 5 | "TextRegion:paragraph", 6 | "TextRegion:heading", 7 | "TextRegion:floating", 8 | "TableRegion", 9 | "ImageRegion" 10 | ] 11 | } -------------------------------------------------------------------------------- /ocrd_detectron2/presets_PubLayNet_R101_JPLeoRX.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_config": "PubLayNet_R_101_FPN_3x_JPLeoRX.yaml", 3 | "model_weights": "PubLayNet_R_101_FPN_3x_JPLeoRX.pth", 4 | "categories": [ 5 | "TextRegion:paragraph", 6 | "TextRegion:heading", 7 | "TextRegion:floating", 8 | "TableRegion", 9 | "ImageRegion" 10 | ] 11 | } -------------------------------------------------------------------------------- /ocrd_detectron2/presets_PubLayNet_R50.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_config": "PubLayNet_R_50_FPN_3x.yaml", 3 | "model_weights": "PubLayNet_R_50_FPN_3x.pth", 4 | "categories": [ 5 | "TextRegion:paragraph", 6 | "TextRegion:heading", 7 | "TextRegion:floating", 8 | "TableRegion", 9 | "ImageRegion" 10 | ] 11 | } -------------------------------------------------------------------------------- /ocrd_detectron2/presets_PubLayNet_R50_JPLeoRX.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_config": "PubLayNet_R_50_FPN_3x_JPLeoRX.yaml", 3 | "model_weights": "PubLayNet_R_50_FPN_3x_JPLeoRX.pth", 4 | "categories": [ 5 | "TextRegion:paragraph", 6 | "TextRegion:heading", 7 | "TextRegion:floating", 8 | "TableRegion", 9 | "ImageRegion" 10 | ] 11 | } -------------------------------------------------------------------------------- /ocrd_detectron2/presets_PubLayNet_X101.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_config": "PubLayNet_X_101_32x8d_FPN_3x.yaml", 3 | "model_weights": "PubLayNet_X_101_32x8d_FPN_3x.pth", 4 | "categories": [ 5 | "TextRegion:paragraph", 6 | "TextRegion:heading", 7 | "TextRegion:floating", 8 | "TableRegion", 9 | "ImageRegion" 10 | ] 11 | } -------------------------------------------------------------------------------- /ocrd_detectron2/presets_TableBank_X152.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_config": "TableBank_X152.yaml", 3 | "model_weights": "TableBank_X152.pth", 4 | "categories": [ 5 | "TableRegion" 6 | ] 7 | } -------------------------------------------------------------------------------- /ocrd_detectron2/presets_TableBank_X152_Psarpei.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_config": "TableBank_X152_Psarpei.yaml", 3 | "model_weights": "TableBank_X152_Psarpei.pth", 4 | "categories": [ 5 | "TableRegion" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /ocrd_detectron2/segment.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import sys 4 | import os 5 | import tempfile 6 | import fileinput 7 | import shutil 8 | import math 9 | import multiprocessing as mp 10 | import multiprocessing.sharedctypes 11 | import ctypes 12 | from typing import Optional 13 | 14 | import numpy as np 15 | from shapely.geometry import Polygon 16 | from shapely.ops import unary_union 17 | import cv2 18 | from PIL import Image 19 | #from detectron2.utils.logger import setup_logger 20 | from detectron2.engine import DefaultPredictor 21 | from detectron2.utils import visualizer 22 | from detectron2.config import get_cfg 23 | from detectron2.data import MetadataCatalog #, DatasetCatalog 24 | import torch 25 | 26 | from ocrd_utils import ( 27 | resource_filename, 28 | getLogger, 29 | pushd_popd, 30 | coordinates_of_segment, 31 | coordinates_for_segment, 32 | crop_image, 33 | points_from_polygon, 34 | polygon_from_points, 35 | ) 36 | from ocrd_models.ocrd_page import ( 37 | OcrdPage, 38 | PageType, 39 | AdvertRegionType, 40 | ChartRegionType, 41 | ChemRegionType, 42 | CustomRegionType, 43 | GraphicRegionType, 44 | ImageRegionType, 45 | LineDrawingRegionType, 46 | MapRegionType, 47 | MathsRegionType, 48 | MusicRegionType, 49 | NoiseRegionType, 50 | SeparatorRegionType, 51 | TableRegionType, 52 | TextRegionType, 53 | UnknownRegionType, 54 | CoordsType, 55 | AlternativeImageType 56 | ) 57 | from ocrd_models.ocrd_page_generateds import ( 58 | ChartTypeSimpleType, 59 | GraphicsTypeSimpleType, 60 | TextTypeSimpleType 61 | ) 62 | from ocrd import Processor, OcrdPageResult, OcrdPageResultImage 63 | 64 | # when doing Numpy postprocessing, enlarge masks via 65 | # outer (convex) instead of inner (concave) hull of 66 | # corresponding connected components 67 | NP_POSTPROCESSING_OUTER = False 68 | # when pruning overlapping detections (in either mode), 69 | # require at least this share of the area to be redundant 70 | RECALL_THRESHOLD = 0.8 71 | # when finalizing contours of detections (in either mode), 72 | # snap to connected components overlapping by this share 73 | # (of component area), i.e. include if larger and exclude 74 | # if smaller than this much 75 | IOCC_THRESHOLD = 0.4 76 | # when finalizing contours of detections (in either mode), 77 | # add this many pixels in each direction 78 | FINAL_DILATION = 4 79 | 80 | class Detectron2Segment(Processor): 81 | max_workers = 1 # GPU context sharable across not forks 82 | 83 | @property 84 | def executable(self): 85 | return 'ocrd-detectron2-segment' 86 | 87 | def setup(self): 88 | #setup_logger(name='fvcore') 89 | #mp.set_start_method("spawn", force=True) 90 | # runtime overrides 91 | if self.parameter['device'] == 'cpu' or not torch.cuda.is_available(): 92 | device = "cpu" 93 | else: 94 | device = self.parameter['device'] 95 | self.logger.info("Using compute device %s", device) 96 | model_config = self.resolve_resource(self.parameter['model_config']) 97 | self.logger.info("Loading config '%s'", model_config) 98 | # add project-specific config (e.g., TensorMask) here if you're not running a model in detectron2's core library 99 | with tempfile.TemporaryDirectory() as tmpdir: 100 | # workaround for fvcore/detectron2's stupid decision 101 | # to resolve the relative path for _BASE_ in the config file 102 | # on its dirname instead of the detectron2 distribution's config directory 103 | temp_configs = os.path.join(tmpdir, 'configs') 104 | with resource_filename('detectron2', 'model_zoo/configs') as stock_configs: 105 | shutil.copytree(stock_configs, temp_configs) 106 | temp_config = os.path.join(temp_configs, os.path.basename(model_config)) 107 | shutil.copyfile(model_config, temp_config) 108 | with pushd_popd(tmpdir): 109 | # repair broken config files that make deviating assumptions on model_zoo files 110 | with fileinput.input(temp_config, inplace=True) as temp_config_file: 111 | for line in temp_config_file: 112 | if fileinput.isfirstline(): 113 | PREFIXES = ['/content/', 114 | '../../configs/', 115 | '../configs/', 116 | '../'] 117 | line = next((line.replace(pref, '') for pref in PREFIXES 118 | if line.startswith('_BASE_: "' + pref)), line) 119 | if os.path.basename(model_config) == 'Jambo-sudo_X101.yaml' and 'NUM_CLASSES: 5' in line: 120 | # workaround for Jambo-sudo/Historical-document-layout-analysis#1 121 | line = line.replace('NUM_CLASSES: 5', 'NUM_CLASSES: 6') 122 | print(line, end='') 123 | cfg = get_cfg() 124 | cfg.merge_from_file(temp_config) 125 | model_weights = self.resolve_resource(self.parameter['model_weights']) 126 | cfg.merge_from_list([ 127 | # set threshold for this model 128 | "MODEL.ROI_HEADS.SCORE_THRESH_TEST", self.parameter['min_confidence'], 129 | "MODEL.RETINANET.SCORE_THRESH_TEST", self.parameter['min_confidence'], 130 | "MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH", self.parameter['min_confidence'], 131 | # or cfg.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH ? 132 | "MODEL.DEVICE", device, 133 | "MODEL.WEIGHTS", model_weights, 134 | ]) 135 | cfg.freeze() 136 | assert cfg.MODEL.ROI_HEADS.NUM_CLASSES == len(self.parameter['categories']), \ 137 | "The chosen model's number of classes %d does not match the given list of categories %d " % ( 138 | cfg.MODEL.ROI_HEADS.NUM_CLASSES, len(self.parameter['categories'])) 139 | # instantiate model 140 | self.logger.info("Loading weights '%s'", model_weights) 141 | self.predictor = DefaultPredictor(cfg) 142 | self.categories = self.parameter['categories'] 143 | self.metadatacat = MetadataCatalog.get('runtime') 144 | self.metadatacat.thing_classes = self.categories 145 | 146 | def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: 147 | """Use detectron2 to segment each page into regions. 148 | 149 | Open and deserialize PAGE input files and their respective images, 150 | then iterate over the element hierarchy down to the requested 151 | ``operation_level``. 152 | 153 | Fetch a raw and a binarized image for the page/segment (possibly 154 | cropped and deskewed). 155 | 156 | Feed the raw image into the detectron2 predictor that has been 157 | used to load the given model. Then, depending on the model capabilities 158 | (whether it can do panoptic segmentation or only instance segmentation, 159 | whether the latter can do masks or only bounding boxes), post-process 160 | the predictions: 161 | 162 | \b 163 | - panoptic segmentation: take the provided segment label map, and 164 | apply the segment to class label map, 165 | - instance segmentation: find an optimal non-overlapping set (flat 166 | map) of instances via non-maximum suppression, 167 | - both: avoid overlapping pre-existing top-level regions (incremental 168 | segmentation). 169 | 170 | Then extend / shrink the surviving masks to fully include / exclude 171 | connected components in the foreground that are on the boundary. 172 | 173 | (This describes the steps when ``postprocessing`` is `full`. A value 174 | of `only-nms` will omit the morphological extension/shrinking, while 175 | `only-morph` will omit the non-maximum suppression, and `none` will 176 | skip all postprocessing.) 177 | 178 | Finally, find the convex hull polygon for each region, and map its 179 | class id to a new PAGE region type (and subtype). 180 | 181 | (Does not annotate `ReadingOrder` or `TextLine`s or `@orientation`.) 182 | 183 | Produce a new output file by serialising the resulting hierarchy. 184 | """ 185 | pcgts = input_pcgts[0] 186 | result = OcrdPageResult(pcgts) 187 | level = self.parameter['operation_level'] 188 | 189 | page = pcgts.get_Page() 190 | page_image_raw, page_coords, page_image_info = self.workspace.image_from_page( 191 | page, page_id, feature_filter='binarized') 192 | # for morphological post-processing, we will need the binarized image, too 193 | if self.parameter['postprocessing'] != 'none': 194 | page_image_bin, _, _ = self.workspace.image_from_page( 195 | page, page_id, feature_selector='binarized') 196 | page_image_raw, page_image_bin = _ensure_consistent_crops( 197 | page_image_raw, page_image_bin) 198 | else: 199 | page_image_bin = page_image_raw 200 | # determine current zoom and target zoom 201 | if page_image_info.resolution != 1: 202 | dpi = page_image_info.resolution 203 | if page_image_info.resolutionUnit == 'cm': 204 | dpi = round(dpi * 2.54) 205 | zoom = 300.0 / dpi 206 | else: 207 | dpi = None 208 | zoom = 1.0 209 | # todo: if zoom is > 4.0, do something along the lines of eynollah's enhance 210 | if zoom < 2.0: 211 | # actual resampling: see below 212 | zoomed = zoom / 2.0 213 | self.logger.info("scaling %dx%d image by %.2f", page_image_raw.width, page_image_raw.height, zoomed) 214 | else: 215 | zoomed = 1.0 216 | 217 | for segment in ([page] if level == 'page' else 218 | page.get_AllRegions(depth=1, classes=['Table'])): 219 | # regions = segment.get_AllRegions(depth=1) 220 | # FIXME: as long as we don't have get_AllRegions on region level, 221 | # we have to simulate this via parent_object filtering 222 | def at_segment(region): 223 | return region.parent_object_ is segment 224 | regions = list(filter(at_segment, page.get_AllRegions())) 225 | 226 | if isinstance(segment, PageType): 227 | image_raw = page_image_raw 228 | image_bin = page_image_bin 229 | coords = page_coords 230 | else: 231 | image_raw, coords = self.workspace.image_from_segment( 232 | segment, page_image_raw, page_coords, feature_filter='binarized') 233 | if self.parameter['postprocessing'] != 'none': 234 | image_bin, _ = self.workspace.image_from_segment( 235 | segment, page_image_bin, page_coords) 236 | image_raw, image_bin = _ensure_consistent_crops( 237 | image_raw, image_bin) 238 | else: 239 | image_bin = image_raw 240 | 241 | # ensure RGB (if raw was merely grayscale) 242 | if image_raw.mode == '1': 243 | image_raw = image_raw.convert('L') 244 | image_raw = image_raw.convert(mode='RGB') 245 | image_bin = image_bin.convert(mode='1') 246 | 247 | # reduce resolution to 300 DPI max 248 | if zoomed != 1.0: 249 | image_bin = image_bin.resize( 250 | (int(image_raw.width * zoomed), 251 | int(image_raw.height * zoomed)), 252 | resample=Image.Resampling.BICUBIC) 253 | image_raw = image_raw.resize( 254 | (int(image_raw.width * zoomed), 255 | int(image_raw.height * zoomed)), 256 | resample=Image.Resampling.BICUBIC) 257 | 258 | # convert raw to BGR 259 | array_raw = np.array(image_raw) 260 | array_raw = array_raw[:,:,::-1] 261 | # convert binarized to single-channel negative 262 | array_bin = np.array(image_bin) 263 | array_bin = ~ array_bin 264 | 265 | image = self._process_segment(segment, regions, coords, array_raw, array_bin, zoomed, page_id) 266 | if image: 267 | result.images.append(image) 268 | return result 269 | 270 | def _process_segment(self, segment, ignore, coords, array_raw, array_bin, zoomed, page_id) -> Optional[OcrdPageResultImage]: 271 | self.logger = getLogger('processor.Detectron2Segment') 272 | cpu = torch.device('cpu') 273 | segtype = segment.__class__.__name__[:-4] 274 | # remove existing segmentation (have only detected targets survive) 275 | #page.set_ReadingOrder(None) 276 | #page.set_TextRegion([]) 277 | segment.set_custom('coords=%s' % coords['transform']) 278 | height, width, _ = array_raw.shape 279 | postprocessing = self.parameter['postprocessing'] 280 | scale = 43 281 | if postprocessing in ['full', 'only-morph']: 282 | # get connected components to estimate scale 283 | _, components = cv2.connectedComponents(array_bin.astype(np.uint8)) 284 | # estimate glyph scale (roughly) 285 | _, counts = np.unique(components, return_counts=True) 286 | if counts.shape[0] > 1: 287 | counts = np.sqrt(3 * counts) 288 | counts = counts[(5 < counts) & (counts < 100)] 289 | scale = int(np.median(counts)) 290 | self.logger.debug("estimated scale: %d", scale) 291 | # predict 292 | output = self.predictor(array_raw) 293 | if self.parameter['debug_img'] != 'none': 294 | vis = visualizer.Visualizer(array_raw, 295 | metadata=self.metadatacat, 296 | instance_mode={ 297 | 'instance_colors': visualizer.ColorMode.IMAGE, 298 | 'instance_colors_only': visualizer.ColorMode.IMAGE_BW, 299 | 'category_colors': visualizer.ColorMode.SEGMENTATION 300 | }[self.parameter['debug_img']]) 301 | # decoding, cf. https://detectron2.readthedocs.io/en/latest/tutorials/models.html 302 | if 'panoptic_seg' in output: 303 | self.logger.info("decoding from panoptic segmentation results") 304 | segmap, seginfo = output['panoptic_seg'] 305 | if not isinstance(segmap, np.ndarray): 306 | self.logger.debug(str(segmap)) 307 | segmap = segmap.to(cpu) 308 | segmap = segmap.numpy() 309 | if self.parameter['debug_img'] != 'none': 310 | visimg = vis.draw_panoptic_seg(segmap, seginfo) 311 | seglabels = np.unique(segmap) 312 | nseg = len(seglabels) 313 | if not nseg: 314 | self.logger.warning("Detected no regions on %s '%s'", segtype, segment.id) 315 | return None 316 | masks = [] 317 | classes = [] 318 | scores = [] 319 | for label in seglabels: 320 | if label == -1: 321 | continue 322 | if seginfo is None: 323 | class_id = label // self.predictor.metadata.label_divisor 324 | else: 325 | for info in seginfo: 326 | if info['id'] == label: 327 | class_id = info['category_id'] 328 | break 329 | if not self.categories[class_id]: 330 | continue 331 | masks.append(segmap == label) 332 | scores.append(1.0) #scores[i] 333 | classes.append(class_id) 334 | if not len(masks): 335 | self.logger.warning("Detected no regions for selected categories on %s '%s'", segtype, segment.id) 336 | return None 337 | elif 'instances' in output: 338 | self.logger.info("decoding from instance segmentation results") 339 | instances = output['instances'] 340 | if not isinstance(instances, dict): 341 | assert instances.image_size == (height, width) 342 | instances = instances.to(cpu) 343 | if self.parameter['debug_img'] != 'none': 344 | visimg = vis.draw_instance_predictions(instances) 345 | instances = instances.get_fields() 346 | classes = instances['pred_classes'] 347 | if not all(self.categories): 348 | # filter out inactive classes 349 | select = np.array([bool(cat) for cat in self.categories]) 350 | select = select[classes] 351 | for key, val in instances.items(): 352 | instances[key] = val[select] 353 | classes = instances['pred_classes'] 354 | scores = instances['scores'] 355 | if not isinstance(scores, np.ndarray): 356 | scores = scores.to(cpu).numpy() 357 | if not scores.shape[0]: 358 | self.logger.warning("Detected no regions on %s '%s'", segtype, segment.id) 359 | return None 360 | if 'pred_masks' in instances: # or pred_masks_rle ? 361 | masks = np.asarray(instances['pred_masks']) 362 | def get_mask(x): 363 | # convert from RLE/polygon/Numpy # or Tensor? 364 | # zzz tensor result would have to use .detach().numpy() ... 365 | x = visualizer.GenericMask(x, height, width) 366 | return x.mask > 0 367 | masks = np.stack([get_mask(x) for x in masks]) 368 | elif 'pred_boxes' in instances: 369 | self.logger.warning("model has no mask output, only bbox") 370 | boxes = instances['pred_boxes'] 371 | if not isinstance(boxes, np.ndarray): 372 | boxes = boxes.to(cpu).tensor.numpy() 373 | assert boxes.shape[1] == 4 # and not 5 (rotated boxes) 374 | assert boxes.shape[0], "prediction without instances" 375 | masks = np.zeros((len(boxes), height, width), bool) 376 | for i, (x1, y1, x2, y2) in enumerate(boxes): 377 | masks[i, 378 | math.floor(y1):math.ceil(y2), 379 | math.floor(x1):math.ceil(x2)] = True 380 | else: 381 | self.logger.error("Found no suitable output format to decode from") 382 | return None 383 | assert len(scores) == len(classes) == len(masks) 384 | # apply non-maximum suppression between overlapping instances 385 | # (not strictly necessary in case of panoptic segmentation, 386 | # but we can still have overlaps with preexisting regions) 387 | if len(ignore): 388 | scores = np.insert(scores, 0, 1.0, axis=0) 389 | classes = np.insert(classes, 0, -1, axis=0) 390 | masks = np.insert(masks, 0, 0, axis=0) 391 | mask0 = np.zeros(masks.shape[1:], np.uint8) 392 | for i, region in enumerate(ignore): 393 | polygon = coordinates_of_segment(region, _, coords) 394 | if zoomed != 1.0: 395 | polygon = np.round(polygon * zoomed).astype(int) 396 | cv2.fillPoly(mask0, pts=[polygon], color=(255,)) 397 | assert np.count_nonzero(mask0), "existing regions all outside of page frame" 398 | masks[0] |= mask0 > 0 399 | if postprocessing in ['full', 'only-nms']: 400 | scores, classes, masks = postprocess_nms( 401 | scores, classes, masks, array_bin, self.categories, 402 | min_confidence=self.parameter['min_confidence'], nproc=8, logger=self.logger) 403 | if postprocessing in ['full', 'only-morph']: 404 | scores, classes, masks = postprocess_morph( 405 | scores, classes, masks, components, nproc=8, logger=self.logger) 406 | if len(ignore): 407 | scores = scores[1:] 408 | classes = classes[1:] 409 | masks = masks[1:] 410 | # convert to polygons and regions 411 | region_no = 0 412 | for mask, class_id, score in zip(masks, classes, scores): 413 | category = self.categories[class_id] 414 | # dilate until we have a single outer contour 415 | invalid = True 416 | for _ in range(10): 417 | contours, _ = cv2.findContours(mask.astype(np.uint8), 418 | cv2.RETR_EXTERNAL, 419 | cv2.CHAIN_APPROX_SIMPLE) 420 | if len(contours) == 1 and len(contours[0]) > 3: 421 | invalid = False 422 | break 423 | mask = cv2.dilate(mask.astype(np.uint8), 424 | np.ones((scale,scale), np.uint8)) > 0 425 | if invalid: 426 | self.logger.warning("Ignoring non-contiguous (%d) region for %s", len(contours), category) 427 | continue 428 | region_polygon = contours[0][:,0,:] # already in x,y order 429 | if zoomed != 1.0: 430 | region_polygon = region_polygon / zoomed 431 | # ensure consistent and valid polygon outline 432 | region_polygon = coordinates_for_segment(region_polygon, _, coords) 433 | region_polygon = polygon_for_parent(region_polygon, segment) 434 | if region_polygon is None: 435 | self.logger.warning("Ignoring extant region for %s", category) 436 | continue 437 | # annotate new region/line 438 | region_coords = CoordsType(points_from_polygon(region_polygon), conf=score) 439 | cat2class = dict([ 440 | ('AdvertRegion', AdvertRegionType), 441 | ('ChartRegion', ChartRegionType), 442 | ('ChemRegion', ChemRegionType), 443 | ('CustomRegion', CustomRegionType), 444 | ('GraphicRegion', GraphicRegionType), 445 | ('ImageRegion', ImageRegionType), 446 | ('LineDrawingRegion', LineDrawingRegionType), 447 | ('MapRegion', MapRegionType), 448 | ('MathsRegion', MathsRegionType), 449 | ('MusicRegion', MusicRegionType), 450 | ('NoiseRegion', NoiseRegionType), 451 | ('SeparatorRegion', SeparatorRegionType), 452 | ('TableRegion', TableRegionType), 453 | ('TextRegion', TextRegionType), 454 | ('UnknownRegion', UnknownRegionType), 455 | ]) 456 | cat = category.split(':') 457 | try: 458 | regiontype = cat2class[cat[0]] 459 | except KeyError: 460 | raise ValueError("Invalid region type %s (see https://github.com/PRImA-Research-Lab/PAGE-XML)", cat[0]) 461 | region_no += 1 462 | region_id = 'region%04d_%s' % (region_no, cat[0]) 463 | region = regiontype(id=region_id, Coords=region_coords) 464 | if len(cat) > 1: 465 | try: 466 | {TextRegionType: TextTypeSimpleType, 467 | GraphicRegionType: GraphicsTypeSimpleType, 468 | ChartRegionType: ChartTypeSimpleType}[regiontype](cat[1]) 469 | region.set_type(cat[1]) 470 | except (KeyError, ValueError): 471 | region.set_custom(cat[1]) 472 | getattr(segment, 'add_' + cat[0])(region) 473 | self.logger.info("Detected %s region%04d (p=%.2f) on %s '%s'", 474 | category, region_no, score, segtype, segment.id) 475 | if self.parameter['debug_img'] != 'none': 476 | altimg = AlternativeImageType(comments='debug') 477 | segment.add_AlternativeImage(altimg) 478 | return OcrdPageResultImage( 479 | Image.fromarray(visimg.get_image()), 480 | ('' if isinstance(segment, PageType) else '_' + segment.id) + '.IMG-DEBUG', 481 | altimg) 482 | return None 483 | 484 | 485 | def postprocess_nms(scores, classes, masks, page_array_bin, categories, min_confidence=0.5, nproc=8, logger=None): 486 | """Apply geometrical post-processing to raw detections: remove overlapping candidates via non-maximum suppression across classes. 487 | 488 | Implement via Numpy routines. 489 | """ 490 | if logger is None: 491 | logger = getLogger('ocrd.processor.Detectron2Segment') 492 | # apply IoU-based NMS across classes 493 | assert masks.dtype == bool 494 | instances = np.arange(len(masks)) 495 | instances_i, instances_j = np.meshgrid(instances, instances, indexing='ij') 496 | combinations = list(zip(*np.where(instances_i != instances_j))) 497 | shared_masks = mp.sharedctypes.RawArray(ctypes.c_bool, masks.size) 498 | shared_masks_np = tonumpyarray_with_shape(shared_masks, masks.shape) 499 | np.copyto(shared_masks_np, masks * page_array_bin) 500 | with mp.Pool(processes=nproc, # to be refined via param 501 | initializer=overlapmasks_init, 502 | initargs=(shared_masks, masks.shape)) as pool: 503 | # multiprocessing for different combinations of array slices (pure) 504 | overlapping_combinations = pool.starmap(overlapmasks, combinations) 505 | overlaps = np.zeros((len(masks), len(masks)), bool) 506 | for (i, j), overlapping in zip(combinations, overlapping_combinations): 507 | if overlapping: 508 | overlaps[i, j] = True 509 | # find best-scoring instance per class 510 | bad = np.zeros_like(instances, bool) 511 | for i in np.argsort(-scores): 512 | score = scores[i] 513 | mask = masks[i] 514 | assert mask.shape[:2] == page_array_bin.shape[:2] 515 | ys, xs = mask.nonzero() 516 | assert xs.any() and ys.any(), "instance has empty mask" 517 | bbox = [xs.min(), ys.min(), xs.max(), ys.max()] 518 | class_id = classes[i] 519 | if class_id < 0: 520 | logger.debug("ignoring existing region at %s", str(bbox)) 521 | continue 522 | category = categories[class_id] 523 | if scores[i] < min_confidence: 524 | logger.debug("Ignoring instance for %s with too low score %.2f", category, score) 525 | bad[i] = True 526 | continue 527 | count = np.count_nonzero(mask) 528 | if count < 10: 529 | logger.warning("Ignoring too small (%dpx) region for %s", count, category) 530 | bad[i] = True 531 | continue 532 | worse = score < scores 533 | if np.any(worse & overlaps[i]): 534 | logger.debug("Ignoring instance for %s with %.2f overlapping better neighbour", 535 | category, score) 536 | bad[i] = True 537 | else: 538 | logger.debug("post-processing prediction for %s at %s area %d score %f", 539 | category, str(bbox), count, score) 540 | # post-process detections morphologically and decode to region polygons 541 | # does not compile (no OpenCV support): 542 | keep = np.nonzero(~ bad)[0] 543 | if not keep.size: 544 | return [], [], [] 545 | keep = sorted(keep, key=lambda i: scores[i], reverse=True) 546 | scores = scores[keep] 547 | classes = classes[keep] 548 | masks = masks[keep] 549 | return scores, classes, masks 550 | 551 | def postprocess_morph(scores, classes, masks, components, nproc=8, logger=None): 552 | """Apply morphological post-processing to raw detections: extend masks to avoid chopping off fg connected components. 553 | 554 | Implement via Numpy routines. 555 | """ 556 | if logger is None: 557 | logger = getLogger('ocrd.processor.Detectron2Segment') 558 | shared_masks = mp.sharedctypes.RawArray(ctypes.c_bool, masks.size) 559 | shared_components = mp.sharedctypes.RawArray(ctypes.c_int32, components.size) 560 | shared_masks_np = tonumpyarray_with_shape(shared_masks, masks.shape) 561 | shared_components_np = tonumpyarray_with_shape(shared_components, components.shape) 562 | np.copyto(shared_components_np, components, casting='equiv') 563 | np.copyto(shared_masks_np, masks) 564 | with mp.Pool(processes=nproc, # to be refined via param 565 | initializer=morphmasks_init, 566 | initargs=(shared_masks, masks.shape, 567 | shared_components, components.shape)) as pool: 568 | # multiprocessing for different slices of array (in-place) 569 | pool.map(morphmasks, range(masks.shape[0])) 570 | masks = tonumpyarray_with_shape(shared_masks, masks.shape) 571 | return scores, classes, masks 572 | 573 | def polygon_for_parent(polygon, parent): 574 | """Clip polygon to parent polygon range. 575 | 576 | (Should be moved to ocrd_utils.coordinates_for_segment.) 577 | """ 578 | childp = Polygon(polygon) 579 | if isinstance(parent, PageType): 580 | if parent.get_Border(): 581 | parentp = Polygon(polygon_from_points(parent.get_Border().get_Coords().points)) 582 | else: 583 | parentp = Polygon([[0,0], [0,parent.get_imageHeight()], 584 | [parent.get_imageWidth(),parent.get_imageHeight()], 585 | [parent.get_imageWidth(),0]]) 586 | else: 587 | parentp = Polygon(polygon_from_points(parent.get_Coords().points)) 588 | # ensure input coords have valid paths (without self-intersection) 589 | # (this can happen when shapes valid in floating point are rounded) 590 | childp = make_valid(childp) 591 | parentp = make_valid(parentp) 592 | if not childp.is_valid: 593 | return None 594 | if not parentp.is_valid: 595 | return None 596 | # check if clipping is necessary 597 | if childp.within(parentp): 598 | return childp.exterior.coords[:-1] 599 | # clip to parent 600 | interp = childp.intersection(parentp) 601 | # post-process 602 | if interp.is_empty or interp.area == 0.0: 603 | return None 604 | if interp.type == 'GeometryCollection': 605 | # heterogeneous result: filter zero-area shapes (LineString, Point) 606 | interp = unary_union([geom for geom in interp.geoms if geom.area > 0]) 607 | if interp.type == 'MultiPolygon': 608 | # homogeneous result: construct convex hull to connect 609 | # FIXME: construct concave hull / alpha shape 610 | interp = interp.convex_hull 611 | if interp.minimum_clearance < 1.0: 612 | # follow-up calculations will necessarily be integer; 613 | # so anticipate rounding here and then ensure validity 614 | interp = Polygon(np.round(interp.exterior.coords)) 615 | interp = make_valid(interp) 616 | return interp.exterior.coords[:-1] # keep open 617 | 618 | def make_valid(polygon): 619 | for split in range(1, len(polygon.exterior.coords)-1): 620 | if polygon.is_valid or polygon.simplify(polygon.area).is_valid: 621 | break 622 | # simplification may not be possible (at all) due to ordering 623 | # in that case, try another starting point 624 | polygon = Polygon(polygon.exterior.coords[-split:]+polygon.exterior.coords[:-split]) 625 | for tolerance in range(1, int(polygon.area)): 626 | if polygon.is_valid: 627 | break 628 | # simplification may require a larger tolerance 629 | polygon = polygon.simplify(tolerance) 630 | return polygon 631 | 632 | def tonumpyarray(mp_arr): 633 | return np.frombuffer(mp_arr, dtype=np.dtype(mp_arr)) 634 | 635 | def tonumpyarray_with_shape(mp_arr, shape): 636 | return np.frombuffer(mp_arr, dtype=np.dtype(mp_arr)).reshape(shape) 637 | 638 | def overlapmasks_init(masks_array, masks_shape): 639 | global shared_masks 640 | global shared_masks_shape 641 | shared_masks = masks_array 642 | shared_masks_shape = masks_shape 643 | 644 | def overlapmasks(i, j): 645 | # is i redundant w.r.t. j (i.e. j already covers most of its area) 646 | masks = np.ctypeslib.as_array(shared_masks).reshape(shared_masks_shape) 647 | imask = masks[i] 648 | jmask = masks[j] 649 | intersection = np.count_nonzero(imask * jmask) 650 | if not intersection: 651 | return False 652 | base = np.count_nonzero(imask) 653 | if intersection / base > RECALL_THRESHOLD: 654 | return True 655 | return False 656 | 657 | def morphmasks_init(masks_array, masks_shape, components_array, components_shape): 658 | global shared_masks 659 | global shared_masks_shape 660 | global shared_components 661 | global shared_components_shape 662 | shared_masks = masks_array 663 | shared_masks_shape = masks_shape 664 | shared_components = components_array 665 | shared_components_shape = components_shape 666 | 667 | def morphmasks(instance): 668 | masks = np.ctypeslib.as_array(shared_masks).reshape(shared_masks_shape) 669 | components = np.ctypeslib.as_array(shared_components).reshape(shared_components_shape) 670 | mask = masks[instance] 671 | # find closure in connected components 672 | complabels = np.unique(mask * components) 673 | left, top, w, h = cv2.boundingRect(mask.astype(np.uint8)) 674 | right = left + w 675 | bottom = top + h 676 | if NP_POSTPROCESSING_OUTER: 677 | # overwrite pixel mask from (padded) outer bbox 678 | for label in complabels: 679 | if not label: 680 | continue # bg/white 681 | leftc, topc, wc, hc = cv2.boundingRect((components == label).astype(np.uint8)) 682 | rightc = leftc + wc 683 | bottomc = topc + hc 684 | if wc > 2 * w or hc > 2 * h: 685 | continue # huge (non-text?) component 686 | # intersection over component too small? 687 | if (min(right, rightc) - max(left, leftc)) * \ 688 | (min(bottom, bottomc) - max(top, topc)) < IOCC_THRESHOLD * wc * hc: 689 | continue # too little overlap 690 | newleft = min(left, leftc) 691 | newtop = min(top, topc) 692 | newright = max(right, rightc) 693 | newbottom = max(bottom, bottomc) 694 | if (newright - newleft) > 2 * w or (newbottom - newtop) > 1.5 * h: 695 | continue # 696 | left = newleft 697 | top = newtop 698 | right = newright 699 | bottom = newbottom 700 | w = right - left 701 | h = bottom - top 702 | left = max(0, left - FINAL_DILATION) 703 | top = max(0, top - FINAL_DILATION) 704 | right = min(mask.shape[1], right + FINAL_DILATION) 705 | bottom = min(mask.shape[0], bottom + FINAL_DILATION) 706 | mask[top:bottom, left:right] = True 707 | 708 | else: 709 | # fill pixel mask from (padded) inner bboxes 710 | for label in complabels: 711 | if not label: 712 | continue # bg/white 713 | suppress = False 714 | leftc, topc, wc, hc = cv2.boundingRect((components == label).astype(np.uint8)) 715 | rightc = leftc + wc 716 | bottomc = topc + hc 717 | if wc > 2 * w or hc > 2 * h: 718 | # huge (non-text?) component 719 | suppress = True 720 | if (min(right, rightc) - max(left, leftc)) * \ 721 | (min(bottom, bottomc) - max(top, topc)) < IOCC_THRESHOLD * wc * hc: 722 | # intersection over component too small 723 | suppress = True 724 | newleft = min(left, leftc) 725 | newtop = min(top, topc) 726 | newright = max(right, rightc) 727 | newbottom = max(bottom, bottomc) 728 | if (newright - newleft) > 2 * w or (newbottom - newtop) > 1.5 * h: 729 | # huge (non-text?) component 730 | suppress = True 731 | elif (newright - newleft) < 1.1 * w and (newbottom - newtop) < 1.1 * h: 732 | suppress = False 733 | if suppress: 734 | leftc = min(mask.shape[1], leftc + FINAL_DILATION) 735 | topc = min(mask.shape[0], topc + FINAL_DILATION) 736 | rightc = max(0, rightc - FINAL_DILATION) 737 | bottomc = max(0, bottomc - FINAL_DILATION) 738 | mask[topc:bottomc, leftc:rightc] = False 739 | else: 740 | leftc = max(0, leftc - FINAL_DILATION) 741 | topc = max(0, topc - FINAL_DILATION) 742 | rightc = min(mask.shape[1], rightc + FINAL_DILATION) 743 | bottomc = min(mask.shape[0], bottomc + FINAL_DILATION) 744 | mask[topc:bottomc, leftc:rightc] = True 745 | left = newleft 746 | top = newtop 747 | right = newright 748 | bottom = newbottom 749 | w = right - left 750 | h = bottom - top 751 | 752 | def _ensure_consistent_crops(image_raw, image_bin): 753 | # workaround for OCR-D/core#687: 754 | if 0 < abs(image_raw.width - image_bin.width) <= 2: 755 | diff = image_raw.width - image_bin.width 756 | if diff > 0: 757 | image_raw = crop_image( 758 | image_raw, 759 | (int(np.floor(diff / 2)), 0, 760 | image_raw.width - int(np.ceil(diff / 2)), 761 | image_raw.height)) 762 | else: 763 | image_bin = crop_image( 764 | image_bin, 765 | (int(np.floor(-diff / 2)), 0, 766 | image_bin.width - int(np.ceil(-diff / 2)), 767 | image_bin.height)) 768 | if 0 < abs(image_raw.height - image_bin.height) <= 2: 769 | diff = image_raw.height - image_bin.height 770 | if diff > 0: 771 | image_raw = crop_image( 772 | image_raw, 773 | (0, int(np.floor(diff / 2)), 774 | image_raw.width, 775 | image_raw.height - int(np.ceil(diff / 2)))) 776 | else: 777 | image_bin = crop_image( 778 | image_bin, 779 | (0, int(np.floor(-diff / 2)), 780 | image_bin.width, 781 | image_bin.height - int(np.ceil(-diff / 2)))) 782 | return image_raw, image_bin 783 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0.0", "wheel", "setuptools-ocrd"] 3 | 4 | [project] 5 | name = "ocrd_detectron2" 6 | authors = [ 7 | {name = "Robert Sachunsky", email = "sachunsky@informatik.uni-leipzig.de"}, 8 | {name = "Julian Balling", email = "balling@infai.org"}, 9 | ] 10 | description = "OCR-D wrapper for detectron2 based segmentation models" 11 | readme = "README.md" 12 | license.text = "MIT" 13 | requires-python = ">=3.8" 14 | 15 | dynamic = ["version", "dependencies"] 16 | 17 | # https://pypi.org/classifiers/ 18 | classifiers = [ 19 | "Development Status :: 5 - Production/Stable", 20 | "Environment :: Console", 21 | "Intended Audience :: Science/Research", 22 | "Intended Audience :: Other Audience", 23 | "License :: OSI Approved :: MIT License", 24 | "Programming Language :: Python :: 3", 25 | "Programming Language :: Python :: 3 :: Only", 26 | "Topic :: Text Processing", 27 | ] 28 | 29 | [project.scripts] 30 | ocrd-detectron2-segment = "ocrd_detectron2.cli:ocrd_detectron2_segment" 31 | 32 | [project.urls] 33 | Homepage = "https://github.com/bertsky/ocrd_detectron2" 34 | Repository = "https://github.com/bertsky/ocrd_detectron2.git" 35 | 36 | [tool.setuptools.dynamic] 37 | dependencies = {file = ["requirements.txt"]} 38 | optional-dependencies.test = {file = ["requirements-test.txt"]} 39 | 40 | [tool.setuptools] 41 | packages = ["ocrd_detectron2"] 42 | package-data = {"*" = ["*.json"]} 43 | 44 | [tool.coverage.run] 45 | branch = true 46 | source = ["ocrd_detectron2"] 47 | -------------------------------------------------------------------------------- /requirements-test.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | ocrd_wrap 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ocrd>=3.3.0 2 | click>=7.0 3 | scipy 4 | numpy>=1.17.0 5 | pillow>=7.1.2 6 | shapely 7 | scikit-image>=0.17.2 8 | typing-extensions # for Torch build 9 | torch>=1.10.0 #,<1.11 10 | torchvision>=0.11.2 11 | detectron2>=0.6 12 | setuptools >= 75.0 # for Detectron build 13 | wheel # for Detectron build 14 | pycocotools # for Detectron 15 | --------------------------------------------------------------------------------