├── .dockerignore
├── .github
    └── workflows
    │   ├── docker-image.yml
    │   └── python-app.yml
├── .gitignore
├── .pylintrc
├── CHANGELOG.md
├── Dockerfile
├── MANIFEST.in
├── Makefile
├── README.md
├── ocrd-tool.json
├── ocrd_detectron2
    ├── __init__.py
    ├── cli.py
    ├── ocrd-tool.json
    ├── presets_DocBank_X101.json
    ├── presets_DocBank_X101_page.json
    ├── presets_Jambo-sudo_X101.json
    ├── presets_Math_R50.json
    ├── presets_NewspaperNavigator_R50.json
    ├── presets_PRImALayout_R50.json
    ├── presets_PubLayNet_R101.json
    ├── presets_PubLayNet_R101_JPLeoRX.json
    ├── presets_PubLayNet_R50.json
    ├── presets_PubLayNet_R50_JPLeoRX.json
    ├── presets_PubLayNet_X101.json
    ├── presets_TableBank_X152.json
    ├── presets_TableBank_X152_Psarpei.json
    └── segment.py
├── pyproject.toml
├── requirements-test.txt
└── requirements.txt


/.dockerignore:
--------------------------------------------------------------------------------
1 | test
2 | repo
3 | dist
4 | build
5 | *.egg-info
6 | *.whl
7 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Image CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "master" ]
 6 |   workflow_dispatch:
 7 | 
 8 | env:
 9 |   DOCKER_TAGNAME: ocrd/detectron2
10 | 
11 | jobs:
12 | 
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     permissions:
17 |       packages: write
18 |       contents: read
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v3
22 |     - # Activate cache export feature to reduce build time of image
23 |       name: Set up Docker Buildx
24 |       uses: docker/setup-buildx-action@v2
25 |     - name: Build the Docker image
26 |       run: make docker DOCKER_TAG=${{ env.DOCKER_TAGNAME }}
27 |     - name: Login to Dockerhub
28 |       uses: docker/login-action@v2
29 |       with:
30 |         username: ${{ vars.DOCKERHUB_USERNAME }}
31 |         password: ${{ secrets.DOCKERHUB_PASSWORD }}
32 |     - name: Push image to Dockerhub
33 |       run: docker push ${{ env.DOCKER_TAGNAME }}
34 |     - name: Alias the Docker image for GHCR
35 |       run: docker tag ${{ env.DOCKER_TAGNAME }} ghcr.io/bertsky/ocrd_detectron2
36 |     - name: Login to GitHub Container Registry
37 |       uses: docker/login-action@v2
38 |       with:
39 |         registry: ghcr.io
40 |         username: ${{ github.actor }}
41 |         password: ${{ secrets.GITHUB_TOKEN }}
42 |     - name: Push image to Github Container Registry
43 |       run: docker push ghcr.io/bertsky/ocrd_detectron2
44 | 


--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: CLI Tests
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "master" ]
 9 |   pull_request:
10 |   workflow_dispatch:
11 |     inputs:
12 |       upterm-session:
13 |         description: 'Run SSH login server for debugging'
14 |         default: False
15 |         type: boolean
16 | 
17 | jobs:
18 |   build:
19 | 
20 |     runs-on: ubuntu-latest
21 |     strategy:
22 |       matrix:
23 |         python-version: ['3.8', '3.9', '3.10']
24 | 
25 |     steps:
26 |     - uses: actions/checkout@v4
27 |     - name: Set up Python
28 |       uses: actions/setup-python@v5
29 |       with:
30 |         python-version:  ${{ matrix.python-version }}
31 |     - name: Lint with flake8
32 |       run: |
33 |         pip install flake8
34 |         # stop the build if there are Python syntax errors or undefined names
35 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
36 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
37 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
38 |     - name: Setup upterm session
39 |       # interactive SSH logins for debugging
40 |       if: github.event.inputs.upterm-session == 'true'
41 |       uses: lhotari/action-upterm@v1
42 |     - name: Install dependencies
43 |       run: make deps
44 |     - name: Install package
45 |       run: make install
46 |     - name: Cache models
47 |       uses: actions/cache@v4
48 |       with:
49 |         key: detectron-models
50 |         path: /home/runner/.local/share/ocrd-resources/ocrd-detectron2-segment/*
51 |     - name: Install dependencies for test
52 |       # also downloads models, if not already present
53 |       run: make deps-test
54 |     - name: Run tests
55 |       run: make test
56 |     - name: Upload test results
57 |       if: matrix.python-version == '3.8'
58 |       uses: actions/upload-artifact@v4
59 |       with:
60 |         name: test-results
61 |         path: |
62 |           ./test/assets/*/data/test-result
63 |           ./test/assets/*/data/OCR-D-SEG-*/
64 |         if-no-files-found: error
65 |         
66 |   publish:
67 |     permissions:
68 |       # for stefanzweifel/git-auto-commit-action to push code on gh-pages
69 |       contents: write
70 |     needs: build
71 |     runs-on: ubuntu-latest
72 |     continue-on-error: true
73 |     steps:
74 |     - name: Checkout GH Pages
75 |       uses: actions/checkout@v4
76 |       with:
77 |         ref: gh-pages
78 |     - name: Download Artifact
79 |       uses: actions/download-artifact@v4
80 |       with:
81 |         name: test-results
82 |         path: test-results
83 |     - name: Data Ingest
84 |       run: bash gen-test-results-table.sh
85 |     - name: Commit
86 |       uses: stefanzweifel/git-auto-commit-action@v4
87 |       with:
88 |         commit_message: new test results from ${{ github.sha }}
89 |         branch: gh-pages
90 |         # file_pattern: test-results*
91 |         repository: .
92 |   
93 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # Distribution / packaging
 7 | .Python
 8 | build/
 9 | develop-eggs/
10 | dist/
11 | downloads/
12 | eggs/
13 | .eggs/
14 | lib/
15 | lib64/
16 | parts/
17 | sdist/
18 | var/
19 | wheels/
20 | *.egg-info/
21 | .installed.cfg
22 | *.egg
23 | MANIFEST
24 | 
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .coverage.*
40 | .cache
41 | nosetests.xml
42 | coverage.xml
43 | *.cover
44 | .hypothesis/
45 | .pytest_cache/
46 | 
47 | # vim tmp
48 | *.swp
49 | *.swo
50 | 
51 | # emacs bkup
52 | *~
53 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
 1 | [MASTER]
 2 | extension-pkg-whitelist=cv2
 3 | 
 4 | [MESSAGES CONTROL]
 5 | disable =
 6 |     ungrouped-imports,
 7 |     bad-continuation,
 8 |     missing-docstring,
 9 |     no-self-use,
10 |     superfluous-parens,
11 |     invalid-name,
12 |     line-too-long,
13 |     too-many-arguments,
14 |     too-many-branches,
15 |     too-many-statements,
16 |     too-many-locals,
17 |     too-few-public-methods,
18 |     too-many-nested-blocks,
19 |     wrong-import-order,
20 |     duplicate-code
21 | 
22 | # allow non-snake-case identifiers:
23 | good-names=n,i
24 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | All notable changes to this project will be documented in this file.
  3 | 
  4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
  5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
  6 | 
  7 | ## [Unreleased]
  8 | 
  9 | ## [0.2.0] - 2025-04-08
 10 | 
 11 | ### Changed
 12 | - updated to OCR-D v3 API
 13 | - switched from `setup.py` to `pyproject.toml`  
 14 |   (and `ocrd-tool.json` based versioning)
 15 | - updated Dockerfile (base version, variables, labels, `ocrd-all-tool.json`)
 16 | - updated CI
 17 | 
 18 | ## [0.1.8] - 2023-06-29
 19 | ### Fixed
 20 | - workarounds for broken models (DocBank_X101, Jambo-sudo_X101)
 21 | - `make deps`: add explicit reqs prior to pip step with Torch index
 22 | - set `pc:PcGts/@pcGtsId` from `mets:file/@ID`
 23 | 
 24 | ### Added
 25 | - CI for CLI tests (with cached models and stored result artifacts)
 26 | 
 27 | ### Changed
 28 | - migrated model URLs from external to Github release assets
 29 | 
 30 | ## [0.1.7] - 2023-03-20
 31 | ### Fixed
 32 | - adapt to Numpy 1.24 (no `np.bool`)
 33 | 
 34 | ### Added
 35 | - model by Jambo-sudo (PubLayNet+custom GT)
 36 | - model by LayoutParser (PRImA Layout GT)
 37 | - CLI tests
 38 | 
 39 | ## [0.1.6] - 2023-03-10
 40 | ### Fixed
 41 | - avoid colon in generated region IDs
 42 | - `make deps`: add explicit deps for torch
 43 | - fix/update file resources
 44 | - fix model config base paths on-the-fly
 45 | 
 46 | ### Added
 47 | - add Psarpei TD model
 48 | 
 49 | ## [0.1.5] - 2023-01-15
 50 | ### Fixed
 51 | - param `debug_img`: 1 image per page
 52 | - URLs/specs for PubLayNet/JPLeoRX models
 53 | 
 54 | ## [0.1.4] - 2022-12-02
 55 | ### Added
 56 | - param `postprocessing` (select steps, including `none`)
 57 | - param `debug_img` (styles to visualise raw predictions, including `none`)
 58 | 
 59 | ## [0.1.3] - 2022-11-02
 60 | ### Fixed
 61 | - `make deps`: fall back to Detectron2 src build
 62 | 
 63 | ### Changed
 64 | - added various models as file resources
 65 | - added corresponding preset files
 66 | - updated documentation
 67 | 
 68 | ## [0.1.2] - 2022-10-27
 69 | ### Fixed
 70 | - `make deps`: fix CUDA detection even more
 71 | - apply `device` param as passed
 72 | 
 73 | ### Changed
 74 | - downscale images to no more than 150 DPI for prediction (for speed)
 75 | - add param `operation_level` (default `page`), add `table` mode
 76 | 
 77 | ## [0.1.1] - 2022-02-02
 78 | ### Fixed
 79 | - `make deps`: fix CUDA detection and allow CPU as fallback
 80 | 
 81 | ### Changed
 82 | - instance segmentation postprocessing: use asymmetric overlap
 83 |   criterion for non-maximum suppression
 84 | - skip instances which belong to classes with empty category
 85 | - annotate incrementally (by skipping candidates that overlap
 86 |   with pre-existing top-level regions)
 87 | 
 88 | ## [0.1.0] - 2022-01-21
 89 | 
 90 | <!-- link-labels -->
 91 | [0.1.0]: ../../compare/aeca7e37...v0.1.0
 92 | [0.1.1]: ../../compare/v0.1.0...v0.1.1
 93 | [0.1.2]: ../../compare/v0.1.1...v0.1.2
 94 | [0.1.3]: ../../compare/v0.1.2...v0.1.3
 95 | [0.1.4]: ../../compare/v0.1.3...v0.1.4
 96 | [0.1.5]: ../../compare/v0.1.4...v0.1.5
 97 | [0.1.6]: ../../compare/v0.1.5...v0.1.6
 98 | [0.1.7]: ../../compare/v0.1.6...v0.1.7
 99 | [0.1.8]: ../../compare/v0.1.7...v0.1.8
100 | [0.2.0]: ../../compare/v0.1.8...v0.2.0
101 | [unreleased]: ../../compare/v0.2.0...master
102 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG DOCKER_BASE_IMAGE
 2 | FROM $DOCKER_BASE_IMAGE
 3 | ARG VCS_REF
 4 | ARG BUILD_DATE
 5 | LABEL \
 6 |     maintainer="https://ocr-d.de/en/contact" \
 7 |     org.label-schema.vcs-ref=$VCS_REF \
 8 |     org.label-schema.vcs-url="https://github.com/bertsky/ocrd_detectron2" \
 9 |     org.label-schema.build-date=$BUILD_DATE \
10 |     org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \
11 |     org.opencontainers.image.title="ocrd_detectron2" \
12 |     org.opencontainers.image.description="OCR-D wrapper for detectron2 based segmentation models" \
13 |     org.opencontainers.image.source="https://github.com/bertsky/ocrd_detectron2" \
14 |     org.opencontainers.image.documentation="https://github.com/bertsky/ocrd_detectron2/blob/${VCS_REF}/README.md" \
15 |     org.opencontainers.image.revision=$VCS_REF \
16 |     org.opencontainers.image.created=$BUILD_DATE \
17 |     org.opencontainers.image.base.name=ocrd/core-cuda-torch
18 | 
19 | ENV DEBIAN_FRONTEND=noninteractive
20 | ENV PYTHONIOENCODING=utf8
21 | ENV LANG=C.UTF-8
22 | ENV LC_ALL=C.UTF-8
23 | 
24 | # avoid HOME/.local/share (hard to predict USER here)
25 | # so let XDG_DATA_HOME coincide with fixed system location
26 | # (can still be overridden by derived stages)
27 | ENV XDG_DATA_HOME /usr/local/share
28 | # avoid the need for an extra volume for persistent resource user db
29 | # (i.e. XDG_CONFIG_HOME/ocrd/resources.yml)
30 | ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
31 | 
32 | WORKDIR /build/ocrd_detectron2
33 | 
34 | COPY . .
35 | COPY ocrd-tool.json .
36 | # prepackage ocrd-tool.json as ocrd-all-tool.json
37 | RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
38 | # prepackage ocrd-all-module-dir.json
39 | RUN ocrd ocrd-tool ocrd-tool.json dump-module-dirs > $(dirname $(ocrd bashlib filename))/ocrd-all-module-dir.json
40 | # install everything and reduce image size
41 | RUN apt-get install -y --no-install-recommends g++ && \
42 |     make deps && \
43 |     make install && \
44 |     rm -rf /build/ocrd_detectron2 && \
45 |     apt-get -y remove --auto-remove g++
46 | 
47 | WORKDIR /data
48 | VOLUME /data
49 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include ocrd-tool.json
2 | include README.md
3 | include requirements.txt
4 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | PYTHON = python3
  2 | PIP = pip3
  3 | PYTHONIOENCODING=utf8
  4 | SHELL = /bin/bash
  5 | 
  6 | # Docker container tag
  7 | DOCKER_BASE_IMAGE ?= docker.io/ocrd/core-cuda-torch:latest
  8 | DOCKER_TAG ?= 'ocrd/detectron2'
  9 | DOCKER ?= docker
 10 | 
 11 | help:
 12 | 	@echo
 13 | 	@echo "  Targets"
 14 | 	@echo
 15 | 	@echo "    deps        Install only Python dependencies via pip"
 16 | 	@echo "    install     Install full Python package via pip"
 17 | 	@echo "    install-dev Install full Python package via pip"
 18 | 	@echo "    deps-test   Install Python dependencies for tests via pip and models via resmgr"
 19 | 	@echo "    test        Run regression tests"
 20 | 	@echo "    build       Build Python package as source and wheel distribution"
 21 | 	@echo "    clean       Remove symlinks in test/assets"
 22 | 	@echo "    docker      Build Docker image"
 23 | 	@echo
 24 | 	@echo "  Variables"
 25 | 	@echo "    PYTHON"
 26 | 	@echo "    CUDA_VERSION  override detection of CUDA runtime version (e.g. '11.3' or 'CPU')"
 27 | 	@echo "    DOCKER_TAG    Docker image tag of result for the docker target"
 28 | 
 29 | # Install Python deps via pip
 30 | # There is no prebuilt for detectron2 on PyPI, and the public wheels depend on CUDA and Torch version.
 31 | # See https://github.com/facebookresearch/detectron2/blob/main/INSTALL.md#install-pre-built-detectron2
 32 | # and https://github.com/facebookresearch/detectron2/issues/969
 33 | # While there is a web site which lists them, which works with `pip install -f`, this unfortunately cannot
 34 | # be encapsulated via setuptools, see https://github.com/pypa/pip/issues/5898
 35 | # and https://stackoverflow.com/questions/3472430/how-can-i-make-setuptools-install-a-package-thats-not-on-pypi
 36 | # and https://github.com/pypa/pip/issues/4187
 37 | # Detectron2 requires Torch >=1.10 and <1.11, which is quite out of date now.
 38 | # Also, the prebuilt versions on https://dl.fbaipublicfiles.com/detectron2/wheels/*/torch1.10/index.html
 39 | # are only available for CUDA 10.1, 10.2, 11.1, 11.3 or CPU.
 40 | # Moreoever, even Torch >=1.10 and <1.11 is not available on https://download.pytorch.org/whl/torch/
 41 | # except for a narrow few CUDA versions.
 42 | # To make matters worse, Detectron2 setup fails specifying Torch as build-time and run-time dependency:
 43 | # https://github.com/facebookresearch/detectron2/issues/4472
 44 | # Therefore, source build of Detectron2 fails unless Torch is already installed before _and_ using
 45 | # pip install --no-build-isolation.
 46 | # Finally, due to https://github.com/pypa/pip/issues/4321, we cannot even mix -f links and pkgindex (for Pytorch versions)
 47 | # because pip will (more or less) randomly pick the one or the other.
 48 | # Detectron2 must always have the same version of Torch at runtime which it was compiled against.
 49 | deps:
 50 | 	@$(PIP) install -r <(sed "/torch/d;/detectron2/d" requirements.txt)
 51 | 	@if test -n "$$CUDA_VERSION"; then :; \
 52 | 	elif test -s /usr/local/cuda/version.txt; then \
 53 | 		CUDA_VERSION=$$(sed 's/^.* //;s/\([0-9]\+[.][0-9]\).*/\1/' /usr/local/cuda/version.txt); \
 54 | 	elif command -v nvcc &>/dev/null; then \
 55 | 		CUDA_VERSION=$$(nvcc --version | sed -n '/^Cuda/{s/.* release //;s/,.*//;p;}'); \
 56 | 	elif command -v nvidia-smi &>/dev/null; then \
 57 | 		CUDA_VERSION=$$(nvidia-smi | sed -n '/CUDA Version/{s/.*CUDA Version: //;s/ .*//;p;}'); \
 58 | 	elif command -v pkg-config &>/dev/null; then \
 59 | 		CUDA_VERSION=$$(pkg-config --list-all | sed -n '/^cudart/{s/cudart-//;s/ .*//;p;q;}'); \
 60 | 	fi && \
 61 | 	if test -z "$$CUDA_VERSION"; then \
 62 | 		echo "Cannot find CUDA runtime library, assuming CPU-only"; CUDA_VERSION=CPU; \
 63 | 	fi && echo "Detected CUDA version: $$CUDA_VERSION" && \
 64 | 	if test "$$CUDA_VERSION" = CPU; then CUDA=cpu; \
 65 | 	else IFS=. CUDA=($$CUDA_VERSION) && CUDA=cu$${CUDA[0]}$${CUDA[1]}; \
 66 | 	fi && \
 67 | 	$(PIP) install -i "https://download.pytorch.org/whl/$$CUDA" \
 68 | 	-r <(sed -n "/torch/p" requirements.txt) && \
 69 | 	$(PIP) install --no-build-isolation "git+https://github.com/facebookresearch/detectron2#egg=detectron2"
 70 | 
 71 | # Install Python package via pip
 72 | install: deps
 73 | 	$(PIP) install .
 74 | 
 75 | # Install Python package via pip
 76 | install-dev: deps
 77 | 	$(PIP) install -e .
 78 | 
 79 | # Install testing python deps via pip
 80 | deps-test: models-test
 81 | 	$(PIP) install -r requirements-test.txt
 82 | 
 83 | build:
 84 | 	$(PIP) install build
 85 | 	$(PYTHON) -m build .
 86 | 
 87 | # Clone OCR-D/assets to ./repo/assets
 88 | repo/assets:
 89 | 	@mkdir -p $(@D)
 90 | 	git clone https://github.com/OCR-D/assets $@
 91 | 
 92 | # Setup test data
 93 | test/assets: repo/assets
 94 | 	@mkdir -p $@
 95 | 	cp -r -t $@ repo/assets/data/*
 96 | 
 97 | # Remove test data copies and intermediate results
 98 | clean:
 99 | 	-$(RM) -r test/assets
100 | 
101 | # Build docker image
102 | docker:
103 | 	$(DOCKER) build \
104 | 	--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
105 | 	--build-arg VCS_REF=$$(git rev-parse --short HEAD) \
106 | 	--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
107 | 	-t $(DOCKER_TAG) .
108 | 
109 | #MODELDIR := $(or $(XDG_DATA_HOME),$(HOME)/.local/share)/ocrd-resources/ocrd-detectron2-segment
110 | 
111 | TESTMODEL := TableBank_X152_Psarpei
112 | TESTMODEL += DocBank_X101
113 | TESTMODEL += Jambo-sudo_X101
114 | TESTMODEL += PRImALayout_R50
115 | 
116 | TESTBED := gutachten
117 | TESTBED += column-samples
118 | 
119 | models-test: $(TESTMODEL:=.yaml)
120 | models-test: $(TESTMODEL:=.pth)
121 | 
122 | %.yaml:
123 | 	ocrd resmgr download ocrd-detectron2-segment $@
124 | %.pth:
125 | 	ocrd resmgr download ocrd-detectron2-segment $@
126 | 
127 | test: $(patsubst %,test/assets/%/data/test-result,$(TESTBED))
128 | 	@cat $^
129 | 
130 | count-regions := python -c "import sys; from ocrd_models.ocrd_page import parse; print('%s: %d' % (sys.argv[1], len(parse(sys.argv[1], silence=True).get_Page().get_AllRegions())))"
131 | 
132 | %/test-result: test/assets
133 | 	for MODEL in $(TESTMODEL); do \
134 | 		$(MAKE) MODEL=$$MODEL $*/OCR-D-SEG-$$MODEL; \
135 | 	done
136 | 	@shopt -s nullglob; { for file in $(TESTMODEL:%=$*/OCR-D-SEG-%/*.xml); do \
137 | 		$(count-regions) $$file; \
138 | 	done; } > $@
139 | 
140 | %/OCR-D-BIN: 
141 | 	cd $(@D) && ocrd-skimage-binarize -I `grp=(*IMG); basename $$grp` -O $(@F)
142 | 
143 | # workaround for OCR-D/core#930:
144 | %/OCR-D-SEG-$(MODEL): PRESET = $(shell ocrd-detectron2-segment -D)/presets_$(MODEL).json
145 | 
146 | %/OCR-D-SEG-$(MODEL): %/OCR-D-BIN
147 | 	cd $(@D) && ocrd-detectron2-segment -I $(<F) -O $(@F) -P debug_img instance_colors_only -P postprocessing only-nms -P min_confidence 0.3 -p $(PRESET)
148 | 
149 | # make cannot delete directories, so keep them
150 | .PRECIOUS .SECONDARY: %/OCR-D-BIN %/OCR-D-SEG-$(MODEL)
151 | 
152 | .PHONY: help deps install build deps-test models-test test clean docker
153 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![PyPI version](https://badge.fury.io/py/ocrd-detectron2.svg)](https://badge.fury.io/py/ocrd-detectron2)
  2 | [![Python test](https://github.com/bertsky/ocrd_detectron2/actions/workflows/python-app.yml/badge.svg)](https://github.com/bertsky/ocrd_detectron2/actions/workflows/python-app.yml)
  3 | [![Docker Automated build via Github Container Registry](https://github.com/bertsky/ocrd_detectron2/actions/workflows/docker-image.yml/badge.svg)](https://github.com/bertsky/ocrd_detectron2/actions/workflows/docker-image.yml)
  4 | [![Docker Automated build via Dockerhub](https://img.shields.io/docker/automated/bertsky/ocrd_detectron2.svg)](https://hub.docker.com/r/bertsky/ocrd_detectron2/tags/)
  5 | 
  6 | # ocrd_detectron2
  7 | 
  8 |     OCR-D wrapper for detectron2 based segmentation models
  9 | 
 10 |   * [Introduction](#introduction)
 11 |   * [Installation](#installation)
 12 |   * [Usage](#usage)
 13 |      * [OCR-D processor interface ocrd-detectron2-segment](#ocr-d-processor-interface-ocrd-detectron2-segment)
 14 |   * [Models](#models)
 15 |      * [TableBank](#tablebank)
 16 |      * [PubLayNet](#publaynet)
 17 |      * [PubLayNet](#publaynet-1)
 18 |      * [LayoutParser](#layoutparser)
 19 |      * [DocBank](#docbank)
 20 |   * [Testing](#testing)
 21 |      * [Test results](#test-results)
 22 | 
 23 | ## Introduction
 24 | 
 25 | This offers [OCR-D](https://ocr-d.de) compliant [workspace processors](https://ocr-d.de/en/spec/cli) for document layout analysis with models trained on [Detectron2](https://github.com/facebookresearch/detectron2), which implements [Faster R-CNN](https://arxiv.org/abs/1506.01497), [Mask R-CNN](https://arxiv.org/abs/1703.06870), [Cascade R-CNN](https://arxiv.org/abs/1712.00726), [Feature Pyramid Networks](https://arxiv.org/abs/1612.03144) and [Panoptic Segmentation](https://arxiv.org/abs/1801.00868), among others.
 26 | 
 27 | In trying to cover a broad range of third-party models, a few sacrifices have to be made: Deployment of [models](#models) may be difficult, and needs configuration. Class labels (really [PAGE-XML](https://github.com/PRImA-Research-Lab/PAGE-XML) region types) must be provided. The code itself tries to cope with panoptic and instance segmentation models (with or without masks).
 28 | 
 29 | Only meant for (coarse) page segmentation into regions – no text lines, no reading order, no orientation.
 30 | 
 31 | ## Installation
 32 | 
 33 | Create and activate a [virtual environment](https://packaging.python.org/tutorials/installing-packages/#creating-virtual-environments) as usual.
 34 | 
 35 | To install Python dependencies:
 36 | 
 37 |     make deps
 38 | 
 39 | Which is the equivalent of:
 40 | 
 41 |     pip install -r requirements.txt -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.10/index.html # for CUDA 11.3
 42 |     pip install -r requirements.txt -f https://dl.fbaipublicfiles.com/detectron2/wheels/cpu/torch1.10/index.html # for CPU only
 43 | 
 44 | To install this module, then do:
 45 | 
 46 |     make install
 47 | 
 48 | Which is the equivalent of:
 49 | 
 50 |     pip install .
 51 | 
 52 | **Alternatively**, you can use the provided **Docker image** (either from [Github Container Registry](https://github.com/users/bertsky/packages/container/package/ocrd_detectron2) or from [Dockerhub](https://hub.docker.com/r/bertsky/ocrd_detectron2)):
 53 | 
 54 |     docker pull bertsky/ocrd_detectron2
 55 |     # or
 56 |     docker pull ghcr.io/bertsky/ocrd_detectron2
 57 | 
 58 | 
 59 | ## Usage
 60 | 
 61 | ### [OCR-D processor](https://ocr-d.de/en/spec/cli) interface `ocrd-detectron2-segment`
 62 | 
 63 | To be used with [PAGE-XML](https://github.com/PRImA-Research-Lab/PAGE-XML) documents in an [OCR-D](https://ocr-d.de/en/about) annotation workflow.
 64 | 
 65 | ```
 66 | Usage: ocrd-detectron2-segment [OPTIONS]
 67 | 
 68 |   Detect regions with Detectron2 models
 69 | 
 70 |   > Use detectron2 to segment each page into regions.
 71 | 
 72 |   > Open and deserialize PAGE input files and their respective images.
 73 |   > Fetch a raw and a binarized image for the page frame (possibly
 74 |   > cropped and deskewed).
 75 | 
 76 |   > Feed the raw image into the detectron2 predictor that has been used
 77 |   > to load the given model. Then, depending on the model capabilities
 78 |   > (whether it can do panoptic segmentation or only instance
 79 |   > segmentation, whether the latter can do masks or only bounding
 80 |   > boxes), post-process the predictions:
 81 | 
 82 |   > - panoptic segmentation: take the provided segment label map, and
 83 |   >   apply the segment to class label map,
 84 |   > - instance segmentation: find an optimal non-overlapping set (flat
 85 |   >   map) of instances via non-maximum suppression,
 86 |   > - both: avoid overlapping pre-existing top-level regions (incremental
 87 |   >   segmentation).
 88 | 
 89 |   > Then extend / shrink the surviving masks to fully include / exclude
 90 |   > connected components in the foreground that are on the boundary.
 91 | 
 92 |   > (This describes the steps when ``postprocessing`` is `full`. A value
 93 |   > of `only-nms` will omit the morphological extension/shrinking, while
 94 |   > `only-morph` will omit the non-maximum suppression, and `none` will
 95 |   > skip all postprocessing.)
 96 | 
 97 |   > Finally, find the convex hull polygon for each region, and map its
 98 |   > class id to a new PAGE region type (and subtype).
 99 | 
100 |   > (Does not annotate `ReadingOrder` or `TextLine`s or `@orientation`.)
101 | 
102 |   > Produce a new output file by serialising the resulting hierarchy.
103 | 
104 | Options:
105 |   -I, --input-file-grp USE        File group(s) used as input
106 |   -O, --output-file-grp USE       File group(s) used as output
107 |   -g, --page-id ID                Physical page ID(s) to process
108 |   --overwrite                     Remove existing output pages/images
109 |                                   (with --page-id, remove only those)
110 |   --profile                       Enable profiling
111 |   --profile-file                  Write cProfile stats to this file. Implies --profile
112 |   -p, --parameter JSON-PATH       Parameters, either verbatim JSON string
113 |                                   or JSON file path
114 |   -P, --param-override KEY VAL    Override a single JSON object key-value pair,
115 |                                   taking precedence over --parameter
116 |   -m, --mets URL-PATH             URL or file path of METS to process
117 |   -w, --working-dir PATH          Working directory of local workspace
118 |   -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
119 |                                   Log level
120 |   -C, --show-resource RESNAME     Dump the content of processor resource RESNAME
121 |   -L, --list-resources            List names of processor resources
122 |   -J, --dump-json                 Dump tool description as JSON and exit
123 |   -D, --dump-module-dir           Output the 'module' directory with resources for this processor
124 |   -h, --help                      This help message
125 |   -V, --version                   Show version
126 | 
127 | Parameters:
128 |    "operation_level" [string - "page"]
129 |     hierarchy level which to predict and assign regions for
130 |     Possible values: ["page", "table"]
131 |    "categories" [array - REQUIRED]
132 |     maps each category (class index) of the model to a PAGE region
133 |     type (and @type or @custom if separated by colon), e.g.
134 |     ['TextRegion:paragraph', 'TextRegion:heading',
135 |     'TextRegion:floating', 'TableRegion', 'ImageRegion'] for PubLayNet;
136 |     categories with an empty string will be skipped during prediction
137 |    "model_config" [string - REQUIRED]
138 |     path name of model config
139 |    "model_weights" [string - REQUIRED]
140 |     path name of model weights
141 |    "min_confidence" [number - 0.5]
142 |     confidence threshold for detections
143 |    "postprocessing" [string - "full"]
144 |     which postprocessing steps to enable: by default, applies a custom
145 |     non-maximum suppression (to avoid overlaps) and morphological
146 |     operations (using connected component analysis on the binarized
147 |     input image to shrink or expand regions)
148 |     Possible values: ["full", "only-nms", "only-morph", "none"]
149 |    "debug_img" [string - "none"]
150 |     paint an AlternativeImage which blends the input image
151 |     and all raw decoded region candidates
152 |     Possible values: ["none", "instance_colors", "instance_colors_only", "category_colors"]
153 |    "device" [string - "cuda"]
154 |     select computing device for Torch (e.g. cpu or cuda:0); will fall
155 |     back to CPU if no GPU is available
156 | ```
157 | 
158 | Example:
159 | 
160 |     # download one preconfigured model:
161 |     ocrd resmgr download ocrd-detectron2-segment TableBank_X152.yaml
162 |     ocrd resmgr download ocrd-detectron2-segment TableBank_X152.pth
163 |     # run it (setting model_config, model_weights and categories):
164 |     ocrd-detectron2-segment -I OCR-D-BIN -O OCR-D-SEG-TAB -P categories '["TableRegion"]' -P model_config TableBank_X152.yaml -P model_weights TableBank_X152.pth -P min_confidence 0.1
165 |     # run it (equivalent, with presets file)
166 |     ocrd-detectron2-segment -I OCR-D-BIN -O OCR-D-SEG-TAB -p presets_TableBank_X152.json -P min_confidence 0.1 
167 |     # download all preconfigured models
168 |     ocrd resmgr download ocrd-detectron2-segment "*"
169 | 
170 | For installation **via Docker**, usage is bascially the same as above – with some modifications:
171 | 
172 |     # For data persistency, decide which host-side directories you want to mount in Docker:
173 |     DATADIR=/host-side/path/to/data
174 |     MODELDIR=/host-side/path/to/models
175 |     # Either you "log in" to a container first:
176 |     docker run -v $DATADIR:/data -v $MODELDIR:/usr/local/share/ocrd-resources -it bertsky/ocrd_detectron2 bash
177 |     # and then can use the above commands verbatim
178 |     ...
179 |     # Or you spin up a new container each time,
180 |     # which means prefixing the above commands with
181 |     docker run -v $DATADIR:/data -v $MODELDIR:/usr/local/share/ocrd-resources bertsky/ocrd_detectron2 ...
182 | 
183 | 
184 | #### Debugging
185 | 
186 | If you mistrust your model, and/or this tool's additional postprocessing,
187 | try playing with the runtime parameters:
188 | 
189 | - Set `debug_img` to some value other than `none`, e.g. `instance_colors_only`.
190 |   This will generate an image which overlays the raw predictions with the raw image
191 |   using Detectron2's internal visualiser. The parameter settings correspond to its
192 |   [ColorMode](https://detectron2.readthedocs.io/en/latest/modules/utils.html#detectron2.utils.visualizer.ColorMode).
193 |   The AlternativeImages will have `@comments="debug"`, and will also be referenced in the METS,
194 |   which allows convenient browsing with [OCR-D Browser](https://github.com/hnesk/browse-ocrd).
195 |   (For example, open the Page View and Image View side by side, and navigate to your output
196 |   fileGrp on each.)
197 | - Selectively disable postprocessing steps: from the default `full` via `only-nms` (first stage)
198 |   or `only-morph` (second stage) to `none`.
199 | - Lower `min_confidence` to get more candidates, raise to get fewer.
200 | 
201 | ## Models
202 | 
203 | Some of the following models have already been registered as known [file resources](https://ocr-d.de/en/spec/cli#processor-resources), along with parameter presets to use them conveniently.
204 | 
205 | To get a list of registered models **available for download**, do:
206 | 
207 |     ocrd resmgr list-available -e ocrd-detectron2-segment
208 | 
209 | To get a list of **already installed** models and presets, do:
210 | 
211 |     ocrd resmgr list-installed -e ocrd-detectron2-segment
212 | 
213 | To **download** a registered model (i.e. a config file and the respective weights file), do:
214 | 
215 |     ocrd resmgr download ocrd-detectron2-segment NAME.yaml
216 |     ocrd resmgr download ocrd-detectron2-segment NAME.pth
217 | 
218 | To download more models (registered or other), see:
219 | 
220 |     ocrd resmgr download --help
221 | 
222 | To **use** a model, do:
223 | 
224 |     ocrd-detectron2-segment -P model_config NAME.yaml -P model_weights NAME.pth -P categories '[...]' ...
225 |     ocrd-detectron2-segment -p NAME.json ... # equivalent, with presets file
226 | 
227 | To add (i.e. register) a **new model**, you first have to find:
228 | - the classes it is trained on, so you can then define a mapping to PAGE-XML region (and subregion) types,
229 | - a download link to the model config and model weights file. 
230 |   Archives (zip/tar) are allowed, but then you must also specify the file paths to extract.
231 | 
232 | Assuming you have done so, then proceed as follows:
233 | 
234 |     # from local file path
235 |     ocrd resmgr download -n path/to/model/config.yml ocrd-detectron2-segment NAME.yml
236 |     ocrd resmgr download -n path/to/model/weights.pth ocrd-detectron2-segment NAME.pth
237 |     # from single file URL
238 |     ocrd resmgr download -n https://path.to/model/config.yml ocrd-detectron2-segment NAME.yml
239 |     ocrd resmgr download -n https://path.to/model/weights.pth ocrd-detectron2-segment NAME.pth
240 |     # from zip file URL
241 |     ocrd resmgr download -n https://path.to/model/arch.zip -t archive -P zip-path/to/config.yml ocrd-detectron2-segment NAME.yml
242 |     ocrd resmgr download -n https://path.to/model/arch.zip -t archive -P zip-path/to/weights.pth ocrd-detectron2-segment NAME.pth
243 |     # create corresponding preset file
244 |     echo '{"model_weights": "NAME.pth", "model_config": "NAME.yml", "categories": [...]}' > NAME.json
245 |     # install preset file so it can be used everywhere (not just in CWD):
246 |     ocrd resmgr download -n NAME.json ocrd-detectron2-segment NAME.json
247 |     # now the new model can be used just like the preregistered models
248 |     ocrd-detectron2-segment -p NAME.json ...
249 | 
250 | 
251 | What follows is an **overview** of the **preregistered** models (i.e. available via `resmgr`).
252 | 
253 | > **Note**: These are just examples, no exhaustive search was done yet!
254 | 
255 | > **Note**: The filename suffix (.pth vs .pkl) of the weight file does matter!
256 | 
257 | ### [TableBank](https://github.com/doc-analysis/TableBank)
258 | 
259 | X152-FPN [config](https://layoutlm.blob.core.windows.net/tablebank/model_zoo/detection/All_X152/All_X152.yaml)|[weights](https://layoutlm.blob.core.windows.net/tablebank/model_zoo/detection/All_X152/model_final.pth)|`["TableRegion"]`
260 | 
261 | ### [TableBank](https://github.com/Psarpei/Multi-Type-TD-TSR)
262 | 
263 | X152-FPN [config](https://drive.google.com/drive/folders/1COTV5f7dEAA4Txmxy3LVfcNHiPSc4Bmp?usp=sharing)|[weights](https://drive.google.com/drive/folders/1COTV5f7dEAA4Txmxy3LVfcNHiPSc4Bmp?usp=sharing)|`["TableRegion"]`
264 | 
265 | ### [PubLayNet](https://github.com/hpanwar08/detectron2)
266 | 
267 | R50-FPN [config](https://github.com/hpanwar08/detectron2/raw/master/configs/DLA_mask_rcnn_R_50_FPN_3x.yaml)|[weights](https://www.dropbox.com/sh/44ez171b2qaocd2/AAB0huidzzOXeo99QdplZRjua)|`["TextRegion:paragraph", "TextRegion:heading", "TextRegion:floating", "TableRegion", "ImageRegion"]`
268 | 
269 | R101-FPN [config](https://github.com/hpanwar08/detectron2/raw/master/configs/DLA_mask_rcnn_R_101_FPN_3x.yaml)|[weights](https://www.dropbox.com/sh/wgt9skz67usliei/AAD9n6qbsyMz1Y3CwpZpHXCpa)|`["TextRegion:paragraph", "TextRegion:heading", "TextRegion:floating", "TableRegion", "ImageRegion"]`
270 | 
271 | X101-FPN [config](https://github.com/hpanwar08/detectron2/raw/master/configs/DLA_mask_rcnn_X_101_32x8d_FPN_3x.yaml)|[weights](https://www.dropbox.com/sh/1098ym6vhad4zi6/AABe16eSdY_34KGp52W0ruwha)|`["TextRegion:paragraph", "TextRegion:heading", "TextRegion:floating", "TableRegion", "ImageRegion"]`
272 | 
273 | ### [PubLayNet](https://github.com/JPLeoRX/detectron2-publaynet)
274 | 
275 | R50-FPN [config](https://github.com/facebookresearch/detectron2/blob/main/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml)|[weights](https://drive.google.com/file/d/1IbxaRd82hIrxPT4a1U61_g2vvE3zcRLO/view?usp=sharing)|`["TextRegion:paragraph", "TextRegion:heading", "TextRegion:floating", "TableRegion", "ImageRegion"]`
276 | 
277 | R101-FPN [config](https://github.com/facebookresearch/detectron2/blob/main/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml)|[weights](https://drive.google.com/file/d/17MD-FegQtFRNn4GeHqKCLaQZ6FiFrzLg/view?usp=sharing)|`["TextRegion:paragraph", "TextRegion:heading", "TextRegion:floating", "TableRegion", "ImageRegion"]`
278 | 
279 | ### [LayoutParser](https://github.com/Layout-Parser/layout-parser/blob/master/src/layoutparser/models/detectron2/catalog.py)
280 | 
281 | provides different model variants of various depths for multiple datasets:
282 | - [PubLayNet](https://github.com/ibm-aur-nlp/PubLayNet) (Medical Research Papers)
283 | - [TableBank](https://doc-analysis.github.io/tablebank-page/index.html) (Tables Computer Typesetting)
284 | - [PRImALayout](https://www.primaresearch.org/dataset/) (Various Computer Typesetting)  
285 |   R50-FPN [config](https://www.dropbox.com/s/yc92x97k50abynt/config.yaml?dl=1)|[weights](https://www.dropbox.com/s/h7th27jfv19rxiy/model_final.pth?dl=1)|`["Background","TextRegion","ImageRegion","TableRegion","MathsRegion","SeparatorRegion","LineDrawingRegion"]`
286 | - [HJDataset](https://dell-research-harvard.github.io/HJDataset/) (Historical Japanese Magazines)
287 | - [NewspaperNavigator](https://news-navigator.labs.loc.gov/) (Historical Newspapers)
288 | - [Math Formula Detection](http://transcriptorium.eu/~htrcontest/MathsICDAR2021/)
289 | 
290 | See [here](https://github.com/Layout-Parser/layout-parser/blob/master/docs/notes/modelzoo.md) for an overview,
291 | and [here](https://github.com/Layout-Parser/layout-parser/blob/main/src/layoutparser/models/detectron2/catalog.py) for the model files.
292 | You will have to adapt the label map to conform to [PAGE-XML](https://github.com/PRImA-Research-Lab/PAGE-XML)
293 | region (sub)types accordingly.
294 | 
295 | ### [PubLaynet finetuning](https://github.com/Jambo-sudo/Historical-document-layout-analysis)
296 | 
297 | (pre-trained on PubLayNet, fine-tuned on a custom, non-public GT corpus of 500 pages 20th century magazines)
298 | 
299 | X101-FPN [config](https://github.com/Jambo-sudo/Historical-document-layout-analysis/raw/main/historical-document-analysis/DLA_mask_rcnn_X_101_32x8d_FPN_3x.yaml)|[weights](https://www.dropbox.com/s/hfhsdpvg7jesd4g/pub_model_final.pth?dl=1)|`["TextRegion:caption","ImageRegion","TextRegion:page-number","TableRegion","TextRegion:heading","TextRegion:paragraph"]`
300 | 
301 | ### [DocBank](https://github.com/doc-analysis/DocBank/blob/master/MODEL_ZOO.md)
302 | 
303 | X101-FPN [archive](https://layoutlm.blob.core.windows.net/docbank/model_zoo/X101.zip)
304 | 
305 | Proposed mappings:
306 | - `["TextRegion:header", "TextRegion:credit", "TextRegion:caption", "TextRegion:other", "MathsRegion", "GraphicRegion", "TextRegion:footer", "TextRegion:floating", "TextRegion:paragraph", "TextRegion:endnote", "TextRegion:heading", "TableRegion", "TextRegion:heading"]` (using only predefined `@type`)
307 | - `["TextRegion:abstract", "TextRegion:author", "TextRegion:caption", "TextRegion:date", "MathsRegion", "GraphicRegion", "TextRegion:footer", "TextRegion:list", "TextRegion:paragraph", "TextRegion:reference", "TextRegion:heading", "TableRegion", "TextRegion:title"]` (using `@custom` as well)
308 | 
309 | ## Testing
310 | 
311 | To install Python dependencies and download some models:
312 | 
313 |     make deps-test
314 | 
315 | Which is the equivalent of:
316 | 
317 |     pip install -r requirements-test.txt
318 |     make models-test
319 | 
320 | To run the tests, then do:
321 | 
322 |     make test
323 | 
324 | You can inspect the results under `test/assets/*/data` under various new `OCR-D-SEG-*` fileGrps.
325 | (Again, it is recommended to use [OCR-D Browser](https://github.com/hnesk/browse-ocrd).)
326 | 
327 | Finally, to remove the test data, do:
328 | 
329 |     make clean
330 | 
331 | ### Test results
332 | 
333 | These tests are integrated as a [Github Action](https://github.com/bertsky/ocrd_detectron2/actions/workflows/python-app.yml). Its results can be viewed [here](https://bertsky.github.io/ocrd_detectron2/test-results).
334 | 


--------------------------------------------------------------------------------
/ocrd-tool.json:
--------------------------------------------------------------------------------
1 | ocrd_detectron2/ocrd-tool.json


--------------------------------------------------------------------------------
/ocrd_detectron2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bertsky/ocrd_detectron2/6ff0a12bc552dc1aeaa63ea450d951ad58099f0c/ocrd_detectron2/__init__.py


--------------------------------------------------------------------------------
/ocrd_detectron2/cli.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | 
 3 | from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
 4 | from .segment import Detectron2Segment
 5 | 
 6 | @click.command()
 7 | @ocrd_cli_options
 8 | def ocrd_detectron2_segment(*args, **kwargs):
 9 |     return ocrd_cli_wrap_processor(Detectron2Segment, *args, **kwargs)
10 | 


--------------------------------------------------------------------------------
/ocrd_detectron2/ocrd-tool.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "git_url": "https://github.com/bertsky/ocrd_detectron2",
  3 |     "dockerhub": "ocrd/detectron2",
  4 |     "version": "0.2.0",
  5 |     "tools": {
  6 |         "ocrd-detectron2-segment": {
  7 |             "executable": "ocrd-detectron2-segment",
  8 |             "categories": ["Layout analysis"],
  9 |             "steps": ["layout/segmentation/region"],
 10 |             "description": "Detect regions with Detectron2 models",
 11 |             "input_file_grp_cardinality": 1,
 12 |             "output_file_grp_cardinality": 1,
 13 |             "parameters": {
 14 |                 "operation_level": {
 15 |                     "type": "string",
 16 |                     "enum": ["page", "table"],
 17 |                     "default": "page",
 18 |                     "description": "hierarchy level which to predict and assign regions for"
 19 |                 },
 20 |                 "categories": {
 21 |                     "type": "array",
 22 |                     "required": true,
 23 |                     "description": "maps each region category (position) of the model to a PAGE region type (and @type or @custom if separated by colon), e.g. ['TextRegion:paragraph', 'TextRegion:heading', 'TextRegion:floating', 'TableRegion', 'ImageRegion'] for PubLayNet; categories with an empty string will be skipped during prediction"
 24 |                 },
 25 |                 "model_config": {
 26 |                     "type": "string",
 27 |                     "format": "uri",
 28 |                     "content-type": "text/yaml",
 29 |                     "required": true,
 30 |                     "description": "path name of model config"
 31 |                 },
 32 |                 "model_weights": {
 33 |                     "type": "string",
 34 |                     "format": "uri",
 35 |                     "content-type": "application/octet-stream",
 36 |                     "required": true,
 37 |                     "description": "path name of model weights"
 38 |                 },
 39 |                 "min_confidence": {
 40 |                     "type": "number",
 41 |                     "format": "float",
 42 |                     "default": 0.5,
 43 |                     "description": "confidence threshold for detections"
 44 |                 },
 45 |                 "postprocessing": {
 46 |                     "type": "string",
 47 |                     "enum": ["full", "only-nms", "only-morph", "none"],
 48 |                     "default": "full",
 49 |                     "description": "which postprocessing steps to enable: by default, applies a custom non-maximum suppression (to avoid overlaps) and morphological operations (using connected component analysis on the binarized input image to shrink or expand regions)"
 50 |                 },
 51 |                 "debug_img": {
 52 |                     "type": "string",
 53 |                     "enum": ["none", "instance_colors", "instance_colors_only", "category_colors"],
 54 |                     "default": "none",
 55 |                     "description": "paint an AlternativeImage which blends the input image and all raw decoded region candidates"
 56 |                 },
 57 |                 "device": {
 58 |                     "type": "string",
 59 |                     "default": "cuda",
 60 |                     "description": "select computing device for Torch (e.g. cpu or cuda:0); will fall back to CPU if no GPU is available"
 61 |                 }
 62 |             },
 63 |             "resources": [
 64 |                 {
 65 |                     "description": "TableBank via LayoutLM X152-FPN config",
 66 |                     "name": "TableBank_X152.yaml",
 67 |                     "size": 536,
 68 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/TableBank_X152.yaml"
 69 |                 },
 70 |                 {
 71 |                     "description": "TableBank via LayoutLM X152-FPN weights",
 72 |                     "name": "TableBank_X152.pth",
 73 |                     "size": 1103832675,
 74 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/TableBank_X152.pth"
 75 |                 },
 76 |                 {
 77 |                     "description": "TableBank via Psarpei X152-FPN config",
 78 |                     "name": "TableBank_X152_Psarpei.yaml",
 79 |                     "size": 534,
 80 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/TableBank_X152_Psarpei.yaml"
 81 |                 },
 82 |                 {
 83 |                     "description": "TableBank via Psarpei X152-FPN weights",
 84 |                     "name": "TableBank_X152_Psarpei.pth",
 85 |                     "size": 1103832675,
 86 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/TableBank_X152_Psarpei.pth"
 87 |                 },
 88 |                 {
 89 |                     "description": "PubLayNet via hpanwar08 R50-FPN config",
 90 |                     "name": "PubLayNet_R_50_FPN_3x.yaml",
 91 |                     "size": 388,
 92 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PubLayNet_R_50_FPN_3x.yaml"
 93 |                 },
 94 |                 {
 95 |                     "description": "PubLayNet via hpanwar08 R50-FPN weights",
 96 |                     "name": "PubLayNet_R_50_FPN_3x.pth",
 97 |                     "size": 176249718,
 98 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PubLayNet_R_50_FPN_3x.pth"
 99 |                 },
100 |                 {
101 |                     "description": "PubLayNet via hpanwar08 R101-FPN config",
102 |                     "name": "PubLayNet_R_101_FPN_3x.yaml",
103 |                     "size": 392,
104 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PubLayNet_R_101_FPN_3x.yaml"
105 |                 },
106 |                 {
107 |                     "description": "PubLayNet via hpanwar08 R101-FPN weights",
108 |                     "name": "PubLayNet_R_101_FPN_3x.pth",
109 |                     "size": 503147199,
110 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PubLayNet_R_101_FPN_3x.pth"
111 |                 },
112 |                 {
113 |                     "description": "PubLayNet via hpanwar08 X101-FPN config",
114 |                     "name": "PubLayNet_X_101_32x8d_FPN_3x.yaml",
115 |                     "size": 592,
116 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PubLayNet_X_101_32x8d_FPN_3x.yaml"
117 |                 },
118 |                 {
119 |                     "description": "PubLayNet via hpanwar08 X101-FPN weights",
120 |                     "name": "PubLayNet_X_101_32x8d_FPN_3x.pth",
121 |                     "size": 429840864,
122 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PubLayNet_X_101_32x8d_FPN_3x.pth"
123 |                 },
124 |                 {
125 |                     "description": "PubLayNet via JPLeoRX R50-FPN config",
126 |                     "name": "PubLayNet_R_50_FPN_3x_JPLeoRX.yaml",
127 |                     "size": 388,
128 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PubLayNet_R_50_FPN_3x_JPLeoRX.yaml"
129 |                 },
130 |                 {
131 |                     "description": "PubLayNet via JPLeoRX R50-FPN weights",
132 |                     "name": "PubLayNet_R_50_FPN_3x_JPLeoRX.pth",
133 |                     "size": 176299422,
134 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PubLayNet_R_50_FPN_3x_JPLeoRX.pth"
135 |                 },
136 |                 {
137 |                     "description": "PubLayNet via JPLeoRX R101-FPN config",
138 |                     "name": "PubLayNet_R_101_FPN_3x_JPLeoRX.yaml",
139 |                     "size": 392,
140 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PubLayNet_R_101_FPN_3x_JPLeoRX.yaml"
141 |                 },
142 |                 {
143 |                     "description": "PubLayNet via JPLeoRX R101-FPN weights",
144 |                     "name": "PubLayNet_R_101_FPN_3x_JPLeoRX.pth",
145 |                     "size": 252572745,
146 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PubLayNet_R_101_FPN_3x_JPLeoRX.pth"
147 |                 },
148 |                 {
149 |                     "description": "Modern Magazines via Jambo-sudo X101-FPN (pre-trained on PubLayNet, fine-tuned on 500 p. 20th cent. magazines) config",
150 |                     "name": "Jambo-sudo_X101.yaml",
151 |                     "size": 592,
152 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/Jambo-sudo_X101.yaml"
153 |                 },
154 |                 {
155 |                     "description": "Modern Magazines via Jambo-sudo X101-FPN (pre-trained on PubLayNet, fine-tuned on 500 p. 20th cent. magazines) weights",
156 |                     "name": "Jambo-sudo_X101.pth",
157 |                     "size": 856430002,
158 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/Jambo-sudo_X101.pth"
159 |                 },
160 |                 {
161 |                     "description": "PRImALayout via LayoutLM R50-FPN config",
162 |                     "name": "PRImALayout_R50.yaml",
163 |                     "size": 934,
164 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PRImALayout_R50.yaml"
165 |                 },
166 |                 {
167 |                     "description": "PRImALayout via LayoutLM R50-FPN weights",
168 |                     "name": "PRImALayout_R50.pth",
169 |                     "size": 351229486,
170 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/PRImALayout_R50.pth"
171 |                 },
172 |                 {
173 |                     "description": "DocBank via LayoutLM X101-FPN config",
174 |                     "name": "DocBank_X101.yaml",
175 |                     "size": 523,
176 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/DocBank_X101.yaml"
177 |                 },
178 |                 {
179 |                     "description": "DocBank via LayoutLM X101-FPN config",
180 |                     "name": "DocBank_X101.pth",
181 |                     "size": 835606605,
182 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/DocBank_X101.pth"
183 |                 },
184 |                 {
185 |                     "description": "NewspaperNavigator via LayoutParser R50-PanopticFPN config",
186 |                     "name": "NewspaperNavigator_R_50_PFPN_3x.yaml",
187 |                     "size": 330226761,
188 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/NewspaperNavigator_R_50_PFPN_3x.yaml"
189 |                 },
190 |                 {
191 |                     "description": "NewspaperNavigator via LayoutParser R50-PanopticFPN weights",
192 |                     "name": "NewspaperNavigator_R_50_PFPN_3x.pth",
193 |                     "size": 330226761,
194 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/NewspaperNavigator_R_50_PFPN_3x.pth"
195 |                 },
196 |                 {
197 |                     "description": "MathFormulaDetection via LayoutParser R50-FPN config",
198 |                     "name": "Math_R_50_FPN_3x.yaml",
199 |                     "size": 5632,
200 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/Math_R_50_FPN_3x.yaml"
201 |                 },
202 |                 {
203 |                     "description": "MathFormulaDetection via LayoutParser R50-FPN weights",
204 |                     "name": "Math_R_50_FPN_3x.pth",
205 |                     "size": 330084629,
206 |                     "url": "https://github.com/bertsky/ocrd_detectron2/releases/download/v0.1.7/Math_R_50_FPN_3x.pth"
207 |                 }
208 |             ]
209 |         }
210 |     }
211 | }
212 | 


--------------------------------------------------------------------------------
/ocrd_detectron2/presets_DocBank_X101.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_config": "DocBank_X101.yaml",
 3 |     "model_weights": "DocBank_X101.pth",
 4 |     "categories": [
 5 |         "TextRegion:abstract",
 6 |         "TextRegion:author",
 7 |         "TextRegion:caption",
 8 |         "TextRegion:date",
 9 |         "MathsRegion",
10 |         "GraphicRegion",
11 |         "TextRegion:footer",
12 |         "TextRegion:list",
13 |         "TextRegion:paragraph",
14 |         "TextRegion:reference",
15 |         "TextRegion:heading",
16 |         "TableRegion",
17 |         "TextRegion:title"
18 |     ]
19 | }


--------------------------------------------------------------------------------
/ocrd_detectron2/presets_DocBank_X101_page.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_config": "DocBank_X101.yaml",
 3 |     "model_weights": "DocBank_X101.pth",
 4 |     "categories": [
 5 |         "TextRegion:header",
 6 |         "TextRegion:credit",
 7 |         "TextRegion:caption",
 8 |         "TextRegion:other",
 9 |         "MathsRegion",
10 |         "GraphicRegion",
11 |         "TextRegion:footer",
12 |         "TextRegion:floating",
13 |         "TextRegion:paragraph",
14 |         "TextRegion:endnote",
15 |         "TextRegion:heading",
16 |         "TableRegion",
17 |         "TextRegion:heading"
18 |     ]
19 | }
20 | 


--------------------------------------------------------------------------------
/ocrd_detectron2/presets_Jambo-sudo_X101.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_config": "Jambo-sudo_X101.yaml",
 3 |     "model_weights": "Jambo-sudo_X101.pth",
 4 |     "categories": [
 5 |         "TextRegion:caption",
 6 |         "ImageRegion",
 7 |         "TextRegion:page-number",
 8 |         "TableRegion",
 9 |         "TextRegion:heading",
10 |         "TextRegion:paragraph"
11 |     ]
12 | }
13 | 


--------------------------------------------------------------------------------
/ocrd_detectron2/presets_Math_R50.json:
--------------------------------------------------------------------------------
1 | {
2 |     "model_config": "Math_R_50_FPN_3x.yaml",
3 |     "model_weights": "Math_R_50_FPN_3x.pth",
4 |     "categories": [
5 |         "",
6 |         "MathsRegion"
7 |     ]
8 | }


--------------------------------------------------------------------------------
/ocrd_detectron2/presets_NewspaperNavigator_R50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_config": "NewspaperNavigator_R_50_PFPN_3x.yaml",
 3 |     "model_weights": "NewspaperNavigator_R_50_PFPN_3x.pth",
 4 |     "categories": [
 5 |         "ImageRegion:photograph",
 6 |         "ImageRegion:illustration",
 7 |         "MapRegion",
 8 |         "ImageRegion:cartoon",
 9 |         "ImageRegion:editorial",
10 |         "TextRegion:heading",
11 |         "AdvertRegion"
12 |     ]
13 | }


--------------------------------------------------------------------------------
/ocrd_detectron2/presets_PRImALayout_R50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_config": "PRImALayout_R50.yaml",
 3 |     "model_weights": "PRImALayout_R50.pth",
 4 |     "categories": [
 5 |         "Background",
 6 |         "TextRegion",
 7 |         "ImageRegion",
 8 |         "TableRegion",
 9 |         "MathsRegion",
10 |         "SeparatorRegion",
11 |         "LineDrawingRegion"
12 |     ]
13 | }
14 | 


--------------------------------------------------------------------------------
/ocrd_detectron2/presets_PubLayNet_R101.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_config": "PubLayNet_R_101_FPN_3x.yaml",
 3 |     "model_weights": "PubLayNet_R_101_FPN_3x.pth",
 4 |     "categories": [
 5 |         "TextRegion:paragraph", 
 6 |         "TextRegion:heading", 
 7 |         "TextRegion:floating", 
 8 |         "TableRegion", 
 9 |         "ImageRegion"
10 |     ]
11 | }


--------------------------------------------------------------------------------
/ocrd_detectron2/presets_PubLayNet_R101_JPLeoRX.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_config": "PubLayNet_R_101_FPN_3x_JPLeoRX.yaml",
 3 |     "model_weights": "PubLayNet_R_101_FPN_3x_JPLeoRX.pth",
 4 |     "categories": [
 5 |         "TextRegion:paragraph", 
 6 |         "TextRegion:heading", 
 7 |         "TextRegion:floating", 
 8 |         "TableRegion", 
 9 |         "ImageRegion"
10 |     ]
11 | }


--------------------------------------------------------------------------------
/ocrd_detectron2/presets_PubLayNet_R50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_config": "PubLayNet_R_50_FPN_3x.yaml",
 3 |     "model_weights": "PubLayNet_R_50_FPN_3x.pth",
 4 |     "categories": [
 5 |         "TextRegion:paragraph", 
 6 |         "TextRegion:heading", 
 7 |         "TextRegion:floating", 
 8 |         "TableRegion", 
 9 |         "ImageRegion"
10 |     ]
11 | }


--------------------------------------------------------------------------------
/ocrd_detectron2/presets_PubLayNet_R50_JPLeoRX.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_config": "PubLayNet_R_50_FPN_3x_JPLeoRX.yaml",
 3 |     "model_weights": "PubLayNet_R_50_FPN_3x_JPLeoRX.pth",
 4 |     "categories": [
 5 |         "TextRegion:paragraph", 
 6 |         "TextRegion:heading", 
 7 |         "TextRegion:floating", 
 8 |         "TableRegion", 
 9 |         "ImageRegion"
10 |     ]
11 | }


--------------------------------------------------------------------------------
/ocrd_detectron2/presets_PubLayNet_X101.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_config": "PubLayNet_X_101_32x8d_FPN_3x.yaml",
 3 |     "model_weights": "PubLayNet_X_101_32x8d_FPN_3x.pth",
 4 |     "categories": [
 5 |         "TextRegion:paragraph", 
 6 |         "TextRegion:heading", 
 7 |         "TextRegion:floating", 
 8 |         "TableRegion", 
 9 |         "ImageRegion"
10 |     ]
11 | }


--------------------------------------------------------------------------------
/ocrd_detectron2/presets_TableBank_X152.json:
--------------------------------------------------------------------------------
1 | {
2 |     "model_config": "TableBank_X152.yaml",
3 |     "model_weights": "TableBank_X152.pth",
4 |     "categories": [
5 |         "TableRegion"
6 |     ]
7 | }


--------------------------------------------------------------------------------
/ocrd_detectron2/presets_TableBank_X152_Psarpei.json:
--------------------------------------------------------------------------------
1 | {
2 |     "model_config": "TableBank_X152_Psarpei.yaml",
3 |     "model_weights": "TableBank_X152_Psarpei.pth",
4 |     "categories": [
5 |         "TableRegion"
6 |     ]
7 | }
8 | 


--------------------------------------------------------------------------------
/ocrd_detectron2/segment.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | 
  3 | import sys
  4 | import os
  5 | import tempfile
  6 | import fileinput
  7 | import shutil
  8 | import math
  9 | import multiprocessing as mp
 10 | import multiprocessing.sharedctypes
 11 | import ctypes
 12 | from typing import Optional
 13 | 
 14 | import numpy as np
 15 | from shapely.geometry import Polygon
 16 | from shapely.ops import unary_union
 17 | import cv2
 18 | from PIL import Image
 19 | #from detectron2.utils.logger import setup_logger
 20 | from detectron2.engine import DefaultPredictor
 21 | from detectron2.utils import visualizer
 22 | from detectron2.config import get_cfg
 23 | from detectron2.data import MetadataCatalog #, DatasetCatalog
 24 | import torch
 25 | 
 26 | from ocrd_utils import (
 27 |     resource_filename,
 28 |     getLogger,
 29 |     pushd_popd,
 30 |     coordinates_of_segment,
 31 |     coordinates_for_segment,
 32 |     crop_image,
 33 |     points_from_polygon,
 34 |     polygon_from_points,
 35 | )
 36 | from ocrd_models.ocrd_page import (
 37 |     OcrdPage,
 38 |     PageType,
 39 |     AdvertRegionType,
 40 |     ChartRegionType,
 41 |     ChemRegionType,
 42 |     CustomRegionType,
 43 |     GraphicRegionType,
 44 |     ImageRegionType,
 45 |     LineDrawingRegionType,
 46 |     MapRegionType,
 47 |     MathsRegionType,
 48 |     MusicRegionType,
 49 |     NoiseRegionType,
 50 |     SeparatorRegionType,
 51 |     TableRegionType,
 52 |     TextRegionType,
 53 |     UnknownRegionType,
 54 |     CoordsType,
 55 |     AlternativeImageType
 56 | )
 57 | from ocrd_models.ocrd_page_generateds import (
 58 |     ChartTypeSimpleType,
 59 |     GraphicsTypeSimpleType,
 60 |     TextTypeSimpleType
 61 | )
 62 | from ocrd import Processor, OcrdPageResult, OcrdPageResultImage
 63 | 
 64 | # when doing Numpy postprocessing, enlarge masks via
 65 | # outer (convex) instead of inner (concave) hull of
 66 | # corresponding connected components
 67 | NP_POSTPROCESSING_OUTER = False
 68 | # when pruning overlapping detections (in either mode),
 69 | # require at least this share of the area to be redundant
 70 | RECALL_THRESHOLD = 0.8
 71 | # when finalizing contours of detections (in either mode),
 72 | # snap to connected components overlapping by this share
 73 | # (of component area), i.e. include if larger and exclude
 74 | # if smaller than this much
 75 | IOCC_THRESHOLD = 0.4
 76 | # when finalizing contours of detections (in either mode),
 77 | # add this many pixels in each direction
 78 | FINAL_DILATION = 4
 79 | 
 80 | class Detectron2Segment(Processor):
 81 |     max_workers = 1 # GPU context sharable across not forks
 82 | 
 83 |     @property
 84 |     def executable(self):
 85 |         return 'ocrd-detectron2-segment'
 86 | 
 87 |     def setup(self):
 88 |         #setup_logger(name='fvcore')
 89 |         #mp.set_start_method("spawn", force=True)
 90 |         # runtime overrides
 91 |         if self.parameter['device'] == 'cpu' or not torch.cuda.is_available():
 92 |             device = "cpu"
 93 |         else:
 94 |             device = self.parameter['device']
 95 |         self.logger.info("Using compute device %s", device)
 96 |         model_config = self.resolve_resource(self.parameter['model_config'])
 97 |         self.logger.info("Loading config '%s'", model_config)
 98 |         # add project-specific config (e.g., TensorMask) here if you're not running a model in detectron2's core library
 99 |         with tempfile.TemporaryDirectory() as tmpdir:
100 |             # workaround for fvcore/detectron2's stupid decision
101 |             # to resolve the relative path for _BASE_ in the config file
102 |             # on its dirname instead of the detectron2 distribution's config directory
103 |             temp_configs = os.path.join(tmpdir, 'configs')
104 |             with resource_filename('detectron2', 'model_zoo/configs') as stock_configs:
105 |                 shutil.copytree(stock_configs, temp_configs)
106 |             temp_config = os.path.join(temp_configs, os.path.basename(model_config))
107 |             shutil.copyfile(model_config, temp_config)
108 |             with pushd_popd(tmpdir):
109 |                 # repair broken config files that make deviating assumptions on model_zoo files
110 |                 with fileinput.input(temp_config, inplace=True) as temp_config_file:
111 |                     for line in temp_config_file:
112 |                         if fileinput.isfirstline():
113 |                             PREFIXES = ['/content/',
114 |                                         '../../configs/',
115 |                                         '../configs/',
116 |                                         '../']
117 |                             line = next((line.replace(pref, '') for pref in PREFIXES
118 |                                          if line.startswith('_BASE_: "' + pref)), line)
119 |                         if os.path.basename(model_config) == 'Jambo-sudo_X101.yaml' and 'NUM_CLASSES: 5' in line:
120 |                             # workaround for Jambo-sudo/Historical-document-layout-analysis#1
121 |                             line = line.replace('NUM_CLASSES: 5', 'NUM_CLASSES: 6')
122 |                         print(line, end='')
123 |                 cfg = get_cfg()
124 |                 cfg.merge_from_file(temp_config)
125 |         model_weights = self.resolve_resource(self.parameter['model_weights'])
126 |         cfg.merge_from_list([
127 |             # set threshold for this model
128 |             "MODEL.ROI_HEADS.SCORE_THRESH_TEST", self.parameter['min_confidence'],
129 |             "MODEL.RETINANET.SCORE_THRESH_TEST", self.parameter['min_confidence'],
130 |             "MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH", self.parameter['min_confidence'],
131 |             # or cfg.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH ?
132 |             "MODEL.DEVICE", device,
133 |             "MODEL.WEIGHTS", model_weights,
134 |         ])
135 |         cfg.freeze()
136 |         assert cfg.MODEL.ROI_HEADS.NUM_CLASSES == len(self.parameter['categories']), \
137 |             "The chosen model's number of classes %d does not match the given list of categories %d " % (
138 |                 cfg.MODEL.ROI_HEADS.NUM_CLASSES, len(self.parameter['categories']))
139 |         # instantiate model
140 |         self.logger.info("Loading weights '%s'", model_weights)
141 |         self.predictor = DefaultPredictor(cfg)
142 |         self.categories = self.parameter['categories']
143 |         self.metadatacat = MetadataCatalog.get('runtime')
144 |         self.metadatacat.thing_classes = self.categories
145 | 
146 |     def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
147 |         """Use detectron2 to segment each page into regions.
148 | 
149 |         Open and deserialize PAGE input files and their respective images,
150 |         then iterate over the element hierarchy down to the requested
151 |         ``operation_level``.
152 | 
153 |         Fetch a raw and a binarized image for the page/segment (possibly
154 |         cropped and deskewed).
155 | 
156 |         Feed the raw image into the detectron2 predictor that has been
157 |         used to load the given model. Then, depending on the model capabilities
158 |         (whether it can do panoptic segmentation or only instance segmentation,
159 |         whether the latter can do masks or only bounding boxes), post-process
160 |         the predictions:
161 | 
162 |         \b
163 |         - panoptic segmentation: take the provided segment label map, and
164 |           apply the segment to class label map,
165 |         - instance segmentation: find an optimal non-overlapping set (flat
166 |           map) of instances via non-maximum suppression,
167 |         - both: avoid overlapping pre-existing top-level regions (incremental
168 |           segmentation).
169 | 
170 |         Then extend / shrink the surviving masks to fully include / exclude
171 |         connected components in the foreground that are on the boundary.
172 | 
173 |         (This describes the steps when ``postprocessing`` is `full`. A value
174 |         of `only-nms` will omit the morphological extension/shrinking, while
175 |         `only-morph` will omit the non-maximum suppression, and `none` will
176 |         skip all postprocessing.)
177 | 
178 |         Finally, find the convex hull polygon for each region, and map its
179 |         class id to a new PAGE region type (and subtype).
180 | 
181 |         (Does not annotate `ReadingOrder` or `TextLine`s or `@orientation`.)
182 | 
183 |         Produce a new output file by serialising the resulting hierarchy.
184 |         """
185 |         pcgts = input_pcgts[0]
186 |         result = OcrdPageResult(pcgts)
187 |         level = self.parameter['operation_level']
188 | 
189 |         page = pcgts.get_Page()
190 |         page_image_raw, page_coords, page_image_info = self.workspace.image_from_page(
191 |             page, page_id, feature_filter='binarized')
192 |         # for morphological post-processing, we will need the binarized image, too
193 |         if self.parameter['postprocessing'] != 'none':
194 |             page_image_bin, _, _ = self.workspace.image_from_page(
195 |                 page, page_id, feature_selector='binarized')
196 |             page_image_raw, page_image_bin = _ensure_consistent_crops(
197 |                 page_image_raw, page_image_bin)
198 |         else:
199 |             page_image_bin = page_image_raw
200 |         # determine current zoom and target zoom
201 |         if page_image_info.resolution != 1:
202 |             dpi = page_image_info.resolution
203 |             if page_image_info.resolutionUnit == 'cm':
204 |                 dpi = round(dpi * 2.54)
205 |             zoom = 300.0 / dpi
206 |         else:
207 |             dpi = None
208 |             zoom = 1.0
209 |         # todo: if zoom is > 4.0, do something along the lines of eynollah's enhance
210 |         if zoom < 2.0:
211 |             # actual resampling: see below
212 |             zoomed = zoom / 2.0
213 |             self.logger.info("scaling %dx%d image by %.2f", page_image_raw.width, page_image_raw.height, zoomed)
214 |         else:
215 |             zoomed = 1.0
216 | 
217 |         for segment in ([page] if level == 'page' else
218 |                         page.get_AllRegions(depth=1, classes=['Table'])):
219 |             # regions = segment.get_AllRegions(depth=1)
220 |             # FIXME: as long as we don't have get_AllRegions on region level,
221 |             #        we have to simulate this via parent_object filtering
222 |             def at_segment(region):
223 |                 return region.parent_object_ is segment
224 |             regions = list(filter(at_segment, page.get_AllRegions()))
225 | 
226 |             if isinstance(segment, PageType):
227 |                 image_raw = page_image_raw
228 |                 image_bin = page_image_bin
229 |                 coords = page_coords
230 |             else:
231 |                 image_raw, coords = self.workspace.image_from_segment(
232 |                     segment, page_image_raw, page_coords, feature_filter='binarized')
233 |                 if self.parameter['postprocessing'] != 'none':
234 |                     image_bin, _ = self.workspace.image_from_segment(
235 |                         segment, page_image_bin, page_coords)
236 |                     image_raw, image_bin = _ensure_consistent_crops(
237 |                         image_raw, image_bin)
238 |                 else:
239 |                     image_bin = image_raw
240 | 
241 |             # ensure RGB (if raw was merely grayscale)
242 |             if image_raw.mode == '1':
243 |                 image_raw = image_raw.convert('L')
244 |             image_raw = image_raw.convert(mode='RGB')
245 |             image_bin = image_bin.convert(mode='1')
246 | 
247 |             # reduce resolution to 300 DPI max
248 |             if zoomed != 1.0:
249 |                 image_bin = image_bin.resize(
250 |                     (int(image_raw.width * zoomed),
251 |                      int(image_raw.height * zoomed)),
252 |                     resample=Image.Resampling.BICUBIC)
253 |                 image_raw = image_raw.resize(
254 |                     (int(image_raw.width * zoomed),
255 |                      int(image_raw.height * zoomed)),
256 |                     resample=Image.Resampling.BICUBIC)
257 | 
258 |             # convert raw to BGR
259 |             array_raw = np.array(image_raw)
260 |             array_raw = array_raw[:,:,::-1]
261 |             # convert binarized to single-channel negative
262 |             array_bin = np.array(image_bin)
263 |             array_bin = ~ array_bin
264 | 
265 |             image = self._process_segment(segment, regions, coords, array_raw, array_bin, zoomed, page_id)
266 |             if image:
267 |                 result.images.append(image)
268 |         return result
269 | 
270 |     def _process_segment(self, segment, ignore, coords, array_raw, array_bin, zoomed, page_id) -> Optional[OcrdPageResultImage]:
271 |         self.logger = getLogger('processor.Detectron2Segment')
272 |         cpu = torch.device('cpu')
273 |         segtype = segment.__class__.__name__[:-4]
274 |         # remove existing segmentation (have only detected targets survive)
275 |         #page.set_ReadingOrder(None)
276 |         #page.set_TextRegion([])
277 |         segment.set_custom('coords=%s' % coords['transform'])
278 |         height, width, _ = array_raw.shape
279 |         postprocessing = self.parameter['postprocessing']
280 |         scale = 43
281 |         if postprocessing in ['full', 'only-morph']:
282 |             # get connected components to estimate scale
283 |             _, components = cv2.connectedComponents(array_bin.astype(np.uint8))
284 |             # estimate glyph scale (roughly)
285 |             _, counts = np.unique(components, return_counts=True)
286 |             if counts.shape[0] > 1:
287 |                 counts = np.sqrt(3 * counts)
288 |                 counts = counts[(5 < counts) & (counts < 100)]
289 |                 scale = int(np.median(counts))
290 |                 self.logger.debug("estimated scale: %d", scale)
291 |         # predict
292 |         output = self.predictor(array_raw)
293 |         if self.parameter['debug_img'] != 'none':
294 |             vis = visualizer.Visualizer(array_raw,
295 |                                         metadata=self.metadatacat,
296 |                                         instance_mode={
297 |                                             'instance_colors': visualizer.ColorMode.IMAGE,
298 |                                             'instance_colors_only': visualizer.ColorMode.IMAGE_BW,
299 |                                             'category_colors': visualizer.ColorMode.SEGMENTATION
300 |                                         }[self.parameter['debug_img']])
301 |         # decoding, cf. https://detectron2.readthedocs.io/en/latest/tutorials/models.html
302 |         if 'panoptic_seg' in output:
303 |             self.logger.info("decoding from panoptic segmentation results")
304 |             segmap, seginfo = output['panoptic_seg']
305 |             if not isinstance(segmap, np.ndarray):
306 |                 self.logger.debug(str(segmap))
307 |                 segmap = segmap.to(cpu)
308 |                 segmap = segmap.numpy()
309 |             if self.parameter['debug_img'] != 'none':
310 |                 visimg = vis.draw_panoptic_seg(segmap, seginfo)
311 |             seglabels = np.unique(segmap)
312 |             nseg = len(seglabels)
313 |             if not nseg:
314 |                 self.logger.warning("Detected no regions on %s '%s'", segtype, segment.id)
315 |                 return None
316 |             masks = []
317 |             classes = []
318 |             scores = []
319 |             for label in seglabels:
320 |                 if label == -1:
321 |                     continue
322 |                 if seginfo is None:
323 |                     class_id = label // self.predictor.metadata.label_divisor
324 |                 else:
325 |                     for info in seginfo:
326 |                         if info['id'] == label:
327 |                             class_id = info['category_id']
328 |                             break
329 |                 if not self.categories[class_id]:
330 |                     continue
331 |                 masks.append(segmap == label)
332 |                 scores.append(1.0) #scores[i]
333 |                 classes.append(class_id)
334 |             if not len(masks):
335 |                 self.logger.warning("Detected no regions for selected categories on %s '%s'", segtype, segment.id)
336 |                 return None
337 |         elif 'instances' in output:
338 |             self.logger.info("decoding from instance segmentation results")
339 |             instances = output['instances']
340 |             if not isinstance(instances, dict):
341 |                 assert instances.image_size == (height, width)
342 |                 instances = instances.to(cpu)
343 |                 if self.parameter['debug_img'] != 'none':
344 |                     visimg = vis.draw_instance_predictions(instances)
345 |                 instances = instances.get_fields()
346 |             classes = instances['pred_classes']
347 |             if not all(self.categories):
348 |                 # filter out inactive classes
349 |                 select = np.array([bool(cat) for cat in self.categories])
350 |                 select = select[classes]
351 |                 for key, val in instances.items():
352 |                     instances[key] = val[select]
353 |                 classes = instances['pred_classes']
354 |             scores = instances['scores']
355 |             if not isinstance(scores, np.ndarray):
356 |                 scores = scores.to(cpu).numpy()
357 |             if not scores.shape[0]:
358 |                 self.logger.warning("Detected no regions on %s '%s'", segtype, segment.id)
359 |                 return None
360 |             if 'pred_masks' in instances: # or pred_masks_rle ?
361 |                 masks = np.asarray(instances['pred_masks'])
362 |                 def get_mask(x):
363 |                     # convert from RLE/polygon/Numpy # or Tensor?
364 |                     # zzz tensor result would have to use .detach().numpy() ...
365 |                     x = visualizer.GenericMask(x, height, width)
366 |                     return x.mask > 0
367 |                 masks = np.stack([get_mask(x) for x in masks])
368 |             elif 'pred_boxes' in instances:
369 |                 self.logger.warning("model has no mask output, only bbox")
370 |                 boxes = instances['pred_boxes']
371 |                 if not isinstance(boxes, np.ndarray):
372 |                     boxes = boxes.to(cpu).tensor.numpy()
373 |                 assert boxes.shape[1] == 4 # and not 5 (rotated boxes)
374 |                 assert boxes.shape[0], "prediction without instances"
375 |                 masks = np.zeros((len(boxes), height, width), bool)
376 |                 for i, (x1, y1, x2, y2) in enumerate(boxes):
377 |                     masks[i,
378 |                           math.floor(y1):math.ceil(y2),
379 |                           math.floor(x1):math.ceil(x2)] = True
380 |         else:
381 |             self.logger.error("Found no suitable output format to decode from")
382 |             return None
383 |         assert len(scores) == len(classes) == len(masks)
384 |         # apply non-maximum suppression between overlapping instances
385 |         # (not strictly necessary in case of panoptic segmentation,
386 |         #  but we can still have overlaps with preexisting regions)
387 |         if len(ignore):
388 |             scores = np.insert(scores, 0, 1.0, axis=0)
389 |             classes = np.insert(classes, 0, -1, axis=0)
390 |             masks = np.insert(masks, 0, 0, axis=0)
391 |             mask0 = np.zeros(masks.shape[1:], np.uint8)
392 |             for i, region in enumerate(ignore):
393 |                 polygon = coordinates_of_segment(region, _, coords)
394 |                 if zoomed != 1.0:
395 |                     polygon = np.round(polygon * zoomed).astype(int)
396 |                 cv2.fillPoly(mask0, pts=[polygon], color=(255,))
397 |             assert np.count_nonzero(mask0), "existing regions all outside of page frame"
398 |             masks[0] |= mask0 > 0
399 |         if postprocessing in ['full', 'only-nms']:
400 |             scores, classes, masks = postprocess_nms(
401 |                 scores, classes, masks, array_bin, self.categories,
402 |                 min_confidence=self.parameter['min_confidence'], nproc=8, logger=self.logger)
403 |         if postprocessing in ['full', 'only-morph']:
404 |             scores, classes, masks = postprocess_morph(
405 |                 scores, classes, masks, components, nproc=8, logger=self.logger)
406 |         if len(ignore):
407 |             scores = scores[1:]
408 |             classes = classes[1:]
409 |             masks = masks[1:]
410 |         # convert to polygons and regions
411 |         region_no = 0
412 |         for mask, class_id, score in zip(masks, classes, scores):
413 |             category = self.categories[class_id]
414 |             # dilate until we have a single outer contour
415 |             invalid = True
416 |             for _ in range(10):
417 |                 contours, _ = cv2.findContours(mask.astype(np.uint8),
418 |                                                cv2.RETR_EXTERNAL,
419 |                                                cv2.CHAIN_APPROX_SIMPLE)
420 |                 if len(contours) == 1 and len(contours[0]) > 3:
421 |                     invalid = False
422 |                     break
423 |                 mask = cv2.dilate(mask.astype(np.uint8),
424 |                                   np.ones((scale,scale), np.uint8)) > 0
425 |             if invalid:
426 |                 self.logger.warning("Ignoring non-contiguous (%d) region for %s", len(contours), category)
427 |                 continue
428 |             region_polygon = contours[0][:,0,:] # already in x,y order
429 |             if zoomed != 1.0:
430 |                 region_polygon = region_polygon / zoomed
431 |             # ensure consistent and valid polygon outline
432 |             region_polygon = coordinates_for_segment(region_polygon, _, coords)
433 |             region_polygon = polygon_for_parent(region_polygon, segment)
434 |             if region_polygon is None:
435 |                 self.logger.warning("Ignoring extant region for %s", category)
436 |                 continue
437 |             # annotate new region/line
438 |             region_coords = CoordsType(points_from_polygon(region_polygon), conf=score)
439 |             cat2class = dict([
440 |                 ('AdvertRegion', AdvertRegionType),
441 |                 ('ChartRegion', ChartRegionType),
442 |                 ('ChemRegion', ChemRegionType),
443 |                 ('CustomRegion', CustomRegionType),
444 |                 ('GraphicRegion', GraphicRegionType),
445 |                 ('ImageRegion', ImageRegionType),
446 |                 ('LineDrawingRegion', LineDrawingRegionType),
447 |                 ('MapRegion', MapRegionType),
448 |                 ('MathsRegion', MathsRegionType),
449 |                 ('MusicRegion', MusicRegionType),
450 |                 ('NoiseRegion', NoiseRegionType),
451 |                 ('SeparatorRegion', SeparatorRegionType),
452 |                 ('TableRegion', TableRegionType),
453 |                 ('TextRegion', TextRegionType),
454 |                 ('UnknownRegion', UnknownRegionType),
455 |                 ])
456 |             cat = category.split(':')
457 |             try:
458 |                 regiontype = cat2class[cat[0]]
459 |             except KeyError:
460 |                 raise ValueError("Invalid region type %s (see https://github.com/PRImA-Research-Lab/PAGE-XML)", cat[0])
461 |             region_no += 1
462 |             region_id = 'region%04d_%s' % (region_no, cat[0])
463 |             region = regiontype(id=region_id, Coords=region_coords)
464 |             if len(cat) > 1:
465 |                 try:
466 |                     {TextRegionType: TextTypeSimpleType,
467 |                      GraphicRegionType: GraphicsTypeSimpleType,
468 |                      ChartRegionType: ChartTypeSimpleType}[regiontype](cat[1])
469 |                     region.set_type(cat[1])
470 |                 except (KeyError, ValueError):
471 |                     region.set_custom(cat[1])
472 |             getattr(segment, 'add_' + cat[0])(region)
473 |             self.logger.info("Detected %s region%04d (p=%.2f) on %s '%s'",
474 |                      category, region_no, score, segtype, segment.id)
475 |         if self.parameter['debug_img'] != 'none':
476 |             altimg = AlternativeImageType(comments='debug')
477 |             segment.add_AlternativeImage(altimg)
478 |             return OcrdPageResultImage(
479 |                 Image.fromarray(visimg.get_image()),
480 |                 ('' if isinstance(segment, PageType) else '_' + segment.id) + '.IMG-DEBUG',
481 |                 altimg)
482 |         return None
483 | 
484 | 
485 | def postprocess_nms(scores, classes, masks, page_array_bin, categories, min_confidence=0.5, nproc=8, logger=None):
486 |     """Apply geometrical post-processing to raw detections: remove overlapping candidates via non-maximum suppression across classes.
487 | 
488 |     Implement via Numpy routines.
489 |     """
490 |     if logger is None:
491 |         logger = getLogger('ocrd.processor.Detectron2Segment')
492 |     # apply IoU-based NMS across classes
493 |     assert masks.dtype == bool
494 |     instances = np.arange(len(masks))
495 |     instances_i, instances_j = np.meshgrid(instances, instances, indexing='ij')
496 |     combinations = list(zip(*np.where(instances_i != instances_j)))
497 |     shared_masks = mp.sharedctypes.RawArray(ctypes.c_bool, masks.size)
498 |     shared_masks_np = tonumpyarray_with_shape(shared_masks, masks.shape)
499 |     np.copyto(shared_masks_np, masks * page_array_bin)
500 |     with mp.Pool(processes=nproc, # to be refined via param
501 |                  initializer=overlapmasks_init,
502 |                  initargs=(shared_masks, masks.shape)) as pool:
503 |         # multiprocessing for different combinations of array slices (pure)
504 |         overlapping_combinations = pool.starmap(overlapmasks, combinations)
505 |     overlaps = np.zeros((len(masks), len(masks)), bool)
506 |     for (i, j), overlapping in zip(combinations, overlapping_combinations):
507 |         if overlapping:
508 |             overlaps[i, j] = True
509 |     # find best-scoring instance per class
510 |     bad = np.zeros_like(instances, bool)
511 |     for i in np.argsort(-scores):
512 |         score = scores[i]
513 |         mask = masks[i]
514 |         assert mask.shape[:2] == page_array_bin.shape[:2]
515 |         ys, xs = mask.nonzero()
516 |         assert xs.any() and ys.any(), "instance has empty mask"
517 |         bbox = [xs.min(), ys.min(), xs.max(), ys.max()]
518 |         class_id = classes[i]
519 |         if class_id < 0:
520 |             logger.debug("ignoring existing region at %s", str(bbox))
521 |             continue
522 |         category = categories[class_id]
523 |         if scores[i] < min_confidence:
524 |             logger.debug("Ignoring instance for %s with too low score %.2f", category, score)
525 |             bad[i] = True
526 |             continue
527 |         count = np.count_nonzero(mask)
528 |         if count < 10:
529 |             logger.warning("Ignoring too small (%dpx) region for %s", count, category)
530 |             bad[i] = True
531 |             continue
532 |         worse = score < scores
533 |         if np.any(worse & overlaps[i]):
534 |             logger.debug("Ignoring instance for %s with %.2f overlapping better neighbour",
535 |                       category, score)
536 |             bad[i] = True
537 |         else:
538 |             logger.debug("post-processing prediction for %s at %s area %d score %f",
539 |                       category, str(bbox), count, score)
540 |     # post-process detections morphologically and decode to region polygons
541 |     # does not compile (no OpenCV support):
542 |     keep = np.nonzero(~ bad)[0]
543 |     if not keep.size:
544 |         return [], [], []
545 |     keep = sorted(keep, key=lambda i: scores[i], reverse=True)
546 |     scores = scores[keep]
547 |     classes = classes[keep]
548 |     masks = masks[keep]
549 |     return scores, classes, masks
550 | 
551 | def postprocess_morph(scores, classes, masks, components, nproc=8, logger=None):
552 |     """Apply morphological post-processing to raw detections: extend masks to avoid chopping off fg connected components.
553 | 
554 |     Implement via Numpy routines.
555 |     """
556 |     if logger is None:
557 |         logger = getLogger('ocrd.processor.Detectron2Segment')
558 |     shared_masks = mp.sharedctypes.RawArray(ctypes.c_bool, masks.size)
559 |     shared_components = mp.sharedctypes.RawArray(ctypes.c_int32, components.size)
560 |     shared_masks_np = tonumpyarray_with_shape(shared_masks, masks.shape)
561 |     shared_components_np = tonumpyarray_with_shape(shared_components, components.shape)
562 |     np.copyto(shared_components_np, components, casting='equiv')
563 |     np.copyto(shared_masks_np, masks)
564 |     with mp.Pool(processes=nproc, # to be refined via param
565 |                  initializer=morphmasks_init,
566 |                  initargs=(shared_masks, masks.shape,
567 |                            shared_components, components.shape)) as pool:
568 |         # multiprocessing for different slices of array (in-place)
569 |         pool.map(morphmasks, range(masks.shape[0]))
570 |     masks = tonumpyarray_with_shape(shared_masks, masks.shape)
571 |     return scores, classes, masks
572 | 
573 | def polygon_for_parent(polygon, parent):
574 |     """Clip polygon to parent polygon range.
575 | 
576 |     (Should be moved to ocrd_utils.coordinates_for_segment.)
577 |     """
578 |     childp = Polygon(polygon)
579 |     if isinstance(parent, PageType):
580 |         if parent.get_Border():
581 |             parentp = Polygon(polygon_from_points(parent.get_Border().get_Coords().points))
582 |         else:
583 |             parentp = Polygon([[0,0], [0,parent.get_imageHeight()],
584 |                                [parent.get_imageWidth(),parent.get_imageHeight()],
585 |                                [parent.get_imageWidth(),0]])
586 |     else:
587 |         parentp = Polygon(polygon_from_points(parent.get_Coords().points))
588 |     # ensure input coords have valid paths (without self-intersection)
589 |     # (this can happen when shapes valid in floating point are rounded)
590 |     childp = make_valid(childp)
591 |     parentp = make_valid(parentp)
592 |     if not childp.is_valid:
593 |         return None
594 |     if not parentp.is_valid:
595 |         return None
596 |     # check if clipping is necessary
597 |     if childp.within(parentp):
598 |         return childp.exterior.coords[:-1]
599 |     # clip to parent
600 |     interp = childp.intersection(parentp)
601 |     # post-process
602 |     if interp.is_empty or interp.area == 0.0:
603 |         return None
604 |     if interp.type == 'GeometryCollection':
605 |         # heterogeneous result: filter zero-area shapes (LineString, Point)
606 |         interp = unary_union([geom for geom in interp.geoms if geom.area > 0])
607 |     if interp.type == 'MultiPolygon':
608 |         # homogeneous result: construct convex hull to connect
609 |         # FIXME: construct concave hull / alpha shape
610 |         interp = interp.convex_hull
611 |     if interp.minimum_clearance < 1.0:
612 |         # follow-up calculations will necessarily be integer;
613 |         # so anticipate rounding here and then ensure validity
614 |         interp = Polygon(np.round(interp.exterior.coords))
615 |         interp = make_valid(interp)
616 |     return interp.exterior.coords[:-1] # keep open
617 | 
618 | def make_valid(polygon):
619 |     for split in range(1, len(polygon.exterior.coords)-1):
620 |         if polygon.is_valid or polygon.simplify(polygon.area).is_valid:
621 |             break
622 |         # simplification may not be possible (at all) due to ordering
623 |         # in that case, try another starting point
624 |         polygon = Polygon(polygon.exterior.coords[-split:]+polygon.exterior.coords[:-split])
625 |     for tolerance in range(1, int(polygon.area)):
626 |         if polygon.is_valid:
627 |             break
628 |         # simplification may require a larger tolerance
629 |         polygon = polygon.simplify(tolerance)
630 |     return polygon
631 | 
632 | def tonumpyarray(mp_arr):
633 |     return np.frombuffer(mp_arr, dtype=np.dtype(mp_arr))
634 | 
635 | def tonumpyarray_with_shape(mp_arr, shape):
636 |     return np.frombuffer(mp_arr, dtype=np.dtype(mp_arr)).reshape(shape)
637 | 
638 | def overlapmasks_init(masks_array, masks_shape):
639 |     global shared_masks
640 |     global shared_masks_shape
641 |     shared_masks = masks_array
642 |     shared_masks_shape = masks_shape
643 | 
644 | def overlapmasks(i, j):
645 |     # is i redundant w.r.t. j (i.e. j already covers most of its area)
646 |     masks = np.ctypeslib.as_array(shared_masks).reshape(shared_masks_shape)
647 |     imask = masks[i]
648 |     jmask = masks[j]
649 |     intersection = np.count_nonzero(imask * jmask)
650 |     if not intersection:
651 |         return False
652 |     base = np.count_nonzero(imask)
653 |     if intersection / base > RECALL_THRESHOLD:
654 |         return True
655 |     return False
656 | 
657 | def morphmasks_init(masks_array, masks_shape, components_array, components_shape):
658 |     global shared_masks
659 |     global shared_masks_shape
660 |     global shared_components
661 |     global shared_components_shape
662 |     shared_masks = masks_array
663 |     shared_masks_shape = masks_shape
664 |     shared_components = components_array
665 |     shared_components_shape = components_shape
666 | 
667 | def morphmasks(instance):
668 |     masks = np.ctypeslib.as_array(shared_masks).reshape(shared_masks_shape)
669 |     components = np.ctypeslib.as_array(shared_components).reshape(shared_components_shape)
670 |     mask = masks[instance]
671 |     # find closure in connected components
672 |     complabels = np.unique(mask * components)
673 |     left, top, w, h = cv2.boundingRect(mask.astype(np.uint8))
674 |     right = left + w
675 |     bottom = top + h
676 |     if NP_POSTPROCESSING_OUTER:
677 |         # overwrite pixel mask from (padded) outer bbox
678 |         for label in complabels:
679 |             if not label:
680 |                 continue # bg/white
681 |             leftc, topc, wc, hc = cv2.boundingRect((components == label).astype(np.uint8))
682 |             rightc = leftc + wc
683 |             bottomc = topc + hc
684 |             if wc > 2 * w or hc > 2 * h:
685 |                 continue # huge (non-text?) component
686 |             # intersection over component too small?
687 |             if (min(right, rightc) - max(left, leftc)) * \
688 |                 (min(bottom, bottomc) - max(top, topc)) < IOCC_THRESHOLD * wc * hc:
689 |                 continue # too little overlap
690 |             newleft = min(left, leftc)
691 |             newtop = min(top, topc)
692 |             newright = max(right, rightc)
693 |             newbottom = max(bottom, bottomc)
694 |             if (newright - newleft) > 2 * w or (newbottom - newtop) > 1.5 * h:
695 |                 continue #
696 |             left = newleft
697 |             top = newtop
698 |             right = newright
699 |             bottom = newbottom
700 |             w = right - left
701 |             h = bottom - top
702 |         left = max(0, left - FINAL_DILATION)
703 |         top = max(0, top - FINAL_DILATION)
704 |         right = min(mask.shape[1], right + FINAL_DILATION)
705 |         bottom = min(mask.shape[0], bottom + FINAL_DILATION)
706 |         mask[top:bottom, left:right] = True
707 | 
708 |     else:
709 |         # fill pixel mask from (padded) inner bboxes
710 |         for label in complabels:
711 |             if not label:
712 |                 continue # bg/white
713 |             suppress = False
714 |             leftc, topc, wc, hc = cv2.boundingRect((components == label).astype(np.uint8))
715 |             rightc = leftc + wc
716 |             bottomc = topc + hc
717 |             if wc > 2 * w or hc > 2 * h:
718 |                 # huge (non-text?) component
719 |                 suppress = True
720 |             if (min(right, rightc) - max(left, leftc)) * \
721 |                 (min(bottom, bottomc) - max(top, topc)) < IOCC_THRESHOLD * wc * hc:
722 |                 # intersection over component too small
723 |                 suppress = True
724 |             newleft = min(left, leftc)
725 |             newtop = min(top, topc)
726 |             newright = max(right, rightc)
727 |             newbottom = max(bottom, bottomc)
728 |             if (newright - newleft) > 2 * w or (newbottom - newtop) > 1.5 * h:
729 |                 # huge (non-text?) component
730 |                 suppress = True
731 |             elif (newright - newleft) < 1.1 * w and (newbottom - newtop) < 1.1 * h:
732 |                 suppress = False
733 |             if suppress:
734 |                 leftc = min(mask.shape[1], leftc + FINAL_DILATION)
735 |                 topc = min(mask.shape[0], topc + FINAL_DILATION)
736 |                 rightc = max(0, rightc - FINAL_DILATION)
737 |                 bottomc = max(0, bottomc - FINAL_DILATION)
738 |                 mask[topc:bottomc, leftc:rightc] = False
739 |             else:
740 |                 leftc = max(0, leftc - FINAL_DILATION)
741 |                 topc = max(0, topc - FINAL_DILATION)
742 |                 rightc = min(mask.shape[1], rightc + FINAL_DILATION)
743 |                 bottomc = min(mask.shape[0], bottomc + FINAL_DILATION)
744 |                 mask[topc:bottomc, leftc:rightc] = True
745 |                 left = newleft
746 |                 top = newtop
747 |                 right = newright
748 |                 bottom = newbottom
749 |                 w = right - left
750 |                 h = bottom - top
751 | 
752 | def _ensure_consistent_crops(image_raw, image_bin):
753 |     # workaround for OCR-D/core#687:
754 |     if 0 < abs(image_raw.width - image_bin.width) <= 2:
755 |         diff = image_raw.width - image_bin.width
756 |         if diff > 0:
757 |             image_raw = crop_image(
758 |                 image_raw,
759 |                 (int(np.floor(diff / 2)), 0,
760 |                  image_raw.width - int(np.ceil(diff / 2)),
761 |                  image_raw.height))
762 |         else:
763 |             image_bin = crop_image(
764 |                 image_bin,
765 |                 (int(np.floor(-diff / 2)), 0,
766 |                  image_bin.width - int(np.ceil(-diff / 2)),
767 |                  image_bin.height))
768 |     if 0 < abs(image_raw.height - image_bin.height) <= 2:
769 |         diff = image_raw.height - image_bin.height
770 |         if diff > 0:
771 |             image_raw = crop_image(
772 |                 image_raw,
773 |                 (0, int(np.floor(diff / 2)),
774 |                  image_raw.width,
775 |                  image_raw.height - int(np.ceil(diff / 2))))
776 |         else:
777 |             image_bin = crop_image(
778 |                 image_bin,
779 |                 (0, int(np.floor(-diff / 2)),
780 |                  image_bin.width,
781 |                  image_bin.height - int(np.ceil(-diff / 2))))
782 |     return image_raw, image_bin
783 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0.0", "wheel", "setuptools-ocrd"]
 3 | 
 4 | [project]
 5 | name = "ocrd_detectron2"
 6 | authors = [
 7 |     {name = "Robert Sachunsky", email = "sachunsky@informatik.uni-leipzig.de"},
 8 |     {name = "Julian Balling", email = "balling@infai.org"},
 9 | ]
10 | description = "OCR-D wrapper for detectron2 based segmentation models"
11 | readme = "README.md"
12 | license.text = "MIT"
13 | requires-python = ">=3.8"
14 | 
15 | dynamic = ["version", "dependencies"]
16 | 
17 | # https://pypi.org/classifiers/
18 | classifiers = [
19 |     "Development Status :: 5 - Production/Stable",
20 |     "Environment :: Console",
21 |     "Intended Audience :: Science/Research",
22 |     "Intended Audience :: Other Audience",
23 |     "License :: OSI Approved :: MIT License",
24 |     "Programming Language :: Python :: 3",
25 |     "Programming Language :: Python :: 3 :: Only",
26 |     "Topic :: Text Processing",
27 | ]
28 | 
29 | [project.scripts]
30 | ocrd-detectron2-segment = "ocrd_detectron2.cli:ocrd_detectron2_segment"
31 | 
32 | [project.urls]
33 | Homepage = "https://github.com/bertsky/ocrd_detectron2"
34 | Repository = "https://github.com/bertsky/ocrd_detectron2.git"
35 | 
36 | [tool.setuptools.dynamic]
37 | dependencies = {file = ["requirements.txt"]}
38 | optional-dependencies.test = {file = ["requirements-test.txt"]}
39 | 
40 | [tool.setuptools]
41 | packages = ["ocrd_detectron2"]
42 | package-data = {"*" = ["*.json"]}
43 | 
44 | [tool.coverage.run]
45 | branch = true
46 | source = ["ocrd_detectron2"]
47 | 


--------------------------------------------------------------------------------
/requirements-test.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | ocrd_wrap
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | ocrd>=3.3.0
 2 | click>=7.0
 3 | scipy
 4 | numpy>=1.17.0
 5 | pillow>=7.1.2
 6 | shapely
 7 | scikit-image>=0.17.2
 8 | typing-extensions # for Torch build
 9 | torch>=1.10.0 #,<1.11
10 | torchvision>=0.11.2
11 | detectron2>=0.6
12 | setuptools >= 75.0 # for Detectron build
13 | wheel # for Detectron build
14 | pycocotools # for Detectron
15 | 


--------------------------------------------------------------------------------