├── .gitignore ├── src ├── test │ ├── resources │ │ ├── solr │ │ │ ├── collection1 │ │ │ ├── alldata │ │ │ │ └── conf │ │ │ │ │ ├── solrconfig.xml │ │ │ │ │ └── schema.xml │ │ │ ├── minimal │ │ │ │ └── conf │ │ │ │ │ ├── solrconfig.xml │ │ │ │ │ └── schema.xml │ │ │ └── min_absolute │ │ │ │ └── conf │ │ │ │ ├── solrconfig.xml │ │ │ │ └── schema.xml │ │ └── data │ │ │ └── ocrtext_full.txt │ └── java │ │ └── de │ │ └── digitalcollections │ │ ├── lucene │ │ └── analysis │ │ │ └── payloads │ │ │ ├── TestUtils.java │ │ │ ├── OcrInfoEncoderTest.java │ │ │ ├── PayloadHelperTest.java │ │ │ └── OcrInfoTest.java │ │ └── solr │ │ └── plugin │ │ └── components │ │ └── ocrhighlighting │ │ ├── AbsoluteHighlightingTest.java │ │ ├── MinimalHighlightingTest.java │ │ ├── DistributedOcrHighlightingTest.java │ │ └── OcrHighlightingTest.java └── main │ └── java │ └── de │ └── digitalcollections │ ├── lucene │ └── analysis │ │ ├── payloads │ │ ├── OcrInfoEncoder.java │ │ ├── OcrPayloadHelper.java │ │ └── OcrInfo.java │ │ └── util │ │ └── DelimitedOcrInfoPayloadTokenFilterFactory.java │ └── solr │ └── plugin │ └── components │ └── ocrhighlighting │ └── OcrHighlighting.java ├── example ├── docker-compose.yml ├── solr │ ├── Dockerfile │ └── ocrtest │ │ └── conf │ │ ├── solrconfig.xml │ │ └── schema.xml ├── hocr2solr ├── index_google1000 └── README.md ├── settings.xml ├── CHANGELOG.md ├── LICENSE ├── .travis.yml ├── pom.xml └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea/ 2 | target 3 | -------------------------------------------------------------------------------- /src/test/resources/solr/collection1: -------------------------------------------------------------------------------- 1 | alldata -------------------------------------------------------------------------------- /example/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | solr: 4 | build: solr 5 | ports: 6 | - "8983:8983" 7 | - "18983:18983" 8 | - "8849:8849" 9 | volumes: 10 | - data-solr:/opt/solr/server/solr/ocrtest 11 | volumes: 12 | data-solr: 13 | -------------------------------------------------------------------------------- /src/test/java/de/digitalcollections/lucene/analysis/payloads/TestUtils.java: -------------------------------------------------------------------------------- 1 | package de.digitalcollections.lucene.analysis.payloads; 2 | 3 | class TestUtils { 4 | public static char[] toChars(String input) { 5 | char[] buf = new char[input.length()]; 6 | input.getChars(0, input.length(), buf, 0); 7 | return buf; 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | ossrh-snapshots 8 | ${env.SONATYPE_USERNAME} 9 | ${env.SONATYPE_PASSWORD} 10 | 11 | 12 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # 0.2 2 | - Support for absolute coordinates 3 | - Docker-based example setup 4 | - Storing Term Vectors is no longer needed, which reduces the index size 5 | significantly (~50%) and speeds up the highlighting 6 | 7 | # 0.1 (initial release) 8 | - Changes compared to the in-house development version: 9 | - Made the number of bits for page, line and word indices configurable 10 | - Much more detailled documentation 11 | - Updated all docstrings 12 | - Tests for various usage scenarios 13 | 14 | -------------------------------------------------------------------------------- /example/solr/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM solr:7.3.1-alpine 2 | 3 | COPY ocrtest /opt/solr/server/solr/ocrtest 4 | 5 | USER root 6 | RUN chown -R $SOLR_USER:$SOLR_USER /opt/solr/server/solr/ocrtest 7 | 8 | USER solr 9 | RUN mkdir -p /opt/solr/server/solr/ocrtest/lib &&\ 10 | wget https://github.com/dbmdz/solr-ocrpayload-plugin/releases/download/0.2/solr-ocrpayload-plugin-0.2.jar -P/opt/solr/server/solr/ocrtest/lib/ &&\ 11 | bin/solr start -v &&\ 12 | bin/solr create_core -c ocrtest &&\ 13 | bin/solr stop 14 | 15 | USER solr 16 | -------------------------------------------------------------------------------- /src/test/resources/solr/alldata/conf/solrconfig.xml: -------------------------------------------------------------------------------- 1 | 2 | ${tests.luceneMatchVersion:LUCENE_CURRENT} 3 | ${solr.data.dir:} 4 | 5 | 6 | 7 | 8 | 9 | ocr_highlight 10 | 11 | 12 | 13 | 16 | 17 | -------------------------------------------------------------------------------- /src/test/resources/solr/minimal/conf/solrconfig.xml: -------------------------------------------------------------------------------- 1 | 2 | ${tests.luceneMatchVersion:LUCENE_CURRENT} 3 | ${solr.data.dir:} 4 | 5 | 6 | 7 | 8 | 9 | ocr_highlight 10 | 11 | 12 | 13 | 16 | 17 | -------------------------------------------------------------------------------- /src/test/resources/solr/min_absolute/conf/solrconfig.xml: -------------------------------------------------------------------------------- 1 | 2 | ${tests.luceneMatchVersion:LUCENE_CURRENT} 3 | ${solr.data.dir:} 4 | 5 | 6 | 7 | 8 | 9 | ocr_highlight 10 | 11 | 12 | 13 | 16 | 17 | -------------------------------------------------------------------------------- /example/solr/ocrtest/conf/solrconfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 7.3 3 | ${solr.data.dir:} 4 | 5 | 6 | 7 | 9 | 13 | 14 | 15 | 16 | explicit 17 | 10 18 | 19 | 20 | ocr_highlight 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2018 Munich Digitization Center/Bavarian State Library 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /src/test/resources/solr/minimal/conf/schema.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | id 24 | 25 | -------------------------------------------------------------------------------- /src/test/resources/solr/alldata/conf/schema.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | id 24 | 25 | -------------------------------------------------------------------------------- /src/test/resources/solr/min_absolute/conf/schema.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | id 24 | 25 | -------------------------------------------------------------------------------- /example/solr/ocrtest/conf/schema.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | id 28 | 29 | -------------------------------------------------------------------------------- /example/hocr2solr: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | from collections import namedtuple 4 | 5 | import lxml.etree as ET 6 | 7 | PAGE_PATH = './/div[@class="ocr_page"]' 8 | LINE_PATH = './/span[@class="ocr_line"]' 9 | WORD_PATH = './span[@class="ocr_cinfo"]' 10 | 11 | OcrBox = namedtuple('OcrBox', ('page_idx', 'line_idx', 'word_idx', 12 | 'x', 'y', 'width', 'height', 'word')) 13 | parser = ET.HTMLParser() 14 | 15 | 16 | def make_solr_token(ocrbox): 17 | payload = ("p:{page_idx},l:{line_idx},n:{word_idx},x:{x},y:{y}," 18 | "w:{width},h:{height}").format(**ocrbox._asdict()) 19 | return "{word}☛{payload}".format(word=ocrbox.word, payload=payload) 20 | 21 | 22 | def parse_hocr(hocr_path): 23 | tree = ET.parse(hocr_path, parser=parser) 24 | for page_idx, page_elem in enumerate(tree.findall(PAGE_PATH)): 25 | for line_idx, line_elem in enumerate(page_elem.findall(LINE_PATH)): 26 | for word_idx, word_elem in enumerate(line_elem.findall(WORD_PATH)): 27 | bbox = next( 28 | p.strip() for p in word_elem.attrib['title'].split(';') 29 | if p.strip().startswith('bbox')) 30 | x, y, x1, y1 = tuple(int(p) for p in bbox.split(" ")[1:]) 31 | yield OcrBox(page_idx=page_idx, line_idx=line_idx, 32 | word_idx=word_idx, x=x, y=y, width=x1-x, height=y1-y, 33 | word=word_elem.text) 34 | 35 | 36 | if __name__ == '__main__': 37 | hocr_path = sys.argv[1] 38 | for idx, box in enumerate(parse_hocr(hocr_path)): 39 | if idx > 0: 40 | sys.stdout.write(" ") 41 | sys.stdout.write(make_solr_token(box)) 42 | -------------------------------------------------------------------------------- /src/test/java/de/digitalcollections/solr/plugin/components/ocrhighlighting/AbsoluteHighlightingTest.java: -------------------------------------------------------------------------------- 1 | package de.digitalcollections.solr.plugin.components.ocrhighlighting; 2 | 3 | import com.jayway.jsonpath.DocumentContext; 4 | import com.jayway.jsonpath.JsonPath; 5 | import com.revinate.assertj.json.JsonPathAssert; 6 | import org.apache.solr.SolrTestCaseJ4; 7 | import org.junit.BeforeClass; 8 | import org.junit.Test; 9 | 10 | /** Test that configuring the plugin with absolute coordinates works as expected. **/ 11 | public class AbsoluteHighlightingTest extends SolrTestCaseJ4 { 12 | @BeforeClass 13 | public static void beforeClass() throws Exception { 14 | initCore("conf/solrconfig.xml", "conf/schema.xml", "src/test/resources/solr", "min_absolute"); 15 | assertU(adoc("ocr_text", "two☛x:12300,y:432,w:543,h:654, one☛x:654,y:543,w:432,h:321,", "id", "101")); 16 | assertU(adoc("ocr_text", "three☛x:127,y:4820,w:5490,h:654, two☛x:654,y:54337,w:431,h:341 five☛x:0,y:0,w:0,h:0, " 17 | + "four☛x:111,y:111,w:111,h:111,", "id", "102")); 18 | 19 | assertU(commit()); 20 | } 21 | 22 | @Test 23 | public void testMinimal() throws Exception { 24 | String json = JQ(req( 25 | "q", "two", "sort", "id asc", "ocr_hl", "true", "ocr_hl.fields", "ocr_text", "df", "ocr_text")); 26 | DocumentContext ctx = JsonPath.parse(json); 27 | JsonPathAssert.assertThat(ctx).jsonPathAsInteger("ocr_highlighting.101.ocr_text[0].x") 28 | .isEqualTo(12300); 29 | JsonPathAssert.assertThat(ctx).jsonPathAsInteger("ocr_highlighting.101.ocr_text.length()").isEqualTo(1); 30 | JsonPathAssert.assertThat(ctx).jsonPathAsInteger("ocr_highlighting.102.ocr_text[0].y") 31 | .isEqualTo(54337); 32 | JsonPathAssert.assertThat(ctx).jsonPathAsInteger("ocr_highlighting.101.ocr_text.length()").isEqualTo(1); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /example/index_google1000: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import re 3 | import sys 4 | import tarfile 5 | from pathlib import Path 6 | 7 | import requests 8 | 9 | 10 | OCRTEXT_URL = 'https://zvdd-ng.de/files/google1000_solr.tgz' 11 | SOLR_HOST = 'localhost:8983' 12 | SOLR_CORE = 'ocrtest' 13 | 14 | 15 | class SolrException(Exception): 16 | def __init__(self, resp, payload): 17 | self.message = resp 18 | self.payload = payload 19 | 20 | 21 | def index_documents(docs): 22 | resp = requests.post( 23 | "http://{}/solr/{}/update".format(SOLR_HOST, SOLR_CORE), 24 | json=docs, params=dict(softCommit="true")) 25 | if not resp: 26 | raise SolrException(resp.json(), docs) 27 | 28 | 29 | def fetch_ocrtext(): 30 | with requests.get(OCRTEXT_URL, stream=True) as resp: 31 | tf = tarfile.open(fileobj=resp.raw, mode="r|gz") 32 | for ti in tf: 33 | if not ti.isfile() or not ti.name.endswith('.txt'): 34 | continue 35 | ident = int(re.findall('\d{4}', ti.name)[0]) 36 | yield ident, tf.extractfile(ti).read().decode('utf8') 37 | 38 | 39 | def load_ocrtext(base_dir): 40 | base_dir = Path(base_dir) 41 | for idx, txt in enumerate(sorted(base_dir.glob("./*.txt"))): 42 | with txt.open("rt") as fp: 43 | yield idx, fp.read() 44 | 45 | 46 | if __name__ == '__main__': 47 | if len(sys.argv) > 1: 48 | txt_iter = load_ocrtext(sys.argv[1]) 49 | else: 50 | txt_iter = fetch_ocrtext() 51 | batch = [] 52 | for ident, text in txt_iter: 53 | doc = dict(id=ident, ocr_text=text) 54 | batch.append(doc) 55 | if len(batch) == 50: 56 | print("Indexing batch of 50 documents...") 57 | index_documents(batch) 58 | batch = [] 59 | if batch: 60 | index_documents(batch) 61 | -------------------------------------------------------------------------------- /src/test/java/de/digitalcollections/solr/plugin/components/ocrhighlighting/MinimalHighlightingTest.java: -------------------------------------------------------------------------------- 1 | package de.digitalcollections.solr.plugin.components.ocrhighlighting; 2 | 3 | import com.jayway.jsonpath.DocumentContext; 4 | import com.jayway.jsonpath.JsonPath; 5 | import com.revinate.assertj.json.JsonPathAssert; 6 | import org.apache.solr.SolrTestCaseJ4; 7 | import org.junit.BeforeClass; 8 | import org.junit.Test; 9 | 10 | import java.math.BigDecimal; 11 | 12 | /** Test that configuring the plugin without page/line/word indices works as expected. **/ 13 | public class MinimalHighlightingTest extends SolrTestCaseJ4 { 14 | @BeforeClass 15 | public static void beforeClass() throws Exception { 16 | initCore("conf/solrconfig.xml", "conf/schema.xml", "src/test/resources/solr", "minimal"); 17 | assertU(adoc("ocr_text", "two|x:12.3,y:43.2,w:54.3,h:65.4, one|x:65.4,y:54.3,w:43.2,h:32.1,", "id", "101")); 18 | assertU(adoc("ocr_text", "three|x:12.7,y:48.2,w:54.9,h:65.4, two|x:65.4,y:54.3,w:43.1,h:34.1, five|x:0,y:0,w:0,h:0, " 19 | + "four|x:11.1,y:11.1,w:11.1,h:11.1,", "id", "102")); 20 | 21 | assertU(commit()); 22 | } 23 | 24 | @Test 25 | public void testMinimal() throws Exception { 26 | String json = JQ(req( 27 | "q", "two", "sort", "id asc", "ocr_hl", "true", "ocr_hl.fields", "ocr_text", "df", "ocr_text")); 28 | DocumentContext ctx = JsonPath.parse(json); 29 | JsonPathAssert.assertThat(ctx).jsonPathAsBigDecimal("ocr_highlighting.101.ocr_text[0].x") 30 | .isBetween(BigDecimal.valueOf(0.1230), BigDecimal.valueOf(0.1239)); 31 | JsonPathAssert.assertThat(ctx).jsonPathAsInteger("ocr_highlighting.101.ocr_text.length()").isEqualTo(1); 32 | JsonPathAssert.assertThat(ctx).jsonPathAsBigDecimal("ocr_highlighting.102.ocr_text[0].x") 33 | .isBetween(BigDecimal.valueOf(0.6540), BigDecimal.valueOf(0.6549)); 34 | JsonPathAssert.assertThat(ctx).jsonPathAsInteger("ocr_highlighting.101.ocr_text.length()").isEqualTo(1); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /example/README.md: -------------------------------------------------------------------------------- 1 | # Example Setup 2 | 3 | ## Configuration 4 | The index is configured to store the OCR bounding boxes with the following parameters: 5 | 6 | - Coordinates are stored as **absolute pixel values** 7 | - The payload needs **11 bytes** per token 8 | (14 bits per coordinate, 12 bit for word-, 11 bit for line- and 9 bit for word-indices) 9 | 10 | 11 | ## Dataset 12 | This demo creates an index of the [Google 1000 Books ICDAR 2007 dataset](http://commondatastorage.googleapis.com/books/icdar2007/README.txt). 13 | It consists of 103,672,774 tokens split across 1000 OCRed books taken from the Google Books project. 14 | The resulting index is 1.1GiB in size, compared to 4.4GiB for the uncompressed input documents. 15 | 16 | 17 | ## Running the demo 18 | - Launch the Docker container: `docker-compose up` 19 | - Index the pre-converted OCR volumes with `./index_google1000` 20 | - **Search!** `curl 'http://localhost:8983/solr/ocrtest/select?q=ocr_text:harvard&ocr_hl=true&ocr_hl.fields=ocr_text'` 21 | 22 | 23 | ## Converting the hOCR to the input format manually 24 | The instructions above fetch an archive with the hOCRs from the dataset pre-converted 25 | (https://zvdd-ng.de/files/google1000_solr.tgz). If you want to do this yourself, follow these steps: 26 | 27 | - Obtain the dataset by downloading the individual books, ideally with a newer version of bash or zsh: 28 | ```sh 29 | $ wget http://commondatastorage.googleapis.com/books/icdar2007/Volume_{0000..0999}.zip 30 | $ for zip in *.zip; do unzip $zip; done 31 | ``` 32 | - Convert the individual hOCR files to the format needed by the Solr configuration 33 | (`☛p:,l:,n:,x:,y:,w:,h:`): 34 | ```sh 35 | $ for hocr in Volume_*/hOCR.html; do ./hocr2solr $hocr > $(echo $hocr |sed 's/.html/.txt/'); done 36 | ``` 37 | - Index the books by passing the directory with the `.txt`-files as the first parameter: 38 | ```sh 39 | $ ./index_google1000 40 | ``` 41 | -------------------------------------------------------------------------------- /src/test/java/de/digitalcollections/solr/plugin/components/ocrhighlighting/DistributedOcrHighlightingTest.java: -------------------------------------------------------------------------------- 1 | package de.digitalcollections.solr.plugin.components.ocrhighlighting; 2 | 3 | import org.apache.solr.BaseDistributedSearchTestCase; 4 | import org.apache.solr.handler.component.SearchComponent; 5 | import org.junit.BeforeClass; 6 | import org.junit.Test; 7 | 8 | public class DistributedOcrHighlightingTest extends BaseDistributedSearchTestCase { 9 | 10 | @BeforeClass 11 | public static void beforeClass() throws Exception { 12 | System.setProperty("managed.schema.mutable", "true"); 13 | 14 | initCore("conf/solrconfig.xml", "conf/schema.xml", "src/test/resources/solr", "alldata"); 15 | 16 | // The highlighting component should be active 17 | SearchComponent highlighter = h.getCore().getSearchComponent("ocr_highlight"); 18 | assertTrue("wrong highlighter: " + highlighter.getClass(), 19 | highlighter instanceof OcrHighlighting); 20 | 21 | assertU(adoc("ocr_text", "contains|p:20,l:3,n:5,x:11.1,y:22.2,w:33.3,h:44.4, position|p:20,l:4,n:6,x:55.5,y:66.6,w:77.7,h:88.8,", "id", "105")); 22 | 23 | assertU(BaseDistributedSearchTestCase.commit()); 24 | } 25 | 26 | @Test 27 | @ShardsRepeat(max=5) 28 | public void testWithPageNumberAndPosition() { 29 | assertQ( 30 | "terms with both page number and word position", 31 | req("q", "contains position", "sort", "id asc", "ocr_hl", "true","ocr_hl.fields", "ocr_text", "df", "ocr_text"), 32 | "count(//lst[@name='ocr_highlighting']/*)=1", 33 | "count(//lst[@name='ocr_highlighting']/lst[@name='105']/arr[@name='ocr_text']/lst)=2", 34 | "(//lst[@name='ocr_highlighting']/lst[@name='105']/arr[@name='ocr_text']/lst)[1]/int[@name='page']='20'", 35 | "(//lst[@name='ocr_highlighting']/lst[@name='105']/arr[@name='ocr_text']/lst)[1]/int[@name='line']='3'", 36 | "(//lst[@name='ocr_highlighting']/lst[@name='105']/arr[@name='ocr_text']/lst)[1]/int[@name='word']='5'", 37 | "(//lst[@name='ocr_highlighting']/lst[@name='105']/arr[@name='ocr_text']/lst)[2]/int[@name='page']='20'", 38 | "(//lst[@name='ocr_highlighting']/lst[@name='105']/arr[@name='ocr_text']/lst)[2]/int[@name='line']='4'", 39 | "(//lst[@name='ocr_highlighting']/lst[@name='105']/arr[@name='ocr_text']/lst)[2]/int[@name='word']='6'"); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | jdk: 3 | - openjdk8 4 | - oraclejdk8 5 | - openjdk11 6 | addons: 7 | apt: 8 | packages: 9 | - libxml2-utils 10 | before_script: 11 | - PROJECT_VERSION=$(xmllint --xpath '/*[local-name()="project"]/*[local-name()="version"]/text()' pom.xml) 12 | - if [ ! -z "$TRAVIS_TAG" ]; then mvn versions:set -DnewVersion=$TRAVIS_TAG; fi 13 | script: 14 | - mvn clean install 15 | after_success: 16 | - bash <(curl -s https://codecov.io/bash) 17 | - if [[ "$PROJECT_VERSION" == *SNAPSHOT ]]; then mvn deploy -B -DskipTests=true --settings settings.xml; fi 18 | deploy: 19 | provider: releases 20 | api_key: 21 | secure: LJjZzy9i0OmNJrav71LbyP7kP9oWzkUXgwCnG95b2bzrDnVw0OQVDpU3nQXB9w0AKgF4CCsHMLo6k383n9LQ+f/AGO91a3urya3Rns+Y7Y6ptk3ZRXA39U7/5Eod8lWApd18l+EWvansZEzkukHH0xXGK+SWuTo5taH6PKmrEQKjw9zxMI07+9lIJcN7vddQzdFanCaincz4pW/EgLIhJXcecyprDjnYQnbEVczYsp7W2+XbLn/rhE7vxK/ZCU1nJojUD3YwMrQgBj5MaiAtAEYSwvuq5N9jpR1glEIqlmhw5kdhLIt+R3FlMicWPhKPW/7t7RmCFAxDclkBaJT+RBF/438wZwUXLZ4KbeAcgB3zRkgK1qVpxx/jJF0Q1zHzrRF1tsJSILI7yJBpE+JPp/881JwLJnUgbuAAWy282m9Kd5G3DL7yhkWlG+Jrau6iN89kh+Nko7KtZ3bosvyzne9cgC+AubTY3pF+BebKA0ZQarolhY24U9yOKDmo1gvllEbnlSsV5N3ga7zhZ6rHMpN/X+3qqn9awffHvXsG0dJpXk968c2FzSgwMk7ibcN1vFcSf0MkByOE5iiZDjOpzS4FOi6j70XLfX+8NI/QcP63jXO7oXzNBBWhQaXM/VrHt0oNP2u56UZTKDmgW+9BSIQtIKRCd3FxKfNtr/ES1ms= 22 | file_glob: true 23 | file: 24 | - target/*.jar 25 | skip_cleanup: true 26 | on: 27 | tags: true 28 | env: 29 | global: 30 | - secure: WGtLFUeolKk0vLZnMnAdkirAVjHN/PbZPSATSMDdVsBZO36JypwDq8OLcHejjuNT0y4O0QAgN4m5iz6qR5iZjf9qcTSp/eZVd5lgt1sLaWIWfNdccS2cwOLcb6RprKAbrfzhIUVoBbJUt6LvSXE/tJM+lHMcIqGdEsW/u3NbfMxFzZdJTjvbjyN/kszRq4CeuLaou/8NhR1RdN26CvPr9RZkcqNsyKgZql5Tny5Bvt1xbxaMeju+2ZKZVyTXgEntiQM92uYVooyyni6Zl0pfXHwo57EoBwiveVaCRtdQDN2AAl+2B8jV5bsIZipUiGEZDs3E9vYjl0/9YUCCKqqiRk2xMi0lqUmLOenTKVHS28WlKEaPCbxTqTtA+ZmQi9tAgt7YqTOnfIUT7RFEZfTMs5DnedwNdOI055ft71Vbjh612nuFWqkoaxqp8gHmBFRF55R6c0Ixq5HRShiERi9x5U4CAeLnkh25QzvZD+PRfVn52fdQ+qaVkpJwBplzr3/m6RjxXjW6l414y7eSBope+dk5BTLPrC1R1KcU3yGSXugJUr8hVrqDVvhoN5JOwL6j5iPThdkDEebifm6nsH4Hm3sP0KlBFOq7P71UbHOhR8Atj6CWKgYePyPaQ+I8KcARxu4wmjh8FShQeVcRRAKkAd2vYa+qQRAhit+EijL8J5M= 31 | - secure: aQfHA0oyixmU2KM5H33u5cKYSnn/ToSgNMrriCWdY0zKHwIGQuy8Bw8EV3Zp75Gy+1gEh6JrbM4mpdoxvxxPJd45Eywr+ZYrOlw1viZD3XuxZON842iAtp9GbyzMvpGtRK0MGCMWgPjg1sVGhEM5BgFFR4DABq8xKoj2sp1kNKy+hEZSPOfBiGyCO45zcBcU+9JoKyTez7fKQvOK2Sxv52owQbzrpbAChXUQkZvH8zbGnX8SvA534m1X6VaK3SvxTOo/TIQA3b4iBO2SlHNB7p15D9s2P6WxYP/zBlMqhNSA1A3sz/jqVvYZNbMEPp0ZX+qpzympj6RqI0Oqll20LTFQW1HvPMC+U0aVB8C4N5gGyuTir2+pkGAkRiCtmwg189fjvSArfkDTXTknN8nZMvG+hzT0YhJf7+n/NHR0mcHnoCA+vcs06c3GZq7mxMAJj7rRsqpnOUu5Um9J/vZdqg+UurmjroVqWyzAEAi/OK9dad+aPQME8xEpymx55bhBFxR5vH0cjkqGDTBRjz9x/1Km+MIu1Se3H66OT2jl/8CsWiL7Tw2Tcyz6mTXenYktFil6wsYLyTrzR5rgV+fbhveajG635iRol1/TdwHj8LcGhbG8hNCSJwDLafRrh7mHASNo0qNthQkU3SLBND3qTlDlZrVQtqkf0qFMEqaZBdc= 32 | -------------------------------------------------------------------------------- /src/test/java/de/digitalcollections/lucene/analysis/payloads/OcrInfoEncoderTest.java: -------------------------------------------------------------------------------- 1 | package de.digitalcollections.lucene.analysis.payloads; 2 | 3 | import com.google.common.collect.ImmutableMap; 4 | import org.apache.lucene.util.BytesRef; 5 | import org.junit.jupiter.params.ParameterizedTest; 6 | import org.junit.jupiter.params.provider.Arguments; 7 | import org.junit.jupiter.params.provider.MethodSource; 8 | 9 | import java.util.Map; 10 | import java.util.stream.Stream; 11 | 12 | import static de.digitalcollections.lucene.analysis.payloads.TestUtils.toChars; 13 | import static org.assertj.core.api.Assertions.assertThat; 14 | 15 | public class OcrInfoEncoderTest { 16 | 17 | public static Stream data() { 18 | byte[] withPage = {(byte) 0x1b, (byte) 0x21, (byte) 0xa1, (byte) 0xce, (byte) 0x10, (byte) 0x36}; 19 | byte[] withoutPage = {(byte) 0x21, (byte) 0xa1, (byte) 0xce, (byte) 0x10, (byte) 0x36}; 20 | byte[] withPosition = {(byte) 0x20, (byte) 0x21, (byte) 0xa1, (byte) 0xce, (byte) 0x10, (byte) 0x36}; 21 | byte[] withPositionAndPage = {(byte) 0x36, (byte) 0x20, (byte) 0x21, (byte) 0xa1, (byte) 0xce, (byte) 0x10, (byte) 0x36}; 22 | byte[] withPageLineWord = {(byte) 0x01, (byte) 0xb0, (byte) 0x54, (byte) 0x0c, (byte) 0x1f, (byte) 0x8f, (byte) 0x05, (byte) 0x85, (byte) 0xd3}; 23 | byte[] withPageLineWordAbsolute = {(byte) 0x01, (byte) 0x60, (byte) 0x42, (byte) 0x2c, (byte) 0x03, (byte) 0x0a, (byte) 0x08, (byte) 0x90, (byte) 0x0f, (byte) 0xa0, (byte) 0x03, (byte) 0x70}; 24 | Map params = ImmutableMap.builder() 25 | .put("withPageooo|p:27,x:13.1,y:52.7,w:87.9,h:5.3,", withPage) 26 | .put("withoutPage|x:13.1,y:52.7,w:87.9,h:5.3,", withoutPage) 27 | .put("withWordoooo|n:32,x:13.1,y:52.7,w:87.9,h:5.3,", withPosition) 28 | .put("withPageWord|p:27,n:32,x:13.1,y:52.7,w:87.9,h:5.3,", withPositionAndPage) 29 | .put("withPageLineWord|p:27,l:42,n:12,x:12.3,y:23.4,w:34.5,h:45.6,", withPageLineWord) 30 | .put("withPageLineWordAbsolute|p:22,l:33,n:44,x:778,y:2192,w:4000,h:880", withPageLineWordAbsolute) 31 | .build(); 32 | return params.entrySet().stream().map(e -> Arguments.of(e.getKey(), e.getValue())); 33 | } 34 | 35 | @ParameterizedTest 36 | @MethodSource("data") 37 | public void encode(String tokenFixture, byte[] bytesFixture) { 38 | OcrInfoEncoder encoder = new OcrInfoEncoder( 39 | tokenFixture.contains("Absolute") ? 16 : 10, 40 | tokenFixture.contains("Word") ? 9 : 0, 41 | tokenFixture.contains("Line") ? 11 : 0, 42 | tokenFixture.contains("withPage") ? 12 : 0, 43 | tokenFixture.contains("Absolute")); 44 | 45 | BytesRef bytes = encoder.encode( 46 | toChars(tokenFixture), 47 | tokenFixture.indexOf("|") + 1, 48 | tokenFixture.length() - tokenFixture.indexOf("|") -1 ); 49 | assertThat(bytes.bytes).isEqualTo(bytesFixture); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/de/digitalcollections/lucene/analysis/payloads/OcrInfoEncoder.java: -------------------------------------------------------------------------------- 1 | package de.digitalcollections.lucene.analysis.payloads; 2 | 3 | import org.apache.lucene.analysis.payloads.AbstractEncoder; 4 | import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory; 5 | import org.apache.lucene.analysis.payloads.PayloadEncoder; 6 | import org.apache.lucene.util.BytesRef; 7 | 8 | /** 9 | * Encode an OCR information string as a {@link BytesRef}. 10 | * 11 | * Not intended to be used directly with {@link DelimitedPayloadTokenFilterFactory}. 12 | * Use {@link de.digitalcollections.lucene.analysis.util.DelimitedOcrInfoPayloadTokenFilterFactory} instead. 13 | * For information on the expected format of the payload string, see {@link OcrInfo#parse(char[], int, int, int, int, int, int, boolean)} 14 | * 15 | * To use it, configure the {@link de.digitalcollections.lucene.analysis.util.DelimitedOcrInfoPayloadTokenFilterFactory}: 16 | * 17 | * ```xml 18 | *
{@code
19 |  * 
21 |  * }
22 | * ``` 23 | */ 24 | public class OcrInfoEncoder extends AbstractEncoder implements PayloadEncoder { 25 | 26 | private final int coordBits; 27 | private final int wordBits; 28 | private final int lineBits; 29 | private final int pageBits; 30 | private final boolean absoluteCoordinates; 31 | 32 | /** 33 | * Configure a new OcrInfoEncoder. 34 | * 35 | * The sum of coordBits*4, wordBits, lineBits and pageBits should be divisible by 8, as not to waste any space in the 36 | * index. 37 | * 38 | * @param coordBits Number of bits to use for storing the OCR coordinates in the index, must be an even number. 39 | * @param wordBits Number of bits to use for storing the word index (0 to disable) 40 | * @param lineBits Number of bits to use for storing the line index (0 to disable) 41 | * @param pageBits Number of bits to use for storing the page index (0 to disable) 42 | * @param absoluteCoordinates Whether the coordinates are stored as absolute (integral position) or relative (percentage position) 43 | */ 44 | public OcrInfoEncoder(int coordBits, int wordBits, int lineBits, int pageBits, boolean absoluteCoordinates) { 45 | this.coordBits = coordBits; 46 | this.wordBits = wordBits; 47 | this.lineBits = lineBits; 48 | this.pageBits = pageBits; 49 | this.absoluteCoordinates = absoluteCoordinates; 50 | } 51 | 52 | /** 53 | * Default constructor that encodes with 12bit for the coordinates and doesn't store any indices. 54 | */ 55 | public OcrInfoEncoder() { 56 | this(12, 0, 0, 0, false); 57 | } 58 | 59 | /** 60 | * Encode the OCR payload (see {@link OcrInfo#parse(char[], int, int, int, int, int, int, boolean)} 61 | * be formatted) to a space-efficient binary representation. 62 | */ 63 | @Override 64 | public BytesRef encode(char[] chars, int offset, int length) { 65 | OcrInfo info = OcrInfo.parse(chars, offset, length, wordBits, lineBits, pageBits, coordBits, absoluteCoordinates); 66 | byte[] data = OcrPayloadHelper.encodeOcrInfo(info, coordBits, wordBits, lineBits, pageBits); 67 | return new BytesRef(data); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/test/java/de/digitalcollections/lucene/analysis/payloads/PayloadHelperTest.java: -------------------------------------------------------------------------------- 1 | package de.digitalcollections.lucene.analysis.payloads; 2 | 3 | import org.apache.lucene.util.BytesRef; 4 | import org.assertj.core.data.Offset; 5 | import org.junit.jupiter.params.ParameterizedTest; 6 | import org.junit.jupiter.params.provider.Arguments; 7 | import org.junit.jupiter.params.provider.MethodSource; 8 | 9 | import java.util.stream.Stream; 10 | 11 | import static org.assertj.core.api.Assertions.assertThat; 12 | 13 | 14 | public class PayloadHelperTest { 15 | public static Stream fixtureProvider() { 16 | byte[] withPage = {(byte) 0x1b, (byte) 0x21, (byte) 0xa1, (byte) 0xce, (byte) 0x10, (byte) 0x36}; 17 | byte[] withoutPage = {(byte)0x23, (byte)0x1e, (byte)0x9f, (byte)0xa8, (byte)0x36}; 18 | byte[] withPosition = {(byte) 0x20, (byte) 0x21, (byte) 0xa1, (byte) 0xce, (byte) 0x10, (byte) 0x36}; 19 | byte[] withPositionAndPage = {(byte) 0x0, (byte) 0xd8, (byte) 0x20, (byte) 0x21, (byte) 0xa1, (byte) 0xce, (byte) 0x10, (byte) 0x36}; 20 | byte[] withPageLineWordAbsolute = {(byte) 0x01, (byte) 0x60, (byte) 0x42, (byte) 0x2c, (byte) 0x30, (byte) 0xa8, (byte) 0x90, (byte) 0x19, (byte) 0x03, (byte) 0x70}; 21 | return Stream.of( 22 | Arguments.of(new OcrInfo(27, .131f, .527f, .879f, .053f), withPage), 23 | Arguments.of(new OcrInfo(.1368f, .4779f, .9782f, .0532f), withoutPage), 24 | Arguments.of(new OcrInfo(-1, 32, .131f, .527f, .879f, .053f), withPosition), 25 | Arguments.of(new OcrInfo(27, 32, .131f, .527f, .879f, .053f), withPositionAndPage), 26 | Arguments.of(new OcrInfo(22, 33, 44, 778, 2192, 400, 880), 27 | withPageLineWordAbsolute)); 28 | } 29 | 30 | private void assertAreAboutEqual(OcrInfo a, OcrInfo b) { 31 | assertThat(a.getHorizontalOffset()).isCloseTo(b.getHorizontalOffset(), Offset.offset(0.09f)); 32 | assertThat(a.getVerticalOffset()).isCloseTo(b.getVerticalOffset(), Offset.offset(0.09f)); 33 | assertThat(a.getWidth()).isCloseTo(b.getWidth(), Offset.offset(0.09f)); 34 | assertThat(a.getHeight()).isCloseTo(b.getHeight(), Offset.offset(0.09f)); 35 | } 36 | 37 | @ParameterizedTest 38 | @MethodSource("fixtureProvider") 39 | public void encodeOcrInfo(OcrInfo ocrInfo, byte[] payload) { 40 | byte[] encodedInfo = OcrPayloadHelper.encodeOcrInfo( 41 | ocrInfo, 42 | ocrInfo.getHasAbsoluteCoordinates() ? 12 : 10, 43 | ocrInfo.getWordIndex() >= 0 ? 9 : 0, 44 | ocrInfo.getLineIndex() >= 0 ? 11 : 0, 45 | ocrInfo.getPageIndex() >= 0 ? 12 : 0); 46 | assertThat(encodedInfo).isEqualTo(payload); 47 | } 48 | 49 | @ParameterizedTest 50 | @MethodSource("fixtureProvider") 51 | public void decodeOcrInfo(OcrInfo ocrInfo, byte[] payload) { 52 | OcrInfo decodedInfo = OcrPayloadHelper.decodeOcrInfo( 53 | new BytesRef(payload), 54 | ocrInfo.getHasAbsoluteCoordinates() ? 12: 10, 55 | ocrInfo.getWordIndex() >= 0 ? 9 : 0, 56 | ocrInfo.getLineIndex() >= 0 ? 11 : 0, 57 | ocrInfo.getPageIndex() >= 0 ? 12 : 0, 58 | ocrInfo.getHasAbsoluteCoordinates()); 59 | if (ocrInfo.getHasAbsoluteCoordinates()) { 60 | assertThat(decodedInfo).isEqualToComparingFieldByField(ocrInfo); 61 | } else { 62 | assertAreAboutEqual(decodedInfo, ocrInfo); 63 | } 64 | } 65 | 66 | @ParameterizedTest 67 | @MethodSource("fixtureProvider") 68 | public void doesNotDegradeAccuracy(OcrInfo ocrInfo, byte[] payload) { 69 | if (ocrInfo.getHasAbsoluteCoordinates()) { 70 | // NOP, there's no risk of degradation with integers 71 | return; 72 | } 73 | byte[] encodedInfo; 74 | OcrInfo decodedInfo = ocrInfo; 75 | for (int i=0; i < 100; i++) { 76 | encodedInfo = OcrPayloadHelper.encodeOcrInfo(decodedInfo, 10, 9, 11, 12); 77 | decodedInfo = OcrPayloadHelper.decodeOcrInfo(new BytesRef(encodedInfo), 10, 78 | ocrInfo.getWordIndex() >= 0 ? 9 : 0, ocrInfo.getLineIndex() >= 0 ? 11 : 0, ocrInfo.getPageIndex() >= 0 ? 12 : 0, false); 79 | assertAreAboutEqual(decodedInfo, ocrInfo); 80 | } assertAreAboutEqual(decodedInfo, ocrInfo); 81 | } 82 | } -------------------------------------------------------------------------------- /src/main/java/de/digitalcollections/lucene/analysis/util/DelimitedOcrInfoPayloadTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package de.digitalcollections.lucene.analysis.util; 2 | 3 | import de.digitalcollections.lucene.analysis.payloads.OcrInfo; 4 | import de.digitalcollections.lucene.analysis.payloads.OcrInfoEncoder; 5 | import de.digitalcollections.lucene.analysis.payloads.OcrPayloadHelper; 6 | import java.util.Map; 7 | import org.apache.lucene.analysis.TokenStream; 8 | import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter; 9 | import org.apache.lucene.analysis.util.TokenFilterFactory; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | 13 | /** 14 | * Filter factory for space-efficiently encoding OCR information in token payloads. 15 | * 16 | * For information on the expected format of the payload string, see 17 | * {@link OcrInfo#parse(char[], int, int, int, int, int, int, boolean)} 18 | * 19 | * Takes the following configuration parameters: 20 | * 21 | * `coordinateBits` 22 | * : Number of bits to use for encoding a coordinate value. 10 bits is recommended (and set by default), 23 | * which yields a precision to approximately three decimal places 24 | * 25 | * `delimiter` 26 | * : Delimiting character. If not provided, the pipe symbol (`|`) is used 27 | * 28 | * `pageBits` 29 | * : Number of bits to use for encoding the page index. 0 will disable page indices (default). 30 | * 31 | * `lineBits` 32 | * : Number of bits to use for encoding the line index. 0 will disable line indices (default). 33 | * 34 | * `wordBits` 35 | * : Number of bits to use for encoding the word index. 0 will disable word indices (default). 36 | * 37 | * Here is a sample configuration with page indices enabled: 38 | * ``` 39 | *
{@code
40 |  * 
42 |  * }
43 | * ``` 44 | */ 45 | public class DelimitedOcrInfoPayloadTokenFilterFactory extends TokenFilterFactory { 46 | 47 | private static final Logger LOGGER = LoggerFactory.getLogger(OcrPayloadHelper.class); 48 | 49 | private static final String COORD_BITS_ATTR = "coordinateBits"; 50 | private static final String DELIMITER_ATTR = "delimiter"; 51 | private static final String PAGE_BITS_ATTR = "pageBits"; 52 | private static final String LINE_BITS_ATTR = "lineBits"; 53 | private static final String WORD_BITS_ATTR = "wordBits"; 54 | private static final String ABSOLUTE_COORDS_ATTR = "absoluteCoordinates"; 55 | 56 | /** Delimiter to use for splitting OCR information from the tokens **/ 57 | private final char delimiter; 58 | 59 | private OcrInfoEncoder encoder; 60 | 61 | public DelimitedOcrInfoPayloadTokenFilterFactory(Map args) { 62 | super(args); 63 | delimiter = getChar(args, DELIMITER_ATTR, '|'); 64 | 65 | /* Number of bits to use for encoding position information */ 66 | final int coordinateBits = getInt(args, COORD_BITS_ATTR, 10); 67 | final int pageBits = getInt(args, PAGE_BITS_ATTR, 0); 68 | final int lineBits = getInt(args, LINE_BITS_ATTR, 0); 69 | final int wordBits = getInt(args, WORD_BITS_ATTR, 0); 70 | final boolean absoluteCoordinates = getBoolean(args, ABSOLUTE_COORDS_ATTR, false); 71 | 72 | int coordWidth = coordinateBits * 4; 73 | int remainder = coordWidth % 8; 74 | if (remainder != 0) { 75 | throw new IllegalArgumentException("coordinateBits must be an even number."); 76 | } 77 | int bitSum = coordWidth + pageBits + lineBits + wordBits; 78 | remainder = bitSum % 8; 79 | if (remainder != 0) { 80 | LOGGER.warn("Final payload size {} is not divisible by 8, will be padded. This is wasting {} bits, try playing " 81 | + "with the wordBits, lineBits and/or pageBits options.", bitSum, remainder); 82 | } 83 | encoder = new OcrInfoEncoder(coordinateBits, wordBits, lineBits, pageBits, absoluteCoordinates); 84 | if (!args.isEmpty()) { 85 | throw new IllegalArgumentException("Unknown parameters: " + args); 86 | } 87 | } 88 | 89 | @Override 90 | public TokenStream create(TokenStream input) { 91 | return new DelimitedPayloadTokenFilter(input, delimiter, encoder); 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/test/java/de/digitalcollections/lucene/analysis/payloads/OcrInfoTest.java: -------------------------------------------------------------------------------- 1 | package de.digitalcollections.lucene.analysis.payloads; 2 | 3 | import org.junit.jupiter.api.Test; 4 | import org.junit.jupiter.params.ParameterizedTest; 5 | import org.junit.jupiter.params.provider.Arguments; 6 | import org.junit.jupiter.params.provider.MethodSource; 7 | 8 | import java.util.stream.Stream; 9 | 10 | import static de.digitalcollections.lucene.analysis.payloads.TestUtils.toChars; 11 | import static org.assertj.core.api.Assertions.assertThat; 12 | import static org.assertj.core.api.Assertions.assertThatThrownBy; 13 | 14 | public class OcrInfoTest { 15 | public static Stream fixtureProvider() { 16 | return Stream.of( 17 | Arguments.of(new OcrInfo(27, .131f, .527f, .879f, .053f), "p:27,x:13.1,y:52.7,w:87.9,h:5.3"), 18 | Arguments.of(new OcrInfo(.131f, .527f, .879f, .053f), "x:13.1,y:52.7,w:87.9,h:5.3"), 19 | Arguments.of(new OcrInfo(-1, 50, .123f, .456f, .789f, .091f), "l:50,x:12.3,y:45.6,w:78.9,h:9.1"), 20 | Arguments.of(new OcrInfo(123,456,.123f,.456f, .234f, .456f), "p:123,l:456,x:12.3,y:45.6,w:23.4,h:45.6"), 21 | Arguments.of(new OcrInfo( 123, 456, 511, .123f, .234f, .345f, .456f), "p:123,l:456,n:511,x:12.3,y:23.4,w:34.5,h:45.6"), 22 | Arguments.of(new OcrInfo(123, -1, 456, .123f, .234f, .345f, .456f), "p:123,n:456,x:12.3,y:23.4,w:34.5,h:45.6"), 23 | Arguments.of(new OcrInfo( 123, 456, 511, .1234f, .2345f, .3456f, .4567f), "p:123,l:456,n:511,x:12.34,y:23.45,w:34.56,h:45.67"), 24 | Arguments.of(new OcrInfo(123, 456, 511, 768, 1024, 2048, 4095), 25 | "p:123,l:456,n:511,x:768,y:1024,w:2048,h:4095") 26 | ); 27 | } 28 | 29 | @ParameterizedTest 30 | @MethodSource("fixtureProvider") 31 | public void parseFromBeginning(OcrInfo info, String payload) { 32 | char[] buf = toChars(payload); 33 | OcrInfo parsed = OcrInfo.parse( 34 | buf, 0, payload.length(), 35 | info.getWordIndex() > 0 ? 9 : 0, info.getLineIndex() > 0 ? 11 : 0, info.getPageIndex() > 0 ? 12 : 0, 36 | 12, 37 | info.getHasAbsoluteCoordinates()); 38 | assertThat(parsed).isEqualToComparingFieldByField(info); 39 | } 40 | 41 | @ParameterizedTest 42 | @MethodSource("fixtureProvider") 43 | public void parseFromPosition(OcrInfo info, String payload) { 44 | String padding = "someToken|"; 45 | String padded = padding + payload; 46 | char[] buf = toChars(padded); 47 | OcrInfo parsed = OcrInfo.parse( 48 | buf, padding.length(), payload.length(), 49 | info.getWordIndex() > 0 ? 9 : 0, info.getLineIndex() > 0 ? 11 : 0, info.getPageIndex() > 0 ? 12 : 0, 50 | 12, 51 | info.getHasAbsoluteCoordinates()); 52 | assertThat(parsed).isEqualToComparingFieldByField(info); 53 | } 54 | 55 | @Test 56 | public void keysMustNotBeUsedMultipleTimes() { 57 | String payload = "p:12,x:34.5,n:56,x:78.9,y:87.6,w:54.3,h:21"; 58 | assertThatThrownBy(() -> OcrInfo.parse(toChars(payload), 0, payload.length(), 9, 11, 12, 12, false)) 59 | .isInstanceOf(IllegalArgumentException.class) 60 | .hasMessageContaining("Invalid payload p:12,x:34.5,n:56,x:78.9,y:87.6,w:54.3,h:21: duplicate key 'x'"); 61 | } 62 | 63 | @Test 64 | public void catchOverFlow() { 65 | String idxOverflow = "p:12,l:34,n:512,x:78.9,y:87.6,w:54.3,h:2.1"; 66 | assertThatThrownBy(() -> OcrInfo.parse(toChars(idxOverflow), 0, idxOverflow.length(), 9, 11, 12, 12, false)) 67 | .isInstanceOf(IllegalArgumentException.class) 68 | .hasMessageContaining("512 for word needs more than 9 bits (valid values range from 0 to 511). Payload=p:12,l:34,n:512,x:78.9,y:87.6,w:54.3,h:2.1"); 69 | String coordOverFlow = "p:1,l:2,n:3,x:4096,y:2048,w:1024,h:512"; 70 | assertThatThrownBy(() -> OcrInfo.parse(toChars(coordOverFlow), 0, coordOverFlow.length(), 9, 11, 12, 12, true)) 71 | .isInstanceOf(IllegalArgumentException.class) 72 | .hasMessageContaining("4096 for x needs more than 12 bits (valid values range from 0 to 4095). Payload=p:1,l:2,n:3,x:4096,y:2048,w:1024,h:512"); 73 | } 74 | 75 | @Test 76 | public void missingParametersAreCaught() { 77 | String missingLine = "p:12,n:56,x:78.9,y:87.6,w:54.3,h:2.1"; 78 | assertThatThrownBy(() -> OcrInfo.parse(toChars(missingLine), 0, missingLine.length(), 9, 11, 12, 12, false)) 79 | .isInstanceOf(IllegalArgumentException.class) 80 | .hasMessageContaining("fix payload or set the 'lineBits' option to 0."); 81 | String missingWord = "p:12,l:34,x:78.9,y:87.6,w:54.3,h:2.1"; 82 | assertThatThrownBy(() -> OcrInfo.parse(toChars(missingWord), 0, missingWord.length(), 9, 11, 12, 12, false)) 83 | .isInstanceOf(IllegalArgumentException.class) 84 | .hasMessageContaining("fix payload or set the 'wordBits' option to 0."); 85 | String missingPage = "l:34,n:56,x:78.9,y:87.6,w:54.3,h:2.1"; 86 | assertThatThrownBy(() -> OcrInfo.parse(toChars(missingPage), 0, missingPage.length(), 9, 11, 12, 12, false)) 87 | .isInstanceOf(IllegalArgumentException.class) 88 | .hasMessageContaining("fix payload or set the 'pageBits' option to 0."); 89 | String missingCoord = "p:12,l:34,n:56,x:78.9,y:87.6,w:54.3"; 90 | assertThatThrownBy(() -> OcrInfo.parse(toChars(missingCoord), 0, missingCoord.length(), 9, 11, 12, 12, false)) 91 | .isInstanceOf(IllegalArgumentException.class) 92 | .hasMessageContaining("coordinates are missing from payload "); 93 | } 94 | } -------------------------------------------------------------------------------- /src/test/java/de/digitalcollections/solr/plugin/components/ocrhighlighting/OcrHighlightingTest.java: -------------------------------------------------------------------------------- 1 | package de.digitalcollections.solr.plugin.components.ocrhighlighting; 2 | 3 | import java.nio.file.Files; 4 | import java.nio.file.Paths; 5 | import org.apache.solr.SolrTestCaseJ4; 6 | import org.apache.solr.handler.component.SearchComponent; 7 | import org.junit.BeforeClass; 8 | import org.junit.Test; 9 | 10 | public class OcrHighlightingTest extends SolrTestCaseJ4 { 11 | @BeforeClass 12 | public static void beforeClass() throws Exception { 13 | initCore("solrconfig.xml", "schema.xml", "src/test/resources/solr", "alldata"); 14 | 15 | // The highlighting component should be active 16 | SearchComponent highlighter = h.getCore().getSearchComponent("ocr_highlight"); 17 | assertTrue("wrong highlighter: " + highlighter.getClass(), 18 | highlighter instanceof OcrHighlighting); 19 | 20 | String ocrText = String.join(" ", Files 21 | .readAllLines(Paths.get(OcrHighlighting.class.getResource("/data/ocrtext_full.txt").toURI()))); 22 | assertU(adoc("ocr_text", "two|p:27,l:13,n:24,x:12.3,y:43.2,w:54.3,h:65.4, one|p:28,l:27,n:64,x:65.4,y:54.3,w:43.2,h:32.1", "id", "101")); 23 | assertU(adoc("ocr_text", "three|p:28,l:14,n:25,x:12.7,y:48.2,w:54.9,h:65.4, two|p:29,l:27,n:64,x:65.4,y:54.3,w:43.1,h:34.1, five|p:30,l:17,n:80,x:0,y:0,w:0,h:0, " 24 | + "four|p:31,l:32,n:33,x:11.1,y:11.1,w:11.1,h:11.1", "id", "102")); 25 | assertU(adoc("ocr_text", ocrText, "id", "103")); 26 | 27 | // Test with a dynamic field 28 | assertU(adoc("body_ocr", "one|p:42,l:13,n:55,x:11.1,y:22.2,w:33.3,h:44.4, two|p:42,l:13,n:66,x:55.5,y:66.6,w:77.7,h:88.8", "id", "106")); 29 | 30 | assertU(commit()); 31 | } 32 | 33 | @Test 34 | public void testSingleQueryTerm() { 35 | assertQ( 36 | "single query term", 37 | req("q", "two", "sort", "id asc", "ocr_hl", "true","ocr_hl.fields", "ocr_text", "df", "ocr_text"), 38 | "count(//lst[@name='ocr_highlighting']/*)=2", 39 | "//lst[@name='ocr_highlighting']/lst[@name='101']/arr[@name='ocr_text']/lst[1]/int[@name='page']='27'", 40 | "count(//lst[@name='ocr_highlighting']/lst[@name='101']/arr[@name='ocr_text']/lst)=number('1')", 41 | "//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst[1]/int[@name='page']='29'", 42 | "count(//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst)=number('1')"); 43 | } 44 | 45 | @Test 46 | public void testMultipleQueryTerms() { 47 | assertQ( 48 | "multiple query terms", 49 | req("q", "five four", "sort", "id asc", "ocr_hl", "true","ocr_hl.fields", "ocr_text", "df", "ocr_text"), 50 | "count(//lst[@name='ocr_highlighting']/*)=1", 51 | "count(//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst)=number('2')", 52 | "//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst[1]/int[@name='page']='30'", 53 | "//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst[2]/int[@name='page']='31'"); 54 | 55 | } 56 | 57 | @Test 58 | public void testMultipleFuzzyQueryTerms() { 59 | assertQ( 60 | "multiple fuzzy query terms", 61 | req("q", "fives fours", "sort", "id asc", "ocr_hl", "true","ocr_hl.fields", "ocr_text", "df", "ocr_text"), 62 | "count(//lst[@name='ocr_highlighting']/*)=1", 63 | "count(//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst)=number('2')", 64 | "//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst[1]/int[@name='page']='30'", 65 | "//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst[1]/str[@name='term']='five'", 66 | "//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst[2]/int[@name='page']='31'", 67 | "//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst[2]/str[@name='term']='four'"); 68 | } 69 | 70 | @Test 71 | public void testLimitHighlightsPerDoc() { 72 | assertQ( 73 | "limit number of highlights per document", 74 | req("q", "und", "sort", "id asc", "ocr_hl", "true","ocr_hl.fields", "ocr_text", "ocr_hl.maxPerDoc", "5", "df", 75 | "ocr_text"), 76 | "count(//lst[@name='ocr_highlighting']/lst[@name='103']/arr[@name='ocr_text']/lst)=number('5')"); 77 | } 78 | 79 | @Test 80 | public void testLimitHighlightsPerPage() { 81 | assertQ( 82 | "limit number of highlights per page", 83 | req("q", "und", "sort", "id asc", "ocr_hl", "true","ocr_hl.fields", "ocr_text", "ocr_hl.maxPerPage", "5", "df", 84 | "ocr_text"), 85 | "count(//lst[@name='ocr_highlighting']/lst[@name='103']/arr[@name='ocr_text']/lst[int[@name='page']='183'])=number('5')"); 86 | } 87 | 88 | @Test 89 | public void testDynamicField() { 90 | assertQ( 91 | "Dynamic field contains term with page number and word position", 92 | req("q", "one two", "sort", "id asc", "ocr_hl", "true", "ocr_hl.fields", "body_ocr", "df", "body_ocr"), 93 | "count(//lst[@name='ocr_highlighting']/*)=1", 94 | "count(//lst[@name='ocr_highlighting']/lst[@name='106']/arr[@name='body_ocr']/lst)=number('2')", 95 | "(//lst[@name='ocr_highlighting']/lst[@name='106']/arr[@name='body_ocr']/lst)[1]/int[@name='page']='42'", 96 | "(//lst[@name='ocr_highlighting']/lst[@name='106']/arr[@name='body_ocr']/lst)[1]/int[@name='word']='55'", 97 | "(//lst[@name='ocr_highlighting']/lst[@name='106']/arr[@name='body_ocr']/lst)[1]/int[@name='line']='13'", 98 | "(//lst[@name='ocr_highlighting']/lst[@name='106']/arr[@name='body_ocr']/lst)[2]/int[@name='page']='42'", 99 | "(//lst[@name='ocr_highlighting']/lst[@name='106']/arr[@name='body_ocr']/lst)[2]/int[@name='line']='13'", 100 | "(//lst[@name='ocr_highlighting']/lst[@name='106']/arr[@name='body_ocr']/lst)[2]/int[@name='word']='66'" 101 | ); 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/main/java/de/digitalcollections/lucene/analysis/payloads/OcrPayloadHelper.java: -------------------------------------------------------------------------------- 1 | package de.digitalcollections.lucene.analysis.payloads; 2 | 3 | import com.google.common.math.IntMath; 4 | import java.math.BigInteger; 5 | import java.util.Arrays; 6 | import org.apache.lucene.util.BytesRef; 7 | 8 | /** Helper class to decode and encode OCR information from/into an efficient binary representation. **/ 9 | public class OcrPayloadHelper { 10 | 11 | private OcrPayloadHelper() { 12 | // Cannot be instantiated, is only here for the static methods 13 | } 14 | 15 | /** 16 | * Encode a {@link OcrInfo} object into a byte array. 17 | * 18 | * If the coordinates are set to be stored as relative (i.e. percentage values), we first scale the bounding box 19 | * coordinates according to `precision`. We then pack the complete information into `coordBits * 4` bits. The bit 20 | * packing is done to save as much space as possible, while scaling is used t ostill maintain as much precision as 21 | * possible. 22 | * 23 | * Optionally, we also store word, line and page indices if the corresponding option 24 | * (`wordBits`, `lineBits`, `pageBits`) is non-zero. 25 | * 26 | * Here an example with page, line and word indices, relative coordinates and 10 bits per coordinate value: 27 | * 28 | * **Input:** 29 | * ``` 30 | * info = OcrInfo(pageIndex=837, lineIndex=13, wordIndex=20, horizontalOffset=0.136838387, 31 | * verticalOffset=0.477909823, width=0.978231258, height=0.532390081), 32 | * coordBits = 10 33 | * wordBits = 9 34 | * lineBits = 10 35 | * pageBits = 12 36 | * absoluteCoordinates = false 37 | * ``` 38 | * 39 | * Since we are using 10 bits for each of the four coordinates, 9 bits for the word index, 10 for the line index and 40 | * 12 for the page index, the resulting binary representation will have 72 bits (`4 * 10 + 9 + 11 + 12`) or 9 bytes. 41 | * This is very space-efficient compared to a string-based encoding, e.g. `x136y478w978h532n20l13p837`, which is 42 | * 36 bytes. 43 | * 44 | * 45 | * **Output:** 46 | * ``` 47 | *
{@code
 48 |    * field     | width  |        scaled value        | binary representation
 49 |    * ========================================================================
 50 |    * pageIndex | 12bit  |                        837 | 001101000100
 51 |    * lineIndex | 11bit  |                         13 |  00000001101
 52 |    * wordIndex |  9bit  |                         20 |    000010100
 53 |    * x         | 10bit  | 0.136838387 * 2^10 ~>  140 |   0010001100
 54 |    * y         | 10bit  | 0.477909823 * 2^10 ~>  489 |   0111101001
 55 |    * width     | 10bit  | 0.978231258 * 2^10 ~> 1002 |   1111101010
 56 |    * height    | 10bit  | 0.532390081 * 2^10 ~>  545 |   1000100001
 57 |    * }
58 | * ```` 59 | * 60 | * The resulting byte sequence is as follows (bytes are separated by whitespace): 61 | * ``` 62 | * pageIndex | lineIndex | wordIndex| x | y | w | h 63 | * 00110100 0100|0000 0001101|0 00010100| 00100011 00|011110 1001|1111 101010|10 00100001 64 | * 0x34 0x40 0x1A 0x14 0x23 0x1E 0x9F 0xAA 0x21 65 | * ``` 66 | * 67 | * @param info The {@link OcrInfo} to encode 68 | * @param coordBits The number of bits to encode each OCR coordinate value into 69 | * @param wordBits The number of bits to encode the word index into 70 | * @param lineBits The number of bits to encode the line index into 71 | * @param pageBits The number of bits to encode the page index into 72 | * @return The resulting byte payload 73 | */ 74 | public static byte[] encodeOcrInfo(OcrInfo info, int coordBits, int wordBits, int lineBits, int pageBits) { 75 | // To make bit-fiddling easier, we encode all the values into an arbitrary-length BigInteger 76 | int numBitsTotal = getOutputSize(coordBits, wordBits, lineBits, pageBits); 77 | int outSize = (int) Math.ceil((double) numBitsTotal / 8.0); 78 | BigInteger encoded = new BigInteger(new byte[outSize]); 79 | 80 | if (pageBits > 0) { 81 | encoded = encoded.or(BigInteger.valueOf(info.getPageIndex())); 82 | } 83 | if (lineBits > 0) { 84 | encoded = encoded.shiftLeft(lineBits) 85 | .or(BigInteger.valueOf(info.getLineIndex())); 86 | } 87 | if (wordBits > 0) { 88 | encoded = encoded.shiftLeft(wordBits) 89 | .or(BigInteger.valueOf(info.getWordIndex())); 90 | } 91 | if (info.getHasAbsoluteCoordinates()) { 92 | encoded = encoded 93 | .shiftLeft(coordBits) 94 | .or(BigInteger.valueOf(verifyAbsoluteValue((int) info.getHorizontalOffset(), coordBits))) 95 | .shiftLeft(coordBits) 96 | .or(BigInteger.valueOf(verifyAbsoluteValue((int) info.getVerticalOffset(), coordBits))) 97 | .shiftLeft(coordBits) 98 | .or(BigInteger.valueOf(verifyAbsoluteValue((int) info.getWidth(), coordBits))) 99 | .shiftLeft(coordBits) 100 | .or(BigInteger.valueOf(verifyAbsoluteValue((int) info.getHeight(), coordBits))); 101 | 102 | } else { 103 | encoded = encoded 104 | .shiftLeft(coordBits) 105 | .or(BigInteger.valueOf(encodeValue(info.getHorizontalOffset(), coordBits))) 106 | .shiftLeft(coordBits) 107 | .or(BigInteger.valueOf(encodeValue(info.getVerticalOffset(), coordBits))) 108 | .shiftLeft(coordBits) 109 | .or(BigInteger.valueOf(encodeValue(info.getWidth(), coordBits))) 110 | .shiftLeft(coordBits) 111 | .or(BigInteger.valueOf(encodeValue(info.getHeight(), coordBits))); 112 | } 113 | 114 | byte[] out = encoded.toByteArray(); 115 | 116 | // FIXME: This should only strip as many leading zeroes as out.length - outSize 117 | // Strip extra leading null-bytes 118 | if (out.length > outSize) { 119 | byte[] trimmed = new byte[outSize]; 120 | int trimmedIdx = 0; 121 | boolean prefix = true; 122 | for (byte anOut : out) { 123 | if (anOut != 0 || !prefix) { 124 | prefix = false; 125 | trimmed[trimmedIdx] = anOut; 126 | trimmedIdx += 1; 127 | } 128 | } 129 | out = trimmed; 130 | } 131 | return out; 132 | } 133 | 134 | private static int verifyAbsoluteValue(int value, int coordBits) { 135 | if (value >= IntMath.pow(2, coordBits)) { 136 | throw new IllegalArgumentException(String.format( 137 | "Value %d exceeds legal range of %d bits (0 to %d).", value, coordBits, IntMath.pow(2, coordBits) - 1)); 138 | } 139 | return value; 140 | } 141 | 142 | /** Calculate the size of the payload resulting from the parameters **/ 143 | private static int getOutputSize(int coordBits, int wordBits, int lineBits, int pageBits) { 144 | int outSize = coordBits * 4; 145 | if (pageBits > 0) { 146 | outSize += pageBits; 147 | } 148 | if (lineBits > 0) { 149 | outSize += lineBits; 150 | } 151 | if (wordBits > 0) { 152 | outSize += wordBits; 153 | } 154 | return outSize; 155 | } 156 | 157 | /** 158 | * Encode a given floating point value (between 0 and 1) to an integer with the given number of bits. 159 | **/ 160 | private static int encodeValue(float source, int numBits) { 161 | return (int) Math.round(source * Math.pow(2, numBits)); 162 | } 163 | 164 | /** 165 | * Decode a given integer (encoded with a certain number of bits) to a floating point value. 166 | **/ 167 | private static float decodeValue(long source, int numBits) { 168 | return (float) (source / Math.pow(2, numBits)); 169 | } 170 | 171 | /** 172 | * Create a bit mask to mask out a given number of bits 173 | */ 174 | private static BigInteger makeBitMask(int numBits) { 175 | return BigInteger.valueOf(IntMath.pow(2, numBits) - 1); 176 | } 177 | 178 | /** 179 | * Decode an {@link OcrInfo} instance from the encoded byte array. 180 | * 181 | * @param data Buffer with encoded binary OCR information 182 | * @param coordBits Number of bits the OCR information was encoded with 183 | * @param wordBits Number of bits the word index was encoded with 184 | * @param lineBits Number of bits the line index was encoded with 185 | * @param pageBits Number of bits the page index was encoded with 186 | * @param absoluteCoordinates Whether the coordinates are stored absolute or relative (percent-values) 187 | * @return The decoded {@link OcrInfo} instance 188 | */ 189 | public static OcrInfo decodeOcrInfo(BytesRef data, int coordBits, int wordBits, int lineBits, int pageBits, 190 | boolean absoluteCoordinates) { 191 | int coordMask = IntMath.pow(2, coordBits) - 1; 192 | OcrInfo info = new OcrInfo(); 193 | info.setHasAbsoluteCoordinates(absoluteCoordinates); 194 | BigInteger encoded = new BigInteger(Arrays.copyOfRange(data.bytes, data.offset, data.offset + data.length)); 195 | 196 | if (absoluteCoordinates) { 197 | info.setHeight(encoded.and(BigInteger.valueOf(coordMask)).intValue()); 198 | info.setWidth(encoded.shiftRight(coordBits) 199 | .and(BigInteger.valueOf(coordMask)).intValue()); 200 | info.setVerticalOffset(encoded.shiftRight(coordBits * 2) 201 | .and(BigInteger.valueOf(coordMask)).intValue()); 202 | info.setHorizontalOffset(encoded.shiftRight(coordBits * 3) 203 | .and(BigInteger.valueOf(coordMask)).intValue()); 204 | } else { 205 | info.setHeight(OcrPayloadHelper.decodeValue( 206 | encoded.and(BigInteger.valueOf(coordMask)).intValue(), coordBits)); 207 | info.setWidth(OcrPayloadHelper.decodeValue( 208 | encoded.shiftRight(coordBits) 209 | .and(BigInteger.valueOf(coordMask)).intValue(), coordBits)); 210 | info.setVerticalOffset(OcrPayloadHelper.decodeValue( 211 | encoded.shiftRight(coordBits * 2) 212 | .and(BigInteger.valueOf(coordMask)).intValue(), coordBits)); 213 | info.setHorizontalOffset(OcrPayloadHelper.decodeValue( 214 | encoded.shiftRight(coordBits * 3) 215 | .and(BigInteger.valueOf(coordMask)).intValue(), coordBits)); 216 | } 217 | 218 | int shift = coordBits * 4; 219 | if (wordBits > 0) { 220 | info.setWordIndex(encoded.shiftRight(shift).and(makeBitMask(wordBits)).intValue()); 221 | shift += wordBits; 222 | } 223 | if (lineBits > 0) { 224 | info.setLineIndex(encoded.shiftRight(shift).and(makeBitMask(lineBits)).intValue()); 225 | shift += lineBits; 226 | } 227 | if (pageBits > 0) { 228 | info.setPageIndex(encoded.shiftRight(shift).intValue()); 229 | } 230 | 231 | return info; 232 | } 233 | } 234 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | 5 | solr-ocrpayload-plugin 6 | de.digitalcollections.search 7 | 0.2.2-SNAPSHOT 8 | jar 9 | Solr OCR Coordinate Payload Plugin 10 | 11 | Efficient indexing and bounding-box "highlighting" for OCR text 12 | 13 | https://github.com/dbmdz/solr-ocrpayload-plugin 14 | 15 | 16 | MIT License 17 | https://github.com/dbmdz/solr-ocrpayload-plugin/blob/master/LICENSE 18 | repo 19 | 20 | 21 | 22 | 23 | 24 | Johannes Baiter 25 | johannes.baiter@bsb-muenchen.de 26 | jbaiter 27 | 28 | 29 | Christoph Lorenz 30 | christoph.lorenz@bsb-muenchen.de 31 | clorenz 32 | 33 | 34 | 35 | 36 | https://travis-ci.org/dbmdz/solr-ocrpayload-plugin 37 | Travis CI 38 | 39 | 40 | 41 | https://github.com/dbmdz/solr-ocrpayload-plugin/issues 42 | GitHub Issues 43 | 44 | 45 | 46 | https://github.com/dbmdz/solr-ocrpayload-plugin.git 47 | git@github.com:dbmdz/solr-ocrpayload-plugin.git 48 | https://github.com/dbmdz/solr-ocrpayload-plugin 49 | 50 | 51 | 52 | 1.8 53 | 1.8 54 | 1.8 55 | UTF-8 56 | 57 | 3.12.1 58 | 1.2.0 59 | 5.3.2 60 | 2.11.1 61 | 1.7.25 62 | 7.5.0 63 | 64 | 0.8.3 65 | 3.0.0 66 | 3.8.0 67 | 3.1.1 68 | 3.1.0 69 | 3.0.1 70 | 2.22.1 71 | 1.6.8 72 | 73 | 74 | 75 | 76 | com.revinate 77 | assertj-json 78 | ${version.assertj-json} 79 | test 80 | 81 | 82 | org.apache.logging.log4j 83 | log4j-core 84 | ${version.log4j} 85 | 86 | 87 | org.apache.solr 88 | solr-core 89 | ${version.solr} 90 | compile 91 | 92 | 93 | org.apache.solr 94 | solr-test-framework 95 | ${version.solr} 96 | test 97 | 98 | 99 | org.assertj 100 | assertj-core 101 | ${version.assertj} 102 | test 103 | 104 | 105 | org.junit.jupiter 106 | junit-jupiter-api 107 | ${version.junit} 108 | test 109 | 110 | 111 | org.junit.jupiter 112 | junit-jupiter-engine 113 | ${version.junit} 114 | test 115 | 116 | 117 | org.junit.jupiter 118 | junit-jupiter-params 119 | ${version.junit} 120 | test 121 | 122 | 123 | org.junit.vintage 124 | junit-vintage-engine 125 | ${version.junit} 126 | 127 | 128 | org.slf4j 129 | slf4j-api 130 | ${version.slf4j} 131 | 132 | 133 | org.slf4j 134 | slf4j-nop 135 | ${version.slf4j} 136 | test 137 | 138 | 139 | 140 | 141 | 142 | 143 | org.apache.maven.plugins 144 | maven-checkstyle-plugin 145 | ${version.maven-checkstyle-plugin} 146 | 147 | 148 | validate 149 | validate 150 | 151 | https://raw.githubusercontent.com/dbmdz/development/master/code-quality/checkstyle.xml 152 | UTF-8 153 | true 154 | true 155 | false 156 | 157 | 158 | check 159 | 160 | 161 | 162 | 163 | 164 | org.apache.maven.plugins 165 | maven-compiler-plugin 166 | ${version.maven-compiler-plugin} 167 | 168 | 1.8 169 | 1.8 170 | 171 | 172 | 173 | org.apache.maven.plugins 174 | maven-jar-plugin 175 | ${version.maven-jar-plugin} 176 | 177 | 178 | 179 | true 180 | true 181 | 182 | 183 | 184 | 185 | 186 | org.apache.maven.plugins 187 | maven-javadoc-plugin 188 | ${version.maven-javadoc-plugin} 189 | 190 | 8 191 | 192 | 193 | 194 | attach-javadocs 195 | 196 | jar 197 | 198 | 199 | 200 | 201 | 202 | org.apache.maven.plugins 203 | maven-source-plugin 204 | ${version.maven-source-plugin} 205 | 206 | 207 | attach-sources 208 | 209 | jar-no-fork 210 | 211 | 212 | 213 | 214 | 215 | org.apache.maven.plugins 216 | maven-surefire-plugin 217 | ${version.maven-surefire-plugin} 218 | 219 | 220 | file:/dev/./urandom 221 | 222 | 223 | 224 | 225 | org.jacoco 226 | jacoco-maven-plugin 227 | ${version.jacoco-maven-plugin} 228 | 229 | 230 | pre-unit-test 231 | 232 | prepare-agent 233 | 234 | 235 | 236 | test 237 | 238 | report 239 | 240 | 241 | 242 | 243 | 244 | org.sonatype.plugins 245 | nexus-staging-maven-plugin 246 | ${version.nexus-staging-maven-plugin} 247 | true 248 | 249 | ossrh 250 | https://oss.sonatype.org/ 251 | true 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | org.jacoco 261 | jacoco-maven-plugin 262 | ${version.jacoco-maven-plugin} 263 | 264 | 265 | 266 | 267 | 268 | 269 | deploy 270 | 271 | 272 | 273 | org.apache.maven.plugins 274 | maven-gpg-plugin 275 | 1.6 276 | 277 | 278 | sign-artifacts 279 | verify 280 | 281 | sign 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | ossrh-snapshots 294 | Sonatype Nexus Snapshots 295 | https://oss.sonatype.org/content/repositories/snapshots 296 | 297 | 298 | 299 | 300 | 301 | ossrh-snapshots 302 | Sonatype Nexus Snapshots 303 | https://oss.sonatype.org/content/repositories/snapshots 304 | 305 | true 306 | 307 | 308 | false 309 | 310 | 311 | 312 | 313 | -------------------------------------------------------------------------------- /src/main/java/de/digitalcollections/lucene/analysis/payloads/OcrInfo.java: -------------------------------------------------------------------------------- 1 | package de.digitalcollections.lucene.analysis.payloads; 2 | 3 | import com.google.common.math.IntMath; 4 | import java.util.Comparator; 5 | import java.util.HashSet; 6 | import java.util.Set; 7 | import java.util.regex.Matcher; 8 | import java.util.regex.Pattern; 9 | 10 | public class OcrInfo implements Comparable { 11 | 12 | private static final Pattern PAYLOAD_PAT = Pattern.compile("(\\D+):([0-9.]+),?"); 13 | 14 | private boolean hasAbsoluteCoordinates = false; 15 | private float horizontalOffset = -1.0f; 16 | private float verticalOffset = -1.0f; 17 | private float width = -1.0f; 18 | private float height = -1.0f; 19 | private int pageIndex = -1; 20 | private int lineIndex = -1; 21 | private int wordIndex = -1; 22 | 23 | private String term; // optional, only when returning search results 24 | 25 | OcrInfo() { 26 | // NOP 27 | } 28 | 29 | public OcrInfo(int horizontalOffset, int verticalOffset, int width, int height) { 30 | this(-1, horizontalOffset, verticalOffset, width, height); 31 | this.setHasAbsoluteCoordinates(true); 32 | } 33 | 34 | public OcrInfo(int pageIndex, int horizontalOffset, int verticalOffset, int width, int height) { 35 | this(pageIndex, -1, -1, horizontalOffset, verticalOffset, width, height); 36 | this.setHasAbsoluteCoordinates(true); 37 | } 38 | 39 | public OcrInfo(int pageIndex, int lineIndex, int horizontalOffset, int verticalOffset, int width, int height) { 40 | this(pageIndex, lineIndex, -1, horizontalOffset, verticalOffset, width, height); 41 | this.setHasAbsoluteCoordinates(true); 42 | } 43 | 44 | public OcrInfo(int pageIndex, int lineIndex, int wordIndex, int horizontalOffset, int verticalOffset, int width, int height) { 45 | this.setHasAbsoluteCoordinates(true); 46 | this.setHorizontalOffset(horizontalOffset); 47 | this.setVerticalOffset(verticalOffset); 48 | this.setWidth(width); 49 | this.setHeight(height); 50 | this.setPageIndex(pageIndex); 51 | this.setLineIndex(lineIndex); 52 | this.setWordIndex(wordIndex); 53 | } 54 | 55 | public OcrInfo(float horizontalOffset, float verticalOffset, float width, float height) { 56 | this(-1, horizontalOffset, verticalOffset, width, height); 57 | } 58 | 59 | public OcrInfo(int pageIndex, float horizontalOffset, float verticalOffset, float width, float height) { 60 | this.setHorizontalOffset(horizontalOffset); 61 | this.setVerticalOffset(verticalOffset); 62 | this.setWidth(width); 63 | this.setHeight(height); 64 | this.setPageIndex(pageIndex); 65 | } 66 | 67 | public OcrInfo(int pageIndex, int lineIndex, float horizontalOffset, float verticalOffset, float width, float height) { 68 | this(pageIndex, horizontalOffset, verticalOffset, width, height); 69 | this.lineIndex = lineIndex; 70 | } 71 | 72 | public OcrInfo(int pageIndex, int lineIndex, int wordIndex, float horizontalOffset, float verticalOffset, float width, float height) { 73 | this(pageIndex, lineIndex, horizontalOffset, verticalOffset, width, height); 74 | this.wordIndex = wordIndex; 75 | } 76 | 77 | /** 78 | * Parse an {@link OcrInfo} object from a character buffer. 79 | * 80 | * The string contains comma-separated pairs of single-character keys and numerical 81 | * values, e.g. `x:13.37`. 82 | * 83 | * Valid keys are: 84 | * - **p**: Page index, ranging from 0 to 2^pageBits (optional) 85 | * - **l**: Line index, ranging from 0 to 2^lineBits (optional) 86 | * - **n**: Word index, ranging from 0 to 2^wordBits (optional) 87 | * - **x**: Horizontal offset as floating point percentage in range [0...100] 88 | * OR absolute position as unsigned integer in range [0...2^coordBits] (mandatory) 89 | * - **y**: Vertical offset as floating point percentage in range [0...100] 90 | * OR absolute position as unsigned integer in range [0...2^coordBits] (mandatory) 91 | * - **w**: Width as floating point percentage in range [0...100] 92 | * OR absolute position as unsigned integer in range [0...2^coordBits] (mandatory) 93 | * - **h**: Height as floating point percentage in range [0...100] 94 | * OR absolute position as unsigned integer in range [0...2^coordBits] (mandatory) 95 | * 96 | * Here es an example: `p:27,l:50,n:13,x:13.1,y:52.7,w:87.9,h:5.3` 97 | * or, with integral (absolute) coordinate 98 | * 99 | * @param buffer Input character buffer 100 | * @param offset Offset of the encoded character information 101 | * @param length Length of the encoded character information 102 | * @param wordBits Number of bits used for encoding the word index 103 | * @param lineBits Number of bits used for encoding the line index 104 | * @param pageBits Number of bits used for encoding the page index 105 | * @param coordBits Number of bits used for encoding the coordinates 106 | * @param absoluteCoordinates Whether the coordinates are stored absolute or relative (percent-values) 107 | * @return The decoded {@link OcrInfo} instance 108 | */ 109 | public static OcrInfo parse(char[] buffer, int offset, int length, int wordBits, int lineBits, int pageBits, 110 | int coordBits, boolean absoluteCoordinates) { 111 | OcrInfo info = new OcrInfo(); 112 | info.setHasAbsoluteCoordinates(absoluteCoordinates); 113 | 114 | String payload = new String(buffer, offset, length).toLowerCase(); 115 | Matcher m = PAYLOAD_PAT.matcher(payload); 116 | Set seenKeys = new HashSet<>(); 117 | while (m.find()) { 118 | char key = m.group(1).charAt(0); 119 | if (seenKeys.contains(key)) { 120 | throw new IllegalArgumentException(String.format("Invalid payload %s: duplicate key '%c'", payload, key)); 121 | } else { 122 | seenKeys.add(key); 123 | } 124 | String value = m.group(2); 125 | switch (key) { 126 | case 'p': 127 | info.setPageIndex(parseIntValue(value, pageBits, "page", payload)); 128 | break; 129 | case 'l': 130 | info.setLineIndex(parseIntValue(value, lineBits, "line", payload)); 131 | break; 132 | case 'n': 133 | info.setWordIndex(parseIntValue(value, wordBits, "word", payload)); 134 | break; 135 | case 'x': 136 | if (absoluteCoordinates) { 137 | info.setHorizontalOffset(parseIntValue(value, coordBits, "x", payload)); 138 | } else { 139 | info.setHorizontalOffset(Float.parseFloat(value) / 100f); 140 | } 141 | break; 142 | case 'y': 143 | if (absoluteCoordinates) { 144 | info.setVerticalOffset(parseIntValue(value, coordBits, "y", payload)); 145 | } else { 146 | info.setVerticalOffset(Float.parseFloat(value) / 100f); 147 | } 148 | break; 149 | case 'w': 150 | if (absoluteCoordinates) { 151 | info.setWidth(parseIntValue(value, coordBits, "w", payload)); 152 | } else { 153 | info.setWidth(Float.parseFloat(value) / 100f); 154 | } 155 | break; 156 | case 'h': 157 | if (absoluteCoordinates) { 158 | info.setHeight(parseIntValue(value, coordBits, "h", payload)); 159 | } else { 160 | info.setHeight(Float.parseFloat(value) / 100f); 161 | } 162 | break; 163 | default: 164 | throw new IllegalArgumentException(String.format( 165 | "Could not parse OCR bounding box information, string was %s, invalid character was %c", 166 | new String(buffer, offset, length), key)); 167 | } 168 | } 169 | if (info.getHorizontalOffset() < 0 || info.getHorizontalOffset() < 0 || info.getWidth() < 0 || info.getHeight() < 0) { 170 | throw new IllegalArgumentException(String.format( 171 | "One or more coordinates are missing from payload (was %s), make sure you have 'x', 'y', 'w' and 'h' set!", 172 | payload)); 173 | } 174 | if (pageBits > 0 && info.getPageIndex() < 0) { 175 | throw new IllegalArgumentException(String.format( 176 | "Page index is missing from payload (was: '%s'), fix payload or set the 'pageBits' option to 0.", payload)); 177 | } 178 | if (lineBits > 0 && info.getLineIndex() < 0) { 179 | throw new IllegalArgumentException(String.format( 180 | "Line index is missing from payload (was: '%s'), fix payload or set the 'lineBits' option to 0.", payload)); 181 | } 182 | if (wordBits > 0 && info.getWordIndex() < 0) { 183 | throw new IllegalArgumentException(String.format( 184 | "Word index is missing from payload (was: '%s'), fix payload or set the 'wordBits' option to 0.", payload)); 185 | } 186 | return info; 187 | } 188 | 189 | private static int parseIntValue(String value, int numBits, String type, String payload) { 190 | int index = Integer.parseInt(value); 191 | if (index >= IntMath.pow(2, numBits)) { 192 | throw new IllegalArgumentException(String.format("Value %d for %s needs more than %d bits (valid values range from 0 to %d). Payload=%s", 193 | index, type, numBits, IntMath.pow(2, numBits) - 1, payload)); 194 | } 195 | return index; 196 | } 197 | 198 | public float getHorizontalOffset() { 199 | return horizontalOffset; 200 | } 201 | 202 | public void setHorizontalOffset(float horizontalOffset) { 203 | this.horizontalOffset = horizontalOffset; 204 | } 205 | 206 | private void checkCoordinate(float coordinate) { 207 | if (coordinate > 1) { 208 | throw new IllegalArgumentException(String.format("Coordinates can at most be 100, was %1f!", coordinate * 100)); 209 | } 210 | } 211 | 212 | public float getVerticalOffset() { 213 | return verticalOffset; 214 | } 215 | 216 | public void setVerticalOffset(float verticalOffset) { 217 | if (!hasAbsoluteCoordinates) { 218 | checkCoordinate(verticalOffset); 219 | } 220 | this.verticalOffset = verticalOffset; 221 | } 222 | 223 | public float getWidth() { 224 | return width; 225 | } 226 | 227 | public void setWidth(float width) { 228 | if (!hasAbsoluteCoordinates) { 229 | checkCoordinate(width); 230 | } 231 | this.width = width; 232 | } 233 | 234 | public float getHeight() { 235 | return height; 236 | } 237 | 238 | public void setHeight(float height) { 239 | if (!hasAbsoluteCoordinates) { 240 | checkCoordinate(height); 241 | } 242 | this.height = height; 243 | } 244 | 245 | public int getPageIndex() { 246 | return pageIndex; 247 | } 248 | 249 | public void setPageIndex(int pageIndex) { 250 | this.pageIndex = pageIndex; 251 | } 252 | 253 | public String getTerm() { 254 | return term; 255 | } 256 | 257 | public void setTerm(String term) { 258 | this.term = term; 259 | } 260 | 261 | public int getLineIndex() { 262 | return lineIndex; 263 | } 264 | 265 | public void setLineIndex(int lineIndex) { 266 | this.lineIndex = lineIndex; 267 | } 268 | 269 | public int getWordIndex() { 270 | return wordIndex; 271 | } 272 | 273 | public void setWordIndex(int wordIndex) { 274 | this.wordIndex = wordIndex; 275 | } 276 | 277 | @Override 278 | public String toString() { 279 | return "OcrInfo{" 280 | + "horizontalOffset=" + horizontalOffset 281 | + ", verticalOffset=" + verticalOffset 282 | + ", width=" + width 283 | + ", height=" + height 284 | + ", pageIndex=" + pageIndex 285 | + ", lineIndex=" + lineIndex 286 | + ", wordIndex=" + wordIndex 287 | + ", term='" + term + '\'' 288 | + '}'; 289 | } 290 | 291 | @Override 292 | public int compareTo(OcrInfo other) { 293 | return Comparator 294 | .comparing(OcrInfo::getPageIndex) 295 | .thenComparing(OcrInfo::getLineIndex) 296 | .thenComparing(OcrInfo::getWordIndex) 297 | .thenComparing(OcrInfo::getHorizontalOffset) 298 | .thenComparing(OcrInfo::getVerticalOffset) 299 | .compare(this, other); 300 | } 301 | 302 | public boolean getHasAbsoluteCoordinates() { 303 | return hasAbsoluteCoordinates; 304 | } 305 | 306 | public void setHasAbsoluteCoordinates(boolean hasAbsoluteCoordinates) { 307 | this.hasAbsoluteCoordinates = hasAbsoluteCoordinates; 308 | } 309 | } 310 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # :construction: Deprecated in favor of [solr-ocrhighlighting](https://github.com/dbmdz/solr-ocrhighlighting) 2 | 3 | # Solr OCR Coordinate Payload Plugin 4 | 5 | [![Javadocs](https://javadoc.io/badge/de.digitalcollections.search/solr-ocrpayload-plugin.svg)](https://javadoc.io/doc/de.digitalcollections.search/solr-ocrpayload-plugin) 6 | [![Build Status](https://img.shields.io/travis/dbmdz/solr-ocrpayload-plugin/master.svg)](https://travis-ci.org/dbmdz/solr-ocrpayload-plugin) 7 | [![Codecov](https://img.shields.io/codecov/c/github/dbmdz/solr-ocrpayload-plugin/master.svg)](https://codecov.io/gh/dbmdz/solr-ocrpayload-plugin) 8 | [![MIT License](https://img.shields.io/github/license/dbmdz/solr-ocrpayload-plugin.svg)](LICENSE) 9 | [![GitHub release](https://img.shields.io/github/release/dbmdz/solr-ocrpayload-plugin.svg)](https://github.com/dbmdz/solr-ocrpayload-plugin/releases) 10 | [![Maven Central](https://img.shields.io/maven-central/v/de.digitalcollections.search/solr-ocrpayload-plugin.svg)](https://search.maven.org/search?q=a:solr-ocrpayload-plugin) 11 | 12 | *Efficient indexing and bounding-box "highlighting" for OCR text* 13 | 14 | ## tl;dr 15 | 16 | - Store OCR bounding box information and token position directly in the Solr index in a space-efficient manner 17 | - Retrieve bounding box and token position directly in your Solr query results, no additional parsing necessary 18 | 19 | **Indexing**: 20 | 21 | The OCR information is appended after each token as a concatenated list of `:` pairs, see further down 22 | for a detailed description of available keys. 23 | 24 | `POST /solr/mycore/update` 25 | 26 | ```json 27 | [{ "id": "test_document", 28 | "ocr_text": "this|p:13,l:5,n:6,x:11.1,y:22.2,w:33.3,h:44.4 is|p:13,l:5,n:7,x:22.2,y:33.3,w:44.4,h:55.5 a|p:13,l:5,n:8,x:33.3,y:33.3,w:44.4,h:55.5 test|p:13,l:5,n:9,x:44.4,y:33.3,w:44.4h:55.5" }] 29 | ``` 30 | 31 | **Querying**: 32 | 33 | The plugin adds a new top-level key (`ocr_highlight` in this case) that contains the OCR information for 34 | each matching token as a structured object. 35 | 36 | `GET /solr/mycore/select?ocr_hl=true&ocr_hl.fields=ocr_text&indent=true&wt=json&q=test` 37 | 38 | ```json 39 | { 40 | "responseHeader": "...", 41 | "response": { 42 | "numFound": 1, 43 | "docs": [{"id": "test_document"}] 44 | }, 45 | "ocr_highlight":{ 46 | "test_document":{ 47 | "ocr_text":[{ 48 | "term":"test", 49 | "page":13, 50 | "line": 5, 51 | "word": 9, 52 | "x":0.444, 53 | "y":0.333, 54 | "width":0.444, 55 | "height":0.555}] 56 | } 57 | } 58 | } 59 | ``` 60 | 61 | ## Use Case 62 | At the Bavarian State Library, we try to provide full-text search over all of our OCRed content. In addition 63 | to obtaining matching documents, the user should also get a small snippet of the corresponding part of the 64 | page image, with the matching words highlighted, similar to what e.g. Google Books provides. 65 | 66 | 67 | ## Approaches 68 | For this to work, we need some way of mapping matching tokens to their corresponding location in the underlying 69 | OCR text. A common approach used by a number of libraries is to **use a secondary microservice for this** that takes 70 | as input a document identifier and a text snippet and will return all coordinates of matching text snippets on 71 | the page. While this approach generally works okay, it has several drawbacks: 72 | 73 | - **Performance:** Every snippet requires a query to the OCR service, which itself has to do a linear scan 74 | through the OCR document. For e.g. a result set of 100 snippets, this will result in 101 queries (initial 75 | Solr query and 100 snippet queries). Of course this can be optimized by batching and having a good index 76 | structure for the coordinate lookup, but it's still less than ideal. 77 | - **Storage:** To reliably be able to map text matches to the base text, you have to store a copy of the 78 | full text in the index, alongside the regular index. This blows up the index size significantly. 79 | Foregoing storing the text and only using the normalized terms from the index for matching will 80 | break the mapping to OCR, since depending on the analyzer configuration, Lucene will perform stemming, etc. 81 | 82 | Alternatively, you could also **store the coordinates directly as strings in the index**. This works by e.g. 83 | indexing each token as `|` and telling Lucene to ignore everything after the pipe during 84 | analysis. As the full text of the document is stored, you wil get back a series of these annotated tokens 85 | as query results and can then parse the coordinates from your highlighting information. This solves the 86 | *Performance* part of the above approach, but worsens the *Storage* problem: For every token, we now not only 87 | have to store the token itself, but an expensive coordinate string as well. 88 | 89 | ## Our Approach 90 | 91 | This plugin uses a similar approach to the above, but solves the *Storage* problem by using an efficient binary 92 | format to store the OCR coordinate information in the index: We use bit-packing to combine a number of OCR 93 | coordinate parameters into a **byte payload**, which is not stored in the field itself, but as an associated 94 | [Lucene Payload](https://lucidworks.com/2017/09/14/solr-payloads/): 95 | 96 | - `x`, `y`, `w`, `h`: Coordinates of the bounding box on the page as either: 97 | - **absolute** unsigned integer offsets between 0 and `2^coordinateBits` (see below) 98 | - **relative** floating point percentages between 0 and 100 (e.g. `x:42.3` for a horizontal offset of 43.2%) 99 | - `pageIndex`: Unsigned integer that stores the page index of a token (optional) 100 | - `lineIndex`: Unsigned integer that stores the line index of a token (optional) 101 | - `wordIndex`: Unsigned integer that stores the word index of a token (optional) 102 | 103 | For each of these values, you can configure the number of bits the plugin should use to store them, or disable 104 | certain parameters entirely. This allows you to fine-tune the settings to your needs. In our case, for example, we 105 | use these values: `4 * 12 bits (coordinates) + 9 bits (word index) + 11 bits (line index) + 12 bits (page index)`, 106 | resulting in a 80 bit or 10 byte payload per token. A comparable string representation `p0l0n0x000y000w000h000` 107 | would have at least 22 bytes, so we save >50% for every token. 108 | 109 | At query time, we then retrieve the payload for each matching token and put the decoded information into the 110 | `ocr_highlight` result key that can be directly used without having to do any additional parsing. 111 | 112 | ## Usage 113 | ### Installation 114 | 115 | Download the [latest release from GitHub](https://github.com/dbmdz/solr-ocrpayload-plugin/releases) and put the JAR into your `$SOLR_HOME/$SOLR_CORE/lib/` directory. 116 | 117 | ### Indexing configuration 118 | 119 | To use it, first add the `DelimitedOcrInfoPayloadTokenFilterFactory`☕ filter to your analyzer chain (e.g. for a `ocr_text` field type): 120 | 121 | ```xml 122 | 123 | 124 | 125 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | ``` 134 | 135 | The filter takes the following parameters: 136 | 137 | - `delimiter`: Character used for delimiting the payload from the token in the input document (default: `|`) 138 | - `absoluteCoordinates`: `true` or `false` to configure whether the stored coordinates are absolute 139 | - `coordinateBits`: Number of bits to use for encoding OCR coordinates in the index. (mandatory)
140 | A value of `10` (default) is recommended, resulting in coordBits to approximately two decimal places. 141 | - `wordBits`: Number of bits to use for encoding the word index.
142 | Set to 0 (default) to disable storage of the word index. 143 | - `lineBits`: Number of bits to use for encoding the line index.
144 | Set to 0 (default) to disable storage of the line index. 145 | - `pageBits`: Number of bits to use for encoding the page index.
146 | Set to 0 (default) to disable storage of the page index. 147 | 148 | The filter expects an input payload after the configured `delimiter` in the input stream, with the payload being a 149 | pseudo-JSON structure (e.g. `k1:1,k2:3`) with the following keys: 150 | 151 | - `p`: Page index (if `pageBits` > 0) 152 | - `l`: Line index (if `lineBits` > 0) 153 | - `n`: Word index (if `wordBits` > 0) 154 | - `x`, `y`, `w`, `h`: Coordinates of the OCR box as floating point percentages or integers (if `absoluteCoordinates`) 155 | 156 | As an example, consider the token `foobar` with an OCR box of `(0.50712, 0.31432, 0.87148, 0.05089)` 157 | (i.e. with `absoluteCoordinates="false"`), the configured delimiter `☞` and storage of indices for the word (`30`), 158 | line (`12`) and page (`13`): 159 | `foobar☞p:13,l:12,n:30,x:50.7,y:31.4,w:87.1,h:5.1`. 160 | 161 | Alternatively, with `absoluteCoordinates="true"`, an OCR box of `(512, 1024, 3192, 256)` and otherwise the same 162 | settings: 163 | `foobar☞p:13,l:12,n:30,x:512,y:1024,w:3192,h:256`. 164 | 165 | Finally, you just have to configure your schema to use the field type defined above. Storing the content is **not** 166 | recommended, since it significantly increases the index size and is not used at all for querying and highlighting: 167 | 168 | ```xml 169 | 170 | ``` 171 | 172 | ### Highlighting configuration 173 | 174 | To enable highlighting using the OCR payloads, add the `OcrHighlighting` component to your Solr 175 | configuration, configure it with the same `absoluteCoordinates`, `coordinateBits`, `wordBits`, `lineBits` and `pageBits` 176 | values that were used for the filter in the analyzer chain: 177 | 178 | ```xml 179 | 180 | 183 | 184 | 185 | 186 | ocr_highlight 187 | 188 | 189 | 190 | ``` 191 | 192 | Now at query time, you can just set the `ocr_hl=true` parameter, specify the fields you want highlighted via 193 | `ocr_hl.fields=myfield,myotherfield` and retrieve highlighted matches with their OCR coordinates: 194 | 195 | `GET /solr/mycore/select?ocr_hl=true&ocr_hl.fields=ocr_text&indent=true&q=augsburg&wt=json` 196 | 197 | ```json 198 | { 199 | "responseHeader":{ 200 | "status":0, 201 | "QTime":158}, 202 | "response":{"numFound":526,"start":0,"docs":[ 203 | { 204 | "id":"bsb10502835"}, 205 | { 206 | "id":"bsb11032147"}, 207 | { 208 | "id":"bsb10485243"}, 209 | ... 210 | }, 211 | "ocr_highlight":{ 212 | "bsb10502835":{ 213 | "ocr_text":[{ 214 | "page":7, 215 | "position":9, 216 | "term":"augsburg", 217 | "x":0.111, 218 | "y":0.062, 219 | "width":0.075, 220 | "height":0.013}, 221 | { 222 | "page":7, 223 | "position":264, 224 | "term":"augsburg", 225 | "x":0.320, 226 | "y":0.670, 227 | "width":0.099, 228 | "height":0.012}, 229 | ...]}}, 230 | ... 231 | } 232 | } 233 | } 234 | ``` 235 | 236 | 237 | ## FAQ 238 | 239 | - **How does highlighting work with phrase queries?** 240 | 241 | You will receive a bounding box object for every individual matching term in the phrase. 242 | 243 | - **What are the performance and storage implications of using this plugin?** 244 | 245 | *Performance*: With an Intel Xeon E5-1620@3.5GHz on a single core, we measured (with JMH): 246 | 247 | - Encoding the Payload: 1,484,443.200 Payloads/Second or ~14.2MiB/s with an 80bit payload 248 | - Decoding the Payload: 1,593,036.372 Payloads/Second or ~15.2MiB/s with an 80bit payload 249 | 250 | *Storage*: This depends on your configuration. With our sample configuration of an 80 bit payload 251 | (see above), the payload overhead is 10 bytes per token. That is, for a corpus size of 10 Million Tokens, 252 | you will need approximately 95MiB to store the payloads. 253 | The actual storage required might be lower, since Lucene compresses the payloads with LZ4. 254 | 255 | - **Does this work with SolrCloud?** 256 | 257 | It does! We're running it with SolrCloud ourselves. 258 | -------------------------------------------------------------------------------- /src/main/java/de/digitalcollections/solr/plugin/components/ocrhighlighting/OcrHighlighting.java: -------------------------------------------------------------------------------- 1 | package de.digitalcollections.solr.plugin.components.ocrhighlighting; 2 | 3 | import de.digitalcollections.lucene.analysis.payloads.OcrInfo; 4 | import de.digitalcollections.lucene.analysis.payloads.OcrPayloadHelper; 5 | import java.io.IOException; 6 | import java.util.ArrayList; 7 | import java.util.Arrays; 8 | import java.util.Collections; 9 | import java.util.HashMap; 10 | import java.util.List; 11 | import java.util.Map; 12 | import java.util.Set; 13 | import java.util.TreeSet; 14 | import org.apache.lucene.document.Document; 15 | import org.apache.lucene.index.IndexReader; 16 | import org.apache.lucene.index.LeafReader; 17 | import org.apache.lucene.index.LeafReaderContext; 18 | import org.apache.lucene.index.MultiReader; 19 | import org.apache.lucene.index.PostingsEnum; 20 | import org.apache.lucene.index.ReaderUtil; 21 | import org.apache.lucene.index.Term; 22 | import org.apache.lucene.index.Terms; 23 | import org.apache.lucene.index.TermsEnum; 24 | import org.apache.lucene.search.IndexSearcher; 25 | import org.apache.lucene.search.Query; 26 | import org.apache.lucene.util.BytesRef; 27 | import org.apache.solr.common.params.SolrParams; 28 | import org.apache.solr.common.util.NamedList; 29 | import org.apache.solr.common.util.SimpleOrderedMap; 30 | import org.apache.solr.core.PluginInfo; 31 | import org.apache.solr.handler.component.ResponseBuilder; 32 | import org.apache.solr.handler.component.SearchComponent; 33 | import org.apache.solr.handler.component.ShardRequest; 34 | import org.apache.solr.request.SolrQueryRequest; 35 | import org.apache.solr.schema.IndexSchema; 36 | import org.apache.solr.schema.SchemaField; 37 | import org.apache.solr.search.DocIterator; 38 | import org.apache.solr.search.DocList; 39 | import org.apache.solr.search.SolrIndexSearcher; 40 | import org.apache.solr.util.SolrPluginUtils; 41 | import org.apache.solr.util.plugin.PluginInfoInitialized; 42 | 43 | public class OcrHighlighting extends SearchComponent implements PluginInfoInitialized { 44 | 45 | private static final IndexSearcher EMPTY_INDEXSEARCHER; 46 | 47 | static { 48 | try { 49 | IndexReader emptyReader = new MultiReader(); 50 | EMPTY_INDEXSEARCHER = new IndexSearcher(emptyReader); 51 | EMPTY_INDEXSEARCHER.setQueryCache(null); 52 | } catch (IOException bogus) { 53 | throw new RuntimeException(bogus); 54 | } 55 | } 56 | 57 | private int coordBits; 58 | private int wordBits; 59 | private int lineBits; 60 | private int pageBits; 61 | private boolean absoluteCoordinates; 62 | 63 | @Override 64 | public void prepare(ResponseBuilder rb) { 65 | // NOP 66 | } 67 | 68 | @Override 69 | public void process(ResponseBuilder rb) throws IOException { 70 | if (rb.req.getParams().getBool("ocr_hl", false)) { 71 | NamedList highlighting = doHighlighting(rb.getResults().docList, rb.getQuery(), rb.req); 72 | rb.rsp.add("ocr_highlighting", highlighting); 73 | } 74 | } 75 | 76 | // Adapted from solr's own HighlightComponent 77 | @Override 78 | public void modifyRequest(ResponseBuilder rb, SearchComponent who, ShardRequest sreq) { 79 | if (!(rb.req.getParams().getBool("ocr_hl", false))) { 80 | return; 81 | } 82 | 83 | // Turn on highlighting only only when retrieving fields 84 | if ((sreq.purpose & ShardRequest.PURPOSE_GET_FIELDS) != 0) { 85 | sreq.purpose |= ShardRequest.PURPOSE_GET_HIGHLIGHTS; 86 | // should already be true... 87 | sreq.params.set("ocr_hl", "true"); // TODO: Maybe set hl_params? 88 | } else { 89 | sreq.params.set("ocr_hl", "false"); 90 | } 91 | } 92 | 93 | // Adapted from solr's own HighlightComponent 94 | @SuppressWarnings("unchecked") 95 | @Override 96 | public void finishStage(ResponseBuilder rb) { 97 | if (!rb.req.getParams().getBool("ocr_hl", false) || rb.stage != ResponseBuilder.STAGE_GET_FIELDS) { 98 | return; 99 | } 100 | 101 | NamedList.NamedListEntry[] arr = new NamedList.NamedListEntry[rb.resultIds.size()]; 102 | rb.finished.stream() 103 | .filter(sreq -> (sreq.purpose & ShardRequest.PURPOSE_GET_HIGHLIGHTS) != 0) 104 | .flatMap(sreq -> sreq.responses.stream()) 105 | // can't expect the highlight content if there was an exception for this request 106 | // this should only happen when using shards.tolerant=true 107 | .filter(resp -> resp.getException() == null) 108 | .map(resp -> (NamedList) resp.getSolrResponse().getResponse().get("ocr_highlighting")) 109 | .forEach(hl -> SolrPluginUtils.copyNamedListIntoArrayByDocPosInResponse(hl, rb.resultIds, arr)); 110 | 111 | // remove nulls in case not all docs were able to be retrieved 112 | rb.rsp.add("ocr_highlighting", SolrPluginUtils.removeNulls(arr, new SimpleOrderedMap<>())); 113 | } 114 | 115 | @Override 116 | public String getDescription() { 117 | return null; 118 | } 119 | 120 | @Override 121 | public void init(PluginInfo info) { 122 | this.coordBits = Integer.parseInt(info.attributes.getOrDefault("coordinateBits", "12")); 123 | this.pageBits = Integer.parseInt(info.attributes.getOrDefault("pageBits", "0")); 124 | this.lineBits = Integer.parseInt(info.attributes.getOrDefault("lineBits", "0")); 125 | this.wordBits = Integer.parseInt(info.attributes.getOrDefault("wordBits", "0")); 126 | this.absoluteCoordinates = Boolean.parseBoolean(info.attributes.getOrDefault("absoluteCoordinates", "false")); 127 | } 128 | 129 | private Set getTerms(Query query, String fieldName) throws IOException { 130 | Set terms = new TreeSet<>(); 131 | Set extractPosInsensitiveTermsTarget = new TreeSet() { 132 | @Override 133 | public boolean add(Term term) { 134 | if (term.field().equals(fieldName)) { 135 | return terms.add(term.bytes()); 136 | } 137 | return false; 138 | } 139 | }; 140 | query.createWeight(EMPTY_INDEXSEARCHER, false, 1.0f) 141 | .extractTerms(extractPosInsensitiveTermsTarget); 142 | return terms; 143 | } 144 | 145 | /** 146 | * Generates a list of highlighted query term coordinates for each item in a list of documents, or returns null if highlighting is disabled. 147 | * 148 | * @param docs query results 149 | * @param query the query 150 | * @param req the current request 151 | * @return NamedList containing a {@link NamedList} for each document, 152 | * which in turns contains `({@link String} field, {@link OcrInfo} coordinates)` pairs. 153 | */ 154 | private NamedList doHighlighting(DocList docs, Query query, SolrQueryRequest req) throws IOException { 155 | SolrParams params = req.getParams(); 156 | int maxHighlightsPerDoc = params.getInt("ocr_hl.maxPerDoc", -1); 157 | int maxHighlightsPerPage = params.getInt("ocr_hl.maxPerPage", -1); 158 | IndexReader reader = req.getSearcher().getIndexReader(); 159 | 160 | int[] docIds = toDocIDs(docs); 161 | String[] keys = getUniqueKeys(req.getSearcher(), docIds); 162 | String[] fieldNames = params.getParams("ocr_hl.fields"); 163 | 164 | // For each document, obtain a mapping from field names to their matching OCR boxes 165 | List> boxes = new ArrayList<>(); 166 | for (int docId : docIds) { 167 | Map docBoxes = new HashMap<>(); 168 | for (String fieldName : fieldNames) { 169 | // We grab the terms in their UTF-8 encoded form to avoid costly decoding operations 170 | // when checking for term equality down the line 171 | Set termSet = getTerms(query, fieldName); 172 | OcrInfo[] ocrInfos = getOcrInfos(reader, docId, fieldName, termSet, maxHighlightsPerDoc, maxHighlightsPerPage); 173 | docBoxes.put(fieldName, ocrInfos); 174 | } 175 | boxes.add(docBoxes); 176 | } 177 | return encodeSnippets(keys, fieldNames, boxes); 178 | } 179 | 180 | /** 181 | * Retrieve unique keys for matching documents. 182 | */ 183 | private String[] getUniqueKeys(SolrIndexSearcher searcher, int[] docIds) throws IOException { 184 | IndexSchema schema = searcher.getSchema(); 185 | SchemaField keyField = schema.getUniqueKeyField(); 186 | if (keyField != null) { 187 | Set selector = Collections.singleton(keyField.getName()); 188 | String[] uniqueKeys = new String[docIds.length]; 189 | for (int i = 0; i < docIds.length; i++) { 190 | int docId = docIds[i]; 191 | Document doc = searcher.doc(docId, selector); 192 | String id = schema.printableUniqueKey(doc); 193 | uniqueKeys[i] = id; 194 | } 195 | return uniqueKeys; 196 | } else { 197 | return new String[docIds.length]; 198 | } 199 | } 200 | 201 | /** 202 | * Retrieve Document IDs from the list of matching documents. 203 | */ 204 | private int[] toDocIDs(DocList docs) { 205 | int[] ids = new int[docs.size()]; 206 | DocIterator iterator = docs.iterator(); 207 | for (int i = 0; i < ids.length; i++) { 208 | if (!iterator.hasNext()) { 209 | throw new AssertionError(); 210 | } 211 | ids[i] = iterator.nextDoc(); 212 | } 213 | if (iterator.hasNext()) { 214 | throw new AssertionError(); 215 | } 216 | return ids; 217 | } 218 | 219 | /** 220 | * Retrieve all {@link OcrInfo}s for matching terms from a given field in a document. 221 | * 222 | * This takes a lot of inspiration from the {@link org.apache.lucene.search.uhighlight.UnifiedHighlighter}, thanks 223 | * to David Smiley (@dsmiley) for pointing out that term vectors are not necessary for this highlighter. 224 | * 225 | * @param reader A reader into the search index 226 | * @param docId Identifier of the matching document 227 | * @param fieldName Field to obtain OCR information from 228 | * @param termSet Set of matching terms 229 | * @param maxHighlightsPerDoc Maximum number of OCR terms per document 230 | * @param maxHighlightsPerPage Maximum number of OCR terms per page 231 | * @return All OCR information for matching terms on all positions in the field 232 | * @throws IOException Error during retrieval from index 233 | */ 234 | private OcrInfo[] getOcrInfos(IndexReader reader, int docId, String fieldName, Set termSet, 235 | int maxHighlightsPerDoc, int maxHighlightsPerPage) throws IOException { 236 | List ocrList = new ArrayList<>(); 237 | 238 | final LeafReader leafReader; 239 | if (reader instanceof LeafReader) { 240 | leafReader = (LeafReader) reader; 241 | } else { 242 | List leaves = reader.leaves(); 243 | LeafReaderContext leafReaderContext = leaves.get(ReaderUtil.subIndex(docId, leaves)); 244 | leafReader = leafReaderContext.reader(); 245 | docId -= leafReaderContext.docBase; // adjust 'doc' to be within this leaf reader 246 | } 247 | 248 | final Terms terms = leafReader.terms(fieldName); 249 | if (terms == null || !terms.hasPositions() || !terms.hasPayloads()) { 250 | return new OcrInfo[]{}; 251 | } 252 | 253 | final TermsEnum termsEnum = terms.iterator(); 254 | int currentPage = -1; 255 | int matchesOnCurrentPage = 0; 256 | 257 | for (BytesRef term : termSet) { 258 | if (!termsEnum.seekExact(term)) { 259 | continue; 260 | } 261 | PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.POSITIONS | PostingsEnum.PAYLOADS); 262 | if (postingsEnum == null) { 263 | // no offsets or positions available 264 | throw new IllegalArgumentException("field '" + fieldName + "' was indexed without offsets, cannot highlight"); 265 | } 266 | if (docId != postingsEnum.advance(docId)) { 267 | continue; 268 | } 269 | 270 | final int freq = postingsEnum.freq(); 271 | for (int i = 0; i < freq && (maxHighlightsPerDoc < 0 || ocrList.size() < maxHighlightsPerDoc); i++) { 272 | postingsEnum.nextPosition(); 273 | BytesRef payload = postingsEnum.getPayload(); 274 | OcrInfo info = OcrPayloadHelper.decodeOcrInfo(payload, coordBits, wordBits, lineBits, pageBits, absoluteCoordinates); 275 | if (info.getPageIndex() != currentPage) { // Are we on a new page? 276 | matchesOnCurrentPage = 0; 277 | currentPage = info.getPageIndex(); 278 | } 279 | if (maxHighlightsPerPage < 0 || matchesOnCurrentPage < maxHighlightsPerPage) { // Limit matches per page? 280 | info.setTerm(term.utf8ToString()); 281 | ocrList.add(info); 282 | matchesOnCurrentPage++; 283 | } 284 | } 285 | } 286 | return ocrList.stream().sorted().toArray(OcrInfo[]::new); 287 | } 288 | 289 | private NamedList encodeOcrInfo(OcrInfo info) { 290 | NamedList encoded = new SimpleOrderedMap<>(); 291 | if (info.getPageIndex() >= 0) { 292 | encoded.add("page", info.getPageIndex()); 293 | } 294 | if (info.getLineIndex() >= 0) { 295 | encoded.add("line", info.getLineIndex()); 296 | } 297 | if (info.getWordIndex() >= 0) { 298 | encoded.add("word", info.getWordIndex()); 299 | } 300 | encoded.add("term", info.getTerm()); 301 | 302 | if (absoluteCoordinates) { 303 | encoded.add("x", (int) info.getHorizontalOffset()); 304 | encoded.add("y", (int) info.getVerticalOffset()); 305 | encoded.add("width", (int) info.getWidth()); 306 | encoded.add("height", (int) info.getHeight()); 307 | } else { 308 | encoded.add("x", info.getHorizontalOffset()); 309 | encoded.add("y", info.getVerticalOffset()); 310 | encoded.add("width", info.getWidth()); 311 | encoded.add("height", info.getHeight()); 312 | } 313 | return encoded; 314 | } 315 | 316 | /** 317 | * Encode the highlighting result into a format that can be used by upstream users. 318 | */ 319 | private NamedList encodeSnippets(String[] keys, String[] fieldNames, List> ocrInfos) { 320 | NamedList list = new SimpleOrderedMap<>(); 321 | for (int i = 0; i < keys.length; i++) { 322 | NamedList summary = new SimpleOrderedMap<>(); 323 | Map docBoxes = ocrInfos.get(i); 324 | for (String field : fieldNames) { 325 | summary.add(field, 326 | Arrays.stream(docBoxes.get(field)).sorted().map(this::encodeOcrInfo).toArray()); 327 | } 328 | list.add(keys[i], summary); 329 | } 330 | return list; 331 | } 332 | } 333 | -------------------------------------------------------------------------------- /src/test/resources/data/ocrtext_full.txt: -------------------------------------------------------------------------------- 1 | Ein|p:183,l:0,n:0,x:22.31,y:08.77,w:28.24,h:11.5 4 und|p:183,l:0,n:1,x:29.52,y:08.77,w:35.23,h:11.54 zwanzigfies|p:183,l:0,n:2,x:36.36,y:08.77,w:53.94,h:11.54 Gutachten, 2 | |p:183,l:0,n:3,x:55.14,y:08.77,w:72.42,h:11.54 179|p:183,l:0,n:4,x:77.00,y:08.77,w:83.24,h:11.54 geworfen|p:183,l:1,n:0,x:12.39,y:12.91,w:23.81,h:15.42 haben 3 | .|p:183,l:1,n:1,x:26.29,y:12.91,w:34.86,h:15.42 Sein|p:183,l:1,n:2,x:38.16,y:12.91,w:44.62,h:15.42 Medicus|p:183,l:1,n:3,x:46.80,y:12.91,w:58.45,h:15.42 ordinarius,|p:183,l:1,n:4,x:61.23,y:12.91,w:76.93,h:15.42 der|p:183,l:1,n:5,x:79.26,y:12.91,w:83.24,h:15.42 hiefige|p:183,l:2,n:0,x:12.39,y:15.91,w:20.28,h:18.20 Doftor|p:183,l:2,n:1,x:22.91,y:15.91,w:31.70,h:18.20 Dolf,|p:183,l:2,n:2,x:34.25,y:15.91,w:42.82,h:18.20 bezeuget|p:183,l:2,n:3,x:45.67,y:15.91,w:55.89,h:18.20 in|p:183,l:2,n:4,x:58.30,y:15.91,w:60.55,h:18.20 einem|p:183,l:2,n:5,x:62.96,y:15.91,w:70.02,h:18.20 mir|p:183,l:2,n:6,x:72.42,y:15.91,w:76.85,h:18.20 auf|p:183,l:2,n:7,x:78.96,y:15.91,w:83.39,h:18.20 mein|p:183,l:3,n:0,x:12.32,y:18.59,w:18.48,h:20.80 Berlangen|p:183,l:3,n:1,x:20.21,y:18.59,w:33.28,h:20.80 überſchicften|p:183,l:3,n:2,x:34.86,y:18.59,w:50.18,h:20.80 Privatattefi,|p:183,l:3,n:3,x:51.99,y:18.59,w:67.84,h:20.80 „daß|p:183,l:3,n:4,x:70.24,y:18.59,w:77.23,h:20.80 der|p:183,l:3,n:5,x:78.96,y:18.59,w:83.02,h:20.80 „Tobias|p:183,l:4,n:0,x:12.32,y:21.24,w:24.11,h:23.57 D.|p:183,l:4,n:1,x:25.92,y:21.24,w:29.07,h:23.57 mit|p:183,l:4,n:2,x:30.42,y:21.24,w:34.56,h:23.57 hämorrhoidaliſchen|p:183,l:4,n:3,x:36.13,y:21.24,w:59.72,h:23.57 Sufällen,|p:183,l:4,n:4,x:61.45,y:21.24,w:72.65,h:23.57 welche|p:183,l:4,n:5,x:75.05,y:21.24,w:83.02,h:23.57 „ihm|p:183,l:5,n:0,x:12.24,y:24.01,w:19.30,h:26.35 starte|p:183,l:5,n:1,x:20.43,y:24.01,w:26.97,h:26.35 Eongestiones|p:183,l:5,n:2,x:27.72,y:24.01,w:43.57,h:26.35 nach|p:183,l:5,n:3,x:44.47,y:24.01,w:49.88,h:26.35 der|p:183,l:5,n:4,x:51.01,y:24.01,w:54.84,h:26.35 Bruf|p:183,l:5,n:5,x:55.59,y:24.01,w:62.58,h:26.35 c.|p:183,l:5,n:6,x:64.23,y:24.01,w:66.49,h:26.35 verurfachen,|p:183,l:5,n:7,x:67.61,y:24.01,w:82.87,h:26.35 „behaftet|p:183,l:6,n:0,x:12.39,y:26.70,w:24.56,h:28.82 fey.|p:183,l:6,n:1,x:26.07,y:26.70,w:30.57,h:28.82 “|p:183,l:6,n:2,x:31.55,y:26.70,w:33.20,h:28.82 –|p:183,l:6,n:3,x:34.63,y:26.70,w:38.39,h:28.82 Seinen|p:183,l:6,n:4,x:40.72,y:26.70,w:49.66,h:28.82 Puls|p:183,l:6,n:5,x:51.38,y:26.70,w:57.25,h:28.82 fand|p:183,l:6,n:6,x:59.65,y:26.70,w:65.13,h:28.82 ich|p:183,l:6,n:7,x:67.54,y:26.70,w:70.77,h:28.82 fchwach,|p:183,l:6,n:8,x:72.42,y:27.01,w:82.79,h:29.13 flein|p:183,l:7,n:0,x:12.32,y:29.44,w:17.88,h:31.46 und|p:183,l:7,n:1,x:19.23,y:29.44,w:23.74,h:31.46 intermittirend,|p:183,l:7,n:2,x:25.61,y:29.44,w:43.50,h:31.46 ihn|p:183,l:7,n:3,x:45.45,y:29.44,w:49.13,h:31.46 felbſt|p:183,l:7,n:4,x:50.93,y:29.44,w:57.17,h:31.46 ſchwach|p:183,l:7,n:5,x:58.90,y:29.44,w:67.99,h:31.46 und|p:183,l:7,n:6,x:69.79,y:29.44,w:74.22,h:31.46 matt.|p:183,l:7,n:7,x:76.03,y:29.44,w:82.64,h:31.46 Angenemmen|p:183,l:8,n:0,x:18.18,y:34.90,w:34.93,h:37.02 nun|p:183,l:8,n:1,x:36.81,y:34.90,w:41.32,h:37.02 auch,|p:183,l:8,n:2,x:43.12,y:34.90,w:49.88,h:37.02 daß|p:183,l:8,n:3,x:52.06,y:34.90,w:56.64,h:37.02 dieſer|p:183,l:8,n:4,x:58.82,y:34.90,w:65.58,h:37.02 Menſch|p:183,l:8,n:5,x:67.39,y:34.90,w:76.48,h:37.02 aus|p:183,l:8,n:6,x:78.06,y:34.90,w:82.64,h:37.02 Furcht|p:183,l:9,n:0,x:12.02,y:37.54,w:20.66,h:39.79 vor|p:183,l:9,n:1,x:22.31,y:37.54,w:26.59,h:39.79 der|p:183,l:9,n:2,x:28.17,y:37.54,w:32.08,h:39.79 Strafe|p:183,l:9,n:3,x:33.88,y:37.54,w:42.52,h:39.79 te,|p:183,l:9,n:4,x:44.32,y:37.54,w:47.03,h:39.79 feine|p:183,l:9,n:5,x:49.58,y:37.54,w:55.22,h:39.79 Beſchwerden|p:183,l:9,n:6,x:57.02,y:37.54,w:72.50,h:39.79 größer|p:183,l:9,n:7,x:74.53,y:37.54,w:82.87,h:39.79 angegeben|p:183,l:10,n:0,x:12.09,y:40.14,w:24.86,h:42.52 hätte,|p:183,l:10,n:1,x:26.59,y:40.14,w:33.80,h:42.52 als|p:183,l:10,n:2,x:35.83,y:40.14,w:39.59,h:42.52 fie|p:183,l:10,n:3,x:41.62,y:40.14,w:44.47,h:42.52 in|p:183,l:10,n:4,x:46.65,y:40.14,w:49.13,h:42.52 der|p:183,l:10,n:5,x:50.93,y:40.14,w:54.84,h:42.52 That|p:183,l:10,n:6,x:57.09,y:40.14,w:62.65,h:42.52 find,|p:183,l:10,n:7,x:64.08,y:40.14,w:69.94,h:42.52 fo|p:183,l:10,n:8,x:72.20,y:40.14,w:74.53,h:42.52 gehet|p:183,l:10,n:9,x:76.10,y:40.14,w:82.71,h:42.52 doch|p:183,l:11,n:0,x:11.87,y:42.79,w:17.65,h:45.26 ſo|p:183,l:11,n:1,x:19.53,y:42.79,w:21.71,h:45.26 viel|p:183,l:11,n:2,x:23.51,y:42.79,w:28.02,h:45.26 aus|p:183,l:11,n:3,x:29.75,y:42.79,w:34.18,h:45.26 dieſer|p:183,l:11,n:4,x:36.06,y:42.79,w:42.90,h:45.26 unterſuchung|p:183,l:11,n:5,x:44.62,y:42.79,w:60.70,h:45.26 und|p:183,l:11,n:6,x:62.65,y:42.79,w:67.09,h:45.26 dem|p:183,l:11,n:7,x:68.97,y:42.79,w:73.85,h:45.26 Beuge|p:183,l:11,n:8,x:75.65,y:42.79,w:82.71,h:45.26 niß|p:183,l:12,n:0,x:11.94,y:45.52,w:16.15,h:47.73 des|p:183,l:12,n:1,x:18.33,y:45.52,w:22.46,h:47.73 Deftor|p:183,l:12,n:2,x:24.79,y:45.52,w:33.50,h:47.73 Dolf|p:183,l:12,n:3,x:36.13,y:45.52,w:43.65,h:47.73 herfür,|p:183,l:12,n:4,x:45.37,y:45.52,w:54.24,h:47.73 daß|p:183,l:12,n:5,x:57.25,y:45.52,w:61.75,h:47.73 derfelbe|p:183,l:12,n:6,x:64.16,y:45.52,w:73.47,h:47.73 wårfe|p:183,l:12,n:7,x:75.73,y:45.52,w:82.79,h:47.73 lich|p:183,l:13,n:0,x:12.02,y:48.30,w:16.30,h:50.41 nur|p:183,l:13,n:1,x:18.18,y:48.30,w:22.38,h:50.41 ſchwächlich,|p:183,l:13,n:2,x:24.26,y:48.30,w:38.84,h:50.41 hånorrhoidaliſchen|p:183,l:13,n:3,x:40.57,y:48.30,w:64.16,h:50.41 Sufällen|p:183,l:13,n:4,x:65.96,y:48.30,w:75.88,h:50.41 und|p:183,l:13,n:5,x:77.91,y:48.30,w:82.41,h:50.41 daher|p:183,l:14,n:0,x:12.02,y:51.12,w:19.00,h:53.10 entſpringenden|p:183,l:14,n:1,x:20.81,y:51.12,w:38.69,h:53.10 heftigen|p:183,l:14,n:2,x:40.64,y:51.12,w:50.18,h:53.10 Gongestionen|p:183,l:14,n:3,x:51.99,y:51.07,w:67.91,h:53.63 des|p:183,l:14,n:4,x:69.72,y:51.16,w:73.85,h:53.01 Bluts|p:183,l:14,n:5,x:75.05,y:51.16,w:82.64,h:53.01 nach|p:183,l:15,n:0,x:11.87,y:53.81,w:17.50,h:55.97 der|p:183,l:15,n:1,x:19.30,y:53.81,w:23.06,h:55.97 Bruft|p:183,l:15,n:2,x:24.94,y:53.81,w:32.00,h:55.97 unterworfen|p:183,l:15,n:3,x:33.95,y:53.81,w:49.13,h:55.97 fey|p:183,l:15,n:4,x:50.93,y:53.81,w:54.54,h:55.97 und|p:183,l:15,n:5,x:56.72,y:53.81,w:61.30,h:55.97 vorzüglich|p:183,l:15,n:6,x:63.48,y:53.81,w:75.80,h:55.97 eine|p:183,l:15,n:7,x:77.68,y:53.81,w:82.64,h:55.97 fchwache|p:183,l:16,n:0,x:11.94,y:56.45,w:22.68,h:58.66 Bruft|p:183,l:16,n:1,x:24.41,y:56.45,w:31.48,h:58.66 habe.|p:183,l:16,n:2,x:33.20,y:56.45,w:39.66,h:58.66 Da|p:183,l:17,n:0,x:17.58,y:61.83,w:21.78,h:63.94 nun|p:183,l:17,n:1,x:23.66,y:61.83,w:28.17,h:63.94 aber|p:183,l:17,n:2,x:29.75,y:61.83,w:35.08,h:63.94 das|p:183,l:17,n:3,x:37.26,y:61.83,w:41.62,h:63.94 Tragen|p:183,l:17,n:4,x:43.57,y:61.83,w:52.44,h:63.94 des|p:183,l:17,n:5,x:54.39,y:61.83,w:58.52,h:63.94 fogenannten|p:183,l:17,n:6,x:60.70,y:61.83,w:75.50,h:63.94 ſpå»|p:183,l:17,n:7,x:77.68,y:61.83,w:82.64,h:63.94 niſchen|p:183,l:18,n:0,x:11.94,y:64.69,w:20.43,h:66.90 Mantels,|p:183,l:18,n:1,x:22.23,y:64.69,w:34.33,h:66.90 welcher|p:183,l:18,n:2,x:36.13,y:64.69,w:45.00,h:66.90 nach|p:183,l:18,n:3,x:46.80,y:64.69,w:52.21,h:66.90 dem|p:183,l:18,n:4,x:53.86,y:64.69,w:58.90,h:66.90 zu|p:183,l:18,n:5,x:60.70,y:64.69,w:63.18,h:66.90 urtheilen,|p:183,l:18,n:6,x:64.83,y:64.69,w:76.78,h:66.90 der|p:183,l:18,n:7,x:78.58,y:64.69,w:82.56,h:66.90 auf|p:183,l:19,n:0,x:12.02,y:67.38,w:16.45,h:69.36 der|p:183,l:19,n:1,x:17.95,y:67.38,w:21.86,h:69.36 Hausvogtey|p:183,l:19,n:2,x:24.19,y:67.38,w:38.84,h:69.36 zu|p:183,l:19,n:3,x:40.72,y:67.38,w:43.12,h:69.36 dergleichen|p:183,l:19,n:4,x:44.70,y:67.38,w:58.07,h:69.36 Behuf|p:183,l:19,n:5,x:59.87,y:67.38,w:67.91,h:69.36 aufbewahe|p:183,l:19,n:6,x:69.42,y:67.38,w:82.49,h:69.36 ret|p:183,l:20,n:0,x:11.94,y:69.94,w:15.47,h:72.01 wird,|p:183,l:20,n:1,x:17.20,y:69.94,w:24.34,h:72.01 an|p:183,l:20,n:2,x:26.82,y:69.94,w:29.75,h:72.01 die|p:183,l:20,n:3,x:31.93,y:69.94,w:35.38,h:72.01 fiebenzig|p:183,l:20,n:4,x:36.88,y:69.94,w:47.18,h:72.01 Pfund|p:183,l:20,n:5,x:49.73,y:69.94,w:56.94,h:72.01 ſchwer|p:183,l:20,n:6,x:59.50,y:69.94,w:67.24,h:72.01 iſt,|p:183,l:20,n:7,x:69.04,y:69.94,w:73.02,h:72.01 einen|p:183,l:20,n:8,x:76.18,y:69.94,w:82.49,h:72.01 farfen|p:183,l:21,n:0,x:11.87,y:72.58,w:20.06,h:74.74 Rörper|p:183,l:21,n:1,x:22.16,y:72.58,w:30.80,h:74.74 und|p:183,l:21,n:2,x:32.68,y:72.58,w:37.11,h:74.74 heftige,|p:183,l:21,n:3,x:38.99,y:72.58,w:47.93,h:74.74 Stunden|p:183,l:21,n:4,x:49.81,y:72.58,w:60.78,h:74.74 lang|p:183,l:21,n:5,x:62.28,y:72.58,w:67.61,h:74.74 anhaltende|p:183,l:21,n:6,x:69.42,y:72.58,w:82.56,h:74.74 Anfirengung|p:183,l:22,n:0,x:12.17,y:75.31,w:27.19,h:77.52 aller|p:183,l:22,n:1,x:28.92,y:75.31,w:34.25,h:77.52 sträfte,|p:183,l:22,n:2,x:36.06,y:75.31,w:45.22,h:77.52 vorzüglich|p:183,l:22,n:3,x:47.10,y:75.31,w:59.42,h:77.52 aber,|p:183,l:22,n:4,x:61.15,y:75.31,w:67.76,h:77.52 da|p:183,l:22,n:5,x:69.34,y:75.31,w:72.20,h:77.52 er|p:183,l:22,n:6,x:74.07,y:75.31,w:76.40,h:77.52 auf|p:183,l:22,n:7,x:78.13,y:75.31,w:82.64,h:77.52 den|p:183,l:23,n:0,x:11.87,y:77.91,w:16.15,h:79.99 Schultern|p:183,l:23,n:1,x:17.88,y:77.91,w:30.35,h:79.99 liegt,|p:183,l:23,n:2,x:32.30,y:77.91,w:38.91,h:79.99 der|p:183,l:23,n:3,x:40.79,y:77.91,w:44.62,h:79.99 Bruft|p:183,l:23,n:4,x:46.50,y:77.91,w:53.56,h:79.99 e|p:183,l:23,n:5,x:54.62,y:77.91,w:55.67,h:79.99 Hals|p:183,l:23,n:6,x:57.55,y:77.91,w:63.26,h:79.99 -|p:183,l:23,n:7,x:64.31,y:77.91,w:65.13,h:79.99 und|p:183,l:23,n:8,x:67.01,y:77.91,w:71.45,h:79.99 Rücfen,|p:183,l:23,n:9,x:72.65,y:77.91,w:82.34,h:79.99 muffeln|p:183,l:24,n:0,x:11.94,y:80.69,w:21.78,h:82.76 erfordert,|p:183,l:24,n:1,x:24.56,y:80.69,w:36.73,h:82.76 hierdurch|p:183,l:24,n:2,x:39.66,y:80.69,w:51.01,h:82.76 aber|p:183,l:24,n:3,x:53.86,y:80.69,w:59.12,h:82.76 nicht|p:183,l:24,n:4,x:61.75,y:80.69,w:67.54,h:82.76 nur|p:183,l:24,n:5,x:70.62,y:80.69,w:74.90,h:82.76 das|p:183,l:24,n:6,x:77.91,y:80.69,w:82.26,h:82.76 Blut|p:183,l:25,n:0,x:11.87,y:83.34,w:18.03,h:85.45 mit|p:183,l:25,n:1,x:20.58,y:83.34,w:24.79,h:85.45 fortdaurender|p:183,l:25,n:2,x:27.49,y:83.34,w:44.40,h:85.45 Ộeftigfeit|p:183,l:25,n:3,x:47.18,y:83.34,w:58.75,h:85.45 nach|p:183,l:25,n:4,x:61.38,y:83.34,w:66.71,h:85.45 Ropf|p:183,l:25,n:5,x:69.19,y:83.34,w:75.50,h:85.45 und|p:183,l:25,n:6,x:77.61,y:83.34,w:82.26,h:85.45 Bruft|p:183,l:26,n:0,x:11.87,y:85.98,w:19.08,h:88.23 zu|p:183,l:26,n:1,x:20.58,y:85.98,w:23.06,h:88.23 getrieben|p:183,l:26,n:2,x:24.79,y:85.98,w:35.76,h:88.23 wird,|p:183,l:26,n:3,x:37.34,y:85.98,w:43.87,h:88.23 und|p:183,l:26,n:4,x:45.83,y:85.98,w:50.26,h:88.23 leștere|p:183,l:26,n:5,x:52.14,y:85.98,w:60.25,h:88.23 vorzůglich|p:183,l:26,n:6,x:61.98,y:85.98,w:74.30,h:88.23 lange|p:183,l:26,n:7,x:75.65,y:85.98,w:82.41,h:88.23 und|p:183,l:27,n:0,x:11.79,y:88.67,w:16.45,h:90.92 anhaltend|p:183,l:27,n:1,x:17.73,y:88.67,w:29.45,h:90.92 viel|p:183,l:27,n:2,x:30.57,y:88.67,w:35.08,h:90.92 dabey|p:183,l:27,n:3,x:36.13,y:88.67,w:43.20,h:90.92 leidet:|p:183,l:27,n:4,x:44.77,y:88.67,w:52.66,h:90.92 fo|p:183,l:27,n:5,x:54.39,y:88.67,w:56.64,h:90.92 fann|p:183,l:27,n:6,x:57.70,y:88.67,w:63.33,h:90.92 ich|p:183,l:27,n:7,x:64.46,y:88.67,w:67.69,h:90.92 meiner|p:183,l:27,n:8,x:69.04,y:88.67,w:77.38,h:90.92 ge|p:183,l:27,n:9,x:78.51,y:88.67,w:82.26,h:90.92 ಋRa|p:183,l:28,n:0,x:46.05,y:91.27,w:53.04,h:93.03 Jſº|p:183,l:28,n:1,x:77.00,y:91.49,w:82.26,h:93.74 18o|p:184,l:0,n:0,x:16.96,y:08.07,w:23.52,h:10.70 Ein|p:184,l:0,n:1,x:29.20,y:08.07,w:35.28,h:10.70 und|p:184,l:0,n:2,x:37.01,y:08.07,w:43.09,h:10.70 zwanzigſtes|p:184,l:0,n:3,x:44.19,y:08.07,w:62.58,h:10.70 Gutachten.|p:184,l:0,n:4,x:63.69,y:08.07,w:82.16,h:10.70 wifenhaften|p:184,l:1,n:0,x:16.73,y:12.63,w:32.59,h:14.73 lleberzeugung|p:184,l:1,n:1,x:34.64,y:12.63,w:52.24,h:14.73 nach|p:184,l:1,n:2,x:54.53,y:12.63,w:60.14,h:14.73 nicht|p:184,l:1,n:3,x:62.66,y:12.63,w:68.74,h:14.73 anderð|p:184,l:1,n:4,x:71.34,y:12.63,w:80.11,h:14.73 urtheis|p:184,l:1,n:5,x:82.79,y:12.63,w:91.79,h:14.73 len,|p:184,l:2,n:0,x:16.89,y:15.61,w:22.17,h:17.50 als|p:184,l:2,n:1,x:25.17,y:15.61,w:29.20,h:17.50 *|p:184,l:2,n:2,x:46.80,y:16.40,w:47.75,h:17.01 daß|p:184,l:3,n:0,x:24.62,y:19.25,w:29.59,h:21.40 dieſe|p:184,l:3,n:1,x:31.33,y:19.25,w:37.17,h:21.40 Strafe|p:184,l:3,n:2,x:38.91,y:19.25,w:48.06,h:21.40 an|p:184,l:3,n:3,x:50.03,y:19.25,w:53.03,h:21.40 dem|p:184,l:3,n:4,x:54.53,y:19.25,w:59.74,h:21.40 Suculpaten|p:184,l:3,n:5,x:61.01,y:19.25,w:75.84,h:21.40 nicht|p:184,l:3,n:6,x:77.97,y:19.25,w:84.13,h:21.40 ohne|p:184,l:3,n:7,x:85.63,y:19.25,w:91.39,h:21.40 zu|p:184,l:4,n:0,x:24.70,y:22.01,w:27.62,h:24.25 befürchtenden|p:184,l:4,n:1,x:30.22,y:22.01,w:47.90,h:24.25 großen|p:184,l:4,n:2,x:50.59,y:22.01,w:59.51,h:24.25 Machtheil|p:184,l:4,n:3,x:62.19,y:22.01,w:74.50,h:24.25 feiner|p:184,l:4,n:4,x:77.03,y:22.01,w:84.37,h:24.25 See|p:184,l:4,n:5,x:86.97,y:22.01,w:91.39,h:24.25 fundheit|p:184,l:5,n:0,x:24.94,y:24.73,w:35.59,h:26.92 vollzogen|p:184,l:5,n:1,x:37.33,y:24.73,w:49.32,h:26.92 werden|p:184,l:5,n:2,x:50.82,y:24.73,w:60.14,h:26.92 fönne;|p:184,l:5,n:3,x:62.03,y:24.73,w:71.03,h:26.92 da|p:184,l:6,n:0,x:17.12,y:28.72,w:20.52,h:30.74 von|p:184,l:6,n:1,x:22.33,y:28.72,w:26.99,h:30.74 dem|p:184,l:6,n:2,x:28.96,y:28.72,w:34.17,h:30.74 anhaltenden|p:184,l:6,n:3,x:36.30,y:28.72,w:51.53,h:30.74 heftigen|p:184,l:6,n:4,x:53.59,y:28.72,w:63.69,h:30.74 Antrieb|p:184,l:6,n:5,x:66.29,y:28.72,w:75.84,h:30.74 des|p:184,l:6,n:6,x:77.90,y:28.72,w:82.24,h:30.74 Bluts|p:184,l:6,n:7,x:84.13,y:28.72,w:91.31,h:30.74 nach|p:184,l:7,n:0,x:17.20,y:31.49,w:23.12,h:33.68 der|p:184,l:7,n:1,x:25.49,y:31.49,w:29.67,h:33.68 Bruft|p:184,l:7,n:2,x:31.88,y:31.49,w:39.30,h:33.68 und|p:184,l:7,n:3,x:42.22,y:31.49,w:46.88,h:33.68 ſtarfen|p:184,l:7,n:4,x:49.88,y:31.49,w:58.56,h:33.68 Anfrengung|p:184,l:7,n:5,x:61.56,y:31.49,w:77.50,h:33.68 derfelben|p:184,l:7,n:6,x:80.42,y:31.49,w:91.39,h:33.68 fehr|p:184,l:8,n:0,x:17.36,y:34.21,w:22.41,h:36.44 leicht|p:184,l:8,n:1,x:24.07,y:34.21,w:30.86,h:36.44 eine|p:184,l:8,n:2,x:32.35,y:34.21,w:37.56,h:36.44 3erreißung|p:184,l:8,n:3,x:39.30,y:34.21,w:53.74,h:36.44 der|p:184,l:8,n:4,x:55.64,y:34.21,w:59.66,h:36.44 Gefäße|p:184,l:8,n:5,x:61.48,y:34.07,w:70.71,h:36.09 und|p:184,l:8,n:6,x:73.32,y:34.07,w:78.13,h:36.09 unheilbae|p:184,l:8,n:7,x:80.03,y:34.07,w:91.39,h:36.09 res|p:184,l:9,n:0,x:17.60,y:36.88,w:21.62,h:39.03 Blutſpeyen|p:184,l:9,n:1,x:23.59,y:36.88,w:37.96,h:39.03 entſtehen|p:184,l:9,n:2,x:39.93,y:36.88,w:51.22,h:39.03 fann.|p:184,l:9,n:3,x:53.03,y:36.88,w:59.98,h:39.03 Diefes|p:184,l:10,n:0,x:23.91,y:41.44,w:32.12,h:43.64 iſt|p:184,l:10,n:1,x:34.49,y:41.44,w:37.25,h:43.64 mein|p:184,l:10,n:2,x:39.70,y:41.44,w:45.85,h:43.64 pflichtmäßiges|p:184,l:10,n:3,x:48.93,y:41.44,w:67.64,h:43.64 Gutachten|p:184,l:10,n:4,x:70.08,y:41.44,w:83.89,h:43.64 hier:|p:184,l:10,n:5,x:85.95,y:41.44,w:91.23,h:43.64 åber,|p:184,l:11,n:0,x:17.67,y:44.21,w:24.94,h:46.49 welches|p:184,l:11,n:1,x:27.15,y:44.21,w:37.01,h:46.49 ich|p:184,l:11,n:2,x:39.38,y:44.21,w:42.85,h:46.49 mit|p:184,l:11,n:3,x:45.22,y:44.21,w:49.72,h:46.49 meines|p:184,l:11,n:4,x:51.93,y:44.21,w:60.85,h:46.49 Rahmens|p:184,l:11,n:5,x:62.66,y:44.21,w:75.45,h:46.49 unterſchrift|p:184,l:11,n:6,x:77.66,y:44.21,w:91.63,h:46.49 und|p:184,l:12,n:0,x:17.83,y:47.01,w:22.80,h:49.12 vorgedrucften|p:184,l:12,n:1,x:24.70,y:47.01,w:43.25,h:49.12 Whyffatsfiegel|p:184,l:12,n:2,x:45.77,y:47.01,w:64.16,h:49.12 beurfunde,|p:184,l:12,n:3,x:66.29,y:47.01,w:80.34,h:49.12 Berlin,|p:184,l:13,n:0,x:23.67,y:51.66,w:33.70,h:53.85 den|p:184,l:13,n:1,x:35.67,y:51.66,w:40.09,h:53.85 Iften|p:184,l:13,n:2,x:42.54,y:51.66,w:48.77,h:53.85 März,|p:184,l:13,n:3,x:50.82,y:51.66,w:59.66,h:53.85 1789.|p:184,l:13,n:4,x:62.19,y:51.66,w:70.71,h:53.85 pyl.|p:184,l:14,n:0,x:77.66,y:57.36,w:85.00,h:59.73 »**|p:184,l:15,n:0,x:20.20,y:91.14,w:23.67,h:92.50 -|p:184,l:15,n:1,x:32.83,y:93.55,w:33.22,h:93.77 @rit;|p:184,l:15,n:2,x:82.95,y:90.57,w:91.23,h:92.93, --------------------------------------------------------------------------------