├── .gitignore
├── src
    ├── test
    │   ├── resources
    │   │   ├── solr
    │   │   │   ├── collection1
    │   │   │   ├── alldata
    │   │   │   │   └── conf
    │   │   │   │   │   ├── solrconfig.xml
    │   │   │   │   │   └── schema.xml
    │   │   │   ├── minimal
    │   │   │   │   └── conf
    │   │   │   │   │   ├── solrconfig.xml
    │   │   │   │   │   └── schema.xml
    │   │   │   └── min_absolute
    │   │   │   │   └── conf
    │   │   │   │       ├── solrconfig.xml
    │   │   │   │       └── schema.xml
    │   │   └── data
    │   │   │   └── ocrtext_full.txt
    │   └── java
    │   │   └── de
    │   │       └── digitalcollections
    │   │           ├── lucene
    │   │               └── analysis
    │   │               │   └── payloads
    │   │               │       ├── TestUtils.java
    │   │               │       ├── OcrInfoEncoderTest.java
    │   │               │       ├── PayloadHelperTest.java
    │   │               │       └── OcrInfoTest.java
    │   │           └── solr
    │   │               └── plugin
    │   │                   └── components
    │   │                       └── ocrhighlighting
    │   │                           ├── AbsoluteHighlightingTest.java
    │   │                           ├── MinimalHighlightingTest.java
    │   │                           ├── DistributedOcrHighlightingTest.java
    │   │                           └── OcrHighlightingTest.java
    └── main
    │   └── java
    │       └── de
    │           └── digitalcollections
    │               ├── lucene
    │                   └── analysis
    │                   │   ├── payloads
    │                   │       ├── OcrInfoEncoder.java
    │                   │       ├── OcrPayloadHelper.java
    │                   │       └── OcrInfo.java
    │                   │   └── util
    │                   │       └── DelimitedOcrInfoPayloadTokenFilterFactory.java
    │               └── solr
    │                   └── plugin
    │                       └── components
    │                           └── ocrhighlighting
    │                               └── OcrHighlighting.java
├── example
    ├── docker-compose.yml
    ├── solr
    │   ├── Dockerfile
    │   └── ocrtest
    │   │   └── conf
    │   │       ├── solrconfig.xml
    │   │       └── schema.xml
    ├── hocr2solr
    ├── index_google1000
    └── README.md
├── settings.xml
├── CHANGELOG.md
├── LICENSE
├── .travis.yml
├── pom.xml
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | /.idea/
2 | target
3 | 


--------------------------------------------------------------------------------
/src/test/resources/solr/collection1:
--------------------------------------------------------------------------------
1 | alldata


--------------------------------------------------------------------------------
/example/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | services:
 3 |   solr:
 4 |     build: solr
 5 |     ports:
 6 |      - "8983:8983"
 7 |      - "18983:18983"
 8 |      - "8849:8849"
 9 |     volumes:
10 |       - data-solr:/opt/solr/server/solr/ocrtest
11 | volumes:
12 |   data-solr:
13 | 


--------------------------------------------------------------------------------
/src/test/java/de/digitalcollections/lucene/analysis/payloads/TestUtils.java:
--------------------------------------------------------------------------------
 1 | package de.digitalcollections.lucene.analysis.payloads;
 2 | 
 3 | class TestUtils {
 4 |   public static char[] toChars(String input) {
 5 |     char[] buf = new char[input.length()];
 6 |     input.getChars(0, input.length(), buf, 0);
 7 |     return buf;
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/settings.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <settings xmlns="http://maven.apache.org/SETTINGS/1.0.0"
 3 |   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |   xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0 http://maven.apache.org/xsd/settings-1.0.0.xsd">
 5 |   <servers>
 6 |     <server>
 7 |       <id>ossrh-snapshots</id>
 8 |       <username>${env.SONATYPE_USERNAME}</username>
 9 |       <password>${env.SONATYPE_PASSWORD}</password>
10 |     </server>
11 |   </servers>
12 | </settings>


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # 0.2
 2 | - Support for absolute coordinates
 3 | - Docker-based example setup
 4 | - Storing Term Vectors is no longer needed, which reduces the index size
 5 |   significantly (~50%) and speeds up the highlighting
 6 | 
 7 | # 0.1 (initial release)
 8 | -  Changes compared to the in-house development version:
 9 |   - Made the number of bits for page, line and word indices configurable
10 |   - Much more detailled documentation
11 |   - Updated all docstrings
12 |   - Tests for various usage scenarios
13 | 
14 | 


--------------------------------------------------------------------------------
/example/solr/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM solr:7.3.1-alpine
 2 | 
 3 | COPY ocrtest /opt/solr/server/solr/ocrtest
 4 | 
 5 | USER root
 6 | RUN chown -R $SOLR_USER:$SOLR_USER /opt/solr/server/solr/ocrtest
 7 | 
 8 | USER solr
 9 | RUN mkdir -p /opt/solr/server/solr/ocrtest/lib &&\
10 |     wget https://github.com/dbmdz/solr-ocrpayload-plugin/releases/download/0.2/solr-ocrpayload-plugin-0.2.jar -P/opt/solr/server/solr/ocrtest/lib/ &&\
11 |     bin/solr start -v &&\
12 |     bin/solr create_core -c ocrtest &&\
13 |     bin/solr stop
14 | 
15 | USER solr
16 | 


--------------------------------------------------------------------------------
/src/test/resources/solr/alldata/conf/solrconfig.xml:
--------------------------------------------------------------------------------
 1 | <config>
 2 |   <luceneMatchVersion>${tests.luceneMatchVersion:LUCENE_CURRENT}</luceneMatchVersion>
 3 |   <dataDir>${solr.data.dir:}</dataDir>
 4 |   <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
 5 |   <schemaFactory class="ClassicIndexSchemaFactory"/>
 6 | 
 7 |   <requestHandler name="standard" class="solr.StandardRequestHandler">
 8 |     <arr name="last-components">
 9 |       <str>ocr_highlight</str>
10 |     </arr>
11 |   </requestHandler>
12 | 
13 |   <searchComponent name="ocr_highlight"
14 |                    class="de.digitalcollections.solr.plugin.components.ocrhighlighting.OcrHighlighting"
15 |                    coordinateBits="10" pageBits="12" lineBits="11" wordBits="9" />
16 | </config>
17 | 


--------------------------------------------------------------------------------
/src/test/resources/solr/minimal/conf/solrconfig.xml:
--------------------------------------------------------------------------------
 1 | <config>
 2 |   <luceneMatchVersion>${tests.luceneMatchVersion:LUCENE_CURRENT}</luceneMatchVersion>
 3 |   <dataDir>${solr.data.dir:}</dataDir>
 4 |   <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
 5 |   <schemaFactory class="ClassicIndexSchemaFactory"/>
 6 | 
 7 |   <requestHandler name="standard" class="solr.StandardRequestHandler">
 8 |     <arr name="last-components">
 9 |       <str>ocr_highlight</str>
10 |     </arr>
11 |   </requestHandler>
12 | 
13 |   <searchComponent name="ocr_highlight"
14 |                    class="de.digitalcollections.solr.plugin.components.ocrhighlighting.OcrHighlighting"
15 |                    coordinateBits="10" pageBits="0" lineBits="0" wordBits="0" />
16 | </config>
17 | 


--------------------------------------------------------------------------------
/src/test/resources/solr/min_absolute/conf/solrconfig.xml:
--------------------------------------------------------------------------------
 1 | <config>
 2 |   <luceneMatchVersion>${tests.luceneMatchVersion:LUCENE_CURRENT}</luceneMatchVersion>
 3 |   <dataDir>${solr.data.dir:}</dataDir>
 4 |   <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
 5 |   <schemaFactory class="ClassicIndexSchemaFactory"/>
 6 | 
 7 |   <requestHandler name="standard" class="solr.StandardRequestHandler">
 8 |     <arr name="last-components">
 9 |       <str>ocr_highlight</str>
10 |     </arr>
11 |   </requestHandler>
12 | 
13 |   <searchComponent name="ocr_highlight"
14 |                    class="de.digitalcollections.solr.plugin.components.ocrhighlighting.OcrHighlighting"
15 |                    coordinateBits="16" absoluteCoordinates="true" pageBits="0" lineBits="0" wordBits="0" />
16 | </config>
17 | 


--------------------------------------------------------------------------------
/example/solr/ocrtest/conf/solrconfig.xml:
--------------------------------------------------------------------------------
 1 | <config>
 2 |   <luceneMatchVersion>7.3</luceneMatchVersion>
 3 |   <dataDir>${solr.data.dir:}</dataDir>
 4 |   <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.StandardDirectoryFactory}"/>
 5 |   <schemaFactory class="ClassicIndexSchemaFactory"/>
 6 | 
 7 |   <!-- The highlighting component needs to have the same parameter values as the
 8 |        filter component. -->
 9 |   <searchComponent name="ocr_highlight"
10 |                    class="de.digitalcollections.solr.plugin.components.ocrhighlighting.OcrHighlighting"
11 |                    coordinateBits="14" pageBits="12" lineBits="11" wordBits="9"
12 |                    absoluteCoordinates="true" />
13 | 
14 |   <requestHandler name="/select" class="solr.SearchHandler">
15 |     <lst name="defaults">
16 |       <str name="echoParams">explicit</str>
17 |       <int name="rows">10</int>
18 |     </lst>
19 |     <arr name="last-components">
20 |       <str>ocr_highlight</str>
21 |     </arr>
22 |   </requestHandler>
23 | </config>
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2018 Munich Digitization Center/Bavarian State Library
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/src/test/resources/solr/minimal/conf/schema.xml:
--------------------------------------------------------------------------------
 1 | <schema name="coordinateHighlight" version="1.0">
 2 |   <types>
 3 |     <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
 4 |     <fieldtype name="text_ocr" class="solr.TextField" omitTermFreqAndPositions="false">
 5 | 
 6 |       <analyzer>
 7 |         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
 8 |         <filter class="de.digitalcollections.lucene.analysis.util.DelimitedOcrInfoPayloadTokenFilterFactory"
 9 |           delimiter="|" coordinateBits="10" pageBits="0" lineBits="0" wordBits="0"/>
10 |         <filter class="solr.StandardFilterFactory"/>
11 |         <filter class="solr.LowerCaseFilterFactory"/>
12 |         <filter class="solr.StopFilterFactory"/>
13 |         <filter class="solr.PorterStemFilterFactory"/>
14 |       </analyzer>
15 |     </fieldtype>
16 |   </types>
17 | 
18 |   <fields>
19 |     <field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
20 |     <field name="ocr_text" type="text_ocr" indexed="true" stored="false" />
21 |     <dynamicField name="*_ocr" type="text_ocr" indexed="true" stored="false" />
22 |   </fields>
23 |   <uniqueKey>id</uniqueKey>
24 | </schema>
25 | 


--------------------------------------------------------------------------------
/src/test/resources/solr/alldata/conf/schema.xml:
--------------------------------------------------------------------------------
 1 | <schema name="coordinateHighlight" version="1.0">
 2 |   <types>
 3 |     <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
 4 |     <fieldtype name="text_ocr" class="solr.TextField" omitTermFreqAndPositions="false">
 5 | 
 6 |       <analyzer>
 7 |         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
 8 |         <filter class="de.digitalcollections.lucene.analysis.util.DelimitedOcrInfoPayloadTokenFilterFactory"
 9 |           delimiter="|" coordinateBits="10" pageBits="12" lineBits="11" wordBits="9"/>
10 |         <filter class="solr.StandardFilterFactory"/>
11 |         <filter class="solr.LowerCaseFilterFactory"/>
12 |         <filter class="solr.StopFilterFactory"/>
13 |         <filter class="solr.PorterStemFilterFactory"/>
14 |       </analyzer>
15 |     </fieldtype>
16 |   </types>
17 | 
18 |   <fields>
19 |     <field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
20 |     <field name="ocr_text" type="text_ocr" indexed="true" stored="false" />
21 |     <dynamicField name="*_ocr" type="text_ocr" indexed="true" stored="false" />
22 |   </fields>
23 |   <uniqueKey>id</uniqueKey>
24 | </schema>
25 | 


--------------------------------------------------------------------------------
/src/test/resources/solr/min_absolute/conf/schema.xml:
--------------------------------------------------------------------------------
 1 | <schema name="coordinateHighlight" version="1.0">
 2 |   <types>
 3 |     <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
 4 |     <fieldtype name="text_ocr" class="solr.TextField" omitTermFreqAndPositions="false">
 5 | 
 6 |       <analyzer>
 7 |         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
 8 |         <filter class="de.digitalcollections.lucene.analysis.util.DelimitedOcrInfoPayloadTokenFilterFactory"
 9 |           delimiter="☛" coordinateBits="16" absoluteCoordinates="true" pageBits="0" lineBits="0" wordBits="0"/>
10 |         <filter class="solr.StandardFilterFactory"/>
11 |         <filter class="solr.LowerCaseFilterFactory"/>
12 |         <filter class="solr.StopFilterFactory"/>
13 |         <filter class="solr.PorterStemFilterFactory"/>
14 |       </analyzer>
15 |     </fieldtype>
16 |   </types>
17 | 
18 |   <fields>
19 |     <field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
20 |     <field name="ocr_text" type="text_ocr" indexed="true" stored="false" />
21 |     <dynamicField name="*_ocr" type="text_ocr" indexed="true" stored="false" />
22 |   </fields>
23 |   <uniqueKey>id</uniqueKey>
24 | </schema>
25 | 


--------------------------------------------------------------------------------
/example/solr/ocrtest/conf/schema.xml:
--------------------------------------------------------------------------------
 1 | <schema name="coordinateHighlight" version="1.0">
 2 |   <types>
 3 |     <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
 4 |     <fieldtype name="text_ocr" class="solr.TextField" omitTermFreqAndPositions="false">
 5 | 
 6 |       <analyzer>
 7 |         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
 8 | 
 9 |         <!-- We store word, line and page offsets alongside the coordinates -->
10 |         <filter class="de.digitalcollections.lucene.analysis.util.DelimitedOcrInfoPayloadTokenFilterFactory"
11 |           delimiter="☛" coordinateBits="14" pageBits="12" lineBits="11" wordBits="9"
12 |           absoluteCoordinates="true" />
13 | 
14 |         <filter class="solr.StandardFilterFactory"/>
15 |         <filter class="solr.LowerCaseFilterFactory"/>
16 |         <filter class="solr.StopFilterFactory"/>
17 |         <filter class="solr.PorterStemFilterFactory"/>
18 |       </analyzer>
19 |     </fieldtype>
20 |   </types>
21 | 
22 |   <fields>
23 |     <field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
24 |     <field name="ocr_text" type="text_ocr" indexed="true" stored="false" />
25 |     <dynamicField name="*_ocr" type="text_ocr" indexed="true" stored="false" />
26 |   </fields>
27 |   <uniqueKey>id</uniqueKey>
28 | </schema>
29 | 


--------------------------------------------------------------------------------
/example/hocr2solr:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | from collections import namedtuple
 4 | 
 5 | import lxml.etree as ET
 6 | 
 7 | PAGE_PATH = './/div[@class="ocr_page"]'
 8 | LINE_PATH = './/span[@class="ocr_line"]'
 9 | WORD_PATH = './span[@class="ocr_cinfo"]'
10 | 
11 | OcrBox = namedtuple('OcrBox', ('page_idx', 'line_idx', 'word_idx',
12 |                                'x', 'y', 'width', 'height', 'word'))
13 | parser = ET.HTMLParser()
14 | 
15 | 
16 | def make_solr_token(ocrbox):
17 |     payload = ("p:{page_idx},l:{line_idx},n:{word_idx},x:{x},y:{y},"
18 |                "w:{width},h:{height}").format(**ocrbox._asdict())
19 |     return "{word}☛{payload}".format(word=ocrbox.word, payload=payload)
20 | 
21 | 
22 | def parse_hocr(hocr_path):
23 |     tree = ET.parse(hocr_path, parser=parser)
24 |     for page_idx, page_elem in enumerate(tree.findall(PAGE_PATH)):
25 |         for line_idx, line_elem in enumerate(page_elem.findall(LINE_PATH)):
26 |             for word_idx, word_elem in enumerate(line_elem.findall(WORD_PATH)):
27 |                 bbox = next(
28 |                         p.strip() for p in word_elem.attrib['title'].split(';')
29 |                         if p.strip().startswith('bbox'))
30 |                 x, y, x1, y1 = tuple(int(p) for p in bbox.split(" ")[1:])
31 |                 yield OcrBox(page_idx=page_idx, line_idx=line_idx,
32 |                         word_idx=word_idx, x=x, y=y, width=x1-x, height=y1-y,
33 |                         word=word_elem.text)
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     hocr_path = sys.argv[1]
38 |     for idx, box in enumerate(parse_hocr(hocr_path)):
39 |         if idx > 0:
40 |             sys.stdout.write(" ")
41 |         sys.stdout.write(make_solr_token(box))
42 | 


--------------------------------------------------------------------------------
/src/test/java/de/digitalcollections/solr/plugin/components/ocrhighlighting/AbsoluteHighlightingTest.java:
--------------------------------------------------------------------------------
 1 | package de.digitalcollections.solr.plugin.components.ocrhighlighting;
 2 | 
 3 | import com.jayway.jsonpath.DocumentContext;
 4 | import com.jayway.jsonpath.JsonPath;
 5 | import com.revinate.assertj.json.JsonPathAssert;
 6 | import org.apache.solr.SolrTestCaseJ4;
 7 | import org.junit.BeforeClass;
 8 | import org.junit.Test;
 9 | 
10 | /** Test that configuring the plugin with absolute coordinates works as expected. **/
11 | public class AbsoluteHighlightingTest extends SolrTestCaseJ4 {
12 |   @BeforeClass
13 |   public static void beforeClass() throws Exception {
14 |     initCore("conf/solrconfig.xml", "conf/schema.xml", "src/test/resources/solr", "min_absolute");
15 |     assertU(adoc("ocr_text", "two☛x:12300,y:432,w:543,h:654, one☛x:654,y:543,w:432,h:321,", "id", "101"));
16 |     assertU(adoc("ocr_text", "three☛x:127,y:4820,w:5490,h:654, two☛x:654,y:54337,w:431,h:341 five☛x:0,y:0,w:0,h:0, "
17 |         + "four☛x:111,y:111,w:111,h:111,", "id", "102"));
18 | 
19 |     assertU(commit());
20 |   }
21 | 
22 |   @Test
23 |   public void testMinimal() throws Exception {
24 |     String json = JQ(req(
25 |         "q", "two", "sort", "id asc", "ocr_hl", "true", "ocr_hl.fields", "ocr_text", "df", "ocr_text"));
26 |     DocumentContext ctx = JsonPath.parse(json);
27 |     JsonPathAssert.assertThat(ctx).jsonPathAsInteger("ocr_highlighting.101.ocr_text[0].x")
28 |         .isEqualTo(12300);
29 |     JsonPathAssert.assertThat(ctx).jsonPathAsInteger("ocr_highlighting.101.ocr_text.length()").isEqualTo(1);
30 |     JsonPathAssert.assertThat(ctx).jsonPathAsInteger("ocr_highlighting.102.ocr_text[0].y")
31 |         .isEqualTo(54337);
32 |     JsonPathAssert.assertThat(ctx).jsonPathAsInteger("ocr_highlighting.101.ocr_text.length()").isEqualTo(1);
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/example/index_google1000:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import re
 3 | import sys
 4 | import tarfile
 5 | from pathlib import Path
 6 | 
 7 | import requests
 8 | 
 9 | 
10 | OCRTEXT_URL = 'https://zvdd-ng.de/files/google1000_solr.tgz'
11 | SOLR_HOST = 'localhost:8983'
12 | SOLR_CORE = 'ocrtest'
13 | 
14 | 
15 | class SolrException(Exception):
16 |     def __init__(self, resp, payload):
17 |         self.message = resp
18 |         self.payload = payload
19 | 
20 | 
21 | def index_documents(docs):
22 |     resp = requests.post(
23 |         "http://{}/solr/{}/update".format(SOLR_HOST, SOLR_CORE),
24 |         json=docs, params=dict(softCommit="true"))
25 |     if not resp:
26 |         raise SolrException(resp.json(), docs)
27 | 
28 | 
29 | def fetch_ocrtext():
30 |     with requests.get(OCRTEXT_URL, stream=True) as resp:
31 |         tf = tarfile.open(fileobj=resp.raw, mode="r|gz")
32 |         for ti in tf:
33 |             if not ti.isfile() or not ti.name.endswith('.txt'):
34 |                 continue
35 |             ident = int(re.findall('\d{4}', ti.name)[0])
36 |             yield ident, tf.extractfile(ti).read().decode('utf8')
37 | 
38 | 
39 | def load_ocrtext(base_dir):
40 |     base_dir = Path(base_dir)
41 |     for idx, txt in enumerate(sorted(base_dir.glob("./*.txt"))):
42 |         with txt.open("rt") as fp:
43 |             yield idx, fp.read()
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     if len(sys.argv) > 1:
48 |         txt_iter = load_ocrtext(sys.argv[1])
49 |     else:
50 |         txt_iter = fetch_ocrtext()
51 |     batch = []
52 |     for ident, text in txt_iter:
53 |         doc = dict(id=ident, ocr_text=text)
54 |         batch.append(doc)
55 |         if len(batch) == 50:
56 |             print("Indexing batch of 50 documents...")
57 |             index_documents(batch)
58 |             batch = []
59 |     if batch:
60 |         index_documents(batch)
61 | 


--------------------------------------------------------------------------------
/src/test/java/de/digitalcollections/solr/plugin/components/ocrhighlighting/MinimalHighlightingTest.java:
--------------------------------------------------------------------------------
 1 | package de.digitalcollections.solr.plugin.components.ocrhighlighting;
 2 | 
 3 | import com.jayway.jsonpath.DocumentContext;
 4 | import com.jayway.jsonpath.JsonPath;
 5 | import com.revinate.assertj.json.JsonPathAssert;
 6 | import org.apache.solr.SolrTestCaseJ4;
 7 | import org.junit.BeforeClass;
 8 | import org.junit.Test;
 9 | 
10 | import java.math.BigDecimal;
11 | 
12 | /** Test that configuring the plugin without page/line/word indices works as expected. **/
13 | public class MinimalHighlightingTest extends SolrTestCaseJ4 {
14 |   @BeforeClass
15 |   public static void beforeClass() throws Exception {
16 |     initCore("conf/solrconfig.xml", "conf/schema.xml", "src/test/resources/solr", "minimal");
17 |     assertU(adoc("ocr_text", "two|x:12.3,y:43.2,w:54.3,h:65.4, one|x:65.4,y:54.3,w:43.2,h:32.1,", "id", "101"));
18 |     assertU(adoc("ocr_text", "three|x:12.7,y:48.2,w:54.9,h:65.4, two|x:65.4,y:54.3,w:43.1,h:34.1, five|x:0,y:0,w:0,h:0, "
19 |         + "four|x:11.1,y:11.1,w:11.1,h:11.1,", "id", "102"));
20 | 
21 |     assertU(commit());
22 |   }
23 | 
24 |   @Test
25 |   public void testMinimal() throws Exception {
26 |     String json = JQ(req(
27 |         "q", "two", "sort", "id asc", "ocr_hl", "true", "ocr_hl.fields", "ocr_text", "df", "ocr_text"));
28 |     DocumentContext ctx = JsonPath.parse(json);
29 |     JsonPathAssert.assertThat(ctx).jsonPathAsBigDecimal("ocr_highlighting.101.ocr_text[0].x")
30 |         .isBetween(BigDecimal.valueOf(0.1230), BigDecimal.valueOf(0.1239));
31 |     JsonPathAssert.assertThat(ctx).jsonPathAsInteger("ocr_highlighting.101.ocr_text.length()").isEqualTo(1);
32 |     JsonPathAssert.assertThat(ctx).jsonPathAsBigDecimal("ocr_highlighting.102.ocr_text[0].x")
33 |         .isBetween(BigDecimal.valueOf(0.6540), BigDecimal.valueOf(0.6549));
34 |     JsonPathAssert.assertThat(ctx).jsonPathAsInteger("ocr_highlighting.101.ocr_text.length()").isEqualTo(1);
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/example/README.md:
--------------------------------------------------------------------------------
 1 | # Example Setup
 2 | 
 3 | ## Configuration
 4 | The index is configured to store the OCR bounding boxes with the following parameters:
 5 | 
 6 | - Coordinates are stored as **absolute pixel values**
 7 | - The payload needs **11 bytes** per token
 8 |   (14 bits per coordinate, 12 bit for word-, 11 bit for line- and 9 bit for word-indices)
 9 | 
10 | 
11 | ## Dataset
12 | This demo creates an index of the [Google 1000 Books ICDAR 2007 dataset](http://commondatastorage.googleapis.com/books/icdar2007/README.txt).
13 | It consists of 103,672,774 tokens split across 1000 OCRed books taken from the Google Books project.
14 | The resulting index is 1.1GiB in size, compared to 4.4GiB for the uncompressed input documents.
15 | 
16 | 
17 | ## Running the demo
18 | - Launch the Docker container: `docker-compose up`
19 | - Index the pre-converted OCR volumes with `./index_google1000`
20 | - **Search!** `curl 'http://localhost:8983/solr/ocrtest/select?q=ocr_text:harvard&ocr_hl=true&ocr_hl.fields=ocr_text'`
21 | 
22 | 
23 | ## Converting the hOCR to the input format manually
24 | The instructions above fetch an archive with the hOCRs from the dataset pre-converted
25 | (https://zvdd-ng.de/files/google1000_solr.tgz). If you want to do this yourself, follow these steps:
26 | 
27 | - Obtain the dataset by downloading the individual books, ideally with a newer version of bash or zsh:
28 |   ```sh
29 |   $ wget http://commondatastorage.googleapis.com/books/icdar2007/Volume_{0000..0999}.zip
30 |   $ for zip in *.zip; do unzip $zip; done
31 |   ```
32 | - Convert the individual hOCR files to the format needed by the Solr configuration
33 |   (`<word>☛p:<pageNo>,l:<lineNo>,n:<wordNo>,x:<xOffset>,y:<yOffset>,w:<width>,h:<height>`):
34 |   ```sh
35 |   $ for hocr in Volume_*/hOCR.html; do ./hocr2solr $hocr > $(echo $hocr |sed 's/.html/.txt/'); done
36 |   ```
37 | - Index the books by passing the directory with the `.txt`-files as the first parameter:
38 |   ```sh
39 |   $ ./index_google1000 <txt-dir>
40 |   ```
41 | 


--------------------------------------------------------------------------------
/src/test/java/de/digitalcollections/solr/plugin/components/ocrhighlighting/DistributedOcrHighlightingTest.java:
--------------------------------------------------------------------------------
 1 | package de.digitalcollections.solr.plugin.components.ocrhighlighting;
 2 | 
 3 | import org.apache.solr.BaseDistributedSearchTestCase;
 4 | import org.apache.solr.handler.component.SearchComponent;
 5 | import org.junit.BeforeClass;
 6 | import org.junit.Test;
 7 | 
 8 | public class DistributedOcrHighlightingTest extends BaseDistributedSearchTestCase {
 9 | 
10 |   @BeforeClass
11 |   public static void beforeClass() throws Exception {
12 |     System.setProperty("managed.schema.mutable", "true");
13 | 
14 |     initCore("conf/solrconfig.xml", "conf/schema.xml", "src/test/resources/solr", "alldata");
15 | 
16 |     // The highlighting component should be active
17 |     SearchComponent highlighter = h.getCore().getSearchComponent("ocr_highlight");
18 |     assertTrue("wrong highlighter: " + highlighter.getClass(),
19 |         highlighter instanceof OcrHighlighting);
20 | 
21 |     assertU(adoc("ocr_text", "contains|p:20,l:3,n:5,x:11.1,y:22.2,w:33.3,h:44.4, position|p:20,l:4,n:6,x:55.5,y:66.6,w:77.7,h:88.8,", "id", "105"));
22 | 
23 |     assertU(BaseDistributedSearchTestCase.commit());
24 |   }
25 | 
26 |   @Test
27 |   @ShardsRepeat(max=5)
28 |   public void testWithPageNumberAndPosition() {
29 |     assertQ(
30 |         "terms with both page number and word position",
31 |         req("q", "contains position", "sort", "id asc", "ocr_hl", "true","ocr_hl.fields", "ocr_text", "df", "ocr_text"),
32 |         "count(//lst[@name='ocr_highlighting']/*)=1",
33 |         "count(//lst[@name='ocr_highlighting']/lst[@name='105']/arr[@name='ocr_text']/lst)=2",
34 |         "(//lst[@name='ocr_highlighting']/lst[@name='105']/arr[@name='ocr_text']/lst)[1]/int[@name='page']='20'",
35 |         "(//lst[@name='ocr_highlighting']/lst[@name='105']/arr[@name='ocr_text']/lst)[1]/int[@name='line']='3'",
36 |         "(//lst[@name='ocr_highlighting']/lst[@name='105']/arr[@name='ocr_text']/lst)[1]/int[@name='word']='5'",
37 |         "(//lst[@name='ocr_highlighting']/lst[@name='105']/arr[@name='ocr_text']/lst)[2]/int[@name='page']='20'",
38 |         "(//lst[@name='ocr_highlighting']/lst[@name='105']/arr[@name='ocr_text']/lst)[2]/int[@name='line']='4'",
39 |         "(//lst[@name='ocr_highlighting']/lst[@name='105']/arr[@name='ocr_text']/lst)[2]/int[@name='word']='6'");
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: java
 2 | jdk:
 3 | - openjdk8
 4 | - oraclejdk8
 5 | - openjdk11
 6 | addons:
 7 |   apt:
 8 |     packages:
 9 |     - libxml2-utils
10 | before_script:
11 | - PROJECT_VERSION=$(xmllint --xpath '/*[local-name()="project"]/*[local-name()="version"]/text()' pom.xml)
12 | - if [ ! -z "$TRAVIS_TAG" ]; then mvn versions:set -DnewVersion=$TRAVIS_TAG; fi
13 | script:
14 | - mvn clean install
15 | after_success:
16 | - bash <(curl -s https://codecov.io/bash)
17 | - if [[ "$PROJECT_VERSION" == *SNAPSHOT ]]; then mvn deploy -B -DskipTests=true --settings settings.xml; fi
18 | deploy:
19 |   provider: releases
20 |   api_key:
21 |     secure: LJjZzy9i0OmNJrav71LbyP7kP9oWzkUXgwCnG95b2bzrDnVw0OQVDpU3nQXB9w0AKgF4CCsHMLo6k383n9LQ+f/AGO91a3urya3Rns+Y7Y6ptk3ZRXA39U7/5Eod8lWApd18l+EWvansZEzkukHH0xXGK+SWuTo5taH6PKmrEQKjw9zxMI07+9lIJcN7vddQzdFanCaincz4pW/EgLIhJXcecyprDjnYQnbEVczYsp7W2+XbLn/rhE7vxK/ZCU1nJojUD3YwMrQgBj5MaiAtAEYSwvuq5N9jpR1glEIqlmhw5kdhLIt+R3FlMicWPhKPW/7t7RmCFAxDclkBaJT+RBF/438wZwUXLZ4KbeAcgB3zRkgK1qVpxx/jJF0Q1zHzrRF1tsJSILI7yJBpE+JPp/881JwLJnUgbuAAWy282m9Kd5G3DL7yhkWlG+Jrau6iN89kh+Nko7KtZ3bosvyzne9cgC+AubTY3pF+BebKA0ZQarolhY24U9yOKDmo1gvllEbnlSsV5N3ga7zhZ6rHMpN/X+3qqn9awffHvXsG0dJpXk968c2FzSgwMk7ibcN1vFcSf0MkByOE5iiZDjOpzS4FOi6j70XLfX+8NI/QcP63jXO7oXzNBBWhQaXM/VrHt0oNP2u56UZTKDmgW+9BSIQtIKRCd3FxKfNtr/ES1ms=
22 |   file_glob: true
23 |   file:
24 |   - target/*.jar
25 |   skip_cleanup: true
26 |   on:
27 |     tags: true
28 | env:
29 |   global:
30 |   - secure: WGtLFUeolKk0vLZnMnAdkirAVjHN/PbZPSATSMDdVsBZO36JypwDq8OLcHejjuNT0y4O0QAgN4m5iz6qR5iZjf9qcTSp/eZVd5lgt1sLaWIWfNdccS2cwOLcb6RprKAbrfzhIUVoBbJUt6LvSXE/tJM+lHMcIqGdEsW/u3NbfMxFzZdJTjvbjyN/kszRq4CeuLaou/8NhR1RdN26CvPr9RZkcqNsyKgZql5Tny5Bvt1xbxaMeju+2ZKZVyTXgEntiQM92uYVooyyni6Zl0pfXHwo57EoBwiveVaCRtdQDN2AAl+2B8jV5bsIZipUiGEZDs3E9vYjl0/9YUCCKqqiRk2xMi0lqUmLOenTKVHS28WlKEaPCbxTqTtA+ZmQi9tAgt7YqTOnfIUT7RFEZfTMs5DnedwNdOI055ft71Vbjh612nuFWqkoaxqp8gHmBFRF55R6c0Ixq5HRShiERi9x5U4CAeLnkh25QzvZD+PRfVn52fdQ+qaVkpJwBplzr3/m6RjxXjW6l414y7eSBope+dk5BTLPrC1R1KcU3yGSXugJUr8hVrqDVvhoN5JOwL6j5iPThdkDEebifm6nsH4Hm3sP0KlBFOq7P71UbHOhR8Atj6CWKgYePyPaQ+I8KcARxu4wmjh8FShQeVcRRAKkAd2vYa+qQRAhit+EijL8J5M=
31 |   - secure: aQfHA0oyixmU2KM5H33u5cKYSnn/ToSgNMrriCWdY0zKHwIGQuy8Bw8EV3Zp75Gy+1gEh6JrbM4mpdoxvxxPJd45Eywr+ZYrOlw1viZD3XuxZON842iAtp9GbyzMvpGtRK0MGCMWgPjg1sVGhEM5BgFFR4DABq8xKoj2sp1kNKy+hEZSPOfBiGyCO45zcBcU+9JoKyTez7fKQvOK2Sxv52owQbzrpbAChXUQkZvH8zbGnX8SvA534m1X6VaK3SvxTOo/TIQA3b4iBO2SlHNB7p15D9s2P6WxYP/zBlMqhNSA1A3sz/jqVvYZNbMEPp0ZX+qpzympj6RqI0Oqll20LTFQW1HvPMC+U0aVB8C4N5gGyuTir2+pkGAkRiCtmwg189fjvSArfkDTXTknN8nZMvG+hzT0YhJf7+n/NHR0mcHnoCA+vcs06c3GZq7mxMAJj7rRsqpnOUu5Um9J/vZdqg+UurmjroVqWyzAEAi/OK9dad+aPQME8xEpymx55bhBFxR5vH0cjkqGDTBRjz9x/1Km+MIu1Se3H66OT2jl/8CsWiL7Tw2Tcyz6mTXenYktFil6wsYLyTrzR5rgV+fbhveajG635iRol1/TdwHj8LcGhbG8hNCSJwDLafRrh7mHASNo0qNthQkU3SLBND3qTlDlZrVQtqkf0qFMEqaZBdc=
32 | 


--------------------------------------------------------------------------------
/src/test/java/de/digitalcollections/lucene/analysis/payloads/OcrInfoEncoderTest.java:
--------------------------------------------------------------------------------
 1 | package de.digitalcollections.lucene.analysis.payloads;
 2 | 
 3 | import com.google.common.collect.ImmutableMap;
 4 | import org.apache.lucene.util.BytesRef;
 5 | import org.junit.jupiter.params.ParameterizedTest;
 6 | import org.junit.jupiter.params.provider.Arguments;
 7 | import org.junit.jupiter.params.provider.MethodSource;
 8 | 
 9 | import java.util.Map;
10 | import java.util.stream.Stream;
11 | 
12 | import static de.digitalcollections.lucene.analysis.payloads.TestUtils.toChars;
13 | import static org.assertj.core.api.Assertions.assertThat;
14 | 
15 | public class OcrInfoEncoderTest {
16 | 
17 |   public static Stream<Arguments> data() {
18 |     byte[] withPage = {(byte) 0x1b, (byte) 0x21, (byte) 0xa1, (byte) 0xce, (byte) 0x10, (byte) 0x36};
19 |     byte[] withoutPage = {(byte) 0x21, (byte) 0xa1, (byte) 0xce, (byte) 0x10, (byte) 0x36};
20 |     byte[] withPosition = {(byte) 0x20, (byte) 0x21, (byte) 0xa1, (byte) 0xce, (byte) 0x10, (byte) 0x36};
21 |     byte[] withPositionAndPage = {(byte) 0x36, (byte) 0x20, (byte) 0x21, (byte) 0xa1, (byte) 0xce, (byte) 0x10, (byte) 0x36};
22 |     byte[] withPageLineWord = {(byte) 0x01, (byte) 0xb0, (byte) 0x54, (byte) 0x0c, (byte) 0x1f, (byte) 0x8f, (byte) 0x05, (byte) 0x85, (byte) 0xd3};
23 |     byte[] withPageLineWordAbsolute = {(byte) 0x01, (byte) 0x60, (byte) 0x42, (byte) 0x2c, (byte) 0x03, (byte) 0x0a, (byte) 0x08, (byte) 0x90, (byte) 0x0f, (byte) 0xa0, (byte) 0x03, (byte) 0x70};
24 |     Map<String, byte[]> params = ImmutableMap.<String, byte[]>builder()
25 |             .put("withPageooo|p:27,x:13.1,y:52.7,w:87.9,h:5.3,", withPage)
26 |             .put("withoutPage|x:13.1,y:52.7,w:87.9,h:5.3,", withoutPage)
27 |             .put("withWordoooo|n:32,x:13.1,y:52.7,w:87.9,h:5.3,", withPosition)
28 |             .put("withPageWord|p:27,n:32,x:13.1,y:52.7,w:87.9,h:5.3,", withPositionAndPage)
29 |             .put("withPageLineWord|p:27,l:42,n:12,x:12.3,y:23.4,w:34.5,h:45.6,", withPageLineWord)
30 |             .put("withPageLineWordAbsolute|p:22,l:33,n:44,x:778,y:2192,w:4000,h:880", withPageLineWordAbsolute)
31 |             .build();
32 |     return params.entrySet().stream().map(e -> Arguments.of(e.getKey(), e.getValue()));
33 |   }
34 | 
35 |   @ParameterizedTest
36 |   @MethodSource("data")
37 |   public void encode(String tokenFixture, byte[] bytesFixture) {
38 |     OcrInfoEncoder encoder = new OcrInfoEncoder(
39 |         tokenFixture.contains("Absolute") ? 16 : 10,
40 |         tokenFixture.contains("Word") ? 9 : 0,
41 |         tokenFixture.contains("Line") ? 11 : 0,
42 |         tokenFixture.contains("withPage") ? 12 : 0,
43 |         tokenFixture.contains("Absolute"));
44 | 
45 |     BytesRef bytes = encoder.encode(
46 |             toChars(tokenFixture),
47 |             tokenFixture.indexOf("|") + 1,
48 |             tokenFixture.length() - tokenFixture.indexOf("|") -1 );
49 |     assertThat(bytes.bytes).isEqualTo(bytesFixture);
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/java/de/digitalcollections/lucene/analysis/payloads/OcrInfoEncoder.java:
--------------------------------------------------------------------------------
 1 | package de.digitalcollections.lucene.analysis.payloads;
 2 | 
 3 | import org.apache.lucene.analysis.payloads.AbstractEncoder;
 4 | import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory;
 5 | import org.apache.lucene.analysis.payloads.PayloadEncoder;
 6 | import org.apache.lucene.util.BytesRef;
 7 | 
 8 | /**
 9 |  * Encode an OCR information string as a {@link BytesRef}.
10 |  *
11 |  * Not intended to be used directly with {@link DelimitedPayloadTokenFilterFactory}.
12 |  * Use {@link de.digitalcollections.lucene.analysis.util.DelimitedOcrInfoPayloadTokenFilterFactory} instead.
13 |  * For information on the expected format of the payload string, see {@link OcrInfo#parse(char[], int, int, int, int, int, int, boolean)}
14 |  *
15 |  * To use it, configure the {@link de.digitalcollections.lucene.analysis.util.DelimitedOcrInfoPayloadTokenFilterFactory}:
16 |  *
17 |  * ```xml
18 |  * <pre>{@code
19 |  * <filter class="org.apache.lucene.analysis.util.DelimitedOcrInfoPayloadTokenFilterFactory"
20 |  *         coordBits="12" wordBits="9" lineBits="11" pageBits="12" absoluteCoordinates="false" />
21 |  * }</pre>
22 |  * ```
23 |  */
24 | public class OcrInfoEncoder extends AbstractEncoder implements PayloadEncoder {
25 | 
26 |   private final int coordBits;
27 |   private final int wordBits;
28 |   private final int lineBits;
29 |   private final int pageBits;
30 |   private final boolean absoluteCoordinates;
31 | 
32 |   /**
33 |    * Configure a new OcrInfoEncoder.
34 |    *
35 |    * The sum of coordBits*4, wordBits, lineBits and pageBits should be divisible by 8, as not to waste any space in the
36 |    * index.
37 |    *
38 |    * @param coordBits       Number of bits to use for storing the OCR coordinates in the index, must be an even number.
39 |    * @param wordBits        Number of bits to use for storing the word index (0 to disable)
40 |    * @param lineBits        Number of bits to use for storing the line index (0 to disable)
41 |    * @param pageBits        Number of bits to use for storing the page index (0 to disable)
42 |    * @param absoluteCoordinates Whether the coordinates are stored as absolute (integral position) or relative (percentage position)
43 |    */
44 |   public OcrInfoEncoder(int coordBits, int wordBits, int lineBits, int pageBits, boolean absoluteCoordinates) {
45 |     this.coordBits = coordBits;
46 |     this.wordBits = wordBits;
47 |     this.lineBits = lineBits;
48 |     this.pageBits = pageBits;
49 |     this.absoluteCoordinates = absoluteCoordinates;
50 |   }
51 | 
52 |   /**
53 |    * Default constructor that encodes with 12bit for the coordinates and doesn't store any indices.
54 |    */
55 |   public OcrInfoEncoder() {
56 |     this(12, 0, 0, 0, false);
57 |   }
58 | 
59 |   /**
60 |    * Encode the OCR payload (see {@link OcrInfo#parse(char[], int, int, int, int, int, int, boolean)}
61 |    * be formatted) to a space-efficient binary representation.
62 |    */
63 |   @Override
64 |   public BytesRef encode(char[] chars, int offset, int length) {
65 |     OcrInfo info = OcrInfo.parse(chars, offset, length, wordBits, lineBits, pageBits, coordBits, absoluteCoordinates);
66 |     byte[] data = OcrPayloadHelper.encodeOcrInfo(info, coordBits, wordBits, lineBits, pageBits);
67 |     return new BytesRef(data);
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/src/test/java/de/digitalcollections/lucene/analysis/payloads/PayloadHelperTest.java:
--------------------------------------------------------------------------------
 1 | package de.digitalcollections.lucene.analysis.payloads;
 2 | 
 3 | import org.apache.lucene.util.BytesRef;
 4 | import org.assertj.core.data.Offset;
 5 | import org.junit.jupiter.params.ParameterizedTest;
 6 | import org.junit.jupiter.params.provider.Arguments;
 7 | import org.junit.jupiter.params.provider.MethodSource;
 8 | 
 9 | import java.util.stream.Stream;
10 | 
11 | import static org.assertj.core.api.Assertions.assertThat;
12 | 
13 | 
14 | public class PayloadHelperTest {
15 |   public static Stream<Arguments> fixtureProvider() {
16 |     byte[] withPage = {(byte) 0x1b, (byte) 0x21, (byte) 0xa1, (byte) 0xce, (byte) 0x10, (byte) 0x36};
17 |     byte[] withoutPage = {(byte)0x23, (byte)0x1e, (byte)0x9f, (byte)0xa8, (byte)0x36};
18 |     byte[] withPosition = {(byte) 0x20, (byte) 0x21, (byte) 0xa1, (byte) 0xce, (byte) 0x10, (byte) 0x36};
19 |     byte[] withPositionAndPage = {(byte) 0x0, (byte) 0xd8, (byte) 0x20, (byte) 0x21, (byte) 0xa1, (byte) 0xce, (byte) 0x10, (byte) 0x36};
20 |     byte[] withPageLineWordAbsolute = {(byte) 0x01, (byte) 0x60, (byte) 0x42, (byte) 0x2c, (byte) 0x30, (byte) 0xa8, (byte) 0x90, (byte) 0x19, (byte) 0x03, (byte) 0x70};
21 |     return Stream.of(
22 |         Arguments.of(new OcrInfo(27, .131f, .527f, .879f, .053f), withPage),
23 |         Arguments.of(new OcrInfo(.1368f, .4779f, .9782f, .0532f), withoutPage),
24 |         Arguments.of(new OcrInfo(-1, 32, .131f, .527f, .879f, .053f), withPosition),
25 |         Arguments.of(new OcrInfo(27, 32, .131f, .527f, .879f, .053f), withPositionAndPage),
26 |         Arguments.of(new OcrInfo(22, 33, 44, 778, 2192, 400, 880),
27 |                      withPageLineWordAbsolute));
28 |   }
29 | 
30 |   private void assertAreAboutEqual(OcrInfo a, OcrInfo b) {
31 |     assertThat(a.getHorizontalOffset()).isCloseTo(b.getHorizontalOffset(), Offset.offset(0.09f));
32 |     assertThat(a.getVerticalOffset()).isCloseTo(b.getVerticalOffset(), Offset.offset(0.09f));
33 |     assertThat(a.getWidth()).isCloseTo(b.getWidth(), Offset.offset(0.09f));
34 |     assertThat(a.getHeight()).isCloseTo(b.getHeight(), Offset.offset(0.09f));
35 |   }
36 | 
37 |   @ParameterizedTest
38 |   @MethodSource("fixtureProvider")
39 |   public void encodeOcrInfo(OcrInfo ocrInfo, byte[] payload) {
40 |     byte[] encodedInfo = OcrPayloadHelper.encodeOcrInfo(
41 |         ocrInfo,
42 |         ocrInfo.getHasAbsoluteCoordinates() ? 12 : 10,
43 |         ocrInfo.getWordIndex() >= 0 ? 9 : 0,
44 |         ocrInfo.getLineIndex() >= 0 ? 11 : 0,
45 |         ocrInfo.getPageIndex() >= 0 ? 12 : 0);
46 |     assertThat(encodedInfo).isEqualTo(payload);
47 |   }
48 | 
49 |   @ParameterizedTest
50 |   @MethodSource("fixtureProvider")
51 |   public void decodeOcrInfo(OcrInfo ocrInfo, byte[] payload) {
52 |     OcrInfo decodedInfo = OcrPayloadHelper.decodeOcrInfo(
53 |         new BytesRef(payload),
54 |         ocrInfo.getHasAbsoluteCoordinates() ? 12: 10,
55 |         ocrInfo.getWordIndex() >= 0 ? 9 : 0,
56 |         ocrInfo.getLineIndex() >= 0 ? 11 : 0,
57 |         ocrInfo.getPageIndex() >= 0 ? 12 : 0,
58 |         ocrInfo.getHasAbsoluteCoordinates());
59 |     if (ocrInfo.getHasAbsoluteCoordinates()) {
60 |       assertThat(decodedInfo).isEqualToComparingFieldByField(ocrInfo);
61 |     } else {
62 |       assertAreAboutEqual(decodedInfo, ocrInfo);
63 |     }
64 |   }
65 | 
66 |   @ParameterizedTest
67 |   @MethodSource("fixtureProvider")
68 |   public void doesNotDegradeAccuracy(OcrInfo ocrInfo, byte[] payload) {
69 |     if (ocrInfo.getHasAbsoluteCoordinates()) {
70 |       // NOP, there's no risk of degradation with integers
71 |       return;
72 |     }
73 |     byte[] encodedInfo;
74 |     OcrInfo decodedInfo = ocrInfo;
75 |     for (int i=0; i < 100; i++) {
76 |       encodedInfo = OcrPayloadHelper.encodeOcrInfo(decodedInfo, 10, 9, 11, 12);
77 |       decodedInfo = OcrPayloadHelper.decodeOcrInfo(new BytesRef(encodedInfo), 10,
78 |            ocrInfo.getWordIndex() >= 0 ? 9 : 0,  ocrInfo.getLineIndex() >= 0 ? 11 : 0, ocrInfo.getPageIndex() >= 0 ? 12 : 0, false);
79 |       assertAreAboutEqual(decodedInfo, ocrInfo);
80 |     }      assertAreAboutEqual(decodedInfo, ocrInfo);
81 |   }
82 | }


--------------------------------------------------------------------------------
/src/main/java/de/digitalcollections/lucene/analysis/util/DelimitedOcrInfoPayloadTokenFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package de.digitalcollections.lucene.analysis.util;
 2 | 
 3 | import de.digitalcollections.lucene.analysis.payloads.OcrInfo;
 4 | import de.digitalcollections.lucene.analysis.payloads.OcrInfoEncoder;
 5 | import de.digitalcollections.lucene.analysis.payloads.OcrPayloadHelper;
 6 | import java.util.Map;
 7 | import org.apache.lucene.analysis.TokenStream;
 8 | import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
 9 | import org.apache.lucene.analysis.util.TokenFilterFactory;
10 | import org.slf4j.Logger;
11 | import org.slf4j.LoggerFactory;
12 | 
13 | /**
14 |  * Filter factory for space-efficiently encoding OCR information in token payloads.
15 |  *
16 |  * For information on the expected format of the payload string, see
17 |  * {@link OcrInfo#parse(char[], int, int, int, int, int, int, boolean)}
18 |  *
19 |  * Takes the following configuration parameters:
20 |  *
21 |  * `coordinateBits`
22 |  * : Number of bits to use for encoding a coordinate value. 10 bits is recommended (and set by default),
23 |  *   which yields a precision to approximately three decimal places
24 |  *
25 |  * `delimiter`
26 |  * : Delimiting character. If not provided, the pipe symbol (`|`) is used
27 |  *
28 |  * `pageBits`
29 |  * : Number of bits to use for encoding the page index. 0 will disable page indices (default).
30 |  *
31 |  * `lineBits`
32 |  * : Number of bits to use for encoding the line index. 0 will disable line indices (default).
33 |  *
34 |  * `wordBits`
35 |  * : Number of bits to use for encoding the word index. 0 will disable word indices (default).
36 |  *
37 |  * Here is a sample configuration with page indices enabled:
38 |  * ```
39 |  * <pre>{@code
40 |  * <filter class="de.digitalcollections.lucene.analysis.util.DelimitedOcrInfoPayloadTokenFilterFactory"
41 |  *         coordinateBits="10" wordBits="0" lineBits="0" pageBits="12 absoluteCoordinates="false" />
42 |  * }</pre>
43 |  * ```
44 |  */
45 | public class DelimitedOcrInfoPayloadTokenFilterFactory extends TokenFilterFactory {
46 | 
47 |   private static final Logger LOGGER = LoggerFactory.getLogger(OcrPayloadHelper.class);
48 | 
49 |   private static final String COORD_BITS_ATTR = "coordinateBits";
50 |   private static final String DELIMITER_ATTR = "delimiter";
51 |   private static final String PAGE_BITS_ATTR = "pageBits";
52 |   private static final String LINE_BITS_ATTR = "lineBits";
53 |   private static final String WORD_BITS_ATTR = "wordBits";
54 |   private static final String ABSOLUTE_COORDS_ATTR = "absoluteCoordinates";
55 | 
56 |   /** Delimiter to use for splitting OCR information from the tokens **/
57 |   private final char delimiter;
58 | 
59 |   private OcrInfoEncoder encoder;
60 | 
61 |   public DelimitedOcrInfoPayloadTokenFilterFactory(Map<String, String> args) {
62 |     super(args);
63 |     delimiter = getChar(args, DELIMITER_ATTR, '|');
64 | 
65 |     /* Number of bits to use for encoding position information */
66 |     final int coordinateBits = getInt(args, COORD_BITS_ATTR, 10);
67 |     final int pageBits = getInt(args, PAGE_BITS_ATTR, 0);
68 |     final int lineBits = getInt(args, LINE_BITS_ATTR, 0);
69 |     final int wordBits = getInt(args, WORD_BITS_ATTR, 0);
70 |     final boolean absoluteCoordinates = getBoolean(args, ABSOLUTE_COORDS_ATTR, false);
71 | 
72 |     int coordWidth = coordinateBits * 4;
73 |     int remainder = coordWidth % 8;
74 |     if (remainder != 0) {
75 |       throw new IllegalArgumentException("coordinateBits must be an even number.");
76 |     }
77 |     int bitSum = coordWidth + pageBits + lineBits + wordBits;
78 |     remainder = bitSum % 8;
79 |     if (remainder != 0) {
80 |       LOGGER.warn("Final payload size {} is not divisible by 8, will be padded. This is wasting {} bits, try playing "
81 |               + "with the wordBits, lineBits and/or pageBits options.", bitSum, remainder);
82 |     }
83 |     encoder = new OcrInfoEncoder(coordinateBits, wordBits, lineBits, pageBits, absoluteCoordinates);
84 |     if (!args.isEmpty()) {
85 |       throw new IllegalArgumentException("Unknown parameters: " + args);
86 |     }
87 |   }
88 | 
89 |   @Override
90 |   public TokenStream create(TokenStream input) {
91 |     return new DelimitedPayloadTokenFilter(input, delimiter, encoder);
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------
/src/test/java/de/digitalcollections/lucene/analysis/payloads/OcrInfoTest.java:
--------------------------------------------------------------------------------
 1 | package de.digitalcollections.lucene.analysis.payloads;
 2 | 
 3 | import org.junit.jupiter.api.Test;
 4 | import org.junit.jupiter.params.ParameterizedTest;
 5 | import org.junit.jupiter.params.provider.Arguments;
 6 | import org.junit.jupiter.params.provider.MethodSource;
 7 | 
 8 | import java.util.stream.Stream;
 9 | 
10 | import static de.digitalcollections.lucene.analysis.payloads.TestUtils.toChars;
11 | import static org.assertj.core.api.Assertions.assertThat;
12 | import static org.assertj.core.api.Assertions.assertThatThrownBy;
13 | 
14 | public class OcrInfoTest {
15 |   public static Stream<Arguments> fixtureProvider() {
16 |     return Stream.of(
17 |         Arguments.of(new OcrInfo(27, .131f, .527f, .879f, .053f), "p:27,x:13.1,y:52.7,w:87.9,h:5.3"),
18 |         Arguments.of(new OcrInfo(.131f, .527f, .879f, .053f), "x:13.1,y:52.7,w:87.9,h:5.3"),
19 |         Arguments.of(new OcrInfo(-1, 50, .123f, .456f, .789f, .091f), "l:50,x:12.3,y:45.6,w:78.9,h:9.1"),
20 |         Arguments.of(new OcrInfo(123,456,.123f,.456f, .234f, .456f), "p:123,l:456,x:12.3,y:45.6,w:23.4,h:45.6"),
21 |         Arguments.of(new OcrInfo( 123, 456, 511, .123f, .234f, .345f, .456f), "p:123,l:456,n:511,x:12.3,y:23.4,w:34.5,h:45.6"),
22 |         Arguments.of(new OcrInfo(123, -1, 456, .123f, .234f, .345f, .456f), "p:123,n:456,x:12.3,y:23.4,w:34.5,h:45.6"),
23 |         Arguments.of(new OcrInfo( 123, 456, 511, .1234f, .2345f, .3456f, .4567f), "p:123,l:456,n:511,x:12.34,y:23.45,w:34.56,h:45.67"),
24 |         Arguments.of(new OcrInfo(123, 456, 511, 768, 1024, 2048, 4095),
25 |                      "p:123,l:456,n:511,x:768,y:1024,w:2048,h:4095")
26 |     );
27 |   }
28 | 
29 |   @ParameterizedTest
30 |   @MethodSource("fixtureProvider")
31 |   public void parseFromBeginning(OcrInfo info, String payload) {
32 |     char[] buf = toChars(payload);
33 |     OcrInfo parsed = OcrInfo.parse(
34 |         buf, 0, payload.length(),
35 |         info.getWordIndex() > 0 ? 9 : 0, info.getLineIndex() > 0 ? 11 : 0, info.getPageIndex() > 0 ? 12 : 0,
36 |         12,
37 |         info.getHasAbsoluteCoordinates());
38 |     assertThat(parsed).isEqualToComparingFieldByField(info);
39 |   }
40 | 
41 |   @ParameterizedTest
42 |   @MethodSource("fixtureProvider")
43 |   public void parseFromPosition(OcrInfo info, String payload) {
44 |     String padding = "someToken|";
45 |     String padded = padding + payload;
46 |     char[] buf = toChars(padded);
47 |     OcrInfo parsed = OcrInfo.parse(
48 |         buf, padding.length(), payload.length(),
49 |         info.getWordIndex() > 0 ? 9 : 0, info.getLineIndex() > 0 ? 11 : 0, info.getPageIndex() > 0 ? 12 : 0,
50 |         12,
51 |         info.getHasAbsoluteCoordinates());
52 |     assertThat(parsed).isEqualToComparingFieldByField(info);
53 |   }
54 | 
55 |   @Test
56 |   public void keysMustNotBeUsedMultipleTimes() {
57 |     String payload = "p:12,x:34.5,n:56,x:78.9,y:87.6,w:54.3,h:21";
58 |     assertThatThrownBy(() -> OcrInfo.parse(toChars(payload), 0, payload.length(), 9, 11, 12, 12, false))
59 |         .isInstanceOf(IllegalArgumentException.class)
60 |         .hasMessageContaining("Invalid payload p:12,x:34.5,n:56,x:78.9,y:87.6,w:54.3,h:21: duplicate key 'x'");
61 |   }
62 | 
63 |   @Test
64 |   public void catchOverFlow() {
65 |     String idxOverflow = "p:12,l:34,n:512,x:78.9,y:87.6,w:54.3,h:2.1";
66 |     assertThatThrownBy(() -> OcrInfo.parse(toChars(idxOverflow), 0, idxOverflow.length(), 9, 11, 12, 12, false))
67 |         .isInstanceOf(IllegalArgumentException.class)
68 |         .hasMessageContaining("512 for word needs more than 9 bits (valid values range from 0 to 511). Payload=p:12,l:34,n:512,x:78.9,y:87.6,w:54.3,h:2.1");
69 |     String coordOverFlow = "p:1,l:2,n:3,x:4096,y:2048,w:1024,h:512";
70 |     assertThatThrownBy(() -> OcrInfo.parse(toChars(coordOverFlow), 0, coordOverFlow.length(), 9, 11, 12, 12, true))
71 |         .isInstanceOf(IllegalArgumentException.class)
72 |         .hasMessageContaining("4096 for x needs more than 12 bits (valid values range from 0 to 4095). Payload=p:1,l:2,n:3,x:4096,y:2048,w:1024,h:512");
73 |   }
74 | 
75 |   @Test
76 |   public void missingParametersAreCaught() {
77 |     String missingLine = "p:12,n:56,x:78.9,y:87.6,w:54.3,h:2.1";
78 |     assertThatThrownBy(() -> OcrInfo.parse(toChars(missingLine), 0, missingLine.length(), 9, 11, 12, 12, false))
79 |         .isInstanceOf(IllegalArgumentException.class)
80 |         .hasMessageContaining("fix payload or set the 'lineBits' option to 0.");
81 |     String missingWord = "p:12,l:34,x:78.9,y:87.6,w:54.3,h:2.1";
82 |     assertThatThrownBy(() -> OcrInfo.parse(toChars(missingWord), 0, missingWord.length(), 9, 11, 12, 12, false))
83 |         .isInstanceOf(IllegalArgumentException.class)
84 |         .hasMessageContaining("fix payload or set the 'wordBits' option to 0.");
85 |     String missingPage = "l:34,n:56,x:78.9,y:87.6,w:54.3,h:2.1";
86 |     assertThatThrownBy(() -> OcrInfo.parse(toChars(missingPage), 0, missingPage.length(), 9, 11, 12, 12, false))
87 |         .isInstanceOf(IllegalArgumentException.class)
88 |         .hasMessageContaining("fix payload or set the 'pageBits' option to 0.");
89 |     String missingCoord = "p:12,l:34,n:56,x:78.9,y:87.6,w:54.3";
90 |     assertThatThrownBy(() -> OcrInfo.parse(toChars(missingCoord), 0, missingCoord.length(), 9, 11, 12, 12, false))
91 |         .isInstanceOf(IllegalArgumentException.class)
92 |         .hasMessageContaining("coordinates are missing from payload ");
93 |   }
94 | }


--------------------------------------------------------------------------------
/src/test/java/de/digitalcollections/solr/plugin/components/ocrhighlighting/OcrHighlightingTest.java:
--------------------------------------------------------------------------------
  1 | package de.digitalcollections.solr.plugin.components.ocrhighlighting;
  2 | 
  3 | import java.nio.file.Files;
  4 | import java.nio.file.Paths;
  5 | import org.apache.solr.SolrTestCaseJ4;
  6 | import org.apache.solr.handler.component.SearchComponent;
  7 | import org.junit.BeforeClass;
  8 | import org.junit.Test;
  9 | 
 10 | public class OcrHighlightingTest extends SolrTestCaseJ4 {
 11 |   @BeforeClass
 12 |   public static void beforeClass() throws Exception {
 13 |     initCore("solrconfig.xml", "schema.xml", "src/test/resources/solr", "alldata");
 14 | 
 15 |     // The highlighting component should be active
 16 |     SearchComponent highlighter = h.getCore().getSearchComponent("ocr_highlight");
 17 |     assertTrue("wrong highlighter: " + highlighter.getClass(),
 18 |         highlighter instanceof OcrHighlighting);
 19 | 
 20 |     String ocrText = String.join(" ", Files
 21 |         .readAllLines(Paths.get(OcrHighlighting.class.getResource("/data/ocrtext_full.txt").toURI())));
 22 |     assertU(adoc("ocr_text", "two|p:27,l:13,n:24,x:12.3,y:43.2,w:54.3,h:65.4, one|p:28,l:27,n:64,x:65.4,y:54.3,w:43.2,h:32.1", "id", "101"));
 23 |     assertU(adoc("ocr_text", "three|p:28,l:14,n:25,x:12.7,y:48.2,w:54.9,h:65.4, two|p:29,l:27,n:64,x:65.4,y:54.3,w:43.1,h:34.1, five|p:30,l:17,n:80,x:0,y:0,w:0,h:0, "
 24 |         + "four|p:31,l:32,n:33,x:11.1,y:11.1,w:11.1,h:11.1", "id", "102"));
 25 |     assertU(adoc("ocr_text", ocrText, "id", "103"));
 26 | 
 27 |     // Test with a dynamic field
 28 |     assertU(adoc("body_ocr", "one|p:42,l:13,n:55,x:11.1,y:22.2,w:33.3,h:44.4, two|p:42,l:13,n:66,x:55.5,y:66.6,w:77.7,h:88.8", "id", "106"));
 29 | 
 30 |     assertU(commit());
 31 |   }
 32 | 
 33 |   @Test
 34 |   public void testSingleQueryTerm() {
 35 |     assertQ(
 36 |         "single query term",
 37 |         req("q", "two", "sort", "id asc", "ocr_hl", "true","ocr_hl.fields", "ocr_text", "df", "ocr_text"),
 38 |         "count(//lst[@name='ocr_highlighting']/*)=2",
 39 |         "//lst[@name='ocr_highlighting']/lst[@name='101']/arr[@name='ocr_text']/lst[1]/int[@name='page']='27'",
 40 |         "count(//lst[@name='ocr_highlighting']/lst[@name='101']/arr[@name='ocr_text']/lst)=number('1')",
 41 |         "//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst[1]/int[@name='page']='29'",
 42 |         "count(//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst)=number('1')");
 43 |   }
 44 | 
 45 |   @Test
 46 |   public void testMultipleQueryTerms() {
 47 |     assertQ(
 48 |         "multiple query terms",
 49 |         req("q", "five four", "sort", "id asc", "ocr_hl", "true","ocr_hl.fields", "ocr_text", "df", "ocr_text"),
 50 |         "count(//lst[@name='ocr_highlighting']/*)=1",
 51 |         "count(//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst)=number('2')",
 52 |         "//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst[1]/int[@name='page']='30'",
 53 |         "//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst[2]/int[@name='page']='31'");
 54 | 
 55 |   }
 56 | 
 57 |   @Test
 58 |   public void testMultipleFuzzyQueryTerms() {
 59 |     assertQ(
 60 |         "multiple fuzzy query terms",
 61 |         req("q", "fives fours", "sort", "id asc", "ocr_hl", "true","ocr_hl.fields", "ocr_text", "df", "ocr_text"),
 62 |         "count(//lst[@name='ocr_highlighting']/*)=1",
 63 |         "count(//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst)=number('2')",
 64 |         "//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst[1]/int[@name='page']='30'",
 65 |         "//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst[1]/str[@name='term']='five'",
 66 |         "//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst[2]/int[@name='page']='31'",
 67 |         "//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst[2]/str[@name='term']='four'");
 68 |   }
 69 | 
 70 |   @Test
 71 |   public void testLimitHighlightsPerDoc() {
 72 |     assertQ(
 73 |         "limit number of highlights per document",
 74 |         req("q", "und", "sort", "id asc", "ocr_hl", "true","ocr_hl.fields", "ocr_text", "ocr_hl.maxPerDoc", "5", "df",
 75 |             "ocr_text"),
 76 |         "count(//lst[@name='ocr_highlighting']/lst[@name='103']/arr[@name='ocr_text']/lst)=number('5')");
 77 |   }
 78 | 
 79 |   @Test
 80 |   public void testLimitHighlightsPerPage() {
 81 |     assertQ(
 82 |         "limit number of highlights per page",
 83 |         req("q", "und", "sort", "id asc", "ocr_hl", "true","ocr_hl.fields", "ocr_text", "ocr_hl.maxPerPage", "5", "df",
 84 |             "ocr_text"),
 85 |         "count(//lst[@name='ocr_highlighting']/lst[@name='103']/arr[@name='ocr_text']/lst[int[@name='page']='183'])=number('5')");
 86 |   }
 87 | 
 88 |   @Test
 89 |   public void testDynamicField() {
 90 |     assertQ(
 91 |       "Dynamic field contains term with page number and word position",
 92 |       req("q", "one two", "sort", "id asc", "ocr_hl", "true", "ocr_hl.fields", "body_ocr", "df", "body_ocr"),
 93 |         "count(//lst[@name='ocr_highlighting']/*)=1",
 94 |         "count(//lst[@name='ocr_highlighting']/lst[@name='106']/arr[@name='body_ocr']/lst)=number('2')",
 95 |         "(//lst[@name='ocr_highlighting']/lst[@name='106']/arr[@name='body_ocr']/lst)[1]/int[@name='page']='42'",
 96 |         "(//lst[@name='ocr_highlighting']/lst[@name='106']/arr[@name='body_ocr']/lst)[1]/int[@name='word']='55'",
 97 |         "(//lst[@name='ocr_highlighting']/lst[@name='106']/arr[@name='body_ocr']/lst)[1]/int[@name='line']='13'",
 98 |         "(//lst[@name='ocr_highlighting']/lst[@name='106']/arr[@name='body_ocr']/lst)[2]/int[@name='page']='42'",
 99 |         "(//lst[@name='ocr_highlighting']/lst[@name='106']/arr[@name='body_ocr']/lst)[2]/int[@name='line']='13'",
100 |         "(//lst[@name='ocr_highlighting']/lst[@name='106']/arr[@name='body_ocr']/lst)[2]/int[@name='word']='66'"
101 |     );
102 |   }
103 | }
104 | 


--------------------------------------------------------------------------------
/src/main/java/de/digitalcollections/lucene/analysis/payloads/OcrPayloadHelper.java:
--------------------------------------------------------------------------------
  1 | package de.digitalcollections.lucene.analysis.payloads;
  2 | 
  3 | import com.google.common.math.IntMath;
  4 | import java.math.BigInteger;
  5 | import java.util.Arrays;
  6 | import org.apache.lucene.util.BytesRef;
  7 | 
  8 | /** Helper class to decode and encode OCR information from/into an efficient binary representation. **/
  9 | public class OcrPayloadHelper {
 10 | 
 11 |   private OcrPayloadHelper() {
 12 |     // Cannot be instantiated, is only here for the static methods
 13 |   }
 14 | 
 15 |   /**
 16 |    * Encode a {@link OcrInfo} object into a byte array.
 17 |    *
 18 |    * If the coordinates are set to be stored as relative (i.e. percentage values), we first scale the bounding box
 19 |    * coordinates according to `precision`. We then pack the complete information into `coordBits * 4` bits. The bit
 20 |    * packing is done to save as much space as possible, while scaling is used t ostill maintain as much precision as
 21 |    * possible.
 22 |    *
 23 |    * Optionally, we also store word, line and page indices if the corresponding option
 24 |    * (`wordBits`, `lineBits`, `pageBits`) is non-zero.
 25 |    *
 26 |    * Here an example with page, line and word indices, relative coordinates and 10 bits per coordinate value:
 27 |    *
 28 |    * **Input:**
 29 |    * ```
 30 |    * info = OcrInfo(pageIndex=837, lineIndex=13, wordIndex=20, horizontalOffset=0.136838387,
 31 |    *                verticalOffset=0.477909823, width=0.978231258, height=0.532390081),
 32 |    * coordBits = 10
 33 |    * wordBits  = 9
 34 |    * lineBits  = 10
 35 |    * pageBits  = 12
 36 |    * absoluteCoordinates = false
 37 |    * ```
 38 |    *
 39 |    * Since we are using 10 bits for each of the four coordinates, 9 bits for the word index, 10 for the line index and
 40 |    * 12 for the page index, the resulting binary representation will have 72 bits (`4 * 10 + 9 + 11 + 12`) or 9 bytes.
 41 |    * This is very space-efficient compared to a string-based encoding, e.g. `x136y478w978h532n20l13p837`, which is
 42 |    * 36 bytes.
 43 |    *
 44 |    *
 45 |    * **Output:**
 46 |    * ```
 47 |    * <pre>{@code
 48 |    * field     | width  |        scaled value        | binary representation
 49 |    * ========================================================================
 50 |    * pageIndex | 12bit  |                        837 | 001101000100
 51 |    * lineIndex | 11bit  |                         13 |  00000001101
 52 |    * wordIndex |  9bit  |                         20 |    000010100
 53 |    * x         | 10bit  | 0.136838387 * 2^10 ~>  140 |   0010001100
 54 |    * y         | 10bit  | 0.477909823 * 2^10 ~>  489 |   0111101001
 55 |    * width     | 10bit  | 0.978231258 * 2^10 ~> 1002 |   1111101010
 56 |    * height    | 10bit  | 0.532390081 * 2^10 ~>  545 |   1000100001
 57 |    * }</pre>
 58 |    * ````
 59 |    *
 60 |    * The resulting byte sequence is as follows (bytes are separated by whitespace):
 61 |    * ```
 62 |    *    pageIndex | lineIndex  | wordIndex|      x     |     y     |     w     |    h
 63 |    * 00110100 0100|0000 0001101|0 00010100| 00100011 00|011110 1001|1111 101010|10 00100001
 64 |    *     0x34      0x40      0x1A     0x14      0x23      0x1E      0x9F      0xAA     0x21
 65 |    * ```
 66 |    *
 67 |    * @param info                The {@link OcrInfo} to encode
 68 |    * @param coordBits The number of bits to encode each OCR coordinate value into
 69 |    * @param wordBits  The number of bits to encode the word index into
 70 |    * @param lineBits  The number of bits to encode the line index into
 71 |    * @param pageBits  The number of bits to encode the page index into
 72 |    * @return                    The resulting byte payload
 73 |    */
 74 |   public static byte[] encodeOcrInfo(OcrInfo info, int coordBits, int wordBits, int lineBits, int pageBits) {
 75 |     // To make bit-fiddling easier, we encode all the values into an arbitrary-length BigInteger
 76 |     int numBitsTotal = getOutputSize(coordBits, wordBits, lineBits, pageBits);
 77 |     int outSize = (int) Math.ceil((double) numBitsTotal / 8.0);
 78 |     BigInteger encoded = new BigInteger(new byte[outSize]);
 79 | 
 80 |     if (pageBits > 0) {
 81 |       encoded = encoded.or(BigInteger.valueOf(info.getPageIndex()));
 82 |     }
 83 |     if (lineBits > 0) {
 84 |       encoded = encoded.shiftLeft(lineBits)
 85 |               .or(BigInteger.valueOf(info.getLineIndex()));
 86 |     }
 87 |     if (wordBits > 0) {
 88 |       encoded = encoded.shiftLeft(wordBits)
 89 |               .or(BigInteger.valueOf(info.getWordIndex()));
 90 |     }
 91 |     if (info.getHasAbsoluteCoordinates()) {
 92 |       encoded = encoded
 93 |               .shiftLeft(coordBits)
 94 |               .or(BigInteger.valueOf(verifyAbsoluteValue((int) info.getHorizontalOffset(), coordBits)))
 95 |               .shiftLeft(coordBits)
 96 |               .or(BigInteger.valueOf(verifyAbsoluteValue((int) info.getVerticalOffset(), coordBits)))
 97 |               .shiftLeft(coordBits)
 98 |               .or(BigInteger.valueOf(verifyAbsoluteValue((int) info.getWidth(), coordBits)))
 99 |               .shiftLeft(coordBits)
100 |               .or(BigInteger.valueOf(verifyAbsoluteValue((int) info.getHeight(), coordBits)));
101 | 
102 |     } else {
103 |       encoded = encoded
104 |               .shiftLeft(coordBits)
105 |               .or(BigInteger.valueOf(encodeValue(info.getHorizontalOffset(), coordBits)))
106 |               .shiftLeft(coordBits)
107 |               .or(BigInteger.valueOf(encodeValue(info.getVerticalOffset(), coordBits)))
108 |               .shiftLeft(coordBits)
109 |               .or(BigInteger.valueOf(encodeValue(info.getWidth(), coordBits)))
110 |               .shiftLeft(coordBits)
111 |               .or(BigInteger.valueOf(encodeValue(info.getHeight(), coordBits)));
112 |     }
113 | 
114 |     byte[] out = encoded.toByteArray();
115 | 
116 |     // FIXME: This should only strip as many leading zeroes as out.length - outSize
117 |     // Strip extra leading null-bytes
118 |     if (out.length > outSize) {
119 |       byte[] trimmed = new byte[outSize];
120 |       int trimmedIdx = 0;
121 |       boolean prefix = true;
122 |       for (byte anOut : out) {
123 |         if (anOut != 0 || !prefix) {
124 |           prefix = false;
125 |           trimmed[trimmedIdx] = anOut;
126 |           trimmedIdx += 1;
127 |         }
128 |       }
129 |       out = trimmed;
130 |     }
131 |     return out;
132 |   }
133 | 
134 |   private static int verifyAbsoluteValue(int value, int coordBits) {
135 |     if (value >= IntMath.pow(2, coordBits)) {
136 |       throw new IllegalArgumentException(String.format(
137 |               "Value %d exceeds legal range of %d bits (0 to %d).", value, coordBits, IntMath.pow(2, coordBits) - 1));
138 |     }
139 |     return value;
140 |   }
141 | 
142 |   /** Calculate the size of the payload resulting from the parameters **/
143 |   private static int getOutputSize(int coordBits, int wordBits, int lineBits, int pageBits) {
144 |     int outSize = coordBits * 4;
145 |     if (pageBits > 0) {
146 |       outSize += pageBits;
147 |     }
148 |     if (lineBits > 0) {
149 |       outSize += lineBits;
150 |     }
151 |     if (wordBits > 0) {
152 |       outSize += wordBits;
153 |     }
154 |     return outSize;
155 |   }
156 | 
157 |   /**
158 |    * Encode a given floating point value (between 0 and 1) to an integer with the given number of bits.
159 |    **/
160 |   private static int encodeValue(float source, int numBits) {
161 |     return (int) Math.round(source * Math.pow(2, numBits));
162 |   }
163 | 
164 |   /**
165 |    * Decode a given integer (encoded with a certain number of bits) to a floating point value.
166 |    **/
167 |   private static float decodeValue(long source, int numBits) {
168 |     return (float) (source / Math.pow(2, numBits));
169 |   }
170 | 
171 |   /**
172 |    * Create a bit mask to mask out a given number of bits
173 |    */
174 |   private static BigInteger makeBitMask(int numBits) {
175 |     return BigInteger.valueOf(IntMath.pow(2, numBits) - 1);
176 |   }
177 | 
178 |   /**
179 |    * Decode an {@link OcrInfo} instance from the encoded byte array.
180 |    *
181 |    * @param data                Buffer with encoded binary OCR information
182 |    * @param coordBits           Number of bits the OCR information was encoded with
183 |    * @param wordBits            Number of bits the word index was encoded with
184 |    * @param lineBits            Number of bits the line index was encoded with
185 |    * @param pageBits            Number of bits the page index was encoded with
186 |    * @param absoluteCoordinates Whether the coordinates are stored absolute or relative (percent-values)
187 |    * @return The decoded {@link OcrInfo} instance
188 |    */
189 |   public static OcrInfo decodeOcrInfo(BytesRef data, int coordBits, int wordBits, int lineBits, int pageBits,
190 |           boolean absoluteCoordinates) {
191 |     int coordMask = IntMath.pow(2, coordBits) - 1;
192 |     OcrInfo info = new OcrInfo();
193 |     info.setHasAbsoluteCoordinates(absoluteCoordinates);
194 |     BigInteger encoded = new BigInteger(Arrays.copyOfRange(data.bytes, data.offset, data.offset + data.length));
195 | 
196 |     if (absoluteCoordinates) {
197 |       info.setHeight(encoded.and(BigInteger.valueOf(coordMask)).intValue());
198 |       info.setWidth(encoded.shiftRight(coordBits)
199 |               .and(BigInteger.valueOf(coordMask)).intValue());
200 |       info.setVerticalOffset(encoded.shiftRight(coordBits * 2)
201 |               .and(BigInteger.valueOf(coordMask)).intValue());
202 |       info.setHorizontalOffset(encoded.shiftRight(coordBits * 3)
203 |               .and(BigInteger.valueOf(coordMask)).intValue());
204 |     } else {
205 |       info.setHeight(OcrPayloadHelper.decodeValue(
206 |               encoded.and(BigInteger.valueOf(coordMask)).intValue(), coordBits));
207 |       info.setWidth(OcrPayloadHelper.decodeValue(
208 |               encoded.shiftRight(coordBits)
209 |                       .and(BigInteger.valueOf(coordMask)).intValue(), coordBits));
210 |       info.setVerticalOffset(OcrPayloadHelper.decodeValue(
211 |               encoded.shiftRight(coordBits * 2)
212 |                       .and(BigInteger.valueOf(coordMask)).intValue(), coordBits));
213 |       info.setHorizontalOffset(OcrPayloadHelper.decodeValue(
214 |               encoded.shiftRight(coordBits * 3)
215 |                       .and(BigInteger.valueOf(coordMask)).intValue(), coordBits));
216 |     }
217 | 
218 |     int shift = coordBits * 4;
219 |     if (wordBits > 0) {
220 |       info.setWordIndex(encoded.shiftRight(shift).and(makeBitMask(wordBits)).intValue());
221 |       shift += wordBits;
222 |     }
223 |     if (lineBits > 0) {
224 |       info.setLineIndex(encoded.shiftRight(shift).and(makeBitMask(lineBits)).intValue());
225 |       shift += lineBits;
226 |     }
227 |     if (pageBits > 0) {
228 |       info.setPageIndex(encoded.shiftRight(shift).intValue());
229 |     }
230 | 
231 |     return info;
232 |   }
233 | }
234 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version='1.0' encoding='UTF-8'?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |   <modelVersion>4.0.0</modelVersion>
  4 | 
  5 |   <artifactId>solr-ocrpayload-plugin</artifactId>
  6 |   <groupId>de.digitalcollections.search</groupId>
  7 |   <version>0.2.2-SNAPSHOT</version>
  8 |   <packaging>jar</packaging>
  9 |   <name>Solr OCR Coordinate Payload Plugin</name>
 10 |   <description>
 11 |     Efficient indexing and bounding-box "highlighting" for OCR text
 12 |   </description>
 13 |   <url>https://github.com/dbmdz/solr-ocrpayload-plugin</url>
 14 |   <licenses>
 15 |     <license>
 16 |       <name>MIT License</name>
 17 |       <url>https://github.com/dbmdz/solr-ocrpayload-plugin/blob/master/LICENSE</url>
 18 |       <distribution>repo</distribution>
 19 |     </license>
 20 |   </licenses>
 21 | 
 22 |   <developers>
 23 |     <developer>
 24 |       <name>Johannes Baiter</name>
 25 |       <email>johannes.baiter@bsb-muenchen.de</email>
 26 |       <id>jbaiter</id>
 27 |     </developer>
 28 |     <developer>
 29 |       <name>Christoph Lorenz</name>
 30 |       <email>christoph.lorenz@bsb-muenchen.de</email>
 31 |       <id>clorenz</id>
 32 |     </developer>
 33 |   </developers>
 34 | 
 35 |   <ciManagement>
 36 |     <url>https://travis-ci.org/dbmdz/solr-ocrpayload-plugin</url>
 37 |     <system>Travis CI</system>
 38 |   </ciManagement>
 39 | 
 40 |   <issueManagement>
 41 |     <url>https://github.com/dbmdz/solr-ocrpayload-plugin/issues</url>
 42 |     <system>GitHub Issues</system>
 43 |   </issueManagement>
 44 | 
 45 |   <scm>
 46 |     <connection>https://github.com/dbmdz/solr-ocrpayload-plugin.git</connection>
 47 |     <developerConnection>git@github.com:dbmdz/solr-ocrpayload-plugin.git</developerConnection>
 48 |     <url>https://github.com/dbmdz/solr-ocrpayload-plugin</url>
 49 |   </scm>
 50 | 
 51 |   <properties>
 52 |     <java.version>1.8</java.version>
 53 |     <maven.compiler.source>1.8</maven.compiler.source>
 54 |     <maven.compiler.target>1.8</maven.compiler.target>
 55 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 56 | 
 57 |     <version.assertj>3.12.1</version.assertj>
 58 |     <version.assertj-json>1.2.0</version.assertj-json>
 59 |     <version.junit>5.3.2</version.junit>
 60 |     <version.log4j>2.11.1</version.log4j>
 61 |     <version.slf4j>1.7.25</version.slf4j>
 62 |     <version.solr>7.5.0</version.solr>
 63 | 
 64 |     <version.jacoco-maven-plugin>0.8.3</version.jacoco-maven-plugin>
 65 |     <version.maven-checkstyle-plugin>3.0.0</version.maven-checkstyle-plugin>
 66 |     <version.maven-compiler-plugin>3.8.0</version.maven-compiler-plugin>
 67 |     <version.maven-jar-plugin>3.1.1</version.maven-jar-plugin>
 68 |     <version.maven-javadoc-plugin>3.1.0</version.maven-javadoc-plugin>
 69 |     <version.maven-source-plugin>3.0.1</version.maven-source-plugin>
 70 |     <version.maven-surefire-plugin>2.22.1</version.maven-surefire-plugin>
 71 |     <version.nexus-staging-maven-plugin>1.6.8</version.nexus-staging-maven-plugin>
 72 |   </properties>
 73 | 
 74 |   <dependencies>
 75 |     <dependency>
 76 |       <groupId>com.revinate</groupId>
 77 |       <artifactId>assertj-json</artifactId>
 78 |       <version>${version.assertj-json}</version>
 79 |       <scope>test</scope>
 80 |     </dependency>
 81 |     <dependency>
 82 |       <groupId>org.apache.logging.log4j</groupId>
 83 |       <artifactId>log4j-core</artifactId>
 84 |       <version>${version.log4j}</version>
 85 |     </dependency>
 86 |     <dependency>
 87 |       <groupId>org.apache.solr</groupId>
 88 |       <artifactId>solr-core</artifactId>
 89 |       <version>${version.solr}</version>
 90 |       <scope>compile</scope>
 91 |     </dependency>
 92 |     <dependency>
 93 |       <groupId>org.apache.solr</groupId>
 94 |       <artifactId>solr-test-framework</artifactId>
 95 |       <version>${version.solr}</version>
 96 |       <scope>test</scope>
 97 |     </dependency>
 98 |     <dependency>
 99 |       <groupId>org.assertj</groupId>
100 |       <artifactId>assertj-core</artifactId>
101 |       <version>${version.assertj}</version>
102 |       <scope>test</scope>
103 |     </dependency>
104 |     <dependency>
105 |       <groupId>org.junit.jupiter</groupId>
106 |       <artifactId>junit-jupiter-api</artifactId>
107 |       <version>${version.junit}</version>
108 |       <scope>test</scope>
109 |     </dependency>
110 |     <dependency>
111 |       <groupId>org.junit.jupiter</groupId>
112 |       <artifactId>junit-jupiter-engine</artifactId>
113 |       <version>${version.junit}</version>
114 |       <scope>test</scope>
115 |     </dependency>
116 |     <dependency>
117 |       <groupId>org.junit.jupiter</groupId>
118 |       <artifactId>junit-jupiter-params</artifactId>
119 |       <version>${version.junit}</version>
120 |       <scope>test</scope>
121 |     </dependency>
122 |     <dependency>
123 |       <groupId>org.junit.vintage</groupId>
124 |       <artifactId>junit-vintage-engine</artifactId>
125 |       <version>${version.junit}</version>
126 |     </dependency>
127 |     <dependency>
128 |       <groupId>org.slf4j</groupId>
129 |       <artifactId>slf4j-api</artifactId>
130 |       <version>${version.slf4j}</version>
131 |     </dependency>
132 |     <dependency>
133 |       <groupId>org.slf4j</groupId>
134 |       <artifactId>slf4j-nop</artifactId>
135 |       <version>${version.slf4j}</version>
136 |       <scope>test</scope>
137 |     </dependency>
138 |   </dependencies>
139 | 
140 |   <build>
141 |     <plugins>
142 |       <plugin>
143 |         <groupId>org.apache.maven.plugins</groupId>
144 |         <artifactId>maven-checkstyle-plugin</artifactId>
145 |         <version>${version.maven-checkstyle-plugin}</version>
146 |         <executions>
147 |           <execution>
148 |             <id>validate</id>
149 |             <phase>validate</phase>
150 |             <configuration>
151 |               <configLocation>https://raw.githubusercontent.com/dbmdz/development/master/code-quality/checkstyle.xml</configLocation>
152 |               <encoding>UTF-8</encoding>
153 |               <consoleOutput>true</consoleOutput>
154 |               <failsOnError>true</failsOnError>
155 |               <linkXRef>false</linkXRef>
156 |             </configuration>
157 |             <goals>
158 |               <goal>check</goal>
159 |             </goals>
160 |           </execution>
161 |         </executions>
162 |       </plugin>
163 |       <plugin>
164 |         <groupId>org.apache.maven.plugins</groupId>
165 |         <artifactId>maven-compiler-plugin</artifactId>
166 |         <version>${version.maven-compiler-plugin}</version>
167 |         <configuration>
168 |           <source>1.8</source>
169 |           <target>1.8</target>
170 |         </configuration>
171 |       </plugin>
172 |       <plugin>
173 |         <groupId>org.apache.maven.plugins</groupId>
174 |         <artifactId>maven-jar-plugin</artifactId>
175 |         <version>${version.maven-jar-plugin}</version>
176 |         <configuration>
177 |           <archive>
178 |             <manifest>
179 |               <addDefaultImplementationEntries>true</addDefaultImplementationEntries>
180 |               <addDefaultSpecificationEntries>true</addDefaultSpecificationEntries>
181 |             </manifest>
182 |           </archive>
183 |         </configuration>
184 |       </plugin>
185 |       <plugin>
186 |         <groupId>org.apache.maven.plugins</groupId>
187 |         <artifactId>maven-javadoc-plugin</artifactId>
188 |         <version>${version.maven-javadoc-plugin}</version>
189 |         <configuration>
190 |           <source>8</source>
191 |         </configuration>
192 |         <executions>
193 |           <execution>
194 |             <id>attach-javadocs</id>
195 |             <goals>
196 |               <goal>jar</goal>
197 |             </goals>
198 |           </execution>
199 |         </executions>
200 |       </plugin>
201 |       <plugin>
202 |         <groupId>org.apache.maven.plugins</groupId>
203 |         <artifactId>maven-source-plugin</artifactId>
204 |         <version>${version.maven-source-plugin}</version>
205 |         <executions>
206 |           <execution>
207 |             <id>attach-sources</id>
208 |             <goals>
209 |               <goal>jar-no-fork</goal>
210 |             </goals>
211 |           </execution>
212 |         </executions>
213 |       </plugin>
214 |       <plugin>
215 |         <groupId>org.apache.maven.plugins</groupId>
216 |         <artifactId>maven-surefire-plugin</artifactId>
217 |         <version>${version.maven-surefire-plugin}</version>
218 |         <configuration>
219 |           <systemPropertyVariables>
220 |             <java.security.egd>file:/dev/./urandom</java.security.egd>
221 |           </systemPropertyVariables>
222 |         </configuration>
223 |       </plugin>
224 |       <plugin>
225 |         <groupId>org.jacoco</groupId>
226 |         <artifactId>jacoco-maven-plugin</artifactId>
227 |         <version>${version.jacoco-maven-plugin}</version>
228 |         <executions>
229 |           <execution>
230 |             <id>pre-unit-test</id>
231 |             <goals>
232 |               <goal>prepare-agent</goal>
233 |             </goals>
234 |           </execution>
235 |           <execution>
236 |             <phase>test</phase>
237 |             <goals>
238 |               <goal>report</goal>
239 |             </goals>
240 |           </execution>
241 |         </executions>
242 |       </plugin>
243 |       <plugin>
244 |         <groupId>org.sonatype.plugins</groupId>
245 |         <artifactId>nexus-staging-maven-plugin</artifactId>
246 |         <version>${version.nexus-staging-maven-plugin}</version>
247 |         <extensions>true</extensions>
248 |         <configuration>
249 |           <serverId>ossrh</serverId>
250 |           <nexusUrl>https://oss.sonatype.org/</nexusUrl>
251 |           <autoReleaseAfterClose>true</autoReleaseAfterClose>
252 |         </configuration>
253 |       </plugin>
254 |     </plugins>
255 |   </build>
256 | 
257 |   <reporting>
258 |     <plugins>
259 |       <plugin>
260 |         <groupId>org.jacoco</groupId>
261 |         <artifactId>jacoco-maven-plugin</artifactId>
262 |         <version>${version.jacoco-maven-plugin}</version>
263 |       </plugin>
264 |     </plugins>
265 |   </reporting>
266 | 
267 |   <profiles>
268 |     <profile>
269 |       <id>deploy</id>
270 |       <build>
271 |         <plugins>
272 |           <plugin>
273 |             <groupId>org.apache.maven.plugins</groupId>
274 |             <artifactId>maven-gpg-plugin</artifactId>
275 |             <version>1.6</version>
276 |             <executions>
277 |               <execution>
278 |                 <id>sign-artifacts</id>
279 |                 <phase>verify</phase>
280 |                 <goals>
281 |                   <goal>sign</goal>
282 |                 </goals>
283 |               </execution>
284 |             </executions>
285 |           </plugin>
286 |         </plugins>
287 |       </build>
288 |     </profile>
289 |   </profiles>
290 | 
291 |   <distributionManagement>
292 |     <snapshotRepository>
293 |       <id>ossrh-snapshots</id>
294 |       <name>Sonatype Nexus Snapshots</name>
295 |       <url>https://oss.sonatype.org/content/repositories/snapshots</url>
296 |     </snapshotRepository>
297 |   </distributionManagement>
298 | 
299 |   <repositories>
300 |     <repository>
301 |       <id>ossrh-snapshots</id>
302 |       <name>Sonatype Nexus Snapshots</name>
303 |       <url>https://oss.sonatype.org/content/repositories/snapshots</url>
304 |       <snapshots>
305 |         <enabled>true</enabled>
306 |       </snapshots>
307 |       <releases>
308 |         <enabled>false</enabled>
309 |       </releases>
310 |     </repository>
311 |   </repositories>
312 | </project>
313 | 


--------------------------------------------------------------------------------
/src/main/java/de/digitalcollections/lucene/analysis/payloads/OcrInfo.java:
--------------------------------------------------------------------------------
  1 | package de.digitalcollections.lucene.analysis.payloads;
  2 | 
  3 | import com.google.common.math.IntMath;
  4 | import java.util.Comparator;
  5 | import java.util.HashSet;
  6 | import java.util.Set;
  7 | import java.util.regex.Matcher;
  8 | import java.util.regex.Pattern;
  9 | 
 10 | public class OcrInfo implements Comparable<OcrInfo> {
 11 | 
 12 |   private static final Pattern PAYLOAD_PAT = Pattern.compile("(\\D+):([0-9.]+),?");
 13 | 
 14 |   private boolean hasAbsoluteCoordinates = false;
 15 |   private float horizontalOffset = -1.0f;
 16 |   private float verticalOffset = -1.0f;
 17 |   private float width = -1.0f;
 18 |   private float height = -1.0f;
 19 |   private int pageIndex = -1;
 20 |   private int lineIndex = -1;
 21 |   private int wordIndex = -1;
 22 | 
 23 |   private String term; // optional, only when returning search results
 24 | 
 25 |   OcrInfo() {
 26 |     // NOP
 27 |   }
 28 | 
 29 |   public OcrInfo(int horizontalOffset, int verticalOffset, int width, int height) {
 30 |     this(-1, horizontalOffset, verticalOffset, width, height);
 31 |     this.setHasAbsoluteCoordinates(true);
 32 |   }
 33 | 
 34 |   public OcrInfo(int pageIndex, int horizontalOffset, int verticalOffset, int width, int height) {
 35 |     this(pageIndex, -1, -1, horizontalOffset, verticalOffset, width, height);
 36 |     this.setHasAbsoluteCoordinates(true);
 37 |   }
 38 | 
 39 |   public OcrInfo(int pageIndex, int lineIndex, int horizontalOffset, int verticalOffset, int width, int height) {
 40 |     this(pageIndex, lineIndex, -1, horizontalOffset, verticalOffset, width, height);
 41 |     this.setHasAbsoluteCoordinates(true);
 42 |   }
 43 | 
 44 |   public OcrInfo(int pageIndex, int lineIndex, int wordIndex, int horizontalOffset, int verticalOffset, int width, int height) {
 45 |     this.setHasAbsoluteCoordinates(true);
 46 |     this.setHorizontalOffset(horizontalOffset);
 47 |     this.setVerticalOffset(verticalOffset);
 48 |     this.setWidth(width);
 49 |     this.setHeight(height);
 50 |     this.setPageIndex(pageIndex);
 51 |     this.setLineIndex(lineIndex);
 52 |     this.setWordIndex(wordIndex);
 53 |   }
 54 | 
 55 |   public OcrInfo(float horizontalOffset, float verticalOffset, float width, float height) {
 56 |     this(-1, horizontalOffset, verticalOffset, width, height);
 57 |   }
 58 | 
 59 |   public OcrInfo(int pageIndex, float horizontalOffset, float verticalOffset, float width, float height) {
 60 |     this.setHorizontalOffset(horizontalOffset);
 61 |     this.setVerticalOffset(verticalOffset);
 62 |     this.setWidth(width);
 63 |     this.setHeight(height);
 64 |     this.setPageIndex(pageIndex);
 65 |   }
 66 | 
 67 |   public OcrInfo(int pageIndex, int lineIndex, float horizontalOffset, float verticalOffset, float width, float height) {
 68 |     this(pageIndex, horizontalOffset, verticalOffset, width, height);
 69 |     this.lineIndex = lineIndex;
 70 |   }
 71 | 
 72 |   public OcrInfo(int pageIndex, int lineIndex, int wordIndex, float horizontalOffset, float verticalOffset, float width, float height) {
 73 |     this(pageIndex, lineIndex, horizontalOffset, verticalOffset, width, height);
 74 |     this.wordIndex = wordIndex;
 75 |   }
 76 | 
 77 |   /**
 78 |    * Parse an {@link OcrInfo} object from a character buffer.
 79 |    *
 80 |    * The string contains comma-separated pairs of single-character keys and numerical
 81 |    * values, e.g. `x:13.37`.
 82 |    *
 83 |    * Valid keys are:
 84 |    * - **p**: Page index, ranging from 0 to 2^pageBits (optional)
 85 |    * - **l**: Line index, ranging from 0 to 2^lineBits (optional)
 86 |    * - **n**: Word index, ranging from 0 to 2^wordBits (optional)
 87 |    * - **x**: Horizontal offset as floating point percentage in range [0...100]
 88 |    *          OR absolute position as unsigned integer in range [0...2^coordBits] (mandatory)
 89 |    * - **y**: Vertical offset as floating point percentage in range [0...100]
 90 |    *          OR absolute position as unsigned integer in range [0...2^coordBits] (mandatory)
 91 |    * - **w**: Width as floating point percentage in range [0...100]
 92 |    *          OR absolute position as unsigned integer in range [0...2^coordBits] (mandatory)
 93 |    * - **h**: Height as floating point percentage in range [0...100]
 94 |    *          OR absolute position as unsigned integer in range [0...2^coordBits] (mandatory)
 95 |    *
 96 |    * Here es an example: `p:27,l:50,n:13,x:13.1,y:52.7,w:87.9,h:5.3`
 97 |    * or, with integral (absolute) coordinate
 98 |    *
 99 |    * @param buffer Input character buffer
100 |    * @param offset Offset of the encoded character information
101 |    * @param length Length of the encoded character information
102 |    * @param wordBits Number of bits used for encoding the word index
103 |    * @param lineBits Number of bits used for encoding the line index
104 |    * @param pageBits Number of bits used for encoding the page index
105 |    * @param coordBits Number of bits used for encoding the coordinates
106 |    * @param absoluteCoordinates Whether the coordinates are stored absolute or relative (percent-values)
107 |    * @return The decoded {@link OcrInfo} instance
108 |    */
109 |   public static OcrInfo parse(char[] buffer, int offset, int length, int wordBits, int lineBits, int pageBits,
110 |           int coordBits, boolean absoluteCoordinates) {
111 |     OcrInfo info = new OcrInfo();
112 |     info.setHasAbsoluteCoordinates(absoluteCoordinates);
113 | 
114 |     String payload = new String(buffer, offset, length).toLowerCase();
115 |     Matcher m = PAYLOAD_PAT.matcher(payload);
116 |     Set<Character> seenKeys = new HashSet<>();
117 |     while (m.find()) {
118 |       char key = m.group(1).charAt(0);
119 |       if (seenKeys.contains(key)) {
120 |         throw new IllegalArgumentException(String.format("Invalid payload %s: duplicate key '%c'", payload, key));
121 |       } else {
122 |         seenKeys.add(key);
123 |       }
124 |       String value = m.group(2);
125 |       switch (key) {
126 |         case 'p':
127 |           info.setPageIndex(parseIntValue(value, pageBits, "page", payload));
128 |           break;
129 |         case 'l':
130 |           info.setLineIndex(parseIntValue(value, lineBits, "line", payload));
131 |           break;
132 |         case 'n':
133 |           info.setWordIndex(parseIntValue(value, wordBits, "word", payload));
134 |           break;
135 |         case 'x':
136 |           if (absoluteCoordinates) {
137 |             info.setHorizontalOffset(parseIntValue(value, coordBits, "x", payload));
138 |           } else {
139 |             info.setHorizontalOffset(Float.parseFloat(value) / 100f);
140 |           }
141 |           break;
142 |         case 'y':
143 |           if (absoluteCoordinates) {
144 |             info.setVerticalOffset(parseIntValue(value, coordBits, "y", payload));
145 |           } else {
146 |             info.setVerticalOffset(Float.parseFloat(value) / 100f);
147 |           }
148 |           break;
149 |         case 'w':
150 |           if (absoluteCoordinates) {
151 |             info.setWidth(parseIntValue(value, coordBits, "w", payload));
152 |           } else {
153 |             info.setWidth(Float.parseFloat(value) / 100f);
154 |           }
155 |           break;
156 |         case 'h':
157 |           if (absoluteCoordinates) {
158 |             info.setHeight(parseIntValue(value, coordBits, "h", payload));
159 |           } else {
160 |             info.setHeight(Float.parseFloat(value) / 100f);
161 |           }
162 |           break;
163 |         default:
164 |           throw new IllegalArgumentException(String.format(
165 |                   "Could not parse OCR bounding box information, string was %s, invalid character was %c",
166 |                   new String(buffer, offset, length), key));
167 |       }
168 |     }
169 |     if (info.getHorizontalOffset() < 0 || info.getHorizontalOffset() < 0 || info.getWidth() < 0 || info.getHeight() < 0) {
170 |       throw new IllegalArgumentException(String.format(
171 |               "One or more coordinates are missing from payload (was %s), make sure you have 'x', 'y', 'w' and 'h' set!",
172 |               payload));
173 |     }
174 |     if (pageBits > 0 && info.getPageIndex() < 0) {
175 |       throw new IllegalArgumentException(String.format(
176 |               "Page index is missing from payload (was: '%s'), fix payload or set the 'pageBits' option to 0.", payload));
177 |     }
178 |     if (lineBits > 0 && info.getLineIndex() < 0) {
179 |       throw new IllegalArgumentException(String.format(
180 |               "Line index is missing from payload (was: '%s'), fix payload or set the 'lineBits' option to 0.", payload));
181 |     }
182 |     if (wordBits > 0 && info.getWordIndex() < 0) {
183 |       throw new IllegalArgumentException(String.format(
184 |               "Word index is missing from payload (was: '%s'), fix payload or set the 'wordBits' option to 0.", payload));
185 |     }
186 |     return info;
187 |   }
188 | 
189 |   private static int parseIntValue(String value, int numBits, String type, String payload) {
190 |     int index = Integer.parseInt(value);
191 |     if (index >= IntMath.pow(2, numBits)) {
192 |       throw new IllegalArgumentException(String.format("Value %d for %s needs more than %d bits (valid values range from 0 to %d). Payload=%s",
193 |               index, type, numBits, IntMath.pow(2, numBits) - 1, payload));
194 |     }
195 |     return index;
196 |   }
197 | 
198 |   public float getHorizontalOffset() {
199 |     return horizontalOffset;
200 |   }
201 | 
202 |   public void setHorizontalOffset(float horizontalOffset) {
203 |     this.horizontalOffset = horizontalOffset;
204 |   }
205 | 
206 |   private void checkCoordinate(float coordinate) {
207 |     if (coordinate > 1) {
208 |       throw new IllegalArgumentException(String.format("Coordinates can at most be 100, was %1f!", coordinate * 100));
209 |     }
210 |   }
211 | 
212 |   public float getVerticalOffset() {
213 |     return verticalOffset;
214 |   }
215 | 
216 |   public void setVerticalOffset(float verticalOffset) {
217 |     if (!hasAbsoluteCoordinates) {
218 |       checkCoordinate(verticalOffset);
219 |     }
220 |     this.verticalOffset = verticalOffset;
221 |   }
222 | 
223 |   public float getWidth() {
224 |     return width;
225 |   }
226 | 
227 |   public void setWidth(float width) {
228 |     if (!hasAbsoluteCoordinates) {
229 |       checkCoordinate(width);
230 |     }
231 |     this.width = width;
232 |   }
233 | 
234 |   public float getHeight() {
235 |     return height;
236 |   }
237 | 
238 |   public void setHeight(float height) {
239 |     if (!hasAbsoluteCoordinates) {
240 |       checkCoordinate(height);
241 |     }
242 |     this.height = height;
243 |   }
244 | 
245 |   public int getPageIndex() {
246 |     return pageIndex;
247 |   }
248 | 
249 |   public void setPageIndex(int pageIndex) {
250 |     this.pageIndex = pageIndex;
251 |   }
252 | 
253 |   public String getTerm() {
254 |     return term;
255 |   }
256 | 
257 |   public void setTerm(String term) {
258 |     this.term = term;
259 |   }
260 | 
261 |   public int getLineIndex() {
262 |     return lineIndex;
263 |   }
264 | 
265 |   public void setLineIndex(int lineIndex) {
266 |     this.lineIndex = lineIndex;
267 |   }
268 | 
269 |   public int getWordIndex() {
270 |     return wordIndex;
271 |   }
272 | 
273 |   public void setWordIndex(int wordIndex) {
274 |     this.wordIndex = wordIndex;
275 |   }
276 | 
277 |   @Override
278 |   public String toString() {
279 |     return "OcrInfo{"
280 |             + "horizontalOffset=" + horizontalOffset
281 |             + ", verticalOffset=" + verticalOffset
282 |             + ", width=" + width
283 |             + ", height=" + height
284 |             + ", pageIndex=" + pageIndex
285 |             + ", lineIndex=" + lineIndex
286 |             + ", wordIndex=" + wordIndex
287 |             + ", term='" + term + '\''
288 |             + '}';
289 |   }
290 | 
291 |   @Override
292 |   public int compareTo(OcrInfo other) {
293 |     return Comparator
294 |             .comparing(OcrInfo::getPageIndex)
295 |             .thenComparing(OcrInfo::getLineIndex)
296 |             .thenComparing(OcrInfo::getWordIndex)
297 |             .thenComparing(OcrInfo::getHorizontalOffset)
298 |             .thenComparing(OcrInfo::getVerticalOffset)
299 |             .compare(this, other);
300 |   }
301 | 
302 |   public boolean getHasAbsoluteCoordinates() {
303 |     return hasAbsoluteCoordinates;
304 |   }
305 | 
306 |   public void setHasAbsoluteCoordinates(boolean hasAbsoluteCoordinates) {
307 |     this.hasAbsoluteCoordinates = hasAbsoluteCoordinates;
308 |   }
309 | }
310 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # :construction: Deprecated in favor of [solr-ocrhighlighting](https://github.com/dbmdz/solr-ocrhighlighting)
  2 | 
  3 | # Solr OCR Coordinate Payload Plugin
  4 | 
  5 | [![Javadocs](https://javadoc.io/badge/de.digitalcollections.search/solr-ocrpayload-plugin.svg)](https://javadoc.io/doc/de.digitalcollections.search/solr-ocrpayload-plugin)
  6 | [![Build Status](https://img.shields.io/travis/dbmdz/solr-ocrpayload-plugin/master.svg)](https://travis-ci.org/dbmdz/solr-ocrpayload-plugin)
  7 | [![Codecov](https://img.shields.io/codecov/c/github/dbmdz/solr-ocrpayload-plugin/master.svg)](https://codecov.io/gh/dbmdz/solr-ocrpayload-plugin)
  8 | [![MIT License](https://img.shields.io/github/license/dbmdz/solr-ocrpayload-plugin.svg)](LICENSE)
  9 | [![GitHub release](https://img.shields.io/github/release/dbmdz/solr-ocrpayload-plugin.svg)](https://github.com/dbmdz/solr-ocrpayload-plugin/releases)
 10 | [![Maven Central](https://img.shields.io/maven-central/v/de.digitalcollections.search/solr-ocrpayload-plugin.svg)](https://search.maven.org/search?q=a:solr-ocrpayload-plugin)
 11 | 
 12 | *Efficient indexing and bounding-box "highlighting" for OCR text*
 13 | 
 14 | ## tl;dr
 15 | 
 16 | - Store OCR bounding box information and token position directly in the Solr index in a space-efficient manner
 17 | - Retrieve bounding box and token position directly in your Solr query results, no additional parsing necessary
 18 | 
 19 | **Indexing**:
 20 | 
 21 | The OCR information is appended after each token as a concatenated list of `<key>:<val>` pairs, see further down
 22 | for a detailed description of available keys.
 23 | 
 24 | `POST /solr/mycore/update`
 25 | 
 26 | ```json
 27 | [{ "id": "test_document",
 28 |    "ocr_text": "this|p:13,l:5,n:6,x:11.1,y:22.2,w:33.3,h:44.4 is|p:13,l:5,n:7,x:22.2,y:33.3,w:44.4,h:55.5 a|p:13,l:5,n:8,x:33.3,y:33.3,w:44.4,h:55.5 test|p:13,l:5,n:9,x:44.4,y:33.3,w:44.4h:55.5" }]
 29 | ```
 30 | 
 31 | **Querying**:
 32 | 
 33 | The plugin adds a new top-level key (`ocr_highlight` in this case) that contains the OCR information for
 34 | each matching token as a structured object.
 35 | 
 36 | `GET /solr/mycore/select?ocr_hl=true&ocr_hl.fields=ocr_text&indent=true&wt=json&q=test`
 37 | 
 38 | ```json
 39 | {
 40 |   "responseHeader": "...",
 41 |   "response": {
 42 |     "numFound": 1,
 43 |     "docs": [{"id": "test_document"}]
 44 |   },
 45 |   "ocr_highlight":{
 46 |     "test_document":{
 47 |       "ocr_text":[{
 48 |           "term":"test",
 49 |           "page":13,
 50 |           "line": 5,
 51 |           "word": 9,
 52 |           "x":0.444,
 53 |           "y":0.333,
 54 |           "width":0.444,
 55 |           "height":0.555}]
 56 |     }
 57 |   }
 58 | }
 59 | ```
 60 | 
 61 | ## Use Case
 62 | At the Bavarian State Library, we try to provide full-text search over all of our OCRed content. In addition
 63 | to obtaining matching documents, the user should also get a small snippet of the corresponding part of the
 64 | page image, with the matching words highlighted, similar to what e.g. Google Books provides.
 65 | 
 66 | 
 67 | ## Approaches
 68 | For this to work, we need some way of mapping matching tokens to their corresponding location in the underlying
 69 | OCR text. A common approach used by a number of libraries is to **use a secondary microservice for this** that takes
 70 | as input a document identifier and a text snippet and will return all coordinates of matching text snippets on
 71 | the page. While this approach generally works okay, it has several drawbacks:
 72 | 
 73 | - **Performance:** Every snippet requires a query to the OCR service, which itself has to do a linear scan
 74 |   through the OCR document. For e.g. a result set of 100 snippets, this will result in 101 queries (initial
 75 |   Solr query and 100 snippet queries). Of course this can be optimized by batching and having a good index
 76 |   structure for the coordinate lookup, but it's still less than ideal.
 77 | - **Storage:** To reliably be able to map text matches to the base text, you have to store a copy of the
 78 |   full text in the index, alongside the regular index. This blows up the index size significantly.
 79 |   Foregoing storing the text and only using the normalized terms from the index for matching will
 80 |   break the mapping to OCR, since depending on the analyzer configuration, Lucene will perform stemming, etc.
 81 |   
 82 | Alternatively, you could also **store the coordinates directly as strings in the index**. This works by e.g.
 83 | indexing each token as `<token>|<coordinates>` and telling Lucene to ignore everything after the pipe during
 84 | analysis. As the full text of the document is stored, you wil get back a series of these annotated tokens
 85 | as query results and can then parse the coordinates from your highlighting information. This solves the
 86 | *Performance* part of the above approach, but worsens the *Storage* problem: For every token, we now not only
 87 | have to store the token itself, but an expensive coordinate string as well.
 88 | 
 89 | ## Our Approach
 90 | 
 91 | This plugin uses a similar approach to the above, but solves the *Storage* problem by using an efficient binary
 92 | format to store the OCR coordinate information in the index: We use bit-packing to combine a number of OCR
 93 | coordinate parameters into a **byte payload**, which is not stored in the field itself, but as an associated
 94 | [Lucene Payload](https://lucidworks.com/2017/09/14/solr-payloads/):
 95 | 
 96 | - `x`, `y`, `w`, `h`: Coordinates of the bounding box on the page as either:
 97 |     - **absolute** unsigned integer offsets between 0 and `2^coordinateBits` (see below)
 98 |     - **relative** floating point percentages between 0 and 100 (e.g. `x:42.3` for a horizontal offset of 43.2%)
 99 | - `pageIndex`: Unsigned integer that stores the page index of a token (optional)
100 | - `lineIndex`: Unsigned integer that stores the line index of a token (optional)
101 | - `wordIndex`: Unsigned integer that stores the word index of a token (optional)
102 | 
103 | For each of these values, you can configure the number of bits the plugin should use to store them, or disable
104 | certain parameters entirely. This allows you to fine-tune the settings to your needs. In our case, for example, we
105 | use these values: `4 * 12 bits (coordinates) + 9 bits (word index) + 11 bits (line index) + 12 bits (page index)`,
106 | resulting in a 80 bit or 10 byte payload per token. A comparable string representation `p0l0n0x000y000w000h000`
107 | would have at least 22 bytes, so we save >50% for every token.
108 | 
109 | At query time, we then retrieve the payload for each matching token and put the decoded information into the
110 | `ocr_highlight` result key that can be directly used without having to do any additional parsing.
111 | 
112 | ## Usage
113 | ### Installation
114 | 
115 | Download the [latest release from GitHub](https://github.com/dbmdz/solr-ocrpayload-plugin/releases) and put the JAR into your `$SOLR_HOME/$SOLR_CORE/lib/` directory.
116 | 
117 | ### Indexing configuration
118 | 
119 | To use it, first add the `DelimitedOcrInfoPayloadTokenFilterFactory`☕ filter to your analyzer chain (e.g. for a `ocr_text` field type):
120 | 
121 | ```xml
122 | <fieldtype name="text_ocr" class="solr.TextField" omitTermFreqAndPositions="false">
123 |   <analyzer>
124 |     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
125 |     <filter class="de.digitalcollections.lucene.analysis.util.DelimitedOcrInfoPayloadTokenFilterFactory"
126 |             delimiter="☞" absoluteCoordinates="false" coordinateBits="10" wordBits="0" lineBits="0" pageBits="12" />
127 |     <filter class="solr.StandardFilterFactory"/>
128 |     <filter class="solr.LowerCaseFilterFactory"/>
129 |     <filter class="solr.StopFilterFactory"/>
130 |     <filter class="solr.PorterStemFilterFactory"/>
131 |   </analyzer>
132 | </fieldtype>
133 | ```
134 | 
135 | The filter takes the following parameters:
136 | 
137 | - `delimiter`: Character used for delimiting the payload from the token in the input document (default: `|`)
138 | - `absoluteCoordinates`: `true` or `false` to configure whether the stored coordinates are absolute
139 | - `coordinateBits`:  Number of bits to use for encoding OCR coordinates in the index. (mandatory)<br/>
140 |    A value of `10` (default) is recommended, resulting in coordBits to approximately two decimal places.
141 | - `wordBits`: Number of bits to use for encoding the word index.<br/>
142 |    Set to 0 (default) to disable storage of the word index.
143 | - `lineBits`: Number of bits to use for encoding the line index.<br/>
144 |    Set to 0 (default) to disable storage of the line index.
145 | - `pageBits`: Number of bits to use for encoding the page index.<br/>
146 |    Set to 0 (default) to disable storage of the page index.
147 | 
148 | The filter expects an input payload after the configured `delimiter` in the input stream, with the payload being a
149 | pseudo-JSON structure (e.g. `k1:1,k2:3`) with the following keys:
150 | 
151 | - `p`: Page index (if `pageBits` > 0)
152 | - `l`: Line index  (if `lineBits` > 0)
153 | - `n`: Word index (if `wordBits` > 0)
154 | - `x`, `y`, `w`, `h`: Coordinates of the OCR box as floating point percentages or integers (if `absoluteCoordinates`)
155 | 
156 | As an example, consider the token `foobar` with an OCR box of `(0.50712, 0.31432, 0.87148, 0.05089)`
157 | (i.e. with `absoluteCoordinates="false"`), the configured delimiter `☞` and storage of indices for the word (`30`),
158 | line (`12`) and page (`13`):
159 | `foobar☞p:13,l:12,n:30,x:50.7,y:31.4,w:87.1,h:5.1`.
160 | 
161 | Alternatively, with `absoluteCoordinates="true"`, an OCR box of `(512, 1024, 3192, 256)` and otherwise the same
162 | settings:
163 | `foobar☞p:13,l:12,n:30,x:512,y:1024,w:3192,h:256`.
164 | 
165 | Finally, you just have to configure your schema to use the field type defined above. Storing the content is **not**
166 | recommended, since it significantly increases the index size and is not used at all for querying and highlighting:
167 | 
168 | ```xml
169 | <field name="ocr_text" type="text_ocr" indexed="true" stored="false" />
170 | ```
171 | 
172 | ### Highlighting configuration
173 | 
174 | To enable highlighting using the OCR payloads, add the `OcrHighlighting` component to your Solr
175 | configuration, configure it with the same `absoluteCoordinates`, `coordinateBits`, `wordBits`, `lineBits` and `pageBits`
176 | values that were used for the filter in the analyzer chain:
177 | 
178 | ```xml
179 | <config>
180 |   <searchComponent name="ocr_highlight"
181 |                    class="de.digitalcollections.solr.plugin.components.ocrhighlighting.OcrHighlighting"
182 |                    absoluteCoordinates="false" coordinateBits="10" wordBits="0" lineBits="0" pageBits="12" />
183 |                    
184 |   <requestHandler name="standard" class="solr.StandardRequestHandler">
185 |     <arr name="last-components">
186 |       <str>ocr_highlight</str>
187 |     </arr>
188 |   </requestHandler>
189 | </config>
190 | ```
191 | 
192 | Now at query time, you can just set the `ocr_hl=true` parameter, specify the fields you want highlighted via
193 | `ocr_hl.fields=myfield,myotherfield` and retrieve highlighted matches with their OCR coordinates:
194 | 
195 | `GET /solr/mycore/select?ocr_hl=true&ocr_hl.fields=ocr_text&indent=true&q=augsburg&wt=json`
196 | 
197 | ```json
198 | {
199 |   "responseHeader":{
200 |     "status":0,
201 |     "QTime":158},
202 |   "response":{"numFound":526,"start":0,"docs":[
203 |       {
204 |         "id":"bsb10502835"},
205 |       {
206 |         "id":"bsb11032147"},
207 |       {
208 |         "id":"bsb10485243"},
209 |       ...
210 |   },
211 |   "ocr_highlight":{
212 |     "bsb10502835":{
213 |       "ocr_text":[{
214 |           "page":7,
215 |           "position":9,
216 |           "term":"augsburg",
217 |           "x":0.111,
218 |           "y":0.062,
219 |           "width":0.075,
220 |           "height":0.013},
221 |         {
222 |           "page":7,
223 |           "position":264,
224 |           "term":"augsburg",
225 |           "x":0.320,
226 |           "y":0.670,
227 |           "width":0.099,
228 |           "height":0.012},
229 |         ...]}},
230 |        ...
231 |     }
232 |   }
233 | }
234 | ```
235 | 
236 | 
237 | ## FAQ
238 | 
239 | - **How does highlighting work with phrase queries?**
240 |   
241 |   You will receive a bounding box object for every individual matching term in the phrase.
242 | 
243 | - **What are the performance and storage implications of using this plugin?**
244 | 
245 |   *Performance*: With an Intel Xeon E5-1620@3.5GHz on a single core, we measured (with JMH):
246 |   
247 |   - Encoding the Payload: 1,484,443.200 Payloads/Second or ~14.2MiB/s with an 80bit payload
248 |   - Decoding the Payload: 1,593,036.372 Payloads/Second or ~15.2MiB/s with an 80bit payload
249 |   
250 |   *Storage*: This depends on your configuration. With our sample configuration of an 80 bit payload
251 |   (see above), the payload overhead is 10 bytes per token. That is, for a corpus size of 10 Million Tokens,
252 |   you will need approximately 95MiB to store the payloads.
253 |   The actual storage required might be lower, since Lucene compresses the payloads with LZ4.
254 |   
255 | - **Does this work with SolrCloud?**
256 | 
257 |   It does! We're running it with SolrCloud ourselves.
258 | 


--------------------------------------------------------------------------------
/src/main/java/de/digitalcollections/solr/plugin/components/ocrhighlighting/OcrHighlighting.java:
--------------------------------------------------------------------------------
  1 | package de.digitalcollections.solr.plugin.components.ocrhighlighting;
  2 | 
  3 | import de.digitalcollections.lucene.analysis.payloads.OcrInfo;
  4 | import de.digitalcollections.lucene.analysis.payloads.OcrPayloadHelper;
  5 | import java.io.IOException;
  6 | import java.util.ArrayList;
  7 | import java.util.Arrays;
  8 | import java.util.Collections;
  9 | import java.util.HashMap;
 10 | import java.util.List;
 11 | import java.util.Map;
 12 | import java.util.Set;
 13 | import java.util.TreeSet;
 14 | import org.apache.lucene.document.Document;
 15 | import org.apache.lucene.index.IndexReader;
 16 | import org.apache.lucene.index.LeafReader;
 17 | import org.apache.lucene.index.LeafReaderContext;
 18 | import org.apache.lucene.index.MultiReader;
 19 | import org.apache.lucene.index.PostingsEnum;
 20 | import org.apache.lucene.index.ReaderUtil;
 21 | import org.apache.lucene.index.Term;
 22 | import org.apache.lucene.index.Terms;
 23 | import org.apache.lucene.index.TermsEnum;
 24 | import org.apache.lucene.search.IndexSearcher;
 25 | import org.apache.lucene.search.Query;
 26 | import org.apache.lucene.util.BytesRef;
 27 | import org.apache.solr.common.params.SolrParams;
 28 | import org.apache.solr.common.util.NamedList;
 29 | import org.apache.solr.common.util.SimpleOrderedMap;
 30 | import org.apache.solr.core.PluginInfo;
 31 | import org.apache.solr.handler.component.ResponseBuilder;
 32 | import org.apache.solr.handler.component.SearchComponent;
 33 | import org.apache.solr.handler.component.ShardRequest;
 34 | import org.apache.solr.request.SolrQueryRequest;
 35 | import org.apache.solr.schema.IndexSchema;
 36 | import org.apache.solr.schema.SchemaField;
 37 | import org.apache.solr.search.DocIterator;
 38 | import org.apache.solr.search.DocList;
 39 | import org.apache.solr.search.SolrIndexSearcher;
 40 | import org.apache.solr.util.SolrPluginUtils;
 41 | import org.apache.solr.util.plugin.PluginInfoInitialized;
 42 | 
 43 | public class OcrHighlighting extends SearchComponent implements PluginInfoInitialized {
 44 | 
 45 |   private static final IndexSearcher EMPTY_INDEXSEARCHER;
 46 | 
 47 |   static {
 48 |     try {
 49 |       IndexReader emptyReader = new MultiReader();
 50 |       EMPTY_INDEXSEARCHER = new IndexSearcher(emptyReader);
 51 |       EMPTY_INDEXSEARCHER.setQueryCache(null);
 52 |     } catch (IOException bogus) {
 53 |       throw new RuntimeException(bogus);
 54 |     }
 55 |   }
 56 | 
 57 |   private int coordBits;
 58 |   private int wordBits;
 59 |   private int lineBits;
 60 |   private int pageBits;
 61 |   private boolean absoluteCoordinates;
 62 | 
 63 |   @Override
 64 |   public void prepare(ResponseBuilder rb) {
 65 |     // NOP
 66 |   }
 67 | 
 68 |   @Override
 69 |   public void process(ResponseBuilder rb) throws IOException {
 70 |     if (rb.req.getParams().getBool("ocr_hl", false)) {
 71 |       NamedList<Object> highlighting = doHighlighting(rb.getResults().docList, rb.getQuery(), rb.req);
 72 |       rb.rsp.add("ocr_highlighting", highlighting);
 73 |     }
 74 |   }
 75 | 
 76 |   // Adapted from solr's own HighlightComponent
 77 |   @Override
 78 |   public void modifyRequest(ResponseBuilder rb, SearchComponent who, ShardRequest sreq) {
 79 |     if (!(rb.req.getParams().getBool("ocr_hl", false))) {
 80 |       return;
 81 |     }
 82 | 
 83 |     // Turn on highlighting only only when retrieving fields
 84 |     if ((sreq.purpose & ShardRequest.PURPOSE_GET_FIELDS) != 0) {
 85 |       sreq.purpose |= ShardRequest.PURPOSE_GET_HIGHLIGHTS;
 86 |       // should already be true...
 87 |       sreq.params.set("ocr_hl", "true");     // TODO: Maybe set hl_params?
 88 |     } else {
 89 |       sreq.params.set("ocr_hl", "false");
 90 |     }
 91 |   }
 92 | 
 93 |   // Adapted from solr's own HighlightComponent
 94 |   @SuppressWarnings("unchecked")
 95 |   @Override
 96 |   public void finishStage(ResponseBuilder rb) {
 97 |     if (!rb.req.getParams().getBool("ocr_hl", false) || rb.stage != ResponseBuilder.STAGE_GET_FIELDS) {
 98 |       return;
 99 |     }
100 | 
101 |     NamedList.NamedListEntry[] arr = new NamedList.NamedListEntry[rb.resultIds.size()];
102 |     rb.finished.stream()
103 |             .filter(sreq -> (sreq.purpose & ShardRequest.PURPOSE_GET_HIGHLIGHTS) != 0)
104 |             .flatMap(sreq -> sreq.responses.stream())
105 |             // can't expect the highlight content if there was an exception for this request
106 |             // this should only happen when using shards.tolerant=true
107 |             .filter(resp -> resp.getException() == null)
108 |             .map(resp -> (NamedList) resp.getSolrResponse().getResponse().get("ocr_highlighting"))
109 |             .forEach(hl -> SolrPluginUtils.copyNamedListIntoArrayByDocPosInResponse(hl, rb.resultIds, arr));
110 | 
111 |     // remove nulls in case not all docs were able to be retrieved
112 |     rb.rsp.add("ocr_highlighting", SolrPluginUtils.removeNulls(arr, new SimpleOrderedMap<>()));
113 |   }
114 | 
115 |   @Override
116 |   public String getDescription() {
117 |     return null;
118 |   }
119 | 
120 |   @Override
121 |   public void init(PluginInfo info) {
122 |     this.coordBits = Integer.parseInt(info.attributes.getOrDefault("coordinateBits", "12"));
123 |     this.pageBits = Integer.parseInt(info.attributes.getOrDefault("pageBits", "0"));
124 |     this.lineBits = Integer.parseInt(info.attributes.getOrDefault("lineBits", "0"));
125 |     this.wordBits = Integer.parseInt(info.attributes.getOrDefault("wordBits", "0"));
126 |     this.absoluteCoordinates = Boolean.parseBoolean(info.attributes.getOrDefault("absoluteCoordinates", "false"));
127 |   }
128 | 
129 |   private Set<BytesRef> getTerms(Query query, String fieldName) throws IOException {
130 |     Set<BytesRef> terms = new TreeSet<>();
131 |     Set<Term> extractPosInsensitiveTermsTarget = new TreeSet<Term>() {
132 |       @Override
133 |       public boolean add(Term term) {
134 |         if (term.field().equals(fieldName)) {
135 |           return terms.add(term.bytes());
136 |         }
137 |         return false;
138 |       }
139 |     };
140 |     query.createWeight(EMPTY_INDEXSEARCHER, false, 1.0f)
141 |             .extractTerms(extractPosInsensitiveTermsTarget);
142 |     return terms;
143 |   }
144 | 
145 |   /**
146 |    * Generates a list of highlighted query term coordinates for each item in a list of documents, or returns null if highlighting is disabled.
147 |    *
148 |    * @param docs          query results
149 |    * @param query         the query
150 |    * @param req           the current request
151 |    * @return              NamedList containing a {@link NamedList} for each document,
152 |    *                      which in turns contains `({@link String} field, {@link OcrInfo} coordinates)` pairs.
153 |    */
154 |   private NamedList<Object> doHighlighting(DocList docs, Query query, SolrQueryRequest req) throws IOException {
155 |     SolrParams params = req.getParams();
156 |     int maxHighlightsPerDoc = params.getInt("ocr_hl.maxPerDoc", -1);
157 |     int maxHighlightsPerPage = params.getInt("ocr_hl.maxPerPage", -1);
158 |     IndexReader reader = req.getSearcher().getIndexReader();
159 | 
160 |     int[] docIds = toDocIDs(docs);
161 |     String[] keys = getUniqueKeys(req.getSearcher(), docIds);
162 |     String[] fieldNames = params.getParams("ocr_hl.fields");
163 | 
164 |     // For each document, obtain a mapping from field names to their matching OCR boxes
165 |     List<Map<String, OcrInfo[]>> boxes = new ArrayList<>();
166 |     for (int docId : docIds) {
167 |       Map<String, OcrInfo[]> docBoxes = new HashMap<>();
168 |       for (String fieldName : fieldNames) {
169 |         // We grab the terms in their UTF-8 encoded form to avoid costly decoding operations
170 |         // when checking for term equality down the line
171 |         Set<BytesRef> termSet = getTerms(query, fieldName);
172 |         OcrInfo[] ocrInfos = getOcrInfos(reader, docId, fieldName, termSet, maxHighlightsPerDoc, maxHighlightsPerPage);
173 |         docBoxes.put(fieldName, ocrInfos);
174 |       }
175 |       boxes.add(docBoxes);
176 |     }
177 |     return encodeSnippets(keys, fieldNames, boxes);
178 |   }
179 | 
180 |   /**
181 |    * Retrieve unique keys for matching documents.
182 |    */
183 |   private String[] getUniqueKeys(SolrIndexSearcher searcher, int[] docIds) throws IOException {
184 |     IndexSchema schema = searcher.getSchema();
185 |     SchemaField keyField = schema.getUniqueKeyField();
186 |     if (keyField != null) {
187 |       Set<String> selector = Collections.singleton(keyField.getName());
188 |       String[] uniqueKeys = new String[docIds.length];
189 |       for (int i = 0; i < docIds.length; i++) {
190 |         int docId = docIds[i];
191 |         Document doc = searcher.doc(docId, selector);
192 |         String id = schema.printableUniqueKey(doc);
193 |         uniqueKeys[i] = id;
194 |       }
195 |       return uniqueKeys;
196 |     } else {
197 |       return new String[docIds.length];
198 |     }
199 |   }
200 | 
201 |   /**
202 |    * Retrieve Document IDs from the list of matching documents.
203 |    */
204 |   private int[] toDocIDs(DocList docs) {
205 |     int[] ids = new int[docs.size()];
206 |     DocIterator iterator = docs.iterator();
207 |     for (int i = 0; i < ids.length; i++) {
208 |       if (!iterator.hasNext()) {
209 |         throw new AssertionError();
210 |       }
211 |       ids[i] = iterator.nextDoc();
212 |     }
213 |     if (iterator.hasNext()) {
214 |       throw new AssertionError();
215 |     }
216 |     return ids;
217 |   }
218 | 
219 |   /**
220 |    * Retrieve all {@link OcrInfo}s for matching terms from a given field in a document.
221 |    *
222 |    * This takes a lot of inspiration from the {@link org.apache.lucene.search.uhighlight.UnifiedHighlighter}, thanks
223 |    * to David Smiley (@dsmiley) for pointing out that term vectors are not necessary for this highlighter.
224 |    *
225 |    * @param reader A reader into the search index
226 |    * @param docId Identifier of the matching document
227 |    * @param fieldName Field to obtain OCR information from
228 |    * @param termSet Set of matching terms
229 |    * @param maxHighlightsPerDoc Maximum number of OCR terms per document
230 |    * @param maxHighlightsPerPage Maximum number of OCR terms per page
231 |    * @return All OCR information for matching terms on all positions in the field
232 |    * @throws IOException Error during retrieval from index
233 |    */
234 |   private OcrInfo[] getOcrInfos(IndexReader reader, int docId, String fieldName, Set<BytesRef> termSet,
235 |           int maxHighlightsPerDoc, int maxHighlightsPerPage) throws IOException {
236 |     List<OcrInfo> ocrList = new ArrayList<>();
237 | 
238 |     final LeafReader leafReader;
239 |     if (reader instanceof LeafReader) {
240 |       leafReader = (LeafReader) reader;
241 |     } else {
242 |       List<LeafReaderContext> leaves = reader.leaves();
243 |       LeafReaderContext leafReaderContext = leaves.get(ReaderUtil.subIndex(docId, leaves));
244 |       leafReader = leafReaderContext.reader();
245 |       docId -= leafReaderContext.docBase; // adjust 'doc' to be within this leaf reader
246 |     }
247 | 
248 |     final Terms terms = leafReader.terms(fieldName);
249 |     if (terms == null || !terms.hasPositions() || !terms.hasPayloads()) {
250 |       return new OcrInfo[]{};
251 |     }
252 | 
253 |     final TermsEnum termsEnum = terms.iterator();
254 |     int currentPage = -1;
255 |     int matchesOnCurrentPage = 0;
256 | 
257 |     for (BytesRef term : termSet) {
258 |       if (!termsEnum.seekExact(term)) {
259 |         continue;
260 |       }
261 |       PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.POSITIONS | PostingsEnum.PAYLOADS);
262 |       if (postingsEnum == null) {
263 |         // no offsets or positions available
264 |         throw new IllegalArgumentException("field '" + fieldName + "' was indexed without offsets, cannot highlight");
265 |       }
266 |       if (docId != postingsEnum.advance(docId)) {
267 |         continue;
268 |       }
269 | 
270 |       final int freq = postingsEnum.freq();
271 |       for (int i = 0; i < freq && (maxHighlightsPerDoc < 0 || ocrList.size() < maxHighlightsPerDoc); i++) {
272 |         postingsEnum.nextPosition();
273 |         BytesRef payload = postingsEnum.getPayload();
274 |         OcrInfo info = OcrPayloadHelper.decodeOcrInfo(payload, coordBits, wordBits, lineBits, pageBits, absoluteCoordinates);
275 |         if (info.getPageIndex() != currentPage) {  // Are we on a new page?
276 |           matchesOnCurrentPage = 0;
277 |           currentPage = info.getPageIndex();
278 |         }
279 |         if (maxHighlightsPerPage < 0 || matchesOnCurrentPage < maxHighlightsPerPage) {  // Limit matches per page?
280 |           info.setTerm(term.utf8ToString());
281 |           ocrList.add(info);
282 |           matchesOnCurrentPage++;
283 |         }
284 |       }
285 |     }
286 |     return ocrList.stream().sorted().toArray(OcrInfo[]::new);
287 |   }
288 | 
289 |   private NamedList<Object> encodeOcrInfo(OcrInfo info) {
290 |     NamedList<Object> encoded = new SimpleOrderedMap<>();
291 |     if (info.getPageIndex() >= 0) {
292 |       encoded.add("page", info.getPageIndex());
293 |     }
294 |     if (info.getLineIndex() >= 0) {
295 |       encoded.add("line", info.getLineIndex());
296 |     }
297 |     if (info.getWordIndex() >= 0) {
298 |       encoded.add("word", info.getWordIndex());
299 |     }
300 |     encoded.add("term", info.getTerm());
301 | 
302 |     if (absoluteCoordinates) {
303 |       encoded.add("x", (int) info.getHorizontalOffset());
304 |       encoded.add("y", (int) info.getVerticalOffset());
305 |       encoded.add("width", (int) info.getWidth());
306 |       encoded.add("height", (int) info.getHeight());
307 |     } else {
308 |       encoded.add("x", info.getHorizontalOffset());
309 |       encoded.add("y", info.getVerticalOffset());
310 |       encoded.add("width", info.getWidth());
311 |       encoded.add("height", info.getHeight());
312 |     }
313 |     return encoded;
314 |   }
315 | 
316 |   /**
317 |    * Encode the highlighting result into a format that can be used by upstream users.
318 |    */
319 |   private NamedList<Object> encodeSnippets(String[] keys, String[] fieldNames, List<Map<String, OcrInfo[]>> ocrInfos) {
320 |     NamedList<Object> list = new SimpleOrderedMap<>();
321 |     for (int i = 0; i < keys.length; i++) {
322 |       NamedList<Object> summary = new SimpleOrderedMap<>();
323 |       Map<String, OcrInfo[]> docBoxes = ocrInfos.get(i);
324 |       for (String field : fieldNames) {
325 |         summary.add(field,
326 |                 Arrays.stream(docBoxes.get(field)).sorted().map(this::encodeOcrInfo).toArray());
327 |       }
328 |       list.add(keys[i], summary);
329 |     }
330 |     return list;
331 |   }
332 | }
333 | 


--------------------------------------------------------------------------------
/src/test/resources/data/ocrtext_full.txt:
--------------------------------------------------------------------------------
1 | Ein|p:183,l:0,n:0,x:22.31,y:08.77,w:28.24,h:11.5 4 und|p:183,l:0,n:1,x:29.52,y:08.77,w:35.23,h:11.54 zwanzigfies|p:183,l:0,n:2,x:36.36,y:08.77,w:53.94,h:11.54 Gutachten,
2 | |p:183,l:0,n:3,x:55.14,y:08.77,w:72.42,h:11.54 179|p:183,l:0,n:4,x:77.00,y:08.77,w:83.24,h:11.54 geworfen|p:183,l:1,n:0,x:12.39,y:12.91,w:23.81,h:15.42 haben
3 | .|p:183,l:1,n:1,x:26.29,y:12.91,w:34.86,h:15.42 Sein|p:183,l:1,n:2,x:38.16,y:12.91,w:44.62,h:15.42 Medicus|p:183,l:1,n:3,x:46.80,y:12.91,w:58.45,h:15.42 ordinarius,|p:183,l:1,n:4,x:61.23,y:12.91,w:76.93,h:15.42 der|p:183,l:1,n:5,x:79.26,y:12.91,w:83.24,h:15.42 hiefige|p:183,l:2,n:0,x:12.39,y:15.91,w:20.28,h:18.20 Doftor|p:183,l:2,n:1,x:22.91,y:15.91,w:31.70,h:18.20 Dolf,|p:183,l:2,n:2,x:34.25,y:15.91,w:42.82,h:18.20 bezeuget|p:183,l:2,n:3,x:45.67,y:15.91,w:55.89,h:18.20 in|p:183,l:2,n:4,x:58.30,y:15.91,w:60.55,h:18.20 einem|p:183,l:2,n:5,x:62.96,y:15.91,w:70.02,h:18.20 mir|p:183,l:2,n:6,x:72.42,y:15.91,w:76.85,h:18.20 auf|p:183,l:2,n:7,x:78.96,y:15.91,w:83.39,h:18.20 mein|p:183,l:3,n:0,x:12.32,y:18.59,w:18.48,h:20.80 Berlangen|p:183,l:3,n:1,x:20.21,y:18.59,w:33.28,h:20.80 überſchicften|p:183,l:3,n:2,x:34.86,y:18.59,w:50.18,h:20.80 Privatattefi,|p:183,l:3,n:3,x:51.99,y:18.59,w:67.84,h:20.80 „daß|p:183,l:3,n:4,x:70.24,y:18.59,w:77.23,h:20.80 der|p:183,l:3,n:5,x:78.96,y:18.59,w:83.02,h:20.80 „Tobias|p:183,l:4,n:0,x:12.32,y:21.24,w:24.11,h:23.57 D.|p:183,l:4,n:1,x:25.92,y:21.24,w:29.07,h:23.57 mit|p:183,l:4,n:2,x:30.42,y:21.24,w:34.56,h:23.57 hämorrhoidaliſchen|p:183,l:4,n:3,x:36.13,y:21.24,w:59.72,h:23.57 Sufällen,|p:183,l:4,n:4,x:61.45,y:21.24,w:72.65,h:23.57 welche|p:183,l:4,n:5,x:75.05,y:21.24,w:83.02,h:23.57 „ihm|p:183,l:5,n:0,x:12.24,y:24.01,w:19.30,h:26.35 starte|p:183,l:5,n:1,x:20.43,y:24.01,w:26.97,h:26.35 Eongestiones|p:183,l:5,n:2,x:27.72,y:24.01,w:43.57,h:26.35 nach|p:183,l:5,n:3,x:44.47,y:24.01,w:49.88,h:26.35 der|p:183,l:5,n:4,x:51.01,y:24.01,w:54.84,h:26.35 Bruf|p:183,l:5,n:5,x:55.59,y:24.01,w:62.58,h:26.35 c.|p:183,l:5,n:6,x:64.23,y:24.01,w:66.49,h:26.35 verurfachen,|p:183,l:5,n:7,x:67.61,y:24.01,w:82.87,h:26.35 „behaftet|p:183,l:6,n:0,x:12.39,y:26.70,w:24.56,h:28.82 fey.|p:183,l:6,n:1,x:26.07,y:26.70,w:30.57,h:28.82 “|p:183,l:6,n:2,x:31.55,y:26.70,w:33.20,h:28.82 –|p:183,l:6,n:3,x:34.63,y:26.70,w:38.39,h:28.82 Seinen|p:183,l:6,n:4,x:40.72,y:26.70,w:49.66,h:28.82 Puls|p:183,l:6,n:5,x:51.38,y:26.70,w:57.25,h:28.82 fand|p:183,l:6,n:6,x:59.65,y:26.70,w:65.13,h:28.82 ich|p:183,l:6,n:7,x:67.54,y:26.70,w:70.77,h:28.82 fchwach,|p:183,l:6,n:8,x:72.42,y:27.01,w:82.79,h:29.13 flein|p:183,l:7,n:0,x:12.32,y:29.44,w:17.88,h:31.46 und|p:183,l:7,n:1,x:19.23,y:29.44,w:23.74,h:31.46 intermittirend,|p:183,l:7,n:2,x:25.61,y:29.44,w:43.50,h:31.46 ihn|p:183,l:7,n:3,x:45.45,y:29.44,w:49.13,h:31.46 felbſt|p:183,l:7,n:4,x:50.93,y:29.44,w:57.17,h:31.46 ſchwach|p:183,l:7,n:5,x:58.90,y:29.44,w:67.99,h:31.46 und|p:183,l:7,n:6,x:69.79,y:29.44,w:74.22,h:31.46 matt.|p:183,l:7,n:7,x:76.03,y:29.44,w:82.64,h:31.46 Angenemmen|p:183,l:8,n:0,x:18.18,y:34.90,w:34.93,h:37.02 nun|p:183,l:8,n:1,x:36.81,y:34.90,w:41.32,h:37.02 auch,|p:183,l:8,n:2,x:43.12,y:34.90,w:49.88,h:37.02 daß|p:183,l:8,n:3,x:52.06,y:34.90,w:56.64,h:37.02 dieſer|p:183,l:8,n:4,x:58.82,y:34.90,w:65.58,h:37.02 Menſch|p:183,l:8,n:5,x:67.39,y:34.90,w:76.48,h:37.02 aus|p:183,l:8,n:6,x:78.06,y:34.90,w:82.64,h:37.02 Furcht|p:183,l:9,n:0,x:12.02,y:37.54,w:20.66,h:39.79 vor|p:183,l:9,n:1,x:22.31,y:37.54,w:26.59,h:39.79 der|p:183,l:9,n:2,x:28.17,y:37.54,w:32.08,h:39.79 Strafe|p:183,l:9,n:3,x:33.88,y:37.54,w:42.52,h:39.79 te,|p:183,l:9,n:4,x:44.32,y:37.54,w:47.03,h:39.79 feine|p:183,l:9,n:5,x:49.58,y:37.54,w:55.22,h:39.79 Beſchwerden|p:183,l:9,n:6,x:57.02,y:37.54,w:72.50,h:39.79 größer|p:183,l:9,n:7,x:74.53,y:37.54,w:82.87,h:39.79 angegeben|p:183,l:10,n:0,x:12.09,y:40.14,w:24.86,h:42.52 hätte,|p:183,l:10,n:1,x:26.59,y:40.14,w:33.80,h:42.52 als|p:183,l:10,n:2,x:35.83,y:40.14,w:39.59,h:42.52 fie|p:183,l:10,n:3,x:41.62,y:40.14,w:44.47,h:42.52 in|p:183,l:10,n:4,x:46.65,y:40.14,w:49.13,h:42.52 der|p:183,l:10,n:5,x:50.93,y:40.14,w:54.84,h:42.52 That|p:183,l:10,n:6,x:57.09,y:40.14,w:62.65,h:42.52 find,|p:183,l:10,n:7,x:64.08,y:40.14,w:69.94,h:42.52 fo|p:183,l:10,n:8,x:72.20,y:40.14,w:74.53,h:42.52 gehet|p:183,l:10,n:9,x:76.10,y:40.14,w:82.71,h:42.52 doch|p:183,l:11,n:0,x:11.87,y:42.79,w:17.65,h:45.26 ſo|p:183,l:11,n:1,x:19.53,y:42.79,w:21.71,h:45.26 viel|p:183,l:11,n:2,x:23.51,y:42.79,w:28.02,h:45.26 aus|p:183,l:11,n:3,x:29.75,y:42.79,w:34.18,h:45.26 dieſer|p:183,l:11,n:4,x:36.06,y:42.79,w:42.90,h:45.26 unterſuchung|p:183,l:11,n:5,x:44.62,y:42.79,w:60.70,h:45.26 und|p:183,l:11,n:6,x:62.65,y:42.79,w:67.09,h:45.26 dem|p:183,l:11,n:7,x:68.97,y:42.79,w:73.85,h:45.26 Beuge|p:183,l:11,n:8,x:75.65,y:42.79,w:82.71,h:45.26 niß|p:183,l:12,n:0,x:11.94,y:45.52,w:16.15,h:47.73 des|p:183,l:12,n:1,x:18.33,y:45.52,w:22.46,h:47.73 Deftor|p:183,l:12,n:2,x:24.79,y:45.52,w:33.50,h:47.73 Dolf|p:183,l:12,n:3,x:36.13,y:45.52,w:43.65,h:47.73 herfür,|p:183,l:12,n:4,x:45.37,y:45.52,w:54.24,h:47.73 daß|p:183,l:12,n:5,x:57.25,y:45.52,w:61.75,h:47.73 derfelbe|p:183,l:12,n:6,x:64.16,y:45.52,w:73.47,h:47.73 wårfe|p:183,l:12,n:7,x:75.73,y:45.52,w:82.79,h:47.73 lich|p:183,l:13,n:0,x:12.02,y:48.30,w:16.30,h:50.41 nur|p:183,l:13,n:1,x:18.18,y:48.30,w:22.38,h:50.41 ſchwächlich,|p:183,l:13,n:2,x:24.26,y:48.30,w:38.84,h:50.41 hånorrhoidaliſchen|p:183,l:13,n:3,x:40.57,y:48.30,w:64.16,h:50.41 Sufällen|p:183,l:13,n:4,x:65.96,y:48.30,w:75.88,h:50.41 und|p:183,l:13,n:5,x:77.91,y:48.30,w:82.41,h:50.41 daher|p:183,l:14,n:0,x:12.02,y:51.12,w:19.00,h:53.10 entſpringenden|p:183,l:14,n:1,x:20.81,y:51.12,w:38.69,h:53.10 heftigen|p:183,l:14,n:2,x:40.64,y:51.12,w:50.18,h:53.10 Gongestionen|p:183,l:14,n:3,x:51.99,y:51.07,w:67.91,h:53.63 des|p:183,l:14,n:4,x:69.72,y:51.16,w:73.85,h:53.01 Bluts|p:183,l:14,n:5,x:75.05,y:51.16,w:82.64,h:53.01 nach|p:183,l:15,n:0,x:11.87,y:53.81,w:17.50,h:55.97 der|p:183,l:15,n:1,x:19.30,y:53.81,w:23.06,h:55.97 Bruft|p:183,l:15,n:2,x:24.94,y:53.81,w:32.00,h:55.97 unterworfen|p:183,l:15,n:3,x:33.95,y:53.81,w:49.13,h:55.97 fey|p:183,l:15,n:4,x:50.93,y:53.81,w:54.54,h:55.97 und|p:183,l:15,n:5,x:56.72,y:53.81,w:61.30,h:55.97 vorzüglich|p:183,l:15,n:6,x:63.48,y:53.81,w:75.80,h:55.97 eine|p:183,l:15,n:7,x:77.68,y:53.81,w:82.64,h:55.97 fchwache|p:183,l:16,n:0,x:11.94,y:56.45,w:22.68,h:58.66 Bruft|p:183,l:16,n:1,x:24.41,y:56.45,w:31.48,h:58.66 habe.|p:183,l:16,n:2,x:33.20,y:56.45,w:39.66,h:58.66 Da|p:183,l:17,n:0,x:17.58,y:61.83,w:21.78,h:63.94 nun|p:183,l:17,n:1,x:23.66,y:61.83,w:28.17,h:63.94 aber|p:183,l:17,n:2,x:29.75,y:61.83,w:35.08,h:63.94 das|p:183,l:17,n:3,x:37.26,y:61.83,w:41.62,h:63.94 Tragen|p:183,l:17,n:4,x:43.57,y:61.83,w:52.44,h:63.94 des|p:183,l:17,n:5,x:54.39,y:61.83,w:58.52,h:63.94 fogenannten|p:183,l:17,n:6,x:60.70,y:61.83,w:75.50,h:63.94 ſpå»|p:183,l:17,n:7,x:77.68,y:61.83,w:82.64,h:63.94 niſchen|p:183,l:18,n:0,x:11.94,y:64.69,w:20.43,h:66.90 Mantels,|p:183,l:18,n:1,x:22.23,y:64.69,w:34.33,h:66.90 welcher|p:183,l:18,n:2,x:36.13,y:64.69,w:45.00,h:66.90 nach|p:183,l:18,n:3,x:46.80,y:64.69,w:52.21,h:66.90 dem|p:183,l:18,n:4,x:53.86,y:64.69,w:58.90,h:66.90 zu|p:183,l:18,n:5,x:60.70,y:64.69,w:63.18,h:66.90 urtheilen,|p:183,l:18,n:6,x:64.83,y:64.69,w:76.78,h:66.90 der|p:183,l:18,n:7,x:78.58,y:64.69,w:82.56,h:66.90 auf|p:183,l:19,n:0,x:12.02,y:67.38,w:16.45,h:69.36 der|p:183,l:19,n:1,x:17.95,y:67.38,w:21.86,h:69.36 Hausvogtey|p:183,l:19,n:2,x:24.19,y:67.38,w:38.84,h:69.36 zu|p:183,l:19,n:3,x:40.72,y:67.38,w:43.12,h:69.36 dergleichen|p:183,l:19,n:4,x:44.70,y:67.38,w:58.07,h:69.36 Behuf|p:183,l:19,n:5,x:59.87,y:67.38,w:67.91,h:69.36 aufbewahe|p:183,l:19,n:6,x:69.42,y:67.38,w:82.49,h:69.36 ret|p:183,l:20,n:0,x:11.94,y:69.94,w:15.47,h:72.01 wird,|p:183,l:20,n:1,x:17.20,y:69.94,w:24.34,h:72.01 an|p:183,l:20,n:2,x:26.82,y:69.94,w:29.75,h:72.01 die|p:183,l:20,n:3,x:31.93,y:69.94,w:35.38,h:72.01 fiebenzig|p:183,l:20,n:4,x:36.88,y:69.94,w:47.18,h:72.01 Pfund|p:183,l:20,n:5,x:49.73,y:69.94,w:56.94,h:72.01 ſchwer|p:183,l:20,n:6,x:59.50,y:69.94,w:67.24,h:72.01 iſt,|p:183,l:20,n:7,x:69.04,y:69.94,w:73.02,h:72.01 einen|p:183,l:20,n:8,x:76.18,y:69.94,w:82.49,h:72.01 farfen|p:183,l:21,n:0,x:11.87,y:72.58,w:20.06,h:74.74 Rörper|p:183,l:21,n:1,x:22.16,y:72.58,w:30.80,h:74.74 und|p:183,l:21,n:2,x:32.68,y:72.58,w:37.11,h:74.74 heftige,|p:183,l:21,n:3,x:38.99,y:72.58,w:47.93,h:74.74 Stunden|p:183,l:21,n:4,x:49.81,y:72.58,w:60.78,h:74.74 lang|p:183,l:21,n:5,x:62.28,y:72.58,w:67.61,h:74.74 anhaltende|p:183,l:21,n:6,x:69.42,y:72.58,w:82.56,h:74.74 Anfirengung|p:183,l:22,n:0,x:12.17,y:75.31,w:27.19,h:77.52 aller|p:183,l:22,n:1,x:28.92,y:75.31,w:34.25,h:77.52 sträfte,|p:183,l:22,n:2,x:36.06,y:75.31,w:45.22,h:77.52 vorzüglich|p:183,l:22,n:3,x:47.10,y:75.31,w:59.42,h:77.52 aber,|p:183,l:22,n:4,x:61.15,y:75.31,w:67.76,h:77.52 da|p:183,l:22,n:5,x:69.34,y:75.31,w:72.20,h:77.52 er|p:183,l:22,n:6,x:74.07,y:75.31,w:76.40,h:77.52 auf|p:183,l:22,n:7,x:78.13,y:75.31,w:82.64,h:77.52 den|p:183,l:23,n:0,x:11.87,y:77.91,w:16.15,h:79.99 Schultern|p:183,l:23,n:1,x:17.88,y:77.91,w:30.35,h:79.99 liegt,|p:183,l:23,n:2,x:32.30,y:77.91,w:38.91,h:79.99 der|p:183,l:23,n:3,x:40.79,y:77.91,w:44.62,h:79.99 Bruft|p:183,l:23,n:4,x:46.50,y:77.91,w:53.56,h:79.99 e|p:183,l:23,n:5,x:54.62,y:77.91,w:55.67,h:79.99 Hals|p:183,l:23,n:6,x:57.55,y:77.91,w:63.26,h:79.99 -|p:183,l:23,n:7,x:64.31,y:77.91,w:65.13,h:79.99 und|p:183,l:23,n:8,x:67.01,y:77.91,w:71.45,h:79.99 Rücfen,|p:183,l:23,n:9,x:72.65,y:77.91,w:82.34,h:79.99 muffeln|p:183,l:24,n:0,x:11.94,y:80.69,w:21.78,h:82.76 erfordert,|p:183,l:24,n:1,x:24.56,y:80.69,w:36.73,h:82.76 hierdurch|p:183,l:24,n:2,x:39.66,y:80.69,w:51.01,h:82.76 aber|p:183,l:24,n:3,x:53.86,y:80.69,w:59.12,h:82.76 nicht|p:183,l:24,n:4,x:61.75,y:80.69,w:67.54,h:82.76 nur|p:183,l:24,n:5,x:70.62,y:80.69,w:74.90,h:82.76 das|p:183,l:24,n:6,x:77.91,y:80.69,w:82.26,h:82.76 Blut|p:183,l:25,n:0,x:11.87,y:83.34,w:18.03,h:85.45 mit|p:183,l:25,n:1,x:20.58,y:83.34,w:24.79,h:85.45 fortdaurender|p:183,l:25,n:2,x:27.49,y:83.34,w:44.40,h:85.45 Ộeftigfeit|p:183,l:25,n:3,x:47.18,y:83.34,w:58.75,h:85.45 nach|p:183,l:25,n:4,x:61.38,y:83.34,w:66.71,h:85.45 Ropf|p:183,l:25,n:5,x:69.19,y:83.34,w:75.50,h:85.45 und|p:183,l:25,n:6,x:77.61,y:83.34,w:82.26,h:85.45 Bruft|p:183,l:26,n:0,x:11.87,y:85.98,w:19.08,h:88.23 zu|p:183,l:26,n:1,x:20.58,y:85.98,w:23.06,h:88.23 getrieben|p:183,l:26,n:2,x:24.79,y:85.98,w:35.76,h:88.23 wird,|p:183,l:26,n:3,x:37.34,y:85.98,w:43.87,h:88.23 und|p:183,l:26,n:4,x:45.83,y:85.98,w:50.26,h:88.23 leștere|p:183,l:26,n:5,x:52.14,y:85.98,w:60.25,h:88.23 vorzůglich|p:183,l:26,n:6,x:61.98,y:85.98,w:74.30,h:88.23 lange|p:183,l:26,n:7,x:75.65,y:85.98,w:82.41,h:88.23 und|p:183,l:27,n:0,x:11.79,y:88.67,w:16.45,h:90.92 anhaltend|p:183,l:27,n:1,x:17.73,y:88.67,w:29.45,h:90.92 viel|p:183,l:27,n:2,x:30.57,y:88.67,w:35.08,h:90.92 dabey|p:183,l:27,n:3,x:36.13,y:88.67,w:43.20,h:90.92 leidet:|p:183,l:27,n:4,x:44.77,y:88.67,w:52.66,h:90.92 fo|p:183,l:27,n:5,x:54.39,y:88.67,w:56.64,h:90.92 fann|p:183,l:27,n:6,x:57.70,y:88.67,w:63.33,h:90.92 ich|p:183,l:27,n:7,x:64.46,y:88.67,w:67.69,h:90.92 meiner|p:183,l:27,n:8,x:69.04,y:88.67,w:77.38,h:90.92 ge|p:183,l:27,n:9,x:78.51,y:88.67,w:82.26,h:90.92 ಋRa|p:183,l:28,n:0,x:46.05,y:91.27,w:53.04,h:93.03 Jſº|p:183,l:28,n:1,x:77.00,y:91.49,w:82.26,h:93.74 18o|p:184,l:0,n:0,x:16.96,y:08.07,w:23.52,h:10.70 Ein|p:184,l:0,n:1,x:29.20,y:08.07,w:35.28,h:10.70 und|p:184,l:0,n:2,x:37.01,y:08.07,w:43.09,h:10.70 zwanzigſtes|p:184,l:0,n:3,x:44.19,y:08.07,w:62.58,h:10.70 Gutachten.|p:184,l:0,n:4,x:63.69,y:08.07,w:82.16,h:10.70 wifenhaften|p:184,l:1,n:0,x:16.73,y:12.63,w:32.59,h:14.73 lleberzeugung|p:184,l:1,n:1,x:34.64,y:12.63,w:52.24,h:14.73 nach|p:184,l:1,n:2,x:54.53,y:12.63,w:60.14,h:14.73 nicht|p:184,l:1,n:3,x:62.66,y:12.63,w:68.74,h:14.73 anderð|p:184,l:1,n:4,x:71.34,y:12.63,w:80.11,h:14.73 urtheis|p:184,l:1,n:5,x:82.79,y:12.63,w:91.79,h:14.73 len,|p:184,l:2,n:0,x:16.89,y:15.61,w:22.17,h:17.50 als|p:184,l:2,n:1,x:25.17,y:15.61,w:29.20,h:17.50 *|p:184,l:2,n:2,x:46.80,y:16.40,w:47.75,h:17.01 daß|p:184,l:3,n:0,x:24.62,y:19.25,w:29.59,h:21.40 dieſe|p:184,l:3,n:1,x:31.33,y:19.25,w:37.17,h:21.40 Strafe|p:184,l:3,n:2,x:38.91,y:19.25,w:48.06,h:21.40 an|p:184,l:3,n:3,x:50.03,y:19.25,w:53.03,h:21.40 dem|p:184,l:3,n:4,x:54.53,y:19.25,w:59.74,h:21.40 Suculpaten|p:184,l:3,n:5,x:61.01,y:19.25,w:75.84,h:21.40 nicht|p:184,l:3,n:6,x:77.97,y:19.25,w:84.13,h:21.40 ohne|p:184,l:3,n:7,x:85.63,y:19.25,w:91.39,h:21.40 zu|p:184,l:4,n:0,x:24.70,y:22.01,w:27.62,h:24.25 befürchtenden|p:184,l:4,n:1,x:30.22,y:22.01,w:47.90,h:24.25 großen|p:184,l:4,n:2,x:50.59,y:22.01,w:59.51,h:24.25 Machtheil|p:184,l:4,n:3,x:62.19,y:22.01,w:74.50,h:24.25 feiner|p:184,l:4,n:4,x:77.03,y:22.01,w:84.37,h:24.25 See|p:184,l:4,n:5,x:86.97,y:22.01,w:91.39,h:24.25 fundheit|p:184,l:5,n:0,x:24.94,y:24.73,w:35.59,h:26.92 vollzogen|p:184,l:5,n:1,x:37.33,y:24.73,w:49.32,h:26.92 werden|p:184,l:5,n:2,x:50.82,y:24.73,w:60.14,h:26.92 fönne;|p:184,l:5,n:3,x:62.03,y:24.73,w:71.03,h:26.92 da|p:184,l:6,n:0,x:17.12,y:28.72,w:20.52,h:30.74 von|p:184,l:6,n:1,x:22.33,y:28.72,w:26.99,h:30.74 dem|p:184,l:6,n:2,x:28.96,y:28.72,w:34.17,h:30.74 anhaltenden|p:184,l:6,n:3,x:36.30,y:28.72,w:51.53,h:30.74 heftigen|p:184,l:6,n:4,x:53.59,y:28.72,w:63.69,h:30.74 Antrieb|p:184,l:6,n:5,x:66.29,y:28.72,w:75.84,h:30.74 des|p:184,l:6,n:6,x:77.90,y:28.72,w:82.24,h:30.74 Bluts|p:184,l:6,n:7,x:84.13,y:28.72,w:91.31,h:30.74 nach|p:184,l:7,n:0,x:17.20,y:31.49,w:23.12,h:33.68 der|p:184,l:7,n:1,x:25.49,y:31.49,w:29.67,h:33.68 Bruft|p:184,l:7,n:2,x:31.88,y:31.49,w:39.30,h:33.68 und|p:184,l:7,n:3,x:42.22,y:31.49,w:46.88,h:33.68 ſtarfen|p:184,l:7,n:4,x:49.88,y:31.49,w:58.56,h:33.68 Anfrengung|p:184,l:7,n:5,x:61.56,y:31.49,w:77.50,h:33.68 derfelben|p:184,l:7,n:6,x:80.42,y:31.49,w:91.39,h:33.68 fehr|p:184,l:8,n:0,x:17.36,y:34.21,w:22.41,h:36.44 leicht|p:184,l:8,n:1,x:24.07,y:34.21,w:30.86,h:36.44 eine|p:184,l:8,n:2,x:32.35,y:34.21,w:37.56,h:36.44 3erreißung|p:184,l:8,n:3,x:39.30,y:34.21,w:53.74,h:36.44 der|p:184,l:8,n:4,x:55.64,y:34.21,w:59.66,h:36.44 Gefäße|p:184,l:8,n:5,x:61.48,y:34.07,w:70.71,h:36.09 und|p:184,l:8,n:6,x:73.32,y:34.07,w:78.13,h:36.09 unheilbae|p:184,l:8,n:7,x:80.03,y:34.07,w:91.39,h:36.09 res|p:184,l:9,n:0,x:17.60,y:36.88,w:21.62,h:39.03 Blutſpeyen|p:184,l:9,n:1,x:23.59,y:36.88,w:37.96,h:39.03 entſtehen|p:184,l:9,n:2,x:39.93,y:36.88,w:51.22,h:39.03 fann.|p:184,l:9,n:3,x:53.03,y:36.88,w:59.98,h:39.03 Diefes|p:184,l:10,n:0,x:23.91,y:41.44,w:32.12,h:43.64 iſt|p:184,l:10,n:1,x:34.49,y:41.44,w:37.25,h:43.64 mein|p:184,l:10,n:2,x:39.70,y:41.44,w:45.85,h:43.64 pflichtmäßiges|p:184,l:10,n:3,x:48.93,y:41.44,w:67.64,h:43.64 Gutachten|p:184,l:10,n:4,x:70.08,y:41.44,w:83.89,h:43.64 hier:|p:184,l:10,n:5,x:85.95,y:41.44,w:91.23,h:43.64 åber,|p:184,l:11,n:0,x:17.67,y:44.21,w:24.94,h:46.49 welches|p:184,l:11,n:1,x:27.15,y:44.21,w:37.01,h:46.49 ich|p:184,l:11,n:2,x:39.38,y:44.21,w:42.85,h:46.49 mit|p:184,l:11,n:3,x:45.22,y:44.21,w:49.72,h:46.49 meines|p:184,l:11,n:4,x:51.93,y:44.21,w:60.85,h:46.49 Rahmens|p:184,l:11,n:5,x:62.66,y:44.21,w:75.45,h:46.49 unterſchrift|p:184,l:11,n:6,x:77.66,y:44.21,w:91.63,h:46.49 und|p:184,l:12,n:0,x:17.83,y:47.01,w:22.80,h:49.12 vorgedrucften|p:184,l:12,n:1,x:24.70,y:47.01,w:43.25,h:49.12 Whyffatsfiegel|p:184,l:12,n:2,x:45.77,y:47.01,w:64.16,h:49.12 beurfunde,|p:184,l:12,n:3,x:66.29,y:47.01,w:80.34,h:49.12 Berlin,|p:184,l:13,n:0,x:23.67,y:51.66,w:33.70,h:53.85 den|p:184,l:13,n:1,x:35.67,y:51.66,w:40.09,h:53.85 Iften|p:184,l:13,n:2,x:42.54,y:51.66,w:48.77,h:53.85 März,|p:184,l:13,n:3,x:50.82,y:51.66,w:59.66,h:53.85 1789.|p:184,l:13,n:4,x:62.19,y:51.66,w:70.71,h:53.85 pyl.|p:184,l:14,n:0,x:77.66,y:57.36,w:85.00,h:59.73 »**|p:184,l:15,n:0,x:20.20,y:91.14,w:23.67,h:92.50 -|p:184,l:15,n:1,x:32.83,y:93.55,w:33.22,h:93.77 @rit;|p:184,l:15,n:2,x:82.95,y:90.57,w:91.23,h:92.93,


--------------------------------------------------------------------------------